1 1 1 1 1 1 3 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 // SPDX-License-Identifier: GPL-2.0+ /* * inode.c -- user mode filesystem api for usb gadget controllers * * Copyright (C) 2003-2004 David Brownell * Copyright (C) 2003 Agilent Technologies */ /* #define VERBOSE_DEBUG */ #include <linux/init.h> #include <linux/module.h> #include <linux/fs.h> #include <linux/fs_context.h> #include <linux/pagemap.h> #include <linux/uts.h> #include <linux/wait.h> #include <linux/compiler.h> #include <linux/uaccess.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/poll.h> #include <linux/kthread.h> #include <linux/aio.h> #include <linux/uio.h> #include <linux/refcount.h> #include <linux/delay.h> #include <linux/device.h> #include <linux/moduleparam.h> #include <linux/usb/gadgetfs.h> #include <linux/usb/gadget.h> #include <linux/usb/composite.h> /* for USB_GADGET_DELAYED_STATUS */ /* Undef helpers from linux/usb/composite.h as gadgetfs redefines them */ #undef DBG #undef ERROR #undef INFO /* * The gadgetfs API maps each endpoint to a file descriptor so that you * can use standard synchronous read/write calls for I/O. There's some * O_NONBLOCK and O_ASYNC/FASYNC style i/o support. Example usermode * drivers show how this works in practice. You can also use AIO to * eliminate I/O gaps between requests, to help when streaming data. * * Key parts that must be USB-specific are protocols defining how the * read/write operations relate to the hardware state machines. There * are two types of files. One type is for the device, implementing ep0. * The other type is for each IN or OUT endpoint. In both cases, the * user mode driver must configure the hardware before using it. * * - First, dev_config() is called when /dev/gadget/$CHIP is configured * (by writing configuration and device descriptors). Afterwards it * may serve as a source of device events, used to handle all control * requests other than basic enumeration. * * - Then, after a SET_CONFIGURATION control request, ep_config() is * called when each /dev/gadget/ep* file is configured (by writing * endpoint descriptors). Afterwards these files are used to write() * IN data or to read() OUT data. To halt the endpoint, a "wrong * direction" request is issued (like reading an IN endpoint). * * Unlike "usbfs" the only ioctl()s are for things that are rare, and maybe * not possible on all hardware. For example, precise fault handling with * respect to data left in endpoint fifos after aborted operations; or * selective clearing of endpoint halts, to implement SET_INTERFACE. */ #define DRIVER_DESC "USB Gadget filesystem" #define DRIVER_VERSION "24 Aug 2004" static const char driver_desc [] = DRIVER_DESC; static const char shortname [] = "gadgetfs"; MODULE_DESCRIPTION (DRIVER_DESC); MODULE_AUTHOR ("David Brownell"); MODULE_LICENSE ("GPL"); static int ep_open(struct inode *, struct file *); /*----------------------------------------------------------------------*/ #define GADGETFS_MAGIC 0xaee71ee7 /* /dev/gadget/$CHIP represents ep0 and the whole device */ enum ep0_state { /* DISABLED is the initial state. */ STATE_DEV_DISABLED = 0, /* Only one open() of /dev/gadget/$CHIP; only one file tracks * ep0/device i/o modes and binding to the controller. Driver * must always write descriptors to initialize the device, then * the device becomes UNCONNECTED until enumeration. */ STATE_DEV_OPENED, /* From then on, ep0 fd is in either of two basic modes: * - (UN)CONNECTED: read usb_gadgetfs_event(s) from it * - SETUP: read/write will transfer control data and succeed; * or if "wrong direction", performs protocol stall */ STATE_DEV_UNCONNECTED, STATE_DEV_CONNECTED, STATE_DEV_SETUP, /* UNBOUND means the driver closed ep0, so the device won't be * accessible again (DEV_DISABLED) until all fds are closed. */ STATE_DEV_UNBOUND, }; /* enough for the whole queue: most events invalidate others */ #define N_EVENT 5 #define RBUF_SIZE 256 struct dev_data { spinlock_t lock; refcount_t count; int udc_usage; enum ep0_state state; /* P: lock */ struct usb_gadgetfs_event event [N_EVENT]; unsigned ev_next; struct fasync_struct *fasync; u8 current_config; /* drivers reading ep0 MUST handle control requests (SETUP) * reported that way; else the host will time out. */ unsigned usermode_setup : 1, setup_in : 1, setup_can_stall : 1, setup_out_ready : 1, setup_out_error : 1, setup_abort : 1, gadget_registered : 1; unsigned setup_wLength; /* the rest is basically write-once */ struct usb_config_descriptor *config, *hs_config; struct usb_device_descriptor *dev; struct usb_request *req; struct usb_gadget *gadget; struct list_head epfiles; void *buf; wait_queue_head_t wait; struct super_block *sb; struct dentry *dentry; /* except this scratch i/o buffer for ep0 */ u8 rbuf[RBUF_SIZE]; }; static inline void get_dev (struct dev_data *data) { refcount_inc (&data->count); } static void put_dev (struct dev_data *data) { if (likely (!refcount_dec_and_test (&data->count))) return; /* needs no more cleanup */ BUG_ON (waitqueue_active (&data->wait)); kfree (data); } static struct dev_data *dev_new (void) { struct dev_data *dev; dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (!dev) return NULL; dev->state = STATE_DEV_DISABLED; refcount_set (&dev->count, 1); spin_lock_init (&dev->lock); INIT_LIST_HEAD (&dev->epfiles); init_waitqueue_head (&dev->wait); return dev; } /*----------------------------------------------------------------------*/ /* other /dev/gadget/$ENDPOINT files represent endpoints */ enum ep_state { STATE_EP_DISABLED = 0, STATE_EP_READY, STATE_EP_ENABLED, STATE_EP_UNBOUND, }; struct ep_data { struct mutex lock; enum ep_state state; refcount_t count; struct dev_data *dev; /* must hold dev->lock before accessing ep or req */ struct usb_ep *ep; struct usb_request *req; ssize_t status; char name [16]; struct usb_endpoint_descriptor desc, hs_desc; struct list_head epfiles; wait_queue_head_t wait; struct dentry *dentry; }; static inline void get_ep (struct ep_data *data) { refcount_inc (&data->count); } static void put_ep (struct ep_data *data) { if (likely (!refcount_dec_and_test (&data->count))) return; put_dev (data->dev); /* needs no more cleanup */ BUG_ON (!list_empty (&data->epfiles)); BUG_ON (waitqueue_active (&data->wait)); kfree (data); } /*----------------------------------------------------------------------*/ /* most "how to use the hardware" policy choices are in userspace: * mapping endpoint roles (which the driver needs) to the capabilities * which the usb controller has. most of those capabilities are exposed * implicitly, starting with the driver name and then endpoint names. */ static const char *CHIP; static DEFINE_MUTEX(sb_mutex); /* Serialize superblock operations */ /*----------------------------------------------------------------------*/ /* NOTE: don't use dev_printk calls before binding to the gadget * at the end of ep0 configuration, or after unbind. */ /* too wordy: dev_printk(level , &(d)->gadget->dev , fmt , ## args) */ #define xprintk(d,level,fmt,args...) \ printk(level "%s: " fmt , shortname , ## args) #ifdef DEBUG #define DBG(dev,fmt,args...) \ xprintk(dev , KERN_DEBUG , fmt , ## args) #else #define DBG(dev,fmt,args...) \ do { } while (0) #endif /* DEBUG */ #ifdef VERBOSE_DEBUG #define VDEBUG DBG #else #define VDEBUG(dev,fmt,args...) \ do { } while (0) #endif /* DEBUG */ #define ERROR(dev,fmt,args...) \ xprintk(dev , KERN_ERR , fmt , ## args) #define INFO(dev,fmt,args...) \ xprintk(dev , KERN_INFO , fmt , ## args) /*----------------------------------------------------------------------*/ /* SYNCHRONOUS ENDPOINT OPERATIONS (bulk/intr/iso) * * After opening, configure non-control endpoints. Then use normal * stream read() and write() requests; and maybe ioctl() to get more * precise FIFO status when recovering from cancellation. */ static void epio_complete (struct usb_ep *ep, struct usb_request *req) { struct ep_data *epdata = ep->driver_data; if (!req->context) return; if (req->status) epdata->status = req->status; else epdata->status = req->actual; complete ((struct completion *)req->context); } /* tasklock endpoint, returning when it's connected. * still need dev->lock to use epdata->ep. */ static int get_ready_ep (unsigned f_flags, struct ep_data *epdata, bool is_write) { int val; if (f_flags & O_NONBLOCK) { if (!mutex_trylock(&epdata->lock)) goto nonblock; if (epdata->state != STATE_EP_ENABLED && (!is_write || epdata->state != STATE_EP_READY)) { mutex_unlock(&epdata->lock); nonblock: val = -EAGAIN; } else val = 0; return val; } val = mutex_lock_interruptible(&epdata->lock); if (val < 0) return val; switch (epdata->state) { case STATE_EP_ENABLED: return 0; case STATE_EP_READY: /* not configured yet */ if (is_write) return 0; fallthrough; case STATE_EP_UNBOUND: /* clean disconnect */ break; // case STATE_EP_DISABLED: /* "can't happen" */ default: /* error! */ pr_debug ("%s: ep %p not available, state %d\n", shortname, epdata, epdata->state); } mutex_unlock(&epdata->lock); return -ENODEV; } static ssize_t ep_io (struct ep_data *epdata, void *buf, unsigned len) { DECLARE_COMPLETION_ONSTACK (done); int value; spin_lock_irq (&epdata->dev->lock); if (likely (epdata->ep != NULL)) { struct usb_request *req = epdata->req; req->context = &done; req->complete = epio_complete; req->buf = buf; req->length = len; value = usb_ep_queue (epdata->ep, req, GFP_ATOMIC); } else value = -ENODEV; spin_unlock_irq (&epdata->dev->lock); if (likely (value == 0)) { value = wait_for_completion_interruptible(&done); if (value != 0) { spin_lock_irq (&epdata->dev->lock); if (likely (epdata->ep != NULL)) { DBG (epdata->dev, "%s i/o interrupted\n", epdata->name); usb_ep_dequeue (epdata->ep, epdata->req); spin_unlock_irq (&epdata->dev->lock); wait_for_completion(&done); if (epdata->status == -ECONNRESET) epdata->status = -EINTR; } else { spin_unlock_irq (&epdata->dev->lock); DBG (epdata->dev, "endpoint gone\n"); wait_for_completion(&done); epdata->status = -ENODEV; } } return epdata->status; } return value; } static int ep_release (struct inode *inode, struct file *fd) { struct ep_data *data = fd->private_data; int value; value = mutex_lock_interruptible(&data->lock); if (value < 0) return value; /* clean up if this can be reopened */ if (data->state != STATE_EP_UNBOUND) { data->state = STATE_EP_DISABLED; data->desc.bDescriptorType = 0; data->hs_desc.bDescriptorType = 0; usb_ep_disable(data->ep); } mutex_unlock(&data->lock); put_ep (data); return 0; } static long ep_ioctl(struct file *fd, unsigned code, unsigned long value) { struct ep_data *data = fd->private_data; int status; if ((status = get_ready_ep (fd->f_flags, data, false)) < 0) return status; spin_lock_irq (&data->dev->lock); if (likely (data->ep != NULL)) { switch (code) { case GADGETFS_FIFO_STATUS: status = usb_ep_fifo_status (data->ep); break; case GADGETFS_FIFO_FLUSH: usb_ep_fifo_flush (data->ep); break; case GADGETFS_CLEAR_HALT: status = usb_ep_clear_halt (data->ep); break; default: status = -ENOTTY; } } else status = -ENODEV; spin_unlock_irq (&data->dev->lock); mutex_unlock(&data->lock); return status; } /*----------------------------------------------------------------------*/ /* ASYNCHRONOUS ENDPOINT I/O OPERATIONS (bulk/intr/iso) */ struct kiocb_priv { struct usb_request *req; struct ep_data *epdata; struct kiocb *iocb; struct mm_struct *mm; struct work_struct work; void *buf; struct iov_iter to; const void *to_free; unsigned actual; }; static int ep_aio_cancel(struct kiocb *iocb) { struct kiocb_priv *priv = iocb->private; struct ep_data *epdata; int value; local_irq_disable(); epdata = priv->epdata; // spin_lock(&epdata->dev->lock); if (likely(epdata && epdata->ep && priv->req)) value = usb_ep_dequeue (epdata->ep, priv->req); else value = -EINVAL; // spin_unlock(&epdata->dev->lock); local_irq_enable(); return value; } static void ep_user_copy_worker(struct work_struct *work) { struct kiocb_priv *priv = container_of(work, struct kiocb_priv, work); struct mm_struct *mm = priv->mm; struct kiocb *iocb = priv->iocb; size_t ret; kthread_use_mm(mm); ret = copy_to_iter(priv->buf, priv->actual, &priv->to); kthread_unuse_mm(mm); if (!ret) ret = -EFAULT; /* completing the iocb can drop the ctx and mm, don't touch mm after */ iocb->ki_complete(iocb, ret); kfree(priv->buf); kfree(priv->to_free); kfree(priv); } static void ep_aio_complete(struct usb_ep *ep, struct usb_request *req) { struct kiocb *iocb = req->context; struct kiocb_priv *priv = iocb->private; struct ep_data *epdata = priv->epdata; /* lock against disconnect (and ideally, cancel) */ spin_lock(&epdata->dev->lock); priv->req = NULL; priv->epdata = NULL; /* if this was a write or a read returning no data then we * don't need to copy anything to userspace, so we can * complete the aio request immediately. */ if (priv->to_free == NULL || unlikely(req->actual == 0)) { kfree(req->buf); kfree(priv->to_free); kfree(priv); iocb->private = NULL; iocb->ki_complete(iocb, req->actual ? req->actual : (long)req->status); } else { /* ep_copy_to_user() won't report both; we hide some faults */ if (unlikely(0 != req->status)) DBG(epdata->dev, "%s fault %d len %d\n", ep->name, req->status, req->actual); priv->buf = req->buf; priv->actual = req->actual; INIT_WORK(&priv->work, ep_user_copy_worker); schedule_work(&priv->work); } usb_ep_free_request(ep, req); spin_unlock(&epdata->dev->lock); put_ep(epdata); } static ssize_t ep_aio(struct kiocb *iocb, struct kiocb_priv *priv, struct ep_data *epdata, char *buf, size_t len) { struct usb_request *req; ssize_t value; iocb->private = priv; priv->iocb = iocb; kiocb_set_cancel_fn(iocb, ep_aio_cancel); get_ep(epdata); priv->epdata = epdata; priv->actual = 0; priv->mm = current->mm; /* mm teardown waits for iocbs in exit_aio() */ /* each kiocb is coupled to one usb_request, but we can't * allocate or submit those if the host disconnected. */ spin_lock_irq(&epdata->dev->lock); value = -ENODEV; if (unlikely(epdata->ep == NULL)) goto fail; req = usb_ep_alloc_request(epdata->ep, GFP_ATOMIC); value = -ENOMEM; if (unlikely(!req)) goto fail; priv->req = req; req->buf = buf; req->length = len; req->complete = ep_aio_complete; req->context = iocb; value = usb_ep_queue(epdata->ep, req, GFP_ATOMIC); if (unlikely(0 != value)) { usb_ep_free_request(epdata->ep, req); goto fail; } spin_unlock_irq(&epdata->dev->lock); return -EIOCBQUEUED; fail: spin_unlock_irq(&epdata->dev->lock); kfree(priv->to_free); kfree(priv); put_ep(epdata); return value; } static ssize_t ep_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct ep_data *epdata = file->private_data; size_t len = iov_iter_count(to); ssize_t value; char *buf; if ((value = get_ready_ep(file->f_flags, epdata, false)) < 0) return value; /* halt any endpoint by doing a "wrong direction" i/o call */ if (usb_endpoint_dir_in(&epdata->desc)) { if (usb_endpoint_xfer_isoc(&epdata->desc) || !is_sync_kiocb(iocb)) { mutex_unlock(&epdata->lock); return -EINVAL; } DBG (epdata->dev, "%s halt\n", epdata->name); spin_lock_irq(&epdata->dev->lock); if (likely(epdata->ep != NULL)) usb_ep_set_halt(epdata->ep); spin_unlock_irq(&epdata->dev->lock); mutex_unlock(&epdata->lock); return -EBADMSG; } buf = kmalloc(len, GFP_KERNEL); if (unlikely(!buf)) { mutex_unlock(&epdata->lock); return -ENOMEM; } if (is_sync_kiocb(iocb)) { value = ep_io(epdata, buf, len); if (value >= 0 && (copy_to_iter(buf, value, to) != value)) value = -EFAULT; } else { struct kiocb_priv *priv = kzalloc(sizeof *priv, GFP_KERNEL); value = -ENOMEM; if (!priv) goto fail; priv->to_free = dup_iter(&priv->to, to, GFP_KERNEL); if (!iter_is_ubuf(&priv->to) && !priv->to_free) { kfree(priv); goto fail; } value = ep_aio(iocb, priv, epdata, buf, len); if (value == -EIOCBQUEUED) buf = NULL; } fail: kfree(buf); mutex_unlock(&epdata->lock); return value; } static ssize_t ep_config(struct ep_data *, const char *, size_t); static ssize_t ep_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct ep_data *epdata = file->private_data; size_t len = iov_iter_count(from); bool configured; ssize_t value; char *buf; if ((value = get_ready_ep(file->f_flags, epdata, true)) < 0) return value; configured = epdata->state == STATE_EP_ENABLED; /* halt any endpoint by doing a "wrong direction" i/o call */ if (configured && !usb_endpoint_dir_in(&epdata->desc)) { if (usb_endpoint_xfer_isoc(&epdata->desc) || !is_sync_kiocb(iocb)) { mutex_unlock(&epdata->lock); return -EINVAL; } DBG (epdata->dev, "%s halt\n", epdata->name); spin_lock_irq(&epdata->dev->lock); if (likely(epdata->ep != NULL)) usb_ep_set_halt(epdata->ep); spin_unlock_irq(&epdata->dev->lock); mutex_unlock(&epdata->lock); return -EBADMSG; } buf = kmalloc(len, GFP_KERNEL); if (unlikely(!buf)) { mutex_unlock(&epdata->lock); return -ENOMEM; } if (unlikely(!copy_from_iter_full(buf, len, from))) { value = -EFAULT; goto out; } if (unlikely(!configured)) { value = ep_config(epdata, buf, len); } else if (is_sync_kiocb(iocb)) { value = ep_io(epdata, buf, len); } else { struct kiocb_priv *priv = kzalloc(sizeof *priv, GFP_KERNEL); value = -ENOMEM; if (priv) { value = ep_aio(iocb, priv, epdata, buf, len); if (value == -EIOCBQUEUED) buf = NULL; } } out: kfree(buf); mutex_unlock(&epdata->lock); return value; } /*----------------------------------------------------------------------*/ /* used after endpoint configuration */ static const struct file_operations ep_io_operations = { .owner = THIS_MODULE, .open = ep_open, .release = ep_release, .llseek = no_llseek, .unlocked_ioctl = ep_ioctl, .read_iter = ep_read_iter, .write_iter = ep_write_iter, }; /* ENDPOINT INITIALIZATION * * fd = open ("/dev/gadget/$ENDPOINT", O_RDWR) * status = write (fd, descriptors, sizeof descriptors) * * That write establishes the endpoint configuration, configuring * the controller to process bulk, interrupt, or isochronous transfers * at the right maxpacket size, and so on. * * The descriptors are message type 1, identified by a host order u32 * at the beginning of what's written. Descriptor order is: full/low * speed descriptor, then optional high speed descriptor. */ static ssize_t ep_config (struct ep_data *data, const char *buf, size_t len) { struct usb_ep *ep; u32 tag; int value, length = len; if (data->state != STATE_EP_READY) { value = -EL2HLT; goto fail; } value = len; if (len < USB_DT_ENDPOINT_SIZE + 4) goto fail0; /* we might need to change message format someday */ memcpy(&tag, buf, 4); if (tag != 1) { DBG(data->dev, "config %s, bad tag %d\n", data->name, tag); goto fail0; } buf += 4; len -= 4; /* NOTE: audio endpoint extensions not accepted here; * just don't include the extra bytes. */ /* full/low speed descriptor, then high speed */ memcpy(&data->desc, buf, USB_DT_ENDPOINT_SIZE); if (data->desc.bLength != USB_DT_ENDPOINT_SIZE || data->desc.bDescriptorType != USB_DT_ENDPOINT) goto fail0; if (len != USB_DT_ENDPOINT_SIZE) { if (len != 2 * USB_DT_ENDPOINT_SIZE) goto fail0; memcpy(&data->hs_desc, buf + USB_DT_ENDPOINT_SIZE, USB_DT_ENDPOINT_SIZE); if (data->hs_desc.bLength != USB_DT_ENDPOINT_SIZE || data->hs_desc.bDescriptorType != USB_DT_ENDPOINT) { DBG(data->dev, "config %s, bad hs length or type\n", data->name); goto fail0; } } spin_lock_irq (&data->dev->lock); if (data->dev->state == STATE_DEV_UNBOUND) { value = -ENOENT; goto gone; } else { ep = data->ep; if (ep == NULL) { value = -ENODEV; goto gone; } } switch (data->dev->gadget->speed) { case USB_SPEED_LOW: case USB_SPEED_FULL: ep->desc = &data->desc; break; case USB_SPEED_HIGH: /* fails if caller didn't provide that descriptor... */ ep->desc = &data->hs_desc; break; default: DBG(data->dev, "unconnected, %s init abandoned\n", data->name); value = -EINVAL; goto gone; } value = usb_ep_enable(ep); if (value == 0) { data->state = STATE_EP_ENABLED; value = length; } gone: spin_unlock_irq (&data->dev->lock); if (value < 0) { fail: data->desc.bDescriptorType = 0; data->hs_desc.bDescriptorType = 0; } return value; fail0: value = -EINVAL; goto fail; } static int ep_open (struct inode *inode, struct file *fd) { struct ep_data *data = inode->i_private; int value = -EBUSY; if (mutex_lock_interruptible(&data->lock) != 0) return -EINTR; spin_lock_irq (&data->dev->lock); if (data->dev->state == STATE_DEV_UNBOUND) value = -ENOENT; else if (data->state == STATE_EP_DISABLED) { value = 0; data->state = STATE_EP_READY; get_ep (data); fd->private_data = data; VDEBUG (data->dev, "%s ready\n", data->name); } else DBG (data->dev, "%s state %d\n", data->name, data->state); spin_unlock_irq (&data->dev->lock); mutex_unlock(&data->lock); return value; } /*----------------------------------------------------------------------*/ /* EP0 IMPLEMENTATION can be partly in userspace. * * Drivers that use this facility receive various events, including * control requests the kernel doesn't handle. Drivers that don't * use this facility may be too simple-minded for real applications. */ static inline void ep0_readable (struct dev_data *dev) { wake_up (&dev->wait); kill_fasync (&dev->fasync, SIGIO, POLL_IN); } static void clean_req (struct usb_ep *ep, struct usb_request *req) { struct dev_data *dev = ep->driver_data; if (req->buf != dev->rbuf) { kfree(req->buf); req->buf = dev->rbuf; } req->complete = epio_complete; dev->setup_out_ready = 0; } static void ep0_complete (struct usb_ep *ep, struct usb_request *req) { struct dev_data *dev = ep->driver_data; unsigned long flags; int free = 1; /* for control OUT, data must still get to userspace */ spin_lock_irqsave(&dev->lock, flags); if (!dev->setup_in) { dev->setup_out_error = (req->status != 0); if (!dev->setup_out_error) free = 0; dev->setup_out_ready = 1; ep0_readable (dev); } /* clean up as appropriate */ if (free && req->buf != &dev->rbuf) clean_req (ep, req); req->complete = epio_complete; spin_unlock_irqrestore(&dev->lock, flags); } static int setup_req (struct usb_ep *ep, struct usb_request *req, u16 len) { struct dev_data *dev = ep->driver_data; if (dev->setup_out_ready) { DBG (dev, "ep0 request busy!\n"); return -EBUSY; } if (len > sizeof (dev->rbuf)) req->buf = kmalloc(len, GFP_ATOMIC); if (req->buf == NULL) { req->buf = dev->rbuf; return -ENOMEM; } req->complete = ep0_complete; req->length = len; req->zero = 0; return 0; } static ssize_t ep0_read (struct file *fd, char __user *buf, size_t len, loff_t *ptr) { struct dev_data *dev = fd->private_data; ssize_t retval; enum ep0_state state; spin_lock_irq (&dev->lock); if (dev->state <= STATE_DEV_OPENED) { retval = -EINVAL; goto done; } /* report fd mode change before acting on it */ if (dev->setup_abort) { dev->setup_abort = 0; retval = -EIDRM; goto done; } /* control DATA stage */ if ((state = dev->state) == STATE_DEV_SETUP) { if (dev->setup_in) { /* stall IN */ VDEBUG(dev, "ep0in stall\n"); (void) usb_ep_set_halt (dev->gadget->ep0); retval = -EL2HLT; dev->state = STATE_DEV_CONNECTED; } else if (len == 0) { /* ack SET_CONFIGURATION etc */ struct usb_ep *ep = dev->gadget->ep0; struct usb_request *req = dev->req; if ((retval = setup_req (ep, req, 0)) == 0) { ++dev->udc_usage; spin_unlock_irq (&dev->lock); retval = usb_ep_queue (ep, req, GFP_KERNEL); spin_lock_irq (&dev->lock); --dev->udc_usage; } dev->state = STATE_DEV_CONNECTED; /* assume that was SET_CONFIGURATION */ if (dev->current_config) { unsigned power; if (gadget_is_dualspeed(dev->gadget) && (dev->gadget->speed == USB_SPEED_HIGH)) power = dev->hs_config->bMaxPower; else power = dev->config->bMaxPower; usb_gadget_vbus_draw(dev->gadget, 2 * power); } } else { /* collect OUT data */ if ((fd->f_flags & O_NONBLOCK) != 0 && !dev->setup_out_ready) { retval = -EAGAIN; goto done; } spin_unlock_irq (&dev->lock); retval = wait_event_interruptible (dev->wait, dev->setup_out_ready != 0); /* FIXME state could change from under us */ spin_lock_irq (&dev->lock); if (retval) goto done; if (dev->state != STATE_DEV_SETUP) { retval = -ECANCELED; goto done; } dev->state = STATE_DEV_CONNECTED; if (dev->setup_out_error) retval = -EIO; else { len = min (len, (size_t)dev->req->actual); ++dev->udc_usage; spin_unlock_irq(&dev->lock); if (copy_to_user (buf, dev->req->buf, len)) retval = -EFAULT; else retval = len; spin_lock_irq(&dev->lock); --dev->udc_usage; clean_req (dev->gadget->ep0, dev->req); /* NOTE userspace can't yet choose to stall */ } } goto done; } /* else normal: return event data */ if (len < sizeof dev->event [0]) { retval = -EINVAL; goto done; } len -= len % sizeof (struct usb_gadgetfs_event); dev->usermode_setup = 1; scan: /* return queued events right away */ if (dev->ev_next != 0) { unsigned i, n; n = len / sizeof (struct usb_gadgetfs_event); if (dev->ev_next < n) n = dev->ev_next; /* ep0 i/o has special semantics during STATE_DEV_SETUP */ for (i = 0; i < n; i++) { if (dev->event [i].type == GADGETFS_SETUP) { dev->state = STATE_DEV_SETUP; n = i + 1; break; } } spin_unlock_irq (&dev->lock); len = n * sizeof (struct usb_gadgetfs_event); if (copy_to_user (buf, &dev->event, len)) retval = -EFAULT; else retval = len; if (len > 0) { /* NOTE this doesn't guard against broken drivers; * concurrent ep0 readers may lose events. */ spin_lock_irq (&dev->lock); if (dev->ev_next > n) { memmove(&dev->event[0], &dev->event[n], sizeof (struct usb_gadgetfs_event) * (dev->ev_next - n)); } dev->ev_next -= n; spin_unlock_irq (&dev->lock); } return retval; } if (fd->f_flags & O_NONBLOCK) { retval = -EAGAIN; goto done; } switch (state) { default: DBG (dev, "fail %s, state %d\n", __func__, state); retval = -ESRCH; break; case STATE_DEV_UNCONNECTED: case STATE_DEV_CONNECTED: spin_unlock_irq (&dev->lock); DBG (dev, "%s wait\n", __func__); /* wait for events */ retval = wait_event_interruptible (dev->wait, dev->ev_next != 0); if (retval < 0) return retval; spin_lock_irq (&dev->lock); goto scan; } done: spin_unlock_irq (&dev->lock); return retval; } static struct usb_gadgetfs_event * next_event (struct dev_data *dev, enum usb_gadgetfs_event_type type) { struct usb_gadgetfs_event *event; unsigned i; switch (type) { /* these events purge the queue */ case GADGETFS_DISCONNECT: if (dev->state == STATE_DEV_SETUP) dev->setup_abort = 1; fallthrough; case GADGETFS_CONNECT: dev->ev_next = 0; break; case GADGETFS_SETUP: /* previous request timed out */ case GADGETFS_SUSPEND: /* same effect */ /* these events can't be repeated */ for (i = 0; i != dev->ev_next; i++) { if (dev->event [i].type != type) continue; DBG(dev, "discard old event[%d] %d\n", i, type); dev->ev_next--; if (i == dev->ev_next) break; /* indices start at zero, for simplicity */ memmove (&dev->event [i], &dev->event [i + 1], sizeof (struct usb_gadgetfs_event) * (dev->ev_next - i)); } break; default: BUG (); } VDEBUG(dev, "event[%d] = %d\n", dev->ev_next, type); event = &dev->event [dev->ev_next++]; BUG_ON (dev->ev_next > N_EVENT); memset (event, 0, sizeof *event); event->type = type; return event; } static ssize_t ep0_write (struct file *fd, const char __user *buf, size_t len, loff_t *ptr) { struct dev_data *dev = fd->private_data; ssize_t retval = -ESRCH; /* report fd mode change before acting on it */ if (dev->setup_abort) { dev->setup_abort = 0; retval = -EIDRM; /* data and/or status stage for control request */ } else if (dev->state == STATE_DEV_SETUP) { len = min_t(size_t, len, dev->setup_wLength); if (dev->setup_in) { retval = setup_req (dev->gadget->ep0, dev->req, len); if (retval == 0) { dev->state = STATE_DEV_CONNECTED; ++dev->udc_usage; spin_unlock_irq (&dev->lock); if (copy_from_user (dev->req->buf, buf, len)) retval = -EFAULT; else { if (len < dev->setup_wLength) dev->req->zero = 1; retval = usb_ep_queue ( dev->gadget->ep0, dev->req, GFP_KERNEL); } spin_lock_irq(&dev->lock); --dev->udc_usage; if (retval < 0) { clean_req (dev->gadget->ep0, dev->req); } else retval = len; return retval; } /* can stall some OUT transfers */ } else if (dev->setup_can_stall) { VDEBUG(dev, "ep0out stall\n"); (void) usb_ep_set_halt (dev->gadget->ep0); retval = -EL2HLT; dev->state = STATE_DEV_CONNECTED; } else { DBG(dev, "bogus ep0out stall!\n"); } } else DBG (dev, "fail %s, state %d\n", __func__, dev->state); return retval; } static int ep0_fasync (int f, struct file *fd, int on) { struct dev_data *dev = fd->private_data; // caller must F_SETOWN before signal delivery happens VDEBUG (dev, "%s %s\n", __func__, on ? "on" : "off"); return fasync_helper (f, fd, on, &dev->fasync); } static struct usb_gadget_driver gadgetfs_driver; static int dev_release (struct inode *inode, struct file *fd) { struct dev_data *dev = fd->private_data; /* closing ep0 === shutdown all */ if (dev->gadget_registered) { usb_gadget_unregister_driver (&gadgetfs_driver); dev->gadget_registered = false; } /* at this point "good" hardware has disconnected the * device from USB; the host won't see it any more. * alternatively, all host requests will time out. */ kfree (dev->buf); dev->buf = NULL; /* other endpoints were all decoupled from this device */ spin_lock_irq(&dev->lock); dev->state = STATE_DEV_DISABLED; spin_unlock_irq(&dev->lock); put_dev (dev); return 0; } static __poll_t ep0_poll (struct file *fd, poll_table *wait) { struct dev_data *dev = fd->private_data; __poll_t mask = 0; if (dev->state <= STATE_DEV_OPENED) return DEFAULT_POLLMASK; poll_wait(fd, &dev->wait, wait); spin_lock_irq(&dev->lock); /* report fd mode change before acting on it */ if (dev->setup_abort) { dev->setup_abort = 0; mask = EPOLLHUP; goto out; } if (dev->state == STATE_DEV_SETUP) { if (dev->setup_in || dev->setup_can_stall) mask = EPOLLOUT; } else { if (dev->ev_next != 0) mask = EPOLLIN; } out: spin_unlock_irq(&dev->lock); return mask; } static long gadget_dev_ioctl (struct file *fd, unsigned code, unsigned long value) { struct dev_data *dev = fd->private_data; struct usb_gadget *gadget = dev->gadget; long ret = -ENOTTY; spin_lock_irq(&dev->lock); if (dev->state == STATE_DEV_OPENED || dev->state == STATE_DEV_UNBOUND) { /* Not bound to a UDC */ } else if (gadget->ops->ioctl) { ++dev->udc_usage; spin_unlock_irq(&dev->lock); ret = gadget->ops->ioctl (gadget, code, value); spin_lock_irq(&dev->lock); --dev->udc_usage; } spin_unlock_irq(&dev->lock); return ret; } /*----------------------------------------------------------------------*/ /* The in-kernel gadget driver handles most ep0 issues, in particular * enumerating the single configuration (as provided from user space). * * Unrecognized ep0 requests may be handled in user space. */ static void make_qualifier (struct dev_data *dev) { struct usb_qualifier_descriptor qual; struct usb_device_descriptor *desc; qual.bLength = sizeof qual; qual.bDescriptorType = USB_DT_DEVICE_QUALIFIER; qual.bcdUSB = cpu_to_le16 (0x0200); desc = dev->dev; qual.bDeviceClass = desc->bDeviceClass; qual.bDeviceSubClass = desc->bDeviceSubClass; qual.bDeviceProtocol = desc->bDeviceProtocol; /* assumes ep0 uses the same value for both speeds ... */ qual.bMaxPacketSize0 = dev->gadget->ep0->maxpacket; qual.bNumConfigurations = 1; qual.bRESERVED = 0; memcpy (dev->rbuf, &qual, sizeof qual); } static int config_buf (struct dev_data *dev, u8 type, unsigned index) { int len; int hs = 0; /* only one configuration */ if (index > 0) return -EINVAL; if (gadget_is_dualspeed(dev->gadget)) { hs = (dev->gadget->speed == USB_SPEED_HIGH); if (type == USB_DT_OTHER_SPEED_CONFIG) hs = !hs; } if (hs) { dev->req->buf = dev->hs_config; len = le16_to_cpu(dev->hs_config->wTotalLength); } else { dev->req->buf = dev->config; len = le16_to_cpu(dev->config->wTotalLength); } ((u8 *)dev->req->buf) [1] = type; return len; } static int gadgetfs_setup (struct usb_gadget *gadget, const struct usb_ctrlrequest *ctrl) { struct dev_data *dev = get_gadget_data (gadget); struct usb_request *req = dev->req; int value = -EOPNOTSUPP; struct usb_gadgetfs_event *event; u16 w_value = le16_to_cpu(ctrl->wValue); u16 w_length = le16_to_cpu(ctrl->wLength); if (w_length > RBUF_SIZE) { if (ctrl->bRequestType & USB_DIR_IN) { /* Cast away the const, we are going to overwrite on purpose. */ __le16 *temp = (__le16 *)&ctrl->wLength; *temp = cpu_to_le16(RBUF_SIZE); w_length = RBUF_SIZE; } else { return value; } } spin_lock (&dev->lock); dev->setup_abort = 0; if (dev->state == STATE_DEV_UNCONNECTED) { if (gadget_is_dualspeed(gadget) && gadget->speed == USB_SPEED_HIGH && dev->hs_config == NULL) { spin_unlock(&dev->lock); ERROR (dev, "no high speed config??\n"); return -EINVAL; } dev->state = STATE_DEV_CONNECTED; INFO (dev, "connected\n"); event = next_event (dev, GADGETFS_CONNECT); event->u.speed = gadget->speed; ep0_readable (dev); /* host may have given up waiting for response. we can miss control * requests handled lower down (device/endpoint status and features); * then ep0_{read,write} will report the wrong status. controller * driver will have aborted pending i/o. */ } else if (dev->state == STATE_DEV_SETUP) dev->setup_abort = 1; req->buf = dev->rbuf; req->context = NULL; switch (ctrl->bRequest) { case USB_REQ_GET_DESCRIPTOR: if (ctrl->bRequestType != USB_DIR_IN) goto unrecognized; switch (w_value >> 8) { case USB_DT_DEVICE: value = min (w_length, (u16) sizeof *dev->dev); dev->dev->bMaxPacketSize0 = dev->gadget->ep0->maxpacket; req->buf = dev->dev; break; case USB_DT_DEVICE_QUALIFIER: if (!dev->hs_config) break; value = min (w_length, (u16) sizeof (struct usb_qualifier_descriptor)); make_qualifier (dev); break; case USB_DT_OTHER_SPEED_CONFIG: case USB_DT_CONFIG: value = config_buf (dev, w_value >> 8, w_value & 0xff); if (value >= 0) value = min (w_length, (u16) value); break; case USB_DT_STRING: goto unrecognized; default: // all others are errors break; } break; /* currently one config, two speeds */ case USB_REQ_SET_CONFIGURATION: if (ctrl->bRequestType != 0) goto unrecognized; if (0 == (u8) w_value) { value = 0; dev->current_config = 0; usb_gadget_vbus_draw(gadget, 8 /* mA */ ); // user mode expected to disable endpoints } else { u8 config, power; if (gadget_is_dualspeed(gadget) && gadget->speed == USB_SPEED_HIGH) { config = dev->hs_config->bConfigurationValue; power = dev->hs_config->bMaxPower; } else { config = dev->config->bConfigurationValue; power = dev->config->bMaxPower; } if (config == (u8) w_value) { value = 0; dev->current_config = config; usb_gadget_vbus_draw(gadget, 2 * power); } } /* report SET_CONFIGURATION like any other control request, * except that usermode may not stall this. the next * request mustn't be allowed start until this finishes: * endpoints and threads set up, etc. * * NOTE: older PXA hardware (before PXA 255: without UDCCFR) * has bad/racey automagic that prevents synchronizing here. * even kernel mode drivers often miss them. */ if (value == 0) { INFO (dev, "configuration #%d\n", dev->current_config); usb_gadget_set_state(gadget, USB_STATE_CONFIGURED); if (dev->usermode_setup) { dev->setup_can_stall = 0; goto delegate; } } break; #ifndef CONFIG_USB_PXA25X /* PXA automagically handles this request too */ case USB_REQ_GET_CONFIGURATION: if (ctrl->bRequestType != 0x80) goto unrecognized; *(u8 *)req->buf = dev->current_config; value = min (w_length, (u16) 1); break; #endif default: unrecognized: VDEBUG (dev, "%s req%02x.%02x v%04x i%04x l%d\n", dev->usermode_setup ? "delegate" : "fail", ctrl->bRequestType, ctrl->bRequest, w_value, le16_to_cpu(ctrl->wIndex), w_length); /* if there's an ep0 reader, don't stall */ if (dev->usermode_setup) { dev->setup_can_stall = 1; delegate: dev->setup_in = (ctrl->bRequestType & USB_DIR_IN) ? 1 : 0; dev->setup_wLength = w_length; dev->setup_out_ready = 0; dev->setup_out_error = 0; /* read DATA stage for OUT right away */ if (unlikely (!dev->setup_in && w_length)) { value = setup_req (gadget->ep0, dev->req, w_length); if (value < 0) break; ++dev->udc_usage; spin_unlock (&dev->lock); value = usb_ep_queue (gadget->ep0, dev->req, GFP_KERNEL); spin_lock (&dev->lock); --dev->udc_usage; if (value < 0) { clean_req (gadget->ep0, dev->req); break; } /* we can't currently stall these */ dev->setup_can_stall = 0; } /* state changes when reader collects event */ event = next_event (dev, GADGETFS_SETUP); event->u.setup = *ctrl; ep0_readable (dev); spin_unlock (&dev->lock); /* * Return USB_GADGET_DELAYED_STATUS as a workaround to * stop some UDC drivers (e.g. dwc3) from automatically * proceeding with the status stage for 0-length * transfers. * Should be removed once all UDC drivers are fixed to * always delay the status stage until a response is * queued to EP0. */ return w_length == 0 ? USB_GADGET_DELAYED_STATUS : 0; } } /* proceed with data transfer and status phases? */ if (value >= 0 && dev->state != STATE_DEV_SETUP) { req->length = value; req->zero = value < w_length; ++dev->udc_usage; spin_unlock (&dev->lock); value = usb_ep_queue (gadget->ep0, req, GFP_KERNEL); spin_lock(&dev->lock); --dev->udc_usage; spin_unlock(&dev->lock); if (value < 0) { DBG (dev, "ep_queue --> %d\n", value); req->status = 0; } return value; } /* device stalls when value < 0 */ spin_unlock (&dev->lock); return value; } static void destroy_ep_files (struct dev_data *dev) { DBG (dev, "%s %d\n", __func__, dev->state); /* dev->state must prevent interference */ spin_lock_irq (&dev->lock); while (!list_empty(&dev->epfiles)) { struct ep_data *ep; struct inode *parent; struct dentry *dentry; /* break link to FS */ ep = list_first_entry (&dev->epfiles, struct ep_data, epfiles); list_del_init (&ep->epfiles); spin_unlock_irq (&dev->lock); dentry = ep->dentry; ep->dentry = NULL; parent = d_inode(dentry->d_parent); /* break link to controller */ mutex_lock(&ep->lock); if (ep->state == STATE_EP_ENABLED) (void) usb_ep_disable (ep->ep); ep->state = STATE_EP_UNBOUND; usb_ep_free_request (ep->ep, ep->req); ep->ep = NULL; mutex_unlock(&ep->lock); wake_up (&ep->wait); put_ep (ep); /* break link to dcache */ inode_lock(parent); d_delete (dentry); dput (dentry); inode_unlock(parent); spin_lock_irq (&dev->lock); } spin_unlock_irq (&dev->lock); } static struct dentry * gadgetfs_create_file (struct super_block *sb, char const *name, void *data, const struct file_operations *fops); static int activate_ep_files (struct dev_data *dev) { struct usb_ep *ep; struct ep_data *data; gadget_for_each_ep (ep, dev->gadget) { data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) goto enomem0; data->state = STATE_EP_DISABLED; mutex_init(&data->lock); init_waitqueue_head (&data->wait); strncpy (data->name, ep->name, sizeof (data->name) - 1); refcount_set (&data->count, 1); data->dev = dev; get_dev (dev); data->ep = ep; ep->driver_data = data; data->req = usb_ep_alloc_request (ep, GFP_KERNEL); if (!data->req) goto enomem1; data->dentry = gadgetfs_create_file (dev->sb, data->name, data, &ep_io_operations); if (!data->dentry) goto enomem2; list_add_tail (&data->epfiles, &dev->epfiles); } return 0; enomem2: usb_ep_free_request (ep, data->req); enomem1: put_dev (dev); kfree (data); enomem0: DBG (dev, "%s enomem\n", __func__); destroy_ep_files (dev); return -ENOMEM; } static void gadgetfs_unbind (struct usb_gadget *gadget) { struct dev_data *dev = get_gadget_data (gadget); DBG (dev, "%s\n", __func__); spin_lock_irq (&dev->lock); dev->state = STATE_DEV_UNBOUND; while (dev->udc_usage > 0) { spin_unlock_irq(&dev->lock); usleep_range(1000, 2000); spin_lock_irq(&dev->lock); } spin_unlock_irq (&dev->lock); destroy_ep_files (dev); gadget->ep0->driver_data = NULL; set_gadget_data (gadget, NULL); /* we've already been disconnected ... no i/o is active */ if (dev->req) usb_ep_free_request (gadget->ep0, dev->req); DBG (dev, "%s done\n", __func__); put_dev (dev); } static struct dev_data *the_device; static int gadgetfs_bind(struct usb_gadget *gadget, struct usb_gadget_driver *driver) { struct dev_data *dev = the_device; if (!dev) return -ESRCH; if (0 != strcmp (CHIP, gadget->name)) { pr_err("%s expected %s controller not %s\n", shortname, CHIP, gadget->name); return -ENODEV; } set_gadget_data (gadget, dev); dev->gadget = gadget; gadget->ep0->driver_data = dev; /* preallocate control response and buffer */ dev->req = usb_ep_alloc_request (gadget->ep0, GFP_KERNEL); if (!dev->req) goto enomem; dev->req->context = NULL; dev->req->complete = epio_complete; if (activate_ep_files (dev) < 0) goto enomem; INFO (dev, "bound to %s driver\n", gadget->name); spin_lock_irq(&dev->lock); dev->state = STATE_DEV_UNCONNECTED; spin_unlock_irq(&dev->lock); get_dev (dev); return 0; enomem: gadgetfs_unbind (gadget); return -ENOMEM; } static void gadgetfs_disconnect (struct usb_gadget *gadget) { struct dev_data *dev = get_gadget_data (gadget); unsigned long flags; spin_lock_irqsave (&dev->lock, flags); if (dev->state == STATE_DEV_UNCONNECTED) goto exit; dev->state = STATE_DEV_UNCONNECTED; INFO (dev, "disconnected\n"); next_event (dev, GADGETFS_DISCONNECT); ep0_readable (dev); exit: spin_unlock_irqrestore (&dev->lock, flags); } static void gadgetfs_suspend (struct usb_gadget *gadget) { struct dev_data *dev = get_gadget_data (gadget); unsigned long flags; INFO (dev, "suspended from state %d\n", dev->state); spin_lock_irqsave(&dev->lock, flags); switch (dev->state) { case STATE_DEV_SETUP: // VERY odd... host died?? case STATE_DEV_CONNECTED: case STATE_DEV_UNCONNECTED: next_event (dev, GADGETFS_SUSPEND); ep0_readable (dev); fallthrough; default: break; } spin_unlock_irqrestore(&dev->lock, flags); } static struct usb_gadget_driver gadgetfs_driver = { .function = (char *) driver_desc, .bind = gadgetfs_bind, .unbind = gadgetfs_unbind, .setup = gadgetfs_setup, .reset = gadgetfs_disconnect, .disconnect = gadgetfs_disconnect, .suspend = gadgetfs_suspend, .driver = { .name = shortname, }, }; /*----------------------------------------------------------------------*/ /* DEVICE INITIALIZATION * * fd = open ("/dev/gadget/$CHIP", O_RDWR) * status = write (fd, descriptors, sizeof descriptors) * * That write establishes the device configuration, so the kernel can * bind to the controller ... guaranteeing it can handle enumeration * at all necessary speeds. Descriptor order is: * * . message tag (u32, host order) ... for now, must be zero; it * would change to support features like multi-config devices * . full/low speed config ... all wTotalLength bytes (with interface, * class, altsetting, endpoint, and other descriptors) * . high speed config ... all descriptors, for high speed operation; * this one's optional except for high-speed hardware * . device descriptor * * Endpoints are not yet enabled. Drivers must wait until device * configuration and interface altsetting changes create * the need to configure (or unconfigure) them. * * After initialization, the device stays active for as long as that * $CHIP file is open. Events must then be read from that descriptor, * such as configuration notifications. */ static int is_valid_config(struct usb_config_descriptor *config, unsigned int total) { return config->bDescriptorType == USB_DT_CONFIG && config->bLength == USB_DT_CONFIG_SIZE && total >= USB_DT_CONFIG_SIZE && config->bConfigurationValue != 0 && (config->bmAttributes & USB_CONFIG_ATT_ONE) != 0 && (config->bmAttributes & USB_CONFIG_ATT_WAKEUP) == 0; /* FIXME if gadget->is_otg, _must_ include an otg descriptor */ /* FIXME check lengths: walk to end */ } static ssize_t dev_config (struct file *fd, const char __user *buf, size_t len, loff_t *ptr) { struct dev_data *dev = fd->private_data; ssize_t value, length = len; unsigned total; u32 tag; char *kbuf; spin_lock_irq(&dev->lock); if (dev->state > STATE_DEV_OPENED) { value = ep0_write(fd, buf, len, ptr); spin_unlock_irq(&dev->lock); return value; } spin_unlock_irq(&dev->lock); if ((len < (USB_DT_CONFIG_SIZE + USB_DT_DEVICE_SIZE + 4)) || (len > PAGE_SIZE * 4)) return -EINVAL; /* we might need to change message format someday */ if (copy_from_user (&tag, buf, 4)) return -EFAULT; if (tag != 0) return -EINVAL; buf += 4; length -= 4; kbuf = memdup_user(buf, length); if (IS_ERR(kbuf)) return PTR_ERR(kbuf); spin_lock_irq (&dev->lock); value = -EINVAL; if (dev->buf) { spin_unlock_irq(&dev->lock); kfree(kbuf); return value; } dev->buf = kbuf; /* full or low speed config */ dev->config = (void *) kbuf; total = le16_to_cpu(dev->config->wTotalLength); if (!is_valid_config(dev->config, total) || total > length - USB_DT_DEVICE_SIZE) goto fail; kbuf += total; length -= total; /* optional high speed config */ if (kbuf [1] == USB_DT_CONFIG) { dev->hs_config = (void *) kbuf; total = le16_to_cpu(dev->hs_config->wTotalLength); if (!is_valid_config(dev->hs_config, total) || total > length - USB_DT_DEVICE_SIZE) goto fail; kbuf += total; length -= total; } else { dev->hs_config = NULL; } /* could support multiple configs, using another encoding! */ /* device descriptor (tweaked for paranoia) */ if (length != USB_DT_DEVICE_SIZE) goto fail; dev->dev = (void *)kbuf; if (dev->dev->bLength != USB_DT_DEVICE_SIZE || dev->dev->bDescriptorType != USB_DT_DEVICE || dev->dev->bNumConfigurations != 1) goto fail; dev->dev->bcdUSB = cpu_to_le16 (0x0200); /* triggers gadgetfs_bind(); then we can enumerate. */ spin_unlock_irq (&dev->lock); if (dev->hs_config) gadgetfs_driver.max_speed = USB_SPEED_HIGH; else gadgetfs_driver.max_speed = USB_SPEED_FULL; value = usb_gadget_register_driver(&gadgetfs_driver); if (value != 0) { spin_lock_irq(&dev->lock); goto fail; } else { /* at this point "good" hardware has for the first time * let the USB the host see us. alternatively, if users * unplug/replug that will clear all the error state. * * note: everything running before here was guaranteed * to choke driver model style diagnostics. from here * on, they can work ... except in cleanup paths that * kick in after the ep0 descriptor is closed. */ value = len; dev->gadget_registered = true; } return value; fail: dev->config = NULL; dev->hs_config = NULL; dev->dev = NULL; spin_unlock_irq (&dev->lock); pr_debug ("%s: %s fail %zd, %p\n", shortname, __func__, value, dev); kfree (dev->buf); dev->buf = NULL; return value; } static int gadget_dev_open (struct inode *inode, struct file *fd) { struct dev_data *dev = inode->i_private; int value = -EBUSY; spin_lock_irq(&dev->lock); if (dev->state == STATE_DEV_DISABLED) { dev->ev_next = 0; dev->state = STATE_DEV_OPENED; fd->private_data = dev; get_dev (dev); value = 0; } spin_unlock_irq(&dev->lock); return value; } static const struct file_operations ep0_operations = { .llseek = no_llseek, .open = gadget_dev_open, .read = ep0_read, .write = dev_config, .fasync = ep0_fasync, .poll = ep0_poll, .unlocked_ioctl = gadget_dev_ioctl, .release = dev_release, }; /*----------------------------------------------------------------------*/ /* FILESYSTEM AND SUPERBLOCK OPERATIONS * * Mounting the filesystem creates a controller file, used first for * device configuration then later for event monitoring. */ /* FIXME PAM etc could set this security policy without mount options * if epfiles inherited ownership and permissons from ep0 ... */ static unsigned default_uid; static unsigned default_gid; static unsigned default_perm = S_IRUSR | S_IWUSR; module_param (default_uid, uint, 0644); module_param (default_gid, uint, 0644); module_param (default_perm, uint, 0644); static struct inode * gadgetfs_make_inode (struct super_block *sb, void *data, const struct file_operations *fops, int mode) { struct inode *inode = new_inode (sb); if (inode) { inode->i_ino = get_next_ino(); inode->i_mode = mode; inode->i_uid = make_kuid(&init_user_ns, default_uid); inode->i_gid = make_kgid(&init_user_ns, default_gid); simple_inode_init_ts(inode); inode->i_private = data; inode->i_fop = fops; } return inode; } /* creates in fs root directory, so non-renamable and non-linkable. * so inode and dentry are paired, until device reconfig. */ static struct dentry * gadgetfs_create_file (struct super_block *sb, char const *name, void *data, const struct file_operations *fops) { struct dentry *dentry; struct inode *inode; dentry = d_alloc_name(sb->s_root, name); if (!dentry) return NULL; inode = gadgetfs_make_inode (sb, data, fops, S_IFREG | (default_perm & S_IRWXUGO)); if (!inode) { dput(dentry); return NULL; } d_add (dentry, inode); return dentry; } static const struct super_operations gadget_fs_operations = { .statfs = simple_statfs, .drop_inode = generic_delete_inode, }; static int gadgetfs_fill_super (struct super_block *sb, struct fs_context *fc) { struct inode *inode; struct dev_data *dev; int rc; mutex_lock(&sb_mutex); if (the_device) { rc = -ESRCH; goto Done; } CHIP = usb_get_gadget_udc_name(); if (!CHIP) { rc = -ENODEV; goto Done; } /* superblock */ sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; sb->s_magic = GADGETFS_MAGIC; sb->s_op = &gadget_fs_operations; sb->s_time_gran = 1; /* root inode */ inode = gadgetfs_make_inode (sb, NULL, &simple_dir_operations, S_IFDIR | S_IRUGO | S_IXUGO); if (!inode) goto Enomem; inode->i_op = &simple_dir_inode_operations; if (!(sb->s_root = d_make_root (inode))) goto Enomem; /* the ep0 file is named after the controller we expect; * user mode code can use it for sanity checks, like we do. */ dev = dev_new (); if (!dev) goto Enomem; dev->sb = sb; dev->dentry = gadgetfs_create_file(sb, CHIP, dev, &ep0_operations); if (!dev->dentry) { put_dev(dev); goto Enomem; } /* other endpoint files are available after hardware setup, * from binding to a controller. */ the_device = dev; rc = 0; goto Done; Enomem: kfree(CHIP); CHIP = NULL; rc = -ENOMEM; Done: mutex_unlock(&sb_mutex); return rc; } /* "mount -t gadgetfs path /dev/gadget" ends up here */ static int gadgetfs_get_tree(struct fs_context *fc) { return get_tree_single(fc, gadgetfs_fill_super); } static const struct fs_context_operations gadgetfs_context_ops = { .get_tree = gadgetfs_get_tree, }; static int gadgetfs_init_fs_context(struct fs_context *fc) { fc->ops = &gadgetfs_context_ops; return 0; } static void gadgetfs_kill_sb (struct super_block *sb) { mutex_lock(&sb_mutex); kill_litter_super (sb); if (the_device) { put_dev (the_device); the_device = NULL; } kfree(CHIP); CHIP = NULL; mutex_unlock(&sb_mutex); } /*----------------------------------------------------------------------*/ static struct file_system_type gadgetfs_type = { .owner = THIS_MODULE, .name = shortname, .init_fs_context = gadgetfs_init_fs_context, .kill_sb = gadgetfs_kill_sb, }; MODULE_ALIAS_FS("gadgetfs"); /*----------------------------------------------------------------------*/ static int __init gadgetfs_init (void) { int status; status = register_filesystem (&gadgetfs_type); if (status == 0) pr_info ("%s: %s, version " DRIVER_VERSION "\n", shortname, driver_desc); return status; } module_init (gadgetfs_init); static void __exit gadgetfs_cleanup (void) { pr_debug ("unregister %s\n", shortname); unregister_filesystem (&gadgetfs_type); } module_exit (gadgetfs_cleanup);
1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) 2001 Jean-Fredric Clere, Nikolas Zimmermann, Georg Acher * Mark Cave-Ayland, Carlo E Prelz, Dick Streefland * Copyright (c) 2002, 2003 Tuukka Toivonen * Copyright (c) 2008 Erik Andrén * * P/N 861037: Sensor HDCS1000 ASIC STV0600 * P/N 861050-0010: Sensor HDCS1000 ASIC STV0600 * P/N 861050-0020: Sensor Photobit PB100 ASIC STV0600-1 - QuickCam Express * P/N 861055: Sensor ST VV6410 ASIC STV0610 - LEGO cam * P/N 861075-0040: Sensor HDCS1000 ASIC * P/N 961179-0700: Sensor ST VV6410 ASIC STV0602 - Dexxa WebCam USB * P/N 861040-0000: Sensor ST VV6410 ASIC STV0610 - QuickCam Web */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/input.h> #include "stv06xx_sensor.h" MODULE_AUTHOR("Erik Andrén"); MODULE_DESCRIPTION("STV06XX USB Camera Driver"); MODULE_LICENSE("GPL"); static bool dump_bridge; static bool dump_sensor; int stv06xx_write_bridge(struct sd *sd, u16 address, u16 i2c_data) { int err; struct gspca_dev *gspca_dev = (struct gspca_dev *)sd; struct usb_device *udev = sd->gspca_dev.dev; __u8 *buf = sd->gspca_dev.usb_buf; u8 len = (i2c_data > 0xff) ? 2 : 1; buf[0] = i2c_data & 0xff; buf[1] = (i2c_data >> 8) & 0xff; err = usb_control_msg(udev, usb_sndctrlpipe(udev, 0), 0x04, 0x40, address, 0, buf, len, STV06XX_URB_MSG_TIMEOUT); gspca_dbg(gspca_dev, D_CONF, "Written 0x%x to address 0x%x, status: %d\n", i2c_data, address, err); return (err < 0) ? err : 0; } int stv06xx_read_bridge(struct sd *sd, u16 address, u8 *i2c_data) { int err; struct gspca_dev *gspca_dev = (struct gspca_dev *)sd; struct usb_device *udev = sd->gspca_dev.dev; __u8 *buf = sd->gspca_dev.usb_buf; err = usb_control_msg(udev, usb_rcvctrlpipe(udev, 0), 0x04, 0xc0, address, 0, buf, 1, STV06XX_URB_MSG_TIMEOUT); *i2c_data = buf[0]; gspca_dbg(gspca_dev, D_CONF, "Reading 0x%x from address 0x%x, status %d\n", *i2c_data, address, err); return (err < 0) ? err : 0; } /* Wraps the normal write sensor bytes / words functions for writing a single value */ int stv06xx_write_sensor(struct sd *sd, u8 address, u16 value) { if (sd->sensor->i2c_len == 2) { u16 data[2] = { address, value }; return stv06xx_write_sensor_words(sd, data, 1); } else { u8 data[2] = { address, value }; return stv06xx_write_sensor_bytes(sd, data, 1); } } static int stv06xx_write_sensor_finish(struct sd *sd) { int err = 0; if (sd->bridge == BRIDGE_STV610) { struct usb_device *udev = sd->gspca_dev.dev; __u8 *buf = sd->gspca_dev.usb_buf; buf[0] = 0; err = usb_control_msg(udev, usb_sndctrlpipe(udev, 0), 0x04, 0x40, 0x1704, 0, buf, 1, STV06XX_URB_MSG_TIMEOUT); } return (err < 0) ? err : 0; } int stv06xx_write_sensor_bytes(struct sd *sd, const u8 *data, u8 len) { int err, i, j; struct gspca_dev *gspca_dev = (struct gspca_dev *)sd; struct usb_device *udev = sd->gspca_dev.dev; __u8 *buf = sd->gspca_dev.usb_buf; gspca_dbg(gspca_dev, D_CONF, "I2C: Command buffer contains %d entries\n", len); for (i = 0; i < len;) { /* Build the command buffer */ memset(buf, 0, I2C_BUFFER_LENGTH); for (j = 0; j < I2C_MAX_BYTES && i < len; j++, i++) { buf[j] = data[2*i]; buf[0x10 + j] = data[2*i+1]; gspca_dbg(gspca_dev, D_CONF, "I2C: Writing 0x%02x to reg 0x%02x\n", data[2*i+1], data[2*i]); } buf[0x20] = sd->sensor->i2c_addr; buf[0x21] = j - 1; /* Number of commands to send - 1 */ buf[0x22] = I2C_WRITE_CMD; err = usb_control_msg(udev, usb_sndctrlpipe(udev, 0), 0x04, 0x40, 0x0400, 0, buf, I2C_BUFFER_LENGTH, STV06XX_URB_MSG_TIMEOUT); if (err < 0) return err; } return stv06xx_write_sensor_finish(sd); } int stv06xx_write_sensor_words(struct sd *sd, const u16 *data, u8 len) { int err, i, j; struct gspca_dev *gspca_dev = (struct gspca_dev *)sd; struct usb_device *udev = sd->gspca_dev.dev; __u8 *buf = sd->gspca_dev.usb_buf; gspca_dbg(gspca_dev, D_CONF, "I2C: Command buffer contains %d entries\n", len); for (i = 0; i < len;) { /* Build the command buffer */ memset(buf, 0, I2C_BUFFER_LENGTH); for (j = 0; j < I2C_MAX_WORDS && i < len; j++, i++) { buf[j] = data[2*i]; buf[0x10 + j * 2] = data[2*i+1]; buf[0x10 + j * 2 + 1] = data[2*i+1] >> 8; gspca_dbg(gspca_dev, D_CONF, "I2C: Writing 0x%04x to reg 0x%02x\n", data[2*i+1], data[2*i]); } buf[0x20] = sd->sensor->i2c_addr; buf[0x21] = j - 1; /* Number of commands to send - 1 */ buf[0x22] = I2C_WRITE_CMD; err = usb_control_msg(udev, usb_sndctrlpipe(udev, 0), 0x04, 0x40, 0x0400, 0, buf, I2C_BUFFER_LENGTH, STV06XX_URB_MSG_TIMEOUT); if (err < 0) return err; } return stv06xx_write_sensor_finish(sd); } int stv06xx_read_sensor(struct sd *sd, const u8 address, u16 *value) { int err; struct gspca_dev *gspca_dev = (struct gspca_dev *)sd; struct usb_device *udev = sd->gspca_dev.dev; __u8 *buf = sd->gspca_dev.usb_buf; err = stv06xx_write_bridge(sd, STV_I2C_FLUSH, sd->sensor->i2c_flush); if (err < 0) return err; /* Clear mem */ memset(buf, 0, I2C_BUFFER_LENGTH); buf[0] = address; buf[0x20] = sd->sensor->i2c_addr; buf[0x21] = 0; /* Read I2C register */ buf[0x22] = I2C_READ_CMD; err = usb_control_msg(udev, usb_sndctrlpipe(udev, 0), 0x04, 0x40, 0x1400, 0, buf, I2C_BUFFER_LENGTH, STV06XX_URB_MSG_TIMEOUT); if (err < 0) { pr_err("I2C: Read error writing address: %d\n", err); return err; } err = usb_control_msg(udev, usb_rcvctrlpipe(udev, 0), 0x04, 0xc0, 0x1410, 0, buf, sd->sensor->i2c_len, STV06XX_URB_MSG_TIMEOUT); if (sd->sensor->i2c_len == 2) *value = buf[0] | (buf[1] << 8); else *value = buf[0]; gspca_dbg(gspca_dev, D_CONF, "I2C: Read 0x%x from address 0x%x, status: %d\n", *value, address, err); return (err < 0) ? err : 0; } /* Dumps all bridge registers */ static void stv06xx_dump_bridge(struct sd *sd) { int i; u8 data, buf; pr_info("Dumping all stv06xx bridge registers\n"); for (i = 0x1400; i < 0x160f; i++) { stv06xx_read_bridge(sd, i, &data); pr_info("Read 0x%x from address 0x%x\n", data, i); } pr_info("Testing stv06xx bridge registers for writability\n"); for (i = 0x1400; i < 0x160f; i++) { stv06xx_read_bridge(sd, i, &data); buf = data; stv06xx_write_bridge(sd, i, 0xff); stv06xx_read_bridge(sd, i, &data); if (data == 0xff) pr_info("Register 0x%x is read/write\n", i); else if (data != buf) pr_info("Register 0x%x is read/write, but only partially\n", i); else pr_info("Register 0x%x is read-only\n", i); stv06xx_write_bridge(sd, i, buf); } } /* this function is called at probe and resume time */ static int stv06xx_init(struct gspca_dev *gspca_dev) { struct sd *sd = (struct sd *) gspca_dev; int err; gspca_dbg(gspca_dev, D_PROBE, "Initializing camera\n"); /* Let the usb init settle for a bit before performing the initialization */ msleep(250); err = sd->sensor->init(sd); if (dump_sensor && sd->sensor->dump) sd->sensor->dump(sd); return (err < 0) ? err : 0; } /* this function is called at probe time */ static int stv06xx_init_controls(struct gspca_dev *gspca_dev) { struct sd *sd = (struct sd *) gspca_dev; gspca_dbg(gspca_dev, D_PROBE, "Initializing controls\n"); gspca_dev->vdev.ctrl_handler = &gspca_dev->ctrl_handler; return sd->sensor->init_controls(sd); } /* Start the camera */ static int stv06xx_start(struct gspca_dev *gspca_dev) { struct sd *sd = (struct sd *) gspca_dev; struct usb_host_interface *alt; struct usb_interface *intf; int err, packet_size; intf = usb_ifnum_to_if(sd->gspca_dev.dev, sd->gspca_dev.iface); alt = usb_altnum_to_altsetting(intf, sd->gspca_dev.alt); if (!alt) { gspca_err(gspca_dev, "Couldn't get altsetting\n"); return -EIO; } if (alt->desc.bNumEndpoints < 1) return -ENODEV; packet_size = le16_to_cpu(alt->endpoint[0].desc.wMaxPacketSize); err = stv06xx_write_bridge(sd, STV_ISO_SIZE_L, packet_size); if (err < 0) return err; /* Prepare the sensor for start */ err = sd->sensor->start(sd); if (err < 0) goto out; /* Start isochronous streaming */ err = stv06xx_write_bridge(sd, STV_ISO_ENABLE, 1); out: if (err < 0) gspca_dbg(gspca_dev, D_STREAM, "Starting stream failed\n"); else gspca_dbg(gspca_dev, D_STREAM, "Started streaming\n"); return (err < 0) ? err : 0; } static int stv06xx_isoc_init(struct gspca_dev *gspca_dev) { struct usb_interface_cache *intfc; struct usb_host_interface *alt; struct sd *sd = (struct sd *) gspca_dev; intfc = gspca_dev->dev->actconfig->intf_cache[0]; if (intfc->num_altsetting < 2) return -ENODEV; alt = &intfc->altsetting[1]; if (alt->desc.bNumEndpoints < 1) return -ENODEV; /* Start isoc bandwidth "negotiation" at max isoc bandwidth */ alt->endpoint[0].desc.wMaxPacketSize = cpu_to_le16(sd->sensor->max_packet_size[gspca_dev->curr_mode]); return 0; } static int stv06xx_isoc_nego(struct gspca_dev *gspca_dev) { int ret, packet_size, min_packet_size; struct usb_host_interface *alt; struct sd *sd = (struct sd *) gspca_dev; /* * Existence of altsetting and endpoint was verified in * stv06xx_isoc_init() */ alt = &gspca_dev->dev->actconfig->intf_cache[0]->altsetting[1]; packet_size = le16_to_cpu(alt->endpoint[0].desc.wMaxPacketSize); min_packet_size = sd->sensor->min_packet_size[gspca_dev->curr_mode]; if (packet_size <= min_packet_size) return -EIO; packet_size -= 100; if (packet_size < min_packet_size) packet_size = min_packet_size; alt->endpoint[0].desc.wMaxPacketSize = cpu_to_le16(packet_size); ret = usb_set_interface(gspca_dev->dev, gspca_dev->iface, 1); if (ret < 0) gspca_err(gspca_dev, "set alt 1 err %d\n", ret); return ret; } static void stv06xx_stopN(struct gspca_dev *gspca_dev) { int err; struct sd *sd = (struct sd *) gspca_dev; /* stop ISO-streaming */ err = stv06xx_write_bridge(sd, STV_ISO_ENABLE, 0); if (err < 0) goto out; err = sd->sensor->stop(sd); out: if (err < 0) gspca_dbg(gspca_dev, D_STREAM, "Failed to stop stream\n"); else gspca_dbg(gspca_dev, D_STREAM, "Stopped streaming\n"); } /* * Analyse an USB packet of the data stream and store it appropriately. * Each packet contains an integral number of chunks. Each chunk has * 2-bytes identification, followed by 2-bytes that describe the chunk * length. Known/guessed chunk identifications are: * 8001/8005/C001/C005 - Begin new frame * 8002/8006/C002/C006 - End frame * 0200/4200 - Contains actual image data, bayer or compressed * 0005 - 11 bytes of unknown data * 0100 - 2 bytes of unknown data * The 0005 and 0100 chunks seem to appear only in compressed stream. */ static void stv06xx_pkt_scan(struct gspca_dev *gspca_dev, u8 *data, /* isoc packet */ int len) /* iso packet length */ { struct sd *sd = (struct sd *) gspca_dev; gspca_dbg(gspca_dev, D_PACK, "Packet of length %d arrived\n", len); /* A packet may contain several frames loop until the whole packet is reached */ while (len) { int id, chunk_len; if (len < 4) { gspca_dbg(gspca_dev, D_PACK, "Packet is smaller than 4 bytes\n"); return; } /* Capture the id */ id = (data[0] << 8) | data[1]; /* Capture the chunk length */ chunk_len = (data[2] << 8) | data[3]; gspca_dbg(gspca_dev, D_PACK, "Chunk id: %x, length: %d\n", id, chunk_len); data += 4; len -= 4; if (len < chunk_len) { gspca_err(gspca_dev, "URB packet length is smaller than the specified chunk length\n"); gspca_dev->last_packet_type = DISCARD_PACKET; return; } /* First byte seem to be 02=data 2nd byte is unknown??? */ if (sd->bridge == BRIDGE_ST6422 && (id & 0xff00) == 0x0200) goto frame_data; switch (id) { case 0x0200: case 0x4200: frame_data: gspca_dbg(gspca_dev, D_PACK, "Frame data packet detected\n"); if (sd->to_skip) { int skip = (sd->to_skip < chunk_len) ? sd->to_skip : chunk_len; data += skip; len -= skip; chunk_len -= skip; sd->to_skip -= skip; } gspca_frame_add(gspca_dev, INTER_PACKET, data, chunk_len); break; case 0x8001: case 0x8005: case 0xc001: case 0xc005: gspca_dbg(gspca_dev, D_PACK, "Starting new frame\n"); /* Create a new frame, chunk length should be zero */ gspca_frame_add(gspca_dev, FIRST_PACKET, NULL, 0); if (sd->bridge == BRIDGE_ST6422) sd->to_skip = gspca_dev->pixfmt.width * 4; if (chunk_len) gspca_err(gspca_dev, "Chunk length is non-zero on a SOF\n"); break; case 0x8002: case 0x8006: case 0xc002: gspca_dbg(gspca_dev, D_PACK, "End of frame detected\n"); /* Complete the last frame (if any) */ gspca_frame_add(gspca_dev, LAST_PACKET, NULL, 0); if (chunk_len) gspca_err(gspca_dev, "Chunk length is non-zero on a EOF\n"); break; case 0x0005: gspca_dbg(gspca_dev, D_PACK, "Chunk 0x005 detected\n"); /* Unknown chunk with 11 bytes of data, occurs just before end of each frame in compressed mode */ break; case 0x0100: gspca_dbg(gspca_dev, D_PACK, "Chunk 0x0100 detected\n"); /* Unknown chunk with 2 bytes of data, occurs 2-3 times per USB interrupt */ break; case 0x42ff: gspca_dbg(gspca_dev, D_PACK, "Chunk 0x42ff detected\n"); /* Special chunk seen sometimes on the ST6422 */ break; default: gspca_dbg(gspca_dev, D_PACK, "Unknown chunk 0x%04x detected\n", id); /* Unknown chunk */ } data += chunk_len; len -= chunk_len; } } #if IS_ENABLED(CONFIG_INPUT) static int sd_int_pkt_scan(struct gspca_dev *gspca_dev, u8 *data, /* interrupt packet data */ int len) /* interrupt packet length */ { int ret = -EINVAL; if (len == 1 && (data[0] == 0x80 || data[0] == 0x10)) { input_report_key(gspca_dev->input_dev, KEY_CAMERA, 1); input_sync(gspca_dev->input_dev); ret = 0; } if (len == 1 && (data[0] == 0x88 || data[0] == 0x11)) { input_report_key(gspca_dev->input_dev, KEY_CAMERA, 0); input_sync(gspca_dev->input_dev); ret = 0; } return ret; } #endif static int stv06xx_config(struct gspca_dev *gspca_dev, const struct usb_device_id *id); static void stv06xx_probe_error(struct gspca_dev *gspca_dev) { struct sd *sd = (struct sd *)gspca_dev; kfree(sd->sensor_priv); sd->sensor_priv = NULL; } /* sub-driver description */ static const struct sd_desc sd_desc = { .name = MODULE_NAME, .config = stv06xx_config, .init = stv06xx_init, .init_controls = stv06xx_init_controls, .probe_error = stv06xx_probe_error, .start = stv06xx_start, .stopN = stv06xx_stopN, .pkt_scan = stv06xx_pkt_scan, .isoc_init = stv06xx_isoc_init, .isoc_nego = stv06xx_isoc_nego, #if IS_ENABLED(CONFIG_INPUT) .int_pkt_scan = sd_int_pkt_scan, #endif }; /* This function is called at probe time */ static int stv06xx_config(struct gspca_dev *gspca_dev, const struct usb_device_id *id) { struct sd *sd = (struct sd *) gspca_dev; gspca_dbg(gspca_dev, D_PROBE, "Configuring camera\n"); sd->bridge = id->driver_info; gspca_dev->sd_desc = &sd_desc; if (dump_bridge) stv06xx_dump_bridge(sd); sd->sensor = &stv06xx_sensor_st6422; if (!sd->sensor->probe(sd)) return 0; sd->sensor = &stv06xx_sensor_vv6410; if (!sd->sensor->probe(sd)) return 0; sd->sensor = &stv06xx_sensor_hdcs1x00; if (!sd->sensor->probe(sd)) return 0; sd->sensor = &stv06xx_sensor_hdcs1020; if (!sd->sensor->probe(sd)) return 0; sd->sensor = &stv06xx_sensor_pb0100; if (!sd->sensor->probe(sd)) return 0; sd->sensor = NULL; return -ENODEV; } /* -- module initialisation -- */ static const struct usb_device_id device_table[] = { {USB_DEVICE(0x046d, 0x0840), .driver_info = BRIDGE_STV600 }, /* QuickCam Express */ {USB_DEVICE(0x046d, 0x0850), .driver_info = BRIDGE_STV610 }, /* LEGO cam / QuickCam Web */ {USB_DEVICE(0x046d, 0x0870), .driver_info = BRIDGE_STV602 }, /* Dexxa WebCam USB */ {USB_DEVICE(0x046D, 0x08F0), .driver_info = BRIDGE_ST6422 }, /* QuickCam Messenger */ {USB_DEVICE(0x046D, 0x08F5), .driver_info = BRIDGE_ST6422 }, /* QuickCam Communicate */ {USB_DEVICE(0x046D, 0x08F6), .driver_info = BRIDGE_ST6422 }, /* QuickCam Messenger (new) */ {} }; MODULE_DEVICE_TABLE(usb, device_table); /* -- device connect -- */ static int sd_probe(struct usb_interface *intf, const struct usb_device_id *id) { return gspca_dev_probe(intf, id, &sd_desc, sizeof(struct sd), THIS_MODULE); } static void sd_disconnect(struct usb_interface *intf) { struct gspca_dev *gspca_dev = usb_get_intfdata(intf); struct sd *sd = (struct sd *) gspca_dev; void *priv = sd->sensor_priv; gspca_dbg(gspca_dev, D_PROBE, "Disconnecting the stv06xx device\n"); sd->sensor = NULL; gspca_disconnect(intf); kfree(priv); } static struct usb_driver sd_driver = { .name = MODULE_NAME, .id_table = device_table, .probe = sd_probe, .disconnect = sd_disconnect, #ifdef CONFIG_PM .suspend = gspca_suspend, .resume = gspca_resume, .reset_resume = gspca_resume, #endif }; module_usb_driver(sd_driver); module_param(dump_bridge, bool, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(dump_bridge, "Dumps all usb bridge registers at startup"); module_param(dump_sensor, bool, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(dump_sensor, "Dumps all sensor registers at startup");
13 8 5 20 19 37 35 2 6 7 1 1 7 7 1 7 7 7 7 7 7 7 1 1 21 21 21 21 21 21 21 21 9 9 9 6 20 21 21 6 6 6 11 3 6 2 4 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 /* Linux multicast routing support * Common logic shared by IPv4 [ipmr] and IPv6 [ip6mr] implementation */ #include <linux/rhashtable.h> #include <linux/mroute_base.h> /* Sets everything common except 'dev', since that is done under locking */ void vif_device_init(struct vif_device *v, struct net_device *dev, unsigned long rate_limit, unsigned char threshold, unsigned short flags, unsigned short get_iflink_mask) { RCU_INIT_POINTER(v->dev, NULL); v->bytes_in = 0; v->bytes_out = 0; v->pkt_in = 0; v->pkt_out = 0; v->rate_limit = rate_limit; v->flags = flags; v->threshold = threshold; if (v->flags & get_iflink_mask) v->link = dev_get_iflink(dev); else v->link = dev->ifindex; } EXPORT_SYMBOL(vif_device_init); struct mr_table * mr_table_alloc(struct net *net, u32 id, struct mr_table_ops *ops, void (*expire_func)(struct timer_list *t), void (*table_set)(struct mr_table *mrt, struct net *net)) { struct mr_table *mrt; int err; mrt = kzalloc(sizeof(*mrt), GFP_KERNEL); if (!mrt) return ERR_PTR(-ENOMEM); mrt->id = id; write_pnet(&mrt->net, net); mrt->ops = *ops; err = rhltable_init(&mrt->mfc_hash, mrt->ops.rht_params); if (err) { kfree(mrt); return ERR_PTR(err); } INIT_LIST_HEAD(&mrt->mfc_cache_list); INIT_LIST_HEAD(&mrt->mfc_unres_queue); timer_setup(&mrt->ipmr_expire_timer, expire_func, 0); mrt->mroute_reg_vif_num = -1; table_set(mrt, net); return mrt; } EXPORT_SYMBOL(mr_table_alloc); void *mr_mfc_find_parent(struct mr_table *mrt, void *hasharg, int parent) { struct rhlist_head *tmp, *list; struct mr_mfc *c; list = rhltable_lookup(&mrt->mfc_hash, hasharg, *mrt->ops.rht_params); rhl_for_each_entry_rcu(c, tmp, list, mnode) if (parent == -1 || parent == c->mfc_parent) return c; return NULL; } EXPORT_SYMBOL(mr_mfc_find_parent); void *mr_mfc_find_any_parent(struct mr_table *mrt, int vifi) { struct rhlist_head *tmp, *list; struct mr_mfc *c; list = rhltable_lookup(&mrt->mfc_hash, mrt->ops.cmparg_any, *mrt->ops.rht_params); rhl_for_each_entry_rcu(c, tmp, list, mnode) if (c->mfc_un.res.ttls[vifi] < 255) return c; return NULL; } EXPORT_SYMBOL(mr_mfc_find_any_parent); void *mr_mfc_find_any(struct mr_table *mrt, int vifi, void *hasharg) { struct rhlist_head *tmp, *list; struct mr_mfc *c, *proxy; list = rhltable_lookup(&mrt->mfc_hash, hasharg, *mrt->ops.rht_params); rhl_for_each_entry_rcu(c, tmp, list, mnode) { if (c->mfc_un.res.ttls[vifi] < 255) return c; /* It's ok if the vifi is part of the static tree */ proxy = mr_mfc_find_any_parent(mrt, c->mfc_parent); if (proxy && proxy->mfc_un.res.ttls[vifi] < 255) return c; } return mr_mfc_find_any_parent(mrt, vifi); } EXPORT_SYMBOL(mr_mfc_find_any); #ifdef CONFIG_PROC_FS void *mr_vif_seq_idx(struct net *net, struct mr_vif_iter *iter, loff_t pos) { struct mr_table *mrt = iter->mrt; for (iter->ct = 0; iter->ct < mrt->maxvif; ++iter->ct) { if (!VIF_EXISTS(mrt, iter->ct)) continue; if (pos-- == 0) return &mrt->vif_table[iter->ct]; } return NULL; } EXPORT_SYMBOL(mr_vif_seq_idx); void *mr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct mr_vif_iter *iter = seq->private; struct net *net = seq_file_net(seq); struct mr_table *mrt = iter->mrt; ++*pos; if (v == SEQ_START_TOKEN) return mr_vif_seq_idx(net, iter, 0); while (++iter->ct < mrt->maxvif) { if (!VIF_EXISTS(mrt, iter->ct)) continue; return &mrt->vif_table[iter->ct]; } return NULL; } EXPORT_SYMBOL(mr_vif_seq_next); void *mr_mfc_seq_idx(struct net *net, struct mr_mfc_iter *it, loff_t pos) { struct mr_table *mrt = it->mrt; struct mr_mfc *mfc; rcu_read_lock(); it->cache = &mrt->mfc_cache_list; list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) if (pos-- == 0) return mfc; rcu_read_unlock(); spin_lock_bh(it->lock); it->cache = &mrt->mfc_unres_queue; list_for_each_entry(mfc, it->cache, list) if (pos-- == 0) return mfc; spin_unlock_bh(it->lock); it->cache = NULL; return NULL; } EXPORT_SYMBOL(mr_mfc_seq_idx); void *mr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct mr_mfc_iter *it = seq->private; struct net *net = seq_file_net(seq); struct mr_table *mrt = it->mrt; struct mr_mfc *c = v; ++*pos; if (v == SEQ_START_TOKEN) return mr_mfc_seq_idx(net, seq->private, 0); if (c->list.next != it->cache) return list_entry(c->list.next, struct mr_mfc, list); if (it->cache == &mrt->mfc_unres_queue) goto end_of_list; /* exhausted cache_array, show unresolved */ rcu_read_unlock(); it->cache = &mrt->mfc_unres_queue; spin_lock_bh(it->lock); if (!list_empty(it->cache)) return list_first_entry(it->cache, struct mr_mfc, list); end_of_list: spin_unlock_bh(it->lock); it->cache = NULL; return NULL; } EXPORT_SYMBOL(mr_mfc_seq_next); #endif int mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, struct mr_mfc *c, struct rtmsg *rtm) { struct net_device *vif_dev; struct rta_mfc_stats mfcs; struct nlattr *mp_attr; struct rtnexthop *nhp; unsigned long lastuse; int ct; /* If cache is unresolved, don't try to parse IIF and OIF */ if (c->mfc_parent >= MAXVIFS) { rtm->rtm_flags |= RTNH_F_UNRESOLVED; return -ENOENT; } rcu_read_lock(); vif_dev = rcu_dereference(mrt->vif_table[c->mfc_parent].dev); if (vif_dev && nla_put_u32(skb, RTA_IIF, vif_dev->ifindex) < 0) { rcu_read_unlock(); return -EMSGSIZE; } rcu_read_unlock(); if (c->mfc_flags & MFC_OFFLOAD) rtm->rtm_flags |= RTNH_F_OFFLOAD; mp_attr = nla_nest_start_noflag(skb, RTA_MULTIPATH); if (!mp_attr) return -EMSGSIZE; rcu_read_lock(); for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) { struct vif_device *vif = &mrt->vif_table[ct]; vif_dev = rcu_dereference(vif->dev); if (vif_dev && c->mfc_un.res.ttls[ct] < 255) { nhp = nla_reserve_nohdr(skb, sizeof(*nhp)); if (!nhp) { rcu_read_unlock(); nla_nest_cancel(skb, mp_attr); return -EMSGSIZE; } nhp->rtnh_flags = 0; nhp->rtnh_hops = c->mfc_un.res.ttls[ct]; nhp->rtnh_ifindex = vif_dev->ifindex; nhp->rtnh_len = sizeof(*nhp); } } rcu_read_unlock(); nla_nest_end(skb, mp_attr); lastuse = READ_ONCE(c->mfc_un.res.lastuse); lastuse = time_after_eq(jiffies, lastuse) ? jiffies - lastuse : 0; mfcs.mfcs_packets = c->mfc_un.res.pkt; mfcs.mfcs_bytes = c->mfc_un.res.bytes; mfcs.mfcs_wrong_if = c->mfc_un.res.wrong_if; if (nla_put_64bit(skb, RTA_MFC_STATS, sizeof(mfcs), &mfcs, RTA_PAD) || nla_put_u64_64bit(skb, RTA_EXPIRES, jiffies_to_clock_t(lastuse), RTA_PAD)) return -EMSGSIZE; rtm->rtm_type = RTN_MULTICAST; return 1; } EXPORT_SYMBOL(mr_fill_mroute); static bool mr_mfc_uses_dev(const struct mr_table *mrt, const struct mr_mfc *c, const struct net_device *dev) { int ct; for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) { const struct net_device *vif_dev; const struct vif_device *vif; vif = &mrt->vif_table[ct]; vif_dev = rcu_access_pointer(vif->dev); if (vif_dev && c->mfc_un.res.ttls[ct] < 255 && vif_dev == dev) return true; } return false; } int mr_table_dump(struct mr_table *mrt, struct sk_buff *skb, struct netlink_callback *cb, int (*fill)(struct mr_table *mrt, struct sk_buff *skb, u32 portid, u32 seq, struct mr_mfc *c, int cmd, int flags), spinlock_t *lock, struct fib_dump_filter *filter) { unsigned int e = 0, s_e = cb->args[1]; unsigned int flags = NLM_F_MULTI; struct mr_mfc *mfc; int err; if (filter->filter_set) flags |= NLM_F_DUMP_FILTERED; list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) { if (e < s_e) goto next_entry; if (filter->dev && !mr_mfc_uses_dev(mrt, mfc, filter->dev)) goto next_entry; err = fill(mrt, skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, mfc, RTM_NEWROUTE, flags); if (err < 0) goto out; next_entry: e++; } spin_lock_bh(lock); list_for_each_entry(mfc, &mrt->mfc_unres_queue, list) { if (e < s_e) goto next_entry2; if (filter->dev && !mr_mfc_uses_dev(mrt, mfc, filter->dev)) goto next_entry2; err = fill(mrt, skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, mfc, RTM_NEWROUTE, flags); if (err < 0) { spin_unlock_bh(lock); goto out; } next_entry2: e++; } spin_unlock_bh(lock); err = 0; out: cb->args[1] = e; return err; } EXPORT_SYMBOL(mr_table_dump); int mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb, struct mr_table *(*iter)(struct net *net, struct mr_table *mrt), int (*fill)(struct mr_table *mrt, struct sk_buff *skb, u32 portid, u32 seq, struct mr_mfc *c, int cmd, int flags), spinlock_t *lock, struct fib_dump_filter *filter) { unsigned int t = 0, s_t = cb->args[0]; struct net *net = sock_net(skb->sk); struct mr_table *mrt; int err; /* multicast does not track protocol or have route type other * than RTN_MULTICAST */ if (filter->filter_set) { if (filter->protocol || filter->flags || (filter->rt_type && filter->rt_type != RTN_MULTICAST)) return skb->len; } rcu_read_lock(); for (mrt = iter(net, NULL); mrt; mrt = iter(net, mrt)) { if (t < s_t) goto next_table; err = mr_table_dump(mrt, skb, cb, fill, lock, filter); if (err < 0) break; cb->args[1] = 0; next_table: t++; } rcu_read_unlock(); cb->args[0] = t; return skb->len; } EXPORT_SYMBOL(mr_rtm_dumproute); int mr_dump(struct net *net, struct notifier_block *nb, unsigned short family, int (*rules_dump)(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack), struct mr_table *(*mr_iter)(struct net *net, struct mr_table *mrt), struct netlink_ext_ack *extack) { struct mr_table *mrt; int err; err = rules_dump(net, nb, extack); if (err) return err; for (mrt = mr_iter(net, NULL); mrt; mrt = mr_iter(net, mrt)) { struct vif_device *v = &mrt->vif_table[0]; struct net_device *vif_dev; struct mr_mfc *mfc; int vifi; /* Notifiy on table VIF entries */ rcu_read_lock(); for (vifi = 0; vifi < mrt->maxvif; vifi++, v++) { vif_dev = rcu_dereference(v->dev); if (!vif_dev) continue; err = mr_call_vif_notifier(nb, family, FIB_EVENT_VIF_ADD, v, vif_dev, vifi, mrt->id, extack); if (err) break; } rcu_read_unlock(); if (err) return err; /* Notify on table MFC entries */ list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) { err = mr_call_mfc_notifier(nb, family, FIB_EVENT_ENTRY_ADD, mfc, mrt->id, extack); if (err) return err; } } return 0; } EXPORT_SYMBOL(mr_dump);
2 3 2 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * NetLabel Network Address Lists * * This file contains network address list functions used to manage ordered * lists of network addresses for use by the NetLabel subsystem. The NetLabel * system manages static and dynamic label mappings for network protocols such * as CIPSO and RIPSO. * * Author: Paul Moore <paul@paul-moore.com> */ /* * (c) Copyright Hewlett-Packard Development Company, L.P., 2008 */ #ifndef _NETLABEL_ADDRLIST_H #define _NETLABEL_ADDRLIST_H #include <linux/types.h> #include <linux/rcupdate.h> #include <linux/list.h> #include <linux/in6.h> #include <linux/audit.h> /** * struct netlbl_af4list - NetLabel IPv4 address list * @addr: IPv4 address * @mask: IPv4 address mask * @valid: valid flag * @list: list structure, used internally */ struct netlbl_af4list { __be32 addr; __be32 mask; u32 valid; struct list_head list; }; /** * struct netlbl_af6list - NetLabel IPv6 address list * @addr: IPv6 address * @mask: IPv6 address mask * @valid: valid flag * @list: list structure, used internally */ struct netlbl_af6list { struct in6_addr addr; struct in6_addr mask; u32 valid; struct list_head list; }; #define __af4list_entry(ptr) container_of(ptr, struct netlbl_af4list, list) static inline struct netlbl_af4list *__af4list_valid(struct list_head *s, struct list_head *h) { struct list_head *i = s; struct netlbl_af4list *n = __af4list_entry(s); while (i != h && !n->valid) { i = i->next; n = __af4list_entry(i); } return n; } static inline struct netlbl_af4list *__af4list_valid_rcu(struct list_head *s, struct list_head *h) { struct list_head *i = s; struct netlbl_af4list *n = __af4list_entry(s); while (i != h && !n->valid) { i = rcu_dereference(list_next_rcu(i)); n = __af4list_entry(i); } return n; } #define netlbl_af4list_foreach(iter, head) \ for (iter = __af4list_valid((head)->next, head); \ &iter->list != (head); \ iter = __af4list_valid(iter->list.next, head)) #define netlbl_af4list_foreach_rcu(iter, head) \ for (iter = __af4list_valid_rcu((head)->next, head); \ &iter->list != (head); \ iter = __af4list_valid_rcu(iter->list.next, head)) #define netlbl_af4list_foreach_safe(iter, tmp, head) \ for (iter = __af4list_valid((head)->next, head), \ tmp = __af4list_valid(iter->list.next, head); \ &iter->list != (head); \ iter = tmp, tmp = __af4list_valid(iter->list.next, head)) int netlbl_af4list_add(struct netlbl_af4list *entry, struct list_head *head); struct netlbl_af4list *netlbl_af4list_remove(__be32 addr, __be32 mask, struct list_head *head); void netlbl_af4list_remove_entry(struct netlbl_af4list *entry); struct netlbl_af4list *netlbl_af4list_search(__be32 addr, struct list_head *head); struct netlbl_af4list *netlbl_af4list_search_exact(__be32 addr, __be32 mask, struct list_head *head); #ifdef CONFIG_AUDIT void netlbl_af4list_audit_addr(struct audit_buffer *audit_buf, int src, const char *dev, __be32 addr, __be32 mask); #else static inline void netlbl_af4list_audit_addr(struct audit_buffer *audit_buf, int src, const char *dev, __be32 addr, __be32 mask) { } #endif #if IS_ENABLED(CONFIG_IPV6) #define __af6list_entry(ptr) container_of(ptr, struct netlbl_af6list, list) static inline struct netlbl_af6list *__af6list_valid(struct list_head *s, struct list_head *h) { struct list_head *i = s; struct netlbl_af6list *n = __af6list_entry(s); while (i != h && !n->valid) { i = i->next; n = __af6list_entry(i); } return n; } static inline struct netlbl_af6list *__af6list_valid_rcu(struct list_head *s, struct list_head *h) { struct list_head *i = s; struct netlbl_af6list *n = __af6list_entry(s); while (i != h && !n->valid) { i = rcu_dereference(list_next_rcu(i)); n = __af6list_entry(i); } return n; } #define netlbl_af6list_foreach(iter, head) \ for (iter = __af6list_valid((head)->next, head); \ &iter->list != (head); \ iter = __af6list_valid(iter->list.next, head)) #define netlbl_af6list_foreach_rcu(iter, head) \ for (iter = __af6list_valid_rcu((head)->next, head); \ &iter->list != (head); \ iter = __af6list_valid_rcu(iter->list.next, head)) #define netlbl_af6list_foreach_safe(iter, tmp, head) \ for (iter = __af6list_valid((head)->next, head), \ tmp = __af6list_valid(iter->list.next, head); \ &iter->list != (head); \ iter = tmp, tmp = __af6list_valid(iter->list.next, head)) int netlbl_af6list_add(struct netlbl_af6list *entry, struct list_head *head); struct netlbl_af6list *netlbl_af6list_remove(const struct in6_addr *addr, const struct in6_addr *mask, struct list_head *head); void netlbl_af6list_remove_entry(struct netlbl_af6list *entry); struct netlbl_af6list *netlbl_af6list_search(const struct in6_addr *addr, struct list_head *head); struct netlbl_af6list *netlbl_af6list_search_exact(const struct in6_addr *addr, const struct in6_addr *mask, struct list_head *head); #ifdef CONFIG_AUDIT void netlbl_af6list_audit_addr(struct audit_buffer *audit_buf, int src, const char *dev, const struct in6_addr *addr, const struct in6_addr *mask); #else static inline void netlbl_af6list_audit_addr(struct audit_buffer *audit_buf, int src, const char *dev, const struct in6_addr *addr, const struct in6_addr *mask) { } #endif #endif /* IPV6 */ #endif
15 8 7 2 2 7 8 2 1 7 10 10 10 1 1 8 2 8 2 8 2 10 1 1 1 1 1 1 1 1 1 8 2 9 26 12 11 1 1 12 6 18 15 3 1 1 1 1 4 5 10 4 4 4 1 4 4 1 12 6 23 3 13 13 1 1 11 2 14 12 2 14 14 2 2 2 2 2 2 2 2 1 1 1 1 4 4 4 2 1 2 4 3 3 1 2 3 3 3 3 3 3 3 3 3 3 3 2 3 3 3 1 2 3 1 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 // SPDX-License-Identifier: GPL-2.0 /* * * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved. * * TODO: Merge attr_set_size/attr_data_get_block/attr_allocate_frame? */ #include <linux/fs.h> #include <linux/slab.h> #include <linux/kernel.h> #include "debug.h" #include "ntfs.h" #include "ntfs_fs.h" /* * You can set external NTFS_MIN_LOG2_OF_CLUMP/NTFS_MAX_LOG2_OF_CLUMP to manage * preallocate algorithm. */ #ifndef NTFS_MIN_LOG2_OF_CLUMP #define NTFS_MIN_LOG2_OF_CLUMP 16 #endif #ifndef NTFS_MAX_LOG2_OF_CLUMP #define NTFS_MAX_LOG2_OF_CLUMP 26 #endif // 16M #define NTFS_CLUMP_MIN (1 << (NTFS_MIN_LOG2_OF_CLUMP + 8)) // 16G #define NTFS_CLUMP_MAX (1ull << (NTFS_MAX_LOG2_OF_CLUMP + 8)) static inline u64 get_pre_allocated(u64 size) { u32 clump; u8 align_shift; u64 ret; if (size <= NTFS_CLUMP_MIN) { clump = 1 << NTFS_MIN_LOG2_OF_CLUMP; align_shift = NTFS_MIN_LOG2_OF_CLUMP; } else if (size >= NTFS_CLUMP_MAX) { clump = 1 << NTFS_MAX_LOG2_OF_CLUMP; align_shift = NTFS_MAX_LOG2_OF_CLUMP; } else { align_shift = NTFS_MIN_LOG2_OF_CLUMP - 1 + __ffs(size >> (8 + NTFS_MIN_LOG2_OF_CLUMP)); clump = 1u << align_shift; } ret = (((size + clump - 1) >> align_shift)) << align_shift; return ret; } /* * attr_load_runs - Load all runs stored in @attr. */ static int attr_load_runs(struct ATTRIB *attr, struct ntfs_inode *ni, struct runs_tree *run, const CLST *vcn) { int err; CLST svcn = le64_to_cpu(attr->nres.svcn); CLST evcn = le64_to_cpu(attr->nres.evcn); u32 asize; u16 run_off; if (svcn >= evcn + 1 || run_is_mapped_full(run, svcn, evcn)) return 0; if (vcn && (evcn < *vcn || *vcn < svcn)) return -EINVAL; asize = le32_to_cpu(attr->size); run_off = le16_to_cpu(attr->nres.run_off); if (run_off > asize) return -EINVAL; err = run_unpack_ex(run, ni->mi.sbi, ni->mi.rno, svcn, evcn, vcn ? *vcn : svcn, Add2Ptr(attr, run_off), asize - run_off); if (err < 0) return err; return 0; } /* * run_deallocate_ex - Deallocate clusters. */ static int run_deallocate_ex(struct ntfs_sb_info *sbi, struct runs_tree *run, CLST vcn, CLST len, CLST *done, bool trim) { int err = 0; CLST vcn_next, vcn0 = vcn, lcn, clen, dn = 0; size_t idx; if (!len) goto out; if (!run_lookup_entry(run, vcn, &lcn, &clen, &idx)) { failed: run_truncate(run, vcn0); err = -EINVAL; goto out; } for (;;) { if (clen > len) clen = len; if (!clen) { err = -EINVAL; goto out; } if (lcn != SPARSE_LCN) { if (sbi) { /* mark bitmap range [lcn + clen) as free and trim clusters. */ mark_as_free_ex(sbi, lcn, clen, trim); } dn += clen; } len -= clen; if (!len) break; vcn_next = vcn + clen; if (!run_get_entry(run, ++idx, &vcn, &lcn, &clen) || vcn != vcn_next) { /* Save memory - don't load entire run. */ goto failed; } } out: if (done) *done += dn; return err; } /* * attr_allocate_clusters - Find free space, mark it as used and store in @run. */ int attr_allocate_clusters(struct ntfs_sb_info *sbi, struct runs_tree *run, CLST vcn, CLST lcn, CLST len, CLST *pre_alloc, enum ALLOCATE_OPT opt, CLST *alen, const size_t fr, CLST *new_lcn, CLST *new_len) { int err; CLST flen, vcn0 = vcn, pre = pre_alloc ? *pre_alloc : 0; size_t cnt = run->count; for (;;) { err = ntfs_look_for_free_space(sbi, lcn, len + pre, &lcn, &flen, opt); if (err == -ENOSPC && pre) { pre = 0; if (*pre_alloc) *pre_alloc = 0; continue; } if (err) goto out; if (vcn == vcn0) { /* Return the first fragment. */ if (new_lcn) *new_lcn = lcn; if (new_len) *new_len = flen; } /* Add new fragment into run storage. */ if (!run_add_entry(run, vcn, lcn, flen, opt & ALLOCATE_MFT)) { /* Undo last 'ntfs_look_for_free_space' */ mark_as_free_ex(sbi, lcn, len, false); err = -ENOMEM; goto out; } if (opt & ALLOCATE_ZERO) { u8 shift = sbi->cluster_bits - SECTOR_SHIFT; err = blkdev_issue_zeroout(sbi->sb->s_bdev, (sector_t)lcn << shift, (sector_t)flen << shift, GFP_NOFS, 0); if (err) goto out; } vcn += flen; if (flen >= len || (opt & ALLOCATE_MFT) || (fr && run->count - cnt >= fr)) { *alen = vcn - vcn0; return 0; } len -= flen; } out: /* Undo 'ntfs_look_for_free_space' */ if (vcn - vcn0) { run_deallocate_ex(sbi, run, vcn0, vcn - vcn0, NULL, false); run_truncate(run, vcn0); } return err; } /* * attr_make_nonresident * * If page is not NULL - it is already contains resident data * and locked (called from ni_write_frame()). */ int attr_make_nonresident(struct ntfs_inode *ni, struct ATTRIB *attr, struct ATTR_LIST_ENTRY *le, struct mft_inode *mi, u64 new_size, struct runs_tree *run, struct ATTRIB **ins_attr, struct page *page) { struct ntfs_sb_info *sbi; struct ATTRIB *attr_s; struct MFT_REC *rec; u32 used, asize, rsize, aoff, align; bool is_data; CLST len, alen; char *next; int err; if (attr->non_res) { *ins_attr = attr; return 0; } sbi = mi->sbi; rec = mi->mrec; attr_s = NULL; used = le32_to_cpu(rec->used); asize = le32_to_cpu(attr->size); next = Add2Ptr(attr, asize); aoff = PtrOffset(rec, attr); rsize = le32_to_cpu(attr->res.data_size); is_data = attr->type == ATTR_DATA && !attr->name_len; align = sbi->cluster_size; if (is_attr_compressed(attr)) align <<= COMPRESSION_UNIT; len = (rsize + align - 1) >> sbi->cluster_bits; run_init(run); /* Make a copy of original attribute. */ attr_s = kmemdup(attr, asize, GFP_NOFS); if (!attr_s) { err = -ENOMEM; goto out; } if (!len) { /* Empty resident -> Empty nonresident. */ alen = 0; } else { const char *data = resident_data(attr); err = attr_allocate_clusters(sbi, run, 0, 0, len, NULL, ALLOCATE_DEF, &alen, 0, NULL, NULL); if (err) goto out1; if (!rsize) { /* Empty resident -> Non empty nonresident. */ } else if (!is_data) { err = ntfs_sb_write_run(sbi, run, 0, data, rsize, 0); if (err) goto out2; } else if (!page) { char *kaddr; page = grab_cache_page(ni->vfs_inode.i_mapping, 0); if (!page) { err = -ENOMEM; goto out2; } kaddr = kmap_atomic(page); memcpy(kaddr, data, rsize); memset(kaddr + rsize, 0, PAGE_SIZE - rsize); kunmap_atomic(kaddr); flush_dcache_page(page); SetPageUptodate(page); set_page_dirty(page); unlock_page(page); put_page(page); } } /* Remove original attribute. */ used -= asize; memmove(attr, Add2Ptr(attr, asize), used - aoff); rec->used = cpu_to_le32(used); mi->dirty = true; if (le) al_remove_le(ni, le); err = ni_insert_nonresident(ni, attr_s->type, attr_name(attr_s), attr_s->name_len, run, 0, alen, attr_s->flags, &attr, NULL, NULL); if (err) goto out3; kfree(attr_s); attr->nres.data_size = cpu_to_le64(rsize); attr->nres.valid_size = attr->nres.data_size; *ins_attr = attr; if (is_data) ni->ni_flags &= ~NI_FLAG_RESIDENT; /* Resident attribute becomes non resident. */ return 0; out3: attr = Add2Ptr(rec, aoff); memmove(next, attr, used - aoff); memcpy(attr, attr_s, asize); rec->used = cpu_to_le32(used + asize); mi->dirty = true; out2: /* Undo: do not trim new allocated clusters. */ run_deallocate(sbi, run, false); run_close(run); out1: kfree(attr_s); out: return err; } /* * attr_set_size_res - Helper for attr_set_size(). */ static int attr_set_size_res(struct ntfs_inode *ni, struct ATTRIB *attr, struct ATTR_LIST_ENTRY *le, struct mft_inode *mi, u64 new_size, struct runs_tree *run, struct ATTRIB **ins_attr) { struct ntfs_sb_info *sbi = mi->sbi; struct MFT_REC *rec = mi->mrec; u32 used = le32_to_cpu(rec->used); u32 asize = le32_to_cpu(attr->size); u32 aoff = PtrOffset(rec, attr); u32 rsize = le32_to_cpu(attr->res.data_size); u32 tail = used - aoff - asize; char *next = Add2Ptr(attr, asize); s64 dsize = ALIGN(new_size, 8) - ALIGN(rsize, 8); if (dsize < 0) { memmove(next + dsize, next, tail); } else if (dsize > 0) { if (used + dsize > sbi->max_bytes_per_attr) return attr_make_nonresident(ni, attr, le, mi, new_size, run, ins_attr, NULL); memmove(next + dsize, next, tail); memset(next, 0, dsize); } if (new_size > rsize) memset(Add2Ptr(resident_data(attr), rsize), 0, new_size - rsize); rec->used = cpu_to_le32(used + dsize); attr->size = cpu_to_le32(asize + dsize); attr->res.data_size = cpu_to_le32(new_size); mi->dirty = true; *ins_attr = attr; return 0; } /* * attr_set_size - Change the size of attribute. * * Extend: * - Sparse/compressed: No allocated clusters. * - Normal: Append allocated and preallocated new clusters. * Shrink: * - No deallocate if @keep_prealloc is set. */ int attr_set_size(struct ntfs_inode *ni, enum ATTR_TYPE type, const __le16 *name, u8 name_len, struct runs_tree *run, u64 new_size, const u64 *new_valid, bool keep_prealloc, struct ATTRIB **ret) { int err = 0; struct ntfs_sb_info *sbi = ni->mi.sbi; u8 cluster_bits = sbi->cluster_bits; bool is_mft = ni->mi.rno == MFT_REC_MFT && type == ATTR_DATA && !name_len; u64 old_valid, old_size, old_alloc, new_alloc, new_alloc_tmp; struct ATTRIB *attr = NULL, *attr_b; struct ATTR_LIST_ENTRY *le, *le_b; struct mft_inode *mi, *mi_b; CLST alen, vcn, lcn, new_alen, old_alen, svcn, evcn; CLST next_svcn, pre_alloc = -1, done = 0; bool is_ext, is_bad = false; bool dirty = false; u32 align; struct MFT_REC *rec; again: alen = 0; le_b = NULL; attr_b = ni_find_attr(ni, NULL, &le_b, type, name, name_len, NULL, &mi_b); if (!attr_b) { err = -ENOENT; goto bad_inode; } if (!attr_b->non_res) { err = attr_set_size_res(ni, attr_b, le_b, mi_b, new_size, run, &attr_b); if (err) return err; /* Return if file is still resident. */ if (!attr_b->non_res) { dirty = true; goto ok1; } /* Layout of records may be changed, so do a full search. */ goto again; } is_ext = is_attr_ext(attr_b); align = sbi->cluster_size; if (is_ext) align <<= attr_b->nres.c_unit; old_valid = le64_to_cpu(attr_b->nres.valid_size); old_size = le64_to_cpu(attr_b->nres.data_size); old_alloc = le64_to_cpu(attr_b->nres.alloc_size); again_1: old_alen = old_alloc >> cluster_bits; new_alloc = (new_size + align - 1) & ~(u64)(align - 1); new_alen = new_alloc >> cluster_bits; if (keep_prealloc && new_size < old_size) { attr_b->nres.data_size = cpu_to_le64(new_size); mi_b->dirty = dirty = true; goto ok; } vcn = old_alen - 1; svcn = le64_to_cpu(attr_b->nres.svcn); evcn = le64_to_cpu(attr_b->nres.evcn); if (svcn <= vcn && vcn <= evcn) { attr = attr_b; le = le_b; mi = mi_b; } else if (!le_b) { err = -EINVAL; goto bad_inode; } else { le = le_b; attr = ni_find_attr(ni, attr_b, &le, type, name, name_len, &vcn, &mi); if (!attr) { err = -EINVAL; goto bad_inode; } next_le_1: svcn = le64_to_cpu(attr->nres.svcn); evcn = le64_to_cpu(attr->nres.evcn); } /* * Here we have: * attr,mi,le - last attribute segment (containing 'vcn'). * attr_b,mi_b,le_b - base (primary) attribute segment. */ next_le: rec = mi->mrec; err = attr_load_runs(attr, ni, run, NULL); if (err) goto out; if (new_size > old_size) { CLST to_allocate; size_t free; if (new_alloc <= old_alloc) { attr_b->nres.data_size = cpu_to_le64(new_size); mi_b->dirty = dirty = true; goto ok; } /* * Add clusters. In simple case we have to: * - allocate space (vcn, lcn, len) * - update packed run in 'mi' * - update attr->nres.evcn * - update attr_b->nres.data_size/attr_b->nres.alloc_size */ to_allocate = new_alen - old_alen; add_alloc_in_same_attr_seg: lcn = 0; if (is_mft) { /* MFT allocates clusters from MFT zone. */ pre_alloc = 0; } else if (is_ext) { /* No preallocate for sparse/compress. */ pre_alloc = 0; } else if (pre_alloc == -1) { pre_alloc = 0; if (type == ATTR_DATA && !name_len && sbi->options->prealloc) { pre_alloc = bytes_to_cluster( sbi, get_pre_allocated( new_size)) - new_alen; } /* Get the last LCN to allocate from. */ if (old_alen && !run_lookup_entry(run, vcn, &lcn, NULL, NULL)) { lcn = SPARSE_LCN; } if (lcn == SPARSE_LCN) lcn = 0; else if (lcn) lcn += 1; free = wnd_zeroes(&sbi->used.bitmap); if (to_allocate > free) { err = -ENOSPC; goto out; } if (pre_alloc && to_allocate + pre_alloc > free) pre_alloc = 0; } vcn = old_alen; if (is_ext) { if (!run_add_entry(run, vcn, SPARSE_LCN, to_allocate, false)) { err = -ENOMEM; goto out; } alen = to_allocate; } else { /* ~3 bytes per fragment. */ err = attr_allocate_clusters( sbi, run, vcn, lcn, to_allocate, &pre_alloc, is_mft ? ALLOCATE_MFT : ALLOCATE_DEF, &alen, is_mft ? 0 : (sbi->record_size - le32_to_cpu(rec->used) + 8) / 3 + 1, NULL, NULL); if (err) goto out; } done += alen; vcn += alen; if (to_allocate > alen) to_allocate -= alen; else to_allocate = 0; pack_runs: err = mi_pack_runs(mi, attr, run, vcn - svcn); if (err) goto undo_1; next_svcn = le64_to_cpu(attr->nres.evcn) + 1; new_alloc_tmp = (u64)next_svcn << cluster_bits; attr_b->nres.alloc_size = cpu_to_le64(new_alloc_tmp); mi_b->dirty = dirty = true; if (next_svcn >= vcn && !to_allocate) { /* Normal way. Update attribute and exit. */ attr_b->nres.data_size = cpu_to_le64(new_size); goto ok; } /* At least two MFT to avoid recursive loop. */ if (is_mft && next_svcn == vcn && ((u64)done << sbi->cluster_bits) >= 2 * sbi->record_size) { new_size = new_alloc_tmp; attr_b->nres.data_size = attr_b->nres.alloc_size; goto ok; } if (le32_to_cpu(rec->used) < sbi->record_size) { old_alen = next_svcn; evcn = old_alen - 1; goto add_alloc_in_same_attr_seg; } attr_b->nres.data_size = attr_b->nres.alloc_size; if (new_alloc_tmp < old_valid) attr_b->nres.valid_size = attr_b->nres.data_size; if (type == ATTR_LIST) { err = ni_expand_list(ni); if (err) goto undo_2; if (next_svcn < vcn) goto pack_runs; /* Layout of records is changed. */ goto again; } if (!ni->attr_list.size) { err = ni_create_attr_list(ni); /* In case of error layout of records is not changed. */ if (err) goto undo_2; /* Layout of records is changed. */ } if (next_svcn >= vcn) { /* This is MFT data, repeat. */ goto again; } /* Insert new attribute segment. */ err = ni_insert_nonresident(ni, type, name, name_len, run, next_svcn, vcn - next_svcn, attr_b->flags, &attr, &mi, NULL); /* * Layout of records maybe changed. * Find base attribute to update. */ le_b = NULL; attr_b = ni_find_attr(ni, NULL, &le_b, type, name, name_len, NULL, &mi_b); if (!attr_b) { err = -EINVAL; goto bad_inode; } if (err) { /* ni_insert_nonresident failed. */ attr = NULL; goto undo_2; } if (!is_mft) run_truncate_head(run, evcn + 1); svcn = le64_to_cpu(attr->nres.svcn); evcn = le64_to_cpu(attr->nres.evcn); /* * Attribute is in consistency state. * Save this point to restore to if next steps fail. */ old_valid = old_size = old_alloc = (u64)vcn << cluster_bits; attr_b->nres.valid_size = attr_b->nres.data_size = attr_b->nres.alloc_size = cpu_to_le64(old_size); mi_b->dirty = dirty = true; goto again_1; } if (new_size != old_size || (new_alloc != old_alloc && !keep_prealloc)) { /* * Truncate clusters. In simple case we have to: * - update packed run in 'mi' * - update attr->nres.evcn * - update attr_b->nres.data_size/attr_b->nres.alloc_size * - mark and trim clusters as free (vcn, lcn, len) */ CLST dlen = 0; vcn = max(svcn, new_alen); new_alloc_tmp = (u64)vcn << cluster_bits; if (vcn > svcn) { err = mi_pack_runs(mi, attr, run, vcn - svcn); if (err) goto out; } else if (le && le->vcn) { u16 le_sz = le16_to_cpu(le->size); /* * NOTE: List entries for one attribute are always * the same size. We deal with last entry (vcn==0) * and it is not first in entries array * (list entry for std attribute always first). * So it is safe to step back. */ mi_remove_attr(NULL, mi, attr); if (!al_remove_le(ni, le)) { err = -EINVAL; goto bad_inode; } le = (struct ATTR_LIST_ENTRY *)((u8 *)le - le_sz); } else { attr->nres.evcn = cpu_to_le64((u64)vcn - 1); mi->dirty = true; } attr_b->nres.alloc_size = cpu_to_le64(new_alloc_tmp); if (vcn == new_alen) { attr_b->nres.data_size = cpu_to_le64(new_size); if (new_size < old_valid) attr_b->nres.valid_size = attr_b->nres.data_size; } else { if (new_alloc_tmp <= le64_to_cpu(attr_b->nres.data_size)) attr_b->nres.data_size = attr_b->nres.alloc_size; if (new_alloc_tmp < le64_to_cpu(attr_b->nres.valid_size)) attr_b->nres.valid_size = attr_b->nres.alloc_size; } mi_b->dirty = dirty = true; err = run_deallocate_ex(sbi, run, vcn, evcn - vcn + 1, &dlen, true); if (err) goto out; if (is_ext) { /* dlen - really deallocated clusters. */ le64_sub_cpu(&attr_b->nres.total_size, ((u64)dlen << cluster_bits)); } run_truncate(run, vcn); if (new_alloc_tmp <= new_alloc) goto ok; old_size = new_alloc_tmp; vcn = svcn - 1; if (le == le_b) { attr = attr_b; mi = mi_b; evcn = svcn - 1; svcn = 0; goto next_le; } if (le->type != type || le->name_len != name_len || memcmp(le_name(le), name, name_len * sizeof(short))) { err = -EINVAL; goto bad_inode; } err = ni_load_mi(ni, le, &mi); if (err) goto out; attr = mi_find_attr(mi, NULL, type, name, name_len, &le->id); if (!attr) { err = -EINVAL; goto bad_inode; } goto next_le_1; } ok: if (new_valid) { __le64 valid = cpu_to_le64(min(*new_valid, new_size)); if (attr_b->nres.valid_size != valid) { attr_b->nres.valid_size = valid; mi_b->dirty = true; } } ok1: if (ret) *ret = attr_b; if (((type == ATTR_DATA && !name_len) || (type == ATTR_ALLOC && name == I30_NAME))) { /* Update inode_set_bytes. */ if (attr_b->non_res) { new_alloc = le64_to_cpu(attr_b->nres.alloc_size); if (inode_get_bytes(&ni->vfs_inode) != new_alloc) { inode_set_bytes(&ni->vfs_inode, new_alloc); dirty = true; } } /* Don't forget to update duplicate information in parent. */ if (dirty) { ni->ni_flags |= NI_FLAG_UPDATE_PARENT; mark_inode_dirty(&ni->vfs_inode); } } return 0; undo_2: vcn -= alen; attr_b->nres.data_size = cpu_to_le64(old_size); attr_b->nres.valid_size = cpu_to_le64(old_valid); attr_b->nres.alloc_size = cpu_to_le64(old_alloc); /* Restore 'attr' and 'mi'. */ if (attr) goto restore_run; if (le64_to_cpu(attr_b->nres.svcn) <= svcn && svcn <= le64_to_cpu(attr_b->nres.evcn)) { attr = attr_b; le = le_b; mi = mi_b; } else if (!le_b) { err = -EINVAL; goto bad_inode; } else { le = le_b; attr = ni_find_attr(ni, attr_b, &le, type, name, name_len, &svcn, &mi); if (!attr) goto bad_inode; } restore_run: if (mi_pack_runs(mi, attr, run, evcn - svcn + 1)) is_bad = true; undo_1: run_deallocate_ex(sbi, run, vcn, alen, NULL, false); run_truncate(run, vcn); out: if (is_bad) { bad_inode: _ntfs_bad_inode(&ni->vfs_inode); } return err; } /* * attr_data_get_block - Returns 'lcn' and 'len' for given 'vcn'. * * @new == NULL means just to get current mapping for 'vcn' * @new != NULL means allocate real cluster if 'vcn' maps to hole * @zero - zeroout new allocated clusters * * NOTE: * - @new != NULL is called only for sparsed or compressed attributes. * - new allocated clusters are zeroed via blkdev_issue_zeroout. */ int attr_data_get_block(struct ntfs_inode *ni, CLST vcn, CLST clen, CLST *lcn, CLST *len, bool *new, bool zero) { int err = 0; struct runs_tree *run = &ni->file.run; struct ntfs_sb_info *sbi; u8 cluster_bits; struct ATTRIB *attr, *attr_b; struct ATTR_LIST_ENTRY *le, *le_b; struct mft_inode *mi, *mi_b; CLST hint, svcn, to_alloc, evcn1, next_svcn, asize, end, vcn0, alen; CLST alloc, evcn; unsigned fr; u64 total_size, total_size0; int step = 0; if (new) *new = false; /* Try to find in cache. */ down_read(&ni->file.run_lock); if (!run_lookup_entry(run, vcn, lcn, len, NULL)) *len = 0; up_read(&ni->file.run_lock); if (*len && (*lcn != SPARSE_LCN || !new)) return 0; /* Fast normal way without allocation. */ /* No cluster in cache or we need to allocate cluster in hole. */ sbi = ni->mi.sbi; cluster_bits = sbi->cluster_bits; ni_lock(ni); down_write(&ni->file.run_lock); /* Repeat the code above (under write lock). */ if (!run_lookup_entry(run, vcn, lcn, len, NULL)) *len = 0; if (*len) { if (*lcn != SPARSE_LCN || !new) goto out; /* normal way without allocation. */ if (clen > *len) clen = *len; } le_b = NULL; attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL, 0, NULL, &mi_b); if (!attr_b) { err = -ENOENT; goto out; } if (!attr_b->non_res) { *lcn = RESIDENT_LCN; *len = 1; goto out; } asize = le64_to_cpu(attr_b->nres.alloc_size) >> cluster_bits; if (vcn >= asize) { if (new) { err = -EINVAL; } else { *len = 1; *lcn = SPARSE_LCN; } goto out; } svcn = le64_to_cpu(attr_b->nres.svcn); evcn1 = le64_to_cpu(attr_b->nres.evcn) + 1; attr = attr_b; le = le_b; mi = mi_b; if (le_b && (vcn < svcn || evcn1 <= vcn)) { attr = ni_find_attr(ni, attr_b, &le, ATTR_DATA, NULL, 0, &vcn, &mi); if (!attr) { err = -EINVAL; goto out; } svcn = le64_to_cpu(attr->nres.svcn); evcn1 = le64_to_cpu(attr->nres.evcn) + 1; } /* Load in cache actual information. */ err = attr_load_runs(attr, ni, run, NULL); if (err) goto out; if (!*len) { if (run_lookup_entry(run, vcn, lcn, len, NULL)) { if (*lcn != SPARSE_LCN || !new) goto ok; /* Slow normal way without allocation. */ if (clen > *len) clen = *len; } else if (!new) { /* Here we may return -ENOENT. * In any case caller gets zero length. */ goto ok; } } if (!is_attr_ext(attr_b)) { /* The code below only for sparsed or compressed attributes. */ err = -EINVAL; goto out; } vcn0 = vcn; to_alloc = clen; fr = (sbi->record_size - le32_to_cpu(mi->mrec->used) + 8) / 3 + 1; /* Allocate frame aligned clusters. * ntfs.sys usually uses 16 clusters per frame for sparsed or compressed. * ntfs3 uses 1 cluster per frame for new created sparsed files. */ if (attr_b->nres.c_unit) { CLST clst_per_frame = 1u << attr_b->nres.c_unit; CLST cmask = ~(clst_per_frame - 1); /* Get frame aligned vcn and to_alloc. */ vcn = vcn0 & cmask; to_alloc = ((vcn0 + clen + clst_per_frame - 1) & cmask) - vcn; if (fr < clst_per_frame) fr = clst_per_frame; zero = true; /* Check if 'vcn' and 'vcn0' in different attribute segments. */ if (vcn < svcn || evcn1 <= vcn) { /* Load attribute for truncated vcn. */ attr = ni_find_attr(ni, attr_b, &le, ATTR_DATA, NULL, 0, &vcn, &mi); if (!attr) { err = -EINVAL; goto out; } svcn = le64_to_cpu(attr->nres.svcn); evcn1 = le64_to_cpu(attr->nres.evcn) + 1; err = attr_load_runs(attr, ni, run, NULL); if (err) goto out; } } if (vcn + to_alloc > asize) to_alloc = asize - vcn; /* Get the last LCN to allocate from. */ hint = 0; if (vcn > evcn1) { if (!run_add_entry(run, evcn1, SPARSE_LCN, vcn - evcn1, false)) { err = -ENOMEM; goto out; } } else if (vcn && !run_lookup_entry(run, vcn - 1, &hint, NULL, NULL)) { hint = -1; } /* Allocate and zeroout new clusters. */ err = attr_allocate_clusters(sbi, run, vcn, hint + 1, to_alloc, NULL, zero ? ALLOCATE_ZERO : ALLOCATE_DEF, &alen, fr, lcn, len); if (err) goto out; *new = true; step = 1; end = vcn + alen; /* Save 'total_size0' to restore if error. */ total_size0 = le64_to_cpu(attr_b->nres.total_size); total_size = total_size0 + ((u64)alen << cluster_bits); if (vcn != vcn0) { if (!run_lookup_entry(run, vcn0, lcn, len, NULL)) { err = -EINVAL; goto out; } if (*lcn == SPARSE_LCN) { /* Internal error. Should not happened. */ WARN_ON(1); err = -EINVAL; goto out; } /* Check case when vcn0 + len overlaps new allocated clusters. */ if (vcn0 + *len > end) *len = end - vcn0; } repack: err = mi_pack_runs(mi, attr, run, max(end, evcn1) - svcn); if (err) goto out; attr_b->nres.total_size = cpu_to_le64(total_size); inode_set_bytes(&ni->vfs_inode, total_size); ni->ni_flags |= NI_FLAG_UPDATE_PARENT; mi_b->dirty = true; mark_inode_dirty(&ni->vfs_inode); /* Stored [vcn : next_svcn) from [vcn : end). */ next_svcn = le64_to_cpu(attr->nres.evcn) + 1; if (end <= evcn1) { if (next_svcn == evcn1) { /* Normal way. Update attribute and exit. */ goto ok; } /* Add new segment [next_svcn : evcn1 - next_svcn). */ if (!ni->attr_list.size) { err = ni_create_attr_list(ni); if (err) goto undo1; /* Layout of records is changed. */ le_b = NULL; attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL, 0, NULL, &mi_b); if (!attr_b) { err = -ENOENT; goto out; } attr = attr_b; le = le_b; mi = mi_b; goto repack; } } /* * The code below may require additional cluster (to extend attribute list) * and / or one MFT record * It is too complex to undo operations if -ENOSPC occurs deep inside * in 'ni_insert_nonresident'. * Return in advance -ENOSPC here if there are no free cluster and no free MFT. */ if (!ntfs_check_for_free_space(sbi, 1, 1)) { /* Undo step 1. */ err = -ENOSPC; goto undo1; } step = 2; svcn = evcn1; /* Estimate next attribute. */ attr = ni_find_attr(ni, attr, &le, ATTR_DATA, NULL, 0, &svcn, &mi); if (!attr) { /* Insert new attribute segment. */ goto ins_ext; } /* Try to update existed attribute segment. */ alloc = bytes_to_cluster(sbi, le64_to_cpu(attr_b->nres.alloc_size)); evcn = le64_to_cpu(attr->nres.evcn); if (end < next_svcn) end = next_svcn; while (end > evcn) { /* Remove segment [svcn : evcn). */ mi_remove_attr(NULL, mi, attr); if (!al_remove_le(ni, le)) { err = -EINVAL; goto out; } if (evcn + 1 >= alloc) { /* Last attribute segment. */ evcn1 = evcn + 1; goto ins_ext; } if (ni_load_mi(ni, le, &mi)) { attr = NULL; goto out; } attr = mi_find_attr(mi, NULL, ATTR_DATA, NULL, 0, &le->id); if (!attr) { err = -EINVAL; goto out; } svcn = le64_to_cpu(attr->nres.svcn); evcn = le64_to_cpu(attr->nres.evcn); } if (end < svcn) end = svcn; err = attr_load_runs(attr, ni, run, &end); if (err) goto out; evcn1 = evcn + 1; attr->nres.svcn = cpu_to_le64(next_svcn); err = mi_pack_runs(mi, attr, run, evcn1 - next_svcn); if (err) goto out; le->vcn = cpu_to_le64(next_svcn); ni->attr_list.dirty = true; mi->dirty = true; next_svcn = le64_to_cpu(attr->nres.evcn) + 1; ins_ext: if (evcn1 > next_svcn) { err = ni_insert_nonresident(ni, ATTR_DATA, NULL, 0, run, next_svcn, evcn1 - next_svcn, attr_b->flags, &attr, &mi, NULL); if (err) goto out; } ok: run_truncate_around(run, vcn); out: if (err && step > 1) { /* Too complex to restore. */ _ntfs_bad_inode(&ni->vfs_inode); } up_write(&ni->file.run_lock); ni_unlock(ni); return err; undo1: /* Undo step1. */ attr_b->nres.total_size = cpu_to_le64(total_size0); inode_set_bytes(&ni->vfs_inode, total_size0); if (run_deallocate_ex(sbi, run, vcn, alen, NULL, false) || !run_add_entry(run, vcn, SPARSE_LCN, alen, false) || mi_pack_runs(mi, attr, run, max(end, evcn1) - svcn)) { _ntfs_bad_inode(&ni->vfs_inode); } goto out; } int attr_data_read_resident(struct ntfs_inode *ni, struct page *page) { u64 vbo; struct ATTRIB *attr; u32 data_size; attr = ni_find_attr(ni, NULL, NULL, ATTR_DATA, NULL, 0, NULL, NULL); if (!attr) return -EINVAL; if (attr->non_res) return E_NTFS_NONRESIDENT; vbo = page->index << PAGE_SHIFT; data_size = le32_to_cpu(attr->res.data_size); if (vbo < data_size) { const char *data = resident_data(attr); char *kaddr = kmap_atomic(page); u32 use = data_size - vbo; if (use > PAGE_SIZE) use = PAGE_SIZE; memcpy(kaddr, data + vbo, use); memset(kaddr + use, 0, PAGE_SIZE - use); kunmap_atomic(kaddr); flush_dcache_page(page); SetPageUptodate(page); } else if (!PageUptodate(page)) { zero_user_segment(page, 0, PAGE_SIZE); SetPageUptodate(page); } return 0; } int attr_data_write_resident(struct ntfs_inode *ni, struct page *page) { u64 vbo; struct mft_inode *mi; struct ATTRIB *attr; u32 data_size; attr = ni_find_attr(ni, NULL, NULL, ATTR_DATA, NULL, 0, NULL, &mi); if (!attr) return -EINVAL; if (attr->non_res) { /* Return special error code to check this case. */ return E_NTFS_NONRESIDENT; } vbo = page->index << PAGE_SHIFT; data_size = le32_to_cpu(attr->res.data_size); if (vbo < data_size) { char *data = resident_data(attr); char *kaddr = kmap_atomic(page); u32 use = data_size - vbo; if (use > PAGE_SIZE) use = PAGE_SIZE; memcpy(data + vbo, kaddr, use); kunmap_atomic(kaddr); mi->dirty = true; } ni->i_valid = data_size; return 0; } /* * attr_load_runs_vcn - Load runs with VCN. */ int attr_load_runs_vcn(struct ntfs_inode *ni, enum ATTR_TYPE type, const __le16 *name, u8 name_len, struct runs_tree *run, CLST vcn) { struct ATTRIB *attr; int err; CLST svcn, evcn; u16 ro; if (!ni) { /* Is record corrupted? */ return -ENOENT; } attr = ni_find_attr(ni, NULL, NULL, type, name, name_len, &vcn, NULL); if (!attr) { /* Is record corrupted? */ return -ENOENT; } svcn = le64_to_cpu(attr->nres.svcn); evcn = le64_to_cpu(attr->nres.evcn); if (evcn < vcn || vcn < svcn) { /* Is record corrupted? */ return -EINVAL; } ro = le16_to_cpu(attr->nres.run_off); if (ro > le32_to_cpu(attr->size)) return -EINVAL; err = run_unpack_ex(run, ni->mi.sbi, ni->mi.rno, svcn, evcn, svcn, Add2Ptr(attr, ro), le32_to_cpu(attr->size) - ro); if (err < 0) return err; return 0; } /* * attr_load_runs_range - Load runs for given range [from to). */ int attr_load_runs_range(struct ntfs_inode *ni, enum ATTR_TYPE type, const __le16 *name, u8 name_len, struct runs_tree *run, u64 from, u64 to) { struct ntfs_sb_info *sbi = ni->mi.sbi; u8 cluster_bits = sbi->cluster_bits; CLST vcn; CLST vcn_last = (to - 1) >> cluster_bits; CLST lcn, clen; int err; for (vcn = from >> cluster_bits; vcn <= vcn_last; vcn += clen) { if (!run_lookup_entry(run, vcn, &lcn, &clen, NULL)) { err = attr_load_runs_vcn(ni, type, name, name_len, run, vcn); if (err) return err; clen = 0; /* Next run_lookup_entry(vcn) must be success. */ } } return 0; } #ifdef CONFIG_NTFS3_LZX_XPRESS /* * attr_wof_frame_info * * Read header of Xpress/LZX file to get info about frame. */ int attr_wof_frame_info(struct ntfs_inode *ni, struct ATTRIB *attr, struct runs_tree *run, u64 frame, u64 frames, u8 frame_bits, u32 *ondisk_size, u64 *vbo_data) { struct ntfs_sb_info *sbi = ni->mi.sbi; u64 vbo[2], off[2], wof_size; u32 voff; u8 bytes_per_off; char *addr; struct page *page; int i, err; __le32 *off32; __le64 *off64; if (ni->vfs_inode.i_size < 0x100000000ull) { /* File starts with array of 32 bit offsets. */ bytes_per_off = sizeof(__le32); vbo[1] = frame << 2; *vbo_data = frames << 2; } else { /* File starts with array of 64 bit offsets. */ bytes_per_off = sizeof(__le64); vbo[1] = frame << 3; *vbo_data = frames << 3; } /* * Read 4/8 bytes at [vbo - 4(8)] == offset where compressed frame starts. * Read 4/8 bytes at [vbo] == offset where compressed frame ends. */ if (!attr->non_res) { if (vbo[1] + bytes_per_off > le32_to_cpu(attr->res.data_size)) { ntfs_inode_err(&ni->vfs_inode, "is corrupted"); return -EINVAL; } addr = resident_data(attr); if (bytes_per_off == sizeof(__le32)) { off32 = Add2Ptr(addr, vbo[1]); off[0] = vbo[1] ? le32_to_cpu(off32[-1]) : 0; off[1] = le32_to_cpu(off32[0]); } else { off64 = Add2Ptr(addr, vbo[1]); off[0] = vbo[1] ? le64_to_cpu(off64[-1]) : 0; off[1] = le64_to_cpu(off64[0]); } *vbo_data += off[0]; *ondisk_size = off[1] - off[0]; return 0; } wof_size = le64_to_cpu(attr->nres.data_size); down_write(&ni->file.run_lock); page = ni->file.offs_page; if (!page) { page = alloc_page(GFP_KERNEL); if (!page) { err = -ENOMEM; goto out; } page->index = -1; ni->file.offs_page = page; } lock_page(page); addr = page_address(page); if (vbo[1]) { voff = vbo[1] & (PAGE_SIZE - 1); vbo[0] = vbo[1] - bytes_per_off; i = 0; } else { voff = 0; vbo[0] = 0; off[0] = 0; i = 1; } do { pgoff_t index = vbo[i] >> PAGE_SHIFT; if (index != page->index) { u64 from = vbo[i] & ~(u64)(PAGE_SIZE - 1); u64 to = min(from + PAGE_SIZE, wof_size); err = attr_load_runs_range(ni, ATTR_DATA, WOF_NAME, ARRAY_SIZE(WOF_NAME), run, from, to); if (err) goto out1; err = ntfs_bio_pages(sbi, run, &page, 1, from, to - from, REQ_OP_READ); if (err) { page->index = -1; goto out1; } page->index = index; } if (i) { if (bytes_per_off == sizeof(__le32)) { off32 = Add2Ptr(addr, voff); off[1] = le32_to_cpu(*off32); } else { off64 = Add2Ptr(addr, voff); off[1] = le64_to_cpu(*off64); } } else if (!voff) { if (bytes_per_off == sizeof(__le32)) { off32 = Add2Ptr(addr, PAGE_SIZE - sizeof(u32)); off[0] = le32_to_cpu(*off32); } else { off64 = Add2Ptr(addr, PAGE_SIZE - sizeof(u64)); off[0] = le64_to_cpu(*off64); } } else { /* Two values in one page. */ if (bytes_per_off == sizeof(__le32)) { off32 = Add2Ptr(addr, voff); off[0] = le32_to_cpu(off32[-1]); off[1] = le32_to_cpu(off32[0]); } else { off64 = Add2Ptr(addr, voff); off[0] = le64_to_cpu(off64[-1]); off[1] = le64_to_cpu(off64[0]); } break; } } while (++i < 2); *vbo_data += off[0]; *ondisk_size = off[1] - off[0]; out1: unlock_page(page); out: up_write(&ni->file.run_lock); return err; } #endif /* * attr_is_frame_compressed - Used to detect compressed frame. */ int attr_is_frame_compressed(struct ntfs_inode *ni, struct ATTRIB *attr, CLST frame, CLST *clst_data) { int err; u32 clst_frame; CLST clen, lcn, vcn, alen, slen, vcn_next; size_t idx; struct runs_tree *run; *clst_data = 0; if (!is_attr_compressed(attr)) return 0; if (!attr->non_res) return 0; clst_frame = 1u << attr->nres.c_unit; vcn = frame * clst_frame; run = &ni->file.run; if (!run_lookup_entry(run, vcn, &lcn, &clen, &idx)) { err = attr_load_runs_vcn(ni, attr->type, attr_name(attr), attr->name_len, run, vcn); if (err) return err; if (!run_lookup_entry(run, vcn, &lcn, &clen, &idx)) return -EINVAL; } if (lcn == SPARSE_LCN) { /* Sparsed frame. */ return 0; } if (clen >= clst_frame) { /* * The frame is not compressed 'cause * it does not contain any sparse clusters. */ *clst_data = clst_frame; return 0; } alen = bytes_to_cluster(ni->mi.sbi, le64_to_cpu(attr->nres.alloc_size)); slen = 0; *clst_data = clen; /* * The frame is compressed if *clst_data + slen >= clst_frame. * Check next fragments. */ while ((vcn += clen) < alen) { vcn_next = vcn; if (!run_get_entry(run, ++idx, &vcn, &lcn, &clen) || vcn_next != vcn) { err = attr_load_runs_vcn(ni, attr->type, attr_name(attr), attr->name_len, run, vcn_next); if (err) return err; vcn = vcn_next; if (!run_lookup_entry(run, vcn, &lcn, &clen, &idx)) return -EINVAL; } if (lcn == SPARSE_LCN) { slen += clen; } else { if (slen) { /* * Data_clusters + sparse_clusters = * not enough for frame. */ return -EINVAL; } *clst_data += clen; } if (*clst_data + slen >= clst_frame) { if (!slen) { /* * There is no sparsed clusters in this frame * so it is not compressed. */ *clst_data = clst_frame; } else { /* Frame is compressed. */ } break; } } return 0; } /* * attr_allocate_frame - Allocate/free clusters for @frame. * * Assumed: down_write(&ni->file.run_lock); */ int attr_allocate_frame(struct ntfs_inode *ni, CLST frame, size_t compr_size, u64 new_valid) { int err = 0; struct runs_tree *run = &ni->file.run; struct ntfs_sb_info *sbi = ni->mi.sbi; struct ATTRIB *attr = NULL, *attr_b; struct ATTR_LIST_ENTRY *le, *le_b; struct mft_inode *mi, *mi_b; CLST svcn, evcn1, next_svcn, len; CLST vcn, end, clst_data; u64 total_size, valid_size, data_size; le_b = NULL; attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL, 0, NULL, &mi_b); if (!attr_b) return -ENOENT; if (!is_attr_ext(attr_b)) return -EINVAL; vcn = frame << NTFS_LZNT_CUNIT; total_size = le64_to_cpu(attr_b->nres.total_size); svcn = le64_to_cpu(attr_b->nres.svcn); evcn1 = le64_to_cpu(attr_b->nres.evcn) + 1; data_size = le64_to_cpu(attr_b->nres.data_size); if (svcn <= vcn && vcn < evcn1) { attr = attr_b; le = le_b; mi = mi_b; } else if (!le_b) { err = -EINVAL; goto out; } else { le = le_b; attr = ni_find_attr(ni, attr_b, &le, ATTR_DATA, NULL, 0, &vcn, &mi); if (!attr) { err = -EINVAL; goto out; } svcn = le64_to_cpu(attr->nres.svcn); evcn1 = le64_to_cpu(attr->nres.evcn) + 1; } err = attr_load_runs(attr, ni, run, NULL); if (err) goto out; err = attr_is_frame_compressed(ni, attr_b, frame, &clst_data); if (err) goto out; total_size -= (u64)clst_data << sbi->cluster_bits; len = bytes_to_cluster(sbi, compr_size); if (len == clst_data) goto out; if (len < clst_data) { err = run_deallocate_ex(sbi, run, vcn + len, clst_data - len, NULL, true); if (err) goto out; if (!run_add_entry(run, vcn + len, SPARSE_LCN, clst_data - len, false)) { err = -ENOMEM; goto out; } end = vcn + clst_data; /* Run contains updated range [vcn + len : end). */ } else { CLST alen, hint = 0; /* Get the last LCN to allocate from. */ if (vcn + clst_data && !run_lookup_entry(run, vcn + clst_data - 1, &hint, NULL, NULL)) { hint = -1; } err = attr_allocate_clusters(sbi, run, vcn + clst_data, hint + 1, len - clst_data, NULL, ALLOCATE_DEF, &alen, 0, NULL, NULL); if (err) goto out; end = vcn + len; /* Run contains updated range [vcn + clst_data : end). */ } total_size += (u64)len << sbi->cluster_bits; repack: err = mi_pack_runs(mi, attr, run, max(end, evcn1) - svcn); if (err) goto out; attr_b->nres.total_size = cpu_to_le64(total_size); inode_set_bytes(&ni->vfs_inode, total_size); mi_b->dirty = true; mark_inode_dirty(&ni->vfs_inode); /* Stored [vcn : next_svcn) from [vcn : end). */ next_svcn = le64_to_cpu(attr->nres.evcn) + 1; if (end <= evcn1) { if (next_svcn == evcn1) { /* Normal way. Update attribute and exit. */ goto ok; } /* Add new segment [next_svcn : evcn1 - next_svcn). */ if (!ni->attr_list.size) { err = ni_create_attr_list(ni); if (err) goto out; /* Layout of records is changed. */ le_b = NULL; attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL, 0, NULL, &mi_b); if (!attr_b) { err = -ENOENT; goto out; } attr = attr_b; le = le_b; mi = mi_b; goto repack; } } svcn = evcn1; /* Estimate next attribute. */ attr = ni_find_attr(ni, attr, &le, ATTR_DATA, NULL, 0, &svcn, &mi); if (attr) { CLST alloc = bytes_to_cluster( sbi, le64_to_cpu(attr_b->nres.alloc_size)); CLST evcn = le64_to_cpu(attr->nres.evcn); if (end < next_svcn) end = next_svcn; while (end > evcn) { /* Remove segment [svcn : evcn). */ mi_remove_attr(NULL, mi, attr); if (!al_remove_le(ni, le)) { err = -EINVAL; goto out; } if (evcn + 1 >= alloc) { /* Last attribute segment. */ evcn1 = evcn + 1; goto ins_ext; } if (ni_load_mi(ni, le, &mi)) { attr = NULL; goto out; } attr = mi_find_attr(mi, NULL, ATTR_DATA, NULL, 0, &le->id); if (!attr) { err = -EINVAL; goto out; } svcn = le64_to_cpu(attr->nres.svcn); evcn = le64_to_cpu(attr->nres.evcn); } if (end < svcn) end = svcn; err = attr_load_runs(attr, ni, run, &end); if (err) goto out; evcn1 = evcn + 1; attr->nres.svcn = cpu_to_le64(next_svcn); err = mi_pack_runs(mi, attr, run, evcn1 - next_svcn); if (err) goto out; le->vcn = cpu_to_le64(next_svcn); ni->attr_list.dirty = true; mi->dirty = true; next_svcn = le64_to_cpu(attr->nres.evcn) + 1; } ins_ext: if (evcn1 > next_svcn) { err = ni_insert_nonresident(ni, ATTR_DATA, NULL, 0, run, next_svcn, evcn1 - next_svcn, attr_b->flags, &attr, &mi, NULL); if (err) goto out; } ok: run_truncate_around(run, vcn); out: if (attr_b) { if (new_valid > data_size) new_valid = data_size; valid_size = le64_to_cpu(attr_b->nres.valid_size); if (new_valid != valid_size) { attr_b->nres.valid_size = cpu_to_le64(valid_size); mi_b->dirty = true; } } return err; } /* * attr_collapse_range - Collapse range in file. */ int attr_collapse_range(struct ntfs_inode *ni, u64 vbo, u64 bytes) { int err = 0; struct runs_tree *run = &ni->file.run; struct ntfs_sb_info *sbi = ni->mi.sbi; struct ATTRIB *attr = NULL, *attr_b; struct ATTR_LIST_ENTRY *le, *le_b; struct mft_inode *mi, *mi_b; CLST svcn, evcn1, len, dealloc, alen; CLST vcn, end; u64 valid_size, data_size, alloc_size, total_size; u32 mask; __le16 a_flags; if (!bytes) return 0; le_b = NULL; attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL, 0, NULL, &mi_b); if (!attr_b) return -ENOENT; if (!attr_b->non_res) { /* Attribute is resident. Nothing to do? */ return 0; } data_size = le64_to_cpu(attr_b->nres.data_size); alloc_size = le64_to_cpu(attr_b->nres.alloc_size); a_flags = attr_b->flags; if (is_attr_ext(attr_b)) { total_size = le64_to_cpu(attr_b->nres.total_size); mask = (sbi->cluster_size << attr_b->nres.c_unit) - 1; } else { total_size = alloc_size; mask = sbi->cluster_mask; } if ((vbo & mask) || (bytes & mask)) { /* Allow to collapse only cluster aligned ranges. */ return -EINVAL; } if (vbo > data_size) return -EINVAL; down_write(&ni->file.run_lock); if (vbo + bytes >= data_size) { u64 new_valid = min(ni->i_valid, vbo); /* Simple truncate file at 'vbo'. */ truncate_setsize(&ni->vfs_inode, vbo); err = attr_set_size(ni, ATTR_DATA, NULL, 0, &ni->file.run, vbo, &new_valid, true, NULL); if (!err && new_valid < ni->i_valid) ni->i_valid = new_valid; goto out; } /* * Enumerate all attribute segments and collapse. */ alen = alloc_size >> sbi->cluster_bits; vcn = vbo >> sbi->cluster_bits; len = bytes >> sbi->cluster_bits; end = vcn + len; dealloc = 0; svcn = le64_to_cpu(attr_b->nres.svcn); evcn1 = le64_to_cpu(attr_b->nres.evcn) + 1; if (svcn <= vcn && vcn < evcn1) { attr = attr_b; le = le_b; mi = mi_b; } else if (!le_b) { err = -EINVAL; goto out; } else { le = le_b; attr = ni_find_attr(ni, attr_b, &le, ATTR_DATA, NULL, 0, &vcn, &mi); if (!attr) { err = -EINVAL; goto out; } svcn = le64_to_cpu(attr->nres.svcn); evcn1 = le64_to_cpu(attr->nres.evcn) + 1; } for (;;) { if (svcn >= end) { /* Shift VCN- */ attr->nres.svcn = cpu_to_le64(svcn - len); attr->nres.evcn = cpu_to_le64(evcn1 - 1 - len); if (le) { le->vcn = attr->nres.svcn; ni->attr_list.dirty = true; } mi->dirty = true; } else if (svcn < vcn || end < evcn1) { CLST vcn1, eat, next_svcn; /* Collapse a part of this attribute segment. */ err = attr_load_runs(attr, ni, run, &svcn); if (err) goto out; vcn1 = max(vcn, svcn); eat = min(end, evcn1) - vcn1; err = run_deallocate_ex(sbi, run, vcn1, eat, &dealloc, true); if (err) goto out; if (!run_collapse_range(run, vcn1, eat)) { err = -ENOMEM; goto out; } if (svcn >= vcn) { /* Shift VCN */ attr->nres.svcn = cpu_to_le64(vcn); if (le) { le->vcn = attr->nres.svcn; ni->attr_list.dirty = true; } } err = mi_pack_runs(mi, attr, run, evcn1 - svcn - eat); if (err) goto out; next_svcn = le64_to_cpu(attr->nres.evcn) + 1; if (next_svcn + eat < evcn1) { err = ni_insert_nonresident( ni, ATTR_DATA, NULL, 0, run, next_svcn, evcn1 - eat - next_svcn, a_flags, &attr, &mi, &le); if (err) goto out; /* Layout of records maybe changed. */ attr_b = NULL; } /* Free all allocated memory. */ run_truncate(run, 0); } else { u16 le_sz; u16 roff = le16_to_cpu(attr->nres.run_off); if (roff > le32_to_cpu(attr->size)) { err = -EINVAL; goto out; } run_unpack_ex(RUN_DEALLOCATE, sbi, ni->mi.rno, svcn, evcn1 - 1, svcn, Add2Ptr(attr, roff), le32_to_cpu(attr->size) - roff); /* Delete this attribute segment. */ mi_remove_attr(NULL, mi, attr); if (!le) break; le_sz = le16_to_cpu(le->size); if (!al_remove_le(ni, le)) { err = -EINVAL; goto out; } if (evcn1 >= alen) break; if (!svcn) { /* Load next record that contains this attribute. */ if (ni_load_mi(ni, le, &mi)) { err = -EINVAL; goto out; } /* Look for required attribute. */ attr = mi_find_attr(mi, NULL, ATTR_DATA, NULL, 0, &le->id); if (!attr) { err = -EINVAL; goto out; } goto next_attr; } le = (struct ATTR_LIST_ENTRY *)((u8 *)le - le_sz); } if (evcn1 >= alen) break; attr = ni_enum_attr_ex(ni, attr, &le, &mi); if (!attr) { err = -EINVAL; goto out; } next_attr: svcn = le64_to_cpu(attr->nres.svcn); evcn1 = le64_to_cpu(attr->nres.evcn) + 1; } if (!attr_b) { le_b = NULL; attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL, 0, NULL, &mi_b); if (!attr_b) { err = -ENOENT; goto out; } } data_size -= bytes; valid_size = ni->i_valid; if (vbo + bytes <= valid_size) valid_size -= bytes; else if (vbo < valid_size) valid_size = vbo; attr_b->nres.alloc_size = cpu_to_le64(alloc_size - bytes); attr_b->nres.data_size = cpu_to_le64(data_size); attr_b->nres.valid_size = cpu_to_le64(min(valid_size, data_size)); total_size -= (u64)dealloc << sbi->cluster_bits; if (is_attr_ext(attr_b)) attr_b->nres.total_size = cpu_to_le64(total_size); mi_b->dirty = true; /* Update inode size. */ ni->i_valid = valid_size; i_size_write(&ni->vfs_inode, data_size); inode_set_bytes(&ni->vfs_inode, total_size); ni->ni_flags |= NI_FLAG_UPDATE_PARENT; mark_inode_dirty(&ni->vfs_inode); out: up_write(&ni->file.run_lock); if (err) _ntfs_bad_inode(&ni->vfs_inode); return err; } /* * attr_punch_hole * * Not for normal files. */ int attr_punch_hole(struct ntfs_inode *ni, u64 vbo, u64 bytes, u32 *frame_size) { int err = 0; struct runs_tree *run = &ni->file.run; struct ntfs_sb_info *sbi = ni->mi.sbi; struct ATTRIB *attr = NULL, *attr_b; struct ATTR_LIST_ENTRY *le, *le_b; struct mft_inode *mi, *mi_b; CLST svcn, evcn1, vcn, len, end, alen, hole, next_svcn; u64 total_size, alloc_size; u32 mask; __le16 a_flags; struct runs_tree run2; if (!bytes) return 0; le_b = NULL; attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL, 0, NULL, &mi_b); if (!attr_b) return -ENOENT; if (!attr_b->non_res) { u32 data_size = le32_to_cpu(attr_b->res.data_size); u32 from, to; if (vbo > data_size) return 0; from = vbo; to = min_t(u64, vbo + bytes, data_size); memset(Add2Ptr(resident_data(attr_b), from), 0, to - from); return 0; } if (!is_attr_ext(attr_b)) return -EOPNOTSUPP; alloc_size = le64_to_cpu(attr_b->nres.alloc_size); total_size = le64_to_cpu(attr_b->nres.total_size); if (vbo >= alloc_size) { /* NOTE: It is allowed. */ return 0; } mask = (sbi->cluster_size << attr_b->nres.c_unit) - 1; bytes += vbo; if (bytes > alloc_size) bytes = alloc_size; bytes -= vbo; if ((vbo & mask) || (bytes & mask)) { /* We have to zero a range(s). */ if (frame_size == NULL) { /* Caller insists range is aligned. */ return -EINVAL; } *frame_size = mask + 1; return E_NTFS_NOTALIGNED; } down_write(&ni->file.run_lock); run_init(&run2); run_truncate(run, 0); /* * Enumerate all attribute segments and punch hole where necessary. */ alen = alloc_size >> sbi->cluster_bits; vcn = vbo >> sbi->cluster_bits; len = bytes >> sbi->cluster_bits; end = vcn + len; hole = 0; svcn = le64_to_cpu(attr_b->nres.svcn); evcn1 = le64_to_cpu(attr_b->nres.evcn) + 1; a_flags = attr_b->flags; if (svcn <= vcn && vcn < evcn1) { attr = attr_b; le = le_b; mi = mi_b; } else if (!le_b) { err = -EINVAL; goto bad_inode; } else { le = le_b; attr = ni_find_attr(ni, attr_b, &le, ATTR_DATA, NULL, 0, &vcn, &mi); if (!attr) { err = -EINVAL; goto bad_inode; } svcn = le64_to_cpu(attr->nres.svcn); evcn1 = le64_to_cpu(attr->nres.evcn) + 1; } while (svcn < end) { CLST vcn1, zero, hole2 = hole; err = attr_load_runs(attr, ni, run, &svcn); if (err) goto done; vcn1 = max(vcn, svcn); zero = min(end, evcn1) - vcn1; /* * Check range [vcn1 + zero). * Calculate how many clusters there are. * Don't do any destructive actions. */ err = run_deallocate_ex(NULL, run, vcn1, zero, &hole2, false); if (err) goto done; /* Check if required range is already hole. */ if (hole2 == hole) goto next_attr; /* Make a clone of run to undo. */ err = run_clone(run, &run2); if (err) goto done; /* Make a hole range (sparse) [vcn1 + zero). */ if (!run_add_entry(run, vcn1, SPARSE_LCN, zero, false)) { err = -ENOMEM; goto done; } /* Update run in attribute segment. */ err = mi_pack_runs(mi, attr, run, evcn1 - svcn); if (err) goto done; next_svcn = le64_to_cpu(attr->nres.evcn) + 1; if (next_svcn < evcn1) { /* Insert new attribute segment. */ err = ni_insert_nonresident(ni, ATTR_DATA, NULL, 0, run, next_svcn, evcn1 - next_svcn, a_flags, &attr, &mi, &le); if (err) goto undo_punch; /* Layout of records maybe changed. */ attr_b = NULL; } /* Real deallocate. Should not fail. */ run_deallocate_ex(sbi, &run2, vcn1, zero, &hole, true); next_attr: /* Free all allocated memory. */ run_truncate(run, 0); if (evcn1 >= alen) break; /* Get next attribute segment. */ attr = ni_enum_attr_ex(ni, attr, &le, &mi); if (!attr) { err = -EINVAL; goto bad_inode; } svcn = le64_to_cpu(attr->nres.svcn); evcn1 = le64_to_cpu(attr->nres.evcn) + 1; } done: if (!hole) goto out; if (!attr_b) { attr_b = ni_find_attr(ni, NULL, NULL, ATTR_DATA, NULL, 0, NULL, &mi_b); if (!attr_b) { err = -EINVAL; goto bad_inode; } } total_size -= (u64)hole << sbi->cluster_bits; attr_b->nres.total_size = cpu_to_le64(total_size); mi_b->dirty = true; /* Update inode size. */ inode_set_bytes(&ni->vfs_inode, total_size); ni->ni_flags |= NI_FLAG_UPDATE_PARENT; mark_inode_dirty(&ni->vfs_inode); out: run_close(&run2); up_write(&ni->file.run_lock); return err; bad_inode: _ntfs_bad_inode(&ni->vfs_inode); goto out; undo_punch: /* * Restore packed runs. * 'mi_pack_runs' should not fail, cause we restore original. */ if (mi_pack_runs(mi, attr, &run2, evcn1 - svcn)) goto bad_inode; goto done; } /* * attr_insert_range - Insert range (hole) in file. * Not for normal files. */ int attr_insert_range(struct ntfs_inode *ni, u64 vbo, u64 bytes) { int err = 0; struct runs_tree *run = &ni->file.run; struct ntfs_sb_info *sbi = ni->mi.sbi; struct ATTRIB *attr = NULL, *attr_b; struct ATTR_LIST_ENTRY *le, *le_b; struct mft_inode *mi, *mi_b; CLST vcn, svcn, evcn1, len, next_svcn; u64 data_size, alloc_size; u32 mask; __le16 a_flags; if (!bytes) return 0; le_b = NULL; attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL, 0, NULL, &mi_b); if (!attr_b) return -ENOENT; if (!is_attr_ext(attr_b)) { /* It was checked above. See fallocate. */ return -EOPNOTSUPP; } if (!attr_b->non_res) { data_size = le32_to_cpu(attr_b->res.data_size); alloc_size = data_size; mask = sbi->cluster_mask; /* cluster_size - 1 */ } else { data_size = le64_to_cpu(attr_b->nres.data_size); alloc_size = le64_to_cpu(attr_b->nres.alloc_size); mask = (sbi->cluster_size << attr_b->nres.c_unit) - 1; } if (vbo > data_size) { /* Insert range after the file size is not allowed. */ return -EINVAL; } if ((vbo & mask) || (bytes & mask)) { /* Allow to insert only frame aligned ranges. */ return -EINVAL; } /* * valid_size <= data_size <= alloc_size * Check alloc_size for maximum possible. */ if (bytes > sbi->maxbytes_sparse - alloc_size) return -EFBIG; vcn = vbo >> sbi->cluster_bits; len = bytes >> sbi->cluster_bits; down_write(&ni->file.run_lock); if (!attr_b->non_res) { err = attr_set_size(ni, ATTR_DATA, NULL, 0, run, data_size + bytes, NULL, false, NULL); le_b = NULL; attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL, 0, NULL, &mi_b); if (!attr_b) { err = -EINVAL; goto bad_inode; } if (err) goto out; if (!attr_b->non_res) { /* Still resident. */ char *data = Add2Ptr(attr_b, le16_to_cpu(attr_b->res.data_off)); memmove(data + bytes, data, bytes); memset(data, 0, bytes); goto done; } /* Resident files becomes nonresident. */ data_size = le64_to_cpu(attr_b->nres.data_size); alloc_size = le64_to_cpu(attr_b->nres.alloc_size); } /* * Enumerate all attribute segments and shift start vcn. */ a_flags = attr_b->flags; svcn = le64_to_cpu(attr_b->nres.svcn); evcn1 = le64_to_cpu(attr_b->nres.evcn) + 1; if (svcn <= vcn && vcn < evcn1) { attr = attr_b; le = le_b; mi = mi_b; } else if (!le_b) { err = -EINVAL; goto bad_inode; } else { le = le_b; attr = ni_find_attr(ni, attr_b, &le, ATTR_DATA, NULL, 0, &vcn, &mi); if (!attr) { err = -EINVAL; goto bad_inode; } svcn = le64_to_cpu(attr->nres.svcn); evcn1 = le64_to_cpu(attr->nres.evcn) + 1; } run_truncate(run, 0); /* clear cached values. */ err = attr_load_runs(attr, ni, run, NULL); if (err) goto out; if (!run_insert_range(run, vcn, len)) { err = -ENOMEM; goto out; } /* Try to pack in current record as much as possible. */ err = mi_pack_runs(mi, attr, run, evcn1 + len - svcn); if (err) goto out; next_svcn = le64_to_cpu(attr->nres.evcn) + 1; while ((attr = ni_enum_attr_ex(ni, attr, &le, &mi)) && attr->type == ATTR_DATA && !attr->name_len) { le64_add_cpu(&attr->nres.svcn, len); le64_add_cpu(&attr->nres.evcn, len); if (le) { le->vcn = attr->nres.svcn; ni->attr_list.dirty = true; } mi->dirty = true; } if (next_svcn < evcn1 + len) { err = ni_insert_nonresident(ni, ATTR_DATA, NULL, 0, run, next_svcn, evcn1 + len - next_svcn, a_flags, NULL, NULL, NULL); le_b = NULL; attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL, 0, NULL, &mi_b); if (!attr_b) { err = -EINVAL; goto bad_inode; } if (err) { /* ni_insert_nonresident failed. Try to undo. */ goto undo_insert_range; } } /* * Update primary attribute segment. */ if (vbo <= ni->i_valid) ni->i_valid += bytes; attr_b->nres.data_size = cpu_to_le64(data_size + bytes); attr_b->nres.alloc_size = cpu_to_le64(alloc_size + bytes); /* ni->valid may be not equal valid_size (temporary). */ if (ni->i_valid > data_size + bytes) attr_b->nres.valid_size = attr_b->nres.data_size; else attr_b->nres.valid_size = cpu_to_le64(ni->i_valid); mi_b->dirty = true; done: i_size_write(&ni->vfs_inode, ni->vfs_inode.i_size + bytes); ni->ni_flags |= NI_FLAG_UPDATE_PARENT; mark_inode_dirty(&ni->vfs_inode); out: run_truncate(run, 0); /* clear cached values. */ up_write(&ni->file.run_lock); return err; bad_inode: _ntfs_bad_inode(&ni->vfs_inode); goto out; undo_insert_range: svcn = le64_to_cpu(attr_b->nres.svcn); evcn1 = le64_to_cpu(attr_b->nres.evcn) + 1; if (svcn <= vcn && vcn < evcn1) { attr = attr_b; le = le_b; mi = mi_b; } else if (!le_b) { goto bad_inode; } else { le = le_b; attr = ni_find_attr(ni, attr_b, &le, ATTR_DATA, NULL, 0, &vcn, &mi); if (!attr) { goto bad_inode; } svcn = le64_to_cpu(attr->nres.svcn); evcn1 = le64_to_cpu(attr->nres.evcn) + 1; } if (attr_load_runs(attr, ni, run, NULL)) goto bad_inode; if (!run_collapse_range(run, vcn, len)) goto bad_inode; if (mi_pack_runs(mi, attr, run, evcn1 + len - svcn)) goto bad_inode; while ((attr = ni_enum_attr_ex(ni, attr, &le, &mi)) && attr->type == ATTR_DATA && !attr->name_len) { le64_sub_cpu(&attr->nres.svcn, len); le64_sub_cpu(&attr->nres.evcn, len); if (le) { le->vcn = attr->nres.svcn; ni->attr_list.dirty = true; } mi->dirty = true; } goto out; }
3 3 2 2 3 3 3 2 2 1 1 1 1 2 2 2 1 1 1 12 12 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 // SPDX-License-Identifier: GPL-2.0-only /* tunnel4.c: Generic IP tunnel transformer. * * Copyright (C) 2003 David S. Miller (davem@redhat.com) */ #include <linux/init.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/mpls.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <net/icmp.h> #include <net/ip.h> #include <net/protocol.h> #include <net/xfrm.h> static struct xfrm_tunnel __rcu *tunnel4_handlers __read_mostly; static struct xfrm_tunnel __rcu *tunnel64_handlers __read_mostly; static struct xfrm_tunnel __rcu *tunnelmpls4_handlers __read_mostly; static DEFINE_MUTEX(tunnel4_mutex); static inline struct xfrm_tunnel __rcu **fam_handlers(unsigned short family) { return (family == AF_INET) ? &tunnel4_handlers : (family == AF_INET6) ? &tunnel64_handlers : &tunnelmpls4_handlers; } int xfrm4_tunnel_register(struct xfrm_tunnel *handler, unsigned short family) { struct xfrm_tunnel __rcu **pprev; struct xfrm_tunnel *t; int ret = -EEXIST; int priority = handler->priority; mutex_lock(&tunnel4_mutex); for (pprev = fam_handlers(family); (t = rcu_dereference_protected(*pprev, lockdep_is_held(&tunnel4_mutex))) != NULL; pprev = &t->next) { if (t->priority > priority) break; if (t->priority == priority) goto err; } handler->next = *pprev; rcu_assign_pointer(*pprev, handler); ret = 0; err: mutex_unlock(&tunnel4_mutex); return ret; } EXPORT_SYMBOL(xfrm4_tunnel_register); int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler, unsigned short family) { struct xfrm_tunnel __rcu **pprev; struct xfrm_tunnel *t; int ret = -ENOENT; mutex_lock(&tunnel4_mutex); for (pprev = fam_handlers(family); (t = rcu_dereference_protected(*pprev, lockdep_is_held(&tunnel4_mutex))) != NULL; pprev = &t->next) { if (t == handler) { *pprev = handler->next; ret = 0; break; } } mutex_unlock(&tunnel4_mutex); synchronize_net(); return ret; } EXPORT_SYMBOL(xfrm4_tunnel_deregister); #define for_each_tunnel_rcu(head, handler) \ for (handler = rcu_dereference(head); \ handler != NULL; \ handler = rcu_dereference(handler->next)) \ static int tunnel4_rcv(struct sk_buff *skb) { struct xfrm_tunnel *handler; if (!pskb_may_pull(skb, sizeof(struct iphdr))) goto drop; for_each_tunnel_rcu(tunnel4_handlers, handler) if (!handler->handler(skb)) return 0; icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); drop: kfree_skb(skb); return 0; } #if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL) static int tunnel4_rcv_cb(struct sk_buff *skb, u8 proto, int err) { struct xfrm_tunnel __rcu *head; struct xfrm_tunnel *handler; int ret; head = (proto == IPPROTO_IPIP) ? tunnel4_handlers : tunnel64_handlers; for_each_tunnel_rcu(head, handler) { if (handler->cb_handler) { ret = handler->cb_handler(skb, err); if (ret <= 0) return ret; } } return 0; } static const struct xfrm_input_afinfo tunnel4_input_afinfo = { .family = AF_INET, .is_ipip = true, .callback = tunnel4_rcv_cb, }; #endif #if IS_ENABLED(CONFIG_IPV6) static int tunnel64_rcv(struct sk_buff *skb) { struct xfrm_tunnel *handler; if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) goto drop; for_each_tunnel_rcu(tunnel64_handlers, handler) if (!handler->handler(skb)) return 0; icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); drop: kfree_skb(skb); return 0; } #endif #if IS_ENABLED(CONFIG_MPLS) static int tunnelmpls4_rcv(struct sk_buff *skb) { struct xfrm_tunnel *handler; if (!pskb_may_pull(skb, sizeof(struct mpls_label))) goto drop; for_each_tunnel_rcu(tunnelmpls4_handlers, handler) if (!handler->handler(skb)) return 0; icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); drop: kfree_skb(skb); return 0; } #endif static int tunnel4_err(struct sk_buff *skb, u32 info) { struct xfrm_tunnel *handler; for_each_tunnel_rcu(tunnel4_handlers, handler) if (!handler->err_handler(skb, info)) return 0; return -ENOENT; } #if IS_ENABLED(CONFIG_IPV6) static int tunnel64_err(struct sk_buff *skb, u32 info) { struct xfrm_tunnel *handler; for_each_tunnel_rcu(tunnel64_handlers, handler) if (!handler->err_handler(skb, info)) return 0; return -ENOENT; } #endif #if IS_ENABLED(CONFIG_MPLS) static int tunnelmpls4_err(struct sk_buff *skb, u32 info) { struct xfrm_tunnel *handler; for_each_tunnel_rcu(tunnelmpls4_handlers, handler) if (!handler->err_handler(skb, info)) return 0; return -ENOENT; } #endif static const struct net_protocol tunnel4_protocol = { .handler = tunnel4_rcv, .err_handler = tunnel4_err, .no_policy = 1, }; #if IS_ENABLED(CONFIG_IPV6) static const struct net_protocol tunnel64_protocol = { .handler = tunnel64_rcv, .err_handler = tunnel64_err, .no_policy = 1, }; #endif #if IS_ENABLED(CONFIG_MPLS) static const struct net_protocol tunnelmpls4_protocol = { .handler = tunnelmpls4_rcv, .err_handler = tunnelmpls4_err, .no_policy = 1, }; #endif static int __init tunnel4_init(void) { if (inet_add_protocol(&tunnel4_protocol, IPPROTO_IPIP)) goto err; #if IS_ENABLED(CONFIG_IPV6) if (inet_add_protocol(&tunnel64_protocol, IPPROTO_IPV6)) { inet_del_protocol(&tunnel4_protocol, IPPROTO_IPIP); goto err; } #endif #if IS_ENABLED(CONFIG_MPLS) if (inet_add_protocol(&tunnelmpls4_protocol, IPPROTO_MPLS)) { inet_del_protocol(&tunnel4_protocol, IPPROTO_IPIP); #if IS_ENABLED(CONFIG_IPV6) inet_del_protocol(&tunnel64_protocol, IPPROTO_IPV6); #endif goto err; } #endif #if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL) if (xfrm_input_register_afinfo(&tunnel4_input_afinfo)) { inet_del_protocol(&tunnel4_protocol, IPPROTO_IPIP); #if IS_ENABLED(CONFIG_IPV6) inet_del_protocol(&tunnel64_protocol, IPPROTO_IPV6); #endif #if IS_ENABLED(CONFIG_MPLS) inet_del_protocol(&tunnelmpls4_protocol, IPPROTO_MPLS); #endif goto err; } #endif return 0; err: pr_err("%s: can't add protocol\n", __func__); return -EAGAIN; } static void __exit tunnel4_fini(void) { #if IS_ENABLED(CONFIG_INET_XFRM_TUNNEL) if (xfrm_input_unregister_afinfo(&tunnel4_input_afinfo)) pr_err("tunnel4 close: can't remove input afinfo\n"); #endif #if IS_ENABLED(CONFIG_MPLS) if (inet_del_protocol(&tunnelmpls4_protocol, IPPROTO_MPLS)) pr_err("tunnelmpls4 close: can't remove protocol\n"); #endif #if IS_ENABLED(CONFIG_IPV6) if (inet_del_protocol(&tunnel64_protocol, IPPROTO_IPV6)) pr_err("tunnel64 close: can't remove protocol\n"); #endif if (inet_del_protocol(&tunnel4_protocol, IPPROTO_IPIP)) pr_err("tunnel4 close: can't remove protocol\n"); } module_init(tunnel4_init); module_exit(tunnel4_fini); MODULE_DESCRIPTION("IPv4 XFRM tunnel library"); MODULE_LICENSE("GPL");
1002 1072 965 1062 1064 6 78 220 10 14 39 34 2 2 31 3 24 2 19 79 68 11 79 31 61 10 24 13 4 7 10 3 1 2 1 1 6 22 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 // SPDX-License-Identifier: GPL-2.0-only /* * IPv6 library code, needed by static components when full IPv6 support is * not configured or static. */ #include <linux/export.h> #include <net/ipv6.h> /* * find out if nexthdr is a well-known extension header or a protocol */ bool ipv6_ext_hdr(u8 nexthdr) { /* * find out if nexthdr is an extension header or a protocol */ return (nexthdr == NEXTHDR_HOP) || (nexthdr == NEXTHDR_ROUTING) || (nexthdr == NEXTHDR_FRAGMENT) || (nexthdr == NEXTHDR_AUTH) || (nexthdr == NEXTHDR_NONE) || (nexthdr == NEXTHDR_DEST); } EXPORT_SYMBOL(ipv6_ext_hdr); /* * Skip any extension headers. This is used by the ICMP module. * * Note that strictly speaking this conflicts with RFC 2460 4.0: * ...The contents and semantics of each extension header determine whether * or not to proceed to the next header. Therefore, extension headers must * be processed strictly in the order they appear in the packet; a * receiver must not, for example, scan through a packet looking for a * particular kind of extension header and process that header prior to * processing all preceding ones. * * We do exactly this. This is a protocol bug. We can't decide after a * seeing an unknown discard-with-error flavour TLV option if it's a * ICMP error message or not (errors should never be send in reply to * ICMP error messages). * * But I see no other way to do this. This might need to be reexamined * when Linux implements ESP (and maybe AUTH) headers. * --AK * * This function parses (probably truncated) exthdr set "hdr". * "nexthdrp" initially points to some place, * where type of the first header can be found. * * It skips all well-known exthdrs, and returns pointer to the start * of unparsable area i.e. the first header with unknown type. * If it is not NULL *nexthdr is updated by type/protocol of this header. * * NOTES: - if packet terminated with NEXTHDR_NONE it returns NULL. * - it may return pointer pointing beyond end of packet, * if the last recognized header is truncated in the middle. * - if packet is truncated, so that all parsed headers are skipped, * it returns NULL. * - First fragment header is skipped, not-first ones * are considered as unparsable. * - Reports the offset field of the final fragment header so it is * possible to tell whether this is a first fragment, later fragment, * or not fragmented. * - ESP is unparsable for now and considered like * normal payload protocol. * - Note also special handling of AUTH header. Thanks to IPsec wizards. * * --ANK (980726) */ int ipv6_skip_exthdr(const struct sk_buff *skb, int start, u8 *nexthdrp, __be16 *frag_offp) { u8 nexthdr = *nexthdrp; *frag_offp = 0; while (ipv6_ext_hdr(nexthdr)) { struct ipv6_opt_hdr _hdr, *hp; int hdrlen; if (nexthdr == NEXTHDR_NONE) return -1; hp = skb_header_pointer(skb, start, sizeof(_hdr), &_hdr); if (!hp) return -1; if (nexthdr == NEXTHDR_FRAGMENT) { __be16 _frag_off, *fp; fp = skb_header_pointer(skb, start+offsetof(struct frag_hdr, frag_off), sizeof(_frag_off), &_frag_off); if (!fp) return -1; *frag_offp = *fp; if (ntohs(*frag_offp) & ~0x7) break; hdrlen = 8; } else if (nexthdr == NEXTHDR_AUTH) hdrlen = ipv6_authlen(hp); else hdrlen = ipv6_optlen(hp); nexthdr = hp->nexthdr; start += hdrlen; } *nexthdrp = nexthdr; return start; } EXPORT_SYMBOL(ipv6_skip_exthdr); int ipv6_find_tlv(const struct sk_buff *skb, int offset, int type) { const unsigned char *nh = skb_network_header(skb); int packet_len = skb_tail_pointer(skb) - skb_network_header(skb); struct ipv6_opt_hdr *hdr; int len; if (offset + 2 > packet_len) goto bad; hdr = (struct ipv6_opt_hdr *)(nh + offset); len = ((hdr->hdrlen + 1) << 3); if (offset + len > packet_len) goto bad; offset += 2; len -= 2; while (len > 0) { int opttype = nh[offset]; int optlen; if (opttype == type) return offset; switch (opttype) { case IPV6_TLV_PAD1: optlen = 1; break; default: if (len < 2) goto bad; optlen = nh[offset + 1] + 2; if (optlen > len) goto bad; break; } offset += optlen; len -= optlen; } /* not_found */ bad: return -1; } EXPORT_SYMBOL_GPL(ipv6_find_tlv); /* * find the offset to specified header or the protocol number of last header * if target < 0. "last header" is transport protocol header, ESP, or * "No next header". * * Note that *offset is used as input/output parameter, and if it is not zero, * then it must be a valid offset to an inner IPv6 header. This can be used * to explore inner IPv6 header, eg. ICMPv6 error messages. * * If target header is found, its offset is set in *offset and return protocol * number. Otherwise, return -1. * * If the first fragment doesn't contain the final protocol header or * NEXTHDR_NONE it is considered invalid. * * Note that non-1st fragment is special case that "the protocol number * of last header" is "next header" field in Fragment header. In this case, * *offset is meaningless and fragment offset is stored in *fragoff if fragoff * isn't NULL. * * if flags is not NULL and it's a fragment, then the frag flag * IP6_FH_F_FRAG will be set. If it's an AH header, the * IP6_FH_F_AUTH flag is set and target < 0, then this function will * stop at the AH header. If IP6_FH_F_SKIP_RH flag was passed, then this * function will skip all those routing headers, where segements_left was 0. */ int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset, int target, unsigned short *fragoff, int *flags) { unsigned int start = skb_network_offset(skb) + sizeof(struct ipv6hdr); u8 nexthdr = ipv6_hdr(skb)->nexthdr; bool found; if (fragoff) *fragoff = 0; if (*offset) { struct ipv6hdr _ip6, *ip6; ip6 = skb_header_pointer(skb, *offset, sizeof(_ip6), &_ip6); if (!ip6 || (ip6->version != 6)) return -EBADMSG; start = *offset + sizeof(struct ipv6hdr); nexthdr = ip6->nexthdr; } do { struct ipv6_opt_hdr _hdr, *hp; unsigned int hdrlen; found = (nexthdr == target); if ((!ipv6_ext_hdr(nexthdr)) || nexthdr == NEXTHDR_NONE) { if (target < 0 || found) break; return -ENOENT; } hp = skb_header_pointer(skb, start, sizeof(_hdr), &_hdr); if (!hp) return -EBADMSG; if (nexthdr == NEXTHDR_ROUTING) { struct ipv6_rt_hdr _rh, *rh; rh = skb_header_pointer(skb, start, sizeof(_rh), &_rh); if (!rh) return -EBADMSG; if (flags && (*flags & IP6_FH_F_SKIP_RH) && rh->segments_left == 0) found = false; } if (nexthdr == NEXTHDR_FRAGMENT) { unsigned short _frag_off; __be16 *fp; if (flags) /* Indicate that this is a fragment */ *flags |= IP6_FH_F_FRAG; fp = skb_header_pointer(skb, start+offsetof(struct frag_hdr, frag_off), sizeof(_frag_off), &_frag_off); if (!fp) return -EBADMSG; _frag_off = ntohs(*fp) & ~0x7; if (_frag_off) { if (target < 0 && ((!ipv6_ext_hdr(hp->nexthdr)) || hp->nexthdr == NEXTHDR_NONE)) { if (fragoff) *fragoff = _frag_off; return hp->nexthdr; } if (!found) return -ENOENT; if (fragoff) *fragoff = _frag_off; break; } hdrlen = 8; } else if (nexthdr == NEXTHDR_AUTH) { if (flags && (*flags & IP6_FH_F_AUTH) && (target < 0)) break; hdrlen = ipv6_authlen(hp); } else hdrlen = ipv6_optlen(hp); if (!found) { nexthdr = hp->nexthdr; start += hdrlen; } } while (!found); *offset = start; return nexthdr; } EXPORT_SYMBOL(ipv6_find_hdr);
10 10 1 1 1 1 6 3 1 1112 1109 203 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2008-2011, Intel Corporation. * * Description: Data Center Bridging netlink interface * Author: Lucy Liu <lucy.liu@intel.com> */ #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/slab.h> #include <net/netlink.h> #include <net/rtnetlink.h> #include <linux/dcbnl.h> #include <net/dcbevent.h> #include <linux/rtnetlink.h> #include <linux/init.h> #include <net/sock.h> /* Data Center Bridging (DCB) is a collection of Ethernet enhancements * intended to allow network traffic with differing requirements * (highly reliable, no drops vs. best effort vs. low latency) to operate * and co-exist on Ethernet. Current DCB features are: * * Enhanced Transmission Selection (aka Priority Grouping [PG]) - provides a * framework for assigning bandwidth guarantees to traffic classes. * * Priority-based Flow Control (PFC) - provides a flow control mechanism which * can work independently for each 802.1p priority. * * Congestion Notification - provides a mechanism for end-to-end congestion * control for protocols which do not have built-in congestion management. * * More information about the emerging standards for these Ethernet features * can be found at: http://www.ieee802.org/1/pages/dcbridges.html * * This file implements an rtnetlink interface to allow configuration of DCB * features for capable devices. */ /**************** DCB attribute policies *************************************/ /* DCB netlink attributes policy */ static const struct nla_policy dcbnl_rtnl_policy[DCB_ATTR_MAX + 1] = { [DCB_ATTR_IFNAME] = {.type = NLA_NUL_STRING, .len = IFNAMSIZ - 1}, [DCB_ATTR_STATE] = {.type = NLA_U8}, [DCB_ATTR_PFC_CFG] = {.type = NLA_NESTED}, [DCB_ATTR_PG_CFG] = {.type = NLA_NESTED}, [DCB_ATTR_SET_ALL] = {.type = NLA_U8}, [DCB_ATTR_PERM_HWADDR] = {.type = NLA_FLAG}, [DCB_ATTR_CAP] = {.type = NLA_NESTED}, [DCB_ATTR_PFC_STATE] = {.type = NLA_U8}, [DCB_ATTR_BCN] = {.type = NLA_NESTED}, [DCB_ATTR_APP] = {.type = NLA_NESTED}, [DCB_ATTR_IEEE] = {.type = NLA_NESTED}, [DCB_ATTR_DCBX] = {.type = NLA_U8}, [DCB_ATTR_FEATCFG] = {.type = NLA_NESTED}, }; /* DCB priority flow control to User Priority nested attributes */ static const struct nla_policy dcbnl_pfc_up_nest[DCB_PFC_UP_ATTR_MAX + 1] = { [DCB_PFC_UP_ATTR_0] = {.type = NLA_U8}, [DCB_PFC_UP_ATTR_1] = {.type = NLA_U8}, [DCB_PFC_UP_ATTR_2] = {.type = NLA_U8}, [DCB_PFC_UP_ATTR_3] = {.type = NLA_U8}, [DCB_PFC_UP_ATTR_4] = {.type = NLA_U8}, [DCB_PFC_UP_ATTR_5] = {.type = NLA_U8}, [DCB_PFC_UP_ATTR_6] = {.type = NLA_U8}, [DCB_PFC_UP_ATTR_7] = {.type = NLA_U8}, [DCB_PFC_UP_ATTR_ALL] = {.type = NLA_FLAG}, }; /* DCB priority grouping nested attributes */ static const struct nla_policy dcbnl_pg_nest[DCB_PG_ATTR_MAX + 1] = { [DCB_PG_ATTR_TC_0] = {.type = NLA_NESTED}, [DCB_PG_ATTR_TC_1] = {.type = NLA_NESTED}, [DCB_PG_ATTR_TC_2] = {.type = NLA_NESTED}, [DCB_PG_ATTR_TC_3] = {.type = NLA_NESTED}, [DCB_PG_ATTR_TC_4] = {.type = NLA_NESTED}, [DCB_PG_ATTR_TC_5] = {.type = NLA_NESTED}, [DCB_PG_ATTR_TC_6] = {.type = NLA_NESTED}, [DCB_PG_ATTR_TC_7] = {.type = NLA_NESTED}, [DCB_PG_ATTR_TC_ALL] = {.type = NLA_NESTED}, [DCB_PG_ATTR_BW_ID_0] = {.type = NLA_U8}, [DCB_PG_ATTR_BW_ID_1] = {.type = NLA_U8}, [DCB_PG_ATTR_BW_ID_2] = {.type = NLA_U8}, [DCB_PG_ATTR_BW_ID_3] = {.type = NLA_U8}, [DCB_PG_ATTR_BW_ID_4] = {.type = NLA_U8}, [DCB_PG_ATTR_BW_ID_5] = {.type = NLA_U8}, [DCB_PG_ATTR_BW_ID_6] = {.type = NLA_U8}, [DCB_PG_ATTR_BW_ID_7] = {.type = NLA_U8}, [DCB_PG_ATTR_BW_ID_ALL] = {.type = NLA_FLAG}, }; /* DCB traffic class nested attributes. */ static const struct nla_policy dcbnl_tc_param_nest[DCB_TC_ATTR_PARAM_MAX + 1] = { [DCB_TC_ATTR_PARAM_PGID] = {.type = NLA_U8}, [DCB_TC_ATTR_PARAM_UP_MAPPING] = {.type = NLA_U8}, [DCB_TC_ATTR_PARAM_STRICT_PRIO] = {.type = NLA_U8}, [DCB_TC_ATTR_PARAM_BW_PCT] = {.type = NLA_U8}, [DCB_TC_ATTR_PARAM_ALL] = {.type = NLA_FLAG}, }; /* DCB capabilities nested attributes. */ static const struct nla_policy dcbnl_cap_nest[DCB_CAP_ATTR_MAX + 1] = { [DCB_CAP_ATTR_ALL] = {.type = NLA_FLAG}, [DCB_CAP_ATTR_PG] = {.type = NLA_U8}, [DCB_CAP_ATTR_PFC] = {.type = NLA_U8}, [DCB_CAP_ATTR_UP2TC] = {.type = NLA_U8}, [DCB_CAP_ATTR_PG_TCS] = {.type = NLA_U8}, [DCB_CAP_ATTR_PFC_TCS] = {.type = NLA_U8}, [DCB_CAP_ATTR_GSP] = {.type = NLA_U8}, [DCB_CAP_ATTR_BCN] = {.type = NLA_U8}, [DCB_CAP_ATTR_DCBX] = {.type = NLA_U8}, }; /* DCB capabilities nested attributes. */ static const struct nla_policy dcbnl_numtcs_nest[DCB_NUMTCS_ATTR_MAX + 1] = { [DCB_NUMTCS_ATTR_ALL] = {.type = NLA_FLAG}, [DCB_NUMTCS_ATTR_PG] = {.type = NLA_U8}, [DCB_NUMTCS_ATTR_PFC] = {.type = NLA_U8}, }; /* DCB BCN nested attributes. */ static const struct nla_policy dcbnl_bcn_nest[DCB_BCN_ATTR_MAX + 1] = { [DCB_BCN_ATTR_RP_0] = {.type = NLA_U8}, [DCB_BCN_ATTR_RP_1] = {.type = NLA_U8}, [DCB_BCN_ATTR_RP_2] = {.type = NLA_U8}, [DCB_BCN_ATTR_RP_3] = {.type = NLA_U8}, [DCB_BCN_ATTR_RP_4] = {.type = NLA_U8}, [DCB_BCN_ATTR_RP_5] = {.type = NLA_U8}, [DCB_BCN_ATTR_RP_6] = {.type = NLA_U8}, [DCB_BCN_ATTR_RP_7] = {.type = NLA_U8}, [DCB_BCN_ATTR_RP_ALL] = {.type = NLA_FLAG}, [DCB_BCN_ATTR_BCNA_0] = {.type = NLA_U32}, [DCB_BCN_ATTR_BCNA_1] = {.type = NLA_U32}, [DCB_BCN_ATTR_ALPHA] = {.type = NLA_U32}, [DCB_BCN_ATTR_BETA] = {.type = NLA_U32}, [DCB_BCN_ATTR_GD] = {.type = NLA_U32}, [DCB_BCN_ATTR_GI] = {.type = NLA_U32}, [DCB_BCN_ATTR_TMAX] = {.type = NLA_U32}, [DCB_BCN_ATTR_TD] = {.type = NLA_U32}, [DCB_BCN_ATTR_RMIN] = {.type = NLA_U32}, [DCB_BCN_ATTR_W] = {.type = NLA_U32}, [DCB_BCN_ATTR_RD] = {.type = NLA_U32}, [DCB_BCN_ATTR_RU] = {.type = NLA_U32}, [DCB_BCN_ATTR_WRTT] = {.type = NLA_U32}, [DCB_BCN_ATTR_RI] = {.type = NLA_U32}, [DCB_BCN_ATTR_C] = {.type = NLA_U32}, [DCB_BCN_ATTR_ALL] = {.type = NLA_FLAG}, }; /* DCB APP nested attributes. */ static const struct nla_policy dcbnl_app_nest[DCB_APP_ATTR_MAX + 1] = { [DCB_APP_ATTR_IDTYPE] = {.type = NLA_U8}, [DCB_APP_ATTR_ID] = {.type = NLA_U16}, [DCB_APP_ATTR_PRIORITY] = {.type = NLA_U8}, }; /* IEEE 802.1Qaz nested attributes. */ static const struct nla_policy dcbnl_ieee_policy[DCB_ATTR_IEEE_MAX + 1] = { [DCB_ATTR_IEEE_ETS] = {.len = sizeof(struct ieee_ets)}, [DCB_ATTR_IEEE_PFC] = {.len = sizeof(struct ieee_pfc)}, [DCB_ATTR_IEEE_APP_TABLE] = {.type = NLA_NESTED}, [DCB_ATTR_IEEE_MAXRATE] = {.len = sizeof(struct ieee_maxrate)}, [DCB_ATTR_IEEE_QCN] = {.len = sizeof(struct ieee_qcn)}, [DCB_ATTR_IEEE_QCN_STATS] = {.len = sizeof(struct ieee_qcn_stats)}, [DCB_ATTR_DCB_BUFFER] = {.len = sizeof(struct dcbnl_buffer)}, [DCB_ATTR_DCB_APP_TRUST_TABLE] = {.type = NLA_NESTED}, }; /* DCB number of traffic classes nested attributes. */ static const struct nla_policy dcbnl_featcfg_nest[DCB_FEATCFG_ATTR_MAX + 1] = { [DCB_FEATCFG_ATTR_ALL] = {.type = NLA_FLAG}, [DCB_FEATCFG_ATTR_PG] = {.type = NLA_U8}, [DCB_FEATCFG_ATTR_PFC] = {.type = NLA_U8}, [DCB_FEATCFG_ATTR_APP] = {.type = NLA_U8}, }; static LIST_HEAD(dcb_app_list); static LIST_HEAD(dcb_rewr_list); static DEFINE_SPINLOCK(dcb_lock); static enum ieee_attrs_app dcbnl_app_attr_type_get(u8 selector) { switch (selector) { case IEEE_8021QAZ_APP_SEL_ETHERTYPE: case IEEE_8021QAZ_APP_SEL_STREAM: case IEEE_8021QAZ_APP_SEL_DGRAM: case IEEE_8021QAZ_APP_SEL_ANY: case IEEE_8021QAZ_APP_SEL_DSCP: return DCB_ATTR_IEEE_APP; case DCB_APP_SEL_PCP: return DCB_ATTR_DCB_APP; default: return DCB_ATTR_IEEE_APP_UNSPEC; } } static bool dcbnl_app_attr_type_validate(enum ieee_attrs_app type) { switch (type) { case DCB_ATTR_IEEE_APP: case DCB_ATTR_DCB_APP: return true; default: return false; } } static bool dcbnl_app_selector_validate(enum ieee_attrs_app type, u8 selector) { return dcbnl_app_attr_type_get(selector) == type; } static struct sk_buff *dcbnl_newmsg(int type, u8 cmd, u32 port, u32 seq, u32 flags, struct nlmsghdr **nlhp) { struct sk_buff *skb; struct dcbmsg *dcb; struct nlmsghdr *nlh; skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb) return NULL; nlh = nlmsg_put(skb, port, seq, type, sizeof(*dcb), flags); BUG_ON(!nlh); dcb = nlmsg_data(nlh); dcb->dcb_family = AF_UNSPEC; dcb->cmd = cmd; dcb->dcb_pad = 0; if (nlhp) *nlhp = nlh; return skb; } static int dcbnl_getstate(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { /* if (!tb[DCB_ATTR_STATE] || !netdev->dcbnl_ops->getstate) */ if (!netdev->dcbnl_ops->getstate) return -EOPNOTSUPP; return nla_put_u8(skb, DCB_ATTR_STATE, netdev->dcbnl_ops->getstate(netdev)); } static int dcbnl_getpfccfg(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { struct nlattr *data[DCB_PFC_UP_ATTR_MAX + 1], *nest; u8 value; int ret; int i; int getall = 0; if (!tb[DCB_ATTR_PFC_CFG]) return -EINVAL; if (!netdev->dcbnl_ops->getpfccfg) return -EOPNOTSUPP; ret = nla_parse_nested_deprecated(data, DCB_PFC_UP_ATTR_MAX, tb[DCB_ATTR_PFC_CFG], dcbnl_pfc_up_nest, NULL); if (ret) return ret; nest = nla_nest_start_noflag(skb, DCB_ATTR_PFC_CFG); if (!nest) return -EMSGSIZE; if (data[DCB_PFC_UP_ATTR_ALL]) getall = 1; for (i = DCB_PFC_UP_ATTR_0; i <= DCB_PFC_UP_ATTR_7; i++) { if (!getall && !data[i]) continue; netdev->dcbnl_ops->getpfccfg(netdev, i - DCB_PFC_UP_ATTR_0, &value); ret = nla_put_u8(skb, i, value); if (ret) { nla_nest_cancel(skb, nest); return ret; } } nla_nest_end(skb, nest); return 0; } static int dcbnl_getperm_hwaddr(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { u8 perm_addr[MAX_ADDR_LEN]; if (!netdev->dcbnl_ops->getpermhwaddr) return -EOPNOTSUPP; memset(perm_addr, 0, sizeof(perm_addr)); netdev->dcbnl_ops->getpermhwaddr(netdev, perm_addr); return nla_put(skb, DCB_ATTR_PERM_HWADDR, sizeof(perm_addr), perm_addr); } static int dcbnl_getcap(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { struct nlattr *data[DCB_CAP_ATTR_MAX + 1], *nest; u8 value; int ret; int i; int getall = 0; if (!tb[DCB_ATTR_CAP]) return -EINVAL; if (!netdev->dcbnl_ops->getcap) return -EOPNOTSUPP; ret = nla_parse_nested_deprecated(data, DCB_CAP_ATTR_MAX, tb[DCB_ATTR_CAP], dcbnl_cap_nest, NULL); if (ret) return ret; nest = nla_nest_start_noflag(skb, DCB_ATTR_CAP); if (!nest) return -EMSGSIZE; if (data[DCB_CAP_ATTR_ALL]) getall = 1; for (i = DCB_CAP_ATTR_ALL+1; i <= DCB_CAP_ATTR_MAX; i++) { if (!getall && !data[i]) continue; if (!netdev->dcbnl_ops->getcap(netdev, i, &value)) { ret = nla_put_u8(skb, i, value); if (ret) { nla_nest_cancel(skb, nest); return ret; } } } nla_nest_end(skb, nest); return 0; } static int dcbnl_getnumtcs(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { struct nlattr *data[DCB_NUMTCS_ATTR_MAX + 1], *nest; u8 value; int ret; int i; int getall = 0; if (!tb[DCB_ATTR_NUMTCS]) return -EINVAL; if (!netdev->dcbnl_ops->getnumtcs) return -EOPNOTSUPP; ret = nla_parse_nested_deprecated(data, DCB_NUMTCS_ATTR_MAX, tb[DCB_ATTR_NUMTCS], dcbnl_numtcs_nest, NULL); if (ret) return ret; nest = nla_nest_start_noflag(skb, DCB_ATTR_NUMTCS); if (!nest) return -EMSGSIZE; if (data[DCB_NUMTCS_ATTR_ALL]) getall = 1; for (i = DCB_NUMTCS_ATTR_ALL+1; i <= DCB_NUMTCS_ATTR_MAX; i++) { if (!getall && !data[i]) continue; ret = netdev->dcbnl_ops->getnumtcs(netdev, i, &value); if (!ret) { ret = nla_put_u8(skb, i, value); if (ret) { nla_nest_cancel(skb, nest); return ret; } } else return -EINVAL; } nla_nest_end(skb, nest); return 0; } static int dcbnl_setnumtcs(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { struct nlattr *data[DCB_NUMTCS_ATTR_MAX + 1]; int ret; u8 value; int i; if (!tb[DCB_ATTR_NUMTCS]) return -EINVAL; if (!netdev->dcbnl_ops->setnumtcs) return -EOPNOTSUPP; ret = nla_parse_nested_deprecated(data, DCB_NUMTCS_ATTR_MAX, tb[DCB_ATTR_NUMTCS], dcbnl_numtcs_nest, NULL); if (ret) return ret; for (i = DCB_NUMTCS_ATTR_ALL+1; i <= DCB_NUMTCS_ATTR_MAX; i++) { if (data[i] == NULL) continue; value = nla_get_u8(data[i]); ret = netdev->dcbnl_ops->setnumtcs(netdev, i, value); if (ret) break; } return nla_put_u8(skb, DCB_ATTR_NUMTCS, !!ret); } static int dcbnl_getpfcstate(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { if (!netdev->dcbnl_ops->getpfcstate) return -EOPNOTSUPP; return nla_put_u8(skb, DCB_ATTR_PFC_STATE, netdev->dcbnl_ops->getpfcstate(netdev)); } static int dcbnl_setpfcstate(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { u8 value; if (!tb[DCB_ATTR_PFC_STATE]) return -EINVAL; if (!netdev->dcbnl_ops->setpfcstate) return -EOPNOTSUPP; value = nla_get_u8(tb[DCB_ATTR_PFC_STATE]); netdev->dcbnl_ops->setpfcstate(netdev, value); return nla_put_u8(skb, DCB_ATTR_PFC_STATE, 0); } static int dcbnl_getapp(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { struct nlattr *app_nest; struct nlattr *app_tb[DCB_APP_ATTR_MAX + 1]; u16 id; u8 up, idtype; int ret; if (!tb[DCB_ATTR_APP]) return -EINVAL; ret = nla_parse_nested_deprecated(app_tb, DCB_APP_ATTR_MAX, tb[DCB_ATTR_APP], dcbnl_app_nest, NULL); if (ret) return ret; /* all must be non-null */ if ((!app_tb[DCB_APP_ATTR_IDTYPE]) || (!app_tb[DCB_APP_ATTR_ID])) return -EINVAL; /* either by eth type or by socket number */ idtype = nla_get_u8(app_tb[DCB_APP_ATTR_IDTYPE]); if ((idtype != DCB_APP_IDTYPE_ETHTYPE) && (idtype != DCB_APP_IDTYPE_PORTNUM)) return -EINVAL; id = nla_get_u16(app_tb[DCB_APP_ATTR_ID]); if (netdev->dcbnl_ops->getapp) { ret = netdev->dcbnl_ops->getapp(netdev, idtype, id); if (ret < 0) return ret; else up = ret; } else { struct dcb_app app = { .selector = idtype, .protocol = id, }; up = dcb_getapp(netdev, &app); } app_nest = nla_nest_start_noflag(skb, DCB_ATTR_APP); if (!app_nest) return -EMSGSIZE; ret = nla_put_u8(skb, DCB_APP_ATTR_IDTYPE, idtype); if (ret) goto out_cancel; ret = nla_put_u16(skb, DCB_APP_ATTR_ID, id); if (ret) goto out_cancel; ret = nla_put_u8(skb, DCB_APP_ATTR_PRIORITY, up); if (ret) goto out_cancel; nla_nest_end(skb, app_nest); return 0; out_cancel: nla_nest_cancel(skb, app_nest); return ret; } static int dcbnl_setapp(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { int ret; u16 id; u8 up, idtype; struct nlattr *app_tb[DCB_APP_ATTR_MAX + 1]; if (!tb[DCB_ATTR_APP]) return -EINVAL; ret = nla_parse_nested_deprecated(app_tb, DCB_APP_ATTR_MAX, tb[DCB_ATTR_APP], dcbnl_app_nest, NULL); if (ret) return ret; /* all must be non-null */ if ((!app_tb[DCB_APP_ATTR_IDTYPE]) || (!app_tb[DCB_APP_ATTR_ID]) || (!app_tb[DCB_APP_ATTR_PRIORITY])) return -EINVAL; /* either by eth type or by socket number */ idtype = nla_get_u8(app_tb[DCB_APP_ATTR_IDTYPE]); if ((idtype != DCB_APP_IDTYPE_ETHTYPE) && (idtype != DCB_APP_IDTYPE_PORTNUM)) return -EINVAL; id = nla_get_u16(app_tb[DCB_APP_ATTR_ID]); up = nla_get_u8(app_tb[DCB_APP_ATTR_PRIORITY]); if (netdev->dcbnl_ops->setapp) { ret = netdev->dcbnl_ops->setapp(netdev, idtype, id, up); if (ret < 0) return ret; } else { struct dcb_app app; app.selector = idtype; app.protocol = id; app.priority = up; ret = dcb_setapp(netdev, &app); } ret = nla_put_u8(skb, DCB_ATTR_APP, ret); dcbnl_cee_notify(netdev, RTM_SETDCB, DCB_CMD_SAPP, seq, 0); return ret; } static int __dcbnl_pg_getcfg(struct net_device *netdev, struct nlmsghdr *nlh, struct nlattr **tb, struct sk_buff *skb, int dir) { struct nlattr *pg_nest, *param_nest, *data; struct nlattr *pg_tb[DCB_PG_ATTR_MAX + 1]; struct nlattr *param_tb[DCB_TC_ATTR_PARAM_MAX + 1]; u8 prio, pgid, tc_pct, up_map; int ret; int getall = 0; int i; if (!tb[DCB_ATTR_PG_CFG]) return -EINVAL; if (!netdev->dcbnl_ops->getpgtccfgtx || !netdev->dcbnl_ops->getpgtccfgrx || !netdev->dcbnl_ops->getpgbwgcfgtx || !netdev->dcbnl_ops->getpgbwgcfgrx) return -EOPNOTSUPP; ret = nla_parse_nested_deprecated(pg_tb, DCB_PG_ATTR_MAX, tb[DCB_ATTR_PG_CFG], dcbnl_pg_nest, NULL); if (ret) return ret; pg_nest = nla_nest_start_noflag(skb, DCB_ATTR_PG_CFG); if (!pg_nest) return -EMSGSIZE; if (pg_tb[DCB_PG_ATTR_TC_ALL]) getall = 1; for (i = DCB_PG_ATTR_TC_0; i <= DCB_PG_ATTR_TC_7; i++) { if (!getall && !pg_tb[i]) continue; if (pg_tb[DCB_PG_ATTR_TC_ALL]) data = pg_tb[DCB_PG_ATTR_TC_ALL]; else data = pg_tb[i]; ret = nla_parse_nested_deprecated(param_tb, DCB_TC_ATTR_PARAM_MAX, data, dcbnl_tc_param_nest, NULL); if (ret) goto err_pg; param_nest = nla_nest_start_noflag(skb, i); if (!param_nest) goto err_pg; pgid = DCB_ATTR_VALUE_UNDEFINED; prio = DCB_ATTR_VALUE_UNDEFINED; tc_pct = DCB_ATTR_VALUE_UNDEFINED; up_map = DCB_ATTR_VALUE_UNDEFINED; if (dir) { /* Rx */ netdev->dcbnl_ops->getpgtccfgrx(netdev, i - DCB_PG_ATTR_TC_0, &prio, &pgid, &tc_pct, &up_map); } else { /* Tx */ netdev->dcbnl_ops->getpgtccfgtx(netdev, i - DCB_PG_ATTR_TC_0, &prio, &pgid, &tc_pct, &up_map); } if (param_tb[DCB_TC_ATTR_PARAM_PGID] || param_tb[DCB_TC_ATTR_PARAM_ALL]) { ret = nla_put_u8(skb, DCB_TC_ATTR_PARAM_PGID, pgid); if (ret) goto err_param; } if (param_tb[DCB_TC_ATTR_PARAM_UP_MAPPING] || param_tb[DCB_TC_ATTR_PARAM_ALL]) { ret = nla_put_u8(skb, DCB_TC_ATTR_PARAM_UP_MAPPING, up_map); if (ret) goto err_param; } if (param_tb[DCB_TC_ATTR_PARAM_STRICT_PRIO] || param_tb[DCB_TC_ATTR_PARAM_ALL]) { ret = nla_put_u8(skb, DCB_TC_ATTR_PARAM_STRICT_PRIO, prio); if (ret) goto err_param; } if (param_tb[DCB_TC_ATTR_PARAM_BW_PCT] || param_tb[DCB_TC_ATTR_PARAM_ALL]) { ret = nla_put_u8(skb, DCB_TC_ATTR_PARAM_BW_PCT, tc_pct); if (ret) goto err_param; } nla_nest_end(skb, param_nest); } if (pg_tb[DCB_PG_ATTR_BW_ID_ALL]) getall = 1; else getall = 0; for (i = DCB_PG_ATTR_BW_ID_0; i <= DCB_PG_ATTR_BW_ID_7; i++) { if (!getall && !pg_tb[i]) continue; tc_pct = DCB_ATTR_VALUE_UNDEFINED; if (dir) { /* Rx */ netdev->dcbnl_ops->getpgbwgcfgrx(netdev, i - DCB_PG_ATTR_BW_ID_0, &tc_pct); } else { /* Tx */ netdev->dcbnl_ops->getpgbwgcfgtx(netdev, i - DCB_PG_ATTR_BW_ID_0, &tc_pct); } ret = nla_put_u8(skb, i, tc_pct); if (ret) goto err_pg; } nla_nest_end(skb, pg_nest); return 0; err_param: nla_nest_cancel(skb, param_nest); err_pg: nla_nest_cancel(skb, pg_nest); return -EMSGSIZE; } static int dcbnl_pgtx_getcfg(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { return __dcbnl_pg_getcfg(netdev, nlh, tb, skb, 0); } static int dcbnl_pgrx_getcfg(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { return __dcbnl_pg_getcfg(netdev, nlh, tb, skb, 1); } static int dcbnl_setstate(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { u8 value; if (!tb[DCB_ATTR_STATE]) return -EINVAL; if (!netdev->dcbnl_ops->setstate) return -EOPNOTSUPP; value = nla_get_u8(tb[DCB_ATTR_STATE]); return nla_put_u8(skb, DCB_ATTR_STATE, netdev->dcbnl_ops->setstate(netdev, value)); } static int dcbnl_setpfccfg(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { struct nlattr *data[DCB_PFC_UP_ATTR_MAX + 1]; int i; int ret; u8 value; if (!tb[DCB_ATTR_PFC_CFG]) return -EINVAL; if (!netdev->dcbnl_ops->setpfccfg) return -EOPNOTSUPP; ret = nla_parse_nested_deprecated(data, DCB_PFC_UP_ATTR_MAX, tb[DCB_ATTR_PFC_CFG], dcbnl_pfc_up_nest, NULL); if (ret) return ret; for (i = DCB_PFC_UP_ATTR_0; i <= DCB_PFC_UP_ATTR_7; i++) { if (data[i] == NULL) continue; value = nla_get_u8(data[i]); netdev->dcbnl_ops->setpfccfg(netdev, data[i]->nla_type - DCB_PFC_UP_ATTR_0, value); } return nla_put_u8(skb, DCB_ATTR_PFC_CFG, 0); } static int dcbnl_setall(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { int ret; if (!tb[DCB_ATTR_SET_ALL]) return -EINVAL; if (!netdev->dcbnl_ops->setall) return -EOPNOTSUPP; ret = nla_put_u8(skb, DCB_ATTR_SET_ALL, netdev->dcbnl_ops->setall(netdev)); dcbnl_cee_notify(netdev, RTM_SETDCB, DCB_CMD_SET_ALL, seq, 0); return ret; } static int __dcbnl_pg_setcfg(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb, int dir) { struct nlattr *pg_tb[DCB_PG_ATTR_MAX + 1]; struct nlattr *param_tb[DCB_TC_ATTR_PARAM_MAX + 1]; int ret; int i; u8 pgid; u8 up_map; u8 prio; u8 tc_pct; if (!tb[DCB_ATTR_PG_CFG]) return -EINVAL; if (!netdev->dcbnl_ops->setpgtccfgtx || !netdev->dcbnl_ops->setpgtccfgrx || !netdev->dcbnl_ops->setpgbwgcfgtx || !netdev->dcbnl_ops->setpgbwgcfgrx) return -EOPNOTSUPP; ret = nla_parse_nested_deprecated(pg_tb, DCB_PG_ATTR_MAX, tb[DCB_ATTR_PG_CFG], dcbnl_pg_nest, NULL); if (ret) return ret; for (i = DCB_PG_ATTR_TC_0; i <= DCB_PG_ATTR_TC_7; i++) { if (!pg_tb[i]) continue; ret = nla_parse_nested_deprecated(param_tb, DCB_TC_ATTR_PARAM_MAX, pg_tb[i], dcbnl_tc_param_nest, NULL); if (ret) return ret; pgid = DCB_ATTR_VALUE_UNDEFINED; prio = DCB_ATTR_VALUE_UNDEFINED; tc_pct = DCB_ATTR_VALUE_UNDEFINED; up_map = DCB_ATTR_VALUE_UNDEFINED; if (param_tb[DCB_TC_ATTR_PARAM_STRICT_PRIO]) prio = nla_get_u8(param_tb[DCB_TC_ATTR_PARAM_STRICT_PRIO]); if (param_tb[DCB_TC_ATTR_PARAM_PGID]) pgid = nla_get_u8(param_tb[DCB_TC_ATTR_PARAM_PGID]); if (param_tb[DCB_TC_ATTR_PARAM_BW_PCT]) tc_pct = nla_get_u8(param_tb[DCB_TC_ATTR_PARAM_BW_PCT]); if (param_tb[DCB_TC_ATTR_PARAM_UP_MAPPING]) up_map = nla_get_u8(param_tb[DCB_TC_ATTR_PARAM_UP_MAPPING]); /* dir: Tx = 0, Rx = 1 */ if (dir) { /* Rx */ netdev->dcbnl_ops->setpgtccfgrx(netdev, i - DCB_PG_ATTR_TC_0, prio, pgid, tc_pct, up_map); } else { /* Tx */ netdev->dcbnl_ops->setpgtccfgtx(netdev, i - DCB_PG_ATTR_TC_0, prio, pgid, tc_pct, up_map); } } for (i = DCB_PG_ATTR_BW_ID_0; i <= DCB_PG_ATTR_BW_ID_7; i++) { if (!pg_tb[i]) continue; tc_pct = nla_get_u8(pg_tb[i]); /* dir: Tx = 0, Rx = 1 */ if (dir) { /* Rx */ netdev->dcbnl_ops->setpgbwgcfgrx(netdev, i - DCB_PG_ATTR_BW_ID_0, tc_pct); } else { /* Tx */ netdev->dcbnl_ops->setpgbwgcfgtx(netdev, i - DCB_PG_ATTR_BW_ID_0, tc_pct); } } return nla_put_u8(skb, DCB_ATTR_PG_CFG, 0); } static int dcbnl_pgtx_setcfg(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { return __dcbnl_pg_setcfg(netdev, nlh, seq, tb, skb, 0); } static int dcbnl_pgrx_setcfg(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { return __dcbnl_pg_setcfg(netdev, nlh, seq, tb, skb, 1); } static int dcbnl_bcn_getcfg(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { struct nlattr *bcn_nest; struct nlattr *bcn_tb[DCB_BCN_ATTR_MAX + 1]; u8 value_byte; u32 value_integer; int ret; bool getall = false; int i; if (!tb[DCB_ATTR_BCN]) return -EINVAL; if (!netdev->dcbnl_ops->getbcnrp || !netdev->dcbnl_ops->getbcncfg) return -EOPNOTSUPP; ret = nla_parse_nested_deprecated(bcn_tb, DCB_BCN_ATTR_MAX, tb[DCB_ATTR_BCN], dcbnl_bcn_nest, NULL); if (ret) return ret; bcn_nest = nla_nest_start_noflag(skb, DCB_ATTR_BCN); if (!bcn_nest) return -EMSGSIZE; if (bcn_tb[DCB_BCN_ATTR_ALL]) getall = true; for (i = DCB_BCN_ATTR_RP_0; i <= DCB_BCN_ATTR_RP_7; i++) { if (!getall && !bcn_tb[i]) continue; netdev->dcbnl_ops->getbcnrp(netdev, i - DCB_BCN_ATTR_RP_0, &value_byte); ret = nla_put_u8(skb, i, value_byte); if (ret) goto err_bcn; } for (i = DCB_BCN_ATTR_BCNA_0; i <= DCB_BCN_ATTR_RI; i++) { if (!getall && !bcn_tb[i]) continue; netdev->dcbnl_ops->getbcncfg(netdev, i, &value_integer); ret = nla_put_u32(skb, i, value_integer); if (ret) goto err_bcn; } nla_nest_end(skb, bcn_nest); return 0; err_bcn: nla_nest_cancel(skb, bcn_nest); return ret; } static int dcbnl_bcn_setcfg(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { struct nlattr *data[DCB_BCN_ATTR_MAX + 1]; int i; int ret; u8 value_byte; u32 value_int; if (!tb[DCB_ATTR_BCN]) return -EINVAL; if (!netdev->dcbnl_ops->setbcncfg || !netdev->dcbnl_ops->setbcnrp) return -EOPNOTSUPP; ret = nla_parse_nested_deprecated(data, DCB_BCN_ATTR_MAX, tb[DCB_ATTR_BCN], dcbnl_bcn_nest, NULL); if (ret) return ret; for (i = DCB_BCN_ATTR_RP_0; i <= DCB_BCN_ATTR_RP_7; i++) { if (data[i] == NULL) continue; value_byte = nla_get_u8(data[i]); netdev->dcbnl_ops->setbcnrp(netdev, data[i]->nla_type - DCB_BCN_ATTR_RP_0, value_byte); } for (i = DCB_BCN_ATTR_BCNA_0; i <= DCB_BCN_ATTR_RI; i++) { if (data[i] == NULL) continue; value_int = nla_get_u32(data[i]); netdev->dcbnl_ops->setbcncfg(netdev, i, value_int); } return nla_put_u8(skb, DCB_ATTR_BCN, 0); } static int dcbnl_build_peer_app(struct net_device *netdev, struct sk_buff* skb, int app_nested_type, int app_info_type, int app_entry_type) { struct dcb_peer_app_info info; struct dcb_app *table = NULL; const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops; u16 app_count; int err; /** * retrieve the peer app configuration form the driver. If the driver * handlers fail exit without doing anything */ err = ops->peer_getappinfo(netdev, &info, &app_count); if (!err && app_count) { table = kmalloc_array(app_count, sizeof(struct dcb_app), GFP_KERNEL); if (!table) return -ENOMEM; err = ops->peer_getapptable(netdev, table); } if (!err) { u16 i; struct nlattr *app; /** * build the message, from here on the only possible failure * is due to the skb size */ err = -EMSGSIZE; app = nla_nest_start_noflag(skb, app_nested_type); if (!app) goto nla_put_failure; if (app_info_type && nla_put(skb, app_info_type, sizeof(info), &info)) goto nla_put_failure; for (i = 0; i < app_count; i++) { if (nla_put(skb, app_entry_type, sizeof(struct dcb_app), &table[i])) goto nla_put_failure; } nla_nest_end(skb, app); } err = 0; nla_put_failure: kfree(table); return err; } static int dcbnl_getapptrust(struct net_device *netdev, struct sk_buff *skb) { const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops; enum ieee_attrs_app type; struct nlattr *apptrust; int nselectors, err, i; u8 *selectors; selectors = kzalloc(IEEE_8021QAZ_APP_SEL_MAX + 1, GFP_KERNEL); if (!selectors) return -ENOMEM; err = ops->dcbnl_getapptrust(netdev, selectors, &nselectors); if (err) { err = 0; goto out; } apptrust = nla_nest_start(skb, DCB_ATTR_DCB_APP_TRUST_TABLE); if (!apptrust) { err = -EMSGSIZE; goto out; } for (i = 0; i < nselectors; i++) { type = dcbnl_app_attr_type_get(selectors[i]); err = nla_put_u8(skb, type, selectors[i]); if (err) { nla_nest_cancel(skb, apptrust); goto out; } } nla_nest_end(skb, apptrust); out: kfree(selectors); return err; } /* Set or delete APP table or rewrite table entries. The APP struct is validated * and the appropriate callback function is called. */ static int dcbnl_app_table_setdel(struct nlattr *attr, struct net_device *netdev, int (*setdel)(struct net_device *dev, struct dcb_app *app)) { struct dcb_app *app_data; enum ieee_attrs_app type; struct nlattr *attr_itr; int rem, err; nla_for_each_nested(attr_itr, attr, rem) { type = nla_type(attr_itr); if (!dcbnl_app_attr_type_validate(type)) continue; if (nla_len(attr_itr) < sizeof(struct dcb_app)) return -ERANGE; app_data = nla_data(attr_itr); if (!dcbnl_app_selector_validate(type, app_data->selector)) return -EINVAL; err = setdel(netdev, app_data); if (err) return err; } return 0; } /* Handle IEEE 802.1Qaz/802.1Qau/802.1Qbb GET commands. */ static int dcbnl_ieee_fill(struct sk_buff *skb, struct net_device *netdev) { const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops; struct nlattr *ieee, *app, *rewr; struct dcb_app_type *itr; int dcbx; int err; if (nla_put_string(skb, DCB_ATTR_IFNAME, netdev->name)) return -EMSGSIZE; ieee = nla_nest_start_noflag(skb, DCB_ATTR_IEEE); if (!ieee) return -EMSGSIZE; if (ops->ieee_getets) { struct ieee_ets ets; memset(&ets, 0, sizeof(ets)); err = ops->ieee_getets(netdev, &ets); if (!err && nla_put(skb, DCB_ATTR_IEEE_ETS, sizeof(ets), &ets)) return -EMSGSIZE; } if (ops->ieee_getmaxrate) { struct ieee_maxrate maxrate; memset(&maxrate, 0, sizeof(maxrate)); err = ops->ieee_getmaxrate(netdev, &maxrate); if (!err) { err = nla_put(skb, DCB_ATTR_IEEE_MAXRATE, sizeof(maxrate), &maxrate); if (err) return -EMSGSIZE; } } if (ops->ieee_getqcn) { struct ieee_qcn qcn; memset(&qcn, 0, sizeof(qcn)); err = ops->ieee_getqcn(netdev, &qcn); if (!err) { err = nla_put(skb, DCB_ATTR_IEEE_QCN, sizeof(qcn), &qcn); if (err) return -EMSGSIZE; } } if (ops->ieee_getqcnstats) { struct ieee_qcn_stats qcn_stats; memset(&qcn_stats, 0, sizeof(qcn_stats)); err = ops->ieee_getqcnstats(netdev, &qcn_stats); if (!err) { err = nla_put(skb, DCB_ATTR_IEEE_QCN_STATS, sizeof(qcn_stats), &qcn_stats); if (err) return -EMSGSIZE; } } if (ops->ieee_getpfc) { struct ieee_pfc pfc; memset(&pfc, 0, sizeof(pfc)); err = ops->ieee_getpfc(netdev, &pfc); if (!err && nla_put(skb, DCB_ATTR_IEEE_PFC, sizeof(pfc), &pfc)) return -EMSGSIZE; } if (ops->dcbnl_getbuffer) { struct dcbnl_buffer buffer; memset(&buffer, 0, sizeof(buffer)); err = ops->dcbnl_getbuffer(netdev, &buffer); if (!err && nla_put(skb, DCB_ATTR_DCB_BUFFER, sizeof(buffer), &buffer)) return -EMSGSIZE; } app = nla_nest_start_noflag(skb, DCB_ATTR_IEEE_APP_TABLE); if (!app) return -EMSGSIZE; spin_lock_bh(&dcb_lock); list_for_each_entry(itr, &dcb_app_list, list) { if (itr->ifindex == netdev->ifindex) { enum ieee_attrs_app type = dcbnl_app_attr_type_get(itr->app.selector); err = nla_put(skb, type, sizeof(itr->app), &itr->app); if (err) { spin_unlock_bh(&dcb_lock); return -EMSGSIZE; } } } if (netdev->dcbnl_ops->getdcbx) dcbx = netdev->dcbnl_ops->getdcbx(netdev); else dcbx = -EOPNOTSUPP; spin_unlock_bh(&dcb_lock); nla_nest_end(skb, app); rewr = nla_nest_start(skb, DCB_ATTR_DCB_REWR_TABLE); if (!rewr) return -EMSGSIZE; spin_lock_bh(&dcb_lock); list_for_each_entry(itr, &dcb_rewr_list, list) { if (itr->ifindex == netdev->ifindex) { enum ieee_attrs_app type = dcbnl_app_attr_type_get(itr->app.selector); err = nla_put(skb, type, sizeof(itr->app), &itr->app); if (err) { spin_unlock_bh(&dcb_lock); nla_nest_cancel(skb, rewr); return -EMSGSIZE; } } } spin_unlock_bh(&dcb_lock); nla_nest_end(skb, rewr); if (ops->dcbnl_getapptrust) { err = dcbnl_getapptrust(netdev, skb); if (err) return err; } /* get peer info if available */ if (ops->ieee_peer_getets) { struct ieee_ets ets; memset(&ets, 0, sizeof(ets)); err = ops->ieee_peer_getets(netdev, &ets); if (!err && nla_put(skb, DCB_ATTR_IEEE_PEER_ETS, sizeof(ets), &ets)) return -EMSGSIZE; } if (ops->ieee_peer_getpfc) { struct ieee_pfc pfc; memset(&pfc, 0, sizeof(pfc)); err = ops->ieee_peer_getpfc(netdev, &pfc); if (!err && nla_put(skb, DCB_ATTR_IEEE_PEER_PFC, sizeof(pfc), &pfc)) return -EMSGSIZE; } if (ops->peer_getappinfo && ops->peer_getapptable) { err = dcbnl_build_peer_app(netdev, skb, DCB_ATTR_IEEE_PEER_APP, DCB_ATTR_IEEE_APP_UNSPEC, DCB_ATTR_IEEE_APP); if (err) return -EMSGSIZE; } nla_nest_end(skb, ieee); if (dcbx >= 0) { err = nla_put_u8(skb, DCB_ATTR_DCBX, dcbx); if (err) return -EMSGSIZE; } return 0; } static int dcbnl_cee_pg_fill(struct sk_buff *skb, struct net_device *dev, int dir) { u8 pgid, up_map, prio, tc_pct; const struct dcbnl_rtnl_ops *ops = dev->dcbnl_ops; int i = dir ? DCB_ATTR_CEE_TX_PG : DCB_ATTR_CEE_RX_PG; struct nlattr *pg = nla_nest_start_noflag(skb, i); if (!pg) return -EMSGSIZE; for (i = DCB_PG_ATTR_TC_0; i <= DCB_PG_ATTR_TC_7; i++) { struct nlattr *tc_nest = nla_nest_start_noflag(skb, i); if (!tc_nest) return -EMSGSIZE; pgid = DCB_ATTR_VALUE_UNDEFINED; prio = DCB_ATTR_VALUE_UNDEFINED; tc_pct = DCB_ATTR_VALUE_UNDEFINED; up_map = DCB_ATTR_VALUE_UNDEFINED; if (!dir) ops->getpgtccfgrx(dev, i - DCB_PG_ATTR_TC_0, &prio, &pgid, &tc_pct, &up_map); else ops->getpgtccfgtx(dev, i - DCB_PG_ATTR_TC_0, &prio, &pgid, &tc_pct, &up_map); if (nla_put_u8(skb, DCB_TC_ATTR_PARAM_PGID, pgid) || nla_put_u8(skb, DCB_TC_ATTR_PARAM_UP_MAPPING, up_map) || nla_put_u8(skb, DCB_TC_ATTR_PARAM_STRICT_PRIO, prio) || nla_put_u8(skb, DCB_TC_ATTR_PARAM_BW_PCT, tc_pct)) return -EMSGSIZE; nla_nest_end(skb, tc_nest); } for (i = DCB_PG_ATTR_BW_ID_0; i <= DCB_PG_ATTR_BW_ID_7; i++) { tc_pct = DCB_ATTR_VALUE_UNDEFINED; if (!dir) ops->getpgbwgcfgrx(dev, i - DCB_PG_ATTR_BW_ID_0, &tc_pct); else ops->getpgbwgcfgtx(dev, i - DCB_PG_ATTR_BW_ID_0, &tc_pct); if (nla_put_u8(skb, i, tc_pct)) return -EMSGSIZE; } nla_nest_end(skb, pg); return 0; } static int dcbnl_cee_fill(struct sk_buff *skb, struct net_device *netdev) { struct nlattr *cee, *app; struct dcb_app_type *itr; const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops; int dcbx, i, err = -EMSGSIZE; u8 value; if (nla_put_string(skb, DCB_ATTR_IFNAME, netdev->name)) goto nla_put_failure; cee = nla_nest_start_noflag(skb, DCB_ATTR_CEE); if (!cee) goto nla_put_failure; /* local pg */ if (ops->getpgtccfgtx && ops->getpgbwgcfgtx) { err = dcbnl_cee_pg_fill(skb, netdev, 1); if (err) goto nla_put_failure; } if (ops->getpgtccfgrx && ops->getpgbwgcfgrx) { err = dcbnl_cee_pg_fill(skb, netdev, 0); if (err) goto nla_put_failure; } /* local pfc */ if (ops->getpfccfg) { struct nlattr *pfc_nest = nla_nest_start_noflag(skb, DCB_ATTR_CEE_PFC); if (!pfc_nest) goto nla_put_failure; for (i = DCB_PFC_UP_ATTR_0; i <= DCB_PFC_UP_ATTR_7; i++) { ops->getpfccfg(netdev, i - DCB_PFC_UP_ATTR_0, &value); if (nla_put_u8(skb, i, value)) goto nla_put_failure; } nla_nest_end(skb, pfc_nest); } /* local app */ spin_lock_bh(&dcb_lock); app = nla_nest_start_noflag(skb, DCB_ATTR_CEE_APP_TABLE); if (!app) goto dcb_unlock; list_for_each_entry(itr, &dcb_app_list, list) { if (itr->ifindex == netdev->ifindex) { struct nlattr *app_nest = nla_nest_start_noflag(skb, DCB_ATTR_APP); if (!app_nest) goto dcb_unlock; err = nla_put_u8(skb, DCB_APP_ATTR_IDTYPE, itr->app.selector); if (err) goto dcb_unlock; err = nla_put_u16(skb, DCB_APP_ATTR_ID, itr->app.protocol); if (err) goto dcb_unlock; err = nla_put_u8(skb, DCB_APP_ATTR_PRIORITY, itr->app.priority); if (err) goto dcb_unlock; nla_nest_end(skb, app_nest); } } nla_nest_end(skb, app); if (netdev->dcbnl_ops->getdcbx) dcbx = netdev->dcbnl_ops->getdcbx(netdev); else dcbx = -EOPNOTSUPP; spin_unlock_bh(&dcb_lock); /* features flags */ if (ops->getfeatcfg) { struct nlattr *feat = nla_nest_start_noflag(skb, DCB_ATTR_CEE_FEAT); if (!feat) goto nla_put_failure; for (i = DCB_FEATCFG_ATTR_ALL + 1; i <= DCB_FEATCFG_ATTR_MAX; i++) if (!ops->getfeatcfg(netdev, i, &value) && nla_put_u8(skb, i, value)) goto nla_put_failure; nla_nest_end(skb, feat); } /* peer info if available */ if (ops->cee_peer_getpg) { struct cee_pg pg; memset(&pg, 0, sizeof(pg)); err = ops->cee_peer_getpg(netdev, &pg); if (!err && nla_put(skb, DCB_ATTR_CEE_PEER_PG, sizeof(pg), &pg)) goto nla_put_failure; } if (ops->cee_peer_getpfc) { struct cee_pfc pfc; memset(&pfc, 0, sizeof(pfc)); err = ops->cee_peer_getpfc(netdev, &pfc); if (!err && nla_put(skb, DCB_ATTR_CEE_PEER_PFC, sizeof(pfc), &pfc)) goto nla_put_failure; } if (ops->peer_getappinfo && ops->peer_getapptable) { err = dcbnl_build_peer_app(netdev, skb, DCB_ATTR_CEE_PEER_APP_TABLE, DCB_ATTR_CEE_PEER_APP_INFO, DCB_ATTR_CEE_PEER_APP); if (err) goto nla_put_failure; } nla_nest_end(skb, cee); /* DCBX state */ if (dcbx >= 0) { err = nla_put_u8(skb, DCB_ATTR_DCBX, dcbx); if (err) goto nla_put_failure; } return 0; dcb_unlock: spin_unlock_bh(&dcb_lock); nla_put_failure: err = -EMSGSIZE; return err; } static int dcbnl_notify(struct net_device *dev, int event, int cmd, u32 seq, u32 portid, int dcbx_ver) { struct net *net = dev_net(dev); struct sk_buff *skb; struct nlmsghdr *nlh; const struct dcbnl_rtnl_ops *ops = dev->dcbnl_ops; int err; if (!ops) return -EOPNOTSUPP; skb = dcbnl_newmsg(event, cmd, portid, seq, 0, &nlh); if (!skb) return -ENOMEM; if (dcbx_ver == DCB_CAP_DCBX_VER_IEEE) err = dcbnl_ieee_fill(skb, dev); else err = dcbnl_cee_fill(skb, dev); if (err < 0) { /* Report error to broadcast listeners */ nlmsg_free(skb); rtnl_set_sk_err(net, RTNLGRP_DCB, err); } else { /* End nlmsg and notify broadcast listeners */ nlmsg_end(skb, nlh); rtnl_notify(skb, net, 0, RTNLGRP_DCB, NULL, GFP_KERNEL); } return err; } int dcbnl_ieee_notify(struct net_device *dev, int event, int cmd, u32 seq, u32 portid) { return dcbnl_notify(dev, event, cmd, seq, portid, DCB_CAP_DCBX_VER_IEEE); } EXPORT_SYMBOL(dcbnl_ieee_notify); int dcbnl_cee_notify(struct net_device *dev, int event, int cmd, u32 seq, u32 portid) { return dcbnl_notify(dev, event, cmd, seq, portid, DCB_CAP_DCBX_VER_CEE); } EXPORT_SYMBOL(dcbnl_cee_notify); /* Handle IEEE 802.1Qaz/802.1Qau/802.1Qbb SET commands. * If any requested operation can not be completed * the entire msg is aborted and error value is returned. * No attempt is made to reconcile the case where only part of the * cmd can be completed. */ static int dcbnl_ieee_set(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops; struct nlattr *ieee[DCB_ATTR_IEEE_MAX + 1]; int prio; int err; if (!ops) return -EOPNOTSUPP; if (!tb[DCB_ATTR_IEEE]) return -EINVAL; err = nla_parse_nested_deprecated(ieee, DCB_ATTR_IEEE_MAX, tb[DCB_ATTR_IEEE], dcbnl_ieee_policy, NULL); if (err) return err; if (ieee[DCB_ATTR_IEEE_ETS] && ops->ieee_setets) { struct ieee_ets *ets = nla_data(ieee[DCB_ATTR_IEEE_ETS]); err = ops->ieee_setets(netdev, ets); if (err) goto err; } if (ieee[DCB_ATTR_IEEE_MAXRATE] && ops->ieee_setmaxrate) { struct ieee_maxrate *maxrate = nla_data(ieee[DCB_ATTR_IEEE_MAXRATE]); err = ops->ieee_setmaxrate(netdev, maxrate); if (err) goto err; } if (ieee[DCB_ATTR_IEEE_QCN] && ops->ieee_setqcn) { struct ieee_qcn *qcn = nla_data(ieee[DCB_ATTR_IEEE_QCN]); err = ops->ieee_setqcn(netdev, qcn); if (err) goto err; } if (ieee[DCB_ATTR_IEEE_PFC] && ops->ieee_setpfc) { struct ieee_pfc *pfc = nla_data(ieee[DCB_ATTR_IEEE_PFC]); err = ops->ieee_setpfc(netdev, pfc); if (err) goto err; } if (ieee[DCB_ATTR_DCB_BUFFER] && ops->dcbnl_setbuffer) { struct dcbnl_buffer *buffer = nla_data(ieee[DCB_ATTR_DCB_BUFFER]); for (prio = 0; prio < ARRAY_SIZE(buffer->prio2buffer); prio++) { if (buffer->prio2buffer[prio] >= DCBX_MAX_BUFFERS) { err = -EINVAL; goto err; } } err = ops->dcbnl_setbuffer(netdev, buffer); if (err) goto err; } if (ieee[DCB_ATTR_DCB_REWR_TABLE]) { err = dcbnl_app_table_setdel(ieee[DCB_ATTR_DCB_REWR_TABLE], netdev, ops->dcbnl_setrewr ?: dcb_setrewr); if (err) goto err; } if (ieee[DCB_ATTR_IEEE_APP_TABLE]) { err = dcbnl_app_table_setdel(ieee[DCB_ATTR_IEEE_APP_TABLE], netdev, ops->ieee_setapp ?: dcb_ieee_setapp); if (err) goto err; } if (ieee[DCB_ATTR_DCB_APP_TRUST_TABLE]) { u8 selectors[IEEE_8021QAZ_APP_SEL_MAX + 1] = {0}; struct nlattr *attr; int nselectors = 0; int rem; if (!ops->dcbnl_setapptrust) { err = -EOPNOTSUPP; goto err; } nla_for_each_nested(attr, ieee[DCB_ATTR_DCB_APP_TRUST_TABLE], rem) { enum ieee_attrs_app type = nla_type(attr); u8 selector; int i; if (!dcbnl_app_attr_type_validate(type) || nla_len(attr) != 1 || nselectors >= sizeof(selectors)) { err = -EINVAL; goto err; } selector = nla_get_u8(attr); if (!dcbnl_app_selector_validate(type, selector)) { err = -EINVAL; goto err; } /* Duplicate selector ? */ for (i = 0; i < nselectors; i++) { if (selectors[i] == selector) { err = -EINVAL; goto err; } } selectors[nselectors++] = selector; } err = ops->dcbnl_setapptrust(netdev, selectors, nselectors); if (err) goto err; } err: err = nla_put_u8(skb, DCB_ATTR_IEEE, err); dcbnl_ieee_notify(netdev, RTM_SETDCB, DCB_CMD_IEEE_SET, seq, 0); return err; } static int dcbnl_ieee_get(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops; if (!ops) return -EOPNOTSUPP; return dcbnl_ieee_fill(skb, netdev); } static int dcbnl_ieee_del(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops; struct nlattr *ieee[DCB_ATTR_IEEE_MAX + 1]; int err; if (!ops) return -EOPNOTSUPP; if (!tb[DCB_ATTR_IEEE]) return -EINVAL; err = nla_parse_nested_deprecated(ieee, DCB_ATTR_IEEE_MAX, tb[DCB_ATTR_IEEE], dcbnl_ieee_policy, NULL); if (err) return err; if (ieee[DCB_ATTR_IEEE_APP_TABLE]) { err = dcbnl_app_table_setdel(ieee[DCB_ATTR_IEEE_APP_TABLE], netdev, ops->ieee_delapp ?: dcb_ieee_delapp); if (err) goto err; } if (ieee[DCB_ATTR_DCB_REWR_TABLE]) { err = dcbnl_app_table_setdel(ieee[DCB_ATTR_DCB_REWR_TABLE], netdev, ops->dcbnl_delrewr ?: dcb_delrewr); if (err) goto err; } err: err = nla_put_u8(skb, DCB_ATTR_IEEE, err); dcbnl_ieee_notify(netdev, RTM_SETDCB, DCB_CMD_IEEE_DEL, seq, 0); return err; } /* DCBX configuration */ static int dcbnl_getdcbx(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { if (!netdev->dcbnl_ops->getdcbx) return -EOPNOTSUPP; return nla_put_u8(skb, DCB_ATTR_DCBX, netdev->dcbnl_ops->getdcbx(netdev)); } static int dcbnl_setdcbx(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { u8 value; if (!netdev->dcbnl_ops->setdcbx) return -EOPNOTSUPP; if (!tb[DCB_ATTR_DCBX]) return -EINVAL; value = nla_get_u8(tb[DCB_ATTR_DCBX]); return nla_put_u8(skb, DCB_ATTR_DCBX, netdev->dcbnl_ops->setdcbx(netdev, value)); } static int dcbnl_getfeatcfg(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { struct nlattr *data[DCB_FEATCFG_ATTR_MAX + 1], *nest; u8 value; int ret, i; int getall = 0; if (!netdev->dcbnl_ops->getfeatcfg) return -EOPNOTSUPP; if (!tb[DCB_ATTR_FEATCFG]) return -EINVAL; ret = nla_parse_nested_deprecated(data, DCB_FEATCFG_ATTR_MAX, tb[DCB_ATTR_FEATCFG], dcbnl_featcfg_nest, NULL); if (ret) return ret; nest = nla_nest_start_noflag(skb, DCB_ATTR_FEATCFG); if (!nest) return -EMSGSIZE; if (data[DCB_FEATCFG_ATTR_ALL]) getall = 1; for (i = DCB_FEATCFG_ATTR_ALL+1; i <= DCB_FEATCFG_ATTR_MAX; i++) { if (!getall && !data[i]) continue; ret = netdev->dcbnl_ops->getfeatcfg(netdev, i, &value); if (!ret) ret = nla_put_u8(skb, i, value); if (ret) { nla_nest_cancel(skb, nest); goto nla_put_failure; } } nla_nest_end(skb, nest); nla_put_failure: return ret; } static int dcbnl_setfeatcfg(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { struct nlattr *data[DCB_FEATCFG_ATTR_MAX + 1]; int ret, i; u8 value; if (!netdev->dcbnl_ops->setfeatcfg) return -ENOTSUPP; if (!tb[DCB_ATTR_FEATCFG]) return -EINVAL; ret = nla_parse_nested_deprecated(data, DCB_FEATCFG_ATTR_MAX, tb[DCB_ATTR_FEATCFG], dcbnl_featcfg_nest, NULL); if (ret) goto err; for (i = DCB_FEATCFG_ATTR_ALL+1; i <= DCB_FEATCFG_ATTR_MAX; i++) { if (data[i] == NULL) continue; value = nla_get_u8(data[i]); ret = netdev->dcbnl_ops->setfeatcfg(netdev, i, value); if (ret) goto err; } err: ret = nla_put_u8(skb, DCB_ATTR_FEATCFG, ret); return ret; } /* Handle CEE DCBX GET commands. */ static int dcbnl_cee_get(struct net_device *netdev, struct nlmsghdr *nlh, u32 seq, struct nlattr **tb, struct sk_buff *skb) { const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops; if (!ops) return -EOPNOTSUPP; return dcbnl_cee_fill(skb, netdev); } struct reply_func { /* reply netlink message type */ int type; /* function to fill message contents */ int (*cb)(struct net_device *, struct nlmsghdr *, u32, struct nlattr **, struct sk_buff *); }; static const struct reply_func reply_funcs[DCB_CMD_MAX+1] = { [DCB_CMD_GSTATE] = { RTM_GETDCB, dcbnl_getstate }, [DCB_CMD_SSTATE] = { RTM_SETDCB, dcbnl_setstate }, [DCB_CMD_PFC_GCFG] = { RTM_GETDCB, dcbnl_getpfccfg }, [DCB_CMD_PFC_SCFG] = { RTM_SETDCB, dcbnl_setpfccfg }, [DCB_CMD_GPERM_HWADDR] = { RTM_GETDCB, dcbnl_getperm_hwaddr }, [DCB_CMD_GCAP] = { RTM_GETDCB, dcbnl_getcap }, [DCB_CMD_GNUMTCS] = { RTM_GETDCB, dcbnl_getnumtcs }, [DCB_CMD_SNUMTCS] = { RTM_SETDCB, dcbnl_setnumtcs }, [DCB_CMD_PFC_GSTATE] = { RTM_GETDCB, dcbnl_getpfcstate }, [DCB_CMD_PFC_SSTATE] = { RTM_SETDCB, dcbnl_setpfcstate }, [DCB_CMD_GAPP] = { RTM_GETDCB, dcbnl_getapp }, [DCB_CMD_SAPP] = { RTM_SETDCB, dcbnl_setapp }, [DCB_CMD_PGTX_GCFG] = { RTM_GETDCB, dcbnl_pgtx_getcfg }, [DCB_CMD_PGTX_SCFG] = { RTM_SETDCB, dcbnl_pgtx_setcfg }, [DCB_CMD_PGRX_GCFG] = { RTM_GETDCB, dcbnl_pgrx_getcfg }, [DCB_CMD_PGRX_SCFG] = { RTM_SETDCB, dcbnl_pgrx_setcfg }, [DCB_CMD_SET_ALL] = { RTM_SETDCB, dcbnl_setall }, [DCB_CMD_BCN_GCFG] = { RTM_GETDCB, dcbnl_bcn_getcfg }, [DCB_CMD_BCN_SCFG] = { RTM_SETDCB, dcbnl_bcn_setcfg }, [DCB_CMD_IEEE_GET] = { RTM_GETDCB, dcbnl_ieee_get }, [DCB_CMD_IEEE_SET] = { RTM_SETDCB, dcbnl_ieee_set }, [DCB_CMD_IEEE_DEL] = { RTM_SETDCB, dcbnl_ieee_del }, [DCB_CMD_GDCBX] = { RTM_GETDCB, dcbnl_getdcbx }, [DCB_CMD_SDCBX] = { RTM_SETDCB, dcbnl_setdcbx }, [DCB_CMD_GFEATCFG] = { RTM_GETDCB, dcbnl_getfeatcfg }, [DCB_CMD_SFEATCFG] = { RTM_SETDCB, dcbnl_setfeatcfg }, [DCB_CMD_CEE_GET] = { RTM_GETDCB, dcbnl_cee_get }, }; static int dcb_doit(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct net_device *netdev; struct dcbmsg *dcb = nlmsg_data(nlh); struct nlattr *tb[DCB_ATTR_MAX + 1]; u32 portid = NETLINK_CB(skb).portid; int ret = -EINVAL; struct sk_buff *reply_skb; struct nlmsghdr *reply_nlh = NULL; const struct reply_func *fn; if ((nlh->nlmsg_type == RTM_SETDCB) && !netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; ret = nlmsg_parse_deprecated(nlh, sizeof(*dcb), tb, DCB_ATTR_MAX, dcbnl_rtnl_policy, extack); if (ret < 0) return ret; if (dcb->cmd > DCB_CMD_MAX) return -EINVAL; /* check if a reply function has been defined for the command */ fn = &reply_funcs[dcb->cmd]; if (!fn->cb) return -EOPNOTSUPP; if (fn->type == RTM_SETDCB && !netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; if (!tb[DCB_ATTR_IFNAME]) return -EINVAL; netdev = __dev_get_by_name(net, nla_data(tb[DCB_ATTR_IFNAME])); if (!netdev) return -ENODEV; if (!netdev->dcbnl_ops) return -EOPNOTSUPP; reply_skb = dcbnl_newmsg(fn->type, dcb->cmd, portid, nlh->nlmsg_seq, nlh->nlmsg_flags, &reply_nlh); if (!reply_skb) return -ENOMEM; ret = fn->cb(netdev, nlh, nlh->nlmsg_seq, tb, reply_skb); if (ret < 0) { nlmsg_free(reply_skb); goto out; } nlmsg_end(reply_skb, reply_nlh); ret = rtnl_unicast(reply_skb, net, portid); out: return ret; } static struct dcb_app_type *dcb_rewr_lookup(const struct dcb_app *app, int ifindex, int proto) { struct dcb_app_type *itr; list_for_each_entry(itr, &dcb_rewr_list, list) { if (itr->app.selector == app->selector && itr->app.priority == app->priority && itr->ifindex == ifindex && ((proto == -1) || itr->app.protocol == proto)) return itr; } return NULL; } static struct dcb_app_type *dcb_app_lookup(const struct dcb_app *app, int ifindex, int prio) { struct dcb_app_type *itr; list_for_each_entry(itr, &dcb_app_list, list) { if (itr->app.selector == app->selector && itr->app.protocol == app->protocol && itr->ifindex == ifindex && ((prio == -1) || itr->app.priority == prio)) return itr; } return NULL; } static int dcb_app_add(struct list_head *list, const struct dcb_app *app, int ifindex) { struct dcb_app_type *entry; entry = kmalloc(sizeof(*entry), GFP_ATOMIC); if (!entry) return -ENOMEM; memcpy(&entry->app, app, sizeof(*app)); entry->ifindex = ifindex; list_add(&entry->list, list); return 0; } /** * dcb_getapp - retrieve the DCBX application user priority * @dev: network interface * @app: application to get user priority of * * On success returns a non-zero 802.1p user priority bitmap * otherwise returns 0 as the invalid user priority bitmap to * indicate an error. */ u8 dcb_getapp(struct net_device *dev, struct dcb_app *app) { struct dcb_app_type *itr; u8 prio = 0; spin_lock_bh(&dcb_lock); itr = dcb_app_lookup(app, dev->ifindex, -1); if (itr) prio = itr->app.priority; spin_unlock_bh(&dcb_lock); return prio; } EXPORT_SYMBOL(dcb_getapp); /** * dcb_setapp - add CEE dcb application data to app list * @dev: network interface * @new: application data to add * * Priority 0 is an invalid priority in CEE spec. This routine * removes applications from the app list if the priority is * set to zero. Priority is expected to be 8-bit 802.1p user priority bitmap */ int dcb_setapp(struct net_device *dev, struct dcb_app *new) { struct dcb_app_type *itr; struct dcb_app_type event; int err = 0; event.ifindex = dev->ifindex; memcpy(&event.app, new, sizeof(event.app)); if (dev->dcbnl_ops->getdcbx) event.dcbx = dev->dcbnl_ops->getdcbx(dev); spin_lock_bh(&dcb_lock); /* Search for existing match and replace */ itr = dcb_app_lookup(new, dev->ifindex, -1); if (itr) { if (new->priority) itr->app.priority = new->priority; else { list_del(&itr->list); kfree(itr); } goto out; } /* App type does not exist add new application type */ if (new->priority) err = dcb_app_add(&dcb_app_list, new, dev->ifindex); out: spin_unlock_bh(&dcb_lock); if (!err) call_dcbevent_notifiers(DCB_APP_EVENT, &event); return err; } EXPORT_SYMBOL(dcb_setapp); /** * dcb_ieee_getapp_mask - retrieve the IEEE DCB application priority * @dev: network interface * @app: where to store the retrieve application data * * Helper routine which on success returns a non-zero 802.1Qaz user * priority bitmap otherwise returns 0 to indicate the dcb_app was * not found in APP list. */ u8 dcb_ieee_getapp_mask(struct net_device *dev, struct dcb_app *app) { struct dcb_app_type *itr; u8 prio = 0; spin_lock_bh(&dcb_lock); itr = dcb_app_lookup(app, dev->ifindex, -1); if (itr) prio |= 1 << itr->app.priority; spin_unlock_bh(&dcb_lock); return prio; } EXPORT_SYMBOL(dcb_ieee_getapp_mask); /* Get protocol value from rewrite entry. */ u16 dcb_getrewr(struct net_device *dev, struct dcb_app *app) { struct dcb_app_type *itr; u16 proto = 0; spin_lock_bh(&dcb_lock); itr = dcb_rewr_lookup(app, dev->ifindex, -1); if (itr) proto = itr->app.protocol; spin_unlock_bh(&dcb_lock); return proto; } EXPORT_SYMBOL(dcb_getrewr); /* Add rewrite entry to the rewrite list. */ int dcb_setrewr(struct net_device *dev, struct dcb_app *new) { int err; spin_lock_bh(&dcb_lock); /* Search for existing match and abort if found. */ if (dcb_rewr_lookup(new, dev->ifindex, new->protocol)) { err = -EEXIST; goto out; } err = dcb_app_add(&dcb_rewr_list, new, dev->ifindex); out: spin_unlock_bh(&dcb_lock); return err; } EXPORT_SYMBOL(dcb_setrewr); /* Delete rewrite entry from the rewrite list. */ int dcb_delrewr(struct net_device *dev, struct dcb_app *del) { struct dcb_app_type *itr; int err = -ENOENT; spin_lock_bh(&dcb_lock); /* Search for existing match and remove it. */ itr = dcb_rewr_lookup(del, dev->ifindex, del->protocol); if (itr) { list_del(&itr->list); kfree(itr); err = 0; } spin_unlock_bh(&dcb_lock); return err; } EXPORT_SYMBOL(dcb_delrewr); /** * dcb_ieee_setapp - add IEEE dcb application data to app list * @dev: network interface * @new: application data to add * * This adds Application data to the list. Multiple application * entries may exists for the same selector and protocol as long * as the priorities are different. Priority is expected to be a * 3-bit unsigned integer */ int dcb_ieee_setapp(struct net_device *dev, struct dcb_app *new) { struct dcb_app_type event; int err = 0; event.ifindex = dev->ifindex; memcpy(&event.app, new, sizeof(event.app)); if (dev->dcbnl_ops->getdcbx) event.dcbx = dev->dcbnl_ops->getdcbx(dev); spin_lock_bh(&dcb_lock); /* Search for existing match and abort if found */ if (dcb_app_lookup(new, dev->ifindex, new->priority)) { err = -EEXIST; goto out; } err = dcb_app_add(&dcb_app_list, new, dev->ifindex); out: spin_unlock_bh(&dcb_lock); if (!err) call_dcbevent_notifiers(DCB_APP_EVENT, &event); return err; } EXPORT_SYMBOL(dcb_ieee_setapp); /** * dcb_ieee_delapp - delete IEEE dcb application data from list * @dev: network interface * @del: application data to delete * * This removes a matching APP data from the APP list */ int dcb_ieee_delapp(struct net_device *dev, struct dcb_app *del) { struct dcb_app_type *itr; struct dcb_app_type event; int err = -ENOENT; event.ifindex = dev->ifindex; memcpy(&event.app, del, sizeof(event.app)); if (dev->dcbnl_ops->getdcbx) event.dcbx = dev->dcbnl_ops->getdcbx(dev); spin_lock_bh(&dcb_lock); /* Search for existing match and remove it. */ if ((itr = dcb_app_lookup(del, dev->ifindex, del->priority))) { list_del(&itr->list); kfree(itr); err = 0; } spin_unlock_bh(&dcb_lock); if (!err) call_dcbevent_notifiers(DCB_APP_EVENT, &event); return err; } EXPORT_SYMBOL(dcb_ieee_delapp); /* dcb_getrewr_prio_pcp_mask_map - For a given device, find mapping from * priorities to the PCP and DEI values assigned to that priority. */ void dcb_getrewr_prio_pcp_mask_map(const struct net_device *dev, struct dcb_rewr_prio_pcp_map *p_map) { int ifindex = dev->ifindex; struct dcb_app_type *itr; u8 prio; memset(p_map->map, 0, sizeof(p_map->map)); spin_lock_bh(&dcb_lock); list_for_each_entry(itr, &dcb_rewr_list, list) { if (itr->ifindex == ifindex && itr->app.selector == DCB_APP_SEL_PCP && itr->app.protocol < 16 && itr->app.priority < IEEE_8021QAZ_MAX_TCS) { prio = itr->app.priority; p_map->map[prio] |= 1 << itr->app.protocol; } } spin_unlock_bh(&dcb_lock); } EXPORT_SYMBOL(dcb_getrewr_prio_pcp_mask_map); /* dcb_getrewr_prio_dscp_mask_map - For a given device, find mapping from * priorities to the DSCP values assigned to that priority. */ void dcb_getrewr_prio_dscp_mask_map(const struct net_device *dev, struct dcb_ieee_app_prio_map *p_map) { int ifindex = dev->ifindex; struct dcb_app_type *itr; u8 prio; memset(p_map->map, 0, sizeof(p_map->map)); spin_lock_bh(&dcb_lock); list_for_each_entry(itr, &dcb_rewr_list, list) { if (itr->ifindex == ifindex && itr->app.selector == IEEE_8021QAZ_APP_SEL_DSCP && itr->app.protocol < 64 && itr->app.priority < IEEE_8021QAZ_MAX_TCS) { prio = itr->app.priority; p_map->map[prio] |= 1ULL << itr->app.protocol; } } spin_unlock_bh(&dcb_lock); } EXPORT_SYMBOL(dcb_getrewr_prio_dscp_mask_map); /* * dcb_ieee_getapp_prio_dscp_mask_map - For a given device, find mapping from * priorities to the DSCP values assigned to that priority. Initialize p_map * such that each map element holds a bit mask of DSCP values configured for * that priority by APP entries. */ void dcb_ieee_getapp_prio_dscp_mask_map(const struct net_device *dev, struct dcb_ieee_app_prio_map *p_map) { int ifindex = dev->ifindex; struct dcb_app_type *itr; u8 prio; memset(p_map->map, 0, sizeof(p_map->map)); spin_lock_bh(&dcb_lock); list_for_each_entry(itr, &dcb_app_list, list) { if (itr->ifindex == ifindex && itr->app.selector == IEEE_8021QAZ_APP_SEL_DSCP && itr->app.protocol < 64 && itr->app.priority < IEEE_8021QAZ_MAX_TCS) { prio = itr->app.priority; p_map->map[prio] |= 1ULL << itr->app.protocol; } } spin_unlock_bh(&dcb_lock); } EXPORT_SYMBOL(dcb_ieee_getapp_prio_dscp_mask_map); /* * dcb_ieee_getapp_dscp_prio_mask_map - For a given device, find mapping from * DSCP values to the priorities assigned to that DSCP value. Initialize p_map * such that each map element holds a bit mask of priorities configured for a * given DSCP value by APP entries. */ void dcb_ieee_getapp_dscp_prio_mask_map(const struct net_device *dev, struct dcb_ieee_app_dscp_map *p_map) { int ifindex = dev->ifindex; struct dcb_app_type *itr; memset(p_map->map, 0, sizeof(p_map->map)); spin_lock_bh(&dcb_lock); list_for_each_entry(itr, &dcb_app_list, list) { if (itr->ifindex == ifindex && itr->app.selector == IEEE_8021QAZ_APP_SEL_DSCP && itr->app.protocol < 64 && itr->app.priority < IEEE_8021QAZ_MAX_TCS) p_map->map[itr->app.protocol] |= 1 << itr->app.priority; } spin_unlock_bh(&dcb_lock); } EXPORT_SYMBOL(dcb_ieee_getapp_dscp_prio_mask_map); /* * Per 802.1Q-2014, the selector value of 1 is used for matching on Ethernet * type, with valid PID values >= 1536. A special meaning is then assigned to * protocol value of 0: "default priority. For use when priority is not * otherwise specified". * * dcb_ieee_getapp_default_prio_mask - For a given device, find all APP entries * of the form {$PRIO, ETHERTYPE, 0} and construct a bit mask of all default * priorities set by these entries. */ u8 dcb_ieee_getapp_default_prio_mask(const struct net_device *dev) { int ifindex = dev->ifindex; struct dcb_app_type *itr; u8 mask = 0; spin_lock_bh(&dcb_lock); list_for_each_entry(itr, &dcb_app_list, list) { if (itr->ifindex == ifindex && itr->app.selector == IEEE_8021QAZ_APP_SEL_ETHERTYPE && itr->app.protocol == 0 && itr->app.priority < IEEE_8021QAZ_MAX_TCS) mask |= 1 << itr->app.priority; } spin_unlock_bh(&dcb_lock); return mask; } EXPORT_SYMBOL(dcb_ieee_getapp_default_prio_mask); static void dcbnl_flush_dev(struct net_device *dev) { struct dcb_app_type *itr, *tmp; spin_lock_bh(&dcb_lock); list_for_each_entry_safe(itr, tmp, &dcb_app_list, list) { if (itr->ifindex == dev->ifindex) { list_del(&itr->list); kfree(itr); } } spin_unlock_bh(&dcb_lock); } static int dcbnl_netdevice_event(struct notifier_block *nb, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); switch (event) { case NETDEV_UNREGISTER: if (!dev->dcbnl_ops) return NOTIFY_DONE; dcbnl_flush_dev(dev); return NOTIFY_OK; default: return NOTIFY_DONE; } } static struct notifier_block dcbnl_nb __read_mostly = { .notifier_call = dcbnl_netdevice_event, }; static int __init dcbnl_init(void) { int err; err = register_netdevice_notifier(&dcbnl_nb); if (err) return err; rtnl_register(PF_UNSPEC, RTM_GETDCB, dcb_doit, NULL, 0); rtnl_register(PF_UNSPEC, RTM_SETDCB, dcb_doit, NULL, 0); return 0; } device_initcall(dcbnl_init);
136 137 3 8 3 6 14 1 3 116 85 6 2 4 6 54 50 72 72 20 54 52 1 2 54 47 1 1 8 1 55 28 17 1 6 6 1 1 1 1 124 124 124 123 2 123 93 32 41 71 32 121 1 122 121 121 1 9 3 14 14 4 2 1 1 1 2 5 5 9 223 84 41 39 108 62 15 144 1 45 3 4 2 7 3 14 112 69 118 78 84 65 23 1 82 75 5 2 14 68 194 12 19 1 18 2 153 13 59 140 59 148 150 2 2 111 25 13 124 9 128 128 5 65 102 60 108 108 31 108 94 15 90 90 90 87 2 90 90 17 88 3 14 14 1 1 2 2 7 7 7 7 48 107 76 8 41 26 19 26 37 4 8 28 19 91 110 14 22 22 57 5 61 2 186 2 1 1 4 6 2 3 2 2 1 1 6 5 1 5 93 20 5 2 11 13 84 41 25 50 92 37 46 17 42 42 4 16 1 9 9 11 1 8 1 7 2 11 2 1 2 5 2 3 2 2 1 17 1 49 50 7 1 7 7 3 9 5 4 3 3 8 4 2 2 2 1 2 3 5 12 6 7 1 8 5 5 5 5 12 1 2 2 2 35 36 5 1 5 11 14 1 1 6 1 1 11 9 2 11 3 8 8 4 5 2 3 11 12 6 3 5 41 41 38 2 1 3 38 41 40 3 34 41 41 74 1 64 9 72 1 2 50 55 54 6 2 45 39 2 7 44 21 1 1 1 1 9 4 15 1 24 1 24 1 26 9 5 2 13 23 2 36 2 5 31 16 1 7 22 1 2 38 47 9 6 74 5 67 1 32 7 21 20 7 7 231 233 165 145 43 109 204 40 30 3 27 27 26 3 3 3 3 3 3 67 5 65 46 5 13 1 13 8 69 69 66 1 1 64 3 3 64 64 56 57 22 241 242 240 241 89 15 66 16 57 5 59 5 90 88 85 4 90 90 90 89 21 1 22 6 1 1 4 1 1 1 1 8 9 8 2 4 2 4 1 5 3 1 1 23 3 20 25 7 18 4 1 3 1 1 3 3 4 3 4 1 2 2 2 3 1 3 4 1 1 1 213 3 20 2 18 2 8 2 6 15 11 6 166 6 7 5 4 1 3 3 11 39 1 5 1 1 1 1 1 1 16 1 1 1 1 1 1 1 1 9 3 4 1 4 4 1 4 2 1 1 1 4 3 1 1 6 11 1 1 1 1 610 403 216 11 7 37 24 11 5 10 6 10 6 13 2 14 1 15 14 1 15 15 14 15 3 3 3 3 137 138 2 3 2 2 2 2 2 1 1 2 2 1 1 1 1 2 9 1 1 9 6 2 9 1 1 2 1 1 1 3 1 1 2 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 36 6 2 1 1 1 36 21 14 5 17 10 24 36 1 42 41 14 241 107 137 6 6 1 19 10 10 5 1 4 5 1 4 95 95 95 95 92 3 9 9 9 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Implementation of the Transmission Control Protocol(TCP). * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Mark Evans, <evansmp@uhura.aston.ac.uk> * Corey Minyard <wf-rch!minyard@relay.EU.net> * Florian La Roche, <flla@stud.uni-sb.de> * Charles Hedrick, <hedrick@klinzhai.rutgers.edu> * Linus Torvalds, <torvalds@cs.helsinki.fi> * Alan Cox, <gw4pts@gw4pts.ampr.org> * Matthew Dillon, <dillon@apollo.west.oic.com> * Arnt Gulbrandsen, <agulbra@nvg.unit.no> * Jorge Cwik, <jorge@laser.satlink.net> * * Fixes: * Alan Cox : Numerous verify_area() calls * Alan Cox : Set the ACK bit on a reset * Alan Cox : Stopped it crashing if it closed while * sk->inuse=1 and was trying to connect * (tcp_err()). * Alan Cox : All icmp error handling was broken * pointers passed where wrong and the * socket was looked up backwards. Nobody * tested any icmp error code obviously. * Alan Cox : tcp_err() now handled properly. It * wakes people on errors. poll * behaves and the icmp error race * has gone by moving it into sock.c * Alan Cox : tcp_send_reset() fixed to work for * everything not just packets for * unknown sockets. * Alan Cox : tcp option processing. * Alan Cox : Reset tweaked (still not 100%) [Had * syn rule wrong] * Herp Rosmanith : More reset fixes * Alan Cox : No longer acks invalid rst frames. * Acking any kind of RST is right out. * Alan Cox : Sets an ignore me flag on an rst * receive otherwise odd bits of prattle * escape still * Alan Cox : Fixed another acking RST frame bug. * Should stop LAN workplace lockups. * Alan Cox : Some tidyups using the new skb list * facilities * Alan Cox : sk->keepopen now seems to work * Alan Cox : Pulls options out correctly on accepts * Alan Cox : Fixed assorted sk->rqueue->next errors * Alan Cox : PSH doesn't end a TCP read. Switched a * bit to skb ops. * Alan Cox : Tidied tcp_data to avoid a potential * nasty. * Alan Cox : Added some better commenting, as the * tcp is hard to follow * Alan Cox : Removed incorrect check for 20 * psh * Michael O'Reilly : ack < copied bug fix. * Johannes Stille : Misc tcp fixes (not all in yet). * Alan Cox : FIN with no memory -> CRASH * Alan Cox : Added socket option proto entries. * Also added awareness of them to accept. * Alan Cox : Added TCP options (SOL_TCP) * Alan Cox : Switched wakeup calls to callbacks, * so the kernel can layer network * sockets. * Alan Cox : Use ip_tos/ip_ttl settings. * Alan Cox : Handle FIN (more) properly (we hope). * Alan Cox : RST frames sent on unsynchronised * state ack error. * Alan Cox : Put in missing check for SYN bit. * Alan Cox : Added tcp_select_window() aka NET2E * window non shrink trick. * Alan Cox : Added a couple of small NET2E timer * fixes * Charles Hedrick : TCP fixes * Toomas Tamm : TCP window fixes * Alan Cox : Small URG fix to rlogin ^C ack fight * Charles Hedrick : Rewrote most of it to actually work * Linus : Rewrote tcp_read() and URG handling * completely * Gerhard Koerting: Fixed some missing timer handling * Matthew Dillon : Reworked TCP machine states as per RFC * Gerhard Koerting: PC/TCP workarounds * Adam Caldwell : Assorted timer/timing errors * Matthew Dillon : Fixed another RST bug * Alan Cox : Move to kernel side addressing changes. * Alan Cox : Beginning work on TCP fastpathing * (not yet usable) * Arnt Gulbrandsen: Turbocharged tcp_check() routine. * Alan Cox : TCP fast path debugging * Alan Cox : Window clamping * Michael Riepe : Bug in tcp_check() * Matt Dillon : More TCP improvements and RST bug fixes * Matt Dillon : Yet more small nasties remove from the * TCP code (Be very nice to this man if * tcp finally works 100%) 8) * Alan Cox : BSD accept semantics. * Alan Cox : Reset on closedown bug. * Peter De Schrijver : ENOTCONN check missing in tcp_sendto(). * Michael Pall : Handle poll() after URG properly in * all cases. * Michael Pall : Undo the last fix in tcp_read_urg() * (multi URG PUSH broke rlogin). * Michael Pall : Fix the multi URG PUSH problem in * tcp_readable(), poll() after URG * works now. * Michael Pall : recv(...,MSG_OOB) never blocks in the * BSD api. * Alan Cox : Changed the semantics of sk->socket to * fix a race and a signal problem with * accept() and async I/O. * Alan Cox : Relaxed the rules on tcp_sendto(). * Yury Shevchuk : Really fixed accept() blocking problem. * Craig I. Hagan : Allow for BSD compatible TIME_WAIT for * clients/servers which listen in on * fixed ports. * Alan Cox : Cleaned the above up and shrank it to * a sensible code size. * Alan Cox : Self connect lockup fix. * Alan Cox : No connect to multicast. * Ross Biro : Close unaccepted children on master * socket close. * Alan Cox : Reset tracing code. * Alan Cox : Spurious resets on shutdown. * Alan Cox : Giant 15 minute/60 second timer error * Alan Cox : Small whoops in polling before an * accept. * Alan Cox : Kept the state trace facility since * it's handy for debugging. * Alan Cox : More reset handler fixes. * Alan Cox : Started rewriting the code based on * the RFC's for other useful protocol * references see: Comer, KA9Q NOS, and * for a reference on the difference * between specifications and how BSD * works see the 4.4lite source. * A.N.Kuznetsov : Don't time wait on completion of tidy * close. * Linus Torvalds : Fin/Shutdown & copied_seq changes. * Linus Torvalds : Fixed BSD port reuse to work first syn * Alan Cox : Reimplemented timers as per the RFC * and using multiple timers for sanity. * Alan Cox : Small bug fixes, and a lot of new * comments. * Alan Cox : Fixed dual reader crash by locking * the buffers (much like datagram.c) * Alan Cox : Fixed stuck sockets in probe. A probe * now gets fed up of retrying without * (even a no space) answer. * Alan Cox : Extracted closing code better * Alan Cox : Fixed the closing state machine to * resemble the RFC. * Alan Cox : More 'per spec' fixes. * Jorge Cwik : Even faster checksumming. * Alan Cox : tcp_data() doesn't ack illegal PSH * only frames. At least one pc tcp stack * generates them. * Alan Cox : Cache last socket. * Alan Cox : Per route irtt. * Matt Day : poll()->select() match BSD precisely on error * Alan Cox : New buffers * Marc Tamsky : Various sk->prot->retransmits and * sk->retransmits misupdating fixed. * Fixed tcp_write_timeout: stuck close, * and TCP syn retries gets used now. * Mark Yarvis : In tcp_read_wakeup(), don't send an * ack if state is TCP_CLOSED. * Alan Cox : Look up device on a retransmit - routes may * change. Doesn't yet cope with MSS shrink right * but it's a start! * Marc Tamsky : Closing in closing fixes. * Mike Shaver : RFC1122 verifications. * Alan Cox : rcv_saddr errors. * Alan Cox : Block double connect(). * Alan Cox : Small hooks for enSKIP. * Alexey Kuznetsov: Path MTU discovery. * Alan Cox : Support soft errors. * Alan Cox : Fix MTU discovery pathological case * when the remote claims no mtu! * Marc Tamsky : TCP_CLOSE fix. * Colin (G3TNE) : Send a reset on syn ack replies in * window but wrong (fixes NT lpd problems) * Pedro Roque : Better TCP window handling, delayed ack. * Joerg Reuter : No modification of locked buffers in * tcp_do_retransmit() * Eric Schenk : Changed receiver side silly window * avoidance algorithm to BSD style * algorithm. This doubles throughput * against machines running Solaris, * and seems to result in general * improvement. * Stefan Magdalinski : adjusted tcp_readable() to fix FIONREAD * Willy Konynenberg : Transparent proxying support. * Mike McLagan : Routing by source * Keith Owens : Do proper merging with partial SKB's in * tcp_do_sendmsg to avoid burstiness. * Eric Schenk : Fix fast close down bug with * shutdown() followed by close(). * Andi Kleen : Make poll agree with SIGIO * Salvatore Sanfilippo : Support SO_LINGER with linger == 1 and * lingertime == 0 (RFC 793 ABORT Call) * Hirokazu Takahashi : Use copy_from_user() instead of * csum_and_copy_from_user() if possible. * * Description of States: * * TCP_SYN_SENT sent a connection request, waiting for ack * * TCP_SYN_RECV received a connection request, sent ack, * waiting for final ack in three-way handshake. * * TCP_ESTABLISHED connection established * * TCP_FIN_WAIT1 our side has shutdown, waiting to complete * transmission of remaining buffered data * * TCP_FIN_WAIT2 all buffered data sent, waiting for remote * to shutdown * * TCP_CLOSING both sides have shutdown but we still have * data we have to finish sending * * TCP_TIME_WAIT timeout to catch resent junk before entering * closed, can only be entered from FIN_WAIT2 * or CLOSING. Required because the other end * may not have gotten our last ACK causing it * to retransmit the data packet (which we ignore) * * TCP_CLOSE_WAIT remote side has shutdown and is waiting for * us to finish writing our data and to shutdown * (we have to close() to move on to LAST_ACK) * * TCP_LAST_ACK out side has shutdown after remote has * shutdown. There may still be data in our * buffer that we have to finish sending * * TCP_CLOSE socket is finished */ #define pr_fmt(fmt) "TCP: " fmt #include <crypto/hash.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/types.h> #include <linux/fcntl.h> #include <linux/poll.h> #include <linux/inet_diag.h> #include <linux/init.h> #include <linux/fs.h> #include <linux/skbuff.h> #include <linux/scatterlist.h> #include <linux/splice.h> #include <linux/net.h> #include <linux/socket.h> #include <linux/random.h> #include <linux/memblock.h> #include <linux/highmem.h> #include <linux/cache.h> #include <linux/err.h> #include <linux/time.h> #include <linux/slab.h> #include <linux/errqueue.h> #include <linux/static_key.h> #include <linux/btf.h> #include <net/icmp.h> #include <net/inet_common.h> #include <net/tcp.h> #include <net/mptcp.h> #include <net/xfrm.h> #include <net/ip.h> #include <net/sock.h> #include <linux/uaccess.h> #include <asm/ioctls.h> #include <net/busy_poll.h> #include <net/rps.h> /* Track pending CMSGs. */ enum { TCP_CMSG_INQ = 1, TCP_CMSG_TS = 2 }; DEFINE_PER_CPU(unsigned int, tcp_orphan_count); EXPORT_PER_CPU_SYMBOL_GPL(tcp_orphan_count); long sysctl_tcp_mem[3] __read_mostly; EXPORT_SYMBOL(sysctl_tcp_mem); atomic_long_t tcp_memory_allocated ____cacheline_aligned_in_smp; /* Current allocated memory. */ EXPORT_SYMBOL(tcp_memory_allocated); DEFINE_PER_CPU(int, tcp_memory_per_cpu_fw_alloc); EXPORT_PER_CPU_SYMBOL_GPL(tcp_memory_per_cpu_fw_alloc); #if IS_ENABLED(CONFIG_SMC) DEFINE_STATIC_KEY_FALSE(tcp_have_smc); EXPORT_SYMBOL(tcp_have_smc); #endif /* * Current number of TCP sockets. */ struct percpu_counter tcp_sockets_allocated ____cacheline_aligned_in_smp; EXPORT_SYMBOL(tcp_sockets_allocated); /* * TCP splice context */ struct tcp_splice_state { struct pipe_inode_info *pipe; size_t len; unsigned int flags; }; /* * Pressure flag: try to collapse. * Technical note: it is used by multiple contexts non atomically. * All the __sk_mem_schedule() is of this nature: accounting * is strict, actions are advisory and have some latency. */ unsigned long tcp_memory_pressure __read_mostly; EXPORT_SYMBOL_GPL(tcp_memory_pressure); void tcp_enter_memory_pressure(struct sock *sk) { unsigned long val; if (READ_ONCE(tcp_memory_pressure)) return; val = jiffies; if (!val) val--; if (!cmpxchg(&tcp_memory_pressure, 0, val)) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES); } EXPORT_SYMBOL_GPL(tcp_enter_memory_pressure); void tcp_leave_memory_pressure(struct sock *sk) { unsigned long val; if (!READ_ONCE(tcp_memory_pressure)) return; val = xchg(&tcp_memory_pressure, 0); if (val) NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURESCHRONO, jiffies_to_msecs(jiffies - val)); } EXPORT_SYMBOL_GPL(tcp_leave_memory_pressure); /* Convert seconds to retransmits based on initial and max timeout */ static u8 secs_to_retrans(int seconds, int timeout, int rto_max) { u8 res = 0; if (seconds > 0) { int period = timeout; res = 1; while (seconds > period && res < 255) { res++; timeout <<= 1; if (timeout > rto_max) timeout = rto_max; period += timeout; } } return res; } /* Convert retransmits to seconds based on initial and max timeout */ static int retrans_to_secs(u8 retrans, int timeout, int rto_max) { int period = 0; if (retrans > 0) { period = timeout; while (--retrans) { timeout <<= 1; if (timeout > rto_max) timeout = rto_max; period += timeout; } } return period; } static u64 tcp_compute_delivery_rate(const struct tcp_sock *tp) { u32 rate = READ_ONCE(tp->rate_delivered); u32 intv = READ_ONCE(tp->rate_interval_us); u64 rate64 = 0; if (rate && intv) { rate64 = (u64)rate * tp->mss_cache * USEC_PER_SEC; do_div(rate64, intv); } return rate64; } /* Address-family independent initialization for a tcp_sock. * * NOTE: A lot of things set to zero explicitly by call to * sk_alloc() so need not be done here. */ void tcp_init_sock(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); tp->out_of_order_queue = RB_ROOT; sk->tcp_rtx_queue = RB_ROOT; tcp_init_xmit_timers(sk); INIT_LIST_HEAD(&tp->tsq_node); INIT_LIST_HEAD(&tp->tsorted_sent_queue); icsk->icsk_rto = TCP_TIMEOUT_INIT; icsk->icsk_rto_min = TCP_RTO_MIN; icsk->icsk_delack_max = TCP_DELACK_MAX; tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); minmax_reset(&tp->rtt_min, tcp_jiffies32, ~0U); /* So many TCP implementations out there (incorrectly) count the * initial SYN frame in their delayed-ACK and congestion control * algorithms that we must have the following bandaid to talk * efficiently to them. -DaveM */ tcp_snd_cwnd_set(tp, TCP_INIT_CWND); /* There's a bubble in the pipe until at least the first ACK. */ tp->app_limited = ~0U; tp->rate_app_limited = 1; /* See draft-stevens-tcpca-spec-01 for discussion of the * initialization of these values. */ tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; tp->snd_cwnd_clamp = ~0; tp->mss_cache = TCP_MSS_DEFAULT; tp->reordering = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reordering); tcp_assign_congestion_control(sk); tp->tsoffset = 0; tp->rack.reo_wnd_steps = 1; sk->sk_write_space = sk_stream_write_space; sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); icsk->icsk_sync_mss = tcp_sync_mss; WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[1])); WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[1])); tcp_scaling_ratio_init(sk); set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags); sk_sockets_allocated_inc(sk); } EXPORT_SYMBOL(tcp_init_sock); static void tcp_tx_timestamp(struct sock *sk, u16 tsflags) { struct sk_buff *skb = tcp_write_queue_tail(sk); if (tsflags && skb) { struct skb_shared_info *shinfo = skb_shinfo(skb); struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); sock_tx_timestamp(sk, tsflags, &shinfo->tx_flags); if (tsflags & SOF_TIMESTAMPING_TX_ACK) tcb->txstamp_ack = 1; if (tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK) shinfo->tskey = TCP_SKB_CB(skb)->seq + skb->len - 1; } } static bool tcp_stream_is_readable(struct sock *sk, int target) { if (tcp_epollin_ready(sk, target)) return true; return sk_is_readable(sk); } /* * Wait for a TCP event. * * Note that we don't need to lock the socket, as the upper poll layers * take care of normal races (between the test and the event) and we don't * go look at any of the socket buffers directly. */ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) { __poll_t mask; struct sock *sk = sock->sk; const struct tcp_sock *tp = tcp_sk(sk); u8 shutdown; int state; sock_poll_wait(file, sock, wait); state = inet_sk_state_load(sk); if (state == TCP_LISTEN) return inet_csk_listen_poll(sk); /* Socket is not locked. We are protected from async events * by poll logic and correct handling of state changes * made by other threads is impossible in any case. */ mask = 0; /* * EPOLLHUP is certainly not done right. But poll() doesn't * have a notion of HUP in just one direction, and for a * socket the read side is more interesting. * * Some poll() documentation says that EPOLLHUP is incompatible * with the EPOLLOUT/POLLWR flags, so somebody should check this * all. But careful, it tends to be safer to return too many * bits than too few, and you can easily break real applications * if you don't tell them that something has hung up! * * Check-me. * * Check number 1. EPOLLHUP is _UNMASKABLE_ event (see UNIX98 and * our fs/select.c). It means that after we received EOF, * poll always returns immediately, making impossible poll() on write() * in state CLOSE_WAIT. One solution is evident --- to set EPOLLHUP * if and only if shutdown has been made in both directions. * Actually, it is interesting to look how Solaris and DUX * solve this dilemma. I would prefer, if EPOLLHUP were maskable, * then we could set it on SND_SHUTDOWN. BTW examples given * in Stevens' books assume exactly this behaviour, it explains * why EPOLLHUP is incompatible with EPOLLOUT. --ANK * * NOTE. Check for TCP_CLOSE is added. The goal is to prevent * blocking on fresh not-connected or disconnected socket. --ANK */ shutdown = READ_ONCE(sk->sk_shutdown); if (shutdown == SHUTDOWN_MASK || state == TCP_CLOSE) mask |= EPOLLHUP; if (shutdown & RCV_SHUTDOWN) mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; /* Connected or passive Fast Open socket? */ if (state != TCP_SYN_SENT && (state != TCP_SYN_RECV || rcu_access_pointer(tp->fastopen_rsk))) { int target = sock_rcvlowat(sk, 0, INT_MAX); u16 urg_data = READ_ONCE(tp->urg_data); if (unlikely(urg_data) && READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq) && !sock_flag(sk, SOCK_URGINLINE)) target++; if (tcp_stream_is_readable(sk, target)) mask |= EPOLLIN | EPOLLRDNORM; if (!(shutdown & SEND_SHUTDOWN)) { if (__sk_stream_is_writeable(sk, 1)) { mask |= EPOLLOUT | EPOLLWRNORM; } else { /* send SIGIO later */ sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); /* Race breaker. If space is freed after * wspace test but before the flags are set, * IO signal will be lost. Memory barrier * pairs with the input side. */ smp_mb__after_atomic(); if (__sk_stream_is_writeable(sk, 1)) mask |= EPOLLOUT | EPOLLWRNORM; } } else mask |= EPOLLOUT | EPOLLWRNORM; if (urg_data & TCP_URG_VALID) mask |= EPOLLPRI; } else if (state == TCP_SYN_SENT && inet_test_bit(DEFER_CONNECT, sk)) { /* Active TCP fastopen socket with defer_connect * Return EPOLLOUT so application can call write() * in order for kernel to generate SYN+data */ mask |= EPOLLOUT | EPOLLWRNORM; } /* This barrier is coupled with smp_wmb() in tcp_reset() */ smp_rmb(); if (READ_ONCE(sk->sk_err) || !skb_queue_empty_lockless(&sk->sk_error_queue)) mask |= EPOLLERR; return mask; } EXPORT_SYMBOL(tcp_poll); int tcp_ioctl(struct sock *sk, int cmd, int *karg) { struct tcp_sock *tp = tcp_sk(sk); int answ; bool slow; switch (cmd) { case SIOCINQ: if (sk->sk_state == TCP_LISTEN) return -EINVAL; slow = lock_sock_fast(sk); answ = tcp_inq(sk); unlock_sock_fast(sk, slow); break; case SIOCATMARK: answ = READ_ONCE(tp->urg_data) && READ_ONCE(tp->urg_seq) == READ_ONCE(tp->copied_seq); break; case SIOCOUTQ: if (sk->sk_state == TCP_LISTEN) return -EINVAL; if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) answ = 0; else answ = READ_ONCE(tp->write_seq) - tp->snd_una; break; case SIOCOUTQNSD: if (sk->sk_state == TCP_LISTEN) return -EINVAL; if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) answ = 0; else answ = READ_ONCE(tp->write_seq) - READ_ONCE(tp->snd_nxt); break; default: return -ENOIOCTLCMD; } *karg = answ; return 0; } EXPORT_SYMBOL(tcp_ioctl); void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb) { TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_PSH; tp->pushed_seq = tp->write_seq; } static inline bool forced_push(const struct tcp_sock *tp) { return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1)); } void tcp_skb_entail(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); tcb->seq = tcb->end_seq = tp->write_seq; tcb->tcp_flags = TCPHDR_ACK; __skb_header_release(skb); tcp_add_write_queue_tail(sk, skb); sk_wmem_queued_add(sk, skb->truesize); sk_mem_charge(sk, skb->truesize); if (tp->nonagle & TCP_NAGLE_PUSH) tp->nonagle &= ~TCP_NAGLE_PUSH; tcp_slow_start_after_idle_check(sk); } static inline void tcp_mark_urg(struct tcp_sock *tp, int flags) { if (flags & MSG_OOB) tp->snd_up = tp->write_seq; } /* If a not yet filled skb is pushed, do not send it if * we have data packets in Qdisc or NIC queues : * Because TX completion will happen shortly, it gives a chance * to coalesce future sendmsg() payload into this skb, without * need for a timer, and with no latency trade off. * As packets containing data payload have a bigger truesize * than pure acks (dataless) packets, the last checks prevent * autocorking if we only have an ACK in Qdisc/NIC queues, * or if TX completion was delayed after we processed ACK packet. */ static bool tcp_should_autocork(struct sock *sk, struct sk_buff *skb, int size_goal) { return skb->len < size_goal && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_autocorking) && !tcp_rtx_queue_empty(sk) && refcount_read(&sk->sk_wmem_alloc) > skb->truesize && tcp_skb_can_collapse_to(skb); } void tcp_push(struct sock *sk, int flags, int mss_now, int nonagle, int size_goal) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; skb = tcp_write_queue_tail(sk); if (!skb) return; if (!(flags & MSG_MORE) || forced_push(tp)) tcp_mark_push(tp, skb); tcp_mark_urg(tp, flags); if (tcp_should_autocork(sk, skb, size_goal)) { /* avoid atomic op if TSQ_THROTTLED bit is already set */ if (!test_bit(TSQ_THROTTLED, &sk->sk_tsq_flags)) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING); set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags); smp_mb__after_atomic(); } /* It is possible TX completion already happened * before we set TSQ_THROTTLED. */ if (refcount_read(&sk->sk_wmem_alloc) > skb->truesize) return; } if (flags & MSG_MORE) nonagle = TCP_NAGLE_CORK; __tcp_push_pending_frames(sk, mss_now, nonagle); } static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, unsigned int offset, size_t len) { struct tcp_splice_state *tss = rd_desc->arg.data; int ret; ret = skb_splice_bits(skb, skb->sk, offset, tss->pipe, min(rd_desc->count, len), tss->flags); if (ret > 0) rd_desc->count -= ret; return ret; } static int __tcp_splice_read(struct sock *sk, struct tcp_splice_state *tss) { /* Store TCP splice context information in read_descriptor_t. */ read_descriptor_t rd_desc = { .arg.data = tss, .count = tss->len, }; return tcp_read_sock(sk, &rd_desc, tcp_splice_data_recv); } /** * tcp_splice_read - splice data from TCP socket to a pipe * @sock: socket to splice from * @ppos: position (not valid) * @pipe: pipe to splice to * @len: number of bytes to splice * @flags: splice modifier flags * * Description: * Will read pages from given socket and fill them into a pipe. * **/ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { struct sock *sk = sock->sk; struct tcp_splice_state tss = { .pipe = pipe, .len = len, .flags = flags, }; long timeo; ssize_t spliced; int ret; sock_rps_record_flow(sk); /* * We can't seek on a socket input */ if (unlikely(*ppos)) return -ESPIPE; ret = spliced = 0; lock_sock(sk); timeo = sock_rcvtimeo(sk, sock->file->f_flags & O_NONBLOCK); while (tss.len) { ret = __tcp_splice_read(sk, &tss); if (ret < 0) break; else if (!ret) { if (spliced) break; if (sock_flag(sk, SOCK_DONE)) break; if (sk->sk_err) { ret = sock_error(sk); break; } if (sk->sk_shutdown & RCV_SHUTDOWN) break; if (sk->sk_state == TCP_CLOSE) { /* * This occurs when user tries to read * from never connected socket. */ ret = -ENOTCONN; break; } if (!timeo) { ret = -EAGAIN; break; } /* if __tcp_splice_read() got nothing while we have * an skb in receive queue, we do not want to loop. * This might happen with URG data. */ if (!skb_queue_empty(&sk->sk_receive_queue)) break; ret = sk_wait_data(sk, &timeo, NULL); if (ret < 0) break; if (signal_pending(current)) { ret = sock_intr_errno(timeo); break; } continue; } tss.len -= ret; spliced += ret; if (!tss.len || !timeo) break; release_sock(sk); lock_sock(sk); if (sk->sk_err || sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN) || signal_pending(current)) break; } release_sock(sk); if (spliced) return spliced; return ret; } EXPORT_SYMBOL(tcp_splice_read); struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, gfp_t gfp, bool force_schedule) { struct sk_buff *skb; skb = alloc_skb_fclone(MAX_TCP_HEADER, gfp); if (likely(skb)) { bool mem_scheduled; skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); if (force_schedule) { mem_scheduled = true; sk_forced_mem_schedule(sk, skb->truesize); } else { mem_scheduled = sk_wmem_schedule(sk, skb->truesize); } if (likely(mem_scheduled)) { skb_reserve(skb, MAX_TCP_HEADER); skb->ip_summed = CHECKSUM_PARTIAL; INIT_LIST_HEAD(&skb->tcp_tsorted_anchor); return skb; } __kfree_skb(skb); } else { sk->sk_prot->enter_memory_pressure(sk); sk_stream_moderate_sndbuf(sk); } return NULL; } static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, int large_allowed) { struct tcp_sock *tp = tcp_sk(sk); u32 new_size_goal, size_goal; if (!large_allowed) return mss_now; /* Note : tcp_tso_autosize() will eventually split this later */ new_size_goal = tcp_bound_to_half_wnd(tp, sk->sk_gso_max_size); /* We try hard to avoid divides here */ size_goal = tp->gso_segs * mss_now; if (unlikely(new_size_goal < size_goal || new_size_goal >= size_goal + mss_now)) { tp->gso_segs = min_t(u16, new_size_goal / mss_now, sk->sk_gso_max_segs); size_goal = tp->gso_segs * mss_now; } return max(size_goal, mss_now); } int tcp_send_mss(struct sock *sk, int *size_goal, int flags) { int mss_now; mss_now = tcp_current_mss(sk); *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB)); return mss_now; } /* In some cases, sendmsg() could have added an skb to the write queue, * but failed adding payload on it. We need to remove it to consume less * memory, but more importantly be able to generate EPOLLOUT for Edge Trigger * epoll() users. Another reason is that tcp_write_xmit() does not like * finding an empty skb in the write queue. */ void tcp_remove_empty_skb(struct sock *sk) { struct sk_buff *skb = tcp_write_queue_tail(sk); if (skb && TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) { tcp_unlink_write_queue(skb, sk); if (tcp_write_queue_empty(sk)) tcp_chrono_stop(sk, TCP_CHRONO_BUSY); tcp_wmem_free_skb(sk, skb); } } /* skb changing from pure zc to mixed, must charge zc */ static int tcp_downgrade_zcopy_pure(struct sock *sk, struct sk_buff *skb) { if (unlikely(skb_zcopy_pure(skb))) { u32 extra = skb->truesize - SKB_TRUESIZE(skb_end_offset(skb)); if (!sk_wmem_schedule(sk, extra)) return -ENOMEM; sk_mem_charge(sk, extra); skb_shinfo(skb)->flags &= ~SKBFL_PURE_ZEROCOPY; } return 0; } int tcp_wmem_schedule(struct sock *sk, int copy) { int left; if (likely(sk_wmem_schedule(sk, copy))) return copy; /* We could be in trouble if we have nothing queued. * Use whatever is left in sk->sk_forward_alloc and tcp_wmem[0] * to guarantee some progress. */ left = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[0]) - sk->sk_wmem_queued; if (left > 0) sk_forced_mem_schedule(sk, min(left, copy)); return min(copy, sk->sk_forward_alloc); } void tcp_free_fastopen_req(struct tcp_sock *tp) { if (tp->fastopen_req) { kfree(tp->fastopen_req); tp->fastopen_req = NULL; } } int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied, size_t size, struct ubuf_info *uarg) { struct tcp_sock *tp = tcp_sk(sk); struct inet_sock *inet = inet_sk(sk); struct sockaddr *uaddr = msg->msg_name; int err, flags; if (!(READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen) & TFO_CLIENT_ENABLE) || (uaddr && msg->msg_namelen >= sizeof(uaddr->sa_family) && uaddr->sa_family == AF_UNSPEC)) return -EOPNOTSUPP; if (tp->fastopen_req) return -EALREADY; /* Another Fast Open is in progress */ tp->fastopen_req = kzalloc(sizeof(struct tcp_fastopen_request), sk->sk_allocation); if (unlikely(!tp->fastopen_req)) return -ENOBUFS; tp->fastopen_req->data = msg; tp->fastopen_req->size = size; tp->fastopen_req->uarg = uarg; if (inet_test_bit(DEFER_CONNECT, sk)) { err = tcp_connect(sk); /* Same failure procedure as in tcp_v4/6_connect */ if (err) { tcp_set_state(sk, TCP_CLOSE); inet->inet_dport = 0; sk->sk_route_caps = 0; } } flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0; err = __inet_stream_connect(sk->sk_socket, uaddr, msg->msg_namelen, flags, 1); /* fastopen_req could already be freed in __inet_stream_connect * if the connection times out or gets rst */ if (tp->fastopen_req) { *copied = tp->fastopen_req->copied; tcp_free_fastopen_req(tp); inet_clear_bit(DEFER_CONNECT, sk); } return err; } int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) { struct tcp_sock *tp = tcp_sk(sk); struct ubuf_info *uarg = NULL; struct sk_buff *skb; struct sockcm_cookie sockc; int flags, err, copied = 0; int mss_now = 0, size_goal, copied_syn = 0; int process_backlog = 0; int zc = 0; long timeo; flags = msg->msg_flags; if ((flags & MSG_ZEROCOPY) && size) { if (msg->msg_ubuf) { uarg = msg->msg_ubuf; if (sk->sk_route_caps & NETIF_F_SG) zc = MSG_ZEROCOPY; } else if (sock_flag(sk, SOCK_ZEROCOPY)) { skb = tcp_write_queue_tail(sk); uarg = msg_zerocopy_realloc(sk, size, skb_zcopy(skb)); if (!uarg) { err = -ENOBUFS; goto out_err; } if (sk->sk_route_caps & NETIF_F_SG) zc = MSG_ZEROCOPY; else uarg_to_msgzc(uarg)->zerocopy = 0; } } else if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES) && size) { if (sk->sk_route_caps & NETIF_F_SG) zc = MSG_SPLICE_PAGES; } if (unlikely(flags & MSG_FASTOPEN || inet_test_bit(DEFER_CONNECT, sk)) && !tp->repair) { err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size, uarg); if (err == -EINPROGRESS && copied_syn > 0) goto out; else if (err) goto out_err; } timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); tcp_rate_check_app_limited(sk); /* is sending application-limited? */ /* Wait for a connection to finish. One exception is TCP Fast Open * (passive side) where data is allowed to be sent before a connection * is fully established. */ if (((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) && !tcp_passive_fastopen(sk)) { err = sk_stream_wait_connect(sk, &timeo); if (err != 0) goto do_error; } if (unlikely(tp->repair)) { if (tp->repair_queue == TCP_RECV_QUEUE) { copied = tcp_send_rcvq(sk, msg, size); goto out_nopush; } err = -EINVAL; if (tp->repair_queue == TCP_NO_QUEUE) goto out_err; /* 'common' sending to sendq */ } sockcm_init(&sockc, sk); if (msg->msg_controllen) { err = sock_cmsg_send(sk, msg, &sockc); if (unlikely(err)) { err = -EINVAL; goto out_err; } } /* This should be in poll */ sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); /* Ok commence sending. */ copied = 0; restart: mss_now = tcp_send_mss(sk, &size_goal, flags); err = -EPIPE; if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) goto do_error; while (msg_data_left(msg)) { ssize_t copy = 0; skb = tcp_write_queue_tail(sk); if (skb) copy = size_goal - skb->len; if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) { bool first_skb; new_segment: if (!sk_stream_memory_free(sk)) goto wait_for_space; if (unlikely(process_backlog >= 16)) { process_backlog = 0; if (sk_flush_backlog(sk)) goto restart; } first_skb = tcp_rtx_and_write_queues_empty(sk); skb = tcp_stream_alloc_skb(sk, sk->sk_allocation, first_skb); if (!skb) goto wait_for_space; process_backlog++; tcp_skb_entail(sk, skb); copy = size_goal; /* All packets are restored as if they have * already been sent. skb_mstamp_ns isn't set to * avoid wrong rtt estimation. */ if (tp->repair) TCP_SKB_CB(skb)->sacked |= TCPCB_REPAIRED; } /* Try to append data to the end of skb. */ if (copy > msg_data_left(msg)) copy = msg_data_left(msg); if (zc == 0) { bool merge = true; int i = skb_shinfo(skb)->nr_frags; struct page_frag *pfrag = sk_page_frag(sk); if (!sk_page_frag_refill(sk, pfrag)) goto wait_for_space; if (!skb_can_coalesce(skb, i, pfrag->page, pfrag->offset)) { if (i >= READ_ONCE(sysctl_max_skb_frags)) { tcp_mark_push(tp, skb); goto new_segment; } merge = false; } copy = min_t(int, copy, pfrag->size - pfrag->offset); if (unlikely(skb_zcopy_pure(skb) || skb_zcopy_managed(skb))) { if (tcp_downgrade_zcopy_pure(sk, skb)) goto wait_for_space; skb_zcopy_downgrade_managed(skb); } copy = tcp_wmem_schedule(sk, copy); if (!copy) goto wait_for_space; err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb, pfrag->page, pfrag->offset, copy); if (err) goto do_error; /* Update the skb. */ if (merge) { skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); } else { skb_fill_page_desc(skb, i, pfrag->page, pfrag->offset, copy); page_ref_inc(pfrag->page); } pfrag->offset += copy; } else if (zc == MSG_ZEROCOPY) { /* First append to a fragless skb builds initial * pure zerocopy skb */ if (!skb->len) skb_shinfo(skb)->flags |= SKBFL_PURE_ZEROCOPY; if (!skb_zcopy_pure(skb)) { copy = tcp_wmem_schedule(sk, copy); if (!copy) goto wait_for_space; } err = skb_zerocopy_iter_stream(sk, skb, msg, copy, uarg); if (err == -EMSGSIZE || err == -EEXIST) { tcp_mark_push(tp, skb); goto new_segment; } if (err < 0) goto do_error; copy = err; } else if (zc == MSG_SPLICE_PAGES) { /* Splice in data if we can; copy if we can't. */ if (tcp_downgrade_zcopy_pure(sk, skb)) goto wait_for_space; copy = tcp_wmem_schedule(sk, copy); if (!copy) goto wait_for_space; err = skb_splice_from_iter(skb, &msg->msg_iter, copy, sk->sk_allocation); if (err < 0) { if (err == -EMSGSIZE) { tcp_mark_push(tp, skb); goto new_segment; } goto do_error; } copy = err; if (!(flags & MSG_NO_SHARED_FRAGS)) skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG; sk_wmem_queued_add(sk, copy); sk_mem_charge(sk, copy); } if (!copied) TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_PSH; WRITE_ONCE(tp->write_seq, tp->write_seq + copy); TCP_SKB_CB(skb)->end_seq += copy; tcp_skb_pcount_set(skb, 0); copied += copy; if (!msg_data_left(msg)) { if (unlikely(flags & MSG_EOR)) TCP_SKB_CB(skb)->eor = 1; goto out; } if (skb->len < size_goal || (flags & MSG_OOB) || unlikely(tp->repair)) continue; if (forced_push(tp)) { tcp_mark_push(tp, skb); __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH); } else if (skb == tcp_send_head(sk)) tcp_push_one(sk, mss_now); continue; wait_for_space: set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); tcp_remove_empty_skb(sk); if (copied) tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH, size_goal); err = sk_stream_wait_memory(sk, &timeo); if (err != 0) goto do_error; mss_now = tcp_send_mss(sk, &size_goal, flags); } out: if (copied) { tcp_tx_timestamp(sk, sockc.tsflags); tcp_push(sk, flags, mss_now, tp->nonagle, size_goal); } out_nopush: /* msg->msg_ubuf is pinned by the caller so we don't take extra refs */ if (uarg && !msg->msg_ubuf) net_zcopy_put(uarg); return copied + copied_syn; do_error: tcp_remove_empty_skb(sk); if (copied + copied_syn) goto out; out_err: /* msg->msg_ubuf is pinned by the caller so we don't take extra refs */ if (uarg && !msg->msg_ubuf) net_zcopy_put_abort(uarg, true); err = sk_stream_error(sk, flags, err); /* make sure we wake any epoll edge trigger waiter */ if (unlikely(tcp_rtx_and_write_queues_empty(sk) && err == -EAGAIN)) { sk->sk_write_space(sk); tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED); } return err; } EXPORT_SYMBOL_GPL(tcp_sendmsg_locked); int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) { int ret; lock_sock(sk); ret = tcp_sendmsg_locked(sk, msg, size); release_sock(sk); return ret; } EXPORT_SYMBOL(tcp_sendmsg); void tcp_splice_eof(struct socket *sock) { struct sock *sk = sock->sk; struct tcp_sock *tp = tcp_sk(sk); int mss_now, size_goal; if (!tcp_write_queue_tail(sk)) return; lock_sock(sk); mss_now = tcp_send_mss(sk, &size_goal, 0); tcp_push(sk, 0, mss_now, tp->nonagle, size_goal); release_sock(sk); } EXPORT_SYMBOL_GPL(tcp_splice_eof); /* * Handle reading urgent data. BSD has very simple semantics for * this, no blocking and very strange errors 8) */ static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags) { struct tcp_sock *tp = tcp_sk(sk); /* No URG data to read. */ if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data || tp->urg_data == TCP_URG_READ) return -EINVAL; /* Yes this is right ! */ if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE)) return -ENOTCONN; if (tp->urg_data & TCP_URG_VALID) { int err = 0; char c = tp->urg_data; if (!(flags & MSG_PEEK)) WRITE_ONCE(tp->urg_data, TCP_URG_READ); /* Read urgent data. */ msg->msg_flags |= MSG_OOB; if (len > 0) { if (!(flags & MSG_TRUNC)) err = memcpy_to_msg(msg, &c, 1); len = 1; } else msg->msg_flags |= MSG_TRUNC; return err ? -EFAULT : len; } if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN)) return 0; /* Fixed the recv(..., MSG_OOB) behaviour. BSD docs and * the available implementations agree in this case: * this call should never block, independent of the * blocking state of the socket. * Mike <pall@rz.uni-karlsruhe.de> */ return -EAGAIN; } static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len) { struct sk_buff *skb; int copied = 0, err = 0; /* XXX -- need to support SO_PEEK_OFF */ skb_rbtree_walk(skb, &sk->tcp_rtx_queue) { err = skb_copy_datagram_msg(skb, 0, msg, skb->len); if (err) return err; copied += skb->len; } skb_queue_walk(&sk->sk_write_queue, skb) { err = skb_copy_datagram_msg(skb, 0, msg, skb->len); if (err) break; copied += skb->len; } return err ?: copied; } /* Clean up the receive buffer for full frames taken by the user, * then send an ACK if necessary. COPIED is the number of bytes * tcp_recvmsg has given to the user so far, it speeds up the * calculation of whether or not we must ACK for the sake of * a window update. */ void __tcp_cleanup_rbuf(struct sock *sk, int copied) { struct tcp_sock *tp = tcp_sk(sk); bool time_to_ack = false; if (inet_csk_ack_scheduled(sk)) { const struct inet_connection_sock *icsk = inet_csk(sk); if (/* Once-per-two-segments ACK was not sent by tcp_input.c */ tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss || /* * If this read emptied read buffer, we send ACK, if * connection is not bidirectional, user drained * receive buffer and there was a small segment * in queue. */ (copied > 0 && ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) || ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) && !inet_csk_in_pingpong_mode(sk))) && !atomic_read(&sk->sk_rmem_alloc))) time_to_ack = true; } /* We send an ACK if we can now advertise a non-zero window * which has been raised "significantly". * * Even if window raised up to infinity, do not send window open ACK * in states, where we will not receive more. It is useless. */ if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) { __u32 rcv_window_now = tcp_receive_window(tp); /* Optimize, __tcp_select_window() is not cheap. */ if (2*rcv_window_now <= tp->window_clamp) { __u32 new_window = __tcp_select_window(sk); /* Send ACK now, if this read freed lots of space * in our buffer. Certainly, new_window is new window. * We can advertise it now, if it is not less than current one. * "Lots" means "at least twice" here. */ if (new_window && new_window >= 2 * rcv_window_now) time_to_ack = true; } } if (time_to_ack) tcp_send_ack(sk); } void tcp_cleanup_rbuf(struct sock *sk, int copied) { struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); struct tcp_sock *tp = tcp_sk(sk); WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq), "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n", tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt); __tcp_cleanup_rbuf(sk, copied); } static void tcp_eat_recv_skb(struct sock *sk, struct sk_buff *skb) { __skb_unlink(skb, &sk->sk_receive_queue); if (likely(skb->destructor == sock_rfree)) { sock_rfree(skb); skb->destructor = NULL; skb->sk = NULL; return skb_attempt_defer_free(skb); } __kfree_skb(skb); } struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) { struct sk_buff *skb; u32 offset; while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) { offset = seq - TCP_SKB_CB(skb)->seq; if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { pr_err_once("%s: found a SYN, please report !\n", __func__); offset--; } if (offset < skb->len || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) { *off = offset; return skb; } /* This looks weird, but this can happen if TCP collapsing * splitted a fat GRO packet, while we released socket lock * in skb_splice_bits() */ tcp_eat_recv_skb(sk, skb); } return NULL; } EXPORT_SYMBOL(tcp_recv_skb); /* * This routine provides an alternative to tcp_recvmsg() for routines * that would like to handle copying from skbuffs directly in 'sendfile' * fashion. * Note: * - It is assumed that the socket was locked by the caller. * - The routine does not block. * - At present, there is no support for reading OOB data * or for 'peeking' the socket using this routine * (although both would be easy to implement). */ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, sk_read_actor_t recv_actor) { struct sk_buff *skb; struct tcp_sock *tp = tcp_sk(sk); u32 seq = tp->copied_seq; u32 offset; int copied = 0; if (sk->sk_state == TCP_LISTEN) return -ENOTCONN; while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) { if (offset < skb->len) { int used; size_t len; len = skb->len - offset; /* Stop reading if we hit a patch of urgent data */ if (unlikely(tp->urg_data)) { u32 urg_offset = tp->urg_seq - seq; if (urg_offset < len) len = urg_offset; if (!len) break; } used = recv_actor(desc, skb, offset, len); if (used <= 0) { if (!copied) copied = used; break; } if (WARN_ON_ONCE(used > len)) used = len; seq += used; copied += used; offset += used; /* If recv_actor drops the lock (e.g. TCP splice * receive) the skb pointer might be invalid when * getting here: tcp_collapse might have deleted it * while aggregating skbs from the socket queue. */ skb = tcp_recv_skb(sk, seq - 1, &offset); if (!skb) break; /* TCP coalescing might have appended data to the skb. * Try to splice more frags */ if (offset + 1 != skb->len) continue; } if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) { tcp_eat_recv_skb(sk, skb); ++seq; break; } tcp_eat_recv_skb(sk, skb); if (!desc->count) break; WRITE_ONCE(tp->copied_seq, seq); } WRITE_ONCE(tp->copied_seq, seq); tcp_rcv_space_adjust(sk); /* Clean up data we have read: This will do ACK frames. */ if (copied > 0) { tcp_recv_skb(sk, seq, &offset); tcp_cleanup_rbuf(sk, copied); } return copied; } EXPORT_SYMBOL(tcp_read_sock); int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor) { struct sk_buff *skb; int copied = 0; if (sk->sk_state == TCP_LISTEN) return -ENOTCONN; while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) { u8 tcp_flags; int used; __skb_unlink(skb, &sk->sk_receive_queue); WARN_ON_ONCE(!skb_set_owner_sk_safe(skb, sk)); tcp_flags = TCP_SKB_CB(skb)->tcp_flags; used = recv_actor(sk, skb); if (used < 0) { if (!copied) copied = used; break; } copied += used; if (tcp_flags & TCPHDR_FIN) break; } return copied; } EXPORT_SYMBOL(tcp_read_skb); void tcp_read_done(struct sock *sk, size_t len) { struct tcp_sock *tp = tcp_sk(sk); u32 seq = tp->copied_seq; struct sk_buff *skb; size_t left; u32 offset; if (sk->sk_state == TCP_LISTEN) return; left = len; while (left && (skb = tcp_recv_skb(sk, seq, &offset)) != NULL) { int used; used = min_t(size_t, skb->len - offset, left); seq += used; left -= used; if (skb->len > offset + used) break; if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) { tcp_eat_recv_skb(sk, skb); ++seq; break; } tcp_eat_recv_skb(sk, skb); } WRITE_ONCE(tp->copied_seq, seq); tcp_rcv_space_adjust(sk); /* Clean up data we have read: This will do ACK frames. */ if (left != len) tcp_cleanup_rbuf(sk, len - left); } EXPORT_SYMBOL(tcp_read_done); int tcp_peek_len(struct socket *sock) { return tcp_inq(sock->sk); } EXPORT_SYMBOL(tcp_peek_len); /* Make sure sk_rcvbuf is big enough to satisfy SO_RCVLOWAT hint */ int tcp_set_rcvlowat(struct sock *sk, int val) { int space, cap; if (sk->sk_userlocks & SOCK_RCVBUF_LOCK) cap = sk->sk_rcvbuf >> 1; else cap = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1; val = min(val, cap); WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); /* Check if we need to signal EPOLLIN right now */ tcp_data_ready(sk); if (sk->sk_userlocks & SOCK_RCVBUF_LOCK) return 0; space = tcp_space_from_win(sk, val); if (space > sk->sk_rcvbuf) { WRITE_ONCE(sk->sk_rcvbuf, space); tcp_sk(sk)->window_clamp = val; } return 0; } EXPORT_SYMBOL(tcp_set_rcvlowat); void tcp_update_recv_tstamps(struct sk_buff *skb, struct scm_timestamping_internal *tss) { if (skb->tstamp) tss->ts[0] = ktime_to_timespec64(skb->tstamp); else tss->ts[0] = (struct timespec64) {0}; if (skb_hwtstamps(skb)->hwtstamp) tss->ts[2] = ktime_to_timespec64(skb_hwtstamps(skb)->hwtstamp); else tss->ts[2] = (struct timespec64) {0}; } #ifdef CONFIG_MMU static const struct vm_operations_struct tcp_vm_ops = { }; int tcp_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) { if (vma->vm_flags & (VM_WRITE | VM_EXEC)) return -EPERM; vm_flags_clear(vma, VM_MAYWRITE | VM_MAYEXEC); /* Instruct vm_insert_page() to not mmap_read_lock(mm) */ vm_flags_set(vma, VM_MIXEDMAP); vma->vm_ops = &tcp_vm_ops; return 0; } EXPORT_SYMBOL(tcp_mmap); static skb_frag_t *skb_advance_to_frag(struct sk_buff *skb, u32 offset_skb, u32 *offset_frag) { skb_frag_t *frag; if (unlikely(offset_skb >= skb->len)) return NULL; offset_skb -= skb_headlen(skb); if ((int)offset_skb < 0 || skb_has_frag_list(skb)) return NULL; frag = skb_shinfo(skb)->frags; while (offset_skb) { if (skb_frag_size(frag) > offset_skb) { *offset_frag = offset_skb; return frag; } offset_skb -= skb_frag_size(frag); ++frag; } *offset_frag = 0; return frag; } static bool can_map_frag(const skb_frag_t *frag) { struct page *page; if (skb_frag_size(frag) != PAGE_SIZE || skb_frag_off(frag)) return false; page = skb_frag_page(frag); if (PageCompound(page) || page->mapping) return false; return true; } static int find_next_mappable_frag(const skb_frag_t *frag, int remaining_in_skb) { int offset = 0; if (likely(can_map_frag(frag))) return 0; while (offset < remaining_in_skb && !can_map_frag(frag)) { offset += skb_frag_size(frag); ++frag; } return offset; } static void tcp_zerocopy_set_hint_for_skb(struct sock *sk, struct tcp_zerocopy_receive *zc, struct sk_buff *skb, u32 offset) { u32 frag_offset, partial_frag_remainder = 0; int mappable_offset; skb_frag_t *frag; /* worst case: skip to next skb. try to improve on this case below */ zc->recv_skip_hint = skb->len - offset; /* Find the frag containing this offset (and how far into that frag) */ frag = skb_advance_to_frag(skb, offset, &frag_offset); if (!frag) return; if (frag_offset) { struct skb_shared_info *info = skb_shinfo(skb); /* We read part of the last frag, must recvmsg() rest of skb. */ if (frag == &info->frags[info->nr_frags - 1]) return; /* Else, we must at least read the remainder in this frag. */ partial_frag_remainder = skb_frag_size(frag) - frag_offset; zc->recv_skip_hint -= partial_frag_remainder; ++frag; } /* partial_frag_remainder: If part way through a frag, must read rest. * mappable_offset: Bytes till next mappable frag, *not* counting bytes * in partial_frag_remainder. */ mappable_offset = find_next_mappable_frag(frag, zc->recv_skip_hint); zc->recv_skip_hint = mappable_offset + partial_frag_remainder; } static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len, int flags, struct scm_timestamping_internal *tss, int *cmsg_flags); static int receive_fallback_to_copy(struct sock *sk, struct tcp_zerocopy_receive *zc, int inq, struct scm_timestamping_internal *tss) { unsigned long copy_address = (unsigned long)zc->copybuf_address; struct msghdr msg = {}; int err; zc->length = 0; zc->recv_skip_hint = 0; if (copy_address != zc->copybuf_address) return -EINVAL; err = import_ubuf(ITER_DEST, (void __user *)copy_address, inq, &msg.msg_iter); if (err) return err; err = tcp_recvmsg_locked(sk, &msg, inq, MSG_DONTWAIT, tss, &zc->msg_flags); if (err < 0) return err; zc->copybuf_len = err; if (likely(zc->copybuf_len)) { struct sk_buff *skb; u32 offset; skb = tcp_recv_skb(sk, tcp_sk(sk)->copied_seq, &offset); if (skb) tcp_zerocopy_set_hint_for_skb(sk, zc, skb, offset); } return 0; } static int tcp_copy_straggler_data(struct tcp_zerocopy_receive *zc, struct sk_buff *skb, u32 copylen, u32 *offset, u32 *seq) { unsigned long copy_address = (unsigned long)zc->copybuf_address; struct msghdr msg = {}; int err; if (copy_address != zc->copybuf_address) return -EINVAL; err = import_ubuf(ITER_DEST, (void __user *)copy_address, copylen, &msg.msg_iter); if (err) return err; err = skb_copy_datagram_msg(skb, *offset, &msg, copylen); if (err) return err; zc->recv_skip_hint -= copylen; *offset += copylen; *seq += copylen; return (__s32)copylen; } static int tcp_zc_handle_leftover(struct tcp_zerocopy_receive *zc, struct sock *sk, struct sk_buff *skb, u32 *seq, s32 copybuf_len, struct scm_timestamping_internal *tss) { u32 offset, copylen = min_t(u32, copybuf_len, zc->recv_skip_hint); if (!copylen) return 0; /* skb is null if inq < PAGE_SIZE. */ if (skb) { offset = *seq - TCP_SKB_CB(skb)->seq; } else { skb = tcp_recv_skb(sk, *seq, &offset); if (TCP_SKB_CB(skb)->has_rxtstamp) { tcp_update_recv_tstamps(skb, tss); zc->msg_flags |= TCP_CMSG_TS; } } zc->copybuf_len = tcp_copy_straggler_data(zc, skb, copylen, &offset, seq); return zc->copybuf_len < 0 ? 0 : copylen; } static int tcp_zerocopy_vm_insert_batch_error(struct vm_area_struct *vma, struct page **pending_pages, unsigned long pages_remaining, unsigned long *address, u32 *length, u32 *seq, struct tcp_zerocopy_receive *zc, u32 total_bytes_to_map, int err) { /* At least one page did not map. Try zapping if we skipped earlier. */ if (err == -EBUSY && zc->flags & TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT) { u32 maybe_zap_len; maybe_zap_len = total_bytes_to_map - /* All bytes to map */ *length + /* Mapped or pending */ (pages_remaining * PAGE_SIZE); /* Failed map. */ zap_page_range_single(vma, *address, maybe_zap_len, NULL); err = 0; } if (!err) { unsigned long leftover_pages = pages_remaining; int bytes_mapped; /* We called zap_page_range_single, try to reinsert. */ err = vm_insert_pages(vma, *address, pending_pages, &pages_remaining); bytes_mapped = PAGE_SIZE * (leftover_pages - pages_remaining); *seq += bytes_mapped; *address += bytes_mapped; } if (err) { /* Either we were unable to zap, OR we zapped, retried an * insert, and still had an issue. Either ways, pages_remaining * is the number of pages we were unable to map, and we unroll * some state we speculatively touched before. */ const int bytes_not_mapped = PAGE_SIZE * pages_remaining; *length -= bytes_not_mapped; zc->recv_skip_hint += bytes_not_mapped; } return err; } static int tcp_zerocopy_vm_insert_batch(struct vm_area_struct *vma, struct page **pages, unsigned int pages_to_map, unsigned long *address, u32 *length, u32 *seq, struct tcp_zerocopy_receive *zc, u32 total_bytes_to_map) { unsigned long pages_remaining = pages_to_map; unsigned int pages_mapped; unsigned int bytes_mapped; int err; err = vm_insert_pages(vma, *address, pages, &pages_remaining); pages_mapped = pages_to_map - (unsigned int)pages_remaining; bytes_mapped = PAGE_SIZE * pages_mapped; /* Even if vm_insert_pages fails, it may have partially succeeded in * mapping (some but not all of the pages). */ *seq += bytes_mapped; *address += bytes_mapped; if (likely(!err)) return 0; /* Error: maybe zap and retry + rollback state for failed inserts. */ return tcp_zerocopy_vm_insert_batch_error(vma, pages + pages_mapped, pages_remaining, address, length, seq, zc, total_bytes_to_map, err); } #define TCP_VALID_ZC_MSG_FLAGS (TCP_CMSG_TS) static void tcp_zc_finalize_rx_tstamp(struct sock *sk, struct tcp_zerocopy_receive *zc, struct scm_timestamping_internal *tss) { unsigned long msg_control_addr; struct msghdr cmsg_dummy; msg_control_addr = (unsigned long)zc->msg_control; cmsg_dummy.msg_control_user = (void __user *)msg_control_addr; cmsg_dummy.msg_controllen = (__kernel_size_t)zc->msg_controllen; cmsg_dummy.msg_flags = in_compat_syscall() ? MSG_CMSG_COMPAT : 0; cmsg_dummy.msg_control_is_user = true; zc->msg_flags = 0; if (zc->msg_control == msg_control_addr && zc->msg_controllen == cmsg_dummy.msg_controllen) { tcp_recv_timestamp(&cmsg_dummy, sk, tss); zc->msg_control = (__u64) ((uintptr_t)cmsg_dummy.msg_control_user); zc->msg_controllen = (__u64)cmsg_dummy.msg_controllen; zc->msg_flags = (__u32)cmsg_dummy.msg_flags; } } static struct vm_area_struct *find_tcp_vma(struct mm_struct *mm, unsigned long address, bool *mmap_locked) { struct vm_area_struct *vma = lock_vma_under_rcu(mm, address); if (vma) { if (vma->vm_ops != &tcp_vm_ops) { vma_end_read(vma); return NULL; } *mmap_locked = false; return vma; } mmap_read_lock(mm); vma = vma_lookup(mm, address); if (!vma || vma->vm_ops != &tcp_vm_ops) { mmap_read_unlock(mm); return NULL; } *mmap_locked = true; return vma; } #define TCP_ZEROCOPY_PAGE_BATCH_SIZE 32 static int tcp_zerocopy_receive(struct sock *sk, struct tcp_zerocopy_receive *zc, struct scm_timestamping_internal *tss) { u32 length = 0, offset, vma_len, avail_len, copylen = 0; unsigned long address = (unsigned long)zc->address; struct page *pages[TCP_ZEROCOPY_PAGE_BATCH_SIZE]; s32 copybuf_len = zc->copybuf_len; struct tcp_sock *tp = tcp_sk(sk); const skb_frag_t *frags = NULL; unsigned int pages_to_map = 0; struct vm_area_struct *vma; struct sk_buff *skb = NULL; u32 seq = tp->copied_seq; u32 total_bytes_to_map; int inq = tcp_inq(sk); bool mmap_locked; int ret; zc->copybuf_len = 0; zc->msg_flags = 0; if (address & (PAGE_SIZE - 1) || address != zc->address) return -EINVAL; if (sk->sk_state == TCP_LISTEN) return -ENOTCONN; sock_rps_record_flow(sk); if (inq && inq <= copybuf_len) return receive_fallback_to_copy(sk, zc, inq, tss); if (inq < PAGE_SIZE) { zc->length = 0; zc->recv_skip_hint = inq; if (!inq && sock_flag(sk, SOCK_DONE)) return -EIO; return 0; } vma = find_tcp_vma(current->mm, address, &mmap_locked); if (!vma) return -EINVAL; vma_len = min_t(unsigned long, zc->length, vma->vm_end - address); avail_len = min_t(u32, vma_len, inq); total_bytes_to_map = avail_len & ~(PAGE_SIZE - 1); if (total_bytes_to_map) { if (!(zc->flags & TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT)) zap_page_range_single(vma, address, total_bytes_to_map, NULL); zc->length = total_bytes_to_map; zc->recv_skip_hint = 0; } else { zc->length = avail_len; zc->recv_skip_hint = avail_len; } ret = 0; while (length + PAGE_SIZE <= zc->length) { int mappable_offset; struct page *page; if (zc->recv_skip_hint < PAGE_SIZE) { u32 offset_frag; if (skb) { if (zc->recv_skip_hint > 0) break; skb = skb->next; offset = seq - TCP_SKB_CB(skb)->seq; } else { skb = tcp_recv_skb(sk, seq, &offset); } if (TCP_SKB_CB(skb)->has_rxtstamp) { tcp_update_recv_tstamps(skb, tss); zc->msg_flags |= TCP_CMSG_TS; } zc->recv_skip_hint = skb->len - offset; frags = skb_advance_to_frag(skb, offset, &offset_frag); if (!frags || offset_frag) break; } mappable_offset = find_next_mappable_frag(frags, zc->recv_skip_hint); if (mappable_offset) { zc->recv_skip_hint = mappable_offset; break; } page = skb_frag_page(frags); prefetchw(page); pages[pages_to_map++] = page; length += PAGE_SIZE; zc->recv_skip_hint -= PAGE_SIZE; frags++; if (pages_to_map == TCP_ZEROCOPY_PAGE_BATCH_SIZE || zc->recv_skip_hint < PAGE_SIZE) { /* Either full batch, or we're about to go to next skb * (and we cannot unroll failed ops across skbs). */ ret = tcp_zerocopy_vm_insert_batch(vma, pages, pages_to_map, &address, &length, &seq, zc, total_bytes_to_map); if (ret) goto out; pages_to_map = 0; } } if (pages_to_map) { ret = tcp_zerocopy_vm_insert_batch(vma, pages, pages_to_map, &address, &length, &seq, zc, total_bytes_to_map); } out: if (mmap_locked) mmap_read_unlock(current->mm); else vma_end_read(vma); /* Try to copy straggler data. */ if (!ret) copylen = tcp_zc_handle_leftover(zc, sk, skb, &seq, copybuf_len, tss); if (length + copylen) { WRITE_ONCE(tp->copied_seq, seq); tcp_rcv_space_adjust(sk); /* Clean up data we have read: This will do ACK frames. */ tcp_recv_skb(sk, seq, &offset); tcp_cleanup_rbuf(sk, length + copylen); ret = 0; if (length == zc->length) zc->recv_skip_hint = 0; } else { if (!zc->recv_skip_hint && sock_flag(sk, SOCK_DONE)) ret = -EIO; } zc->length = length; return ret; } #endif /* Similar to __sock_recv_timestamp, but does not require an skb */ void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, struct scm_timestamping_internal *tss) { int new_tstamp = sock_flag(sk, SOCK_TSTAMP_NEW); bool has_timestamping = false; if (tss->ts[0].tv_sec || tss->ts[0].tv_nsec) { if (sock_flag(sk, SOCK_RCVTSTAMP)) { if (sock_flag(sk, SOCK_RCVTSTAMPNS)) { if (new_tstamp) { struct __kernel_timespec kts = { .tv_sec = tss->ts[0].tv_sec, .tv_nsec = tss->ts[0].tv_nsec, }; put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_NEW, sizeof(kts), &kts); } else { struct __kernel_old_timespec ts_old = { .tv_sec = tss->ts[0].tv_sec, .tv_nsec = tss->ts[0].tv_nsec, }; put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_OLD, sizeof(ts_old), &ts_old); } } else { if (new_tstamp) { struct __kernel_sock_timeval stv = { .tv_sec = tss->ts[0].tv_sec, .tv_usec = tss->ts[0].tv_nsec / 1000, }; put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_NEW, sizeof(stv), &stv); } else { struct __kernel_old_timeval tv = { .tv_sec = tss->ts[0].tv_sec, .tv_usec = tss->ts[0].tv_nsec / 1000, }; put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP_OLD, sizeof(tv), &tv); } } } if (READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_SOFTWARE) has_timestamping = true; else tss->ts[0] = (struct timespec64) {0}; } if (tss->ts[2].tv_sec || tss->ts[2].tv_nsec) { if (READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_RAW_HARDWARE) has_timestamping = true; else tss->ts[2] = (struct timespec64) {0}; } if (has_timestamping) { tss->ts[1] = (struct timespec64) {0}; if (sock_flag(sk, SOCK_TSTAMP_NEW)) put_cmsg_scm_timestamping64(msg, tss); else put_cmsg_scm_timestamping(msg, tss); } } static int tcp_inq_hint(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); u32 copied_seq = READ_ONCE(tp->copied_seq); u32 rcv_nxt = READ_ONCE(tp->rcv_nxt); int inq; inq = rcv_nxt - copied_seq; if (unlikely(inq < 0 || copied_seq != READ_ONCE(tp->copied_seq))) { lock_sock(sk); inq = tp->rcv_nxt - tp->copied_seq; release_sock(sk); } /* After receiving a FIN, tell the user-space to continue reading * by returning a non-zero inq. */ if (inq == 0 && sock_flag(sk, SOCK_DONE)) inq = 1; return inq; } /* * This routine copies from a sock struct into the user buffer. * * Technical note: in 2.3 we work on _locked_ socket, so that * tricks with *seq access order and skb->users are not required. * Probably, code can be easily improved even more. */ static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len, int flags, struct scm_timestamping_internal *tss, int *cmsg_flags) { struct tcp_sock *tp = tcp_sk(sk); int copied = 0; u32 peek_seq; u32 *seq; unsigned long used; int err; int target; /* Read at least this many bytes */ long timeo; struct sk_buff *skb, *last; u32 urg_hole = 0; err = -ENOTCONN; if (sk->sk_state == TCP_LISTEN) goto out; if (tp->recvmsg_inq) { *cmsg_flags = TCP_CMSG_INQ; msg->msg_get_inq = 1; } timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); /* Urgent data needs to be handled specially. */ if (flags & MSG_OOB) goto recv_urg; if (unlikely(tp->repair)) { err = -EPERM; if (!(flags & MSG_PEEK)) goto out; if (tp->repair_queue == TCP_SEND_QUEUE) goto recv_sndq; err = -EINVAL; if (tp->repair_queue == TCP_NO_QUEUE) goto out; /* 'common' recv queue MSG_PEEK-ing */ } seq = &tp->copied_seq; if (flags & MSG_PEEK) { peek_seq = tp->copied_seq; seq = &peek_seq; } target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); do { u32 offset; /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */ if (unlikely(tp->urg_data) && tp->urg_seq == *seq) { if (copied) break; if (signal_pending(current)) { copied = timeo ? sock_intr_errno(timeo) : -EAGAIN; break; } } /* Next get a buffer. */ last = skb_peek_tail(&sk->sk_receive_queue); skb_queue_walk(&sk->sk_receive_queue, skb) { last = skb; /* Now that we have two receive queues this * shouldn't happen. */ if (WARN(before(*seq, TCP_SKB_CB(skb)->seq), "TCP recvmsg seq # bug: copied %X, seq %X, rcvnxt %X, fl %X\n", *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags)) break; offset = *seq - TCP_SKB_CB(skb)->seq; if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { pr_err_once("%s: found a SYN, please report !\n", __func__); offset--; } if (offset < skb->len) goto found_ok_skb; if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) goto found_fin_ok; WARN(!(flags & MSG_PEEK), "TCP recvmsg seq # bug 2: copied %X, seq %X, rcvnxt %X, fl %X\n", *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags); } /* Well, if we have backlog, try to process it now yet. */ if (copied >= target && !READ_ONCE(sk->sk_backlog.tail)) break; if (copied) { if (!timeo || sk->sk_err || sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN) || signal_pending(current)) break; } else { if (sock_flag(sk, SOCK_DONE)) break; if (sk->sk_err) { copied = sock_error(sk); break; } if (sk->sk_shutdown & RCV_SHUTDOWN) break; if (sk->sk_state == TCP_CLOSE) { /* This occurs when user tries to read * from never connected socket. */ copied = -ENOTCONN; break; } if (!timeo) { copied = -EAGAIN; break; } if (signal_pending(current)) { copied = sock_intr_errno(timeo); break; } } if (copied >= target) { /* Do not sleep, just process backlog. */ __sk_flush_backlog(sk); } else { tcp_cleanup_rbuf(sk, copied); err = sk_wait_data(sk, &timeo, last); if (err < 0) { err = copied ? : err; goto out; } } if ((flags & MSG_PEEK) && (peek_seq - copied - urg_hole != tp->copied_seq)) { net_dbg_ratelimited("TCP(%s:%d): Application bug, race in MSG_PEEK\n", current->comm, task_pid_nr(current)); peek_seq = tp->copied_seq; } continue; found_ok_skb: /* Ok so how much can we use? */ used = skb->len - offset; if (len < used) used = len; /* Do we have urgent data here? */ if (unlikely(tp->urg_data)) { u32 urg_offset = tp->urg_seq - *seq; if (urg_offset < used) { if (!urg_offset) { if (!sock_flag(sk, SOCK_URGINLINE)) { WRITE_ONCE(*seq, *seq + 1); urg_hole++; offset++; used--; if (!used) goto skip_copy; } } else used = urg_offset; } } if (!(flags & MSG_TRUNC)) { err = skb_copy_datagram_msg(skb, offset, msg, used); if (err) { /* Exception. Bailout! */ if (!copied) copied = -EFAULT; break; } } WRITE_ONCE(*seq, *seq + used); copied += used; len -= used; tcp_rcv_space_adjust(sk); skip_copy: if (unlikely(tp->urg_data) && after(tp->copied_seq, tp->urg_seq)) { WRITE_ONCE(tp->urg_data, 0); tcp_fast_path_check(sk); } if (TCP_SKB_CB(skb)->has_rxtstamp) { tcp_update_recv_tstamps(skb, tss); *cmsg_flags |= TCP_CMSG_TS; } if (used + offset < skb->len) continue; if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) goto found_fin_ok; if (!(flags & MSG_PEEK)) tcp_eat_recv_skb(sk, skb); continue; found_fin_ok: /* Process the FIN. */ WRITE_ONCE(*seq, *seq + 1); if (!(flags & MSG_PEEK)) tcp_eat_recv_skb(sk, skb); break; } while (len > 0); /* According to UNIX98, msg_name/msg_namelen are ignored * on connected socket. I was just happy when found this 8) --ANK */ /* Clean up data we have read: This will do ACK frames. */ tcp_cleanup_rbuf(sk, copied); return copied; out: return err; recv_urg: err = tcp_recv_urg(sk, msg, len, flags); goto out; recv_sndq: err = tcp_peek_sndq(sk, msg, len); goto out; } int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { int cmsg_flags = 0, ret; struct scm_timestamping_internal tss; if (unlikely(flags & MSG_ERRQUEUE)) return inet_recv_error(sk, msg, len, addr_len); if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue) && sk->sk_state == TCP_ESTABLISHED) sk_busy_loop(sk, flags & MSG_DONTWAIT); lock_sock(sk); ret = tcp_recvmsg_locked(sk, msg, len, flags, &tss, &cmsg_flags); release_sock(sk); if ((cmsg_flags || msg->msg_get_inq) && ret >= 0) { if (cmsg_flags & TCP_CMSG_TS) tcp_recv_timestamp(msg, sk, &tss); if (msg->msg_get_inq) { msg->msg_inq = tcp_inq_hint(sk); if (cmsg_flags & TCP_CMSG_INQ) put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(msg->msg_inq), &msg->msg_inq); } } return ret; } EXPORT_SYMBOL(tcp_recvmsg); void tcp_set_state(struct sock *sk, int state) { int oldstate = sk->sk_state; /* We defined a new enum for TCP states that are exported in BPF * so as not force the internal TCP states to be frozen. The * following checks will detect if an internal state value ever * differs from the BPF value. If this ever happens, then we will * need to remap the internal value to the BPF value before calling * tcp_call_bpf_2arg. */ BUILD_BUG_ON((int)BPF_TCP_ESTABLISHED != (int)TCP_ESTABLISHED); BUILD_BUG_ON((int)BPF_TCP_SYN_SENT != (int)TCP_SYN_SENT); BUILD_BUG_ON((int)BPF_TCP_SYN_RECV != (int)TCP_SYN_RECV); BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT1 != (int)TCP_FIN_WAIT1); BUILD_BUG_ON((int)BPF_TCP_FIN_WAIT2 != (int)TCP_FIN_WAIT2); BUILD_BUG_ON((int)BPF_TCP_TIME_WAIT != (int)TCP_TIME_WAIT); BUILD_BUG_ON((int)BPF_TCP_CLOSE != (int)TCP_CLOSE); BUILD_BUG_ON((int)BPF_TCP_CLOSE_WAIT != (int)TCP_CLOSE_WAIT); BUILD_BUG_ON((int)BPF_TCP_LAST_ACK != (int)TCP_LAST_ACK); BUILD_BUG_ON((int)BPF_TCP_LISTEN != (int)TCP_LISTEN); BUILD_BUG_ON((int)BPF_TCP_CLOSING != (int)TCP_CLOSING); BUILD_BUG_ON((int)BPF_TCP_NEW_SYN_RECV != (int)TCP_NEW_SYN_RECV); BUILD_BUG_ON((int)BPF_TCP_BOUND_INACTIVE != (int)TCP_BOUND_INACTIVE); BUILD_BUG_ON((int)BPF_TCP_MAX_STATES != (int)TCP_MAX_STATES); /* bpf uapi header bpf.h defines an anonymous enum with values * BPF_TCP_* used by bpf programs. Currently gcc built vmlinux * is able to emit this enum in DWARF due to the above BUILD_BUG_ON. * But clang built vmlinux does not have this enum in DWARF * since clang removes the above code before generating IR/debuginfo. * Let us explicitly emit the type debuginfo to ensure the * above-mentioned anonymous enum in the vmlinux DWARF and hence BTF * regardless of which compiler is used. */ BTF_TYPE_EMIT_ENUM(BPF_TCP_ESTABLISHED); if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_STATE_CB_FLAG)) tcp_call_bpf_2arg(sk, BPF_SOCK_OPS_STATE_CB, oldstate, state); switch (state) { case TCP_ESTABLISHED: if (oldstate != TCP_ESTABLISHED) TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB); break; case TCP_CLOSE: if (oldstate == TCP_CLOSE_WAIT || oldstate == TCP_ESTABLISHED) TCP_INC_STATS(sock_net(sk), TCP_MIB_ESTABRESETS); sk->sk_prot->unhash(sk); if (inet_csk(sk)->icsk_bind_hash && !(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) inet_put_port(sk); fallthrough; default: if (oldstate == TCP_ESTABLISHED) TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB); } /* Change state AFTER socket is unhashed to avoid closed * socket sitting in hash tables. */ inet_sk_state_store(sk, state); } EXPORT_SYMBOL_GPL(tcp_set_state); /* * State processing on a close. This implements the state shift for * sending our FIN frame. Note that we only send a FIN for some * states. A shutdown() may have already sent the FIN, or we may be * closed. */ static const unsigned char new_state[16] = { /* current state: new state: action: */ [0 /* (Invalid) */] = TCP_CLOSE, [TCP_ESTABLISHED] = TCP_FIN_WAIT1 | TCP_ACTION_FIN, [TCP_SYN_SENT] = TCP_CLOSE, [TCP_SYN_RECV] = TCP_FIN_WAIT1 | TCP_ACTION_FIN, [TCP_FIN_WAIT1] = TCP_FIN_WAIT1, [TCP_FIN_WAIT2] = TCP_FIN_WAIT2, [TCP_TIME_WAIT] = TCP_CLOSE, [TCP_CLOSE] = TCP_CLOSE, [TCP_CLOSE_WAIT] = TCP_LAST_ACK | TCP_ACTION_FIN, [TCP_LAST_ACK] = TCP_LAST_ACK, [TCP_LISTEN] = TCP_CLOSE, [TCP_CLOSING] = TCP_CLOSING, [TCP_NEW_SYN_RECV] = TCP_CLOSE, /* should not happen ! */ }; static int tcp_close_state(struct sock *sk) { int next = (int)new_state[sk->sk_state]; int ns = next & TCP_STATE_MASK; tcp_set_state(sk, ns); return next & TCP_ACTION_FIN; } /* * Shutdown the sending side of a connection. Much like close except * that we don't receive shut down or sock_set_flag(sk, SOCK_DEAD). */ void tcp_shutdown(struct sock *sk, int how) { /* We need to grab some memory, and put together a FIN, * and then put it into the queue to be sent. * Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92. */ if (!(how & SEND_SHUTDOWN)) return; /* If we've already sent a FIN, or it's a closed state, skip this. */ if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) { /* Clear out any half completed packets. FIN if needed. */ if (tcp_close_state(sk)) tcp_send_fin(sk); } } EXPORT_SYMBOL(tcp_shutdown); int tcp_orphan_count_sum(void) { int i, total = 0; for_each_possible_cpu(i) total += per_cpu(tcp_orphan_count, i); return max(total, 0); } static int tcp_orphan_cache; static struct timer_list tcp_orphan_timer; #define TCP_ORPHAN_TIMER_PERIOD msecs_to_jiffies(100) static void tcp_orphan_update(struct timer_list *unused) { WRITE_ONCE(tcp_orphan_cache, tcp_orphan_count_sum()); mod_timer(&tcp_orphan_timer, jiffies + TCP_ORPHAN_TIMER_PERIOD); } static bool tcp_too_many_orphans(int shift) { return READ_ONCE(tcp_orphan_cache) << shift > READ_ONCE(sysctl_tcp_max_orphans); } bool tcp_check_oom(struct sock *sk, int shift) { bool too_many_orphans, out_of_socket_memory; too_many_orphans = tcp_too_many_orphans(shift); out_of_socket_memory = tcp_out_of_memory(sk); if (too_many_orphans) net_info_ratelimited("too many orphaned sockets\n"); if (out_of_socket_memory) net_info_ratelimited("out of memory -- consider tuning tcp_mem\n"); return too_many_orphans || out_of_socket_memory; } void __tcp_close(struct sock *sk, long timeout) { struct sk_buff *skb; int data_was_unread = 0; int state; WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); if (sk->sk_state == TCP_LISTEN) { tcp_set_state(sk, TCP_CLOSE); /* Special case. */ inet_csk_listen_stop(sk); goto adjudge_to_death; } /* We need to flush the recv. buffs. We do this only on the * descriptor close, not protocol-sourced closes, because the * reader process may not have drained the data yet! */ while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) { u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq; if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) len--; data_was_unread += len; __kfree_skb(skb); } /* If socket has been already reset (e.g. in tcp_reset()) - kill it. */ if (sk->sk_state == TCP_CLOSE) goto adjudge_to_death; /* As outlined in RFC 2525, section 2.17, we send a RST here because * data was lost. To witness the awful effects of the old behavior of * always doing a FIN, run an older 2.1.x kernel or 2.0.x, start a bulk * GET in an FTP client, suspend the process, wait for the client to * advertise a zero window, then kill -9 the FTP client, wheee... * Note: timeout is always zero in such a case. */ if (unlikely(tcp_sk(sk)->repair)) { sk->sk_prot->disconnect(sk, 0); } else if (data_was_unread) { /* Unread data was tossed, zap the connection. */ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE); tcp_set_state(sk, TCP_CLOSE); tcp_send_active_reset(sk, sk->sk_allocation); } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) { /* Check zero linger _after_ checking for unread data. */ sk->sk_prot->disconnect(sk, 0); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA); } else if (tcp_close_state(sk)) { /* We FIN if the application ate all the data before * zapping the connection. */ /* RED-PEN. Formally speaking, we have broken TCP state * machine. State transitions: * * TCP_ESTABLISHED -> TCP_FIN_WAIT1 * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible) * TCP_CLOSE_WAIT -> TCP_LAST_ACK * * are legal only when FIN has been sent (i.e. in window), * rather than queued out of window. Purists blame. * * F.e. "RFC state" is ESTABLISHED, * if Linux state is FIN-WAIT-1, but FIN is still not sent. * * The visible declinations are that sometimes * we enter time-wait state, when it is not required really * (harmless), do not send active resets, when they are * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when * they look as CLOSING or LAST_ACK for Linux) * Probably, I missed some more holelets. * --ANK * XXX (TFO) - To start off we don't support SYN+ACK+FIN * in a single packet! (May consider it later but will * probably need API support or TCP_CORK SYN-ACK until * data is written and socket is closed.) */ tcp_send_fin(sk); } sk_stream_wait_close(sk, timeout); adjudge_to_death: state = sk->sk_state; sock_hold(sk); sock_orphan(sk); local_bh_disable(); bh_lock_sock(sk); /* remove backlog if any, without releasing ownership. */ __release_sock(sk); this_cpu_inc(tcp_orphan_count); /* Have we already been destroyed by a softirq or backlog? */ if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE) goto out; /* This is a (useful) BSD violating of the RFC. There is a * problem with TCP as specified in that the other end could * keep a socket open forever with no application left this end. * We use a 1 minute timeout (about the same as BSD) then kill * our end. If they send after that then tough - BUT: long enough * that we won't make the old 4*rto = almost no time - whoops * reset mistake. * * Nope, it was not mistake. It is really desired behaviour * f.e. on http servers, when such sockets are useless, but * consume significant resources. Let's do it with special * linger2 option. --ANK */ if (sk->sk_state == TCP_FIN_WAIT2) { struct tcp_sock *tp = tcp_sk(sk); if (READ_ONCE(tp->linger2) < 0) { tcp_set_state(sk, TCP_CLOSE); tcp_send_active_reset(sk, GFP_ATOMIC); __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONLINGER); } else { const int tmo = tcp_fin_time(sk); if (tmo > TCP_TIMEWAIT_LEN) { inet_csk_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); } else { tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); goto out; } } } if (sk->sk_state != TCP_CLOSE) { if (tcp_check_oom(sk, 0)) { tcp_set_state(sk, TCP_CLOSE); tcp_send_active_reset(sk, GFP_ATOMIC); __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONMEMORY); } else if (!check_net(sock_net(sk))) { /* Not possible to send reset; just close */ tcp_set_state(sk, TCP_CLOSE); } } if (sk->sk_state == TCP_CLOSE) { struct request_sock *req; req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk, lockdep_sock_is_held(sk)); /* We could get here with a non-NULL req if the socket is * aborted (e.g., closed with unread data) before 3WHS * finishes. */ if (req) reqsk_fastopen_remove(sk, req, false); inet_csk_destroy_sock(sk); } /* Otherwise, socket is reprieved until protocol close. */ out: bh_unlock_sock(sk); local_bh_enable(); } void tcp_close(struct sock *sk, long timeout) { lock_sock(sk); __tcp_close(sk, timeout); release_sock(sk); if (!sk->sk_net_refcnt) inet_csk_clear_xmit_timers_sync(sk); sock_put(sk); } EXPORT_SYMBOL(tcp_close); /* These states need RST on ABORT according to RFC793 */ static inline bool tcp_need_reset(int state) { return (1 << state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | TCPF_SYN_RECV); } static void tcp_rtx_queue_purge(struct sock *sk) { struct rb_node *p = rb_first(&sk->tcp_rtx_queue); tcp_sk(sk)->highest_sack = NULL; while (p) { struct sk_buff *skb = rb_to_skb(p); p = rb_next(p); /* Since we are deleting whole queue, no need to * list_del(&skb->tcp_tsorted_anchor) */ tcp_rtx_queue_unlink(skb, sk); tcp_wmem_free_skb(sk, skb); } } void tcp_write_queue_purge(struct sock *sk) { struct sk_buff *skb; tcp_chrono_stop(sk, TCP_CHRONO_BUSY); while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) { tcp_skb_tsorted_anchor_cleanup(skb); tcp_wmem_free_skb(sk, skb); } tcp_rtx_queue_purge(sk); INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue); tcp_clear_all_retrans_hints(tcp_sk(sk)); tcp_sk(sk)->packets_out = 0; inet_csk(sk)->icsk_backoff = 0; } int tcp_disconnect(struct sock *sk, int flags) { struct inet_sock *inet = inet_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); int old_state = sk->sk_state; u32 seq; if (old_state != TCP_CLOSE) tcp_set_state(sk, TCP_CLOSE); /* ABORT function of RFC793 */ if (old_state == TCP_LISTEN) { inet_csk_listen_stop(sk); } else if (unlikely(tp->repair)) { WRITE_ONCE(sk->sk_err, ECONNABORTED); } else if (tcp_need_reset(old_state) || (tp->snd_nxt != tp->write_seq && (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) { /* The last check adjusts for discrepancy of Linux wrt. RFC * states */ tcp_send_active_reset(sk, gfp_any()); WRITE_ONCE(sk->sk_err, ECONNRESET); } else if (old_state == TCP_SYN_SENT) WRITE_ONCE(sk->sk_err, ECONNRESET); tcp_clear_xmit_timers(sk); __skb_queue_purge(&sk->sk_receive_queue); WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); WRITE_ONCE(tp->urg_data, 0); tcp_write_queue_purge(sk); tcp_fastopen_active_disable_ofo_check(sk); skb_rbtree_purge(&tp->out_of_order_queue); inet->inet_dport = 0; inet_bhash2_reset_saddr(sk); WRITE_ONCE(sk->sk_shutdown, 0); sock_reset_flag(sk, SOCK_DONE); tp->srtt_us = 0; tp->mdev_us = jiffies_to_usecs(TCP_TIMEOUT_INIT); tp->rcv_rtt_last_tsecr = 0; seq = tp->write_seq + tp->max_window + 2; if (!seq) seq = 1; WRITE_ONCE(tp->write_seq, seq); icsk->icsk_backoff = 0; icsk->icsk_probes_out = 0; icsk->icsk_probes_tstamp = 0; icsk->icsk_rto = TCP_TIMEOUT_INIT; icsk->icsk_rto_min = TCP_RTO_MIN; icsk->icsk_delack_max = TCP_DELACK_MAX; tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; tcp_snd_cwnd_set(tp, TCP_INIT_CWND); tp->snd_cwnd_cnt = 0; tp->is_cwnd_limited = 0; tp->max_packets_out = 0; tp->window_clamp = 0; tp->delivered = 0; tp->delivered_ce = 0; if (icsk->icsk_ca_ops->release) icsk->icsk_ca_ops->release(sk); memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); icsk->icsk_ca_initialized = 0; tcp_set_ca_state(sk, TCP_CA_Open); tp->is_sack_reneg = 0; tcp_clear_retrans(tp); tp->total_retrans = 0; inet_csk_delack_init(sk); /* Initialize rcv_mss to TCP_MIN_MSS to avoid division by 0 * issue in __tcp_select_window() */ icsk->icsk_ack.rcv_mss = TCP_MIN_MSS; memset(&tp->rx_opt, 0, sizeof(tp->rx_opt)); __sk_dst_reset(sk); dst_release(xchg((__force struct dst_entry **)&sk->sk_rx_dst, NULL)); tcp_saved_syn_free(tp); tp->compressed_ack = 0; tp->segs_in = 0; tp->segs_out = 0; tp->bytes_sent = 0; tp->bytes_acked = 0; tp->bytes_received = 0; tp->bytes_retrans = 0; tp->data_segs_in = 0; tp->data_segs_out = 0; tp->duplicate_sack[0].start_seq = 0; tp->duplicate_sack[0].end_seq = 0; tp->dsack_dups = 0; tp->reord_seen = 0; tp->retrans_out = 0; tp->sacked_out = 0; tp->tlp_high_seq = 0; tp->last_oow_ack_time = 0; tp->plb_rehash = 0; /* There's a bubble in the pipe until at least the first ACK. */ tp->app_limited = ~0U; tp->rate_app_limited = 1; tp->rack.mstamp = 0; tp->rack.advanced = 0; tp->rack.reo_wnd_steps = 1; tp->rack.last_delivered = 0; tp->rack.reo_wnd_persist = 0; tp->rack.dsack_seen = 0; tp->syn_data_acked = 0; tp->rx_opt.saw_tstamp = 0; tp->rx_opt.dsack = 0; tp->rx_opt.num_sacks = 0; tp->rcv_ooopack = 0; /* Clean up fastopen related fields */ tcp_free_fastopen_req(tp); inet_clear_bit(DEFER_CONNECT, sk); tp->fastopen_client_fail = 0; WARN_ON(inet->inet_num && !icsk->icsk_bind_hash); if (sk->sk_frag.page) { put_page(sk->sk_frag.page); sk->sk_frag.page = NULL; sk->sk_frag.offset = 0; } sk_error_report(sk); return 0; } EXPORT_SYMBOL(tcp_disconnect); static inline bool tcp_can_repair_sock(const struct sock *sk) { return sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) && (sk->sk_state != TCP_LISTEN); } static int tcp_repair_set_window(struct tcp_sock *tp, sockptr_t optbuf, int len) { struct tcp_repair_window opt; if (!tp->repair) return -EPERM; if (len != sizeof(opt)) return -EINVAL; if (copy_from_sockptr(&opt, optbuf, sizeof(opt))) return -EFAULT; if (opt.max_window < opt.snd_wnd) return -EINVAL; if (after(opt.snd_wl1, tp->rcv_nxt + opt.rcv_wnd)) return -EINVAL; if (after(opt.rcv_wup, tp->rcv_nxt)) return -EINVAL; tp->snd_wl1 = opt.snd_wl1; tp->snd_wnd = opt.snd_wnd; tp->max_window = opt.max_window; tp->rcv_wnd = opt.rcv_wnd; tp->rcv_wup = opt.rcv_wup; return 0; } static int tcp_repair_options_est(struct sock *sk, sockptr_t optbuf, unsigned int len) { struct tcp_sock *tp = tcp_sk(sk); struct tcp_repair_opt opt; size_t offset = 0; while (len >= sizeof(opt)) { if (copy_from_sockptr_offset(&opt, optbuf, offset, sizeof(opt))) return -EFAULT; offset += sizeof(opt); len -= sizeof(opt); switch (opt.opt_code) { case TCPOPT_MSS: tp->rx_opt.mss_clamp = opt.opt_val; tcp_mtup_init(sk); break; case TCPOPT_WINDOW: { u16 snd_wscale = opt.opt_val & 0xFFFF; u16 rcv_wscale = opt.opt_val >> 16; if (snd_wscale > TCP_MAX_WSCALE || rcv_wscale > TCP_MAX_WSCALE) return -EFBIG; tp->rx_opt.snd_wscale = snd_wscale; tp->rx_opt.rcv_wscale = rcv_wscale; tp->rx_opt.wscale_ok = 1; } break; case TCPOPT_SACK_PERM: if (opt.opt_val != 0) return -EINVAL; tp->rx_opt.sack_ok |= TCP_SACK_SEEN; break; case TCPOPT_TIMESTAMP: if (opt.opt_val != 0) return -EINVAL; tp->rx_opt.tstamp_ok = 1; break; } } return 0; } DEFINE_STATIC_KEY_FALSE(tcp_tx_delay_enabled); EXPORT_SYMBOL(tcp_tx_delay_enabled); static void tcp_enable_tx_delay(void) { if (!static_branch_unlikely(&tcp_tx_delay_enabled)) { static int __tcp_tx_delay_enabled = 0; if (cmpxchg(&__tcp_tx_delay_enabled, 0, 1) == 0) { static_branch_enable(&tcp_tx_delay_enabled); pr_info("TCP_TX_DELAY enabled\n"); } } } /* When set indicates to always queue non-full frames. Later the user clears * this option and we transmit any pending partial frames in the queue. This is * meant to be used alongside sendfile() to get properly filled frames when the * user (for example) must write out headers with a write() call first and then * use sendfile to send out the data parts. * * TCP_CORK can be set together with TCP_NODELAY and it is stronger than * TCP_NODELAY. */ void __tcp_sock_set_cork(struct sock *sk, bool on) { struct tcp_sock *tp = tcp_sk(sk); if (on) { tp->nonagle |= TCP_NAGLE_CORK; } else { tp->nonagle &= ~TCP_NAGLE_CORK; if (tp->nonagle & TCP_NAGLE_OFF) tp->nonagle |= TCP_NAGLE_PUSH; tcp_push_pending_frames(sk); } } void tcp_sock_set_cork(struct sock *sk, bool on) { lock_sock(sk); __tcp_sock_set_cork(sk, on); release_sock(sk); } EXPORT_SYMBOL(tcp_sock_set_cork); /* TCP_NODELAY is weaker than TCP_CORK, so that this option on corked socket is * remembered, but it is not activated until cork is cleared. * * However, when TCP_NODELAY is set we make an explicit push, which overrides * even TCP_CORK for currently queued segments. */ void __tcp_sock_set_nodelay(struct sock *sk, bool on) { if (on) { tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH; tcp_push_pending_frames(sk); } else { tcp_sk(sk)->nonagle &= ~TCP_NAGLE_OFF; } } void tcp_sock_set_nodelay(struct sock *sk) { lock_sock(sk); __tcp_sock_set_nodelay(sk, true); release_sock(sk); } EXPORT_SYMBOL(tcp_sock_set_nodelay); static void __tcp_sock_set_quickack(struct sock *sk, int val) { if (!val) { inet_csk_enter_pingpong_mode(sk); return; } inet_csk_exit_pingpong_mode(sk); if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) && inet_csk_ack_scheduled(sk)) { inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_PUSHED; tcp_cleanup_rbuf(sk, 1); if (!(val & 1)) inet_csk_enter_pingpong_mode(sk); } } void tcp_sock_set_quickack(struct sock *sk, int val) { lock_sock(sk); __tcp_sock_set_quickack(sk, val); release_sock(sk); } EXPORT_SYMBOL(tcp_sock_set_quickack); int tcp_sock_set_syncnt(struct sock *sk, int val) { if (val < 1 || val > MAX_TCP_SYNCNT) return -EINVAL; WRITE_ONCE(inet_csk(sk)->icsk_syn_retries, val); return 0; } EXPORT_SYMBOL(tcp_sock_set_syncnt); int tcp_sock_set_user_timeout(struct sock *sk, int val) { /* Cap the max time in ms TCP will retry or probe the window * before giving up and aborting (ETIMEDOUT) a connection. */ if (val < 0) return -EINVAL; WRITE_ONCE(inet_csk(sk)->icsk_user_timeout, val); return 0; } EXPORT_SYMBOL(tcp_sock_set_user_timeout); int tcp_sock_set_keepidle_locked(struct sock *sk, int val) { struct tcp_sock *tp = tcp_sk(sk); if (val < 1 || val > MAX_TCP_KEEPIDLE) return -EINVAL; /* Paired with WRITE_ONCE() in keepalive_time_when() */ WRITE_ONCE(tp->keepalive_time, val * HZ); if (sock_flag(sk, SOCK_KEEPOPEN) && !((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) { u32 elapsed = keepalive_time_elapsed(tp); if (tp->keepalive_time > elapsed) elapsed = tp->keepalive_time - elapsed; else elapsed = 0; inet_csk_reset_keepalive_timer(sk, elapsed); } return 0; } int tcp_sock_set_keepidle(struct sock *sk, int val) { int err; lock_sock(sk); err = tcp_sock_set_keepidle_locked(sk, val); release_sock(sk); return err; } EXPORT_SYMBOL(tcp_sock_set_keepidle); int tcp_sock_set_keepintvl(struct sock *sk, int val) { if (val < 1 || val > MAX_TCP_KEEPINTVL) return -EINVAL; WRITE_ONCE(tcp_sk(sk)->keepalive_intvl, val * HZ); return 0; } EXPORT_SYMBOL(tcp_sock_set_keepintvl); int tcp_sock_set_keepcnt(struct sock *sk, int val) { if (val < 1 || val > MAX_TCP_KEEPCNT) return -EINVAL; /* Paired with READ_ONCE() in keepalive_probes() */ WRITE_ONCE(tcp_sk(sk)->keepalive_probes, val); return 0; } EXPORT_SYMBOL(tcp_sock_set_keepcnt); int tcp_set_window_clamp(struct sock *sk, int val) { struct tcp_sock *tp = tcp_sk(sk); if (!val) { if (sk->sk_state != TCP_CLOSE) return -EINVAL; tp->window_clamp = 0; } else { u32 new_rcv_ssthresh, old_window_clamp = tp->window_clamp; u32 new_window_clamp = val < SOCK_MIN_RCVBUF / 2 ? SOCK_MIN_RCVBUF / 2 : val; if (new_window_clamp == old_window_clamp) return 0; tp->window_clamp = new_window_clamp; if (new_window_clamp < old_window_clamp) { /* need to apply the reserved mem provisioning only * when shrinking the window clamp */ __tcp_adjust_rcv_ssthresh(sk, tp->window_clamp); } else { new_rcv_ssthresh = min(tp->rcv_wnd, tp->window_clamp); tp->rcv_ssthresh = max(new_rcv_ssthresh, tp->rcv_ssthresh); } } return 0; } /* * Socket option code for TCP. */ int do_tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); struct net *net = sock_net(sk); int val; int err = 0; /* These are data/string values, all the others are ints */ switch (optname) { case TCP_CONGESTION: { char name[TCP_CA_NAME_MAX]; if (optlen < 1) return -EINVAL; val = strncpy_from_sockptr(name, optval, min_t(long, TCP_CA_NAME_MAX-1, optlen)); if (val < 0) return -EFAULT; name[val] = 0; sockopt_lock_sock(sk); err = tcp_set_congestion_control(sk, name, !has_current_bpf_ctx(), sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)); sockopt_release_sock(sk); return err; } case TCP_ULP: { char name[TCP_ULP_NAME_MAX]; if (optlen < 1) return -EINVAL; val = strncpy_from_sockptr(name, optval, min_t(long, TCP_ULP_NAME_MAX - 1, optlen)); if (val < 0) return -EFAULT; name[val] = 0; sockopt_lock_sock(sk); err = tcp_set_ulp(sk, name); sockopt_release_sock(sk); return err; } case TCP_FASTOPEN_KEY: { __u8 key[TCP_FASTOPEN_KEY_BUF_LENGTH]; __u8 *backup_key = NULL; /* Allow a backup key as well to facilitate key rotation * First key is the active one. */ if (optlen != TCP_FASTOPEN_KEY_LENGTH && optlen != TCP_FASTOPEN_KEY_BUF_LENGTH) return -EINVAL; if (copy_from_sockptr(key, optval, optlen)) return -EFAULT; if (optlen == TCP_FASTOPEN_KEY_BUF_LENGTH) backup_key = key + TCP_FASTOPEN_KEY_LENGTH; return tcp_fastopen_reset_cipher(net, sk, key, backup_key); } default: /* fallthru */ break; } if (optlen < sizeof(int)) return -EINVAL; if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; /* Handle options that can be set without locking the socket. */ switch (optname) { case TCP_SYNCNT: return tcp_sock_set_syncnt(sk, val); case TCP_USER_TIMEOUT: return tcp_sock_set_user_timeout(sk, val); case TCP_KEEPINTVL: return tcp_sock_set_keepintvl(sk, val); case TCP_KEEPCNT: return tcp_sock_set_keepcnt(sk, val); case TCP_LINGER2: if (val < 0) WRITE_ONCE(tp->linger2, -1); else if (val > TCP_FIN_TIMEOUT_MAX / HZ) WRITE_ONCE(tp->linger2, TCP_FIN_TIMEOUT_MAX); else WRITE_ONCE(tp->linger2, val * HZ); return 0; case TCP_DEFER_ACCEPT: /* Translate value in seconds to number of retransmits */ WRITE_ONCE(icsk->icsk_accept_queue.rskq_defer_accept, secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ, TCP_RTO_MAX / HZ)); return 0; } sockopt_lock_sock(sk); switch (optname) { case TCP_MAXSEG: /* Values greater than interface MTU won't take effect. However * at the point when this call is done we typically don't yet * know which interface is going to be used */ if (val && (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW)) { err = -EINVAL; break; } tp->rx_opt.user_mss = val; break; case TCP_NODELAY: __tcp_sock_set_nodelay(sk, val); break; case TCP_THIN_LINEAR_TIMEOUTS: if (val < 0 || val > 1) err = -EINVAL; else tp->thin_lto = val; break; case TCP_THIN_DUPACK: if (val < 0 || val > 1) err = -EINVAL; break; case TCP_REPAIR: if (!tcp_can_repair_sock(sk)) err = -EPERM; else if (val == TCP_REPAIR_ON) { tp->repair = 1; sk->sk_reuse = SK_FORCE_REUSE; tp->repair_queue = TCP_NO_QUEUE; } else if (val == TCP_REPAIR_OFF) { tp->repair = 0; sk->sk_reuse = SK_NO_REUSE; tcp_send_window_probe(sk); } else if (val == TCP_REPAIR_OFF_NO_WP) { tp->repair = 0; sk->sk_reuse = SK_NO_REUSE; } else err = -EINVAL; break; case TCP_REPAIR_QUEUE: if (!tp->repair) err = -EPERM; else if ((unsigned int)val < TCP_QUEUES_NR) tp->repair_queue = val; else err = -EINVAL; break; case TCP_QUEUE_SEQ: if (sk->sk_state != TCP_CLOSE) { err = -EPERM; } else if (tp->repair_queue == TCP_SEND_QUEUE) { if (!tcp_rtx_queue_empty(sk)) err = -EPERM; else WRITE_ONCE(tp->write_seq, val); } else if (tp->repair_queue == TCP_RECV_QUEUE) { if (tp->rcv_nxt != tp->copied_seq) { err = -EPERM; } else { WRITE_ONCE(tp->rcv_nxt, val); WRITE_ONCE(tp->copied_seq, val); } } else { err = -EINVAL; } break; case TCP_REPAIR_OPTIONS: if (!tp->repair) err = -EINVAL; else if (sk->sk_state == TCP_ESTABLISHED && !tp->bytes_sent) err = tcp_repair_options_est(sk, optval, optlen); else err = -EPERM; break; case TCP_CORK: __tcp_sock_set_cork(sk, val); break; case TCP_KEEPIDLE: err = tcp_sock_set_keepidle_locked(sk, val); break; case TCP_SAVE_SYN: /* 0: disable, 1: enable, 2: start from ether_header */ if (val < 0 || val > 2) err = -EINVAL; else tp->save_syn = val; break; case TCP_WINDOW_CLAMP: err = tcp_set_window_clamp(sk, val); break; case TCP_QUICKACK: __tcp_sock_set_quickack(sk, val); break; case TCP_AO_REPAIR: if (!tcp_can_repair_sock(sk)) { err = -EPERM; break; } err = tcp_ao_set_repair(sk, optval, optlen); break; #ifdef CONFIG_TCP_AO case TCP_AO_ADD_KEY: case TCP_AO_DEL_KEY: case TCP_AO_INFO: { /* If this is the first TCP-AO setsockopt() on the socket, * sk_state has to be LISTEN or CLOSE. Allow TCP_REPAIR * in any state. */ if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) goto ao_parse; if (rcu_dereference_protected(tcp_sk(sk)->ao_info, lockdep_sock_is_held(sk))) goto ao_parse; if (tp->repair) goto ao_parse; err = -EISCONN; break; ao_parse: err = tp->af_specific->ao_parse(sk, optname, optval, optlen); break; } #endif #ifdef CONFIG_TCP_MD5SIG case TCP_MD5SIG: case TCP_MD5SIG_EXT: err = tp->af_specific->md5_parse(sk, optname, optval, optlen); break; #endif case TCP_FASTOPEN: if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) { tcp_fastopen_init_key_once(net); fastopen_queue_tune(sk, val); } else { err = -EINVAL; } break; case TCP_FASTOPEN_CONNECT: if (val > 1 || val < 0) { err = -EINVAL; } else if (READ_ONCE(net->ipv4.sysctl_tcp_fastopen) & TFO_CLIENT_ENABLE) { if (sk->sk_state == TCP_CLOSE) tp->fastopen_connect = val; else err = -EINVAL; } else { err = -EOPNOTSUPP; } break; case TCP_FASTOPEN_NO_COOKIE: if (val > 1 || val < 0) err = -EINVAL; else if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) err = -EINVAL; else tp->fastopen_no_cookie = val; break; case TCP_TIMESTAMP: if (!tp->repair) { err = -EPERM; break; } /* val is an opaque field, * and low order bit contains usec_ts enable bit. * Its a best effort, and we do not care if user makes an error. */ tp->tcp_usec_ts = val & 1; WRITE_ONCE(tp->tsoffset, val - tcp_clock_ts(tp->tcp_usec_ts)); break; case TCP_REPAIR_WINDOW: err = tcp_repair_set_window(tp, optval, optlen); break; case TCP_NOTSENT_LOWAT: WRITE_ONCE(tp->notsent_lowat, val); sk->sk_write_space(sk); break; case TCP_INQ: if (val > 1 || val < 0) err = -EINVAL; else tp->recvmsg_inq = val; break; case TCP_TX_DELAY: if (val) tcp_enable_tx_delay(); WRITE_ONCE(tp->tcp_tx_delay, val); break; default: err = -ENOPROTOOPT; break; } sockopt_release_sock(sk); return err; } int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { const struct inet_connection_sock *icsk = inet_csk(sk); if (level != SOL_TCP) /* Paired with WRITE_ONCE() in do_ipv6_setsockopt() and tcp_v6_connect() */ return READ_ONCE(icsk->icsk_af_ops)->setsockopt(sk, level, optname, optval, optlen); return do_tcp_setsockopt(sk, level, optname, optval, optlen); } EXPORT_SYMBOL(tcp_setsockopt); static void tcp_get_info_chrono_stats(const struct tcp_sock *tp, struct tcp_info *info) { u64 stats[__TCP_CHRONO_MAX], total = 0; enum tcp_chrono i; for (i = TCP_CHRONO_BUSY; i < __TCP_CHRONO_MAX; ++i) { stats[i] = tp->chrono_stat[i - 1]; if (i == tp->chrono_type) stats[i] += tcp_jiffies32 - tp->chrono_start; stats[i] *= USEC_PER_SEC / HZ; total += stats[i]; } info->tcpi_busy_time = total; info->tcpi_rwnd_limited = stats[TCP_CHRONO_RWND_LIMITED]; info->tcpi_sndbuf_limited = stats[TCP_CHRONO_SNDBUF_LIMITED]; } /* Return information about state of tcp endpoint in API format. */ void tcp_get_info(struct sock *sk, struct tcp_info *info) { const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */ const struct inet_connection_sock *icsk = inet_csk(sk); unsigned long rate; u32 now; u64 rate64; bool slow; memset(info, 0, sizeof(*info)); if (sk->sk_type != SOCK_STREAM) return; info->tcpi_state = inet_sk_state_load(sk); /* Report meaningful fields for all TCP states, including listeners */ rate = READ_ONCE(sk->sk_pacing_rate); rate64 = (rate != ~0UL) ? rate : ~0ULL; info->tcpi_pacing_rate = rate64; rate = READ_ONCE(sk->sk_max_pacing_rate); rate64 = (rate != ~0UL) ? rate : ~0ULL; info->tcpi_max_pacing_rate = rate64; info->tcpi_reordering = tp->reordering; info->tcpi_snd_cwnd = tcp_snd_cwnd(tp); if (info->tcpi_state == TCP_LISTEN) { /* listeners aliased fields : * tcpi_unacked -> Number of children ready for accept() * tcpi_sacked -> max backlog */ info->tcpi_unacked = READ_ONCE(sk->sk_ack_backlog); info->tcpi_sacked = READ_ONCE(sk->sk_max_ack_backlog); return; } slow = lock_sock_fast(sk); info->tcpi_ca_state = icsk->icsk_ca_state; info->tcpi_retransmits = icsk->icsk_retransmits; info->tcpi_probes = icsk->icsk_probes_out; info->tcpi_backoff = icsk->icsk_backoff; if (tp->rx_opt.tstamp_ok) info->tcpi_options |= TCPI_OPT_TIMESTAMPS; if (tcp_is_sack(tp)) info->tcpi_options |= TCPI_OPT_SACK; if (tp->rx_opt.wscale_ok) { info->tcpi_options |= TCPI_OPT_WSCALE; info->tcpi_snd_wscale = tp->rx_opt.snd_wscale; info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale; } if (tp->ecn_flags & TCP_ECN_OK) info->tcpi_options |= TCPI_OPT_ECN; if (tp->ecn_flags & TCP_ECN_SEEN) info->tcpi_options |= TCPI_OPT_ECN_SEEN; if (tp->syn_data_acked) info->tcpi_options |= TCPI_OPT_SYN_DATA; if (tp->tcp_usec_ts) info->tcpi_options |= TCPI_OPT_USEC_TS; info->tcpi_rto = jiffies_to_usecs(icsk->icsk_rto); info->tcpi_ato = jiffies_to_usecs(min_t(u32, icsk->icsk_ack.ato, tcp_delack_max(sk))); info->tcpi_snd_mss = tp->mss_cache; info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss; info->tcpi_unacked = tp->packets_out; info->tcpi_sacked = tp->sacked_out; info->tcpi_lost = tp->lost_out; info->tcpi_retrans = tp->retrans_out; now = tcp_jiffies32; info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime); info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime); info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp); info->tcpi_pmtu = icsk->icsk_pmtu_cookie; info->tcpi_rcv_ssthresh = tp->rcv_ssthresh; info->tcpi_rtt = tp->srtt_us >> 3; info->tcpi_rttvar = tp->mdev_us >> 2; info->tcpi_snd_ssthresh = tp->snd_ssthresh; info->tcpi_advmss = tp->advmss; info->tcpi_rcv_rtt = tp->rcv_rtt_est.rtt_us >> 3; info->tcpi_rcv_space = tp->rcvq_space.space; info->tcpi_total_retrans = tp->total_retrans; info->tcpi_bytes_acked = tp->bytes_acked; info->tcpi_bytes_received = tp->bytes_received; info->tcpi_notsent_bytes = max_t(int, 0, tp->write_seq - tp->snd_nxt); tcp_get_info_chrono_stats(tp, info); info->tcpi_segs_out = tp->segs_out; /* segs_in and data_segs_in can be updated from tcp_segs_in() from BH */ info->tcpi_segs_in = READ_ONCE(tp->segs_in); info->tcpi_data_segs_in = READ_ONCE(tp->data_segs_in); info->tcpi_min_rtt = tcp_min_rtt(tp); info->tcpi_data_segs_out = tp->data_segs_out; info->tcpi_delivery_rate_app_limited = tp->rate_app_limited ? 1 : 0; rate64 = tcp_compute_delivery_rate(tp); if (rate64) info->tcpi_delivery_rate = rate64; info->tcpi_delivered = tp->delivered; info->tcpi_delivered_ce = tp->delivered_ce; info->tcpi_bytes_sent = tp->bytes_sent; info->tcpi_bytes_retrans = tp->bytes_retrans; info->tcpi_dsack_dups = tp->dsack_dups; info->tcpi_reord_seen = tp->reord_seen; info->tcpi_rcv_ooopack = tp->rcv_ooopack; info->tcpi_snd_wnd = tp->snd_wnd; info->tcpi_rcv_wnd = tp->rcv_wnd; info->tcpi_rehash = tp->plb_rehash + tp->timeout_rehash; info->tcpi_fastopen_client_fail = tp->fastopen_client_fail; info->tcpi_total_rto = tp->total_rto; info->tcpi_total_rto_recoveries = tp->total_rto_recoveries; info->tcpi_total_rto_time = tp->total_rto_time; if (tp->rto_stamp) info->tcpi_total_rto_time += tcp_clock_ms() - tp->rto_stamp; unlock_sock_fast(sk, slow); } EXPORT_SYMBOL_GPL(tcp_get_info); static size_t tcp_opt_stats_get_size(void) { return nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BUSY */ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_RWND_LIMITED */ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_SNDBUF_LIMITED */ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_DATA_SEGS_OUT */ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_TOTAL_RETRANS */ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_PACING_RATE */ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_DELIVERY_RATE */ nla_total_size(sizeof(u32)) + /* TCP_NLA_SND_CWND */ nla_total_size(sizeof(u32)) + /* TCP_NLA_REORDERING */ nla_total_size(sizeof(u32)) + /* TCP_NLA_MIN_RTT */ nla_total_size(sizeof(u8)) + /* TCP_NLA_RECUR_RETRANS */ nla_total_size(sizeof(u8)) + /* TCP_NLA_DELIVERY_RATE_APP_LMT */ nla_total_size(sizeof(u32)) + /* TCP_NLA_SNDQ_SIZE */ nla_total_size(sizeof(u8)) + /* TCP_NLA_CA_STATE */ nla_total_size(sizeof(u32)) + /* TCP_NLA_SND_SSTHRESH */ nla_total_size(sizeof(u32)) + /* TCP_NLA_DELIVERED */ nla_total_size(sizeof(u32)) + /* TCP_NLA_DELIVERED_CE */ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_SENT */ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_BYTES_RETRANS */ nla_total_size(sizeof(u32)) + /* TCP_NLA_DSACK_DUPS */ nla_total_size(sizeof(u32)) + /* TCP_NLA_REORD_SEEN */ nla_total_size(sizeof(u32)) + /* TCP_NLA_SRTT */ nla_total_size(sizeof(u16)) + /* TCP_NLA_TIMEOUT_REHASH */ nla_total_size(sizeof(u32)) + /* TCP_NLA_BYTES_NOTSENT */ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_EDT */ nla_total_size(sizeof(u8)) + /* TCP_NLA_TTL */ nla_total_size(sizeof(u32)) + /* TCP_NLA_REHASH */ 0; } /* Returns TTL or hop limit of an incoming packet from skb. */ static u8 tcp_skb_ttl_or_hop_limit(const struct sk_buff *skb) { if (skb->protocol == htons(ETH_P_IP)) return ip_hdr(skb)->ttl; else if (skb->protocol == htons(ETH_P_IPV6)) return ipv6_hdr(skb)->hop_limit; else return 0; } struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk, const struct sk_buff *orig_skb, const struct sk_buff *ack_skb) { const struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *stats; struct tcp_info info; unsigned long rate; u64 rate64; stats = alloc_skb(tcp_opt_stats_get_size(), GFP_ATOMIC); if (!stats) return NULL; tcp_get_info_chrono_stats(tp, &info); nla_put_u64_64bit(stats, TCP_NLA_BUSY, info.tcpi_busy_time, TCP_NLA_PAD); nla_put_u64_64bit(stats, TCP_NLA_RWND_LIMITED, info.tcpi_rwnd_limited, TCP_NLA_PAD); nla_put_u64_64bit(stats, TCP_NLA_SNDBUF_LIMITED, info.tcpi_sndbuf_limited, TCP_NLA_PAD); nla_put_u64_64bit(stats, TCP_NLA_DATA_SEGS_OUT, tp->data_segs_out, TCP_NLA_PAD); nla_put_u64_64bit(stats, TCP_NLA_TOTAL_RETRANS, tp->total_retrans, TCP_NLA_PAD); rate = READ_ONCE(sk->sk_pacing_rate); rate64 = (rate != ~0UL) ? rate : ~0ULL; nla_put_u64_64bit(stats, TCP_NLA_PACING_RATE, rate64, TCP_NLA_PAD); rate64 = tcp_compute_delivery_rate(tp); nla_put_u64_64bit(stats, TCP_NLA_DELIVERY_RATE, rate64, TCP_NLA_PAD); nla_put_u32(stats, TCP_NLA_SND_CWND, tcp_snd_cwnd(tp)); nla_put_u32(stats, TCP_NLA_REORDERING, tp->reordering); nla_put_u32(stats, TCP_NLA_MIN_RTT, tcp_min_rtt(tp)); nla_put_u8(stats, TCP_NLA_RECUR_RETRANS, inet_csk(sk)->icsk_retransmits); nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited); nla_put_u32(stats, TCP_NLA_SND_SSTHRESH, tp->snd_ssthresh); nla_put_u32(stats, TCP_NLA_DELIVERED, tp->delivered); nla_put_u32(stats, TCP_NLA_DELIVERED_CE, tp->delivered_ce); nla_put_u32(stats, TCP_NLA_SNDQ_SIZE, tp->write_seq - tp->snd_una); nla_put_u8(stats, TCP_NLA_CA_STATE, inet_csk(sk)->icsk_ca_state); nla_put_u64_64bit(stats, TCP_NLA_BYTES_SENT, tp->bytes_sent, TCP_NLA_PAD); nla_put_u64_64bit(stats, TCP_NLA_BYTES_RETRANS, tp->bytes_retrans, TCP_NLA_PAD); nla_put_u32(stats, TCP_NLA_DSACK_DUPS, tp->dsack_dups); nla_put_u32(stats, TCP_NLA_REORD_SEEN, tp->reord_seen); nla_put_u32(stats, TCP_NLA_SRTT, tp->srtt_us >> 3); nla_put_u16(stats, TCP_NLA_TIMEOUT_REHASH, tp->timeout_rehash); nla_put_u32(stats, TCP_NLA_BYTES_NOTSENT, max_t(int, 0, tp->write_seq - tp->snd_nxt)); nla_put_u64_64bit(stats, TCP_NLA_EDT, orig_skb->skb_mstamp_ns, TCP_NLA_PAD); if (ack_skb) nla_put_u8(stats, TCP_NLA_TTL, tcp_skb_ttl_or_hop_limit(ack_skb)); nla_put_u32(stats, TCP_NLA_REHASH, tp->plb_rehash + tp->timeout_rehash); return stats; } int do_tcp_getsockopt(struct sock *sk, int level, int optname, sockptr_t optval, sockptr_t optlen) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); int val, len; if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; if (len < 0) return -EINVAL; len = min_t(unsigned int, len, sizeof(int)); switch (optname) { case TCP_MAXSEG: val = tp->mss_cache; if (tp->rx_opt.user_mss && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) val = tp->rx_opt.user_mss; if (tp->repair) val = tp->rx_opt.mss_clamp; break; case TCP_NODELAY: val = !!(tp->nonagle&TCP_NAGLE_OFF); break; case TCP_CORK: val = !!(tp->nonagle&TCP_NAGLE_CORK); break; case TCP_KEEPIDLE: val = keepalive_time_when(tp) / HZ; break; case TCP_KEEPINTVL: val = keepalive_intvl_when(tp) / HZ; break; case TCP_KEEPCNT: val = keepalive_probes(tp); break; case TCP_SYNCNT: val = READ_ONCE(icsk->icsk_syn_retries) ? : READ_ONCE(net->ipv4.sysctl_tcp_syn_retries); break; case TCP_LINGER2: val = READ_ONCE(tp->linger2); if (val >= 0) val = (val ? : READ_ONCE(net->ipv4.sysctl_tcp_fin_timeout)) / HZ; break; case TCP_DEFER_ACCEPT: val = READ_ONCE(icsk->icsk_accept_queue.rskq_defer_accept); val = retrans_to_secs(val, TCP_TIMEOUT_INIT / HZ, TCP_RTO_MAX / HZ); break; case TCP_WINDOW_CLAMP: val = tp->window_clamp; break; case TCP_INFO: { struct tcp_info info; if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; tcp_get_info(sk, &info); len = min_t(unsigned int, len, sizeof(info)); if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; if (copy_to_sockptr(optval, &info, len)) return -EFAULT; return 0; } case TCP_CC_INFO: { const struct tcp_congestion_ops *ca_ops; union tcp_cc_info info; size_t sz = 0; int attr; if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; ca_ops = icsk->icsk_ca_ops; if (ca_ops && ca_ops->get_info) sz = ca_ops->get_info(sk, ~0U, &attr, &info); len = min_t(unsigned int, len, sz); if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; if (copy_to_sockptr(optval, &info, len)) return -EFAULT; return 0; } case TCP_QUICKACK: val = !inet_csk_in_pingpong_mode(sk); break; case TCP_CONGESTION: if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; len = min_t(unsigned int, len, TCP_CA_NAME_MAX); if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; if (copy_to_sockptr(optval, icsk->icsk_ca_ops->name, len)) return -EFAULT; return 0; case TCP_ULP: if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; len = min_t(unsigned int, len, TCP_ULP_NAME_MAX); if (!icsk->icsk_ulp_ops) { len = 0; if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; return 0; } if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; if (copy_to_sockptr(optval, icsk->icsk_ulp_ops->name, len)) return -EFAULT; return 0; case TCP_FASTOPEN_KEY: { u64 key[TCP_FASTOPEN_KEY_BUF_LENGTH / sizeof(u64)]; unsigned int key_len; if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; key_len = tcp_fastopen_get_cipher(net, icsk, key) * TCP_FASTOPEN_KEY_LENGTH; len = min_t(unsigned int, len, key_len); if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; if (copy_to_sockptr(optval, key, len)) return -EFAULT; return 0; } case TCP_THIN_LINEAR_TIMEOUTS: val = tp->thin_lto; break; case TCP_THIN_DUPACK: val = 0; break; case TCP_REPAIR: val = tp->repair; break; case TCP_REPAIR_QUEUE: if (tp->repair) val = tp->repair_queue; else return -EINVAL; break; case TCP_REPAIR_WINDOW: { struct tcp_repair_window opt; if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; if (len != sizeof(opt)) return -EINVAL; if (!tp->repair) return -EPERM; opt.snd_wl1 = tp->snd_wl1; opt.snd_wnd = tp->snd_wnd; opt.max_window = tp->max_window; opt.rcv_wnd = tp->rcv_wnd; opt.rcv_wup = tp->rcv_wup; if (copy_to_sockptr(optval, &opt, len)) return -EFAULT; return 0; } case TCP_QUEUE_SEQ: if (tp->repair_queue == TCP_SEND_QUEUE) val = tp->write_seq; else if (tp->repair_queue == TCP_RECV_QUEUE) val = tp->rcv_nxt; else return -EINVAL; break; case TCP_USER_TIMEOUT: val = READ_ONCE(icsk->icsk_user_timeout); break; case TCP_FASTOPEN: val = READ_ONCE(icsk->icsk_accept_queue.fastopenq.max_qlen); break; case TCP_FASTOPEN_CONNECT: val = tp->fastopen_connect; break; case TCP_FASTOPEN_NO_COOKIE: val = tp->fastopen_no_cookie; break; case TCP_TX_DELAY: val = READ_ONCE(tp->tcp_tx_delay); break; case TCP_TIMESTAMP: val = tcp_clock_ts(tp->tcp_usec_ts) + READ_ONCE(tp->tsoffset); if (tp->tcp_usec_ts) val |= 1; else val &= ~1; break; case TCP_NOTSENT_LOWAT: val = READ_ONCE(tp->notsent_lowat); break; case TCP_INQ: val = tp->recvmsg_inq; break; case TCP_SAVE_SYN: val = tp->save_syn; break; case TCP_SAVED_SYN: { if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; sockopt_lock_sock(sk); if (tp->saved_syn) { if (len < tcp_saved_syn_len(tp->saved_syn)) { len = tcp_saved_syn_len(tp->saved_syn); if (copy_to_sockptr(optlen, &len, sizeof(int))) { sockopt_release_sock(sk); return -EFAULT; } sockopt_release_sock(sk); return -EINVAL; } len = tcp_saved_syn_len(tp->saved_syn); if (copy_to_sockptr(optlen, &len, sizeof(int))) { sockopt_release_sock(sk); return -EFAULT; } if (copy_to_sockptr(optval, tp->saved_syn->data, len)) { sockopt_release_sock(sk); return -EFAULT; } tcp_saved_syn_free(tp); sockopt_release_sock(sk); } else { sockopt_release_sock(sk); len = 0; if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; } return 0; } #ifdef CONFIG_MMU case TCP_ZEROCOPY_RECEIVE: { struct scm_timestamping_internal tss; struct tcp_zerocopy_receive zc = {}; int err; if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; if (len < 0 || len < offsetofend(struct tcp_zerocopy_receive, length)) return -EINVAL; if (unlikely(len > sizeof(zc))) { err = check_zeroed_sockptr(optval, sizeof(zc), len - sizeof(zc)); if (err < 1) return err == 0 ? -EINVAL : err; len = sizeof(zc); if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; } if (copy_from_sockptr(&zc, optval, len)) return -EFAULT; if (zc.reserved) return -EINVAL; if (zc.msg_flags & ~(TCP_VALID_ZC_MSG_FLAGS)) return -EINVAL; sockopt_lock_sock(sk); err = tcp_zerocopy_receive(sk, &zc, &tss); err = BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sk, level, optname, &zc, &len, err); sockopt_release_sock(sk); if (len >= offsetofend(struct tcp_zerocopy_receive, msg_flags)) goto zerocopy_rcv_cmsg; switch (len) { case offsetofend(struct tcp_zerocopy_receive, msg_flags): goto zerocopy_rcv_cmsg; case offsetofend(struct tcp_zerocopy_receive, msg_controllen): case offsetofend(struct tcp_zerocopy_receive, msg_control): case offsetofend(struct tcp_zerocopy_receive, flags): case offsetofend(struct tcp_zerocopy_receive, copybuf_len): case offsetofend(struct tcp_zerocopy_receive, copybuf_address): case offsetofend(struct tcp_zerocopy_receive, err): goto zerocopy_rcv_sk_err; case offsetofend(struct tcp_zerocopy_receive, inq): goto zerocopy_rcv_inq; case offsetofend(struct tcp_zerocopy_receive, length): default: goto zerocopy_rcv_out; } zerocopy_rcv_cmsg: if (zc.msg_flags & TCP_CMSG_TS) tcp_zc_finalize_rx_tstamp(sk, &zc, &tss); else zc.msg_flags = 0; zerocopy_rcv_sk_err: if (!err) zc.err = sock_error(sk); zerocopy_rcv_inq: zc.inq = tcp_inq_hint(sk); zerocopy_rcv_out: if (!err && copy_to_sockptr(optval, &zc, len)) err = -EFAULT; return err; } #endif case TCP_AO_REPAIR: if (!tcp_can_repair_sock(sk)) return -EPERM; return tcp_ao_get_repair(sk, optval, optlen); case TCP_AO_GET_KEYS: case TCP_AO_INFO: { int err; sockopt_lock_sock(sk); if (optname == TCP_AO_GET_KEYS) err = tcp_ao_get_mkts(sk, optval, optlen); else err = tcp_ao_get_sock_info(sk, optval, optlen); sockopt_release_sock(sk); return err; } default: return -ENOPROTOOPT; } if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; if (copy_to_sockptr(optval, &val, len)) return -EFAULT; return 0; } bool tcp_bpf_bypass_getsockopt(int level, int optname) { /* TCP do_tcp_getsockopt has optimized getsockopt implementation * to avoid extra socket lock for TCP_ZEROCOPY_RECEIVE. */ if (level == SOL_TCP && optname == TCP_ZEROCOPY_RECEIVE) return true; return false; } EXPORT_SYMBOL(tcp_bpf_bypass_getsockopt); int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { struct inet_connection_sock *icsk = inet_csk(sk); if (level != SOL_TCP) /* Paired with WRITE_ONCE() in do_ipv6_setsockopt() and tcp_v6_connect() */ return READ_ONCE(icsk->icsk_af_ops)->getsockopt(sk, level, optname, optval, optlen); return do_tcp_getsockopt(sk, level, optname, USER_SOCKPTR(optval), USER_SOCKPTR(optlen)); } EXPORT_SYMBOL(tcp_getsockopt); #ifdef CONFIG_TCP_MD5SIG int tcp_md5_sigpool_id = -1; EXPORT_SYMBOL_GPL(tcp_md5_sigpool_id); int tcp_md5_alloc_sigpool(void) { size_t scratch_size; int ret; scratch_size = sizeof(union tcp_md5sum_block) + sizeof(struct tcphdr); ret = tcp_sigpool_alloc_ahash("md5", scratch_size); if (ret >= 0) { /* As long as any md5 sigpool was allocated, the return * id would stay the same. Re-write the id only for the case * when previously all MD5 keys were deleted and this call * allocates the first MD5 key, which may return a different * sigpool id than was used previously. */ WRITE_ONCE(tcp_md5_sigpool_id, ret); /* Avoids the compiler potentially being smart here */ return 0; } return ret; } void tcp_md5_release_sigpool(void) { tcp_sigpool_release(READ_ONCE(tcp_md5_sigpool_id)); } void tcp_md5_add_sigpool(void) { tcp_sigpool_get(READ_ONCE(tcp_md5_sigpool_id)); } int tcp_md5_hash_key(struct tcp_sigpool *hp, const struct tcp_md5sig_key *key) { u8 keylen = READ_ONCE(key->keylen); /* paired with WRITE_ONCE() in tcp_md5_do_add */ struct scatterlist sg; sg_init_one(&sg, key->key, keylen); ahash_request_set_crypt(hp->req, &sg, NULL, keylen); /* We use data_race() because tcp_md5_do_add() might change * key->key under us */ return data_race(crypto_ahash_update(hp->req)); } EXPORT_SYMBOL(tcp_md5_hash_key); /* Called with rcu_read_lock() */ enum skb_drop_reason tcp_inbound_md5_hash(const struct sock *sk, const struct sk_buff *skb, const void *saddr, const void *daddr, int family, int l3index, const __u8 *hash_location) { /* This gets called for each TCP segment that has TCP-MD5 option. * We have 3 drop cases: * o No MD5 hash and one expected. * o MD5 hash and we're not expecting one. * o MD5 hash and its wrong. */ const struct tcp_sock *tp = tcp_sk(sk); struct tcp_md5sig_key *key; u8 newhash[16]; int genhash; key = tcp_md5_do_lookup(sk, l3index, saddr, family); if (!key && hash_location) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED); tcp_hash_fail("Unexpected MD5 Hash found", family, skb, ""); return SKB_DROP_REASON_TCP_MD5UNEXPECTED; } /* Check the signature. * To support dual stack listeners, we need to handle * IPv4-mapped case. */ if (family == AF_INET) genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb); else genhash = tp->af_specific->calc_md5_hash(newhash, key, NULL, skb); if (genhash || memcmp(hash_location, newhash, 16) != 0) { NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5FAILURE); if (family == AF_INET) { tcp_hash_fail("MD5 Hash failed", AF_INET, skb, "%s L3 index %d", genhash ? "tcp_v4_calc_md5_hash failed" : "", l3index); } else { if (genhash) { tcp_hash_fail("MD5 Hash failed", AF_INET6, skb, "L3 index %d", l3index); } else { tcp_hash_fail("MD5 Hash mismatch", AF_INET6, skb, "L3 index %d", l3index); } } return SKB_DROP_REASON_TCP_MD5FAILURE; } return SKB_NOT_DROPPED_YET; } EXPORT_SYMBOL(tcp_inbound_md5_hash); #endif void tcp_done(struct sock *sk) { struct request_sock *req; /* We might be called with a new socket, after * inet_csk_prepare_forced_close() has been called * so we can not use lockdep_sock_is_held(sk) */ req = rcu_dereference_protected(tcp_sk(sk)->fastopen_rsk, 1); if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV) TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS); tcp_set_state(sk, TCP_CLOSE); tcp_clear_xmit_timers(sk); if (req) reqsk_fastopen_remove(sk, req, false); WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); if (!sock_flag(sk, SOCK_DEAD)) sk->sk_state_change(sk); else inet_csk_destroy_sock(sk); } EXPORT_SYMBOL_GPL(tcp_done); int tcp_abort(struct sock *sk, int err) { int state = inet_sk_state_load(sk); if (state == TCP_NEW_SYN_RECV) { struct request_sock *req = inet_reqsk(sk); local_bh_disable(); inet_csk_reqsk_queue_drop(req->rsk_listener, req); local_bh_enable(); return 0; } if (state == TCP_TIME_WAIT) { struct inet_timewait_sock *tw = inet_twsk(sk); refcount_inc(&tw->tw_refcnt); local_bh_disable(); inet_twsk_deschedule_put(tw); local_bh_enable(); return 0; } /* BPF context ensures sock locking. */ if (!has_current_bpf_ctx()) /* Don't race with userspace socket closes such as tcp_close. */ lock_sock(sk); if (sk->sk_state == TCP_LISTEN) { tcp_set_state(sk, TCP_CLOSE); inet_csk_listen_stop(sk); } /* Don't race with BH socket closes such as inet_csk_listen_stop. */ local_bh_disable(); bh_lock_sock(sk); if (!sock_flag(sk, SOCK_DEAD)) { WRITE_ONCE(sk->sk_err, err); /* This barrier is coupled with smp_rmb() in tcp_poll() */ smp_wmb(); sk_error_report(sk); if (tcp_need_reset(sk->sk_state)) tcp_send_active_reset(sk, GFP_ATOMIC); tcp_done(sk); } bh_unlock_sock(sk); local_bh_enable(); tcp_write_queue_purge(sk); if (!has_current_bpf_ctx()) release_sock(sk); return 0; } EXPORT_SYMBOL_GPL(tcp_abort); extern struct tcp_congestion_ops tcp_reno; static __initdata unsigned long thash_entries; static int __init set_thash_entries(char *str) { ssize_t ret; if (!str) return 0; ret = kstrtoul(str, 0, &thash_entries); if (ret) return 0; return 1; } __setup("thash_entries=", set_thash_entries); static void __init tcp_init_mem(void) { unsigned long limit = nr_free_buffer_pages() / 16; limit = max(limit, 128UL); sysctl_tcp_mem[0] = limit / 4 * 3; /* 4.68 % */ sysctl_tcp_mem[1] = limit; /* 6.25 % */ sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2; /* 9.37 % */ } static void __init tcp_struct_check(void) { /* TX read-mostly hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, max_window); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, rcv_ssthresh); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, reordering); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, notsent_lowat); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, gso_segs); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, lost_skb_hint); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, retransmit_skb_hint); CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_read_tx, 40); /* TXRX read-mostly hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, tsoffset); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, snd_wnd); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, mss_cache); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, snd_cwnd); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, prr_out); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, lost_out); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, sacked_out); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, scaling_ratio); CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_read_txrx, 32); /* RX read-mostly hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, copied_seq); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, rcv_tstamp); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, snd_wl1); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, tlp_high_seq); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, rttvar_us); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, retrans_out); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, advmss); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, urg_data); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, lost); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, rtt_min); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, out_of_order_queue); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, snd_ssthresh); CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_read_rx, 69); /* TX read-write hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, segs_out); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, data_segs_out); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, bytes_sent); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, snd_sml); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, chrono_start); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, chrono_stat); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, write_seq); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, pushed_seq); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, lsndtime); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, mdev_us); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tcp_wstamp_ns); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tcp_clock_cache); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tcp_mstamp); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, rtt_seq); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tsorted_sent_queue); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, highest_sack); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, ecn_flags); CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_tx, 105); /* TXRX read-write hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, pred_flags); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_nxt); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, snd_nxt); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, snd_una); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, window_clamp); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, srtt_us); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, packets_out); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, snd_up); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, delivered); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, delivered_ce); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, app_limited); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_wnd); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rx_opt); CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 76); /* RX read-write hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_received); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, segs_in); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, data_segs_in); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_wup); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, max_packets_out); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, cwnd_usage_seq); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rate_delivered); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rate_interval_us); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_rtt_last_tsecr); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, first_tx_mstamp); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, delivered_mstamp); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_acked); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_rtt_est); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcvq_space); CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_rx, 99); } void __init tcp_init(void) { int max_rshare, max_wshare, cnt; unsigned long limit; unsigned int i; BUILD_BUG_ON(TCP_MIN_SND_MSS <= MAX_TCP_OPTION_SPACE); BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof_field(struct sk_buff, cb)); tcp_struct_check(); percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL); timer_setup(&tcp_orphan_timer, tcp_orphan_update, TIMER_DEFERRABLE); mod_timer(&tcp_orphan_timer, jiffies + TCP_ORPHAN_TIMER_PERIOD); inet_hashinfo2_init(&tcp_hashinfo, "tcp_listen_portaddr_hash", thash_entries, 21, /* one slot per 2 MB*/ 0, 64 * 1024); tcp_hashinfo.bind_bucket_cachep = kmem_cache_create("tcp_bind_bucket", sizeof(struct inet_bind_bucket), 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT, NULL); tcp_hashinfo.bind2_bucket_cachep = kmem_cache_create("tcp_bind2_bucket", sizeof(struct inet_bind2_bucket), 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT, NULL); /* Size and allocate the main established and bind bucket * hash tables. * * The methodology is similar to that of the buffer cache. */ tcp_hashinfo.ehash = alloc_large_system_hash("TCP established", sizeof(struct inet_ehash_bucket), thash_entries, 17, /* one slot per 128 KB of memory */ 0, NULL, &tcp_hashinfo.ehash_mask, 0, thash_entries ? 0 : 512 * 1024); for (i = 0; i <= tcp_hashinfo.ehash_mask; i++) INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i); if (inet_ehash_locks_alloc(&tcp_hashinfo)) panic("TCP: failed to alloc ehash_locks"); tcp_hashinfo.bhash = alloc_large_system_hash("TCP bind", 2 * sizeof(struct inet_bind_hashbucket), tcp_hashinfo.ehash_mask + 1, 17, /* one slot per 128 KB of memory */ 0, &tcp_hashinfo.bhash_size, NULL, 0, 64 * 1024); tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size; tcp_hashinfo.bhash2 = tcp_hashinfo.bhash + tcp_hashinfo.bhash_size; for (i = 0; i < tcp_hashinfo.bhash_size; i++) { spin_lock_init(&tcp_hashinfo.bhash[i].lock); INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain); spin_lock_init(&tcp_hashinfo.bhash2[i].lock); INIT_HLIST_HEAD(&tcp_hashinfo.bhash2[i].chain); } tcp_hashinfo.pernet = false; cnt = tcp_hashinfo.ehash_mask + 1; sysctl_tcp_max_orphans = cnt / 2; tcp_init_mem(); /* Set per-socket limits to no more than 1/128 the pressure threshold */ limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7); max_wshare = min(4UL*1024*1024, limit); max_rshare = min(6UL*1024*1024, limit); init_net.ipv4.sysctl_tcp_wmem[0] = PAGE_SIZE; init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024; init_net.ipv4.sysctl_tcp_wmem[2] = max(64*1024, max_wshare); init_net.ipv4.sysctl_tcp_rmem[0] = PAGE_SIZE; init_net.ipv4.sysctl_tcp_rmem[1] = 131072; init_net.ipv4.sysctl_tcp_rmem[2] = max(131072, max_rshare); pr_info("Hash tables configured (established %u bind %u)\n", tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size); tcp_v4_init(); tcp_metrics_init(); BUG_ON(tcp_register_congestion_control(&tcp_reno) != 0); tcp_tasklet_init(); mptcp_init(); }
5 1 3 10 9 10 10 10 10 10 10 10 7 7 12 33 2 1 16 11 11 2 5 4 3 5 9 10 4 6 8 2 10 10 10 10 10 3 3 4 6 6 1 7 3 3 3 4 4 4 4 4 4 3 1 4 2 2 2 11 2 9 9 9 285 7 1 1 3 2 11 2 6 7 16 4 16 4 7 13 11 9 4 7 2 2 2 16 16 20 9 11 2 1 8 2 2 1 45 20 2 51 12 44 6 2 8 1 10 1 9 7 1 16 2 2 4 5 7 1 1 18 1 18 1 2 1 2 18 1 1 11 9 1 17 1 76 77 1 22 20 11 11 3 3 1 12 1 2 6 1 11 1 4 6 1 1 1 1 17 3 7 12 2 10 15 15 1 6 10 3 13 5 20 20 3 3 2 2 15 4 1 3 19 11 4 3 14 22 22 2 2 2 1 6 6 1 14 14 104 1110 963 277 268 12 104 9 9 9 9 18 18 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 // SPDX-License-Identifier: GPL-2.0 /* Generic nexthop implementation * * Copyright (c) 2017-19 Cumulus Networks * Copyright (c) 2017-19 David Ahern <dsa@cumulusnetworks.com> */ #include <linux/nexthop.h> #include <linux/rtnetlink.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <net/arp.h> #include <net/ipv6_stubs.h> #include <net/lwtunnel.h> #include <net/ndisc.h> #include <net/nexthop.h> #include <net/route.h> #include <net/sock.h> #define NH_RES_DEFAULT_IDLE_TIMER (120 * HZ) #define NH_RES_DEFAULT_UNBALANCED_TIMER 0 /* No forced rebalancing. */ static void remove_nexthop(struct net *net, struct nexthop *nh, struct nl_info *nlinfo); #define NH_DEV_HASHBITS 8 #define NH_DEV_HASHSIZE (1U << NH_DEV_HASHBITS) #define NHA_OP_FLAGS_DUMP_ALL (NHA_OP_FLAG_DUMP_STATS | \ NHA_OP_FLAG_DUMP_HW_STATS) static const struct nla_policy rtm_nh_policy_new[] = { [NHA_ID] = { .type = NLA_U32 }, [NHA_GROUP] = { .type = NLA_BINARY }, [NHA_GROUP_TYPE] = { .type = NLA_U16 }, [NHA_BLACKHOLE] = { .type = NLA_FLAG }, [NHA_OIF] = { .type = NLA_U32 }, [NHA_GATEWAY] = { .type = NLA_BINARY }, [NHA_ENCAP_TYPE] = { .type = NLA_U16 }, [NHA_ENCAP] = { .type = NLA_NESTED }, [NHA_FDB] = { .type = NLA_FLAG }, [NHA_RES_GROUP] = { .type = NLA_NESTED }, [NHA_HW_STATS_ENABLE] = NLA_POLICY_MAX(NLA_U32, true), }; static const struct nla_policy rtm_nh_policy_get[] = { [NHA_ID] = { .type = NLA_U32 }, [NHA_OP_FLAGS] = NLA_POLICY_MASK(NLA_U32, NHA_OP_FLAGS_DUMP_ALL), }; static const struct nla_policy rtm_nh_policy_del[] = { [NHA_ID] = { .type = NLA_U32 }, }; static const struct nla_policy rtm_nh_policy_dump[] = { [NHA_OIF] = { .type = NLA_U32 }, [NHA_GROUPS] = { .type = NLA_FLAG }, [NHA_MASTER] = { .type = NLA_U32 }, [NHA_FDB] = { .type = NLA_FLAG }, [NHA_OP_FLAGS] = NLA_POLICY_MASK(NLA_U32, NHA_OP_FLAGS_DUMP_ALL), }; static const struct nla_policy rtm_nh_res_policy_new[] = { [NHA_RES_GROUP_BUCKETS] = { .type = NLA_U16 }, [NHA_RES_GROUP_IDLE_TIMER] = { .type = NLA_U32 }, [NHA_RES_GROUP_UNBALANCED_TIMER] = { .type = NLA_U32 }, }; static const struct nla_policy rtm_nh_policy_dump_bucket[] = { [NHA_ID] = { .type = NLA_U32 }, [NHA_OIF] = { .type = NLA_U32 }, [NHA_MASTER] = { .type = NLA_U32 }, [NHA_RES_BUCKET] = { .type = NLA_NESTED }, }; static const struct nla_policy rtm_nh_res_bucket_policy_dump[] = { [NHA_RES_BUCKET_NH_ID] = { .type = NLA_U32 }, }; static const struct nla_policy rtm_nh_policy_get_bucket[] = { [NHA_ID] = { .type = NLA_U32 }, [NHA_RES_BUCKET] = { .type = NLA_NESTED }, }; static const struct nla_policy rtm_nh_res_bucket_policy_get[] = { [NHA_RES_BUCKET_INDEX] = { .type = NLA_U16 }, }; static bool nexthop_notifiers_is_empty(struct net *net) { return !net->nexthop.notifier_chain.head; } static void __nh_notifier_single_info_init(struct nh_notifier_single_info *nh_info, const struct nh_info *nhi) { nh_info->dev = nhi->fib_nhc.nhc_dev; nh_info->gw_family = nhi->fib_nhc.nhc_gw_family; if (nh_info->gw_family == AF_INET) nh_info->ipv4 = nhi->fib_nhc.nhc_gw.ipv4; else if (nh_info->gw_family == AF_INET6) nh_info->ipv6 = nhi->fib_nhc.nhc_gw.ipv6; nh_info->id = nhi->nh_parent->id; nh_info->is_reject = nhi->reject_nh; nh_info->is_fdb = nhi->fdb_nh; nh_info->has_encap = !!nhi->fib_nhc.nhc_lwtstate; } static int nh_notifier_single_info_init(struct nh_notifier_info *info, const struct nexthop *nh) { struct nh_info *nhi = rtnl_dereference(nh->nh_info); info->type = NH_NOTIFIER_INFO_TYPE_SINGLE; info->nh = kzalloc(sizeof(*info->nh), GFP_KERNEL); if (!info->nh) return -ENOMEM; __nh_notifier_single_info_init(info->nh, nhi); return 0; } static void nh_notifier_single_info_fini(struct nh_notifier_info *info) { kfree(info->nh); } static int nh_notifier_mpath_info_init(struct nh_notifier_info *info, struct nh_group *nhg) { u16 num_nh = nhg->num_nh; int i; info->type = NH_NOTIFIER_INFO_TYPE_GRP; info->nh_grp = kzalloc(struct_size(info->nh_grp, nh_entries, num_nh), GFP_KERNEL); if (!info->nh_grp) return -ENOMEM; info->nh_grp->num_nh = num_nh; info->nh_grp->is_fdb = nhg->fdb_nh; info->nh_grp->hw_stats = nhg->hw_stats; for (i = 0; i < num_nh; i++) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; struct nh_info *nhi; nhi = rtnl_dereference(nhge->nh->nh_info); info->nh_grp->nh_entries[i].weight = nhge->weight; __nh_notifier_single_info_init(&info->nh_grp->nh_entries[i].nh, nhi); } return 0; } static int nh_notifier_res_table_info_init(struct nh_notifier_info *info, struct nh_group *nhg) { struct nh_res_table *res_table = rtnl_dereference(nhg->res_table); u16 num_nh_buckets = res_table->num_nh_buckets; unsigned long size; u16 i; info->type = NH_NOTIFIER_INFO_TYPE_RES_TABLE; size = struct_size(info->nh_res_table, nhs, num_nh_buckets); info->nh_res_table = __vmalloc(size, GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN); if (!info->nh_res_table) return -ENOMEM; info->nh_res_table->num_nh_buckets = num_nh_buckets; info->nh_res_table->hw_stats = nhg->hw_stats; for (i = 0; i < num_nh_buckets; i++) { struct nh_res_bucket *bucket = &res_table->nh_buckets[i]; struct nh_grp_entry *nhge; struct nh_info *nhi; nhge = rtnl_dereference(bucket->nh_entry); nhi = rtnl_dereference(nhge->nh->nh_info); __nh_notifier_single_info_init(&info->nh_res_table->nhs[i], nhi); } return 0; } static int nh_notifier_grp_info_init(struct nh_notifier_info *info, const struct nexthop *nh) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); if (nhg->hash_threshold) return nh_notifier_mpath_info_init(info, nhg); else if (nhg->resilient) return nh_notifier_res_table_info_init(info, nhg); return -EINVAL; } static void nh_notifier_grp_info_fini(struct nh_notifier_info *info, const struct nexthop *nh) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); if (nhg->hash_threshold) kfree(info->nh_grp); else if (nhg->resilient) vfree(info->nh_res_table); } static int nh_notifier_info_init(struct nh_notifier_info *info, const struct nexthop *nh) { info->id = nh->id; if (nh->is_group) return nh_notifier_grp_info_init(info, nh); else return nh_notifier_single_info_init(info, nh); } static void nh_notifier_info_fini(struct nh_notifier_info *info, const struct nexthop *nh) { if (nh->is_group) nh_notifier_grp_info_fini(info, nh); else nh_notifier_single_info_fini(info); } static int call_nexthop_notifiers(struct net *net, enum nexthop_event_type event_type, struct nexthop *nh, struct netlink_ext_ack *extack) { struct nh_notifier_info info = { .net = net, .extack = extack, }; int err; ASSERT_RTNL(); if (nexthop_notifiers_is_empty(net)) return 0; err = nh_notifier_info_init(&info, nh); if (err) { NL_SET_ERR_MSG(extack, "Failed to initialize nexthop notifier info"); return err; } err = blocking_notifier_call_chain(&net->nexthop.notifier_chain, event_type, &info); nh_notifier_info_fini(&info, nh); return notifier_to_errno(err); } static int nh_notifier_res_bucket_idle_timer_get(const struct nh_notifier_info *info, bool force, unsigned int *p_idle_timer_ms) { struct nh_res_table *res_table; struct nh_group *nhg; struct nexthop *nh; int err = 0; /* When 'force' is false, nexthop bucket replacement is performed * because the bucket was deemed to be idle. In this case, capable * listeners can choose to perform an atomic replacement: The bucket is * only replaced if it is inactive. However, if the idle timer interval * is smaller than the interval in which a listener is querying * buckets' activity from the device, then atomic replacement should * not be tried. Pass the idle timer value to listeners, so that they * could determine which type of replacement to perform. */ if (force) { *p_idle_timer_ms = 0; return 0; } rcu_read_lock(); nh = nexthop_find_by_id(info->net, info->id); if (!nh) { err = -EINVAL; goto out; } nhg = rcu_dereference(nh->nh_grp); res_table = rcu_dereference(nhg->res_table); *p_idle_timer_ms = jiffies_to_msecs(res_table->idle_timer); out: rcu_read_unlock(); return err; } static int nh_notifier_res_bucket_info_init(struct nh_notifier_info *info, u16 bucket_index, bool force, struct nh_info *oldi, struct nh_info *newi) { unsigned int idle_timer_ms; int err; err = nh_notifier_res_bucket_idle_timer_get(info, force, &idle_timer_ms); if (err) return err; info->type = NH_NOTIFIER_INFO_TYPE_RES_BUCKET; info->nh_res_bucket = kzalloc(sizeof(*info->nh_res_bucket), GFP_KERNEL); if (!info->nh_res_bucket) return -ENOMEM; info->nh_res_bucket->bucket_index = bucket_index; info->nh_res_bucket->idle_timer_ms = idle_timer_ms; info->nh_res_bucket->force = force; __nh_notifier_single_info_init(&info->nh_res_bucket->old_nh, oldi); __nh_notifier_single_info_init(&info->nh_res_bucket->new_nh, newi); return 0; } static void nh_notifier_res_bucket_info_fini(struct nh_notifier_info *info) { kfree(info->nh_res_bucket); } static int __call_nexthop_res_bucket_notifiers(struct net *net, u32 nhg_id, u16 bucket_index, bool force, struct nh_info *oldi, struct nh_info *newi, struct netlink_ext_ack *extack) { struct nh_notifier_info info = { .net = net, .extack = extack, .id = nhg_id, }; int err; if (nexthop_notifiers_is_empty(net)) return 0; err = nh_notifier_res_bucket_info_init(&info, bucket_index, force, oldi, newi); if (err) return err; err = blocking_notifier_call_chain(&net->nexthop.notifier_chain, NEXTHOP_EVENT_BUCKET_REPLACE, &info); nh_notifier_res_bucket_info_fini(&info); return notifier_to_errno(err); } /* There are three users of RES_TABLE, and NHs etc. referenced from there: * * 1) a collection of callbacks for NH maintenance. This operates under * RTNL, * 2) the delayed work that gradually balances the resilient table, * 3) and nexthop_select_path(), operating under RCU. * * Both the delayed work and the RTNL block are writers, and need to * maintain mutual exclusion. Since there are only two and well-known * writers for each table, the RTNL code can make sure it has exclusive * access thus: * * - Have the DW operate without locking; * - synchronously cancel the DW; * - do the writing; * - if the write was not actually a delete, call upkeep, which schedules * DW again if necessary. * * The functions that are always called from the RTNL context use * rtnl_dereference(). The functions that can also be called from the DW do * a raw dereference and rely on the above mutual exclusion scheme. */ #define nh_res_dereference(p) (rcu_dereference_raw(p)) static int call_nexthop_res_bucket_notifiers(struct net *net, u32 nhg_id, u16 bucket_index, bool force, struct nexthop *old_nh, struct nexthop *new_nh, struct netlink_ext_ack *extack) { struct nh_info *oldi = nh_res_dereference(old_nh->nh_info); struct nh_info *newi = nh_res_dereference(new_nh->nh_info); return __call_nexthop_res_bucket_notifiers(net, nhg_id, bucket_index, force, oldi, newi, extack); } static int call_nexthop_res_table_notifiers(struct net *net, struct nexthop *nh, struct netlink_ext_ack *extack) { struct nh_notifier_info info = { .net = net, .extack = extack, .id = nh->id, }; struct nh_group *nhg; int err; ASSERT_RTNL(); if (nexthop_notifiers_is_empty(net)) return 0; /* At this point, the nexthop buckets are still not populated. Only * emit a notification with the logical nexthops, so that a listener * could potentially veto it in case of unsupported configuration. */ nhg = rtnl_dereference(nh->nh_grp); err = nh_notifier_mpath_info_init(&info, nhg); if (err) { NL_SET_ERR_MSG(extack, "Failed to initialize nexthop notifier info"); return err; } err = blocking_notifier_call_chain(&net->nexthop.notifier_chain, NEXTHOP_EVENT_RES_TABLE_PRE_REPLACE, &info); kfree(info.nh_grp); return notifier_to_errno(err); } static int call_nexthop_notifier(struct notifier_block *nb, struct net *net, enum nexthop_event_type event_type, struct nexthop *nh, struct netlink_ext_ack *extack) { struct nh_notifier_info info = { .net = net, .extack = extack, }; int err; err = nh_notifier_info_init(&info, nh); if (err) return err; err = nb->notifier_call(nb, event_type, &info); nh_notifier_info_fini(&info, nh); return notifier_to_errno(err); } static unsigned int nh_dev_hashfn(unsigned int val) { unsigned int mask = NH_DEV_HASHSIZE - 1; return (val ^ (val >> NH_DEV_HASHBITS) ^ (val >> (NH_DEV_HASHBITS * 2))) & mask; } static void nexthop_devhash_add(struct net *net, struct nh_info *nhi) { struct net_device *dev = nhi->fib_nhc.nhc_dev; struct hlist_head *head; unsigned int hash; WARN_ON(!dev); hash = nh_dev_hashfn(dev->ifindex); head = &net->nexthop.devhash[hash]; hlist_add_head(&nhi->dev_hash, head); } static void nexthop_free_group(struct nexthop *nh) { struct nh_group *nhg; int i; nhg = rcu_dereference_raw(nh->nh_grp); for (i = 0; i < nhg->num_nh; ++i) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; WARN_ON(!list_empty(&nhge->nh_list)); free_percpu(nhge->stats); nexthop_put(nhge->nh); } WARN_ON(nhg->spare == nhg); if (nhg->resilient) vfree(rcu_dereference_raw(nhg->res_table)); kfree(nhg->spare); kfree(nhg); } static void nexthop_free_single(struct nexthop *nh) { struct nh_info *nhi; nhi = rcu_dereference_raw(nh->nh_info); switch (nhi->family) { case AF_INET: fib_nh_release(nh->net, &nhi->fib_nh); break; case AF_INET6: ipv6_stub->fib6_nh_release(&nhi->fib6_nh); break; } kfree(nhi); } void nexthop_free_rcu(struct rcu_head *head) { struct nexthop *nh = container_of(head, struct nexthop, rcu); if (nh->is_group) nexthop_free_group(nh); else nexthop_free_single(nh); kfree(nh); } EXPORT_SYMBOL_GPL(nexthop_free_rcu); static struct nexthop *nexthop_alloc(void) { struct nexthop *nh; nh = kzalloc(sizeof(struct nexthop), GFP_KERNEL); if (nh) { INIT_LIST_HEAD(&nh->fi_list); INIT_LIST_HEAD(&nh->f6i_list); INIT_LIST_HEAD(&nh->grp_list); INIT_LIST_HEAD(&nh->fdb_list); } return nh; } static struct nh_group *nexthop_grp_alloc(u16 num_nh) { struct nh_group *nhg; nhg = kzalloc(struct_size(nhg, nh_entries, num_nh), GFP_KERNEL); if (nhg) nhg->num_nh = num_nh; return nhg; } static void nh_res_table_upkeep_dw(struct work_struct *work); static struct nh_res_table * nexthop_res_table_alloc(struct net *net, u32 nhg_id, struct nh_config *cfg) { const u16 num_nh_buckets = cfg->nh_grp_res_num_buckets; struct nh_res_table *res_table; unsigned long size; size = struct_size(res_table, nh_buckets, num_nh_buckets); res_table = __vmalloc(size, GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN); if (!res_table) return NULL; res_table->net = net; res_table->nhg_id = nhg_id; INIT_DELAYED_WORK(&res_table->upkeep_dw, &nh_res_table_upkeep_dw); INIT_LIST_HEAD(&res_table->uw_nh_entries); res_table->idle_timer = cfg->nh_grp_res_idle_timer; res_table->unbalanced_timer = cfg->nh_grp_res_unbalanced_timer; res_table->num_nh_buckets = num_nh_buckets; return res_table; } static void nh_base_seq_inc(struct net *net) { while (++net->nexthop.seq == 0) ; } /* no reference taken; rcu lock or rtnl must be held */ struct nexthop *nexthop_find_by_id(struct net *net, u32 id) { struct rb_node **pp, *parent = NULL, *next; pp = &net->nexthop.rb_root.rb_node; while (1) { struct nexthop *nh; next = rcu_dereference_raw(*pp); if (!next) break; parent = next; nh = rb_entry(parent, struct nexthop, rb_node); if (id < nh->id) pp = &next->rb_left; else if (id > nh->id) pp = &next->rb_right; else return nh; } return NULL; } EXPORT_SYMBOL_GPL(nexthop_find_by_id); /* used for auto id allocation; called with rtnl held */ static u32 nh_find_unused_id(struct net *net) { u32 id_start = net->nexthop.last_id_allocated; while (1) { net->nexthop.last_id_allocated++; if (net->nexthop.last_id_allocated == id_start) break; if (!nexthop_find_by_id(net, net->nexthop.last_id_allocated)) return net->nexthop.last_id_allocated; } return 0; } static void nh_res_time_set_deadline(unsigned long next_time, unsigned long *deadline) { if (time_before(next_time, *deadline)) *deadline = next_time; } static clock_t nh_res_table_unbalanced_time(struct nh_res_table *res_table) { if (list_empty(&res_table->uw_nh_entries)) return 0; return jiffies_delta_to_clock_t(jiffies - res_table->unbalanced_since); } static int nla_put_nh_group_res(struct sk_buff *skb, struct nh_group *nhg) { struct nh_res_table *res_table = rtnl_dereference(nhg->res_table); struct nlattr *nest; nest = nla_nest_start(skb, NHA_RES_GROUP); if (!nest) return -EMSGSIZE; if (nla_put_u16(skb, NHA_RES_GROUP_BUCKETS, res_table->num_nh_buckets) || nla_put_u32(skb, NHA_RES_GROUP_IDLE_TIMER, jiffies_to_clock_t(res_table->idle_timer)) || nla_put_u32(skb, NHA_RES_GROUP_UNBALANCED_TIMER, jiffies_to_clock_t(res_table->unbalanced_timer)) || nla_put_u64_64bit(skb, NHA_RES_GROUP_UNBALANCED_TIME, nh_res_table_unbalanced_time(res_table), NHA_RES_GROUP_PAD)) goto nla_put_failure; nla_nest_end(skb, nest); return 0; nla_put_failure: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static void nh_grp_entry_stats_inc(struct nh_grp_entry *nhge) { struct nh_grp_entry_stats *cpu_stats; cpu_stats = get_cpu_ptr(nhge->stats); u64_stats_update_begin(&cpu_stats->syncp); u64_stats_inc(&cpu_stats->packets); u64_stats_update_end(&cpu_stats->syncp); put_cpu_ptr(cpu_stats); } static void nh_grp_entry_stats_read(struct nh_grp_entry *nhge, u64 *ret_packets) { int i; *ret_packets = 0; for_each_possible_cpu(i) { struct nh_grp_entry_stats *cpu_stats; unsigned int start; u64 packets; cpu_stats = per_cpu_ptr(nhge->stats, i); do { start = u64_stats_fetch_begin(&cpu_stats->syncp); packets = u64_stats_read(&cpu_stats->packets); } while (u64_stats_fetch_retry(&cpu_stats->syncp, start)); *ret_packets += packets; } } static int nh_notifier_grp_hw_stats_init(struct nh_notifier_info *info, const struct nexthop *nh) { struct nh_group *nhg; int i; ASSERT_RTNL(); nhg = rtnl_dereference(nh->nh_grp); info->id = nh->id; info->type = NH_NOTIFIER_INFO_TYPE_GRP_HW_STATS; info->nh_grp_hw_stats = kzalloc(struct_size(info->nh_grp_hw_stats, stats, nhg->num_nh), GFP_KERNEL); if (!info->nh_grp_hw_stats) return -ENOMEM; info->nh_grp_hw_stats->num_nh = nhg->num_nh; for (i = 0; i < nhg->num_nh; i++) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; info->nh_grp_hw_stats->stats[i].id = nhge->nh->id; } return 0; } static void nh_notifier_grp_hw_stats_fini(struct nh_notifier_info *info) { kfree(info->nh_grp_hw_stats); } void nh_grp_hw_stats_report_delta(struct nh_notifier_grp_hw_stats_info *info, unsigned int nh_idx, u64 delta_packets) { info->hw_stats_used = true; info->stats[nh_idx].packets += delta_packets; } EXPORT_SYMBOL(nh_grp_hw_stats_report_delta); static void nh_grp_hw_stats_apply_update(struct nexthop *nh, struct nh_notifier_info *info) { struct nh_group *nhg; int i; ASSERT_RTNL(); nhg = rtnl_dereference(nh->nh_grp); for (i = 0; i < nhg->num_nh; i++) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; nhge->packets_hw += info->nh_grp_hw_stats->stats[i].packets; } } static int nh_grp_hw_stats_update(struct nexthop *nh, bool *hw_stats_used) { struct nh_notifier_info info = { .net = nh->net, }; struct net *net = nh->net; int err; if (nexthop_notifiers_is_empty(net)) { *hw_stats_used = false; return 0; } err = nh_notifier_grp_hw_stats_init(&info, nh); if (err) return err; err = blocking_notifier_call_chain(&net->nexthop.notifier_chain, NEXTHOP_EVENT_HW_STATS_REPORT_DELTA, &info); /* Cache whatever we got, even if there was an error, otherwise the * successful stats retrievals would get lost. */ nh_grp_hw_stats_apply_update(nh, &info); *hw_stats_used = info.nh_grp_hw_stats->hw_stats_used; nh_notifier_grp_hw_stats_fini(&info); return notifier_to_errno(err); } static int nla_put_nh_group_stats_entry(struct sk_buff *skb, struct nh_grp_entry *nhge, u32 op_flags) { struct nlattr *nest; u64 packets; nh_grp_entry_stats_read(nhge, &packets); nest = nla_nest_start(skb, NHA_GROUP_STATS_ENTRY); if (!nest) return -EMSGSIZE; if (nla_put_u32(skb, NHA_GROUP_STATS_ENTRY_ID, nhge->nh->id) || nla_put_uint(skb, NHA_GROUP_STATS_ENTRY_PACKETS, packets + nhge->packets_hw)) goto nla_put_failure; if (op_flags & NHA_OP_FLAG_DUMP_HW_STATS && nla_put_uint(skb, NHA_GROUP_STATS_ENTRY_PACKETS_HW, nhge->packets_hw)) goto nla_put_failure; nla_nest_end(skb, nest); return 0; nla_put_failure: nla_nest_cancel(skb, nest); return -EMSGSIZE; } static int nla_put_nh_group_stats(struct sk_buff *skb, struct nexthop *nh, u32 op_flags) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); struct nlattr *nest; bool hw_stats_used; int err; int i; if (nla_put_u32(skb, NHA_HW_STATS_ENABLE, nhg->hw_stats)) goto err_out; if (op_flags & NHA_OP_FLAG_DUMP_HW_STATS && nhg->hw_stats) { err = nh_grp_hw_stats_update(nh, &hw_stats_used); if (err) goto out; if (nla_put_u32(skb, NHA_HW_STATS_USED, hw_stats_used)) goto err_out; } nest = nla_nest_start(skb, NHA_GROUP_STATS); if (!nest) goto err_out; for (i = 0; i < nhg->num_nh; i++) if (nla_put_nh_group_stats_entry(skb, &nhg->nh_entries[i], op_flags)) goto cancel_out; nla_nest_end(skb, nest); return 0; cancel_out: nla_nest_cancel(skb, nest); err_out: err = -EMSGSIZE; out: return err; } static int nla_put_nh_group(struct sk_buff *skb, struct nexthop *nh, u32 op_flags) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); struct nexthop_grp *p; size_t len = nhg->num_nh * sizeof(*p); struct nlattr *nla; u16 group_type = 0; int i; if (nhg->hash_threshold) group_type = NEXTHOP_GRP_TYPE_MPATH; else if (nhg->resilient) group_type = NEXTHOP_GRP_TYPE_RES; if (nla_put_u16(skb, NHA_GROUP_TYPE, group_type)) goto nla_put_failure; nla = nla_reserve(skb, NHA_GROUP, len); if (!nla) goto nla_put_failure; p = nla_data(nla); for (i = 0; i < nhg->num_nh; ++i) { p->id = nhg->nh_entries[i].nh->id; p->weight = nhg->nh_entries[i].weight - 1; p += 1; } if (nhg->resilient && nla_put_nh_group_res(skb, nhg)) goto nla_put_failure; if (op_flags & NHA_OP_FLAG_DUMP_STATS && (nla_put_u32(skb, NHA_HW_STATS_ENABLE, nhg->hw_stats) || nla_put_nh_group_stats(skb, nh, op_flags))) goto nla_put_failure; return 0; nla_put_failure: return -EMSGSIZE; } static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh, int event, u32 portid, u32 seq, unsigned int nlflags, u32 op_flags) { struct fib6_nh *fib6_nh; struct fib_nh *fib_nh; struct nlmsghdr *nlh; struct nh_info *nhi; struct nhmsg *nhm; nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nhm), nlflags); if (!nlh) return -EMSGSIZE; nhm = nlmsg_data(nlh); nhm->nh_family = AF_UNSPEC; nhm->nh_flags = nh->nh_flags; nhm->nh_protocol = nh->protocol; nhm->nh_scope = 0; nhm->resvd = 0; if (nla_put_u32(skb, NHA_ID, nh->id)) goto nla_put_failure; if (nh->is_group) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); if (nhg->fdb_nh && nla_put_flag(skb, NHA_FDB)) goto nla_put_failure; if (nla_put_nh_group(skb, nh, op_flags)) goto nla_put_failure; goto out; } nhi = rtnl_dereference(nh->nh_info); nhm->nh_family = nhi->family; if (nhi->reject_nh) { if (nla_put_flag(skb, NHA_BLACKHOLE)) goto nla_put_failure; goto out; } else if (nhi->fdb_nh) { if (nla_put_flag(skb, NHA_FDB)) goto nla_put_failure; } else { const struct net_device *dev; dev = nhi->fib_nhc.nhc_dev; if (dev && nla_put_u32(skb, NHA_OIF, dev->ifindex)) goto nla_put_failure; } nhm->nh_scope = nhi->fib_nhc.nhc_scope; switch (nhi->family) { case AF_INET: fib_nh = &nhi->fib_nh; if (fib_nh->fib_nh_gw_family && nla_put_be32(skb, NHA_GATEWAY, fib_nh->fib_nh_gw4)) goto nla_put_failure; break; case AF_INET6: fib6_nh = &nhi->fib6_nh; if (fib6_nh->fib_nh_gw_family && nla_put_in6_addr(skb, NHA_GATEWAY, &fib6_nh->fib_nh_gw6)) goto nla_put_failure; break; } if (nhi->fib_nhc.nhc_lwtstate && lwtunnel_fill_encap(skb, nhi->fib_nhc.nhc_lwtstate, NHA_ENCAP, NHA_ENCAP_TYPE) < 0) goto nla_put_failure; out: nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static size_t nh_nlmsg_size_grp_res(struct nh_group *nhg) { return nla_total_size(0) + /* NHA_RES_GROUP */ nla_total_size(2) + /* NHA_RES_GROUP_BUCKETS */ nla_total_size(4) + /* NHA_RES_GROUP_IDLE_TIMER */ nla_total_size(4) + /* NHA_RES_GROUP_UNBALANCED_TIMER */ nla_total_size_64bit(8);/* NHA_RES_GROUP_UNBALANCED_TIME */ } static size_t nh_nlmsg_size_grp(struct nexthop *nh) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); size_t sz = sizeof(struct nexthop_grp) * nhg->num_nh; size_t tot = nla_total_size(sz) + nla_total_size(2); /* NHA_GROUP_TYPE */ if (nhg->resilient) tot += nh_nlmsg_size_grp_res(nhg); return tot; } static size_t nh_nlmsg_size_single(struct nexthop *nh) { struct nh_info *nhi = rtnl_dereference(nh->nh_info); size_t sz; /* covers NHA_BLACKHOLE since NHA_OIF and BLACKHOLE * are mutually exclusive */ sz = nla_total_size(4); /* NHA_OIF */ switch (nhi->family) { case AF_INET: if (nhi->fib_nh.fib_nh_gw_family) sz += nla_total_size(4); /* NHA_GATEWAY */ break; case AF_INET6: /* NHA_GATEWAY */ if (nhi->fib6_nh.fib_nh_gw_family) sz += nla_total_size(sizeof(const struct in6_addr)); break; } if (nhi->fib_nhc.nhc_lwtstate) { sz += lwtunnel_get_encap_size(nhi->fib_nhc.nhc_lwtstate); sz += nla_total_size(2); /* NHA_ENCAP_TYPE */ } return sz; } static size_t nh_nlmsg_size(struct nexthop *nh) { size_t sz = NLMSG_ALIGN(sizeof(struct nhmsg)); sz += nla_total_size(4); /* NHA_ID */ if (nh->is_group) sz += nh_nlmsg_size_grp(nh); else sz += nh_nlmsg_size_single(nh); return sz; } static void nexthop_notify(int event, struct nexthop *nh, struct nl_info *info) { unsigned int nlflags = info->nlh ? info->nlh->nlmsg_flags : 0; u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0; struct sk_buff *skb; int err = -ENOBUFS; skb = nlmsg_new(nh_nlmsg_size(nh), gfp_any()); if (!skb) goto errout; err = nh_fill_node(skb, nh, event, info->portid, seq, nlflags, 0); if (err < 0) { /* -EMSGSIZE implies BUG in nh_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); kfree_skb(skb); goto errout; } rtnl_notify(skb, info->nl_net, info->portid, RTNLGRP_NEXTHOP, info->nlh, gfp_any()); return; errout: if (err < 0) rtnl_set_sk_err(info->nl_net, RTNLGRP_NEXTHOP, err); } static unsigned long nh_res_bucket_used_time(const struct nh_res_bucket *bucket) { return (unsigned long)atomic_long_read(&bucket->used_time); } static unsigned long nh_res_bucket_idle_point(const struct nh_res_table *res_table, const struct nh_res_bucket *bucket, unsigned long now) { unsigned long time = nh_res_bucket_used_time(bucket); /* Bucket was not used since it was migrated. The idle time is now. */ if (time == bucket->migrated_time) return now; return time + res_table->idle_timer; } static unsigned long nh_res_table_unb_point(const struct nh_res_table *res_table) { return res_table->unbalanced_since + res_table->unbalanced_timer; } static void nh_res_bucket_set_idle(const struct nh_res_table *res_table, struct nh_res_bucket *bucket) { unsigned long now = jiffies; atomic_long_set(&bucket->used_time, (long)now); bucket->migrated_time = now; } static void nh_res_bucket_set_busy(struct nh_res_bucket *bucket) { atomic_long_set(&bucket->used_time, (long)jiffies); } static clock_t nh_res_bucket_idle_time(const struct nh_res_bucket *bucket) { unsigned long used_time = nh_res_bucket_used_time(bucket); return jiffies_delta_to_clock_t(jiffies - used_time); } static int nh_fill_res_bucket(struct sk_buff *skb, struct nexthop *nh, struct nh_res_bucket *bucket, u16 bucket_index, int event, u32 portid, u32 seq, unsigned int nlflags, struct netlink_ext_ack *extack) { struct nh_grp_entry *nhge = nh_res_dereference(bucket->nh_entry); struct nlmsghdr *nlh; struct nlattr *nest; struct nhmsg *nhm; nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nhm), nlflags); if (!nlh) return -EMSGSIZE; nhm = nlmsg_data(nlh); nhm->nh_family = AF_UNSPEC; nhm->nh_flags = bucket->nh_flags; nhm->nh_protocol = nh->protocol; nhm->nh_scope = 0; nhm->resvd = 0; if (nla_put_u32(skb, NHA_ID, nh->id)) goto nla_put_failure; nest = nla_nest_start(skb, NHA_RES_BUCKET); if (!nest) goto nla_put_failure; if (nla_put_u16(skb, NHA_RES_BUCKET_INDEX, bucket_index) || nla_put_u32(skb, NHA_RES_BUCKET_NH_ID, nhge->nh->id) || nla_put_u64_64bit(skb, NHA_RES_BUCKET_IDLE_TIME, nh_res_bucket_idle_time(bucket), NHA_RES_BUCKET_PAD)) goto nla_put_failure_nest; nla_nest_end(skb, nest); nlmsg_end(skb, nlh); return 0; nla_put_failure_nest: nla_nest_cancel(skb, nest); nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static void nexthop_bucket_notify(struct nh_res_table *res_table, u16 bucket_index) { struct nh_res_bucket *bucket = &res_table->nh_buckets[bucket_index]; struct nh_grp_entry *nhge = nh_res_dereference(bucket->nh_entry); struct nexthop *nh = nhge->nh_parent; struct sk_buff *skb; int err = -ENOBUFS; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) goto errout; err = nh_fill_res_bucket(skb, nh, bucket, bucket_index, RTM_NEWNEXTHOPBUCKET, 0, 0, NLM_F_REPLACE, NULL); if (err < 0) { kfree_skb(skb); goto errout; } rtnl_notify(skb, nh->net, 0, RTNLGRP_NEXTHOP, NULL, GFP_KERNEL); return; errout: if (err < 0) rtnl_set_sk_err(nh->net, RTNLGRP_NEXTHOP, err); } static bool valid_group_nh(struct nexthop *nh, unsigned int npaths, bool *is_fdb, struct netlink_ext_ack *extack) { if (nh->is_group) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); /* Nesting groups within groups is not supported. */ if (nhg->hash_threshold) { NL_SET_ERR_MSG(extack, "Hash-threshold group can not be a nexthop within a group"); return false; } if (nhg->resilient) { NL_SET_ERR_MSG(extack, "Resilient group can not be a nexthop within a group"); return false; } *is_fdb = nhg->fdb_nh; } else { struct nh_info *nhi = rtnl_dereference(nh->nh_info); if (nhi->reject_nh && npaths > 1) { NL_SET_ERR_MSG(extack, "Blackhole nexthop can not be used in a group with more than 1 path"); return false; } *is_fdb = nhi->fdb_nh; } return true; } static int nh_check_attr_fdb_group(struct nexthop *nh, u8 *nh_family, struct netlink_ext_ack *extack) { struct nh_info *nhi; nhi = rtnl_dereference(nh->nh_info); if (!nhi->fdb_nh) { NL_SET_ERR_MSG(extack, "FDB nexthop group can only have fdb nexthops"); return -EINVAL; } if (*nh_family == AF_UNSPEC) { *nh_family = nhi->family; } else if (*nh_family != nhi->family) { NL_SET_ERR_MSG(extack, "FDB nexthop group cannot have mixed family nexthops"); return -EINVAL; } return 0; } static int nh_check_attr_group(struct net *net, struct nlattr *tb[], size_t tb_size, u16 nh_grp_type, struct netlink_ext_ack *extack) { unsigned int len = nla_len(tb[NHA_GROUP]); u8 nh_family = AF_UNSPEC; struct nexthop_grp *nhg; unsigned int i, j; u8 nhg_fdb = 0; if (!len || len & (sizeof(struct nexthop_grp) - 1)) { NL_SET_ERR_MSG(extack, "Invalid length for nexthop group attribute"); return -EINVAL; } /* convert len to number of nexthop ids */ len /= sizeof(*nhg); nhg = nla_data(tb[NHA_GROUP]); for (i = 0; i < len; ++i) { if (nhg[i].resvd1 || nhg[i].resvd2) { NL_SET_ERR_MSG(extack, "Reserved fields in nexthop_grp must be 0"); return -EINVAL; } if (nhg[i].weight > 254) { NL_SET_ERR_MSG(extack, "Invalid value for weight"); return -EINVAL; } for (j = i + 1; j < len; ++j) { if (nhg[i].id == nhg[j].id) { NL_SET_ERR_MSG(extack, "Nexthop id can not be used twice in a group"); return -EINVAL; } } } if (tb[NHA_FDB]) nhg_fdb = 1; nhg = nla_data(tb[NHA_GROUP]); for (i = 0; i < len; ++i) { struct nexthop *nh; bool is_fdb_nh; nh = nexthop_find_by_id(net, nhg[i].id); if (!nh) { NL_SET_ERR_MSG(extack, "Invalid nexthop id"); return -EINVAL; } if (!valid_group_nh(nh, len, &is_fdb_nh, extack)) return -EINVAL; if (nhg_fdb && nh_check_attr_fdb_group(nh, &nh_family, extack)) return -EINVAL; if (!nhg_fdb && is_fdb_nh) { NL_SET_ERR_MSG(extack, "Non FDB nexthop group cannot have fdb nexthops"); return -EINVAL; } } for (i = NHA_GROUP_TYPE + 1; i < tb_size; ++i) { if (!tb[i]) continue; switch (i) { case NHA_HW_STATS_ENABLE: case NHA_FDB: continue; case NHA_RES_GROUP: if (nh_grp_type == NEXTHOP_GRP_TYPE_RES) continue; break; } NL_SET_ERR_MSG(extack, "No other attributes can be set in nexthop groups"); return -EINVAL; } return 0; } static bool ipv6_good_nh(const struct fib6_nh *nh) { int state = NUD_REACHABLE; struct neighbour *n; rcu_read_lock(); n = __ipv6_neigh_lookup_noref_stub(nh->fib_nh_dev, &nh->fib_nh_gw6); if (n) state = READ_ONCE(n->nud_state); rcu_read_unlock(); return !!(state & NUD_VALID); } static bool ipv4_good_nh(const struct fib_nh *nh) { int state = NUD_REACHABLE; struct neighbour *n; rcu_read_lock(); n = __ipv4_neigh_lookup_noref(nh->fib_nh_dev, (__force u32)nh->fib_nh_gw4); if (n) state = READ_ONCE(n->nud_state); rcu_read_unlock(); return !!(state & NUD_VALID); } static bool nexthop_is_good_nh(const struct nexthop *nh) { struct nh_info *nhi = rcu_dereference(nh->nh_info); switch (nhi->family) { case AF_INET: return ipv4_good_nh(&nhi->fib_nh); case AF_INET6: return ipv6_good_nh(&nhi->fib6_nh); } return false; } static struct nexthop *nexthop_select_path_fdb(struct nh_group *nhg, int hash) { int i; for (i = 0; i < nhg->num_nh; i++) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; if (hash > atomic_read(&nhge->hthr.upper_bound)) continue; nh_grp_entry_stats_inc(nhge); return nhge->nh; } WARN_ON_ONCE(1); return NULL; } static struct nexthop *nexthop_select_path_hthr(struct nh_group *nhg, int hash) { struct nh_grp_entry *nhge0 = NULL; int i; if (nhg->fdb_nh) return nexthop_select_path_fdb(nhg, hash); for (i = 0; i < nhg->num_nh; ++i) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; /* nexthops always check if it is good and does * not rely on a sysctl for this behavior */ if (!nexthop_is_good_nh(nhge->nh)) continue; if (!nhge0) nhge0 = nhge; if (hash > atomic_read(&nhge->hthr.upper_bound)) continue; nh_grp_entry_stats_inc(nhge); return nhge->nh; } if (!nhge0) nhge0 = &nhg->nh_entries[0]; nh_grp_entry_stats_inc(nhge0); return nhge0->nh; } static struct nexthop *nexthop_select_path_res(struct nh_group *nhg, int hash) { struct nh_res_table *res_table = rcu_dereference(nhg->res_table); u16 bucket_index = hash % res_table->num_nh_buckets; struct nh_res_bucket *bucket; struct nh_grp_entry *nhge; /* nexthop_select_path() is expected to return a non-NULL value, so * skip protocol validation and just hand out whatever there is. */ bucket = &res_table->nh_buckets[bucket_index]; nh_res_bucket_set_busy(bucket); nhge = rcu_dereference(bucket->nh_entry); nh_grp_entry_stats_inc(nhge); return nhge->nh; } struct nexthop *nexthop_select_path(struct nexthop *nh, int hash) { struct nh_group *nhg; if (!nh->is_group) return nh; nhg = rcu_dereference(nh->nh_grp); if (nhg->hash_threshold) return nexthop_select_path_hthr(nhg, hash); else if (nhg->resilient) return nexthop_select_path_res(nhg, hash); /* Unreachable. */ return NULL; } EXPORT_SYMBOL_GPL(nexthop_select_path); int nexthop_for_each_fib6_nh(struct nexthop *nh, int (*cb)(struct fib6_nh *nh, void *arg), void *arg) { struct nh_info *nhi; int err; if (nh->is_group) { struct nh_group *nhg; int i; nhg = rcu_dereference_rtnl(nh->nh_grp); for (i = 0; i < nhg->num_nh; i++) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; nhi = rcu_dereference_rtnl(nhge->nh->nh_info); err = cb(&nhi->fib6_nh, arg); if (err) return err; } } else { nhi = rcu_dereference_rtnl(nh->nh_info); err = cb(&nhi->fib6_nh, arg); if (err) return err; } return 0; } EXPORT_SYMBOL_GPL(nexthop_for_each_fib6_nh); static int check_src_addr(const struct in6_addr *saddr, struct netlink_ext_ack *extack) { if (!ipv6_addr_any(saddr)) { NL_SET_ERR_MSG(extack, "IPv6 routes using source address can not use nexthop objects"); return -EINVAL; } return 0; } int fib6_check_nexthop(struct nexthop *nh, struct fib6_config *cfg, struct netlink_ext_ack *extack) { struct nh_info *nhi; bool is_fdb_nh; /* fib6_src is unique to a fib6_info and limits the ability to cache * routes in fib6_nh within a nexthop that is potentially shared * across multiple fib entries. If the config wants to use source * routing it can not use nexthop objects. mlxsw also does not allow * fib6_src on routes. */ if (cfg && check_src_addr(&cfg->fc_src, extack) < 0) return -EINVAL; if (nh->is_group) { struct nh_group *nhg; nhg = rtnl_dereference(nh->nh_grp); if (nhg->has_v4) goto no_v4_nh; is_fdb_nh = nhg->fdb_nh; } else { nhi = rtnl_dereference(nh->nh_info); if (nhi->family == AF_INET) goto no_v4_nh; is_fdb_nh = nhi->fdb_nh; } if (is_fdb_nh) { NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop"); return -EINVAL; } return 0; no_v4_nh: NL_SET_ERR_MSG(extack, "IPv6 routes can not use an IPv4 nexthop"); return -EINVAL; } EXPORT_SYMBOL_GPL(fib6_check_nexthop); /* if existing nexthop has ipv6 routes linked to it, need * to verify this new spec works with ipv6 */ static int fib6_check_nh_list(struct nexthop *old, struct nexthop *new, struct netlink_ext_ack *extack) { struct fib6_info *f6i; if (list_empty(&old->f6i_list)) return 0; list_for_each_entry(f6i, &old->f6i_list, nh_list) { if (check_src_addr(&f6i->fib6_src.addr, extack) < 0) return -EINVAL; } return fib6_check_nexthop(new, NULL, extack); } static int nexthop_check_scope(struct nh_info *nhi, u8 scope, struct netlink_ext_ack *extack) { if (scope == RT_SCOPE_HOST && nhi->fib_nhc.nhc_gw_family) { NL_SET_ERR_MSG(extack, "Route with host scope can not have a gateway"); return -EINVAL; } if (nhi->fib_nhc.nhc_flags & RTNH_F_ONLINK && scope >= RT_SCOPE_LINK) { NL_SET_ERR_MSG(extack, "Scope mismatch with nexthop"); return -EINVAL; } return 0; } /* Invoked by fib add code to verify nexthop by id is ok with * config for prefix; parts of fib_check_nh not done when nexthop * object is used. */ int fib_check_nexthop(struct nexthop *nh, u8 scope, struct netlink_ext_ack *extack) { struct nh_info *nhi; int err = 0; if (nh->is_group) { struct nh_group *nhg; nhg = rtnl_dereference(nh->nh_grp); if (nhg->fdb_nh) { NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop"); err = -EINVAL; goto out; } if (scope == RT_SCOPE_HOST) { NL_SET_ERR_MSG(extack, "Route with host scope can not have multiple nexthops"); err = -EINVAL; goto out; } /* all nexthops in a group have the same scope */ nhi = rtnl_dereference(nhg->nh_entries[0].nh->nh_info); err = nexthop_check_scope(nhi, scope, extack); } else { nhi = rtnl_dereference(nh->nh_info); if (nhi->fdb_nh) { NL_SET_ERR_MSG(extack, "Route cannot point to a fdb nexthop"); err = -EINVAL; goto out; } err = nexthop_check_scope(nhi, scope, extack); } out: return err; } static int fib_check_nh_list(struct nexthop *old, struct nexthop *new, struct netlink_ext_ack *extack) { struct fib_info *fi; list_for_each_entry(fi, &old->fi_list, nh_list) { int err; err = fib_check_nexthop(new, fi->fib_scope, extack); if (err) return err; } return 0; } static bool nh_res_nhge_is_balanced(const struct nh_grp_entry *nhge) { return nhge->res.count_buckets == nhge->res.wants_buckets; } static bool nh_res_nhge_is_ow(const struct nh_grp_entry *nhge) { return nhge->res.count_buckets > nhge->res.wants_buckets; } static bool nh_res_nhge_is_uw(const struct nh_grp_entry *nhge) { return nhge->res.count_buckets < nhge->res.wants_buckets; } static bool nh_res_table_is_balanced(const struct nh_res_table *res_table) { return list_empty(&res_table->uw_nh_entries); } static void nh_res_bucket_unset_nh(struct nh_res_bucket *bucket) { struct nh_grp_entry *nhge; if (bucket->occupied) { nhge = nh_res_dereference(bucket->nh_entry); nhge->res.count_buckets--; bucket->occupied = false; } } static void nh_res_bucket_set_nh(struct nh_res_bucket *bucket, struct nh_grp_entry *nhge) { nh_res_bucket_unset_nh(bucket); bucket->occupied = true; rcu_assign_pointer(bucket->nh_entry, nhge); nhge->res.count_buckets++; } static bool nh_res_bucket_should_migrate(struct nh_res_table *res_table, struct nh_res_bucket *bucket, unsigned long *deadline, bool *force) { unsigned long now = jiffies; struct nh_grp_entry *nhge; unsigned long idle_point; if (!bucket->occupied) { /* The bucket is not occupied, its NHGE pointer is either * NULL or obsolete. We _have to_ migrate: set force. */ *force = true; return true; } nhge = nh_res_dereference(bucket->nh_entry); /* If the bucket is populated by an underweight or balanced * nexthop, do not migrate. */ if (!nh_res_nhge_is_ow(nhge)) return false; /* At this point we know that the bucket is populated with an * overweight nexthop. It needs to be migrated to a new nexthop if * the idle timer of unbalanced timer expired. */ idle_point = nh_res_bucket_idle_point(res_table, bucket, now); if (time_after_eq(now, idle_point)) { /* The bucket is idle. We _can_ migrate: unset force. */ *force = false; return true; } /* Unbalanced timer of 0 means "never force". */ if (res_table->unbalanced_timer) { unsigned long unb_point; unb_point = nh_res_table_unb_point(res_table); if (time_after(now, unb_point)) { /* The bucket is not idle, but the unbalanced timer * expired. We _can_ migrate, but set force anyway, * so that drivers know to ignore activity reports * from the HW. */ *force = true; return true; } nh_res_time_set_deadline(unb_point, deadline); } nh_res_time_set_deadline(idle_point, deadline); return false; } static bool nh_res_bucket_migrate(struct nh_res_table *res_table, u16 bucket_index, bool notify, bool notify_nl, bool force) { struct nh_res_bucket *bucket = &res_table->nh_buckets[bucket_index]; struct nh_grp_entry *new_nhge; struct netlink_ext_ack extack; int err; new_nhge = list_first_entry_or_null(&res_table->uw_nh_entries, struct nh_grp_entry, res.uw_nh_entry); if (WARN_ON_ONCE(!new_nhge)) /* If this function is called, "bucket" is either not * occupied, or it belongs to a next hop that is * overweight. In either case, there ought to be a * corresponding underweight next hop. */ return false; if (notify) { struct nh_grp_entry *old_nhge; old_nhge = nh_res_dereference(bucket->nh_entry); err = call_nexthop_res_bucket_notifiers(res_table->net, res_table->nhg_id, bucket_index, force, old_nhge->nh, new_nhge->nh, &extack); if (err) { pr_err_ratelimited("%s\n", extack._msg); if (!force) return false; /* It is not possible to veto a forced replacement, so * just clear the hardware flags from the nexthop * bucket to indicate to user space that this bucket is * not correctly populated in hardware. */ bucket->nh_flags &= ~(RTNH_F_OFFLOAD | RTNH_F_TRAP); } } nh_res_bucket_set_nh(bucket, new_nhge); nh_res_bucket_set_idle(res_table, bucket); if (notify_nl) nexthop_bucket_notify(res_table, bucket_index); if (nh_res_nhge_is_balanced(new_nhge)) list_del(&new_nhge->res.uw_nh_entry); return true; } #define NH_RES_UPKEEP_DW_MINIMUM_INTERVAL (HZ / 2) static void nh_res_table_upkeep(struct nh_res_table *res_table, bool notify, bool notify_nl) { unsigned long now = jiffies; unsigned long deadline; u16 i; /* Deadline is the next time that upkeep should be run. It is the * earliest time at which one of the buckets might be migrated. * Start at the most pessimistic estimate: either unbalanced_timer * from now, or if there is none, idle_timer from now. For each * encountered time point, call nh_res_time_set_deadline() to * refine the estimate. */ if (res_table->unbalanced_timer) deadline = now + res_table->unbalanced_timer; else deadline = now + res_table->idle_timer; for (i = 0; i < res_table->num_nh_buckets; i++) { struct nh_res_bucket *bucket = &res_table->nh_buckets[i]; bool force; if (nh_res_bucket_should_migrate(res_table, bucket, &deadline, &force)) { if (!nh_res_bucket_migrate(res_table, i, notify, notify_nl, force)) { unsigned long idle_point; /* A driver can override the migration * decision if the HW reports that the * bucket is actually not idle. Therefore * remark the bucket as busy again and * update the deadline. */ nh_res_bucket_set_busy(bucket); idle_point = nh_res_bucket_idle_point(res_table, bucket, now); nh_res_time_set_deadline(idle_point, &deadline); } } } /* If the group is still unbalanced, schedule the next upkeep to * either the deadline computed above, or the minimum deadline, * whichever comes later. */ if (!nh_res_table_is_balanced(res_table)) { unsigned long now = jiffies; unsigned long min_deadline; min_deadline = now + NH_RES_UPKEEP_DW_MINIMUM_INTERVAL; if (time_before(deadline, min_deadline)) deadline = min_deadline; queue_delayed_work(system_power_efficient_wq, &res_table->upkeep_dw, deadline - now); } } static void nh_res_table_upkeep_dw(struct work_struct *work) { struct delayed_work *dw = to_delayed_work(work); struct nh_res_table *res_table; res_table = container_of(dw, struct nh_res_table, upkeep_dw); nh_res_table_upkeep(res_table, true, true); } static void nh_res_table_cancel_upkeep(struct nh_res_table *res_table) { cancel_delayed_work_sync(&res_table->upkeep_dw); } static void nh_res_group_rebalance(struct nh_group *nhg, struct nh_res_table *res_table) { int prev_upper_bound = 0; int total = 0; int w = 0; int i; INIT_LIST_HEAD(&res_table->uw_nh_entries); for (i = 0; i < nhg->num_nh; ++i) total += nhg->nh_entries[i].weight; for (i = 0; i < nhg->num_nh; ++i) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; int upper_bound; w += nhge->weight; upper_bound = DIV_ROUND_CLOSEST(res_table->num_nh_buckets * w, total); nhge->res.wants_buckets = upper_bound - prev_upper_bound; prev_upper_bound = upper_bound; if (nh_res_nhge_is_uw(nhge)) { if (list_empty(&res_table->uw_nh_entries)) res_table->unbalanced_since = jiffies; list_add(&nhge->res.uw_nh_entry, &res_table->uw_nh_entries); } } } /* Migrate buckets in res_table so that they reference NHGE's from NHG with * the right NH ID. Set those buckets that do not have a corresponding NHGE * entry in NHG as not occupied. */ static void nh_res_table_migrate_buckets(struct nh_res_table *res_table, struct nh_group *nhg) { u16 i; for (i = 0; i < res_table->num_nh_buckets; i++) { struct nh_res_bucket *bucket = &res_table->nh_buckets[i]; u32 id = rtnl_dereference(bucket->nh_entry)->nh->id; bool found = false; int j; for (j = 0; j < nhg->num_nh; j++) { struct nh_grp_entry *nhge = &nhg->nh_entries[j]; if (nhge->nh->id == id) { nh_res_bucket_set_nh(bucket, nhge); found = true; break; } } if (!found) nh_res_bucket_unset_nh(bucket); } } static void replace_nexthop_grp_res(struct nh_group *oldg, struct nh_group *newg) { /* For NH group replacement, the new NHG might only have a stub * hash table with 0 buckets, because the number of buckets was not * specified. For NH removal, oldg and newg both reference the same * res_table. So in any case, in the following, we want to work * with oldg->res_table. */ struct nh_res_table *old_res_table = rtnl_dereference(oldg->res_table); unsigned long prev_unbalanced_since = old_res_table->unbalanced_since; bool prev_has_uw = !list_empty(&old_res_table->uw_nh_entries); nh_res_table_cancel_upkeep(old_res_table); nh_res_table_migrate_buckets(old_res_table, newg); nh_res_group_rebalance(newg, old_res_table); if (prev_has_uw && !list_empty(&old_res_table->uw_nh_entries)) old_res_table->unbalanced_since = prev_unbalanced_since; nh_res_table_upkeep(old_res_table, true, false); } static void nh_hthr_group_rebalance(struct nh_group *nhg) { int total = 0; int w = 0; int i; for (i = 0; i < nhg->num_nh; ++i) total += nhg->nh_entries[i].weight; for (i = 0; i < nhg->num_nh; ++i) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; int upper_bound; w += nhge->weight; upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31, total) - 1; atomic_set(&nhge->hthr.upper_bound, upper_bound); } } static void remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge, struct nl_info *nlinfo) { struct nh_grp_entry *nhges, *new_nhges; struct nexthop *nhp = nhge->nh_parent; struct netlink_ext_ack extack; struct nexthop *nh = nhge->nh; struct nh_group *nhg, *newg; int i, j, err; WARN_ON(!nh); nhg = rtnl_dereference(nhp->nh_grp); newg = nhg->spare; /* last entry, keep it visible and remove the parent */ if (nhg->num_nh == 1) { remove_nexthop(net, nhp, nlinfo); return; } newg->has_v4 = false; newg->is_multipath = nhg->is_multipath; newg->hash_threshold = nhg->hash_threshold; newg->resilient = nhg->resilient; newg->fdb_nh = nhg->fdb_nh; newg->num_nh = nhg->num_nh; /* copy old entries to new except the one getting removed */ nhges = nhg->nh_entries; new_nhges = newg->nh_entries; for (i = 0, j = 0; i < nhg->num_nh; ++i) { struct nh_info *nhi; /* current nexthop getting removed */ if (nhg->nh_entries[i].nh == nh) { newg->num_nh--; continue; } nhi = rtnl_dereference(nhges[i].nh->nh_info); if (nhi->family == AF_INET) newg->has_v4 = true; list_del(&nhges[i].nh_list); new_nhges[j].stats = nhges[i].stats; new_nhges[j].nh_parent = nhges[i].nh_parent; new_nhges[j].nh = nhges[i].nh; new_nhges[j].weight = nhges[i].weight; list_add(&new_nhges[j].nh_list, &new_nhges[j].nh->grp_list); j++; } if (newg->hash_threshold) nh_hthr_group_rebalance(newg); else if (newg->resilient) replace_nexthop_grp_res(nhg, newg); rcu_assign_pointer(nhp->nh_grp, newg); list_del(&nhge->nh_list); free_percpu(nhge->stats); nexthop_put(nhge->nh); /* Removal of a NH from a resilient group is notified through * bucket notifications. */ if (newg->hash_threshold) { err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, nhp, &extack); if (err) pr_err("%s\n", extack._msg); } if (nlinfo) nexthop_notify(RTM_NEWNEXTHOP, nhp, nlinfo); } static void remove_nexthop_from_groups(struct net *net, struct nexthop *nh, struct nl_info *nlinfo) { struct nh_grp_entry *nhge, *tmp; list_for_each_entry_safe(nhge, tmp, &nh->grp_list, nh_list) remove_nh_grp_entry(net, nhge, nlinfo); /* make sure all see the newly published array before releasing rtnl */ synchronize_net(); } static void remove_nexthop_group(struct nexthop *nh, struct nl_info *nlinfo) { struct nh_group *nhg = rcu_dereference_rtnl(nh->nh_grp); struct nh_res_table *res_table; int i, num_nh = nhg->num_nh; for (i = 0; i < num_nh; ++i) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; if (WARN_ON(!nhge->nh)) continue; list_del_init(&nhge->nh_list); } if (nhg->resilient) { res_table = rtnl_dereference(nhg->res_table); nh_res_table_cancel_upkeep(res_table); } } /* not called for nexthop replace */ static void __remove_nexthop_fib(struct net *net, struct nexthop *nh) { struct fib6_info *f6i, *tmp; bool do_flush = false; struct fib_info *fi; list_for_each_entry(fi, &nh->fi_list, nh_list) { fi->fib_flags |= RTNH_F_DEAD; do_flush = true; } if (do_flush) fib_flush(net); /* ip6_del_rt removes the entry from this list hence the _safe */ list_for_each_entry_safe(f6i, tmp, &nh->f6i_list, nh_list) { /* __ip6_del_rt does a release, so do a hold here */ fib6_info_hold(f6i); ipv6_stub->ip6_del_rt(net, f6i, !READ_ONCE(net->ipv4.sysctl_nexthop_compat_mode)); } } static void __remove_nexthop(struct net *net, struct nexthop *nh, struct nl_info *nlinfo) { __remove_nexthop_fib(net, nh); if (nh->is_group) { remove_nexthop_group(nh, nlinfo); } else { struct nh_info *nhi; nhi = rtnl_dereference(nh->nh_info); if (nhi->fib_nhc.nhc_dev) hlist_del(&nhi->dev_hash); remove_nexthop_from_groups(net, nh, nlinfo); } } static void remove_nexthop(struct net *net, struct nexthop *nh, struct nl_info *nlinfo) { call_nexthop_notifiers(net, NEXTHOP_EVENT_DEL, nh, NULL); /* remove from the tree */ rb_erase(&nh->rb_node, &net->nexthop.rb_root); if (nlinfo) nexthop_notify(RTM_DELNEXTHOP, nh, nlinfo); __remove_nexthop(net, nh, nlinfo); nh_base_seq_inc(net); nexthop_put(nh); } /* if any FIB entries reference this nexthop, any dst entries * need to be regenerated */ static void nh_rt_cache_flush(struct net *net, struct nexthop *nh, struct nexthop *replaced_nh) { struct fib6_info *f6i; struct nh_group *nhg; int i; if (!list_empty(&nh->fi_list)) rt_cache_flush(net); list_for_each_entry(f6i, &nh->f6i_list, nh_list) ipv6_stub->fib6_update_sernum(net, f6i); /* if an IPv6 group was replaced, we have to release all old * dsts to make sure all refcounts are released */ if (!replaced_nh->is_group) return; nhg = rtnl_dereference(replaced_nh->nh_grp); for (i = 0; i < nhg->num_nh; i++) { struct nh_grp_entry *nhge = &nhg->nh_entries[i]; struct nh_info *nhi = rtnl_dereference(nhge->nh->nh_info); if (nhi->family == AF_INET6) ipv6_stub->fib6_nh_release_dsts(&nhi->fib6_nh); } } static int replace_nexthop_grp(struct net *net, struct nexthop *old, struct nexthop *new, const struct nh_config *cfg, struct netlink_ext_ack *extack) { struct nh_res_table *tmp_table = NULL; struct nh_res_table *new_res_table; struct nh_res_table *old_res_table; struct nh_group *oldg, *newg; int i, err; if (!new->is_group) { NL_SET_ERR_MSG(extack, "Can not replace a nexthop group with a nexthop."); return -EINVAL; } oldg = rtnl_dereference(old->nh_grp); newg = rtnl_dereference(new->nh_grp); if (newg->hash_threshold != oldg->hash_threshold) { NL_SET_ERR_MSG(extack, "Can not replace a nexthop group with one of a different type."); return -EINVAL; } if (newg->hash_threshold) { err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new, extack); if (err) return err; } else if (newg->resilient) { new_res_table = rtnl_dereference(newg->res_table); old_res_table = rtnl_dereference(oldg->res_table); /* Accept if num_nh_buckets was not given, but if it was * given, demand that the value be correct. */ if (cfg->nh_grp_res_has_num_buckets && cfg->nh_grp_res_num_buckets != old_res_table->num_nh_buckets) { NL_SET_ERR_MSG(extack, "Can not change number of buckets of a resilient nexthop group."); return -EINVAL; } /* Emit a pre-replace notification so that listeners could veto * a potentially unsupported configuration. Otherwise, * individual bucket replacement notifications would need to be * vetoed, which is something that should only happen if the * bucket is currently active. */ err = call_nexthop_res_table_notifiers(net, new, extack); if (err) return err; if (cfg->nh_grp_res_has_idle_timer) old_res_table->idle_timer = cfg->nh_grp_res_idle_timer; if (cfg->nh_grp_res_has_unbalanced_timer) old_res_table->unbalanced_timer = cfg->nh_grp_res_unbalanced_timer; replace_nexthop_grp_res(oldg, newg); tmp_table = new_res_table; rcu_assign_pointer(newg->res_table, old_res_table); rcu_assign_pointer(newg->spare->res_table, old_res_table); } /* update parents - used by nexthop code for cleanup */ for (i = 0; i < newg->num_nh; i++) newg->nh_entries[i].nh_parent = old; rcu_assign_pointer(old->nh_grp, newg); /* Make sure concurrent readers are not using 'oldg' anymore. */ synchronize_net(); if (newg->resilient) { rcu_assign_pointer(oldg->res_table, tmp_table); rcu_assign_pointer(oldg->spare->res_table, tmp_table); } for (i = 0; i < oldg->num_nh; i++) oldg->nh_entries[i].nh_parent = new; rcu_assign_pointer(new->nh_grp, oldg); return 0; } static void nh_group_v4_update(struct nh_group *nhg) { struct nh_grp_entry *nhges; bool has_v4 = false; int i; nhges = nhg->nh_entries; for (i = 0; i < nhg->num_nh; i++) { struct nh_info *nhi; nhi = rtnl_dereference(nhges[i].nh->nh_info); if (nhi->family == AF_INET) has_v4 = true; } nhg->has_v4 = has_v4; } static int replace_nexthop_single_notify_res(struct net *net, struct nh_res_table *res_table, struct nexthop *old, struct nh_info *oldi, struct nh_info *newi, struct netlink_ext_ack *extack) { u32 nhg_id = res_table->nhg_id; int err; u16 i; for (i = 0; i < res_table->num_nh_buckets; i++) { struct nh_res_bucket *bucket = &res_table->nh_buckets[i]; struct nh_grp_entry *nhge; nhge = rtnl_dereference(bucket->nh_entry); if (nhge->nh == old) { err = __call_nexthop_res_bucket_notifiers(net, nhg_id, i, true, oldi, newi, extack); if (err) goto err_notify; } } return 0; err_notify: while (i-- > 0) { struct nh_res_bucket *bucket = &res_table->nh_buckets[i]; struct nh_grp_entry *nhge; nhge = rtnl_dereference(bucket->nh_entry); if (nhge->nh == old) __call_nexthop_res_bucket_notifiers(net, nhg_id, i, true, newi, oldi, extack); } return err; } static int replace_nexthop_single_notify(struct net *net, struct nexthop *group_nh, struct nexthop *old, struct nh_info *oldi, struct nh_info *newi, struct netlink_ext_ack *extack) { struct nh_group *nhg = rtnl_dereference(group_nh->nh_grp); struct nh_res_table *res_table; if (nhg->hash_threshold) { return call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, group_nh, extack); } else if (nhg->resilient) { res_table = rtnl_dereference(nhg->res_table); return replace_nexthop_single_notify_res(net, res_table, old, oldi, newi, extack); } return -EINVAL; } static int replace_nexthop_single(struct net *net, struct nexthop *old, struct nexthop *new, struct netlink_ext_ack *extack) { u8 old_protocol, old_nh_flags; struct nh_info *oldi, *newi; struct nh_grp_entry *nhge; int err; if (new->is_group) { NL_SET_ERR_MSG(extack, "Can not replace a nexthop with a nexthop group."); return -EINVAL; } err = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new, extack); if (err) return err; /* Hardware flags were set on 'old' as 'new' is not in the red-black * tree. Therefore, inherit the flags from 'old' to 'new'. */ new->nh_flags |= old->nh_flags & (RTNH_F_OFFLOAD | RTNH_F_TRAP); oldi = rtnl_dereference(old->nh_info); newi = rtnl_dereference(new->nh_info); newi->nh_parent = old; oldi->nh_parent = new; old_protocol = old->protocol; old_nh_flags = old->nh_flags; old->protocol = new->protocol; old->nh_flags = new->nh_flags; rcu_assign_pointer(old->nh_info, newi); rcu_assign_pointer(new->nh_info, oldi); /* Send a replace notification for all the groups using the nexthop. */ list_for_each_entry(nhge, &old->grp_list, nh_list) { struct nexthop *nhp = nhge->nh_parent; err = replace_nexthop_single_notify(net, nhp, old, oldi, newi, extack); if (err) goto err_notify; } /* When replacing an IPv4 nexthop with an IPv6 nexthop, potentially * update IPv4 indication in all the groups using the nexthop. */ if (oldi->family == AF_INET && newi->family == AF_INET6) { list_for_each_entry(nhge, &old->grp_list, nh_list) { struct nexthop *nhp = nhge->nh_parent; struct nh_group *nhg; nhg = rtnl_dereference(nhp->nh_grp); nh_group_v4_update(nhg); } } return 0; err_notify: rcu_assign_pointer(new->nh_info, newi); rcu_assign_pointer(old->nh_info, oldi); old->nh_flags = old_nh_flags; old->protocol = old_protocol; oldi->nh_parent = old; newi->nh_parent = new; list_for_each_entry_continue_reverse(nhge, &old->grp_list, nh_list) { struct nexthop *nhp = nhge->nh_parent; replace_nexthop_single_notify(net, nhp, old, newi, oldi, NULL); } call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, old, extack); return err; } static void __nexthop_replace_notify(struct net *net, struct nexthop *nh, struct nl_info *info) { struct fib6_info *f6i; if (!list_empty(&nh->fi_list)) { struct fib_info *fi; /* expectation is a few fib_info per nexthop and then * a lot of routes per fib_info. So mark the fib_info * and then walk the fib tables once */ list_for_each_entry(fi, &nh->fi_list, nh_list) fi->nh_updated = true; fib_info_notify_update(net, info); list_for_each_entry(fi, &nh->fi_list, nh_list) fi->nh_updated = false; } list_for_each_entry(f6i, &nh->f6i_list, nh_list) ipv6_stub->fib6_rt_update(net, f6i, info); } /* send RTM_NEWROUTE with REPLACE flag set for all FIB entries * linked to this nexthop and for all groups that the nexthop * is a member of */ static void nexthop_replace_notify(struct net *net, struct nexthop *nh, struct nl_info *info) { struct nh_grp_entry *nhge; __nexthop_replace_notify(net, nh, info); list_for_each_entry(nhge, &nh->grp_list, nh_list) __nexthop_replace_notify(net, nhge->nh_parent, info); } static int replace_nexthop(struct net *net, struct nexthop *old, struct nexthop *new, const struct nh_config *cfg, struct netlink_ext_ack *extack) { bool new_is_reject = false; struct nh_grp_entry *nhge; int err; /* check that existing FIB entries are ok with the * new nexthop definition */ err = fib_check_nh_list(old, new, extack); if (err) return err; err = fib6_check_nh_list(old, new, extack); if (err) return err; if (!new->is_group) { struct nh_info *nhi = rtnl_dereference(new->nh_info); new_is_reject = nhi->reject_nh; } list_for_each_entry(nhge, &old->grp_list, nh_list) { /* if new nexthop is a blackhole, any groups using this * nexthop cannot have more than 1 path */ if (new_is_reject && nexthop_num_path(nhge->nh_parent) > 1) { NL_SET_ERR_MSG(extack, "Blackhole nexthop can not be a member of a group with more than one path"); return -EINVAL; } err = fib_check_nh_list(nhge->nh_parent, new, extack); if (err) return err; err = fib6_check_nh_list(nhge->nh_parent, new, extack); if (err) return err; } if (old->is_group) err = replace_nexthop_grp(net, old, new, cfg, extack); else err = replace_nexthop_single(net, old, new, extack); if (!err) { nh_rt_cache_flush(net, old, new); __remove_nexthop(net, new, NULL); nexthop_put(new); } return err; } /* called with rtnl_lock held */ static int insert_nexthop(struct net *net, struct nexthop *new_nh, struct nh_config *cfg, struct netlink_ext_ack *extack) { struct rb_node **pp, *parent = NULL, *next; struct rb_root *root = &net->nexthop.rb_root; bool replace = !!(cfg->nlflags & NLM_F_REPLACE); bool create = !!(cfg->nlflags & NLM_F_CREATE); u32 new_id = new_nh->id; int replace_notify = 0; int rc = -EEXIST; pp = &root->rb_node; while (1) { struct nexthop *nh; next = *pp; if (!next) break; parent = next; nh = rb_entry(parent, struct nexthop, rb_node); if (new_id < nh->id) { pp = &next->rb_left; } else if (new_id > nh->id) { pp = &next->rb_right; } else if (replace) { rc = replace_nexthop(net, nh, new_nh, cfg, extack); if (!rc) { new_nh = nh; /* send notification with old nh */ replace_notify = 1; } goto out; } else { /* id already exists and not a replace */ goto out; } } if (replace && !create) { NL_SET_ERR_MSG(extack, "Replace specified without create and no entry exists"); rc = -ENOENT; goto out; } if (new_nh->is_group) { struct nh_group *nhg = rtnl_dereference(new_nh->nh_grp); struct nh_res_table *res_table; if (nhg->resilient) { res_table = rtnl_dereference(nhg->res_table); /* Not passing the number of buckets is OK when * replacing, but not when creating a new group. */ if (!cfg->nh_grp_res_has_num_buckets) { NL_SET_ERR_MSG(extack, "Number of buckets not specified for nexthop group insertion"); rc = -EINVAL; goto out; } nh_res_group_rebalance(nhg, res_table); /* Do not send bucket notifications, we do full * notification below. */ nh_res_table_upkeep(res_table, false, false); } } rb_link_node_rcu(&new_nh->rb_node, parent, pp); rb_insert_color(&new_nh->rb_node, root); /* The initial insertion is a full notification for hash-threshold as * well as resilient groups. */ rc = call_nexthop_notifiers(net, NEXTHOP_EVENT_REPLACE, new_nh, extack); if (rc) rb_erase(&new_nh->rb_node, &net->nexthop.rb_root); out: if (!rc) { nh_base_seq_inc(net); nexthop_notify(RTM_NEWNEXTHOP, new_nh, &cfg->nlinfo); if (replace_notify && READ_ONCE(net->ipv4.sysctl_nexthop_compat_mode)) nexthop_replace_notify(net, new_nh, &cfg->nlinfo); } return rc; } /* rtnl */ /* remove all nexthops tied to a device being deleted */ static void nexthop_flush_dev(struct net_device *dev, unsigned long event) { unsigned int hash = nh_dev_hashfn(dev->ifindex); struct net *net = dev_net(dev); struct hlist_head *head = &net->nexthop.devhash[hash]; struct hlist_node *n; struct nh_info *nhi; hlist_for_each_entry_safe(nhi, n, head, dev_hash) { if (nhi->fib_nhc.nhc_dev != dev) continue; if (nhi->reject_nh && (event == NETDEV_DOWN || event == NETDEV_CHANGE)) continue; remove_nexthop(net, nhi->nh_parent, NULL); } } /* rtnl; called when net namespace is deleted */ static void flush_all_nexthops(struct net *net) { struct rb_root *root = &net->nexthop.rb_root; struct rb_node *node; struct nexthop *nh; while ((node = rb_first(root))) { nh = rb_entry(node, struct nexthop, rb_node); remove_nexthop(net, nh, NULL); cond_resched(); } } static struct nexthop *nexthop_create_group(struct net *net, struct nh_config *cfg) { struct nlattr *grps_attr = cfg->nh_grp; struct nexthop_grp *entry = nla_data(grps_attr); u16 num_nh = nla_len(grps_attr) / sizeof(*entry); struct nh_group *nhg; struct nexthop *nh; int err; int i; if (WARN_ON(!num_nh)) return ERR_PTR(-EINVAL); nh = nexthop_alloc(); if (!nh) return ERR_PTR(-ENOMEM); nh->is_group = 1; nhg = nexthop_grp_alloc(num_nh); if (!nhg) { kfree(nh); return ERR_PTR(-ENOMEM); } /* spare group used for removals */ nhg->spare = nexthop_grp_alloc(num_nh); if (!nhg->spare) { kfree(nhg); kfree(nh); return ERR_PTR(-ENOMEM); } nhg->spare->spare = nhg; for (i = 0; i < nhg->num_nh; ++i) { struct nexthop *nhe; struct nh_info *nhi; nhe = nexthop_find_by_id(net, entry[i].id); if (!nexthop_get(nhe)) { err = -ENOENT; goto out_no_nh; } nhi = rtnl_dereference(nhe->nh_info); if (nhi->family == AF_INET) nhg->has_v4 = true; nhg->nh_entries[i].stats = netdev_alloc_pcpu_stats(struct nh_grp_entry_stats); if (!nhg->nh_entries[i].stats) { err = -ENOMEM; nexthop_put(nhe); goto out_no_nh; } nhg->nh_entries[i].nh = nhe; nhg->nh_entries[i].weight = entry[i].weight + 1; list_add(&nhg->nh_entries[i].nh_list, &nhe->grp_list); nhg->nh_entries[i].nh_parent = nh; } if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_MPATH) { nhg->hash_threshold = 1; nhg->is_multipath = true; } else if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_RES) { struct nh_res_table *res_table; res_table = nexthop_res_table_alloc(net, cfg->nh_id, cfg); if (!res_table) { err = -ENOMEM; goto out_no_nh; } rcu_assign_pointer(nhg->spare->res_table, res_table); rcu_assign_pointer(nhg->res_table, res_table); nhg->resilient = true; nhg->is_multipath = true; } WARN_ON_ONCE(nhg->hash_threshold + nhg->resilient != 1); if (nhg->hash_threshold) nh_hthr_group_rebalance(nhg); if (cfg->nh_fdb) nhg->fdb_nh = 1; if (cfg->nh_hw_stats) nhg->hw_stats = true; rcu_assign_pointer(nh->nh_grp, nhg); return nh; out_no_nh: for (i--; i >= 0; --i) { list_del(&nhg->nh_entries[i].nh_list); free_percpu(nhg->nh_entries[i].stats); nexthop_put(nhg->nh_entries[i].nh); } kfree(nhg->spare); kfree(nhg); kfree(nh); return ERR_PTR(err); } static int nh_create_ipv4(struct net *net, struct nexthop *nh, struct nh_info *nhi, struct nh_config *cfg, struct netlink_ext_ack *extack) { struct fib_nh *fib_nh = &nhi->fib_nh; struct fib_config fib_cfg = { .fc_oif = cfg->nh_ifindex, .fc_gw4 = cfg->gw.ipv4, .fc_gw_family = cfg->gw.ipv4 ? AF_INET : 0, .fc_flags = cfg->nh_flags, .fc_nlinfo = cfg->nlinfo, .fc_encap = cfg->nh_encap, .fc_encap_type = cfg->nh_encap_type, }; u32 tb_id = (cfg->dev ? l3mdev_fib_table(cfg->dev) : RT_TABLE_MAIN); int err; err = fib_nh_init(net, fib_nh, &fib_cfg, 1, extack); if (err) { fib_nh_release(net, fib_nh); goto out; } if (nhi->fdb_nh) goto out; /* sets nh_dev if successful */ err = fib_check_nh(net, fib_nh, tb_id, 0, extack); if (!err) { nh->nh_flags = fib_nh->fib_nh_flags; fib_info_update_nhc_saddr(net, &fib_nh->nh_common, !fib_nh->fib_nh_scope ? 0 : fib_nh->fib_nh_scope - 1); } else { fib_nh_release(net, fib_nh); } out: return err; } static int nh_create_ipv6(struct net *net, struct nexthop *nh, struct nh_info *nhi, struct nh_config *cfg, struct netlink_ext_ack *extack) { struct fib6_nh *fib6_nh = &nhi->fib6_nh; struct fib6_config fib6_cfg = { .fc_table = l3mdev_fib_table(cfg->dev), .fc_ifindex = cfg->nh_ifindex, .fc_gateway = cfg->gw.ipv6, .fc_flags = cfg->nh_flags, .fc_nlinfo = cfg->nlinfo, .fc_encap = cfg->nh_encap, .fc_encap_type = cfg->nh_encap_type, .fc_is_fdb = cfg->nh_fdb, }; int err; if (!ipv6_addr_any(&cfg->gw.ipv6)) fib6_cfg.fc_flags |= RTF_GATEWAY; /* sets nh_dev if successful */ err = ipv6_stub->fib6_nh_init(net, fib6_nh, &fib6_cfg, GFP_KERNEL, extack); if (err) { /* IPv6 is not enabled, don't call fib6_nh_release */ if (err == -EAFNOSUPPORT) goto out; ipv6_stub->fib6_nh_release(fib6_nh); } else { nh->nh_flags = fib6_nh->fib_nh_flags; } out: return err; } static struct nexthop *nexthop_create(struct net *net, struct nh_config *cfg, struct netlink_ext_ack *extack) { struct nh_info *nhi; struct nexthop *nh; int err = 0; nh = nexthop_alloc(); if (!nh) return ERR_PTR(-ENOMEM); nhi = kzalloc(sizeof(*nhi), GFP_KERNEL); if (!nhi) { kfree(nh); return ERR_PTR(-ENOMEM); } nh->nh_flags = cfg->nh_flags; nh->net = net; nhi->nh_parent = nh; nhi->family = cfg->nh_family; nhi->fib_nhc.nhc_scope = RT_SCOPE_LINK; if (cfg->nh_fdb) nhi->fdb_nh = 1; if (cfg->nh_blackhole) { nhi->reject_nh = 1; cfg->nh_ifindex = net->loopback_dev->ifindex; } switch (cfg->nh_family) { case AF_INET: err = nh_create_ipv4(net, nh, nhi, cfg, extack); break; case AF_INET6: err = nh_create_ipv6(net, nh, nhi, cfg, extack); break; } if (err) { kfree(nhi); kfree(nh); return ERR_PTR(err); } /* add the entry to the device based hash */ if (!nhi->fdb_nh) nexthop_devhash_add(net, nhi); rcu_assign_pointer(nh->nh_info, nhi); return nh; } /* called with rtnl lock held */ static struct nexthop *nexthop_add(struct net *net, struct nh_config *cfg, struct netlink_ext_ack *extack) { struct nexthop *nh; int err; if (cfg->nlflags & NLM_F_REPLACE && !cfg->nh_id) { NL_SET_ERR_MSG(extack, "Replace requires nexthop id"); return ERR_PTR(-EINVAL); } if (!cfg->nh_id) { cfg->nh_id = nh_find_unused_id(net); if (!cfg->nh_id) { NL_SET_ERR_MSG(extack, "No unused id"); return ERR_PTR(-EINVAL); } } if (cfg->nh_grp) nh = nexthop_create_group(net, cfg); else nh = nexthop_create(net, cfg, extack); if (IS_ERR(nh)) return nh; refcount_set(&nh->refcnt, 1); nh->id = cfg->nh_id; nh->protocol = cfg->nh_protocol; nh->net = net; err = insert_nexthop(net, nh, cfg, extack); if (err) { __remove_nexthop(net, nh, NULL); nexthop_put(nh); nh = ERR_PTR(err); } return nh; } static int rtm_nh_get_timer(struct nlattr *attr, unsigned long fallback, unsigned long *timer_p, bool *has_p, struct netlink_ext_ack *extack) { unsigned long timer; u32 value; if (!attr) { *timer_p = fallback; *has_p = false; return 0; } value = nla_get_u32(attr); timer = clock_t_to_jiffies(value); if (timer == ~0UL) { NL_SET_ERR_MSG(extack, "Timer value too large"); return -EINVAL; } *timer_p = timer; *has_p = true; return 0; } static int rtm_to_nh_config_grp_res(struct nlattr *res, struct nh_config *cfg, struct netlink_ext_ack *extack) { struct nlattr *tb[ARRAY_SIZE(rtm_nh_res_policy_new)] = {}; int err; if (res) { err = nla_parse_nested(tb, ARRAY_SIZE(rtm_nh_res_policy_new) - 1, res, rtm_nh_res_policy_new, extack); if (err < 0) return err; } if (tb[NHA_RES_GROUP_BUCKETS]) { cfg->nh_grp_res_num_buckets = nla_get_u16(tb[NHA_RES_GROUP_BUCKETS]); cfg->nh_grp_res_has_num_buckets = true; if (!cfg->nh_grp_res_num_buckets) { NL_SET_ERR_MSG(extack, "Number of buckets needs to be non-0"); return -EINVAL; } } err = rtm_nh_get_timer(tb[NHA_RES_GROUP_IDLE_TIMER], NH_RES_DEFAULT_IDLE_TIMER, &cfg->nh_grp_res_idle_timer, &cfg->nh_grp_res_has_idle_timer, extack); if (err) return err; return rtm_nh_get_timer(tb[NHA_RES_GROUP_UNBALANCED_TIMER], NH_RES_DEFAULT_UNBALANCED_TIMER, &cfg->nh_grp_res_unbalanced_timer, &cfg->nh_grp_res_has_unbalanced_timer, extack); } static int rtm_to_nh_config(struct net *net, struct sk_buff *skb, struct nlmsghdr *nlh, struct nh_config *cfg, struct netlink_ext_ack *extack) { struct nhmsg *nhm = nlmsg_data(nlh); struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_new)]; int err; err = nlmsg_parse(nlh, sizeof(*nhm), tb, ARRAY_SIZE(rtm_nh_policy_new) - 1, rtm_nh_policy_new, extack); if (err < 0) return err; err = -EINVAL; if (nhm->resvd || nhm->nh_scope) { NL_SET_ERR_MSG(extack, "Invalid values in ancillary header"); goto out; } if (nhm->nh_flags & ~NEXTHOP_VALID_USER_FLAGS) { NL_SET_ERR_MSG(extack, "Invalid nexthop flags in ancillary header"); goto out; } switch (nhm->nh_family) { case AF_INET: case AF_INET6: break; case AF_UNSPEC: if (tb[NHA_GROUP]) break; fallthrough; default: NL_SET_ERR_MSG(extack, "Invalid address family"); goto out; } memset(cfg, 0, sizeof(*cfg)); cfg->nlflags = nlh->nlmsg_flags; cfg->nlinfo.portid = NETLINK_CB(skb).portid; cfg->nlinfo.nlh = nlh; cfg->nlinfo.nl_net = net; cfg->nh_family = nhm->nh_family; cfg->nh_protocol = nhm->nh_protocol; cfg->nh_flags = nhm->nh_flags; if (tb[NHA_ID]) cfg->nh_id = nla_get_u32(tb[NHA_ID]); if (tb[NHA_FDB]) { if (tb[NHA_OIF] || tb[NHA_BLACKHOLE] || tb[NHA_ENCAP] || tb[NHA_ENCAP_TYPE]) { NL_SET_ERR_MSG(extack, "Fdb attribute can not be used with encap, oif or blackhole"); goto out; } if (nhm->nh_flags) { NL_SET_ERR_MSG(extack, "Unsupported nexthop flags in ancillary header"); goto out; } cfg->nh_fdb = nla_get_flag(tb[NHA_FDB]); } if (tb[NHA_GROUP]) { if (nhm->nh_family != AF_UNSPEC) { NL_SET_ERR_MSG(extack, "Invalid family for group"); goto out; } cfg->nh_grp = tb[NHA_GROUP]; cfg->nh_grp_type = NEXTHOP_GRP_TYPE_MPATH; if (tb[NHA_GROUP_TYPE]) cfg->nh_grp_type = nla_get_u16(tb[NHA_GROUP_TYPE]); if (cfg->nh_grp_type > NEXTHOP_GRP_TYPE_MAX) { NL_SET_ERR_MSG(extack, "Invalid group type"); goto out; } err = nh_check_attr_group(net, tb, ARRAY_SIZE(tb), cfg->nh_grp_type, extack); if (err) goto out; if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_RES) err = rtm_to_nh_config_grp_res(tb[NHA_RES_GROUP], cfg, extack); if (tb[NHA_HW_STATS_ENABLE]) cfg->nh_hw_stats = nla_get_u32(tb[NHA_HW_STATS_ENABLE]); /* no other attributes should be set */ goto out; } if (tb[NHA_BLACKHOLE]) { if (tb[NHA_GATEWAY] || tb[NHA_OIF] || tb[NHA_ENCAP] || tb[NHA_ENCAP_TYPE] || tb[NHA_FDB]) { NL_SET_ERR_MSG(extack, "Blackhole attribute can not be used with gateway, oif, encap or fdb"); goto out; } cfg->nh_blackhole = 1; err = 0; goto out; } if (!cfg->nh_fdb && !tb[NHA_OIF]) { NL_SET_ERR_MSG(extack, "Device attribute required for non-blackhole and non-fdb nexthops"); goto out; } if (!cfg->nh_fdb && tb[NHA_OIF]) { cfg->nh_ifindex = nla_get_u32(tb[NHA_OIF]); if (cfg->nh_ifindex) cfg->dev = __dev_get_by_index(net, cfg->nh_ifindex); if (!cfg->dev) { NL_SET_ERR_MSG(extack, "Invalid device index"); goto out; } else if (!(cfg->dev->flags & IFF_UP)) { NL_SET_ERR_MSG(extack, "Nexthop device is not up"); err = -ENETDOWN; goto out; } else if (!netif_carrier_ok(cfg->dev)) { NL_SET_ERR_MSG(extack, "Carrier for nexthop device is down"); err = -ENETDOWN; goto out; } } err = -EINVAL; if (tb[NHA_GATEWAY]) { struct nlattr *gwa = tb[NHA_GATEWAY]; switch (cfg->nh_family) { case AF_INET: if (nla_len(gwa) != sizeof(u32)) { NL_SET_ERR_MSG(extack, "Invalid gateway"); goto out; } cfg->gw.ipv4 = nla_get_be32(gwa); break; case AF_INET6: if (nla_len(gwa) != sizeof(struct in6_addr)) { NL_SET_ERR_MSG(extack, "Invalid gateway"); goto out; } cfg->gw.ipv6 = nla_get_in6_addr(gwa); break; default: NL_SET_ERR_MSG(extack, "Unknown address family for gateway"); goto out; } } else { /* device only nexthop (no gateway) */ if (cfg->nh_flags & RTNH_F_ONLINK) { NL_SET_ERR_MSG(extack, "ONLINK flag can not be set for nexthop without a gateway"); goto out; } } if (tb[NHA_ENCAP]) { cfg->nh_encap = tb[NHA_ENCAP]; if (!tb[NHA_ENCAP_TYPE]) { NL_SET_ERR_MSG(extack, "LWT encapsulation type is missing"); goto out; } cfg->nh_encap_type = nla_get_u16(tb[NHA_ENCAP_TYPE]); err = lwtunnel_valid_encap_type(cfg->nh_encap_type, extack); if (err < 0) goto out; } else if (tb[NHA_ENCAP_TYPE]) { NL_SET_ERR_MSG(extack, "LWT encapsulation attribute is missing"); goto out; } if (tb[NHA_HW_STATS_ENABLE]) { NL_SET_ERR_MSG(extack, "Cannot enable nexthop hardware statistics for non-group nexthops"); goto out; } err = 0; out: return err; } /* rtnl */ static int rtm_new_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct nh_config cfg; struct nexthop *nh; int err; err = rtm_to_nh_config(net, skb, nlh, &cfg, extack); if (!err) { nh = nexthop_add(net, &cfg, extack); if (IS_ERR(nh)) err = PTR_ERR(nh); } return err; } static int nh_valid_get_del_req(const struct nlmsghdr *nlh, struct nlattr **tb, u32 *id, u32 *op_flags, struct netlink_ext_ack *extack) { struct nhmsg *nhm = nlmsg_data(nlh); if (nhm->nh_protocol || nhm->resvd || nhm->nh_scope || nhm->nh_flags) { NL_SET_ERR_MSG(extack, "Invalid values in header"); return -EINVAL; } if (!tb[NHA_ID]) { NL_SET_ERR_MSG(extack, "Nexthop id is missing"); return -EINVAL; } *id = nla_get_u32(tb[NHA_ID]); if (!(*id)) { NL_SET_ERR_MSG(extack, "Invalid nexthop id"); return -EINVAL; } if (op_flags) { if (tb[NHA_OP_FLAGS]) *op_flags = nla_get_u32(tb[NHA_OP_FLAGS]); else *op_flags = 0; } return 0; } /* rtnl */ static int rtm_del_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_del)]; struct net *net = sock_net(skb->sk); struct nl_info nlinfo = { .nlh = nlh, .nl_net = net, .portid = NETLINK_CB(skb).portid, }; struct nexthop *nh; int err; u32 id; err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb, ARRAY_SIZE(rtm_nh_policy_del) - 1, rtm_nh_policy_del, extack); if (err < 0) return err; err = nh_valid_get_del_req(nlh, tb, &id, NULL, extack); if (err) return err; nh = nexthop_find_by_id(net, id); if (!nh) return -ENOENT; remove_nexthop(net, nh, &nlinfo); return 0; } /* rtnl */ static int rtm_get_nexthop(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_get)]; struct net *net = sock_net(in_skb->sk); struct sk_buff *skb = NULL; struct nexthop *nh; u32 op_flags; int err; u32 id; err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb, ARRAY_SIZE(rtm_nh_policy_get) - 1, rtm_nh_policy_get, extack); if (err < 0) return err; err = nh_valid_get_del_req(nlh, tb, &id, &op_flags, extack); if (err) return err; err = -ENOBUFS; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) goto out; err = -ENOENT; nh = nexthop_find_by_id(net, id); if (!nh) goto errout_free; err = nh_fill_node(skb, nh, RTM_NEWNEXTHOP, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 0, op_flags); if (err < 0) { WARN_ON(err == -EMSGSIZE); goto errout_free; } err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); out: return err; errout_free: kfree_skb(skb); goto out; } struct nh_dump_filter { u32 nh_id; int dev_idx; int master_idx; bool group_filter; bool fdb_filter; u32 res_bucket_nh_id; u32 op_flags; }; static bool nh_dump_filtered(struct nexthop *nh, struct nh_dump_filter *filter, u8 family) { const struct net_device *dev; const struct nh_info *nhi; if (filter->group_filter && !nh->is_group) return true; if (!filter->dev_idx && !filter->master_idx && !family) return false; if (nh->is_group) return true; nhi = rtnl_dereference(nh->nh_info); if (family && nhi->family != family) return true; dev = nhi->fib_nhc.nhc_dev; if (filter->dev_idx && (!dev || dev->ifindex != filter->dev_idx)) return true; if (filter->master_idx) { struct net_device *master; if (!dev) return true; master = netdev_master_upper_dev_get((struct net_device *)dev); if (!master || master->ifindex != filter->master_idx) return true; } return false; } static int __nh_valid_dump_req(const struct nlmsghdr *nlh, struct nlattr **tb, struct nh_dump_filter *filter, struct netlink_ext_ack *extack) { struct nhmsg *nhm; u32 idx; if (tb[NHA_OIF]) { idx = nla_get_u32(tb[NHA_OIF]); if (idx > INT_MAX) { NL_SET_ERR_MSG(extack, "Invalid device index"); return -EINVAL; } filter->dev_idx = idx; } if (tb[NHA_MASTER]) { idx = nla_get_u32(tb[NHA_MASTER]); if (idx > INT_MAX) { NL_SET_ERR_MSG(extack, "Invalid master device index"); return -EINVAL; } filter->master_idx = idx; } filter->group_filter = nla_get_flag(tb[NHA_GROUPS]); filter->fdb_filter = nla_get_flag(tb[NHA_FDB]); nhm = nlmsg_data(nlh); if (nhm->nh_protocol || nhm->resvd || nhm->nh_scope || nhm->nh_flags) { NL_SET_ERR_MSG(extack, "Invalid values in header for nexthop dump request"); return -EINVAL; } return 0; } static int nh_valid_dump_req(const struct nlmsghdr *nlh, struct nh_dump_filter *filter, struct netlink_callback *cb) { struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_dump)]; int err; err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb, ARRAY_SIZE(rtm_nh_policy_dump) - 1, rtm_nh_policy_dump, cb->extack); if (err < 0) return err; if (tb[NHA_OP_FLAGS]) filter->op_flags = nla_get_u32(tb[NHA_OP_FLAGS]); else filter->op_flags = 0; return __nh_valid_dump_req(nlh, tb, filter, cb->extack); } struct rtm_dump_nh_ctx { u32 idx; }; static struct rtm_dump_nh_ctx * rtm_dump_nh_ctx(struct netlink_callback *cb) { struct rtm_dump_nh_ctx *ctx = (void *)cb->ctx; BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx)); return ctx; } static int rtm_dump_walk_nexthops(struct sk_buff *skb, struct netlink_callback *cb, struct rb_root *root, struct rtm_dump_nh_ctx *ctx, int (*nh_cb)(struct sk_buff *skb, struct netlink_callback *cb, struct nexthop *nh, void *data), void *data) { struct rb_node *node; int s_idx; int err; s_idx = ctx->idx; for (node = rb_first(root); node; node = rb_next(node)) { struct nexthop *nh; nh = rb_entry(node, struct nexthop, rb_node); if (nh->id < s_idx) continue; ctx->idx = nh->id; err = nh_cb(skb, cb, nh, data); if (err) return err; } return 0; } static int rtm_dump_nexthop_cb(struct sk_buff *skb, struct netlink_callback *cb, struct nexthop *nh, void *data) { struct nhmsg *nhm = nlmsg_data(cb->nlh); struct nh_dump_filter *filter = data; if (nh_dump_filtered(nh, filter, nhm->nh_family)) return 0; return nh_fill_node(skb, nh, RTM_NEWNEXTHOP, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, filter->op_flags); } /* rtnl */ static int rtm_dump_nexthop(struct sk_buff *skb, struct netlink_callback *cb) { struct rtm_dump_nh_ctx *ctx = rtm_dump_nh_ctx(cb); struct net *net = sock_net(skb->sk); struct rb_root *root = &net->nexthop.rb_root; struct nh_dump_filter filter = {}; int err; err = nh_valid_dump_req(cb->nlh, &filter, cb); if (err < 0) return err; err = rtm_dump_walk_nexthops(skb, cb, root, ctx, &rtm_dump_nexthop_cb, &filter); cb->seq = net->nexthop.seq; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); return err; } static struct nexthop * nexthop_find_group_resilient(struct net *net, u32 id, struct netlink_ext_ack *extack) { struct nh_group *nhg; struct nexthop *nh; nh = nexthop_find_by_id(net, id); if (!nh) return ERR_PTR(-ENOENT); if (!nh->is_group) { NL_SET_ERR_MSG(extack, "Not a nexthop group"); return ERR_PTR(-EINVAL); } nhg = rtnl_dereference(nh->nh_grp); if (!nhg->resilient) { NL_SET_ERR_MSG(extack, "Nexthop group not of type resilient"); return ERR_PTR(-EINVAL); } return nh; } static int nh_valid_dump_nhid(struct nlattr *attr, u32 *nh_id_p, struct netlink_ext_ack *extack) { u32 idx; if (attr) { idx = nla_get_u32(attr); if (!idx) { NL_SET_ERR_MSG(extack, "Invalid nexthop id"); return -EINVAL; } *nh_id_p = idx; } else { *nh_id_p = 0; } return 0; } static int nh_valid_dump_bucket_req(const struct nlmsghdr *nlh, struct nh_dump_filter *filter, struct netlink_callback *cb) { struct nlattr *res_tb[ARRAY_SIZE(rtm_nh_res_bucket_policy_dump)]; struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_dump_bucket)]; int err; err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb, ARRAY_SIZE(rtm_nh_policy_dump_bucket) - 1, rtm_nh_policy_dump_bucket, NULL); if (err < 0) return err; err = nh_valid_dump_nhid(tb[NHA_ID], &filter->nh_id, cb->extack); if (err) return err; if (tb[NHA_RES_BUCKET]) { size_t max = ARRAY_SIZE(rtm_nh_res_bucket_policy_dump) - 1; err = nla_parse_nested(res_tb, max, tb[NHA_RES_BUCKET], rtm_nh_res_bucket_policy_dump, cb->extack); if (err < 0) return err; err = nh_valid_dump_nhid(res_tb[NHA_RES_BUCKET_NH_ID], &filter->res_bucket_nh_id, cb->extack); if (err) return err; } return __nh_valid_dump_req(nlh, tb, filter, cb->extack); } struct rtm_dump_res_bucket_ctx { struct rtm_dump_nh_ctx nh; u16 bucket_index; }; static struct rtm_dump_res_bucket_ctx * rtm_dump_res_bucket_ctx(struct netlink_callback *cb) { struct rtm_dump_res_bucket_ctx *ctx = (void *)cb->ctx; BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx)); return ctx; } struct rtm_dump_nexthop_bucket_data { struct rtm_dump_res_bucket_ctx *ctx; struct nh_dump_filter filter; }; static int rtm_dump_nexthop_bucket_nh(struct sk_buff *skb, struct netlink_callback *cb, struct nexthop *nh, struct rtm_dump_nexthop_bucket_data *dd) { u32 portid = NETLINK_CB(cb->skb).portid; struct nhmsg *nhm = nlmsg_data(cb->nlh); struct nh_res_table *res_table; struct nh_group *nhg; u16 bucket_index; int err; nhg = rtnl_dereference(nh->nh_grp); res_table = rtnl_dereference(nhg->res_table); for (bucket_index = dd->ctx->bucket_index; bucket_index < res_table->num_nh_buckets; bucket_index++) { struct nh_res_bucket *bucket; struct nh_grp_entry *nhge; bucket = &res_table->nh_buckets[bucket_index]; nhge = rtnl_dereference(bucket->nh_entry); if (nh_dump_filtered(nhge->nh, &dd->filter, nhm->nh_family)) continue; if (dd->filter.res_bucket_nh_id && dd->filter.res_bucket_nh_id != nhge->nh->id) continue; dd->ctx->bucket_index = bucket_index; err = nh_fill_res_bucket(skb, nh, bucket, bucket_index, RTM_NEWNEXTHOPBUCKET, portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->extack); if (err) return err; } dd->ctx->bucket_index = 0; return 0; } static int rtm_dump_nexthop_bucket_cb(struct sk_buff *skb, struct netlink_callback *cb, struct nexthop *nh, void *data) { struct rtm_dump_nexthop_bucket_data *dd = data; struct nh_group *nhg; if (!nh->is_group) return 0; nhg = rtnl_dereference(nh->nh_grp); if (!nhg->resilient) return 0; return rtm_dump_nexthop_bucket_nh(skb, cb, nh, dd); } /* rtnl */ static int rtm_dump_nexthop_bucket(struct sk_buff *skb, struct netlink_callback *cb) { struct rtm_dump_res_bucket_ctx *ctx = rtm_dump_res_bucket_ctx(cb); struct rtm_dump_nexthop_bucket_data dd = { .ctx = ctx }; struct net *net = sock_net(skb->sk); struct nexthop *nh; int err; err = nh_valid_dump_bucket_req(cb->nlh, &dd.filter, cb); if (err) return err; if (dd.filter.nh_id) { nh = nexthop_find_group_resilient(net, dd.filter.nh_id, cb->extack); if (IS_ERR(nh)) return PTR_ERR(nh); err = rtm_dump_nexthop_bucket_nh(skb, cb, nh, &dd); } else { struct rb_root *root = &net->nexthop.rb_root; err = rtm_dump_walk_nexthops(skb, cb, root, &ctx->nh, &rtm_dump_nexthop_bucket_cb, &dd); } cb->seq = net->nexthop.seq; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); return err; } static int nh_valid_get_bucket_req_res_bucket(struct nlattr *res, u16 *bucket_index, struct netlink_ext_ack *extack) { struct nlattr *tb[ARRAY_SIZE(rtm_nh_res_bucket_policy_get)]; int err; err = nla_parse_nested(tb, ARRAY_SIZE(rtm_nh_res_bucket_policy_get) - 1, res, rtm_nh_res_bucket_policy_get, extack); if (err < 0) return err; if (!tb[NHA_RES_BUCKET_INDEX]) { NL_SET_ERR_MSG(extack, "Bucket index is missing"); return -EINVAL; } *bucket_index = nla_get_u16(tb[NHA_RES_BUCKET_INDEX]); return 0; } static int nh_valid_get_bucket_req(const struct nlmsghdr *nlh, u32 *id, u16 *bucket_index, struct netlink_ext_ack *extack) { struct nlattr *tb[ARRAY_SIZE(rtm_nh_policy_get_bucket)]; int err; err = nlmsg_parse(nlh, sizeof(struct nhmsg), tb, ARRAY_SIZE(rtm_nh_policy_get_bucket) - 1, rtm_nh_policy_get_bucket, extack); if (err < 0) return err; err = nh_valid_get_del_req(nlh, tb, id, NULL, extack); if (err) return err; if (!tb[NHA_RES_BUCKET]) { NL_SET_ERR_MSG(extack, "Bucket information is missing"); return -EINVAL; } err = nh_valid_get_bucket_req_res_bucket(tb[NHA_RES_BUCKET], bucket_index, extack); if (err) return err; return 0; } /* rtnl */ static int rtm_get_nexthop_bucket(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(in_skb->sk); struct nh_res_table *res_table; struct sk_buff *skb = NULL; struct nh_group *nhg; struct nexthop *nh; u16 bucket_index; int err; u32 id; err = nh_valid_get_bucket_req(nlh, &id, &bucket_index, extack); if (err) return err; nh = nexthop_find_group_resilient(net, id, extack); if (IS_ERR(nh)) return PTR_ERR(nh); nhg = rtnl_dereference(nh->nh_grp); res_table = rtnl_dereference(nhg->res_table); if (bucket_index >= res_table->num_nh_buckets) { NL_SET_ERR_MSG(extack, "Bucket index out of bounds"); return -ENOENT; } skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return -ENOBUFS; err = nh_fill_res_bucket(skb, nh, &res_table->nh_buckets[bucket_index], bucket_index, RTM_NEWNEXTHOPBUCKET, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, 0, extack); if (err < 0) { WARN_ON(err == -EMSGSIZE); goto errout_free; } return rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); errout_free: kfree_skb(skb); return err; } static void nexthop_sync_mtu(struct net_device *dev, u32 orig_mtu) { unsigned int hash = nh_dev_hashfn(dev->ifindex); struct net *net = dev_net(dev); struct hlist_head *head = &net->nexthop.devhash[hash]; struct hlist_node *n; struct nh_info *nhi; hlist_for_each_entry_safe(nhi, n, head, dev_hash) { if (nhi->fib_nhc.nhc_dev == dev) { if (nhi->family == AF_INET) fib_nhc_update_mtu(&nhi->fib_nhc, dev->mtu, orig_mtu); } } } /* rtnl */ static int nh_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct netdev_notifier_info_ext *info_ext; switch (event) { case NETDEV_DOWN: case NETDEV_UNREGISTER: nexthop_flush_dev(dev, event); break; case NETDEV_CHANGE: if (!(dev_get_flags(dev) & (IFF_RUNNING | IFF_LOWER_UP))) nexthop_flush_dev(dev, event); break; case NETDEV_CHANGEMTU: info_ext = ptr; nexthop_sync_mtu(dev, info_ext->ext.mtu); rt_cache_flush(dev_net(dev)); break; } return NOTIFY_DONE; } static struct notifier_block nh_netdev_notifier = { .notifier_call = nh_netdev_event, }; static int nexthops_dump(struct net *net, struct notifier_block *nb, enum nexthop_event_type event_type, struct netlink_ext_ack *extack) { struct rb_root *root = &net->nexthop.rb_root; struct rb_node *node; int err = 0; for (node = rb_first(root); node; node = rb_next(node)) { struct nexthop *nh; nh = rb_entry(node, struct nexthop, rb_node); err = call_nexthop_notifier(nb, net, event_type, nh, extack); if (err) break; } return err; } int register_nexthop_notifier(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { int err; rtnl_lock(); err = nexthops_dump(net, nb, NEXTHOP_EVENT_REPLACE, extack); if (err) goto unlock; err = blocking_notifier_chain_register(&net->nexthop.notifier_chain, nb); unlock: rtnl_unlock(); return err; } EXPORT_SYMBOL(register_nexthop_notifier); int __unregister_nexthop_notifier(struct net *net, struct notifier_block *nb) { int err; err = blocking_notifier_chain_unregister(&net->nexthop.notifier_chain, nb); if (!err) nexthops_dump(net, nb, NEXTHOP_EVENT_DEL, NULL); return err; } EXPORT_SYMBOL(__unregister_nexthop_notifier); int unregister_nexthop_notifier(struct net *net, struct notifier_block *nb) { int err; rtnl_lock(); err = __unregister_nexthop_notifier(net, nb); rtnl_unlock(); return err; } EXPORT_SYMBOL(unregister_nexthop_notifier); void nexthop_set_hw_flags(struct net *net, u32 id, bool offload, bool trap) { struct nexthop *nexthop; rcu_read_lock(); nexthop = nexthop_find_by_id(net, id); if (!nexthop) goto out; nexthop->nh_flags &= ~(RTNH_F_OFFLOAD | RTNH_F_TRAP); if (offload) nexthop->nh_flags |= RTNH_F_OFFLOAD; if (trap) nexthop->nh_flags |= RTNH_F_TRAP; out: rcu_read_unlock(); } EXPORT_SYMBOL(nexthop_set_hw_flags); void nexthop_bucket_set_hw_flags(struct net *net, u32 id, u16 bucket_index, bool offload, bool trap) { struct nh_res_table *res_table; struct nh_res_bucket *bucket; struct nexthop *nexthop; struct nh_group *nhg; rcu_read_lock(); nexthop = nexthop_find_by_id(net, id); if (!nexthop || !nexthop->is_group) goto out; nhg = rcu_dereference(nexthop->nh_grp); if (!nhg->resilient) goto out; if (bucket_index >= nhg->res_table->num_nh_buckets) goto out; res_table = rcu_dereference(nhg->res_table); bucket = &res_table->nh_buckets[bucket_index]; bucket->nh_flags &= ~(RTNH_F_OFFLOAD | RTNH_F_TRAP); if (offload) bucket->nh_flags |= RTNH_F_OFFLOAD; if (trap) bucket->nh_flags |= RTNH_F_TRAP; out: rcu_read_unlock(); } EXPORT_SYMBOL(nexthop_bucket_set_hw_flags); void nexthop_res_grp_activity_update(struct net *net, u32 id, u16 num_buckets, unsigned long *activity) { struct nh_res_table *res_table; struct nexthop *nexthop; struct nh_group *nhg; u16 i; rcu_read_lock(); nexthop = nexthop_find_by_id(net, id); if (!nexthop || !nexthop->is_group) goto out; nhg = rcu_dereference(nexthop->nh_grp); if (!nhg->resilient) goto out; /* Instead of silently ignoring some buckets, demand that the sizes * be the same. */ res_table = rcu_dereference(nhg->res_table); if (num_buckets != res_table->num_nh_buckets) goto out; for (i = 0; i < num_buckets; i++) { if (test_bit(i, activity)) nh_res_bucket_set_busy(&res_table->nh_buckets[i]); } out: rcu_read_unlock(); } EXPORT_SYMBOL(nexthop_res_grp_activity_update); static void __net_exit nexthop_net_exit_batch_rtnl(struct list_head *net_list, struct list_head *dev_to_kill) { struct net *net; ASSERT_RTNL(); list_for_each_entry(net, net_list, exit_list) flush_all_nexthops(net); } static void __net_exit nexthop_net_exit(struct net *net) { kfree(net->nexthop.devhash); net->nexthop.devhash = NULL; } static int __net_init nexthop_net_init(struct net *net) { size_t sz = sizeof(struct hlist_head) * NH_DEV_HASHSIZE; net->nexthop.rb_root = RB_ROOT; net->nexthop.devhash = kzalloc(sz, GFP_KERNEL); if (!net->nexthop.devhash) return -ENOMEM; BLOCKING_INIT_NOTIFIER_HEAD(&net->nexthop.notifier_chain); return 0; } static struct pernet_operations nexthop_net_ops = { .init = nexthop_net_init, .exit = nexthop_net_exit, .exit_batch_rtnl = nexthop_net_exit_batch_rtnl, }; static int __init nexthop_init(void) { register_pernet_subsys(&nexthop_net_ops); register_netdevice_notifier(&nh_netdev_notifier); rtnl_register(PF_UNSPEC, RTM_NEWNEXTHOP, rtm_new_nexthop, NULL, 0); rtnl_register(PF_UNSPEC, RTM_DELNEXTHOP, rtm_del_nexthop, NULL, 0); rtnl_register(PF_UNSPEC, RTM_GETNEXTHOP, rtm_get_nexthop, rtm_dump_nexthop, 0); rtnl_register(PF_INET, RTM_NEWNEXTHOP, rtm_new_nexthop, NULL, 0); rtnl_register(PF_INET, RTM_GETNEXTHOP, NULL, rtm_dump_nexthop, 0); rtnl_register(PF_INET6, RTM_NEWNEXTHOP, rtm_new_nexthop, NULL, 0); rtnl_register(PF_INET6, RTM_GETNEXTHOP, NULL, rtm_dump_nexthop, 0); rtnl_register(PF_UNSPEC, RTM_GETNEXTHOPBUCKET, rtm_get_nexthop_bucket, rtm_dump_nexthop_bucket, 0); return 0; } subsys_initcall(nexthop_init);
6 6 1 5 6 1 6 8 8 6 6 6 6 6 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 // SPDX-License-Identifier: GPL-2.0-only #include "netlink.h" #include "common.h" struct linkinfo_req_info { struct ethnl_req_info base; }; struct linkinfo_reply_data { struct ethnl_reply_data base; struct ethtool_link_ksettings ksettings; struct ethtool_link_settings *lsettings; }; #define LINKINFO_REPDATA(__reply_base) \ container_of(__reply_base, struct linkinfo_reply_data, base) const struct nla_policy ethnl_linkinfo_get_policy[] = { [ETHTOOL_A_LINKINFO_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), }; static int linkinfo_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) { struct linkinfo_reply_data *data = LINKINFO_REPDATA(reply_base); struct net_device *dev = reply_base->dev; int ret; data->lsettings = &data->ksettings.base; ret = ethnl_ops_begin(dev); if (ret < 0) return ret; ret = __ethtool_get_link_ksettings(dev, &data->ksettings); if (ret < 0 && info) GENL_SET_ERR_MSG(info, "failed to retrieve link settings"); ethnl_ops_complete(dev); return ret; } static int linkinfo_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { return nla_total_size(sizeof(u8)) /* LINKINFO_PORT */ + nla_total_size(sizeof(u8)) /* LINKINFO_PHYADDR */ + nla_total_size(sizeof(u8)) /* LINKINFO_TP_MDIX */ + nla_total_size(sizeof(u8)) /* LINKINFO_TP_MDIX_CTRL */ + nla_total_size(sizeof(u8)) /* LINKINFO_TRANSCEIVER */ + 0; } static int linkinfo_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct linkinfo_reply_data *data = LINKINFO_REPDATA(reply_base); if (nla_put_u8(skb, ETHTOOL_A_LINKINFO_PORT, data->lsettings->port) || nla_put_u8(skb, ETHTOOL_A_LINKINFO_PHYADDR, data->lsettings->phy_address) || nla_put_u8(skb, ETHTOOL_A_LINKINFO_TP_MDIX, data->lsettings->eth_tp_mdix) || nla_put_u8(skb, ETHTOOL_A_LINKINFO_TP_MDIX_CTRL, data->lsettings->eth_tp_mdix_ctrl) || nla_put_u8(skb, ETHTOOL_A_LINKINFO_TRANSCEIVER, data->lsettings->transceiver)) return -EMSGSIZE; return 0; } /* LINKINFO_SET */ const struct nla_policy ethnl_linkinfo_set_policy[] = { [ETHTOOL_A_LINKINFO_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), [ETHTOOL_A_LINKINFO_PORT] = { .type = NLA_U8 }, [ETHTOOL_A_LINKINFO_PHYADDR] = { .type = NLA_U8 }, [ETHTOOL_A_LINKINFO_TP_MDIX_CTRL] = { .type = NLA_U8 }, }; static int ethnl_set_linkinfo_validate(struct ethnl_req_info *req_info, struct genl_info *info) { const struct ethtool_ops *ops = req_info->dev->ethtool_ops; if (!ops->get_link_ksettings || !ops->set_link_ksettings) return -EOPNOTSUPP; return 1; } static int ethnl_set_linkinfo(struct ethnl_req_info *req_info, struct genl_info *info) { struct ethtool_link_ksettings ksettings = {}; struct ethtool_link_settings *lsettings; struct net_device *dev = req_info->dev; struct nlattr **tb = info->attrs; bool mod = false; int ret; ret = __ethtool_get_link_ksettings(dev, &ksettings); if (ret < 0) { GENL_SET_ERR_MSG(info, "failed to retrieve link settings"); return ret; } lsettings = &ksettings.base; ethnl_update_u8(&lsettings->port, tb[ETHTOOL_A_LINKINFO_PORT], &mod); ethnl_update_u8(&lsettings->phy_address, tb[ETHTOOL_A_LINKINFO_PHYADDR], &mod); ethnl_update_u8(&lsettings->eth_tp_mdix_ctrl, tb[ETHTOOL_A_LINKINFO_TP_MDIX_CTRL], &mod); if (!mod) return 0; ret = dev->ethtool_ops->set_link_ksettings(dev, &ksettings); if (ret < 0) { GENL_SET_ERR_MSG(info, "link settings update failed"); return ret; } return 1; } const struct ethnl_request_ops ethnl_linkinfo_request_ops = { .request_cmd = ETHTOOL_MSG_LINKINFO_GET, .reply_cmd = ETHTOOL_MSG_LINKINFO_GET_REPLY, .hdr_attr = ETHTOOL_A_LINKINFO_HEADER, .req_info_size = sizeof(struct linkinfo_req_info), .reply_data_size = sizeof(struct linkinfo_reply_data), .prepare_data = linkinfo_prepare_data, .reply_size = linkinfo_reply_size, .fill_reply = linkinfo_fill_reply, .set_validate = ethnl_set_linkinfo_validate, .set = ethnl_set_linkinfo, .set_ntf_cmd = ETHTOOL_MSG_LINKINFO_NTF, };
16 25 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM alarmtimer #if !defined(_TRACE_ALARMTIMER_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_ALARMTIMER_H #include <linux/alarmtimer.h> #include <linux/rtc.h> #include <linux/tracepoint.h> TRACE_DEFINE_ENUM(ALARM_REALTIME); TRACE_DEFINE_ENUM(ALARM_BOOTTIME); TRACE_DEFINE_ENUM(ALARM_REALTIME_FREEZER); TRACE_DEFINE_ENUM(ALARM_BOOTTIME_FREEZER); #define show_alarm_type(type) __print_flags(type, " | ", \ { 1 << ALARM_REALTIME, "REALTIME" }, \ { 1 << ALARM_BOOTTIME, "BOOTTIME" }, \ { 1 << ALARM_REALTIME_FREEZER, "REALTIME Freezer" }, \ { 1 << ALARM_BOOTTIME_FREEZER, "BOOTTIME Freezer" }) TRACE_EVENT(alarmtimer_suspend, TP_PROTO(ktime_t expires, int flag), TP_ARGS(expires, flag), TP_STRUCT__entry( __field(s64, expires) __field(unsigned char, alarm_type) ), TP_fast_assign( __entry->expires = expires; __entry->alarm_type = flag; ), TP_printk("alarmtimer type:%s expires:%llu", show_alarm_type((1 << __entry->alarm_type)), __entry->expires ) ); DECLARE_EVENT_CLASS(alarm_class, TP_PROTO(struct alarm *alarm, ktime_t now), TP_ARGS(alarm, now), TP_STRUCT__entry( __field(void *, alarm) __field(unsigned char, alarm_type) __field(s64, expires) __field(s64, now) ), TP_fast_assign( __entry->alarm = alarm; __entry->alarm_type = alarm->type; __entry->expires = alarm->node.expires; __entry->now = now; ), TP_printk("alarmtimer:%p type:%s expires:%llu now:%llu", __entry->alarm, show_alarm_type((1 << __entry->alarm_type)), __entry->expires, __entry->now ) ); DEFINE_EVENT(alarm_class, alarmtimer_fired, TP_PROTO(struct alarm *alarm, ktime_t now), TP_ARGS(alarm, now) ); DEFINE_EVENT(alarm_class, alarmtimer_start, TP_PROTO(struct alarm *alarm, ktime_t now), TP_ARGS(alarm, now) ); DEFINE_EVENT(alarm_class, alarmtimer_cancel, TP_PROTO(struct alarm *alarm, ktime_t now), TP_ARGS(alarm, now) ); #endif /* _TRACE_ALARMTIMER_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
967 1 967 967 967 968 955 952 951 968 966 966 965 968 967 965 967 952 1 1 1 1 1 2 1 7 16 2 1 1 967 968 953 954 954 1 13 955 968 952 966 13 13 13 2 2 2 18 18 1 1 1 1 1 1 1 1 1 1 20 20 3 1 17 18 18 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 // SPDX-License-Identifier: GPL-2.0-or-later /* Virtio ring implementation. * * Copyright 2007 Rusty Russell IBM Corporation */ #include <linux/virtio.h> #include <linux/virtio_ring.h> #include <linux/virtio_config.h> #include <linux/device.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/hrtimer.h> #include <linux/dma-mapping.h> #include <linux/kmsan.h> #include <linux/spinlock.h> #include <xen/xen.h> #ifdef DEBUG /* For development, we want to crash whenever the ring is screwed. */ #define BAD_RING(_vq, fmt, args...) \ do { \ dev_err(&(_vq)->vq.vdev->dev, \ "%s:"fmt, (_vq)->vq.name, ##args); \ BUG(); \ } while (0) /* Caller is supposed to guarantee no reentry. */ #define START_USE(_vq) \ do { \ if ((_vq)->in_use) \ panic("%s:in_use = %i\n", \ (_vq)->vq.name, (_vq)->in_use); \ (_vq)->in_use = __LINE__; \ } while (0) #define END_USE(_vq) \ do { BUG_ON(!(_vq)->in_use); (_vq)->in_use = 0; } while(0) #define LAST_ADD_TIME_UPDATE(_vq) \ do { \ ktime_t now = ktime_get(); \ \ /* No kick or get, with .1 second between? Warn. */ \ if ((_vq)->last_add_time_valid) \ WARN_ON(ktime_to_ms(ktime_sub(now, \ (_vq)->last_add_time)) > 100); \ (_vq)->last_add_time = now; \ (_vq)->last_add_time_valid = true; \ } while (0) #define LAST_ADD_TIME_CHECK(_vq) \ do { \ if ((_vq)->last_add_time_valid) { \ WARN_ON(ktime_to_ms(ktime_sub(ktime_get(), \ (_vq)->last_add_time)) > 100); \ } \ } while (0) #define LAST_ADD_TIME_INVALID(_vq) \ ((_vq)->last_add_time_valid = false) #else #define BAD_RING(_vq, fmt, args...) \ do { \ dev_err(&_vq->vq.vdev->dev, \ "%s:"fmt, (_vq)->vq.name, ##args); \ (_vq)->broken = true; \ } while (0) #define START_USE(vq) #define END_USE(vq) #define LAST_ADD_TIME_UPDATE(vq) #define LAST_ADD_TIME_CHECK(vq) #define LAST_ADD_TIME_INVALID(vq) #endif struct vring_desc_state_split { void *data; /* Data for callback. */ struct vring_desc *indir_desc; /* Indirect descriptor, if any. */ }; struct vring_desc_state_packed { void *data; /* Data for callback. */ struct vring_packed_desc *indir_desc; /* Indirect descriptor, if any. */ u16 num; /* Descriptor list length. */ u16 last; /* The last desc state in a list. */ }; struct vring_desc_extra { dma_addr_t addr; /* Descriptor DMA addr. */ u32 len; /* Descriptor length. */ u16 flags; /* Descriptor flags. */ u16 next; /* The next desc state in a list. */ }; struct vring_virtqueue_split { /* Actual memory layout for this queue. */ struct vring vring; /* Last written value to avail->flags */ u16 avail_flags_shadow; /* * Last written value to avail->idx in * guest byte order. */ u16 avail_idx_shadow; /* Per-descriptor state. */ struct vring_desc_state_split *desc_state; struct vring_desc_extra *desc_extra; /* DMA address and size information */ dma_addr_t queue_dma_addr; size_t queue_size_in_bytes; /* * The parameters for creating vrings are reserved for creating new * vring. */ u32 vring_align; bool may_reduce_num; }; struct vring_virtqueue_packed { /* Actual memory layout for this queue. */ struct { unsigned int num; struct vring_packed_desc *desc; struct vring_packed_desc_event *driver; struct vring_packed_desc_event *device; } vring; /* Driver ring wrap counter. */ bool avail_wrap_counter; /* Avail used flags. */ u16 avail_used_flags; /* Index of the next avail descriptor. */ u16 next_avail_idx; /* * Last written value to driver->flags in * guest byte order. */ u16 event_flags_shadow; /* Per-descriptor state. */ struct vring_desc_state_packed *desc_state; struct vring_desc_extra *desc_extra; /* DMA address and size information */ dma_addr_t ring_dma_addr; dma_addr_t driver_event_dma_addr; dma_addr_t device_event_dma_addr; size_t ring_size_in_bytes; size_t event_size_in_bytes; }; struct vring_virtqueue { struct virtqueue vq; /* Is this a packed ring? */ bool packed_ring; /* Is DMA API used? */ bool use_dma_api; /* Can we use weak barriers? */ bool weak_barriers; /* Other side has made a mess, don't try any more. */ bool broken; /* Host supports indirect buffers */ bool indirect; /* Host publishes avail event idx */ bool event; /* Do DMA mapping by driver */ bool premapped; /* Do unmap or not for desc. Just when premapped is False and * use_dma_api is true, this is true. */ bool do_unmap; /* Head of free buffer list. */ unsigned int free_head; /* Number we've added since last sync. */ unsigned int num_added; /* Last used index we've seen. * for split ring, it just contains last used index * for packed ring: * bits up to VRING_PACKED_EVENT_F_WRAP_CTR include the last used index. * bits from VRING_PACKED_EVENT_F_WRAP_CTR include the used wrap counter. */ u16 last_used_idx; /* Hint for event idx: already triggered no need to disable. */ bool event_triggered; union { /* Available for split ring */ struct vring_virtqueue_split split; /* Available for packed ring */ struct vring_virtqueue_packed packed; }; /* How to notify other side. FIXME: commonalize hcalls! */ bool (*notify)(struct virtqueue *vq); /* DMA, allocation, and size information */ bool we_own_ring; /* Device used for doing DMA */ struct device *dma_dev; #ifdef DEBUG /* They're supposed to lock for us. */ unsigned int in_use; /* Figure out if their kicks are too delayed. */ bool last_add_time_valid; ktime_t last_add_time; #endif }; static struct virtqueue *__vring_new_virtqueue(unsigned int index, struct vring_virtqueue_split *vring_split, struct virtio_device *vdev, bool weak_barriers, bool context, bool (*notify)(struct virtqueue *), void (*callback)(struct virtqueue *), const char *name, struct device *dma_dev); static struct vring_desc_extra *vring_alloc_desc_extra(unsigned int num); static void vring_free(struct virtqueue *_vq); /* * Helpers. */ #define to_vvq(_vq) container_of_const(_vq, struct vring_virtqueue, vq) static bool virtqueue_use_indirect(const struct vring_virtqueue *vq, unsigned int total_sg) { /* * If the host supports indirect descriptor tables, and we have multiple * buffers, then go indirect. FIXME: tune this threshold */ return (vq->indirect && total_sg > 1 && vq->vq.num_free); } /* * Modern virtio devices have feature bits to specify whether they need a * quirk and bypass the IOMMU. If not there, just use the DMA API. * * If there, the interaction between virtio and DMA API is messy. * * On most systems with virtio, physical addresses match bus addresses, * and it doesn't particularly matter whether we use the DMA API. * * On some systems, including Xen and any system with a physical device * that speaks virtio behind a physical IOMMU, we must use the DMA API * for virtio DMA to work at all. * * On other systems, including SPARC and PPC64, virtio-pci devices are * enumerated as though they are behind an IOMMU, but the virtio host * ignores the IOMMU, so we must either pretend that the IOMMU isn't * there or somehow map everything as the identity. * * For the time being, we preserve historic behavior and bypass the DMA * API. * * TODO: install a per-device DMA ops structure that does the right thing * taking into account all the above quirks, and use the DMA API * unconditionally on data path. */ static bool vring_use_dma_api(const struct virtio_device *vdev) { if (!virtio_has_dma_quirk(vdev)) return true; /* Otherwise, we are left to guess. */ /* * In theory, it's possible to have a buggy QEMU-supposed * emulated Q35 IOMMU and Xen enabled at the same time. On * such a configuration, virtio has never worked and will * not work without an even larger kludge. Instead, enable * the DMA API if we're a Xen guest, which at least allows * all of the sensible Xen configurations to work correctly. */ if (xen_domain()) return true; return false; } size_t virtio_max_dma_size(const struct virtio_device *vdev) { size_t max_segment_size = SIZE_MAX; if (vring_use_dma_api(vdev)) max_segment_size = dma_max_mapping_size(vdev->dev.parent); return max_segment_size; } EXPORT_SYMBOL_GPL(virtio_max_dma_size); static void *vring_alloc_queue(struct virtio_device *vdev, size_t size, dma_addr_t *dma_handle, gfp_t flag, struct device *dma_dev) { if (vring_use_dma_api(vdev)) { return dma_alloc_coherent(dma_dev, size, dma_handle, flag); } else { void *queue = alloc_pages_exact(PAGE_ALIGN(size), flag); if (queue) { phys_addr_t phys_addr = virt_to_phys(queue); *dma_handle = (dma_addr_t)phys_addr; /* * Sanity check: make sure we dind't truncate * the address. The only arches I can find that * have 64-bit phys_addr_t but 32-bit dma_addr_t * are certain non-highmem MIPS and x86 * configurations, but these configurations * should never allocate physical pages above 32 * bits, so this is fine. Just in case, throw a * warning and abort if we end up with an * unrepresentable address. */ if (WARN_ON_ONCE(*dma_handle != phys_addr)) { free_pages_exact(queue, PAGE_ALIGN(size)); return NULL; } } return queue; } } static void vring_free_queue(struct virtio_device *vdev, size_t size, void *queue, dma_addr_t dma_handle, struct device *dma_dev) { if (vring_use_dma_api(vdev)) dma_free_coherent(dma_dev, size, queue, dma_handle); else free_pages_exact(queue, PAGE_ALIGN(size)); } /* * The DMA ops on various arches are rather gnarly right now, and * making all of the arch DMA ops work on the vring device itself * is a mess. */ static struct device *vring_dma_dev(const struct vring_virtqueue *vq) { return vq->dma_dev; } /* Map one sg entry. */ static int vring_map_one_sg(const struct vring_virtqueue *vq, struct scatterlist *sg, enum dma_data_direction direction, dma_addr_t *addr) { if (vq->premapped) { *addr = sg_dma_address(sg); return 0; } if (!vq->use_dma_api) { /* * If DMA is not used, KMSAN doesn't know that the scatterlist * is initialized by the hardware. Explicitly check/unpoison it * depending on the direction. */ kmsan_handle_dma(sg_page(sg), sg->offset, sg->length, direction); *addr = (dma_addr_t)sg_phys(sg); return 0; } /* * We can't use dma_map_sg, because we don't use scatterlists in * the way it expects (we don't guarantee that the scatterlist * will exist for the lifetime of the mapping). */ *addr = dma_map_page(vring_dma_dev(vq), sg_page(sg), sg->offset, sg->length, direction); if (dma_mapping_error(vring_dma_dev(vq), *addr)) return -ENOMEM; return 0; } static dma_addr_t vring_map_single(const struct vring_virtqueue *vq, void *cpu_addr, size_t size, enum dma_data_direction direction) { if (!vq->use_dma_api) return (dma_addr_t)virt_to_phys(cpu_addr); return dma_map_single(vring_dma_dev(vq), cpu_addr, size, direction); } static int vring_mapping_error(const struct vring_virtqueue *vq, dma_addr_t addr) { if (!vq->use_dma_api) return 0; return dma_mapping_error(vring_dma_dev(vq), addr); } static void virtqueue_init(struct vring_virtqueue *vq, u32 num) { vq->vq.num_free = num; if (vq->packed_ring) vq->last_used_idx = 0 | (1 << VRING_PACKED_EVENT_F_WRAP_CTR); else vq->last_used_idx = 0; vq->event_triggered = false; vq->num_added = 0; #ifdef DEBUG vq->in_use = false; vq->last_add_time_valid = false; #endif } /* * Split ring specific functions - *_split(). */ static void vring_unmap_one_split_indirect(const struct vring_virtqueue *vq, const struct vring_desc *desc) { u16 flags; if (!vq->do_unmap) return; flags = virtio16_to_cpu(vq->vq.vdev, desc->flags); dma_unmap_page(vring_dma_dev(vq), virtio64_to_cpu(vq->vq.vdev, desc->addr), virtio32_to_cpu(vq->vq.vdev, desc->len), (flags & VRING_DESC_F_WRITE) ? DMA_FROM_DEVICE : DMA_TO_DEVICE); } static unsigned int vring_unmap_one_split(const struct vring_virtqueue *vq, unsigned int i) { struct vring_desc_extra *extra = vq->split.desc_extra; u16 flags; flags = extra[i].flags; if (flags & VRING_DESC_F_INDIRECT) { if (!vq->use_dma_api) goto out; dma_unmap_single(vring_dma_dev(vq), extra[i].addr, extra[i].len, (flags & VRING_DESC_F_WRITE) ? DMA_FROM_DEVICE : DMA_TO_DEVICE); } else { if (!vq->do_unmap) goto out; dma_unmap_page(vring_dma_dev(vq), extra[i].addr, extra[i].len, (flags & VRING_DESC_F_WRITE) ? DMA_FROM_DEVICE : DMA_TO_DEVICE); } out: return extra[i].next; } static struct vring_desc *alloc_indirect_split(struct virtqueue *_vq, unsigned int total_sg, gfp_t gfp) { struct vring_desc *desc; unsigned int i; /* * We require lowmem mappings for the descriptors because * otherwise virt_to_phys will give us bogus addresses in the * virtqueue. */ gfp &= ~__GFP_HIGHMEM; desc = kmalloc_array(total_sg, sizeof(struct vring_desc), gfp); if (!desc) return NULL; for (i = 0; i < total_sg; i++) desc[i].next = cpu_to_virtio16(_vq->vdev, i + 1); return desc; } static inline unsigned int virtqueue_add_desc_split(struct virtqueue *vq, struct vring_desc *desc, unsigned int i, dma_addr_t addr, unsigned int len, u16 flags, bool indirect) { struct vring_virtqueue *vring = to_vvq(vq); struct vring_desc_extra *extra = vring->split.desc_extra; u16 next; desc[i].flags = cpu_to_virtio16(vq->vdev, flags); desc[i].addr = cpu_to_virtio64(vq->vdev, addr); desc[i].len = cpu_to_virtio32(vq->vdev, len); if (!indirect) { next = extra[i].next; desc[i].next = cpu_to_virtio16(vq->vdev, next); extra[i].addr = addr; extra[i].len = len; extra[i].flags = flags; } else next = virtio16_to_cpu(vq->vdev, desc[i].next); return next; } static inline int virtqueue_add_split(struct virtqueue *_vq, struct scatterlist *sgs[], unsigned int total_sg, unsigned int out_sgs, unsigned int in_sgs, void *data, void *ctx, gfp_t gfp) { struct vring_virtqueue *vq = to_vvq(_vq); struct scatterlist *sg; struct vring_desc *desc; unsigned int i, n, avail, descs_used, prev, err_idx; int head; bool indirect; START_USE(vq); BUG_ON(data == NULL); BUG_ON(ctx && vq->indirect); if (unlikely(vq->broken)) { END_USE(vq); return -EIO; } LAST_ADD_TIME_UPDATE(vq); BUG_ON(total_sg == 0); head = vq->free_head; if (virtqueue_use_indirect(vq, total_sg)) desc = alloc_indirect_split(_vq, total_sg, gfp); else { desc = NULL; WARN_ON_ONCE(total_sg > vq->split.vring.num && !vq->indirect); } if (desc) { /* Use a single buffer which doesn't continue */ indirect = true; /* Set up rest to use this indirect table. */ i = 0; descs_used = 1; } else { indirect = false; desc = vq->split.vring.desc; i = head; descs_used = total_sg; } if (unlikely(vq->vq.num_free < descs_used)) { pr_debug("Can't add buf len %i - avail = %i\n", descs_used, vq->vq.num_free); /* FIXME: for historical reasons, we force a notify here if * there are outgoing parts to the buffer. Presumably the * host should service the ring ASAP. */ if (out_sgs) vq->notify(&vq->vq); if (indirect) kfree(desc); END_USE(vq); return -ENOSPC; } for (n = 0; n < out_sgs; n++) { for (sg = sgs[n]; sg; sg = sg_next(sg)) { dma_addr_t addr; if (vring_map_one_sg(vq, sg, DMA_TO_DEVICE, &addr)) goto unmap_release; prev = i; /* Note that we trust indirect descriptor * table since it use stream DMA mapping. */ i = virtqueue_add_desc_split(_vq, desc, i, addr, sg->length, VRING_DESC_F_NEXT, indirect); } } for (; n < (out_sgs + in_sgs); n++) { for (sg = sgs[n]; sg; sg = sg_next(sg)) { dma_addr_t addr; if (vring_map_one_sg(vq, sg, DMA_FROM_DEVICE, &addr)) goto unmap_release; prev = i; /* Note that we trust indirect descriptor * table since it use stream DMA mapping. */ i = virtqueue_add_desc_split(_vq, desc, i, addr, sg->length, VRING_DESC_F_NEXT | VRING_DESC_F_WRITE, indirect); } } /* Last one doesn't continue. */ desc[prev].flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT); if (!indirect && vq->do_unmap) vq->split.desc_extra[prev & (vq->split.vring.num - 1)].flags &= ~VRING_DESC_F_NEXT; if (indirect) { /* Now that the indirect table is filled in, map it. */ dma_addr_t addr = vring_map_single( vq, desc, total_sg * sizeof(struct vring_desc), DMA_TO_DEVICE); if (vring_mapping_error(vq, addr)) { if (vq->premapped) goto free_indirect; goto unmap_release; } virtqueue_add_desc_split(_vq, vq->split.vring.desc, head, addr, total_sg * sizeof(struct vring_desc), VRING_DESC_F_INDIRECT, false); } /* We're using some buffers from the free list. */ vq->vq.num_free -= descs_used; /* Update free pointer */ if (indirect) vq->free_head = vq->split.desc_extra[head].next; else vq->free_head = i; /* Store token and indirect buffer state. */ vq->split.desc_state[head].data = data; if (indirect) vq->split.desc_state[head].indir_desc = desc; else vq->split.desc_state[head].indir_desc = ctx; /* Put entry in available array (but don't update avail->idx until they * do sync). */ avail = vq->split.avail_idx_shadow & (vq->split.vring.num - 1); vq->split.vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head); /* Descriptors and available array need to be set before we expose the * new available array entries. */ virtio_wmb(vq->weak_barriers); vq->split.avail_idx_shadow++; vq->split.vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->split.avail_idx_shadow); vq->num_added++; pr_debug("Added buffer head %i to %p\n", head, vq); END_USE(vq); /* This is very unlikely, but theoretically possible. Kick * just in case. */ if (unlikely(vq->num_added == (1 << 16) - 1)) virtqueue_kick(_vq); return 0; unmap_release: err_idx = i; if (indirect) i = 0; else i = head; for (n = 0; n < total_sg; n++) { if (i == err_idx) break; if (indirect) { vring_unmap_one_split_indirect(vq, &desc[i]); i = virtio16_to_cpu(_vq->vdev, desc[i].next); } else i = vring_unmap_one_split(vq, i); } free_indirect: if (indirect) kfree(desc); END_USE(vq); return -ENOMEM; } static bool virtqueue_kick_prepare_split(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); u16 new, old; bool needs_kick; START_USE(vq); /* We need to expose available array entries before checking avail * event. */ virtio_mb(vq->weak_barriers); old = vq->split.avail_idx_shadow - vq->num_added; new = vq->split.avail_idx_shadow; vq->num_added = 0; LAST_ADD_TIME_CHECK(vq); LAST_ADD_TIME_INVALID(vq); if (vq->event) { needs_kick = vring_need_event(virtio16_to_cpu(_vq->vdev, vring_avail_event(&vq->split.vring)), new, old); } else { needs_kick = !(vq->split.vring.used->flags & cpu_to_virtio16(_vq->vdev, VRING_USED_F_NO_NOTIFY)); } END_USE(vq); return needs_kick; } static void detach_buf_split(struct vring_virtqueue *vq, unsigned int head, void **ctx) { unsigned int i, j; __virtio16 nextflag = cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_NEXT); /* Clear data ptr. */ vq->split.desc_state[head].data = NULL; /* Put back on free list: unmap first-level descriptors and find end */ i = head; while (vq->split.vring.desc[i].flags & nextflag) { vring_unmap_one_split(vq, i); i = vq->split.desc_extra[i].next; vq->vq.num_free++; } vring_unmap_one_split(vq, i); vq->split.desc_extra[i].next = vq->free_head; vq->free_head = head; /* Plus final descriptor */ vq->vq.num_free++; if (vq->indirect) { struct vring_desc *indir_desc = vq->split.desc_state[head].indir_desc; u32 len; /* Free the indirect table, if any, now that it's unmapped. */ if (!indir_desc) return; len = vq->split.desc_extra[head].len; BUG_ON(!(vq->split.desc_extra[head].flags & VRING_DESC_F_INDIRECT)); BUG_ON(len == 0 || len % sizeof(struct vring_desc)); if (vq->do_unmap) { for (j = 0; j < len / sizeof(struct vring_desc); j++) vring_unmap_one_split_indirect(vq, &indir_desc[j]); } kfree(indir_desc); vq->split.desc_state[head].indir_desc = NULL; } else if (ctx) { *ctx = vq->split.desc_state[head].indir_desc; } } static bool more_used_split(const struct vring_virtqueue *vq) { return vq->last_used_idx != virtio16_to_cpu(vq->vq.vdev, vq->split.vring.used->idx); } static void *virtqueue_get_buf_ctx_split(struct virtqueue *_vq, unsigned int *len, void **ctx) { struct vring_virtqueue *vq = to_vvq(_vq); void *ret; unsigned int i; u16 last_used; START_USE(vq); if (unlikely(vq->broken)) { END_USE(vq); return NULL; } if (!more_used_split(vq)) { pr_debug("No more buffers in queue\n"); END_USE(vq); return NULL; } /* Only get used array entries after they have been exposed by host. */ virtio_rmb(vq->weak_barriers); last_used = (vq->last_used_idx & (vq->split.vring.num - 1)); i = virtio32_to_cpu(_vq->vdev, vq->split.vring.used->ring[last_used].id); *len = virtio32_to_cpu(_vq->vdev, vq->split.vring.used->ring[last_used].len); if (unlikely(i >= vq->split.vring.num)) { BAD_RING(vq, "id %u out of range\n", i); return NULL; } if (unlikely(!vq->split.desc_state[i].data)) { BAD_RING(vq, "id %u is not a head!\n", i); return NULL; } /* detach_buf_split clears data, so grab it now. */ ret = vq->split.desc_state[i].data; detach_buf_split(vq, i, ctx); vq->last_used_idx++; /* If we expect an interrupt for the next entry, tell host * by writing event index and flush out the write before * the read in the next get_buf call. */ if (!(vq->split.avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT)) virtio_store_mb(vq->weak_barriers, &vring_used_event(&vq->split.vring), cpu_to_virtio16(_vq->vdev, vq->last_used_idx)); LAST_ADD_TIME_INVALID(vq); END_USE(vq); return ret; } static void virtqueue_disable_cb_split(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); if (!(vq->split.avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT)) { vq->split.avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT; /* * If device triggered an event already it won't trigger one again: * no need to disable. */ if (vq->event_triggered) return; if (vq->event) /* TODO: this is a hack. Figure out a cleaner value to write. */ vring_used_event(&vq->split.vring) = 0x0; else vq->split.vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->split.avail_flags_shadow); } } static unsigned int virtqueue_enable_cb_prepare_split(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); u16 last_used_idx; START_USE(vq); /* We optimistically turn back on interrupts, then check if there was * more to do. */ /* Depending on the VIRTIO_RING_F_EVENT_IDX feature, we need to * either clear the flags bit or point the event index at the next * entry. Always do both to keep code simple. */ if (vq->split.avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT) { vq->split.avail_flags_shadow &= ~VRING_AVAIL_F_NO_INTERRUPT; if (!vq->event) vq->split.vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->split.avail_flags_shadow); } vring_used_event(&vq->split.vring) = cpu_to_virtio16(_vq->vdev, last_used_idx = vq->last_used_idx); END_USE(vq); return last_used_idx; } static bool virtqueue_poll_split(struct virtqueue *_vq, unsigned int last_used_idx) { struct vring_virtqueue *vq = to_vvq(_vq); return (u16)last_used_idx != virtio16_to_cpu(_vq->vdev, vq->split.vring.used->idx); } static bool virtqueue_enable_cb_delayed_split(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); u16 bufs; START_USE(vq); /* We optimistically turn back on interrupts, then check if there was * more to do. */ /* Depending on the VIRTIO_RING_F_USED_EVENT_IDX feature, we need to * either clear the flags bit or point the event index at the next * entry. Always update the event index to keep code simple. */ if (vq->split.avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT) { vq->split.avail_flags_shadow &= ~VRING_AVAIL_F_NO_INTERRUPT; if (!vq->event) vq->split.vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->split.avail_flags_shadow); } /* TODO: tune this threshold */ bufs = (u16)(vq->split.avail_idx_shadow - vq->last_used_idx) * 3 / 4; virtio_store_mb(vq->weak_barriers, &vring_used_event(&vq->split.vring), cpu_to_virtio16(_vq->vdev, vq->last_used_idx + bufs)); if (unlikely((u16)(virtio16_to_cpu(_vq->vdev, vq->split.vring.used->idx) - vq->last_used_idx) > bufs)) { END_USE(vq); return false; } END_USE(vq); return true; } static void *virtqueue_detach_unused_buf_split(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); unsigned int i; void *buf; START_USE(vq); for (i = 0; i < vq->split.vring.num; i++) { if (!vq->split.desc_state[i].data) continue; /* detach_buf_split clears data, so grab it now. */ buf = vq->split.desc_state[i].data; detach_buf_split(vq, i, NULL); vq->split.avail_idx_shadow--; vq->split.vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->split.avail_idx_shadow); END_USE(vq); return buf; } /* That should have freed everything. */ BUG_ON(vq->vq.num_free != vq->split.vring.num); END_USE(vq); return NULL; } static void virtqueue_vring_init_split(struct vring_virtqueue_split *vring_split, struct vring_virtqueue *vq) { struct virtio_device *vdev; vdev = vq->vq.vdev; vring_split->avail_flags_shadow = 0; vring_split->avail_idx_shadow = 0; /* No callback? Tell other side not to bother us. */ if (!vq->vq.callback) { vring_split->avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT; if (!vq->event) vring_split->vring.avail->flags = cpu_to_virtio16(vdev, vring_split->avail_flags_shadow); } } static void virtqueue_reinit_split(struct vring_virtqueue *vq) { int num; num = vq->split.vring.num; vq->split.vring.avail->flags = 0; vq->split.vring.avail->idx = 0; /* reset avail event */ vq->split.vring.avail->ring[num] = 0; vq->split.vring.used->flags = 0; vq->split.vring.used->idx = 0; /* reset used event */ *(__virtio16 *)&(vq->split.vring.used->ring[num]) = 0; virtqueue_init(vq, num); virtqueue_vring_init_split(&vq->split, vq); } static void virtqueue_vring_attach_split(struct vring_virtqueue *vq, struct vring_virtqueue_split *vring_split) { vq->split = *vring_split; /* Put everything in free lists. */ vq->free_head = 0; } static int vring_alloc_state_extra_split(struct vring_virtqueue_split *vring_split) { struct vring_desc_state_split *state; struct vring_desc_extra *extra; u32 num = vring_split->vring.num; state = kmalloc_array(num, sizeof(struct vring_desc_state_split), GFP_KERNEL); if (!state) goto err_state; extra = vring_alloc_desc_extra(num); if (!extra) goto err_extra; memset(state, 0, num * sizeof(struct vring_desc_state_split)); vring_split->desc_state = state; vring_split->desc_extra = extra; return 0; err_extra: kfree(state); err_state: return -ENOMEM; } static void vring_free_split(struct vring_virtqueue_split *vring_split, struct virtio_device *vdev, struct device *dma_dev) { vring_free_queue(vdev, vring_split->queue_size_in_bytes, vring_split->vring.desc, vring_split->queue_dma_addr, dma_dev); kfree(vring_split->desc_state); kfree(vring_split->desc_extra); } static int vring_alloc_queue_split(struct vring_virtqueue_split *vring_split, struct virtio_device *vdev, u32 num, unsigned int vring_align, bool may_reduce_num, struct device *dma_dev) { void *queue = NULL; dma_addr_t dma_addr; /* We assume num is a power of 2. */ if (!is_power_of_2(num)) { dev_warn(&vdev->dev, "Bad virtqueue length %u\n", num); return -EINVAL; } /* TODO: allocate each queue chunk individually */ for (; num && vring_size(num, vring_align) > PAGE_SIZE; num /= 2) { queue = vring_alloc_queue(vdev, vring_size(num, vring_align), &dma_addr, GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO, dma_dev); if (queue) break; if (!may_reduce_num) return -ENOMEM; } if (!num) return -ENOMEM; if (!queue) { /* Try to get a single page. You are my only hope! */ queue = vring_alloc_queue(vdev, vring_size(num, vring_align), &dma_addr, GFP_KERNEL | __GFP_ZERO, dma_dev); } if (!queue) return -ENOMEM; vring_init(&vring_split->vring, num, queue, vring_align); vring_split->queue_dma_addr = dma_addr; vring_split->queue_size_in_bytes = vring_size(num, vring_align); vring_split->vring_align = vring_align; vring_split->may_reduce_num = may_reduce_num; return 0; } static struct virtqueue *vring_create_virtqueue_split( unsigned int index, unsigned int num, unsigned int vring_align, struct virtio_device *vdev, bool weak_barriers, bool may_reduce_num, bool context, bool (*notify)(struct virtqueue *), void (*callback)(struct virtqueue *), const char *name, struct device *dma_dev) { struct vring_virtqueue_split vring_split = {}; struct virtqueue *vq; int err; err = vring_alloc_queue_split(&vring_split, vdev, num, vring_align, may_reduce_num, dma_dev); if (err) return NULL; vq = __vring_new_virtqueue(index, &vring_split, vdev, weak_barriers, context, notify, callback, name, dma_dev); if (!vq) { vring_free_split(&vring_split, vdev, dma_dev); return NULL; } to_vvq(vq)->we_own_ring = true; return vq; } static int virtqueue_resize_split(struct virtqueue *_vq, u32 num) { struct vring_virtqueue_split vring_split = {}; struct vring_virtqueue *vq = to_vvq(_vq); struct virtio_device *vdev = _vq->vdev; int err; err = vring_alloc_queue_split(&vring_split, vdev, num, vq->split.vring_align, vq->split.may_reduce_num, vring_dma_dev(vq)); if (err) goto err; err = vring_alloc_state_extra_split(&vring_split); if (err) goto err_state_extra; vring_free(&vq->vq); virtqueue_vring_init_split(&vring_split, vq); virtqueue_init(vq, vring_split.vring.num); virtqueue_vring_attach_split(vq, &vring_split); return 0; err_state_extra: vring_free_split(&vring_split, vdev, vring_dma_dev(vq)); err: virtqueue_reinit_split(vq); return -ENOMEM; } /* * Packed ring specific functions - *_packed(). */ static bool packed_used_wrap_counter(u16 last_used_idx) { return !!(last_used_idx & (1 << VRING_PACKED_EVENT_F_WRAP_CTR)); } static u16 packed_last_used(u16 last_used_idx) { return last_used_idx & ~(-(1 << VRING_PACKED_EVENT_F_WRAP_CTR)); } static void vring_unmap_extra_packed(const struct vring_virtqueue *vq, const struct vring_desc_extra *extra) { u16 flags; flags = extra->flags; if (flags & VRING_DESC_F_INDIRECT) { if (!vq->use_dma_api) return; dma_unmap_single(vring_dma_dev(vq), extra->addr, extra->len, (flags & VRING_DESC_F_WRITE) ? DMA_FROM_DEVICE : DMA_TO_DEVICE); } else { if (!vq->do_unmap) return; dma_unmap_page(vring_dma_dev(vq), extra->addr, extra->len, (flags & VRING_DESC_F_WRITE) ? DMA_FROM_DEVICE : DMA_TO_DEVICE); } } static void vring_unmap_desc_packed(const struct vring_virtqueue *vq, const struct vring_packed_desc *desc) { u16 flags; if (!vq->do_unmap) return; flags = le16_to_cpu(desc->flags); dma_unmap_page(vring_dma_dev(vq), le64_to_cpu(desc->addr), le32_to_cpu(desc->len), (flags & VRING_DESC_F_WRITE) ? DMA_FROM_DEVICE : DMA_TO_DEVICE); } static struct vring_packed_desc *alloc_indirect_packed(unsigned int total_sg, gfp_t gfp) { struct vring_packed_desc *desc; /* * We require lowmem mappings for the descriptors because * otherwise virt_to_phys will give us bogus addresses in the * virtqueue. */ gfp &= ~__GFP_HIGHMEM; desc = kmalloc_array(total_sg, sizeof(struct vring_packed_desc), gfp); return desc; } static int virtqueue_add_indirect_packed(struct vring_virtqueue *vq, struct scatterlist *sgs[], unsigned int total_sg, unsigned int out_sgs, unsigned int in_sgs, void *data, gfp_t gfp) { struct vring_packed_desc *desc; struct scatterlist *sg; unsigned int i, n, err_idx; u16 head, id; dma_addr_t addr; head = vq->packed.next_avail_idx; desc = alloc_indirect_packed(total_sg, gfp); if (!desc) return -ENOMEM; if (unlikely(vq->vq.num_free < 1)) { pr_debug("Can't add buf len 1 - avail = 0\n"); kfree(desc); END_USE(vq); return -ENOSPC; } i = 0; id = vq->free_head; BUG_ON(id == vq->packed.vring.num); for (n = 0; n < out_sgs + in_sgs; n++) { for (sg = sgs[n]; sg; sg = sg_next(sg)) { if (vring_map_one_sg(vq, sg, n < out_sgs ? DMA_TO_DEVICE : DMA_FROM_DEVICE, &addr)) goto unmap_release; desc[i].flags = cpu_to_le16(n < out_sgs ? 0 : VRING_DESC_F_WRITE); desc[i].addr = cpu_to_le64(addr); desc[i].len = cpu_to_le32(sg->length); i++; } } /* Now that the indirect table is filled in, map it. */ addr = vring_map_single(vq, desc, total_sg * sizeof(struct vring_packed_desc), DMA_TO_DEVICE); if (vring_mapping_error(vq, addr)) { if (vq->premapped) goto free_desc; goto unmap_release; } vq->packed.vring.desc[head].addr = cpu_to_le64(addr); vq->packed.vring.desc[head].len = cpu_to_le32(total_sg * sizeof(struct vring_packed_desc)); vq->packed.vring.desc[head].id = cpu_to_le16(id); if (vq->use_dma_api) { vq->packed.desc_extra[id].addr = addr; vq->packed.desc_extra[id].len = total_sg * sizeof(struct vring_packed_desc); vq->packed.desc_extra[id].flags = VRING_DESC_F_INDIRECT | vq->packed.avail_used_flags; } /* * A driver MUST NOT make the first descriptor in the list * available before all subsequent descriptors comprising * the list are made available. */ virtio_wmb(vq->weak_barriers); vq->packed.vring.desc[head].flags = cpu_to_le16(VRING_DESC_F_INDIRECT | vq->packed.avail_used_flags); /* We're using some buffers from the free list. */ vq->vq.num_free -= 1; /* Update free pointer */ n = head + 1; if (n >= vq->packed.vring.num) { n = 0; vq->packed.avail_wrap_counter ^= 1; vq->packed.avail_used_flags ^= 1 << VRING_PACKED_DESC_F_AVAIL | 1 << VRING_PACKED_DESC_F_USED; } vq->packed.next_avail_idx = n; vq->free_head = vq->packed.desc_extra[id].next; /* Store token and indirect buffer state. */ vq->packed.desc_state[id].num = 1; vq->packed.desc_state[id].data = data; vq->packed.desc_state[id].indir_desc = desc; vq->packed.desc_state[id].last = id; vq->num_added += 1; pr_debug("Added buffer head %i to %p\n", head, vq); END_USE(vq); return 0; unmap_release: err_idx = i; for (i = 0; i < err_idx; i++) vring_unmap_desc_packed(vq, &desc[i]); free_desc: kfree(desc); END_USE(vq); return -ENOMEM; } static inline int virtqueue_add_packed(struct virtqueue *_vq, struct scatterlist *sgs[], unsigned int total_sg, unsigned int out_sgs, unsigned int in_sgs, void *data, void *ctx, gfp_t gfp) { struct vring_virtqueue *vq = to_vvq(_vq); struct vring_packed_desc *desc; struct scatterlist *sg; unsigned int i, n, c, descs_used, err_idx; __le16 head_flags, flags; u16 head, id, prev, curr, avail_used_flags; int err; START_USE(vq); BUG_ON(data == NULL); BUG_ON(ctx && vq->indirect); if (unlikely(vq->broken)) { END_USE(vq); return -EIO; } LAST_ADD_TIME_UPDATE(vq); BUG_ON(total_sg == 0); if (virtqueue_use_indirect(vq, total_sg)) { err = virtqueue_add_indirect_packed(vq, sgs, total_sg, out_sgs, in_sgs, data, gfp); if (err != -ENOMEM) { END_USE(vq); return err; } /* fall back on direct */ } head = vq->packed.next_avail_idx; avail_used_flags = vq->packed.avail_used_flags; WARN_ON_ONCE(total_sg > vq->packed.vring.num && !vq->indirect); desc = vq->packed.vring.desc; i = head; descs_used = total_sg; if (unlikely(vq->vq.num_free < descs_used)) { pr_debug("Can't add buf len %i - avail = %i\n", descs_used, vq->vq.num_free); END_USE(vq); return -ENOSPC; } id = vq->free_head; BUG_ON(id == vq->packed.vring.num); curr = id; c = 0; for (n = 0; n < out_sgs + in_sgs; n++) { for (sg = sgs[n]; sg; sg = sg_next(sg)) { dma_addr_t addr; if (vring_map_one_sg(vq, sg, n < out_sgs ? DMA_TO_DEVICE : DMA_FROM_DEVICE, &addr)) goto unmap_release; flags = cpu_to_le16(vq->packed.avail_used_flags | (++c == total_sg ? 0 : VRING_DESC_F_NEXT) | (n < out_sgs ? 0 : VRING_DESC_F_WRITE)); if (i == head) head_flags = flags; else desc[i].flags = flags; desc[i].addr = cpu_to_le64(addr); desc[i].len = cpu_to_le32(sg->length); desc[i].id = cpu_to_le16(id); if (unlikely(vq->use_dma_api)) { vq->packed.desc_extra[curr].addr = addr; vq->packed.desc_extra[curr].len = sg->length; vq->packed.desc_extra[curr].flags = le16_to_cpu(flags); } prev = curr; curr = vq->packed.desc_extra[curr].next; if ((unlikely(++i >= vq->packed.vring.num))) { i = 0; vq->packed.avail_used_flags ^= 1 << VRING_PACKED_DESC_F_AVAIL | 1 << VRING_PACKED_DESC_F_USED; } } } if (i <= head) vq->packed.avail_wrap_counter ^= 1; /* We're using some buffers from the free list. */ vq->vq.num_free -= descs_used; /* Update free pointer */ vq->packed.next_avail_idx = i; vq->free_head = curr; /* Store token. */ vq->packed.desc_state[id].num = descs_used; vq->packed.desc_state[id].data = data; vq->packed.desc_state[id].indir_desc = ctx; vq->packed.desc_state[id].last = prev; /* * A driver MUST NOT make the first descriptor in the list * available before all subsequent descriptors comprising * the list are made available. */ virtio_wmb(vq->weak_barriers); vq->packed.vring.desc[head].flags = head_flags; vq->num_added += descs_used; pr_debug("Added buffer head %i to %p\n", head, vq); END_USE(vq); return 0; unmap_release: err_idx = i; i = head; curr = vq->free_head; vq->packed.avail_used_flags = avail_used_flags; for (n = 0; n < total_sg; n++) { if (i == err_idx) break; vring_unmap_extra_packed(vq, &vq->packed.desc_extra[curr]); curr = vq->packed.desc_extra[curr].next; i++; if (i >= vq->packed.vring.num) i = 0; } END_USE(vq); return -EIO; } static bool virtqueue_kick_prepare_packed(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); u16 new, old, off_wrap, flags, wrap_counter, event_idx; bool needs_kick; union { struct { __le16 off_wrap; __le16 flags; }; u32 u32; } snapshot; START_USE(vq); /* * We need to expose the new flags value before checking notification * suppressions. */ virtio_mb(vq->weak_barriers); old = vq->packed.next_avail_idx - vq->num_added; new = vq->packed.next_avail_idx; vq->num_added = 0; snapshot.u32 = *(u32 *)vq->packed.vring.device; flags = le16_to_cpu(snapshot.flags); LAST_ADD_TIME_CHECK(vq); LAST_ADD_TIME_INVALID(vq); if (flags != VRING_PACKED_EVENT_FLAG_DESC) { needs_kick = (flags != VRING_PACKED_EVENT_FLAG_DISABLE); goto out; } off_wrap = le16_to_cpu(snapshot.off_wrap); wrap_counter = off_wrap >> VRING_PACKED_EVENT_F_WRAP_CTR; event_idx = off_wrap & ~(1 << VRING_PACKED_EVENT_F_WRAP_CTR); if (wrap_counter != vq->packed.avail_wrap_counter) event_idx -= vq->packed.vring.num; needs_kick = vring_need_event(event_idx, new, old); out: END_USE(vq); return needs_kick; } static void detach_buf_packed(struct vring_virtqueue *vq, unsigned int id, void **ctx) { struct vring_desc_state_packed *state = NULL; struct vring_packed_desc *desc; unsigned int i, curr; state = &vq->packed.desc_state[id]; /* Clear data ptr. */ state->data = NULL; vq->packed.desc_extra[state->last].next = vq->free_head; vq->free_head = id; vq->vq.num_free += state->num; if (unlikely(vq->use_dma_api)) { curr = id; for (i = 0; i < state->num; i++) { vring_unmap_extra_packed(vq, &vq->packed.desc_extra[curr]); curr = vq->packed.desc_extra[curr].next; } } if (vq->indirect) { u32 len; /* Free the indirect table, if any, now that it's unmapped. */ desc = state->indir_desc; if (!desc) return; if (vq->do_unmap) { len = vq->packed.desc_extra[id].len; for (i = 0; i < len / sizeof(struct vring_packed_desc); i++) vring_unmap_desc_packed(vq, &desc[i]); } kfree(desc); state->indir_desc = NULL; } else if (ctx) { *ctx = state->indir_desc; } } static inline bool is_used_desc_packed(const struct vring_virtqueue *vq, u16 idx, bool used_wrap_counter) { bool avail, used; u16 flags; flags = le16_to_cpu(vq->packed.vring.desc[idx].flags); avail = !!(flags & (1 << VRING_PACKED_DESC_F_AVAIL)); used = !!(flags & (1 << VRING_PACKED_DESC_F_USED)); return avail == used && used == used_wrap_counter; } static bool more_used_packed(const struct vring_virtqueue *vq) { u16 last_used; u16 last_used_idx; bool used_wrap_counter; last_used_idx = READ_ONCE(vq->last_used_idx); last_used = packed_last_used(last_used_idx); used_wrap_counter = packed_used_wrap_counter(last_used_idx); return is_used_desc_packed(vq, last_used, used_wrap_counter); } static void *virtqueue_get_buf_ctx_packed(struct virtqueue *_vq, unsigned int *len, void **ctx) { struct vring_virtqueue *vq = to_vvq(_vq); u16 last_used, id, last_used_idx; bool used_wrap_counter; void *ret; START_USE(vq); if (unlikely(vq->broken)) { END_USE(vq); return NULL; } if (!more_used_packed(vq)) { pr_debug("No more buffers in queue\n"); END_USE(vq); return NULL; } /* Only get used elements after they have been exposed by host. */ virtio_rmb(vq->weak_barriers); last_used_idx = READ_ONCE(vq->last_used_idx); used_wrap_counter = packed_used_wrap_counter(last_used_idx); last_used = packed_last_used(last_used_idx); id = le16_to_cpu(vq->packed.vring.desc[last_used].id); *len = le32_to_cpu(vq->packed.vring.desc[last_used].len); if (unlikely(id >= vq->packed.vring.num)) { BAD_RING(vq, "id %u out of range\n", id); return NULL; } if (unlikely(!vq->packed.desc_state[id].data)) { BAD_RING(vq, "id %u is not a head!\n", id); return NULL; } /* detach_buf_packed clears data, so grab it now. */ ret = vq->packed.desc_state[id].data; detach_buf_packed(vq, id, ctx); last_used += vq->packed.desc_state[id].num; if (unlikely(last_used >= vq->packed.vring.num)) { last_used -= vq->packed.vring.num; used_wrap_counter ^= 1; } last_used = (last_used | (used_wrap_counter << VRING_PACKED_EVENT_F_WRAP_CTR)); WRITE_ONCE(vq->last_used_idx, last_used); /* * If we expect an interrupt for the next entry, tell host * by writing event index and flush out the write before * the read in the next get_buf call. */ if (vq->packed.event_flags_shadow == VRING_PACKED_EVENT_FLAG_DESC) virtio_store_mb(vq->weak_barriers, &vq->packed.vring.driver->off_wrap, cpu_to_le16(vq->last_used_idx)); LAST_ADD_TIME_INVALID(vq); END_USE(vq); return ret; } static void virtqueue_disable_cb_packed(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); if (vq->packed.event_flags_shadow != VRING_PACKED_EVENT_FLAG_DISABLE) { vq->packed.event_flags_shadow = VRING_PACKED_EVENT_FLAG_DISABLE; /* * If device triggered an event already it won't trigger one again: * no need to disable. */ if (vq->event_triggered) return; vq->packed.vring.driver->flags = cpu_to_le16(vq->packed.event_flags_shadow); } } static unsigned int virtqueue_enable_cb_prepare_packed(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); START_USE(vq); /* * We optimistically turn back on interrupts, then check if there was * more to do. */ if (vq->event) { vq->packed.vring.driver->off_wrap = cpu_to_le16(vq->last_used_idx); /* * We need to update event offset and event wrap * counter first before updating event flags. */ virtio_wmb(vq->weak_barriers); } if (vq->packed.event_flags_shadow == VRING_PACKED_EVENT_FLAG_DISABLE) { vq->packed.event_flags_shadow = vq->event ? VRING_PACKED_EVENT_FLAG_DESC : VRING_PACKED_EVENT_FLAG_ENABLE; vq->packed.vring.driver->flags = cpu_to_le16(vq->packed.event_flags_shadow); } END_USE(vq); return vq->last_used_idx; } static bool virtqueue_poll_packed(struct virtqueue *_vq, u16 off_wrap) { struct vring_virtqueue *vq = to_vvq(_vq); bool wrap_counter; u16 used_idx; wrap_counter = off_wrap >> VRING_PACKED_EVENT_F_WRAP_CTR; used_idx = off_wrap & ~(1 << VRING_PACKED_EVENT_F_WRAP_CTR); return is_used_desc_packed(vq, used_idx, wrap_counter); } static bool virtqueue_enable_cb_delayed_packed(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); u16 used_idx, wrap_counter, last_used_idx; u16 bufs; START_USE(vq); /* * We optimistically turn back on interrupts, then check if there was * more to do. */ if (vq->event) { /* TODO: tune this threshold */ bufs = (vq->packed.vring.num - vq->vq.num_free) * 3 / 4; last_used_idx = READ_ONCE(vq->last_used_idx); wrap_counter = packed_used_wrap_counter(last_used_idx); used_idx = packed_last_used(last_used_idx) + bufs; if (used_idx >= vq->packed.vring.num) { used_idx -= vq->packed.vring.num; wrap_counter ^= 1; } vq->packed.vring.driver->off_wrap = cpu_to_le16(used_idx | (wrap_counter << VRING_PACKED_EVENT_F_WRAP_CTR)); /* * We need to update event offset and event wrap * counter first before updating event flags. */ virtio_wmb(vq->weak_barriers); } if (vq->packed.event_flags_shadow == VRING_PACKED_EVENT_FLAG_DISABLE) { vq->packed.event_flags_shadow = vq->event ? VRING_PACKED_EVENT_FLAG_DESC : VRING_PACKED_EVENT_FLAG_ENABLE; vq->packed.vring.driver->flags = cpu_to_le16(vq->packed.event_flags_shadow); } /* * We need to update event suppression structure first * before re-checking for more used buffers. */ virtio_mb(vq->weak_barriers); last_used_idx = READ_ONCE(vq->last_used_idx); wrap_counter = packed_used_wrap_counter(last_used_idx); used_idx = packed_last_used(last_used_idx); if (is_used_desc_packed(vq, used_idx, wrap_counter)) { END_USE(vq); return false; } END_USE(vq); return true; } static void *virtqueue_detach_unused_buf_packed(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); unsigned int i; void *buf; START_USE(vq); for (i = 0; i < vq->packed.vring.num; i++) { if (!vq->packed.desc_state[i].data) continue; /* detach_buf clears data, so grab it now. */ buf = vq->packed.desc_state[i].data; detach_buf_packed(vq, i, NULL); END_USE(vq); return buf; } /* That should have freed everything. */ BUG_ON(vq->vq.num_free != vq->packed.vring.num); END_USE(vq); return NULL; } static struct vring_desc_extra *vring_alloc_desc_extra(unsigned int num) { struct vring_desc_extra *desc_extra; unsigned int i; desc_extra = kmalloc_array(num, sizeof(struct vring_desc_extra), GFP_KERNEL); if (!desc_extra) return NULL; memset(desc_extra, 0, num * sizeof(struct vring_desc_extra)); for (i = 0; i < num - 1; i++) desc_extra[i].next = i + 1; return desc_extra; } static void vring_free_packed(struct vring_virtqueue_packed *vring_packed, struct virtio_device *vdev, struct device *dma_dev) { if (vring_packed->vring.desc) vring_free_queue(vdev, vring_packed->ring_size_in_bytes, vring_packed->vring.desc, vring_packed->ring_dma_addr, dma_dev); if (vring_packed->vring.driver) vring_free_queue(vdev, vring_packed->event_size_in_bytes, vring_packed->vring.driver, vring_packed->driver_event_dma_addr, dma_dev); if (vring_packed->vring.device) vring_free_queue(vdev, vring_packed->event_size_in_bytes, vring_packed->vring.device, vring_packed->device_event_dma_addr, dma_dev); kfree(vring_packed->desc_state); kfree(vring_packed->desc_extra); } static int vring_alloc_queue_packed(struct vring_virtqueue_packed *vring_packed, struct virtio_device *vdev, u32 num, struct device *dma_dev) { struct vring_packed_desc *ring; struct vring_packed_desc_event *driver, *device; dma_addr_t ring_dma_addr, driver_event_dma_addr, device_event_dma_addr; size_t ring_size_in_bytes, event_size_in_bytes; ring_size_in_bytes = num * sizeof(struct vring_packed_desc); ring = vring_alloc_queue(vdev, ring_size_in_bytes, &ring_dma_addr, GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO, dma_dev); if (!ring) goto err; vring_packed->vring.desc = ring; vring_packed->ring_dma_addr = ring_dma_addr; vring_packed->ring_size_in_bytes = ring_size_in_bytes; event_size_in_bytes = sizeof(struct vring_packed_desc_event); driver = vring_alloc_queue(vdev, event_size_in_bytes, &driver_event_dma_addr, GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO, dma_dev); if (!driver) goto err; vring_packed->vring.driver = driver; vring_packed->event_size_in_bytes = event_size_in_bytes; vring_packed->driver_event_dma_addr = driver_event_dma_addr; device = vring_alloc_queue(vdev, event_size_in_bytes, &device_event_dma_addr, GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO, dma_dev); if (!device) goto err; vring_packed->vring.device = device; vring_packed->device_event_dma_addr = device_event_dma_addr; vring_packed->vring.num = num; return 0; err: vring_free_packed(vring_packed, vdev, dma_dev); return -ENOMEM; } static int vring_alloc_state_extra_packed(struct vring_virtqueue_packed *vring_packed) { struct vring_desc_state_packed *state; struct vring_desc_extra *extra; u32 num = vring_packed->vring.num; state = kmalloc_array(num, sizeof(struct vring_desc_state_packed), GFP_KERNEL); if (!state) goto err_desc_state; memset(state, 0, num * sizeof(struct vring_desc_state_packed)); extra = vring_alloc_desc_extra(num); if (!extra) goto err_desc_extra; vring_packed->desc_state = state; vring_packed->desc_extra = extra; return 0; err_desc_extra: kfree(state); err_desc_state: return -ENOMEM; } static void virtqueue_vring_init_packed(struct vring_virtqueue_packed *vring_packed, bool callback) { vring_packed->next_avail_idx = 0; vring_packed->avail_wrap_counter = 1; vring_packed->event_flags_shadow = 0; vring_packed->avail_used_flags = 1 << VRING_PACKED_DESC_F_AVAIL; /* No callback? Tell other side not to bother us. */ if (!callback) { vring_packed->event_flags_shadow = VRING_PACKED_EVENT_FLAG_DISABLE; vring_packed->vring.driver->flags = cpu_to_le16(vring_packed->event_flags_shadow); } } static void virtqueue_vring_attach_packed(struct vring_virtqueue *vq, struct vring_virtqueue_packed *vring_packed) { vq->packed = *vring_packed; /* Put everything in free lists. */ vq->free_head = 0; } static void virtqueue_reinit_packed(struct vring_virtqueue *vq) { memset(vq->packed.vring.device, 0, vq->packed.event_size_in_bytes); memset(vq->packed.vring.driver, 0, vq->packed.event_size_in_bytes); /* we need to reset the desc.flags. For more, see is_used_desc_packed() */ memset(vq->packed.vring.desc, 0, vq->packed.ring_size_in_bytes); virtqueue_init(vq, vq->packed.vring.num); virtqueue_vring_init_packed(&vq->packed, !!vq->vq.callback); } static struct virtqueue *vring_create_virtqueue_packed( unsigned int index, unsigned int num, unsigned int vring_align, struct virtio_device *vdev, bool weak_barriers, bool may_reduce_num, bool context, bool (*notify)(struct virtqueue *), void (*callback)(struct virtqueue *), const char *name, struct device *dma_dev) { struct vring_virtqueue_packed vring_packed = {}; struct vring_virtqueue *vq; int err; if (vring_alloc_queue_packed(&vring_packed, vdev, num, dma_dev)) goto err_ring; vq = kmalloc(sizeof(*vq), GFP_KERNEL); if (!vq) goto err_vq; vq->vq.callback = callback; vq->vq.vdev = vdev; vq->vq.name = name; vq->vq.index = index; vq->vq.reset = false; vq->we_own_ring = true; vq->notify = notify; vq->weak_barriers = weak_barriers; #ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION vq->broken = true; #else vq->broken = false; #endif vq->packed_ring = true; vq->dma_dev = dma_dev; vq->use_dma_api = vring_use_dma_api(vdev); vq->premapped = false; vq->do_unmap = vq->use_dma_api; vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) && !context; vq->event = virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX); if (virtio_has_feature(vdev, VIRTIO_F_ORDER_PLATFORM)) vq->weak_barriers = false; err = vring_alloc_state_extra_packed(&vring_packed); if (err) goto err_state_extra; virtqueue_vring_init_packed(&vring_packed, !!callback); virtqueue_init(vq, num); virtqueue_vring_attach_packed(vq, &vring_packed); spin_lock(&vdev->vqs_list_lock); list_add_tail(&vq->vq.list, &vdev->vqs); spin_unlock(&vdev->vqs_list_lock); return &vq->vq; err_state_extra: kfree(vq); err_vq: vring_free_packed(&vring_packed, vdev, dma_dev); err_ring: return NULL; } static int virtqueue_resize_packed(struct virtqueue *_vq, u32 num) { struct vring_virtqueue_packed vring_packed = {}; struct vring_virtqueue *vq = to_vvq(_vq); struct virtio_device *vdev = _vq->vdev; int err; if (vring_alloc_queue_packed(&vring_packed, vdev, num, vring_dma_dev(vq))) goto err_ring; err = vring_alloc_state_extra_packed(&vring_packed); if (err) goto err_state_extra; vring_free(&vq->vq); virtqueue_vring_init_packed(&vring_packed, !!vq->vq.callback); virtqueue_init(vq, vring_packed.vring.num); virtqueue_vring_attach_packed(vq, &vring_packed); return 0; err_state_extra: vring_free_packed(&vring_packed, vdev, vring_dma_dev(vq)); err_ring: virtqueue_reinit_packed(vq); return -ENOMEM; } static int virtqueue_disable_and_recycle(struct virtqueue *_vq, void (*recycle)(struct virtqueue *vq, void *buf)) { struct vring_virtqueue *vq = to_vvq(_vq); struct virtio_device *vdev = vq->vq.vdev; void *buf; int err; if (!vq->we_own_ring) return -EPERM; if (!vdev->config->disable_vq_and_reset) return -ENOENT; if (!vdev->config->enable_vq_after_reset) return -ENOENT; err = vdev->config->disable_vq_and_reset(_vq); if (err) return err; while ((buf = virtqueue_detach_unused_buf(_vq)) != NULL) recycle(_vq, buf); return 0; } static int virtqueue_enable_after_reset(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); struct virtio_device *vdev = vq->vq.vdev; if (vdev->config->enable_vq_after_reset(_vq)) return -EBUSY; return 0; } /* * Generic functions and exported symbols. */ static inline int virtqueue_add(struct virtqueue *_vq, struct scatterlist *sgs[], unsigned int total_sg, unsigned int out_sgs, unsigned int in_sgs, void *data, void *ctx, gfp_t gfp) { struct vring_virtqueue *vq = to_vvq(_vq); return vq->packed_ring ? virtqueue_add_packed(_vq, sgs, total_sg, out_sgs, in_sgs, data, ctx, gfp) : virtqueue_add_split(_vq, sgs, total_sg, out_sgs, in_sgs, data, ctx, gfp); } /** * virtqueue_add_sgs - expose buffers to other end * @_vq: the struct virtqueue we're talking about. * @sgs: array of terminated scatterlists. * @out_sgs: the number of scatterlists readable by other side * @in_sgs: the number of scatterlists which are writable (after readable ones) * @data: the token identifying the buffer. * @gfp: how to do memory allocations (if necessary). * * Caller must ensure we don't call this with other virtqueue operations * at the same time (except where noted). * * Returns zero or a negative error (ie. ENOSPC, ENOMEM, EIO). */ int virtqueue_add_sgs(struct virtqueue *_vq, struct scatterlist *sgs[], unsigned int out_sgs, unsigned int in_sgs, void *data, gfp_t gfp) { unsigned int i, total_sg = 0; /* Count them first. */ for (i = 0; i < out_sgs + in_sgs; i++) { struct scatterlist *sg; for (sg = sgs[i]; sg; sg = sg_next(sg)) total_sg++; } return virtqueue_add(_vq, sgs, total_sg, out_sgs, in_sgs, data, NULL, gfp); } EXPORT_SYMBOL_GPL(virtqueue_add_sgs); /** * virtqueue_add_outbuf - expose output buffers to other end * @vq: the struct virtqueue we're talking about. * @sg: scatterlist (must be well-formed and terminated!) * @num: the number of entries in @sg readable by other side * @data: the token identifying the buffer. * @gfp: how to do memory allocations (if necessary). * * Caller must ensure we don't call this with other virtqueue operations * at the same time (except where noted). * * Returns zero or a negative error (ie. ENOSPC, ENOMEM, EIO). */ int virtqueue_add_outbuf(struct virtqueue *vq, struct scatterlist *sg, unsigned int num, void *data, gfp_t gfp) { return virtqueue_add(vq, &sg, num, 1, 0, data, NULL, gfp); } EXPORT_SYMBOL_GPL(virtqueue_add_outbuf); /** * virtqueue_add_inbuf - expose input buffers to other end * @vq: the struct virtqueue we're talking about. * @sg: scatterlist (must be well-formed and terminated!) * @num: the number of entries in @sg writable by other side * @data: the token identifying the buffer. * @gfp: how to do memory allocations (if necessary). * * Caller must ensure we don't call this with other virtqueue operations * at the same time (except where noted). * * Returns zero or a negative error (ie. ENOSPC, ENOMEM, EIO). */ int virtqueue_add_inbuf(struct virtqueue *vq, struct scatterlist *sg, unsigned int num, void *data, gfp_t gfp) { return virtqueue_add(vq, &sg, num, 0, 1, data, NULL, gfp); } EXPORT_SYMBOL_GPL(virtqueue_add_inbuf); /** * virtqueue_add_inbuf_ctx - expose input buffers to other end * @vq: the struct virtqueue we're talking about. * @sg: scatterlist (must be well-formed and terminated!) * @num: the number of entries in @sg writable by other side * @data: the token identifying the buffer. * @ctx: extra context for the token * @gfp: how to do memory allocations (if necessary). * * Caller must ensure we don't call this with other virtqueue operations * at the same time (except where noted). * * Returns zero or a negative error (ie. ENOSPC, ENOMEM, EIO). */ int virtqueue_add_inbuf_ctx(struct virtqueue *vq, struct scatterlist *sg, unsigned int num, void *data, void *ctx, gfp_t gfp) { return virtqueue_add(vq, &sg, num, 0, 1, data, ctx, gfp); } EXPORT_SYMBOL_GPL(virtqueue_add_inbuf_ctx); /** * virtqueue_dma_dev - get the dma dev * @_vq: the struct virtqueue we're talking about. * * Returns the dma dev. That can been used for dma api. */ struct device *virtqueue_dma_dev(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); if (vq->use_dma_api) return vring_dma_dev(vq); else return NULL; } EXPORT_SYMBOL_GPL(virtqueue_dma_dev); /** * virtqueue_kick_prepare - first half of split virtqueue_kick call. * @_vq: the struct virtqueue * * Instead of virtqueue_kick(), you can do: * if (virtqueue_kick_prepare(vq)) * virtqueue_notify(vq); * * This is sometimes useful because the virtqueue_kick_prepare() needs * to be serialized, but the actual virtqueue_notify() call does not. */ bool virtqueue_kick_prepare(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); return vq->packed_ring ? virtqueue_kick_prepare_packed(_vq) : virtqueue_kick_prepare_split(_vq); } EXPORT_SYMBOL_GPL(virtqueue_kick_prepare); /** * virtqueue_notify - second half of split virtqueue_kick call. * @_vq: the struct virtqueue * * This does not need to be serialized. * * Returns false if host notify failed or queue is broken, otherwise true. */ bool virtqueue_notify(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); if (unlikely(vq->broken)) return false; /* Prod other side to tell it about changes. */ if (!vq->notify(_vq)) { vq->broken = true; return false; } return true; } EXPORT_SYMBOL_GPL(virtqueue_notify); /** * virtqueue_kick - update after add_buf * @vq: the struct virtqueue * * After one or more virtqueue_add_* calls, invoke this to kick * the other side. * * Caller must ensure we don't call this with other virtqueue * operations at the same time (except where noted). * * Returns false if kick failed, otherwise true. */ bool virtqueue_kick(struct virtqueue *vq) { if (virtqueue_kick_prepare(vq)) return virtqueue_notify(vq); return true; } EXPORT_SYMBOL_GPL(virtqueue_kick); /** * virtqueue_get_buf_ctx - get the next used buffer * @_vq: the struct virtqueue we're talking about. * @len: the length written into the buffer * @ctx: extra context for the token * * If the device wrote data into the buffer, @len will be set to the * amount written. This means you don't need to clear the buffer * beforehand to ensure there's no data leakage in the case of short * writes. * * Caller must ensure we don't call this with other virtqueue * operations at the same time (except where noted). * * Returns NULL if there are no used buffers, or the "data" token * handed to virtqueue_add_*(). */ void *virtqueue_get_buf_ctx(struct virtqueue *_vq, unsigned int *len, void **ctx) { struct vring_virtqueue *vq = to_vvq(_vq); return vq->packed_ring ? virtqueue_get_buf_ctx_packed(_vq, len, ctx) : virtqueue_get_buf_ctx_split(_vq, len, ctx); } EXPORT_SYMBOL_GPL(virtqueue_get_buf_ctx); void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len) { return virtqueue_get_buf_ctx(_vq, len, NULL); } EXPORT_SYMBOL_GPL(virtqueue_get_buf); /** * virtqueue_disable_cb - disable callbacks * @_vq: the struct virtqueue we're talking about. * * Note that this is not necessarily synchronous, hence unreliable and only * useful as an optimization. * * Unlike other operations, this need not be serialized. */ void virtqueue_disable_cb(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); if (vq->packed_ring) virtqueue_disable_cb_packed(_vq); else virtqueue_disable_cb_split(_vq); } EXPORT_SYMBOL_GPL(virtqueue_disable_cb); /** * virtqueue_enable_cb_prepare - restart callbacks after disable_cb * @_vq: the struct virtqueue we're talking about. * * This re-enables callbacks; it returns current queue state * in an opaque unsigned value. This value should be later tested by * virtqueue_poll, to detect a possible race between the driver checking for * more work, and enabling callbacks. * * Caller must ensure we don't call this with other virtqueue * operations at the same time (except where noted). */ unsigned int virtqueue_enable_cb_prepare(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); if (vq->event_triggered) vq->event_triggered = false; return vq->packed_ring ? virtqueue_enable_cb_prepare_packed(_vq) : virtqueue_enable_cb_prepare_split(_vq); } EXPORT_SYMBOL_GPL(virtqueue_enable_cb_prepare); /** * virtqueue_poll - query pending used buffers * @_vq: the struct virtqueue we're talking about. * @last_used_idx: virtqueue state (from call to virtqueue_enable_cb_prepare). * * Returns "true" if there are pending used buffers in the queue. * * This does not need to be serialized. */ bool virtqueue_poll(struct virtqueue *_vq, unsigned int last_used_idx) { struct vring_virtqueue *vq = to_vvq(_vq); if (unlikely(vq->broken)) return false; virtio_mb(vq->weak_barriers); return vq->packed_ring ? virtqueue_poll_packed(_vq, last_used_idx) : virtqueue_poll_split(_vq, last_used_idx); } EXPORT_SYMBOL_GPL(virtqueue_poll); /** * virtqueue_enable_cb - restart callbacks after disable_cb. * @_vq: the struct virtqueue we're talking about. * * This re-enables callbacks; it returns "false" if there are pending * buffers in the queue, to detect a possible race between the driver * checking for more work, and enabling callbacks. * * Caller must ensure we don't call this with other virtqueue * operations at the same time (except where noted). */ bool virtqueue_enable_cb(struct virtqueue *_vq) { unsigned int last_used_idx = virtqueue_enable_cb_prepare(_vq); return !virtqueue_poll(_vq, last_used_idx); } EXPORT_SYMBOL_GPL(virtqueue_enable_cb); /** * virtqueue_enable_cb_delayed - restart callbacks after disable_cb. * @_vq: the struct virtqueue we're talking about. * * This re-enables callbacks but hints to the other side to delay * interrupts until most of the available buffers have been processed; * it returns "false" if there are many pending buffers in the queue, * to detect a possible race between the driver checking for more work, * and enabling callbacks. * * Caller must ensure we don't call this with other virtqueue * operations at the same time (except where noted). */ bool virtqueue_enable_cb_delayed(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); if (vq->event_triggered) vq->event_triggered = false; return vq->packed_ring ? virtqueue_enable_cb_delayed_packed(_vq) : virtqueue_enable_cb_delayed_split(_vq); } EXPORT_SYMBOL_GPL(virtqueue_enable_cb_delayed); /** * virtqueue_detach_unused_buf - detach first unused buffer * @_vq: the struct virtqueue we're talking about. * * Returns NULL or the "data" token handed to virtqueue_add_*(). * This is not valid on an active queue; it is useful for device * shutdown or the reset queue. */ void *virtqueue_detach_unused_buf(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); return vq->packed_ring ? virtqueue_detach_unused_buf_packed(_vq) : virtqueue_detach_unused_buf_split(_vq); } EXPORT_SYMBOL_GPL(virtqueue_detach_unused_buf); static inline bool more_used(const struct vring_virtqueue *vq) { return vq->packed_ring ? more_used_packed(vq) : more_used_split(vq); } /** * vring_interrupt - notify a virtqueue on an interrupt * @irq: the IRQ number (ignored) * @_vq: the struct virtqueue to notify * * Calls the callback function of @_vq to process the virtqueue * notification. */ irqreturn_t vring_interrupt(int irq, void *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); if (!more_used(vq)) { pr_debug("virtqueue interrupt with no work for %p\n", vq); return IRQ_NONE; } if (unlikely(vq->broken)) { #ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION dev_warn_once(&vq->vq.vdev->dev, "virtio vring IRQ raised before DRIVER_OK"); return IRQ_NONE; #else return IRQ_HANDLED; #endif } /* Just a hint for performance: so it's ok that this can be racy! */ if (vq->event) vq->event_triggered = true; pr_debug("virtqueue callback for %p (%p)\n", vq, vq->vq.callback); if (vq->vq.callback) vq->vq.callback(&vq->vq); return IRQ_HANDLED; } EXPORT_SYMBOL_GPL(vring_interrupt); /* Only available for split ring */ static struct virtqueue *__vring_new_virtqueue(unsigned int index, struct vring_virtqueue_split *vring_split, struct virtio_device *vdev, bool weak_barriers, bool context, bool (*notify)(struct virtqueue *), void (*callback)(struct virtqueue *), const char *name, struct device *dma_dev) { struct vring_virtqueue *vq; int err; if (virtio_has_feature(vdev, VIRTIO_F_RING_PACKED)) return NULL; vq = kmalloc(sizeof(*vq), GFP_KERNEL); if (!vq) return NULL; vq->packed_ring = false; vq->vq.callback = callback; vq->vq.vdev = vdev; vq->vq.name = name; vq->vq.index = index; vq->vq.reset = false; vq->we_own_ring = false; vq->notify = notify; vq->weak_barriers = weak_barriers; #ifdef CONFIG_VIRTIO_HARDEN_NOTIFICATION vq->broken = true; #else vq->broken = false; #endif vq->dma_dev = dma_dev; vq->use_dma_api = vring_use_dma_api(vdev); vq->premapped = false; vq->do_unmap = vq->use_dma_api; vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) && !context; vq->event = virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX); if (virtio_has_feature(vdev, VIRTIO_F_ORDER_PLATFORM)) vq->weak_barriers = false; err = vring_alloc_state_extra_split(vring_split); if (err) { kfree(vq); return NULL; } virtqueue_vring_init_split(vring_split, vq); virtqueue_init(vq, vring_split->vring.num); virtqueue_vring_attach_split(vq, vring_split); spin_lock(&vdev->vqs_list_lock); list_add_tail(&vq->vq.list, &vdev->vqs); spin_unlock(&vdev->vqs_list_lock); return &vq->vq; } struct virtqueue *vring_create_virtqueue( unsigned int index, unsigned int num, unsigned int vring_align, struct virtio_device *vdev, bool weak_barriers, bool may_reduce_num, bool context, bool (*notify)(struct virtqueue *), void (*callback)(struct virtqueue *), const char *name) { if (virtio_has_feature(vdev, VIRTIO_F_RING_PACKED)) return vring_create_virtqueue_packed(index, num, vring_align, vdev, weak_barriers, may_reduce_num, context, notify, callback, name, vdev->dev.parent); return vring_create_virtqueue_split(index, num, vring_align, vdev, weak_barriers, may_reduce_num, context, notify, callback, name, vdev->dev.parent); } EXPORT_SYMBOL_GPL(vring_create_virtqueue); struct virtqueue *vring_create_virtqueue_dma( unsigned int index, unsigned int num, unsigned int vring_align, struct virtio_device *vdev, bool weak_barriers, bool may_reduce_num, bool context, bool (*notify)(struct virtqueue *), void (*callback)(struct virtqueue *), const char *name, struct device *dma_dev) { if (virtio_has_feature(vdev, VIRTIO_F_RING_PACKED)) return vring_create_virtqueue_packed(index, num, vring_align, vdev, weak_barriers, may_reduce_num, context, notify, callback, name, dma_dev); return vring_create_virtqueue_split(index, num, vring_align, vdev, weak_barriers, may_reduce_num, context, notify, callback, name, dma_dev); } EXPORT_SYMBOL_GPL(vring_create_virtqueue_dma); /** * virtqueue_resize - resize the vring of vq * @_vq: the struct virtqueue we're talking about. * @num: new ring num * @recycle: callback to recycle unused buffers * * When it is really necessary to create a new vring, it will set the current vq * into the reset state. Then call the passed callback to recycle the buffer * that is no longer used. Only after the new vring is successfully created, the * old vring will be released. * * Caller must ensure we don't call this with other virtqueue operations * at the same time (except where noted). * * Returns zero or a negative error. * 0: success. * -ENOMEM: Failed to allocate a new ring, fall back to the original ring size. * vq can still work normally * -EBUSY: Failed to sync with device, vq may not work properly * -ENOENT: Transport or device not supported * -E2BIG/-EINVAL: num error * -EPERM: Operation not permitted * */ int virtqueue_resize(struct virtqueue *_vq, u32 num, void (*recycle)(struct virtqueue *vq, void *buf)) { struct vring_virtqueue *vq = to_vvq(_vq); int err; if (num > vq->vq.num_max) return -E2BIG; if (!num) return -EINVAL; if ((vq->packed_ring ? vq->packed.vring.num : vq->split.vring.num) == num) return 0; err = virtqueue_disable_and_recycle(_vq, recycle); if (err) return err; if (vq->packed_ring) err = virtqueue_resize_packed(_vq, num); else err = virtqueue_resize_split(_vq, num); return virtqueue_enable_after_reset(_vq); } EXPORT_SYMBOL_GPL(virtqueue_resize); /** * virtqueue_set_dma_premapped - set the vring premapped mode * @_vq: the struct virtqueue we're talking about. * * Enable the premapped mode of the vq. * * The vring in premapped mode does not do dma internally, so the driver must * do dma mapping in advance. The driver must pass the dma_address through * dma_address of scatterlist. When the driver got a used buffer from * the vring, it has to unmap the dma address. * * This function must be called immediately after creating the vq, or after vq * reset, and before adding any buffers to it. * * Caller must ensure we don't call this with other virtqueue operations * at the same time (except where noted). * * Returns zero or a negative error. * 0: success. * -EINVAL: vring does not use the dma api, so we can not enable premapped mode. */ int virtqueue_set_dma_premapped(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); u32 num; START_USE(vq); num = vq->packed_ring ? vq->packed.vring.num : vq->split.vring.num; if (num != vq->vq.num_free) { END_USE(vq); return -EINVAL; } if (!vq->use_dma_api) { END_USE(vq); return -EINVAL; } vq->premapped = true; vq->do_unmap = false; END_USE(vq); return 0; } EXPORT_SYMBOL_GPL(virtqueue_set_dma_premapped); /** * virtqueue_reset - detach and recycle all unused buffers * @_vq: the struct virtqueue we're talking about. * @recycle: callback to recycle unused buffers * * Caller must ensure we don't call this with other virtqueue operations * at the same time (except where noted). * * Returns zero or a negative error. * 0: success. * -EBUSY: Failed to sync with device, vq may not work properly * -ENOENT: Transport or device not supported * -EPERM: Operation not permitted */ int virtqueue_reset(struct virtqueue *_vq, void (*recycle)(struct virtqueue *vq, void *buf)) { struct vring_virtqueue *vq = to_vvq(_vq); int err; err = virtqueue_disable_and_recycle(_vq, recycle); if (err) return err; if (vq->packed_ring) virtqueue_reinit_packed(vq); else virtqueue_reinit_split(vq); return virtqueue_enable_after_reset(_vq); } EXPORT_SYMBOL_GPL(virtqueue_reset); /* Only available for split ring */ struct virtqueue *vring_new_virtqueue(unsigned int index, unsigned int num, unsigned int vring_align, struct virtio_device *vdev, bool weak_barriers, bool context, void *pages, bool (*notify)(struct virtqueue *vq), void (*callback)(struct virtqueue *vq), const char *name) { struct vring_virtqueue_split vring_split = {}; if (virtio_has_feature(vdev, VIRTIO_F_RING_PACKED)) return NULL; vring_init(&vring_split.vring, num, pages, vring_align); return __vring_new_virtqueue(index, &vring_split, vdev, weak_barriers, context, notify, callback, name, vdev->dev.parent); } EXPORT_SYMBOL_GPL(vring_new_virtqueue); static void vring_free(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); if (vq->we_own_ring) { if (vq->packed_ring) { vring_free_queue(vq->vq.vdev, vq->packed.ring_size_in_bytes, vq->packed.vring.desc, vq->packed.ring_dma_addr, vring_dma_dev(vq)); vring_free_queue(vq->vq.vdev, vq->packed.event_size_in_bytes, vq->packed.vring.driver, vq->packed.driver_event_dma_addr, vring_dma_dev(vq)); vring_free_queue(vq->vq.vdev, vq->packed.event_size_in_bytes, vq->packed.vring.device, vq->packed.device_event_dma_addr, vring_dma_dev(vq)); kfree(vq->packed.desc_state); kfree(vq->packed.desc_extra); } else { vring_free_queue(vq->vq.vdev, vq->split.queue_size_in_bytes, vq->split.vring.desc, vq->split.queue_dma_addr, vring_dma_dev(vq)); } } if (!vq->packed_ring) { kfree(vq->split.desc_state); kfree(vq->split.desc_extra); } } void vring_del_virtqueue(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); spin_lock(&vq->vq.vdev->vqs_list_lock); list_del(&_vq->list); spin_unlock(&vq->vq.vdev->vqs_list_lock); vring_free(_vq); kfree(vq); } EXPORT_SYMBOL_GPL(vring_del_virtqueue); u32 vring_notification_data(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); u16 next; if (vq->packed_ring) next = (vq->packed.next_avail_idx & ~(-(1 << VRING_PACKED_EVENT_F_WRAP_CTR))) | vq->packed.avail_wrap_counter << VRING_PACKED_EVENT_F_WRAP_CTR; else next = vq->split.avail_idx_shadow; return next << 16 | _vq->index; } EXPORT_SYMBOL_GPL(vring_notification_data); /* Manipulates transport-specific feature bits. */ void vring_transport_features(struct virtio_device *vdev) { unsigned int i; for (i = VIRTIO_TRANSPORT_F_START; i < VIRTIO_TRANSPORT_F_END; i++) { switch (i) { case VIRTIO_RING_F_INDIRECT_DESC: break; case VIRTIO_RING_F_EVENT_IDX: break; case VIRTIO_F_VERSION_1: break; case VIRTIO_F_ACCESS_PLATFORM: break; case VIRTIO_F_RING_PACKED: break; case VIRTIO_F_ORDER_PLATFORM: break; case VIRTIO_F_NOTIFICATION_DATA: break; default: /* We don't understand this bit. */ __virtio_clear_bit(vdev, i); } } } EXPORT_SYMBOL_GPL(vring_transport_features); /** * virtqueue_get_vring_size - return the size of the virtqueue's vring * @_vq: the struct virtqueue containing the vring of interest. * * Returns the size of the vring. This is mainly used for boasting to * userspace. Unlike other operations, this need not be serialized. */ unsigned int virtqueue_get_vring_size(const struct virtqueue *_vq) { const struct vring_virtqueue *vq = to_vvq(_vq); return vq->packed_ring ? vq->packed.vring.num : vq->split.vring.num; } EXPORT_SYMBOL_GPL(virtqueue_get_vring_size); /* * This function should only be called by the core, not directly by the driver. */ void __virtqueue_break(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); /* Pairs with READ_ONCE() in virtqueue_is_broken(). */ WRITE_ONCE(vq->broken, true); } EXPORT_SYMBOL_GPL(__virtqueue_break); /* * This function should only be called by the core, not directly by the driver. */ void __virtqueue_unbreak(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); /* Pairs with READ_ONCE() in virtqueue_is_broken(). */ WRITE_ONCE(vq->broken, false); } EXPORT_SYMBOL_GPL(__virtqueue_unbreak); bool virtqueue_is_broken(const struct virtqueue *_vq) { const struct vring_virtqueue *vq = to_vvq(_vq); return READ_ONCE(vq->broken); } EXPORT_SYMBOL_GPL(virtqueue_is_broken); /* * This should prevent the device from being used, allowing drivers to * recover. You may need to grab appropriate locks to flush. */ void virtio_break_device(struct virtio_device *dev) { struct virtqueue *_vq; spin_lock(&dev->vqs_list_lock); list_for_each_entry(_vq, &dev->vqs, list) { struct vring_virtqueue *vq = to_vvq(_vq); /* Pairs with READ_ONCE() in virtqueue_is_broken(). */ WRITE_ONCE(vq->broken, true); } spin_unlock(&dev->vqs_list_lock); } EXPORT_SYMBOL_GPL(virtio_break_device); /* * This should allow the device to be used by the driver. You may * need to grab appropriate locks to flush the write to * vq->broken. This should only be used in some specific case e.g * (probing and restoring). This function should only be called by the * core, not directly by the driver. */ void __virtio_unbreak_device(struct virtio_device *dev) { struct virtqueue *_vq; spin_lock(&dev->vqs_list_lock); list_for_each_entry(_vq, &dev->vqs, list) { struct vring_virtqueue *vq = to_vvq(_vq); /* Pairs with READ_ONCE() in virtqueue_is_broken(). */ WRITE_ONCE(vq->broken, false); } spin_unlock(&dev->vqs_list_lock); } EXPORT_SYMBOL_GPL(__virtio_unbreak_device); dma_addr_t virtqueue_get_desc_addr(const struct virtqueue *_vq) { const struct vring_virtqueue *vq = to_vvq(_vq); BUG_ON(!vq->we_own_ring); if (vq->packed_ring) return vq->packed.ring_dma_addr; return vq->split.queue_dma_addr; } EXPORT_SYMBOL_GPL(virtqueue_get_desc_addr); dma_addr_t virtqueue_get_avail_addr(const struct virtqueue *_vq) { const struct vring_virtqueue *vq = to_vvq(_vq); BUG_ON(!vq->we_own_ring); if (vq->packed_ring) return vq->packed.driver_event_dma_addr; return vq->split.queue_dma_addr + ((char *)vq->split.vring.avail - (char *)vq->split.vring.desc); } EXPORT_SYMBOL_GPL(virtqueue_get_avail_addr); dma_addr_t virtqueue_get_used_addr(const struct virtqueue *_vq) { const struct vring_virtqueue *vq = to_vvq(_vq); BUG_ON(!vq->we_own_ring); if (vq->packed_ring) return vq->packed.device_event_dma_addr; return vq->split.queue_dma_addr + ((char *)vq->split.vring.used - (char *)vq->split.vring.desc); } EXPORT_SYMBOL_GPL(virtqueue_get_used_addr); /* Only available for split ring */ const struct vring *virtqueue_get_vring(const struct virtqueue *vq) { return &to_vvq(vq)->split.vring; } EXPORT_SYMBOL_GPL(virtqueue_get_vring); /** * virtqueue_dma_map_single_attrs - map DMA for _vq * @_vq: the struct virtqueue we're talking about. * @ptr: the pointer of the buffer to do dma * @size: the size of the buffer to do dma * @dir: DMA direction * @attrs: DMA Attrs * * The caller calls this to do dma mapping in advance. The DMA address can be * passed to this _vq when it is in pre-mapped mode. * * return DMA address. Caller should check that by virtqueue_dma_mapping_error(). */ dma_addr_t virtqueue_dma_map_single_attrs(struct virtqueue *_vq, void *ptr, size_t size, enum dma_data_direction dir, unsigned long attrs) { struct vring_virtqueue *vq = to_vvq(_vq); if (!vq->use_dma_api) return (dma_addr_t)virt_to_phys(ptr); return dma_map_single_attrs(vring_dma_dev(vq), ptr, size, dir, attrs); } EXPORT_SYMBOL_GPL(virtqueue_dma_map_single_attrs); /** * virtqueue_dma_unmap_single_attrs - unmap DMA for _vq * @_vq: the struct virtqueue we're talking about. * @addr: the dma address to unmap * @size: the size of the buffer * @dir: DMA direction * @attrs: DMA Attrs * * Unmap the address that is mapped by the virtqueue_dma_map_* APIs. * */ void virtqueue_dma_unmap_single_attrs(struct virtqueue *_vq, dma_addr_t addr, size_t size, enum dma_data_direction dir, unsigned long attrs) { struct vring_virtqueue *vq = to_vvq(_vq); if (!vq->use_dma_api) return; dma_unmap_single_attrs(vring_dma_dev(vq), addr, size, dir, attrs); } EXPORT_SYMBOL_GPL(virtqueue_dma_unmap_single_attrs); /** * virtqueue_dma_mapping_error - check dma address * @_vq: the struct virtqueue we're talking about. * @addr: DMA address * * Returns 0 means dma valid. Other means invalid dma address. */ int virtqueue_dma_mapping_error(struct virtqueue *_vq, dma_addr_t addr) { struct vring_virtqueue *vq = to_vvq(_vq); if (!vq->use_dma_api) return 0; return dma_mapping_error(vring_dma_dev(vq), addr); } EXPORT_SYMBOL_GPL(virtqueue_dma_mapping_error); /** * virtqueue_dma_need_sync - check a dma address needs sync * @_vq: the struct virtqueue we're talking about. * @addr: DMA address * * Check if the dma address mapped by the virtqueue_dma_map_* APIs needs to be * synchronized * * return bool */ bool virtqueue_dma_need_sync(struct virtqueue *_vq, dma_addr_t addr) { struct vring_virtqueue *vq = to_vvq(_vq); if (!vq->use_dma_api) return false; return dma_need_sync(vring_dma_dev(vq), addr); } EXPORT_SYMBOL_GPL(virtqueue_dma_need_sync); /** * virtqueue_dma_sync_single_range_for_cpu - dma sync for cpu * @_vq: the struct virtqueue we're talking about. * @addr: DMA address * @offset: DMA address offset * @size: buf size for sync * @dir: DMA direction * * Before calling this function, use virtqueue_dma_need_sync() to confirm that * the DMA address really needs to be synchronized * */ void virtqueue_dma_sync_single_range_for_cpu(struct virtqueue *_vq, dma_addr_t addr, unsigned long offset, size_t size, enum dma_data_direction dir) { struct vring_virtqueue *vq = to_vvq(_vq); struct device *dev = vring_dma_dev(vq); if (!vq->use_dma_api) return; dma_sync_single_range_for_cpu(dev, addr, offset, size, dir); } EXPORT_SYMBOL_GPL(virtqueue_dma_sync_single_range_for_cpu); /** * virtqueue_dma_sync_single_range_for_device - dma sync for device * @_vq: the struct virtqueue we're talking about. * @addr: DMA address * @offset: DMA address offset * @size: buf size for sync * @dir: DMA direction * * Before calling this function, use virtqueue_dma_need_sync() to confirm that * the DMA address really needs to be synchronized */ void virtqueue_dma_sync_single_range_for_device(struct virtqueue *_vq, dma_addr_t addr, unsigned long offset, size_t size, enum dma_data_direction dir) { struct vring_virtqueue *vq = to_vvq(_vq); struct device *dev = vring_dma_dev(vq); if (!vq->use_dma_api) return; dma_sync_single_range_for_device(dev, addr, offset, size, dir); } EXPORT_SYMBOL_GPL(virtqueue_dma_sync_single_range_for_device); MODULE_LICENSE("GPL");
1 1 1 1 2 4 13 13 5 2 4 2 2 2 9 9 1 1 2 4 1 9 3 3 3 3 3 3 3 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 // SPDX-License-Identifier: GPL-2.0-only /* * * Copyright (C) Hans Alblas PE1AYX <hans@esrac.ele.tue.nl> * Copyright (C) 2004, 05 Ralf Baechle DL5RB <ralf@linux-mips.org> * Copyright (C) 2004, 05 Thomas Osterried DL9SAU <thomas@x-berg.in-berlin.de> */ #include <linux/module.h> #include <linux/bitops.h> #include <linux/uaccess.h> #include <linux/crc16.h> #include <linux/string.h> #include <linux/mm.h> #include <linux/interrupt.h> #include <linux/in.h> #include <linux/inet.h> #include <linux/slab.h> #include <linux/tty.h> #include <linux/errno.h> #include <linux/netdevice.h> #include <linux/major.h> #include <linux/init.h> #include <linux/rtnetlink.h> #include <linux/etherdevice.h> #include <linux/skbuff.h> #include <linux/if_arp.h> #include <linux/jiffies.h> #include <linux/refcount.h> #include <net/ax25.h> #define AX_MTU 236 /* some arch define END as assembly function ending, just undef it */ #undef END /* SLIP/KISS protocol characters. */ #define END 0300 /* indicates end of frame */ #define ESC 0333 /* indicates byte stuffing */ #define ESC_END 0334 /* ESC ESC_END means END 'data' */ #define ESC_ESC 0335 /* ESC ESC_ESC means ESC 'data' */ struct mkiss { struct tty_struct *tty; /* ptr to TTY structure */ struct net_device *dev; /* easy for intr handling */ /* These are pointers to the malloc()ed frame buffers. */ spinlock_t buflock;/* lock for rbuf and xbuf */ unsigned char *rbuff; /* receiver buffer */ int rcount; /* received chars counter */ unsigned char *xbuff; /* transmitter buffer */ unsigned char *xhead; /* pointer to next byte to XMIT */ int xleft; /* bytes left in XMIT queue */ /* Detailed SLIP statistics. */ int mtu; /* Our mtu (to spot changes!) */ int buffsize; /* Max buffers sizes */ unsigned long flags; /* Flag values/ mode etc */ /* long req'd: used by set_bit --RR */ #define AXF_INUSE 0 /* Channel in use */ #define AXF_ESCAPE 1 /* ESC received */ #define AXF_ERROR 2 /* Parity, etc. error */ #define AXF_KEEPTEST 3 /* Keepalive test flag */ #define AXF_OUTWAIT 4 /* is outpacket was flag */ int mode; int crcmode; /* MW: for FlexNet, SMACK etc. */ int crcauto; /* CRC auto mode */ #define CRC_MODE_NONE 0 #define CRC_MODE_FLEX 1 #define CRC_MODE_SMACK 2 #define CRC_MODE_FLEX_TEST 3 #define CRC_MODE_SMACK_TEST 4 refcount_t refcnt; struct completion dead; }; /*---------------------------------------------------------------------------*/ static const unsigned short crc_flex_table[] = { 0x0f87, 0x1e0e, 0x2c95, 0x3d1c, 0x49a3, 0x582a, 0x6ab1, 0x7b38, 0x83cf, 0x9246, 0xa0dd, 0xb154, 0xc5eb, 0xd462, 0xe6f9, 0xf770, 0x1f06, 0x0e8f, 0x3c14, 0x2d9d, 0x5922, 0x48ab, 0x7a30, 0x6bb9, 0x934e, 0x82c7, 0xb05c, 0xa1d5, 0xd56a, 0xc4e3, 0xf678, 0xe7f1, 0x2e85, 0x3f0c, 0x0d97, 0x1c1e, 0x68a1, 0x7928, 0x4bb3, 0x5a3a, 0xa2cd, 0xb344, 0x81df, 0x9056, 0xe4e9, 0xf560, 0xc7fb, 0xd672, 0x3e04, 0x2f8d, 0x1d16, 0x0c9f, 0x7820, 0x69a9, 0x5b32, 0x4abb, 0xb24c, 0xa3c5, 0x915e, 0x80d7, 0xf468, 0xe5e1, 0xd77a, 0xc6f3, 0x4d83, 0x5c0a, 0x6e91, 0x7f18, 0x0ba7, 0x1a2e, 0x28b5, 0x393c, 0xc1cb, 0xd042, 0xe2d9, 0xf350, 0x87ef, 0x9666, 0xa4fd, 0xb574, 0x5d02, 0x4c8b, 0x7e10, 0x6f99, 0x1b26, 0x0aaf, 0x3834, 0x29bd, 0xd14a, 0xc0c3, 0xf258, 0xe3d1, 0x976e, 0x86e7, 0xb47c, 0xa5f5, 0x6c81, 0x7d08, 0x4f93, 0x5e1a, 0x2aa5, 0x3b2c, 0x09b7, 0x183e, 0xe0c9, 0xf140, 0xc3db, 0xd252, 0xa6ed, 0xb764, 0x85ff, 0x9476, 0x7c00, 0x6d89, 0x5f12, 0x4e9b, 0x3a24, 0x2bad, 0x1936, 0x08bf, 0xf048, 0xe1c1, 0xd35a, 0xc2d3, 0xb66c, 0xa7e5, 0x957e, 0x84f7, 0x8b8f, 0x9a06, 0xa89d, 0xb914, 0xcdab, 0xdc22, 0xeeb9, 0xff30, 0x07c7, 0x164e, 0x24d5, 0x355c, 0x41e3, 0x506a, 0x62f1, 0x7378, 0x9b0e, 0x8a87, 0xb81c, 0xa995, 0xdd2a, 0xcca3, 0xfe38, 0xefb1, 0x1746, 0x06cf, 0x3454, 0x25dd, 0x5162, 0x40eb, 0x7270, 0x63f9, 0xaa8d, 0xbb04, 0x899f, 0x9816, 0xeca9, 0xfd20, 0xcfbb, 0xde32, 0x26c5, 0x374c, 0x05d7, 0x145e, 0x60e1, 0x7168, 0x43f3, 0x527a, 0xba0c, 0xab85, 0x991e, 0x8897, 0xfc28, 0xeda1, 0xdf3a, 0xceb3, 0x3644, 0x27cd, 0x1556, 0x04df, 0x7060, 0x61e9, 0x5372, 0x42fb, 0xc98b, 0xd802, 0xea99, 0xfb10, 0x8faf, 0x9e26, 0xacbd, 0xbd34, 0x45c3, 0x544a, 0x66d1, 0x7758, 0x03e7, 0x126e, 0x20f5, 0x317c, 0xd90a, 0xc883, 0xfa18, 0xeb91, 0x9f2e, 0x8ea7, 0xbc3c, 0xadb5, 0x5542, 0x44cb, 0x7650, 0x67d9, 0x1366, 0x02ef, 0x3074, 0x21fd, 0xe889, 0xf900, 0xcb9b, 0xda12, 0xaead, 0xbf24, 0x8dbf, 0x9c36, 0x64c1, 0x7548, 0x47d3, 0x565a, 0x22e5, 0x336c, 0x01f7, 0x107e, 0xf808, 0xe981, 0xdb1a, 0xca93, 0xbe2c, 0xafa5, 0x9d3e, 0x8cb7, 0x7440, 0x65c9, 0x5752, 0x46db, 0x3264, 0x23ed, 0x1176, 0x00ff }; static unsigned short calc_crc_flex(unsigned char *cp, int size) { unsigned short crc = 0xffff; while (size--) crc = (crc << 8) ^ crc_flex_table[((crc >> 8) ^ *cp++) & 0xff]; return crc; } static int check_crc_flex(unsigned char *cp, int size) { unsigned short crc = 0xffff; if (size < 3) return -1; while (size--) crc = (crc << 8) ^ crc_flex_table[((crc >> 8) ^ *cp++) & 0xff]; if ((crc & 0xffff) != 0x7070) return -1; return 0; } static int check_crc_16(unsigned char *cp, int size) { unsigned short crc = 0x0000; if (size < 3) return -1; crc = crc16(0, cp, size); if (crc != 0x0000) return -1; return 0; } /* * Standard encapsulation */ static int kiss_esc(unsigned char *s, unsigned char *d, int len) { unsigned char *ptr = d; unsigned char c; /* * Send an initial END character to flush out any data that may have * accumulated in the receiver due to line noise. */ *ptr++ = END; while (len-- > 0) { switch (c = *s++) { case END: *ptr++ = ESC; *ptr++ = ESC_END; break; case ESC: *ptr++ = ESC; *ptr++ = ESC_ESC; break; default: *ptr++ = c; break; } } *ptr++ = END; return ptr - d; } /* * MW: * OK its ugly, but tell me a better solution without copying the * packet to a temporary buffer :-) */ static int kiss_esc_crc(unsigned char *s, unsigned char *d, unsigned short crc, int len) { unsigned char *ptr = d; unsigned char c=0; *ptr++ = END; while (len > 0) { if (len > 2) c = *s++; else if (len > 1) c = crc >> 8; else c = crc & 0xff; len--; switch (c) { case END: *ptr++ = ESC; *ptr++ = ESC_END; break; case ESC: *ptr++ = ESC; *ptr++ = ESC_ESC; break; default: *ptr++ = c; break; } } *ptr++ = END; return ptr - d; } /* Send one completely decapsulated AX.25 packet to the AX.25 layer. */ static void ax_bump(struct mkiss *ax) { struct sk_buff *skb; int count; spin_lock_bh(&ax->buflock); if (ax->rbuff[0] > 0x0f) { if (ax->rbuff[0] & 0x80) { if (check_crc_16(ax->rbuff, ax->rcount) < 0) { ax->dev->stats.rx_errors++; spin_unlock_bh(&ax->buflock); return; } if (ax->crcmode != CRC_MODE_SMACK && ax->crcauto) { printk(KERN_INFO "mkiss: %s: Switching to crc-smack\n", ax->dev->name); ax->crcmode = CRC_MODE_SMACK; } ax->rcount -= 2; *ax->rbuff &= ~0x80; } else if (ax->rbuff[0] & 0x20) { if (check_crc_flex(ax->rbuff, ax->rcount) < 0) { ax->dev->stats.rx_errors++; spin_unlock_bh(&ax->buflock); return; } if (ax->crcmode != CRC_MODE_FLEX && ax->crcauto) { printk(KERN_INFO "mkiss: %s: Switching to crc-flexnet\n", ax->dev->name); ax->crcmode = CRC_MODE_FLEX; } ax->rcount -= 2; /* * dl9sau bugfix: the trailling two bytes flexnet crc * will not be passed to the kernel. thus we have to * correct the kissparm signature, because it indicates * a crc but there's none */ *ax->rbuff &= ~0x20; } } count = ax->rcount; if ((skb = dev_alloc_skb(count)) == NULL) { printk(KERN_ERR "mkiss: %s: memory squeeze, dropping packet.\n", ax->dev->name); ax->dev->stats.rx_dropped++; spin_unlock_bh(&ax->buflock); return; } skb_put_data(skb, ax->rbuff, count); skb->protocol = ax25_type_trans(skb, ax->dev); netif_rx(skb); ax->dev->stats.rx_packets++; ax->dev->stats.rx_bytes += count; spin_unlock_bh(&ax->buflock); } static void kiss_unesc(struct mkiss *ax, unsigned char s) { switch (s) { case END: /* drop keeptest bit = VSV */ if (test_bit(AXF_KEEPTEST, &ax->flags)) clear_bit(AXF_KEEPTEST, &ax->flags); if (!test_and_clear_bit(AXF_ERROR, &ax->flags) && (ax->rcount > 2)) ax_bump(ax); clear_bit(AXF_ESCAPE, &ax->flags); ax->rcount = 0; return; case ESC: set_bit(AXF_ESCAPE, &ax->flags); return; case ESC_ESC: if (test_and_clear_bit(AXF_ESCAPE, &ax->flags)) s = ESC; break; case ESC_END: if (test_and_clear_bit(AXF_ESCAPE, &ax->flags)) s = END; break; } spin_lock_bh(&ax->buflock); if (!test_bit(AXF_ERROR, &ax->flags)) { if (ax->rcount < ax->buffsize) { ax->rbuff[ax->rcount++] = s; spin_unlock_bh(&ax->buflock); return; } ax->dev->stats.rx_over_errors++; set_bit(AXF_ERROR, &ax->flags); } spin_unlock_bh(&ax->buflock); } static int ax_set_mac_address(struct net_device *dev, void *addr) { struct sockaddr_ax25 *sa = addr; netif_tx_lock_bh(dev); netif_addr_lock(dev); __dev_addr_set(dev, &sa->sax25_call, AX25_ADDR_LEN); netif_addr_unlock(dev); netif_tx_unlock_bh(dev); return 0; } /*---------------------------------------------------------------------------*/ static void ax_changedmtu(struct mkiss *ax) { struct net_device *dev = ax->dev; unsigned char *xbuff, *rbuff, *oxbuff, *orbuff; int len; len = dev->mtu * 2; /* * allow for arrival of larger UDP packets, even if we say not to * also fixes a bug in which SunOS sends 512-byte packets even with * an MSS of 128 */ if (len < 576 * 2) len = 576 * 2; xbuff = kmalloc(len + 4, GFP_ATOMIC); rbuff = kmalloc(len + 4, GFP_ATOMIC); if (xbuff == NULL || rbuff == NULL) { printk(KERN_ERR "mkiss: %s: unable to grow ax25 buffers, " "MTU change cancelled.\n", ax->dev->name); dev->mtu = ax->mtu; kfree(xbuff); kfree(rbuff); return; } spin_lock_bh(&ax->buflock); oxbuff = ax->xbuff; ax->xbuff = xbuff; orbuff = ax->rbuff; ax->rbuff = rbuff; if (ax->xleft) { if (ax->xleft <= len) { memcpy(ax->xbuff, ax->xhead, ax->xleft); } else { ax->xleft = 0; dev->stats.tx_dropped++; } } ax->xhead = ax->xbuff; if (ax->rcount) { if (ax->rcount <= len) { memcpy(ax->rbuff, orbuff, ax->rcount); } else { ax->rcount = 0; dev->stats.rx_over_errors++; set_bit(AXF_ERROR, &ax->flags); } } ax->mtu = dev->mtu + 73; ax->buffsize = len; spin_unlock_bh(&ax->buflock); kfree(oxbuff); kfree(orbuff); } /* Encapsulate one AX.25 packet and stuff into a TTY queue. */ static void ax_encaps(struct net_device *dev, unsigned char *icp, int len) { struct mkiss *ax = netdev_priv(dev); unsigned char *p; int actual, count; if (ax->mtu != ax->dev->mtu + 73) /* Someone has been ifconfigging */ ax_changedmtu(ax); if (len > ax->mtu) { /* Sigh, shouldn't occur BUT ... */ printk(KERN_ERR "mkiss: %s: truncating oversized transmit packet!\n", ax->dev->name); dev->stats.tx_dropped++; netif_start_queue(dev); return; } p = icp; spin_lock_bh(&ax->buflock); if ((*p & 0x0f) != 0) { /* Configuration Command (kissparms(1). * Protocol spec says: never append CRC. * This fixes a very old bug in the linux * kiss driver. -- dl9sau */ switch (*p & 0xff) { case 0x85: /* command from userspace especially for us, * not for delivery to the tnc */ if (len > 1) { int cmd = (p[1] & 0xff); switch(cmd) { case 3: ax->crcmode = CRC_MODE_SMACK; break; case 2: ax->crcmode = CRC_MODE_FLEX; break; case 1: ax->crcmode = CRC_MODE_NONE; break; case 0: default: ax->crcmode = CRC_MODE_SMACK_TEST; cmd = 0; } ax->crcauto = (cmd ? 0 : 1); printk(KERN_INFO "mkiss: %s: crc mode set to %d\n", ax->dev->name, cmd); } spin_unlock_bh(&ax->buflock); netif_start_queue(dev); return; default: count = kiss_esc(p, ax->xbuff, len); } } else { unsigned short crc; switch (ax->crcmode) { case CRC_MODE_SMACK_TEST: ax->crcmode = CRC_MODE_FLEX_TEST; printk(KERN_INFO "mkiss: %s: Trying crc-smack\n", ax->dev->name); fallthrough; case CRC_MODE_SMACK: *p |= 0x80; crc = swab16(crc16(0, p, len)); count = kiss_esc_crc(p, ax->xbuff, crc, len+2); break; case CRC_MODE_FLEX_TEST: ax->crcmode = CRC_MODE_NONE; printk(KERN_INFO "mkiss: %s: Trying crc-flexnet\n", ax->dev->name); fallthrough; case CRC_MODE_FLEX: *p |= 0x20; crc = calc_crc_flex(p, len); count = kiss_esc_crc(p, ax->xbuff, crc, len+2); break; default: count = kiss_esc(p, ax->xbuff, len); } } spin_unlock_bh(&ax->buflock); set_bit(TTY_DO_WRITE_WAKEUP, &ax->tty->flags); actual = ax->tty->ops->write(ax->tty, ax->xbuff, count); dev->stats.tx_packets++; dev->stats.tx_bytes += actual; netif_trans_update(ax->dev); ax->xleft = count - actual; ax->xhead = ax->xbuff + actual; } /* Encapsulate an AX.25 packet and kick it into a TTY queue. */ static netdev_tx_t ax_xmit(struct sk_buff *skb, struct net_device *dev) { struct mkiss *ax = netdev_priv(dev); if (skb->protocol == htons(ETH_P_IP)) return ax25_ip_xmit(skb); if (!netif_running(dev)) { printk(KERN_ERR "mkiss: %s: xmit call when iface is down\n", dev->name); return NETDEV_TX_BUSY; } if (netif_queue_stopped(dev)) { /* * May be we must check transmitter timeout here ? * 14 Oct 1994 Dmitry Gorodchanin. */ if (time_before(jiffies, dev_trans_start(dev) + 20 * HZ)) { /* 20 sec timeout not reached */ return NETDEV_TX_BUSY; } printk(KERN_ERR "mkiss: %s: transmit timed out, %s?\n", dev->name, (tty_chars_in_buffer(ax->tty) || ax->xleft) ? "bad line quality" : "driver error"); ax->xleft = 0; clear_bit(TTY_DO_WRITE_WAKEUP, &ax->tty->flags); netif_start_queue(dev); } /* We were not busy, so we are now... :-) */ netif_stop_queue(dev); ax_encaps(dev, skb->data, skb->len); kfree_skb(skb); return NETDEV_TX_OK; } static int ax_open_dev(struct net_device *dev) { struct mkiss *ax = netdev_priv(dev); if (ax->tty == NULL) return -ENODEV; return 0; } /* Open the low-level part of the AX25 channel. Easy! */ static int ax_open(struct net_device *dev) { struct mkiss *ax = netdev_priv(dev); unsigned long len; if (ax->tty == NULL) return -ENODEV; /* * Allocate the frame buffers: * * rbuff Receive buffer. * xbuff Transmit buffer. */ len = dev->mtu * 2; /* * allow for arrival of larger UDP packets, even if we say not to * also fixes a bug in which SunOS sends 512-byte packets even with * an MSS of 128 */ if (len < 576 * 2) len = 576 * 2; if ((ax->rbuff = kmalloc(len + 4, GFP_KERNEL)) == NULL) goto norbuff; if ((ax->xbuff = kmalloc(len + 4, GFP_KERNEL)) == NULL) goto noxbuff; ax->mtu = dev->mtu + 73; ax->buffsize = len; ax->rcount = 0; ax->xleft = 0; ax->flags &= (1 << AXF_INUSE); /* Clear ESCAPE & ERROR flags */ spin_lock_init(&ax->buflock); return 0; noxbuff: kfree(ax->rbuff); norbuff: return -ENOMEM; } /* Close the low-level part of the AX25 channel. Easy! */ static int ax_close(struct net_device *dev) { struct mkiss *ax = netdev_priv(dev); if (ax->tty) clear_bit(TTY_DO_WRITE_WAKEUP, &ax->tty->flags); netif_stop_queue(dev); return 0; } static const struct net_device_ops ax_netdev_ops = { .ndo_open = ax_open_dev, .ndo_stop = ax_close, .ndo_start_xmit = ax_xmit, .ndo_set_mac_address = ax_set_mac_address, }; static void ax_setup(struct net_device *dev) { /* Finish setting up the DEVICE info. */ dev->mtu = AX_MTU; dev->hard_header_len = AX25_MAX_HEADER_LEN; dev->addr_len = AX25_ADDR_LEN; dev->type = ARPHRD_AX25; dev->tx_queue_len = 10; dev->header_ops = &ax25_header_ops; dev->netdev_ops = &ax_netdev_ops; memcpy(dev->broadcast, &ax25_bcast, AX25_ADDR_LEN); dev_addr_set(dev, (u8 *)&ax25_defaddr); dev->flags = IFF_BROADCAST | IFF_MULTICAST; } /* * We have a potential race on dereferencing tty->disc_data, because the tty * layer provides no locking at all - thus one cpu could be running * sixpack_receive_buf while another calls sixpack_close, which zeroes * tty->disc_data and frees the memory that sixpack_receive_buf is using. The * best way to fix this is to use a rwlock in the tty struct, but for now we * use a single global rwlock for all ttys in ppp line discipline. */ static DEFINE_RWLOCK(disc_data_lock); static struct mkiss *mkiss_get(struct tty_struct *tty) { struct mkiss *ax; read_lock(&disc_data_lock); ax = tty->disc_data; if (ax) refcount_inc(&ax->refcnt); read_unlock(&disc_data_lock); return ax; } static void mkiss_put(struct mkiss *ax) { if (refcount_dec_and_test(&ax->refcnt)) complete(&ax->dead); } static int crc_force = 0; /* Can be overridden with insmod */ static int mkiss_open(struct tty_struct *tty) { struct net_device *dev; struct mkiss *ax; int err; if (!capable(CAP_NET_ADMIN)) return -EPERM; if (tty->ops->write == NULL) return -EOPNOTSUPP; dev = alloc_netdev(sizeof(struct mkiss), "ax%d", NET_NAME_UNKNOWN, ax_setup); if (!dev) { err = -ENOMEM; goto out; } ax = netdev_priv(dev); ax->dev = dev; spin_lock_init(&ax->buflock); refcount_set(&ax->refcnt, 1); init_completion(&ax->dead); ax->tty = tty; tty->disc_data = ax; tty->receive_room = 65535; tty_driver_flush_buffer(tty); /* Restore default settings */ dev->type = ARPHRD_AX25; /* Perform the low-level AX25 initialization. */ err = ax_open(ax->dev); if (err) goto out_free_netdev; err = register_netdev(dev); if (err) goto out_free_buffers; /* after register_netdev() - because else printk smashes the kernel */ switch (crc_force) { case 3: ax->crcmode = CRC_MODE_SMACK; printk(KERN_INFO "mkiss: %s: crc mode smack forced.\n", ax->dev->name); break; case 2: ax->crcmode = CRC_MODE_FLEX; printk(KERN_INFO "mkiss: %s: crc mode flexnet forced.\n", ax->dev->name); break; case 1: ax->crcmode = CRC_MODE_NONE; printk(KERN_INFO "mkiss: %s: crc mode disabled.\n", ax->dev->name); break; case 0: default: crc_force = 0; printk(KERN_INFO "mkiss: %s: crc mode is auto.\n", ax->dev->name); ax->crcmode = CRC_MODE_SMACK_TEST; } ax->crcauto = (crc_force ? 0 : 1); netif_start_queue(dev); /* Done. We have linked the TTY line to a channel. */ return 0; out_free_buffers: kfree(ax->rbuff); kfree(ax->xbuff); out_free_netdev: free_netdev(dev); out: return err; } static void mkiss_close(struct tty_struct *tty) { struct mkiss *ax; write_lock_irq(&disc_data_lock); ax = tty->disc_data; tty->disc_data = NULL; write_unlock_irq(&disc_data_lock); if (!ax) return; /* * We have now ensured that nobody can start using ap from now on, but * we have to wait for all existing users to finish. */ if (!refcount_dec_and_test(&ax->refcnt)) wait_for_completion(&ax->dead); /* * Halt the transmit queue so that a new transmit cannot scribble * on our buffers */ netif_stop_queue(ax->dev); unregister_netdev(ax->dev); /* Free all AX25 frame buffers after unreg. */ kfree(ax->rbuff); kfree(ax->xbuff); ax->tty = NULL; free_netdev(ax->dev); } /* Perform I/O control on an active ax25 channel. */ static int mkiss_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned long arg) { struct mkiss *ax = mkiss_get(tty); struct net_device *dev; unsigned int tmp, err; /* First make sure we're connected. */ if (ax == NULL) return -ENXIO; dev = ax->dev; switch (cmd) { case SIOCGIFNAME: err = copy_to_user((void __user *) arg, ax->dev->name, strlen(ax->dev->name) + 1) ? -EFAULT : 0; break; case SIOCGIFENCAP: err = put_user(4, (int __user *) arg); break; case SIOCSIFENCAP: if (get_user(tmp, (int __user *) arg)) { err = -EFAULT; break; } ax->mode = tmp; dev->addr_len = AX25_ADDR_LEN; dev->hard_header_len = AX25_KISS_HEADER_LEN + AX25_MAX_HEADER_LEN + 3; dev->type = ARPHRD_AX25; err = 0; break; case SIOCSIFHWADDR: { char addr[AX25_ADDR_LEN]; if (copy_from_user(&addr, (void __user *) arg, AX25_ADDR_LEN)) { err = -EFAULT; break; } netif_tx_lock_bh(dev); __dev_addr_set(dev, addr, AX25_ADDR_LEN); netif_tx_unlock_bh(dev); err = 0; break; } default: err = -ENOIOCTLCMD; } mkiss_put(ax); return err; } /* * Handle the 'receiver data ready' interrupt. * This function is called by the 'tty_io' module in the kernel when * a block of data has been received, which can now be decapsulated * and sent on to the AX.25 layer for further processing. */ static void mkiss_receive_buf(struct tty_struct *tty, const u8 *cp, const u8 *fp, size_t count) { struct mkiss *ax = mkiss_get(tty); if (!ax) return; /* * Argh! mtu change time! - costs us the packet part received * at the change */ if (ax->mtu != ax->dev->mtu + 73) ax_changedmtu(ax); /* Read the characters out of the buffer */ while (count--) { if (fp != NULL && *fp++) { if (!test_and_set_bit(AXF_ERROR, &ax->flags)) ax->dev->stats.rx_errors++; cp++; continue; } kiss_unesc(ax, *cp++); } mkiss_put(ax); tty_unthrottle(tty); } /* * Called by the driver when there's room for more data. If we have * more packets to send, we send them here. */ static void mkiss_write_wakeup(struct tty_struct *tty) { struct mkiss *ax = mkiss_get(tty); int actual; if (!ax) return; if (ax->xleft <= 0) { /* Now serial buffer is almost free & we can start * transmission of another packet */ clear_bit(TTY_DO_WRITE_WAKEUP, &tty->flags); netif_wake_queue(ax->dev); goto out; } actual = tty->ops->write(tty, ax->xhead, ax->xleft); ax->xleft -= actual; ax->xhead += actual; out: mkiss_put(ax); } static struct tty_ldisc_ops ax_ldisc = { .owner = THIS_MODULE, .num = N_AX25, .name = "mkiss", .open = mkiss_open, .close = mkiss_close, .ioctl = mkiss_ioctl, .receive_buf = mkiss_receive_buf, .write_wakeup = mkiss_write_wakeup }; static const char banner[] __initconst = KERN_INFO \ "mkiss: AX.25 Multikiss, Hans Albas PE1AYX\n"; static const char msg_regfail[] __initconst = KERN_ERR \ "mkiss: can't register line discipline (err = %d)\n"; static int __init mkiss_init_driver(void) { int status; printk(banner); status = tty_register_ldisc(&ax_ldisc); if (status != 0) printk(msg_regfail, status); return status; } static void __exit mkiss_exit_driver(void) { tty_unregister_ldisc(&ax_ldisc); } MODULE_AUTHOR("Ralf Baechle DL5RB <ralf@linux-mips.org>"); MODULE_DESCRIPTION("KISS driver for AX.25 over TTYs"); module_param(crc_force, int, 0); MODULE_PARM_DESC(crc_force, "crc [0 = auto | 1 = none | 2 = flexnet | 3 = smack]"); MODULE_LICENSE("GPL"); MODULE_ALIAS_LDISC(N_AX25); module_init(mkiss_init_driver); module_exit(mkiss_exit_driver);
7 5 112 339 342 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 /* SPDX-License-Identifier: GPL-2.0-only */ /* include/net/xdp.h * * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc. */ #ifndef __LINUX_NET_XDP_H__ #define __LINUX_NET_XDP_H__ #include <linux/bitfield.h> #include <linux/filter.h> #include <linux/netdevice.h> #include <linux/skbuff.h> /* skb_shared_info */ /** * DOC: XDP RX-queue information * * The XDP RX-queue info (xdp_rxq_info) is associated with the driver * level RX-ring queues. It is information that is specific to how * the driver has configured a given RX-ring queue. * * Each xdp_buff frame received in the driver carries a (pointer) * reference to this xdp_rxq_info structure. This provides the XDP * data-path read-access to RX-info for both kernel and bpf-side * (limited subset). * * For now, direct access is only safe while running in NAPI/softirq * context. Contents are read-mostly and must not be updated during * driver NAPI/softirq poll. * * The driver usage API is a register and unregister API. * * The struct is not directly tied to the XDP prog. A new XDP prog * can be attached as long as it doesn't change the underlying * RX-ring. If the RX-ring does change significantly, the NIC driver * naturally needs to stop the RX-ring before purging and reallocating * memory. In that process the driver MUST call unregister (which * also applies for driver shutdown and unload). The register API is * also mandatory during RX-ring setup. */ enum xdp_mem_type { MEM_TYPE_PAGE_SHARED = 0, /* Split-page refcnt based model */ MEM_TYPE_PAGE_ORDER0, /* Orig XDP full page model */ MEM_TYPE_PAGE_POOL, MEM_TYPE_XSK_BUFF_POOL, MEM_TYPE_MAX, }; /* XDP flags for ndo_xdp_xmit */ #define XDP_XMIT_FLUSH (1U << 0) /* doorbell signal consumer */ #define XDP_XMIT_FLAGS_MASK XDP_XMIT_FLUSH struct xdp_mem_info { u32 type; /* enum xdp_mem_type, but known size type */ u32 id; }; struct page_pool; struct xdp_rxq_info { struct net_device *dev; u32 queue_index; u32 reg_state; struct xdp_mem_info mem; unsigned int napi_id; u32 frag_size; } ____cacheline_aligned; /* perf critical, avoid false-sharing */ struct xdp_txq_info { struct net_device *dev; }; enum xdp_buff_flags { XDP_FLAGS_HAS_FRAGS = BIT(0), /* non-linear xdp buff */ XDP_FLAGS_FRAGS_PF_MEMALLOC = BIT(1), /* xdp paged memory is under * pressure */ }; struct xdp_buff { void *data; void *data_end; void *data_meta; void *data_hard_start; struct xdp_rxq_info *rxq; struct xdp_txq_info *txq; u32 frame_sz; /* frame size to deduce data_hard_end/reserved tailroom*/ u32 flags; /* supported values defined in xdp_buff_flags */ }; static __always_inline bool xdp_buff_has_frags(struct xdp_buff *xdp) { return !!(xdp->flags & XDP_FLAGS_HAS_FRAGS); } static __always_inline void xdp_buff_set_frags_flag(struct xdp_buff *xdp) { xdp->flags |= XDP_FLAGS_HAS_FRAGS; } static __always_inline void xdp_buff_clear_frags_flag(struct xdp_buff *xdp) { xdp->flags &= ~XDP_FLAGS_HAS_FRAGS; } static __always_inline bool xdp_buff_is_frag_pfmemalloc(struct xdp_buff *xdp) { return !!(xdp->flags & XDP_FLAGS_FRAGS_PF_MEMALLOC); } static __always_inline void xdp_buff_set_frag_pfmemalloc(struct xdp_buff *xdp) { xdp->flags |= XDP_FLAGS_FRAGS_PF_MEMALLOC; } static __always_inline void xdp_init_buff(struct xdp_buff *xdp, u32 frame_sz, struct xdp_rxq_info *rxq) { xdp->frame_sz = frame_sz; xdp->rxq = rxq; xdp->flags = 0; } static __always_inline void xdp_prepare_buff(struct xdp_buff *xdp, unsigned char *hard_start, int headroom, int data_len, const bool meta_valid) { unsigned char *data = hard_start + headroom; xdp->data_hard_start = hard_start; xdp->data = data; xdp->data_end = data + data_len; xdp->data_meta = meta_valid ? data : data + 1; } /* Reserve memory area at end-of data area. * * This macro reserves tailroom in the XDP buffer by limiting the * XDP/BPF data access to data_hard_end. Notice same area (and size) * is used for XDP_PASS, when constructing the SKB via build_skb(). */ #define xdp_data_hard_end(xdp) \ ((xdp)->data_hard_start + (xdp)->frame_sz - \ SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) static inline struct skb_shared_info * xdp_get_shared_info_from_buff(struct xdp_buff *xdp) { return (struct skb_shared_info *)xdp_data_hard_end(xdp); } static __always_inline unsigned int xdp_get_buff_len(struct xdp_buff *xdp) { unsigned int len = xdp->data_end - xdp->data; struct skb_shared_info *sinfo; if (likely(!xdp_buff_has_frags(xdp))) goto out; sinfo = xdp_get_shared_info_from_buff(xdp); len += sinfo->xdp_frags_size; out: return len; } struct xdp_frame { void *data; u16 len; u16 headroom; u32 metasize; /* uses lower 8-bits */ /* Lifetime of xdp_rxq_info is limited to NAPI/enqueue time, * while mem info is valid on remote CPU. */ struct xdp_mem_info mem; struct net_device *dev_rx; /* used by cpumap */ u32 frame_sz; u32 flags; /* supported values defined in xdp_buff_flags */ }; static __always_inline bool xdp_frame_has_frags(struct xdp_frame *frame) { return !!(frame->flags & XDP_FLAGS_HAS_FRAGS); } static __always_inline bool xdp_frame_is_frag_pfmemalloc(struct xdp_frame *frame) { return !!(frame->flags & XDP_FLAGS_FRAGS_PF_MEMALLOC); } #define XDP_BULK_QUEUE_SIZE 16 struct xdp_frame_bulk { int count; void *xa; void *q[XDP_BULK_QUEUE_SIZE]; }; static __always_inline void xdp_frame_bulk_init(struct xdp_frame_bulk *bq) { /* bq->count will be zero'ed when bq->xa gets updated */ bq->xa = NULL; } static inline struct skb_shared_info * xdp_get_shared_info_from_frame(struct xdp_frame *frame) { void *data_hard_start = frame->data - frame->headroom - sizeof(*frame); return (struct skb_shared_info *)(data_hard_start + frame->frame_sz - SKB_DATA_ALIGN(sizeof(struct skb_shared_info))); } struct xdp_cpumap_stats { unsigned int redirect; unsigned int pass; unsigned int drop; }; /* Clear kernel pointers in xdp_frame */ static inline void xdp_scrub_frame(struct xdp_frame *frame) { frame->data = NULL; frame->dev_rx = NULL; } static inline void xdp_update_skb_shared_info(struct sk_buff *skb, u8 nr_frags, unsigned int size, unsigned int truesize, bool pfmemalloc) { skb_shinfo(skb)->nr_frags = nr_frags; skb->len += size; skb->data_len += size; skb->truesize += truesize; skb->pfmemalloc |= pfmemalloc; } /* Avoids inlining WARN macro in fast-path */ void xdp_warn(const char *msg, const char *func, const int line); #define XDP_WARN(msg) xdp_warn(msg, __func__, __LINE__) struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp); struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf, struct sk_buff *skb, struct net_device *dev); struct sk_buff *xdp_build_skb_from_frame(struct xdp_frame *xdpf, struct net_device *dev); int xdp_alloc_skb_bulk(void **skbs, int n_skb, gfp_t gfp); struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf); static inline void xdp_convert_frame_to_buff(struct xdp_frame *frame, struct xdp_buff *xdp) { xdp->data_hard_start = frame->data - frame->headroom - sizeof(*frame); xdp->data = frame->data; xdp->data_end = frame->data + frame->len; xdp->data_meta = frame->data - frame->metasize; xdp->frame_sz = frame->frame_sz; xdp->flags = frame->flags; } static inline int xdp_update_frame_from_buff(struct xdp_buff *xdp, struct xdp_frame *xdp_frame) { int metasize, headroom; /* Assure headroom is available for storing info */ headroom = xdp->data - xdp->data_hard_start; metasize = xdp->data - xdp->data_meta; metasize = metasize > 0 ? metasize : 0; if (unlikely((headroom - metasize) < sizeof(*xdp_frame))) return -ENOSPC; /* Catch if driver didn't reserve tailroom for skb_shared_info */ if (unlikely(xdp->data_end > xdp_data_hard_end(xdp))) { XDP_WARN("Driver BUG: missing reserved tailroom"); return -ENOSPC; } xdp_frame->data = xdp->data; xdp_frame->len = xdp->data_end - xdp->data; xdp_frame->headroom = headroom - sizeof(*xdp_frame); xdp_frame->metasize = metasize; xdp_frame->frame_sz = xdp->frame_sz; xdp_frame->flags = xdp->flags; return 0; } /* Convert xdp_buff to xdp_frame */ static inline struct xdp_frame *xdp_convert_buff_to_frame(struct xdp_buff *xdp) { struct xdp_frame *xdp_frame; if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) return xdp_convert_zc_to_xdp_frame(xdp); /* Store info in top of packet */ xdp_frame = xdp->data_hard_start; if (unlikely(xdp_update_frame_from_buff(xdp, xdp_frame) < 0)) return NULL; /* rxq only valid until napi_schedule ends, convert to xdp_mem_info */ xdp_frame->mem = xdp->rxq->mem; return xdp_frame; } void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, struct xdp_buff *xdp); void xdp_return_frame(struct xdp_frame *xdpf); void xdp_return_frame_rx_napi(struct xdp_frame *xdpf); void xdp_return_buff(struct xdp_buff *xdp); void xdp_flush_frame_bulk(struct xdp_frame_bulk *bq); void xdp_return_frame_bulk(struct xdp_frame *xdpf, struct xdp_frame_bulk *bq); static __always_inline unsigned int xdp_get_frame_len(struct xdp_frame *xdpf) { struct skb_shared_info *sinfo; unsigned int len = xdpf->len; if (likely(!xdp_frame_has_frags(xdpf))) goto out; sinfo = xdp_get_shared_info_from_frame(xdpf); len += sinfo->xdp_frags_size; out: return len; } int __xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, struct net_device *dev, u32 queue_index, unsigned int napi_id, u32 frag_size); static inline int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, struct net_device *dev, u32 queue_index, unsigned int napi_id) { return __xdp_rxq_info_reg(xdp_rxq, dev, queue_index, napi_id, 0); } void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq); void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq); bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq); int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, enum xdp_mem_type type, void *allocator); void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq); int xdp_reg_mem_model(struct xdp_mem_info *mem, enum xdp_mem_type type, void *allocator); void xdp_unreg_mem_model(struct xdp_mem_info *mem); /* Drivers not supporting XDP metadata can use this helper, which * rejects any room expansion for metadata as a result. */ static __always_inline void xdp_set_data_meta_invalid(struct xdp_buff *xdp) { xdp->data_meta = xdp->data + 1; } static __always_inline bool xdp_data_meta_unsupported(const struct xdp_buff *xdp) { return unlikely(xdp->data_meta > xdp->data); } static inline bool xdp_metalen_invalid(unsigned long metalen) { unsigned long meta_max; meta_max = type_max(typeof_member(struct skb_shared_info, meta_len)); BUILD_BUG_ON(!__builtin_constant_p(meta_max)); return !IS_ALIGNED(metalen, sizeof(u32)) || metalen > meta_max; } struct xdp_attachment_info { struct bpf_prog *prog; u32 flags; }; struct netdev_bpf; void xdp_attachment_setup(struct xdp_attachment_info *info, struct netdev_bpf *bpf); #define DEV_MAP_BULK_SIZE XDP_BULK_QUEUE_SIZE /* Define the relationship between xdp-rx-metadata kfunc and * various other entities: * - xdp_rx_metadata enum * - netdev netlink enum (Documentation/netlink/specs/netdev.yaml) * - kfunc name * - xdp_metadata_ops field */ #define XDP_METADATA_KFUNC_xxx \ XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_TIMESTAMP, \ NETDEV_XDP_RX_METADATA_TIMESTAMP, \ bpf_xdp_metadata_rx_timestamp, \ xmo_rx_timestamp) \ XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_HASH, \ NETDEV_XDP_RX_METADATA_HASH, \ bpf_xdp_metadata_rx_hash, \ xmo_rx_hash) \ XDP_METADATA_KFUNC(XDP_METADATA_KFUNC_RX_VLAN_TAG, \ NETDEV_XDP_RX_METADATA_VLAN_TAG, \ bpf_xdp_metadata_rx_vlan_tag, \ xmo_rx_vlan_tag) \ enum xdp_rx_metadata { #define XDP_METADATA_KFUNC(name, _, __, ___) name, XDP_METADATA_KFUNC_xxx #undef XDP_METADATA_KFUNC MAX_XDP_METADATA_KFUNC, }; enum xdp_rss_hash_type { /* First part: Individual bits for L3/L4 types */ XDP_RSS_L3_IPV4 = BIT(0), XDP_RSS_L3_IPV6 = BIT(1), /* The fixed (L3) IPv4 and IPv6 headers can both be followed by * variable/dynamic headers, IPv4 called Options and IPv6 called * Extension Headers. HW RSS type can contain this info. */ XDP_RSS_L3_DYNHDR = BIT(2), /* When RSS hash covers L4 then drivers MUST set XDP_RSS_L4 bit in * addition to the protocol specific bit. This ease interaction with * SKBs and avoids reserving a fixed mask for future L4 protocol bits. */ XDP_RSS_L4 = BIT(3), /* L4 based hash, proto can be unknown */ XDP_RSS_L4_TCP = BIT(4), XDP_RSS_L4_UDP = BIT(5), XDP_RSS_L4_SCTP = BIT(6), XDP_RSS_L4_IPSEC = BIT(7), /* L4 based hash include IPSEC SPI */ XDP_RSS_L4_ICMP = BIT(8), /* Second part: RSS hash type combinations used for driver HW mapping */ XDP_RSS_TYPE_NONE = 0, XDP_RSS_TYPE_L2 = XDP_RSS_TYPE_NONE, XDP_RSS_TYPE_L3_IPV4 = XDP_RSS_L3_IPV4, XDP_RSS_TYPE_L3_IPV6 = XDP_RSS_L3_IPV6, XDP_RSS_TYPE_L3_IPV4_OPT = XDP_RSS_L3_IPV4 | XDP_RSS_L3_DYNHDR, XDP_RSS_TYPE_L3_IPV6_EX = XDP_RSS_L3_IPV6 | XDP_RSS_L3_DYNHDR, XDP_RSS_TYPE_L4_ANY = XDP_RSS_L4, XDP_RSS_TYPE_L4_IPV4_TCP = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_TCP, XDP_RSS_TYPE_L4_IPV4_UDP = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_UDP, XDP_RSS_TYPE_L4_IPV4_SCTP = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_SCTP, XDP_RSS_TYPE_L4_IPV4_IPSEC = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_IPSEC, XDP_RSS_TYPE_L4_IPV4_ICMP = XDP_RSS_L3_IPV4 | XDP_RSS_L4 | XDP_RSS_L4_ICMP, XDP_RSS_TYPE_L4_IPV6_TCP = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_TCP, XDP_RSS_TYPE_L4_IPV6_UDP = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_UDP, XDP_RSS_TYPE_L4_IPV6_SCTP = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_SCTP, XDP_RSS_TYPE_L4_IPV6_IPSEC = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_IPSEC, XDP_RSS_TYPE_L4_IPV6_ICMP = XDP_RSS_L3_IPV6 | XDP_RSS_L4 | XDP_RSS_L4_ICMP, XDP_RSS_TYPE_L4_IPV6_TCP_EX = XDP_RSS_TYPE_L4_IPV6_TCP | XDP_RSS_L3_DYNHDR, XDP_RSS_TYPE_L4_IPV6_UDP_EX = XDP_RSS_TYPE_L4_IPV6_UDP | XDP_RSS_L3_DYNHDR, XDP_RSS_TYPE_L4_IPV6_SCTP_EX = XDP_RSS_TYPE_L4_IPV6_SCTP | XDP_RSS_L3_DYNHDR, }; struct xdp_metadata_ops { int (*xmo_rx_timestamp)(const struct xdp_md *ctx, u64 *timestamp); int (*xmo_rx_hash)(const struct xdp_md *ctx, u32 *hash, enum xdp_rss_hash_type *rss_type); int (*xmo_rx_vlan_tag)(const struct xdp_md *ctx, __be16 *vlan_proto, u16 *vlan_tci); }; #ifdef CONFIG_NET u32 bpf_xdp_metadata_kfunc_id(int id); bool bpf_dev_bound_kfunc_id(u32 btf_id); void xdp_set_features_flag(struct net_device *dev, xdp_features_t val); void xdp_features_set_redirect_target(struct net_device *dev, bool support_sg); void xdp_features_clear_redirect_target(struct net_device *dev); #else static inline u32 bpf_xdp_metadata_kfunc_id(int id) { return 0; } static inline bool bpf_dev_bound_kfunc_id(u32 btf_id) { return false; } static inline void xdp_set_features_flag(struct net_device *dev, xdp_features_t val) { } static inline void xdp_features_set_redirect_target(struct net_device *dev, bool support_sg) { } static inline void xdp_features_clear_redirect_target(struct net_device *dev) { } #endif static inline void xdp_clear_features_flag(struct net_device *dev) { xdp_set_features_flag(dev, 0); } static __always_inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog, struct xdp_buff *xdp) { /* Driver XDP hooks are invoked within a single NAPI poll cycle and thus * under local_bh_disable(), which provides the needed RCU protection * for accessing map entries. */ u32 act = __bpf_prog_run(prog, xdp, BPF_DISPATCHER_FUNC(xdp)); if (static_branch_unlikely(&bpf_master_redirect_enabled_key)) { if (act == XDP_TX && netif_is_bond_slave(xdp->rxq->dev)) act = xdp_master_redirect(xdp); } return act; } #endif /* __LINUX_NET_XDP_H__ */
13 6 7 6 21 1 21 21 29 21 8 8 36 12 24 1 14 27 36 36 1 7 13 13 1 13 1 13 10 9 2 13 13 14 14 11 2 2 12 1 9 2 9 9 8 2 13 13 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 // SPDX-License-Identifier: GPL-2.0 /* * * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved. * * Directory handling functions for NTFS-based filesystems. * */ #include <linux/fs.h> #include <linux/nls.h> #include "debug.h" #include "ntfs.h" #include "ntfs_fs.h" /* Convert little endian UTF-16 to NLS string. */ int ntfs_utf16_to_nls(struct ntfs_sb_info *sbi, const __le16 *name, u32 len, u8 *buf, int buf_len) { int ret, warn; u8 *op; struct nls_table *nls = sbi->options->nls; static_assert(sizeof(wchar_t) == sizeof(__le16)); if (!nls) { /* UTF-16 -> UTF-8 */ ret = utf16s_to_utf8s((wchar_t *)name, len, UTF16_LITTLE_ENDIAN, buf, buf_len); buf[ret] = '\0'; return ret; } op = buf; warn = 0; while (len--) { u16 ec; int charlen; char dump[5]; if (buf_len < NLS_MAX_CHARSET_SIZE) { ntfs_warn(sbi->sb, "filename was truncated while converting."); break; } ec = le16_to_cpu(*name++); charlen = nls->uni2char(ec, op, buf_len); if (charlen > 0) { op += charlen; buf_len -= charlen; continue; } *op++ = '_'; buf_len -= 1; if (warn) continue; warn = 1; hex_byte_pack(&dump[0], ec >> 8); hex_byte_pack(&dump[2], ec); dump[4] = 0; ntfs_err(sbi->sb, "failed to convert \"%s\" to %s", dump, nls->charset); } *op = '\0'; return op - buf; } // clang-format off #define PLANE_SIZE 0x00010000 #define SURROGATE_PAIR 0x0000d800 #define SURROGATE_LOW 0x00000400 #define SURROGATE_BITS 0x000003ff // clang-format on /* * put_utf16 - Modified version of put_utf16 from fs/nls/nls_base.c * * Function is sparse warnings free. */ static inline void put_utf16(wchar_t *s, unsigned int c, enum utf16_endian endian) { static_assert(sizeof(wchar_t) == sizeof(__le16)); static_assert(sizeof(wchar_t) == sizeof(__be16)); switch (endian) { default: *s = (wchar_t)c; break; case UTF16_LITTLE_ENDIAN: *(__le16 *)s = __cpu_to_le16(c); break; case UTF16_BIG_ENDIAN: *(__be16 *)s = __cpu_to_be16(c); break; } } /* * _utf8s_to_utf16s * * Modified version of 'utf8s_to_utf16s' allows to * detect -ENAMETOOLONG without writing out of expected maximum. */ static int _utf8s_to_utf16s(const u8 *s, int inlen, enum utf16_endian endian, wchar_t *pwcs, int maxout) { u16 *op; int size; unicode_t u; op = pwcs; while (inlen > 0 && *s) { if (*s & 0x80) { size = utf8_to_utf32(s, inlen, &u); if (size < 0) return -EINVAL; s += size; inlen -= size; if (u >= PLANE_SIZE) { if (maxout < 2) return -ENAMETOOLONG; u -= PLANE_SIZE; put_utf16(op++, SURROGATE_PAIR | ((u >> 10) & SURROGATE_BITS), endian); put_utf16(op++, SURROGATE_PAIR | SURROGATE_LOW | (u & SURROGATE_BITS), endian); maxout -= 2; } else { if (maxout < 1) return -ENAMETOOLONG; put_utf16(op++, u, endian); maxout--; } } else { if (maxout < 1) return -ENAMETOOLONG; put_utf16(op++, *s++, endian); inlen--; maxout--; } } return op - pwcs; } /* * ntfs_nls_to_utf16 - Convert input string to UTF-16. * @name: Input name. * @name_len: Input name length. * @uni: Destination memory. * @max_ulen: Destination memory. * @endian: Endian of target UTF-16 string. * * This function is called: * - to create NTFS name * - to create symlink * * Return: UTF-16 string length or error (if negative). */ int ntfs_nls_to_utf16(struct ntfs_sb_info *sbi, const u8 *name, u32 name_len, struct cpu_str *uni, u32 max_ulen, enum utf16_endian endian) { int ret, slen; const u8 *end; struct nls_table *nls = sbi->options->nls; u16 *uname = uni->name; static_assert(sizeof(wchar_t) == sizeof(u16)); if (!nls) { /* utf8 -> utf16 */ ret = _utf8s_to_utf16s(name, name_len, endian, uname, max_ulen); uni->len = ret; return ret; } for (ret = 0, end = name + name_len; name < end; ret++, name += slen) { if (ret >= max_ulen) return -ENAMETOOLONG; slen = nls->char2uni(name, end - name, uname + ret); if (!slen) return -EINVAL; if (slen < 0) return slen; } #ifdef __BIG_ENDIAN if (endian == UTF16_LITTLE_ENDIAN) { int i = ret; while (i--) { __cpu_to_le16s(uname); uname++; } } #else if (endian == UTF16_BIG_ENDIAN) { int i = ret; while (i--) { __cpu_to_be16s(uname); uname++; } } #endif uni->len = ret; return ret; } /* * dir_search_u - Helper function. */ struct inode *dir_search_u(struct inode *dir, const struct cpu_str *uni, struct ntfs_fnd *fnd) { int err = 0; struct super_block *sb = dir->i_sb; struct ntfs_sb_info *sbi = sb->s_fs_info; struct ntfs_inode *ni = ntfs_i(dir); struct NTFS_DE *e; int diff; struct inode *inode = NULL; struct ntfs_fnd *fnd_a = NULL; if (!fnd) { fnd_a = fnd_get(); if (!fnd_a) { err = -ENOMEM; goto out; } fnd = fnd_a; } err = indx_find(&ni->dir, ni, NULL, uni, 0, sbi, &diff, &e, fnd); if (err) goto out; if (diff) { err = -ENOENT; goto out; } inode = ntfs_iget5(sb, &e->ref, uni); if (!IS_ERR(inode) && is_bad_inode(inode)) { iput(inode); err = -EINVAL; } out: fnd_put(fnd_a); return err == -ENOENT ? NULL : err ? ERR_PTR(err) : inode; } static inline int ntfs_filldir(struct ntfs_sb_info *sbi, struct ntfs_inode *ni, const struct NTFS_DE *e, u8 *name, struct dir_context *ctx) { const struct ATTR_FILE_NAME *fname; unsigned long ino; int name_len; u32 dt_type; fname = Add2Ptr(e, sizeof(struct NTFS_DE)); if (fname->type == FILE_NAME_DOS) return 0; if (!mi_is_ref(&ni->mi, &fname->home)) return 0; ino = ino_get(&e->ref); if (ino == MFT_REC_ROOT) return 0; /* Skip meta files. Unless option to show metafiles is set. */ if (!sbi->options->showmeta && ntfs_is_meta_file(sbi, ino)) return 0; if (sbi->options->nohidden && (fname->dup.fa & FILE_ATTRIBUTE_HIDDEN)) return 0; name_len = ntfs_utf16_to_nls(sbi, fname->name, fname->name_len, name, PATH_MAX); if (name_len <= 0) { ntfs_warn(sbi->sb, "failed to convert name for inode %lx.", ino); return 0; } /* * NTFS: symlinks are "dir + reparse" or "file + reparse" * Unfortunately reparse attribute is used for many purposes (several dozens). * It is not possible here to know is this name symlink or not. * To get exactly the type of name we should to open inode (read mft). * getattr for opened file (fstat) correctly returns symlink. */ dt_type = (fname->dup.fa & FILE_ATTRIBUTE_DIRECTORY) ? DT_DIR : DT_REG; /* * It is not reliable to detect the type of name using duplicated information * stored in parent directory. * The only correct way to get the type of name - read MFT record and find ATTR_STD. * The code below is not good idea. * It does additional locks/reads just to get the type of name. * Should we use additional mount option to enable branch below? */ if ((fname->dup.fa & FILE_ATTRIBUTE_REPARSE_POINT) && ino != ni->mi.rno) { struct inode *inode = ntfs_iget5(sbi->sb, &e->ref, NULL); if (!IS_ERR_OR_NULL(inode)) { dt_type = fs_umode_to_dtype(inode->i_mode); iput(inode); } } return !dir_emit(ctx, (s8 *)name, name_len, ino, dt_type); } /* * ntfs_read_hdr - Helper function for ntfs_readdir(). */ static int ntfs_read_hdr(struct ntfs_sb_info *sbi, struct ntfs_inode *ni, const struct INDEX_HDR *hdr, u64 vbo, u64 pos, u8 *name, struct dir_context *ctx) { int err; const struct NTFS_DE *e; u32 e_size; u32 end = le32_to_cpu(hdr->used); u32 off = le32_to_cpu(hdr->de_off); for (;; off += e_size) { if (off + sizeof(struct NTFS_DE) > end) return -1; e = Add2Ptr(hdr, off); e_size = le16_to_cpu(e->size); if (e_size < sizeof(struct NTFS_DE) || off + e_size > end) return -1; if (de_is_last(e)) return 0; /* Skip already enumerated. */ if (vbo + off < pos) continue; if (le16_to_cpu(e->key_size) < SIZEOF_ATTRIBUTE_FILENAME) return -1; ctx->pos = vbo + off; /* Submit the name to the filldir callback. */ err = ntfs_filldir(sbi, ni, e, name, ctx); if (err) return err; } } /* * ntfs_readdir - file_operations::iterate_shared * * Use non sorted enumeration. * We have an example of broken volume where sorted enumeration * counts each name twice. */ static int ntfs_readdir(struct file *file, struct dir_context *ctx) { const struct INDEX_ROOT *root; u64 vbo; size_t bit; loff_t eod; int err = 0; struct inode *dir = file_inode(file); struct ntfs_inode *ni = ntfs_i(dir); struct super_block *sb = dir->i_sb; struct ntfs_sb_info *sbi = sb->s_fs_info; loff_t i_size = i_size_read(dir); u32 pos = ctx->pos; u8 *name = NULL; struct indx_node *node = NULL; u8 index_bits = ni->dir.index_bits; /* Name is a buffer of PATH_MAX length. */ static_assert(NTFS_NAME_LEN * 4 < PATH_MAX); eod = i_size + sbi->record_size; if (pos >= eod) return 0; if (!dir_emit_dots(file, ctx)) return 0; /* Allocate PATH_MAX bytes. */ name = __getname(); if (!name) return -ENOMEM; if (!ni->mi_loaded && ni->attr_list.size) { /* * Directory inode is locked for read. * Load all subrecords to avoid 'write' access to 'ni' during * directory reading. */ ni_lock(ni); if (!ni->mi_loaded && ni->attr_list.size) { err = ni_load_all_mi(ni); if (!err) ni->mi_loaded = true; } ni_unlock(ni); if (err) goto out; } root = indx_get_root(&ni->dir, ni, NULL, NULL); if (!root) { err = -EINVAL; goto out; } if (pos >= sbi->record_size) { bit = (pos - sbi->record_size) >> index_bits; } else { err = ntfs_read_hdr(sbi, ni, &root->ihdr, 0, pos, name, ctx); if (err) goto out; bit = 0; } if (!i_size) { ctx->pos = eod; goto out; } for (;;) { vbo = (u64)bit << index_bits; if (vbo >= i_size) { ctx->pos = eod; goto out; } err = indx_used_bit(&ni->dir, ni, &bit); if (err) goto out; if (bit == MINUS_ONE_T) { ctx->pos = eod; goto out; } vbo = (u64)bit << index_bits; if (vbo >= i_size) { ntfs_inode_err(dir, "Looks like your dir is corrupt"); err = -EINVAL; goto out; } err = indx_read(&ni->dir, ni, bit << ni->dir.idx2vbn_bits, &node); if (err) goto out; err = ntfs_read_hdr(sbi, ni, &node->index->ihdr, vbo + sbi->record_size, pos, name, ctx); if (err) goto out; bit += 1; } out: __putname(name); put_indx_node(node); if (err == -ENOENT) { err = 0; ctx->pos = pos; } return err; } static int ntfs_dir_count(struct inode *dir, bool *is_empty, size_t *dirs, size_t *files) { int err = 0; struct ntfs_inode *ni = ntfs_i(dir); struct NTFS_DE *e = NULL; struct INDEX_ROOT *root; struct INDEX_HDR *hdr; const struct ATTR_FILE_NAME *fname; u32 e_size, off, end; size_t drs = 0, fles = 0, bit = 0; struct indx_node *node = NULL; size_t max_indx = i_size_read(&ni->vfs_inode) >> ni->dir.index_bits; if (is_empty) *is_empty = true; root = indx_get_root(&ni->dir, ni, NULL, NULL); if (!root) return -EINVAL; hdr = &root->ihdr; for (;;) { end = le32_to_cpu(hdr->used); off = le32_to_cpu(hdr->de_off); for (; off + sizeof(struct NTFS_DE) <= end; off += e_size) { e = Add2Ptr(hdr, off); e_size = le16_to_cpu(e->size); if (e_size < sizeof(struct NTFS_DE) || off + e_size > end) { /* Looks like corruption. */ break; } if (de_is_last(e)) break; fname = de_get_fname(e); if (!fname) continue; if (fname->type == FILE_NAME_DOS) continue; if (is_empty) { *is_empty = false; if (!dirs && !files) goto out; } if (fname->dup.fa & FILE_ATTRIBUTE_DIRECTORY) drs += 1; else fles += 1; } if (bit >= max_indx) goto out; err = indx_used_bit(&ni->dir, ni, &bit); if (err) goto out; if (bit == MINUS_ONE_T) goto out; if (bit >= max_indx) goto out; err = indx_read(&ni->dir, ni, bit << ni->dir.idx2vbn_bits, &node); if (err) goto out; hdr = &node->index->ihdr; bit += 1; } out: put_indx_node(node); if (dirs) *dirs = drs; if (files) *files = fles; return err; } bool dir_is_empty(struct inode *dir) { bool is_empty = false; ntfs_dir_count(dir, &is_empty, NULL, NULL); return is_empty; } // clang-format off const struct file_operations ntfs_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, .iterate_shared = ntfs_readdir, .fsync = generic_file_fsync, .open = ntfs_file_open, .unlocked_ioctl = ntfs_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ntfs_compat_ioctl, #endif }; // clang-format on
10 1 1 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 // SPDX-License-Identifier: GPL-2.0-or-later /* * Squashfs - a compressed read only filesystem for Linux * * Copyright (c) 2010 * Phillip Lougher <phillip@squashfs.org.uk> * * xattr_id.c */ /* * This file implements code to map the 32-bit xattr id stored in the inode * into the on disk location of the xattr data. */ #include <linux/fs.h> #include <linux/vfs.h> #include <linux/slab.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" #include "squashfs.h" #include "xattr.h" /* * Map xattr id using the xattr id look up table */ int squashfs_xattr_lookup(struct super_block *sb, unsigned int index, int *count, unsigned int *size, unsigned long long *xattr) { struct squashfs_sb_info *msblk = sb->s_fs_info; int block = SQUASHFS_XATTR_BLOCK(index); int offset = SQUASHFS_XATTR_BLOCK_OFFSET(index); u64 start_block; struct squashfs_xattr_id id; int err; if (index >= msblk->xattr_ids) return -EINVAL; start_block = le64_to_cpu(msblk->xattr_id_table[block]); err = squashfs_read_metadata(sb, &id, &start_block, &offset, sizeof(id)); if (err < 0) return err; *xattr = le64_to_cpu(id.xattr); *size = le32_to_cpu(id.size); *count = le32_to_cpu(id.count); return 0; } /* * Read uncompressed xattr id lookup table indexes from disk into memory */ __le64 *squashfs_read_xattr_id_table(struct super_block *sb, u64 table_start, u64 *xattr_table_start, unsigned int *xattr_ids) { struct squashfs_sb_info *msblk = sb->s_fs_info; unsigned int len, indexes; struct squashfs_xattr_id_table *id_table; __le64 *table; u64 start, end; int n; id_table = squashfs_read_table(sb, table_start, sizeof(*id_table)); if (IS_ERR(id_table)) return (__le64 *) id_table; *xattr_table_start = le64_to_cpu(id_table->xattr_table_start); *xattr_ids = le32_to_cpu(id_table->xattr_ids); kfree(id_table); /* Sanity check values */ /* there is always at least one xattr id */ if (*xattr_ids == 0) return ERR_PTR(-EINVAL); len = SQUASHFS_XATTR_BLOCK_BYTES(*xattr_ids); indexes = SQUASHFS_XATTR_BLOCKS(*xattr_ids); /* * The computed size of the index table (len bytes) should exactly * match the table start and end points */ start = table_start + sizeof(*id_table); end = msblk->bytes_used; if (len != (end - start)) return ERR_PTR(-EINVAL); table = squashfs_read_table(sb, start, len); if (IS_ERR(table)) return table; /* table[0], table[1], ... table[indexes - 1] store the locations * of the compressed xattr id blocks. Each entry should be less than * the next (i.e. table[0] < table[1]), and the difference between them * should be SQUASHFS_METADATA_SIZE or less. table[indexes - 1] * should be less than table_start, and again the difference * shouls be SQUASHFS_METADATA_SIZE or less. * * Finally xattr_table_start should be less than table[0]. */ for (n = 0; n < (indexes - 1); n++) { start = le64_to_cpu(table[n]); end = le64_to_cpu(table[n + 1]); if (start >= end || (end - start) > (SQUASHFS_METADATA_SIZE + SQUASHFS_BLOCK_OFFSET)) { kfree(table); return ERR_PTR(-EINVAL); } } start = le64_to_cpu(table[indexes - 1]); if (start >= table_start || (table_start - start) > (SQUASHFS_METADATA_SIZE + SQUASHFS_BLOCK_OFFSET)) { kfree(table); return ERR_PTR(-EINVAL); } if (*xattr_table_start >= le64_to_cpu(table[0])) { kfree(table); return ERR_PTR(-EINVAL); } return table; }
1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright Samuel Mendoza-Jonas, IBM Corporation 2018. */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/if_arp.h> #include <linux/rtnetlink.h> #include <linux/etherdevice.h> #include <net/genetlink.h> #include <net/ncsi.h> #include <linux/skbuff.h> #include <net/sock.h> #include <uapi/linux/ncsi.h> #include "internal.h" #include "ncsi-pkt.h" #include "ncsi-netlink.h" static struct genl_family ncsi_genl_family; static const struct nla_policy ncsi_genl_policy[NCSI_ATTR_MAX + 1] = { [NCSI_ATTR_IFINDEX] = { .type = NLA_U32 }, [NCSI_ATTR_PACKAGE_LIST] = { .type = NLA_NESTED }, [NCSI_ATTR_PACKAGE_ID] = { .type = NLA_U32 }, [NCSI_ATTR_CHANNEL_ID] = { .type = NLA_U32 }, [NCSI_ATTR_DATA] = { .type = NLA_BINARY, .len = 2048 }, [NCSI_ATTR_MULTI_FLAG] = { .type = NLA_FLAG }, [NCSI_ATTR_PACKAGE_MASK] = { .type = NLA_U32 }, [NCSI_ATTR_CHANNEL_MASK] = { .type = NLA_U32 }, }; static struct ncsi_dev_priv *ndp_from_ifindex(struct net *net, u32 ifindex) { struct ncsi_dev_priv *ndp; struct net_device *dev; struct ncsi_dev *nd; struct ncsi_dev; if (!net) return NULL; dev = dev_get_by_index(net, ifindex); if (!dev) { pr_err("NCSI netlink: No device for ifindex %u\n", ifindex); return NULL; } nd = ncsi_find_dev(dev); ndp = nd ? TO_NCSI_DEV_PRIV(nd) : NULL; dev_put(dev); return ndp; } static int ncsi_write_channel_info(struct sk_buff *skb, struct ncsi_dev_priv *ndp, struct ncsi_channel *nc) { struct ncsi_channel_vlan_filter *ncf; struct ncsi_channel_mode *m; struct nlattr *vid_nest; int i; nla_put_u32(skb, NCSI_CHANNEL_ATTR_ID, nc->id); m = &nc->modes[NCSI_MODE_LINK]; nla_put_u32(skb, NCSI_CHANNEL_ATTR_LINK_STATE, m->data[2]); if (nc->state == NCSI_CHANNEL_ACTIVE) nla_put_flag(skb, NCSI_CHANNEL_ATTR_ACTIVE); if (nc == nc->package->preferred_channel) nla_put_flag(skb, NCSI_CHANNEL_ATTR_FORCED); nla_put_u32(skb, NCSI_CHANNEL_ATTR_VERSION_MAJOR, nc->version.major); nla_put_u32(skb, NCSI_CHANNEL_ATTR_VERSION_MINOR, nc->version.minor); nla_put_string(skb, NCSI_CHANNEL_ATTR_VERSION_STR, nc->version.fw_name); vid_nest = nla_nest_start_noflag(skb, NCSI_CHANNEL_ATTR_VLAN_LIST); if (!vid_nest) return -ENOMEM; ncf = &nc->vlan_filter; i = -1; while ((i = find_next_bit((void *)&ncf->bitmap, ncf->n_vids, i + 1)) < ncf->n_vids) { if (ncf->vids[i]) nla_put_u16(skb, NCSI_CHANNEL_ATTR_VLAN_ID, ncf->vids[i]); } nla_nest_end(skb, vid_nest); return 0; } static int ncsi_write_package_info(struct sk_buff *skb, struct ncsi_dev_priv *ndp, unsigned int id) { struct nlattr *pnest, *cnest, *nest; struct ncsi_package *np; struct ncsi_channel *nc; bool found; int rc; if (id > ndp->package_num - 1) { netdev_info(ndp->ndev.dev, "NCSI: No package with id %u\n", id); return -ENODEV; } found = false; NCSI_FOR_EACH_PACKAGE(ndp, np) { if (np->id != id) continue; pnest = nla_nest_start_noflag(skb, NCSI_PKG_ATTR); if (!pnest) return -ENOMEM; rc = nla_put_u32(skb, NCSI_PKG_ATTR_ID, np->id); if (rc) { nla_nest_cancel(skb, pnest); return rc; } if ((0x1 << np->id) == ndp->package_whitelist) nla_put_flag(skb, NCSI_PKG_ATTR_FORCED); cnest = nla_nest_start_noflag(skb, NCSI_PKG_ATTR_CHANNEL_LIST); if (!cnest) { nla_nest_cancel(skb, pnest); return -ENOMEM; } NCSI_FOR_EACH_CHANNEL(np, nc) { nest = nla_nest_start_noflag(skb, NCSI_CHANNEL_ATTR); if (!nest) { nla_nest_cancel(skb, cnest); nla_nest_cancel(skb, pnest); return -ENOMEM; } rc = ncsi_write_channel_info(skb, ndp, nc); if (rc) { nla_nest_cancel(skb, nest); nla_nest_cancel(skb, cnest); nla_nest_cancel(skb, pnest); return rc; } nla_nest_end(skb, nest); } nla_nest_end(skb, cnest); nla_nest_end(skb, pnest); found = true; } if (!found) return -ENODEV; return 0; } static int ncsi_pkg_info_nl(struct sk_buff *msg, struct genl_info *info) { struct ncsi_dev_priv *ndp; unsigned int package_id; struct sk_buff *skb; struct nlattr *attr; void *hdr; int rc; if (!info || !info->attrs) return -EINVAL; if (!info->attrs[NCSI_ATTR_IFINDEX]) return -EINVAL; if (!info->attrs[NCSI_ATTR_PACKAGE_ID]) return -EINVAL; ndp = ndp_from_ifindex(genl_info_net(info), nla_get_u32(info->attrs[NCSI_ATTR_IFINDEX])); if (!ndp) return -ENODEV; skb = genlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb) return -ENOMEM; hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq, &ncsi_genl_family, 0, NCSI_CMD_PKG_INFO); if (!hdr) { kfree_skb(skb); return -EMSGSIZE; } package_id = nla_get_u32(info->attrs[NCSI_ATTR_PACKAGE_ID]); attr = nla_nest_start_noflag(skb, NCSI_ATTR_PACKAGE_LIST); if (!attr) { kfree_skb(skb); return -EMSGSIZE; } rc = ncsi_write_package_info(skb, ndp, package_id); if (rc) { nla_nest_cancel(skb, attr); goto err; } nla_nest_end(skb, attr); genlmsg_end(skb, hdr); return genlmsg_reply(skb, info); err: kfree_skb(skb); return rc; } static int ncsi_pkg_info_all_nl(struct sk_buff *skb, struct netlink_callback *cb) { struct nlattr *attrs[NCSI_ATTR_MAX + 1]; struct ncsi_package *np, *package; struct ncsi_dev_priv *ndp; unsigned int package_id; struct nlattr *attr; void *hdr; int rc; rc = genlmsg_parse_deprecated(cb->nlh, &ncsi_genl_family, attrs, NCSI_ATTR_MAX, ncsi_genl_policy, NULL); if (rc) return rc; if (!attrs[NCSI_ATTR_IFINDEX]) return -EINVAL; ndp = ndp_from_ifindex(get_net(sock_net(skb->sk)), nla_get_u32(attrs[NCSI_ATTR_IFINDEX])); if (!ndp) return -ENODEV; package_id = cb->args[0]; package = NULL; NCSI_FOR_EACH_PACKAGE(ndp, np) if (np->id == package_id) package = np; if (!package) return 0; /* done */ hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &ncsi_genl_family, NLM_F_MULTI, NCSI_CMD_PKG_INFO); if (!hdr) { rc = -EMSGSIZE; goto err; } attr = nla_nest_start_noflag(skb, NCSI_ATTR_PACKAGE_LIST); if (!attr) { rc = -EMSGSIZE; goto err; } rc = ncsi_write_package_info(skb, ndp, package->id); if (rc) { nla_nest_cancel(skb, attr); goto err; } nla_nest_end(skb, attr); genlmsg_end(skb, hdr); cb->args[0] = package_id + 1; return skb->len; err: genlmsg_cancel(skb, hdr); return rc; } static int ncsi_set_interface_nl(struct sk_buff *msg, struct genl_info *info) { struct ncsi_package *np, *package; struct ncsi_channel *nc, *channel; u32 package_id, channel_id; struct ncsi_dev_priv *ndp; unsigned long flags; if (!info || !info->attrs) return -EINVAL; if (!info->attrs[NCSI_ATTR_IFINDEX]) return -EINVAL; if (!info->attrs[NCSI_ATTR_PACKAGE_ID]) return -EINVAL; ndp = ndp_from_ifindex(get_net(sock_net(msg->sk)), nla_get_u32(info->attrs[NCSI_ATTR_IFINDEX])); if (!ndp) return -ENODEV; package_id = nla_get_u32(info->attrs[NCSI_ATTR_PACKAGE_ID]); package = NULL; NCSI_FOR_EACH_PACKAGE(ndp, np) if (np->id == package_id) package = np; if (!package) { /* The user has set a package that does not exist */ return -ERANGE; } channel = NULL; if (info->attrs[NCSI_ATTR_CHANNEL_ID]) { channel_id = nla_get_u32(info->attrs[NCSI_ATTR_CHANNEL_ID]); NCSI_FOR_EACH_CHANNEL(package, nc) if (nc->id == channel_id) { channel = nc; break; } if (!channel) { netdev_info(ndp->ndev.dev, "NCSI: Channel %u does not exist!\n", channel_id); return -ERANGE; } } spin_lock_irqsave(&ndp->lock, flags); ndp->package_whitelist = 0x1 << package->id; ndp->multi_package = false; spin_unlock_irqrestore(&ndp->lock, flags); spin_lock_irqsave(&package->lock, flags); package->multi_channel = false; if (channel) { package->channel_whitelist = 0x1 << channel->id; package->preferred_channel = channel; } else { /* Allow any channel */ package->channel_whitelist = UINT_MAX; package->preferred_channel = NULL; } spin_unlock_irqrestore(&package->lock, flags); if (channel) netdev_info(ndp->ndev.dev, "Set package 0x%x, channel 0x%x as preferred\n", package_id, channel_id); else netdev_info(ndp->ndev.dev, "Set package 0x%x as preferred\n", package_id); /* Update channel configuration */ if (!(ndp->flags & NCSI_DEV_RESET)) ncsi_reset_dev(&ndp->ndev); return 0; } static int ncsi_clear_interface_nl(struct sk_buff *msg, struct genl_info *info) { struct ncsi_dev_priv *ndp; struct ncsi_package *np; unsigned long flags; if (!info || !info->attrs) return -EINVAL; if (!info->attrs[NCSI_ATTR_IFINDEX]) return -EINVAL; ndp = ndp_from_ifindex(get_net(sock_net(msg->sk)), nla_get_u32(info->attrs[NCSI_ATTR_IFINDEX])); if (!ndp) return -ENODEV; /* Reset any whitelists and disable multi mode */ spin_lock_irqsave(&ndp->lock, flags); ndp->package_whitelist = UINT_MAX; ndp->multi_package = false; spin_unlock_irqrestore(&ndp->lock, flags); NCSI_FOR_EACH_PACKAGE(ndp, np) { spin_lock_irqsave(&np->lock, flags); np->multi_channel = false; np->channel_whitelist = UINT_MAX; np->preferred_channel = NULL; spin_unlock_irqrestore(&np->lock, flags); } netdev_info(ndp->ndev.dev, "NCSI: Cleared preferred package/channel\n"); /* Update channel configuration */ if (!(ndp->flags & NCSI_DEV_RESET)) ncsi_reset_dev(&ndp->ndev); return 0; } static int ncsi_send_cmd_nl(struct sk_buff *msg, struct genl_info *info) { struct ncsi_dev_priv *ndp; struct ncsi_pkt_hdr *hdr; struct ncsi_cmd_arg nca; unsigned char *data; u32 package_id; u32 channel_id; int len, ret; if (!info || !info->attrs) { ret = -EINVAL; goto out; } if (!info->attrs[NCSI_ATTR_IFINDEX]) { ret = -EINVAL; goto out; } if (!info->attrs[NCSI_ATTR_PACKAGE_ID]) { ret = -EINVAL; goto out; } if (!info->attrs[NCSI_ATTR_CHANNEL_ID]) { ret = -EINVAL; goto out; } if (!info->attrs[NCSI_ATTR_DATA]) { ret = -EINVAL; goto out; } ndp = ndp_from_ifindex(get_net(sock_net(msg->sk)), nla_get_u32(info->attrs[NCSI_ATTR_IFINDEX])); if (!ndp) { ret = -ENODEV; goto out; } package_id = nla_get_u32(info->attrs[NCSI_ATTR_PACKAGE_ID]); channel_id = nla_get_u32(info->attrs[NCSI_ATTR_CHANNEL_ID]); if (package_id >= NCSI_MAX_PACKAGE || channel_id >= NCSI_MAX_CHANNEL) { ret = -ERANGE; goto out_netlink; } len = nla_len(info->attrs[NCSI_ATTR_DATA]); if (len < sizeof(struct ncsi_pkt_hdr)) { netdev_info(ndp->ndev.dev, "NCSI: no command to send %u\n", package_id); ret = -EINVAL; goto out_netlink; } else { data = (unsigned char *)nla_data(info->attrs[NCSI_ATTR_DATA]); } hdr = (struct ncsi_pkt_hdr *)data; nca.ndp = ndp; nca.package = (unsigned char)package_id; nca.channel = (unsigned char)channel_id; nca.type = hdr->type; nca.req_flags = NCSI_REQ_FLAG_NETLINK_DRIVEN; nca.info = info; nca.payload = ntohs(hdr->length); nca.data = data + sizeof(*hdr); ret = ncsi_xmit_cmd(&nca); out_netlink: if (ret != 0) { netdev_err(ndp->ndev.dev, "NCSI: Error %d sending command\n", ret); ncsi_send_netlink_err(ndp->ndev.dev, info->snd_seq, info->snd_portid, info->nlhdr, ret); } out: return ret; } int ncsi_send_netlink_rsp(struct ncsi_request *nr, struct ncsi_package *np, struct ncsi_channel *nc) { struct sk_buff *skb; struct net *net; void *hdr; int rc; net = dev_net(nr->rsp->dev); skb = genlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!skb) return -ENOMEM; hdr = genlmsg_put(skb, nr->snd_portid, nr->snd_seq, &ncsi_genl_family, 0, NCSI_CMD_SEND_CMD); if (!hdr) { kfree_skb(skb); return -EMSGSIZE; } nla_put_u32(skb, NCSI_ATTR_IFINDEX, nr->rsp->dev->ifindex); if (np) nla_put_u32(skb, NCSI_ATTR_PACKAGE_ID, np->id); if (nc) nla_put_u32(skb, NCSI_ATTR_CHANNEL_ID, nc->id); else nla_put_u32(skb, NCSI_ATTR_CHANNEL_ID, NCSI_RESERVED_CHANNEL); rc = nla_put(skb, NCSI_ATTR_DATA, nr->rsp->len, (void *)nr->rsp->data); if (rc) goto err; genlmsg_end(skb, hdr); return genlmsg_unicast(net, skb, nr->snd_portid); err: kfree_skb(skb); return rc; } int ncsi_send_netlink_timeout(struct ncsi_request *nr, struct ncsi_package *np, struct ncsi_channel *nc) { struct sk_buff *skb; struct net *net; void *hdr; skb = genlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!skb) return -ENOMEM; hdr = genlmsg_put(skb, nr->snd_portid, nr->snd_seq, &ncsi_genl_family, 0, NCSI_CMD_SEND_CMD); if (!hdr) { kfree_skb(skb); return -EMSGSIZE; } net = dev_net(nr->cmd->dev); nla_put_u32(skb, NCSI_ATTR_IFINDEX, nr->cmd->dev->ifindex); if (np) nla_put_u32(skb, NCSI_ATTR_PACKAGE_ID, np->id); else nla_put_u32(skb, NCSI_ATTR_PACKAGE_ID, NCSI_PACKAGE_INDEX((((struct ncsi_pkt_hdr *) nr->cmd->data)->channel))); if (nc) nla_put_u32(skb, NCSI_ATTR_CHANNEL_ID, nc->id); else nla_put_u32(skb, NCSI_ATTR_CHANNEL_ID, NCSI_RESERVED_CHANNEL); genlmsg_end(skb, hdr); return genlmsg_unicast(net, skb, nr->snd_portid); } int ncsi_send_netlink_err(struct net_device *dev, u32 snd_seq, u32 snd_portid, const struct nlmsghdr *nlhdr, int err) { struct nlmsghdr *nlh; struct nlmsgerr *nle; struct sk_buff *skb; struct net *net; skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); if (!skb) return -ENOMEM; net = dev_net(dev); nlh = nlmsg_put(skb, snd_portid, snd_seq, NLMSG_ERROR, sizeof(*nle), 0); nle = (struct nlmsgerr *)nlmsg_data(nlh); nle->error = err; memcpy(&nle->msg, nlhdr, sizeof(*nlh)); nlmsg_end(skb, nlh); return nlmsg_unicast(net->genl_sock, skb, snd_portid); } static int ncsi_set_package_mask_nl(struct sk_buff *msg, struct genl_info *info) { struct ncsi_dev_priv *ndp; unsigned long flags; int rc; if (!info || !info->attrs) return -EINVAL; if (!info->attrs[NCSI_ATTR_IFINDEX]) return -EINVAL; if (!info->attrs[NCSI_ATTR_PACKAGE_MASK]) return -EINVAL; ndp = ndp_from_ifindex(get_net(sock_net(msg->sk)), nla_get_u32(info->attrs[NCSI_ATTR_IFINDEX])); if (!ndp) return -ENODEV; spin_lock_irqsave(&ndp->lock, flags); if (nla_get_flag(info->attrs[NCSI_ATTR_MULTI_FLAG])) { if (ndp->flags & NCSI_DEV_HWA) { ndp->multi_package = true; rc = 0; } else { netdev_err(ndp->ndev.dev, "NCSI: Can't use multiple packages without HWA\n"); rc = -EPERM; } } else { ndp->multi_package = false; rc = 0; } if (!rc) ndp->package_whitelist = nla_get_u32(info->attrs[NCSI_ATTR_PACKAGE_MASK]); spin_unlock_irqrestore(&ndp->lock, flags); if (!rc) { /* Update channel configuration */ if (!(ndp->flags & NCSI_DEV_RESET)) ncsi_reset_dev(&ndp->ndev); } return rc; } static int ncsi_set_channel_mask_nl(struct sk_buff *msg, struct genl_info *info) { struct ncsi_package *np, *package; struct ncsi_channel *nc, *channel; u32 package_id, channel_id; struct ncsi_dev_priv *ndp; unsigned long flags; if (!info || !info->attrs) return -EINVAL; if (!info->attrs[NCSI_ATTR_IFINDEX]) return -EINVAL; if (!info->attrs[NCSI_ATTR_PACKAGE_ID]) return -EINVAL; if (!info->attrs[NCSI_ATTR_CHANNEL_MASK]) return -EINVAL; ndp = ndp_from_ifindex(get_net(sock_net(msg->sk)), nla_get_u32(info->attrs[NCSI_ATTR_IFINDEX])); if (!ndp) return -ENODEV; package_id = nla_get_u32(info->attrs[NCSI_ATTR_PACKAGE_ID]); package = NULL; NCSI_FOR_EACH_PACKAGE(ndp, np) if (np->id == package_id) { package = np; break; } if (!package) return -ERANGE; spin_lock_irqsave(&package->lock, flags); channel = NULL; if (info->attrs[NCSI_ATTR_CHANNEL_ID]) { channel_id = nla_get_u32(info->attrs[NCSI_ATTR_CHANNEL_ID]); NCSI_FOR_EACH_CHANNEL(np, nc) if (nc->id == channel_id) { channel = nc; break; } if (!channel) { spin_unlock_irqrestore(&package->lock, flags); return -ERANGE; } netdev_dbg(ndp->ndev.dev, "NCSI: Channel %u set as preferred channel\n", channel->id); } package->channel_whitelist = nla_get_u32(info->attrs[NCSI_ATTR_CHANNEL_MASK]); if (package->channel_whitelist == 0) netdev_dbg(ndp->ndev.dev, "NCSI: Package %u set to all channels disabled\n", package->id); package->preferred_channel = channel; if (nla_get_flag(info->attrs[NCSI_ATTR_MULTI_FLAG])) { package->multi_channel = true; netdev_info(ndp->ndev.dev, "NCSI: Multi-channel enabled on package %u\n", package_id); } else { package->multi_channel = false; } spin_unlock_irqrestore(&package->lock, flags); /* Update channel configuration */ if (!(ndp->flags & NCSI_DEV_RESET)) ncsi_reset_dev(&ndp->ndev); return 0; } static const struct genl_small_ops ncsi_ops[] = { { .cmd = NCSI_CMD_PKG_INFO, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = ncsi_pkg_info_nl, .dumpit = ncsi_pkg_info_all_nl, .flags = 0, }, { .cmd = NCSI_CMD_SET_INTERFACE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = ncsi_set_interface_nl, .flags = GENL_ADMIN_PERM, }, { .cmd = NCSI_CMD_CLEAR_INTERFACE, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = ncsi_clear_interface_nl, .flags = GENL_ADMIN_PERM, }, { .cmd = NCSI_CMD_SEND_CMD, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = ncsi_send_cmd_nl, .flags = GENL_ADMIN_PERM, }, { .cmd = NCSI_CMD_SET_PACKAGE_MASK, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = ncsi_set_package_mask_nl, .flags = GENL_ADMIN_PERM, }, { .cmd = NCSI_CMD_SET_CHANNEL_MASK, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = ncsi_set_channel_mask_nl, .flags = GENL_ADMIN_PERM, }, }; static struct genl_family ncsi_genl_family __ro_after_init = { .name = "NCSI", .version = 0, .maxattr = NCSI_ATTR_MAX, .policy = ncsi_genl_policy, .module = THIS_MODULE, .small_ops = ncsi_ops, .n_small_ops = ARRAY_SIZE(ncsi_ops), .resv_start_op = NCSI_CMD_SET_CHANNEL_MASK + 1, }; static int __init ncsi_init_netlink(void) { return genl_register_family(&ncsi_genl_family); } subsys_initcall(ncsi_init_netlink);
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 /* * Any part of this program may be used in documents licensed under * the GNU Free Documentation License, Version 1.1 or any later version * published by the Free Software Foundation. */ #ifndef _PARPORT_H_ #define _PARPORT_H_ #include <linux/jiffies.h> #include <linux/proc_fs.h> #include <linux/spinlock.h> #include <linux/wait.h> #include <linux/irqreturn.h> #include <linux/semaphore.h> #include <linux/device.h> #include <asm/ptrace.h> #include <uapi/linux/parport.h> /* Define this later. */ struct parport; struct pardevice; struct pc_parport_state { unsigned int ctr; unsigned int ecr; }; struct ax_parport_state { unsigned int ctr; unsigned int ecr; unsigned int dcsr; }; /* used by both parport_amiga and parport_mfc3 */ struct amiga_parport_state { unsigned char data; /* ciaa.prb */ unsigned char datadir; /* ciaa.ddrb */ unsigned char status; /* ciab.pra & 7 */ unsigned char statusdir;/* ciab.ddrb & 7 */ }; struct ip32_parport_state { unsigned int dcr; unsigned int ecr; }; struct parport_state { union { struct pc_parport_state pc; /* ARC has no state. */ struct ax_parport_state ax; struct amiga_parport_state amiga; /* Atari has not state. */ struct ip32_parport_state ip32; void *misc; } u; }; struct parport_operations { /* IBM PC-style virtual registers. */ void (*write_data)(struct parport *, unsigned char); unsigned char (*read_data)(struct parport *); void (*write_control)(struct parport *, unsigned char); unsigned char (*read_control)(struct parport *); unsigned char (*frob_control)(struct parport *, unsigned char mask, unsigned char val); unsigned char (*read_status)(struct parport *); /* IRQs. */ void (*enable_irq)(struct parport *); void (*disable_irq)(struct parport *); /* Data direction. */ void (*data_forward) (struct parport *); void (*data_reverse) (struct parport *); /* For core parport code. */ void (*init_state)(struct pardevice *, struct parport_state *); void (*save_state)(struct parport *, struct parport_state *); void (*restore_state)(struct parport *, struct parport_state *); /* Block read/write */ size_t (*epp_write_data) (struct parport *port, const void *buf, size_t len, int flags); size_t (*epp_read_data) (struct parport *port, void *buf, size_t len, int flags); size_t (*epp_write_addr) (struct parport *port, const void *buf, size_t len, int flags); size_t (*epp_read_addr) (struct parport *port, void *buf, size_t len, int flags); size_t (*ecp_write_data) (struct parport *port, const void *buf, size_t len, int flags); size_t (*ecp_read_data) (struct parport *port, void *buf, size_t len, int flags); size_t (*ecp_write_addr) (struct parport *port, const void *buf, size_t len, int flags); size_t (*compat_write_data) (struct parport *port, const void *buf, size_t len, int flags); size_t (*nibble_read_data) (struct parport *port, void *buf, size_t len, int flags); size_t (*byte_read_data) (struct parport *port, void *buf, size_t len, int flags); struct module *owner; }; struct parport_device_info { parport_device_class class; const char *class_name; const char *mfr; const char *model; const char *cmdset; const char *description; }; /* Each device can have two callback functions: * 1) a preemption function, called by the resource manager to request * that the driver relinquish control of the port. The driver should * return zero if it agrees to release the port, and nonzero if it * refuses. Do not call parport_release() - the kernel will do this * implicitly. * * 2) a wake-up function, called by the resource manager to tell drivers * that the port is available to be claimed. If a driver wants to use * the port, it should call parport_claim() here. */ /* A parallel port device */ struct pardevice { const char *name; struct parport *port; int daisy; int (*preempt)(void *); void (*wakeup)(void *); void *private; void (*irq_func)(void *); unsigned int flags; struct pardevice *next; struct pardevice *prev; struct device dev; bool devmodel; struct parport_state *state; /* saved status over preemption */ wait_queue_head_t wait_q; unsigned long int time; unsigned long int timeslice; volatile long int timeout; unsigned long waiting; /* long req'd for set_bit --RR */ struct pardevice *waitprev; struct pardevice *waitnext; void * sysctl_table; }; #define to_pardevice(n) container_of(n, struct pardevice, dev) /* IEEE1284 information */ /* IEEE1284 phases. These are exposed to userland through ppdev IOCTL * PP[GS]ETPHASE, so do not change existing values. */ enum ieee1284_phase { IEEE1284_PH_FWD_DATA, IEEE1284_PH_FWD_IDLE, IEEE1284_PH_TERMINATE, IEEE1284_PH_NEGOTIATION, IEEE1284_PH_HBUSY_DNA, IEEE1284_PH_REV_IDLE, IEEE1284_PH_HBUSY_DAVAIL, IEEE1284_PH_REV_DATA, IEEE1284_PH_ECP_SETUP, IEEE1284_PH_ECP_FWD_TO_REV, IEEE1284_PH_ECP_REV_TO_FWD, IEEE1284_PH_ECP_DIR_UNKNOWN, }; struct ieee1284_info { int mode; volatile enum ieee1284_phase phase; struct semaphore irq; }; /* A parallel port */ struct parport { unsigned long base; /* base address */ unsigned long base_hi; /* base address (hi - ECR) */ unsigned int size; /* IO extent */ const char *name; unsigned int modes; int irq; /* interrupt (or -1 for none) */ int dma; int muxport; /* which muxport (if any) this is */ int portnum; /* which physical parallel port (not mux) */ struct device *dev; /* Physical device associated with IO/DMA. * This may unfortulately be null if the * port has a legacy driver. */ struct device bus_dev; /* to link with the bus */ struct parport *physport; /* If this is a non-default mux parport, i.e. we're a clone of a real physical port, this is a pointer to that port. The locking is only done in the real port. For a clone port, the following structure members are meaningless: devices, cad, muxsel, waithead, waittail, flags, pdir, dev, ieee1284, *_lock. It this is a default mux parport, or there is no mux involved, this points to ourself. */ struct pardevice *devices; struct pardevice *cad; /* port owner */ int daisy; /* currently selected daisy addr */ int muxsel; /* currently selected mux port */ struct pardevice *waithead; struct pardevice *waittail; struct list_head list; struct timer_list timer; unsigned int flags; void *sysctl_table; struct parport_device_info probe_info[5]; /* 0-3 + non-IEEE1284.3 */ struct ieee1284_info ieee1284; struct parport_operations *ops; void *private_data; /* for lowlevel driver */ int number; /* port index - the `n' in `parportn' */ spinlock_t pardevice_lock; spinlock_t waitlist_lock; rwlock_t cad_lock; int spintime; atomic_t ref_count; unsigned long devflags; #define PARPORT_DEVPROC_REGISTERED 0 struct pardevice *proc_device; /* Currently register proc device */ struct list_head full_list; struct parport *slaves[3]; }; #define to_parport_dev(n) container_of(n, struct parport, bus_dev) #define DEFAULT_SPIN_TIME 500 /* us */ struct parport_driver { const char *name; void (*attach) (struct parport *); void (*detach) (struct parport *); void (*match_port)(struct parport *); int (*probe)(struct pardevice *); struct device_driver driver; bool devmodel; struct list_head list; }; #define to_parport_driver(n) container_of(n, struct parport_driver, driver) int parport_bus_init(void); void parport_bus_exit(void); /* parport_register_port registers a new parallel port at the given address (if one does not already exist) and returns a pointer to it. This entails claiming the I/O region, IRQ and DMA. NULL is returned if initialisation fails. */ struct parport *parport_register_port(unsigned long base, int irq, int dma, struct parport_operations *ops); /* Once a registered port is ready for high-level drivers to use, the low-level driver that registered it should announce it. This will call the high-level drivers' attach() functions (after things like determining the IEEE 1284.3 topology of the port and collecting DeviceIDs). */ void parport_announce_port (struct parport *port); /* Unregister a port. */ extern void parport_remove_port(struct parport *port); /* Register a new high-level driver. */ int __must_check __parport_register_driver(struct parport_driver *, struct module *, const char *mod_name); /* * parport_register_driver must be a macro so that KBUILD_MODNAME can * be expanded */ /** * parport_register_driver - register a parallel port device driver * @driver: structure describing the driver * * This can be called by a parallel port device driver in order * to receive notifications about ports being found in the * system, as well as ports no longer available. * * If devmodel is true then the new device model is used * for registration. * * The @driver structure is allocated by the caller and must not be * deallocated until after calling parport_unregister_driver(). * * If using the non device model: * The driver's attach() function may block. The port that * attach() is given will be valid for the duration of the * callback, but if the driver wants to take a copy of the * pointer it must call parport_get_port() to do so. Calling * parport_register_device() on that port will do this for you. * * The driver's detach() function may block. The port that * detach() is given will be valid for the duration of the * callback, but if the driver wants to take a copy of the * pointer it must call parport_get_port() to do so. * * * Returns 0 on success. The non device model will always succeeds. * but the new device model can fail and will return the error code. **/ #define parport_register_driver(driver) \ __parport_register_driver(driver, THIS_MODULE, KBUILD_MODNAME) /* Unregister a high-level driver. */ void parport_unregister_driver(struct parport_driver *); /** * module_parport_driver() - Helper macro for registering a modular parport driver * @__parport_driver: struct parport_driver to be used * * Helper macro for parport drivers which do not do anything special in module * init and exit. This eliminates a lot of boilerplate. Each module may only * use this macro once, and calling it replaces module_init() and module_exit(). */ #define module_parport_driver(__parport_driver) \ module_driver(__parport_driver, parport_register_driver, parport_unregister_driver) /* If parport_register_driver doesn't fit your needs, perhaps * parport_find_xxx does. */ extern struct parport *parport_find_number (int); extern struct parport *parport_find_base (unsigned long); /* generic irq handler, if it suits your needs */ extern irqreturn_t parport_irq_handler(int irq, void *dev_id); /* Reference counting for ports. */ extern struct parport *parport_get_port (struct parport *); extern void parport_put_port (struct parport *); void parport_del_port(struct parport *); struct pardev_cb { int (*preempt)(void *); void (*wakeup)(void *); void *private; void (*irq_func)(void *); unsigned int flags; }; /* * parport_register_dev_model declares that a device is connected to a * port, and tells the kernel all it needs to know. */ struct pardevice * parport_register_dev_model(struct parport *port, const char *name, const struct pardev_cb *par_dev_cb, int cnt); /* parport_unregister unlinks a device from the chain. */ extern void parport_unregister_device(struct pardevice *dev); /* parport_claim tries to gain ownership of the port for a particular driver. This may fail (return non-zero) if another driver is busy. If this driver has registered an interrupt handler, it will be enabled. */ extern int parport_claim(struct pardevice *dev); /* parport_claim_or_block is the same, but sleeps if the port cannot be claimed. Return value is 1 if it slept, 0 normally and -errno on error. */ extern int parport_claim_or_block(struct pardevice *dev); /* parport_release reverses a previous parport_claim. This can never fail, though the effects are undefined (except that they are bad) if you didn't previously own the port. Once you have released the port you should make sure that neither your code nor the hardware on the port tries to initiate any communication without first re-claiming the port. If you mess with the port state (enabling ECP for example) you should clean up before releasing the port. */ extern void parport_release(struct pardevice *dev); /** * parport_yield - relinquish a parallel port temporarily * @dev: a device on the parallel port * * This function relinquishes the port if it would be helpful to other * drivers to do so. Afterwards it tries to reclaim the port using * parport_claim(), and the return value is the same as for * parport_claim(). If it fails, the port is left unclaimed and it is * the driver's responsibility to reclaim the port. * * The parport_yield() and parport_yield_blocking() functions are for * marking points in the driver at which other drivers may claim the * port and use their devices. Yielding the port is similar to * releasing it and reclaiming it, but is more efficient because no * action is taken if there are no other devices needing the port. In * fact, nothing is done even if there are other devices waiting but * the current device is still within its "timeslice". The default * timeslice is half a second, but it can be adjusted via the /proc * interface. **/ static __inline__ int parport_yield(struct pardevice *dev) { unsigned long int timeslip = (jiffies - dev->time); if ((dev->port->waithead == NULL) || (timeslip < dev->timeslice)) return 0; parport_release(dev); return parport_claim(dev); } /** * parport_yield_blocking - relinquish a parallel port temporarily * @dev: a device on the parallel port * * This function relinquishes the port if it would be helpful to other * drivers to do so. Afterwards it tries to reclaim the port using * parport_claim_or_block(), and the return value is the same as for * parport_claim_or_block(). **/ static __inline__ int parport_yield_blocking(struct pardevice *dev) { unsigned long int timeslip = (jiffies - dev->time); if ((dev->port->waithead == NULL) || (timeslip < dev->timeslice)) return 0; parport_release(dev); return parport_claim_or_block(dev); } /* Flags used to identify what a device does. */ #define PARPORT_DEV_TRAN 0 /* WARNING !! DEPRECATED !! */ #define PARPORT_DEV_LURK (1<<0) /* WARNING !! DEPRECATED !! */ #define PARPORT_DEV_EXCL (1<<1) /* Need exclusive access. */ #define PARPORT_FLAG_EXCL (1<<1) /* EXCL driver registered. */ /* IEEE1284 functions */ extern void parport_ieee1284_interrupt (void *); extern int parport_negotiate (struct parport *, int mode); extern ssize_t parport_write (struct parport *, const void *buf, size_t len); extern ssize_t parport_read (struct parport *, void *buf, size_t len); #define PARPORT_INACTIVITY_O_NONBLOCK 1 extern long parport_set_timeout (struct pardevice *, long inactivity); extern int parport_wait_event (struct parport *, long timeout); extern int parport_wait_peripheral (struct parport *port, unsigned char mask, unsigned char val); extern int parport_poll_peripheral (struct parport *port, unsigned char mask, unsigned char val, int usec); /* For architectural drivers */ extern size_t parport_ieee1284_write_compat (struct parport *, const void *, size_t, int); extern size_t parport_ieee1284_read_nibble (struct parport *, void *, size_t, int); extern size_t parport_ieee1284_read_byte (struct parport *, void *, size_t, int); extern size_t parport_ieee1284_ecp_read_data (struct parport *, void *, size_t, int); extern size_t parport_ieee1284_ecp_write_data (struct parport *, const void *, size_t, int); extern size_t parport_ieee1284_ecp_write_addr (struct parport *, const void *, size_t, int); extern size_t parport_ieee1284_epp_write_data (struct parport *, const void *, size_t, int); extern size_t parport_ieee1284_epp_read_data (struct parport *, void *, size_t, int); extern size_t parport_ieee1284_epp_write_addr (struct parport *, const void *, size_t, int); extern size_t parport_ieee1284_epp_read_addr (struct parport *, void *, size_t, int); /* IEEE1284.3 functions */ #define daisy_dev_name "Device ID probe" extern int parport_daisy_init (struct parport *port); extern void parport_daisy_fini (struct parport *port); extern struct pardevice *parport_open (int devnum, const char *name); extern void parport_close (struct pardevice *dev); extern ssize_t parport_device_id (int devnum, char *buffer, size_t len); extern void parport_daisy_deselect_all (struct parport *port); extern int parport_daisy_select (struct parport *port, int daisy, int mode); /* Lowlevel drivers _can_ call this support function to handle irqs. */ static inline void parport_generic_irq(struct parport *port) { parport_ieee1284_interrupt (port); read_lock(&port->cad_lock); if (port->cad && port->cad->irq_func) port->cad->irq_func(port->cad->private); read_unlock(&port->cad_lock); } /* Prototypes from parport_procfs */ extern int parport_proc_register(struct parport *pp); extern int parport_proc_unregister(struct parport *pp); extern int parport_device_proc_register(struct pardevice *device); extern int parport_device_proc_unregister(struct pardevice *device); /* If PC hardware is the only type supported, we can optimise a bit. */ #if !defined(CONFIG_PARPORT_NOT_PC) && defined(CONFIG_PARPORT_PC) #include <linux/parport_pc.h> #define parport_write_data(p,x) parport_pc_write_data(p,x) #define parport_read_data(p) parport_pc_read_data(p) #define parport_write_control(p,x) parport_pc_write_control(p,x) #define parport_read_control(p) parport_pc_read_control(p) #define parport_frob_control(p,m,v) parport_pc_frob_control(p,m,v) #define parport_read_status(p) parport_pc_read_status(p) #define parport_enable_irq(p) parport_pc_enable_irq(p) #define parport_disable_irq(p) parport_pc_disable_irq(p) #define parport_data_forward(p) parport_pc_data_forward(p) #define parport_data_reverse(p) parport_pc_data_reverse(p) #else /* !CONFIG_PARPORT_NOT_PC */ /* Generic operations vector through the dispatch table. */ #define parport_write_data(p,x) (p)->ops->write_data(p,x) #define parport_read_data(p) (p)->ops->read_data(p) #define parport_write_control(p,x) (p)->ops->write_control(p,x) #define parport_read_control(p) (p)->ops->read_control(p) #define parport_frob_control(p,m,v) (p)->ops->frob_control(p,m,v) #define parport_read_status(p) (p)->ops->read_status(p) #define parport_enable_irq(p) (p)->ops->enable_irq(p) #define parport_disable_irq(p) (p)->ops->disable_irq(p) #define parport_data_forward(p) (p)->ops->data_forward(p) #define parport_data_reverse(p) (p)->ops->data_reverse(p) #endif /* !CONFIG_PARPORT_NOT_PC */ extern unsigned long parport_default_timeslice; extern int parport_default_spintime; #endif /* _PARPORT_H_ */
9 7 7 7 4 2 1 2 13 13 11 1 10 2 2 2 2 2 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/minix/bitmap.c * * Copyright (C) 1991, 1992 Linus Torvalds */ /* * Modified for 680x0 by Hamish Macdonald * Fixed for 680x0 by Andreas Schwab */ /* bitmap.c contains the code that handles the inode and block bitmaps */ #include "minix.h" #include <linux/buffer_head.h> #include <linux/bitops.h> #include <linux/sched.h> static DEFINE_SPINLOCK(bitmap_lock); /* * bitmap consists of blocks filled with 16bit words * bit set == busy, bit clear == free * endianness is a mess, but for counting zero bits it really doesn't matter... */ static __u32 count_free(struct buffer_head *map[], unsigned blocksize, __u32 numbits) { __u32 sum = 0; unsigned blocks = DIV_ROUND_UP(numbits, blocksize * 8); while (blocks--) { unsigned words = blocksize / 2; __u16 *p = (__u16 *)(*map++)->b_data; while (words--) sum += 16 - hweight16(*p++); } return sum; } void minix_free_block(struct inode *inode, unsigned long block) { struct super_block *sb = inode->i_sb; struct minix_sb_info *sbi = minix_sb(sb); struct buffer_head *bh; int k = sb->s_blocksize_bits + 3; unsigned long bit, zone; if (block < sbi->s_firstdatazone || block >= sbi->s_nzones) { printk("Trying to free block not in datazone\n"); return; } zone = block - sbi->s_firstdatazone + 1; bit = zone & ((1<<k) - 1); zone >>= k; if (zone >= sbi->s_zmap_blocks) { printk("minix_free_block: nonexistent bitmap buffer\n"); return; } bh = sbi->s_zmap[zone]; spin_lock(&bitmap_lock); if (!minix_test_and_clear_bit(bit, bh->b_data)) printk("minix_free_block (%s:%lu): bit already cleared\n", sb->s_id, block); spin_unlock(&bitmap_lock); mark_buffer_dirty(bh); return; } int minix_new_block(struct inode * inode) { struct minix_sb_info *sbi = minix_sb(inode->i_sb); int bits_per_zone = 8 * inode->i_sb->s_blocksize; int i; for (i = 0; i < sbi->s_zmap_blocks; i++) { struct buffer_head *bh = sbi->s_zmap[i]; int j; spin_lock(&bitmap_lock); j = minix_find_first_zero_bit(bh->b_data, bits_per_zone); if (j < bits_per_zone) { minix_set_bit(j, bh->b_data); spin_unlock(&bitmap_lock); mark_buffer_dirty(bh); j += i * bits_per_zone + sbi->s_firstdatazone-1; if (j < sbi->s_firstdatazone || j >= sbi->s_nzones) break; return j; } spin_unlock(&bitmap_lock); } return 0; } unsigned long minix_count_free_blocks(struct super_block *sb) { struct minix_sb_info *sbi = minix_sb(sb); u32 bits = sbi->s_nzones - sbi->s_firstdatazone + 1; return (count_free(sbi->s_zmap, sb->s_blocksize, bits) << sbi->s_log_zone_size); } struct minix_inode * minix_V1_raw_inode(struct super_block *sb, ino_t ino, struct buffer_head **bh) { int block; struct minix_sb_info *sbi = minix_sb(sb); struct minix_inode *p; if (!ino || ino > sbi->s_ninodes) { printk("Bad inode number on dev %s: %ld is out of range\n", sb->s_id, (long)ino); return NULL; } ino--; block = 2 + sbi->s_imap_blocks + sbi->s_zmap_blocks + ino / MINIX_INODES_PER_BLOCK; *bh = sb_bread(sb, block); if (!*bh) { printk("Unable to read inode block\n"); return NULL; } p = (void *)(*bh)->b_data; return p + ino % MINIX_INODES_PER_BLOCK; } struct minix2_inode * minix_V2_raw_inode(struct super_block *sb, ino_t ino, struct buffer_head **bh) { int block; struct minix_sb_info *sbi = minix_sb(sb); struct minix2_inode *p; int minix2_inodes_per_block = sb->s_blocksize / sizeof(struct minix2_inode); *bh = NULL; if (!ino || ino > sbi->s_ninodes) { printk("Bad inode number on dev %s: %ld is out of range\n", sb->s_id, (long)ino); return NULL; } ino--; block = 2 + sbi->s_imap_blocks + sbi->s_zmap_blocks + ino / minix2_inodes_per_block; *bh = sb_bread(sb, block); if (!*bh) { printk("Unable to read inode block\n"); return NULL; } p = (void *)(*bh)->b_data; return p + ino % minix2_inodes_per_block; } /* Clear the link count and mode of a deleted inode on disk. */ static void minix_clear_inode(struct inode *inode) { struct buffer_head *bh = NULL; if (INODE_VERSION(inode) == MINIX_V1) { struct minix_inode *raw_inode; raw_inode = minix_V1_raw_inode(inode->i_sb, inode->i_ino, &bh); if (raw_inode) { raw_inode->i_nlinks = 0; raw_inode->i_mode = 0; } } else { struct minix2_inode *raw_inode; raw_inode = minix_V2_raw_inode(inode->i_sb, inode->i_ino, &bh); if (raw_inode) { raw_inode->i_nlinks = 0; raw_inode->i_mode = 0; } } if (bh) { mark_buffer_dirty(bh); brelse (bh); } } void minix_free_inode(struct inode * inode) { struct super_block *sb = inode->i_sb; struct minix_sb_info *sbi = minix_sb(inode->i_sb); struct buffer_head *bh; int k = sb->s_blocksize_bits + 3; unsigned long ino, bit; ino = inode->i_ino; if (ino < 1 || ino > sbi->s_ninodes) { printk("minix_free_inode: inode 0 or nonexistent inode\n"); return; } bit = ino & ((1<<k) - 1); ino >>= k; if (ino >= sbi->s_imap_blocks) { printk("minix_free_inode: nonexistent imap in superblock\n"); return; } minix_clear_inode(inode); /* clear on-disk copy */ bh = sbi->s_imap[ino]; spin_lock(&bitmap_lock); if (!minix_test_and_clear_bit(bit, bh->b_data)) printk("minix_free_inode: bit %lu already cleared\n", bit); spin_unlock(&bitmap_lock); mark_buffer_dirty(bh); } struct inode *minix_new_inode(const struct inode *dir, umode_t mode) { struct super_block *sb = dir->i_sb; struct minix_sb_info *sbi = minix_sb(sb); struct inode *inode = new_inode(sb); struct buffer_head * bh; int bits_per_zone = 8 * sb->s_blocksize; unsigned long j; int i; if (!inode) return ERR_PTR(-ENOMEM); j = bits_per_zone; bh = NULL; spin_lock(&bitmap_lock); for (i = 0; i < sbi->s_imap_blocks; i++) { bh = sbi->s_imap[i]; j = minix_find_first_zero_bit(bh->b_data, bits_per_zone); if (j < bits_per_zone) break; } if (!bh || j >= bits_per_zone) { spin_unlock(&bitmap_lock); iput(inode); return ERR_PTR(-ENOSPC); } if (minix_test_and_set_bit(j, bh->b_data)) { /* shouldn't happen */ spin_unlock(&bitmap_lock); printk("minix_new_inode: bit already set\n"); iput(inode); return ERR_PTR(-ENOSPC); } spin_unlock(&bitmap_lock); mark_buffer_dirty(bh); j += i * bits_per_zone; if (!j || j > sbi->s_ninodes) { iput(inode); return ERR_PTR(-ENOSPC); } inode_init_owner(&nop_mnt_idmap, inode, dir, mode); inode->i_ino = j; simple_inode_init_ts(inode); inode->i_blocks = 0; memset(&minix_i(inode)->u, 0, sizeof(minix_i(inode)->u)); insert_inode_hash(inode); mark_inode_dirty(inode); return inode; } unsigned long minix_count_free_inodes(struct super_block *sb) { struct minix_sb_info *sbi = minix_sb(sb); u32 bits = sbi->s_ninodes + 1; return count_free(sbi->s_imap, sb->s_blocksize, bits); }
6 5 1 4 4 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 // SPDX-License-Identifier: GPL-2.0 /* * ACPI helpers for GPIO API * * Copyright (C) 2012, Intel Corporation * Authors: Mathias Nyman <mathias.nyman@linux.intel.com> * Mika Westerberg <mika.westerberg@linux.intel.com> */ #include <linux/acpi.h> #include <linux/dmi.h> #include <linux/errno.h> #include <linux/export.h> #include <linux/interrupt.h> #include <linux/irq.h> #include <linux/mutex.h> #include <linux/pinctrl/pinctrl.h> #include <linux/gpio/consumer.h> #include <linux/gpio/driver.h> #include <linux/gpio/machine.h> #include "gpiolib.h" #include "gpiolib-acpi.h" static int run_edge_events_on_boot = -1; module_param(run_edge_events_on_boot, int, 0444); MODULE_PARM_DESC(run_edge_events_on_boot, "Run edge _AEI event-handlers at boot: 0=no, 1=yes, -1=auto"); static char *ignore_wake; module_param(ignore_wake, charp, 0444); MODULE_PARM_DESC(ignore_wake, "controller@pin combos on which to ignore the ACPI wake flag " "ignore_wake=controller@pin[,controller@pin[,...]]"); static char *ignore_interrupt; module_param(ignore_interrupt, charp, 0444); MODULE_PARM_DESC(ignore_interrupt, "controller@pin combos on which to ignore interrupt " "ignore_interrupt=controller@pin[,controller@pin[,...]]"); struct acpi_gpiolib_dmi_quirk { bool no_edge_events_on_boot; char *ignore_wake; char *ignore_interrupt; }; /** * struct acpi_gpio_event - ACPI GPIO event handler data * * @node: list-entry of the events list of the struct acpi_gpio_chip * @handle: handle of ACPI method to execute when the IRQ triggers * @handler: handler function to pass to request_irq() when requesting the IRQ * @pin: GPIO pin number on the struct gpio_chip * @irq: Linux IRQ number for the event, for request_irq() / free_irq() * @irqflags: flags to pass to request_irq() when requesting the IRQ * @irq_is_wake: If the ACPI flags indicate the IRQ is a wakeup source * @irq_requested:True if request_irq() has been done * @desc: struct gpio_desc for the GPIO pin for this event */ struct acpi_gpio_event { struct list_head node; acpi_handle handle; irq_handler_t handler; unsigned int pin; unsigned int irq; unsigned long irqflags; bool irq_is_wake; bool irq_requested; struct gpio_desc *desc; }; struct acpi_gpio_connection { struct list_head node; unsigned int pin; struct gpio_desc *desc; }; struct acpi_gpio_chip { /* * ACPICA requires that the first field of the context parameter * passed to acpi_install_address_space_handler() is large enough * to hold struct acpi_connection_info. */ struct acpi_connection_info conn_info; struct list_head conns; struct mutex conn_lock; struct gpio_chip *chip; struct list_head events; struct list_head deferred_req_irqs_list_entry; }; /** * struct acpi_gpio_info - ACPI GPIO specific information * @adev: reference to ACPI device which consumes GPIO resource * @flags: GPIO initialization flags * @gpioint: if %true this GPIO is of type GpioInt otherwise type is GpioIo * @pin_config: pin bias as provided by ACPI * @polarity: interrupt polarity as provided by ACPI * @triggering: triggering type as provided by ACPI * @wake_capable: wake capability as provided by ACPI * @debounce: debounce timeout as provided by ACPI * @quirks: Linux specific quirks as provided by struct acpi_gpio_mapping */ struct acpi_gpio_info { struct acpi_device *adev; enum gpiod_flags flags; bool gpioint; int pin_config; int polarity; int triggering; bool wake_capable; unsigned int debounce; unsigned int quirks; }; /* * For GPIO chips which call acpi_gpiochip_request_interrupts() before late_init * (so builtin drivers) we register the ACPI GpioInt IRQ handlers from a * late_initcall_sync() handler, so that other builtin drivers can register their * OpRegions before the event handlers can run. This list contains GPIO chips * for which the acpi_gpiochip_request_irqs() call has been deferred. */ static DEFINE_MUTEX(acpi_gpio_deferred_req_irqs_lock); static LIST_HEAD(acpi_gpio_deferred_req_irqs_list); static bool acpi_gpio_deferred_req_irqs_done; static int acpi_gpiochip_find(struct gpio_chip *gc, const void *data) { return device_match_acpi_handle(&gc->gpiodev->dev, data); } /** * acpi_get_gpiod() - Translate ACPI GPIO pin to GPIO descriptor usable with GPIO API * @path: ACPI GPIO controller full path name, (e.g. "\\_SB.GPO1") * @pin: ACPI GPIO pin number (0-based, controller-relative) * * Return: GPIO descriptor to use with Linux generic GPIO API, or ERR_PTR * error value. Specifically returns %-EPROBE_DEFER if the referenced GPIO * controller does not have GPIO chip registered at the moment. This is to * support probe deferral. */ static struct gpio_desc *acpi_get_gpiod(char *path, unsigned int pin) { acpi_handle handle; acpi_status status; status = acpi_get_handle(NULL, path, &handle); if (ACPI_FAILURE(status)) return ERR_PTR(-ENODEV); struct gpio_device *gdev __free(gpio_device_put) = gpio_device_find(handle, acpi_gpiochip_find); if (!gdev) return ERR_PTR(-EPROBE_DEFER); /* * FIXME: keep track of the reference to the GPIO device somehow * instead of putting it here. */ return gpio_device_get_desc(gdev, pin); } static irqreturn_t acpi_gpio_irq_handler(int irq, void *data) { struct acpi_gpio_event *event = data; acpi_evaluate_object(event->handle, NULL, NULL, NULL); return IRQ_HANDLED; } static irqreturn_t acpi_gpio_irq_handler_evt(int irq, void *data) { struct acpi_gpio_event *event = data; acpi_execute_simple_method(event->handle, NULL, event->pin); return IRQ_HANDLED; } static void acpi_gpio_chip_dh(acpi_handle handle, void *data) { /* The address of this function is used as a key. */ } bool acpi_gpio_get_irq_resource(struct acpi_resource *ares, struct acpi_resource_gpio **agpio) { struct acpi_resource_gpio *gpio; if (ares->type != ACPI_RESOURCE_TYPE_GPIO) return false; gpio = &ares->data.gpio; if (gpio->connection_type != ACPI_RESOURCE_GPIO_TYPE_INT) return false; *agpio = gpio; return true; } EXPORT_SYMBOL_GPL(acpi_gpio_get_irq_resource); /** * acpi_gpio_get_io_resource - Fetch details of an ACPI resource if it is a GPIO * I/O resource or return False if not. * @ares: Pointer to the ACPI resource to fetch * @agpio: Pointer to a &struct acpi_resource_gpio to store the output pointer */ bool acpi_gpio_get_io_resource(struct acpi_resource *ares, struct acpi_resource_gpio **agpio) { struct acpi_resource_gpio *gpio; if (ares->type != ACPI_RESOURCE_TYPE_GPIO) return false; gpio = &ares->data.gpio; if (gpio->connection_type != ACPI_RESOURCE_GPIO_TYPE_IO) return false; *agpio = gpio; return true; } EXPORT_SYMBOL_GPL(acpi_gpio_get_io_resource); static void acpi_gpiochip_request_irq(struct acpi_gpio_chip *acpi_gpio, struct acpi_gpio_event *event) { struct device *parent = acpi_gpio->chip->parent; int ret, value; ret = request_threaded_irq(event->irq, NULL, event->handler, event->irqflags | IRQF_ONESHOT, "ACPI:Event", event); if (ret) { dev_err(parent, "Failed to setup interrupt handler for %d\n", event->irq); return; } if (event->irq_is_wake) enable_irq_wake(event->irq); event->irq_requested = true; /* Make sure we trigger the initial state of edge-triggered IRQs */ if (run_edge_events_on_boot && (event->irqflags & (IRQF_TRIGGER_RISING | IRQF_TRIGGER_FALLING))) { value = gpiod_get_raw_value_cansleep(event->desc); if (((event->irqflags & IRQF_TRIGGER_RISING) && value == 1) || ((event->irqflags & IRQF_TRIGGER_FALLING) && value == 0)) event->handler(event->irq, event); } } static void acpi_gpiochip_request_irqs(struct acpi_gpio_chip *acpi_gpio) { struct acpi_gpio_event *event; list_for_each_entry(event, &acpi_gpio->events, node) acpi_gpiochip_request_irq(acpi_gpio, event); } static enum gpiod_flags acpi_gpio_to_gpiod_flags(const struct acpi_resource_gpio *agpio, int polarity) { /* GpioInt() implies input configuration */ if (agpio->connection_type == ACPI_RESOURCE_GPIO_TYPE_INT) return GPIOD_IN; switch (agpio->io_restriction) { case ACPI_IO_RESTRICT_INPUT: return GPIOD_IN; case ACPI_IO_RESTRICT_OUTPUT: /* * ACPI GPIO resources don't contain an initial value for the * GPIO. Therefore we deduce that value from the pull field * and the polarity instead. If the pin is pulled up we assume * default to be high, if it is pulled down we assume default * to be low, otherwise we leave pin untouched. For active low * polarity values will be switched. See also * Documentation/firmware-guide/acpi/gpio-properties.rst. */ switch (agpio->pin_config) { case ACPI_PIN_CONFIG_PULLUP: return polarity == GPIO_ACTIVE_LOW ? GPIOD_OUT_LOW : GPIOD_OUT_HIGH; case ACPI_PIN_CONFIG_PULLDOWN: return polarity == GPIO_ACTIVE_LOW ? GPIOD_OUT_HIGH : GPIOD_OUT_LOW; default: break; } break; default: break; } /* * Assume that the BIOS has configured the direction and pull * accordingly. */ return GPIOD_ASIS; } static struct gpio_desc *acpi_request_own_gpiod(struct gpio_chip *chip, struct acpi_resource_gpio *agpio, unsigned int index, const char *label) { int polarity = GPIO_ACTIVE_HIGH; enum gpiod_flags flags = acpi_gpio_to_gpiod_flags(agpio, polarity); unsigned int pin = agpio->pin_table[index]; struct gpio_desc *desc; int ret; desc = gpiochip_request_own_desc(chip, pin, label, polarity, flags); if (IS_ERR(desc)) return desc; /* ACPI uses hundredths of milliseconds units */ ret = gpio_set_debounce_timeout(desc, agpio->debounce_timeout * 10); if (ret) dev_warn(chip->parent, "Failed to set debounce-timeout for pin 0x%04X, err %d\n", pin, ret); return desc; } static bool acpi_gpio_in_ignore_list(const char *ignore_list, const char *controller_in, unsigned int pin_in) { const char *controller, *pin_str; unsigned int pin; char *endp; int len; controller = ignore_list; while (controller) { pin_str = strchr(controller, '@'); if (!pin_str) goto err; len = pin_str - controller; if (len == strlen(controller_in) && strncmp(controller, controller_in, len) == 0) { pin = simple_strtoul(pin_str + 1, &endp, 10); if (*endp != 0 && *endp != ',') goto err; if (pin == pin_in) return true; } controller = strchr(controller, ','); if (controller) controller++; } return false; err: pr_err_once("Error: Invalid value for gpiolib_acpi.ignore_...: %s\n", ignore_list); return false; } static bool acpi_gpio_irq_is_wake(struct device *parent, const struct acpi_resource_gpio *agpio) { unsigned int pin = agpio->pin_table[0]; if (agpio->wake_capable != ACPI_WAKE_CAPABLE) return false; if (acpi_gpio_in_ignore_list(ignore_wake, dev_name(parent), pin)) { dev_info(parent, "Ignoring wakeup on pin %u\n", pin); return false; } return true; } /* Always returns AE_OK so that we keep looping over the resources */ static acpi_status acpi_gpiochip_alloc_event(struct acpi_resource *ares, void *context) { struct acpi_gpio_chip *acpi_gpio = context; struct gpio_chip *chip = acpi_gpio->chip; struct acpi_resource_gpio *agpio; acpi_handle handle, evt_handle; struct acpi_gpio_event *event; irq_handler_t handler = NULL; struct gpio_desc *desc; unsigned int pin; int ret, irq; if (!acpi_gpio_get_irq_resource(ares, &agpio)) return AE_OK; handle = ACPI_HANDLE(chip->parent); pin = agpio->pin_table[0]; if (pin <= 255) { char ev_name[8]; sprintf(ev_name, "_%c%02X", agpio->triggering == ACPI_EDGE_SENSITIVE ? 'E' : 'L', pin); if (ACPI_SUCCESS(acpi_get_handle(handle, ev_name, &evt_handle))) handler = acpi_gpio_irq_handler; } if (!handler) { if (ACPI_SUCCESS(acpi_get_handle(handle, "_EVT", &evt_handle))) handler = acpi_gpio_irq_handler_evt; } if (!handler) return AE_OK; if (acpi_gpio_in_ignore_list(ignore_interrupt, dev_name(chip->parent), pin)) { dev_info(chip->parent, "Ignoring interrupt on pin %u\n", pin); return AE_OK; } desc = acpi_request_own_gpiod(chip, agpio, 0, "ACPI:Event"); if (IS_ERR(desc)) { dev_err(chip->parent, "Failed to request GPIO for pin 0x%04X, err %ld\n", pin, PTR_ERR(desc)); return AE_OK; } ret = gpiochip_lock_as_irq(chip, pin); if (ret) { dev_err(chip->parent, "Failed to lock GPIO pin 0x%04X as interrupt, err %d\n", pin, ret); goto fail_free_desc; } irq = gpiod_to_irq(desc); if (irq < 0) { dev_err(chip->parent, "Failed to translate GPIO pin 0x%04X to IRQ, err %d\n", pin, irq); goto fail_unlock_irq; } event = kzalloc(sizeof(*event), GFP_KERNEL); if (!event) goto fail_unlock_irq; event->irqflags = IRQF_ONESHOT; if (agpio->triggering == ACPI_LEVEL_SENSITIVE) { if (agpio->polarity == ACPI_ACTIVE_HIGH) event->irqflags |= IRQF_TRIGGER_HIGH; else event->irqflags |= IRQF_TRIGGER_LOW; } else { switch (agpio->polarity) { case ACPI_ACTIVE_HIGH: event->irqflags |= IRQF_TRIGGER_RISING; break; case ACPI_ACTIVE_LOW: event->irqflags |= IRQF_TRIGGER_FALLING; break; default: event->irqflags |= IRQF_TRIGGER_RISING | IRQF_TRIGGER_FALLING; break; } } event->handle = evt_handle; event->handler = handler; event->irq = irq; event->irq_is_wake = acpi_gpio_irq_is_wake(chip->parent, agpio); event->pin = pin; event->desc = desc; list_add_tail(&event->node, &acpi_gpio->events); return AE_OK; fail_unlock_irq: gpiochip_unlock_as_irq(chip, pin); fail_free_desc: gpiochip_free_own_desc(desc); return AE_OK; } /** * acpi_gpiochip_request_interrupts() - Register isr for gpio chip ACPI events * @chip: GPIO chip * * ACPI5 platforms can use GPIO signaled ACPI events. These GPIO interrupts are * handled by ACPI event methods which need to be called from the GPIO * chip's interrupt handler. acpi_gpiochip_request_interrupts() finds out which * GPIO pins have ACPI event methods and assigns interrupt handlers that calls * the ACPI event methods for those pins. */ void acpi_gpiochip_request_interrupts(struct gpio_chip *chip) { struct acpi_gpio_chip *acpi_gpio; acpi_handle handle; acpi_status status; bool defer; if (!chip->parent || !chip->to_irq) return; handle = ACPI_HANDLE(chip->parent); if (!handle) return; status = acpi_get_data(handle, acpi_gpio_chip_dh, (void **)&acpi_gpio); if (ACPI_FAILURE(status)) return; if (acpi_quirk_skip_gpio_event_handlers()) return; acpi_walk_resources(handle, METHOD_NAME__AEI, acpi_gpiochip_alloc_event, acpi_gpio); mutex_lock(&acpi_gpio_deferred_req_irqs_lock); defer = !acpi_gpio_deferred_req_irqs_done; if (defer) list_add(&acpi_gpio->deferred_req_irqs_list_entry, &acpi_gpio_deferred_req_irqs_list); mutex_unlock(&acpi_gpio_deferred_req_irqs_lock); if (defer) return; acpi_gpiochip_request_irqs(acpi_gpio); } EXPORT_SYMBOL_GPL(acpi_gpiochip_request_interrupts); /** * acpi_gpiochip_free_interrupts() - Free GPIO ACPI event interrupts. * @chip: GPIO chip * * Free interrupts associated with GPIO ACPI event method for the given * GPIO chip. */ void acpi_gpiochip_free_interrupts(struct gpio_chip *chip) { struct acpi_gpio_chip *acpi_gpio; struct acpi_gpio_event *event, *ep; acpi_handle handle; acpi_status status; if (!chip->parent || !chip->to_irq) return; handle = ACPI_HANDLE(chip->parent); if (!handle) return; status = acpi_get_data(handle, acpi_gpio_chip_dh, (void **)&acpi_gpio); if (ACPI_FAILURE(status)) return; mutex_lock(&acpi_gpio_deferred_req_irqs_lock); if (!list_empty(&acpi_gpio->deferred_req_irqs_list_entry)) list_del_init(&acpi_gpio->deferred_req_irqs_list_entry); mutex_unlock(&acpi_gpio_deferred_req_irqs_lock); list_for_each_entry_safe_reverse(event, ep, &acpi_gpio->events, node) { if (event->irq_requested) { if (event->irq_is_wake) disable_irq_wake(event->irq); free_irq(event->irq, event); } gpiochip_unlock_as_irq(chip, event->pin); gpiochip_free_own_desc(event->desc); list_del(&event->node); kfree(event); } } EXPORT_SYMBOL_GPL(acpi_gpiochip_free_interrupts); int acpi_dev_add_driver_gpios(struct acpi_device *adev, const struct acpi_gpio_mapping *gpios) { if (adev && gpios) { adev->driver_gpios = gpios; return 0; } return -EINVAL; } EXPORT_SYMBOL_GPL(acpi_dev_add_driver_gpios); void acpi_dev_remove_driver_gpios(struct acpi_device *adev) { if (adev) adev->driver_gpios = NULL; } EXPORT_SYMBOL_GPL(acpi_dev_remove_driver_gpios); static void acpi_dev_release_driver_gpios(void *adev) { acpi_dev_remove_driver_gpios(adev); } int devm_acpi_dev_add_driver_gpios(struct device *dev, const struct acpi_gpio_mapping *gpios) { struct acpi_device *adev = ACPI_COMPANION(dev); int ret; ret = acpi_dev_add_driver_gpios(adev, gpios); if (ret) return ret; return devm_add_action_or_reset(dev, acpi_dev_release_driver_gpios, adev); } EXPORT_SYMBOL_GPL(devm_acpi_dev_add_driver_gpios); static bool acpi_get_driver_gpio_data(struct acpi_device *adev, const char *name, int index, struct fwnode_reference_args *args, unsigned int *quirks) { const struct acpi_gpio_mapping *gm; if (!adev || !adev->driver_gpios) return false; for (gm = adev->driver_gpios; gm->name; gm++) if (!strcmp(name, gm->name) && gm->data && index < gm->size) { const struct acpi_gpio_params *par = gm->data + index; args->fwnode = acpi_fwnode_handle(adev); args->args[0] = par->crs_entry_index; args->args[1] = par->line_index; args->args[2] = par->active_low; args->nargs = 3; *quirks = gm->quirks; return true; } return false; } static int __acpi_gpio_update_gpiod_flags(enum gpiod_flags *flags, enum gpiod_flags update) { const enum gpiod_flags mask = GPIOD_FLAGS_BIT_DIR_SET | GPIOD_FLAGS_BIT_DIR_OUT | GPIOD_FLAGS_BIT_DIR_VAL; int ret = 0; /* * Check if the BIOS has IoRestriction with explicitly set direction * and update @flags accordingly. Otherwise use whatever caller asked * for. */ if (update & GPIOD_FLAGS_BIT_DIR_SET) { enum gpiod_flags diff = *flags ^ update; /* * Check if caller supplied incompatible GPIO initialization * flags. * * Return %-EINVAL to notify that firmware has different * settings and we are going to use them. */ if (((*flags & GPIOD_FLAGS_BIT_DIR_SET) && (diff & GPIOD_FLAGS_BIT_DIR_OUT)) || ((*flags & GPIOD_FLAGS_BIT_DIR_OUT) && (diff & GPIOD_FLAGS_BIT_DIR_VAL))) ret = -EINVAL; *flags = (*flags & ~mask) | (update & mask); } return ret; } static int acpi_gpio_update_gpiod_flags(enum gpiod_flags *flags, struct acpi_gpio_info *info) { struct device *dev = &info->adev->dev; enum gpiod_flags old = *flags; int ret; ret = __acpi_gpio_update_gpiod_flags(&old, info->flags); if (info->quirks & ACPI_GPIO_QUIRK_NO_IO_RESTRICTION) { if (ret) dev_warn(dev, FW_BUG "GPIO not in correct mode, fixing\n"); } else { if (ret) dev_dbg(dev, "Override GPIO initialization flags\n"); *flags = old; } return ret; } static int acpi_gpio_update_gpiod_lookup_flags(unsigned long *lookupflags, struct acpi_gpio_info *info) { switch (info->pin_config) { case ACPI_PIN_CONFIG_PULLUP: *lookupflags |= GPIO_PULL_UP; break; case ACPI_PIN_CONFIG_PULLDOWN: *lookupflags |= GPIO_PULL_DOWN; break; case ACPI_PIN_CONFIG_NOPULL: *lookupflags |= GPIO_PULL_DISABLE; break; default: break; } if (info->polarity == GPIO_ACTIVE_LOW) *lookupflags |= GPIO_ACTIVE_LOW; return 0; } struct acpi_gpio_lookup { struct acpi_gpio_info info; int index; u16 pin_index; bool active_low; struct gpio_desc *desc; int n; }; static int acpi_populate_gpio_lookup(struct acpi_resource *ares, void *data) { struct acpi_gpio_lookup *lookup = data; if (ares->type != ACPI_RESOURCE_TYPE_GPIO) return 1; if (!lookup->desc) { const struct acpi_resource_gpio *agpio = &ares->data.gpio; bool gpioint = agpio->connection_type == ACPI_RESOURCE_GPIO_TYPE_INT; struct gpio_desc *desc; u16 pin_index; if (lookup->info.quirks & ACPI_GPIO_QUIRK_ONLY_GPIOIO && gpioint) lookup->index++; if (lookup->n++ != lookup->index) return 1; pin_index = lookup->pin_index; if (pin_index >= agpio->pin_table_length) return 1; if (lookup->info.quirks & ACPI_GPIO_QUIRK_ABSOLUTE_NUMBER) desc = gpio_to_desc(agpio->pin_table[pin_index]); else desc = acpi_get_gpiod(agpio->resource_source.string_ptr, agpio->pin_table[pin_index]); lookup->desc = desc; lookup->info.pin_config = agpio->pin_config; lookup->info.debounce = agpio->debounce_timeout; lookup->info.gpioint = gpioint; lookup->info.wake_capable = acpi_gpio_irq_is_wake(&lookup->info.adev->dev, agpio); /* * Polarity and triggering are only specified for GpioInt * resource. * Note: we expect here: * - ACPI_ACTIVE_LOW == GPIO_ACTIVE_LOW * - ACPI_ACTIVE_HIGH == GPIO_ACTIVE_HIGH */ if (lookup->info.gpioint) { lookup->info.polarity = agpio->polarity; lookup->info.triggering = agpio->triggering; } else { lookup->info.polarity = lookup->active_low; } lookup->info.flags = acpi_gpio_to_gpiod_flags(agpio, lookup->info.polarity); } return 1; } static int acpi_gpio_resource_lookup(struct acpi_gpio_lookup *lookup, struct acpi_gpio_info *info) { struct acpi_device *adev = lookup->info.adev; struct list_head res_list; int ret; INIT_LIST_HEAD(&res_list); ret = acpi_dev_get_resources(adev, &res_list, acpi_populate_gpio_lookup, lookup); if (ret < 0) return ret; acpi_dev_free_resource_list(&res_list); if (!lookup->desc) return -ENOENT; if (info) *info = lookup->info; return 0; } static int acpi_gpio_property_lookup(struct fwnode_handle *fwnode, const char *propname, int index, struct acpi_gpio_lookup *lookup) { struct fwnode_reference_args args; unsigned int quirks = 0; int ret; memset(&args, 0, sizeof(args)); ret = __acpi_node_get_property_reference(fwnode, propname, index, 3, &args); if (ret) { struct acpi_device *adev; adev = to_acpi_device_node(fwnode); if (!acpi_get_driver_gpio_data(adev, propname, index, &args, &quirks)) return ret; } /* * The property was found and resolved, so need to lookup the GPIO based * on returned args. */ if (!to_acpi_device_node(args.fwnode)) return -EINVAL; if (args.nargs != 3) return -EPROTO; lookup->index = args.args[0]; lookup->pin_index = args.args[1]; lookup->active_low = !!args.args[2]; lookup->info.adev = to_acpi_device_node(args.fwnode); lookup->info.quirks = quirks; return 0; } /** * acpi_get_gpiod_by_index() - get a GPIO descriptor from device resources * @adev: pointer to a ACPI device to get GPIO from * @propname: Property name of the GPIO (optional) * @index: index of GpioIo/GpioInt resource (starting from %0) * @info: info pointer to fill in (optional) * * Function goes through ACPI resources for @adev and based on @index looks * up a GpioIo/GpioInt resource, translates it to the Linux GPIO descriptor, * and returns it. @index matches GpioIo/GpioInt resources only so if there * are total %3 GPIO resources, the index goes from %0 to %2. * * If @propname is specified the GPIO is looked using device property. In * that case @index is used to select the GPIO entry in the property value * (in case of multiple). * * If the GPIO cannot be translated or there is an error, an ERR_PTR is * returned. * * Note: if the GPIO resource has multiple entries in the pin list, this * function only returns the first. */ static struct gpio_desc *acpi_get_gpiod_by_index(struct acpi_device *adev, const char *propname, int index, struct acpi_gpio_info *info) { struct acpi_gpio_lookup lookup; int ret; if (!adev) return ERR_PTR(-ENODEV); memset(&lookup, 0, sizeof(lookup)); lookup.index = index; if (propname) { dev_dbg(&adev->dev, "GPIO: looking up %s\n", propname); ret = acpi_gpio_property_lookup(acpi_fwnode_handle(adev), propname, index, &lookup); if (ret) return ERR_PTR(ret); dev_dbg(&adev->dev, "GPIO: _DSD returned %s %d %u %u\n", dev_name(&lookup.info.adev->dev), lookup.index, lookup.pin_index, lookup.active_low); } else { dev_dbg(&adev->dev, "GPIO: looking up %d in _CRS\n", index); lookup.info.adev = adev; } ret = acpi_gpio_resource_lookup(&lookup, info); return ret ? ERR_PTR(ret) : lookup.desc; } /** * acpi_get_gpiod_from_data() - get a GPIO descriptor from ACPI data node * @fwnode: pointer to an ACPI firmware node to get the GPIO information from * @propname: Property name of the GPIO * @index: index of GpioIo/GpioInt resource (starting from %0) * @info: info pointer to fill in (optional) * * This function uses the property-based GPIO lookup to get to the GPIO * resource with the relevant information from a data-only ACPI firmware node * and uses that to obtain the GPIO descriptor to return. * * If the GPIO cannot be translated or there is an error an ERR_PTR is * returned. */ static struct gpio_desc *acpi_get_gpiod_from_data(struct fwnode_handle *fwnode, const char *propname, int index, struct acpi_gpio_info *info) { struct acpi_gpio_lookup lookup; int ret; if (!is_acpi_data_node(fwnode)) return ERR_PTR(-ENODEV); if (!propname) return ERR_PTR(-EINVAL); memset(&lookup, 0, sizeof(lookup)); lookup.index = index; ret = acpi_gpio_property_lookup(fwnode, propname, index, &lookup); if (ret) return ERR_PTR(ret); ret = acpi_gpio_resource_lookup(&lookup, info); return ret ? ERR_PTR(ret) : lookup.desc; } static bool acpi_can_fallback_to_crs(struct acpi_device *adev, const char *con_id) { /* Never allow fallback if the device has properties */ if (acpi_dev_has_props(adev) || adev->driver_gpios) return false; return con_id == NULL; } struct gpio_desc *acpi_find_gpio(struct fwnode_handle *fwnode, const char *con_id, unsigned int idx, enum gpiod_flags *dflags, unsigned long *lookupflags) { struct acpi_device *adev = to_acpi_device_node(fwnode); struct acpi_gpio_info info; struct gpio_desc *desc; char propname[32]; int i; /* Try first from _DSD */ for (i = 0; i < ARRAY_SIZE(gpio_suffixes); i++) { if (con_id) { snprintf(propname, sizeof(propname), "%s-%s", con_id, gpio_suffixes[i]); } else { snprintf(propname, sizeof(propname), "%s", gpio_suffixes[i]); } if (adev) desc = acpi_get_gpiod_by_index(adev, propname, idx, &info); else desc = acpi_get_gpiod_from_data(fwnode, propname, idx, &info); if (!IS_ERR(desc)) break; if (PTR_ERR(desc) == -EPROBE_DEFER) return ERR_CAST(desc); } /* Then from plain _CRS GPIOs */ if (IS_ERR(desc)) { if (!adev || !acpi_can_fallback_to_crs(adev, con_id)) return ERR_PTR(-ENOENT); desc = acpi_get_gpiod_by_index(adev, NULL, idx, &info); if (IS_ERR(desc)) return desc; } if (info.gpioint && (*dflags == GPIOD_OUT_LOW || *dflags == GPIOD_OUT_HIGH)) { dev_dbg(&adev->dev, "refusing GpioInt() entry when doing GPIOD_OUT_* lookup\n"); return ERR_PTR(-ENOENT); } acpi_gpio_update_gpiod_flags(dflags, &info); acpi_gpio_update_gpiod_lookup_flags(lookupflags, &info); return desc; } /** * acpi_dev_gpio_irq_wake_get_by() - Find GpioInt and translate it to Linux IRQ number * @adev: pointer to a ACPI device to get IRQ from * @name: optional name of GpioInt resource * @index: index of GpioInt resource (starting from %0) * @wake_capable: Set to true if the IRQ is wake capable * * If the device has one or more GpioInt resources, this function can be * used to translate from the GPIO offset in the resource to the Linux IRQ * number. * * The function is idempotent, though each time it runs it will configure GPIO * pin direction according to the flags in GpioInt resource. * * The function takes optional @name parameter. If the resource has a property * name, then only those will be taken into account. * * The GPIO is considered wake capable if the GpioInt resource specifies * SharedAndWake or ExclusiveAndWake. * * Return: Linux IRQ number (> %0) on success, negative errno on failure. */ int acpi_dev_gpio_irq_wake_get_by(struct acpi_device *adev, const char *name, int index, bool *wake_capable) { int idx, i; unsigned int irq_flags; int ret; for (i = 0, idx = 0; idx <= index; i++) { struct acpi_gpio_info info; struct gpio_desc *desc; desc = acpi_get_gpiod_by_index(adev, name, i, &info); /* Ignore -EPROBE_DEFER, it only matters if idx matches */ if (IS_ERR(desc) && PTR_ERR(desc) != -EPROBE_DEFER) return PTR_ERR(desc); if (info.gpioint && idx++ == index) { unsigned long lflags = GPIO_LOOKUP_FLAGS_DEFAULT; enum gpiod_flags dflags = GPIOD_ASIS; char label[32]; int irq; if (IS_ERR(desc)) return PTR_ERR(desc); irq = gpiod_to_irq(desc); if (irq < 0) return irq; acpi_gpio_update_gpiod_flags(&dflags, &info); acpi_gpio_update_gpiod_lookup_flags(&lflags, &info); snprintf(label, sizeof(label), "GpioInt() %d", index); ret = gpiod_configure_flags(desc, label, lflags, dflags); if (ret < 0) return ret; /* ACPI uses hundredths of milliseconds units */ ret = gpio_set_debounce_timeout(desc, info.debounce * 10); if (ret) return ret; irq_flags = acpi_dev_get_irq_type(info.triggering, info.polarity); /* * If the IRQ is not already in use then set type * if specified and different than the current one. */ if (can_request_irq(irq, irq_flags)) { if (irq_flags != IRQ_TYPE_NONE && irq_flags != irq_get_trigger_type(irq)) irq_set_irq_type(irq, irq_flags); } else { dev_dbg(&adev->dev, "IRQ %d already in use\n", irq); } /* avoid suspend issues with GPIOs when systems are using S3 */ if (wake_capable && acpi_gbl_FADT.flags & ACPI_FADT_LOW_POWER_S0) *wake_capable = info.wake_capable; return irq; } } return -ENOENT; } EXPORT_SYMBOL_GPL(acpi_dev_gpio_irq_wake_get_by); static acpi_status acpi_gpio_adr_space_handler(u32 function, acpi_physical_address address, u32 bits, u64 *value, void *handler_context, void *region_context) { struct acpi_gpio_chip *achip = region_context; struct gpio_chip *chip = achip->chip; struct acpi_resource_gpio *agpio; struct acpi_resource *ares; u16 pin_index = address; acpi_status status; int length; int i; status = acpi_buffer_to_resource(achip->conn_info.connection, achip->conn_info.length, &ares); if (ACPI_FAILURE(status)) return status; if (WARN_ON(ares->type != ACPI_RESOURCE_TYPE_GPIO)) { ACPI_FREE(ares); return AE_BAD_PARAMETER; } agpio = &ares->data.gpio; if (WARN_ON(agpio->io_restriction == ACPI_IO_RESTRICT_INPUT && function == ACPI_WRITE)) { ACPI_FREE(ares); return AE_BAD_PARAMETER; } length = min_t(u16, agpio->pin_table_length, pin_index + bits); for (i = pin_index; i < length; ++i) { unsigned int pin = agpio->pin_table[i]; struct acpi_gpio_connection *conn; struct gpio_desc *desc; bool found; mutex_lock(&achip->conn_lock); found = false; list_for_each_entry(conn, &achip->conns, node) { if (conn->pin == pin) { found = true; desc = conn->desc; break; } } /* * The same GPIO can be shared between operation region and * event but only if the access here is ACPI_READ. In that * case we "borrow" the event GPIO instead. */ if (!found && agpio->shareable == ACPI_SHARED && function == ACPI_READ) { struct acpi_gpio_event *event; list_for_each_entry(event, &achip->events, node) { if (event->pin == pin) { desc = event->desc; found = true; break; } } } if (!found) { desc = acpi_request_own_gpiod(chip, agpio, i, "ACPI:OpRegion"); if (IS_ERR(desc)) { mutex_unlock(&achip->conn_lock); status = AE_ERROR; goto out; } conn = kzalloc(sizeof(*conn), GFP_KERNEL); if (!conn) { gpiochip_free_own_desc(desc); mutex_unlock(&achip->conn_lock); status = AE_NO_MEMORY; goto out; } conn->pin = pin; conn->desc = desc; list_add_tail(&conn->node, &achip->conns); } mutex_unlock(&achip->conn_lock); if (function == ACPI_WRITE) gpiod_set_raw_value_cansleep(desc, !!(*value & BIT(i))); else *value |= (u64)gpiod_get_raw_value_cansleep(desc) << i; } out: ACPI_FREE(ares); return status; } static void acpi_gpiochip_request_regions(struct acpi_gpio_chip *achip) { struct gpio_chip *chip = achip->chip; acpi_handle handle = ACPI_HANDLE(chip->parent); acpi_status status; INIT_LIST_HEAD(&achip->conns); mutex_init(&achip->conn_lock); status = acpi_install_address_space_handler(handle, ACPI_ADR_SPACE_GPIO, acpi_gpio_adr_space_handler, NULL, achip); if (ACPI_FAILURE(status)) dev_err(chip->parent, "Failed to install GPIO OpRegion handler\n"); } static void acpi_gpiochip_free_regions(struct acpi_gpio_chip *achip) { struct gpio_chip *chip = achip->chip; acpi_handle handle = ACPI_HANDLE(chip->parent); struct acpi_gpio_connection *conn, *tmp; acpi_status status; status = acpi_remove_address_space_handler(handle, ACPI_ADR_SPACE_GPIO, acpi_gpio_adr_space_handler); if (ACPI_FAILURE(status)) { dev_err(chip->parent, "Failed to remove GPIO OpRegion handler\n"); return; } list_for_each_entry_safe_reverse(conn, tmp, &achip->conns, node) { gpiochip_free_own_desc(conn->desc); list_del(&conn->node); kfree(conn); } } static struct gpio_desc * acpi_gpiochip_parse_own_gpio(struct acpi_gpio_chip *achip, struct fwnode_handle *fwnode, const char **name, unsigned long *lflags, enum gpiod_flags *dflags) { struct gpio_chip *chip = achip->chip; struct gpio_desc *desc; u32 gpios[2]; int ret; *lflags = GPIO_LOOKUP_FLAGS_DEFAULT; *dflags = GPIOD_ASIS; *name = NULL; ret = fwnode_property_read_u32_array(fwnode, "gpios", gpios, ARRAY_SIZE(gpios)); if (ret < 0) return ERR_PTR(ret); desc = gpiochip_get_desc(chip, gpios[0]); if (IS_ERR(desc)) return desc; if (gpios[1]) *lflags |= GPIO_ACTIVE_LOW; if (fwnode_property_present(fwnode, "input")) *dflags |= GPIOD_IN; else if (fwnode_property_present(fwnode, "output-low")) *dflags |= GPIOD_OUT_LOW; else if (fwnode_property_present(fwnode, "output-high")) *dflags |= GPIOD_OUT_HIGH; else return ERR_PTR(-EINVAL); fwnode_property_read_string(fwnode, "line-name", name); return desc; } static void acpi_gpiochip_scan_gpios(struct acpi_gpio_chip *achip) { struct gpio_chip *chip = achip->chip; struct fwnode_handle *fwnode; device_for_each_child_node(chip->parent, fwnode) { unsigned long lflags; enum gpiod_flags dflags; struct gpio_desc *desc; const char *name; int ret; if (!fwnode_property_present(fwnode, "gpio-hog")) continue; desc = acpi_gpiochip_parse_own_gpio(achip, fwnode, &name, &lflags, &dflags); if (IS_ERR(desc)) continue; ret = gpiod_hog(desc, name, lflags, dflags); if (ret) { dev_err(chip->parent, "Failed to hog GPIO\n"); fwnode_handle_put(fwnode); return; } } } void acpi_gpiochip_add(struct gpio_chip *chip) { struct acpi_gpio_chip *acpi_gpio; struct acpi_device *adev; acpi_status status; if (!chip || !chip->parent) return; adev = ACPI_COMPANION(chip->parent); if (!adev) return; acpi_gpio = kzalloc(sizeof(*acpi_gpio), GFP_KERNEL); if (!acpi_gpio) { dev_err(chip->parent, "Failed to allocate memory for ACPI GPIO chip\n"); return; } acpi_gpio->chip = chip; INIT_LIST_HEAD(&acpi_gpio->events); INIT_LIST_HEAD(&acpi_gpio->deferred_req_irqs_list_entry); status = acpi_attach_data(adev->handle, acpi_gpio_chip_dh, acpi_gpio); if (ACPI_FAILURE(status)) { dev_err(chip->parent, "Failed to attach ACPI GPIO chip\n"); kfree(acpi_gpio); return; } acpi_gpiochip_request_regions(acpi_gpio); acpi_gpiochip_scan_gpios(acpi_gpio); acpi_dev_clear_dependencies(adev); } void acpi_gpiochip_remove(struct gpio_chip *chip) { struct acpi_gpio_chip *acpi_gpio; acpi_handle handle; acpi_status status; if (!chip || !chip->parent) return; handle = ACPI_HANDLE(chip->parent); if (!handle) return; status = acpi_get_data(handle, acpi_gpio_chip_dh, (void **)&acpi_gpio); if (ACPI_FAILURE(status)) { dev_warn(chip->parent, "Failed to retrieve ACPI GPIO chip\n"); return; } acpi_gpiochip_free_regions(acpi_gpio); acpi_detach_data(handle, acpi_gpio_chip_dh); kfree(acpi_gpio); } static int acpi_gpio_package_count(const union acpi_object *obj) { const union acpi_object *element = obj->package.elements; const union acpi_object *end = element + obj->package.count; unsigned int count = 0; while (element < end) { switch (element->type) { case ACPI_TYPE_LOCAL_REFERENCE: element += 3; fallthrough; case ACPI_TYPE_INTEGER: element++; count++; break; default: return -EPROTO; } } return count; } static int acpi_find_gpio_count(struct acpi_resource *ares, void *data) { unsigned int *count = data; if (ares->type == ACPI_RESOURCE_TYPE_GPIO) *count += ares->data.gpio.pin_table_length; return 1; } /** * acpi_gpio_count - count the GPIOs associated with a firmware node / function * @fwnode: firmware node of the GPIO consumer * @con_id: function within the GPIO consumer * * Return: * The number of GPIOs associated with a firmware node / function or %-ENOENT, * if no GPIO has been assigned to the requested function. */ int acpi_gpio_count(const struct fwnode_handle *fwnode, const char *con_id) { struct acpi_device *adev = to_acpi_device_node(fwnode); const union acpi_object *obj; const struct acpi_gpio_mapping *gm; int count = -ENOENT; int ret; char propname[32]; unsigned int i; /* Try first from _DSD */ for (i = 0; i < ARRAY_SIZE(gpio_suffixes); i++) { if (con_id) snprintf(propname, sizeof(propname), "%s-%s", con_id, gpio_suffixes[i]); else snprintf(propname, sizeof(propname), "%s", gpio_suffixes[i]); ret = acpi_dev_get_property(adev, propname, ACPI_TYPE_ANY, &obj); if (ret == 0) { if (obj->type == ACPI_TYPE_LOCAL_REFERENCE) count = 1; else if (obj->type == ACPI_TYPE_PACKAGE) count = acpi_gpio_package_count(obj); } else if (adev->driver_gpios) { for (gm = adev->driver_gpios; gm->name; gm++) if (strcmp(propname, gm->name) == 0) { count = gm->size; break; } } if (count > 0) break; } /* Then from plain _CRS GPIOs */ if (count < 0) { struct list_head resource_list; unsigned int crs_count = 0; if (!acpi_can_fallback_to_crs(adev, con_id)) return count; INIT_LIST_HEAD(&resource_list); acpi_dev_get_resources(adev, &resource_list, acpi_find_gpio_count, &crs_count); acpi_dev_free_resource_list(&resource_list); if (crs_count > 0) count = crs_count; } return count ? count : -ENOENT; } /* Run deferred acpi_gpiochip_request_irqs() */ static int __init acpi_gpio_handle_deferred_request_irqs(void) { struct acpi_gpio_chip *acpi_gpio, *tmp; mutex_lock(&acpi_gpio_deferred_req_irqs_lock); list_for_each_entry_safe(acpi_gpio, tmp, &acpi_gpio_deferred_req_irqs_list, deferred_req_irqs_list_entry) acpi_gpiochip_request_irqs(acpi_gpio); acpi_gpio_deferred_req_irqs_done = true; mutex_unlock(&acpi_gpio_deferred_req_irqs_lock); return 0; } /* We must use _sync so that this runs after the first deferred_probe run */ late_initcall_sync(acpi_gpio_handle_deferred_request_irqs); static const struct dmi_system_id gpiolib_acpi_quirks[] __initconst = { { /* * The Minix Neo Z83-4 has a micro-USB-B id-pin handler for * a non existing micro-USB-B connector which puts the HDMI * DDC pins in GPIO mode, breaking HDMI support. */ .matches = { DMI_MATCH(DMI_SYS_VENDOR, "MINIX"), DMI_MATCH(DMI_PRODUCT_NAME, "Z83-4"), }, .driver_data = &(struct acpi_gpiolib_dmi_quirk) { .no_edge_events_on_boot = true, }, }, { /* * The Terra Pad 1061 has a micro-USB-B id-pin handler, which * instead of controlling the actual micro-USB-B turns the 5V * boost for its USB-A connector off. The actual micro-USB-B * connector is wired for charging only. */ .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Wortmann_AG"), DMI_MATCH(DMI_PRODUCT_NAME, "TERRA_PAD_1061"), }, .driver_data = &(struct acpi_gpiolib_dmi_quirk) { .no_edge_events_on_boot = true, }, }, { /* * The Dell Venue 10 Pro 5055, with Bay Trail SoC + TI PMIC uses an * external embedded-controller connected via I2C + an ACPI GPIO * event handler on INT33FFC:02 pin 12, causing spurious wakeups. */ .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), DMI_MATCH(DMI_PRODUCT_NAME, "Venue 10 Pro 5055"), }, .driver_data = &(struct acpi_gpiolib_dmi_quirk) { .ignore_wake = "INT33FC:02@12", }, }, { /* * HP X2 10 models with Cherry Trail SoC + TI PMIC use an * external embedded-controller connected via I2C + an ACPI GPIO * event handler on INT33FF:01 pin 0, causing spurious wakeups. * When suspending by closing the LID, the power to the USB * keyboard is turned off, causing INT0002 ACPI events to * trigger once the XHCI controller notices the keyboard is * gone. So INT0002 events cause spurious wakeups too. Ignoring * EC wakes breaks wakeup when opening the lid, the user needs * to press the power-button to wakeup the system. The * alternative is suspend simply not working, which is worse. */ .matches = { DMI_MATCH(DMI_SYS_VENDOR, "HP"), DMI_MATCH(DMI_PRODUCT_NAME, "HP x2 Detachable 10-p0XX"), }, .driver_data = &(struct acpi_gpiolib_dmi_quirk) { .ignore_wake = "INT33FF:01@0,INT0002:00@2", }, }, { /* * HP X2 10 models with Bay Trail SoC + AXP288 PMIC use an * external embedded-controller connected via I2C + an ACPI GPIO * event handler on INT33FC:02 pin 28, causing spurious wakeups. */ .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), DMI_MATCH(DMI_PRODUCT_NAME, "HP Pavilion x2 Detachable"), DMI_MATCH(DMI_BOARD_NAME, "815D"), }, .driver_data = &(struct acpi_gpiolib_dmi_quirk) { .ignore_wake = "INT33FC:02@28", }, }, { /* * HP X2 10 models with Cherry Trail SoC + AXP288 PMIC use an * external embedded-controller connected via I2C + an ACPI GPIO * event handler on INT33FF:01 pin 0, causing spurious wakeups. */ .matches = { DMI_MATCH(DMI_SYS_VENDOR, "HP"), DMI_MATCH(DMI_PRODUCT_NAME, "HP Pavilion x2 Detachable"), DMI_MATCH(DMI_BOARD_NAME, "813E"), }, .driver_data = &(struct acpi_gpiolib_dmi_quirk) { .ignore_wake = "INT33FF:01@0", }, }, { /* * Interrupt storm caused from edge triggered floating pin * Found in BIOS UX325UAZ.300 * https://bugzilla.kernel.org/show_bug.cgi?id=216208 */ .matches = { DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."), DMI_MATCH(DMI_PRODUCT_NAME, "ZenBook UX325UAZ_UM325UAZ"), }, .driver_data = &(struct acpi_gpiolib_dmi_quirk) { .ignore_interrupt = "AMDI0030:00@18", }, }, { /* * Spurious wakeups from TP_ATTN# pin * Found in BIOS 1.7.8 * https://gitlab.freedesktop.org/drm/amd/-/issues/1722#note_1720627 */ .matches = { DMI_MATCH(DMI_BOARD_NAME, "NL5xNU"), }, .driver_data = &(struct acpi_gpiolib_dmi_quirk) { .ignore_wake = "ELAN0415:00@9", }, }, { /* * Spurious wakeups from TP_ATTN# pin * Found in BIOS 1.7.8 * https://gitlab.freedesktop.org/drm/amd/-/issues/1722#note_1720627 */ .matches = { DMI_MATCH(DMI_BOARD_NAME, "NL5xRU"), }, .driver_data = &(struct acpi_gpiolib_dmi_quirk) { .ignore_wake = "ELAN0415:00@9", }, }, { /* * Spurious wakeups from TP_ATTN# pin * Found in BIOS 1.7.7 */ .matches = { DMI_MATCH(DMI_BOARD_NAME, "NH5xAx"), }, .driver_data = &(struct acpi_gpiolib_dmi_quirk) { .ignore_wake = "SYNA1202:00@16", }, }, { /* * On the Peaq C1010 2-in-1 INT33FC:00 pin 3 is connected to * a "dolby" button. At the ACPI level an _AEI event-handler * is connected which sets an ACPI variable to 1 on both * edges. This variable can be polled + cleared to 0 using * WMI. But since the variable is set on both edges the WMI * interface is pretty useless even when polling. * So instead the x86-android-tablets code instantiates * a gpio-keys platform device for it. * Ignore the _AEI handler for the pin, so that it is not busy. */ .matches = { DMI_MATCH(DMI_SYS_VENDOR, "PEAQ"), DMI_MATCH(DMI_PRODUCT_NAME, "PEAQ PMM C1010 MD99187"), }, .driver_data = &(struct acpi_gpiolib_dmi_quirk) { .ignore_interrupt = "INT33FC:00@3", }, }, { /* * Spurious wakeups from TP_ATTN# pin * Found in BIOS 0.35 * https://gitlab.freedesktop.org/drm/amd/-/issues/3073 */ .matches = { DMI_MATCH(DMI_SYS_VENDOR, "GPD"), DMI_MATCH(DMI_PRODUCT_NAME, "G1619-04"), }, .driver_data = &(struct acpi_gpiolib_dmi_quirk) { .ignore_wake = "PNP0C50:00@8", }, }, {} /* Terminating entry */ }; static int __init acpi_gpio_setup_params(void) { const struct acpi_gpiolib_dmi_quirk *quirk = NULL; const struct dmi_system_id *id; id = dmi_first_match(gpiolib_acpi_quirks); if (id) quirk = id->driver_data; if (run_edge_events_on_boot < 0) { if (quirk && quirk->no_edge_events_on_boot) run_edge_events_on_boot = 0; else run_edge_events_on_boot = 1; } if (ignore_wake == NULL && quirk && quirk->ignore_wake) ignore_wake = quirk->ignore_wake; if (ignore_interrupt == NULL && quirk && quirk->ignore_interrupt) ignore_interrupt = quirk->ignore_interrupt; return 0; } /* Directly after dmi_setup() which runs as core_initcall() */ postcore_initcall(acpi_gpio_setup_params);
37 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_TTY_DRIVER_H #define _LINUX_TTY_DRIVER_H #include <linux/export.h> #include <linux/fs.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/cdev.h> #include <linux/uaccess.h> #include <linux/termios.h> #include <linux/seq_file.h> struct tty_struct; struct tty_driver; struct serial_icounter_struct; struct serial_struct; /** * struct tty_operations -- interface between driver and tty * * @lookup: ``struct tty_struct *()(struct tty_driver *self, struct file *, * int idx)`` * * Return the tty device corresponding to @idx, %NULL if there is not * one currently in use and an %ERR_PTR value on error. Called under * %tty_mutex (for now!) * * Optional method. Default behaviour is to use the @self->ttys array. * * @install: ``int ()(struct tty_driver *self, struct tty_struct *tty)`` * * Install a new @tty into the @self's internal tables. Used in * conjunction with @lookup and @remove methods. * * Optional method. Default behaviour is to use the @self->ttys array. * * @remove: ``void ()(struct tty_driver *self, struct tty_struct *tty)`` * * Remove a closed @tty from the @self's internal tables. Used in * conjunction with @lookup and @remove methods. * * Optional method. Default behaviour is to use the @self->ttys array. * * @open: ``int ()(struct tty_struct *tty, struct file *)`` * * This routine is called when a particular @tty device is opened. This * routine is mandatory; if this routine is not filled in, the attempted * open will fail with %ENODEV. * * Required method. Called with tty lock held. May sleep. * * @close: ``void ()(struct tty_struct *tty, struct file *)`` * * This routine is called when a particular @tty device is closed. At the * point of return from this call the driver must make no further ldisc * calls of any kind. * * Remark: called even if the corresponding @open() failed. * * Required method. Called with tty lock held. May sleep. * * @shutdown: ``void ()(struct tty_struct *tty)`` * * This routine is called under the tty lock when a particular @tty device * is closed for the last time. It executes before the @tty resources * are freed so may execute while another function holds a @tty kref. * * @cleanup: ``void ()(struct tty_struct *tty)`` * * This routine is called asynchronously when a particular @tty device * is closed for the last time freeing up the resources. This is * actually the second part of shutdown for routines that might sleep. * * @write: ``ssize_t ()(struct tty_struct *tty, const u8 *buf, size_t count)`` * * This routine is called by the kernel to write a series (@count) of * characters (@buf) to the @tty device. The characters may come from * user space or kernel space. This routine will return the * number of characters actually accepted for writing. * * May occur in parallel in special cases. Because this includes panic * paths drivers generally shouldn't try and do clever locking here. * * Optional: Required for writable devices. May not sleep. * * @put_char: ``int ()(struct tty_struct *tty, u8 ch)`` * * This routine is called by the kernel to write a single character @ch to * the @tty device. If the kernel uses this routine, it must call the * @flush_chars() routine (if defined) when it is done stuffing characters * into the driver. If there is no room in the queue, the character is * ignored. * * Optional: Kernel will use the @write method if not provided. Do not * call this function directly, call tty_put_char(). * * @flush_chars: ``void ()(struct tty_struct *tty)`` * * This routine is called by the kernel after it has written a * series of characters to the tty device using @put_char(). * * Optional. Do not call this function directly, call * tty_driver_flush_chars(). * * @write_room: ``unsigned int ()(struct tty_struct *tty)`` * * This routine returns the numbers of characters the @tty driver * will accept for queuing to be written. This number is subject * to change as output buffers get emptied, or if the output flow * control is acted. * * The ldisc is responsible for being intelligent about multi-threading of * write_room/write calls * * Required if @write method is provided else not needed. Do not call this * function directly, call tty_write_room() * * @chars_in_buffer: ``unsigned int ()(struct tty_struct *tty)`` * * This routine returns the number of characters in the device private * output queue. Used in tty_wait_until_sent() and for poll() * implementation. * * Optional: if not provided, it is assumed there is no queue on the * device. Do not call this function directly, call tty_chars_in_buffer(). * * @ioctl: ``int ()(struct tty_struct *tty, unsigned int cmd, * unsigned long arg)`` * * This routine allows the @tty driver to implement device-specific * ioctls. If the ioctl number passed in @cmd is not recognized by the * driver, it should return %ENOIOCTLCMD. * * Optional. * * @compat_ioctl: ``long ()(struct tty_struct *tty, unsigned int cmd, * unsigned long arg)`` * * Implement ioctl processing for 32 bit process on 64 bit system. * * Optional. * * @set_termios: ``void ()(struct tty_struct *tty, const struct ktermios *old)`` * * This routine allows the @tty driver to be notified when device's * termios settings have changed. New settings are in @tty->termios. * Previous settings are passed in the @old argument. * * The API is defined such that the driver should return the actual modes * selected. This means that the driver is responsible for modifying any * bits in @tty->termios it cannot fulfill to indicate the actual modes * being used. * * Optional. Called under the @tty->termios_rwsem. May sleep. * * @set_ldisc: ``void ()(struct tty_struct *tty)`` * * This routine allows the @tty driver to be notified when the device's * line discipline is being changed. At the point this is done the * discipline is not yet usable. * * Optional. Called under the @tty->ldisc_sem and @tty->termios_rwsem. * * @throttle: ``void ()(struct tty_struct *tty)`` * * This routine notifies the @tty driver that input buffers for the line * discipline are close to full, and it should somehow signal that no more * characters should be sent to the @tty. * * Serialization including with @unthrottle() is the job of the ldisc * layer. * * Optional: Always invoke via tty_throttle_safe(). Called under the * @tty->termios_rwsem. * * @unthrottle: ``void ()(struct tty_struct *tty)`` * * This routine notifies the @tty driver that it should signal that * characters can now be sent to the @tty without fear of overrunning the * input buffers of the line disciplines. * * Optional. Always invoke via tty_unthrottle(). Called under the * @tty->termios_rwsem. * * @stop: ``void ()(struct tty_struct *tty)`` * * This routine notifies the @tty driver that it should stop outputting * characters to the tty device. * * Called with @tty->flow.lock held. Serialized with @start() method. * * Optional. Always invoke via stop_tty(). * * @start: ``void ()(struct tty_struct *tty)`` * * This routine notifies the @tty driver that it resumed sending * characters to the @tty device. * * Called with @tty->flow.lock held. Serialized with stop() method. * * Optional. Always invoke via start_tty(). * * @hangup: ``void ()(struct tty_struct *tty)`` * * This routine notifies the @tty driver that it should hang up the @tty * device. * * Optional. Called with tty lock held. * * @break_ctl: ``int ()(struct tty_struct *tty, int state)`` * * This optional routine requests the @tty driver to turn on or off BREAK * status on the RS-232 port. If @state is -1, then the BREAK status * should be turned on; if @state is 0, then BREAK should be turned off. * * If this routine is implemented, the high-level tty driver will handle * the following ioctls: %TCSBRK, %TCSBRKP, %TIOCSBRK, %TIOCCBRK. * * If the driver sets %TTY_DRIVER_HARDWARE_BREAK in tty_alloc_driver(), * then the interface will also be called with actual times and the * hardware is expected to do the delay work itself. 0 and -1 are still * used for on/off. * * Optional: Required for %TCSBRK/%BRKP/etc. handling. May sleep. * * @flush_buffer: ``void ()(struct tty_struct *tty)`` * * This routine discards device private output buffer. Invoked on close, * hangup, to implement %TCOFLUSH ioctl and similar. * * Optional: if not provided, it is assumed there is no queue on the * device. Do not call this function directly, call * tty_driver_flush_buffer(). * * @wait_until_sent: ``void ()(struct tty_struct *tty, int timeout)`` * * This routine waits until the device has written out all of the * characters in its transmitter FIFO. Or until @timeout (in jiffies) is * reached. * * Optional: If not provided, the device is assumed to have no FIFO. * Usually correct to invoke via tty_wait_until_sent(). May sleep. * * @send_xchar: ``void ()(struct tty_struct *tty, u8 ch)`` * * This routine is used to send a high-priority XON/XOFF character (@ch) * to the @tty device. * * Optional: If not provided, then the @write method is called under * the @tty->atomic_write_lock to keep it serialized with the ldisc. * * @tiocmget: ``int ()(struct tty_struct *tty)`` * * This routine is used to obtain the modem status bits from the @tty * driver. * * Optional: If not provided, then %ENOTTY is returned from the %TIOCMGET * ioctl. Do not call this function directly, call tty_tiocmget(). * * @tiocmset: ``int ()(struct tty_struct *tty, * unsigned int set, unsigned int clear)`` * * This routine is used to set the modem status bits to the @tty driver. * First, @clear bits should be cleared, then @set bits set. * * Optional: If not provided, then %ENOTTY is returned from the %TIOCMSET * ioctl. Do not call this function directly, call tty_tiocmset(). * * @resize: ``int ()(struct tty_struct *tty, struct winsize *ws)`` * * Called when a termios request is issued which changes the requested * terminal geometry to @ws. * * Optional: the default action is to update the termios structure * without error. This is usually the correct behaviour. Drivers should * not force errors here if they are not resizable objects (e.g. a serial * line). See tty_do_resize() if you need to wrap the standard method * in your own logic -- the usual case. * * @get_icount: ``int ()(struct tty_struct *tty, * struct serial_icounter *icount)`` * * Called when the @tty device receives a %TIOCGICOUNT ioctl. Passed a * kernel structure @icount to complete. * * Optional: called only if provided, otherwise %ENOTTY will be returned. * * @get_serial: ``int ()(struct tty_struct *tty, struct serial_struct *p)`` * * Called when the @tty device receives a %TIOCGSERIAL ioctl. Passed a * kernel structure @p (&struct serial_struct) to complete. * * Optional: called only if provided, otherwise %ENOTTY will be returned. * Do not call this function directly, call tty_tiocgserial(). * * @set_serial: ``int ()(struct tty_struct *tty, struct serial_struct *p)`` * * Called when the @tty device receives a %TIOCSSERIAL ioctl. Passed a * kernel structure @p (&struct serial_struct) to set the values from. * * Optional: called only if provided, otherwise %ENOTTY will be returned. * Do not call this function directly, call tty_tiocsserial(). * * @show_fdinfo: ``void ()(struct tty_struct *tty, struct seq_file *m)`` * * Called when the @tty device file descriptor receives a fdinfo request * from VFS (to show in /proc/<pid>/fdinfo/). @m should be filled with * information. * * Optional: called only if provided, otherwise nothing is written to @m. * Do not call this function directly, call tty_show_fdinfo(). * * @poll_init: ``int ()(struct tty_driver *driver, int line, char *options)`` * * kgdboc support (Documentation/dev-tools/kgdb.rst). This routine is * called to initialize the HW for later use by calling @poll_get_char or * @poll_put_char. * * Optional: called only if provided, otherwise skipped as a non-polling * driver. * * @poll_get_char: ``int ()(struct tty_driver *driver, int line)`` * * kgdboc support (see @poll_init). @driver should read a character from a * tty identified by @line and return it. * * Optional: called only if @poll_init provided. * * @poll_put_char: ``void ()(struct tty_driver *driver, int line, char ch)`` * * kgdboc support (see @poll_init). @driver should write character @ch to * a tty identified by @line. * * Optional: called only if @poll_init provided. * * @proc_show: ``int ()(struct seq_file *m, void *driver)`` * * Driver @driver (cast to &struct tty_driver) can show additional info in * /proc/tty/driver/<driver_name>. It is enough to fill in the information * into @m. * * Optional: called only if provided, otherwise no /proc entry created. * * This structure defines the interface between the low-level tty driver and * the tty routines. These routines can be defined. Unless noted otherwise, * they are optional, and can be filled in with a %NULL pointer. */ struct tty_operations { struct tty_struct * (*lookup)(struct tty_driver *driver, struct file *filp, int idx); int (*install)(struct tty_driver *driver, struct tty_struct *tty); void (*remove)(struct tty_driver *driver, struct tty_struct *tty); int (*open)(struct tty_struct * tty, struct file * filp); void (*close)(struct tty_struct * tty, struct file * filp); void (*shutdown)(struct tty_struct *tty); void (*cleanup)(struct tty_struct *tty); ssize_t (*write)(struct tty_struct *tty, const u8 *buf, size_t count); int (*put_char)(struct tty_struct *tty, u8 ch); void (*flush_chars)(struct tty_struct *tty); unsigned int (*write_room)(struct tty_struct *tty); unsigned int (*chars_in_buffer)(struct tty_struct *tty); int (*ioctl)(struct tty_struct *tty, unsigned int cmd, unsigned long arg); long (*compat_ioctl)(struct tty_struct *tty, unsigned int cmd, unsigned long arg); void (*set_termios)(struct tty_struct *tty, const struct ktermios *old); void (*throttle)(struct tty_struct * tty); void (*unthrottle)(struct tty_struct * tty); void (*stop)(struct tty_struct *tty); void (*start)(struct tty_struct *tty); void (*hangup)(struct tty_struct *tty); int (*break_ctl)(struct tty_struct *tty, int state); void (*flush_buffer)(struct tty_struct *tty); void (*set_ldisc)(struct tty_struct *tty); void (*wait_until_sent)(struct tty_struct *tty, int timeout); void (*send_xchar)(struct tty_struct *tty, u8 ch); int (*tiocmget)(struct tty_struct *tty); int (*tiocmset)(struct tty_struct *tty, unsigned int set, unsigned int clear); int (*resize)(struct tty_struct *tty, struct winsize *ws); int (*get_icount)(struct tty_struct *tty, struct serial_icounter_struct *icount); int (*get_serial)(struct tty_struct *tty, struct serial_struct *p); int (*set_serial)(struct tty_struct *tty, struct serial_struct *p); void (*show_fdinfo)(struct tty_struct *tty, struct seq_file *m); #ifdef CONFIG_CONSOLE_POLL int (*poll_init)(struct tty_driver *driver, int line, char *options); int (*poll_get_char)(struct tty_driver *driver, int line); void (*poll_put_char)(struct tty_driver *driver, int line, char ch); #endif int (*proc_show)(struct seq_file *m, void *driver); } __randomize_layout; /** * struct tty_driver -- driver for TTY devices * * @kref: reference counting. Reaching zero frees all the internals and the * driver. * @cdevs: allocated/registered character /dev devices * @owner: modules owning this driver. Used drivers cannot be rmmod'ed. * Automatically set by tty_alloc_driver(). * @driver_name: name of the driver used in /proc/tty * @name: used for constructing /dev node name * @name_base: used as a number base for constructing /dev node name * @major: major /dev device number (zero for autoassignment) * @minor_start: the first minor /dev device number * @num: number of devices allocated * @type: type of tty driver (%TTY_DRIVER_TYPE_) * @subtype: subtype of tty driver (%SYSTEM_TYPE_, %PTY_TYPE_, %SERIAL_TYPE_) * @init_termios: termios to set to each tty initially (e.g. %tty_std_termios) * @flags: tty driver flags (%TTY_DRIVER_) * @proc_entry: proc fs entry, used internally * @other: driver of the linked tty; only used for the PTY driver * @ttys: array of active &struct tty_struct, set by tty_standard_install() * @ports: array of &struct tty_port; can be set during initialization by * tty_port_link_device() and similar * @termios: storage for termios at each TTY close for the next open * @driver_state: pointer to driver's arbitrary data * @ops: driver hooks for TTYs. Set them using tty_set_operations(). Use &struct * tty_port helpers in them as much as possible. * @tty_drivers: used internally to link tty_drivers together * * The usual handling of &struct tty_driver is to allocate it by * tty_alloc_driver(), set up all the necessary members, and register it by * tty_register_driver(). At last, the driver is torn down by calling * tty_unregister_driver() followed by tty_driver_kref_put(). * * The fields required to be set before calling tty_register_driver() include * @driver_name, @name, @type, @subtype, @init_termios, and @ops. */ struct tty_driver { struct kref kref; struct cdev **cdevs; struct module *owner; const char *driver_name; const char *name; int name_base; int major; int minor_start; unsigned int num; short type; short subtype; struct ktermios init_termios; unsigned long flags; struct proc_dir_entry *proc_entry; struct tty_driver *other; /* * Pointer to the tty data structures */ struct tty_struct **ttys; struct tty_port **ports; struct ktermios **termios; void *driver_state; /* * Driver methods */ const struct tty_operations *ops; struct list_head tty_drivers; } __randomize_layout; extern struct list_head tty_drivers; struct tty_driver *__tty_alloc_driver(unsigned int lines, struct module *owner, unsigned long flags); struct tty_driver *tty_find_polling_driver(char *name, int *line); void tty_driver_kref_put(struct tty_driver *driver); /* Use TTY_DRIVER_* flags below */ #define tty_alloc_driver(lines, flags) \ __tty_alloc_driver(lines, THIS_MODULE, flags) static inline struct tty_driver *tty_driver_kref_get(struct tty_driver *d) { kref_get(&d->kref); return d; } static inline void tty_set_operations(struct tty_driver *driver, const struct tty_operations *op) { driver->ops = op; } /** * DOC: TTY Driver Flags * * TTY_DRIVER_RESET_TERMIOS * Requests the tty layer to reset the termios setting when the last * process has closed the device. Used for PTYs, in particular. * * TTY_DRIVER_REAL_RAW * Indicates that the driver will guarantee not to set any special * character handling flags if this is set for the tty: * * ``(IGNBRK || (!BRKINT && !PARMRK)) && (IGNPAR || !INPCK)`` * * That is, if there is no reason for the driver to * send notifications of parity and break characters up to the line * driver, it won't do so. This allows the line driver to optimize for * this case if this flag is set. (Note that there is also a promise, if * the above case is true, not to signal overruns, either.) * * TTY_DRIVER_DYNAMIC_DEV * The individual tty devices need to be registered with a call to * tty_register_device() when the device is found in the system and * unregistered with a call to tty_unregister_device() so the devices will * be show up properly in sysfs. If not set, all &tty_driver.num entries * will be created by the tty core in sysfs when tty_register_driver() is * called. This is to be used by drivers that have tty devices that can * appear and disappear while the main tty driver is registered with the * tty core. * * TTY_DRIVER_DEVPTS_MEM * Don't use the standard arrays (&tty_driver.ttys and * &tty_driver.termios), instead use dynamic memory keyed through the * devpts filesystem. This is only applicable to the PTY driver. * * TTY_DRIVER_HARDWARE_BREAK * Hardware handles break signals. Pass the requested timeout to the * &tty_operations.break_ctl instead of using a simple on/off interface. * * TTY_DRIVER_DYNAMIC_ALLOC * Do not allocate structures which are needed per line for this driver * (&tty_driver.ports) as it would waste memory. The driver will take * care. This is only applicable to the PTY driver. * * TTY_DRIVER_UNNUMBERED_NODE * Do not create numbered ``/dev`` nodes. For example, create * ``/dev/ttyprintk`` and not ``/dev/ttyprintk0``. Applicable only when a * driver for a single tty device is being allocated. */ #define TTY_DRIVER_INSTALLED 0x0001 #define TTY_DRIVER_RESET_TERMIOS 0x0002 #define TTY_DRIVER_REAL_RAW 0x0004 #define TTY_DRIVER_DYNAMIC_DEV 0x0008 #define TTY_DRIVER_DEVPTS_MEM 0x0010 #define TTY_DRIVER_HARDWARE_BREAK 0x0020 #define TTY_DRIVER_DYNAMIC_ALLOC 0x0040 #define TTY_DRIVER_UNNUMBERED_NODE 0x0080 /* tty driver types */ #define TTY_DRIVER_TYPE_SYSTEM 0x0001 #define TTY_DRIVER_TYPE_CONSOLE 0x0002 #define TTY_DRIVER_TYPE_SERIAL 0x0003 #define TTY_DRIVER_TYPE_PTY 0x0004 #define TTY_DRIVER_TYPE_SCC 0x0005 /* scc driver */ #define TTY_DRIVER_TYPE_SYSCONS 0x0006 /* system subtypes (magic, used by tty_io.c) */ #define SYSTEM_TYPE_TTY 0x0001 #define SYSTEM_TYPE_CONSOLE 0x0002 #define SYSTEM_TYPE_SYSCONS 0x0003 #define SYSTEM_TYPE_SYSPTMX 0x0004 /* pty subtypes (magic, used by tty_io.c) */ #define PTY_TYPE_MASTER 0x0001 #define PTY_TYPE_SLAVE 0x0002 /* serial subtype definitions */ #define SERIAL_TYPE_NORMAL 1 int tty_register_driver(struct tty_driver *driver); void tty_unregister_driver(struct tty_driver *driver); struct device *tty_register_device(struct tty_driver *driver, unsigned index, struct device *dev); struct device *tty_register_device_attr(struct tty_driver *driver, unsigned index, struct device *device, void *drvdata, const struct attribute_group **attr_grp); void tty_unregister_device(struct tty_driver *driver, unsigned index); #ifdef CONFIG_PROC_FS void proc_tty_register_driver(struct tty_driver *); void proc_tty_unregister_driver(struct tty_driver *); #else static inline void proc_tty_register_driver(struct tty_driver *d) {} static inline void proc_tty_unregister_driver(struct tty_driver *d) {} #endif #endif /* #ifdef _LINUX_TTY_DRIVER_H */
5 422 405 388 327 89 89 57 31 293 243 59 59 462 3 462 38 429 21 21 23 1 22 1 21 1 20 1 5 3 2 1 413 16 16 11 11 5 2 39 25 1 26 18 6 2 4 16 44 34 4 18 29 1 1 20 1 7 1 26 2 6 2 2 19 5 24 294 457 2 18 41 108 64 64 63 64 62 82 16 20 19 16 16 12 20 20 1 1 111 107 5 5 5 19 18 180 56 56 2 180 174 180 98 91 6 95 3 98 97 97 97 1 98 91 15 1 2 98 2 1 1 1 182 183 182 39 77 2 94 95 3 1 172 182 181 2 7 7 7 50 154 181 181 163 164 163 43 7 173 174 10 163 173 162 10 50 154 2 1 1 1 406 1 1 3 408 1 1 1 418 415 1 3 10 8 1 392 142 279 412 4 414 375 6 1 402 182 1 10 3 219 222 233 6 1 1 1 39 34 1 36 36 34 2 13 1 1 2 1 2 2 1 3 2 4 1 1 4 1 2 1 435 1 1 2 1 7 11 411 410 1 412 1 2 1 13 1 1 1 46 56 4 1 2 1 2 47 47 1 415 2 10 10 2 405 178 236 179 1 2 2 2 2 1 4 4 1 1 2 1 2 2 2 12 12 11 11 1 16 16 16 3 2 2 2 3 3 3 2 1 16 3 13 17 42 1 1 1 7 27 1 4 4 13 23 17 26 6 3 2 1 11 3 14 7 5 2 7 7 7 1 9 9 1 7 5 11 1 1 9 8 7 1 9 1 17 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/sched/sch_api.c Packet scheduler API. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * * Fixes: * * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired. * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support */ #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <linux/init.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/kmod.h> #include <linux/list.h> #include <linux/hrtimer.h> #include <linux/slab.h> #include <linux/hashtable.h> #include <net/net_namespace.h> #include <net/sock.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> #include <net/tc_wrapper.h> #include <trace/events/qdisc.h> /* Short review. ------------- This file consists of two interrelated parts: 1. queueing disciplines manager frontend. 2. traffic classes manager frontend. Generally, queueing discipline ("qdisc") is a black box, which is able to enqueue packets and to dequeue them (when device is ready to send something) in order and at times determined by algorithm hidden in it. qdisc's are divided to two categories: - "queues", which have no internal structure visible from outside. - "schedulers", which split all the packets to "traffic classes", using "packet classifiers" (look at cls_api.c) In turn, classes may have child qdiscs (as rule, queues) attached to them etc. etc. etc. The goal of the routines in this file is to translate information supplied by user in the form of handles to more intelligible for kernel form, to make some sanity checks and part of work, which is common to all qdiscs and to provide rtnetlink notifications. All real intelligent work is done inside qdisc modules. Every discipline has two major routines: enqueue and dequeue. ---dequeue dequeue usually returns a skb to send. It is allowed to return NULL, but it does not mean that queue is empty, it just means that discipline does not want to send anything this time. Queue is really empty if q->q.qlen == 0. For complicated disciplines with multiple queues q->q is not real packet queue, but however q->q.qlen must be valid. ---enqueue enqueue returns 0, if packet was enqueued successfully. If packet (this one or another one) was dropped, it returns not zero error code. NET_XMIT_DROP - this packet dropped Expected action: do not backoff, but wait until queue will clear. NET_XMIT_CN - probably this packet enqueued, but another one dropped. Expected action: backoff or ignore Auxiliary routines: ---peek like dequeue but without removing a packet from the queue ---reset returns qdisc to initial state: purge all buffers, clear all timers, counters (except for statistics) etc. ---init initializes newly created qdisc. ---destroy destroys resources allocated by init and during lifetime of qdisc. ---change changes qdisc parameters. */ /* Protects list of registered TC modules. It is pure SMP lock. */ static DEFINE_RWLOCK(qdisc_mod_lock); /************************************************ * Queueing disciplines manipulation. * ************************************************/ /* The list of all installed queueing disciplines. */ static struct Qdisc_ops *qdisc_base; /* Register/unregister queueing discipline */ int register_qdisc(struct Qdisc_ops *qops) { struct Qdisc_ops *q, **qp; int rc = -EEXIST; write_lock(&qdisc_mod_lock); for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next) if (!strcmp(qops->id, q->id)) goto out; if (qops->enqueue == NULL) qops->enqueue = noop_qdisc_ops.enqueue; if (qops->peek == NULL) { if (qops->dequeue == NULL) qops->peek = noop_qdisc_ops.peek; else goto out_einval; } if (qops->dequeue == NULL) qops->dequeue = noop_qdisc_ops.dequeue; if (qops->cl_ops) { const struct Qdisc_class_ops *cops = qops->cl_ops; if (!(cops->find && cops->walk && cops->leaf)) goto out_einval; if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf)) goto out_einval; } qops->next = NULL; *qp = qops; rc = 0; out: write_unlock(&qdisc_mod_lock); return rc; out_einval: rc = -EINVAL; goto out; } EXPORT_SYMBOL(register_qdisc); void unregister_qdisc(struct Qdisc_ops *qops) { struct Qdisc_ops *q, **qp; int err = -ENOENT; write_lock(&qdisc_mod_lock); for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next) if (q == qops) break; if (q) { *qp = q->next; q->next = NULL; err = 0; } write_unlock(&qdisc_mod_lock); WARN(err, "unregister qdisc(%s) failed\n", qops->id); } EXPORT_SYMBOL(unregister_qdisc); /* Get default qdisc if not otherwise specified */ void qdisc_get_default(char *name, size_t len) { read_lock(&qdisc_mod_lock); strscpy(name, default_qdisc_ops->id, len); read_unlock(&qdisc_mod_lock); } static struct Qdisc_ops *qdisc_lookup_default(const char *name) { struct Qdisc_ops *q = NULL; for (q = qdisc_base; q; q = q->next) { if (!strcmp(name, q->id)) { if (!try_module_get(q->owner)) q = NULL; break; } } return q; } /* Set new default qdisc to use */ int qdisc_set_default(const char *name) { const struct Qdisc_ops *ops; if (!capable(CAP_NET_ADMIN)) return -EPERM; write_lock(&qdisc_mod_lock); ops = qdisc_lookup_default(name); if (!ops) { /* Not found, drop lock and try to load module */ write_unlock(&qdisc_mod_lock); request_module(NET_SCH_ALIAS_PREFIX "%s", name); write_lock(&qdisc_mod_lock); ops = qdisc_lookup_default(name); } if (ops) { /* Set new default */ module_put(default_qdisc_ops->owner); default_qdisc_ops = ops; } write_unlock(&qdisc_mod_lock); return ops ? 0 : -ENOENT; } #ifdef CONFIG_NET_SCH_DEFAULT /* Set default value from kernel config */ static int __init sch_default_qdisc(void) { return qdisc_set_default(CONFIG_DEFAULT_NET_SCH); } late_initcall(sch_default_qdisc); #endif /* We know handle. Find qdisc among all qdisc's attached to device * (root qdisc, all its children, children of children etc.) * Note: caller either uses rtnl or rcu_read_lock() */ static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle) { struct Qdisc *q; if (!qdisc_dev(root)) return (root->handle == handle ? root : NULL); if (!(root->flags & TCQ_F_BUILTIN) && root->handle == handle) return root; hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle, lockdep_rtnl_is_held()) { if (q->handle == handle) return q; } return NULL; } void qdisc_hash_add(struct Qdisc *q, bool invisible) { if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) { ASSERT_RTNL(); hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle); if (invisible) q->flags |= TCQ_F_INVISIBLE; } } EXPORT_SYMBOL(qdisc_hash_add); void qdisc_hash_del(struct Qdisc *q) { if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) { ASSERT_RTNL(); hash_del_rcu(&q->hash); } } EXPORT_SYMBOL(qdisc_hash_del); struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle) { struct Qdisc *q; if (!handle) return NULL; q = qdisc_match_from_root(rtnl_dereference(dev->qdisc), handle); if (q) goto out; if (dev_ingress_queue(dev)) q = qdisc_match_from_root( rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping), handle); out: return q; } struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle) { struct netdev_queue *nq; struct Qdisc *q; if (!handle) return NULL; q = qdisc_match_from_root(rcu_dereference(dev->qdisc), handle); if (q) goto out; nq = dev_ingress_queue_rcu(dev); if (nq) q = qdisc_match_from_root(rcu_dereference(nq->qdisc_sleeping), handle); out: return q; } static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid) { unsigned long cl; const struct Qdisc_class_ops *cops = p->ops->cl_ops; if (cops == NULL) return NULL; cl = cops->find(p, classid); if (cl == 0) return NULL; return cops->leaf(p, cl); } /* Find queueing discipline by name */ static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind) { struct Qdisc_ops *q = NULL; if (kind) { read_lock(&qdisc_mod_lock); for (q = qdisc_base; q; q = q->next) { if (nla_strcmp(kind, q->id) == 0) { if (!try_module_get(q->owner)) q = NULL; break; } } read_unlock(&qdisc_mod_lock); } return q; } /* The linklayer setting were not transferred from iproute2, in older * versions, and the rate tables lookup systems have been dropped in * the kernel. To keep backward compatible with older iproute2 tc * utils, we detect the linklayer setting by detecting if the rate * table were modified. * * For linklayer ATM table entries, the rate table will be aligned to * 48 bytes, thus some table entries will contain the same value. The * mpu (min packet unit) is also encoded into the old rate table, thus * starting from the mpu, we find low and high table entries for * mapping this cell. If these entries contain the same value, when * the rate tables have been modified for linklayer ATM. * * This is done by rounding mpu to the nearest 48 bytes cell/entry, * and then roundup to the next cell, calc the table entry one below, * and compare. */ static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab) { int low = roundup(r->mpu, 48); int high = roundup(low+1, 48); int cell_low = low >> r->cell_log; int cell_high = (high >> r->cell_log) - 1; /* rtab is too inaccurate at rates > 100Mbit/s */ if ((r->rate > (100000000/8)) || (rtab[0] == 0)) { pr_debug("TC linklayer: Giving up ATM detection\n"); return TC_LINKLAYER_ETHERNET; } if ((cell_high > cell_low) && (cell_high < 256) && (rtab[cell_low] == rtab[cell_high])) { pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n", cell_low, cell_high, rtab[cell_high]); return TC_LINKLAYER_ATM; } return TC_LINKLAYER_ETHERNET; } static struct qdisc_rate_table *qdisc_rtab_list; struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab, struct netlink_ext_ack *extack) { struct qdisc_rate_table *rtab; if (tab == NULL || r->rate == 0 || r->cell_log == 0 || r->cell_log >= 32 || nla_len(tab) != TC_RTAB_SIZE) { NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching"); return NULL; } for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) { if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) && !memcmp(&rtab->data, nla_data(tab), 1024)) { rtab->refcnt++; return rtab; } } rtab = kmalloc(sizeof(*rtab), GFP_KERNEL); if (rtab) { rtab->rate = *r; rtab->refcnt = 1; memcpy(rtab->data, nla_data(tab), 1024); if (r->linklayer == TC_LINKLAYER_UNAWARE) r->linklayer = __detect_linklayer(r, rtab->data); rtab->next = qdisc_rtab_list; qdisc_rtab_list = rtab; } else { NL_SET_ERR_MSG(extack, "Failed to allocate new qdisc rate table"); } return rtab; } EXPORT_SYMBOL(qdisc_get_rtab); void qdisc_put_rtab(struct qdisc_rate_table *tab) { struct qdisc_rate_table *rtab, **rtabp; if (!tab || --tab->refcnt) return; for (rtabp = &qdisc_rtab_list; (rtab = *rtabp) != NULL; rtabp = &rtab->next) { if (rtab == tab) { *rtabp = rtab->next; kfree(rtab); return; } } } EXPORT_SYMBOL(qdisc_put_rtab); static LIST_HEAD(qdisc_stab_list); static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = { [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) }, [TCA_STAB_DATA] = { .type = NLA_BINARY }, }; static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt, struct netlink_ext_ack *extack) { struct nlattr *tb[TCA_STAB_MAX + 1]; struct qdisc_size_table *stab; struct tc_sizespec *s; unsigned int tsize = 0; u16 *tab = NULL; int err; err = nla_parse_nested_deprecated(tb, TCA_STAB_MAX, opt, stab_policy, extack); if (err < 0) return ERR_PTR(err); if (!tb[TCA_STAB_BASE]) { NL_SET_ERR_MSG(extack, "Size table base attribute is missing"); return ERR_PTR(-EINVAL); } s = nla_data(tb[TCA_STAB_BASE]); if (s->tsize > 0) { if (!tb[TCA_STAB_DATA]) { NL_SET_ERR_MSG(extack, "Size table data attribute is missing"); return ERR_PTR(-EINVAL); } tab = nla_data(tb[TCA_STAB_DATA]); tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16); } if (tsize != s->tsize || (!tab && tsize > 0)) { NL_SET_ERR_MSG(extack, "Invalid size of size table"); return ERR_PTR(-EINVAL); } list_for_each_entry(stab, &qdisc_stab_list, list) { if (memcmp(&stab->szopts, s, sizeof(*s))) continue; if (tsize > 0 && memcmp(stab->data, tab, flex_array_size(stab, data, tsize))) continue; stab->refcnt++; return stab; } if (s->size_log > STAB_SIZE_LOG_MAX || s->cell_log > STAB_SIZE_LOG_MAX) { NL_SET_ERR_MSG(extack, "Invalid logarithmic size of size table"); return ERR_PTR(-EINVAL); } stab = kmalloc(struct_size(stab, data, tsize), GFP_KERNEL); if (!stab) return ERR_PTR(-ENOMEM); stab->refcnt = 1; stab->szopts = *s; if (tsize > 0) memcpy(stab->data, tab, flex_array_size(stab, data, tsize)); list_add_tail(&stab->list, &qdisc_stab_list); return stab; } void qdisc_put_stab(struct qdisc_size_table *tab) { if (!tab) return; if (--tab->refcnt == 0) { list_del(&tab->list); kfree_rcu(tab, rcu); } } EXPORT_SYMBOL(qdisc_put_stab); static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab) { struct nlattr *nest; nest = nla_nest_start_noflag(skb, TCA_STAB); if (nest == NULL) goto nla_put_failure; if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts)) goto nla_put_failure; nla_nest_end(skb, nest); return skb->len; nla_put_failure: return -1; } void __qdisc_calculate_pkt_len(struct sk_buff *skb, const struct qdisc_size_table *stab) { int pkt_len, slot; pkt_len = skb->len + stab->szopts.overhead; if (unlikely(!stab->szopts.tsize)) goto out; slot = pkt_len + stab->szopts.cell_align; if (unlikely(slot < 0)) slot = 0; slot >>= stab->szopts.cell_log; if (likely(slot < stab->szopts.tsize)) pkt_len = stab->data[slot]; else pkt_len = stab->data[stab->szopts.tsize - 1] * (slot / stab->szopts.tsize) + stab->data[slot % stab->szopts.tsize]; pkt_len <<= stab->szopts.size_log; out: if (unlikely(pkt_len < 1)) pkt_len = 1; qdisc_skb_cb(skb)->pkt_len = pkt_len; } EXPORT_SYMBOL(__qdisc_calculate_pkt_len); void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc) { if (!(qdisc->flags & TCQ_F_WARN_NONWC)) { pr_warn("%s: %s qdisc %X: is non-work-conserving?\n", txt, qdisc->ops->id, qdisc->handle >> 16); qdisc->flags |= TCQ_F_WARN_NONWC; } } EXPORT_SYMBOL(qdisc_warn_nonwc); static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer) { struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog, timer); rcu_read_lock(); __netif_schedule(qdisc_root(wd->qdisc)); rcu_read_unlock(); return HRTIMER_NORESTART; } void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc, clockid_t clockid) { hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED); wd->timer.function = qdisc_watchdog; wd->qdisc = qdisc; } EXPORT_SYMBOL(qdisc_watchdog_init_clockid); void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc) { qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC); } EXPORT_SYMBOL(qdisc_watchdog_init); void qdisc_watchdog_schedule_range_ns(struct qdisc_watchdog *wd, u64 expires, u64 delta_ns) { bool deactivated; rcu_read_lock(); deactivated = test_bit(__QDISC_STATE_DEACTIVATED, &qdisc_root_sleeping(wd->qdisc)->state); rcu_read_unlock(); if (deactivated) return; if (hrtimer_is_queued(&wd->timer)) { u64 softexpires; softexpires = ktime_to_ns(hrtimer_get_softexpires(&wd->timer)); /* If timer is already set in [expires, expires + delta_ns], * do not reprogram it. */ if (softexpires - expires <= delta_ns) return; } hrtimer_start_range_ns(&wd->timer, ns_to_ktime(expires), delta_ns, HRTIMER_MODE_ABS_PINNED); } EXPORT_SYMBOL(qdisc_watchdog_schedule_range_ns); void qdisc_watchdog_cancel(struct qdisc_watchdog *wd) { hrtimer_cancel(&wd->timer); } EXPORT_SYMBOL(qdisc_watchdog_cancel); static struct hlist_head *qdisc_class_hash_alloc(unsigned int n) { struct hlist_head *h; unsigned int i; h = kvmalloc_array(n, sizeof(struct hlist_head), GFP_KERNEL); if (h != NULL) { for (i = 0; i < n; i++) INIT_HLIST_HEAD(&h[i]); } return h; } void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash) { struct Qdisc_class_common *cl; struct hlist_node *next; struct hlist_head *nhash, *ohash; unsigned int nsize, nmask, osize; unsigned int i, h; /* Rehash when load factor exceeds 0.75 */ if (clhash->hashelems * 4 <= clhash->hashsize * 3) return; nsize = clhash->hashsize * 2; nmask = nsize - 1; nhash = qdisc_class_hash_alloc(nsize); if (nhash == NULL) return; ohash = clhash->hash; osize = clhash->hashsize; sch_tree_lock(sch); for (i = 0; i < osize; i++) { hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) { h = qdisc_class_hash(cl->classid, nmask); hlist_add_head(&cl->hnode, &nhash[h]); } } clhash->hash = nhash; clhash->hashsize = nsize; clhash->hashmask = nmask; sch_tree_unlock(sch); kvfree(ohash); } EXPORT_SYMBOL(qdisc_class_hash_grow); int qdisc_class_hash_init(struct Qdisc_class_hash *clhash) { unsigned int size = 4; clhash->hash = qdisc_class_hash_alloc(size); if (!clhash->hash) return -ENOMEM; clhash->hashsize = size; clhash->hashmask = size - 1; clhash->hashelems = 0; return 0; } EXPORT_SYMBOL(qdisc_class_hash_init); void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash) { kvfree(clhash->hash); } EXPORT_SYMBOL(qdisc_class_hash_destroy); void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash, struct Qdisc_class_common *cl) { unsigned int h; INIT_HLIST_NODE(&cl->hnode); h = qdisc_class_hash(cl->classid, clhash->hashmask); hlist_add_head(&cl->hnode, &clhash->hash[h]); clhash->hashelems++; } EXPORT_SYMBOL(qdisc_class_hash_insert); void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash, struct Qdisc_class_common *cl) { hlist_del(&cl->hnode); clhash->hashelems--; } EXPORT_SYMBOL(qdisc_class_hash_remove); /* Allocate an unique handle from space managed by kernel * Possible range is [8000-FFFF]:0000 (0x8000 values) */ static u32 qdisc_alloc_handle(struct net_device *dev) { int i = 0x8000; static u32 autohandle = TC_H_MAKE(0x80000000U, 0); do { autohandle += TC_H_MAKE(0x10000U, 0); if (autohandle == TC_H_MAKE(TC_H_ROOT, 0)) autohandle = TC_H_MAKE(0x80000000U, 0); if (!qdisc_lookup(dev, autohandle)) return autohandle; cond_resched(); } while (--i > 0); return 0; } void qdisc_tree_reduce_backlog(struct Qdisc *sch, int n, int len) { bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED; const struct Qdisc_class_ops *cops; unsigned long cl; u32 parentid; bool notify; int drops; if (n == 0 && len == 0) return; drops = max_t(int, n, 0); rcu_read_lock(); while ((parentid = sch->parent)) { if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS)) break; if (sch->flags & TCQ_F_NOPARENT) break; /* Notify parent qdisc only if child qdisc becomes empty. * * If child was empty even before update then backlog * counter is screwed and we skip notification because * parent class is already passive. * * If the original child was offloaded then it is allowed * to be seem as empty, so the parent is notified anyway. */ notify = !sch->q.qlen && !WARN_ON_ONCE(!n && !qdisc_is_offloaded); /* TODO: perform the search on a per txq basis */ sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid)); if (sch == NULL) { WARN_ON_ONCE(parentid != TC_H_ROOT); break; } cops = sch->ops->cl_ops; if (notify && cops->qlen_notify) { cl = cops->find(sch, parentid); cops->qlen_notify(sch, cl); } sch->q.qlen -= n; sch->qstats.backlog -= len; __qdisc_qstats_drop(sch, drops); } rcu_read_unlock(); } EXPORT_SYMBOL(qdisc_tree_reduce_backlog); int qdisc_offload_dump_helper(struct Qdisc *sch, enum tc_setup_type type, void *type_data) { struct net_device *dev = qdisc_dev(sch); int err; sch->flags &= ~TCQ_F_OFFLOADED; if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) return 0; err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data); if (err == -EOPNOTSUPP) return 0; if (!err) sch->flags |= TCQ_F_OFFLOADED; return err; } EXPORT_SYMBOL(qdisc_offload_dump_helper); void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch, struct Qdisc *new, struct Qdisc *old, enum tc_setup_type type, void *type_data, struct netlink_ext_ack *extack) { bool any_qdisc_is_offloaded; int err; if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) return; err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data); /* Don't report error if the graft is part of destroy operation. */ if (!err || !new || new == &noop_qdisc) return; /* Don't report error if the parent, the old child and the new * one are not offloaded. */ any_qdisc_is_offloaded = new->flags & TCQ_F_OFFLOADED; any_qdisc_is_offloaded |= sch && sch->flags & TCQ_F_OFFLOADED; any_qdisc_is_offloaded |= old && old->flags & TCQ_F_OFFLOADED; if (any_qdisc_is_offloaded) NL_SET_ERR_MSG(extack, "Offloading graft operation failed."); } EXPORT_SYMBOL(qdisc_offload_graft_helper); void qdisc_offload_query_caps(struct net_device *dev, enum tc_setup_type type, void *caps, size_t caps_len) { const struct net_device_ops *ops = dev->netdev_ops; struct tc_query_caps_base base = { .type = type, .caps = caps, }; memset(caps, 0, caps_len); if (ops->ndo_setup_tc) ops->ndo_setup_tc(dev, TC_QUERY_CAPS, &base); } EXPORT_SYMBOL(qdisc_offload_query_caps); static void qdisc_offload_graft_root(struct net_device *dev, struct Qdisc *new, struct Qdisc *old, struct netlink_ext_ack *extack) { struct tc_root_qopt_offload graft_offload = { .command = TC_ROOT_GRAFT, .handle = new ? new->handle : 0, .ingress = (new && new->flags & TCQ_F_INGRESS) || (old && old->flags & TCQ_F_INGRESS), }; qdisc_offload_graft_helper(dev, NULL, new, old, TC_SETUP_ROOT_QDISC, &graft_offload, extack); } static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, u32 portid, u32 seq, u16 flags, int event, struct netlink_ext_ack *extack) { struct gnet_stats_basic_sync __percpu *cpu_bstats = NULL; struct gnet_stats_queue __percpu *cpu_qstats = NULL; struct tcmsg *tcm; struct nlmsghdr *nlh; unsigned char *b = skb_tail_pointer(skb); struct gnet_dump d; struct qdisc_size_table *stab; u32 block_index; __u32 qlen; cond_resched(); nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags); if (!nlh) goto out_nlmsg_trim; tcm = nlmsg_data(nlh); tcm->tcm_family = AF_UNSPEC; tcm->tcm__pad1 = 0; tcm->tcm__pad2 = 0; tcm->tcm_ifindex = qdisc_dev(q)->ifindex; tcm->tcm_parent = clid; tcm->tcm_handle = q->handle; tcm->tcm_info = refcount_read(&q->refcnt); if (nla_put_string(skb, TCA_KIND, q->ops->id)) goto nla_put_failure; if (q->ops->ingress_block_get) { block_index = q->ops->ingress_block_get(q); if (block_index && nla_put_u32(skb, TCA_INGRESS_BLOCK, block_index)) goto nla_put_failure; } if (q->ops->egress_block_get) { block_index = q->ops->egress_block_get(q); if (block_index && nla_put_u32(skb, TCA_EGRESS_BLOCK, block_index)) goto nla_put_failure; } if (q->ops->dump && q->ops->dump(q, skb) < 0) goto nla_put_failure; if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED))) goto nla_put_failure; qlen = qdisc_qlen_sum(q); stab = rtnl_dereference(q->stab); if (stab && qdisc_dump_stab(skb, stab) < 0) goto nla_put_failure; if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS, NULL, &d, TCA_PAD) < 0) goto nla_put_failure; if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0) goto nla_put_failure; if (qdisc_is_percpu_stats(q)) { cpu_bstats = q->cpu_bstats; cpu_qstats = q->cpu_qstats; } if (gnet_stats_copy_basic(&d, cpu_bstats, &q->bstats, true) < 0 || gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 || gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0) goto nla_put_failure; if (gnet_stats_finish_copy(&d) < 0) goto nla_put_failure; if (extack && extack->_msg && nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg)) goto out_nlmsg_trim; nlh->nlmsg_len = skb_tail_pointer(skb) - b; return skb->len; out_nlmsg_trim: nla_put_failure: nlmsg_trim(skb, b); return -1; } static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible) { if (q->flags & TCQ_F_BUILTIN) return true; if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible) return true; return false; } static int qdisc_get_notify(struct net *net, struct sk_buff *oskb, struct nlmsghdr *n, u32 clid, struct Qdisc *q, struct netlink_ext_ack *extack) { struct sk_buff *skb; u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return -ENOBUFS; if (!tc_qdisc_dump_ignore(q, false)) { if (tc_fill_qdisc(skb, q, clid, portid, n->nlmsg_seq, 0, RTM_NEWQDISC, extack) < 0) goto err_out; } if (skb->len) return rtnetlink_send(skb, net, portid, RTNLGRP_TC, n->nlmsg_flags & NLM_F_ECHO); err_out: kfree_skb(skb); return -EINVAL; } static int qdisc_notify(struct net *net, struct sk_buff *oskb, struct nlmsghdr *n, u32 clid, struct Qdisc *old, struct Qdisc *new, struct netlink_ext_ack *extack) { struct sk_buff *skb; u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; if (!rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC)) return 0; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return -ENOBUFS; if (old && !tc_qdisc_dump_ignore(old, false)) { if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq, 0, RTM_DELQDISC, extack) < 0) goto err_out; } if (new && !tc_qdisc_dump_ignore(new, false)) { if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC, extack) < 0) goto err_out; } if (skb->len) return rtnetlink_send(skb, net, portid, RTNLGRP_TC, n->nlmsg_flags & NLM_F_ECHO); err_out: kfree_skb(skb); return -EINVAL; } static void notify_and_destroy(struct net *net, struct sk_buff *skb, struct nlmsghdr *n, u32 clid, struct Qdisc *old, struct Qdisc *new, struct netlink_ext_ack *extack) { if (new || old) qdisc_notify(net, skb, n, clid, old, new, extack); if (old) qdisc_put(old); } static void qdisc_clear_nolock(struct Qdisc *sch) { sch->flags &= ~TCQ_F_NOLOCK; if (!(sch->flags & TCQ_F_CPUSTATS)) return; free_percpu(sch->cpu_bstats); free_percpu(sch->cpu_qstats); sch->cpu_bstats = NULL; sch->cpu_qstats = NULL; sch->flags &= ~TCQ_F_CPUSTATS; } /* Graft qdisc "new" to class "classid" of qdisc "parent" or * to device "dev". * * When appropriate send a netlink notification using 'skb' * and "n". * * On success, destroy old qdisc. */ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent, struct sk_buff *skb, struct nlmsghdr *n, u32 classid, struct Qdisc *new, struct Qdisc *old, struct netlink_ext_ack *extack) { struct Qdisc *q = old; struct net *net = dev_net(dev); if (parent == NULL) { unsigned int i, num_q, ingress; struct netdev_queue *dev_queue; ingress = 0; num_q = dev->num_tx_queues; if ((q && q->flags & TCQ_F_INGRESS) || (new && new->flags & TCQ_F_INGRESS)) { ingress = 1; dev_queue = dev_ingress_queue(dev); if (!dev_queue) { NL_SET_ERR_MSG(extack, "Device does not have an ingress queue"); return -ENOENT; } q = rtnl_dereference(dev_queue->qdisc_sleeping); /* This is the counterpart of that qdisc_refcount_inc_nz() call in * __tcf_qdisc_find() for filter requests. */ if (!qdisc_refcount_dec_if_one(q)) { NL_SET_ERR_MSG(extack, "Current ingress or clsact Qdisc has ongoing filter requests"); return -EBUSY; } } if (dev->flags & IFF_UP) dev_deactivate(dev); qdisc_offload_graft_root(dev, new, old, extack); if (new && new->ops->attach && !ingress) goto skip; if (!ingress) { for (i = 0; i < num_q; i++) { dev_queue = netdev_get_tx_queue(dev, i); old = dev_graft_qdisc(dev_queue, new); if (new && i > 0) qdisc_refcount_inc(new); qdisc_put(old); } } else { old = dev_graft_qdisc(dev_queue, NULL); /* {ingress,clsact}_destroy() @old before grafting @new to avoid * unprotected concurrent accesses to net_device::miniq_{in,e}gress * pointer(s) in mini_qdisc_pair_swap(). */ qdisc_notify(net, skb, n, classid, old, new, extack); qdisc_destroy(old); dev_graft_qdisc(dev_queue, new); } skip: if (!ingress) { old = rtnl_dereference(dev->qdisc); if (new && !new->ops->attach) qdisc_refcount_inc(new); rcu_assign_pointer(dev->qdisc, new ? : &noop_qdisc); notify_and_destroy(net, skb, n, classid, old, new, extack); if (new && new->ops->attach) new->ops->attach(new); } if (dev->flags & IFF_UP) dev_activate(dev); } else { const struct Qdisc_class_ops *cops = parent->ops->cl_ops; unsigned long cl; int err; /* Only support running class lockless if parent is lockless */ if (new && (new->flags & TCQ_F_NOLOCK) && !(parent->flags & TCQ_F_NOLOCK)) qdisc_clear_nolock(new); if (!cops || !cops->graft) return -EOPNOTSUPP; cl = cops->find(parent, classid); if (!cl) { NL_SET_ERR_MSG(extack, "Specified class not found"); return -ENOENT; } if (new && new->ops == &noqueue_qdisc_ops) { NL_SET_ERR_MSG(extack, "Cannot assign noqueue to a class"); return -EINVAL; } err = cops->graft(parent, cl, new, &old, extack); if (err) return err; notify_and_destroy(net, skb, n, classid, old, new, extack); } return 0; } static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca, struct netlink_ext_ack *extack) { u32 block_index; if (tca[TCA_INGRESS_BLOCK]) { block_index = nla_get_u32(tca[TCA_INGRESS_BLOCK]); if (!block_index) { NL_SET_ERR_MSG(extack, "Ingress block index cannot be 0"); return -EINVAL; } if (!sch->ops->ingress_block_set) { NL_SET_ERR_MSG(extack, "Ingress block sharing is not supported"); return -EOPNOTSUPP; } sch->ops->ingress_block_set(sch, block_index); } if (tca[TCA_EGRESS_BLOCK]) { block_index = nla_get_u32(tca[TCA_EGRESS_BLOCK]); if (!block_index) { NL_SET_ERR_MSG(extack, "Egress block index cannot be 0"); return -EINVAL; } if (!sch->ops->egress_block_set) { NL_SET_ERR_MSG(extack, "Egress block sharing is not supported"); return -EOPNOTSUPP; } sch->ops->egress_block_set(sch, block_index); } return 0; } /* Allocate and initialize new qdisc. Parameters are passed via opt. */ static struct Qdisc *qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue, u32 parent, u32 handle, struct nlattr **tca, int *errp, struct netlink_ext_ack *extack) { int err; struct nlattr *kind = tca[TCA_KIND]; struct Qdisc *sch; struct Qdisc_ops *ops; struct qdisc_size_table *stab; ops = qdisc_lookup_ops(kind); #ifdef CONFIG_MODULES if (ops == NULL && kind != NULL) { char name[IFNAMSIZ]; if (nla_strscpy(name, kind, IFNAMSIZ) >= 0) { /* We dropped the RTNL semaphore in order to * perform the module load. So, even if we * succeeded in loading the module we have to * tell the caller to replay the request. We * indicate this using -EAGAIN. * We replay the request because the device may * go away in the mean time. */ rtnl_unlock(); request_module(NET_SCH_ALIAS_PREFIX "%s", name); rtnl_lock(); ops = qdisc_lookup_ops(kind); if (ops != NULL) { /* We will try again qdisc_lookup_ops, * so don't keep a reference. */ module_put(ops->owner); err = -EAGAIN; goto err_out; } } } #endif err = -ENOENT; if (!ops) { NL_SET_ERR_MSG(extack, "Specified qdisc kind is unknown"); goto err_out; } sch = qdisc_alloc(dev_queue, ops, extack); if (IS_ERR(sch)) { err = PTR_ERR(sch); goto err_out2; } sch->parent = parent; if (handle == TC_H_INGRESS) { if (!(sch->flags & TCQ_F_INGRESS)) { NL_SET_ERR_MSG(extack, "Specified parent ID is reserved for ingress and clsact Qdiscs"); err = -EINVAL; goto err_out3; } handle = TC_H_MAKE(TC_H_INGRESS, 0); } else { if (handle == 0) { handle = qdisc_alloc_handle(dev); if (handle == 0) { NL_SET_ERR_MSG(extack, "Maximum number of qdisc handles was exceeded"); err = -ENOSPC; goto err_out3; } } if (!netif_is_multiqueue(dev)) sch->flags |= TCQ_F_ONETXQUEUE; } sch->handle = handle; /* This exist to keep backward compatible with a userspace * loophole, what allowed userspace to get IFF_NO_QUEUE * facility on older kernels by setting tx_queue_len=0 (prior * to qdisc init), and then forgot to reinit tx_queue_len * before again attaching a qdisc. */ if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) { dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN; netdev_info(dev, "Caught tx_queue_len zero misconfig\n"); } err = qdisc_block_indexes_set(sch, tca, extack); if (err) goto err_out3; if (tca[TCA_STAB]) { stab = qdisc_get_stab(tca[TCA_STAB], extack); if (IS_ERR(stab)) { err = PTR_ERR(stab); goto err_out3; } rcu_assign_pointer(sch->stab, stab); } if (ops->init) { err = ops->init(sch, tca[TCA_OPTIONS], extack); if (err != 0) goto err_out4; } if (tca[TCA_RATE]) { err = -EOPNOTSUPP; if (sch->flags & TCQ_F_MQROOT) { NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc"); goto err_out4; } err = gen_new_estimator(&sch->bstats, sch->cpu_bstats, &sch->rate_est, NULL, true, tca[TCA_RATE]); if (err) { NL_SET_ERR_MSG(extack, "Failed to generate new estimator"); goto err_out4; } } qdisc_hash_add(sch, false); trace_qdisc_create(ops, dev, parent); return sch; err_out4: /* Even if ops->init() failed, we call ops->destroy() * like qdisc_create_dflt(). */ if (ops->destroy) ops->destroy(sch); qdisc_put_stab(rtnl_dereference(sch->stab)); err_out3: netdev_put(dev, &sch->dev_tracker); qdisc_free(sch); err_out2: module_put(ops->owner); err_out: *errp = err; return NULL; } static int qdisc_change(struct Qdisc *sch, struct nlattr **tca, struct netlink_ext_ack *extack) { struct qdisc_size_table *ostab, *stab = NULL; int err = 0; if (tca[TCA_OPTIONS]) { if (!sch->ops->change) { NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc"); return -EINVAL; } if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) { NL_SET_ERR_MSG(extack, "Change of blocks is not supported"); return -EOPNOTSUPP; } err = sch->ops->change(sch, tca[TCA_OPTIONS], extack); if (err) return err; } if (tca[TCA_STAB]) { stab = qdisc_get_stab(tca[TCA_STAB], extack); if (IS_ERR(stab)) return PTR_ERR(stab); } ostab = rtnl_dereference(sch->stab); rcu_assign_pointer(sch->stab, stab); qdisc_put_stab(ostab); if (tca[TCA_RATE]) { /* NB: ignores errors from replace_estimator because change can't be undone. */ if (sch->flags & TCQ_F_MQROOT) goto out; gen_replace_estimator(&sch->bstats, sch->cpu_bstats, &sch->rate_est, NULL, true, tca[TCA_RATE]); } out: return 0; } struct check_loop_arg { struct qdisc_walker w; struct Qdisc *p; int depth; }; static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w); static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth) { struct check_loop_arg arg; if (q->ops->cl_ops == NULL) return 0; arg.w.stop = arg.w.skip = arg.w.count = 0; arg.w.fn = check_loop_fn; arg.depth = depth; arg.p = p; q->ops->cl_ops->walk(q, &arg.w); return arg.w.stop ? -ELOOP : 0; } static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w) { struct Qdisc *leaf; const struct Qdisc_class_ops *cops = q->ops->cl_ops; struct check_loop_arg *arg = (struct check_loop_arg *)w; leaf = cops->leaf(q, cl); if (leaf) { if (leaf == arg->p || arg->depth > 7) return -ELOOP; return check_loop(leaf, arg->p, arg->depth + 1); } return 0; } const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = { [TCA_KIND] = { .type = NLA_STRING }, [TCA_RATE] = { .type = NLA_BINARY, .len = sizeof(struct tc_estimator) }, [TCA_STAB] = { .type = NLA_NESTED }, [TCA_DUMP_INVISIBLE] = { .type = NLA_FLAG }, [TCA_CHAIN] = { .type = NLA_U32 }, [TCA_INGRESS_BLOCK] = { .type = NLA_U32 }, [TCA_EGRESS_BLOCK] = { .type = NLA_U32 }, }; /* * Delete/get qdisc. */ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct tcmsg *tcm = nlmsg_data(n); struct nlattr *tca[TCA_MAX + 1]; struct net_device *dev; u32 clid; struct Qdisc *q = NULL; struct Qdisc *p = NULL; int err; err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy, extack); if (err < 0) return err; dev = __dev_get_by_index(net, tcm->tcm_ifindex); if (!dev) return -ENODEV; clid = tcm->tcm_parent; if (clid) { if (clid != TC_H_ROOT) { if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) { p = qdisc_lookup(dev, TC_H_MAJ(clid)); if (!p) { NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid"); return -ENOENT; } q = qdisc_leaf(p, clid); } else if (dev_ingress_queue(dev)) { q = rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping); } } else { q = rtnl_dereference(dev->qdisc); } if (!q) { NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device"); return -ENOENT; } if (tcm->tcm_handle && q->handle != tcm->tcm_handle) { NL_SET_ERR_MSG(extack, "Invalid handle"); return -EINVAL; } } else { q = qdisc_lookup(dev, tcm->tcm_handle); if (!q) { NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified handle"); return -ENOENT; } } if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) { NL_SET_ERR_MSG(extack, "Invalid qdisc name"); return -EINVAL; } if (n->nlmsg_type == RTM_DELQDISC) { if (!clid) { NL_SET_ERR_MSG(extack, "Classid cannot be zero"); return -EINVAL; } if (q->handle == 0) { NL_SET_ERR_MSG(extack, "Cannot delete qdisc with handle of zero"); return -ENOENT; } err = qdisc_graft(dev, p, skb, n, clid, NULL, q, extack); if (err != 0) return err; } else { qdisc_get_notify(net, skb, n, clid, q, NULL); } return 0; } static bool req_create_or_replace(struct nlmsghdr *n) { return (n->nlmsg_flags & NLM_F_CREATE && n->nlmsg_flags & NLM_F_REPLACE); } static bool req_create_exclusive(struct nlmsghdr *n) { return (n->nlmsg_flags & NLM_F_CREATE && n->nlmsg_flags & NLM_F_EXCL); } static bool req_change(struct nlmsghdr *n) { return (!(n->nlmsg_flags & NLM_F_CREATE) && !(n->nlmsg_flags & NLM_F_REPLACE) && !(n->nlmsg_flags & NLM_F_EXCL)); } /* * Create/change qdisc. */ static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct tcmsg *tcm; struct nlattr *tca[TCA_MAX + 1]; struct net_device *dev; u32 clid; struct Qdisc *q, *p; int err; replay: /* Reinit, just in case something touches this. */ err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy, extack); if (err < 0) return err; tcm = nlmsg_data(n); clid = tcm->tcm_parent; q = p = NULL; dev = __dev_get_by_index(net, tcm->tcm_ifindex); if (!dev) return -ENODEV; if (clid) { if (clid != TC_H_ROOT) { if (clid != TC_H_INGRESS) { p = qdisc_lookup(dev, TC_H_MAJ(clid)); if (!p) { NL_SET_ERR_MSG(extack, "Failed to find specified qdisc"); return -ENOENT; } q = qdisc_leaf(p, clid); } else if (dev_ingress_queue_create(dev)) { q = rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping); } } else { q = rtnl_dereference(dev->qdisc); } /* It may be default qdisc, ignore it */ if (q && q->handle == 0) q = NULL; if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) { if (tcm->tcm_handle) { if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) { NL_SET_ERR_MSG(extack, "NLM_F_REPLACE needed to override"); return -EEXIST; } if (TC_H_MIN(tcm->tcm_handle)) { NL_SET_ERR_MSG(extack, "Invalid minor handle"); return -EINVAL; } q = qdisc_lookup(dev, tcm->tcm_handle); if (!q) goto create_n_graft; if (n->nlmsg_flags & NLM_F_EXCL) { NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override"); return -EEXIST; } if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) { NL_SET_ERR_MSG(extack, "Invalid qdisc name"); return -EINVAL; } if (q->flags & TCQ_F_INGRESS) { NL_SET_ERR_MSG(extack, "Cannot regraft ingress or clsact Qdiscs"); return -EINVAL; } if (q == p || (p && check_loop(q, p, 0))) { NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected"); return -ELOOP; } if (clid == TC_H_INGRESS) { NL_SET_ERR_MSG(extack, "Ingress cannot graft directly"); return -EINVAL; } qdisc_refcount_inc(q); goto graft; } else { if (!q) goto create_n_graft; /* This magic test requires explanation. * * We know, that some child q is already * attached to this parent and have choice: * 1) change it or 2) create/graft new one. * If the requested qdisc kind is different * than the existing one, then we choose graft. * If they are the same then this is "change" * operation - just let it fallthrough.. * * 1. We are allowed to create/graft only * if the request is explicitly stating * "please create if it doesn't exist". * * 2. If the request is to exclusive create * then the qdisc tcm_handle is not expected * to exist, so that we choose create/graft too. * * 3. The last case is when no flags are set. * This will happen when for example tc * utility issues a "change" command. * Alas, it is sort of hole in API, we * cannot decide what to do unambiguously. * For now we select create/graft. */ if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) { if (req_create_or_replace(n) || req_create_exclusive(n)) goto create_n_graft; else if (req_change(n)) goto create_n_graft2; } } } } else { if (!tcm->tcm_handle) { NL_SET_ERR_MSG(extack, "Handle cannot be zero"); return -EINVAL; } q = qdisc_lookup(dev, tcm->tcm_handle); } /* Change qdisc parameters */ if (!q) { NL_SET_ERR_MSG(extack, "Specified qdisc not found"); return -ENOENT; } if (n->nlmsg_flags & NLM_F_EXCL) { NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot modify"); return -EEXIST; } if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) { NL_SET_ERR_MSG(extack, "Invalid qdisc name"); return -EINVAL; } err = qdisc_change(q, tca, extack); if (err == 0) qdisc_notify(net, skb, n, clid, NULL, q, extack); return err; create_n_graft: if (!(n->nlmsg_flags & NLM_F_CREATE)) { NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag"); return -ENOENT; } create_n_graft2: if (clid == TC_H_INGRESS) { if (dev_ingress_queue(dev)) { q = qdisc_create(dev, dev_ingress_queue(dev), tcm->tcm_parent, tcm->tcm_parent, tca, &err, extack); } else { NL_SET_ERR_MSG(extack, "Cannot find ingress queue for specified device"); err = -ENOENT; } } else { struct netdev_queue *dev_queue; if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue) dev_queue = p->ops->cl_ops->select_queue(p, tcm); else if (p) dev_queue = p->dev_queue; else dev_queue = netdev_get_tx_queue(dev, 0); q = qdisc_create(dev, dev_queue, tcm->tcm_parent, tcm->tcm_handle, tca, &err, extack); } if (q == NULL) { if (err == -EAGAIN) goto replay; return err; } graft: err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack); if (err) { if (q) qdisc_put(q); return err; } return 0; } static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb, struct netlink_callback *cb, int *q_idx_p, int s_q_idx, bool recur, bool dump_invisible) { int ret = 0, q_idx = *q_idx_p; struct Qdisc *q; int b; if (!root) return 0; q = root; if (q_idx < s_q_idx) { q_idx++; } else { if (!tc_qdisc_dump_ignore(q, dump_invisible) && tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC, NULL) <= 0) goto done; q_idx++; } /* If dumping singletons, there is no qdisc_dev(root) and the singleton * itself has already been dumped. * * If we've already dumped the top-level (ingress) qdisc above and the global * qdisc hashtable, we don't want to hit it again */ if (!qdisc_dev(root) || !recur) goto out; hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) { if (q_idx < s_q_idx) { q_idx++; continue; } if (!tc_qdisc_dump_ignore(q, dump_invisible) && tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC, NULL) <= 0) goto done; q_idx++; } out: *q_idx_p = q_idx; return ret; done: ret = -1; goto out; } static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); int idx, q_idx; int s_idx, s_q_idx; struct net_device *dev; const struct nlmsghdr *nlh = cb->nlh; struct nlattr *tca[TCA_MAX + 1]; int err; s_idx = cb->args[0]; s_q_idx = q_idx = cb->args[1]; idx = 0; ASSERT_RTNL(); err = nlmsg_parse_deprecated(nlh, sizeof(struct tcmsg), tca, TCA_MAX, rtm_tca_policy, cb->extack); if (err < 0) return err; for_each_netdev(net, dev) { struct netdev_queue *dev_queue; if (idx < s_idx) goto cont; if (idx > s_idx) s_q_idx = 0; q_idx = 0; if (tc_dump_qdisc_root(rtnl_dereference(dev->qdisc), skb, cb, &q_idx, s_q_idx, true, tca[TCA_DUMP_INVISIBLE]) < 0) goto done; dev_queue = dev_ingress_queue(dev); if (dev_queue && tc_dump_qdisc_root(rtnl_dereference(dev_queue->qdisc_sleeping), skb, cb, &q_idx, s_q_idx, false, tca[TCA_DUMP_INVISIBLE]) < 0) goto done; cont: idx++; } done: cb->args[0] = idx; cb->args[1] = q_idx; return skb->len; } /************************************************ * Traffic classes manipulation. * ************************************************/ static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q, unsigned long cl, u32 portid, u32 seq, u16 flags, int event, struct netlink_ext_ack *extack) { struct tcmsg *tcm; struct nlmsghdr *nlh; unsigned char *b = skb_tail_pointer(skb); struct gnet_dump d; const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops; cond_resched(); nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags); if (!nlh) goto out_nlmsg_trim; tcm = nlmsg_data(nlh); tcm->tcm_family = AF_UNSPEC; tcm->tcm__pad1 = 0; tcm->tcm__pad2 = 0; tcm->tcm_ifindex = qdisc_dev(q)->ifindex; tcm->tcm_parent = q->handle; tcm->tcm_handle = q->handle; tcm->tcm_info = 0; if (nla_put_string(skb, TCA_KIND, q->ops->id)) goto nla_put_failure; if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0) goto nla_put_failure; if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS, NULL, &d, TCA_PAD) < 0) goto nla_put_failure; if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0) goto nla_put_failure; if (gnet_stats_finish_copy(&d) < 0) goto nla_put_failure; if (extack && extack->_msg && nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg)) goto out_nlmsg_trim; nlh->nlmsg_len = skb_tail_pointer(skb) - b; return skb->len; out_nlmsg_trim: nla_put_failure: nlmsg_trim(skb, b); return -1; } static int tclass_notify(struct net *net, struct sk_buff *oskb, struct nlmsghdr *n, struct Qdisc *q, unsigned long cl, int event, struct netlink_ext_ack *extack) { struct sk_buff *skb; u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; if (!rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC)) return 0; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return -ENOBUFS; if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event, extack) < 0) { kfree_skb(skb); return -EINVAL; } return rtnetlink_send(skb, net, portid, RTNLGRP_TC, n->nlmsg_flags & NLM_F_ECHO); } static int tclass_get_notify(struct net *net, struct sk_buff *oskb, struct nlmsghdr *n, struct Qdisc *q, unsigned long cl, struct netlink_ext_ack *extack) { struct sk_buff *skb; u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return -ENOBUFS; if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, RTM_NEWTCLASS, extack) < 0) { kfree_skb(skb); return -EINVAL; } return rtnetlink_send(skb, net, portid, RTNLGRP_TC, n->nlmsg_flags & NLM_F_ECHO); } static int tclass_del_notify(struct net *net, const struct Qdisc_class_ops *cops, struct sk_buff *oskb, struct nlmsghdr *n, struct Qdisc *q, unsigned long cl, struct netlink_ext_ack *extack) { u32 portid = oskb ? NETLINK_CB(oskb).portid : 0; struct sk_buff *skb; int err = 0; if (!cops->delete) return -EOPNOTSUPP; if (rtnl_notify_needed(net, n->nlmsg_flags, RTNLGRP_TC)) { skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); if (!skb) return -ENOBUFS; if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, RTM_DELTCLASS, extack) < 0) { kfree_skb(skb); return -EINVAL; } } else { skb = NULL; } err = cops->delete(q, cl, extack); if (err) { kfree_skb(skb); return err; } err = rtnetlink_maybe_send(skb, net, portid, RTNLGRP_TC, n->nlmsg_flags & NLM_F_ECHO); return err; } #ifdef CONFIG_NET_CLS struct tcf_bind_args { struct tcf_walker w; unsigned long base; unsigned long cl; u32 classid; }; static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg) { struct tcf_bind_args *a = (void *)arg; if (n && tp->ops->bind_class) { struct Qdisc *q = tcf_block_q(tp->chain->block); sch_tree_lock(q); tp->ops->bind_class(n, a->classid, a->cl, q, a->base); sch_tree_unlock(q); } return 0; } struct tc_bind_class_args { struct qdisc_walker w; unsigned long new_cl; u32 portid; u32 clid; }; static int tc_bind_class_walker(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w) { struct tc_bind_class_args *a = (struct tc_bind_class_args *)w; const struct Qdisc_class_ops *cops = q->ops->cl_ops; struct tcf_block *block; struct tcf_chain *chain; block = cops->tcf_block(q, cl, NULL); if (!block) return 0; for (chain = tcf_get_next_chain(block, NULL); chain; chain = tcf_get_next_chain(block, chain)) { struct tcf_proto *tp; for (tp = tcf_get_next_proto(chain, NULL); tp; tp = tcf_get_next_proto(chain, tp)) { struct tcf_bind_args arg = {}; arg.w.fn = tcf_node_bind; arg.classid = a->clid; arg.base = cl; arg.cl = a->new_cl; tp->ops->walk(tp, &arg.w, true); } } return 0; } static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid, unsigned long new_cl) { const struct Qdisc_class_ops *cops = q->ops->cl_ops; struct tc_bind_class_args args = {}; if (!cops->tcf_block) return; args.portid = portid; args.clid = clid; args.new_cl = new_cl; args.w.fn = tc_bind_class_walker; q->ops->cl_ops->walk(q, &args.w); } #else static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid, unsigned long new_cl) { } #endif static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct tcmsg *tcm = nlmsg_data(n); struct nlattr *tca[TCA_MAX + 1]; struct net_device *dev; struct Qdisc *q = NULL; const struct Qdisc_class_ops *cops; unsigned long cl = 0; unsigned long new_cl; u32 portid; u32 clid; u32 qid; int err; err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy, extack); if (err < 0) return err; dev = __dev_get_by_index(net, tcm->tcm_ifindex); if (!dev) return -ENODEV; /* parent == TC_H_UNSPEC - unspecified parent. parent == TC_H_ROOT - class is root, which has no parent. parent == X:0 - parent is root class. parent == X:Y - parent is a node in hierarchy. parent == 0:Y - parent is X:Y, where X:0 is qdisc. handle == 0:0 - generate handle from kernel pool. handle == 0:Y - class is X:Y, where X:0 is qdisc. handle == X:Y - clear. handle == X:0 - root class. */ /* Step 1. Determine qdisc handle X:0 */ portid = tcm->tcm_parent; clid = tcm->tcm_handle; qid = TC_H_MAJ(clid); if (portid != TC_H_ROOT) { u32 qid1 = TC_H_MAJ(portid); if (qid && qid1) { /* If both majors are known, they must be identical. */ if (qid != qid1) return -EINVAL; } else if (qid1) { qid = qid1; } else if (qid == 0) qid = rtnl_dereference(dev->qdisc)->handle; /* Now qid is genuine qdisc handle consistent * both with parent and child. * * TC_H_MAJ(portid) still may be unspecified, complete it now. */ if (portid) portid = TC_H_MAKE(qid, portid); } else { if (qid == 0) qid = rtnl_dereference(dev->qdisc)->handle; } /* OK. Locate qdisc */ q = qdisc_lookup(dev, qid); if (!q) return -ENOENT; /* An check that it supports classes */ cops = q->ops->cl_ops; if (cops == NULL) return -EINVAL; /* Now try to get class */ if (clid == 0) { if (portid == TC_H_ROOT) clid = qid; } else clid = TC_H_MAKE(qid, clid); if (clid) cl = cops->find(q, clid); if (cl == 0) { err = -ENOENT; if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags & NLM_F_CREATE)) goto out; } else { switch (n->nlmsg_type) { case RTM_NEWTCLASS: err = -EEXIST; if (n->nlmsg_flags & NLM_F_EXCL) goto out; break; case RTM_DELTCLASS: err = tclass_del_notify(net, cops, skb, n, q, cl, extack); /* Unbind the class with flilters with 0 */ tc_bind_tclass(q, portid, clid, 0); goto out; case RTM_GETTCLASS: err = tclass_get_notify(net, skb, n, q, cl, extack); goto out; default: err = -EINVAL; goto out; } } if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) { NL_SET_ERR_MSG(extack, "Shared blocks are not supported for classes"); return -EOPNOTSUPP; } new_cl = cl; err = -EOPNOTSUPP; if (cops->change) err = cops->change(q, clid, portid, tca, &new_cl, extack); if (err == 0) { tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS, extack); /* We just create a new class, need to do reverse binding. */ if (cl != new_cl) tc_bind_tclass(q, portid, clid, new_cl); } out: return err; } struct qdisc_dump_args { struct qdisc_walker w; struct sk_buff *skb; struct netlink_callback *cb; }; static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg) { struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg; return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid, a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS, NULL); } static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb, struct tcmsg *tcm, struct netlink_callback *cb, int *t_p, int s_t) { struct qdisc_dump_args arg; if (tc_qdisc_dump_ignore(q, false) || *t_p < s_t || !q->ops->cl_ops || (tcm->tcm_parent && TC_H_MAJ(tcm->tcm_parent) != q->handle)) { (*t_p)++; return 0; } if (*t_p > s_t) memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0])); arg.w.fn = qdisc_class_dump; arg.skb = skb; arg.cb = cb; arg.w.stop = 0; arg.w.skip = cb->args[1]; arg.w.count = 0; q->ops->cl_ops->walk(q, &arg.w); cb->args[1] = arg.w.count; if (arg.w.stop) return -1; (*t_p)++; return 0; } static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb, struct tcmsg *tcm, struct netlink_callback *cb, int *t_p, int s_t, bool recur) { struct Qdisc *q; int b; if (!root) return 0; if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0) return -1; if (!qdisc_dev(root) || !recur) return 0; if (tcm->tcm_parent) { q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent)); if (q && q != root && tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0) return -1; return 0; } hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) { if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0) return -1; } return 0; } static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb) { struct tcmsg *tcm = nlmsg_data(cb->nlh); struct net *net = sock_net(skb->sk); struct netdev_queue *dev_queue; struct net_device *dev; int t, s_t; if (nlmsg_len(cb->nlh) < sizeof(*tcm)) return 0; dev = dev_get_by_index(net, tcm->tcm_ifindex); if (!dev) return 0; s_t = cb->args[0]; t = 0; if (tc_dump_tclass_root(rtnl_dereference(dev->qdisc), skb, tcm, cb, &t, s_t, true) < 0) goto done; dev_queue = dev_ingress_queue(dev); if (dev_queue && tc_dump_tclass_root(rtnl_dereference(dev_queue->qdisc_sleeping), skb, tcm, cb, &t, s_t, false) < 0) goto done; done: cb->args[0] = t; dev_put(dev); return skb->len; } #ifdef CONFIG_PROC_FS static int psched_show(struct seq_file *seq, void *v) { seq_printf(seq, "%08x %08x %08x %08x\n", (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1), 1000000, (u32)NSEC_PER_SEC / hrtimer_resolution); return 0; } static int __net_init psched_net_init(struct net *net) { struct proc_dir_entry *e; e = proc_create_single("psched", 0, net->proc_net, psched_show); if (e == NULL) return -ENOMEM; return 0; } static void __net_exit psched_net_exit(struct net *net) { remove_proc_entry("psched", net->proc_net); } #else static int __net_init psched_net_init(struct net *net) { return 0; } static void __net_exit psched_net_exit(struct net *net) { } #endif static struct pernet_operations psched_net_ops = { .init = psched_net_init, .exit = psched_net_exit, }; #if IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) DEFINE_STATIC_KEY_FALSE(tc_skip_wrapper); #endif static int __init pktsched_init(void) { int err; err = register_pernet_subsys(&psched_net_ops); if (err) { pr_err("pktsched_init: " "cannot initialize per netns operations\n"); return err; } register_qdisc(&pfifo_fast_ops); register_qdisc(&pfifo_qdisc_ops); register_qdisc(&bfifo_qdisc_ops); register_qdisc(&pfifo_head_drop_qdisc_ops); register_qdisc(&mq_qdisc_ops); register_qdisc(&noqueue_qdisc_ops); rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, 0); rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, 0); rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc, 0); rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, 0); rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, 0); rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass, 0); tc_wrapper_init(); return 0; } subsys_initcall(pktsched_init);
3 9 9 5 4 9 9 4 2 3 5 11 11 9 9 8 9 8 2 2 5 10 8 7 14 14 8 6 14 14 6 1 7 13 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 // SPDX-License-Identifier: GPL-2.0-or-later /* * udp_diag.c Module for monitoring UDP transport protocols sockets. * * Authors: Pavel Emelyanov, <xemul@parallels.com> */ #include <linux/module.h> #include <linux/inet_diag.h> #include <linux/udp.h> #include <net/udp.h> #include <net/udplite.h> #include <linux/sock_diag.h> static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *req, struct nlattr *bc, bool net_admin) { if (!inet_diag_bc_sk(bc, sk)) return 0; return inet_sk_diag_fill(sk, NULL, skb, cb, req, NLM_F_MULTI, net_admin); } static int udp_dump_one(struct udp_table *tbl, struct netlink_callback *cb, const struct inet_diag_req_v2 *req) { struct sk_buff *in_skb = cb->skb; int err; struct sock *sk = NULL; struct sk_buff *rep; struct net *net = sock_net(in_skb->sk); rcu_read_lock(); if (req->sdiag_family == AF_INET) /* src and dst are swapped for historical reasons */ sk = __udp4_lib_lookup(net, req->id.idiag_src[0], req->id.idiag_sport, req->id.idiag_dst[0], req->id.idiag_dport, req->id.idiag_if, 0, tbl, NULL); #if IS_ENABLED(CONFIG_IPV6) else if (req->sdiag_family == AF_INET6) sk = __udp6_lib_lookup(net, (struct in6_addr *)req->id.idiag_src, req->id.idiag_sport, (struct in6_addr *)req->id.idiag_dst, req->id.idiag_dport, req->id.idiag_if, 0, tbl, NULL); #endif if (sk && !refcount_inc_not_zero(&sk->sk_refcnt)) sk = NULL; rcu_read_unlock(); err = -ENOENT; if (!sk) goto out_nosk; err = sock_diag_check_cookie(sk, req->id.idiag_cookie); if (err) goto out; err = -ENOMEM; rep = nlmsg_new(nla_total_size(sizeof(struct inet_diag_msg)) + inet_diag_msg_attrs_size() + nla_total_size(sizeof(struct inet_diag_meminfo)) + 64, GFP_KERNEL); if (!rep) goto out; err = inet_sk_diag_fill(sk, NULL, rep, cb, req, 0, netlink_net_capable(in_skb, CAP_NET_ADMIN)); if (err < 0) { WARN_ON(err == -EMSGSIZE); kfree_skb(rep); goto out; } err = nlmsg_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid); out: if (sk) sock_put(sk); out_nosk: return err; } static void udp_dump(struct udp_table *table, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); struct net *net = sock_net(skb->sk); struct inet_diag_dump_data *cb_data; int num, s_num, slot, s_slot; struct nlattr *bc; cb_data = cb->data; bc = cb_data->inet_diag_nla_bc; s_slot = cb->args[0]; num = s_num = cb->args[1]; for (slot = s_slot; slot <= table->mask; s_num = 0, slot++) { struct udp_hslot *hslot = &table->hash[slot]; struct sock *sk; num = 0; if (hlist_empty(&hslot->head)) continue; spin_lock_bh(&hslot->lock); sk_for_each(sk, &hslot->head) { struct inet_sock *inet = inet_sk(sk); if (!net_eq(sock_net(sk), net)) continue; if (num < s_num) goto next; if (!(r->idiag_states & (1 << sk->sk_state))) goto next; if (r->sdiag_family != AF_UNSPEC && sk->sk_family != r->sdiag_family) goto next; if (r->id.idiag_sport != inet->inet_sport && r->id.idiag_sport) goto next; if (r->id.idiag_dport != inet->inet_dport && r->id.idiag_dport) goto next; if (sk_diag_dump(sk, skb, cb, r, bc, net_admin) < 0) { spin_unlock_bh(&hslot->lock); goto done; } next: num++; } spin_unlock_bh(&hslot->lock); } done: cb->args[0] = slot; cb->args[1] = num; } static void udp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { udp_dump(sock_net(cb->skb->sk)->ipv4.udp_table, skb, cb, r); } static int udp_diag_dump_one(struct netlink_callback *cb, const struct inet_diag_req_v2 *req) { return udp_dump_one(sock_net(cb->skb->sk)->ipv4.udp_table, cb, req); } static void udp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, void *info) { r->idiag_rqueue = udp_rqueue_get(sk); r->idiag_wqueue = sk_wmem_alloc_get(sk); } #ifdef CONFIG_INET_DIAG_DESTROY static int __udp_diag_destroy(struct sk_buff *in_skb, const struct inet_diag_req_v2 *req, struct udp_table *tbl) { struct net *net = sock_net(in_skb->sk); struct sock *sk; int err; rcu_read_lock(); if (req->sdiag_family == AF_INET) sk = __udp4_lib_lookup(net, req->id.idiag_dst[0], req->id.idiag_dport, req->id.idiag_src[0], req->id.idiag_sport, req->id.idiag_if, 0, tbl, NULL); #if IS_ENABLED(CONFIG_IPV6) else if (req->sdiag_family == AF_INET6) { if (ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_dst) && ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_src)) sk = __udp4_lib_lookup(net, req->id.idiag_dst[3], req->id.idiag_dport, req->id.idiag_src[3], req->id.idiag_sport, req->id.idiag_if, 0, tbl, NULL); else sk = __udp6_lib_lookup(net, (struct in6_addr *)req->id.idiag_dst, req->id.idiag_dport, (struct in6_addr *)req->id.idiag_src, req->id.idiag_sport, req->id.idiag_if, 0, tbl, NULL); } #endif else { rcu_read_unlock(); return -EINVAL; } if (sk && !refcount_inc_not_zero(&sk->sk_refcnt)) sk = NULL; rcu_read_unlock(); if (!sk) return -ENOENT; if (sock_diag_check_cookie(sk, req->id.idiag_cookie)) { sock_put(sk); return -ENOENT; } err = sock_diag_destroy(sk, ECONNABORTED); sock_put(sk); return err; } static int udp_diag_destroy(struct sk_buff *in_skb, const struct inet_diag_req_v2 *req) { return __udp_diag_destroy(in_skb, req, sock_net(in_skb->sk)->ipv4.udp_table); } static int udplite_diag_destroy(struct sk_buff *in_skb, const struct inet_diag_req_v2 *req) { return __udp_diag_destroy(in_skb, req, &udplite_table); } #endif static const struct inet_diag_handler udp_diag_handler = { .owner = THIS_MODULE, .dump = udp_diag_dump, .dump_one = udp_diag_dump_one, .idiag_get_info = udp_diag_get_info, .idiag_type = IPPROTO_UDP, .idiag_info_size = 0, #ifdef CONFIG_INET_DIAG_DESTROY .destroy = udp_diag_destroy, #endif }; static void udplite_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { udp_dump(&udplite_table, skb, cb, r); } static int udplite_diag_dump_one(struct netlink_callback *cb, const struct inet_diag_req_v2 *req) { return udp_dump_one(&udplite_table, cb, req); } static const struct inet_diag_handler udplite_diag_handler = { .owner = THIS_MODULE, .dump = udplite_diag_dump, .dump_one = udplite_diag_dump_one, .idiag_get_info = udp_diag_get_info, .idiag_type = IPPROTO_UDPLITE, .idiag_info_size = 0, #ifdef CONFIG_INET_DIAG_DESTROY .destroy = udplite_diag_destroy, #endif }; static int __init udp_diag_init(void) { int err; err = inet_diag_register(&udp_diag_handler); if (err) goto out; err = inet_diag_register(&udplite_diag_handler); if (err) goto out_lite; out: return err; out_lite: inet_diag_unregister(&udp_diag_handler); goto out; } static void __exit udp_diag_exit(void) { inet_diag_unregister(&udplite_diag_handler); inet_diag_unregister(&udp_diag_handler); } module_init(udp_diag_init); module_exit(udp_diag_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("UDP socket monitoring via SOCK_DIAG"); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-17 /* AF_INET - IPPROTO_UDP */); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-136 /* AF_INET - IPPROTO_UDPLITE */);
1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 // SPDX-License-Identifier: GPL-2.0 /* * BlueZ - Bluetooth protocol stack for Linux * * Copyright (C) 2022 Intel Corporation * Copyright 2023-2024 NXP */ #include <linux/module.h> #include <linux/debugfs.h> #include <linux/seq_file.h> #include <linux/sched/signal.h> #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> #include <net/bluetooth/iso.h> #include "eir.h" static const struct proto_ops iso_sock_ops; static struct bt_sock_list iso_sk_list = { .lock = __RW_LOCK_UNLOCKED(iso_sk_list.lock) }; /* ---- ISO connections ---- */ struct iso_conn { struct hci_conn *hcon; /* @lock: spinlock protecting changes to iso_conn fields */ spinlock_t lock; struct sock *sk; struct delayed_work timeout_work; struct sk_buff *rx_skb; __u32 rx_len; __u16 tx_sn; }; #define iso_conn_lock(c) spin_lock(&(c)->lock) #define iso_conn_unlock(c) spin_unlock(&(c)->lock) static void iso_sock_close(struct sock *sk); static void iso_sock_kill(struct sock *sk); /* ----- ISO socket info ----- */ #define iso_pi(sk) ((struct iso_pinfo *)sk) #define EIR_SERVICE_DATA_LENGTH 4 #define BASE_MAX_LENGTH (HCI_MAX_PER_AD_LENGTH - EIR_SERVICE_DATA_LENGTH) #define EIR_BAA_SERVICE_UUID 0x1851 /* iso_pinfo flags values */ enum { BT_SK_BIG_SYNC, BT_SK_PA_SYNC, BT_SK_PA_SYNC_TERM, }; struct iso_pinfo { struct bt_sock bt; bdaddr_t src; __u8 src_type; bdaddr_t dst; __u8 dst_type; __u8 bc_sid; __u8 bc_num_bis; __u8 bc_bis[ISO_MAX_NUM_BIS]; __u16 sync_handle; unsigned long flags; struct bt_iso_qos qos; bool qos_user_set; __u8 base_len; __u8 base[BASE_MAX_LENGTH]; struct iso_conn *conn; }; static struct bt_iso_qos default_qos; static bool check_ucast_qos(struct bt_iso_qos *qos); static bool check_bcast_qos(struct bt_iso_qos *qos); static bool iso_match_sid(struct sock *sk, void *data); static bool iso_match_sync_handle(struct sock *sk, void *data); static void iso_sock_disconn(struct sock *sk); typedef bool (*iso_sock_match_t)(struct sock *sk, void *data); static struct sock *iso_get_sock_listen(bdaddr_t *src, bdaddr_t *dst, iso_sock_match_t match, void *data); /* ---- ISO timers ---- */ #define ISO_CONN_TIMEOUT (HZ * 40) #define ISO_DISCONN_TIMEOUT (HZ * 2) static void iso_sock_timeout(struct work_struct *work) { struct iso_conn *conn = container_of(work, struct iso_conn, timeout_work.work); struct sock *sk; iso_conn_lock(conn); sk = conn->sk; if (sk) sock_hold(sk); iso_conn_unlock(conn); if (!sk) return; BT_DBG("sock %p state %d", sk, sk->sk_state); lock_sock(sk); sk->sk_err = ETIMEDOUT; sk->sk_state_change(sk); release_sock(sk); sock_put(sk); } static void iso_sock_set_timer(struct sock *sk, long timeout) { if (!iso_pi(sk)->conn) return; BT_DBG("sock %p state %d timeout %ld", sk, sk->sk_state, timeout); cancel_delayed_work(&iso_pi(sk)->conn->timeout_work); schedule_delayed_work(&iso_pi(sk)->conn->timeout_work, timeout); } static void iso_sock_clear_timer(struct sock *sk) { if (!iso_pi(sk)->conn) return; BT_DBG("sock %p state %d", sk, sk->sk_state); cancel_delayed_work(&iso_pi(sk)->conn->timeout_work); } /* ---- ISO connections ---- */ static struct iso_conn *iso_conn_add(struct hci_conn *hcon) { struct iso_conn *conn = hcon->iso_data; if (conn) { if (!conn->hcon) conn->hcon = hcon; return conn; } conn = kzalloc(sizeof(*conn), GFP_KERNEL); if (!conn) return NULL; spin_lock_init(&conn->lock); INIT_DELAYED_WORK(&conn->timeout_work, iso_sock_timeout); hcon->iso_data = conn; conn->hcon = hcon; conn->tx_sn = 0; BT_DBG("hcon %p conn %p", hcon, conn); return conn; } /* Delete channel. Must be called on the locked socket. */ static void iso_chan_del(struct sock *sk, int err) { struct iso_conn *conn; struct sock *parent; conn = iso_pi(sk)->conn; BT_DBG("sk %p, conn %p, err %d", sk, conn, err); if (conn) { iso_conn_lock(conn); conn->sk = NULL; iso_pi(sk)->conn = NULL; iso_conn_unlock(conn); if (conn->hcon) hci_conn_drop(conn->hcon); } sk->sk_state = BT_CLOSED; sk->sk_err = err; parent = bt_sk(sk)->parent; if (parent) { bt_accept_unlink(sk); parent->sk_data_ready(parent); } else { sk->sk_state_change(sk); } sock_set_flag(sk, SOCK_ZAPPED); } static bool iso_match_conn_sync_handle(struct sock *sk, void *data) { struct hci_conn *hcon = data; if (test_bit(BT_SK_PA_SYNC, &iso_pi(sk)->flags)) return false; return hcon->sync_handle == iso_pi(sk)->sync_handle; } static void iso_conn_del(struct hci_conn *hcon, int err) { struct iso_conn *conn = hcon->iso_data; struct sock *sk; struct sock *parent; if (!conn) return; BT_DBG("hcon %p conn %p, err %d", hcon, conn, err); /* Kill socket */ iso_conn_lock(conn); sk = conn->sk; if (sk) sock_hold(sk); iso_conn_unlock(conn); if (sk) { lock_sock(sk); /* While a PA sync hcon is in the process of closing, * mark parent socket with a flag, so that any residual * BIGInfo adv reports that arrive before PA sync is * terminated are not processed anymore. */ if (test_bit(BT_SK_PA_SYNC, &iso_pi(sk)->flags)) { parent = iso_get_sock_listen(&hcon->src, &hcon->dst, iso_match_conn_sync_handle, hcon); if (parent) { set_bit(BT_SK_PA_SYNC_TERM, &iso_pi(parent)->flags); sock_put(parent); } } iso_sock_clear_timer(sk); iso_chan_del(sk, err); release_sock(sk); sock_put(sk); } /* Ensure no more work items will run before freeing conn. */ cancel_delayed_work_sync(&conn->timeout_work); hcon->iso_data = NULL; kfree(conn); } static int __iso_chan_add(struct iso_conn *conn, struct sock *sk, struct sock *parent) { BT_DBG("conn %p", conn); if (iso_pi(sk)->conn == conn && conn->sk == sk) return 0; if (conn->sk) { BT_ERR("conn->sk already set"); return -EBUSY; } iso_pi(sk)->conn = conn; conn->sk = sk; if (parent) bt_accept_enqueue(parent, sk, true); return 0; } static int iso_chan_add(struct iso_conn *conn, struct sock *sk, struct sock *parent) { int err; iso_conn_lock(conn); err = __iso_chan_add(conn, sk, parent); iso_conn_unlock(conn); return err; } static inline u8 le_addr_type(u8 bdaddr_type) { if (bdaddr_type == BDADDR_LE_PUBLIC) return ADDR_LE_DEV_PUBLIC; else return ADDR_LE_DEV_RANDOM; } static int iso_connect_bis(struct sock *sk) { struct iso_conn *conn; struct hci_conn *hcon; struct hci_dev *hdev; int err; BT_DBG("%pMR", &iso_pi(sk)->src); hdev = hci_get_route(&iso_pi(sk)->dst, &iso_pi(sk)->src, iso_pi(sk)->src_type); if (!hdev) return -EHOSTUNREACH; hci_dev_lock(hdev); if (!bis_capable(hdev)) { err = -EOPNOTSUPP; goto unlock; } /* Fail if user set invalid QoS */ if (iso_pi(sk)->qos_user_set && !check_bcast_qos(&iso_pi(sk)->qos)) { iso_pi(sk)->qos = default_qos; err = -EINVAL; goto unlock; } /* Fail if out PHYs are marked as disabled */ if (!iso_pi(sk)->qos.bcast.out.phy) { err = -EINVAL; goto unlock; } /* Just bind if DEFER_SETUP has been set */ if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) { hcon = hci_bind_bis(hdev, &iso_pi(sk)->dst, &iso_pi(sk)->qos, iso_pi(sk)->base_len, iso_pi(sk)->base); if (IS_ERR(hcon)) { err = PTR_ERR(hcon); goto unlock; } } else { hcon = hci_connect_bis(hdev, &iso_pi(sk)->dst, le_addr_type(iso_pi(sk)->dst_type), &iso_pi(sk)->qos, iso_pi(sk)->base_len, iso_pi(sk)->base); if (IS_ERR(hcon)) { err = PTR_ERR(hcon); goto unlock; } } conn = iso_conn_add(hcon); if (!conn) { hci_conn_drop(hcon); err = -ENOMEM; goto unlock; } lock_sock(sk); err = iso_chan_add(conn, sk, NULL); if (err) { release_sock(sk); goto unlock; } /* Update source addr of the socket */ bacpy(&iso_pi(sk)->src, &hcon->src); if (hcon->state == BT_CONNECTED) { iso_sock_clear_timer(sk); sk->sk_state = BT_CONNECTED; } else if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) { iso_sock_clear_timer(sk); sk->sk_state = BT_CONNECT; } else { sk->sk_state = BT_CONNECT; iso_sock_set_timer(sk, sk->sk_sndtimeo); } release_sock(sk); unlock: hci_dev_unlock(hdev); hci_dev_put(hdev); return err; } static int iso_connect_cis(struct sock *sk) { struct iso_conn *conn; struct hci_conn *hcon; struct hci_dev *hdev; int err; BT_DBG("%pMR -> %pMR", &iso_pi(sk)->src, &iso_pi(sk)->dst); hdev = hci_get_route(&iso_pi(sk)->dst, &iso_pi(sk)->src, iso_pi(sk)->src_type); if (!hdev) return -EHOSTUNREACH; hci_dev_lock(hdev); if (!cis_central_capable(hdev)) { err = -EOPNOTSUPP; goto unlock; } /* Fail if user set invalid QoS */ if (iso_pi(sk)->qos_user_set && !check_ucast_qos(&iso_pi(sk)->qos)) { iso_pi(sk)->qos = default_qos; err = -EINVAL; goto unlock; } /* Fail if either PHYs are marked as disabled */ if (!iso_pi(sk)->qos.ucast.in.phy && !iso_pi(sk)->qos.ucast.out.phy) { err = -EINVAL; goto unlock; } /* Just bind if DEFER_SETUP has been set */ if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) { hcon = hci_bind_cis(hdev, &iso_pi(sk)->dst, le_addr_type(iso_pi(sk)->dst_type), &iso_pi(sk)->qos); if (IS_ERR(hcon)) { err = PTR_ERR(hcon); goto unlock; } } else { hcon = hci_connect_cis(hdev, &iso_pi(sk)->dst, le_addr_type(iso_pi(sk)->dst_type), &iso_pi(sk)->qos); if (IS_ERR(hcon)) { err = PTR_ERR(hcon); goto unlock; } } conn = iso_conn_add(hcon); if (!conn) { hci_conn_drop(hcon); err = -ENOMEM; goto unlock; } lock_sock(sk); err = iso_chan_add(conn, sk, NULL); if (err) { release_sock(sk); goto unlock; } /* Update source addr of the socket */ bacpy(&iso_pi(sk)->src, &hcon->src); if (hcon->state == BT_CONNECTED) { iso_sock_clear_timer(sk); sk->sk_state = BT_CONNECTED; } else if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) { iso_sock_clear_timer(sk); sk->sk_state = BT_CONNECT; } else { sk->sk_state = BT_CONNECT; iso_sock_set_timer(sk, sk->sk_sndtimeo); } release_sock(sk); unlock: hci_dev_unlock(hdev); hci_dev_put(hdev); return err; } static struct bt_iso_qos *iso_sock_get_qos(struct sock *sk) { if (sk->sk_state == BT_CONNECTED || sk->sk_state == BT_CONNECT2) return &iso_pi(sk)->conn->hcon->iso_qos; return &iso_pi(sk)->qos; } static int iso_send_frame(struct sock *sk, struct sk_buff *skb) { struct iso_conn *conn = iso_pi(sk)->conn; struct bt_iso_qos *qos = iso_sock_get_qos(sk); struct hci_iso_data_hdr *hdr; int len = 0; BT_DBG("sk %p len %d", sk, skb->len); if (skb->len > qos->ucast.out.sdu) return -EMSGSIZE; len = skb->len; /* Push ISO data header */ hdr = skb_push(skb, HCI_ISO_DATA_HDR_SIZE); hdr->sn = cpu_to_le16(conn->tx_sn++); hdr->slen = cpu_to_le16(hci_iso_data_len_pack(len, HCI_ISO_STATUS_VALID)); if (sk->sk_state == BT_CONNECTED) hci_send_iso(conn->hcon, skb); else len = -ENOTCONN; return len; } static void iso_recv_frame(struct iso_conn *conn, struct sk_buff *skb) { struct sock *sk; iso_conn_lock(conn); sk = conn->sk; iso_conn_unlock(conn); if (!sk) goto drop; BT_DBG("sk %p len %d", sk, skb->len); if (sk->sk_state != BT_CONNECTED) goto drop; if (!sock_queue_rcv_skb(sk, skb)) return; drop: kfree_skb(skb); } /* -------- Socket interface ---------- */ static struct sock *__iso_get_sock_listen_by_addr(bdaddr_t *src, bdaddr_t *dst) { struct sock *sk; sk_for_each(sk, &iso_sk_list.head) { if (sk->sk_state != BT_LISTEN) continue; if (bacmp(&iso_pi(sk)->dst, dst)) continue; if (!bacmp(&iso_pi(sk)->src, src)) return sk; } return NULL; } static struct sock *__iso_get_sock_listen_by_sid(bdaddr_t *ba, bdaddr_t *bc, __u8 sid) { struct sock *sk; sk_for_each(sk, &iso_sk_list.head) { if (sk->sk_state != BT_LISTEN) continue; if (bacmp(&iso_pi(sk)->src, ba)) continue; if (bacmp(&iso_pi(sk)->dst, bc)) continue; if (iso_pi(sk)->bc_sid == sid) return sk; } return NULL; } /* Find socket listening: * source bdaddr (Unicast) * destination bdaddr (Broadcast only) * match func - pass NULL to ignore * match func data - pass -1 to ignore * Returns closest match. */ static struct sock *iso_get_sock_listen(bdaddr_t *src, bdaddr_t *dst, iso_sock_match_t match, void *data) { struct sock *sk = NULL, *sk1 = NULL; read_lock(&iso_sk_list.lock); sk_for_each(sk, &iso_sk_list.head) { if (sk->sk_state != BT_LISTEN) continue; /* Match Broadcast destination */ if (bacmp(dst, BDADDR_ANY) && bacmp(&iso_pi(sk)->dst, dst)) continue; /* Use Match function if provided */ if (match && !match(sk, data)) continue; /* Exact match. */ if (!bacmp(&iso_pi(sk)->src, src)) { sock_hold(sk); break; } /* Closest match */ if (!bacmp(&iso_pi(sk)->src, BDADDR_ANY)) { if (sk1) sock_put(sk1); sk1 = sk; sock_hold(sk1); } } if (sk && sk1) sock_put(sk1); read_unlock(&iso_sk_list.lock); return sk ? sk : sk1; } static struct sock *iso_get_sock_big(struct sock *match_sk, bdaddr_t *src, bdaddr_t *dst, uint8_t big) { struct sock *sk = NULL; read_lock(&iso_sk_list.lock); sk_for_each(sk, &iso_sk_list.head) { if (match_sk == sk) continue; /* Look for sockets that have already been * connected to the BIG */ if (sk->sk_state != BT_CONNECTED && sk->sk_state != BT_CONNECT) continue; /* Match Broadcast destination */ if (bacmp(&iso_pi(sk)->dst, dst)) continue; /* Match BIG handle */ if (iso_pi(sk)->qos.bcast.big != big) continue; /* Match source address */ if (bacmp(&iso_pi(sk)->src, src)) continue; sock_hold(sk); break; } read_unlock(&iso_sk_list.lock); return sk; } static void iso_sock_destruct(struct sock *sk) { BT_DBG("sk %p", sk); skb_queue_purge(&sk->sk_receive_queue); skb_queue_purge(&sk->sk_write_queue); } static void iso_sock_cleanup_listen(struct sock *parent) { struct sock *sk; BT_DBG("parent %p", parent); /* Close not yet accepted channels */ while ((sk = bt_accept_dequeue(parent, NULL))) { iso_sock_close(sk); iso_sock_kill(sk); } /* If listening socket has a hcon, properly disconnect it */ if (iso_pi(parent)->conn && iso_pi(parent)->conn->hcon) { iso_sock_disconn(parent); return; } parent->sk_state = BT_CLOSED; sock_set_flag(parent, SOCK_ZAPPED); } /* Kill socket (only if zapped and orphan) * Must be called on unlocked socket. */ static void iso_sock_kill(struct sock *sk) { if (!sock_flag(sk, SOCK_ZAPPED) || sk->sk_socket || sock_flag(sk, SOCK_DEAD)) return; BT_DBG("sk %p state %d", sk, sk->sk_state); /* Kill poor orphan */ bt_sock_unlink(&iso_sk_list, sk); sock_set_flag(sk, SOCK_DEAD); sock_put(sk); } static void iso_sock_disconn(struct sock *sk) { struct sock *bis_sk; struct hci_conn *hcon = iso_pi(sk)->conn->hcon; if (test_bit(HCI_CONN_BIG_CREATED, &hcon->flags)) { bis_sk = iso_get_sock_big(sk, &iso_pi(sk)->src, &iso_pi(sk)->dst, iso_pi(sk)->qos.bcast.big); /* If there are any other connected sockets for the * same BIG, just delete the sk and leave the bis * hcon active, in case later rebinding is needed. */ if (bis_sk) { hcon->state = BT_OPEN; iso_pi(sk)->conn->hcon = NULL; iso_sock_clear_timer(sk); iso_chan_del(sk, bt_to_errno(hcon->abort_reason)); sock_put(bis_sk); return; } } sk->sk_state = BT_DISCONN; iso_sock_set_timer(sk, ISO_DISCONN_TIMEOUT); iso_conn_lock(iso_pi(sk)->conn); hci_conn_drop(iso_pi(sk)->conn->hcon); iso_pi(sk)->conn->hcon = NULL; iso_conn_unlock(iso_pi(sk)->conn); } static void __iso_sock_close(struct sock *sk) { BT_DBG("sk %p state %d socket %p", sk, sk->sk_state, sk->sk_socket); switch (sk->sk_state) { case BT_LISTEN: iso_sock_cleanup_listen(sk); break; case BT_CONNECT: case BT_CONNECTED: case BT_CONFIG: if (iso_pi(sk)->conn->hcon) iso_sock_disconn(sk); else iso_chan_del(sk, ECONNRESET); break; case BT_CONNECT2: if (iso_pi(sk)->conn->hcon && (test_bit(HCI_CONN_PA_SYNC, &iso_pi(sk)->conn->hcon->flags) || test_bit(HCI_CONN_PA_SYNC_FAILED, &iso_pi(sk)->conn->hcon->flags))) iso_sock_disconn(sk); else iso_chan_del(sk, ECONNRESET); break; case BT_DISCONN: iso_chan_del(sk, ECONNRESET); break; default: sock_set_flag(sk, SOCK_ZAPPED); break; } } /* Must be called on unlocked socket. */ static void iso_sock_close(struct sock *sk) { iso_sock_clear_timer(sk); lock_sock(sk); __iso_sock_close(sk); release_sock(sk); iso_sock_kill(sk); } static void iso_sock_init(struct sock *sk, struct sock *parent) { BT_DBG("sk %p", sk); if (parent) { sk->sk_type = parent->sk_type; bt_sk(sk)->flags = bt_sk(parent)->flags; security_sk_clone(parent, sk); } } static struct proto iso_proto = { .name = "ISO", .owner = THIS_MODULE, .obj_size = sizeof(struct iso_pinfo) }; #define DEFAULT_IO_QOS \ { \ .interval = 10000u, \ .latency = 10u, \ .sdu = 40u, \ .phy = BT_ISO_PHY_2M, \ .rtn = 2u, \ } static struct bt_iso_qos default_qos = { .bcast = { .big = BT_ISO_QOS_BIG_UNSET, .bis = BT_ISO_QOS_BIS_UNSET, .sync_factor = 0x01, .packing = 0x00, .framing = 0x00, .in = DEFAULT_IO_QOS, .out = DEFAULT_IO_QOS, .encryption = 0x00, .bcode = {0x00}, .options = 0x00, .skip = 0x0000, .sync_timeout = BT_ISO_SYNC_TIMEOUT, .sync_cte_type = 0x00, .mse = 0x00, .timeout = BT_ISO_SYNC_TIMEOUT, }, }; static struct sock *iso_sock_alloc(struct net *net, struct socket *sock, int proto, gfp_t prio, int kern) { struct sock *sk; sk = bt_sock_alloc(net, sock, &iso_proto, proto, prio, kern); if (!sk) return NULL; sk->sk_destruct = iso_sock_destruct; sk->sk_sndtimeo = ISO_CONN_TIMEOUT; /* Set address type as public as default src address is BDADDR_ANY */ iso_pi(sk)->src_type = BDADDR_LE_PUBLIC; iso_pi(sk)->qos = default_qos; bt_sock_link(&iso_sk_list, sk); return sk; } static int iso_sock_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; BT_DBG("sock %p", sock); sock->state = SS_UNCONNECTED; if (sock->type != SOCK_SEQPACKET) return -ESOCKTNOSUPPORT; sock->ops = &iso_sock_ops; sk = iso_sock_alloc(net, sock, protocol, GFP_ATOMIC, kern); if (!sk) return -ENOMEM; iso_sock_init(sk, NULL); return 0; } static int iso_sock_bind_bc(struct socket *sock, struct sockaddr *addr, int addr_len) { struct sockaddr_iso *sa = (struct sockaddr_iso *)addr; struct sock *sk = sock->sk; int i; BT_DBG("sk %p bc_sid %u bc_num_bis %u", sk, sa->iso_bc->bc_sid, sa->iso_bc->bc_num_bis); if (addr_len != sizeof(*sa) + sizeof(*sa->iso_bc)) return -EINVAL; bacpy(&iso_pi(sk)->dst, &sa->iso_bc->bc_bdaddr); /* Check if the address type is of LE type */ if (!bdaddr_type_is_le(sa->iso_bc->bc_bdaddr_type)) return -EINVAL; iso_pi(sk)->dst_type = sa->iso_bc->bc_bdaddr_type; iso_pi(sk)->sync_handle = -1; if (sa->iso_bc->bc_sid > 0x0f) return -EINVAL; iso_pi(sk)->bc_sid = sa->iso_bc->bc_sid; if (sa->iso_bc->bc_num_bis > ISO_MAX_NUM_BIS) return -EINVAL; iso_pi(sk)->bc_num_bis = sa->iso_bc->bc_num_bis; for (i = 0; i < iso_pi(sk)->bc_num_bis; i++) if (sa->iso_bc->bc_bis[i] < 0x01 || sa->iso_bc->bc_bis[i] > 0x1f) return -EINVAL; memcpy(iso_pi(sk)->bc_bis, sa->iso_bc->bc_bis, iso_pi(sk)->bc_num_bis); return 0; } static int iso_sock_bind_pa_sk(struct sock *sk, struct sockaddr_iso *sa, int addr_len) { int err = 0; if (sk->sk_type != SOCK_SEQPACKET) { err = -EINVAL; goto done; } if (addr_len != sizeof(*sa) + sizeof(*sa->iso_bc)) { err = -EINVAL; goto done; } if (sa->iso_bc->bc_num_bis > ISO_MAX_NUM_BIS) { err = -EINVAL; goto done; } iso_pi(sk)->bc_num_bis = sa->iso_bc->bc_num_bis; for (int i = 0; i < iso_pi(sk)->bc_num_bis; i++) if (sa->iso_bc->bc_bis[i] < 0x01 || sa->iso_bc->bc_bis[i] > 0x1f) { err = -EINVAL; goto done; } memcpy(iso_pi(sk)->bc_bis, sa->iso_bc->bc_bis, iso_pi(sk)->bc_num_bis); done: return err; } static int iso_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_len) { struct sockaddr_iso *sa = (struct sockaddr_iso *)addr; struct sock *sk = sock->sk; int err = 0; BT_DBG("sk %p %pMR type %u", sk, &sa->iso_bdaddr, sa->iso_bdaddr_type); if (!addr || addr_len < sizeof(struct sockaddr_iso) || addr->sa_family != AF_BLUETOOTH) return -EINVAL; lock_sock(sk); /* Allow the user to bind a PA sync socket to a number * of BISes to sync to. */ if (sk->sk_state == BT_CONNECT2 && test_bit(BT_SK_PA_SYNC, &iso_pi(sk)->flags)) { err = iso_sock_bind_pa_sk(sk, sa, addr_len); goto done; } if (sk->sk_state != BT_OPEN) { err = -EBADFD; goto done; } if (sk->sk_type != SOCK_SEQPACKET) { err = -EINVAL; goto done; } /* Check if the address type is of LE type */ if (!bdaddr_type_is_le(sa->iso_bdaddr_type)) { err = -EINVAL; goto done; } bacpy(&iso_pi(sk)->src, &sa->iso_bdaddr); iso_pi(sk)->src_type = sa->iso_bdaddr_type; /* Check for Broadcast address */ if (addr_len > sizeof(*sa)) { err = iso_sock_bind_bc(sock, addr, addr_len); if (err) goto done; } sk->sk_state = BT_BOUND; done: release_sock(sk); return err; } static int iso_sock_connect(struct socket *sock, struct sockaddr *addr, int alen, int flags) { struct sockaddr_iso *sa = (struct sockaddr_iso *)addr; struct sock *sk = sock->sk; int err; BT_DBG("sk %p", sk); if (alen < sizeof(struct sockaddr_iso) || addr->sa_family != AF_BLUETOOTH) return -EINVAL; if (sk->sk_state != BT_OPEN && sk->sk_state != BT_BOUND) return -EBADFD; if (sk->sk_type != SOCK_SEQPACKET) return -EINVAL; /* Check if the address type is of LE type */ if (!bdaddr_type_is_le(sa->iso_bdaddr_type)) return -EINVAL; lock_sock(sk); bacpy(&iso_pi(sk)->dst, &sa->iso_bdaddr); iso_pi(sk)->dst_type = sa->iso_bdaddr_type; release_sock(sk); if (bacmp(&iso_pi(sk)->dst, BDADDR_ANY)) err = iso_connect_cis(sk); else err = iso_connect_bis(sk); if (err) return err; lock_sock(sk); if (!test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) { err = bt_sock_wait_state(sk, BT_CONNECTED, sock_sndtimeo(sk, flags & O_NONBLOCK)); } release_sock(sk); return err; } static int iso_listen_bis(struct sock *sk) { struct hci_dev *hdev; int err = 0; struct iso_conn *conn; struct hci_conn *hcon; BT_DBG("%pMR -> %pMR (SID 0x%2.2x)", &iso_pi(sk)->src, &iso_pi(sk)->dst, iso_pi(sk)->bc_sid); write_lock(&iso_sk_list.lock); if (__iso_get_sock_listen_by_sid(&iso_pi(sk)->src, &iso_pi(sk)->dst, iso_pi(sk)->bc_sid)) err = -EADDRINUSE; write_unlock(&iso_sk_list.lock); if (err) return err; hdev = hci_get_route(&iso_pi(sk)->dst, &iso_pi(sk)->src, iso_pi(sk)->src_type); if (!hdev) return -EHOSTUNREACH; hci_dev_lock(hdev); /* Fail if user set invalid QoS */ if (iso_pi(sk)->qos_user_set && !check_bcast_qos(&iso_pi(sk)->qos)) { iso_pi(sk)->qos = default_qos; err = -EINVAL; goto unlock; } hcon = hci_pa_create_sync(hdev, &iso_pi(sk)->dst, le_addr_type(iso_pi(sk)->dst_type), iso_pi(sk)->bc_sid, &iso_pi(sk)->qos); if (IS_ERR(hcon)) { err = PTR_ERR(hcon); goto unlock; } conn = iso_conn_add(hcon); if (!conn) { hci_conn_drop(hcon); err = -ENOMEM; goto unlock; } err = iso_chan_add(conn, sk, NULL); if (err) { hci_conn_drop(hcon); goto unlock; } hci_dev_put(hdev); unlock: hci_dev_unlock(hdev); return err; } static int iso_listen_cis(struct sock *sk) { int err = 0; BT_DBG("%pMR", &iso_pi(sk)->src); write_lock(&iso_sk_list.lock); if (__iso_get_sock_listen_by_addr(&iso_pi(sk)->src, &iso_pi(sk)->dst)) err = -EADDRINUSE; write_unlock(&iso_sk_list.lock); return err; } static int iso_sock_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; int err = 0; BT_DBG("sk %p backlog %d", sk, backlog); lock_sock(sk); if (sk->sk_state != BT_BOUND) { err = -EBADFD; goto done; } if (sk->sk_type != SOCK_SEQPACKET) { err = -EINVAL; goto done; } if (!bacmp(&iso_pi(sk)->dst, BDADDR_ANY)) err = iso_listen_cis(sk); else err = iso_listen_bis(sk); if (err) goto done; sk->sk_max_ack_backlog = backlog; sk->sk_ack_backlog = 0; sk->sk_state = BT_LISTEN; done: release_sock(sk); return err; } static int iso_sock_accept(struct socket *sock, struct socket *newsock, int flags, bool kern) { DEFINE_WAIT_FUNC(wait, woken_wake_function); struct sock *sk = sock->sk, *ch; long timeo; int err = 0; lock_sock(sk); timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); BT_DBG("sk %p timeo %ld", sk, timeo); /* Wait for an incoming connection. (wake-one). */ add_wait_queue_exclusive(sk_sleep(sk), &wait); while (1) { if (sk->sk_state != BT_LISTEN) { err = -EBADFD; break; } ch = bt_accept_dequeue(sk, newsock); if (ch) break; if (!timeo) { err = -EAGAIN; break; } if (signal_pending(current)) { err = sock_intr_errno(timeo); break; } release_sock(sk); timeo = wait_woken(&wait, TASK_INTERRUPTIBLE, timeo); lock_sock(sk); } remove_wait_queue(sk_sleep(sk), &wait); if (err) goto done; newsock->state = SS_CONNECTED; BT_DBG("new socket %p", ch); done: release_sock(sk); return err; } static int iso_sock_getname(struct socket *sock, struct sockaddr *addr, int peer) { struct sockaddr_iso *sa = (struct sockaddr_iso *)addr; struct sock *sk = sock->sk; BT_DBG("sock %p, sk %p", sock, sk); addr->sa_family = AF_BLUETOOTH; if (peer) { bacpy(&sa->iso_bdaddr, &iso_pi(sk)->dst); sa->iso_bdaddr_type = iso_pi(sk)->dst_type; } else { bacpy(&sa->iso_bdaddr, &iso_pi(sk)->src); sa->iso_bdaddr_type = iso_pi(sk)->src_type; } return sizeof(struct sockaddr_iso); } static int iso_sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct sk_buff *skb, **frag; size_t mtu; int err; BT_DBG("sock %p, sk %p", sock, sk); err = sock_error(sk); if (err) return err; if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; lock_sock(sk); if (sk->sk_state != BT_CONNECTED) { release_sock(sk); return -ENOTCONN; } mtu = iso_pi(sk)->conn->hcon->hdev->iso_mtu; release_sock(sk); skb = bt_skb_sendmsg(sk, msg, len, mtu, HCI_ISO_DATA_HDR_SIZE, 0); if (IS_ERR(skb)) return PTR_ERR(skb); len -= skb->len; BT_DBG("skb %p len %d", sk, skb->len); /* Continuation fragments */ frag = &skb_shinfo(skb)->frag_list; while (len) { struct sk_buff *tmp; tmp = bt_skb_sendmsg(sk, msg, len, mtu, 0, 0); if (IS_ERR(tmp)) { kfree_skb(skb); return PTR_ERR(tmp); } *frag = tmp; len -= tmp->len; skb->len += tmp->len; skb->data_len += tmp->len; BT_DBG("frag %p len %d", *frag, tmp->len); frag = &(*frag)->next; } lock_sock(sk); if (sk->sk_state == BT_CONNECTED) err = iso_send_frame(sk, skb); else err = -ENOTCONN; release_sock(sk); if (err < 0) kfree_skb(skb); return err; } static void iso_conn_defer_accept(struct hci_conn *conn) { struct hci_cp_le_accept_cis cp; struct hci_dev *hdev = conn->hdev; BT_DBG("conn %p", conn); conn->state = BT_CONFIG; cp.handle = cpu_to_le16(conn->handle); hci_send_cmd(hdev, HCI_OP_LE_ACCEPT_CIS, sizeof(cp), &cp); } static void iso_conn_big_sync(struct sock *sk) { int err; struct hci_dev *hdev; hdev = hci_get_route(&iso_pi(sk)->dst, &iso_pi(sk)->src, iso_pi(sk)->src_type); if (!hdev) return; if (!test_and_set_bit(BT_SK_BIG_SYNC, &iso_pi(sk)->flags)) { err = hci_le_big_create_sync(hdev, iso_pi(sk)->conn->hcon, &iso_pi(sk)->qos, iso_pi(sk)->sync_handle, iso_pi(sk)->bc_num_bis, iso_pi(sk)->bc_bis); if (err) bt_dev_err(hdev, "hci_le_big_create_sync: %d", err); } } static int iso_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { struct sock *sk = sock->sk; struct iso_pinfo *pi = iso_pi(sk); BT_DBG("sk %p", sk); if (test_and_clear_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) { lock_sock(sk); switch (sk->sk_state) { case BT_CONNECT2: if (pi->conn->hcon && test_bit(HCI_CONN_PA_SYNC, &pi->conn->hcon->flags)) { iso_conn_big_sync(sk); sk->sk_state = BT_LISTEN; } else { iso_conn_defer_accept(pi->conn->hcon); sk->sk_state = BT_CONFIG; } release_sock(sk); return 0; case BT_CONNECT: release_sock(sk); return iso_connect_cis(sk); default: release_sock(sk); break; } } return bt_sock_recvmsg(sock, msg, len, flags); } static bool check_io_qos(struct bt_iso_io_qos *qos) { /* If no PHY is enable SDU must be 0 */ if (!qos->phy && qos->sdu) return false; if (qos->interval && (qos->interval < 0xff || qos->interval > 0xfffff)) return false; if (qos->latency && (qos->latency < 0x05 || qos->latency > 0xfa0)) return false; if (qos->phy > BT_ISO_PHY_ANY) return false; return true; } static bool check_ucast_qos(struct bt_iso_qos *qos) { if (qos->ucast.cig > 0xef && qos->ucast.cig != BT_ISO_QOS_CIG_UNSET) return false; if (qos->ucast.cis > 0xef && qos->ucast.cis != BT_ISO_QOS_CIS_UNSET) return false; if (qos->ucast.sca > 0x07) return false; if (qos->ucast.packing > 0x01) return false; if (qos->ucast.framing > 0x01) return false; if (!check_io_qos(&qos->ucast.in)) return false; if (!check_io_qos(&qos->ucast.out)) return false; return true; } static bool check_bcast_qos(struct bt_iso_qos *qos) { if (qos->bcast.sync_factor == 0x00) return false; if (qos->bcast.packing > 0x01) return false; if (qos->bcast.framing > 0x01) return false; if (!check_io_qos(&qos->bcast.in)) return false; if (!check_io_qos(&qos->bcast.out)) return false; if (qos->bcast.encryption > 0x01) return false; if (qos->bcast.options > 0x07) return false; if (qos->bcast.skip > 0x01f3) return false; if (qos->bcast.sync_timeout < 0x000a || qos->bcast.sync_timeout > 0x4000) return false; if (qos->bcast.sync_cte_type > 0x1f) return false; if (qos->bcast.mse > 0x1f) return false; if (qos->bcast.timeout < 0x000a || qos->bcast.timeout > 0x4000) return false; return true; } static int iso_sock_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; int len, err = 0; struct bt_iso_qos qos = default_qos; u32 opt; BT_DBG("sk %p", sk); lock_sock(sk); switch (optname) { case BT_DEFER_SETUP: if (sk->sk_state != BT_BOUND && sk->sk_state != BT_LISTEN) { err = -EINVAL; break; } if (copy_from_sockptr(&opt, optval, sizeof(u32))) { err = -EFAULT; break; } if (opt) set_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags); else clear_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags); break; case BT_PKT_STATUS: if (copy_from_sockptr(&opt, optval, sizeof(u32))) { err = -EFAULT; break; } if (opt) set_bit(BT_SK_PKT_STATUS, &bt_sk(sk)->flags); else clear_bit(BT_SK_PKT_STATUS, &bt_sk(sk)->flags); break; case BT_ISO_QOS: if (sk->sk_state != BT_OPEN && sk->sk_state != BT_BOUND && sk->sk_state != BT_CONNECT2) { err = -EINVAL; break; } len = min_t(unsigned int, sizeof(qos), optlen); if (copy_from_sockptr(&qos, optval, len)) { err = -EFAULT; break; } if (len == sizeof(qos.ucast) && !check_ucast_qos(&qos)) { err = -EINVAL; break; } iso_pi(sk)->qos = qos; iso_pi(sk)->qos_user_set = true; break; case BT_ISO_BASE: if (sk->sk_state != BT_OPEN && sk->sk_state != BT_BOUND && sk->sk_state != BT_CONNECT2) { err = -EINVAL; break; } if (optlen > sizeof(iso_pi(sk)->base)) { err = -EOVERFLOW; break; } len = min_t(unsigned int, sizeof(iso_pi(sk)->base), optlen); if (copy_from_sockptr(iso_pi(sk)->base, optval, len)) { err = -EFAULT; break; } iso_pi(sk)->base_len = len; break; default: err = -ENOPROTOOPT; break; } release_sock(sk); return err; } static int iso_sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; int len, err = 0; struct bt_iso_qos *qos; u8 base_len; u8 *base; BT_DBG("sk %p", sk); if (get_user(len, optlen)) return -EFAULT; lock_sock(sk); switch (optname) { case BT_DEFER_SETUP: if (sk->sk_state == BT_CONNECTED) { err = -EINVAL; break; } if (put_user(test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags), (u32 __user *)optval)) err = -EFAULT; break; case BT_PKT_STATUS: if (put_user(test_bit(BT_SK_PKT_STATUS, &bt_sk(sk)->flags), (int __user *)optval)) err = -EFAULT; break; case BT_ISO_QOS: qos = iso_sock_get_qos(sk); len = min_t(unsigned int, len, sizeof(*qos)); if (copy_to_user(optval, qos, len)) err = -EFAULT; break; case BT_ISO_BASE: if (sk->sk_state == BT_CONNECTED && !bacmp(&iso_pi(sk)->dst, BDADDR_ANY)) { base_len = iso_pi(sk)->conn->hcon->le_per_adv_data_len; base = iso_pi(sk)->conn->hcon->le_per_adv_data; } else { base_len = iso_pi(sk)->base_len; base = iso_pi(sk)->base; } len = min_t(unsigned int, len, base_len); if (copy_to_user(optval, base, len)) err = -EFAULT; if (put_user(len, optlen)) err = -EFAULT; break; default: err = -ENOPROTOOPT; break; } release_sock(sk); return err; } static int iso_sock_shutdown(struct socket *sock, int how) { struct sock *sk = sock->sk; int err = 0; BT_DBG("sock %p, sk %p, how %d", sock, sk, how); if (!sk) return 0; sock_hold(sk); lock_sock(sk); switch (how) { case SHUT_RD: if (sk->sk_shutdown & RCV_SHUTDOWN) goto unlock; sk->sk_shutdown |= RCV_SHUTDOWN; break; case SHUT_WR: if (sk->sk_shutdown & SEND_SHUTDOWN) goto unlock; sk->sk_shutdown |= SEND_SHUTDOWN; break; case SHUT_RDWR: if (sk->sk_shutdown & SHUTDOWN_MASK) goto unlock; sk->sk_shutdown |= SHUTDOWN_MASK; break; } iso_sock_clear_timer(sk); __iso_sock_close(sk); if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime && !(current->flags & PF_EXITING)) err = bt_sock_wait_state(sk, BT_CLOSED, sk->sk_lingertime); unlock: release_sock(sk); sock_put(sk); return err; } static int iso_sock_release(struct socket *sock) { struct sock *sk = sock->sk; int err = 0; BT_DBG("sock %p, sk %p", sock, sk); if (!sk) return 0; iso_sock_close(sk); if (sock_flag(sk, SOCK_LINGER) && READ_ONCE(sk->sk_lingertime) && !(current->flags & PF_EXITING)) { lock_sock(sk); err = bt_sock_wait_state(sk, BT_CLOSED, sk->sk_lingertime); release_sock(sk); } sock_orphan(sk); iso_sock_kill(sk); return err; } static void iso_sock_ready(struct sock *sk) { BT_DBG("sk %p", sk); if (!sk) return; lock_sock(sk); iso_sock_clear_timer(sk); sk->sk_state = BT_CONNECTED; sk->sk_state_change(sk); release_sock(sk); } struct iso_list_data { struct hci_conn *hcon; int count; }; static bool iso_match_big(struct sock *sk, void *data) { struct hci_evt_le_big_sync_estabilished *ev = data; return ev->handle == iso_pi(sk)->qos.bcast.big; } static bool iso_match_pa_sync_flag(struct sock *sk, void *data) { return test_bit(BT_SK_PA_SYNC, &iso_pi(sk)->flags); } static void iso_conn_ready(struct iso_conn *conn) { struct sock *parent = NULL; struct sock *sk = conn->sk; struct hci_ev_le_big_sync_estabilished *ev = NULL; struct hci_ev_le_pa_sync_established *ev2 = NULL; struct hci_evt_le_big_info_adv_report *ev3 = NULL; struct hci_conn *hcon; BT_DBG("conn %p", conn); if (sk) { iso_sock_ready(conn->sk); } else { hcon = conn->hcon; if (!hcon) return; if (test_bit(HCI_CONN_BIG_SYNC, &hcon->flags) || test_bit(HCI_CONN_BIG_SYNC_FAILED, &hcon->flags)) { ev = hci_recv_event_data(hcon->hdev, HCI_EVT_LE_BIG_SYNC_ESTABILISHED); /* Get reference to PA sync parent socket, if it exists */ parent = iso_get_sock_listen(&hcon->src, &hcon->dst, iso_match_pa_sync_flag, NULL); if (!parent && ev) parent = iso_get_sock_listen(&hcon->src, &hcon->dst, iso_match_big, ev); } else if (test_bit(HCI_CONN_PA_SYNC_FAILED, &hcon->flags)) { ev2 = hci_recv_event_data(hcon->hdev, HCI_EV_LE_PA_SYNC_ESTABLISHED); if (ev2) parent = iso_get_sock_listen(&hcon->src, &hcon->dst, iso_match_sid, ev2); } else if (test_bit(HCI_CONN_PA_SYNC, &hcon->flags)) { ev3 = hci_recv_event_data(hcon->hdev, HCI_EVT_LE_BIG_INFO_ADV_REPORT); if (ev3) parent = iso_get_sock_listen(&hcon->src, &hcon->dst, iso_match_sync_handle, ev3); } if (!parent) parent = iso_get_sock_listen(&hcon->src, BDADDR_ANY, NULL, NULL); if (!parent) return; lock_sock(parent); sk = iso_sock_alloc(sock_net(parent), NULL, BTPROTO_ISO, GFP_ATOMIC, 0); if (!sk) { release_sock(parent); return; } iso_sock_init(sk, parent); bacpy(&iso_pi(sk)->src, &hcon->src); /* Convert from HCI to three-value type */ if (hcon->src_type == ADDR_LE_DEV_PUBLIC) iso_pi(sk)->src_type = BDADDR_LE_PUBLIC; else iso_pi(sk)->src_type = BDADDR_LE_RANDOM; /* If hcon has no destination address (BDADDR_ANY) it means it * was created by HCI_EV_LE_BIG_SYNC_ESTABILISHED or * HCI_EV_LE_PA_SYNC_ESTABLISHED so we need to initialize using * the parent socket destination address. */ if (!bacmp(&hcon->dst, BDADDR_ANY)) { bacpy(&hcon->dst, &iso_pi(parent)->dst); hcon->dst_type = iso_pi(parent)->dst_type; hcon->sync_handle = iso_pi(parent)->sync_handle; } if (ev3) { iso_pi(sk)->qos = iso_pi(parent)->qos; iso_pi(sk)->qos.bcast.encryption = ev3->encryption; hcon->iso_qos = iso_pi(sk)->qos; iso_pi(sk)->bc_num_bis = iso_pi(parent)->bc_num_bis; memcpy(iso_pi(sk)->bc_bis, iso_pi(parent)->bc_bis, ISO_MAX_NUM_BIS); set_bit(BT_SK_PA_SYNC, &iso_pi(sk)->flags); } bacpy(&iso_pi(sk)->dst, &hcon->dst); iso_pi(sk)->dst_type = hcon->dst_type; iso_pi(sk)->sync_handle = iso_pi(parent)->sync_handle; memcpy(iso_pi(sk)->base, iso_pi(parent)->base, iso_pi(parent)->base_len); iso_pi(sk)->base_len = iso_pi(parent)->base_len; hci_conn_hold(hcon); iso_chan_add(conn, sk, parent); if ((ev && ((struct hci_evt_le_big_sync_estabilished *)ev)->status) || (ev2 && ev2->status)) { /* Trigger error signal on child socket */ sk->sk_err = ECONNREFUSED; sk->sk_error_report(sk); } if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(parent)->flags)) sk->sk_state = BT_CONNECT2; else sk->sk_state = BT_CONNECTED; /* Wake up parent */ parent->sk_data_ready(parent); release_sock(parent); sock_put(parent); } } static bool iso_match_sid(struct sock *sk, void *data) { struct hci_ev_le_pa_sync_established *ev = data; return ev->sid == iso_pi(sk)->bc_sid; } static bool iso_match_sync_handle(struct sock *sk, void *data) { struct hci_evt_le_big_info_adv_report *ev = data; return le16_to_cpu(ev->sync_handle) == iso_pi(sk)->sync_handle; } static bool iso_match_sync_handle_pa_report(struct sock *sk, void *data) { struct hci_ev_le_per_adv_report *ev = data; return le16_to_cpu(ev->sync_handle) == iso_pi(sk)->sync_handle; } /* ----- ISO interface with lower layer (HCI) ----- */ int iso_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 *flags) { struct hci_ev_le_pa_sync_established *ev1; struct hci_evt_le_big_info_adv_report *ev2; struct hci_ev_le_per_adv_report *ev3; struct sock *sk; bt_dev_dbg(hdev, "bdaddr %pMR", bdaddr); /* Broadcast receiver requires handling of some events before it can * proceed to establishing a BIG sync: * * 1. HCI_EV_LE_PA_SYNC_ESTABLISHED: The socket may specify a specific * SID to listen to and once sync is estabilished its handle needs to * be stored in iso_pi(sk)->sync_handle so it can be matched once * receiving the BIG Info. * 2. HCI_EVT_LE_BIG_INFO_ADV_REPORT: When connect_ind is triggered by a * a BIG Info it attempts to check if there any listening socket with * the same sync_handle and if it does then attempt to create a sync. * 3. HCI_EV_LE_PER_ADV_REPORT: When a PA report is received, it is stored * in iso_pi(sk)->base so it can be passed up to user, in the case of a * broadcast sink. */ ev1 = hci_recv_event_data(hdev, HCI_EV_LE_PA_SYNC_ESTABLISHED); if (ev1) { sk = iso_get_sock_listen(&hdev->bdaddr, bdaddr, iso_match_sid, ev1); if (sk && !ev1->status) iso_pi(sk)->sync_handle = le16_to_cpu(ev1->handle); goto done; } ev2 = hci_recv_event_data(hdev, HCI_EVT_LE_BIG_INFO_ADV_REPORT); if (ev2) { /* Try to get PA sync listening socket, if it exists */ sk = iso_get_sock_listen(&hdev->bdaddr, bdaddr, iso_match_pa_sync_flag, NULL); if (!sk) { sk = iso_get_sock_listen(&hdev->bdaddr, bdaddr, iso_match_sync_handle, ev2); /* If PA Sync is in process of terminating, * do not handle any more BIGInfo adv reports. */ if (sk && test_bit(BT_SK_PA_SYNC_TERM, &iso_pi(sk)->flags)) return 0; } if (sk) { int err; if (ev2->num_bis < iso_pi(sk)->bc_num_bis) iso_pi(sk)->bc_num_bis = ev2->num_bis; if (!test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags) && !test_and_set_bit(BT_SK_BIG_SYNC, &iso_pi(sk)->flags)) { err = hci_le_big_create_sync(hdev, NULL, &iso_pi(sk)->qos, iso_pi(sk)->sync_handle, iso_pi(sk)->bc_num_bis, iso_pi(sk)->bc_bis); if (err) { bt_dev_err(hdev, "hci_le_big_create_sync: %d", err); sock_put(sk); sk = NULL; } } } } ev3 = hci_recv_event_data(hdev, HCI_EV_LE_PER_ADV_REPORT); if (ev3) { size_t base_len = 0; u8 *base; struct hci_conn *hcon; sk = iso_get_sock_listen(&hdev->bdaddr, bdaddr, iso_match_sync_handle_pa_report, ev3); if (!sk) goto done; hcon = iso_pi(sk)->conn->hcon; if (!hcon) goto done; if (ev3->data_status == LE_PA_DATA_TRUNCATED) { /* The controller was unable to retrieve PA data. */ memset(hcon->le_per_adv_data, 0, HCI_MAX_PER_AD_TOT_LEN); hcon->le_per_adv_data_len = 0; hcon->le_per_adv_data_offset = 0; goto done; } if (hcon->le_per_adv_data_offset + ev3->length > HCI_MAX_PER_AD_TOT_LEN) goto done; memcpy(hcon->le_per_adv_data + hcon->le_per_adv_data_offset, ev3->data, ev3->length); hcon->le_per_adv_data_offset += ev3->length; if (ev3->data_status == LE_PA_DATA_COMPLETE) { /* All PA data has been received. */ hcon->le_per_adv_data_len = hcon->le_per_adv_data_offset; hcon->le_per_adv_data_offset = 0; /* Extract BASE */ base = eir_get_service_data(hcon->le_per_adv_data, hcon->le_per_adv_data_len, EIR_BAA_SERVICE_UUID, &base_len); if (!base || base_len > BASE_MAX_LENGTH) goto done; memcpy(iso_pi(sk)->base, base, base_len); iso_pi(sk)->base_len = base_len; } else { /* This is a PA data fragment. Keep pa_data_len set to 0 * until all data has been reassembled. */ hcon->le_per_adv_data_len = 0; } } else { sk = iso_get_sock_listen(&hdev->bdaddr, BDADDR_ANY, NULL, NULL); } done: if (!sk) return 0; if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) *flags |= HCI_PROTO_DEFER; sock_put(sk); return HCI_LM_ACCEPT; } static void iso_connect_cfm(struct hci_conn *hcon, __u8 status) { if (hcon->type != ISO_LINK) { if (hcon->type != LE_LINK) return; /* Check if LE link has failed */ if (status) { struct hci_link *link, *t; list_for_each_entry_safe(link, t, &hcon->link_list, list) iso_conn_del(link->conn, bt_to_errno(status)); return; } /* Create CIS if pending */ hci_le_create_cis_pending(hcon->hdev); return; } BT_DBG("hcon %p bdaddr %pMR status %d", hcon, &hcon->dst, status); /* Similar to the success case, if HCI_CONN_BIG_SYNC_FAILED or * HCI_CONN_PA_SYNC_FAILED is set, queue the failed connection * into the accept queue of the listening socket and wake up * userspace, to inform the user about the event. */ if (!status || test_bit(HCI_CONN_BIG_SYNC_FAILED, &hcon->flags) || test_bit(HCI_CONN_PA_SYNC_FAILED, &hcon->flags)) { struct iso_conn *conn; conn = iso_conn_add(hcon); if (conn) iso_conn_ready(conn); } else { iso_conn_del(hcon, bt_to_errno(status)); } } static void iso_disconn_cfm(struct hci_conn *hcon, __u8 reason) { if (hcon->type != ISO_LINK) return; BT_DBG("hcon %p reason %d", hcon, reason); iso_conn_del(hcon, bt_to_errno(reason)); } void iso_recv(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) { struct iso_conn *conn = hcon->iso_data; __u16 pb, ts, len; if (!conn) goto drop; pb = hci_iso_flags_pb(flags); ts = hci_iso_flags_ts(flags); BT_DBG("conn %p len %d pb 0x%x ts 0x%x", conn, skb->len, pb, ts); switch (pb) { case ISO_START: case ISO_SINGLE: if (conn->rx_len) { BT_ERR("Unexpected start frame (len %d)", skb->len); kfree_skb(conn->rx_skb); conn->rx_skb = NULL; conn->rx_len = 0; } if (ts) { struct hci_iso_ts_data_hdr *hdr; /* TODO: add timestamp to the packet? */ hdr = skb_pull_data(skb, HCI_ISO_TS_DATA_HDR_SIZE); if (!hdr) { BT_ERR("Frame is too short (len %d)", skb->len); goto drop; } len = __le16_to_cpu(hdr->slen); } else { struct hci_iso_data_hdr *hdr; hdr = skb_pull_data(skb, HCI_ISO_DATA_HDR_SIZE); if (!hdr) { BT_ERR("Frame is too short (len %d)", skb->len); goto drop; } len = __le16_to_cpu(hdr->slen); } flags = hci_iso_data_flags(len); len = hci_iso_data_len(len); BT_DBG("Start: total len %d, frag len %d flags 0x%4.4x", len, skb->len, flags); if (len == skb->len) { /* Complete frame received */ hci_skb_pkt_status(skb) = flags & 0x03; iso_recv_frame(conn, skb); return; } if (pb == ISO_SINGLE) { BT_ERR("Frame malformed (len %d, expected len %d)", skb->len, len); goto drop; } if (skb->len > len) { BT_ERR("Frame is too long (len %d, expected len %d)", skb->len, len); goto drop; } /* Allocate skb for the complete frame (with header) */ conn->rx_skb = bt_skb_alloc(len, GFP_KERNEL); if (!conn->rx_skb) goto drop; hci_skb_pkt_status(conn->rx_skb) = flags & 0x03; skb_copy_from_linear_data(skb, skb_put(conn->rx_skb, skb->len), skb->len); conn->rx_len = len - skb->len; break; case ISO_CONT: BT_DBG("Cont: frag len %d (expecting %d)", skb->len, conn->rx_len); if (!conn->rx_len) { BT_ERR("Unexpected continuation frame (len %d)", skb->len); goto drop; } if (skb->len > conn->rx_len) { BT_ERR("Fragment is too long (len %d, expected %d)", skb->len, conn->rx_len); kfree_skb(conn->rx_skb); conn->rx_skb = NULL; conn->rx_len = 0; goto drop; } skb_copy_from_linear_data(skb, skb_put(conn->rx_skb, skb->len), skb->len); conn->rx_len -= skb->len; return; case ISO_END: skb_copy_from_linear_data(skb, skb_put(conn->rx_skb, skb->len), skb->len); conn->rx_len -= skb->len; if (!conn->rx_len) { struct sk_buff *rx_skb = conn->rx_skb; /* Complete frame received. iso_recv_frame * takes ownership of the skb so set the global * rx_skb pointer to NULL first. */ conn->rx_skb = NULL; iso_recv_frame(conn, rx_skb); } break; } drop: kfree_skb(skb); } static struct hci_cb iso_cb = { .name = "ISO", .connect_cfm = iso_connect_cfm, .disconn_cfm = iso_disconn_cfm, }; static int iso_debugfs_show(struct seq_file *f, void *p) { struct sock *sk; read_lock(&iso_sk_list.lock); sk_for_each(sk, &iso_sk_list.head) { seq_printf(f, "%pMR %pMR %d\n", &iso_pi(sk)->src, &iso_pi(sk)->dst, sk->sk_state); } read_unlock(&iso_sk_list.lock); return 0; } DEFINE_SHOW_ATTRIBUTE(iso_debugfs); static struct dentry *iso_debugfs; static const struct proto_ops iso_sock_ops = { .family = PF_BLUETOOTH, .owner = THIS_MODULE, .release = iso_sock_release, .bind = iso_sock_bind, .connect = iso_sock_connect, .listen = iso_sock_listen, .accept = iso_sock_accept, .getname = iso_sock_getname, .sendmsg = iso_sock_sendmsg, .recvmsg = iso_sock_recvmsg, .poll = bt_sock_poll, .ioctl = bt_sock_ioctl, .mmap = sock_no_mmap, .socketpair = sock_no_socketpair, .shutdown = iso_sock_shutdown, .setsockopt = iso_sock_setsockopt, .getsockopt = iso_sock_getsockopt }; static const struct net_proto_family iso_sock_family_ops = { .family = PF_BLUETOOTH, .owner = THIS_MODULE, .create = iso_sock_create, }; static bool iso_inited; bool iso_enabled(void) { return iso_inited; } int iso_init(void) { int err; BUILD_BUG_ON(sizeof(struct sockaddr_iso) > sizeof(struct sockaddr)); if (iso_inited) return -EALREADY; err = proto_register(&iso_proto, 0); if (err < 0) return err; err = bt_sock_register(BTPROTO_ISO, &iso_sock_family_ops); if (err < 0) { BT_ERR("ISO socket registration failed"); goto error; } err = bt_procfs_init(&init_net, "iso", &iso_sk_list, NULL); if (err < 0) { BT_ERR("Failed to create ISO proc file"); bt_sock_unregister(BTPROTO_ISO); goto error; } BT_INFO("ISO socket layer initialized"); hci_register_cb(&iso_cb); if (IS_ERR_OR_NULL(bt_debugfs)) return 0; if (!iso_debugfs) { iso_debugfs = debugfs_create_file("iso", 0444, bt_debugfs, NULL, &iso_debugfs_fops); } iso_inited = true; return 0; error: proto_unregister(&iso_proto); return err; } int iso_exit(void) { if (!iso_inited) return -EALREADY; bt_procfs_cleanup(&init_net, "iso"); debugfs_remove(iso_debugfs); iso_debugfs = NULL; hci_unregister_cb(&iso_cb); bt_sock_unregister(BTPROTO_ISO); proto_unregister(&iso_proto); iso_inited = false; return 0; }
11 11 11 11 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 // SPDX-License-Identifier: GPL-2.0 #include <linux/linkage.h> #include <linux/errno.h> #include <linux/signal.h> #include <linux/sched.h> #include <linux/ioport.h> #include <linux/interrupt.h> #include <linux/irq.h> #include <linux/timex.h> #include <linux/random.h> #include <linux/init.h> #include <linux/kernel_stat.h> #include <linux/syscore_ops.h> #include <linux/bitops.h> #include <linux/acpi.h> #include <linux/io.h> #include <linux/delay.h> #include <linux/pgtable.h> #include <linux/atomic.h> #include <asm/timer.h> #include <asm/hw_irq.h> #include <asm/desc.h> #include <asm/apic.h> #include <asm/i8259.h> /* * This is the 'legacy' 8259A Programmable Interrupt Controller, * present in the majority of PC/AT boxes. * plus some generic x86 specific things if generic specifics makes * any sense at all. */ static void init_8259A(int auto_eoi); static bool pcat_compat __ro_after_init; static int i8259A_auto_eoi; DEFINE_RAW_SPINLOCK(i8259A_lock); /* * 8259A PIC functions to handle ISA devices: */ /* * This contains the irq mask for both 8259A irq controllers, */ unsigned int cached_irq_mask = 0xffff; /* * Not all IRQs can be routed through the IO-APIC, eg. on certain (older) * boards the timer interrupt is not really connected to any IO-APIC pin, * it's fed to the master 8259A's IR0 line only. * * Any '1' bit in this mask means the IRQ is routed through the IO-APIC. * this 'mixed mode' IRQ handling costs nothing because it's only used * at IRQ setup time. */ unsigned long io_apic_irqs; static void mask_8259A_irq(unsigned int irq) { unsigned int mask = 1 << irq; unsigned long flags; raw_spin_lock_irqsave(&i8259A_lock, flags); cached_irq_mask |= mask; if (irq & 8) outb(cached_slave_mask, PIC_SLAVE_IMR); else outb(cached_master_mask, PIC_MASTER_IMR); raw_spin_unlock_irqrestore(&i8259A_lock, flags); } static void disable_8259A_irq(struct irq_data *data) { mask_8259A_irq(data->irq); } static void unmask_8259A_irq(unsigned int irq) { unsigned int mask = ~(1 << irq); unsigned long flags; raw_spin_lock_irqsave(&i8259A_lock, flags); cached_irq_mask &= mask; if (irq & 8) outb(cached_slave_mask, PIC_SLAVE_IMR); else outb(cached_master_mask, PIC_MASTER_IMR); raw_spin_unlock_irqrestore(&i8259A_lock, flags); } static void enable_8259A_irq(struct irq_data *data) { unmask_8259A_irq(data->irq); } static int i8259A_irq_pending(unsigned int irq) { unsigned int mask = 1<<irq; unsigned long flags; int ret; raw_spin_lock_irqsave(&i8259A_lock, flags); if (irq < 8) ret = inb(PIC_MASTER_CMD) & mask; else ret = inb(PIC_SLAVE_CMD) & (mask >> 8); raw_spin_unlock_irqrestore(&i8259A_lock, flags); return ret; } static void make_8259A_irq(unsigned int irq) { disable_irq_nosync(irq); io_apic_irqs &= ~(1<<irq); irq_set_chip_and_handler(irq, &i8259A_chip, handle_level_irq); irq_set_status_flags(irq, IRQ_LEVEL); enable_irq(irq); lapic_assign_legacy_vector(irq, true); } /* * This function assumes to be called rarely. Switching between * 8259A registers is slow. * This has to be protected by the irq controller spinlock * before being called. */ static inline int i8259A_irq_real(unsigned int irq) { int value; int irqmask = 1<<irq; if (irq < 8) { outb(0x0B, PIC_MASTER_CMD); /* ISR register */ value = inb(PIC_MASTER_CMD) & irqmask; outb(0x0A, PIC_MASTER_CMD); /* back to the IRR register */ return value; } outb(0x0B, PIC_SLAVE_CMD); /* ISR register */ value = inb(PIC_SLAVE_CMD) & (irqmask >> 8); outb(0x0A, PIC_SLAVE_CMD); /* back to the IRR register */ return value; } /* * Careful! The 8259A is a fragile beast, it pretty * much _has_ to be done exactly like this (mask it * first, _then_ send the EOI, and the order of EOI * to the two 8259s is important! */ static void mask_and_ack_8259A(struct irq_data *data) { unsigned int irq = data->irq; unsigned int irqmask = 1 << irq; unsigned long flags; raw_spin_lock_irqsave(&i8259A_lock, flags); /* * Lightweight spurious IRQ detection. We do not want * to overdo spurious IRQ handling - it's usually a sign * of hardware problems, so we only do the checks we can * do without slowing down good hardware unnecessarily. * * Note that IRQ7 and IRQ15 (the two spurious IRQs * usually resulting from the 8259A-1|2 PICs) occur * even if the IRQ is masked in the 8259A. Thus we * can check spurious 8259A IRQs without doing the * quite slow i8259A_irq_real() call for every IRQ. * This does not cover 100% of spurious interrupts, * but should be enough to warn the user that there * is something bad going on ... */ if (cached_irq_mask & irqmask) goto spurious_8259A_irq; cached_irq_mask |= irqmask; handle_real_irq: if (irq & 8) { inb(PIC_SLAVE_IMR); /* DUMMY - (do we need this?) */ outb(cached_slave_mask, PIC_SLAVE_IMR); /* 'Specific EOI' to slave */ outb(0x60+(irq&7), PIC_SLAVE_CMD); /* 'Specific EOI' to master-IRQ2 */ outb(0x60+PIC_CASCADE_IR, PIC_MASTER_CMD); } else { inb(PIC_MASTER_IMR); /* DUMMY - (do we need this?) */ outb(cached_master_mask, PIC_MASTER_IMR); outb(0x60+irq, PIC_MASTER_CMD); /* 'Specific EOI to master */ } raw_spin_unlock_irqrestore(&i8259A_lock, flags); return; spurious_8259A_irq: /* * this is the slow path - should happen rarely. */ if (i8259A_irq_real(irq)) /* * oops, the IRQ _is_ in service according to the * 8259A - not spurious, go handle it. */ goto handle_real_irq; { static int spurious_irq_mask; /* * At this point we can be sure the IRQ is spurious, * lets ACK and report it. [once per IRQ] */ if (!(spurious_irq_mask & irqmask)) { printk_deferred(KERN_DEBUG "spurious 8259A interrupt: IRQ%d.\n", irq); spurious_irq_mask |= irqmask; } atomic_inc(&irq_err_count); /* * Theoretically we do not have to handle this IRQ, * but in Linux this does not cause problems and is * simpler for us. */ goto handle_real_irq; } } struct irq_chip i8259A_chip = { .name = "XT-PIC", .irq_mask = disable_8259A_irq, .irq_disable = disable_8259A_irq, .irq_unmask = enable_8259A_irq, .irq_mask_ack = mask_and_ack_8259A, }; static char irq_trigger[2]; /* ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ */ static void restore_ELCR(char *trigger) { outb(trigger[0], PIC_ELCR1); outb(trigger[1], PIC_ELCR2); } static void save_ELCR(char *trigger) { /* IRQ 0,1,2,8,13 are marked as reserved */ trigger[0] = inb(PIC_ELCR1) & 0xF8; trigger[1] = inb(PIC_ELCR2) & 0xDE; } static void i8259A_resume(void) { init_8259A(i8259A_auto_eoi); restore_ELCR(irq_trigger); } static int i8259A_suspend(void) { save_ELCR(irq_trigger); return 0; } static void i8259A_shutdown(void) { /* Put the i8259A into a quiescent state that * the kernel initialization code can get it * out of. */ outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ } static struct syscore_ops i8259_syscore_ops = { .suspend = i8259A_suspend, .resume = i8259A_resume, .shutdown = i8259A_shutdown, }; static void mask_8259A(void) { unsigned long flags; raw_spin_lock_irqsave(&i8259A_lock, flags); outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ raw_spin_unlock_irqrestore(&i8259A_lock, flags); } static void unmask_8259A(void) { unsigned long flags; raw_spin_lock_irqsave(&i8259A_lock, flags); outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */ outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */ raw_spin_unlock_irqrestore(&i8259A_lock, flags); } static int probe_8259A(void) { unsigned char new_val, probe_val = ~(1 << PIC_CASCADE_IR); unsigned long flags; /* * If MADT has the PCAT_COMPAT flag set, then do not bother probing * for the PIC. Some BIOSes leave the PIC uninitialized and probing * fails. * * Right now this causes problems as quite some code depends on * nr_legacy_irqs() > 0 or has_legacy_pic() == true. This is silly * when the system has an IO/APIC because then PIC is not required * at all, except for really old machines where the timer interrupt * must be routed through the PIC. So just pretend that the PIC is * there and let legacy_pic->init() initialize it for nothing. * * Alternatively this could just try to initialize the PIC and * repeat the probe, but for cases where there is no PIC that's * just pointless. */ if (pcat_compat) return nr_legacy_irqs(); /* * Check to see if we have a PIC. Mask all except the cascade and * read back the value we just wrote. If we don't have a PIC, we * will read 0xff as opposed to the value we wrote. */ raw_spin_lock_irqsave(&i8259A_lock, flags); outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ outb(probe_val, PIC_MASTER_IMR); new_val = inb(PIC_MASTER_IMR); if (new_val != probe_val) { printk(KERN_INFO "Using NULL legacy PIC\n"); legacy_pic = &null_legacy_pic; } raw_spin_unlock_irqrestore(&i8259A_lock, flags); return nr_legacy_irqs(); } static void init_8259A(int auto_eoi) { unsigned long flags; i8259A_auto_eoi = auto_eoi; raw_spin_lock_irqsave(&i8259A_lock, flags); outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ /* * outb_pic - this has to work on a wide range of PC hardware. */ outb_pic(0x11, PIC_MASTER_CMD); /* ICW1: select 8259A-1 init */ /* ICW2: 8259A-1 IR0-7 mapped to ISA_IRQ_VECTOR(0) */ outb_pic(ISA_IRQ_VECTOR(0), PIC_MASTER_IMR); /* 8259A-1 (the master) has a slave on IR2 */ outb_pic(1U << PIC_CASCADE_IR, PIC_MASTER_IMR); if (auto_eoi) /* master does Auto EOI */ outb_pic(MASTER_ICW4_DEFAULT | PIC_ICW4_AEOI, PIC_MASTER_IMR); else /* master expects normal EOI */ outb_pic(MASTER_ICW4_DEFAULT, PIC_MASTER_IMR); outb_pic(0x11, PIC_SLAVE_CMD); /* ICW1: select 8259A-2 init */ /* ICW2: 8259A-2 IR0-7 mapped to ISA_IRQ_VECTOR(8) */ outb_pic(ISA_IRQ_VECTOR(8), PIC_SLAVE_IMR); /* 8259A-2 is a slave on master's IR2 */ outb_pic(PIC_CASCADE_IR, PIC_SLAVE_IMR); /* (slave's support for AEOI in flat mode is to be investigated) */ outb_pic(SLAVE_ICW4_DEFAULT, PIC_SLAVE_IMR); if (auto_eoi) /* * In AEOI mode we just have to mask the interrupt * when acking. */ i8259A_chip.irq_mask_ack = disable_8259A_irq; else i8259A_chip.irq_mask_ack = mask_and_ack_8259A; udelay(100); /* wait for 8259A to initialize */ outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */ outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */ raw_spin_unlock_irqrestore(&i8259A_lock, flags); } /* * make i8259 a driver so that we can select pic functions at run time. the goal * is to make x86 binary compatible among pc compatible and non-pc compatible * platforms, such as x86 MID. */ static void legacy_pic_noop(void) { }; static void legacy_pic_uint_noop(unsigned int unused) { }; static void legacy_pic_int_noop(int unused) { }; static int legacy_pic_irq_pending_noop(unsigned int irq) { return 0; } static int legacy_pic_probe(void) { return 0; } struct legacy_pic null_legacy_pic = { .nr_legacy_irqs = 0, .chip = &dummy_irq_chip, .mask = legacy_pic_uint_noop, .unmask = legacy_pic_uint_noop, .mask_all = legacy_pic_noop, .restore_mask = legacy_pic_noop, .init = legacy_pic_int_noop, .probe = legacy_pic_probe, .irq_pending = legacy_pic_irq_pending_noop, .make_irq = legacy_pic_uint_noop, }; static struct legacy_pic default_legacy_pic = { .nr_legacy_irqs = NR_IRQS_LEGACY, .chip = &i8259A_chip, .mask = mask_8259A_irq, .unmask = unmask_8259A_irq, .mask_all = mask_8259A, .restore_mask = unmask_8259A, .init = init_8259A, .probe = probe_8259A, .irq_pending = i8259A_irq_pending, .make_irq = make_8259A_irq, }; struct legacy_pic *legacy_pic = &default_legacy_pic; EXPORT_SYMBOL(legacy_pic); static int __init i8259A_init_ops(void) { if (legacy_pic == &default_legacy_pic) register_syscore_ops(&i8259_syscore_ops); return 0; } device_initcall(i8259A_init_ops); void __init legacy_pic_pcat_compat(void) { pcat_compat = true; }
37 37 4 64 64 52 66 4 36 4 2 38 37 41 4 4 4 7 7 4 4 4 3 41 4 1 4 4 4 3 4 4 4 4 37 37 37 37 36 37 37 2 37 37 37 37 3 64 64 64 64 64 63 64 64 64 64 37 37 37 37 2 37 37 37 3 37 4 4 4 4 4 1 4 37 37 37 2 1 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 /* * zsmalloc memory allocator * * Copyright (C) 2011 Nitin Gupta * Copyright (C) 2012, 2013 Minchan Kim * * This code is released using a dual license strategy: BSD/GPL * You can choose the license that better fits your requirements. * * Released under the terms of 3-clause BSD License * Released under the terms of GNU General Public License Version 2.0 */ /* * Following is how we use various fields and flags of underlying * struct page(s) to form a zspage. * * Usage of struct page fields: * page->private: points to zspage * page->index: links together all component pages of a zspage * For the huge page, this is always 0, so we use this field * to store handle. * page->page_type: first object offset in a subpage of zspage * * Usage of struct page flags: * PG_private: identifies the first component page * PG_owner_priv_1: identifies the huge component page * */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt /* * lock ordering: * page_lock * pool->lock * zspage->lock */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/bitops.h> #include <linux/errno.h> #include <linux/highmem.h> #include <linux/string.h> #include <linux/slab.h> #include <linux/pgtable.h> #include <asm/tlbflush.h> #include <linux/cpumask.h> #include <linux/cpu.h> #include <linux/vmalloc.h> #include <linux/preempt.h> #include <linux/spinlock.h> #include <linux/shrinker.h> #include <linux/types.h> #include <linux/debugfs.h> #include <linux/zsmalloc.h> #include <linux/zpool.h> #include <linux/migrate.h> #include <linux/wait.h> #include <linux/pagemap.h> #include <linux/fs.h> #include <linux/local_lock.h> #define ZSPAGE_MAGIC 0x58 /* * This must be power of 2 and greater than or equal to sizeof(link_free). * These two conditions ensure that any 'struct link_free' itself doesn't * span more than 1 page which avoids complex case of mapping 2 pages simply * to restore link_free pointer values. */ #define ZS_ALIGN 8 #define ZS_HANDLE_SIZE (sizeof(unsigned long)) /* * Object location (<PFN>, <obj_idx>) is encoded as * a single (unsigned long) handle value. * * Note that object index <obj_idx> starts from 0. * * This is made more complicated by various memory models and PAE. */ #ifndef MAX_POSSIBLE_PHYSMEM_BITS #ifdef MAX_PHYSMEM_BITS #define MAX_POSSIBLE_PHYSMEM_BITS MAX_PHYSMEM_BITS #else /* * If this definition of MAX_PHYSMEM_BITS is used, OBJ_INDEX_BITS will just * be PAGE_SHIFT */ #define MAX_POSSIBLE_PHYSMEM_BITS BITS_PER_LONG #endif #endif #define _PFN_BITS (MAX_POSSIBLE_PHYSMEM_BITS - PAGE_SHIFT) /* * Head in allocated object should have OBJ_ALLOCATED_TAG * to identify the object was allocated or not. * It's okay to add the status bit in the least bit because * header keeps handle which is 4byte-aligned address so we * have room for two bit at least. */ #define OBJ_ALLOCATED_TAG 1 #define OBJ_TAG_BITS 1 #define OBJ_TAG_MASK OBJ_ALLOCATED_TAG #define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS) #define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1) #define HUGE_BITS 1 #define FULLNESS_BITS 4 #define CLASS_BITS 8 #define MAGIC_VAL_BITS 8 #define MAX(a, b) ((a) >= (b) ? (a) : (b)) #define ZS_MAX_PAGES_PER_ZSPAGE (_AC(CONFIG_ZSMALLOC_CHAIN_SIZE, UL)) /* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */ #define ZS_MIN_ALLOC_SIZE \ MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS)) /* each chunk includes extra space to keep handle */ #define ZS_MAX_ALLOC_SIZE PAGE_SIZE /* * On systems with 4K page size, this gives 255 size classes! There is a * trader-off here: * - Large number of size classes is potentially wasteful as free page are * spread across these classes * - Small number of size classes causes large internal fragmentation * - Probably its better to use specific size classes (empirically * determined). NOTE: all those class sizes must be set as multiple of * ZS_ALIGN to make sure link_free itself never has to span 2 pages. * * ZS_MIN_ALLOC_SIZE and ZS_SIZE_CLASS_DELTA must be multiple of ZS_ALIGN * (reason above) */ #define ZS_SIZE_CLASS_DELTA (PAGE_SIZE >> CLASS_BITS) #define ZS_SIZE_CLASSES (DIV_ROUND_UP(ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE, \ ZS_SIZE_CLASS_DELTA) + 1) /* * Pages are distinguished by the ratio of used memory (that is the ratio * of ->inuse objects to all objects that page can store). For example, * INUSE_RATIO_10 means that the ratio of used objects is > 0% and <= 10%. * * The number of fullness groups is not random. It allows us to keep * difference between the least busy page in the group (minimum permitted * number of ->inuse objects) and the most busy page (maximum permitted * number of ->inuse objects) at a reasonable value. */ enum fullness_group { ZS_INUSE_RATIO_0, ZS_INUSE_RATIO_10, /* NOTE: 8 more fullness groups here */ ZS_INUSE_RATIO_99 = 10, ZS_INUSE_RATIO_100, NR_FULLNESS_GROUPS, }; enum class_stat_type { /* NOTE: stats for 12 fullness groups here: from inuse 0 to 100 */ ZS_OBJS_ALLOCATED = NR_FULLNESS_GROUPS, ZS_OBJS_INUSE, NR_CLASS_STAT_TYPES, }; struct zs_size_stat { unsigned long objs[NR_CLASS_STAT_TYPES]; }; #ifdef CONFIG_ZSMALLOC_STAT static struct dentry *zs_stat_root; #endif static size_t huge_class_size; struct size_class { struct list_head fullness_list[NR_FULLNESS_GROUPS]; /* * Size of objects stored in this class. Must be multiple * of ZS_ALIGN. */ int size; int objs_per_zspage; /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */ int pages_per_zspage; unsigned int index; struct zs_size_stat stats; }; /* * Placed within free objects to form a singly linked list. * For every zspage, zspage->freeobj gives head of this list. * * This must be power of 2 and less than or equal to ZS_ALIGN */ struct link_free { union { /* * Free object index; * It's valid for non-allocated object */ unsigned long next; /* * Handle of allocated object. */ unsigned long handle; }; }; struct zs_pool { const char *name; struct size_class *size_class[ZS_SIZE_CLASSES]; struct kmem_cache *handle_cachep; struct kmem_cache *zspage_cachep; atomic_long_t pages_allocated; struct zs_pool_stats stats; /* Compact classes */ struct shrinker *shrinker; #ifdef CONFIG_ZSMALLOC_STAT struct dentry *stat_dentry; #endif #ifdef CONFIG_COMPACTION struct work_struct free_work; #endif spinlock_t lock; atomic_t compaction_in_progress; }; struct zspage { struct { unsigned int huge:HUGE_BITS; unsigned int fullness:FULLNESS_BITS; unsigned int class:CLASS_BITS + 1; unsigned int magic:MAGIC_VAL_BITS; }; unsigned int inuse; unsigned int freeobj; struct page *first_page; struct list_head list; /* fullness list */ struct zs_pool *pool; rwlock_t lock; }; struct mapping_area { local_lock_t lock; char *vm_buf; /* copy buffer for objects that span pages */ char *vm_addr; /* address of kmap_atomic()'ed pages */ enum zs_mapmode vm_mm; /* mapping mode */ }; /* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */ static void SetZsHugePage(struct zspage *zspage) { zspage->huge = 1; } static bool ZsHugePage(struct zspage *zspage) { return zspage->huge; } static void migrate_lock_init(struct zspage *zspage); static void migrate_read_lock(struct zspage *zspage); static void migrate_read_unlock(struct zspage *zspage); static void migrate_write_lock(struct zspage *zspage); static void migrate_write_unlock(struct zspage *zspage); #ifdef CONFIG_COMPACTION static void kick_deferred_free(struct zs_pool *pool); static void init_deferred_free(struct zs_pool *pool); static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage); #else static void kick_deferred_free(struct zs_pool *pool) {} static void init_deferred_free(struct zs_pool *pool) {} static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) {} #endif static int create_cache(struct zs_pool *pool) { pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE, 0, 0, NULL); if (!pool->handle_cachep) return 1; pool->zspage_cachep = kmem_cache_create("zspage", sizeof(struct zspage), 0, 0, NULL); if (!pool->zspage_cachep) { kmem_cache_destroy(pool->handle_cachep); pool->handle_cachep = NULL; return 1; } return 0; } static void destroy_cache(struct zs_pool *pool) { kmem_cache_destroy(pool->handle_cachep); kmem_cache_destroy(pool->zspage_cachep); } static unsigned long cache_alloc_handle(struct zs_pool *pool, gfp_t gfp) { return (unsigned long)kmem_cache_alloc(pool->handle_cachep, gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE)); } static void cache_free_handle(struct zs_pool *pool, unsigned long handle) { kmem_cache_free(pool->handle_cachep, (void *)handle); } static struct zspage *cache_alloc_zspage(struct zs_pool *pool, gfp_t flags) { return kmem_cache_zalloc(pool->zspage_cachep, flags & ~(__GFP_HIGHMEM|__GFP_MOVABLE)); } static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage) { kmem_cache_free(pool->zspage_cachep, zspage); } /* pool->lock(which owns the handle) synchronizes races */ static void record_obj(unsigned long handle, unsigned long obj) { *(unsigned long *)handle = obj; } /* zpool driver */ #ifdef CONFIG_ZPOOL static void *zs_zpool_create(const char *name, gfp_t gfp) { /* * Ignore global gfp flags: zs_malloc() may be invoked from * different contexts and its caller must provide a valid * gfp mask. */ return zs_create_pool(name); } static void zs_zpool_destroy(void *pool) { zs_destroy_pool(pool); } static int zs_zpool_malloc(void *pool, size_t size, gfp_t gfp, unsigned long *handle) { *handle = zs_malloc(pool, size, gfp); if (IS_ERR_VALUE(*handle)) return PTR_ERR((void *)*handle); return 0; } static void zs_zpool_free(void *pool, unsigned long handle) { zs_free(pool, handle); } static void *zs_zpool_map(void *pool, unsigned long handle, enum zpool_mapmode mm) { enum zs_mapmode zs_mm; switch (mm) { case ZPOOL_MM_RO: zs_mm = ZS_MM_RO; break; case ZPOOL_MM_WO: zs_mm = ZS_MM_WO; break; case ZPOOL_MM_RW: default: zs_mm = ZS_MM_RW; break; } return zs_map_object(pool, handle, zs_mm); } static void zs_zpool_unmap(void *pool, unsigned long handle) { zs_unmap_object(pool, handle); } static u64 zs_zpool_total_size(void *pool) { return zs_get_total_pages(pool) << PAGE_SHIFT; } static struct zpool_driver zs_zpool_driver = { .type = "zsmalloc", .owner = THIS_MODULE, .create = zs_zpool_create, .destroy = zs_zpool_destroy, .malloc_support_movable = true, .malloc = zs_zpool_malloc, .free = zs_zpool_free, .map = zs_zpool_map, .unmap = zs_zpool_unmap, .total_size = zs_zpool_total_size, }; MODULE_ALIAS("zpool-zsmalloc"); #endif /* CONFIG_ZPOOL */ /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ static DEFINE_PER_CPU(struct mapping_area, zs_map_area) = { .lock = INIT_LOCAL_LOCK(lock), }; static __maybe_unused int is_first_page(struct page *page) { return PagePrivate(page); } /* Protected by pool->lock */ static inline int get_zspage_inuse(struct zspage *zspage) { return zspage->inuse; } static inline void mod_zspage_inuse(struct zspage *zspage, int val) { zspage->inuse += val; } static inline struct page *get_first_page(struct zspage *zspage) { struct page *first_page = zspage->first_page; VM_BUG_ON_PAGE(!is_first_page(first_page), first_page); return first_page; } static inline unsigned int get_first_obj_offset(struct page *page) { return page->page_type; } static inline void set_first_obj_offset(struct page *page, unsigned int offset) { page->page_type = offset; } static inline unsigned int get_freeobj(struct zspage *zspage) { return zspage->freeobj; } static inline void set_freeobj(struct zspage *zspage, unsigned int obj) { zspage->freeobj = obj; } static struct size_class *zspage_class(struct zs_pool *pool, struct zspage *zspage) { return pool->size_class[zspage->class]; } /* * zsmalloc divides the pool into various size classes where each * class maintains a list of zspages where each zspage is divided * into equal sized chunks. Each allocation falls into one of these * classes depending on its size. This function returns index of the * size class which has chunk size big enough to hold the given size. */ static int get_size_class_index(int size) { int idx = 0; if (likely(size > ZS_MIN_ALLOC_SIZE)) idx = DIV_ROUND_UP(size - ZS_MIN_ALLOC_SIZE, ZS_SIZE_CLASS_DELTA); return min_t(int, ZS_SIZE_CLASSES - 1, idx); } static inline void class_stat_inc(struct size_class *class, int type, unsigned long cnt) { class->stats.objs[type] += cnt; } static inline void class_stat_dec(struct size_class *class, int type, unsigned long cnt) { class->stats.objs[type] -= cnt; } static inline unsigned long zs_stat_get(struct size_class *class, int type) { return class->stats.objs[type]; } #ifdef CONFIG_ZSMALLOC_STAT static void __init zs_stat_init(void) { if (!debugfs_initialized()) { pr_warn("debugfs not available, stat dir not created\n"); return; } zs_stat_root = debugfs_create_dir("zsmalloc", NULL); } static void __exit zs_stat_exit(void) { debugfs_remove_recursive(zs_stat_root); } static unsigned long zs_can_compact(struct size_class *class); static int zs_stats_size_show(struct seq_file *s, void *v) { int i, fg; struct zs_pool *pool = s->private; struct size_class *class; int objs_per_zspage; unsigned long obj_allocated, obj_used, pages_used, freeable; unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0; unsigned long total_freeable = 0; unsigned long inuse_totals[NR_FULLNESS_GROUPS] = {0, }; seq_printf(s, " %5s %5s %9s %9s %9s %9s %9s %9s %9s %9s %9s %9s %9s %13s %10s %10s %16s %8s\n", "class", "size", "10%", "20%", "30%", "40%", "50%", "60%", "70%", "80%", "90%", "99%", "100%", "obj_allocated", "obj_used", "pages_used", "pages_per_zspage", "freeable"); for (i = 0; i < ZS_SIZE_CLASSES; i++) { class = pool->size_class[i]; if (class->index != i) continue; spin_lock(&pool->lock); seq_printf(s, " %5u %5u ", i, class->size); for (fg = ZS_INUSE_RATIO_10; fg < NR_FULLNESS_GROUPS; fg++) { inuse_totals[fg] += zs_stat_get(class, fg); seq_printf(s, "%9lu ", zs_stat_get(class, fg)); } obj_allocated = zs_stat_get(class, ZS_OBJS_ALLOCATED); obj_used = zs_stat_get(class, ZS_OBJS_INUSE); freeable = zs_can_compact(class); spin_unlock(&pool->lock); objs_per_zspage = class->objs_per_zspage; pages_used = obj_allocated / objs_per_zspage * class->pages_per_zspage; seq_printf(s, "%13lu %10lu %10lu %16d %8lu\n", obj_allocated, obj_used, pages_used, class->pages_per_zspage, freeable); total_objs += obj_allocated; total_used_objs += obj_used; total_pages += pages_used; total_freeable += freeable; } seq_puts(s, "\n"); seq_printf(s, " %5s %5s ", "Total", ""); for (fg = ZS_INUSE_RATIO_10; fg < NR_FULLNESS_GROUPS; fg++) seq_printf(s, "%9lu ", inuse_totals[fg]); seq_printf(s, "%13lu %10lu %10lu %16s %8lu\n", total_objs, total_used_objs, total_pages, "", total_freeable); return 0; } DEFINE_SHOW_ATTRIBUTE(zs_stats_size); static void zs_pool_stat_create(struct zs_pool *pool, const char *name) { if (!zs_stat_root) { pr_warn("no root stat dir, not creating <%s> stat dir\n", name); return; } pool->stat_dentry = debugfs_create_dir(name, zs_stat_root); debugfs_create_file("classes", S_IFREG | 0444, pool->stat_dentry, pool, &zs_stats_size_fops); } static void zs_pool_stat_destroy(struct zs_pool *pool) { debugfs_remove_recursive(pool->stat_dentry); } #else /* CONFIG_ZSMALLOC_STAT */ static void __init zs_stat_init(void) { } static void __exit zs_stat_exit(void) { } static inline void zs_pool_stat_create(struct zs_pool *pool, const char *name) { } static inline void zs_pool_stat_destroy(struct zs_pool *pool) { } #endif /* * For each size class, zspages are divided into different groups * depending on their usage ratio. This function returns fullness * status of the given page. */ static int get_fullness_group(struct size_class *class, struct zspage *zspage) { int inuse, objs_per_zspage, ratio; inuse = get_zspage_inuse(zspage); objs_per_zspage = class->objs_per_zspage; if (inuse == 0) return ZS_INUSE_RATIO_0; if (inuse == objs_per_zspage) return ZS_INUSE_RATIO_100; ratio = 100 * inuse / objs_per_zspage; /* * Take integer division into consideration: a page with one inuse * object out of 127 possible, will end up having 0 usage ratio, * which is wrong as it belongs in ZS_INUSE_RATIO_10 fullness group. */ return ratio / 10 + 1; } /* * Each size class maintains various freelists and zspages are assigned * to one of these freelists based on the number of live objects they * have. This functions inserts the given zspage into the freelist * identified by <class, fullness_group>. */ static void insert_zspage(struct size_class *class, struct zspage *zspage, int fullness) { class_stat_inc(class, fullness, 1); list_add(&zspage->list, &class->fullness_list[fullness]); zspage->fullness = fullness; } /* * This function removes the given zspage from the freelist identified * by <class, fullness_group>. */ static void remove_zspage(struct size_class *class, struct zspage *zspage) { int fullness = zspage->fullness; VM_BUG_ON(list_empty(&class->fullness_list[fullness])); list_del_init(&zspage->list); class_stat_dec(class, fullness, 1); } /* * Each size class maintains zspages in different fullness groups depending * on the number of live objects they contain. When allocating or freeing * objects, the fullness status of the page can change, for instance, from * INUSE_RATIO_80 to INUSE_RATIO_70 when freeing an object. This function * checks if such a status change has occurred for the given page and * accordingly moves the page from the list of the old fullness group to that * of the new fullness group. */ static int fix_fullness_group(struct size_class *class, struct zspage *zspage) { int newfg; newfg = get_fullness_group(class, zspage); if (newfg == zspage->fullness) goto out; remove_zspage(class, zspage); insert_zspage(class, zspage, newfg); out: return newfg; } static struct zspage *get_zspage(struct page *page) { struct zspage *zspage = (struct zspage *)page_private(page); BUG_ON(zspage->magic != ZSPAGE_MAGIC); return zspage; } static struct page *get_next_page(struct page *page) { struct zspage *zspage = get_zspage(page); if (unlikely(ZsHugePage(zspage))) return NULL; return (struct page *)page->index; } /** * obj_to_location - get (<page>, <obj_idx>) from encoded object value * @obj: the encoded object value * @page: page object resides in zspage * @obj_idx: object index */ static void obj_to_location(unsigned long obj, struct page **page, unsigned int *obj_idx) { *page = pfn_to_page(obj >> OBJ_INDEX_BITS); *obj_idx = (obj & OBJ_INDEX_MASK); } static void obj_to_page(unsigned long obj, struct page **page) { *page = pfn_to_page(obj >> OBJ_INDEX_BITS); } /** * location_to_obj - get obj value encoded from (<page>, <obj_idx>) * @page: page object resides in zspage * @obj_idx: object index */ static unsigned long location_to_obj(struct page *page, unsigned int obj_idx) { unsigned long obj; obj = page_to_pfn(page) << OBJ_INDEX_BITS; obj |= obj_idx & OBJ_INDEX_MASK; return obj; } static unsigned long handle_to_obj(unsigned long handle) { return *(unsigned long *)handle; } static inline bool obj_allocated(struct page *page, void *obj, unsigned long *phandle) { unsigned long handle; struct zspage *zspage = get_zspage(page); if (unlikely(ZsHugePage(zspage))) { VM_BUG_ON_PAGE(!is_first_page(page), page); handle = page->index; } else handle = *(unsigned long *)obj; if (!(handle & OBJ_ALLOCATED_TAG)) return false; /* Clear all tags before returning the handle */ *phandle = handle & ~OBJ_TAG_MASK; return true; } static void reset_page(struct page *page) { __ClearPageMovable(page); ClearPagePrivate(page); set_page_private(page, 0); page_mapcount_reset(page); page->index = 0; } static int trylock_zspage(struct zspage *zspage) { struct page *cursor, *fail; for (cursor = get_first_page(zspage); cursor != NULL; cursor = get_next_page(cursor)) { if (!trylock_page(cursor)) { fail = cursor; goto unlock; } } return 1; unlock: for (cursor = get_first_page(zspage); cursor != fail; cursor = get_next_page(cursor)) unlock_page(cursor); return 0; } static void __free_zspage(struct zs_pool *pool, struct size_class *class, struct zspage *zspage) { struct page *page, *next; assert_spin_locked(&pool->lock); VM_BUG_ON(get_zspage_inuse(zspage)); VM_BUG_ON(zspage->fullness != ZS_INUSE_RATIO_0); next = page = get_first_page(zspage); do { VM_BUG_ON_PAGE(!PageLocked(page), page); next = get_next_page(page); reset_page(page); unlock_page(page); dec_zone_page_state(page, NR_ZSPAGES); put_page(page); page = next; } while (page != NULL); cache_free_zspage(pool, zspage); class_stat_dec(class, ZS_OBJS_ALLOCATED, class->objs_per_zspage); atomic_long_sub(class->pages_per_zspage, &pool->pages_allocated); } static void free_zspage(struct zs_pool *pool, struct size_class *class, struct zspage *zspage) { VM_BUG_ON(get_zspage_inuse(zspage)); VM_BUG_ON(list_empty(&zspage->list)); /* * Since zs_free couldn't be sleepable, this function cannot call * lock_page. The page locks trylock_zspage got will be released * by __free_zspage. */ if (!trylock_zspage(zspage)) { kick_deferred_free(pool); return; } remove_zspage(class, zspage); __free_zspage(pool, class, zspage); } /* Initialize a newly allocated zspage */ static void init_zspage(struct size_class *class, struct zspage *zspage) { unsigned int freeobj = 1; unsigned long off = 0; struct page *page = get_first_page(zspage); while (page) { struct page *next_page; struct link_free *link; void *vaddr; set_first_obj_offset(page, off); vaddr = kmap_atomic(page); link = (struct link_free *)vaddr + off / sizeof(*link); while ((off += class->size) < PAGE_SIZE) { link->next = freeobj++ << OBJ_TAG_BITS; link += class->size / sizeof(*link); } /* * We now come to the last (full or partial) object on this * page, which must point to the first object on the next * page (if present) */ next_page = get_next_page(page); if (next_page) { link->next = freeobj++ << OBJ_TAG_BITS; } else { /* * Reset OBJ_TAG_BITS bit to last link to tell * whether it's allocated object or not. */ link->next = -1UL << OBJ_TAG_BITS; } kunmap_atomic(vaddr); page = next_page; off %= PAGE_SIZE; } set_freeobj(zspage, 0); } static void create_page_chain(struct size_class *class, struct zspage *zspage, struct page *pages[]) { int i; struct page *page; struct page *prev_page = NULL; int nr_pages = class->pages_per_zspage; /* * Allocate individual pages and link them together as: * 1. all pages are linked together using page->index * 2. each sub-page point to zspage using page->private * * we set PG_private to identify the first page (i.e. no other sub-page * has this flag set). */ for (i = 0; i < nr_pages; i++) { page = pages[i]; set_page_private(page, (unsigned long)zspage); page->index = 0; if (i == 0) { zspage->first_page = page; SetPagePrivate(page); if (unlikely(class->objs_per_zspage == 1 && class->pages_per_zspage == 1)) SetZsHugePage(zspage); } else { prev_page->index = (unsigned long)page; } prev_page = page; } } /* * Allocate a zspage for the given size class */ static struct zspage *alloc_zspage(struct zs_pool *pool, struct size_class *class, gfp_t gfp) { int i; struct page *pages[ZS_MAX_PAGES_PER_ZSPAGE]; struct zspage *zspage = cache_alloc_zspage(pool, gfp); if (!zspage) return NULL; zspage->magic = ZSPAGE_MAGIC; migrate_lock_init(zspage); for (i = 0; i < class->pages_per_zspage; i++) { struct page *page; page = alloc_page(gfp); if (!page) { while (--i >= 0) { dec_zone_page_state(pages[i], NR_ZSPAGES); __free_page(pages[i]); } cache_free_zspage(pool, zspage); return NULL; } inc_zone_page_state(page, NR_ZSPAGES); pages[i] = page; } create_page_chain(class, zspage, pages); init_zspage(class, zspage); zspage->pool = pool; zspage->class = class->index; return zspage; } static struct zspage *find_get_zspage(struct size_class *class) { int i; struct zspage *zspage; for (i = ZS_INUSE_RATIO_99; i >= ZS_INUSE_RATIO_0; i--) { zspage = list_first_entry_or_null(&class->fullness_list[i], struct zspage, list); if (zspage) break; } return zspage; } static inline int __zs_cpu_up(struct mapping_area *area) { /* * Make sure we don't leak memory if a cpu UP notification * and zs_init() race and both call zs_cpu_up() on the same cpu */ if (area->vm_buf) return 0; area->vm_buf = kmalloc(ZS_MAX_ALLOC_SIZE, GFP_KERNEL); if (!area->vm_buf) return -ENOMEM; return 0; } static inline void __zs_cpu_down(struct mapping_area *area) { kfree(area->vm_buf); area->vm_buf = NULL; } static void *__zs_map_object(struct mapping_area *area, struct page *pages[2], int off, int size) { int sizes[2]; void *addr; char *buf = area->vm_buf; /* disable page faults to match kmap_atomic() return conditions */ pagefault_disable(); /* no read fastpath */ if (area->vm_mm == ZS_MM_WO) goto out; sizes[0] = PAGE_SIZE - off; sizes[1] = size - sizes[0]; /* copy object to per-cpu buffer */ addr = kmap_atomic(pages[0]); memcpy(buf, addr + off, sizes[0]); kunmap_atomic(addr); addr = kmap_atomic(pages[1]); memcpy(buf + sizes[0], addr, sizes[1]); kunmap_atomic(addr); out: return area->vm_buf; } static void __zs_unmap_object(struct mapping_area *area, struct page *pages[2], int off, int size) { int sizes[2]; void *addr; char *buf; /* no write fastpath */ if (area->vm_mm == ZS_MM_RO) goto out; buf = area->vm_buf; buf = buf + ZS_HANDLE_SIZE; size -= ZS_HANDLE_SIZE; off += ZS_HANDLE_SIZE; sizes[0] = PAGE_SIZE - off; sizes[1] = size - sizes[0]; /* copy per-cpu buffer to object */ addr = kmap_atomic(pages[0]); memcpy(addr + off, buf, sizes[0]); kunmap_atomic(addr); addr = kmap_atomic(pages[1]); memcpy(addr, buf + sizes[0], sizes[1]); kunmap_atomic(addr); out: /* enable page faults to match kunmap_atomic() return conditions */ pagefault_enable(); } static int zs_cpu_prepare(unsigned int cpu) { struct mapping_area *area; area = &per_cpu(zs_map_area, cpu); return __zs_cpu_up(area); } static int zs_cpu_dead(unsigned int cpu) { struct mapping_area *area; area = &per_cpu(zs_map_area, cpu); __zs_cpu_down(area); return 0; } static bool can_merge(struct size_class *prev, int pages_per_zspage, int objs_per_zspage) { if (prev->pages_per_zspage == pages_per_zspage && prev->objs_per_zspage == objs_per_zspage) return true; return false; } static bool zspage_full(struct size_class *class, struct zspage *zspage) { return get_zspage_inuse(zspage) == class->objs_per_zspage; } static bool zspage_empty(struct zspage *zspage) { return get_zspage_inuse(zspage) == 0; } /** * zs_lookup_class_index() - Returns index of the zsmalloc &size_class * that hold objects of the provided size. * @pool: zsmalloc pool to use * @size: object size * * Context: Any context. * * Return: the index of the zsmalloc &size_class that hold objects of the * provided size. */ unsigned int zs_lookup_class_index(struct zs_pool *pool, unsigned int size) { struct size_class *class; class = pool->size_class[get_size_class_index(size)]; return class->index; } EXPORT_SYMBOL_GPL(zs_lookup_class_index); unsigned long zs_get_total_pages(struct zs_pool *pool) { return atomic_long_read(&pool->pages_allocated); } EXPORT_SYMBOL_GPL(zs_get_total_pages); /** * zs_map_object - get address of allocated object from handle. * @pool: pool from which the object was allocated * @handle: handle returned from zs_malloc * @mm: mapping mode to use * * Before using an object allocated from zs_malloc, it must be mapped using * this function. When done with the object, it must be unmapped using * zs_unmap_object. * * Only one object can be mapped per cpu at a time. There is no protection * against nested mappings. * * This function returns with preemption and page faults disabled. */ void *zs_map_object(struct zs_pool *pool, unsigned long handle, enum zs_mapmode mm) { struct zspage *zspage; struct page *page; unsigned long obj, off; unsigned int obj_idx; struct size_class *class; struct mapping_area *area; struct page *pages[2]; void *ret; /* * Because we use per-cpu mapping areas shared among the * pools/users, we can't allow mapping in interrupt context * because it can corrupt another users mappings. */ BUG_ON(in_interrupt()); /* It guarantees it can get zspage from handle safely */ spin_lock(&pool->lock); obj = handle_to_obj(handle); obj_to_location(obj, &page, &obj_idx); zspage = get_zspage(page); /* * migration cannot move any zpages in this zspage. Here, pool->lock * is too heavy since callers would take some time until they calls * zs_unmap_object API so delegate the locking from class to zspage * which is smaller granularity. */ migrate_read_lock(zspage); spin_unlock(&pool->lock); class = zspage_class(pool, zspage); off = offset_in_page(class->size * obj_idx); local_lock(&zs_map_area.lock); area = this_cpu_ptr(&zs_map_area); area->vm_mm = mm; if (off + class->size <= PAGE_SIZE) { /* this object is contained entirely within a page */ area->vm_addr = kmap_atomic(page); ret = area->vm_addr + off; goto out; } /* this object spans two pages */ pages[0] = page; pages[1] = get_next_page(page); BUG_ON(!pages[1]); ret = __zs_map_object(area, pages, off, class->size); out: if (likely(!ZsHugePage(zspage))) ret += ZS_HANDLE_SIZE; return ret; } EXPORT_SYMBOL_GPL(zs_map_object); void zs_unmap_object(struct zs_pool *pool, unsigned long handle) { struct zspage *zspage; struct page *page; unsigned long obj, off; unsigned int obj_idx; struct size_class *class; struct mapping_area *area; obj = handle_to_obj(handle); obj_to_location(obj, &page, &obj_idx); zspage = get_zspage(page); class = zspage_class(pool, zspage); off = offset_in_page(class->size * obj_idx); area = this_cpu_ptr(&zs_map_area); if (off + class->size <= PAGE_SIZE) kunmap_atomic(area->vm_addr); else { struct page *pages[2]; pages[0] = page; pages[1] = get_next_page(page); BUG_ON(!pages[1]); __zs_unmap_object(area, pages, off, class->size); } local_unlock(&zs_map_area.lock); migrate_read_unlock(zspage); } EXPORT_SYMBOL_GPL(zs_unmap_object); /** * zs_huge_class_size() - Returns the size (in bytes) of the first huge * zsmalloc &size_class. * @pool: zsmalloc pool to use * * The function returns the size of the first huge class - any object of equal * or bigger size will be stored in zspage consisting of a single physical * page. * * Context: Any context. * * Return: the size (in bytes) of the first huge zsmalloc &size_class. */ size_t zs_huge_class_size(struct zs_pool *pool) { return huge_class_size; } EXPORT_SYMBOL_GPL(zs_huge_class_size); static unsigned long obj_malloc(struct zs_pool *pool, struct zspage *zspage, unsigned long handle) { int i, nr_page, offset; unsigned long obj; struct link_free *link; struct size_class *class; struct page *m_page; unsigned long m_offset; void *vaddr; class = pool->size_class[zspage->class]; handle |= OBJ_ALLOCATED_TAG; obj = get_freeobj(zspage); offset = obj * class->size; nr_page = offset >> PAGE_SHIFT; m_offset = offset_in_page(offset); m_page = get_first_page(zspage); for (i = 0; i < nr_page; i++) m_page = get_next_page(m_page); vaddr = kmap_atomic(m_page); link = (struct link_free *)vaddr + m_offset / sizeof(*link); set_freeobj(zspage, link->next >> OBJ_TAG_BITS); if (likely(!ZsHugePage(zspage))) /* record handle in the header of allocated chunk */ link->handle = handle; else /* record handle to page->index */ zspage->first_page->index = handle; kunmap_atomic(vaddr); mod_zspage_inuse(zspage, 1); obj = location_to_obj(m_page, obj); return obj; } /** * zs_malloc - Allocate block of given size from pool. * @pool: pool to allocate from * @size: size of block to allocate * @gfp: gfp flags when allocating object * * On success, handle to the allocated object is returned, * otherwise an ERR_PTR(). * Allocation requests with size > ZS_MAX_ALLOC_SIZE will fail. */ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) { unsigned long handle, obj; struct size_class *class; int newfg; struct zspage *zspage; if (unlikely(!size)) return (unsigned long)ERR_PTR(-EINVAL); if (unlikely(size > ZS_MAX_ALLOC_SIZE)) return (unsigned long)ERR_PTR(-ENOSPC); handle = cache_alloc_handle(pool, gfp); if (!handle) return (unsigned long)ERR_PTR(-ENOMEM); /* extra space in chunk to keep the handle */ size += ZS_HANDLE_SIZE; class = pool->size_class[get_size_class_index(size)]; /* pool->lock effectively protects the zpage migration */ spin_lock(&pool->lock); zspage = find_get_zspage(class); if (likely(zspage)) { obj = obj_malloc(pool, zspage, handle); /* Now move the zspage to another fullness group, if required */ fix_fullness_group(class, zspage); record_obj(handle, obj); class_stat_inc(class, ZS_OBJS_INUSE, 1); goto out; } spin_unlock(&pool->lock); zspage = alloc_zspage(pool, class, gfp); if (!zspage) { cache_free_handle(pool, handle); return (unsigned long)ERR_PTR(-ENOMEM); } spin_lock(&pool->lock); obj = obj_malloc(pool, zspage, handle); newfg = get_fullness_group(class, zspage); insert_zspage(class, zspage, newfg); record_obj(handle, obj); atomic_long_add(class->pages_per_zspage, &pool->pages_allocated); class_stat_inc(class, ZS_OBJS_ALLOCATED, class->objs_per_zspage); class_stat_inc(class, ZS_OBJS_INUSE, 1); /* We completely set up zspage so mark them as movable */ SetZsPageMovable(pool, zspage); out: spin_unlock(&pool->lock); return handle; } EXPORT_SYMBOL_GPL(zs_malloc); static void obj_free(int class_size, unsigned long obj) { struct link_free *link; struct zspage *zspage; struct page *f_page; unsigned long f_offset; unsigned int f_objidx; void *vaddr; obj_to_location(obj, &f_page, &f_objidx); f_offset = offset_in_page(class_size * f_objidx); zspage = get_zspage(f_page); vaddr = kmap_atomic(f_page); link = (struct link_free *)(vaddr + f_offset); /* Insert this object in containing zspage's freelist */ if (likely(!ZsHugePage(zspage))) link->next = get_freeobj(zspage) << OBJ_TAG_BITS; else f_page->index = 0; set_freeobj(zspage, f_objidx); kunmap_atomic(vaddr); mod_zspage_inuse(zspage, -1); } void zs_free(struct zs_pool *pool, unsigned long handle) { struct zspage *zspage; struct page *f_page; unsigned long obj; struct size_class *class; int fullness; if (IS_ERR_OR_NULL((void *)handle)) return; /* * The pool->lock protects the race with zpage's migration * so it's safe to get the page from handle. */ spin_lock(&pool->lock); obj = handle_to_obj(handle); obj_to_page(obj, &f_page); zspage = get_zspage(f_page); class = zspage_class(pool, zspage); class_stat_dec(class, ZS_OBJS_INUSE, 1); obj_free(class->size, obj); fullness = fix_fullness_group(class, zspage); if (fullness == ZS_INUSE_RATIO_0) free_zspage(pool, class, zspage); spin_unlock(&pool->lock); cache_free_handle(pool, handle); } EXPORT_SYMBOL_GPL(zs_free); static void zs_object_copy(struct size_class *class, unsigned long dst, unsigned long src) { struct page *s_page, *d_page; unsigned int s_objidx, d_objidx; unsigned long s_off, d_off; void *s_addr, *d_addr; int s_size, d_size, size; int written = 0; s_size = d_size = class->size; obj_to_location(src, &s_page, &s_objidx); obj_to_location(dst, &d_page, &d_objidx); s_off = offset_in_page(class->size * s_objidx); d_off = offset_in_page(class->size * d_objidx); if (s_off + class->size > PAGE_SIZE) s_size = PAGE_SIZE - s_off; if (d_off + class->size > PAGE_SIZE) d_size = PAGE_SIZE - d_off; s_addr = kmap_atomic(s_page); d_addr = kmap_atomic(d_page); while (1) { size = min(s_size, d_size); memcpy(d_addr + d_off, s_addr + s_off, size); written += size; if (written == class->size) break; s_off += size; s_size -= size; d_off += size; d_size -= size; /* * Calling kunmap_atomic(d_addr) is necessary. kunmap_atomic() * calls must occurs in reverse order of calls to kmap_atomic(). * So, to call kunmap_atomic(s_addr) we should first call * kunmap_atomic(d_addr). For more details see * Documentation/mm/highmem.rst. */ if (s_off >= PAGE_SIZE) { kunmap_atomic(d_addr); kunmap_atomic(s_addr); s_page = get_next_page(s_page); s_addr = kmap_atomic(s_page); d_addr = kmap_atomic(d_page); s_size = class->size - written; s_off = 0; } if (d_off >= PAGE_SIZE) { kunmap_atomic(d_addr); d_page = get_next_page(d_page); d_addr = kmap_atomic(d_page); d_size = class->size - written; d_off = 0; } } kunmap_atomic(d_addr); kunmap_atomic(s_addr); } /* * Find alloced object in zspage from index object and * return handle. */ static unsigned long find_alloced_obj(struct size_class *class, struct page *page, int *obj_idx) { unsigned int offset; int index = *obj_idx; unsigned long handle = 0; void *addr = kmap_atomic(page); offset = get_first_obj_offset(page); offset += class->size * index; while (offset < PAGE_SIZE) { if (obj_allocated(page, addr + offset, &handle)) break; offset += class->size; index++; } kunmap_atomic(addr); *obj_idx = index; return handle; } static void migrate_zspage(struct zs_pool *pool, struct zspage *src_zspage, struct zspage *dst_zspage) { unsigned long used_obj, free_obj; unsigned long handle; int obj_idx = 0; struct page *s_page = get_first_page(src_zspage); struct size_class *class = pool->size_class[src_zspage->class]; while (1) { handle = find_alloced_obj(class, s_page, &obj_idx); if (!handle) { s_page = get_next_page(s_page); if (!s_page) break; obj_idx = 0; continue; } used_obj = handle_to_obj(handle); free_obj = obj_malloc(pool, dst_zspage, handle); zs_object_copy(class, free_obj, used_obj); obj_idx++; record_obj(handle, free_obj); obj_free(class->size, used_obj); /* Stop if there is no more space */ if (zspage_full(class, dst_zspage)) break; /* Stop if there are no more objects to migrate */ if (zspage_empty(src_zspage)) break; } } static struct zspage *isolate_src_zspage(struct size_class *class) { struct zspage *zspage; int fg; for (fg = ZS_INUSE_RATIO_10; fg <= ZS_INUSE_RATIO_99; fg++) { zspage = list_first_entry_or_null(&class->fullness_list[fg], struct zspage, list); if (zspage) { remove_zspage(class, zspage); return zspage; } } return zspage; } static struct zspage *isolate_dst_zspage(struct size_class *class) { struct zspage *zspage; int fg; for (fg = ZS_INUSE_RATIO_99; fg >= ZS_INUSE_RATIO_10; fg--) { zspage = list_first_entry_or_null(&class->fullness_list[fg], struct zspage, list); if (zspage) { remove_zspage(class, zspage); return zspage; } } return zspage; } /* * putback_zspage - add @zspage into right class's fullness list * @class: destination class * @zspage: target page * * Return @zspage's fullness status */ static int putback_zspage(struct size_class *class, struct zspage *zspage) { int fullness; fullness = get_fullness_group(class, zspage); insert_zspage(class, zspage, fullness); return fullness; } #ifdef CONFIG_COMPACTION /* * To prevent zspage destroy during migration, zspage freeing should * hold locks of all pages in the zspage. */ static void lock_zspage(struct zspage *zspage) { struct page *curr_page, *page; /* * Pages we haven't locked yet can be migrated off the list while we're * trying to lock them, so we need to be careful and only attempt to * lock each page under migrate_read_lock(). Otherwise, the page we lock * may no longer belong to the zspage. This means that we may wait for * the wrong page to unlock, so we must take a reference to the page * prior to waiting for it to unlock outside migrate_read_lock(). */ while (1) { migrate_read_lock(zspage); page = get_first_page(zspage); if (trylock_page(page)) break; get_page(page); migrate_read_unlock(zspage); wait_on_page_locked(page); put_page(page); } curr_page = page; while ((page = get_next_page(curr_page))) { if (trylock_page(page)) { curr_page = page; } else { get_page(page); migrate_read_unlock(zspage); wait_on_page_locked(page); put_page(page); migrate_read_lock(zspage); } } migrate_read_unlock(zspage); } #endif /* CONFIG_COMPACTION */ static void migrate_lock_init(struct zspage *zspage) { rwlock_init(&zspage->lock); } static void migrate_read_lock(struct zspage *zspage) __acquires(&zspage->lock) { read_lock(&zspage->lock); } static void migrate_read_unlock(struct zspage *zspage) __releases(&zspage->lock) { read_unlock(&zspage->lock); } static void migrate_write_lock(struct zspage *zspage) { write_lock(&zspage->lock); } static void migrate_write_unlock(struct zspage *zspage) { write_unlock(&zspage->lock); } #ifdef CONFIG_COMPACTION static const struct movable_operations zsmalloc_mops; static void replace_sub_page(struct size_class *class, struct zspage *zspage, struct page *newpage, struct page *oldpage) { struct page *page; struct page *pages[ZS_MAX_PAGES_PER_ZSPAGE] = {NULL, }; int idx = 0; page = get_first_page(zspage); do { if (page == oldpage) pages[idx] = newpage; else pages[idx] = page; idx++; } while ((page = get_next_page(page)) != NULL); create_page_chain(class, zspage, pages); set_first_obj_offset(newpage, get_first_obj_offset(oldpage)); if (unlikely(ZsHugePage(zspage))) newpage->index = oldpage->index; __SetPageMovable(newpage, &zsmalloc_mops); } static bool zs_page_isolate(struct page *page, isolate_mode_t mode) { /* * Page is locked so zspage couldn't be destroyed. For detail, look at * lock_zspage in free_zspage. */ VM_BUG_ON_PAGE(PageIsolated(page), page); return true; } static int zs_page_migrate(struct page *newpage, struct page *page, enum migrate_mode mode) { struct zs_pool *pool; struct size_class *class; struct zspage *zspage; struct page *dummy; void *s_addr, *d_addr, *addr; unsigned int offset; unsigned long handle; unsigned long old_obj, new_obj; unsigned int obj_idx; /* * We cannot support the _NO_COPY case here, because copy needs to * happen under the zs lock, which does not work with * MIGRATE_SYNC_NO_COPY workflow. */ if (mode == MIGRATE_SYNC_NO_COPY) return -EINVAL; VM_BUG_ON_PAGE(!PageIsolated(page), page); /* The page is locked, so this pointer must remain valid */ zspage = get_zspage(page); pool = zspage->pool; /* * The pool's lock protects the race between zpage migration * and zs_free. */ spin_lock(&pool->lock); class = zspage_class(pool, zspage); /* the migrate_write_lock protects zpage access via zs_map_object */ migrate_write_lock(zspage); offset = get_first_obj_offset(page); s_addr = kmap_atomic(page); /* * Here, any user cannot access all objects in the zspage so let's move. */ d_addr = kmap_atomic(newpage); copy_page(d_addr, s_addr); kunmap_atomic(d_addr); for (addr = s_addr + offset; addr < s_addr + PAGE_SIZE; addr += class->size) { if (obj_allocated(page, addr, &handle)) { old_obj = handle_to_obj(handle); obj_to_location(old_obj, &dummy, &obj_idx); new_obj = (unsigned long)location_to_obj(newpage, obj_idx); record_obj(handle, new_obj); } } kunmap_atomic(s_addr); replace_sub_page(class, zspage, newpage, page); /* * Since we complete the data copy and set up new zspage structure, * it's okay to release the pool's lock. */ spin_unlock(&pool->lock); migrate_write_unlock(zspage); get_page(newpage); if (page_zone(newpage) != page_zone(page)) { dec_zone_page_state(page, NR_ZSPAGES); inc_zone_page_state(newpage, NR_ZSPAGES); } reset_page(page); put_page(page); return MIGRATEPAGE_SUCCESS; } static void zs_page_putback(struct page *page) { VM_BUG_ON_PAGE(!PageIsolated(page), page); } static const struct movable_operations zsmalloc_mops = { .isolate_page = zs_page_isolate, .migrate_page = zs_page_migrate, .putback_page = zs_page_putback, }; /* * Caller should hold page_lock of all pages in the zspage * In here, we cannot use zspage meta data. */ static void async_free_zspage(struct work_struct *work) { int i; struct size_class *class; struct zspage *zspage, *tmp; LIST_HEAD(free_pages); struct zs_pool *pool = container_of(work, struct zs_pool, free_work); for (i = 0; i < ZS_SIZE_CLASSES; i++) { class = pool->size_class[i]; if (class->index != i) continue; spin_lock(&pool->lock); list_splice_init(&class->fullness_list[ZS_INUSE_RATIO_0], &free_pages); spin_unlock(&pool->lock); } list_for_each_entry_safe(zspage, tmp, &free_pages, list) { list_del(&zspage->list); lock_zspage(zspage); spin_lock(&pool->lock); class = zspage_class(pool, zspage); __free_zspage(pool, class, zspage); spin_unlock(&pool->lock); } }; static void kick_deferred_free(struct zs_pool *pool) { schedule_work(&pool->free_work); } static void zs_flush_migration(struct zs_pool *pool) { flush_work(&pool->free_work); } static void init_deferred_free(struct zs_pool *pool) { INIT_WORK(&pool->free_work, async_free_zspage); } static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) { struct page *page = get_first_page(zspage); do { WARN_ON(!trylock_page(page)); __SetPageMovable(page, &zsmalloc_mops); unlock_page(page); } while ((page = get_next_page(page)) != NULL); } #else static inline void zs_flush_migration(struct zs_pool *pool) { } #endif /* * * Based on the number of unused allocated objects calculate * and return the number of pages that we can free. */ static unsigned long zs_can_compact(struct size_class *class) { unsigned long obj_wasted; unsigned long obj_allocated = zs_stat_get(class, ZS_OBJS_ALLOCATED); unsigned long obj_used = zs_stat_get(class, ZS_OBJS_INUSE); if (obj_allocated <= obj_used) return 0; obj_wasted = obj_allocated - obj_used; obj_wasted /= class->objs_per_zspage; return obj_wasted * class->pages_per_zspage; } static unsigned long __zs_compact(struct zs_pool *pool, struct size_class *class) { struct zspage *src_zspage = NULL; struct zspage *dst_zspage = NULL; unsigned long pages_freed = 0; /* * protect the race between zpage migration and zs_free * as well as zpage allocation/free */ spin_lock(&pool->lock); while (zs_can_compact(class)) { int fg; if (!dst_zspage) { dst_zspage = isolate_dst_zspage(class); if (!dst_zspage) break; } src_zspage = isolate_src_zspage(class); if (!src_zspage) break; migrate_write_lock(src_zspage); migrate_zspage(pool, src_zspage, dst_zspage); migrate_write_unlock(src_zspage); fg = putback_zspage(class, src_zspage); if (fg == ZS_INUSE_RATIO_0) { free_zspage(pool, class, src_zspage); pages_freed += class->pages_per_zspage; } src_zspage = NULL; if (get_fullness_group(class, dst_zspage) == ZS_INUSE_RATIO_100 || spin_is_contended(&pool->lock)) { putback_zspage(class, dst_zspage); dst_zspage = NULL; spin_unlock(&pool->lock); cond_resched(); spin_lock(&pool->lock); } } if (src_zspage) putback_zspage(class, src_zspage); if (dst_zspage) putback_zspage(class, dst_zspage); spin_unlock(&pool->lock); return pages_freed; } unsigned long zs_compact(struct zs_pool *pool) { int i; struct size_class *class; unsigned long pages_freed = 0; /* * Pool compaction is performed under pool->lock so it is basically * single-threaded. Having more than one thread in __zs_compact() * will increase pool->lock contention, which will impact other * zsmalloc operations that need pool->lock. */ if (atomic_xchg(&pool->compaction_in_progress, 1)) return 0; for (i = ZS_SIZE_CLASSES - 1; i >= 0; i--) { class = pool->size_class[i]; if (class->index != i) continue; pages_freed += __zs_compact(pool, class); } atomic_long_add(pages_freed, &pool->stats.pages_compacted); atomic_set(&pool->compaction_in_progress, 0); return pages_freed; } EXPORT_SYMBOL_GPL(zs_compact); void zs_pool_stats(struct zs_pool *pool, struct zs_pool_stats *stats) { memcpy(stats, &pool->stats, sizeof(struct zs_pool_stats)); } EXPORT_SYMBOL_GPL(zs_pool_stats); static unsigned long zs_shrinker_scan(struct shrinker *shrinker, struct shrink_control *sc) { unsigned long pages_freed; struct zs_pool *pool = shrinker->private_data; /* * Compact classes and calculate compaction delta. * Can run concurrently with a manually triggered * (by user) compaction. */ pages_freed = zs_compact(pool); return pages_freed ? pages_freed : SHRINK_STOP; } static unsigned long zs_shrinker_count(struct shrinker *shrinker, struct shrink_control *sc) { int i; struct size_class *class; unsigned long pages_to_free = 0; struct zs_pool *pool = shrinker->private_data; for (i = ZS_SIZE_CLASSES - 1; i >= 0; i--) { class = pool->size_class[i]; if (class->index != i) continue; pages_to_free += zs_can_compact(class); } return pages_to_free; } static void zs_unregister_shrinker(struct zs_pool *pool) { shrinker_free(pool->shrinker); } static int zs_register_shrinker(struct zs_pool *pool) { pool->shrinker = shrinker_alloc(0, "mm-zspool:%s", pool->name); if (!pool->shrinker) return -ENOMEM; pool->shrinker->scan_objects = zs_shrinker_scan; pool->shrinker->count_objects = zs_shrinker_count; pool->shrinker->batch = 0; pool->shrinker->private_data = pool; shrinker_register(pool->shrinker); return 0; } static int calculate_zspage_chain_size(int class_size) { int i, min_waste = INT_MAX; int chain_size = 1; if (is_power_of_2(class_size)) return chain_size; for (i = 1; i <= ZS_MAX_PAGES_PER_ZSPAGE; i++) { int waste; waste = (i * PAGE_SIZE) % class_size; if (waste < min_waste) { min_waste = waste; chain_size = i; } } return chain_size; } /** * zs_create_pool - Creates an allocation pool to work from. * @name: pool name to be created * * This function must be called before anything when using * the zsmalloc allocator. * * On success, a pointer to the newly created pool is returned, * otherwise NULL. */ struct zs_pool *zs_create_pool(const char *name) { int i; struct zs_pool *pool; struct size_class *prev_class = NULL; pool = kzalloc(sizeof(*pool), GFP_KERNEL); if (!pool) return NULL; init_deferred_free(pool); spin_lock_init(&pool->lock); atomic_set(&pool->compaction_in_progress, 0); pool->name = kstrdup(name, GFP_KERNEL); if (!pool->name) goto err; if (create_cache(pool)) goto err; /* * Iterate reversely, because, size of size_class that we want to use * for merging should be larger or equal to current size. */ for (i = ZS_SIZE_CLASSES - 1; i >= 0; i--) { int size; int pages_per_zspage; int objs_per_zspage; struct size_class *class; int fullness; size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA; if (size > ZS_MAX_ALLOC_SIZE) size = ZS_MAX_ALLOC_SIZE; pages_per_zspage = calculate_zspage_chain_size(size); objs_per_zspage = pages_per_zspage * PAGE_SIZE / size; /* * We iterate from biggest down to smallest classes, * so huge_class_size holds the size of the first huge * class. Any object bigger than or equal to that will * endup in the huge class. */ if (pages_per_zspage != 1 && objs_per_zspage != 1 && !huge_class_size) { huge_class_size = size; /* * The object uses ZS_HANDLE_SIZE bytes to store the * handle. We need to subtract it, because zs_malloc() * unconditionally adds handle size before it performs * size class search - so object may be smaller than * huge class size, yet it still can end up in the huge * class because it grows by ZS_HANDLE_SIZE extra bytes * right before class lookup. */ huge_class_size -= (ZS_HANDLE_SIZE - 1); } /* * size_class is used for normal zsmalloc operation such * as alloc/free for that size. Although it is natural that we * have one size_class for each size, there is a chance that we * can get more memory utilization if we use one size_class for * many different sizes whose size_class have same * characteristics. So, we makes size_class point to * previous size_class if possible. */ if (prev_class) { if (can_merge(prev_class, pages_per_zspage, objs_per_zspage)) { pool->size_class[i] = prev_class; continue; } } class = kzalloc(sizeof(struct size_class), GFP_KERNEL); if (!class) goto err; class->size = size; class->index = i; class->pages_per_zspage = pages_per_zspage; class->objs_per_zspage = objs_per_zspage; pool->size_class[i] = class; fullness = ZS_INUSE_RATIO_0; while (fullness < NR_FULLNESS_GROUPS) { INIT_LIST_HEAD(&class->fullness_list[fullness]); fullness++; } prev_class = class; } /* debug only, don't abort if it fails */ zs_pool_stat_create(pool, name); /* * Not critical since shrinker is only used to trigger internal * defragmentation of the pool which is pretty optional thing. If * registration fails we still can use the pool normally and user can * trigger compaction manually. Thus, ignore return code. */ zs_register_shrinker(pool); return pool; err: zs_destroy_pool(pool); return NULL; } EXPORT_SYMBOL_GPL(zs_create_pool); void zs_destroy_pool(struct zs_pool *pool) { int i; zs_unregister_shrinker(pool); zs_flush_migration(pool); zs_pool_stat_destroy(pool); for (i = 0; i < ZS_SIZE_CLASSES; i++) { int fg; struct size_class *class = pool->size_class[i]; if (!class) continue; if (class->index != i) continue; for (fg = ZS_INUSE_RATIO_0; fg < NR_FULLNESS_GROUPS; fg++) { if (list_empty(&class->fullness_list[fg])) continue; pr_err("Class-%d fullness group %d is not empty\n", class->size, fg); } kfree(class); } destroy_cache(pool); kfree(pool->name); kfree(pool); } EXPORT_SYMBOL_GPL(zs_destroy_pool); static int __init zs_init(void) { int ret; ret = cpuhp_setup_state(CPUHP_MM_ZS_PREPARE, "mm/zsmalloc:prepare", zs_cpu_prepare, zs_cpu_dead); if (ret) goto out; #ifdef CONFIG_ZPOOL zpool_register_driver(&zs_zpool_driver); #endif zs_stat_init(); return 0; out: return ret; } static void __exit zs_exit(void) { #ifdef CONFIG_ZPOOL zpool_unregister_driver(&zs_zpool_driver); #endif cpuhp_remove_state(CPUHP_MM_ZS_PREPARE); zs_stat_exit(); } module_init(zs_init); module_exit(zs_exit); MODULE_LICENSE("Dual BSD/GPL"); MODULE_AUTHOR("Nitin Gupta <ngupta@vflare.org>");
4 13 3 4 8 8 9 1 16 15 2 16 14 6 10 16 15 16 9 9 2 2 2 2 5 15 4 4 3 4 3 3 3 2 5 4 5 5 4 2 1 3 3 3 3 3 3 3 3 3 15 15 14 15 15 11 11 11 8 6 10 2 10 3 3 8 2 1 5 6 5 2 4 4 4 4 1 1 1 2 5 2 1 11 9 4 4 4 4 4 4 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2006 Silicon Graphics, Inc. * Copyright (c) 2016-2018 Christoph Hellwig. * All Rights Reserved. */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_inode.h" #include "xfs_btree.h" #include "xfs_bmap_btree.h" #include "xfs_bmap.h" #include "xfs_bmap_util.h" #include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_trans.h" #include "xfs_trans_space.h" #include "xfs_inode_item.h" #include "xfs_iomap.h" #include "xfs_trace.h" #include "xfs_quota.h" #include "xfs_dquot_item.h" #include "xfs_dquot.h" #include "xfs_reflink.h" #include "xfs_health.h" #define XFS_ALLOC_ALIGN(mp, off) \ (((off) >> mp->m_allocsize_log) << mp->m_allocsize_log) static int xfs_alert_fsblock_zero( xfs_inode_t *ip, xfs_bmbt_irec_t *imap) { xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO, "Access to block zero in inode %llu " "start_block: %llx start_off: %llx " "blkcnt: %llx extent-state: %x", (unsigned long long)ip->i_ino, (unsigned long long)imap->br_startblock, (unsigned long long)imap->br_startoff, (unsigned long long)imap->br_blockcount, imap->br_state); xfs_bmap_mark_sick(ip, XFS_DATA_FORK); return -EFSCORRUPTED; } u64 xfs_iomap_inode_sequence( struct xfs_inode *ip, u16 iomap_flags) { u64 cookie = 0; if (iomap_flags & IOMAP_F_XATTR) return READ_ONCE(ip->i_af.if_seq); if ((iomap_flags & IOMAP_F_SHARED) && ip->i_cowfp) cookie = (u64)READ_ONCE(ip->i_cowfp->if_seq) << 32; return cookie | READ_ONCE(ip->i_df.if_seq); } /* * Check that the iomap passed to us is still valid for the given offset and * length. */ static bool xfs_iomap_valid( struct inode *inode, const struct iomap *iomap) { struct xfs_inode *ip = XFS_I(inode); if (iomap->validity_cookie != xfs_iomap_inode_sequence(ip, iomap->flags)) { trace_xfs_iomap_invalid(ip, iomap); return false; } XFS_ERRORTAG_DELAY(ip->i_mount, XFS_ERRTAG_WRITE_DELAY_MS); return true; } static const struct iomap_folio_ops xfs_iomap_folio_ops = { .iomap_valid = xfs_iomap_valid, }; int xfs_bmbt_to_iomap( struct xfs_inode *ip, struct iomap *iomap, struct xfs_bmbt_irec *imap, unsigned int mapping_flags, u16 iomap_flags, u64 sequence_cookie) { struct xfs_mount *mp = ip->i_mount; struct xfs_buftarg *target = xfs_inode_buftarg(ip); if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock))) { xfs_bmap_mark_sick(ip, XFS_DATA_FORK); return xfs_alert_fsblock_zero(ip, imap); } if (imap->br_startblock == HOLESTARTBLOCK) { iomap->addr = IOMAP_NULL_ADDR; iomap->type = IOMAP_HOLE; } else if (imap->br_startblock == DELAYSTARTBLOCK || isnullstartblock(imap->br_startblock)) { iomap->addr = IOMAP_NULL_ADDR; iomap->type = IOMAP_DELALLOC; } else { iomap->addr = BBTOB(xfs_fsb_to_db(ip, imap->br_startblock)); if (mapping_flags & IOMAP_DAX) iomap->addr += target->bt_dax_part_off; if (imap->br_state == XFS_EXT_UNWRITTEN) iomap->type = IOMAP_UNWRITTEN; else iomap->type = IOMAP_MAPPED; } iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff); iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount); if (mapping_flags & IOMAP_DAX) iomap->dax_dev = target->bt_daxdev; else iomap->bdev = target->bt_bdev; iomap->flags = iomap_flags; if (xfs_ipincount(ip) && (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP)) iomap->flags |= IOMAP_F_DIRTY; iomap->validity_cookie = sequence_cookie; iomap->folio_ops = &xfs_iomap_folio_ops; return 0; } static void xfs_hole_to_iomap( struct xfs_inode *ip, struct iomap *iomap, xfs_fileoff_t offset_fsb, xfs_fileoff_t end_fsb) { struct xfs_buftarg *target = xfs_inode_buftarg(ip); iomap->addr = IOMAP_NULL_ADDR; iomap->type = IOMAP_HOLE; iomap->offset = XFS_FSB_TO_B(ip->i_mount, offset_fsb); iomap->length = XFS_FSB_TO_B(ip->i_mount, end_fsb - offset_fsb); iomap->bdev = target->bt_bdev; iomap->dax_dev = target->bt_daxdev; } static inline xfs_fileoff_t xfs_iomap_end_fsb( struct xfs_mount *mp, loff_t offset, loff_t count) { ASSERT(offset <= mp->m_super->s_maxbytes); return min(XFS_B_TO_FSB(mp, offset + count), XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes)); } static xfs_extlen_t xfs_eof_alignment( struct xfs_inode *ip) { struct xfs_mount *mp = ip->i_mount; xfs_extlen_t align = 0; if (!XFS_IS_REALTIME_INODE(ip)) { /* * Round up the allocation request to a stripe unit * (m_dalign) boundary if the file size is >= stripe unit * size, and we are allocating past the allocation eof. * * If mounted with the "-o swalloc" option the alignment is * increased from the strip unit size to the stripe width. */ if (mp->m_swidth && xfs_has_swalloc(mp)) align = mp->m_swidth; else if (mp->m_dalign) align = mp->m_dalign; if (align && XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, align)) align = 0; } return align; } /* * Check if last_fsb is outside the last extent, and if so grow it to the next * stripe unit boundary. */ xfs_fileoff_t xfs_iomap_eof_align_last_fsb( struct xfs_inode *ip, xfs_fileoff_t end_fsb) { struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK); xfs_extlen_t extsz = xfs_get_extsz_hint(ip); xfs_extlen_t align = xfs_eof_alignment(ip); struct xfs_bmbt_irec irec; struct xfs_iext_cursor icur; ASSERT(!xfs_need_iread_extents(ifp)); /* * Always round up the allocation request to the extent hint boundary. */ if (extsz) { if (align) align = roundup_64(align, extsz); else align = extsz; } if (align) { xfs_fileoff_t aligned_end_fsb = roundup_64(end_fsb, align); xfs_iext_last(ifp, &icur); if (!xfs_iext_get_extent(ifp, &icur, &irec) || aligned_end_fsb >= irec.br_startoff + irec.br_blockcount) return aligned_end_fsb; } return end_fsb; } int xfs_iomap_write_direct( struct xfs_inode *ip, xfs_fileoff_t offset_fsb, xfs_fileoff_t count_fsb, unsigned int flags, struct xfs_bmbt_irec *imap, u64 *seq) { struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; xfs_filblks_t resaligned; int nimaps; unsigned int dblocks, rblocks; bool force = false; int error; int bmapi_flags = XFS_BMAPI_PREALLOC; int nr_exts = XFS_IEXT_ADD_NOSPLIT_CNT; ASSERT(count_fsb > 0); resaligned = xfs_aligned_fsb_count(offset_fsb, count_fsb, xfs_get_extsz_hint(ip)); if (unlikely(XFS_IS_REALTIME_INODE(ip))) { dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0); rblocks = resaligned; } else { dblocks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned); rblocks = 0; } error = xfs_qm_dqattach(ip); if (error) return error; /* * For DAX, we do not allocate unwritten extents, but instead we zero * the block before we commit the transaction. Ideally we'd like to do * this outside the transaction context, but if we commit and then crash * we may not have zeroed the blocks and this will be exposed on * recovery of the allocation. Hence we must zero before commit. * * Further, if we are mapping unwritten extents here, we need to zero * and convert them to written so that we don't need an unwritten extent * callback for DAX. This also means that we need to be able to dip into * the reserve block pool for bmbt block allocation if there is no space * left but we need to do unwritten extent conversion. */ if (flags & IOMAP_DAX) { bmapi_flags = XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO; if (imap->br_state == XFS_EXT_UNWRITTEN) { force = true; nr_exts = XFS_IEXT_WRITE_UNWRITTEN_CNT; dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1; } } error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, dblocks, rblocks, force, &tp); if (error) return error; error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, nr_exts); if (error == -EFBIG) error = xfs_iext_count_upgrade(tp, ip, nr_exts); if (error) goto out_trans_cancel; /* * From this point onwards we overwrite the imap pointer that the * caller gave to us. */ nimaps = 1; error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, bmapi_flags, 0, imap, &nimaps); if (error) goto out_trans_cancel; /* * Complete the transaction */ error = xfs_trans_commit(tp); if (error) goto out_unlock; /* * Copy any maps to caller's array and return any error. */ if (nimaps == 0) { error = -ENOSPC; goto out_unlock; } if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock))) { xfs_bmap_mark_sick(ip, XFS_DATA_FORK); error = xfs_alert_fsblock_zero(ip, imap); } out_unlock: *seq = xfs_iomap_inode_sequence(ip, 0); xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; out_trans_cancel: xfs_trans_cancel(tp); goto out_unlock; } STATIC bool xfs_quota_need_throttle( struct xfs_inode *ip, xfs_dqtype_t type, xfs_fsblock_t alloc_blocks) { struct xfs_dquot *dq = xfs_inode_dquot(ip, type); if (!dq || !xfs_this_quota_on(ip->i_mount, type)) return false; /* no hi watermark, no throttle */ if (!dq->q_prealloc_hi_wmark) return false; /* under the lo watermark, no throttle */ if (dq->q_blk.reserved + alloc_blocks < dq->q_prealloc_lo_wmark) return false; return true; } STATIC void xfs_quota_calc_throttle( struct xfs_inode *ip, xfs_dqtype_t type, xfs_fsblock_t *qblocks, int *qshift, int64_t *qfreesp) { struct xfs_dquot *dq = xfs_inode_dquot(ip, type); int64_t freesp; int shift = 0; /* no dq, or over hi wmark, squash the prealloc completely */ if (!dq || dq->q_blk.reserved >= dq->q_prealloc_hi_wmark) { *qblocks = 0; *qfreesp = 0; return; } freesp = dq->q_prealloc_hi_wmark - dq->q_blk.reserved; if (freesp < dq->q_low_space[XFS_QLOWSP_5_PCNT]) { shift = 2; if (freesp < dq->q_low_space[XFS_QLOWSP_3_PCNT]) shift += 2; if (freesp < dq->q_low_space[XFS_QLOWSP_1_PCNT]) shift += 2; } if (freesp < *qfreesp) *qfreesp = freesp; /* only overwrite the throttle values if we are more aggressive */ if ((freesp >> shift) < (*qblocks >> *qshift)) { *qblocks = freesp; *qshift = shift; } } /* * If we don't have a user specified preallocation size, dynamically increase * the preallocation size as the size of the file grows. Cap the maximum size * at a single extent or less if the filesystem is near full. The closer the * filesystem is to being full, the smaller the maximum preallocation. */ STATIC xfs_fsblock_t xfs_iomap_prealloc_size( struct xfs_inode *ip, int whichfork, loff_t offset, loff_t count, struct xfs_iext_cursor *icur) { struct xfs_iext_cursor ncur = *icur; struct xfs_bmbt_irec prev, got; struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); int64_t freesp; xfs_fsblock_t qblocks; xfs_fsblock_t alloc_blocks = 0; xfs_extlen_t plen; int shift = 0; int qshift = 0; /* * As an exception we don't do any preallocation at all if the file is * smaller than the minimum preallocation and we are using the default * dynamic preallocation scheme, as it is likely this is the only write * to the file that is going to be done. */ if (XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_allocsize_blocks)) return 0; /* * Use the minimum preallocation size for small files or if we are * writing right after a hole. */ if (XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_dalign) || !xfs_iext_prev_extent(ifp, &ncur, &prev) || prev.br_startoff + prev.br_blockcount < offset_fsb) return mp->m_allocsize_blocks; /* * Take the size of the preceding data extents as the basis for the * preallocation size. Note that we don't care if the previous extents * are written or not. */ plen = prev.br_blockcount; while (xfs_iext_prev_extent(ifp, &ncur, &got)) { if (plen > XFS_MAX_BMBT_EXTLEN / 2 || isnullstartblock(got.br_startblock) || got.br_startoff + got.br_blockcount != prev.br_startoff || got.br_startblock + got.br_blockcount != prev.br_startblock) break; plen += got.br_blockcount; prev = got; } /* * If the size of the extents is greater than half the maximum extent * length, then use the current offset as the basis. This ensures that * for large files the preallocation size always extends to * XFS_BMBT_MAX_EXTLEN rather than falling short due to things like stripe * unit/width alignment of real extents. */ alloc_blocks = plen * 2; if (alloc_blocks > XFS_MAX_BMBT_EXTLEN) alloc_blocks = XFS_B_TO_FSB(mp, offset); qblocks = alloc_blocks; /* * XFS_BMBT_MAX_EXTLEN is not a power of two value but we round the prealloc * down to the nearest power of two value after throttling. To prevent * the round down from unconditionally reducing the maximum supported * prealloc size, we round up first, apply appropriate throttling, round * down and cap the value to XFS_BMBT_MAX_EXTLEN. */ alloc_blocks = XFS_FILEOFF_MIN(roundup_pow_of_two(XFS_MAX_BMBT_EXTLEN), alloc_blocks); freesp = percpu_counter_read_positive(&mp->m_fdblocks); if (freesp < mp->m_low_space[XFS_LOWSP_5_PCNT]) { shift = 2; if (freesp < mp->m_low_space[XFS_LOWSP_4_PCNT]) shift++; if (freesp < mp->m_low_space[XFS_LOWSP_3_PCNT]) shift++; if (freesp < mp->m_low_space[XFS_LOWSP_2_PCNT]) shift++; if (freesp < mp->m_low_space[XFS_LOWSP_1_PCNT]) shift++; } /* * Check each quota to cap the prealloc size, provide a shift value to * throttle with and adjust amount of available space. */ if (xfs_quota_need_throttle(ip, XFS_DQTYPE_USER, alloc_blocks)) xfs_quota_calc_throttle(ip, XFS_DQTYPE_USER, &qblocks, &qshift, &freesp); if (xfs_quota_need_throttle(ip, XFS_DQTYPE_GROUP, alloc_blocks)) xfs_quota_calc_throttle(ip, XFS_DQTYPE_GROUP, &qblocks, &qshift, &freesp); if (xfs_quota_need_throttle(ip, XFS_DQTYPE_PROJ, alloc_blocks)) xfs_quota_calc_throttle(ip, XFS_DQTYPE_PROJ, &qblocks, &qshift, &freesp); /* * The final prealloc size is set to the minimum of free space available * in each of the quotas and the overall filesystem. * * The shift throttle value is set to the maximum value as determined by * the global low free space values and per-quota low free space values. */ alloc_blocks = min(alloc_blocks, qblocks); shift = max(shift, qshift); if (shift) alloc_blocks >>= shift; /* * rounddown_pow_of_two() returns an undefined result if we pass in * alloc_blocks = 0. */ if (alloc_blocks) alloc_blocks = rounddown_pow_of_two(alloc_blocks); if (alloc_blocks > XFS_MAX_BMBT_EXTLEN) alloc_blocks = XFS_MAX_BMBT_EXTLEN; /* * If we are still trying to allocate more space than is * available, squash the prealloc hard. This can happen if we * have a large file on a small filesystem and the above * lowspace thresholds are smaller than XFS_BMBT_MAX_EXTLEN. */ while (alloc_blocks && alloc_blocks >= freesp) alloc_blocks >>= 4; if (alloc_blocks < mp->m_allocsize_blocks) alloc_blocks = mp->m_allocsize_blocks; trace_xfs_iomap_prealloc_size(ip, alloc_blocks, shift, mp->m_allocsize_blocks); return alloc_blocks; } int xfs_iomap_write_unwritten( xfs_inode_t *ip, xfs_off_t offset, xfs_off_t count, bool update_isize) { xfs_mount_t *mp = ip->i_mount; xfs_fileoff_t offset_fsb; xfs_filblks_t count_fsb; xfs_filblks_t numblks_fsb; int nimaps; xfs_trans_t *tp; xfs_bmbt_irec_t imap; struct inode *inode = VFS_I(ip); xfs_fsize_t i_size; uint resblks; int error; trace_xfs_unwritten_convert(ip, offset, count); offset_fsb = XFS_B_TO_FSBT(mp, offset); count_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count); count_fsb = (xfs_filblks_t)(count_fsb - offset_fsb); /* * Reserve enough blocks in this transaction for two complete extent * btree splits. We may be converting the middle part of an unwritten * extent and in this case we will insert two new extents in the btree * each of which could cause a full split. * * This reservation amount will be used in the first call to * xfs_bmbt_split() to select an AG with enough space to satisfy the * rest of the operation. */ resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1; /* Attach dquots so that bmbt splits are accounted correctly. */ error = xfs_qm_dqattach(ip); if (error) return error; do { /* * Set up a transaction to convert the range of extents * from unwritten to real. Do allocations in a loop until * we have covered the range passed in. * * Note that we can't risk to recursing back into the filesystem * here as we might be asked to write out the same inode that we * complete here and might deadlock on the iolock. */ error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, resblks, 0, true, &tp); if (error) return error; error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, XFS_IEXT_WRITE_UNWRITTEN_CNT); if (error == -EFBIG) error = xfs_iext_count_upgrade(tp, ip, XFS_IEXT_WRITE_UNWRITTEN_CNT); if (error) goto error_on_bmapi_transaction; /* * Modify the unwritten extent state of the buffer. */ nimaps = 1; error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, XFS_BMAPI_CONVERT, resblks, &imap, &nimaps); if (error) goto error_on_bmapi_transaction; /* * Log the updated inode size as we go. We have to be careful * to only log it up to the actual write offset if it is * halfway into a block. */ i_size = XFS_FSB_TO_B(mp, offset_fsb + count_fsb); if (i_size > offset + count) i_size = offset + count; if (update_isize && i_size > i_size_read(inode)) i_size_write(inode, i_size); i_size = xfs_new_eof(ip, i_size); if (i_size) { ip->i_disk_size = i_size; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); } error = xfs_trans_commit(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); if (error) return error; if (unlikely(!xfs_valid_startblock(ip, imap.br_startblock))) { xfs_bmap_mark_sick(ip, XFS_DATA_FORK); return xfs_alert_fsblock_zero(ip, &imap); } if ((numblks_fsb = imap.br_blockcount) == 0) { /* * The numblks_fsb value should always get * smaller, otherwise the loop is stuck. */ ASSERT(imap.br_blockcount); break; } offset_fsb += numblks_fsb; count_fsb -= numblks_fsb; } while (count_fsb > 0); return 0; error_on_bmapi_transaction: xfs_trans_cancel(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } static inline bool imap_needs_alloc( struct inode *inode, unsigned flags, struct xfs_bmbt_irec *imap, int nimaps) { /* don't allocate blocks when just zeroing */ if (flags & IOMAP_ZERO) return false; if (!nimaps || imap->br_startblock == HOLESTARTBLOCK || imap->br_startblock == DELAYSTARTBLOCK) return true; /* we convert unwritten extents before copying the data for DAX */ if ((flags & IOMAP_DAX) && imap->br_state == XFS_EXT_UNWRITTEN) return true; return false; } static inline bool imap_needs_cow( struct xfs_inode *ip, unsigned int flags, struct xfs_bmbt_irec *imap, int nimaps) { if (!xfs_is_cow_inode(ip)) return false; /* when zeroing we don't have to COW holes or unwritten extents */ if (flags & IOMAP_ZERO) { if (!nimaps || imap->br_startblock == HOLESTARTBLOCK || imap->br_state == XFS_EXT_UNWRITTEN) return false; } return true; } static int xfs_ilock_for_iomap( struct xfs_inode *ip, unsigned flags, unsigned *lockmode) { unsigned int mode = *lockmode; bool is_write = flags & (IOMAP_WRITE | IOMAP_ZERO); /* * COW writes may allocate delalloc space or convert unwritten COW * extents, so we need to make sure to take the lock exclusively here. */ if (xfs_is_cow_inode(ip) && is_write) mode = XFS_ILOCK_EXCL; /* * Extents not yet cached requires exclusive access, don't block. This * is an opencoded xfs_ilock_data_map_shared() call but with * non-blocking behaviour. */ if (xfs_need_iread_extents(&ip->i_df)) { if (flags & IOMAP_NOWAIT) return -EAGAIN; mode = XFS_ILOCK_EXCL; } relock: if (flags & IOMAP_NOWAIT) { if (!xfs_ilock_nowait(ip, mode)) return -EAGAIN; } else { xfs_ilock(ip, mode); } /* * The reflink iflag could have changed since the earlier unlocked * check, so if we got ILOCK_SHARED for a write and but we're now a * reflink inode we have to switch to ILOCK_EXCL and relock. */ if (mode == XFS_ILOCK_SHARED && is_write && xfs_is_cow_inode(ip)) { xfs_iunlock(ip, mode); mode = XFS_ILOCK_EXCL; goto relock; } *lockmode = mode; return 0; } /* * Check that the imap we are going to return to the caller spans the entire * range that the caller requested for the IO. */ static bool imap_spans_range( struct xfs_bmbt_irec *imap, xfs_fileoff_t offset_fsb, xfs_fileoff_t end_fsb) { if (imap->br_startoff > offset_fsb) return false; if (imap->br_startoff + imap->br_blockcount < end_fsb) return false; return true; } static int xfs_direct_write_iomap_begin( struct inode *inode, loff_t offset, loff_t length, unsigned flags, struct iomap *iomap, struct iomap *srcmap) { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; struct xfs_bmbt_irec imap, cmap; xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, length); int nimaps = 1, error = 0; bool shared = false; u16 iomap_flags = 0; unsigned int lockmode = XFS_ILOCK_SHARED; u64 seq; ASSERT(flags & (IOMAP_WRITE | IOMAP_ZERO)); if (xfs_is_shutdown(mp)) return -EIO; /* * Writes that span EOF might trigger an IO size update on completion, * so consider them to be dirty for the purposes of O_DSYNC even if * there is no other metadata changes pending or have been made here. */ if (offset + length > i_size_read(inode)) iomap_flags |= IOMAP_F_DIRTY; error = xfs_ilock_for_iomap(ip, flags, &lockmode); if (error) return error; error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, &nimaps, 0); if (error) goto out_unlock; if (imap_needs_cow(ip, flags, &imap, nimaps)) { error = -EAGAIN; if (flags & IOMAP_NOWAIT) goto out_unlock; /* may drop and re-acquire the ilock */ error = xfs_reflink_allocate_cow(ip, &imap, &cmap, &shared, &lockmode, (flags & IOMAP_DIRECT) || IS_DAX(inode)); if (error) goto out_unlock; if (shared) goto out_found_cow; end_fsb = imap.br_startoff + imap.br_blockcount; length = XFS_FSB_TO_B(mp, end_fsb) - offset; } if (imap_needs_alloc(inode, flags, &imap, nimaps)) goto allocate_blocks; /* * NOWAIT and OVERWRITE I/O needs to span the entire requested I/O with * a single map so that we avoid partial IO failures due to the rest of * the I/O range not covered by this map triggering an EAGAIN condition * when it is subsequently mapped and aborting the I/O. */ if (flags & (IOMAP_NOWAIT | IOMAP_OVERWRITE_ONLY)) { error = -EAGAIN; if (!imap_spans_range(&imap, offset_fsb, end_fsb)) goto out_unlock; } /* * For overwrite only I/O, we cannot convert unwritten extents without * requiring sub-block zeroing. This can only be done under an * exclusive IOLOCK, hence return -EAGAIN if this is not a written * extent to tell the caller to try again. */ if (flags & IOMAP_OVERWRITE_ONLY) { error = -EAGAIN; if (imap.br_state != XFS_EXT_NORM && ((offset | length) & mp->m_blockmask)) goto out_unlock; } seq = xfs_iomap_inode_sequence(ip, iomap_flags); xfs_iunlock(ip, lockmode); trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap); return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, iomap_flags, seq); allocate_blocks: error = -EAGAIN; if (flags & (IOMAP_NOWAIT | IOMAP_OVERWRITE_ONLY)) goto out_unlock; /* * We cap the maximum length we map to a sane size to keep the chunks * of work done where somewhat symmetric with the work writeback does. * This is a completely arbitrary number pulled out of thin air as a * best guess for initial testing. * * Note that the values needs to be less than 32-bits wide until the * lower level functions are updated. */ length = min_t(loff_t, length, 1024 * PAGE_SIZE); end_fsb = xfs_iomap_end_fsb(mp, offset, length); if (offset + length > XFS_ISIZE(ip)) end_fsb = xfs_iomap_eof_align_last_fsb(ip, end_fsb); else if (nimaps && imap.br_startblock == HOLESTARTBLOCK) end_fsb = min(end_fsb, imap.br_startoff + imap.br_blockcount); xfs_iunlock(ip, lockmode); error = xfs_iomap_write_direct(ip, offset_fsb, end_fsb - offset_fsb, flags, &imap, &seq); if (error) return error; trace_xfs_iomap_alloc(ip, offset, length, XFS_DATA_FORK, &imap); return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, iomap_flags | IOMAP_F_NEW, seq); out_found_cow: length = XFS_FSB_TO_B(mp, cmap.br_startoff + cmap.br_blockcount); trace_xfs_iomap_found(ip, offset, length - offset, XFS_COW_FORK, &cmap); if (imap.br_startblock != HOLESTARTBLOCK) { seq = xfs_iomap_inode_sequence(ip, 0); error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0, seq); if (error) goto out_unlock; } seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED); xfs_iunlock(ip, lockmode); return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED, seq); out_unlock: if (lockmode) xfs_iunlock(ip, lockmode); return error; } const struct iomap_ops xfs_direct_write_iomap_ops = { .iomap_begin = xfs_direct_write_iomap_begin, }; static int xfs_dax_write_iomap_end( struct inode *inode, loff_t pos, loff_t length, ssize_t written, unsigned flags, struct iomap *iomap) { struct xfs_inode *ip = XFS_I(inode); if (!xfs_is_cow_inode(ip)) return 0; if (!written) { xfs_reflink_cancel_cow_range(ip, pos, length, true); return 0; } return xfs_reflink_end_cow(ip, pos, written); } const struct iomap_ops xfs_dax_write_iomap_ops = { .iomap_begin = xfs_direct_write_iomap_begin, .iomap_end = xfs_dax_write_iomap_end, }; static int xfs_buffered_write_iomap_begin( struct inode *inode, loff_t offset, loff_t count, unsigned flags, struct iomap *iomap, struct iomap *srcmap) { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, count); struct xfs_bmbt_irec imap, cmap; struct xfs_iext_cursor icur, ccur; xfs_fsblock_t prealloc_blocks = 0; bool eof = false, cow_eof = false, shared = false; int allocfork = XFS_DATA_FORK; int error = 0; unsigned int lockmode = XFS_ILOCK_EXCL; u64 seq; if (xfs_is_shutdown(mp)) return -EIO; /* we can't use delayed allocations when using extent size hints */ if (xfs_get_extsz_hint(ip)) return xfs_direct_write_iomap_begin(inode, offset, count, flags, iomap, srcmap); ASSERT(!XFS_IS_REALTIME_INODE(ip)); error = xfs_qm_dqattach(ip); if (error) return error; error = xfs_ilock_for_iomap(ip, flags, &lockmode); if (error) return error; if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(&ip->i_df)) || XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { xfs_bmap_mark_sick(ip, XFS_DATA_FORK); error = -EFSCORRUPTED; goto out_unlock; } XFS_STATS_INC(mp, xs_blk_mapw); error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); if (error) goto out_unlock; /* * Search the data fork first to look up our source mapping. We * always need the data fork map, as we have to return it to the * iomap code so that the higher level write code can read data in to * perform read-modify-write cycles for unaligned writes. */ eof = !xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap); if (eof) imap.br_startoff = end_fsb; /* fake hole until the end */ /* We never need to allocate blocks for zeroing or unsharing a hole. */ if ((flags & (IOMAP_UNSHARE | IOMAP_ZERO)) && imap.br_startoff > offset_fsb) { xfs_hole_to_iomap(ip, iomap, offset_fsb, imap.br_startoff); goto out_unlock; } /* * Search the COW fork extent list even if we did not find a data fork * extent. This serves two purposes: first this implements the * speculative preallocation using cowextsize, so that we also unshare * block adjacent to shared blocks instead of just the shared blocks * themselves. Second the lookup in the extent list is generally faster * than going out to the shared extent tree. */ if (xfs_is_cow_inode(ip)) { if (!ip->i_cowfp) { ASSERT(!xfs_is_reflink_inode(ip)); xfs_ifork_init_cow(ip); } cow_eof = !xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &ccur, &cmap); if (!cow_eof && cmap.br_startoff <= offset_fsb) { trace_xfs_reflink_cow_found(ip, &cmap); goto found_cow; } } if (imap.br_startoff <= offset_fsb) { /* * For reflink files we may need a delalloc reservation when * overwriting shared extents. This includes zeroing of * existing extents that contain data. */ if (!xfs_is_cow_inode(ip) || ((flags & IOMAP_ZERO) && imap.br_state != XFS_EXT_NORM)) { trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK, &imap); goto found_imap; } xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb); /* Trim the mapping to the nearest shared extent boundary. */ error = xfs_bmap_trim_cow(ip, &imap, &shared); if (error) goto out_unlock; /* Not shared? Just report the (potentially capped) extent. */ if (!shared) { trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK, &imap); goto found_imap; } /* * Fork all the shared blocks from our write offset until the * end of the extent. */ allocfork = XFS_COW_FORK; end_fsb = imap.br_startoff + imap.br_blockcount; } else { /* * We cap the maximum length we map here to MAX_WRITEBACK_PAGES * pages to keep the chunks of work done where somewhat * symmetric with the work writeback does. This is a completely * arbitrary number pulled out of thin air. * * Note that the values needs to be less than 32-bits wide until * the lower level functions are updated. */ count = min_t(loff_t, count, 1024 * PAGE_SIZE); end_fsb = xfs_iomap_end_fsb(mp, offset, count); if (xfs_is_always_cow_inode(ip)) allocfork = XFS_COW_FORK; } if (eof && offset + count > XFS_ISIZE(ip)) { /* * Determine the initial size of the preallocation. * We clean up any extra preallocation when the file is closed. */ if (xfs_has_allocsize(mp)) prealloc_blocks = mp->m_allocsize_blocks; else if (allocfork == XFS_DATA_FORK) prealloc_blocks = xfs_iomap_prealloc_size(ip, allocfork, offset, count, &icur); else prealloc_blocks = xfs_iomap_prealloc_size(ip, allocfork, offset, count, &ccur); if (prealloc_blocks) { xfs_extlen_t align; xfs_off_t end_offset; xfs_fileoff_t p_end_fsb; end_offset = XFS_ALLOC_ALIGN(mp, offset + count - 1); p_end_fsb = XFS_B_TO_FSBT(mp, end_offset) + prealloc_blocks; align = xfs_eof_alignment(ip); if (align) p_end_fsb = roundup_64(p_end_fsb, align); p_end_fsb = min(p_end_fsb, XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes)); ASSERT(p_end_fsb > offset_fsb); prealloc_blocks = p_end_fsb - end_fsb; } } retry: error = xfs_bmapi_reserve_delalloc(ip, allocfork, offset_fsb, end_fsb - offset_fsb, prealloc_blocks, allocfork == XFS_DATA_FORK ? &imap : &cmap, allocfork == XFS_DATA_FORK ? &icur : &ccur, allocfork == XFS_DATA_FORK ? eof : cow_eof); switch (error) { case 0: break; case -ENOSPC: case -EDQUOT: /* retry without any preallocation */ trace_xfs_delalloc_enospc(ip, offset, count); if (prealloc_blocks) { prealloc_blocks = 0; goto retry; } fallthrough; default: goto out_unlock; } if (allocfork == XFS_COW_FORK) { trace_xfs_iomap_alloc(ip, offset, count, allocfork, &cmap); goto found_cow; } /* * Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch * them out if the write happens to fail. */ seq = xfs_iomap_inode_sequence(ip, IOMAP_F_NEW); xfs_iunlock(ip, XFS_ILOCK_EXCL); trace_xfs_iomap_alloc(ip, offset, count, allocfork, &imap); return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_NEW, seq); found_imap: seq = xfs_iomap_inode_sequence(ip, 0); xfs_iunlock(ip, XFS_ILOCK_EXCL); return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0, seq); found_cow: seq = xfs_iomap_inode_sequence(ip, 0); if (imap.br_startoff <= offset_fsb) { error = xfs_bmbt_to_iomap(ip, srcmap, &imap, flags, 0, seq); if (error) goto out_unlock; seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED); xfs_iunlock(ip, XFS_ILOCK_EXCL); return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED, seq); } xfs_trim_extent(&cmap, offset_fsb, imap.br_startoff - offset_fsb); xfs_iunlock(ip, XFS_ILOCK_EXCL); return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, 0, seq); out_unlock: xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } static int xfs_buffered_write_delalloc_punch( struct inode *inode, loff_t offset, loff_t length) { return xfs_bmap_punch_delalloc_range(XFS_I(inode), offset, offset + length); } static int xfs_buffered_write_iomap_end( struct inode *inode, loff_t offset, loff_t length, ssize_t written, unsigned flags, struct iomap *iomap) { struct xfs_mount *mp = XFS_M(inode->i_sb); int error; error = iomap_file_buffered_write_punch_delalloc(inode, iomap, offset, length, written, &xfs_buffered_write_delalloc_punch); if (error && !xfs_is_shutdown(mp)) { xfs_alert(mp, "%s: unable to clean up ino 0x%llx", __func__, XFS_I(inode)->i_ino); return error; } return 0; } const struct iomap_ops xfs_buffered_write_iomap_ops = { .iomap_begin = xfs_buffered_write_iomap_begin, .iomap_end = xfs_buffered_write_iomap_end, }; /* * iomap_page_mkwrite() will never fail in a way that requires delalloc extents * that it allocated to be revoked. Hence we do not need an .iomap_end method * for this operation. */ const struct iomap_ops xfs_page_mkwrite_iomap_ops = { .iomap_begin = xfs_buffered_write_iomap_begin, }; static int xfs_read_iomap_begin( struct inode *inode, loff_t offset, loff_t length, unsigned flags, struct iomap *iomap, struct iomap *srcmap) { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; struct xfs_bmbt_irec imap; xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, length); int nimaps = 1, error = 0; bool shared = false; unsigned int lockmode = XFS_ILOCK_SHARED; u64 seq; ASSERT(!(flags & (IOMAP_WRITE | IOMAP_ZERO))); if (xfs_is_shutdown(mp)) return -EIO; error = xfs_ilock_for_iomap(ip, flags, &lockmode); if (error) return error; error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, &nimaps, 0); if (!error && ((flags & IOMAP_REPORT) || IS_DAX(inode))) error = xfs_reflink_trim_around_shared(ip, &imap, &shared); seq = xfs_iomap_inode_sequence(ip, shared ? IOMAP_F_SHARED : 0); xfs_iunlock(ip, lockmode); if (error) return error; trace_xfs_iomap_found(ip, offset, length, XFS_DATA_FORK, &imap); return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, shared ? IOMAP_F_SHARED : 0, seq); } const struct iomap_ops xfs_read_iomap_ops = { .iomap_begin = xfs_read_iomap_begin, }; static int xfs_seek_iomap_begin( struct inode *inode, loff_t offset, loff_t length, unsigned flags, struct iomap *iomap, struct iomap *srcmap) { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + length); xfs_fileoff_t cow_fsb = NULLFILEOFF, data_fsb = NULLFILEOFF; struct xfs_iext_cursor icur; struct xfs_bmbt_irec imap, cmap; int error = 0; unsigned lockmode; u64 seq; if (xfs_is_shutdown(mp)) return -EIO; lockmode = xfs_ilock_data_map_shared(ip); error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); if (error) goto out_unlock; if (xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap)) { /* * If we found a data extent we are done. */ if (imap.br_startoff <= offset_fsb) goto done; data_fsb = imap.br_startoff; } else { /* * Fake a hole until the end of the file. */ data_fsb = xfs_iomap_end_fsb(mp, offset, length); } /* * If a COW fork extent covers the hole, report it - capped to the next * data fork extent: */ if (xfs_inode_has_cow_data(ip) && xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap)) cow_fsb = cmap.br_startoff; if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) { if (data_fsb < cow_fsb + cmap.br_blockcount) end_fsb = min(end_fsb, data_fsb); xfs_trim_extent(&cmap, offset_fsb, end_fsb - offset_fsb); seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED); error = xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED, seq); /* * This is a COW extent, so we must probe the page cache * because there could be dirty page cache being backed * by this extent. */ iomap->type = IOMAP_UNWRITTEN; goto out_unlock; } /* * Else report a hole, capped to the next found data or COW extent. */ if (cow_fsb != NULLFILEOFF && cow_fsb < data_fsb) imap.br_blockcount = cow_fsb - offset_fsb; else imap.br_blockcount = data_fsb - offset_fsb; imap.br_startoff = offset_fsb; imap.br_startblock = HOLESTARTBLOCK; imap.br_state = XFS_EXT_NORM; done: seq = xfs_iomap_inode_sequence(ip, 0); xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb); error = xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0, seq); out_unlock: xfs_iunlock(ip, lockmode); return error; } const struct iomap_ops xfs_seek_iomap_ops = { .iomap_begin = xfs_seek_iomap_begin, }; static int xfs_xattr_iomap_begin( struct inode *inode, loff_t offset, loff_t length, unsigned flags, struct iomap *iomap, struct iomap *srcmap) { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + length); struct xfs_bmbt_irec imap; int nimaps = 1, error = 0; unsigned lockmode; int seq; if (xfs_is_shutdown(mp)) return -EIO; lockmode = xfs_ilock_attr_map_shared(ip); /* if there are no attribute fork or extents, return ENOENT */ if (!xfs_inode_has_attr_fork(ip) || !ip->i_af.if_nextents) { error = -ENOENT; goto out_unlock; } ASSERT(ip->i_af.if_format != XFS_DINODE_FMT_LOCAL); error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, &nimaps, XFS_BMAPI_ATTRFORK); out_unlock: seq = xfs_iomap_inode_sequence(ip, IOMAP_F_XATTR); xfs_iunlock(ip, lockmode); if (error) return error; ASSERT(nimaps); return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_XATTR, seq); } const struct iomap_ops xfs_xattr_iomap_ops = { .iomap_begin = xfs_xattr_iomap_begin, }; int xfs_zero_range( struct xfs_inode *ip, loff_t pos, loff_t len, bool *did_zero) { struct inode *inode = VFS_I(ip); if (IS_DAX(inode)) return dax_zero_range(inode, pos, len, did_zero, &xfs_dax_write_iomap_ops); return iomap_zero_range(inode, pos, len, did_zero, &xfs_buffered_write_iomap_ops); } int xfs_truncate_page( struct xfs_inode *ip, loff_t pos, bool *did_zero) { struct inode *inode = VFS_I(ip); if (IS_DAX(inode)) return dax_truncate_page(inode, pos, did_zero, &xfs_dax_write_iomap_ops); return iomap_truncate_page(inode, pos, did_zero, &xfs_buffered_write_iomap_ops); }
1 1 1 1 1 1 1 1 1 1 1 8 6 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2019 HUAWEI, Inc. * https://www.huawei.com/ */ #include "compress.h" #include <linux/lz4.h> #ifndef LZ4_DISTANCE_MAX /* history window size */ #define LZ4_DISTANCE_MAX 65535 /* set to maximum value by default */ #endif #define LZ4_MAX_DISTANCE_PAGES (DIV_ROUND_UP(LZ4_DISTANCE_MAX, PAGE_SIZE) + 1) #ifndef LZ4_DECOMPRESS_INPLACE_MARGIN #define LZ4_DECOMPRESS_INPLACE_MARGIN(srcsize) (((srcsize) >> 8) + 32) #endif struct z_erofs_lz4_decompress_ctx { struct z_erofs_decompress_req *rq; /* # of encoded, decoded pages */ unsigned int inpages, outpages; /* decoded block total length (used for in-place decompression) */ unsigned int oend; }; static int z_erofs_load_lz4_config(struct super_block *sb, struct erofs_super_block *dsb, void *data, int size) { struct erofs_sb_info *sbi = EROFS_SB(sb); struct z_erofs_lz4_cfgs *lz4 = data; u16 distance; if (lz4) { if (size < sizeof(struct z_erofs_lz4_cfgs)) { erofs_err(sb, "invalid lz4 cfgs, size=%u", size); return -EINVAL; } distance = le16_to_cpu(lz4->max_distance); sbi->lz4.max_pclusterblks = le16_to_cpu(lz4->max_pclusterblks); if (!sbi->lz4.max_pclusterblks) { sbi->lz4.max_pclusterblks = 1; /* reserved case */ } else if (sbi->lz4.max_pclusterblks > erofs_blknr(sb, Z_EROFS_PCLUSTER_MAX_SIZE)) { erofs_err(sb, "too large lz4 pclusterblks %u", sbi->lz4.max_pclusterblks); return -EINVAL; } } else { distance = le16_to_cpu(dsb->u1.lz4_max_distance); sbi->lz4.max_pclusterblks = 1; } sbi->lz4.max_distance_pages = distance ? DIV_ROUND_UP(distance, PAGE_SIZE) + 1 : LZ4_MAX_DISTANCE_PAGES; return erofs_pcpubuf_growsize(sbi->lz4.max_pclusterblks); } /* * Fill all gaps with bounce pages if it's a sparse page list. Also check if * all physical pages are consecutive, which can be seen for moderate CR. */ static int z_erofs_lz4_prepare_dstpages(struct z_erofs_lz4_decompress_ctx *ctx, struct page **pagepool) { struct z_erofs_decompress_req *rq = ctx->rq; struct page *availables[LZ4_MAX_DISTANCE_PAGES] = { NULL }; unsigned long bounced[DIV_ROUND_UP(LZ4_MAX_DISTANCE_PAGES, BITS_PER_LONG)] = { 0 }; unsigned int lz4_max_distance_pages = EROFS_SB(rq->sb)->lz4.max_distance_pages; void *kaddr = NULL; unsigned int i, j, top; top = 0; for (i = j = 0; i < ctx->outpages; ++i, ++j) { struct page *const page = rq->out[i]; struct page *victim; if (j >= lz4_max_distance_pages) j = 0; /* 'valid' bounced can only be tested after a complete round */ if (!rq->fillgaps && test_bit(j, bounced)) { DBG_BUGON(i < lz4_max_distance_pages); DBG_BUGON(top >= lz4_max_distance_pages); availables[top++] = rq->out[i - lz4_max_distance_pages]; } if (page) { __clear_bit(j, bounced); if (!PageHighMem(page)) { if (!i) { kaddr = page_address(page); continue; } if (kaddr && kaddr + PAGE_SIZE == page_address(page)) { kaddr += PAGE_SIZE; continue; } } kaddr = NULL; continue; } kaddr = NULL; __set_bit(j, bounced); if (top) { victim = availables[--top]; get_page(victim); } else { victim = erofs_allocpage(pagepool, rq->gfp); if (!victim) return -ENOMEM; set_page_private(victim, Z_EROFS_SHORTLIVED_PAGE); } rq->out[i] = victim; } return kaddr ? 1 : 0; } static void *z_erofs_lz4_handle_overlap(struct z_erofs_lz4_decompress_ctx *ctx, void *inpage, void *out, unsigned int *inputmargin, int *maptype, bool may_inplace) { struct z_erofs_decompress_req *rq = ctx->rq; unsigned int omargin, total, i; struct page **in; void *src, *tmp; if (rq->inplace_io) { omargin = PAGE_ALIGN(ctx->oend) - ctx->oend; if (rq->partial_decoding || !may_inplace || omargin < LZ4_DECOMPRESS_INPLACE_MARGIN(rq->inputsize)) goto docopy; for (i = 0; i < ctx->inpages; ++i) if (rq->out[ctx->outpages - ctx->inpages + i] != rq->in[i]) goto docopy; kunmap_local(inpage); *maptype = 3; return out + ((ctx->outpages - ctx->inpages) << PAGE_SHIFT); } if (ctx->inpages <= 1) { *maptype = 0; return inpage; } kunmap_local(inpage); src = erofs_vm_map_ram(rq->in, ctx->inpages); if (!src) return ERR_PTR(-ENOMEM); *maptype = 1; return src; docopy: /* Or copy compressed data which can be overlapped to per-CPU buffer */ in = rq->in; src = erofs_get_pcpubuf(ctx->inpages); if (!src) { DBG_BUGON(1); kunmap_local(inpage); return ERR_PTR(-EFAULT); } tmp = src; total = rq->inputsize; while (total) { unsigned int page_copycnt = min_t(unsigned int, total, PAGE_SIZE - *inputmargin); if (!inpage) inpage = kmap_local_page(*in); memcpy(tmp, inpage + *inputmargin, page_copycnt); kunmap_local(inpage); inpage = NULL; tmp += page_copycnt; total -= page_copycnt; ++in; *inputmargin = 0; } *maptype = 2; return src; } /* * Get the exact inputsize with zero_padding feature. * - For LZ4, it should work if zero_padding feature is on (5.3+); * - For MicroLZMA, it'd be enabled all the time. */ int z_erofs_fixup_insize(struct z_erofs_decompress_req *rq, const char *padbuf, unsigned int padbufsize) { const char *padend; padend = memchr_inv(padbuf, 0, padbufsize); if (!padend) return -EFSCORRUPTED; rq->inputsize -= padend - padbuf; rq->pageofs_in += padend - padbuf; return 0; } static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx, u8 *dst) { struct z_erofs_decompress_req *rq = ctx->rq; bool support_0padding = false, may_inplace = false; unsigned int inputmargin; u8 *out, *headpage, *src; int ret, maptype; DBG_BUGON(*rq->in == NULL); headpage = kmap_local_page(*rq->in); /* LZ4 decompression inplace is only safe if zero_padding is enabled */ if (erofs_sb_has_zero_padding(EROFS_SB(rq->sb))) { support_0padding = true; ret = z_erofs_fixup_insize(rq, headpage + rq->pageofs_in, min_t(unsigned int, rq->inputsize, rq->sb->s_blocksize - rq->pageofs_in)); if (ret) { kunmap_local(headpage); return ret; } may_inplace = !((rq->pageofs_in + rq->inputsize) & (rq->sb->s_blocksize - 1)); } inputmargin = rq->pageofs_in; src = z_erofs_lz4_handle_overlap(ctx, headpage, dst, &inputmargin, &maptype, may_inplace); if (IS_ERR(src)) return PTR_ERR(src); out = dst + rq->pageofs_out; /* legacy format could compress extra data in a pcluster. */ if (rq->partial_decoding || !support_0padding) ret = LZ4_decompress_safe_partial(src + inputmargin, out, rq->inputsize, rq->outputsize, rq->outputsize); else ret = LZ4_decompress_safe(src + inputmargin, out, rq->inputsize, rq->outputsize); if (ret != rq->outputsize) { erofs_err(rq->sb, "failed to decompress %d in[%u, %u] out[%u]", ret, rq->inputsize, inputmargin, rq->outputsize); if (ret >= 0) memset(out + ret, 0, rq->outputsize - ret); ret = -EFSCORRUPTED; } else { ret = 0; } if (maptype == 0) { kunmap_local(headpage); } else if (maptype == 1) { vm_unmap_ram(src, ctx->inpages); } else if (maptype == 2) { erofs_put_pcpubuf(src); } else if (maptype != 3) { DBG_BUGON(1); return -EFAULT; } return ret; } static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, struct page **pagepool) { struct z_erofs_lz4_decompress_ctx ctx; unsigned int dst_maptype; void *dst; int ret; ctx.rq = rq; ctx.oend = rq->pageofs_out + rq->outputsize; ctx.outpages = PAGE_ALIGN(ctx.oend) >> PAGE_SHIFT; ctx.inpages = PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT; /* one optimized fast path only for non bigpcluster cases yet */ if (ctx.inpages == 1 && ctx.outpages == 1 && !rq->inplace_io) { DBG_BUGON(!*rq->out); dst = kmap_local_page(*rq->out); dst_maptype = 0; goto dstmap_out; } /* general decoding path which can be used for all cases */ ret = z_erofs_lz4_prepare_dstpages(&ctx, pagepool); if (ret < 0) { return ret; } else if (ret > 0) { dst = page_address(*rq->out); dst_maptype = 1; } else { dst = erofs_vm_map_ram(rq->out, ctx.outpages); if (!dst) return -ENOMEM; dst_maptype = 2; } dstmap_out: ret = z_erofs_lz4_decompress_mem(&ctx, dst); if (!dst_maptype) kunmap_local(dst); else if (dst_maptype == 2) vm_unmap_ram(dst, ctx.outpages); return ret; } static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq, struct page **pagepool) { const unsigned int nrpages_in = PAGE_ALIGN(rq->pageofs_in + rq->inputsize) >> PAGE_SHIFT; const unsigned int nrpages_out = PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; const unsigned int bs = rq->sb->s_blocksize; unsigned int cur = 0, ni = 0, no, pi, po, insz, cnt; u8 *kin; if (rq->outputsize > rq->inputsize) return -EOPNOTSUPP; if (rq->alg == Z_EROFS_COMPRESSION_INTERLACED) { cur = bs - (rq->pageofs_out & (bs - 1)); pi = (rq->pageofs_in + rq->inputsize - cur) & ~PAGE_MASK; cur = min(cur, rq->outputsize); if (cur && rq->out[0]) { kin = kmap_local_page(rq->in[nrpages_in - 1]); if (rq->out[0] == rq->in[nrpages_in - 1]) { memmove(kin + rq->pageofs_out, kin + pi, cur); flush_dcache_page(rq->out[0]); } else { memcpy_to_page(rq->out[0], rq->pageofs_out, kin + pi, cur); } kunmap_local(kin); } rq->outputsize -= cur; } for (; rq->outputsize; rq->pageofs_in = 0, cur += PAGE_SIZE, ni++) { insz = min(PAGE_SIZE - rq->pageofs_in, rq->outputsize); rq->outputsize -= insz; if (!rq->in[ni]) continue; kin = kmap_local_page(rq->in[ni]); pi = 0; do { no = (rq->pageofs_out + cur + pi) >> PAGE_SHIFT; po = (rq->pageofs_out + cur + pi) & ~PAGE_MASK; DBG_BUGON(no >= nrpages_out); cnt = min(insz - pi, PAGE_SIZE - po); if (rq->out[no] == rq->in[ni]) { memmove(kin + po, kin + rq->pageofs_in + pi, cnt); flush_dcache_page(rq->out[no]); } else if (rq->out[no]) { memcpy_to_page(rq->out[no], po, kin + rq->pageofs_in + pi, cnt); } pi += cnt; } while (pi < insz); kunmap_local(kin); } DBG_BUGON(ni > nrpages_in); return 0; } const struct z_erofs_decompressor erofs_decompressors[] = { [Z_EROFS_COMPRESSION_SHIFTED] = { .decompress = z_erofs_transform_plain, .name = "shifted" }, [Z_EROFS_COMPRESSION_INTERLACED] = { .decompress = z_erofs_transform_plain, .name = "interlaced" }, [Z_EROFS_COMPRESSION_LZ4] = { .config = z_erofs_load_lz4_config, .decompress = z_erofs_lz4_decompress, .name = "lz4" }, #ifdef CONFIG_EROFS_FS_ZIP_LZMA [Z_EROFS_COMPRESSION_LZMA] = { .config = z_erofs_load_lzma_config, .decompress = z_erofs_lzma_decompress, .name = "lzma" }, #endif #ifdef CONFIG_EROFS_FS_ZIP_DEFLATE [Z_EROFS_COMPRESSION_DEFLATE] = { .config = z_erofs_load_deflate_config, .decompress = z_erofs_deflate_decompress, .name = "deflate" }, #endif }; int z_erofs_parse_cfgs(struct super_block *sb, struct erofs_super_block *dsb) { struct erofs_sb_info *sbi = EROFS_SB(sb); struct erofs_buf buf = __EROFS_BUF_INITIALIZER; unsigned int algs, alg; erofs_off_t offset; int size, ret = 0; if (!erofs_sb_has_compr_cfgs(sbi)) { sbi->available_compr_algs = 1 << Z_EROFS_COMPRESSION_LZ4; return z_erofs_load_lz4_config(sb, dsb, NULL, 0); } sbi->available_compr_algs = le16_to_cpu(dsb->u1.available_compr_algs); if (sbi->available_compr_algs & ~Z_EROFS_ALL_COMPR_ALGS) { erofs_err(sb, "unidentified algorithms %x, please upgrade kernel", sbi->available_compr_algs & ~Z_EROFS_ALL_COMPR_ALGS); return -EOPNOTSUPP; } erofs_init_metabuf(&buf, sb); offset = EROFS_SUPER_OFFSET + sbi->sb_size; alg = 0; for (algs = sbi->available_compr_algs; algs; algs >>= 1, ++alg) { void *data; if (!(algs & 1)) continue; data = erofs_read_metadata(sb, &buf, &offset, &size); if (IS_ERR(data)) { ret = PTR_ERR(data); break; } if (alg >= ARRAY_SIZE(erofs_decompressors) || !erofs_decompressors[alg].config) { erofs_err(sb, "algorithm %d isn't enabled on this kernel", alg); ret = -EOPNOTSUPP; } else { ret = erofs_decompressors[alg].config(sb, dsb, data, size); } kfree(data); if (ret) break; } erofs_put_metabuf(&buf); return ret; }
10 5 7 3 2 6 10 9 2 22 20 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 // SPDX-License-Identifier: GPL-2.0-or-later /* * Misc and compatibility things * Copyright (c) by Jaroslav Kysela <perex@perex.cz> */ #include <linux/init.h> #include <linux/export.h> #include <linux/moduleparam.h> #include <linux/time.h> #include <linux/slab.h> #include <linux/ioport.h> #include <linux/fs.h> #include <sound/core.h> #ifdef CONFIG_SND_DEBUG #ifdef CONFIG_SND_DEBUG_VERBOSE #define DEFAULT_DEBUG_LEVEL 2 #else #define DEFAULT_DEBUG_LEVEL 1 #endif static int debug = DEFAULT_DEBUG_LEVEL; module_param(debug, int, 0644); MODULE_PARM_DESC(debug, "Debug level (0 = disable)"); #endif /* CONFIG_SND_DEBUG */ void release_and_free_resource(struct resource *res) { if (res) { release_resource(res); kfree(res); } } EXPORT_SYMBOL(release_and_free_resource); #ifdef CONFIG_SND_VERBOSE_PRINTK /* strip the leading path if the given path is absolute */ static const char *sanity_file_name(const char *path) { if (*path == '/') return strrchr(path, '/') + 1; else return path; } #endif #if defined(CONFIG_SND_DEBUG) || defined(CONFIG_SND_VERBOSE_PRINTK) void __snd_printk(unsigned int level, const char *path, int line, const char *format, ...) { va_list args; #ifdef CONFIG_SND_VERBOSE_PRINTK int kern_level; struct va_format vaf; char verbose_fmt[] = KERN_DEFAULT "ALSA %s:%d %pV"; bool level_found = false; #endif #ifdef CONFIG_SND_DEBUG if (debug < level) return; #endif va_start(args, format); #ifdef CONFIG_SND_VERBOSE_PRINTK vaf.fmt = format; vaf.va = &args; while ((kern_level = printk_get_level(vaf.fmt)) != 0) { const char *end_of_header = printk_skip_level(vaf.fmt); /* Ignore KERN_CONT. We print filename:line for each piece. */ if (kern_level >= '0' && kern_level <= '7') { memcpy(verbose_fmt, vaf.fmt, end_of_header - vaf.fmt); level_found = true; } vaf.fmt = end_of_header; } if (!level_found && level) memcpy(verbose_fmt, KERN_DEBUG, sizeof(KERN_DEBUG) - 1); printk(verbose_fmt, sanity_file_name(path), line, &vaf); #else vprintk(format, args); #endif va_end(args); } EXPORT_SYMBOL_GPL(__snd_printk); #endif #ifdef CONFIG_PCI #include <linux/pci.h> /** * snd_pci_quirk_lookup_id - look up a PCI SSID quirk list * @vendor: PCI SSV id * @device: PCI SSD id * @list: quirk list, terminated by a null entry * * Look through the given quirk list and finds a matching entry * with the same PCI SSID. When subdevice is 0, all subdevice * values may match. * * Returns the matched entry pointer, or NULL if nothing matched. */ const struct snd_pci_quirk * snd_pci_quirk_lookup_id(u16 vendor, u16 device, const struct snd_pci_quirk *list) { const struct snd_pci_quirk *q; for (q = list; q->subvendor || q->subdevice; q++) { if (q->subvendor != vendor) continue; if (!q->subdevice || (device & q->subdevice_mask) == q->subdevice) return q; } return NULL; } EXPORT_SYMBOL(snd_pci_quirk_lookup_id); /** * snd_pci_quirk_lookup - look up a PCI SSID quirk list * @pci: pci_dev handle * @list: quirk list, terminated by a null entry * * Look through the given quirk list and finds a matching entry * with the same PCI SSID. When subdevice is 0, all subdevice * values may match. * * Returns the matched entry pointer, or NULL if nothing matched. */ const struct snd_pci_quirk * snd_pci_quirk_lookup(struct pci_dev *pci, const struct snd_pci_quirk *list) { if (!pci) return NULL; return snd_pci_quirk_lookup_id(pci->subsystem_vendor, pci->subsystem_device, list); } EXPORT_SYMBOL(snd_pci_quirk_lookup); #endif /* * Deferred async signal helpers * * Below are a few helper functions to wrap the async signal handling * in the deferred work. The main purpose is to avoid the messy deadlock * around tasklist_lock and co at the kill_fasync() invocation. * fasync_helper() and kill_fasync() are replaced with snd_fasync_helper() * and snd_kill_fasync(), respectively. In addition, snd_fasync_free() has * to be called at releasing the relevant file object. */ struct snd_fasync { struct fasync_struct *fasync; int signal; int poll; int on; struct list_head list; }; static DEFINE_SPINLOCK(snd_fasync_lock); static LIST_HEAD(snd_fasync_list); static void snd_fasync_work_fn(struct work_struct *work) { struct snd_fasync *fasync; spin_lock_irq(&snd_fasync_lock); while (!list_empty(&snd_fasync_list)) { fasync = list_first_entry(&snd_fasync_list, struct snd_fasync, list); list_del_init(&fasync->list); spin_unlock_irq(&snd_fasync_lock); if (fasync->on) kill_fasync(&fasync->fasync, fasync->signal, fasync->poll); spin_lock_irq(&snd_fasync_lock); } spin_unlock_irq(&snd_fasync_lock); } static DECLARE_WORK(snd_fasync_work, snd_fasync_work_fn); int snd_fasync_helper(int fd, struct file *file, int on, struct snd_fasync **fasyncp) { struct snd_fasync *fasync = NULL; if (on) { fasync = kzalloc(sizeof(*fasync), GFP_KERNEL); if (!fasync) return -ENOMEM; INIT_LIST_HEAD(&fasync->list); } spin_lock_irq(&snd_fasync_lock); if (*fasyncp) { kfree(fasync); fasync = *fasyncp; } else { if (!fasync) { spin_unlock_irq(&snd_fasync_lock); return 0; } *fasyncp = fasync; } fasync->on = on; spin_unlock_irq(&snd_fasync_lock); return fasync_helper(fd, file, on, &fasync->fasync); } EXPORT_SYMBOL_GPL(snd_fasync_helper); void snd_kill_fasync(struct snd_fasync *fasync, int signal, int poll) { unsigned long flags; if (!fasync || !fasync->on) return; spin_lock_irqsave(&snd_fasync_lock, flags); fasync->signal = signal; fasync->poll = poll; list_move(&fasync->list, &snd_fasync_list); schedule_work(&snd_fasync_work); spin_unlock_irqrestore(&snd_fasync_lock, flags); } EXPORT_SYMBOL_GPL(snd_kill_fasync); void snd_fasync_free(struct snd_fasync *fasync) { if (!fasync) return; fasync->on = 0; flush_work(&snd_fasync_work); kfree(fasync); } EXPORT_SYMBOL_GPL(snd_fasync_free);
11 1 1 2 5 5 1 3 1 1 5 7 6 1 5 5 5 5 5 6 6 5 1 16 1 2 1 3 10 13 10 4 10 12 2 4 4 2 2 4 3 1 5 2 1 1 1 1 1 17 9 7 1 15 7 7 14 7 8 8 8 1 6 7 13 1 10 1 2 2 5 1 6 5 1 2 4 4 2 17 15 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 // SPDX-License-Identifier: GPL-2.0-or-later /* L2TPv3 IP encapsulation support * * Copyright (c) 2008,2009,2010 Katalix Systems Ltd */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <asm/ioctls.h> #include <linux/icmp.h> #include <linux/module.h> #include <linux/skbuff.h> #include <linux/random.h> #include <linux/socket.h> #include <linux/l2tp.h> #include <linux/in.h> #include <net/sock.h> #include <net/ip.h> #include <net/icmp.h> #include <net/udp.h> #include <net/inet_common.h> #include <net/tcp_states.h> #include <net/protocol.h> #include <net/xfrm.h> #include "l2tp_core.h" struct l2tp_ip_sock { /* inet_sock has to be the first member of l2tp_ip_sock */ struct inet_sock inet; u32 conn_id; u32 peer_conn_id; }; static DEFINE_RWLOCK(l2tp_ip_lock); static struct hlist_head l2tp_ip_table; static struct hlist_head l2tp_ip_bind_table; static inline struct l2tp_ip_sock *l2tp_ip_sk(const struct sock *sk) { return (struct l2tp_ip_sock *)sk; } static struct sock *__l2tp_ip_bind_lookup(const struct net *net, __be32 laddr, __be32 raddr, int dif, u32 tunnel_id) { struct sock *sk; sk_for_each_bound(sk, &l2tp_ip_bind_table) { const struct l2tp_ip_sock *l2tp = l2tp_ip_sk(sk); const struct inet_sock *inet = inet_sk(sk); int bound_dev_if; if (!net_eq(sock_net(sk), net)) continue; bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); if (bound_dev_if && dif && bound_dev_if != dif) continue; if (inet->inet_rcv_saddr && laddr && inet->inet_rcv_saddr != laddr) continue; if (inet->inet_daddr && raddr && inet->inet_daddr != raddr) continue; if (l2tp->conn_id != tunnel_id) continue; goto found; } sk = NULL; found: return sk; } /* When processing receive frames, there are two cases to * consider. Data frames consist of a non-zero session-id and an * optional cookie. Control frames consist of a regular L2TP header * preceded by 32-bits of zeros. * * L2TPv3 Session Header Over IP * * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Session ID | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Cookie (optional, maximum 64 bits)... * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * L2TPv3 Control Message Header Over IP * * 0 1 2 3 * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | (32 bits of zeros) | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * |T|L|x|x|S|x|x|x|x|x|x|x| Ver | Length | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Control Connection ID | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * | Ns | Nr | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * * All control frames are passed to userspace. */ static int l2tp_ip_recv(struct sk_buff *skb) { struct net *net = dev_net(skb->dev); struct sock *sk; u32 session_id; u32 tunnel_id; unsigned char *ptr, *optr; struct l2tp_session *session; struct l2tp_tunnel *tunnel = NULL; struct iphdr *iph; if (!pskb_may_pull(skb, 4)) goto discard; /* Point to L2TP header */ optr = skb->data; ptr = skb->data; session_id = ntohl(*((__be32 *)ptr)); ptr += 4; /* RFC3931: L2TP/IP packets have the first 4 bytes containing * the session_id. If it is 0, the packet is a L2TP control * frame and the session_id value can be discarded. */ if (session_id == 0) { __skb_pull(skb, 4); goto pass_up; } /* Ok, this is a data packet. Lookup the session. */ session = l2tp_session_get(net, session_id); if (!session) goto discard; tunnel = session->tunnel; if (!tunnel) goto discard_sess; if (l2tp_v3_ensure_opt_in_linear(session, skb, &ptr, &optr)) goto discard_sess; l2tp_recv_common(session, skb, ptr, optr, 0, skb->len); l2tp_session_dec_refcount(session); return 0; pass_up: /* Get the tunnel_id from the L2TP header */ if (!pskb_may_pull(skb, 12)) goto discard; if ((skb->data[0] & 0xc0) != 0xc0) goto discard; tunnel_id = ntohl(*(__be32 *)&skb->data[4]); iph = (struct iphdr *)skb_network_header(skb); read_lock_bh(&l2tp_ip_lock); sk = __l2tp_ip_bind_lookup(net, iph->daddr, iph->saddr, inet_iif(skb), tunnel_id); if (!sk) { read_unlock_bh(&l2tp_ip_lock); goto discard; } sock_hold(sk); read_unlock_bh(&l2tp_ip_lock); if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_put; nf_reset_ct(skb); return sk_receive_skb(sk, skb, 1); discard_sess: l2tp_session_dec_refcount(session); goto discard; discard_put: sock_put(sk); discard: kfree_skb(skb); return 0; } static int l2tp_ip_hash(struct sock *sk) { if (sk_unhashed(sk)) { write_lock_bh(&l2tp_ip_lock); sk_add_node(sk, &l2tp_ip_table); write_unlock_bh(&l2tp_ip_lock); } return 0; } static void l2tp_ip_unhash(struct sock *sk) { if (sk_unhashed(sk)) return; write_lock_bh(&l2tp_ip_lock); sk_del_node_init(sk); write_unlock_bh(&l2tp_ip_lock); } static int l2tp_ip_open(struct sock *sk) { /* Prevent autobind. We don't have ports. */ inet_sk(sk)->inet_num = IPPROTO_L2TP; l2tp_ip_hash(sk); return 0; } static void l2tp_ip_close(struct sock *sk, long timeout) { write_lock_bh(&l2tp_ip_lock); hlist_del_init(&sk->sk_bind_node); sk_del_node_init(sk); write_unlock_bh(&l2tp_ip_lock); sk_common_release(sk); } static void l2tp_ip_destroy_sock(struct sock *sk) { struct l2tp_tunnel *tunnel = l2tp_sk_to_tunnel(sk); struct sk_buff *skb; while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) kfree_skb(skb); if (tunnel) l2tp_tunnel_delete(tunnel); } static int l2tp_ip_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct inet_sock *inet = inet_sk(sk); struct sockaddr_l2tpip *addr = (struct sockaddr_l2tpip *)uaddr; struct net *net = sock_net(sk); int ret; int chk_addr_ret; if (addr_len < sizeof(struct sockaddr_l2tpip)) return -EINVAL; if (addr->l2tp_family != AF_INET) return -EINVAL; lock_sock(sk); ret = -EINVAL; if (!sock_flag(sk, SOCK_ZAPPED)) goto out; if (sk->sk_state != TCP_CLOSE) goto out; chk_addr_ret = inet_addr_type(net, addr->l2tp_addr.s_addr); ret = -EADDRNOTAVAIL; if (addr->l2tp_addr.s_addr && chk_addr_ret != RTN_LOCAL && chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST) goto out; if (addr->l2tp_addr.s_addr) { inet->inet_rcv_saddr = addr->l2tp_addr.s_addr; inet->inet_saddr = addr->l2tp_addr.s_addr; } if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST) inet->inet_saddr = 0; /* Use device */ write_lock_bh(&l2tp_ip_lock); if (__l2tp_ip_bind_lookup(net, addr->l2tp_addr.s_addr, 0, sk->sk_bound_dev_if, addr->l2tp_conn_id)) { write_unlock_bh(&l2tp_ip_lock); ret = -EADDRINUSE; goto out; } sk_dst_reset(sk); l2tp_ip_sk(sk)->conn_id = addr->l2tp_conn_id; sk_add_bind_node(sk, &l2tp_ip_bind_table); sk_del_node_init(sk); write_unlock_bh(&l2tp_ip_lock); ret = 0; sock_reset_flag(sk, SOCK_ZAPPED); out: release_sock(sk); return ret; } static int l2tp_ip_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct sockaddr_l2tpip *lsa = (struct sockaddr_l2tpip *)uaddr; int rc; if (addr_len < sizeof(*lsa)) return -EINVAL; if (ipv4_is_multicast(lsa->l2tp_addr.s_addr)) return -EINVAL; lock_sock(sk); /* Must bind first - autobinding does not work */ if (sock_flag(sk, SOCK_ZAPPED)) { rc = -EINVAL; goto out_sk; } rc = __ip4_datagram_connect(sk, uaddr, addr_len); if (rc < 0) goto out_sk; l2tp_ip_sk(sk)->peer_conn_id = lsa->l2tp_conn_id; write_lock_bh(&l2tp_ip_lock); hlist_del_init(&sk->sk_bind_node); sk_add_bind_node(sk, &l2tp_ip_bind_table); write_unlock_bh(&l2tp_ip_lock); out_sk: release_sock(sk); return rc; } static int l2tp_ip_disconnect(struct sock *sk, int flags) { if (sock_flag(sk, SOCK_ZAPPED)) return 0; return __udp_disconnect(sk, flags); } static int l2tp_ip_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { struct sock *sk = sock->sk; struct inet_sock *inet = inet_sk(sk); struct l2tp_ip_sock *lsk = l2tp_ip_sk(sk); struct sockaddr_l2tpip *lsa = (struct sockaddr_l2tpip *)uaddr; memset(lsa, 0, sizeof(*lsa)); lsa->l2tp_family = AF_INET; if (peer) { if (!inet->inet_dport) return -ENOTCONN; lsa->l2tp_conn_id = lsk->peer_conn_id; lsa->l2tp_addr.s_addr = inet->inet_daddr; } else { __be32 addr = inet->inet_rcv_saddr; if (!addr) addr = inet->inet_saddr; lsa->l2tp_conn_id = lsk->conn_id; lsa->l2tp_addr.s_addr = addr; } return sizeof(*lsa); } static int l2tp_ip_backlog_recv(struct sock *sk, struct sk_buff *skb) { int rc; /* Charge it to the socket, dropping if the queue is full. */ rc = sock_queue_rcv_skb(sk, skb); if (rc < 0) goto drop; return 0; drop: IP_INC_STATS(sock_net(sk), IPSTATS_MIB_INDISCARDS); kfree_skb(skb); return 0; } /* Userspace will call sendmsg() on the tunnel socket to send L2TP * control frames. */ static int l2tp_ip_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct sk_buff *skb; int rc; struct inet_sock *inet = inet_sk(sk); struct rtable *rt = NULL; struct flowi4 *fl4; int connected = 0; __be32 daddr; lock_sock(sk); rc = -ENOTCONN; if (sock_flag(sk, SOCK_DEAD)) goto out; /* Get and verify the address. */ if (msg->msg_name) { DECLARE_SOCKADDR(struct sockaddr_l2tpip *, lip, msg->msg_name); rc = -EINVAL; if (msg->msg_namelen < sizeof(*lip)) goto out; if (lip->l2tp_family != AF_INET) { rc = -EAFNOSUPPORT; if (lip->l2tp_family != AF_UNSPEC) goto out; } daddr = lip->l2tp_addr.s_addr; } else { rc = -EDESTADDRREQ; if (sk->sk_state != TCP_ESTABLISHED) goto out; daddr = inet->inet_daddr; connected = 1; } /* Allocate a socket buffer */ rc = -ENOMEM; skb = sock_wmalloc(sk, 2 + NET_SKB_PAD + sizeof(struct iphdr) + 4 + len, 0, GFP_KERNEL); if (!skb) goto error; /* Reserve space for headers, putting IP header on 4-byte boundary. */ skb_reserve(skb, 2 + NET_SKB_PAD); skb_reset_network_header(skb); skb_reserve(skb, sizeof(struct iphdr)); skb_reset_transport_header(skb); /* Insert 0 session_id */ *((__be32 *)skb_put(skb, 4)) = 0; /* Copy user data into skb */ rc = memcpy_from_msg(skb_put(skb, len), msg, len); if (rc < 0) { kfree_skb(skb); goto error; } fl4 = &inet->cork.fl.u.ip4; if (connected) rt = (struct rtable *)__sk_dst_check(sk, 0); rcu_read_lock(); if (!rt) { const struct ip_options_rcu *inet_opt; inet_opt = rcu_dereference(inet->inet_opt); /* Use correct destination address if we have options. */ if (inet_opt && inet_opt->opt.srr) daddr = inet_opt->opt.faddr; /* If this fails, retransmit mechanism of transport layer will * keep trying until route appears or the connection times * itself out. */ rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr, inet->inet_saddr, inet->inet_dport, inet->inet_sport, sk->sk_protocol, ip_sock_rt_tos(sk), sk->sk_bound_dev_if); if (IS_ERR(rt)) goto no_route; if (connected) { sk_setup_caps(sk, &rt->dst); } else { skb_dst_set(skb, &rt->dst); goto xmit; } } /* We don't need to clone dst here, it is guaranteed to not disappear. * __dev_xmit_skb() might force a refcount if needed. */ skb_dst_set_noref(skb, &rt->dst); xmit: /* Queue the packet to IP for output */ rc = ip_queue_xmit(sk, skb, &inet->cork.fl); rcu_read_unlock(); error: if (rc >= 0) rc = len; out: release_sock(sk); return rc; no_route: rcu_read_unlock(); IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); kfree_skb(skb); rc = -EHOSTUNREACH; goto out; } static int l2tp_ip_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { struct inet_sock *inet = inet_sk(sk); size_t copied = 0; int err = -EOPNOTSUPP; DECLARE_SOCKADDR(struct sockaddr_in *, sin, msg->msg_name); struct sk_buff *skb; if (flags & MSG_OOB) goto out; skb = skb_recv_datagram(sk, flags, &err); if (!skb) goto out; copied = skb->len; if (len < copied) { msg->msg_flags |= MSG_TRUNC; copied = len; } err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto done; sock_recv_timestamp(msg, sk, skb); /* Copy the address. */ if (sin) { sin->sin_family = AF_INET; sin->sin_addr.s_addr = ip_hdr(skb)->saddr; sin->sin_port = 0; memset(&sin->sin_zero, 0, sizeof(sin->sin_zero)); *addr_len = sizeof(*sin); } if (inet_cmsg_flags(inet)) ip_cmsg_recv(msg, skb); if (flags & MSG_TRUNC) copied = skb->len; done: skb_free_datagram(sk, skb); out: return err ? err : copied; } int l2tp_ioctl(struct sock *sk, int cmd, int *karg) { struct sk_buff *skb; switch (cmd) { case SIOCOUTQ: *karg = sk_wmem_alloc_get(sk); break; case SIOCINQ: spin_lock_bh(&sk->sk_receive_queue.lock); skb = skb_peek(&sk->sk_receive_queue); *karg = skb ? skb->len : 0; spin_unlock_bh(&sk->sk_receive_queue.lock); break; default: return -ENOIOCTLCMD; } return 0; } EXPORT_SYMBOL_GPL(l2tp_ioctl); static struct proto l2tp_ip_prot = { .name = "L2TP/IP", .owner = THIS_MODULE, .init = l2tp_ip_open, .close = l2tp_ip_close, .bind = l2tp_ip_bind, .connect = l2tp_ip_connect, .disconnect = l2tp_ip_disconnect, .ioctl = l2tp_ioctl, .destroy = l2tp_ip_destroy_sock, .setsockopt = ip_setsockopt, .getsockopt = ip_getsockopt, .sendmsg = l2tp_ip_sendmsg, .recvmsg = l2tp_ip_recvmsg, .backlog_rcv = l2tp_ip_backlog_recv, .hash = l2tp_ip_hash, .unhash = l2tp_ip_unhash, .obj_size = sizeof(struct l2tp_ip_sock), }; static const struct proto_ops l2tp_ip_ops = { .family = PF_INET, .owner = THIS_MODULE, .release = inet_release, .bind = inet_bind, .connect = inet_dgram_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = l2tp_ip_getname, .poll = datagram_poll, .ioctl = inet_ioctl, .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = inet_shutdown, .setsockopt = sock_common_setsockopt, .getsockopt = sock_common_getsockopt, .sendmsg = inet_sendmsg, .recvmsg = sock_common_recvmsg, .mmap = sock_no_mmap, }; static struct inet_protosw l2tp_ip_protosw = { .type = SOCK_DGRAM, .protocol = IPPROTO_L2TP, .prot = &l2tp_ip_prot, .ops = &l2tp_ip_ops, }; static struct net_protocol l2tp_ip_protocol __read_mostly = { .handler = l2tp_ip_recv, }; static int __init l2tp_ip_init(void) { int err; pr_info("L2TP IP encapsulation support (L2TPv3)\n"); err = proto_register(&l2tp_ip_prot, 1); if (err != 0) goto out; err = inet_add_protocol(&l2tp_ip_protocol, IPPROTO_L2TP); if (err) goto out1; inet_register_protosw(&l2tp_ip_protosw); return 0; out1: proto_unregister(&l2tp_ip_prot); out: return err; } static void __exit l2tp_ip_exit(void) { inet_unregister_protosw(&l2tp_ip_protosw); inet_del_protocol(&l2tp_ip_protocol, IPPROTO_L2TP); proto_unregister(&l2tp_ip_prot); } module_init(l2tp_ip_init); module_exit(l2tp_ip_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("James Chapman <jchapman@katalix.com>"); MODULE_DESCRIPTION("L2TP over IP"); MODULE_VERSION("1.0"); /* Use the values of SOCK_DGRAM (2) as type and IPPROTO_L2TP (115) as protocol, * because __stringify doesn't like enums */ MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET, 115, 2); MODULE_ALIAS_NET_PF_PROTO(PF_INET, 115);
8 8 8 8 8 8 8 7 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 // SPDX-License-Identifier: GPL-2.0-only /* * vivid-kthread-out.h - video/vbi output thread support functions. * * Copyright 2014 Cisco Systems, Inc. and/or its affiliates. All rights reserved. */ #include <linux/module.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/init.h> #include <linux/sched.h> #include <linux/slab.h> #include <linux/font.h> #include <linux/mutex.h> #include <linux/videodev2.h> #include <linux/kthread.h> #include <linux/freezer.h> #include <linux/random.h> #include <linux/v4l2-dv-timings.h> #include <linux/jiffies.h> #include <asm/div64.h> #include <media/videobuf2-vmalloc.h> #include <media/v4l2-dv-timings.h> #include <media/v4l2-ioctl.h> #include <media/v4l2-fh.h> #include <media/v4l2-event.h> #include "vivid-core.h" #include "vivid-vid-common.h" #include "vivid-vid-cap.h" #include "vivid-vid-out.h" #include "vivid-radio-common.h" #include "vivid-radio-rx.h" #include "vivid-radio-tx.h" #include "vivid-sdr-cap.h" #include "vivid-vbi-cap.h" #include "vivid-vbi-out.h" #include "vivid-osd.h" #include "vivid-ctrls.h" #include "vivid-kthread-out.h" #include "vivid-meta-out.h" static void vivid_thread_vid_out_tick(struct vivid_dev *dev) { struct vivid_buffer *vid_out_buf = NULL; struct vivid_buffer *vbi_out_buf = NULL; struct vivid_buffer *meta_out_buf = NULL; dprintk(dev, 1, "Video Output Thread Tick\n"); /* Drop a certain percentage of buffers. */ if (dev->perc_dropped_buffers && get_random_u32_below(100) < dev->perc_dropped_buffers) return; spin_lock(&dev->slock); /* * Only dequeue buffer if there is at least one more pending. * This makes video loopback possible. */ if (!list_empty(&dev->vid_out_active) && !list_is_singular(&dev->vid_out_active)) { vid_out_buf = list_entry(dev->vid_out_active.next, struct vivid_buffer, list); list_del(&vid_out_buf->list); } if (!list_empty(&dev->vbi_out_active) && (dev->field_out != V4L2_FIELD_ALTERNATE || (dev->vbi_out_seq_count & 1))) { vbi_out_buf = list_entry(dev->vbi_out_active.next, struct vivid_buffer, list); list_del(&vbi_out_buf->list); } if (!list_empty(&dev->meta_out_active)) { meta_out_buf = list_entry(dev->meta_out_active.next, struct vivid_buffer, list); list_del(&meta_out_buf->list); } spin_unlock(&dev->slock); if (!vid_out_buf && !vbi_out_buf && !meta_out_buf) return; if (vid_out_buf) { v4l2_ctrl_request_setup(vid_out_buf->vb.vb2_buf.req_obj.req, &dev->ctrl_hdl_vid_out); v4l2_ctrl_request_complete(vid_out_buf->vb.vb2_buf.req_obj.req, &dev->ctrl_hdl_vid_out); vid_out_buf->vb.sequence = dev->vid_out_seq_count; if (dev->field_out == V4L2_FIELD_ALTERNATE) { /* * The sequence counter counts frames, not fields. * So divide by two. */ vid_out_buf->vb.sequence /= 2; } vid_out_buf->vb.vb2_buf.timestamp = ktime_get_ns() + dev->time_wrap_offset; vb2_buffer_done(&vid_out_buf->vb.vb2_buf, dev->dqbuf_error ? VB2_BUF_STATE_ERROR : VB2_BUF_STATE_DONE); dprintk(dev, 2, "vid_out buffer %d done\n", vid_out_buf->vb.vb2_buf.index); } if (vbi_out_buf) { v4l2_ctrl_request_setup(vbi_out_buf->vb.vb2_buf.req_obj.req, &dev->ctrl_hdl_vbi_out); v4l2_ctrl_request_complete(vbi_out_buf->vb.vb2_buf.req_obj.req, &dev->ctrl_hdl_vbi_out); if (dev->stream_sliced_vbi_out) vivid_sliced_vbi_out_process(dev, vbi_out_buf); vbi_out_buf->vb.sequence = dev->vbi_out_seq_count; vbi_out_buf->vb.vb2_buf.timestamp = ktime_get_ns() + dev->time_wrap_offset; vb2_buffer_done(&vbi_out_buf->vb.vb2_buf, dev->dqbuf_error ? VB2_BUF_STATE_ERROR : VB2_BUF_STATE_DONE); dprintk(dev, 2, "vbi_out buffer %d done\n", vbi_out_buf->vb.vb2_buf.index); } if (meta_out_buf) { v4l2_ctrl_request_setup(meta_out_buf->vb.vb2_buf.req_obj.req, &dev->ctrl_hdl_meta_out); v4l2_ctrl_request_complete(meta_out_buf->vb.vb2_buf.req_obj.req, &dev->ctrl_hdl_meta_out); vivid_meta_out_process(dev, meta_out_buf); meta_out_buf->vb.sequence = dev->meta_out_seq_count; meta_out_buf->vb.vb2_buf.timestamp = ktime_get_ns() + dev->time_wrap_offset; vb2_buffer_done(&meta_out_buf->vb.vb2_buf, dev->dqbuf_error ? VB2_BUF_STATE_ERROR : VB2_BUF_STATE_DONE); dprintk(dev, 2, "meta_out buffer %d done\n", meta_out_buf->vb.vb2_buf.index); } dev->dqbuf_error = false; } static int vivid_thread_vid_out(void *data) { struct vivid_dev *dev = data; u64 numerators_since_start; u64 buffers_since_start; u64 next_jiffies_since_start; unsigned long jiffies_since_start; unsigned long cur_jiffies; unsigned wait_jiffies; unsigned numerator; unsigned denominator; dprintk(dev, 1, "Video Output Thread Start\n"); set_freezable(); /* Resets frame counters */ dev->out_seq_offset = 0; dev->out_seq_count = 0; dev->jiffies_vid_out = jiffies; dev->out_seq_resync = false; if (dev->time_wrap) dev->time_wrap_offset = dev->time_wrap - ktime_get_ns(); else dev->time_wrap_offset = 0; for (;;) { try_to_freeze(); if (kthread_should_stop()) break; if (!mutex_trylock(&dev->mutex)) { schedule(); continue; } cur_jiffies = jiffies; if (dev->out_seq_resync) { dev->jiffies_vid_out = cur_jiffies; dev->out_seq_offset = dev->out_seq_count + 1; dev->out_seq_count = 0; dev->out_seq_resync = false; } numerator = dev->timeperframe_vid_out.numerator; denominator = dev->timeperframe_vid_out.denominator; if (dev->field_out == V4L2_FIELD_ALTERNATE) denominator *= 2; /* Calculate the number of jiffies since we started streaming */ jiffies_since_start = cur_jiffies - dev->jiffies_vid_out; /* Get the number of buffers streamed since the start */ buffers_since_start = (u64)jiffies_since_start * denominator + (HZ * numerator) / 2; do_div(buffers_since_start, HZ * numerator); /* * After more than 0xf0000000 (rounded down to a multiple of * 'jiffies-per-day' to ease jiffies_to_msecs calculation) * jiffies have passed since we started streaming reset the * counters and keep track of the sequence offset. */ if (jiffies_since_start > JIFFIES_RESYNC) { dev->jiffies_vid_out = cur_jiffies; dev->out_seq_offset = buffers_since_start; buffers_since_start = 0; } dev->out_seq_count = buffers_since_start + dev->out_seq_offset; dev->vid_out_seq_count = dev->out_seq_count - dev->vid_out_seq_start; dev->vbi_out_seq_count = dev->out_seq_count - dev->vbi_out_seq_start; dev->meta_out_seq_count = dev->out_seq_count - dev->meta_out_seq_start; vivid_thread_vid_out_tick(dev); mutex_unlock(&dev->mutex); /* * Calculate the number of 'numerators' streamed since we started, * not including the current buffer. */ numerators_since_start = buffers_since_start * numerator; /* And the number of jiffies since we started */ jiffies_since_start = jiffies - dev->jiffies_vid_out; /* Increase by the 'numerator' of one buffer */ numerators_since_start += numerator; /* * Calculate when that next buffer is supposed to start * in jiffies since we started streaming. */ next_jiffies_since_start = numerators_since_start * HZ + denominator / 2; do_div(next_jiffies_since_start, denominator); /* If it is in the past, then just schedule asap */ if (next_jiffies_since_start < jiffies_since_start) next_jiffies_since_start = jiffies_since_start; wait_jiffies = next_jiffies_since_start - jiffies_since_start; while (time_is_after_jiffies(cur_jiffies + wait_jiffies) && !kthread_should_stop()) schedule(); } dprintk(dev, 1, "Video Output Thread End\n"); return 0; } static void vivid_grab_controls(struct vivid_dev *dev, bool grab) { v4l2_ctrl_grab(dev->ctrl_has_crop_out, grab); v4l2_ctrl_grab(dev->ctrl_has_compose_out, grab); v4l2_ctrl_grab(dev->ctrl_has_scaler_out, grab); v4l2_ctrl_grab(dev->ctrl_tx_mode, grab); v4l2_ctrl_grab(dev->ctrl_tx_rgb_range, grab); } int vivid_start_generating_vid_out(struct vivid_dev *dev, bool *pstreaming) { dprintk(dev, 1, "%s\n", __func__); if (dev->kthread_vid_out) { u32 seq_count = dev->out_seq_count + dev->seq_wrap * 128; if (pstreaming == &dev->vid_out_streaming) dev->vid_out_seq_start = seq_count; else if (pstreaming == &dev->vbi_out_streaming) dev->vbi_out_seq_start = seq_count; else dev->meta_out_seq_start = seq_count; *pstreaming = true; return 0; } /* Resets frame counters */ dev->jiffies_vid_out = jiffies; dev->vid_out_seq_start = dev->seq_wrap * 128; dev->vbi_out_seq_start = dev->seq_wrap * 128; dev->meta_out_seq_start = dev->seq_wrap * 128; dev->kthread_vid_out = kthread_run(vivid_thread_vid_out, dev, "%s-vid-out", dev->v4l2_dev.name); if (IS_ERR(dev->kthread_vid_out)) { int err = PTR_ERR(dev->kthread_vid_out); dev->kthread_vid_out = NULL; v4l2_err(&dev->v4l2_dev, "kernel_thread() failed\n"); return err; } *pstreaming = true; vivid_grab_controls(dev, true); dprintk(dev, 1, "returning from %s\n", __func__); return 0; } void vivid_stop_generating_vid_out(struct vivid_dev *dev, bool *pstreaming) { dprintk(dev, 1, "%s\n", __func__); if (dev->kthread_vid_out == NULL) return; *pstreaming = false; if (pstreaming == &dev->vid_out_streaming) { /* Release all active buffers */ while (!list_empty(&dev->vid_out_active)) { struct vivid_buffer *buf; buf = list_entry(dev->vid_out_active.next, struct vivid_buffer, list); list_del(&buf->list); v4l2_ctrl_request_complete(buf->vb.vb2_buf.req_obj.req, &dev->ctrl_hdl_vid_out); vb2_buffer_done(&buf->vb.vb2_buf, VB2_BUF_STATE_ERROR); dprintk(dev, 2, "vid_out buffer %d done\n", buf->vb.vb2_buf.index); } } if (pstreaming == &dev->vbi_out_streaming) { while (!list_empty(&dev->vbi_out_active)) { struct vivid_buffer *buf; buf = list_entry(dev->vbi_out_active.next, struct vivid_buffer, list); list_del(&buf->list); v4l2_ctrl_request_complete(buf->vb.vb2_buf.req_obj.req, &dev->ctrl_hdl_vbi_out); vb2_buffer_done(&buf->vb.vb2_buf, VB2_BUF_STATE_ERROR); dprintk(dev, 2, "vbi_out buffer %d done\n", buf->vb.vb2_buf.index); } } if (pstreaming == &dev->meta_out_streaming) { while (!list_empty(&dev->meta_out_active)) { struct vivid_buffer *buf; buf = list_entry(dev->meta_out_active.next, struct vivid_buffer, list); list_del(&buf->list); v4l2_ctrl_request_complete(buf->vb.vb2_buf.req_obj.req, &dev->ctrl_hdl_meta_out); vb2_buffer_done(&buf->vb.vb2_buf, VB2_BUF_STATE_ERROR); dprintk(dev, 2, "meta_out buffer %d done\n", buf->vb.vb2_buf.index); } } if (dev->vid_out_streaming || dev->vbi_out_streaming || dev->meta_out_streaming) return; /* shutdown control thread */ vivid_grab_controls(dev, false); kthread_stop(dev->kthread_vid_out); dev->kthread_vid_out = NULL; }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_TC_MIR_H #define __NET_TC_MIR_H #include <net/act_api.h> #include <linux/tc_act/tc_mirred.h> struct tcf_mirred { struct tc_action common; int tcfm_eaction; u32 tcfm_blockid; bool tcfm_mac_header_xmit; struct net_device __rcu *tcfm_dev; netdevice_tracker tcfm_dev_tracker; struct list_head tcfm_list; }; #define to_mirred(a) ((struct tcf_mirred *)a) static inline bool is_tcf_mirred_egress_redirect(const struct tc_action *a) { #ifdef CONFIG_NET_CLS_ACT if (a->ops && a->ops->id == TCA_ID_MIRRED) return to_mirred(a)->tcfm_eaction == TCA_EGRESS_REDIR; #endif return false; } static inline bool is_tcf_mirred_egress_mirror(const struct tc_action *a) { #ifdef CONFIG_NET_CLS_ACT if (a->ops && a->ops->id == TCA_ID_MIRRED) return to_mirred(a)->tcfm_eaction == TCA_EGRESS_MIRROR; #endif return false; } static inline bool is_tcf_mirred_ingress_redirect(const struct tc_action *a) { #ifdef CONFIG_NET_CLS_ACT if (a->ops && a->ops->id == TCA_ID_MIRRED) return to_mirred(a)->tcfm_eaction == TCA_INGRESS_REDIR; #endif return false; } static inline bool is_tcf_mirred_ingress_mirror(const struct tc_action *a) { #ifdef CONFIG_NET_CLS_ACT if (a->ops && a->ops->id == TCA_ID_MIRRED) return to_mirred(a)->tcfm_eaction == TCA_INGRESS_MIRROR; #endif return false; } static inline struct net_device *tcf_mirred_dev(const struct tc_action *a) { return rtnl_dereference(to_mirred(a)->tcfm_dev); } #endif /* __NET_TC_MIR_H */
6 396 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Definitions for the IP protocol. * * Version: @(#)ip.h 1.0.2 04/28/93 * * Authors: Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> */ #ifndef _LINUX_IP_H #define _LINUX_IP_H #include <linux/skbuff.h> #include <uapi/linux/ip.h> static inline struct iphdr *ip_hdr(const struct sk_buff *skb) { return (struct iphdr *)skb_network_header(skb); } static inline struct iphdr *inner_ip_hdr(const struct sk_buff *skb) { return (struct iphdr *)skb_inner_network_header(skb); } static inline struct iphdr *ipip_hdr(const struct sk_buff *skb) { return (struct iphdr *)skb_transport_header(skb); } static inline unsigned int ip_transport_len(const struct sk_buff *skb) { return ntohs(ip_hdr(skb)->tot_len) - skb_network_header_len(skb); } static inline unsigned int iph_totlen(const struct sk_buff *skb, const struct iphdr *iph) { u32 len = ntohs(iph->tot_len); return (len || !skb_is_gso(skb) || !skb_is_gso_tcp(skb)) ? len : skb->len - skb_network_offset(skb); } static inline unsigned int skb_ip_totlen(const struct sk_buff *skb) { return iph_totlen(skb, ip_hdr(skb)); } /* IPv4 datagram length is stored into 16bit field (tot_len) */ #define IP_MAX_MTU 0xFFFFU static inline void iph_set_totlen(struct iphdr *iph, unsigned int len) { iph->tot_len = len <= IP_MAX_MTU ? htons(len) : 0; } #endif /* _LINUX_IP_H */
54 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 /* * videobuf2-v4l2.h - V4L2 driver helper framework * * Copyright (C) 2010 Samsung Electronics * * Author: Pawel Osciak <pawel@osciak.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation. */ #ifndef _MEDIA_VIDEOBUF2_V4L2_H #define _MEDIA_VIDEOBUF2_V4L2_H #include <linux/videodev2.h> #include <media/videobuf2-core.h> #if VB2_MAX_FRAME != VIDEO_MAX_FRAME #error VB2_MAX_FRAME != VIDEO_MAX_FRAME #endif #if VB2_MAX_PLANES != VIDEO_MAX_PLANES #error VB2_MAX_PLANES != VIDEO_MAX_PLANES #endif struct video_device; /** * struct vb2_v4l2_buffer - video buffer information for v4l2. * * @vb2_buf: embedded struct &vb2_buffer. * @flags: buffer informational flags. * @field: field order of the image in the buffer, as defined by * &enum v4l2_field. * @timecode: frame timecode. * @sequence: sequence count of this frame. * @request_fd: the request_fd associated with this buffer * @is_held: if true, then this capture buffer was held * @planes: plane information (userptr/fd, length, bytesused, data_offset). * * Should contain enough information to be able to cover all the fields * of &struct v4l2_buffer at ``videodev2.h``. */ struct vb2_v4l2_buffer { struct vb2_buffer vb2_buf; __u32 flags; __u32 field; struct v4l2_timecode timecode; __u32 sequence; __s32 request_fd; bool is_held; struct vb2_plane planes[VB2_MAX_PLANES]; }; /* VB2 V4L2 flags as set in vb2_queue.subsystem_flags */ #define VB2_V4L2_FL_SUPPORTS_M2M_HOLD_CAPTURE_BUF (1 << 0) /* * to_vb2_v4l2_buffer() - cast struct vb2_buffer * to struct vb2_v4l2_buffer * */ #define to_vb2_v4l2_buffer(vb) \ container_of(vb, struct vb2_v4l2_buffer, vb2_buf) /** * vb2_find_buffer() - Find a buffer with given timestamp * * @q: pointer to &struct vb2_queue with videobuf2 queue. * @timestamp: the timestamp to find. * * Returns the buffer with the given @timestamp, or NULL if not found. */ struct vb2_buffer *vb2_find_buffer(struct vb2_queue *q, u64 timestamp); int vb2_querybuf(struct vb2_queue *q, struct v4l2_buffer *b); /** * vb2_reqbufs() - Wrapper for vb2_core_reqbufs() that also verifies * the memory and type values. * * @q: pointer to &struct vb2_queue with videobuf2 queue. * @req: &struct v4l2_requestbuffers passed from userspace to * &v4l2_ioctl_ops->vidioc_reqbufs handler in driver. */ int vb2_reqbufs(struct vb2_queue *q, struct v4l2_requestbuffers *req); /** * vb2_create_bufs() - Wrapper for vb2_core_create_bufs() that also verifies * the memory and type values. * * @q: pointer to &struct vb2_queue with videobuf2 queue. * @create: creation parameters, passed from userspace to * &v4l2_ioctl_ops->vidioc_create_bufs handler in driver */ int vb2_create_bufs(struct vb2_queue *q, struct v4l2_create_buffers *create); /** * vb2_prepare_buf() - Pass ownership of a buffer from userspace to the kernel * * @q: pointer to &struct vb2_queue with videobuf2 queue. * @mdev: pointer to &struct media_device, may be NULL. * @b: buffer structure passed from userspace to * &v4l2_ioctl_ops->vidioc_prepare_buf handler in driver * * Should be called from &v4l2_ioctl_ops->vidioc_prepare_buf ioctl handler * of a driver. * * This function: * * #) verifies the passed buffer, * #) calls &vb2_ops->buf_prepare callback in the driver (if provided), * in which driver-specific buffer initialization can be performed. * #) if @b->request_fd is non-zero and @mdev->ops->req_queue is set, * then bind the prepared buffer to the request. * * The return values from this function are intended to be directly returned * from &v4l2_ioctl_ops->vidioc_prepare_buf handler in driver. */ int vb2_prepare_buf(struct vb2_queue *q, struct media_device *mdev, struct v4l2_buffer *b); /** * vb2_qbuf() - Queue a buffer from userspace * @q: pointer to &struct vb2_queue with videobuf2 queue. * @mdev: pointer to &struct media_device, may be NULL. * @b: buffer structure passed from userspace to * &v4l2_ioctl_ops->vidioc_qbuf handler in driver * * Should be called from &v4l2_ioctl_ops->vidioc_qbuf handler of a driver. * * This function: * * #) verifies the passed buffer; * #) if @b->request_fd is non-zero and @mdev->ops->req_queue is set, * then bind the buffer to the request. * #) if necessary, calls &vb2_ops->buf_prepare callback in the driver * (if provided), in which driver-specific buffer initialization can * be performed; * #) if streaming is on, queues the buffer in driver by the means of * &vb2_ops->buf_queue callback for processing. * * The return values from this function are intended to be directly returned * from &v4l2_ioctl_ops->vidioc_qbuf handler in driver. */ int vb2_qbuf(struct vb2_queue *q, struct media_device *mdev, struct v4l2_buffer *b); /** * vb2_expbuf() - Export a buffer as a file descriptor * @q: pointer to &struct vb2_queue with videobuf2 queue. * @eb: export buffer structure passed from userspace to * &v4l2_ioctl_ops->vidioc_expbuf handler in driver * * The return values from this function are intended to be directly returned * from &v4l2_ioctl_ops->vidioc_expbuf handler in driver. */ int vb2_expbuf(struct vb2_queue *q, struct v4l2_exportbuffer *eb); /** * vb2_dqbuf() - Dequeue a buffer to the userspace * @q: pointer to &struct vb2_queue with videobuf2 queue. * @b: buffer structure passed from userspace to * &v4l2_ioctl_ops->vidioc_dqbuf handler in driver * @nonblocking: if true, this call will not sleep waiting for a buffer if no * buffers ready for dequeuing are present. Normally the driver * would be passing (&file->f_flags & %O_NONBLOCK) here * * Should be called from &v4l2_ioctl_ops->vidioc_dqbuf ioctl handler * of a driver. * * This function: * * #) verifies the passed buffer; * #) calls &vb2_ops->buf_finish callback in the driver (if provided), in which * driver can perform any additional operations that may be required before * returning the buffer to userspace, such as cache sync; * #) the buffer struct members are filled with relevant information for * the userspace. * * The return values from this function are intended to be directly returned * from &v4l2_ioctl_ops->vidioc_dqbuf handler in driver. */ int vb2_dqbuf(struct vb2_queue *q, struct v4l2_buffer *b, bool nonblocking); /** * vb2_streamon - start streaming * @q: pointer to &struct vb2_queue with videobuf2 queue. * @type: type argument passed from userspace to vidioc_streamon handler, * as defined by &enum v4l2_buf_type. * * Should be called from &v4l2_ioctl_ops->vidioc_streamon handler of a driver. * * This function: * * 1) verifies current state * 2) passes any previously queued buffers to the driver and starts streaming * * The return values from this function are intended to be directly returned * from &v4l2_ioctl_ops->vidioc_streamon handler in the driver. */ int vb2_streamon(struct vb2_queue *q, enum v4l2_buf_type type); /** * vb2_streamoff - stop streaming * @q: pointer to &struct vb2_queue with videobuf2 queue. * @type: type argument passed from userspace to vidioc_streamoff handler * * Should be called from vidioc_streamoff handler of a driver. * * This function: * * #) verifies current state, * #) stop streaming and dequeues any queued buffers, including those previously * passed to the driver (after waiting for the driver to finish). * * This call can be used for pausing playback. * The return values from this function are intended to be directly returned * from vidioc_streamoff handler in the driver */ int vb2_streamoff(struct vb2_queue *q, enum v4l2_buf_type type); /** * vb2_queue_init() - initialize a videobuf2 queue * @q: pointer to &struct vb2_queue with videobuf2 queue. * * The vb2_queue structure should be allocated by the driver. The driver is * responsible of clearing it's content and setting initial values for some * required entries before calling this function. * q->ops, q->mem_ops, q->type and q->io_modes are mandatory. Please refer * to the struct vb2_queue description in include/media/videobuf2-core.h * for more information. */ int __must_check vb2_queue_init(struct vb2_queue *q); /** * vb2_queue_init_name() - initialize a videobuf2 queue with a name * @q: pointer to &struct vb2_queue with videobuf2 queue. * @name: the queue name * * This function initializes the vb2_queue exactly like vb2_queue_init(), * and additionally sets the queue name. The queue name is used for logging * purpose, and should uniquely identify the queue within the context of the * device it belongs to. This is useful to attribute kernel log messages to the * right queue for m2m devices or other devices that handle multiple queues. */ int __must_check vb2_queue_init_name(struct vb2_queue *q, const char *name); /** * vb2_queue_release() - stop streaming, release the queue and free memory * @q: pointer to &struct vb2_queue with videobuf2 queue. * * This function stops streaming and performs necessary clean ups, including * freeing video buffer memory. The driver is responsible for freeing * the vb2_queue structure itself. */ void vb2_queue_release(struct vb2_queue *q); /** * vb2_queue_change_type() - change the type of an inactive vb2_queue * @q: pointer to &struct vb2_queue with videobuf2 queue. * @type: the type to change to (V4L2_BUF_TYPE_VIDEO_*) * * This function changes the type of the vb2_queue. This is only possible * if the queue is not busy (i.e. no buffers have been allocated). * * vb2_queue_change_type() can be used to support multiple buffer types using * the same queue. The driver can implement v4l2_ioctl_ops.vidioc_reqbufs and * v4l2_ioctl_ops.vidioc_create_bufs functions and call vb2_queue_change_type() * before calling vb2_ioctl_reqbufs() or vb2_ioctl_create_bufs(), and thus * "lock" the buffer type until the buffers have been released. */ int vb2_queue_change_type(struct vb2_queue *q, unsigned int type); /** * vb2_poll() - implements poll userspace operation * @q: pointer to &struct vb2_queue with videobuf2 queue. * @file: file argument passed to the poll file operation handler * @wait: wait argument passed to the poll file operation handler * * This function implements poll file operation handler for a driver. * For CAPTURE queues, if a buffer is ready to be dequeued, the userspace will * be informed that the file descriptor of a video device is available for * reading. * For OUTPUT queues, if a buffer is ready to be dequeued, the file descriptor * will be reported as available for writing. * * If the driver uses struct v4l2_fh, then vb2_poll() will also check for any * pending events. * * The return values from this function are intended to be directly returned * from poll handler in driver. */ __poll_t vb2_poll(struct vb2_queue *q, struct file *file, poll_table *wait); /* * The following functions are not part of the vb2 core API, but are simple * helper functions that you can use in your struct v4l2_file_operations, * struct v4l2_ioctl_ops and struct vb2_ops. They will serialize if vb2_queue->lock * or video_device->lock is set, and they will set and test the queue owner * (vb2_queue->owner) to check if the calling filehandle is permitted to do the * queuing operation. */ /** * vb2_queue_is_busy() - check if the queue is busy * @q: pointer to &struct vb2_queue with videobuf2 queue. * @file: file through which the vb2 queue access is performed * * The queue is considered busy if it has an owner and the owner is not the * @file. * * Queue ownership is acquired and checked by some of the v4l2_ioctl_ops helpers * below. Drivers can also use this function directly when they need to * open-code ioctl handlers, for instance to add additional checks between the * queue ownership test and the call to the corresponding vb2 operation. */ static inline bool vb2_queue_is_busy(struct vb2_queue *q, struct file *file) { return q->owner && q->owner != file->private_data; } /* struct v4l2_ioctl_ops helpers */ int vb2_ioctl_reqbufs(struct file *file, void *priv, struct v4l2_requestbuffers *p); int vb2_ioctl_create_bufs(struct file *file, void *priv, struct v4l2_create_buffers *p); int vb2_ioctl_prepare_buf(struct file *file, void *priv, struct v4l2_buffer *p); int vb2_ioctl_querybuf(struct file *file, void *priv, struct v4l2_buffer *p); int vb2_ioctl_qbuf(struct file *file, void *priv, struct v4l2_buffer *p); int vb2_ioctl_dqbuf(struct file *file, void *priv, struct v4l2_buffer *p); int vb2_ioctl_streamon(struct file *file, void *priv, enum v4l2_buf_type i); int vb2_ioctl_streamoff(struct file *file, void *priv, enum v4l2_buf_type i); int vb2_ioctl_expbuf(struct file *file, void *priv, struct v4l2_exportbuffer *p); /* struct v4l2_file_operations helpers */ int vb2_fop_mmap(struct file *file, struct vm_area_struct *vma); int vb2_fop_release(struct file *file); int _vb2_fop_release(struct file *file, struct mutex *lock); ssize_t vb2_fop_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos); ssize_t vb2_fop_read(struct file *file, char __user *buf, size_t count, loff_t *ppos); __poll_t vb2_fop_poll(struct file *file, poll_table *wait); #ifndef CONFIG_MMU unsigned long vb2_fop_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); #endif /** * vb2_video_unregister_device - unregister the video device and release queue * * @vdev: pointer to &struct video_device * * If the driver uses vb2_fop_release()/_vb2_fop_release(), then it should use * vb2_video_unregister_device() instead of video_unregister_device(). * * This function will call video_unregister_device() and then release the * vb2_queue if streaming is in progress. This will stop streaming and * this will simplify the unbind sequence since after this call all subdevs * will have stopped streaming as well. */ void vb2_video_unregister_device(struct video_device *vdev); /** * vb2_ops_wait_prepare - helper function to lock a struct &vb2_queue * * @vq: pointer to &struct vb2_queue * * ..note:: only use if vq->lock is non-NULL. */ void vb2_ops_wait_prepare(struct vb2_queue *vq); /** * vb2_ops_wait_finish - helper function to unlock a struct &vb2_queue * * @vq: pointer to &struct vb2_queue * * ..note:: only use if vq->lock is non-NULL. */ void vb2_ops_wait_finish(struct vb2_queue *vq); struct media_request; int vb2_request_validate(struct media_request *req); void vb2_request_queue(struct media_request *req); #endif /* _MEDIA_VIDEOBUF2_V4L2_H */
6 5 3 3 4 2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 /* * Cryptographic API. * * T10 Data Integrity Field CRC16 Crypto Transform using PCLMULQDQ Instructions * * Copyright (C) 2013 Intel Corporation * Author: Tim Chen <tim.c.chen@linux.intel.com> * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free * Software Foundation; either version 2 of the License, or (at your option) * any later version. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include <linux/types.h> #include <linux/module.h> #include <linux/crc-t10dif.h> #include <crypto/internal/hash.h> #include <crypto/internal/simd.h> #include <linux/init.h> #include <linux/string.h> #include <linux/kernel.h> #include <asm/cpufeatures.h> #include <asm/cpu_device_id.h> #include <asm/simd.h> asmlinkage u16 crc_t10dif_pcl(u16 init_crc, const u8 *buf, size_t len); struct chksum_desc_ctx { __u16 crc; }; static int chksum_init(struct shash_desc *desc) { struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); ctx->crc = 0; return 0; } static int chksum_update(struct shash_desc *desc, const u8 *data, unsigned int length) { struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); if (length >= 16 && crypto_simd_usable()) { kernel_fpu_begin(); ctx->crc = crc_t10dif_pcl(ctx->crc, data, length); kernel_fpu_end(); } else ctx->crc = crc_t10dif_generic(ctx->crc, data, length); return 0; } static int chksum_final(struct shash_desc *desc, u8 *out) { struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); *(__u16 *)out = ctx->crc; return 0; } static int __chksum_finup(__u16 crc, const u8 *data, unsigned int len, u8 *out) { if (len >= 16 && crypto_simd_usable()) { kernel_fpu_begin(); *(__u16 *)out = crc_t10dif_pcl(crc, data, len); kernel_fpu_end(); } else *(__u16 *)out = crc_t10dif_generic(crc, data, len); return 0; } static int chksum_finup(struct shash_desc *desc, const u8 *data, unsigned int len, u8 *out) { struct chksum_desc_ctx *ctx = shash_desc_ctx(desc); return __chksum_finup(ctx->crc, data, len, out); } static int chksum_digest(struct shash_desc *desc, const u8 *data, unsigned int length, u8 *out) { return __chksum_finup(0, data, length, out); } static struct shash_alg alg = { .digestsize = CRC_T10DIF_DIGEST_SIZE, .init = chksum_init, .update = chksum_update, .final = chksum_final, .finup = chksum_finup, .digest = chksum_digest, .descsize = sizeof(struct chksum_desc_ctx), .base = { .cra_name = "crct10dif", .cra_driver_name = "crct10dif-pclmul", .cra_priority = 200, .cra_blocksize = CRC_T10DIF_BLOCK_SIZE, .cra_module = THIS_MODULE, } }; static const struct x86_cpu_id crct10dif_cpu_id[] = { X86_MATCH_FEATURE(X86_FEATURE_PCLMULQDQ, NULL), {} }; MODULE_DEVICE_TABLE(x86cpu, crct10dif_cpu_id); static int __init crct10dif_intel_mod_init(void) { if (!x86_match_cpu(crct10dif_cpu_id)) return -ENODEV; return crypto_register_shash(&alg); } static void __exit crct10dif_intel_mod_fini(void) { crypto_unregister_shash(&alg); } module_init(crct10dif_intel_mod_init); module_exit(crct10dif_intel_mod_fini); MODULE_AUTHOR("Tim Chen <tim.c.chen@linux.intel.com>"); MODULE_DESCRIPTION("T10 DIF CRC calculation accelerated with PCLMULQDQ."); MODULE_LICENSE("GPL"); MODULE_ALIAS_CRYPTO("crct10dif"); MODULE_ALIAS_CRYPTO("crct10dif-pclmul");
297 299 19 7 12 2 12 4 1 1 3 7 2 3 140 140 138 2 140 1109 1111 140 46 46 45 1 81 51 30 30 2 3 30 6 2 2 2 5 3 2 1 1 5 4 2 2 28 11 20 19 2 2 5 23 17 11 8 2 1 12 1 1 2 6 10 10 13 3 23 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 // SPDX-License-Identifier: GPL-2.0-only /* * This is a module which is used for queueing packets and communicating with * userspace via nfnetlink. * * (C) 2005 by Harald Welte <laforge@netfilter.org> * (C) 2007 by Patrick McHardy <kaber@trash.net> * * Based on the old ipv4-only ip_queue.c: * (C) 2000-2002 James Morris <jmorris@intercode.com.au> * (C) 2003-2005 Netfilter Core Team <coreteam@netfilter.org> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <linux/init.h> #include <linux/spinlock.h> #include <linux/slab.h> #include <linux/notifier.h> #include <linux/netdevice.h> #include <linux/netfilter.h> #include <linux/proc_fs.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_ipv6.h> #include <linux/netfilter_bridge.h> #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_queue.h> #include <linux/netfilter/nf_conntrack_common.h> #include <linux/list.h> #include <linux/cgroup-defs.h> #include <net/gso.h> #include <net/sock.h> #include <net/tcp_states.h> #include <net/netfilter/nf_queue.h> #include <net/netns/generic.h> #include <linux/atomic.h> #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) #include "../bridge/br_private.h" #endif #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include <net/netfilter/nf_conntrack.h> #endif #define NFQNL_QMAX_DEFAULT 1024 /* We're using struct nlattr which has 16bit nla_len. Note that nla_len * includes the header length. Thus, the maximum packet length that we * support is 65531 bytes. We send truncated packets if the specified length * is larger than that. Userspace can check for presence of NFQA_CAP_LEN * attribute to detect truncation. */ #define NFQNL_MAX_COPY_RANGE (0xffff - NLA_HDRLEN) struct nfqnl_instance { struct hlist_node hlist; /* global list of queues */ struct rcu_head rcu; u32 peer_portid; unsigned int queue_maxlen; unsigned int copy_range; unsigned int queue_dropped; unsigned int queue_user_dropped; u_int16_t queue_num; /* number of this queue */ u_int8_t copy_mode; u_int32_t flags; /* Set using NFQA_CFG_FLAGS */ /* * Following fields are dirtied for each queued packet, * keep them in same cache line if possible. */ spinlock_t lock ____cacheline_aligned_in_smp; unsigned int queue_total; unsigned int id_sequence; /* 'sequence' of pkt ids */ struct list_head queue_list; /* packets in queue */ }; typedef int (*nfqnl_cmpfn)(struct nf_queue_entry *, unsigned long); static unsigned int nfnl_queue_net_id __read_mostly; #define INSTANCE_BUCKETS 16 struct nfnl_queue_net { spinlock_t instances_lock; struct hlist_head instance_table[INSTANCE_BUCKETS]; }; static struct nfnl_queue_net *nfnl_queue_pernet(struct net *net) { return net_generic(net, nfnl_queue_net_id); } static inline u_int8_t instance_hashfn(u_int16_t queue_num) { return ((queue_num >> 8) ^ queue_num) % INSTANCE_BUCKETS; } static struct nfqnl_instance * instance_lookup(struct nfnl_queue_net *q, u_int16_t queue_num) { struct hlist_head *head; struct nfqnl_instance *inst; head = &q->instance_table[instance_hashfn(queue_num)]; hlist_for_each_entry_rcu(inst, head, hlist) { if (inst->queue_num == queue_num) return inst; } return NULL; } static struct nfqnl_instance * instance_create(struct nfnl_queue_net *q, u_int16_t queue_num, u32 portid) { struct nfqnl_instance *inst; unsigned int h; int err; spin_lock(&q->instances_lock); if (instance_lookup(q, queue_num)) { err = -EEXIST; goto out_unlock; } inst = kzalloc(sizeof(*inst), GFP_ATOMIC); if (!inst) { err = -ENOMEM; goto out_unlock; } inst->queue_num = queue_num; inst->peer_portid = portid; inst->queue_maxlen = NFQNL_QMAX_DEFAULT; inst->copy_range = NFQNL_MAX_COPY_RANGE; inst->copy_mode = NFQNL_COPY_NONE; spin_lock_init(&inst->lock); INIT_LIST_HEAD(&inst->queue_list); if (!try_module_get(THIS_MODULE)) { err = -EAGAIN; goto out_free; } h = instance_hashfn(queue_num); hlist_add_head_rcu(&inst->hlist, &q->instance_table[h]); spin_unlock(&q->instances_lock); return inst; out_free: kfree(inst); out_unlock: spin_unlock(&q->instances_lock); return ERR_PTR(err); } static void nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data); static void instance_destroy_rcu(struct rcu_head *head) { struct nfqnl_instance *inst = container_of(head, struct nfqnl_instance, rcu); nfqnl_flush(inst, NULL, 0); kfree(inst); module_put(THIS_MODULE); } static void __instance_destroy(struct nfqnl_instance *inst) { hlist_del_rcu(&inst->hlist); call_rcu(&inst->rcu, instance_destroy_rcu); } static void instance_destroy(struct nfnl_queue_net *q, struct nfqnl_instance *inst) { spin_lock(&q->instances_lock); __instance_destroy(inst); spin_unlock(&q->instances_lock); } static inline void __enqueue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry) { list_add_tail(&entry->list, &queue->queue_list); queue->queue_total++; } static void __dequeue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry) { list_del(&entry->list); queue->queue_total--; } static struct nf_queue_entry * find_dequeue_entry(struct nfqnl_instance *queue, unsigned int id) { struct nf_queue_entry *entry = NULL, *i; spin_lock_bh(&queue->lock); list_for_each_entry(i, &queue->queue_list, list) { if (i->id == id) { entry = i; break; } } if (entry) __dequeue_entry(queue, entry); spin_unlock_bh(&queue->lock); return entry; } static unsigned int nf_iterate(struct sk_buff *skb, struct nf_hook_state *state, const struct nf_hook_entries *hooks, unsigned int *index) { const struct nf_hook_entry *hook; unsigned int verdict, i = *index; while (i < hooks->num_hook_entries) { hook = &hooks->hooks[i]; repeat: verdict = nf_hook_entry_hookfn(hook, skb, state); if (verdict != NF_ACCEPT) { *index = i; if (verdict != NF_REPEAT) return verdict; goto repeat; } i++; } *index = i; return NF_ACCEPT; } static struct nf_hook_entries *nf_hook_entries_head(const struct net *net, u8 pf, u8 hooknum) { switch (pf) { #ifdef CONFIG_NETFILTER_FAMILY_BRIDGE case NFPROTO_BRIDGE: return rcu_dereference(net->nf.hooks_bridge[hooknum]); #endif case NFPROTO_IPV4: return rcu_dereference(net->nf.hooks_ipv4[hooknum]); case NFPROTO_IPV6: return rcu_dereference(net->nf.hooks_ipv6[hooknum]); default: WARN_ON_ONCE(1); return NULL; } return NULL; } static int nf_ip_reroute(struct sk_buff *skb, const struct nf_queue_entry *entry) { #ifdef CONFIG_INET const struct ip_rt_info *rt_info = nf_queue_entry_reroute(entry); if (entry->state.hook == NF_INET_LOCAL_OUT) { const struct iphdr *iph = ip_hdr(skb); if (!(iph->tos == rt_info->tos && skb->mark == rt_info->mark && iph->daddr == rt_info->daddr && iph->saddr == rt_info->saddr)) return ip_route_me_harder(entry->state.net, entry->state.sk, skb, RTN_UNSPEC); } #endif return 0; } static int nf_reroute(struct sk_buff *skb, struct nf_queue_entry *entry) { const struct nf_ipv6_ops *v6ops; int ret = 0; switch (entry->state.pf) { case AF_INET: ret = nf_ip_reroute(skb, entry); break; case AF_INET6: v6ops = rcu_dereference(nf_ipv6_ops); if (v6ops) ret = v6ops->reroute(skb, entry); break; } return ret; } /* caller must hold rcu read-side lock */ static void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict) { const struct nf_hook_entry *hook_entry; const struct nf_hook_entries *hooks; struct sk_buff *skb = entry->skb; const struct net *net; unsigned int i; int err; u8 pf; net = entry->state.net; pf = entry->state.pf; hooks = nf_hook_entries_head(net, pf, entry->state.hook); i = entry->hook_index; if (WARN_ON_ONCE(!hooks || i >= hooks->num_hook_entries)) { kfree_skb_reason(skb, SKB_DROP_REASON_NETFILTER_DROP); nf_queue_entry_free(entry); return; } hook_entry = &hooks->hooks[i]; /* Continue traversal iff userspace said ok... */ if (verdict == NF_REPEAT) verdict = nf_hook_entry_hookfn(hook_entry, skb, &entry->state); if (verdict == NF_ACCEPT) { if (nf_reroute(skb, entry) < 0) verdict = NF_DROP; } if (verdict == NF_ACCEPT) { next_hook: ++i; verdict = nf_iterate(skb, &entry->state, hooks, &i); } switch (verdict & NF_VERDICT_MASK) { case NF_ACCEPT: case NF_STOP: local_bh_disable(); entry->state.okfn(entry->state.net, entry->state.sk, skb); local_bh_enable(); break; case NF_QUEUE: err = nf_queue(skb, &entry->state, i, verdict); if (err == 1) goto next_hook; break; case NF_STOLEN: break; default: kfree_skb(skb); } nf_queue_entry_free(entry); } static void nfqnl_reinject(struct nf_queue_entry *entry, unsigned int verdict) { const struct nf_ct_hook *ct_hook; if (verdict == NF_ACCEPT || verdict == NF_REPEAT || verdict == NF_STOP) { unsigned int ct_verdict = verdict; rcu_read_lock(); ct_hook = rcu_dereference(nf_ct_hook); if (ct_hook) ct_verdict = ct_hook->update(entry->state.net, entry->skb); rcu_read_unlock(); switch (ct_verdict & NF_VERDICT_MASK) { case NF_ACCEPT: /* follow userspace verdict, could be REPEAT */ break; case NF_STOLEN: nf_queue_entry_free(entry); return; default: verdict = ct_verdict & NF_VERDICT_MASK; break; } } nf_reinject(entry, verdict); } static void nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data) { struct nf_queue_entry *entry, *next; spin_lock_bh(&queue->lock); list_for_each_entry_safe(entry, next, &queue->queue_list, list) { if (!cmpfn || cmpfn(entry, data)) { list_del(&entry->list); queue->queue_total--; nfqnl_reinject(entry, NF_DROP); } } spin_unlock_bh(&queue->lock); } static int nfqnl_put_packet_info(struct sk_buff *nlskb, struct sk_buff *packet, bool csum_verify) { __u32 flags = 0; if (packet->ip_summed == CHECKSUM_PARTIAL) flags = NFQA_SKB_CSUMNOTREADY; else if (csum_verify) flags = NFQA_SKB_CSUM_NOTVERIFIED; if (skb_is_gso(packet)) flags |= NFQA_SKB_GSO; return flags ? nla_put_be32(nlskb, NFQA_SKB_INFO, htonl(flags)) : 0; } static int nfqnl_put_sk_uidgid(struct sk_buff *skb, struct sock *sk) { const struct cred *cred; if (!sk_fullsock(sk)) return 0; read_lock_bh(&sk->sk_callback_lock); if (sk->sk_socket && sk->sk_socket->file) { cred = sk->sk_socket->file->f_cred; if (nla_put_be32(skb, NFQA_UID, htonl(from_kuid_munged(&init_user_ns, cred->fsuid)))) goto nla_put_failure; if (nla_put_be32(skb, NFQA_GID, htonl(from_kgid_munged(&init_user_ns, cred->fsgid)))) goto nla_put_failure; } read_unlock_bh(&sk->sk_callback_lock); return 0; nla_put_failure: read_unlock_bh(&sk->sk_callback_lock); return -1; } static int nfqnl_put_sk_classid(struct sk_buff *skb, struct sock *sk) { #if IS_ENABLED(CONFIG_CGROUP_NET_CLASSID) if (sk && sk_fullsock(sk)) { u32 classid = sock_cgroup_classid(&sk->sk_cgrp_data); if (classid && nla_put_be32(skb, NFQA_CGROUP_CLASSID, htonl(classid))) return -1; } #endif return 0; } static u32 nfqnl_get_sk_secctx(struct sk_buff *skb, char **secdata) { u32 seclen = 0; #if IS_ENABLED(CONFIG_NETWORK_SECMARK) if (!skb || !sk_fullsock(skb->sk)) return 0; read_lock_bh(&skb->sk->sk_callback_lock); if (skb->secmark) security_secid_to_secctx(skb->secmark, secdata, &seclen); read_unlock_bh(&skb->sk->sk_callback_lock); #endif return seclen; } static u32 nfqnl_get_bridge_size(struct nf_queue_entry *entry) { struct sk_buff *entskb = entry->skb; u32 nlalen = 0; if (entry->state.pf != PF_BRIDGE || !skb_mac_header_was_set(entskb)) return 0; if (skb_vlan_tag_present(entskb)) nlalen += nla_total_size(nla_total_size(sizeof(__be16)) + nla_total_size(sizeof(__be16))); if (entskb->network_header > entskb->mac_header) nlalen += nla_total_size((entskb->network_header - entskb->mac_header)); return nlalen; } static int nfqnl_put_bridge(struct nf_queue_entry *entry, struct sk_buff *skb) { struct sk_buff *entskb = entry->skb; if (entry->state.pf != PF_BRIDGE || !skb_mac_header_was_set(entskb)) return 0; if (skb_vlan_tag_present(entskb)) { struct nlattr *nest; nest = nla_nest_start(skb, NFQA_VLAN); if (!nest) goto nla_put_failure; if (nla_put_be16(skb, NFQA_VLAN_TCI, htons(entskb->vlan_tci)) || nla_put_be16(skb, NFQA_VLAN_PROTO, entskb->vlan_proto)) goto nla_put_failure; nla_nest_end(skb, nest); } if (entskb->mac_header < entskb->network_header) { int len = (int)(entskb->network_header - entskb->mac_header); if (nla_put(skb, NFQA_L2HDR, len, skb_mac_header(entskb))) goto nla_put_failure; } return 0; nla_put_failure: return -1; } static struct sk_buff * nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, struct nf_queue_entry *entry, __be32 **packet_id_ptr) { size_t size; size_t data_len = 0, cap_len = 0; unsigned int hlen = 0; struct sk_buff *skb; struct nlattr *nla; struct nfqnl_msg_packet_hdr *pmsg; struct nlmsghdr *nlh; struct sk_buff *entskb = entry->skb; struct net_device *indev; struct net_device *outdev; struct nf_conn *ct = NULL; enum ip_conntrack_info ctinfo = 0; const struct nfnl_ct_hook *nfnl_ct; bool csum_verify; char *secdata = NULL; u32 seclen = 0; ktime_t tstamp; size = nlmsg_total_size(sizeof(struct nfgenmsg)) + nla_total_size(sizeof(struct nfqnl_msg_packet_hdr)) + nla_total_size(sizeof(u_int32_t)) /* ifindex */ + nla_total_size(sizeof(u_int32_t)) /* ifindex */ #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) + nla_total_size(sizeof(u_int32_t)) /* ifindex */ + nla_total_size(sizeof(u_int32_t)) /* ifindex */ #endif + nla_total_size(sizeof(u_int32_t)) /* mark */ + nla_total_size(sizeof(u_int32_t)) /* priority */ + nla_total_size(sizeof(struct nfqnl_msg_packet_hw)) + nla_total_size(sizeof(u_int32_t)) /* skbinfo */ #if IS_ENABLED(CONFIG_CGROUP_NET_CLASSID) + nla_total_size(sizeof(u_int32_t)) /* classid */ #endif + nla_total_size(sizeof(u_int32_t)); /* cap_len */ tstamp = skb_tstamp_cond(entskb, false); if (tstamp) size += nla_total_size(sizeof(struct nfqnl_msg_packet_timestamp)); size += nfqnl_get_bridge_size(entry); if (entry->state.hook <= NF_INET_FORWARD || (entry->state.hook == NF_INET_POST_ROUTING && entskb->sk == NULL)) csum_verify = !skb_csum_unnecessary(entskb); else csum_verify = false; outdev = entry->state.out; switch ((enum nfqnl_config_mode)READ_ONCE(queue->copy_mode)) { case NFQNL_COPY_META: case NFQNL_COPY_NONE: break; case NFQNL_COPY_PACKET: if (!(queue->flags & NFQA_CFG_F_GSO) && entskb->ip_summed == CHECKSUM_PARTIAL && skb_checksum_help(entskb)) return NULL; data_len = READ_ONCE(queue->copy_range); if (data_len > entskb->len) data_len = entskb->len; hlen = skb_zerocopy_headlen(entskb); hlen = min_t(unsigned int, hlen, data_len); size += sizeof(struct nlattr) + hlen; cap_len = entskb->len; break; } nfnl_ct = rcu_dereference(nfnl_ct_hook); #if IS_ENABLED(CONFIG_NF_CONNTRACK) if (queue->flags & NFQA_CFG_F_CONNTRACK) { if (nfnl_ct != NULL) { ct = nf_ct_get(entskb, &ctinfo); if (ct != NULL) size += nfnl_ct->build_size(ct); } } #endif if (queue->flags & NFQA_CFG_F_UID_GID) { size += (nla_total_size(sizeof(u_int32_t)) /* uid */ + nla_total_size(sizeof(u_int32_t))); /* gid */ } if ((queue->flags & NFQA_CFG_F_SECCTX) && entskb->sk) { seclen = nfqnl_get_sk_secctx(entskb, &secdata); if (seclen) size += nla_total_size(seclen); } skb = alloc_skb(size, GFP_ATOMIC); if (!skb) { skb_tx_error(entskb); goto nlmsg_failure; } nlh = nfnl_msg_put(skb, 0, 0, nfnl_msg_type(NFNL_SUBSYS_QUEUE, NFQNL_MSG_PACKET), 0, entry->state.pf, NFNETLINK_V0, htons(queue->queue_num)); if (!nlh) { skb_tx_error(entskb); kfree_skb(skb); goto nlmsg_failure; } nla = __nla_reserve(skb, NFQA_PACKET_HDR, sizeof(*pmsg)); pmsg = nla_data(nla); pmsg->hw_protocol = entskb->protocol; pmsg->hook = entry->state.hook; *packet_id_ptr = &pmsg->packet_id; indev = entry->state.in; if (indev) { #if !IS_ENABLED(CONFIG_BRIDGE_NETFILTER) if (nla_put_be32(skb, NFQA_IFINDEX_INDEV, htonl(indev->ifindex))) goto nla_put_failure; #else if (entry->state.pf == PF_BRIDGE) { /* Case 1: indev is physical input device, we need to * look for bridge group (when called from * netfilter_bridge) */ if (nla_put_be32(skb, NFQA_IFINDEX_PHYSINDEV, htonl(indev->ifindex)) || /* this is the bridge group "brX" */ /* rcu_read_lock()ed by __nf_queue */ nla_put_be32(skb, NFQA_IFINDEX_INDEV, htonl(br_port_get_rcu(indev)->br->dev->ifindex))) goto nla_put_failure; } else { int physinif; /* Case 2: indev is bridge group, we need to look for * physical device (when called from ipv4) */ if (nla_put_be32(skb, NFQA_IFINDEX_INDEV, htonl(indev->ifindex))) goto nla_put_failure; physinif = nf_bridge_get_physinif(entskb); if (physinif && nla_put_be32(skb, NFQA_IFINDEX_PHYSINDEV, htonl(physinif))) goto nla_put_failure; } #endif } if (outdev) { #if !IS_ENABLED(CONFIG_BRIDGE_NETFILTER) if (nla_put_be32(skb, NFQA_IFINDEX_OUTDEV, htonl(outdev->ifindex))) goto nla_put_failure; #else if (entry->state.pf == PF_BRIDGE) { /* Case 1: outdev is physical output device, we need to * look for bridge group (when called from * netfilter_bridge) */ if (nla_put_be32(skb, NFQA_IFINDEX_PHYSOUTDEV, htonl(outdev->ifindex)) || /* this is the bridge group "brX" */ /* rcu_read_lock()ed by __nf_queue */ nla_put_be32(skb, NFQA_IFINDEX_OUTDEV, htonl(br_port_get_rcu(outdev)->br->dev->ifindex))) goto nla_put_failure; } else { int physoutif; /* Case 2: outdev is bridge group, we need to look for * physical output device (when called from ipv4) */ if (nla_put_be32(skb, NFQA_IFINDEX_OUTDEV, htonl(outdev->ifindex))) goto nla_put_failure; physoutif = nf_bridge_get_physoutif(entskb); if (physoutif && nla_put_be32(skb, NFQA_IFINDEX_PHYSOUTDEV, htonl(physoutif))) goto nla_put_failure; } #endif } if (entskb->mark && nla_put_be32(skb, NFQA_MARK, htonl(entskb->mark))) goto nla_put_failure; if (entskb->priority && nla_put_be32(skb, NFQA_PRIORITY, htonl(entskb->priority))) goto nla_put_failure; if (indev && entskb->dev && skb_mac_header_was_set(entskb) && skb_mac_header_len(entskb) != 0) { struct nfqnl_msg_packet_hw phw; int len; memset(&phw, 0, sizeof(phw)); len = dev_parse_header(entskb, phw.hw_addr); if (len) { phw.hw_addrlen = htons(len); if (nla_put(skb, NFQA_HWADDR, sizeof(phw), &phw)) goto nla_put_failure; } } if (nfqnl_put_bridge(entry, skb) < 0) goto nla_put_failure; if (entry->state.hook <= NF_INET_FORWARD && tstamp) { struct nfqnl_msg_packet_timestamp ts; struct timespec64 kts = ktime_to_timespec64(tstamp); ts.sec = cpu_to_be64(kts.tv_sec); ts.usec = cpu_to_be64(kts.tv_nsec / NSEC_PER_USEC); if (nla_put(skb, NFQA_TIMESTAMP, sizeof(ts), &ts)) goto nla_put_failure; } if ((queue->flags & NFQA_CFG_F_UID_GID) && entskb->sk && nfqnl_put_sk_uidgid(skb, entskb->sk) < 0) goto nla_put_failure; if (nfqnl_put_sk_classid(skb, entskb->sk) < 0) goto nla_put_failure; if (seclen && nla_put(skb, NFQA_SECCTX, seclen, secdata)) goto nla_put_failure; if (ct && nfnl_ct->build(skb, ct, ctinfo, NFQA_CT, NFQA_CT_INFO) < 0) goto nla_put_failure; if (cap_len > data_len && nla_put_be32(skb, NFQA_CAP_LEN, htonl(cap_len))) goto nla_put_failure; if (nfqnl_put_packet_info(skb, entskb, csum_verify)) goto nla_put_failure; if (data_len) { struct nlattr *nla; if (skb_tailroom(skb) < sizeof(*nla) + hlen) goto nla_put_failure; nla = skb_put(skb, sizeof(*nla)); nla->nla_type = NFQA_PAYLOAD; nla->nla_len = nla_attr_size(data_len); if (skb_zerocopy(skb, entskb, data_len, hlen)) goto nla_put_failure; } nlh->nlmsg_len = skb->len; if (seclen) security_release_secctx(secdata, seclen); return skb; nla_put_failure: skb_tx_error(entskb); kfree_skb(skb); net_err_ratelimited("nf_queue: error creating packet message\n"); nlmsg_failure: if (seclen) security_release_secctx(secdata, seclen); return NULL; } static bool nf_ct_drop_unconfirmed(const struct nf_queue_entry *entry) { #if IS_ENABLED(CONFIG_NF_CONNTRACK) static const unsigned long flags = IPS_CONFIRMED | IPS_DYING; const struct nf_conn *ct = (void *)skb_nfct(entry->skb); if (ct && ((ct->status & flags) == IPS_DYING)) return true; #endif return false; } static int __nfqnl_enqueue_packet(struct net *net, struct nfqnl_instance *queue, struct nf_queue_entry *entry) { struct sk_buff *nskb; int err = -ENOBUFS; __be32 *packet_id_ptr; int failopen = 0; nskb = nfqnl_build_packet_message(net, queue, entry, &packet_id_ptr); if (nskb == NULL) { err = -ENOMEM; goto err_out; } spin_lock_bh(&queue->lock); if (nf_ct_drop_unconfirmed(entry)) goto err_out_free_nskb; if (queue->queue_total >= queue->queue_maxlen) { if (queue->flags & NFQA_CFG_F_FAIL_OPEN) { failopen = 1; err = 0; } else { queue->queue_dropped++; net_warn_ratelimited("nf_queue: full at %d entries, dropping packets(s)\n", queue->queue_total); } goto err_out_free_nskb; } entry->id = ++queue->id_sequence; *packet_id_ptr = htonl(entry->id); /* nfnetlink_unicast will either free the nskb or add it to a socket */ err = nfnetlink_unicast(nskb, net, queue->peer_portid); if (err < 0) { if (queue->flags & NFQA_CFG_F_FAIL_OPEN) { failopen = 1; err = 0; } else { queue->queue_user_dropped++; } goto err_out_unlock; } __enqueue_entry(queue, entry); spin_unlock_bh(&queue->lock); return 0; err_out_free_nskb: kfree_skb(nskb); err_out_unlock: spin_unlock_bh(&queue->lock); if (failopen) nfqnl_reinject(entry, NF_ACCEPT); err_out: return err; } static struct nf_queue_entry * nf_queue_entry_dup(struct nf_queue_entry *e) { struct nf_queue_entry *entry = kmemdup(e, e->size, GFP_ATOMIC); if (!entry) return NULL; if (nf_queue_entry_get_refs(entry)) return entry; kfree(entry); return NULL; } #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) /* When called from bridge netfilter, skb->data must point to MAC header * before calling skb_gso_segment(). Else, original MAC header is lost * and segmented skbs will be sent to wrong destination. */ static void nf_bridge_adjust_skb_data(struct sk_buff *skb) { if (nf_bridge_info_get(skb)) __skb_push(skb, skb->network_header - skb->mac_header); } static void nf_bridge_adjust_segmented_data(struct sk_buff *skb) { if (nf_bridge_info_get(skb)) __skb_pull(skb, skb->network_header - skb->mac_header); } #else #define nf_bridge_adjust_skb_data(s) do {} while (0) #define nf_bridge_adjust_segmented_data(s) do {} while (0) #endif static int __nfqnl_enqueue_packet_gso(struct net *net, struct nfqnl_instance *queue, struct sk_buff *skb, struct nf_queue_entry *entry) { int ret = -ENOMEM; struct nf_queue_entry *entry_seg; nf_bridge_adjust_segmented_data(skb); if (skb->next == NULL) { /* last packet, no need to copy entry */ struct sk_buff *gso_skb = entry->skb; entry->skb = skb; ret = __nfqnl_enqueue_packet(net, queue, entry); if (ret) entry->skb = gso_skb; return ret; } skb_mark_not_on_list(skb); entry_seg = nf_queue_entry_dup(entry); if (entry_seg) { entry_seg->skb = skb; ret = __nfqnl_enqueue_packet(net, queue, entry_seg); if (ret) nf_queue_entry_free(entry_seg); } return ret; } static int nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) { unsigned int queued; struct nfqnl_instance *queue; struct sk_buff *skb, *segs, *nskb; int err = -ENOBUFS; struct net *net = entry->state.net; struct nfnl_queue_net *q = nfnl_queue_pernet(net); /* rcu_read_lock()ed by nf_hook_thresh */ queue = instance_lookup(q, queuenum); if (!queue) return -ESRCH; if (queue->copy_mode == NFQNL_COPY_NONE) return -EINVAL; skb = entry->skb; switch (entry->state.pf) { case NFPROTO_IPV4: skb->protocol = htons(ETH_P_IP); break; case NFPROTO_IPV6: skb->protocol = htons(ETH_P_IPV6); break; } if ((queue->flags & NFQA_CFG_F_GSO) || !skb_is_gso(skb)) return __nfqnl_enqueue_packet(net, queue, entry); nf_bridge_adjust_skb_data(skb); segs = skb_gso_segment(skb, 0); /* Does not use PTR_ERR to limit the number of error codes that can be * returned by nf_queue. For instance, callers rely on -ESRCH to * mean 'ignore this hook'. */ if (IS_ERR_OR_NULL(segs)) goto out_err; queued = 0; err = 0; skb_list_walk_safe(segs, segs, nskb) { if (err == 0) err = __nfqnl_enqueue_packet_gso(net, queue, segs, entry); if (err == 0) queued++; else kfree_skb(segs); } if (queued) { if (err) /* some segments are already queued */ nf_queue_entry_free(entry); kfree_skb(skb); return 0; } out_err: nf_bridge_adjust_segmented_data(skb); return err; } static int nfqnl_mangle(void *data, unsigned int data_len, struct nf_queue_entry *e, int diff) { struct sk_buff *nskb; if (diff < 0) { unsigned int min_len = skb_transport_offset(e->skb); if (data_len < min_len) return -EINVAL; if (pskb_trim(e->skb, data_len)) return -ENOMEM; } else if (diff > 0) { if (data_len > 0xFFFF) return -EINVAL; if (diff > skb_tailroom(e->skb)) { nskb = skb_copy_expand(e->skb, skb_headroom(e->skb), diff, GFP_ATOMIC); if (!nskb) return -ENOMEM; kfree_skb(e->skb); e->skb = nskb; } skb_put(e->skb, diff); } if (skb_ensure_writable(e->skb, data_len)) return -ENOMEM; skb_copy_to_linear_data(e->skb, data, data_len); e->skb->ip_summed = CHECKSUM_NONE; return 0; } static int nfqnl_set_mode(struct nfqnl_instance *queue, unsigned char mode, unsigned int range) { int status = 0; spin_lock_bh(&queue->lock); switch (mode) { case NFQNL_COPY_NONE: case NFQNL_COPY_META: queue->copy_mode = mode; queue->copy_range = 0; break; case NFQNL_COPY_PACKET: queue->copy_mode = mode; if (range == 0 || range > NFQNL_MAX_COPY_RANGE) queue->copy_range = NFQNL_MAX_COPY_RANGE; else queue->copy_range = range; break; default: status = -EINVAL; } spin_unlock_bh(&queue->lock); return status; } static int dev_cmp(struct nf_queue_entry *entry, unsigned long ifindex) { #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) int physinif, physoutif; physinif = nf_bridge_get_physinif(entry->skb); physoutif = nf_bridge_get_physoutif(entry->skb); if (physinif == ifindex || physoutif == ifindex) return 1; #endif if (entry->state.in) if (entry->state.in->ifindex == ifindex) return 1; if (entry->state.out) if (entry->state.out->ifindex == ifindex) return 1; return 0; } /* drop all packets with either indev or outdev == ifindex from all queue * instances */ static void nfqnl_dev_drop(struct net *net, int ifindex) { int i; struct nfnl_queue_net *q = nfnl_queue_pernet(net); rcu_read_lock(); for (i = 0; i < INSTANCE_BUCKETS; i++) { struct nfqnl_instance *inst; struct hlist_head *head = &q->instance_table[i]; hlist_for_each_entry_rcu(inst, head, hlist) nfqnl_flush(inst, dev_cmp, ifindex); } rcu_read_unlock(); } static int nfqnl_rcv_dev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); /* Drop any packets associated with the downed device */ if (event == NETDEV_DOWN) nfqnl_dev_drop(dev_net(dev), dev->ifindex); return NOTIFY_DONE; } static struct notifier_block nfqnl_dev_notifier = { .notifier_call = nfqnl_rcv_dev_event, }; static void nfqnl_nf_hook_drop(struct net *net) { struct nfnl_queue_net *q = nfnl_queue_pernet(net); int i; /* This function is also called on net namespace error unwind, * when pernet_ops->init() failed and ->exit() functions of the * previous pernet_ops gets called. * * This may result in a call to nfqnl_nf_hook_drop() before * struct nfnl_queue_net was allocated. */ if (!q) return; for (i = 0; i < INSTANCE_BUCKETS; i++) { struct nfqnl_instance *inst; struct hlist_head *head = &q->instance_table[i]; hlist_for_each_entry_rcu(inst, head, hlist) nfqnl_flush(inst, NULL, 0); } } static int nfqnl_rcv_nl_event(struct notifier_block *this, unsigned long event, void *ptr) { struct netlink_notify *n = ptr; struct nfnl_queue_net *q = nfnl_queue_pernet(n->net); if (event == NETLINK_URELEASE && n->protocol == NETLINK_NETFILTER) { int i; /* destroy all instances for this portid */ spin_lock(&q->instances_lock); for (i = 0; i < INSTANCE_BUCKETS; i++) { struct hlist_node *t2; struct nfqnl_instance *inst; struct hlist_head *head = &q->instance_table[i]; hlist_for_each_entry_safe(inst, t2, head, hlist) { if (n->portid == inst->peer_portid) __instance_destroy(inst); } } spin_unlock(&q->instances_lock); } return NOTIFY_DONE; } static struct notifier_block nfqnl_rtnl_notifier = { .notifier_call = nfqnl_rcv_nl_event, }; static const struct nla_policy nfqa_vlan_policy[NFQA_VLAN_MAX + 1] = { [NFQA_VLAN_TCI] = { .type = NLA_U16}, [NFQA_VLAN_PROTO] = { .type = NLA_U16}, }; static const struct nla_policy nfqa_verdict_policy[NFQA_MAX+1] = { [NFQA_VERDICT_HDR] = { .len = sizeof(struct nfqnl_msg_verdict_hdr) }, [NFQA_MARK] = { .type = NLA_U32 }, [NFQA_PAYLOAD] = { .type = NLA_UNSPEC }, [NFQA_CT] = { .type = NLA_UNSPEC }, [NFQA_EXP] = { .type = NLA_UNSPEC }, [NFQA_VLAN] = { .type = NLA_NESTED }, [NFQA_PRIORITY] = { .type = NLA_U32 }, }; static const struct nla_policy nfqa_verdict_batch_policy[NFQA_MAX+1] = { [NFQA_VERDICT_HDR] = { .len = sizeof(struct nfqnl_msg_verdict_hdr) }, [NFQA_MARK] = { .type = NLA_U32 }, [NFQA_PRIORITY] = { .type = NLA_U32 }, }; static struct nfqnl_instance * verdict_instance_lookup(struct nfnl_queue_net *q, u16 queue_num, u32 nlportid) { struct nfqnl_instance *queue; queue = instance_lookup(q, queue_num); if (!queue) return ERR_PTR(-ENODEV); if (queue->peer_portid != nlportid) return ERR_PTR(-EPERM); return queue; } static struct nfqnl_msg_verdict_hdr* verdicthdr_get(const struct nlattr * const nfqa[]) { struct nfqnl_msg_verdict_hdr *vhdr; unsigned int verdict; if (!nfqa[NFQA_VERDICT_HDR]) return NULL; vhdr = nla_data(nfqa[NFQA_VERDICT_HDR]); verdict = ntohl(vhdr->verdict) & NF_VERDICT_MASK; if (verdict > NF_MAX_VERDICT || verdict == NF_STOLEN) return NULL; return vhdr; } static int nfq_id_after(unsigned int id, unsigned int max) { return (int)(id - max) > 0; } static int nfqnl_recv_verdict_batch(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nfqa[]) { struct nfnl_queue_net *q = nfnl_queue_pernet(info->net); u16 queue_num = ntohs(info->nfmsg->res_id); struct nf_queue_entry *entry, *tmp; struct nfqnl_msg_verdict_hdr *vhdr; struct nfqnl_instance *queue; unsigned int verdict, maxid; LIST_HEAD(batch_list); queue = verdict_instance_lookup(q, queue_num, NETLINK_CB(skb).portid); if (IS_ERR(queue)) return PTR_ERR(queue); vhdr = verdicthdr_get(nfqa); if (!vhdr) return -EINVAL; verdict = ntohl(vhdr->verdict); maxid = ntohl(vhdr->id); spin_lock_bh(&queue->lock); list_for_each_entry_safe(entry, tmp, &queue->queue_list, list) { if (nfq_id_after(entry->id, maxid)) break; __dequeue_entry(queue, entry); list_add_tail(&entry->list, &batch_list); } spin_unlock_bh(&queue->lock); if (list_empty(&batch_list)) return -ENOENT; list_for_each_entry_safe(entry, tmp, &batch_list, list) { if (nfqa[NFQA_MARK]) entry->skb->mark = ntohl(nla_get_be32(nfqa[NFQA_MARK])); if (nfqa[NFQA_PRIORITY]) entry->skb->priority = ntohl(nla_get_be32(nfqa[NFQA_PRIORITY])); nfqnl_reinject(entry, verdict); } return 0; } static struct nf_conn *nfqnl_ct_parse(const struct nfnl_ct_hook *nfnl_ct, const struct nlmsghdr *nlh, const struct nlattr * const nfqa[], struct nf_queue_entry *entry, enum ip_conntrack_info *ctinfo) { #if IS_ENABLED(CONFIG_NF_CONNTRACK) struct nf_conn *ct; ct = nf_ct_get(entry->skb, ctinfo); if (ct == NULL) return NULL; if (nfnl_ct->parse(nfqa[NFQA_CT], ct) < 0) return NULL; if (nfqa[NFQA_EXP]) nfnl_ct->attach_expect(nfqa[NFQA_EXP], ct, NETLINK_CB(entry->skb).portid, nlmsg_report(nlh)); return ct; #else return NULL; #endif } static int nfqa_parse_bridge(struct nf_queue_entry *entry, const struct nlattr * const nfqa[]) { if (nfqa[NFQA_VLAN]) { struct nlattr *tb[NFQA_VLAN_MAX + 1]; int err; err = nla_parse_nested_deprecated(tb, NFQA_VLAN_MAX, nfqa[NFQA_VLAN], nfqa_vlan_policy, NULL); if (err < 0) return err; if (!tb[NFQA_VLAN_TCI] || !tb[NFQA_VLAN_PROTO]) return -EINVAL; __vlan_hwaccel_put_tag(entry->skb, nla_get_be16(tb[NFQA_VLAN_PROTO]), ntohs(nla_get_be16(tb[NFQA_VLAN_TCI]))); } if (nfqa[NFQA_L2HDR]) { int mac_header_len = entry->skb->network_header - entry->skb->mac_header; if (mac_header_len != nla_len(nfqa[NFQA_L2HDR])) return -EINVAL; else if (mac_header_len > 0) memcpy(skb_mac_header(entry->skb), nla_data(nfqa[NFQA_L2HDR]), mac_header_len); } return 0; } static int nfqnl_recv_verdict(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nfqa[]) { struct nfnl_queue_net *q = nfnl_queue_pernet(info->net); u_int16_t queue_num = ntohs(info->nfmsg->res_id); const struct nfnl_ct_hook *nfnl_ct; struct nfqnl_msg_verdict_hdr *vhdr; enum ip_conntrack_info ctinfo; struct nfqnl_instance *queue; struct nf_queue_entry *entry; struct nf_conn *ct = NULL; unsigned int verdict; int err; queue = verdict_instance_lookup(q, queue_num, NETLINK_CB(skb).portid); if (IS_ERR(queue)) return PTR_ERR(queue); vhdr = verdicthdr_get(nfqa); if (!vhdr) return -EINVAL; verdict = ntohl(vhdr->verdict); entry = find_dequeue_entry(queue, ntohl(vhdr->id)); if (entry == NULL) return -ENOENT; /* rcu lock already held from nfnl->call_rcu. */ nfnl_ct = rcu_dereference(nfnl_ct_hook); if (nfqa[NFQA_CT]) { if (nfnl_ct != NULL) ct = nfqnl_ct_parse(nfnl_ct, info->nlh, nfqa, entry, &ctinfo); } if (entry->state.pf == PF_BRIDGE) { err = nfqa_parse_bridge(entry, nfqa); if (err < 0) return err; } if (nfqa[NFQA_PAYLOAD]) { u16 payload_len = nla_len(nfqa[NFQA_PAYLOAD]); int diff = payload_len - entry->skb->len; if (nfqnl_mangle(nla_data(nfqa[NFQA_PAYLOAD]), payload_len, entry, diff) < 0) verdict = NF_DROP; if (ct && diff) nfnl_ct->seq_adjust(entry->skb, ct, ctinfo, diff); } if (nfqa[NFQA_MARK]) entry->skb->mark = ntohl(nla_get_be32(nfqa[NFQA_MARK])); if (nfqa[NFQA_PRIORITY]) entry->skb->priority = ntohl(nla_get_be32(nfqa[NFQA_PRIORITY])); nfqnl_reinject(entry, verdict); return 0; } static int nfqnl_recv_unsupp(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const cda[]) { return -ENOTSUPP; } static const struct nla_policy nfqa_cfg_policy[NFQA_CFG_MAX+1] = { [NFQA_CFG_CMD] = { .len = sizeof(struct nfqnl_msg_config_cmd) }, [NFQA_CFG_PARAMS] = { .len = sizeof(struct nfqnl_msg_config_params) }, [NFQA_CFG_QUEUE_MAXLEN] = { .type = NLA_U32 }, [NFQA_CFG_MASK] = { .type = NLA_U32 }, [NFQA_CFG_FLAGS] = { .type = NLA_U32 }, }; static const struct nf_queue_handler nfqh = { .outfn = nfqnl_enqueue_packet, .nf_hook_drop = nfqnl_nf_hook_drop, }; static int nfqnl_recv_config(struct sk_buff *skb, const struct nfnl_info *info, const struct nlattr * const nfqa[]) { struct nfnl_queue_net *q = nfnl_queue_pernet(info->net); u_int16_t queue_num = ntohs(info->nfmsg->res_id); struct nfqnl_msg_config_cmd *cmd = NULL; struct nfqnl_instance *queue; __u32 flags = 0, mask = 0; int ret = 0; if (nfqa[NFQA_CFG_CMD]) { cmd = nla_data(nfqa[NFQA_CFG_CMD]); /* Obsolete commands without queue context */ switch (cmd->command) { case NFQNL_CFG_CMD_PF_BIND: return 0; case NFQNL_CFG_CMD_PF_UNBIND: return 0; } } /* Check if we support these flags in first place, dependencies should * be there too not to break atomicity. */ if (nfqa[NFQA_CFG_FLAGS]) { if (!nfqa[NFQA_CFG_MASK]) { /* A mask is needed to specify which flags are being * changed. */ return -EINVAL; } flags = ntohl(nla_get_be32(nfqa[NFQA_CFG_FLAGS])); mask = ntohl(nla_get_be32(nfqa[NFQA_CFG_MASK])); if (flags >= NFQA_CFG_F_MAX) return -EOPNOTSUPP; #if !IS_ENABLED(CONFIG_NETWORK_SECMARK) if (flags & mask & NFQA_CFG_F_SECCTX) return -EOPNOTSUPP; #endif if ((flags & mask & NFQA_CFG_F_CONNTRACK) && !rcu_access_pointer(nfnl_ct_hook)) { #ifdef CONFIG_MODULES nfnl_unlock(NFNL_SUBSYS_QUEUE); request_module("ip_conntrack_netlink"); nfnl_lock(NFNL_SUBSYS_QUEUE); if (rcu_access_pointer(nfnl_ct_hook)) return -EAGAIN; #endif return -EOPNOTSUPP; } } rcu_read_lock(); queue = instance_lookup(q, queue_num); if (queue && queue->peer_portid != NETLINK_CB(skb).portid) { ret = -EPERM; goto err_out_unlock; } if (cmd != NULL) { switch (cmd->command) { case NFQNL_CFG_CMD_BIND: if (queue) { ret = -EBUSY; goto err_out_unlock; } queue = instance_create(q, queue_num, NETLINK_CB(skb).portid); if (IS_ERR(queue)) { ret = PTR_ERR(queue); goto err_out_unlock; } break; case NFQNL_CFG_CMD_UNBIND: if (!queue) { ret = -ENODEV; goto err_out_unlock; } instance_destroy(q, queue); goto err_out_unlock; case NFQNL_CFG_CMD_PF_BIND: case NFQNL_CFG_CMD_PF_UNBIND: break; default: ret = -ENOTSUPP; goto err_out_unlock; } } if (!queue) { ret = -ENODEV; goto err_out_unlock; } if (nfqa[NFQA_CFG_PARAMS]) { struct nfqnl_msg_config_params *params = nla_data(nfqa[NFQA_CFG_PARAMS]); nfqnl_set_mode(queue, params->copy_mode, ntohl(params->copy_range)); } if (nfqa[NFQA_CFG_QUEUE_MAXLEN]) { __be32 *queue_maxlen = nla_data(nfqa[NFQA_CFG_QUEUE_MAXLEN]); spin_lock_bh(&queue->lock); queue->queue_maxlen = ntohl(*queue_maxlen); spin_unlock_bh(&queue->lock); } if (nfqa[NFQA_CFG_FLAGS]) { spin_lock_bh(&queue->lock); queue->flags &= ~mask; queue->flags |= flags & mask; spin_unlock_bh(&queue->lock); } err_out_unlock: rcu_read_unlock(); return ret; } static const struct nfnl_callback nfqnl_cb[NFQNL_MSG_MAX] = { [NFQNL_MSG_PACKET] = { .call = nfqnl_recv_unsupp, .type = NFNL_CB_RCU, .attr_count = NFQA_MAX, }, [NFQNL_MSG_VERDICT] = { .call = nfqnl_recv_verdict, .type = NFNL_CB_RCU, .attr_count = NFQA_MAX, .policy = nfqa_verdict_policy }, [NFQNL_MSG_CONFIG] = { .call = nfqnl_recv_config, .type = NFNL_CB_MUTEX, .attr_count = NFQA_CFG_MAX, .policy = nfqa_cfg_policy }, [NFQNL_MSG_VERDICT_BATCH] = { .call = nfqnl_recv_verdict_batch, .type = NFNL_CB_RCU, .attr_count = NFQA_MAX, .policy = nfqa_verdict_batch_policy }, }; static const struct nfnetlink_subsystem nfqnl_subsys = { .name = "nf_queue", .subsys_id = NFNL_SUBSYS_QUEUE, .cb_count = NFQNL_MSG_MAX, .cb = nfqnl_cb, }; #ifdef CONFIG_PROC_FS struct iter_state { struct seq_net_private p; unsigned int bucket; }; static struct hlist_node *get_first(struct seq_file *seq) { struct iter_state *st = seq->private; struct net *net; struct nfnl_queue_net *q; if (!st) return NULL; net = seq_file_net(seq); q = nfnl_queue_pernet(net); for (st->bucket = 0; st->bucket < INSTANCE_BUCKETS; st->bucket++) { if (!hlist_empty(&q->instance_table[st->bucket])) return q->instance_table[st->bucket].first; } return NULL; } static struct hlist_node *get_next(struct seq_file *seq, struct hlist_node *h) { struct iter_state *st = seq->private; struct net *net = seq_file_net(seq); h = h->next; while (!h) { struct nfnl_queue_net *q; if (++st->bucket >= INSTANCE_BUCKETS) return NULL; q = nfnl_queue_pernet(net); h = q->instance_table[st->bucket].first; } return h; } static struct hlist_node *get_idx(struct seq_file *seq, loff_t pos) { struct hlist_node *head; head = get_first(seq); if (head) while (pos && (head = get_next(seq, head))) pos--; return pos ? NULL : head; } static void *seq_start(struct seq_file *s, loff_t *pos) __acquires(nfnl_queue_pernet(seq_file_net(s))->instances_lock) { spin_lock(&nfnl_queue_pernet(seq_file_net(s))->instances_lock); return get_idx(s, *pos); } static void *seq_next(struct seq_file *s, void *v, loff_t *pos) { (*pos)++; return get_next(s, v); } static void seq_stop(struct seq_file *s, void *v) __releases(nfnl_queue_pernet(seq_file_net(s))->instances_lock) { spin_unlock(&nfnl_queue_pernet(seq_file_net(s))->instances_lock); } static int seq_show(struct seq_file *s, void *v) { const struct nfqnl_instance *inst = v; seq_printf(s, "%5u %6u %5u %1u %5u %5u %5u %8u %2d\n", inst->queue_num, inst->peer_portid, inst->queue_total, inst->copy_mode, inst->copy_range, inst->queue_dropped, inst->queue_user_dropped, inst->id_sequence, 1); return 0; } static const struct seq_operations nfqnl_seq_ops = { .start = seq_start, .next = seq_next, .stop = seq_stop, .show = seq_show, }; #endif /* PROC_FS */ static int __net_init nfnl_queue_net_init(struct net *net) { unsigned int i; struct nfnl_queue_net *q = nfnl_queue_pernet(net); for (i = 0; i < INSTANCE_BUCKETS; i++) INIT_HLIST_HEAD(&q->instance_table[i]); spin_lock_init(&q->instances_lock); #ifdef CONFIG_PROC_FS if (!proc_create_net("nfnetlink_queue", 0440, net->nf.proc_netfilter, &nfqnl_seq_ops, sizeof(struct iter_state))) return -ENOMEM; #endif return 0; } static void __net_exit nfnl_queue_net_exit(struct net *net) { struct nfnl_queue_net *q = nfnl_queue_pernet(net); unsigned int i; #ifdef CONFIG_PROC_FS remove_proc_entry("nfnetlink_queue", net->nf.proc_netfilter); #endif for (i = 0; i < INSTANCE_BUCKETS; i++) WARN_ON_ONCE(!hlist_empty(&q->instance_table[i])); } static struct pernet_operations nfnl_queue_net_ops = { .init = nfnl_queue_net_init, .exit = nfnl_queue_net_exit, .id = &nfnl_queue_net_id, .size = sizeof(struct nfnl_queue_net), }; static int __init nfnetlink_queue_init(void) { int status; status = register_pernet_subsys(&nfnl_queue_net_ops); if (status < 0) { pr_err("failed to register pernet ops\n"); goto out; } netlink_register_notifier(&nfqnl_rtnl_notifier); status = nfnetlink_subsys_register(&nfqnl_subsys); if (status < 0) { pr_err("failed to create netlink socket\n"); goto cleanup_netlink_notifier; } status = register_netdevice_notifier(&nfqnl_dev_notifier); if (status < 0) { pr_err("failed to register netdevice notifier\n"); goto cleanup_netlink_subsys; } nf_register_queue_handler(&nfqh); return status; cleanup_netlink_subsys: nfnetlink_subsys_unregister(&nfqnl_subsys); cleanup_netlink_notifier: netlink_unregister_notifier(&nfqnl_rtnl_notifier); unregister_pernet_subsys(&nfnl_queue_net_ops); out: return status; } static void __exit nfnetlink_queue_fini(void) { nf_unregister_queue_handler(); unregister_netdevice_notifier(&nfqnl_dev_notifier); nfnetlink_subsys_unregister(&nfqnl_subsys); netlink_unregister_notifier(&nfqnl_rtnl_notifier); unregister_pernet_subsys(&nfnl_queue_net_ops); rcu_barrier(); /* Wait for completion of call_rcu()'s */ } MODULE_DESCRIPTION("netfilter packet queue handler"); MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_QUEUE); module_init(nfnetlink_queue_init); module_exit(nfnetlink_queue_fini);
3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * include/linux/if_team.h - Network team device driver header * Copyright (c) 2011 Jiri Pirko <jpirko@redhat.com> */ #ifndef _LINUX_IF_TEAM_H_ #define _LINUX_IF_TEAM_H_ #include <linux/netpoll.h> #include <net/sch_generic.h> #include <linux/types.h> #include <uapi/linux/if_team.h> struct team_pcpu_stats { u64_stats_t rx_packets; u64_stats_t rx_bytes; u64_stats_t rx_multicast; u64_stats_t tx_packets; u64_stats_t tx_bytes; struct u64_stats_sync syncp; u32 rx_dropped; u32 tx_dropped; u32 rx_nohandler; }; struct team; struct team_port { struct net_device *dev; struct hlist_node hlist; /* node in enabled ports hash list */ struct list_head list; /* node in ordinary list */ struct team *team; int index; /* index of enabled port. If disabled, it's set to -1 */ bool linkup; /* either state.linkup or user.linkup */ struct { bool linkup; u32 speed; u8 duplex; } state; /* Values set by userspace */ struct { bool linkup; bool linkup_enabled; } user; /* Custom gennetlink interface related flags */ bool changed; bool removed; /* * A place for storing original values of the device before it * become a port. */ struct { unsigned char dev_addr[MAX_ADDR_LEN]; unsigned int mtu; } orig; #ifdef CONFIG_NET_POLL_CONTROLLER struct netpoll *np; #endif s32 priority; /* lower number ~ higher priority */ u16 queue_id; struct list_head qom_list; /* node in queue override mapping list */ struct rcu_head rcu; long mode_priv[]; }; static inline struct team_port *team_port_get_rcu(const struct net_device *dev) { return rcu_dereference(dev->rx_handler_data); } static inline bool team_port_enabled(struct team_port *port) { return port->index != -1; } static inline bool team_port_txable(struct team_port *port) { return port->linkup && team_port_enabled(port); } static inline bool team_port_dev_txable(const struct net_device *port_dev) { struct team_port *port; bool txable; rcu_read_lock(); port = team_port_get_rcu(port_dev); txable = port ? team_port_txable(port) : false; rcu_read_unlock(); return txable; } #ifdef CONFIG_NET_POLL_CONTROLLER static inline void team_netpoll_send_skb(struct team_port *port, struct sk_buff *skb) { netpoll_send_skb(port->np, skb); } #else static inline void team_netpoll_send_skb(struct team_port *port, struct sk_buff *skb) { } #endif struct team_mode_ops { int (*init)(struct team *team); void (*exit)(struct team *team); rx_handler_result_t (*receive)(struct team *team, struct team_port *port, struct sk_buff *skb); bool (*transmit)(struct team *team, struct sk_buff *skb); int (*port_enter)(struct team *team, struct team_port *port); void (*port_leave)(struct team *team, struct team_port *port); void (*port_change_dev_addr)(struct team *team, struct team_port *port); void (*port_enabled)(struct team *team, struct team_port *port); void (*port_disabled)(struct team *team, struct team_port *port); }; extern int team_modeop_port_enter(struct team *team, struct team_port *port); extern void team_modeop_port_change_dev_addr(struct team *team, struct team_port *port); enum team_option_type { TEAM_OPTION_TYPE_U32, TEAM_OPTION_TYPE_STRING, TEAM_OPTION_TYPE_BINARY, TEAM_OPTION_TYPE_BOOL, TEAM_OPTION_TYPE_S32, }; struct team_option_inst_info { u32 array_index; struct team_port *port; /* != NULL if per-port */ }; struct team_gsetter_ctx { union { u32 u32_val; const char *str_val; struct { const void *ptr; u32 len; } bin_val; bool bool_val; s32 s32_val; } data; struct team_option_inst_info *info; }; struct team_option { struct list_head list; const char *name; bool per_port; unsigned int array_size; /* != 0 means the option is array */ enum team_option_type type; void (*init)(struct team *team, struct team_option_inst_info *info); void (*getter)(struct team *team, struct team_gsetter_ctx *ctx); int (*setter)(struct team *team, struct team_gsetter_ctx *ctx); }; extern void team_option_inst_set_change(struct team_option_inst_info *opt_inst_info); extern void team_options_change_check(struct team *team); struct team_mode { const char *kind; struct module *owner; size_t priv_size; size_t port_priv_size; const struct team_mode_ops *ops; enum netdev_lag_tx_type lag_tx_type; }; #define TEAM_PORT_HASHBITS 4 #define TEAM_PORT_HASHENTRIES (1 << TEAM_PORT_HASHBITS) #define TEAM_MODE_PRIV_LONGS 4 #define TEAM_MODE_PRIV_SIZE (sizeof(long) * TEAM_MODE_PRIV_LONGS) struct team { struct net_device *dev; /* associated netdevice */ struct team_pcpu_stats __percpu *pcpu_stats; const struct header_ops *header_ops_cache; struct mutex lock; /* used for overall locking, e.g. port lists write */ /* * List of enabled ports and their count */ int en_port_count; struct hlist_head en_port_hlist[TEAM_PORT_HASHENTRIES]; struct list_head port_list; /* list of all ports */ struct list_head option_list; struct list_head option_inst_list; /* list of option instances */ const struct team_mode *mode; struct team_mode_ops ops; bool user_carrier_enabled; bool queue_override_enabled; struct list_head *qom_lists; /* array of queue override mapping lists */ bool port_mtu_change_allowed; bool notifier_ctx; struct { unsigned int count; unsigned int interval; /* in ms */ atomic_t count_pending; struct delayed_work dw; } notify_peers; struct { unsigned int count; unsigned int interval; /* in ms */ atomic_t count_pending; struct delayed_work dw; } mcast_rejoin; struct lock_class_key team_lock_key; long mode_priv[TEAM_MODE_PRIV_LONGS]; }; static inline int team_dev_queue_xmit(struct team *team, struct team_port *port, struct sk_buff *skb) { BUILD_BUG_ON(sizeof(skb->queue_mapping) != sizeof(qdisc_skb_cb(skb)->slave_dev_queue_mapping)); skb_set_queue_mapping(skb, qdisc_skb_cb(skb)->slave_dev_queue_mapping); skb->dev = port->dev; if (unlikely(netpoll_tx_running(team->dev))) { team_netpoll_send_skb(port, skb); return 0; } return dev_queue_xmit(skb); } static inline struct hlist_head *team_port_index_hash(struct team *team, int port_index) { return &team->en_port_hlist[port_index & (TEAM_PORT_HASHENTRIES - 1)]; } static inline struct team_port *team_get_port_by_index(struct team *team, int port_index) { struct team_port *port; struct hlist_head *head = team_port_index_hash(team, port_index); hlist_for_each_entry(port, head, hlist) if (port->index == port_index) return port; return NULL; } static inline int team_num_to_port_index(struct team *team, unsigned int num) { int en_port_count = READ_ONCE(team->en_port_count); if (unlikely(!en_port_count)) return 0; return num % en_port_count; } static inline struct team_port *team_get_port_by_index_rcu(struct team *team, int port_index) { struct team_port *port; struct hlist_head *head = team_port_index_hash(team, port_index); hlist_for_each_entry_rcu(port, head, hlist) if (port->index == port_index) return port; return NULL; } static inline struct team_port * team_get_first_port_txable_rcu(struct team *team, struct team_port *port) { struct team_port *cur; if (likely(team_port_txable(port))) return port; cur = port; list_for_each_entry_continue_rcu(cur, &team->port_list, list) if (team_port_txable(cur)) return cur; list_for_each_entry_rcu(cur, &team->port_list, list) { if (cur == port) break; if (team_port_txable(cur)) return cur; } return NULL; } extern int team_options_register(struct team *team, const struct team_option *option, size_t option_count); extern void team_options_unregister(struct team *team, const struct team_option *option, size_t option_count); extern int team_mode_register(const struct team_mode *mode); extern void team_mode_unregister(const struct team_mode *mode); #define TEAM_DEFAULT_NUM_TX_QUEUES 16 #define TEAM_DEFAULT_NUM_RX_QUEUES 16 #define MODULE_ALIAS_TEAM_MODE(kind) MODULE_ALIAS("team-mode-" kind) #endif /* _LINUX_IF_TEAM_H_ */
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 // SPDX-License-Identifier: GPL-2.0-only /* * vDPA bus. * * Copyright (c) 2020, Red Hat. All rights reserved. * Author: Jason Wang <jasowang@redhat.com> * */ #include <linux/module.h> #include <linux/idr.h> #include <linux/slab.h> #include <linux/vdpa.h> #include <uapi/linux/vdpa.h> #include <net/genetlink.h> #include <linux/mod_devicetable.h> #include <linux/virtio_ids.h> static LIST_HEAD(mdev_head); /* A global mutex that protects vdpa management device and device level operations. */ static DECLARE_RWSEM(vdpa_dev_lock); static DEFINE_IDA(vdpa_index_ida); void vdpa_set_status(struct vdpa_device *vdev, u8 status) { down_write(&vdev->cf_lock); vdev->config->set_status(vdev, status); up_write(&vdev->cf_lock); } EXPORT_SYMBOL(vdpa_set_status); static struct genl_family vdpa_nl_family; static int vdpa_dev_probe(struct device *d) { struct vdpa_device *vdev = dev_to_vdpa(d); struct vdpa_driver *drv = drv_to_vdpa(vdev->dev.driver); const struct vdpa_config_ops *ops = vdev->config; u32 max_num, min_num = 1; int ret = 0; d->dma_mask = &d->coherent_dma_mask; ret = dma_set_mask_and_coherent(d, DMA_BIT_MASK(64)); if (ret) return ret; max_num = ops->get_vq_num_max(vdev); if (ops->get_vq_num_min) min_num = ops->get_vq_num_min(vdev); if (max_num < min_num) return -EINVAL; if (drv && drv->probe) ret = drv->probe(vdev); return ret; } static void vdpa_dev_remove(struct device *d) { struct vdpa_device *vdev = dev_to_vdpa(d); struct vdpa_driver *drv = drv_to_vdpa(vdev->dev.driver); if (drv && drv->remove) drv->remove(vdev); } static int vdpa_dev_match(struct device *dev, struct device_driver *drv) { struct vdpa_device *vdev = dev_to_vdpa(dev); /* Check override first, and if set, only use the named driver */ if (vdev->driver_override) return strcmp(vdev->driver_override, drv->name) == 0; /* Currently devices must be supported by all vDPA bus drivers */ return 1; } static ssize_t driver_override_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct vdpa_device *vdev = dev_to_vdpa(dev); int ret; ret = driver_set_override(dev, &vdev->driver_override, buf, count); if (ret) return ret; return count; } static ssize_t driver_override_show(struct device *dev, struct device_attribute *attr, char *buf) { struct vdpa_device *vdev = dev_to_vdpa(dev); ssize_t len; device_lock(dev); len = snprintf(buf, PAGE_SIZE, "%s\n", vdev->driver_override); device_unlock(dev); return len; } static DEVICE_ATTR_RW(driver_override); static struct attribute *vdpa_dev_attrs[] = { &dev_attr_driver_override.attr, NULL, }; static const struct attribute_group vdpa_dev_group = { .attrs = vdpa_dev_attrs, }; __ATTRIBUTE_GROUPS(vdpa_dev); static const struct bus_type vdpa_bus = { .name = "vdpa", .dev_groups = vdpa_dev_groups, .match = vdpa_dev_match, .probe = vdpa_dev_probe, .remove = vdpa_dev_remove, }; static void vdpa_release_dev(struct device *d) { struct vdpa_device *vdev = dev_to_vdpa(d); const struct vdpa_config_ops *ops = vdev->config; if (ops->free) ops->free(vdev); ida_free(&vdpa_index_ida, vdev->index); kfree(vdev->driver_override); kfree(vdev); } /** * __vdpa_alloc_device - allocate and initilaize a vDPA device * This allows driver to some prepartion after device is * initialized but before registered. * @parent: the parent device * @config: the bus operations that is supported by this device * @ngroups: number of groups supported by this device * @nas: number of address spaces supported by this device * @size: size of the parent structure that contains private data * @name: name of the vdpa device; optional. * @use_va: indicate whether virtual address must be used by this device * * Driver should use vdpa_alloc_device() wrapper macro instead of * using this directly. * * Return: Returns an error when parent/config/dma_dev is not set or fail to get * ida. */ struct vdpa_device *__vdpa_alloc_device(struct device *parent, const struct vdpa_config_ops *config, unsigned int ngroups, unsigned int nas, size_t size, const char *name, bool use_va) { struct vdpa_device *vdev; int err = -EINVAL; if (!config) goto err; if (!!config->dma_map != !!config->dma_unmap) goto err; /* It should only work for the device that use on-chip IOMMU */ if (use_va && !(config->dma_map || config->set_map)) goto err; err = -ENOMEM; vdev = kzalloc(size, GFP_KERNEL); if (!vdev) goto err; err = ida_alloc(&vdpa_index_ida, GFP_KERNEL); if (err < 0) goto err_ida; vdev->dev.bus = &vdpa_bus; vdev->dev.parent = parent; vdev->dev.release = vdpa_release_dev; vdev->index = err; vdev->config = config; vdev->features_valid = false; vdev->use_va = use_va; vdev->ngroups = ngroups; vdev->nas = nas; if (name) err = dev_set_name(&vdev->dev, "%s", name); else err = dev_set_name(&vdev->dev, "vdpa%u", vdev->index); if (err) goto err_name; init_rwsem(&vdev->cf_lock); device_initialize(&vdev->dev); return vdev; err_name: ida_free(&vdpa_index_ida, vdev->index); err_ida: kfree(vdev); err: return ERR_PTR(err); } EXPORT_SYMBOL_GPL(__vdpa_alloc_device); static int vdpa_name_match(struct device *dev, const void *data) { struct vdpa_device *vdev = container_of(dev, struct vdpa_device, dev); return (strcmp(dev_name(&vdev->dev), data) == 0); } static int __vdpa_register_device(struct vdpa_device *vdev, u32 nvqs) { struct device *dev; vdev->nvqs = nvqs; lockdep_assert_held(&vdpa_dev_lock); dev = bus_find_device(&vdpa_bus, NULL, dev_name(&vdev->dev), vdpa_name_match); if (dev) { put_device(dev); return -EEXIST; } return device_add(&vdev->dev); } /** * _vdpa_register_device - register a vDPA device with vdpa lock held * Caller must have a succeed call of vdpa_alloc_device() before. * Caller must invoke this routine in the management device dev_add() * callback after setting up valid mgmtdev for this vdpa device. * @vdev: the vdpa device to be registered to vDPA bus * @nvqs: number of virtqueues supported by this device * * Return: Returns an error when fail to add device to vDPA bus */ int _vdpa_register_device(struct vdpa_device *vdev, u32 nvqs) { if (!vdev->mdev) return -EINVAL; return __vdpa_register_device(vdev, nvqs); } EXPORT_SYMBOL_GPL(_vdpa_register_device); /** * vdpa_register_device - register a vDPA device * Callers must have a succeed call of vdpa_alloc_device() before. * @vdev: the vdpa device to be registered to vDPA bus * @nvqs: number of virtqueues supported by this device * * Return: Returns an error when fail to add to vDPA bus */ int vdpa_register_device(struct vdpa_device *vdev, u32 nvqs) { int err; down_write(&vdpa_dev_lock); err = __vdpa_register_device(vdev, nvqs); up_write(&vdpa_dev_lock); return err; } EXPORT_SYMBOL_GPL(vdpa_register_device); /** * _vdpa_unregister_device - unregister a vDPA device * Caller must invoke this routine as part of management device dev_del() * callback. * @vdev: the vdpa device to be unregisted from vDPA bus */ void _vdpa_unregister_device(struct vdpa_device *vdev) { lockdep_assert_held(&vdpa_dev_lock); WARN_ON(!vdev->mdev); device_unregister(&vdev->dev); } EXPORT_SYMBOL_GPL(_vdpa_unregister_device); /** * vdpa_unregister_device - unregister a vDPA device * @vdev: the vdpa device to be unregisted from vDPA bus */ void vdpa_unregister_device(struct vdpa_device *vdev) { down_write(&vdpa_dev_lock); device_unregister(&vdev->dev); up_write(&vdpa_dev_lock); } EXPORT_SYMBOL_GPL(vdpa_unregister_device); /** * __vdpa_register_driver - register a vDPA device driver * @drv: the vdpa device driver to be registered * @owner: module owner of the driver * * Return: Returns an err when fail to do the registration */ int __vdpa_register_driver(struct vdpa_driver *drv, struct module *owner) { drv->driver.bus = &vdpa_bus; drv->driver.owner = owner; return driver_register(&drv->driver); } EXPORT_SYMBOL_GPL(__vdpa_register_driver); /** * vdpa_unregister_driver - unregister a vDPA device driver * @drv: the vdpa device driver to be unregistered */ void vdpa_unregister_driver(struct vdpa_driver *drv) { driver_unregister(&drv->driver); } EXPORT_SYMBOL_GPL(vdpa_unregister_driver); /** * vdpa_mgmtdev_register - register a vdpa management device * * @mdev: Pointer to vdpa management device * vdpa_mgmtdev_register() register a vdpa management device which supports * vdpa device management. * Return: Returns 0 on success or failure when required callback ops are not * initialized. */ int vdpa_mgmtdev_register(struct vdpa_mgmt_dev *mdev) { if (!mdev->device || !mdev->ops || !mdev->ops->dev_add || !mdev->ops->dev_del) return -EINVAL; INIT_LIST_HEAD(&mdev->list); down_write(&vdpa_dev_lock); list_add_tail(&mdev->list, &mdev_head); up_write(&vdpa_dev_lock); return 0; } EXPORT_SYMBOL_GPL(vdpa_mgmtdev_register); static int vdpa_match_remove(struct device *dev, void *data) { struct vdpa_device *vdev = container_of(dev, struct vdpa_device, dev); struct vdpa_mgmt_dev *mdev = vdev->mdev; if (mdev == data) mdev->ops->dev_del(mdev, vdev); return 0; } void vdpa_mgmtdev_unregister(struct vdpa_mgmt_dev *mdev) { down_write(&vdpa_dev_lock); list_del(&mdev->list); /* Filter out all the entries belong to this management device and delete it. */ bus_for_each_dev(&vdpa_bus, NULL, mdev, vdpa_match_remove); up_write(&vdpa_dev_lock); } EXPORT_SYMBOL_GPL(vdpa_mgmtdev_unregister); static void vdpa_get_config_unlocked(struct vdpa_device *vdev, unsigned int offset, void *buf, unsigned int len) { const struct vdpa_config_ops *ops = vdev->config; /* * Config accesses aren't supposed to trigger before features are set. * If it does happen we assume a legacy guest. */ if (!vdev->features_valid) vdpa_set_features_unlocked(vdev, 0); ops->get_config(vdev, offset, buf, len); } /** * vdpa_get_config - Get one or more device configuration fields. * @vdev: vdpa device to operate on * @offset: starting byte offset of the field * @buf: buffer pointer to read to * @len: length of the configuration fields in bytes */ void vdpa_get_config(struct vdpa_device *vdev, unsigned int offset, void *buf, unsigned int len) { down_read(&vdev->cf_lock); vdpa_get_config_unlocked(vdev, offset, buf, len); up_read(&vdev->cf_lock); } EXPORT_SYMBOL_GPL(vdpa_get_config); /** * vdpa_set_config - Set one or more device configuration fields. * @vdev: vdpa device to operate on * @offset: starting byte offset of the field * @buf: buffer pointer to read from * @length: length of the configuration fields in bytes */ void vdpa_set_config(struct vdpa_device *vdev, unsigned int offset, const void *buf, unsigned int length) { down_write(&vdev->cf_lock); vdev->config->set_config(vdev, offset, buf, length); up_write(&vdev->cf_lock); } EXPORT_SYMBOL_GPL(vdpa_set_config); static bool mgmtdev_handle_match(const struct vdpa_mgmt_dev *mdev, const char *busname, const char *devname) { /* Bus name is optional for simulated management device, so ignore the * device with bus if bus attribute is provided. */ if ((busname && !mdev->device->bus) || (!busname && mdev->device->bus)) return false; if (!busname && strcmp(dev_name(mdev->device), devname) == 0) return true; if (busname && (strcmp(mdev->device->bus->name, busname) == 0) && (strcmp(dev_name(mdev->device), devname) == 0)) return true; return false; } static struct vdpa_mgmt_dev *vdpa_mgmtdev_get_from_attr(struct nlattr **attrs) { struct vdpa_mgmt_dev *mdev; const char *busname = NULL; const char *devname; if (!attrs[VDPA_ATTR_MGMTDEV_DEV_NAME]) return ERR_PTR(-EINVAL); devname = nla_data(attrs[VDPA_ATTR_MGMTDEV_DEV_NAME]); if (attrs[VDPA_ATTR_MGMTDEV_BUS_NAME]) busname = nla_data(attrs[VDPA_ATTR_MGMTDEV_BUS_NAME]); list_for_each_entry(mdev, &mdev_head, list) { if (mgmtdev_handle_match(mdev, busname, devname)) return mdev; } return ERR_PTR(-ENODEV); } static int vdpa_nl_mgmtdev_handle_fill(struct sk_buff *msg, const struct vdpa_mgmt_dev *mdev) { if (mdev->device->bus && nla_put_string(msg, VDPA_ATTR_MGMTDEV_BUS_NAME, mdev->device->bus->name)) return -EMSGSIZE; if (nla_put_string(msg, VDPA_ATTR_MGMTDEV_DEV_NAME, dev_name(mdev->device))) return -EMSGSIZE; return 0; } static u64 vdpa_mgmtdev_get_classes(const struct vdpa_mgmt_dev *mdev, unsigned int *nclasses) { u64 supported_classes = 0; unsigned int n = 0; for (int i = 0; mdev->id_table[i].device; i++) { if (mdev->id_table[i].device > 63) continue; supported_classes |= BIT_ULL(mdev->id_table[i].device); n++; } if (nclasses) *nclasses = n; return supported_classes; } static int vdpa_mgmtdev_fill(const struct vdpa_mgmt_dev *mdev, struct sk_buff *msg, u32 portid, u32 seq, int flags) { void *hdr; int err; hdr = genlmsg_put(msg, portid, seq, &vdpa_nl_family, flags, VDPA_CMD_MGMTDEV_NEW); if (!hdr) return -EMSGSIZE; err = vdpa_nl_mgmtdev_handle_fill(msg, mdev); if (err) goto msg_err; if (nla_put_u64_64bit(msg, VDPA_ATTR_MGMTDEV_SUPPORTED_CLASSES, vdpa_mgmtdev_get_classes(mdev, NULL), VDPA_ATTR_UNSPEC)) { err = -EMSGSIZE; goto msg_err; } if (nla_put_u32(msg, VDPA_ATTR_DEV_MGMTDEV_MAX_VQS, mdev->max_supported_vqs)) { err = -EMSGSIZE; goto msg_err; } if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_SUPPORTED_FEATURES, mdev->supported_features, VDPA_ATTR_PAD)) { err = -EMSGSIZE; goto msg_err; } genlmsg_end(msg, hdr); return 0; msg_err: genlmsg_cancel(msg, hdr); return err; } static int vdpa_nl_cmd_mgmtdev_get_doit(struct sk_buff *skb, struct genl_info *info) { struct vdpa_mgmt_dev *mdev; struct sk_buff *msg; int err; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; down_read(&vdpa_dev_lock); mdev = vdpa_mgmtdev_get_from_attr(info->attrs); if (IS_ERR(mdev)) { up_read(&vdpa_dev_lock); NL_SET_ERR_MSG_MOD(info->extack, "Fail to find the specified mgmt device"); err = PTR_ERR(mdev); goto out; } err = vdpa_mgmtdev_fill(mdev, msg, info->snd_portid, info->snd_seq, 0); up_read(&vdpa_dev_lock); if (err) goto out; err = genlmsg_reply(msg, info); return err; out: nlmsg_free(msg); return err; } static int vdpa_nl_cmd_mgmtdev_get_dumpit(struct sk_buff *msg, struct netlink_callback *cb) { struct vdpa_mgmt_dev *mdev; int start = cb->args[0]; int idx = 0; int err; down_read(&vdpa_dev_lock); list_for_each_entry(mdev, &mdev_head, list) { if (idx < start) { idx++; continue; } err = vdpa_mgmtdev_fill(mdev, msg, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI); if (err) goto out; idx++; } out: up_read(&vdpa_dev_lock); cb->args[0] = idx; return msg->len; } #define VDPA_DEV_NET_ATTRS_MASK (BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MACADDR) | \ BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MTU) | \ BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MAX_VQP)) /* * Bitmask for all per-device features: feature bits VIRTIO_TRANSPORT_F_START * through VIRTIO_TRANSPORT_F_END are unset, i.e. 0xfffffc000fffffff for * all 64bit features. If the features are extended beyond 64 bits, or new * "holes" are reserved for other type of features than per-device, this * macro would have to be updated. */ #define VIRTIO_DEVICE_F_MASK (~0ULL << (VIRTIO_TRANSPORT_F_END + 1) | \ ((1ULL << VIRTIO_TRANSPORT_F_START) - 1)) static int vdpa_nl_cmd_dev_add_set_doit(struct sk_buff *skb, struct genl_info *info) { struct vdpa_dev_set_config config = {}; struct nlattr **nl_attrs = info->attrs; struct vdpa_mgmt_dev *mdev; unsigned int ncls = 0; const u8 *macaddr; const char *name; u64 classes; int err = 0; if (!info->attrs[VDPA_ATTR_DEV_NAME]) return -EINVAL; name = nla_data(info->attrs[VDPA_ATTR_DEV_NAME]); if (nl_attrs[VDPA_ATTR_DEV_NET_CFG_MACADDR]) { macaddr = nla_data(nl_attrs[VDPA_ATTR_DEV_NET_CFG_MACADDR]); memcpy(config.net.mac, macaddr, sizeof(config.net.mac)); config.mask |= BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MACADDR); } if (nl_attrs[VDPA_ATTR_DEV_NET_CFG_MTU]) { config.net.mtu = nla_get_u16(nl_attrs[VDPA_ATTR_DEV_NET_CFG_MTU]); config.mask |= BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MTU); } if (nl_attrs[VDPA_ATTR_DEV_NET_CFG_MAX_VQP]) { config.net.max_vq_pairs = nla_get_u16(nl_attrs[VDPA_ATTR_DEV_NET_CFG_MAX_VQP]); if (!config.net.max_vq_pairs) { NL_SET_ERR_MSG_MOD(info->extack, "At least one pair of VQs is required"); return -EINVAL; } config.mask |= BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MAX_VQP); } if (nl_attrs[VDPA_ATTR_DEV_FEATURES]) { u64 missing = 0x0ULL; config.device_features = nla_get_u64(nl_attrs[VDPA_ATTR_DEV_FEATURES]); if (nl_attrs[VDPA_ATTR_DEV_NET_CFG_MACADDR] && !(config.device_features & BIT_ULL(VIRTIO_NET_F_MAC))) missing |= BIT_ULL(VIRTIO_NET_F_MAC); if (nl_attrs[VDPA_ATTR_DEV_NET_CFG_MTU] && !(config.device_features & BIT_ULL(VIRTIO_NET_F_MTU))) missing |= BIT_ULL(VIRTIO_NET_F_MTU); if (nl_attrs[VDPA_ATTR_DEV_NET_CFG_MAX_VQP] && config.net.max_vq_pairs > 1 && !(config.device_features & BIT_ULL(VIRTIO_NET_F_MQ))) missing |= BIT_ULL(VIRTIO_NET_F_MQ); if (missing) { NL_SET_ERR_MSG_FMT_MOD(info->extack, "Missing features 0x%llx for provided attributes", missing); return -EINVAL; } config.mask |= BIT_ULL(VDPA_ATTR_DEV_FEATURES); } /* Skip checking capability if user didn't prefer to configure any * device networking attributes. It is likely that user might have used * a device specific method to configure such attributes or using device * default attributes. */ if ((config.mask & VDPA_DEV_NET_ATTRS_MASK) && !netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; down_write(&vdpa_dev_lock); mdev = vdpa_mgmtdev_get_from_attr(info->attrs); if (IS_ERR(mdev)) { NL_SET_ERR_MSG_MOD(info->extack, "Fail to find the specified management device"); err = PTR_ERR(mdev); goto err; } if ((config.mask & mdev->config_attr_mask) != config.mask) { NL_SET_ERR_MSG_FMT_MOD(info->extack, "Some provided attributes are not supported: 0x%llx", config.mask & ~mdev->config_attr_mask); err = -EOPNOTSUPP; goto err; } classes = vdpa_mgmtdev_get_classes(mdev, &ncls); if (config.mask & VDPA_DEV_NET_ATTRS_MASK && !(classes & BIT_ULL(VIRTIO_ID_NET))) { NL_SET_ERR_MSG_MOD(info->extack, "Network class attributes provided on unsupported management device"); err = -EINVAL; goto err; } if (!(config.mask & VDPA_DEV_NET_ATTRS_MASK) && config.mask & BIT_ULL(VDPA_ATTR_DEV_FEATURES) && classes & BIT_ULL(VIRTIO_ID_NET) && ncls > 1 && config.device_features & VIRTIO_DEVICE_F_MASK) { NL_SET_ERR_MSG_MOD(info->extack, "Management device supports multi-class while device features specified are ambiguous"); err = -EINVAL; goto err; } err = mdev->ops->dev_add(mdev, name, &config); err: up_write(&vdpa_dev_lock); return err; } static int vdpa_nl_cmd_dev_del_set_doit(struct sk_buff *skb, struct genl_info *info) { struct vdpa_mgmt_dev *mdev; struct vdpa_device *vdev; struct device *dev; const char *name; int err = 0; if (!info->attrs[VDPA_ATTR_DEV_NAME]) return -EINVAL; name = nla_data(info->attrs[VDPA_ATTR_DEV_NAME]); down_write(&vdpa_dev_lock); dev = bus_find_device(&vdpa_bus, NULL, name, vdpa_name_match); if (!dev) { NL_SET_ERR_MSG_MOD(info->extack, "device not found"); err = -ENODEV; goto dev_err; } vdev = container_of(dev, struct vdpa_device, dev); if (!vdev->mdev) { NL_SET_ERR_MSG_MOD(info->extack, "Only user created device can be deleted by user"); err = -EINVAL; goto mdev_err; } mdev = vdev->mdev; mdev->ops->dev_del(mdev, vdev); mdev_err: put_device(dev); dev_err: up_write(&vdpa_dev_lock); return err; } static int vdpa_dev_fill(struct vdpa_device *vdev, struct sk_buff *msg, u32 portid, u32 seq, int flags, struct netlink_ext_ack *extack) { u16 max_vq_size; u16 min_vq_size = 1; u32 device_id; u32 vendor_id; void *hdr; int err; hdr = genlmsg_put(msg, portid, seq, &vdpa_nl_family, flags, VDPA_CMD_DEV_NEW); if (!hdr) return -EMSGSIZE; err = vdpa_nl_mgmtdev_handle_fill(msg, vdev->mdev); if (err) goto msg_err; device_id = vdev->config->get_device_id(vdev); vendor_id = vdev->config->get_vendor_id(vdev); max_vq_size = vdev->config->get_vq_num_max(vdev); if (vdev->config->get_vq_num_min) min_vq_size = vdev->config->get_vq_num_min(vdev); err = -EMSGSIZE; if (nla_put_string(msg, VDPA_ATTR_DEV_NAME, dev_name(&vdev->dev))) goto msg_err; if (nla_put_u32(msg, VDPA_ATTR_DEV_ID, device_id)) goto msg_err; if (nla_put_u32(msg, VDPA_ATTR_DEV_VENDOR_ID, vendor_id)) goto msg_err; if (nla_put_u32(msg, VDPA_ATTR_DEV_MAX_VQS, vdev->nvqs)) goto msg_err; if (nla_put_u16(msg, VDPA_ATTR_DEV_MAX_VQ_SIZE, max_vq_size)) goto msg_err; if (nla_put_u16(msg, VDPA_ATTR_DEV_MIN_VQ_SIZE, min_vq_size)) goto msg_err; genlmsg_end(msg, hdr); return 0; msg_err: genlmsg_cancel(msg, hdr); return err; } static int vdpa_nl_cmd_dev_get_doit(struct sk_buff *skb, struct genl_info *info) { struct vdpa_device *vdev; struct sk_buff *msg; const char *devname; struct device *dev; int err; if (!info->attrs[VDPA_ATTR_DEV_NAME]) return -EINVAL; devname = nla_data(info->attrs[VDPA_ATTR_DEV_NAME]); msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; down_read(&vdpa_dev_lock); dev = bus_find_device(&vdpa_bus, NULL, devname, vdpa_name_match); if (!dev) { NL_SET_ERR_MSG_MOD(info->extack, "device not found"); err = -ENODEV; goto err; } vdev = container_of(dev, struct vdpa_device, dev); if (!vdev->mdev) { err = -EINVAL; goto mdev_err; } err = vdpa_dev_fill(vdev, msg, info->snd_portid, info->snd_seq, 0, info->extack); if (err) goto mdev_err; err = genlmsg_reply(msg, info); put_device(dev); up_read(&vdpa_dev_lock); return err; mdev_err: put_device(dev); err: up_read(&vdpa_dev_lock); nlmsg_free(msg); return err; } struct vdpa_dev_dump_info { struct sk_buff *msg; struct netlink_callback *cb; int start_idx; int idx; }; static int vdpa_dev_dump(struct device *dev, void *data) { struct vdpa_device *vdev = container_of(dev, struct vdpa_device, dev); struct vdpa_dev_dump_info *info = data; int err; if (!vdev->mdev) return 0; if (info->idx < info->start_idx) { info->idx++; return 0; } err = vdpa_dev_fill(vdev, info->msg, NETLINK_CB(info->cb->skb).portid, info->cb->nlh->nlmsg_seq, NLM_F_MULTI, info->cb->extack); if (err) return err; info->idx++; return 0; } static int vdpa_nl_cmd_dev_get_dumpit(struct sk_buff *msg, struct netlink_callback *cb) { struct vdpa_dev_dump_info info; info.msg = msg; info.cb = cb; info.start_idx = cb->args[0]; info.idx = 0; down_read(&vdpa_dev_lock); bus_for_each_dev(&vdpa_bus, NULL, &info, vdpa_dev_dump); up_read(&vdpa_dev_lock); cb->args[0] = info.idx; return msg->len; } static int vdpa_dev_net_mq_config_fill(struct sk_buff *msg, u64 features, const struct virtio_net_config *config) { u16 val_u16; if ((features & BIT_ULL(VIRTIO_NET_F_MQ)) == 0 && (features & BIT_ULL(VIRTIO_NET_F_RSS)) == 0) return 0; val_u16 = __virtio16_to_cpu(true, config->max_virtqueue_pairs); return nla_put_u16(msg, VDPA_ATTR_DEV_NET_CFG_MAX_VQP, val_u16); } static int vdpa_dev_net_mtu_config_fill(struct sk_buff *msg, u64 features, const struct virtio_net_config *config) { u16 val_u16; if ((features & BIT_ULL(VIRTIO_NET_F_MTU)) == 0) return 0; val_u16 = __virtio16_to_cpu(true, config->mtu); return nla_put_u16(msg, VDPA_ATTR_DEV_NET_CFG_MTU, val_u16); } static int vdpa_dev_net_mac_config_fill(struct sk_buff *msg, u64 features, const struct virtio_net_config *config) { if ((features & BIT_ULL(VIRTIO_NET_F_MAC)) == 0) return 0; return nla_put(msg, VDPA_ATTR_DEV_NET_CFG_MACADDR, sizeof(config->mac), config->mac); } static int vdpa_dev_net_status_config_fill(struct sk_buff *msg, u64 features, const struct virtio_net_config *config) { u16 val_u16; if ((features & BIT_ULL(VIRTIO_NET_F_STATUS)) == 0) return 0; val_u16 = __virtio16_to_cpu(true, config->status); return nla_put_u16(msg, VDPA_ATTR_DEV_NET_STATUS, val_u16); } static int vdpa_dev_net_config_fill(struct vdpa_device *vdev, struct sk_buff *msg) { struct virtio_net_config config = {}; u64 features_device; vdev->config->get_config(vdev, 0, &config, sizeof(config)); features_device = vdev->config->get_device_features(vdev); if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_FEATURES, features_device, VDPA_ATTR_PAD)) return -EMSGSIZE; if (vdpa_dev_net_mtu_config_fill(msg, features_device, &config)) return -EMSGSIZE; if (vdpa_dev_net_mac_config_fill(msg, features_device, &config)) return -EMSGSIZE; if (vdpa_dev_net_status_config_fill(msg, features_device, &config)) return -EMSGSIZE; return vdpa_dev_net_mq_config_fill(msg, features_device, &config); } static int vdpa_dev_blk_capacity_config_fill(struct sk_buff *msg, const struct virtio_blk_config *config) { u64 val_u64; val_u64 = __virtio64_to_cpu(true, config->capacity); return nla_put_u64_64bit(msg, VDPA_ATTR_DEV_BLK_CFG_CAPACITY, val_u64, VDPA_ATTR_PAD); } static int vdpa_dev_blk_seg_size_config_fill(struct sk_buff *msg, u64 features, const struct virtio_blk_config *config) { u32 val_u32; if ((features & BIT_ULL(VIRTIO_BLK_F_SIZE_MAX)) == 0) return 0; val_u32 = __virtio32_to_cpu(true, config->size_max); return nla_put_u32(msg, VDPA_ATTR_DEV_BLK_CFG_SEG_SIZE, val_u32); } /* fill the block size*/ static int vdpa_dev_blk_block_size_config_fill(struct sk_buff *msg, u64 features, const struct virtio_blk_config *config) { u32 val_u32; if ((features & BIT_ULL(VIRTIO_BLK_F_BLK_SIZE)) == 0) return 0; val_u32 = __virtio32_to_cpu(true, config->blk_size); return nla_put_u32(msg, VDPA_ATTR_DEV_BLK_CFG_BLK_SIZE, val_u32); } static int vdpa_dev_blk_seg_max_config_fill(struct sk_buff *msg, u64 features, const struct virtio_blk_config *config) { u32 val_u32; if ((features & BIT_ULL(VIRTIO_BLK_F_SEG_MAX)) == 0) return 0; val_u32 = __virtio32_to_cpu(true, config->seg_max); return nla_put_u32(msg, VDPA_ATTR_DEV_BLK_CFG_SEG_MAX, val_u32); } static int vdpa_dev_blk_mq_config_fill(struct sk_buff *msg, u64 features, const struct virtio_blk_config *config) { u16 val_u16; if ((features & BIT_ULL(VIRTIO_BLK_F_MQ)) == 0) return 0; val_u16 = __virtio16_to_cpu(true, config->num_queues); return nla_put_u16(msg, VDPA_ATTR_DEV_BLK_CFG_NUM_QUEUES, val_u16); } static int vdpa_dev_blk_topology_config_fill(struct sk_buff *msg, u64 features, const struct virtio_blk_config *config) { u16 min_io_size; u32 opt_io_size; if ((features & BIT_ULL(VIRTIO_BLK_F_TOPOLOGY)) == 0) return 0; min_io_size = __virtio16_to_cpu(true, config->min_io_size); opt_io_size = __virtio32_to_cpu(true, config->opt_io_size); if (nla_put_u8(msg, VDPA_ATTR_DEV_BLK_CFG_PHY_BLK_EXP, config->physical_block_exp)) return -EMSGSIZE; if (nla_put_u8(msg, VDPA_ATTR_DEV_BLK_CFG_ALIGN_OFFSET, config->alignment_offset)) return -EMSGSIZE; if (nla_put_u16(msg, VDPA_ATTR_DEV_BLK_CFG_MIN_IO_SIZE, min_io_size)) return -EMSGSIZE; if (nla_put_u32(msg, VDPA_ATTR_DEV_BLK_CFG_OPT_IO_SIZE, opt_io_size)) return -EMSGSIZE; return 0; } static int vdpa_dev_blk_discard_config_fill(struct sk_buff *msg, u64 features, const struct virtio_blk_config *config) { u32 val_u32; if ((features & BIT_ULL(VIRTIO_BLK_F_DISCARD)) == 0) return 0; val_u32 = __virtio32_to_cpu(true, config->max_discard_sectors); if (nla_put_u32(msg, VDPA_ATTR_DEV_BLK_CFG_MAX_DISCARD_SEC, val_u32)) return -EMSGSIZE; val_u32 = __virtio32_to_cpu(true, config->max_discard_seg); if (nla_put_u32(msg, VDPA_ATTR_DEV_BLK_CFG_MAX_DISCARD_SEG, val_u32)) return -EMSGSIZE; val_u32 = __virtio32_to_cpu(true, config->discard_sector_alignment); if (nla_put_u32(msg, VDPA_ATTR_DEV_BLK_CFG_DISCARD_SEC_ALIGN, val_u32)) return -EMSGSIZE; return 0; } static int vdpa_dev_blk_write_zeroes_config_fill(struct sk_buff *msg, u64 features, const struct virtio_blk_config *config) { u32 val_u32; if ((features & BIT_ULL(VIRTIO_BLK_F_WRITE_ZEROES)) == 0) return 0; val_u32 = __virtio32_to_cpu(true, config->max_write_zeroes_sectors); if (nla_put_u32(msg, VDPA_ATTR_DEV_BLK_CFG_MAX_WRITE_ZEROES_SEC, val_u32)) return -EMSGSIZE; val_u32 = __virtio32_to_cpu(true, config->max_write_zeroes_seg); if (nla_put_u32(msg, VDPA_ATTR_DEV_BLK_CFG_MAX_WRITE_ZEROES_SEG, val_u32)) return -EMSGSIZE; return 0; } static int vdpa_dev_blk_ro_config_fill(struct sk_buff *msg, u64 features) { u8 ro; ro = ((features & BIT_ULL(VIRTIO_BLK_F_RO)) == 0) ? 0 : 1; if (nla_put_u8(msg, VDPA_ATTR_DEV_BLK_CFG_READ_ONLY, ro)) return -EMSGSIZE; return 0; } static int vdpa_dev_blk_flush_config_fill(struct sk_buff *msg, u64 features) { u8 flush; flush = ((features & BIT_ULL(VIRTIO_BLK_F_FLUSH)) == 0) ? 0 : 1; if (nla_put_u8(msg, VDPA_ATTR_DEV_BLK_CFG_FLUSH, flush)) return -EMSGSIZE; return 0; } static int vdpa_dev_blk_config_fill(struct vdpa_device *vdev, struct sk_buff *msg) { struct virtio_blk_config config = {}; u64 features_device; vdev->config->get_config(vdev, 0, &config, sizeof(config)); features_device = vdev->config->get_device_features(vdev); if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_FEATURES, features_device, VDPA_ATTR_PAD)) return -EMSGSIZE; if (vdpa_dev_blk_capacity_config_fill(msg, &config)) return -EMSGSIZE; if (vdpa_dev_blk_seg_size_config_fill(msg, features_device, &config)) return -EMSGSIZE; if (vdpa_dev_blk_block_size_config_fill(msg, features_device, &config)) return -EMSGSIZE; if (vdpa_dev_blk_seg_max_config_fill(msg, features_device, &config)) return -EMSGSIZE; if (vdpa_dev_blk_mq_config_fill(msg, features_device, &config)) return -EMSGSIZE; if (vdpa_dev_blk_topology_config_fill(msg, features_device, &config)) return -EMSGSIZE; if (vdpa_dev_blk_discard_config_fill(msg, features_device, &config)) return -EMSGSIZE; if (vdpa_dev_blk_write_zeroes_config_fill(msg, features_device, &config)) return -EMSGSIZE; if (vdpa_dev_blk_ro_config_fill(msg, features_device)) return -EMSGSIZE; if (vdpa_dev_blk_flush_config_fill(msg, features_device)) return -EMSGSIZE; return 0; } static int vdpa_dev_config_fill(struct vdpa_device *vdev, struct sk_buff *msg, u32 portid, u32 seq, int flags, struct netlink_ext_ack *extack) { u64 features_driver; u8 status = 0; u32 device_id; void *hdr; int err; down_read(&vdev->cf_lock); hdr = genlmsg_put(msg, portid, seq, &vdpa_nl_family, flags, VDPA_CMD_DEV_CONFIG_GET); if (!hdr) { err = -EMSGSIZE; goto out; } if (nla_put_string(msg, VDPA_ATTR_DEV_NAME, dev_name(&vdev->dev))) { err = -EMSGSIZE; goto msg_err; } device_id = vdev->config->get_device_id(vdev); if (nla_put_u32(msg, VDPA_ATTR_DEV_ID, device_id)) { err = -EMSGSIZE; goto msg_err; } /* only read driver features after the feature negotiation is done */ status = vdev->config->get_status(vdev); if (status & VIRTIO_CONFIG_S_FEATURES_OK) { features_driver = vdev->config->get_driver_features(vdev); if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_NEGOTIATED_FEATURES, features_driver, VDPA_ATTR_PAD)) { err = -EMSGSIZE; goto msg_err; } } switch (device_id) { case VIRTIO_ID_NET: err = vdpa_dev_net_config_fill(vdev, msg); break; case VIRTIO_ID_BLOCK: err = vdpa_dev_blk_config_fill(vdev, msg); break; default: err = -EOPNOTSUPP; break; } if (err) goto msg_err; up_read(&vdev->cf_lock); genlmsg_end(msg, hdr); return 0; msg_err: genlmsg_cancel(msg, hdr); out: up_read(&vdev->cf_lock); return err; } static int vdpa_fill_stats_rec(struct vdpa_device *vdev, struct sk_buff *msg, struct genl_info *info, u32 index) { struct virtio_net_config config = {}; u64 features; u8 status; int err; status = vdev->config->get_status(vdev); if (!(status & VIRTIO_CONFIG_S_FEATURES_OK)) { NL_SET_ERR_MSG_MOD(info->extack, "feature negotiation not complete"); return -EAGAIN; } vdpa_get_config_unlocked(vdev, 0, &config, sizeof(config)); features = vdev->config->get_driver_features(vdev); if (nla_put_u64_64bit(msg, VDPA_ATTR_DEV_NEGOTIATED_FEATURES, features, VDPA_ATTR_PAD)) return -EMSGSIZE; err = vdpa_dev_net_mq_config_fill(msg, features, &config); if (err) return err; if (nla_put_u32(msg, VDPA_ATTR_DEV_QUEUE_INDEX, index)) return -EMSGSIZE; err = vdev->config->get_vendor_vq_stats(vdev, index, msg, info->extack); if (err) return err; return 0; } static int vendor_stats_fill(struct vdpa_device *vdev, struct sk_buff *msg, struct genl_info *info, u32 index) { int err; down_read(&vdev->cf_lock); if (!vdev->config->get_vendor_vq_stats) { err = -EOPNOTSUPP; goto out; } err = vdpa_fill_stats_rec(vdev, msg, info, index); out: up_read(&vdev->cf_lock); return err; } static int vdpa_dev_vendor_stats_fill(struct vdpa_device *vdev, struct sk_buff *msg, struct genl_info *info, u32 index) { u32 device_id; void *hdr; int err; u32 portid = info->snd_portid; u32 seq = info->snd_seq; u32 flags = 0; hdr = genlmsg_put(msg, portid, seq, &vdpa_nl_family, flags, VDPA_CMD_DEV_VSTATS_GET); if (!hdr) return -EMSGSIZE; if (nla_put_string(msg, VDPA_ATTR_DEV_NAME, dev_name(&vdev->dev))) { err = -EMSGSIZE; goto undo_msg; } device_id = vdev->config->get_device_id(vdev); if (nla_put_u32(msg, VDPA_ATTR_DEV_ID, device_id)) { err = -EMSGSIZE; goto undo_msg; } switch (device_id) { case VIRTIO_ID_NET: if (index > VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MAX) { NL_SET_ERR_MSG_MOD(info->extack, "queue index exceeds max value"); err = -ERANGE; break; } err = vendor_stats_fill(vdev, msg, info, index); break; default: err = -EOPNOTSUPP; break; } genlmsg_end(msg, hdr); return err; undo_msg: genlmsg_cancel(msg, hdr); return err; } static int vdpa_nl_cmd_dev_config_get_doit(struct sk_buff *skb, struct genl_info *info) { struct vdpa_device *vdev; struct sk_buff *msg; const char *devname; struct device *dev; int err; if (!info->attrs[VDPA_ATTR_DEV_NAME]) return -EINVAL; devname = nla_data(info->attrs[VDPA_ATTR_DEV_NAME]); msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; down_read(&vdpa_dev_lock); dev = bus_find_device(&vdpa_bus, NULL, devname, vdpa_name_match); if (!dev) { NL_SET_ERR_MSG_MOD(info->extack, "device not found"); err = -ENODEV; goto dev_err; } vdev = container_of(dev, struct vdpa_device, dev); if (!vdev->mdev) { NL_SET_ERR_MSG_MOD(info->extack, "unmanaged vdpa device"); err = -EINVAL; goto mdev_err; } err = vdpa_dev_config_fill(vdev, msg, info->snd_portid, info->snd_seq, 0, info->extack); if (!err) err = genlmsg_reply(msg, info); mdev_err: put_device(dev); dev_err: up_read(&vdpa_dev_lock); if (err) nlmsg_free(msg); return err; } static int vdpa_dev_config_dump(struct device *dev, void *data) { struct vdpa_device *vdev = container_of(dev, struct vdpa_device, dev); struct vdpa_dev_dump_info *info = data; int err; if (!vdev->mdev) return 0; if (info->idx < info->start_idx) { info->idx++; return 0; } err = vdpa_dev_config_fill(vdev, info->msg, NETLINK_CB(info->cb->skb).portid, info->cb->nlh->nlmsg_seq, NLM_F_MULTI, info->cb->extack); if (err) return err; info->idx++; return 0; } static int vdpa_nl_cmd_dev_config_get_dumpit(struct sk_buff *msg, struct netlink_callback *cb) { struct vdpa_dev_dump_info info; info.msg = msg; info.cb = cb; info.start_idx = cb->args[0]; info.idx = 0; down_read(&vdpa_dev_lock); bus_for_each_dev(&vdpa_bus, NULL, &info, vdpa_dev_config_dump); up_read(&vdpa_dev_lock); cb->args[0] = info.idx; return msg->len; } static int vdpa_nl_cmd_dev_stats_get_doit(struct sk_buff *skb, struct genl_info *info) { struct vdpa_device *vdev; struct sk_buff *msg; const char *devname; struct device *dev; u32 index; int err; if (!info->attrs[VDPA_ATTR_DEV_NAME]) return -EINVAL; if (!info->attrs[VDPA_ATTR_DEV_QUEUE_INDEX]) return -EINVAL; devname = nla_data(info->attrs[VDPA_ATTR_DEV_NAME]); msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; index = nla_get_u32(info->attrs[VDPA_ATTR_DEV_QUEUE_INDEX]); down_read(&vdpa_dev_lock); dev = bus_find_device(&vdpa_bus, NULL, devname, vdpa_name_match); if (!dev) { NL_SET_ERR_MSG_MOD(info->extack, "device not found"); err = -ENODEV; goto dev_err; } vdev = container_of(dev, struct vdpa_device, dev); if (!vdev->mdev) { NL_SET_ERR_MSG_MOD(info->extack, "unmanaged vdpa device"); err = -EINVAL; goto mdev_err; } err = vdpa_dev_vendor_stats_fill(vdev, msg, info, index); if (err) goto mdev_err; err = genlmsg_reply(msg, info); put_device(dev); up_read(&vdpa_dev_lock); return err; mdev_err: put_device(dev); dev_err: nlmsg_free(msg); up_read(&vdpa_dev_lock); return err; } static const struct nla_policy vdpa_nl_policy[VDPA_ATTR_MAX + 1] = { [VDPA_ATTR_MGMTDEV_BUS_NAME] = { .type = NLA_NUL_STRING }, [VDPA_ATTR_MGMTDEV_DEV_NAME] = { .type = NLA_STRING }, [VDPA_ATTR_DEV_NAME] = { .type = NLA_STRING }, [VDPA_ATTR_DEV_NET_CFG_MACADDR] = NLA_POLICY_ETH_ADDR, [VDPA_ATTR_DEV_NET_CFG_MAX_VQP] = { .type = NLA_U16 }, /* virtio spec 1.1 section 5.1.4.1 for valid MTU range */ [VDPA_ATTR_DEV_NET_CFG_MTU] = NLA_POLICY_MIN(NLA_U16, 68), [VDPA_ATTR_DEV_QUEUE_INDEX] = { .type = NLA_U32 }, [VDPA_ATTR_DEV_FEATURES] = { .type = NLA_U64 }, }; static const struct genl_ops vdpa_nl_ops[] = { { .cmd = VDPA_CMD_MGMTDEV_GET, .doit = vdpa_nl_cmd_mgmtdev_get_doit, .dumpit = vdpa_nl_cmd_mgmtdev_get_dumpit, }, { .cmd = VDPA_CMD_DEV_NEW, .doit = vdpa_nl_cmd_dev_add_set_doit, .flags = GENL_ADMIN_PERM, }, { .cmd = VDPA_CMD_DEV_DEL, .doit = vdpa_nl_cmd_dev_del_set_doit, .flags = GENL_ADMIN_PERM, }, { .cmd = VDPA_CMD_DEV_GET, .doit = vdpa_nl_cmd_dev_get_doit, .dumpit = vdpa_nl_cmd_dev_get_dumpit, }, { .cmd = VDPA_CMD_DEV_CONFIG_GET, .doit = vdpa_nl_cmd_dev_config_get_doit, .dumpit = vdpa_nl_cmd_dev_config_get_dumpit, }, { .cmd = VDPA_CMD_DEV_VSTATS_GET, .doit = vdpa_nl_cmd_dev_stats_get_doit, .flags = GENL_ADMIN_PERM, }, }; static struct genl_family vdpa_nl_family __ro_after_init = { .name = VDPA_GENL_NAME, .version = VDPA_GENL_VERSION, .maxattr = VDPA_ATTR_MAX, .policy = vdpa_nl_policy, .netnsok = false, .module = THIS_MODULE, .ops = vdpa_nl_ops, .n_ops = ARRAY_SIZE(vdpa_nl_ops), .resv_start_op = VDPA_CMD_DEV_VSTATS_GET + 1, }; static int vdpa_init(void) { int err; err = bus_register(&vdpa_bus); if (err) return err; err = genl_register_family(&vdpa_nl_family); if (err) goto err; return 0; err: bus_unregister(&vdpa_bus); return err; } static void __exit vdpa_exit(void) { genl_unregister_family(&vdpa_nl_family); bus_unregister(&vdpa_bus); ida_destroy(&vdpa_index_ida); } core_initcall(vdpa_init); module_exit(vdpa_exit); MODULE_AUTHOR("Jason Wang <jasowang@redhat.com>"); MODULE_LICENSE("GPL v2");
4 3 3 3 3 3 3 3 1 2 2 1 1 1 1 1 1 1 1 1 3 4 3 4 3 4 4 4 3 3 3 4 4 4 4 4 4 4 4 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 // SPDX-License-Identifier: GPL-2.0 /* * Silicon Laboratories CP210x USB to RS232 serial adaptor driver * * Copyright (C) 2005 Craig Shelley (craig@microtron.org.uk) * Copyright (C) 2010-2021 Johan Hovold (johan@kernel.org) * * Support to set flow control line levels using TIOCMGET and TIOCMSET * thanks to Karl Hiramoto karl@hiramoto.org. RTSCTS hardware flow * control thanks to Munir Nassar nassarmu@real-time.com * */ #include <linux/kernel.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/tty.h> #include <linux/tty_flip.h> #include <linux/module.h> #include <linux/usb.h> #include <linux/usb/serial.h> #include <linux/gpio/driver.h> #include <linux/bitops.h> #include <linux/mutex.h> #define DRIVER_DESC "Silicon Labs CP210x RS232 serial adaptor driver" /* * Function Prototypes */ static int cp210x_open(struct tty_struct *tty, struct usb_serial_port *); static void cp210x_close(struct usb_serial_port *); static void cp210x_change_speed(struct tty_struct *, struct usb_serial_port *, const struct ktermios *); static void cp210x_set_termios(struct tty_struct *, struct usb_serial_port *, const struct ktermios *); static bool cp210x_tx_empty(struct usb_serial_port *port); static int cp210x_tiocmget(struct tty_struct *); static int cp210x_tiocmset(struct tty_struct *, unsigned int, unsigned int); static int cp210x_tiocmset_port(struct usb_serial_port *port, unsigned int, unsigned int); static int cp210x_break_ctl(struct tty_struct *, int); static int cp210x_attach(struct usb_serial *); static void cp210x_disconnect(struct usb_serial *); static void cp210x_release(struct usb_serial *); static int cp210x_port_probe(struct usb_serial_port *); static void cp210x_port_remove(struct usb_serial_port *); static void cp210x_dtr_rts(struct usb_serial_port *port, int on); static void cp210x_process_read_urb(struct urb *urb); static void cp210x_enable_event_mode(struct usb_serial_port *port); static void cp210x_disable_event_mode(struct usb_serial_port *port); static const struct usb_device_id id_table[] = { { USB_DEVICE(0x0404, 0x034C) }, /* NCR Retail IO Box */ { USB_DEVICE(0x045B, 0x0053) }, /* Renesas RX610 RX-Stick */ { USB_DEVICE(0x0471, 0x066A) }, /* AKTAKOM ACE-1001 cable */ { USB_DEVICE(0x0489, 0xE000) }, /* Pirelli Broadband S.p.A, DP-L10 SIP/GSM Mobile */ { USB_DEVICE(0x0489, 0xE003) }, /* Pirelli Broadband S.p.A, DP-L10 SIP/GSM Mobile */ { USB_DEVICE(0x04BF, 0x1301) }, /* TDK Corporation NC0110013M - Network Controller */ { USB_DEVICE(0x04BF, 0x1303) }, /* TDK Corporation MM0110113M - i3 Micro Module */ { USB_DEVICE(0x0745, 0x1000) }, /* CipherLab USB CCD Barcode Scanner 1000 */ { USB_DEVICE(0x0846, 0x1100) }, /* NetGear Managed Switch M4100 series, M5300 series, M7100 series */ { USB_DEVICE(0x08e6, 0x5501) }, /* Gemalto Prox-PU/CU contactless smartcard reader */ { USB_DEVICE(0x08FD, 0x000A) }, /* Digianswer A/S , ZigBee/802.15.4 MAC Device */ { USB_DEVICE(0x0908, 0x0070) }, /* Siemens SCALANCE LPE-9000 USB Serial Console */ { USB_DEVICE(0x0908, 0x01FF) }, /* Siemens RUGGEDCOM USB Serial Console */ { USB_DEVICE(0x0988, 0x0578) }, /* Teraoka AD2000 */ { USB_DEVICE(0x0B00, 0x3070) }, /* Ingenico 3070 */ { USB_DEVICE(0x0BED, 0x1100) }, /* MEI (TM) Cashflow-SC Bill/Voucher Acceptor */ { USB_DEVICE(0x0BED, 0x1101) }, /* MEI series 2000 Combo Acceptor */ { USB_DEVICE(0x0FCF, 0x1003) }, /* Dynastream ANT development board */ { USB_DEVICE(0x0FCF, 0x1004) }, /* Dynastream ANT2USB */ { USB_DEVICE(0x0FCF, 0x1006) }, /* Dynastream ANT development board */ { USB_DEVICE(0x0FDE, 0xCA05) }, /* OWL Wireless Electricity Monitor CM-160 */ { USB_DEVICE(0x106F, 0x0003) }, /* CPI / Money Controls Bulk Coin Recycler */ { USB_DEVICE(0x10A6, 0xAA26) }, /* Knock-off DCU-11 cable */ { USB_DEVICE(0x10AB, 0x10C5) }, /* Siemens MC60 Cable */ { USB_DEVICE(0x10B5, 0xAC70) }, /* Nokia CA-42 USB */ { USB_DEVICE(0x10C4, 0x0F91) }, /* Vstabi */ { USB_DEVICE(0x10C4, 0x1101) }, /* Arkham Technology DS101 Bus Monitor */ { USB_DEVICE(0x10C4, 0x1601) }, /* Arkham Technology DS101 Adapter */ { USB_DEVICE(0x10C4, 0x800A) }, /* SPORTident BSM7-D-USB main station */ { USB_DEVICE(0x10C4, 0x803B) }, /* Pololu USB-serial converter */ { USB_DEVICE(0x10C4, 0x8044) }, /* Cygnal Debug Adapter */ { USB_DEVICE(0x10C4, 0x804E) }, /* Software Bisque Paramount ME build-in converter */ { USB_DEVICE(0x10C4, 0x8053) }, /* Enfora EDG1228 */ { USB_DEVICE(0x10C4, 0x8054) }, /* Enfora GSM2228 */ { USB_DEVICE(0x10C4, 0x8056) }, /* Lorenz Messtechnik devices */ { USB_DEVICE(0x10C4, 0x8066) }, /* Argussoft In-System Programmer */ { USB_DEVICE(0x10C4, 0x806F) }, /* IMS USB to RS422 Converter Cable */ { USB_DEVICE(0x10C4, 0x807A) }, /* Crumb128 board */ { USB_DEVICE(0x10C4, 0x80C4) }, /* Cygnal Integrated Products, Inc., Optris infrared thermometer */ { USB_DEVICE(0x10C4, 0x80CA) }, /* Degree Controls Inc */ { USB_DEVICE(0x10C4, 0x80DD) }, /* Tracient RFID */ { USB_DEVICE(0x10C4, 0x80F6) }, /* Suunto sports instrument */ { USB_DEVICE(0x10C4, 0x8115) }, /* Arygon NFC/Mifare Reader */ { USB_DEVICE(0x10C4, 0x813D) }, /* Burnside Telecom Deskmobile */ { USB_DEVICE(0x10C4, 0x813F) }, /* Tams Master Easy Control */ { USB_DEVICE(0x10C4, 0x814A) }, /* West Mountain Radio RIGblaster P&P */ { USB_DEVICE(0x10C4, 0x814B) }, /* West Mountain Radio RIGtalk */ { USB_DEVICE(0x2405, 0x0003) }, /* West Mountain Radio RIGblaster Advantage */ { USB_DEVICE(0x10C4, 0x8156) }, /* B&G H3000 link cable */ { USB_DEVICE(0x10C4, 0x815E) }, /* Helicomm IP-Link 1220-DVM */ { USB_DEVICE(0x10C4, 0x815F) }, /* Timewave HamLinkUSB */ { USB_DEVICE(0x10C4, 0x817C) }, /* CESINEL MEDCAL N Power Quality Monitor */ { USB_DEVICE(0x10C4, 0x817D) }, /* CESINEL MEDCAL NT Power Quality Monitor */ { USB_DEVICE(0x10C4, 0x817E) }, /* CESINEL MEDCAL S Power Quality Monitor */ { USB_DEVICE(0x10C4, 0x818B) }, /* AVIT Research USB to TTL */ { USB_DEVICE(0x10C4, 0x819F) }, /* MJS USB Toslink Switcher */ { USB_DEVICE(0x10C4, 0x81A6) }, /* ThinkOptics WavIt */ { USB_DEVICE(0x10C4, 0x81A9) }, /* Multiplex RC Interface */ { USB_DEVICE(0x10C4, 0x81AC) }, /* MSD Dash Hawk */ { USB_DEVICE(0x10C4, 0x81AD) }, /* INSYS USB Modem */ { USB_DEVICE(0x10C4, 0x81C8) }, /* Lipowsky Industrie Elektronik GmbH, Baby-JTAG */ { USB_DEVICE(0x10C4, 0x81D7) }, /* IAI Corp. RCB-CV-USB USB to RS485 Adaptor */ { USB_DEVICE(0x10C4, 0x81E2) }, /* Lipowsky Industrie Elektronik GmbH, Baby-LIN */ { USB_DEVICE(0x10C4, 0x81E7) }, /* Aerocomm Radio */ { USB_DEVICE(0x10C4, 0x81E8) }, /* Zephyr Bioharness */ { USB_DEVICE(0x10C4, 0x81F2) }, /* C1007 HF band RFID controller */ { USB_DEVICE(0x10C4, 0x8218) }, /* Lipowsky Industrie Elektronik GmbH, HARP-1 */ { USB_DEVICE(0x10C4, 0x822B) }, /* Modem EDGE(GSM) Comander 2 */ { USB_DEVICE(0x10C4, 0x826B) }, /* Cygnal Integrated Products, Inc., Fasttrax GPS demonstration module */ { USB_DEVICE(0x10C4, 0x8281) }, /* Nanotec Plug & Drive */ { USB_DEVICE(0x10C4, 0x8293) }, /* Telegesis ETRX2USB */ { USB_DEVICE(0x10C4, 0x82AA) }, /* Silicon Labs IFS-USB-DATACABLE used with Quint UPS */ { USB_DEVICE(0x10C4, 0x82EF) }, /* CESINEL FALCO 6105 AC Power Supply */ { USB_DEVICE(0x10C4, 0x82F1) }, /* CESINEL MEDCAL EFD Earth Fault Detector */ { USB_DEVICE(0x10C4, 0x82F2) }, /* CESINEL MEDCAL ST Network Analyzer */ { USB_DEVICE(0x10C4, 0x82F4) }, /* Starizona MicroTouch */ { USB_DEVICE(0x10C4, 0x82F9) }, /* Procyon AVS */ { USB_DEVICE(0x10C4, 0x8341) }, /* Siemens MC35PU GPRS Modem */ { USB_DEVICE(0x10C4, 0x8382) }, /* Cygnal Integrated Products, Inc. */ { USB_DEVICE(0x10C4, 0x83A8) }, /* Amber Wireless AMB2560 */ { USB_DEVICE(0x10C4, 0x83AA) }, /* Mark-10 Digital Force Gauge */ { USB_DEVICE(0x10C4, 0x83D8) }, /* DekTec DTA Plus VHF/UHF Booster/Attenuator */ { USB_DEVICE(0x10C4, 0x8411) }, /* Kyocera GPS Module */ { USB_DEVICE(0x10C4, 0x8414) }, /* Decagon USB Cable Adapter */ { USB_DEVICE(0x10C4, 0x8418) }, /* IRZ Automation Teleport SG-10 GSM/GPRS Modem */ { USB_DEVICE(0x10C4, 0x846E) }, /* BEI USB Sensor Interface (VCP) */ { USB_DEVICE(0x10C4, 0x8470) }, /* Juniper Networks BX Series System Console */ { USB_DEVICE(0x10C4, 0x8477) }, /* Balluff RFID */ { USB_DEVICE(0x10C4, 0x84B6) }, /* Starizona Hyperion */ { USB_DEVICE(0x10C4, 0x851E) }, /* CESINEL MEDCAL PT Network Analyzer */ { USB_DEVICE(0x10C4, 0x85A7) }, /* LifeScan OneTouch Verio IQ */ { USB_DEVICE(0x10C4, 0x85B8) }, /* CESINEL ReCon T Energy Logger */ { USB_DEVICE(0x10C4, 0x85EA) }, /* AC-Services IBUS-IF */ { USB_DEVICE(0x10C4, 0x85EB) }, /* AC-Services CIS-IBUS */ { USB_DEVICE(0x10C4, 0x85F8) }, /* Virtenio Preon32 */ { USB_DEVICE(0x10C4, 0x863C) }, /* MGP Instruments PDS100 */ { USB_DEVICE(0x10C4, 0x8664) }, /* AC-Services CAN-IF */ { USB_DEVICE(0x10C4, 0x8665) }, /* AC-Services OBD-IF */ { USB_DEVICE(0x10C4, 0x87ED) }, /* IMST USB-Stick for Smart Meter */ { USB_DEVICE(0x10C4, 0x8856) }, /* CEL EM357 ZigBee USB Stick - LR */ { USB_DEVICE(0x10C4, 0x8857) }, /* CEL EM357 ZigBee USB Stick */ { USB_DEVICE(0x10C4, 0x88A4) }, /* MMB Networks ZigBee USB Device */ { USB_DEVICE(0x10C4, 0x88A5) }, /* Planet Innovation Ingeni ZigBee USB Device */ { USB_DEVICE(0x10C4, 0x88D8) }, /* Acuity Brands nLight Air Adapter */ { USB_DEVICE(0x10C4, 0x88FB) }, /* CESINEL MEDCAL STII Network Analyzer */ { USB_DEVICE(0x10C4, 0x8938) }, /* CESINEL MEDCAL S II Network Analyzer */ { USB_DEVICE(0x10C4, 0x8946) }, /* Ketra N1 Wireless Interface */ { USB_DEVICE(0x10C4, 0x8962) }, /* Brim Brothers charging dock */ { USB_DEVICE(0x10C4, 0x8977) }, /* CEL MeshWorks DevKit Device */ { USB_DEVICE(0x10C4, 0x8998) }, /* KCF Technologies PRN */ { USB_DEVICE(0x10C4, 0x89A4) }, /* CESINEL FTBC Flexible Thyristor Bridge Controller */ { USB_DEVICE(0x10C4, 0x89FB) }, /* Qivicon ZigBee USB Radio Stick */ { USB_DEVICE(0x10C4, 0x8A2A) }, /* HubZ dual ZigBee and Z-Wave dongle */ { USB_DEVICE(0x10C4, 0x8A5B) }, /* CEL EM3588 ZigBee USB Stick */ { USB_DEVICE(0x10C4, 0x8A5E) }, /* CEL EM3588 ZigBee USB Stick Long Range */ { USB_DEVICE(0x10C4, 0x8B34) }, /* Qivicon ZigBee USB Radio Stick */ { USB_DEVICE(0x10C4, 0xEA60) }, /* Silicon Labs factory default */ { USB_DEVICE(0x10C4, 0xEA61) }, /* Silicon Labs factory default */ { USB_DEVICE(0x10C4, 0xEA63) }, /* Silicon Labs Windows Update (CP2101-4/CP2102N) */ { USB_DEVICE(0x10C4, 0xEA70) }, /* Silicon Labs factory default */ { USB_DEVICE(0x10C4, 0xEA71) }, /* Infinity GPS-MIC-1 Radio Monophone */ { USB_DEVICE(0x10C4, 0xEA7A) }, /* Silicon Labs Windows Update (CP2105) */ { USB_DEVICE(0x10C4, 0xEA7B) }, /* Silicon Labs Windows Update (CP2108) */ { USB_DEVICE(0x10C4, 0xF001) }, /* Elan Digital Systems USBscope50 */ { USB_DEVICE(0x10C4, 0xF002) }, /* Elan Digital Systems USBwave12 */ { USB_DEVICE(0x10C4, 0xF003) }, /* Elan Digital Systems USBpulse100 */ { USB_DEVICE(0x10C4, 0xF004) }, /* Elan Digital Systems USBcount50 */ { USB_DEVICE(0x10C5, 0xEA61) }, /* Silicon Labs MobiData GPRS USB Modem */ { USB_DEVICE(0x10CE, 0xEA6A) }, /* Silicon Labs MobiData GPRS USB Modem 100EU */ { USB_DEVICE(0x11CA, 0x0212) }, /* Verifone USB to Printer (UART, CP2102) */ { USB_DEVICE(0x12B8, 0xEC60) }, /* Link G4 ECU */ { USB_DEVICE(0x12B8, 0xEC62) }, /* Link G4+ ECU */ { USB_DEVICE(0x13AD, 0x9999) }, /* Baltech card reader */ { USB_DEVICE(0x1555, 0x0004) }, /* Owen AC4 USB-RS485 Converter */ { USB_DEVICE(0x155A, 0x1006) }, /* ELDAT Easywave RX09 */ { USB_DEVICE(0x166A, 0x0201) }, /* Clipsal 5500PACA C-Bus Pascal Automation Controller */ { USB_DEVICE(0x166A, 0x0301) }, /* Clipsal 5800PC C-Bus Wireless PC Interface */ { USB_DEVICE(0x166A, 0x0303) }, /* Clipsal 5500PCU C-Bus USB interface */ { USB_DEVICE(0x166A, 0x0304) }, /* Clipsal 5000CT2 C-Bus Black and White Touchscreen */ { USB_DEVICE(0x166A, 0x0305) }, /* Clipsal C-5000CT2 C-Bus Spectrum Colour Touchscreen */ { USB_DEVICE(0x166A, 0x0401) }, /* Clipsal L51xx C-Bus Architectural Dimmer */ { USB_DEVICE(0x166A, 0x0101) }, /* Clipsal 5560884 C-Bus Multi-room Audio Matrix Switcher */ { USB_DEVICE(0x16C0, 0x09B0) }, /* Lunatico Seletek */ { USB_DEVICE(0x16C0, 0x09B1) }, /* Lunatico Seletek */ { USB_DEVICE(0x16D6, 0x0001) }, /* Jablotron serial interface */ { USB_DEVICE(0x16DC, 0x0010) }, /* W-IE-NE-R Plein & Baus GmbH PL512 Power Supply */ { USB_DEVICE(0x16DC, 0x0011) }, /* W-IE-NE-R Plein & Baus GmbH RCM Remote Control for MARATON Power Supply */ { USB_DEVICE(0x16DC, 0x0012) }, /* W-IE-NE-R Plein & Baus GmbH MPOD Multi Channel Power Supply */ { USB_DEVICE(0x16DC, 0x0015) }, /* W-IE-NE-R Plein & Baus GmbH CML Control, Monitoring and Data Logger */ { USB_DEVICE(0x17A8, 0x0001) }, /* Kamstrup Optical Eye/3-wire */ { USB_DEVICE(0x17A8, 0x0005) }, /* Kamstrup M-Bus Master MultiPort 250D */ { USB_DEVICE(0x17A8, 0x0011) }, /* Kamstrup 444 MHz RF sniffer */ { USB_DEVICE(0x17A8, 0x0013) }, /* Kamstrup 870 MHz RF sniffer */ { USB_DEVICE(0x17A8, 0x0101) }, /* Kamstrup 868 MHz wM-Bus C-Mode Meter Reader (Int Ant) */ { USB_DEVICE(0x17A8, 0x0102) }, /* Kamstrup 868 MHz wM-Bus C-Mode Meter Reader (Ext Ant) */ { USB_DEVICE(0x17F4, 0xAAAA) }, /* Wavesense Jazz blood glucose meter */ { USB_DEVICE(0x1843, 0x0200) }, /* Vaisala USB Instrument Cable */ { USB_DEVICE(0x18EF, 0xE00F) }, /* ELV USB-I2C-Interface */ { USB_DEVICE(0x18EF, 0xE025) }, /* ELV Marble Sound Board 1 */ { USB_DEVICE(0x18EF, 0xE030) }, /* ELV ALC 8xxx Battery Charger */ { USB_DEVICE(0x18EF, 0xE032) }, /* ELV TFD500 Data Logger */ { USB_DEVICE(0x1901, 0x0190) }, /* GE B850 CP2105 Recorder interface */ { USB_DEVICE(0x1901, 0x0193) }, /* GE B650 CP2104 PMC interface */ { USB_DEVICE(0x1901, 0x0194) }, /* GE Healthcare Remote Alarm Box */ { USB_DEVICE(0x1901, 0x0195) }, /* GE B850/B650/B450 CP2104 DP UART interface */ { USB_DEVICE(0x1901, 0x0196) }, /* GE B850 CP2105 DP UART interface */ { USB_DEVICE(0x1901, 0x0197) }, /* GE CS1000 M.2 Key E serial interface */ { USB_DEVICE(0x1901, 0x0198) }, /* GE CS1000 Display serial interface */ { USB_DEVICE(0x199B, 0xBA30) }, /* LORD WSDA-200-USB */ { USB_DEVICE(0x19CF, 0x3000) }, /* Parrot NMEA GPS Flight Recorder */ { USB_DEVICE(0x1ADB, 0x0001) }, /* Schweitzer Engineering C662 Cable */ { USB_DEVICE(0x1B1C, 0x1C00) }, /* Corsair USB Dongle */ { USB_DEVICE(0x1BA4, 0x0002) }, /* Silicon Labs 358x factory default */ { USB_DEVICE(0x1BE3, 0x07A6) }, /* WAGO 750-923 USB Service Cable */ { USB_DEVICE(0x1D6F, 0x0010) }, /* Seluxit ApS RF Dongle */ { USB_DEVICE(0x1E29, 0x0102) }, /* Festo CPX-USB */ { USB_DEVICE(0x1E29, 0x0501) }, /* Festo CMSP */ { USB_DEVICE(0x1FB9, 0x0100) }, /* Lake Shore Model 121 Current Source */ { USB_DEVICE(0x1FB9, 0x0200) }, /* Lake Shore Model 218A Temperature Monitor */ { USB_DEVICE(0x1FB9, 0x0201) }, /* Lake Shore Model 219 Temperature Monitor */ { USB_DEVICE(0x1FB9, 0x0202) }, /* Lake Shore Model 233 Temperature Transmitter */ { USB_DEVICE(0x1FB9, 0x0203) }, /* Lake Shore Model 235 Temperature Transmitter */ { USB_DEVICE(0x1FB9, 0x0300) }, /* Lake Shore Model 335 Temperature Controller */ { USB_DEVICE(0x1FB9, 0x0301) }, /* Lake Shore Model 336 Temperature Controller */ { USB_DEVICE(0x1FB9, 0x0302) }, /* Lake Shore Model 350 Temperature Controller */ { USB_DEVICE(0x1FB9, 0x0303) }, /* Lake Shore Model 371 AC Bridge */ { USB_DEVICE(0x1FB9, 0x0400) }, /* Lake Shore Model 411 Handheld Gaussmeter */ { USB_DEVICE(0x1FB9, 0x0401) }, /* Lake Shore Model 425 Gaussmeter */ { USB_DEVICE(0x1FB9, 0x0402) }, /* Lake Shore Model 455A Gaussmeter */ { USB_DEVICE(0x1FB9, 0x0403) }, /* Lake Shore Model 475A Gaussmeter */ { USB_DEVICE(0x1FB9, 0x0404) }, /* Lake Shore Model 465 Three Axis Gaussmeter */ { USB_DEVICE(0x1FB9, 0x0600) }, /* Lake Shore Model 625A Superconducting MPS */ { USB_DEVICE(0x1FB9, 0x0601) }, /* Lake Shore Model 642A Magnet Power Supply */ { USB_DEVICE(0x1FB9, 0x0602) }, /* Lake Shore Model 648 Magnet Power Supply */ { USB_DEVICE(0x1FB9, 0x0700) }, /* Lake Shore Model 737 VSM Controller */ { USB_DEVICE(0x1FB9, 0x0701) }, /* Lake Shore Model 776 Hall Matrix */ { USB_DEVICE(0x2184, 0x0030) }, /* GW Instek GDM-834x Digital Multimeter */ { USB_DEVICE(0x2626, 0xEA60) }, /* Aruba Networks 7xxx USB Serial Console */ { USB_DEVICE(0x3195, 0xF190) }, /* Link Instruments MSO-19 */ { USB_DEVICE(0x3195, 0xF280) }, /* Link Instruments MSO-28 */ { USB_DEVICE(0x3195, 0xF281) }, /* Link Instruments MSO-28 */ { USB_DEVICE(0x3923, 0x7A0B) }, /* National Instruments USB Serial Console */ { USB_DEVICE(0x413C, 0x9500) }, /* DW700 GPS USB interface */ { } /* Terminating Entry */ }; MODULE_DEVICE_TABLE(usb, id_table); struct cp210x_serial_private { #ifdef CONFIG_GPIOLIB struct gpio_chip gc; bool gpio_registered; u16 gpio_pushpull; u16 gpio_altfunc; u16 gpio_input; #endif u8 partnum; u32 fw_version; speed_t min_speed; speed_t max_speed; bool use_actual_rate; bool no_flow_control; bool no_event_mode; }; enum cp210x_event_state { ES_DATA, ES_ESCAPE, ES_LSR, ES_LSR_DATA_0, ES_LSR_DATA_1, ES_MSR }; struct cp210x_port_private { u8 bInterfaceNumber; bool event_mode; enum cp210x_event_state event_state; u8 lsr; struct mutex mutex; bool crtscts; bool dtr; bool rts; }; static struct usb_serial_driver cp210x_device = { .driver = { .owner = THIS_MODULE, .name = "cp210x", }, .id_table = id_table, .num_ports = 1, .bulk_in_size = 256, .bulk_out_size = 256, .open = cp210x_open, .close = cp210x_close, .break_ctl = cp210x_break_ctl, .set_termios = cp210x_set_termios, .tx_empty = cp210x_tx_empty, .throttle = usb_serial_generic_throttle, .unthrottle = usb_serial_generic_unthrottle, .tiocmget = cp210x_tiocmget, .tiocmset = cp210x_tiocmset, .get_icount = usb_serial_generic_get_icount, .attach = cp210x_attach, .disconnect = cp210x_disconnect, .release = cp210x_release, .port_probe = cp210x_port_probe, .port_remove = cp210x_port_remove, .dtr_rts = cp210x_dtr_rts, .process_read_urb = cp210x_process_read_urb, }; static struct usb_serial_driver * const serial_drivers[] = { &cp210x_device, NULL }; /* Config request types */ #define REQTYPE_HOST_TO_INTERFACE 0x41 #define REQTYPE_INTERFACE_TO_HOST 0xc1 #define REQTYPE_HOST_TO_DEVICE 0x40 #define REQTYPE_DEVICE_TO_HOST 0xc0 /* Config request codes */ #define CP210X_IFC_ENABLE 0x00 #define CP210X_SET_BAUDDIV 0x01 #define CP210X_GET_BAUDDIV 0x02 #define CP210X_SET_LINE_CTL 0x03 #define CP210X_GET_LINE_CTL 0x04 #define CP210X_SET_BREAK 0x05 #define CP210X_IMM_CHAR 0x06 #define CP210X_SET_MHS 0x07 #define CP210X_GET_MDMSTS 0x08 #define CP210X_SET_XON 0x09 #define CP210X_SET_XOFF 0x0A #define CP210X_SET_EVENTMASK 0x0B #define CP210X_GET_EVENTMASK 0x0C #define CP210X_SET_CHAR 0x0D #define CP210X_GET_CHARS 0x0E #define CP210X_GET_PROPS 0x0F #define CP210X_GET_COMM_STATUS 0x10 #define CP210X_RESET 0x11 #define CP210X_PURGE 0x12 #define CP210X_SET_FLOW 0x13 #define CP210X_GET_FLOW 0x14 #define CP210X_EMBED_EVENTS 0x15 #define CP210X_GET_EVENTSTATE 0x16 #define CP210X_SET_CHARS 0x19 #define CP210X_GET_BAUDRATE 0x1D #define CP210X_SET_BAUDRATE 0x1E #define CP210X_VENDOR_SPECIFIC 0xFF /* CP210X_IFC_ENABLE */ #define UART_ENABLE 0x0001 #define UART_DISABLE 0x0000 /* CP210X_(SET|GET)_BAUDDIV */ #define BAUD_RATE_GEN_FREQ 0x384000 /* CP210X_(SET|GET)_LINE_CTL */ #define BITS_DATA_MASK 0X0f00 #define BITS_DATA_5 0X0500 #define BITS_DATA_6 0X0600 #define BITS_DATA_7 0X0700 #define BITS_DATA_8 0X0800 #define BITS_DATA_9 0X0900 #define BITS_PARITY_MASK 0x00f0 #define BITS_PARITY_NONE 0x0000 #define BITS_PARITY_ODD 0x0010 #define BITS_PARITY_EVEN 0x0020 #define BITS_PARITY_MARK 0x0030 #define BITS_PARITY_SPACE 0x0040 #define BITS_STOP_MASK 0x000f #define BITS_STOP_1 0x0000 #define BITS_STOP_1_5 0x0001 #define BITS_STOP_2 0x0002 /* CP210X_SET_BREAK */ #define BREAK_ON 0x0001 #define BREAK_OFF 0x0000 /* CP210X_(SET_MHS|GET_MDMSTS) */ #define CONTROL_DTR 0x0001 #define CONTROL_RTS 0x0002 #define CONTROL_CTS 0x0010 #define CONTROL_DSR 0x0020 #define CONTROL_RING 0x0040 #define CONTROL_DCD 0x0080 #define CONTROL_WRITE_DTR 0x0100 #define CONTROL_WRITE_RTS 0x0200 /* CP210X_(GET|SET)_CHARS */ struct cp210x_special_chars { u8 bEofChar; u8 bErrorChar; u8 bBreakChar; u8 bEventChar; u8 bXonChar; u8 bXoffChar; }; /* CP210X_VENDOR_SPECIFIC values */ #define CP210X_GET_FW_VER 0x000E #define CP210X_READ_2NCONFIG 0x000E #define CP210X_GET_FW_VER_2N 0x0010 #define CP210X_READ_LATCH 0x00C2 #define CP210X_GET_PARTNUM 0x370B #define CP210X_GET_PORTCONFIG 0x370C #define CP210X_GET_DEVICEMODE 0x3711 #define CP210X_WRITE_LATCH 0x37E1 /* Part number definitions */ #define CP210X_PARTNUM_CP2101 0x01 #define CP210X_PARTNUM_CP2102 0x02 #define CP210X_PARTNUM_CP2103 0x03 #define CP210X_PARTNUM_CP2104 0x04 #define CP210X_PARTNUM_CP2105 0x05 #define CP210X_PARTNUM_CP2108 0x08 #define CP210X_PARTNUM_CP2102N_QFN28 0x20 #define CP210X_PARTNUM_CP2102N_QFN24 0x21 #define CP210X_PARTNUM_CP2102N_QFN20 0x22 #define CP210X_PARTNUM_UNKNOWN 0xFF /* CP210X_GET_COMM_STATUS returns these 0x13 bytes */ struct cp210x_comm_status { __le32 ulErrors; __le32 ulHoldReasons; __le32 ulAmountInInQueue; __le32 ulAmountInOutQueue; u8 bEofReceived; u8 bWaitForImmediate; u8 bReserved; } __packed; /* * CP210X_PURGE - 16 bits passed in wValue of USB request. * SiLabs app note AN571 gives a strange description of the 4 bits: * bit 0 or bit 2 clears the transmit queue and 1 or 3 receive. * writing 1 to all, however, purges cp2108 well enough to avoid the hang. */ #define PURGE_ALL 0x000f /* CP210X_EMBED_EVENTS */ #define CP210X_ESCCHAR 0xec #define CP210X_LSR_OVERRUN BIT(1) #define CP210X_LSR_PARITY BIT(2) #define CP210X_LSR_FRAME BIT(3) #define CP210X_LSR_BREAK BIT(4) /* CP210X_GET_FLOW/CP210X_SET_FLOW read/write these 0x10 bytes */ struct cp210x_flow_ctl { __le32 ulControlHandshake; __le32 ulFlowReplace; __le32 ulXonLimit; __le32 ulXoffLimit; }; /* cp210x_flow_ctl::ulControlHandshake */ #define CP210X_SERIAL_DTR_MASK GENMASK(1, 0) #define CP210X_SERIAL_DTR_INACTIVE (0 << 0) #define CP210X_SERIAL_DTR_ACTIVE (1 << 0) #define CP210X_SERIAL_DTR_FLOW_CTL (2 << 0) #define CP210X_SERIAL_CTS_HANDSHAKE BIT(3) #define CP210X_SERIAL_DSR_HANDSHAKE BIT(4) #define CP210X_SERIAL_DCD_HANDSHAKE BIT(5) #define CP210X_SERIAL_DSR_SENSITIVITY BIT(6) /* cp210x_flow_ctl::ulFlowReplace */ #define CP210X_SERIAL_AUTO_TRANSMIT BIT(0) #define CP210X_SERIAL_AUTO_RECEIVE BIT(1) #define CP210X_SERIAL_ERROR_CHAR BIT(2) #define CP210X_SERIAL_NULL_STRIPPING BIT(3) #define CP210X_SERIAL_BREAK_CHAR BIT(4) #define CP210X_SERIAL_RTS_MASK GENMASK(7, 6) #define CP210X_SERIAL_RTS_INACTIVE (0 << 6) #define CP210X_SERIAL_RTS_ACTIVE (1 << 6) #define CP210X_SERIAL_RTS_FLOW_CTL (2 << 6) #define CP210X_SERIAL_XOFF_CONTINUE BIT(31) /* CP210X_VENDOR_SPECIFIC, CP210X_GET_DEVICEMODE call reads these 0x2 bytes. */ struct cp210x_pin_mode { u8 eci; u8 sci; }; #define CP210X_PIN_MODE_MODEM 0 #define CP210X_PIN_MODE_GPIO BIT(0) /* * CP210X_VENDOR_SPECIFIC, CP210X_GET_PORTCONFIG call reads these 0xf bytes * on a CP2105 chip. Structure needs padding due to unused/unspecified bytes. */ struct cp210x_dual_port_config { __le16 gpio_mode; u8 __pad0[2]; __le16 reset_state; u8 __pad1[4]; __le16 suspend_state; u8 sci_cfg; u8 eci_cfg; u8 device_cfg; } __packed; /* * CP210X_VENDOR_SPECIFIC, CP210X_GET_PORTCONFIG call reads these 0xd bytes * on a CP2104 chip. Structure needs padding due to unused/unspecified bytes. */ struct cp210x_single_port_config { __le16 gpio_mode; u8 __pad0[2]; __le16 reset_state; u8 __pad1[4]; __le16 suspend_state; u8 device_cfg; } __packed; /* GPIO modes */ #define CP210X_SCI_GPIO_MODE_OFFSET 9 #define CP210X_SCI_GPIO_MODE_MASK GENMASK(11, 9) #define CP210X_ECI_GPIO_MODE_OFFSET 2 #define CP210X_ECI_GPIO_MODE_MASK GENMASK(3, 2) #define CP210X_GPIO_MODE_OFFSET 8 #define CP210X_GPIO_MODE_MASK GENMASK(11, 8) /* CP2105 port configuration values */ #define CP2105_GPIO0_TXLED_MODE BIT(0) #define CP2105_GPIO1_RXLED_MODE BIT(1) #define CP2105_GPIO1_RS485_MODE BIT(2) /* CP2104 port configuration values */ #define CP2104_GPIO0_TXLED_MODE BIT(0) #define CP2104_GPIO1_RXLED_MODE BIT(1) #define CP2104_GPIO2_RS485_MODE BIT(2) struct cp210x_quad_port_state { __le16 gpio_mode_pb0; __le16 gpio_mode_pb1; __le16 gpio_mode_pb2; __le16 gpio_mode_pb3; __le16 gpio_mode_pb4; __le16 gpio_lowpower_pb0; __le16 gpio_lowpower_pb1; __le16 gpio_lowpower_pb2; __le16 gpio_lowpower_pb3; __le16 gpio_lowpower_pb4; __le16 gpio_latch_pb0; __le16 gpio_latch_pb1; __le16 gpio_latch_pb2; __le16 gpio_latch_pb3; __le16 gpio_latch_pb4; }; /* * CP210X_VENDOR_SPECIFIC, CP210X_GET_PORTCONFIG call reads these 0x49 bytes * on a CP2108 chip. * * See https://www.silabs.com/documents/public/application-notes/an978-cp210x-usb-to-uart-api-specification.pdf */ struct cp210x_quad_port_config { struct cp210x_quad_port_state reset_state; struct cp210x_quad_port_state suspend_state; u8 ipdelay_ifc[4]; u8 enhancedfxn_ifc[4]; u8 enhancedfxn_device; u8 extclkfreq[4]; } __packed; #define CP2108_EF_IFC_GPIO_TXLED 0x01 #define CP2108_EF_IFC_GPIO_RXLED 0x02 #define CP2108_EF_IFC_GPIO_RS485 0x04 #define CP2108_EF_IFC_GPIO_RS485_LOGIC 0x08 #define CP2108_EF_IFC_GPIO_CLOCK 0x10 #define CP2108_EF_IFC_DYNAMIC_SUSPEND 0x40 /* CP2102N configuration array indices */ #define CP210X_2NCONFIG_CONFIG_VERSION_IDX 2 #define CP210X_2NCONFIG_GPIO_MODE_IDX 581 #define CP210X_2NCONFIG_GPIO_RSTLATCH_IDX 587 #define CP210X_2NCONFIG_GPIO_CONTROL_IDX 600 /* CP2102N QFN20 port configuration values */ #define CP2102N_QFN20_GPIO2_TXLED_MODE BIT(2) #define CP2102N_QFN20_GPIO3_RXLED_MODE BIT(3) #define CP2102N_QFN20_GPIO1_RS485_MODE BIT(4) #define CP2102N_QFN20_GPIO0_CLK_MODE BIT(6) /* * CP210X_VENDOR_SPECIFIC, CP210X_WRITE_LATCH call writes these 0x02 bytes * for CP2102N, CP2103, CP2104 and CP2105. */ struct cp210x_gpio_write { u8 mask; u8 state; }; /* * CP210X_VENDOR_SPECIFIC, CP210X_WRITE_LATCH call writes these 0x04 bytes * for CP2108. */ struct cp210x_gpio_write16 { __le16 mask; __le16 state; }; /* * Helper to get interface number when we only have struct usb_serial. */ static u8 cp210x_interface_num(struct usb_serial *serial) { struct usb_host_interface *cur_altsetting; cur_altsetting = serial->interface->cur_altsetting; return cur_altsetting->desc.bInterfaceNumber; } /* * Reads a variable-sized block of CP210X_ registers, identified by req. * Returns data into buf in native USB byte order. */ static int cp210x_read_reg_block(struct usb_serial_port *port, u8 req, void *buf, int bufsize) { struct usb_serial *serial = port->serial; struct cp210x_port_private *port_priv = usb_get_serial_port_data(port); int result; result = usb_control_msg_recv(serial->dev, 0, req, REQTYPE_INTERFACE_TO_HOST, 0, port_priv->bInterfaceNumber, buf, bufsize, USB_CTRL_SET_TIMEOUT, GFP_KERNEL); if (result) { dev_err(&port->dev, "failed get req 0x%x size %d status: %d\n", req, bufsize, result); return result; } return 0; } /* * Reads any 8-bit CP210X_ register identified by req. */ static int cp210x_read_u8_reg(struct usb_serial_port *port, u8 req, u8 *val) { return cp210x_read_reg_block(port, req, val, sizeof(*val)); } /* * Reads a variable-sized vendor block of CP210X_ registers, identified by val. * Returns data into buf in native USB byte order. */ static int cp210x_read_vendor_block(struct usb_serial *serial, u8 type, u16 val, void *buf, int bufsize) { int result; result = usb_control_msg_recv(serial->dev, 0, CP210X_VENDOR_SPECIFIC, type, val, cp210x_interface_num(serial), buf, bufsize, USB_CTRL_GET_TIMEOUT, GFP_KERNEL); if (result) { dev_err(&serial->interface->dev, "failed to get vendor val 0x%04x size %d: %d\n", val, bufsize, result); return result; } return 0; } /* * Writes any 16-bit CP210X_ register (req) whose value is passed * entirely in the wValue field of the USB request. */ static int cp210x_write_u16_reg(struct usb_serial_port *port, u8 req, u16 val) { struct usb_serial *serial = port->serial; struct cp210x_port_private *port_priv = usb_get_serial_port_data(port); int result; result = usb_control_msg(serial->dev, usb_sndctrlpipe(serial->dev, 0), req, REQTYPE_HOST_TO_INTERFACE, val, port_priv->bInterfaceNumber, NULL, 0, USB_CTRL_SET_TIMEOUT); if (result < 0) { dev_err(&port->dev, "failed set request 0x%x status: %d\n", req, result); } return result; } /* * Writes a variable-sized block of CP210X_ registers, identified by req. * Data in buf must be in native USB byte order. */ static int cp210x_write_reg_block(struct usb_serial_port *port, u8 req, void *buf, int bufsize) { struct usb_serial *serial = port->serial; struct cp210x_port_private *port_priv = usb_get_serial_port_data(port); int result; result = usb_control_msg_send(serial->dev, 0, req, REQTYPE_HOST_TO_INTERFACE, 0, port_priv->bInterfaceNumber, buf, bufsize, USB_CTRL_SET_TIMEOUT, GFP_KERNEL); if (result) { dev_err(&port->dev, "failed set req 0x%x size %d status: %d\n", req, bufsize, result); return result; } return 0; } /* * Writes any 32-bit CP210X_ register identified by req. */ static int cp210x_write_u32_reg(struct usb_serial_port *port, u8 req, u32 val) { __le32 le32_val; le32_val = cpu_to_le32(val); return cp210x_write_reg_block(port, req, &le32_val, sizeof(le32_val)); } #ifdef CONFIG_GPIOLIB /* * Writes a variable-sized vendor block of CP210X_ registers, identified by val. * Data in buf must be in native USB byte order. */ static int cp210x_write_vendor_block(struct usb_serial *serial, u8 type, u16 val, void *buf, int bufsize) { int result; result = usb_control_msg_send(serial->dev, 0, CP210X_VENDOR_SPECIFIC, type, val, cp210x_interface_num(serial), buf, bufsize, USB_CTRL_SET_TIMEOUT, GFP_KERNEL); if (result) { dev_err(&serial->interface->dev, "failed to set vendor val 0x%04x size %d: %d\n", val, bufsize, result); return result; } return 0; } #endif static int cp210x_open(struct tty_struct *tty, struct usb_serial_port *port) { struct cp210x_port_private *port_priv = usb_get_serial_port_data(port); int result; result = cp210x_write_u16_reg(port, CP210X_IFC_ENABLE, UART_ENABLE); if (result) { dev_err(&port->dev, "%s - Unable to enable UART\n", __func__); return result; } if (tty) cp210x_set_termios(tty, port, NULL); result = usb_serial_generic_open(tty, port); if (result) goto err_disable; return 0; err_disable: cp210x_write_u16_reg(port, CP210X_IFC_ENABLE, UART_DISABLE); port_priv->event_mode = false; return result; } static void cp210x_close(struct usb_serial_port *port) { struct cp210x_port_private *port_priv = usb_get_serial_port_data(port); usb_serial_generic_close(port); /* Clear both queues; cp2108 needs this to avoid an occasional hang */ cp210x_write_u16_reg(port, CP210X_PURGE, PURGE_ALL); cp210x_write_u16_reg(port, CP210X_IFC_ENABLE, UART_DISABLE); /* Disabling the interface disables event-insertion mode. */ port_priv->event_mode = false; } static void cp210x_process_lsr(struct usb_serial_port *port, unsigned char lsr, char *flag) { if (lsr & CP210X_LSR_BREAK) { port->icount.brk++; *flag = TTY_BREAK; } else if (lsr & CP210X_LSR_PARITY) { port->icount.parity++; *flag = TTY_PARITY; } else if (lsr & CP210X_LSR_FRAME) { port->icount.frame++; *flag = TTY_FRAME; } if (lsr & CP210X_LSR_OVERRUN) { port->icount.overrun++; tty_insert_flip_char(&port->port, 0, TTY_OVERRUN); } } static bool cp210x_process_char(struct usb_serial_port *port, unsigned char *ch, char *flag) { struct cp210x_port_private *port_priv = usb_get_serial_port_data(port); switch (port_priv->event_state) { case ES_DATA: if (*ch == CP210X_ESCCHAR) { port_priv->event_state = ES_ESCAPE; break; } return false; case ES_ESCAPE: switch (*ch) { case 0: dev_dbg(&port->dev, "%s - escape char\n", __func__); *ch = CP210X_ESCCHAR; port_priv->event_state = ES_DATA; return false; case 1: port_priv->event_state = ES_LSR_DATA_0; break; case 2: port_priv->event_state = ES_LSR; break; case 3: port_priv->event_state = ES_MSR; break; default: dev_err(&port->dev, "malformed event 0x%02x\n", *ch); port_priv->event_state = ES_DATA; break; } break; case ES_LSR_DATA_0: port_priv->lsr = *ch; port_priv->event_state = ES_LSR_DATA_1; break; case ES_LSR_DATA_1: dev_dbg(&port->dev, "%s - lsr = 0x%02x, data = 0x%02x\n", __func__, port_priv->lsr, *ch); cp210x_process_lsr(port, port_priv->lsr, flag); port_priv->event_state = ES_DATA; return false; case ES_LSR: dev_dbg(&port->dev, "%s - lsr = 0x%02x\n", __func__, *ch); port_priv->lsr = *ch; cp210x_process_lsr(port, port_priv->lsr, flag); port_priv->event_state = ES_DATA; break; case ES_MSR: dev_dbg(&port->dev, "%s - msr = 0x%02x\n", __func__, *ch); /* unimplemented */ port_priv->event_state = ES_DATA; break; } return true; } static void cp210x_process_read_urb(struct urb *urb) { struct usb_serial_port *port = urb->context; struct cp210x_port_private *port_priv = usb_get_serial_port_data(port); unsigned char *ch = urb->transfer_buffer; char flag; int i; if (!urb->actual_length) return; if (port_priv->event_mode) { for (i = 0; i < urb->actual_length; i++, ch++) { flag = TTY_NORMAL; if (cp210x_process_char(port, ch, &flag)) continue; tty_insert_flip_char(&port->port, *ch, flag); } } else { tty_insert_flip_string(&port->port, ch, urb->actual_length); } tty_flip_buffer_push(&port->port); } /* * Read how many bytes are waiting in the TX queue. */ static int cp210x_get_tx_queue_byte_count(struct usb_serial_port *port, u32 *count) { struct usb_serial *serial = port->serial; struct cp210x_port_private *port_priv = usb_get_serial_port_data(port); struct cp210x_comm_status sts; int result; result = usb_control_msg_recv(serial->dev, 0, CP210X_GET_COMM_STATUS, REQTYPE_INTERFACE_TO_HOST, 0, port_priv->bInterfaceNumber, &sts, sizeof(sts), USB_CTRL_GET_TIMEOUT, GFP_KERNEL); if (result) { dev_err(&port->dev, "failed to get comm status: %d\n", result); return result; } *count = le32_to_cpu(sts.ulAmountInOutQueue); return 0; } static bool cp210x_tx_empty(struct usb_serial_port *port) { int err; u32 count; err = cp210x_get_tx_queue_byte_count(port, &count); if (err) return true; return !count; } struct cp210x_rate { speed_t rate; speed_t high; }; static const struct cp210x_rate cp210x_an205_table1[] = { { 300, 300 }, { 600, 600 }, { 1200, 1200 }, { 1800, 1800 }, { 2400, 2400 }, { 4000, 4000 }, { 4800, 4803 }, { 7200, 7207 }, { 9600, 9612 }, { 14400, 14428 }, { 16000, 16062 }, { 19200, 19250 }, { 28800, 28912 }, { 38400, 38601 }, { 51200, 51558 }, { 56000, 56280 }, { 57600, 58053 }, { 64000, 64111 }, { 76800, 77608 }, { 115200, 117028 }, { 128000, 129347 }, { 153600, 156868 }, { 230400, 237832 }, { 250000, 254234 }, { 256000, 273066 }, { 460800, 491520 }, { 500000, 567138 }, { 576000, 670254 }, { 921600, UINT_MAX } }; /* * Quantises the baud rate as per AN205 Table 1 */ static speed_t cp210x_get_an205_rate(speed_t baud) { int i; for (i = 0; i < ARRAY_SIZE(cp210x_an205_table1); ++i) { if (baud <= cp210x_an205_table1[i].high) break; } return cp210x_an205_table1[i].rate; } static speed_t cp210x_get_actual_rate(speed_t baud) { unsigned int prescale = 1; unsigned int div; if (baud <= 365) prescale = 4; div = DIV_ROUND_CLOSEST(48000000, 2 * prescale * baud); baud = 48000000 / (2 * prescale * div); return baud; } /* * CP2101 supports the following baud rates: * * 300, 600, 1200, 1800, 2400, 4800, 7200, 9600, 14400, 19200, 28800, * 38400, 56000, 57600, 115200, 128000, 230400, 460800, 921600 * * CP2102 and CP2103 support the following additional rates: * * 4000, 16000, 51200, 64000, 76800, 153600, 250000, 256000, 500000, * 576000 * * The device will map a requested rate to a supported one, but the result * of requests for rates greater than 1053257 is undefined (see AN205). * * CP2104, CP2105 and CP2110 support most rates up to 2M, 921k and 1M baud, * respectively, with an error less than 1%. The actual rates are determined * by * * div = round(freq / (2 x prescale x request)) * actual = freq / (2 x prescale x div) * * For CP2104 and CP2105 freq is 48Mhz and prescale is 4 for request <= 365bps * or 1 otherwise. * For CP2110 freq is 24Mhz and prescale is 4 for request <= 300bps or 1 * otherwise. */ static void cp210x_change_speed(struct tty_struct *tty, struct usb_serial_port *port, const struct ktermios *old_termios) { struct usb_serial *serial = port->serial; struct cp210x_serial_private *priv = usb_get_serial_data(serial); u32 baud; if (tty->termios.c_ospeed == 0) return; /* * This maps the requested rate to the actual rate, a valid rate on * cp2102 or cp2103, or to an arbitrary rate in [1M, max_speed]. */ baud = clamp(tty->termios.c_ospeed, priv->min_speed, priv->max_speed); if (priv->use_actual_rate) baud = cp210x_get_actual_rate(baud); else if (baud < 1000000) baud = cp210x_get_an205_rate(baud); dev_dbg(&port->dev, "%s - setting baud rate to %u\n", __func__, baud); if (cp210x_write_u32_reg(port, CP210X_SET_BAUDRATE, baud)) { dev_warn(&port->dev, "failed to set baud rate to %u\n", baud); if (old_termios) baud = old_termios->c_ospeed; else baud = 9600; } tty_encode_baud_rate(tty, baud, baud); } static void cp210x_enable_event_mode(struct usb_serial_port *port) { struct cp210x_serial_private *priv = usb_get_serial_data(port->serial); struct cp210x_port_private *port_priv = usb_get_serial_port_data(port); int ret; if (port_priv->event_mode) return; if (priv->no_event_mode) return; port_priv->event_state = ES_DATA; port_priv->event_mode = true; ret = cp210x_write_u16_reg(port, CP210X_EMBED_EVENTS, CP210X_ESCCHAR); if (ret) { dev_err(&port->dev, "failed to enable events: %d\n", ret); port_priv->event_mode = false; } } static void cp210x_disable_event_mode(struct usb_serial_port *port) { struct cp210x_port_private *port_priv = usb_get_serial_port_data(port); int ret; if (!port_priv->event_mode) return; ret = cp210x_write_u16_reg(port, CP210X_EMBED_EVENTS, 0); if (ret) { dev_err(&port->dev, "failed to disable events: %d\n", ret); return; } port_priv->event_mode = false; } static bool cp210x_termios_change(const struct ktermios *a, const struct ktermios *b) { bool iflag_change, cc_change; iflag_change = ((a->c_iflag ^ b->c_iflag) & (INPCK | IXON | IXOFF)); cc_change = a->c_cc[VSTART] != b->c_cc[VSTART] || a->c_cc[VSTOP] != b->c_cc[VSTOP]; return tty_termios_hw_change(a, b) || iflag_change || cc_change; } static void cp210x_set_flow_control(struct tty_struct *tty, struct usb_serial_port *port, const struct ktermios *old_termios) { struct cp210x_serial_private *priv = usb_get_serial_data(port->serial); struct cp210x_port_private *port_priv = usb_get_serial_port_data(port); struct cp210x_special_chars chars; struct cp210x_flow_ctl flow_ctl; u32 flow_repl; u32 ctl_hs; bool crtscts; int ret; /* * Some CP2102N interpret ulXonLimit as ulFlowReplace (erratum * CP2102N_E104). Report back that flow control is not supported. */ if (priv->no_flow_control) { tty->termios.c_cflag &= ~CRTSCTS; tty->termios.c_iflag &= ~(IXON | IXOFF); } if (tty->termios.c_ospeed != 0 && old_termios && old_termios->c_ospeed != 0 && C_CRTSCTS(tty) == (old_termios->c_cflag & CRTSCTS) && I_IXON(tty) == (old_termios->c_iflag & IXON) && I_IXOFF(tty) == (old_termios->c_iflag & IXOFF) && START_CHAR(tty) == old_termios->c_cc[VSTART] && STOP_CHAR(tty) == old_termios->c_cc[VSTOP]) { return; } if (I_IXON(tty) || I_IXOFF(tty)) { memset(&chars, 0, sizeof(chars)); chars.bXonChar = START_CHAR(tty); chars.bXoffChar = STOP_CHAR(tty); ret = cp210x_write_reg_block(port, CP210X_SET_CHARS, &chars, sizeof(chars)); if (ret) { dev_err(&port->dev, "failed to set special chars: %d\n", ret); } } mutex_lock(&port_priv->mutex); if (tty->termios.c_ospeed == 0) { port_priv->dtr = false; port_priv->rts = false; } else if (old_termios && old_termios->c_ospeed == 0) { port_priv->dtr = true; port_priv->rts = true; } ret = cp210x_read_reg_block(port, CP210X_GET_FLOW, &flow_ctl, sizeof(flow_ctl)); if (ret) goto out_unlock; ctl_hs = le32_to_cpu(flow_ctl.ulControlHandshake); flow_repl = le32_to_cpu(flow_ctl.ulFlowReplace); ctl_hs &= ~CP210X_SERIAL_DSR_HANDSHAKE; ctl_hs &= ~CP210X_SERIAL_DCD_HANDSHAKE; ctl_hs &= ~CP210X_SERIAL_DSR_SENSITIVITY; ctl_hs &= ~CP210X_SERIAL_DTR_MASK; if (port_priv->dtr) ctl_hs |= CP210X_SERIAL_DTR_ACTIVE; else ctl_hs |= CP210X_SERIAL_DTR_INACTIVE; flow_repl &= ~CP210X_SERIAL_RTS_MASK; if (C_CRTSCTS(tty)) { ctl_hs |= CP210X_SERIAL_CTS_HANDSHAKE; if (port_priv->rts) flow_repl |= CP210X_SERIAL_RTS_FLOW_CTL; else flow_repl |= CP210X_SERIAL_RTS_INACTIVE; crtscts = true; } else { ctl_hs &= ~CP210X_SERIAL_CTS_HANDSHAKE; if (port_priv->rts) flow_repl |= CP210X_SERIAL_RTS_ACTIVE; else flow_repl |= CP210X_SERIAL_RTS_INACTIVE; crtscts = false; } if (I_IXOFF(tty)) { flow_repl |= CP210X_SERIAL_AUTO_RECEIVE; flow_ctl.ulXonLimit = cpu_to_le32(128); flow_ctl.ulXoffLimit = cpu_to_le32(128); } else { flow_repl &= ~CP210X_SERIAL_AUTO_RECEIVE; } if (I_IXON(tty)) flow_repl |= CP210X_SERIAL_AUTO_TRANSMIT; else flow_repl &= ~CP210X_SERIAL_AUTO_TRANSMIT; dev_dbg(&port->dev, "%s - ctrl = 0x%02x, flow = 0x%02x\n", __func__, ctl_hs, flow_repl); flow_ctl.ulControlHandshake = cpu_to_le32(ctl_hs); flow_ctl.ulFlowReplace = cpu_to_le32(flow_repl); ret = cp210x_write_reg_block(port, CP210X_SET_FLOW, &flow_ctl, sizeof(flow_ctl)); if (ret) goto out_unlock; port_priv->crtscts = crtscts; out_unlock: mutex_unlock(&port_priv->mutex); } static void cp210x_set_termios(struct tty_struct *tty, struct usb_serial_port *port, const struct ktermios *old_termios) { struct cp210x_serial_private *priv = usb_get_serial_data(port->serial); u16 bits; int ret; if (old_termios && !cp210x_termios_change(&tty->termios, old_termios) && tty->termios.c_ospeed != 0) return; if (!old_termios || tty->termios.c_ospeed != old_termios->c_ospeed) cp210x_change_speed(tty, port, old_termios); /* CP2101 only supports CS8, 1 stop bit and non-stick parity. */ if (priv->partnum == CP210X_PARTNUM_CP2101) { tty->termios.c_cflag &= ~(CSIZE | CSTOPB | CMSPAR); tty->termios.c_cflag |= CS8; } bits = 0; switch (C_CSIZE(tty)) { case CS5: bits |= BITS_DATA_5; break; case CS6: bits |= BITS_DATA_6; break; case CS7: bits |= BITS_DATA_7; break; case CS8: default: bits |= BITS_DATA_8; break; } if (C_PARENB(tty)) { if (C_CMSPAR(tty)) { if (C_PARODD(tty)) bits |= BITS_PARITY_MARK; else bits |= BITS_PARITY_SPACE; } else { if (C_PARODD(tty)) bits |= BITS_PARITY_ODD; else bits |= BITS_PARITY_EVEN; } } if (C_CSTOPB(tty)) bits |= BITS_STOP_2; else bits |= BITS_STOP_1; ret = cp210x_write_u16_reg(port, CP210X_SET_LINE_CTL, bits); if (ret) dev_err(&port->dev, "failed to set line control: %d\n", ret); cp210x_set_flow_control(tty, port, old_termios); /* * Enable event-insertion mode only if input parity checking is * enabled for now. */ if (I_INPCK(tty)) cp210x_enable_event_mode(port); else cp210x_disable_event_mode(port); } static int cp210x_tiocmset(struct tty_struct *tty, unsigned int set, unsigned int clear) { struct usb_serial_port *port = tty->driver_data; return cp210x_tiocmset_port(port, set, clear); } static int cp210x_tiocmset_port(struct usb_serial_port *port, unsigned int set, unsigned int clear) { struct cp210x_port_private *port_priv = usb_get_serial_port_data(port); struct cp210x_flow_ctl flow_ctl; u32 ctl_hs, flow_repl; u16 control = 0; int ret; mutex_lock(&port_priv->mutex); if (set & TIOCM_RTS) { port_priv->rts = true; control |= CONTROL_RTS; control |= CONTROL_WRITE_RTS; } if (set & TIOCM_DTR) { port_priv->dtr = true; control |= CONTROL_DTR; control |= CONTROL_WRITE_DTR; } if (clear & TIOCM_RTS) { port_priv->rts = false; control &= ~CONTROL_RTS; control |= CONTROL_WRITE_RTS; } if (clear & TIOCM_DTR) { port_priv->dtr = false; control &= ~CONTROL_DTR; control |= CONTROL_WRITE_DTR; } /* * Use SET_FLOW to set DTR and enable/disable auto-RTS when hardware * flow control is enabled. */ if (port_priv->crtscts && control & CONTROL_WRITE_RTS) { ret = cp210x_read_reg_block(port, CP210X_GET_FLOW, &flow_ctl, sizeof(flow_ctl)); if (ret) goto out_unlock; ctl_hs = le32_to_cpu(flow_ctl.ulControlHandshake); flow_repl = le32_to_cpu(flow_ctl.ulFlowReplace); ctl_hs &= ~CP210X_SERIAL_DTR_MASK; if (port_priv->dtr) ctl_hs |= CP210X_SERIAL_DTR_ACTIVE; else ctl_hs |= CP210X_SERIAL_DTR_INACTIVE; flow_repl &= ~CP210X_SERIAL_RTS_MASK; if (port_priv->rts) flow_repl |= CP210X_SERIAL_RTS_FLOW_CTL; else flow_repl |= CP210X_SERIAL_RTS_INACTIVE; flow_ctl.ulControlHandshake = cpu_to_le32(ctl_hs); flow_ctl.ulFlowReplace = cpu_to_le32(flow_repl); dev_dbg(&port->dev, "%s - ctrl = 0x%02x, flow = 0x%02x\n", __func__, ctl_hs, flow_repl); ret = cp210x_write_reg_block(port, CP210X_SET_FLOW, &flow_ctl, sizeof(flow_ctl)); } else { dev_dbg(&port->dev, "%s - control = 0x%04x\n", __func__, control); ret = cp210x_write_u16_reg(port, CP210X_SET_MHS, control); } out_unlock: mutex_unlock(&port_priv->mutex); return ret; } static void cp210x_dtr_rts(struct usb_serial_port *port, int on) { if (on) cp210x_tiocmset_port(port, TIOCM_DTR | TIOCM_RTS, 0); else cp210x_tiocmset_port(port, 0, TIOCM_DTR | TIOCM_RTS); } static int cp210x_tiocmget(struct tty_struct *tty) { struct usb_serial_port *port = tty->driver_data; u8 control; int result; result = cp210x_read_u8_reg(port, CP210X_GET_MDMSTS, &control); if (result) return result; result = ((control & CONTROL_DTR) ? TIOCM_DTR : 0) |((control & CONTROL_RTS) ? TIOCM_RTS : 0) |((control & CONTROL_CTS) ? TIOCM_CTS : 0) |((control & CONTROL_DSR) ? TIOCM_DSR : 0) |((control & CONTROL_RING)? TIOCM_RI : 0) |((control & CONTROL_DCD) ? TIOCM_CD : 0); dev_dbg(&port->dev, "%s - control = 0x%02x\n", __func__, control); return result; } static int cp210x_break_ctl(struct tty_struct *tty, int break_state) { struct usb_serial_port *port = tty->driver_data; struct cp210x_serial_private *priv = usb_get_serial_data(port->serial); u16 state; if (priv->partnum == CP210X_PARTNUM_CP2105) { if (cp210x_interface_num(port->serial) == 1) return -ENOTTY; } if (break_state == 0) state = BREAK_OFF; else state = BREAK_ON; dev_dbg(&port->dev, "%s - turning break %s\n", __func__, state == BREAK_OFF ? "off" : "on"); return cp210x_write_u16_reg(port, CP210X_SET_BREAK, state); } #ifdef CONFIG_GPIOLIB static int cp210x_gpio_get(struct gpio_chip *gc, unsigned int gpio) { struct usb_serial *serial = gpiochip_get_data(gc); struct cp210x_serial_private *priv = usb_get_serial_data(serial); u8 req_type; u16 mask; int result; int len; result = usb_autopm_get_interface(serial->interface); if (result) return result; switch (priv->partnum) { case CP210X_PARTNUM_CP2105: req_type = REQTYPE_INTERFACE_TO_HOST; len = 1; break; case CP210X_PARTNUM_CP2108: req_type = REQTYPE_INTERFACE_TO_HOST; len = 2; break; default: req_type = REQTYPE_DEVICE_TO_HOST; len = 1; break; } mask = 0; result = cp210x_read_vendor_block(serial, req_type, CP210X_READ_LATCH, &mask, len); usb_autopm_put_interface(serial->interface); if (result < 0) return result; le16_to_cpus(&mask); return !!(mask & BIT(gpio)); } static void cp210x_gpio_set(struct gpio_chip *gc, unsigned int gpio, int value) { struct usb_serial *serial = gpiochip_get_data(gc); struct cp210x_serial_private *priv = usb_get_serial_data(serial); struct cp210x_gpio_write16 buf16; struct cp210x_gpio_write buf; u16 mask, state; u16 wIndex; int result; if (value == 1) state = BIT(gpio); else state = 0; mask = BIT(gpio); result = usb_autopm_get_interface(serial->interface); if (result) goto out; switch (priv->partnum) { case CP210X_PARTNUM_CP2105: buf.mask = (u8)mask; buf.state = (u8)state; result = cp210x_write_vendor_block(serial, REQTYPE_HOST_TO_INTERFACE, CP210X_WRITE_LATCH, &buf, sizeof(buf)); break; case CP210X_PARTNUM_CP2108: buf16.mask = cpu_to_le16(mask); buf16.state = cpu_to_le16(state); result = cp210x_write_vendor_block(serial, REQTYPE_HOST_TO_INTERFACE, CP210X_WRITE_LATCH, &buf16, sizeof(buf16)); break; default: wIndex = state << 8 | mask; result = usb_control_msg(serial->dev, usb_sndctrlpipe(serial->dev, 0), CP210X_VENDOR_SPECIFIC, REQTYPE_HOST_TO_DEVICE, CP210X_WRITE_LATCH, wIndex, NULL, 0, USB_CTRL_SET_TIMEOUT); break; } usb_autopm_put_interface(serial->interface); out: if (result < 0) { dev_err(&serial->interface->dev, "failed to set GPIO value: %d\n", result); } } static int cp210x_gpio_direction_get(struct gpio_chip *gc, unsigned int gpio) { struct usb_serial *serial = gpiochip_get_data(gc); struct cp210x_serial_private *priv = usb_get_serial_data(serial); return priv->gpio_input & BIT(gpio); } static int cp210x_gpio_direction_input(struct gpio_chip *gc, unsigned int gpio) { struct usb_serial *serial = gpiochip_get_data(gc); struct cp210x_serial_private *priv = usb_get_serial_data(serial); if (priv->partnum == CP210X_PARTNUM_CP2105) { /* hardware does not support an input mode */ return -ENOTSUPP; } /* push-pull pins cannot be changed to be inputs */ if (priv->gpio_pushpull & BIT(gpio)) return -EINVAL; /* make sure to release pin if it is being driven low */ cp210x_gpio_set(gc, gpio, 1); priv->gpio_input |= BIT(gpio); return 0; } static int cp210x_gpio_direction_output(struct gpio_chip *gc, unsigned int gpio, int value) { struct usb_serial *serial = gpiochip_get_data(gc); struct cp210x_serial_private *priv = usb_get_serial_data(serial); priv->gpio_input &= ~BIT(gpio); cp210x_gpio_set(gc, gpio, value); return 0; } static int cp210x_gpio_set_config(struct gpio_chip *gc, unsigned int gpio, unsigned long config) { struct usb_serial *serial = gpiochip_get_data(gc); struct cp210x_serial_private *priv = usb_get_serial_data(serial); enum pin_config_param param = pinconf_to_config_param(config); /* Succeed only if in correct mode (this can't be set at runtime) */ if ((param == PIN_CONFIG_DRIVE_PUSH_PULL) && (priv->gpio_pushpull & BIT(gpio))) return 0; if ((param == PIN_CONFIG_DRIVE_OPEN_DRAIN) && !(priv->gpio_pushpull & BIT(gpio))) return 0; return -ENOTSUPP; } static int cp210x_gpio_init_valid_mask(struct gpio_chip *gc, unsigned long *valid_mask, unsigned int ngpios) { struct usb_serial *serial = gpiochip_get_data(gc); struct cp210x_serial_private *priv = usb_get_serial_data(serial); struct device *dev = &serial->interface->dev; unsigned long altfunc_mask = priv->gpio_altfunc; bitmap_complement(valid_mask, &altfunc_mask, ngpios); if (bitmap_empty(valid_mask, ngpios)) dev_dbg(dev, "no pin configured for GPIO\n"); else dev_dbg(dev, "GPIO.%*pbl configured for GPIO\n", ngpios, valid_mask); return 0; } /* * This function is for configuring GPIO using shared pins, where other signals * are made unavailable by configuring the use of GPIO. This is believed to be * only applicable to the cp2105 at this point, the other devices supported by * this driver that provide GPIO do so in a way that does not impact other * signals and are thus expected to have very different initialisation. */ static int cp2105_gpioconf_init(struct usb_serial *serial) { struct cp210x_serial_private *priv = usb_get_serial_data(serial); struct cp210x_pin_mode mode; struct cp210x_dual_port_config config; u8 intf_num = cp210x_interface_num(serial); u8 iface_config; int result; result = cp210x_read_vendor_block(serial, REQTYPE_DEVICE_TO_HOST, CP210X_GET_DEVICEMODE, &mode, sizeof(mode)); if (result < 0) return result; result = cp210x_read_vendor_block(serial, REQTYPE_DEVICE_TO_HOST, CP210X_GET_PORTCONFIG, &config, sizeof(config)); if (result < 0) return result; /* 2 banks of GPIO - One for the pins taken from each serial port */ if (intf_num == 0) { priv->gc.ngpio = 2; if (mode.eci == CP210X_PIN_MODE_MODEM) { /* mark all GPIOs of this interface as reserved */ priv->gpio_altfunc = 0xff; return 0; } iface_config = config.eci_cfg; priv->gpio_pushpull = (u8)((le16_to_cpu(config.gpio_mode) & CP210X_ECI_GPIO_MODE_MASK) >> CP210X_ECI_GPIO_MODE_OFFSET); } else if (intf_num == 1) { priv->gc.ngpio = 3; if (mode.sci == CP210X_PIN_MODE_MODEM) { /* mark all GPIOs of this interface as reserved */ priv->gpio_altfunc = 0xff; return 0; } iface_config = config.sci_cfg; priv->gpio_pushpull = (u8)((le16_to_cpu(config.gpio_mode) & CP210X_SCI_GPIO_MODE_MASK) >> CP210X_SCI_GPIO_MODE_OFFSET); } else { return -ENODEV; } /* mark all pins which are not in GPIO mode */ if (iface_config & CP2105_GPIO0_TXLED_MODE) /* GPIO 0 */ priv->gpio_altfunc |= BIT(0); if (iface_config & (CP2105_GPIO1_RXLED_MODE | /* GPIO 1 */ CP2105_GPIO1_RS485_MODE)) priv->gpio_altfunc |= BIT(1); /* driver implementation for CP2105 only supports outputs */ priv->gpio_input = 0; return 0; } static int cp2104_gpioconf_init(struct usb_serial *serial) { struct cp210x_serial_private *priv = usb_get_serial_data(serial); struct cp210x_single_port_config config; u8 iface_config; u8 gpio_latch; int result; u8 i; result = cp210x_read_vendor_block(serial, REQTYPE_DEVICE_TO_HOST, CP210X_GET_PORTCONFIG, &config, sizeof(config)); if (result < 0) return result; priv->gc.ngpio = 4; iface_config = config.device_cfg; priv->gpio_pushpull = (u8)((le16_to_cpu(config.gpio_mode) & CP210X_GPIO_MODE_MASK) >> CP210X_GPIO_MODE_OFFSET); gpio_latch = (u8)((le16_to_cpu(config.reset_state) & CP210X_GPIO_MODE_MASK) >> CP210X_GPIO_MODE_OFFSET); /* mark all pins which are not in GPIO mode */ if (iface_config & CP2104_GPIO0_TXLED_MODE) /* GPIO 0 */ priv->gpio_altfunc |= BIT(0); if (iface_config & CP2104_GPIO1_RXLED_MODE) /* GPIO 1 */ priv->gpio_altfunc |= BIT(1); if (iface_config & CP2104_GPIO2_RS485_MODE) /* GPIO 2 */ priv->gpio_altfunc |= BIT(2); /* * Like CP2102N, CP2104 has also no strict input and output pin * modes. * Do the same input mode emulation as CP2102N. */ for (i = 0; i < priv->gc.ngpio; ++i) { /* * Set direction to "input" iff pin is open-drain and reset * value is 1. */ if (!(priv->gpio_pushpull & BIT(i)) && (gpio_latch & BIT(i))) priv->gpio_input |= BIT(i); } return 0; } static int cp2108_gpio_init(struct usb_serial *serial) { struct cp210x_serial_private *priv = usb_get_serial_data(serial); struct cp210x_quad_port_config config; u16 gpio_latch; int result; u8 i; result = cp210x_read_vendor_block(serial, REQTYPE_DEVICE_TO_HOST, CP210X_GET_PORTCONFIG, &config, sizeof(config)); if (result < 0) return result; priv->gc.ngpio = 16; priv->gpio_pushpull = le16_to_cpu(config.reset_state.gpio_mode_pb1); gpio_latch = le16_to_cpu(config.reset_state.gpio_latch_pb1); /* * Mark all pins which are not in GPIO mode. * * Refer to table 9.1 "GPIO Mode alternate Functions" in the datasheet: * https://www.silabs.com/documents/public/data-sheets/cp2108-datasheet.pdf * * Alternate functions of GPIO0 to GPIO3 are determine by enhancedfxn_ifc[0] * and the similarly for the other pins; enhancedfxn_ifc[1]: GPIO4 to GPIO7, * enhancedfxn_ifc[2]: GPIO8 to GPIO11, enhancedfxn_ifc[3]: GPIO12 to GPIO15. */ for (i = 0; i < 4; i++) { if (config.enhancedfxn_ifc[i] & CP2108_EF_IFC_GPIO_TXLED) priv->gpio_altfunc |= BIT(i * 4); if (config.enhancedfxn_ifc[i] & CP2108_EF_IFC_GPIO_RXLED) priv->gpio_altfunc |= BIT((i * 4) + 1); if (config.enhancedfxn_ifc[i] & CP2108_EF_IFC_GPIO_RS485) priv->gpio_altfunc |= BIT((i * 4) + 2); if (config.enhancedfxn_ifc[i] & CP2108_EF_IFC_GPIO_CLOCK) priv->gpio_altfunc |= BIT((i * 4) + 3); } /* * Like CP2102N, CP2108 has also no strict input and output pin * modes. Do the same input mode emulation as CP2102N. */ for (i = 0; i < priv->gc.ngpio; ++i) { /* * Set direction to "input" iff pin is open-drain and reset * value is 1. */ if (!(priv->gpio_pushpull & BIT(i)) && (gpio_latch & BIT(i))) priv->gpio_input |= BIT(i); } return 0; } static int cp2102n_gpioconf_init(struct usb_serial *serial) { struct cp210x_serial_private *priv = usb_get_serial_data(serial); const u16 config_size = 0x02a6; u8 gpio_rst_latch; u8 config_version; u8 gpio_pushpull; u8 *config_buf; u8 gpio_latch; u8 gpio_ctrl; int result; u8 i; /* * Retrieve device configuration from the device. * The array received contains all customization settings done at the * factory/manufacturer. Format of the array is documented at the * time of writing at: * https://www.silabs.com/community/interface/knowledge-base.entry.html/2017/03/31/cp2102n_setconfig-xsfa */ config_buf = kmalloc(config_size, GFP_KERNEL); if (!config_buf) return -ENOMEM; result = cp210x_read_vendor_block(serial, REQTYPE_DEVICE_TO_HOST, CP210X_READ_2NCONFIG, config_buf, config_size); if (result < 0) { kfree(config_buf); return result; } config_version = config_buf[CP210X_2NCONFIG_CONFIG_VERSION_IDX]; gpio_pushpull = config_buf[CP210X_2NCONFIG_GPIO_MODE_IDX]; gpio_ctrl = config_buf[CP210X_2NCONFIG_GPIO_CONTROL_IDX]; gpio_rst_latch = config_buf[CP210X_2NCONFIG_GPIO_RSTLATCH_IDX]; kfree(config_buf); /* Make sure this is a config format we understand. */ if (config_version != 0x01) return -ENOTSUPP; priv->gc.ngpio = 4; /* * Get default pin states after reset. Needed so we can determine * the direction of an open-drain pin. */ gpio_latch = (gpio_rst_latch >> 3) & 0x0f; /* 0 indicates open-drain mode, 1 is push-pull */ priv->gpio_pushpull = (gpio_pushpull >> 3) & 0x0f; /* 0 indicates GPIO mode, 1 is alternate function */ if (priv->partnum == CP210X_PARTNUM_CP2102N_QFN20) { /* QFN20 is special... */ if (gpio_ctrl & CP2102N_QFN20_GPIO0_CLK_MODE) /* GPIO 0 */ priv->gpio_altfunc |= BIT(0); if (gpio_ctrl & CP2102N_QFN20_GPIO1_RS485_MODE) /* GPIO 1 */ priv->gpio_altfunc |= BIT(1); if (gpio_ctrl & CP2102N_QFN20_GPIO2_TXLED_MODE) /* GPIO 2 */ priv->gpio_altfunc |= BIT(2); if (gpio_ctrl & CP2102N_QFN20_GPIO3_RXLED_MODE) /* GPIO 3 */ priv->gpio_altfunc |= BIT(3); } else { priv->gpio_altfunc = (gpio_ctrl >> 2) & 0x0f; } if (priv->partnum == CP210X_PARTNUM_CP2102N_QFN28) { /* * For the QFN28 package, GPIO4-6 are controlled by * the low three bits of the mode/latch fields. * Contrary to the document linked above, the bits for * the SUSPEND pins are elsewhere. No alternate * function is available for these pins. */ priv->gc.ngpio = 7; gpio_latch |= (gpio_rst_latch & 7) << 4; priv->gpio_pushpull |= (gpio_pushpull & 7) << 4; } /* * The CP2102N does not strictly has input and output pin modes, * it only knows open-drain and push-pull modes which is set at * factory. An open-drain pin can function both as an * input or an output. We emulate input mode for open-drain pins * by making sure they are not driven low, and we do not allow * push-pull pins to be set as an input. */ for (i = 0; i < priv->gc.ngpio; ++i) { /* * Set direction to "input" iff pin is open-drain and reset * value is 1. */ if (!(priv->gpio_pushpull & BIT(i)) && (gpio_latch & BIT(i))) priv->gpio_input |= BIT(i); } return 0; } static int cp210x_gpio_init(struct usb_serial *serial) { struct cp210x_serial_private *priv = usb_get_serial_data(serial); int result; switch (priv->partnum) { case CP210X_PARTNUM_CP2104: result = cp2104_gpioconf_init(serial); break; case CP210X_PARTNUM_CP2105: result = cp2105_gpioconf_init(serial); break; case CP210X_PARTNUM_CP2108: /* * The GPIOs are not tied to any specific port so only register * once for interface 0. */ if (cp210x_interface_num(serial) != 0) return 0; result = cp2108_gpio_init(serial); break; case CP210X_PARTNUM_CP2102N_QFN28: case CP210X_PARTNUM_CP2102N_QFN24: case CP210X_PARTNUM_CP2102N_QFN20: result = cp2102n_gpioconf_init(serial); break; default: return 0; } if (result < 0) return result; priv->gc.label = "cp210x"; priv->gc.get_direction = cp210x_gpio_direction_get; priv->gc.direction_input = cp210x_gpio_direction_input; priv->gc.direction_output = cp210x_gpio_direction_output; priv->gc.get = cp210x_gpio_get; priv->gc.set = cp210x_gpio_set; priv->gc.set_config = cp210x_gpio_set_config; priv->gc.init_valid_mask = cp210x_gpio_init_valid_mask; priv->gc.owner = THIS_MODULE; priv->gc.parent = &serial->interface->dev; priv->gc.base = -1; priv->gc.can_sleep = true; result = gpiochip_add_data(&priv->gc, serial); if (!result) priv->gpio_registered = true; return result; } static void cp210x_gpio_remove(struct usb_serial *serial) { struct cp210x_serial_private *priv = usb_get_serial_data(serial); if (priv->gpio_registered) { gpiochip_remove(&priv->gc); priv->gpio_registered = false; } } #else static int cp210x_gpio_init(struct usb_serial *serial) { return 0; } static void cp210x_gpio_remove(struct usb_serial *serial) { /* Nothing to do */ } #endif static int cp210x_port_probe(struct usb_serial_port *port) { struct usb_serial *serial = port->serial; struct cp210x_port_private *port_priv; port_priv = kzalloc(sizeof(*port_priv), GFP_KERNEL); if (!port_priv) return -ENOMEM; port_priv->bInterfaceNumber = cp210x_interface_num(serial); mutex_init(&port_priv->mutex); usb_set_serial_port_data(port, port_priv); return 0; } static void cp210x_port_remove(struct usb_serial_port *port) { struct cp210x_port_private *port_priv; port_priv = usb_get_serial_port_data(port); kfree(port_priv); } static void cp210x_init_max_speed(struct usb_serial *serial) { struct cp210x_serial_private *priv = usb_get_serial_data(serial); bool use_actual_rate = false; speed_t min = 300; speed_t max; switch (priv->partnum) { case CP210X_PARTNUM_CP2101: max = 921600; break; case CP210X_PARTNUM_CP2102: case CP210X_PARTNUM_CP2103: max = 1000000; break; case CP210X_PARTNUM_CP2104: use_actual_rate = true; max = 2000000; break; case CP210X_PARTNUM_CP2108: max = 2000000; break; case CP210X_PARTNUM_CP2105: if (cp210x_interface_num(serial) == 0) { use_actual_rate = true; max = 2000000; /* ECI */ } else { min = 2400; max = 921600; /* SCI */ } break; case CP210X_PARTNUM_CP2102N_QFN28: case CP210X_PARTNUM_CP2102N_QFN24: case CP210X_PARTNUM_CP2102N_QFN20: use_actual_rate = true; max = 3000000; break; default: max = 2000000; break; } priv->min_speed = min; priv->max_speed = max; priv->use_actual_rate = use_actual_rate; } static void cp2102_determine_quirks(struct usb_serial *serial) { struct cp210x_serial_private *priv = usb_get_serial_data(serial); u8 *buf; int ret; buf = kmalloc(2, GFP_KERNEL); if (!buf) return; /* * Some (possibly counterfeit) CP2102 do not support event-insertion * mode and respond differently to malformed vendor requests. * Specifically, they return one instead of two bytes when sent a * two-byte part-number request. */ ret = usb_control_msg(serial->dev, usb_rcvctrlpipe(serial->dev, 0), CP210X_VENDOR_SPECIFIC, REQTYPE_DEVICE_TO_HOST, CP210X_GET_PARTNUM, 0, buf, 2, USB_CTRL_GET_TIMEOUT); if (ret == 1) { dev_dbg(&serial->interface->dev, "device does not support event-insertion mode\n"); priv->no_event_mode = true; } kfree(buf); } static int cp210x_get_fw_version(struct usb_serial *serial, u16 value) { struct cp210x_serial_private *priv = usb_get_serial_data(serial); u8 ver[3]; int ret; ret = cp210x_read_vendor_block(serial, REQTYPE_DEVICE_TO_HOST, value, ver, sizeof(ver)); if (ret) return ret; dev_dbg(&serial->interface->dev, "%s - %d.%d.%d\n", __func__, ver[0], ver[1], ver[2]); priv->fw_version = ver[0] << 16 | ver[1] << 8 | ver[2]; return 0; } static void cp210x_determine_type(struct usb_serial *serial) { struct cp210x_serial_private *priv = usb_get_serial_data(serial); int ret; ret = cp210x_read_vendor_block(serial, REQTYPE_DEVICE_TO_HOST, CP210X_GET_PARTNUM, &priv->partnum, sizeof(priv->partnum)); if (ret < 0) { dev_warn(&serial->interface->dev, "querying part number failed\n"); priv->partnum = CP210X_PARTNUM_UNKNOWN; return; } dev_dbg(&serial->interface->dev, "partnum = 0x%02x\n", priv->partnum); switch (priv->partnum) { case CP210X_PARTNUM_CP2102: cp2102_determine_quirks(serial); break; case CP210X_PARTNUM_CP2105: case CP210X_PARTNUM_CP2108: cp210x_get_fw_version(serial, CP210X_GET_FW_VER); break; case CP210X_PARTNUM_CP2102N_QFN28: case CP210X_PARTNUM_CP2102N_QFN24: case CP210X_PARTNUM_CP2102N_QFN20: ret = cp210x_get_fw_version(serial, CP210X_GET_FW_VER_2N); if (ret) break; if (priv->fw_version <= 0x10004) priv->no_flow_control = true; break; default: break; } } static int cp210x_attach(struct usb_serial *serial) { int result; struct cp210x_serial_private *priv; priv = kzalloc(sizeof(*priv), GFP_KERNEL); if (!priv) return -ENOMEM; usb_set_serial_data(serial, priv); cp210x_determine_type(serial); cp210x_init_max_speed(serial); result = cp210x_gpio_init(serial); if (result < 0) { dev_err(&serial->interface->dev, "GPIO initialisation failed: %d\n", result); } return 0; } static void cp210x_disconnect(struct usb_serial *serial) { cp210x_gpio_remove(serial); } static void cp210x_release(struct usb_serial *serial) { struct cp210x_serial_private *priv = usb_get_serial_data(serial); cp210x_gpio_remove(serial); kfree(priv); } module_usb_serial_driver(serial_drivers, id_table); MODULE_DESCRIPTION(DRIVER_DESC); MODULE_LICENSE("GPL v2");
8 1 5 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 // SPDX-License-Identifier: GPL-2.0-only /* Kernel module to match ROUTING parameters. */ /* (C) 2001-2002 Andras Kis-Szabo <kisza@sch.bme.hu> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <linux/ipv6.h> #include <linux/types.h> #include <net/checksum.h> #include <net/ipv6.h> #include <asm/byteorder.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_ipv6/ip6_tables.h> #include <linux/netfilter_ipv6/ip6t_rt.h> MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Xtables: IPv6 Routing Header match"); MODULE_AUTHOR("Andras Kis-Szabo <kisza@sch.bme.hu>"); /* Returns 1 if the id is matched by the range, 0 otherwise */ static inline bool segsleft_match(u_int32_t min, u_int32_t max, u_int32_t id, bool invert) { return (id >= min && id <= max) ^ invert; } static bool rt_mt6(const struct sk_buff *skb, struct xt_action_param *par) { struct ipv6_rt_hdr _route; const struct ipv6_rt_hdr *rh; const struct ip6t_rt *rtinfo = par->matchinfo; unsigned int temp; unsigned int ptr = 0; unsigned int hdrlen = 0; bool ret = false; struct in6_addr _addr; const struct in6_addr *ap; int err; err = ipv6_find_hdr(skb, &ptr, NEXTHDR_ROUTING, NULL, NULL); if (err < 0) { if (err != -ENOENT) par->hotdrop = true; return false; } rh = skb_header_pointer(skb, ptr, sizeof(_route), &_route); if (rh == NULL) { par->hotdrop = true; return false; } hdrlen = ipv6_optlen(rh); if (skb->len - ptr < hdrlen) { /* Pcket smaller than its length field */ return false; } ret = (segsleft_match(rtinfo->segsleft[0], rtinfo->segsleft[1], rh->segments_left, !!(rtinfo->invflags & IP6T_RT_INV_SGS))) && (!(rtinfo->flags & IP6T_RT_LEN) || ((rtinfo->hdrlen == hdrlen) ^ !!(rtinfo->invflags & IP6T_RT_INV_LEN))) && (!(rtinfo->flags & IP6T_RT_TYP) || ((rtinfo->rt_type == rh->type) ^ !!(rtinfo->invflags & IP6T_RT_INV_TYP))); if (ret && (rtinfo->flags & IP6T_RT_RES)) { const u_int32_t *rp; u_int32_t _reserved; rp = skb_header_pointer(skb, ptr + offsetof(struct rt0_hdr, reserved), sizeof(_reserved), &_reserved); if (!rp) { par->hotdrop = true; return false; } ret = (*rp == 0); } if (!(rtinfo->flags & IP6T_RT_FST)) { return ret; } else if (rtinfo->flags & IP6T_RT_FST_NSTRICT) { if (rtinfo->addrnr > (unsigned int)((hdrlen - 8) / 16)) { return false; } else { unsigned int i = 0; for (temp = 0; temp < (unsigned int)((hdrlen - 8) / 16); temp++) { ap = skb_header_pointer(skb, ptr + sizeof(struct rt0_hdr) + temp * sizeof(_addr), sizeof(_addr), &_addr); if (ap == NULL) { par->hotdrop = true; return false; } if (ipv6_addr_equal(ap, &rtinfo->addrs[i])) i++; if (i == rtinfo->addrnr) break; } if (i == rtinfo->addrnr) return ret; else return false; } } else { if (rtinfo->addrnr > (unsigned int)((hdrlen - 8) / 16)) { return false; } else { for (temp = 0; temp < rtinfo->addrnr; temp++) { ap = skb_header_pointer(skb, ptr + sizeof(struct rt0_hdr) + temp * sizeof(_addr), sizeof(_addr), &_addr); if (ap == NULL) { par->hotdrop = true; return false; } if (!ipv6_addr_equal(ap, &rtinfo->addrs[temp])) break; } if (temp == rtinfo->addrnr && temp == (unsigned int)((hdrlen - 8) / 16)) return ret; else return false; } } return false; } static int rt_mt6_check(const struct xt_mtchk_param *par) { const struct ip6t_rt *rtinfo = par->matchinfo; if (rtinfo->invflags & ~IP6T_RT_INV_MASK) { pr_debug("unknown flags %X\n", rtinfo->invflags); return -EINVAL; } if ((rtinfo->flags & (IP6T_RT_RES | IP6T_RT_FST_MASK)) && (!(rtinfo->flags & IP6T_RT_TYP) || (rtinfo->rt_type != 0) || (rtinfo->invflags & IP6T_RT_INV_TYP))) { pr_debug("`--rt-type 0' required before `--rt-0-*'"); return -EINVAL; } return 0; } static struct xt_match rt_mt6_reg __read_mostly = { .name = "rt", .family = NFPROTO_IPV6, .match = rt_mt6, .matchsize = sizeof(struct ip6t_rt), .checkentry = rt_mt6_check, .me = THIS_MODULE, }; static int __init rt_mt6_init(void) { return xt_register_match(&rt_mt6_reg); } static void __exit rt_mt6_exit(void) { xt_unregister_match(&rt_mt6_reg); } module_init(rt_mt6_init); module_exit(rt_mt6_exit);
45 107 107 107 107 1200 1201 45 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 // SPDX-License-Identifier: GPL-2.0 /* * Wakeup statistics in sysfs * * Copyright (c) 2019 Linux Foundation * Copyright (c) 2019 Greg Kroah-Hartman <gregkh@linuxfoundation.org> * Copyright (c) 2019 Google Inc. */ #include <linux/device.h> #include <linux/idr.h> #include <linux/init.h> #include <linux/kdev_t.h> #include <linux/kernel.h> #include <linux/kobject.h> #include <linux/slab.h> #include <linux/timekeeping.h> #include "power.h" static struct class *wakeup_class; #define wakeup_attr(_name) \ static ssize_t _name##_show(struct device *dev, \ struct device_attribute *attr, char *buf) \ { \ struct wakeup_source *ws = dev_get_drvdata(dev); \ \ return sysfs_emit(buf, "%lu\n", ws->_name); \ } \ static DEVICE_ATTR_RO(_name) wakeup_attr(active_count); wakeup_attr(event_count); wakeup_attr(wakeup_count); wakeup_attr(expire_count); static ssize_t active_time_ms_show(struct device *dev, struct device_attribute *attr, char *buf) { struct wakeup_source *ws = dev_get_drvdata(dev); ktime_t active_time = ws->active ? ktime_sub(ktime_get(), ws->last_time) : 0; return sysfs_emit(buf, "%lld\n", ktime_to_ms(active_time)); } static DEVICE_ATTR_RO(active_time_ms); static ssize_t total_time_ms_show(struct device *dev, struct device_attribute *attr, char *buf) { struct wakeup_source *ws = dev_get_drvdata(dev); ktime_t active_time; ktime_t total_time = ws->total_time; if (ws->active) { active_time = ktime_sub(ktime_get(), ws->last_time); total_time = ktime_add(total_time, active_time); } return sysfs_emit(buf, "%lld\n", ktime_to_ms(total_time)); } static DEVICE_ATTR_RO(total_time_ms); static ssize_t max_time_ms_show(struct device *dev, struct device_attribute *attr, char *buf) { struct wakeup_source *ws = dev_get_drvdata(dev); ktime_t active_time; ktime_t max_time = ws->max_time; if (ws->active) { active_time = ktime_sub(ktime_get(), ws->last_time); if (active_time > max_time) max_time = active_time; } return sysfs_emit(buf, "%lld\n", ktime_to_ms(max_time)); } static DEVICE_ATTR_RO(max_time_ms); static ssize_t last_change_ms_show(struct device *dev, struct device_attribute *attr, char *buf) { struct wakeup_source *ws = dev_get_drvdata(dev); return sysfs_emit(buf, "%lld\n", ktime_to_ms(ws->last_time)); } static DEVICE_ATTR_RO(last_change_ms); static ssize_t name_show(struct device *dev, struct device_attribute *attr, char *buf) { struct wakeup_source *ws = dev_get_drvdata(dev); return sysfs_emit(buf, "%s\n", ws->name); } static DEVICE_ATTR_RO(name); static ssize_t prevent_suspend_time_ms_show(struct device *dev, struct device_attribute *attr, char *buf) { struct wakeup_source *ws = dev_get_drvdata(dev); ktime_t prevent_sleep_time = ws->prevent_sleep_time; if (ws->active && ws->autosleep_enabled) { prevent_sleep_time = ktime_add(prevent_sleep_time, ktime_sub(ktime_get(), ws->start_prevent_time)); } return sysfs_emit(buf, "%lld\n", ktime_to_ms(prevent_sleep_time)); } static DEVICE_ATTR_RO(prevent_suspend_time_ms); static struct attribute *wakeup_source_attrs[] = { &dev_attr_name.attr, &dev_attr_active_count.attr, &dev_attr_event_count.attr, &dev_attr_wakeup_count.attr, &dev_attr_expire_count.attr, &dev_attr_active_time_ms.attr, &dev_attr_total_time_ms.attr, &dev_attr_max_time_ms.attr, &dev_attr_last_change_ms.attr, &dev_attr_prevent_suspend_time_ms.attr, NULL, }; ATTRIBUTE_GROUPS(wakeup_source); static void device_create_release(struct device *dev) { kfree(dev); } static struct device *wakeup_source_device_create(struct device *parent, struct wakeup_source *ws) { struct device *dev = NULL; int retval; dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (!dev) { retval = -ENOMEM; goto error; } device_initialize(dev); dev->devt = MKDEV(0, 0); dev->class = wakeup_class; dev->parent = parent; dev->groups = wakeup_source_groups; dev->release = device_create_release; dev_set_drvdata(dev, ws); device_set_pm_not_required(dev); retval = dev_set_name(dev, "wakeup%d", ws->id); if (retval) goto error; retval = device_add(dev); if (retval) goto error; return dev; error: put_device(dev); return ERR_PTR(retval); } /** * wakeup_source_sysfs_add - Add wakeup_source attributes to sysfs. * @parent: Device given wakeup source is associated with (or NULL if virtual). * @ws: Wakeup source to be added in sysfs. */ int wakeup_source_sysfs_add(struct device *parent, struct wakeup_source *ws) { struct device *dev; dev = wakeup_source_device_create(parent, ws); if (IS_ERR(dev)) return PTR_ERR(dev); ws->dev = dev; return 0; } /** * pm_wakeup_source_sysfs_add - Add wakeup_source attributes to sysfs * for a device if they're missing. * @parent: Device given wakeup source is associated with */ int pm_wakeup_source_sysfs_add(struct device *parent) { if (!parent->power.wakeup || parent->power.wakeup->dev) return 0; return wakeup_source_sysfs_add(parent, parent->power.wakeup); } /** * wakeup_source_sysfs_remove - Remove wakeup_source attributes from sysfs. * @ws: Wakeup source to be removed from sysfs. */ void wakeup_source_sysfs_remove(struct wakeup_source *ws) { device_unregister(ws->dev); } static int __init wakeup_sources_sysfs_init(void) { wakeup_class = class_create("wakeup"); return PTR_ERR_OR_ZERO(wakeup_class); } postcore_initcall(wakeup_sources_sysfs_init);
1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 // SPDX-License-Identifier: GPL-2.0 /****************************************************************************** * rtl871x_sta_mgt.c * * Copyright(c) 2007 - 2010 Realtek Corporation. All rights reserved. * Linux device driver for RTL8192SU * * Modifications for inclusion into the Linux staging tree are * Copyright(c) 2010 Larry Finger. All rights reserved. * * Contact information: * WLAN FAE <wlanfae@realtek.com> * Larry Finger <Larry.Finger@lwfinger.net> * ******************************************************************************/ #define _RTL871X_STA_MGT_C_ #include "osdep_service.h" #include "drv_types.h" #include "recv_osdep.h" #include "xmit_osdep.h" #include "sta_info.h" static void _init_stainfo(struct sta_info *psta) { memset((u8 *)psta, 0, sizeof(struct sta_info)); spin_lock_init(&psta->lock); INIT_LIST_HEAD(&psta->list); INIT_LIST_HEAD(&psta->hash_list); _r8712_init_sta_xmit_priv(&psta->sta_xmitpriv); _r8712_init_sta_recv_priv(&psta->sta_recvpriv); INIT_LIST_HEAD(&psta->asoc_list); INIT_LIST_HEAD(&psta->auth_list); } int _r8712_init_sta_priv(struct sta_priv *pstapriv) { struct sta_info *psta; s32 i; pstapriv->pallocated_stainfo_buf = kmalloc(sizeof(struct sta_info) * NUM_STA + 4, GFP_ATOMIC); if (!pstapriv->pallocated_stainfo_buf) return -ENOMEM; pstapriv->pstainfo_buf = pstapriv->pallocated_stainfo_buf + 4 - ((addr_t)(pstapriv->pallocated_stainfo_buf) & 3); _init_queue(&pstapriv->free_sta_queue); spin_lock_init(&pstapriv->sta_hash_lock); pstapriv->asoc_sta_count = 0; _init_queue(&pstapriv->sleep_q); _init_queue(&pstapriv->wakeup_q); psta = (struct sta_info *)(pstapriv->pstainfo_buf); for (i = 0; i < NUM_STA; i++) { _init_stainfo(psta); INIT_LIST_HEAD(&(pstapriv->sta_hash[i])); list_add_tail(&psta->list, &pstapriv->free_sta_queue.queue); psta++; } INIT_LIST_HEAD(&pstapriv->asoc_list); INIT_LIST_HEAD(&pstapriv->auth_list); return 0; } /* this function is used to free the memory of lock || sema for all stainfos */ static void mfree_all_stainfo(struct sta_priv *pstapriv) { unsigned long irqL; struct list_head *plist, *phead; spin_lock_irqsave(&pstapriv->sta_hash_lock, irqL); phead = &pstapriv->free_sta_queue.queue; plist = phead->next; while (!end_of_queue_search(phead, plist)) plist = plist->next; spin_unlock_irqrestore(&pstapriv->sta_hash_lock, irqL); } void _r8712_free_sta_priv(struct sta_priv *pstapriv) { if (pstapriv) { /* be done before free sta_hash_lock */ mfree_all_stainfo(pstapriv); kfree(pstapriv->pallocated_stainfo_buf); } } struct sta_info *r8712_alloc_stainfo(struct sta_priv *pstapriv, u8 *hwaddr) { s32 index; struct list_head *phash_list; struct sta_info *psta; struct __queue *pfree_sta_queue; struct recv_reorder_ctrl *preorder_ctrl; int i = 0; u16 wRxSeqInitialValue = 0xffff; unsigned long flags; pfree_sta_queue = &pstapriv->free_sta_queue; spin_lock_irqsave(&pfree_sta_queue->lock, flags); psta = list_first_entry_or_null(&pfree_sta_queue->queue, struct sta_info, list); if (psta) { list_del_init(&psta->list); _init_stainfo(psta); memcpy(psta->hwaddr, hwaddr, ETH_ALEN); index = wifi_mac_hash(hwaddr); if (index >= NUM_STA) { psta = NULL; goto exit; } phash_list = &pstapriv->sta_hash[index]; list_add_tail(&psta->hash_list, phash_list); pstapriv->asoc_sta_count++; /* For the SMC router, the sequence number of first packet of WPS handshake * will be 0. In this case, this packet will be dropped by recv_decache function * if we use the 0x00 as the default value for tid_rxseq variable. So, we * initialize the tid_rxseq variable as the 0xffff. */ for (i = 0; i < 16; i++) memcpy(&psta->sta_recvpriv.rxcache.tid_rxseq[i], &wRxSeqInitialValue, 2); /* for A-MPDU Rx reordering buffer control */ for (i = 0; i < 16; i++) { preorder_ctrl = &psta->recvreorder_ctrl[i]; preorder_ctrl->padapter = pstapriv->padapter; preorder_ctrl->indicate_seq = 0xffff; preorder_ctrl->wend_b = 0xffff; preorder_ctrl->wsize_b = 64; _init_queue(&preorder_ctrl->pending_recvframe_queue); r8712_init_recv_timer(preorder_ctrl); } } exit: spin_unlock_irqrestore(&pfree_sta_queue->lock, flags); return psta; } /* using pstapriv->sta_hash_lock to protect */ void r8712_free_stainfo(struct _adapter *padapter, struct sta_info *psta) { int i; unsigned long irqL0; struct __queue *pfree_sta_queue; struct recv_reorder_ctrl *preorder_ctrl; struct sta_xmit_priv *pstaxmitpriv; struct xmit_priv *pxmitpriv = &padapter->xmitpriv; struct sta_priv *pstapriv = &padapter->stapriv; if (!psta) return; pfree_sta_queue = &pstapriv->free_sta_queue; pstaxmitpriv = &psta->sta_xmitpriv; spin_lock_irqsave(&(pxmitpriv->vo_pending.lock), irqL0); r8712_free_xmitframe_queue(pxmitpriv, &pstaxmitpriv->vo_q.sta_pending); list_del_init(&(pstaxmitpriv->vo_q.tx_pending)); spin_unlock_irqrestore(&(pxmitpriv->vo_pending.lock), irqL0); spin_lock_irqsave(&(pxmitpriv->vi_pending.lock), irqL0); r8712_free_xmitframe_queue(pxmitpriv, &pstaxmitpriv->vi_q.sta_pending); list_del_init(&(pstaxmitpriv->vi_q.tx_pending)); spin_unlock_irqrestore(&(pxmitpriv->vi_pending.lock), irqL0); spin_lock_irqsave(&(pxmitpriv->bk_pending.lock), irqL0); r8712_free_xmitframe_queue(pxmitpriv, &pstaxmitpriv->bk_q.sta_pending); list_del_init(&(pstaxmitpriv->bk_q.tx_pending)); spin_unlock_irqrestore(&(pxmitpriv->bk_pending.lock), irqL0); spin_lock_irqsave(&(pxmitpriv->be_pending.lock), irqL0); r8712_free_xmitframe_queue(pxmitpriv, &pstaxmitpriv->be_q.sta_pending); list_del_init(&(pstaxmitpriv->be_q.tx_pending)); spin_unlock_irqrestore(&(pxmitpriv->be_pending.lock), irqL0); list_del_init(&psta->hash_list); pstapriv->asoc_sta_count--; /* re-init sta_info; 20061114 */ _r8712_init_sta_xmit_priv(&psta->sta_xmitpriv); _r8712_init_sta_recv_priv(&psta->sta_recvpriv); /* for A-MPDU Rx reordering buffer control, * cancel reordering_ctrl_timer */ for (i = 0; i < 16; i++) { preorder_ctrl = &psta->recvreorder_ctrl[i]; del_timer(&preorder_ctrl->reordering_ctrl_timer); } spin_lock(&(pfree_sta_queue->lock)); /* insert into free_sta_queue; 20061114 */ list_add_tail(&psta->list, &pfree_sta_queue->queue); spin_unlock(&(pfree_sta_queue->lock)); } /* free all stainfo which in sta_hash[all] */ void r8712_free_all_stainfo(struct _adapter *padapter) { unsigned long irqL; struct list_head *plist, *phead; s32 index; struct sta_info *psta = NULL; struct sta_priv *pstapriv = &padapter->stapriv; struct sta_info *pbcmc_stainfo = r8712_get_bcmc_stainfo(padapter); if (pstapriv->asoc_sta_count == 1) return; spin_lock_irqsave(&pstapriv->sta_hash_lock, irqL); for (index = 0; index < NUM_STA; index++) { phead = &(pstapriv->sta_hash[index]); plist = phead->next; while (!end_of_queue_search(phead, plist)) { psta = container_of(plist, struct sta_info, hash_list); plist = plist->next; if (pbcmc_stainfo != psta) r8712_free_stainfo(padapter, psta); } } spin_unlock_irqrestore(&pstapriv->sta_hash_lock, irqL); } /* any station allocated can be searched by hash list */ struct sta_info *r8712_get_stainfo(struct sta_priv *pstapriv, u8 *hwaddr) { unsigned long irqL; struct list_head *plist, *phead; struct sta_info *psta = NULL; u32 index; if (!hwaddr) return NULL; index = wifi_mac_hash(hwaddr); spin_lock_irqsave(&pstapriv->sta_hash_lock, irqL); phead = &(pstapriv->sta_hash[index]); plist = phead->next; while (!end_of_queue_search(phead, plist)) { psta = container_of(plist, struct sta_info, hash_list); if ((!memcmp(psta->hwaddr, hwaddr, ETH_ALEN))) { /* if found the matched address */ break; } psta = NULL; plist = plist->next; } spin_unlock_irqrestore(&pstapriv->sta_hash_lock, irqL); return psta; } void r8712_init_bcmc_stainfo(struct _adapter *padapter) { unsigned char bcast_addr[6] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; struct sta_priv *pstapriv = &padapter->stapriv; r8712_alloc_stainfo(pstapriv, bcast_addr); } struct sta_info *r8712_get_bcmc_stainfo(struct _adapter *padapter) { struct sta_priv *pstapriv = &padapter->stapriv; u8 bc_addr[ETH_ALEN] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; return r8712_get_stainfo(pstapriv, bc_addr); } u8 r8712_access_ctrl(struct wlan_acl_pool *pacl_list, u8 *mac_addr) { return true; }
8765 12 126 126 4521 4503 4207 237 17 1261 19 128 37 14 3311 15 35 26 8 312 36 4 11 12 29 144 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* audit.h -- Auditing support * * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. * All Rights Reserved. * * Written by Rickard E. (Rik) Faith <faith@redhat.com> */ #ifndef _LINUX_AUDIT_H_ #define _LINUX_AUDIT_H_ #include <linux/sched.h> #include <linux/ptrace.h> #include <linux/audit_arch.h> #include <uapi/linux/audit.h> #include <uapi/linux/netfilter/nf_tables.h> #include <uapi/linux/fanotify.h> #define AUDIT_INO_UNSET ((unsigned long)-1) #define AUDIT_DEV_UNSET ((dev_t)-1) struct audit_sig_info { uid_t uid; pid_t pid; char ctx[]; }; struct audit_buffer; struct audit_context; struct inode; struct netlink_skb_parms; struct path; struct linux_binprm; struct mq_attr; struct mqstat; struct audit_watch; struct audit_tree; struct sk_buff; struct kern_ipc_perm; struct audit_krule { u32 pflags; u32 flags; u32 listnr; u32 action; u32 mask[AUDIT_BITMASK_SIZE]; u32 buflen; /* for data alloc on list rules */ u32 field_count; char *filterkey; /* ties events to rules */ struct audit_field *fields; struct audit_field *arch_f; /* quick access to arch field */ struct audit_field *inode_f; /* quick access to an inode field */ struct audit_watch *watch; /* associated watch */ struct audit_tree *tree; /* associated watched tree */ struct audit_fsnotify_mark *exe; struct list_head rlist; /* entry in audit_{watch,tree}.rules list */ struct list_head list; /* for AUDIT_LIST* purposes only */ u64 prio; }; /* Flag to indicate legacy AUDIT_LOGINUID unset usage */ #define AUDIT_LOGINUID_LEGACY 0x1 struct audit_field { u32 type; union { u32 val; kuid_t uid; kgid_t gid; struct { char *lsm_str; void *lsm_rule; }; }; u32 op; }; enum audit_ntp_type { AUDIT_NTP_OFFSET, AUDIT_NTP_FREQ, AUDIT_NTP_STATUS, AUDIT_NTP_TAI, AUDIT_NTP_TICK, AUDIT_NTP_ADJUST, AUDIT_NTP_NVALS /* count */ }; #ifdef CONFIG_AUDITSYSCALL struct audit_ntp_val { long long oldval, newval; }; struct audit_ntp_data { struct audit_ntp_val vals[AUDIT_NTP_NVALS]; }; #else struct audit_ntp_data {}; #endif enum audit_nfcfgop { AUDIT_XT_OP_REGISTER, AUDIT_XT_OP_REPLACE, AUDIT_XT_OP_UNREGISTER, AUDIT_NFT_OP_TABLE_REGISTER, AUDIT_NFT_OP_TABLE_UNREGISTER, AUDIT_NFT_OP_CHAIN_REGISTER, AUDIT_NFT_OP_CHAIN_UNREGISTER, AUDIT_NFT_OP_RULE_REGISTER, AUDIT_NFT_OP_RULE_UNREGISTER, AUDIT_NFT_OP_SET_REGISTER, AUDIT_NFT_OP_SET_UNREGISTER, AUDIT_NFT_OP_SETELEM_REGISTER, AUDIT_NFT_OP_SETELEM_UNREGISTER, AUDIT_NFT_OP_GEN_REGISTER, AUDIT_NFT_OP_OBJ_REGISTER, AUDIT_NFT_OP_OBJ_UNREGISTER, AUDIT_NFT_OP_OBJ_RESET, AUDIT_NFT_OP_FLOWTABLE_REGISTER, AUDIT_NFT_OP_FLOWTABLE_UNREGISTER, AUDIT_NFT_OP_SETELEM_RESET, AUDIT_NFT_OP_RULE_RESET, AUDIT_NFT_OP_INVALID, }; extern int __init audit_register_class(int class, unsigned *list); extern int audit_classify_syscall(int abi, unsigned syscall); extern int audit_classify_arch(int arch); /* only for compat system calls */ extern unsigned compat_write_class[]; extern unsigned compat_read_class[]; extern unsigned compat_dir_class[]; extern unsigned compat_chattr_class[]; extern unsigned compat_signal_class[]; /* audit_names->type values */ #define AUDIT_TYPE_UNKNOWN 0 /* we don't know yet */ #define AUDIT_TYPE_NORMAL 1 /* a "normal" audit record */ #define AUDIT_TYPE_PARENT 2 /* a parent audit record */ #define AUDIT_TYPE_CHILD_DELETE 3 /* a child being deleted */ #define AUDIT_TYPE_CHILD_CREATE 4 /* a child being created */ /* maximized args number that audit_socketcall can process */ #define AUDITSC_ARGS 6 /* bit values for ->signal->audit_tty */ #define AUDIT_TTY_ENABLE BIT(0) #define AUDIT_TTY_LOG_PASSWD BIT(1) struct filename; #define AUDIT_OFF 0 #define AUDIT_ON 1 #define AUDIT_LOCKED 2 #ifdef CONFIG_AUDIT /* These are defined in audit.c */ /* Public API */ extern __printf(4, 5) void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type, const char *fmt, ...); extern struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, int type); extern __printf(2, 3) void audit_log_format(struct audit_buffer *ab, const char *fmt, ...); extern void audit_log_end(struct audit_buffer *ab); extern bool audit_string_contains_control(const char *string, size_t len); extern void audit_log_n_hex(struct audit_buffer *ab, const unsigned char *buf, size_t len); extern void audit_log_n_string(struct audit_buffer *ab, const char *buf, size_t n); extern void audit_log_n_untrustedstring(struct audit_buffer *ab, const char *string, size_t n); extern void audit_log_untrustedstring(struct audit_buffer *ab, const char *string); extern void audit_log_d_path(struct audit_buffer *ab, const char *prefix, const struct path *path); extern void audit_log_key(struct audit_buffer *ab, char *key); extern void audit_log_path_denied(int type, const char *operation); extern void audit_log_lost(const char *message); extern int audit_log_task_context(struct audit_buffer *ab); extern void audit_log_task_info(struct audit_buffer *ab); extern int audit_update_lsm_rules(void); /* Private API (for audit.c only) */ extern int audit_rule_change(int type, int seq, void *data, size_t datasz); extern int audit_list_rules_send(struct sk_buff *request_skb, int seq); extern int audit_set_loginuid(kuid_t loginuid); static inline kuid_t audit_get_loginuid(struct task_struct *tsk) { return tsk->loginuid; } static inline unsigned int audit_get_sessionid(struct task_struct *tsk) { return tsk->sessionid; } extern u32 audit_enabled; extern int audit_signal_info(int sig, struct task_struct *t); #else /* CONFIG_AUDIT */ static inline __printf(4, 5) void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type, const char *fmt, ...) { } static inline struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, int type) { return NULL; } static inline __printf(2, 3) void audit_log_format(struct audit_buffer *ab, const char *fmt, ...) { } static inline void audit_log_end(struct audit_buffer *ab) { } static inline void audit_log_n_hex(struct audit_buffer *ab, const unsigned char *buf, size_t len) { } static inline void audit_log_n_string(struct audit_buffer *ab, const char *buf, size_t n) { } static inline void audit_log_n_untrustedstring(struct audit_buffer *ab, const char *string, size_t n) { } static inline void audit_log_untrustedstring(struct audit_buffer *ab, const char *string) { } static inline void audit_log_d_path(struct audit_buffer *ab, const char *prefix, const struct path *path) { } static inline void audit_log_key(struct audit_buffer *ab, char *key) { } static inline void audit_log_path_denied(int type, const char *operation) { } static inline int audit_log_task_context(struct audit_buffer *ab) { return 0; } static inline void audit_log_task_info(struct audit_buffer *ab) { } static inline kuid_t audit_get_loginuid(struct task_struct *tsk) { return INVALID_UID; } static inline unsigned int audit_get_sessionid(struct task_struct *tsk) { return AUDIT_SID_UNSET; } #define audit_enabled AUDIT_OFF static inline int audit_signal_info(int sig, struct task_struct *t) { return 0; } #endif /* CONFIG_AUDIT */ #ifdef CONFIG_AUDIT_COMPAT_GENERIC #define audit_is_compat(arch) (!((arch) & __AUDIT_ARCH_64BIT)) #else #define audit_is_compat(arch) false #endif #define AUDIT_INODE_PARENT 1 /* dentry represents the parent */ #define AUDIT_INODE_HIDDEN 2 /* audit record should be hidden */ #define AUDIT_INODE_NOEVAL 4 /* audit record incomplete */ #ifdef CONFIG_AUDITSYSCALL #include <asm/syscall.h> /* for syscall_get_arch() */ /* These are defined in auditsc.c */ /* Public API */ extern int audit_alloc(struct task_struct *task); extern void __audit_free(struct task_struct *task); extern void __audit_uring_entry(u8 op); extern void __audit_uring_exit(int success, long code); extern void __audit_syscall_entry(int major, unsigned long a0, unsigned long a1, unsigned long a2, unsigned long a3); extern void __audit_syscall_exit(int ret_success, long ret_value); extern struct filename *__audit_reusename(const __user char *uptr); extern void __audit_getname(struct filename *name); extern void __audit_inode(struct filename *name, const struct dentry *dentry, unsigned int flags); extern void __audit_file(const struct file *); extern void __audit_inode_child(struct inode *parent, const struct dentry *dentry, const unsigned char type); extern void audit_seccomp(unsigned long syscall, long signr, int code); extern void audit_seccomp_actions_logged(const char *names, const char *old_names, int res); extern void __audit_ptrace(struct task_struct *t); static inline void audit_set_context(struct task_struct *task, struct audit_context *ctx) { task->audit_context = ctx; } static inline struct audit_context *audit_context(void) { return current->audit_context; } static inline bool audit_dummy_context(void) { void *p = audit_context(); return !p || *(int *)p; } static inline void audit_free(struct task_struct *task) { if (unlikely(task->audit_context)) __audit_free(task); } static inline void audit_uring_entry(u8 op) { /* * We intentionally check audit_context() before audit_enabled as most * Linux systems (as of ~2021) rely on systemd which forces audit to * be enabled regardless of the user's audit configuration. */ if (unlikely(audit_context() && audit_enabled)) __audit_uring_entry(op); } static inline void audit_uring_exit(int success, long code) { if (unlikely(audit_context())) __audit_uring_exit(success, code); } static inline void audit_syscall_entry(int major, unsigned long a0, unsigned long a1, unsigned long a2, unsigned long a3) { if (unlikely(audit_context())) __audit_syscall_entry(major, a0, a1, a2, a3); } static inline void audit_syscall_exit(void *pt_regs) { if (unlikely(audit_context())) { int success = is_syscall_success(pt_regs); long return_code = regs_return_value(pt_regs); __audit_syscall_exit(success, return_code); } } static inline struct filename *audit_reusename(const __user char *name) { if (unlikely(!audit_dummy_context())) return __audit_reusename(name); return NULL; } static inline void audit_getname(struct filename *name) { if (unlikely(!audit_dummy_context())) __audit_getname(name); } static inline void audit_inode(struct filename *name, const struct dentry *dentry, unsigned int aflags) { if (unlikely(!audit_dummy_context())) __audit_inode(name, dentry, aflags); } static inline void audit_file(struct file *file) { if (unlikely(!audit_dummy_context())) __audit_file(file); } static inline void audit_inode_parent_hidden(struct filename *name, const struct dentry *dentry) { if (unlikely(!audit_dummy_context())) __audit_inode(name, dentry, AUDIT_INODE_PARENT | AUDIT_INODE_HIDDEN); } static inline void audit_inode_child(struct inode *parent, const struct dentry *dentry, const unsigned char type) { if (unlikely(!audit_dummy_context())) __audit_inode_child(parent, dentry, type); } void audit_core_dumps(long signr); static inline void audit_ptrace(struct task_struct *t) { if (unlikely(!audit_dummy_context())) __audit_ptrace(t); } /* Private API (for audit.c only) */ extern void __audit_ipc_obj(struct kern_ipc_perm *ipcp); extern void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode); extern void __audit_bprm(struct linux_binprm *bprm); extern int __audit_socketcall(int nargs, unsigned long *args); extern int __audit_sockaddr(int len, void *addr); extern void __audit_fd_pair(int fd1, int fd2); extern void __audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr); extern void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, const struct timespec64 *abs_timeout); extern void __audit_mq_notify(mqd_t mqdes, const struct sigevent *notification); extern void __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat); extern int __audit_log_bprm_fcaps(struct linux_binprm *bprm, const struct cred *new, const struct cred *old); extern void __audit_log_capset(const struct cred *new, const struct cred *old); extern void __audit_mmap_fd(int fd, int flags); extern void __audit_openat2_how(struct open_how *how); extern void __audit_log_kern_module(char *name); extern void __audit_fanotify(u32 response, struct fanotify_response_info_audit_rule *friar); extern void __audit_tk_injoffset(struct timespec64 offset); extern void __audit_ntp_log(const struct audit_ntp_data *ad); extern void __audit_log_nfcfg(const char *name, u8 af, unsigned int nentries, enum audit_nfcfgop op, gfp_t gfp); static inline void audit_ipc_obj(struct kern_ipc_perm *ipcp) { if (unlikely(!audit_dummy_context())) __audit_ipc_obj(ipcp); } static inline void audit_fd_pair(int fd1, int fd2) { if (unlikely(!audit_dummy_context())) __audit_fd_pair(fd1, fd2); } static inline void audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode) { if (unlikely(!audit_dummy_context())) __audit_ipc_set_perm(qbytes, uid, gid, mode); } static inline void audit_bprm(struct linux_binprm *bprm) { if (unlikely(!audit_dummy_context())) __audit_bprm(bprm); } static inline int audit_socketcall(int nargs, unsigned long *args) { if (unlikely(!audit_dummy_context())) return __audit_socketcall(nargs, args); return 0; } static inline int audit_socketcall_compat(int nargs, u32 *args) { unsigned long a[AUDITSC_ARGS]; int i; if (audit_dummy_context()) return 0; for (i = 0; i < nargs; i++) a[i] = (unsigned long)args[i]; return __audit_socketcall(nargs, a); } static inline int audit_sockaddr(int len, void *addr) { if (unlikely(!audit_dummy_context())) return __audit_sockaddr(len, addr); return 0; } static inline void audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr) { if (unlikely(!audit_dummy_context())) __audit_mq_open(oflag, mode, attr); } static inline void audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, const struct timespec64 *abs_timeout) { if (unlikely(!audit_dummy_context())) __audit_mq_sendrecv(mqdes, msg_len, msg_prio, abs_timeout); } static inline void audit_mq_notify(mqd_t mqdes, const struct sigevent *notification) { if (unlikely(!audit_dummy_context())) __audit_mq_notify(mqdes, notification); } static inline void audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat) { if (unlikely(!audit_dummy_context())) __audit_mq_getsetattr(mqdes, mqstat); } static inline int audit_log_bprm_fcaps(struct linux_binprm *bprm, const struct cred *new, const struct cred *old) { if (unlikely(!audit_dummy_context())) return __audit_log_bprm_fcaps(bprm, new, old); return 0; } static inline void audit_log_capset(const struct cred *new, const struct cred *old) { if (unlikely(!audit_dummy_context())) __audit_log_capset(new, old); } static inline void audit_mmap_fd(int fd, int flags) { if (unlikely(!audit_dummy_context())) __audit_mmap_fd(fd, flags); } static inline void audit_openat2_how(struct open_how *how) { if (unlikely(!audit_dummy_context())) __audit_openat2_how(how); } static inline void audit_log_kern_module(char *name) { if (!audit_dummy_context()) __audit_log_kern_module(name); } static inline void audit_fanotify(u32 response, struct fanotify_response_info_audit_rule *friar) { if (!audit_dummy_context()) __audit_fanotify(response, friar); } static inline void audit_tk_injoffset(struct timespec64 offset) { /* ignore no-op events */ if (offset.tv_sec == 0 && offset.tv_nsec == 0) return; if (!audit_dummy_context()) __audit_tk_injoffset(offset); } static inline void audit_ntp_init(struct audit_ntp_data *ad) { memset(ad, 0, sizeof(*ad)); } static inline void audit_ntp_set_old(struct audit_ntp_data *ad, enum audit_ntp_type type, long long val) { ad->vals[type].oldval = val; } static inline void audit_ntp_set_new(struct audit_ntp_data *ad, enum audit_ntp_type type, long long val) { ad->vals[type].newval = val; } static inline void audit_ntp_log(const struct audit_ntp_data *ad) { if (!audit_dummy_context()) __audit_ntp_log(ad); } static inline void audit_log_nfcfg(const char *name, u8 af, unsigned int nentries, enum audit_nfcfgop op, gfp_t gfp) { if (audit_enabled) __audit_log_nfcfg(name, af, nentries, op, gfp); } extern int audit_n_rules; extern int audit_signals; #else /* CONFIG_AUDITSYSCALL */ static inline int audit_alloc(struct task_struct *task) { return 0; } static inline void audit_free(struct task_struct *task) { } static inline void audit_uring_entry(u8 op) { } static inline void audit_uring_exit(int success, long code) { } static inline void audit_syscall_entry(int major, unsigned long a0, unsigned long a1, unsigned long a2, unsigned long a3) { } static inline void audit_syscall_exit(void *pt_regs) { } static inline bool audit_dummy_context(void) { return true; } static inline void audit_set_context(struct task_struct *task, struct audit_context *ctx) { } static inline struct audit_context *audit_context(void) { return NULL; } static inline struct filename *audit_reusename(const __user char *name) { return NULL; } static inline void audit_getname(struct filename *name) { } static inline void audit_inode(struct filename *name, const struct dentry *dentry, unsigned int aflags) { } static inline void audit_file(struct file *file) { } static inline void audit_inode_parent_hidden(struct filename *name, const struct dentry *dentry) { } static inline void audit_inode_child(struct inode *parent, const struct dentry *dentry, const unsigned char type) { } static inline void audit_core_dumps(long signr) { } static inline void audit_seccomp(unsigned long syscall, long signr, int code) { } static inline void audit_seccomp_actions_logged(const char *names, const char *old_names, int res) { } static inline void audit_ipc_obj(struct kern_ipc_perm *ipcp) { } static inline void audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode) { } static inline void audit_bprm(struct linux_binprm *bprm) { } static inline int audit_socketcall(int nargs, unsigned long *args) { return 0; } static inline int audit_socketcall_compat(int nargs, u32 *args) { return 0; } static inline void audit_fd_pair(int fd1, int fd2) { } static inline int audit_sockaddr(int len, void *addr) { return 0; } static inline void audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr) { } static inline void audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, const struct timespec64 *abs_timeout) { } static inline void audit_mq_notify(mqd_t mqdes, const struct sigevent *notification) { } static inline void audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat) { } static inline int audit_log_bprm_fcaps(struct linux_binprm *bprm, const struct cred *new, const struct cred *old) { return 0; } static inline void audit_log_capset(const struct cred *new, const struct cred *old) { } static inline void audit_mmap_fd(int fd, int flags) { } static inline void audit_openat2_how(struct open_how *how) { } static inline void audit_log_kern_module(char *name) { } static inline void audit_fanotify(u32 response, struct fanotify_response_info_audit_rule *friar) { } static inline void audit_tk_injoffset(struct timespec64 offset) { } static inline void audit_ntp_init(struct audit_ntp_data *ad) { } static inline void audit_ntp_set_old(struct audit_ntp_data *ad, enum audit_ntp_type type, long long val) { } static inline void audit_ntp_set_new(struct audit_ntp_data *ad, enum audit_ntp_type type, long long val) { } static inline void audit_ntp_log(const struct audit_ntp_data *ad) { } static inline void audit_ptrace(struct task_struct *t) { } static inline void audit_log_nfcfg(const char *name, u8 af, unsigned int nentries, enum audit_nfcfgop op, gfp_t gfp) { } #define audit_n_rules 0 #define audit_signals 0 #endif /* CONFIG_AUDITSYSCALL */ static inline bool audit_loginuid_set(struct task_struct *tsk) { return uid_valid(audit_get_loginuid(tsk)); } #endif
1 1 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Kernel-based Virtual Machine driver for Linux * * This header defines architecture specific interfaces, x86 version */ #ifndef _ASM_X86_KVM_HOST_H #define _ASM_X86_KVM_HOST_H #include <linux/types.h> #include <linux/mm.h> #include <linux/mmu_notifier.h> #include <linux/tracepoint.h> #include <linux/cpumask.h> #include <linux/irq_work.h> #include <linux/irq.h> #include <linux/workqueue.h> #include <linux/kvm.h> #include <linux/kvm_para.h> #include <linux/kvm_types.h> #include <linux/perf_event.h> #include <linux/pvclock_gtod.h> #include <linux/clocksource.h> #include <linux/irqbypass.h> #include <linux/hyperv.h> #include <linux/kfifo.h> #include <asm/apic.h> #include <asm/pvclock-abi.h> #include <asm/desc.h> #include <asm/mtrr.h> #include <asm/msr-index.h> #include <asm/asm.h> #include <asm/kvm_page_track.h> #include <asm/kvm_vcpu_regs.h> #include <asm/hyperv-tlfs.h> #define __KVM_HAVE_ARCH_VCPU_DEBUGFS /* * CONFIG_KVM_MAX_NR_VCPUS is defined iff CONFIG_KVM!=n, provide a dummy max if * KVM is disabled (arbitrarily use the default from CONFIG_KVM_MAX_NR_VCPUS). */ #ifdef CONFIG_KVM_MAX_NR_VCPUS #define KVM_MAX_VCPUS CONFIG_KVM_MAX_NR_VCPUS #else #define KVM_MAX_VCPUS 1024 #endif /* * In x86, the VCPU ID corresponds to the APIC ID, and APIC IDs * might be larger than the actual number of VCPUs because the * APIC ID encodes CPU topology information. * * In the worst case, we'll need less than one extra bit for the * Core ID, and less than one extra bit for the Package (Die) ID, * so ratio of 4 should be enough. */ #define KVM_VCPU_ID_RATIO 4 #define KVM_MAX_VCPU_IDS (KVM_MAX_VCPUS * KVM_VCPU_ID_RATIO) /* memory slots that are not exposed to userspace */ #define KVM_INTERNAL_MEM_SLOTS 3 #define KVM_HALT_POLL_NS_DEFAULT 200000 #define KVM_IRQCHIP_NUM_PINS KVM_IOAPIC_NUM_PINS #define KVM_DIRTY_LOG_MANUAL_CAPS (KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE | \ KVM_DIRTY_LOG_INITIALLY_SET) #define KVM_BUS_LOCK_DETECTION_VALID_MODE (KVM_BUS_LOCK_DETECTION_OFF | \ KVM_BUS_LOCK_DETECTION_EXIT) #define KVM_X86_NOTIFY_VMEXIT_VALID_BITS (KVM_X86_NOTIFY_VMEXIT_ENABLED | \ KVM_X86_NOTIFY_VMEXIT_USER) /* x86-specific vcpu->requests bit members */ #define KVM_REQ_MIGRATE_TIMER KVM_ARCH_REQ(0) #define KVM_REQ_REPORT_TPR_ACCESS KVM_ARCH_REQ(1) #define KVM_REQ_TRIPLE_FAULT KVM_ARCH_REQ(2) #define KVM_REQ_MMU_SYNC KVM_ARCH_REQ(3) #define KVM_REQ_CLOCK_UPDATE KVM_ARCH_REQ(4) #define KVM_REQ_LOAD_MMU_PGD KVM_ARCH_REQ(5) #define KVM_REQ_EVENT KVM_ARCH_REQ(6) #define KVM_REQ_APF_HALT KVM_ARCH_REQ(7) #define KVM_REQ_STEAL_UPDATE KVM_ARCH_REQ(8) #define KVM_REQ_NMI KVM_ARCH_REQ(9) #define KVM_REQ_PMU KVM_ARCH_REQ(10) #define KVM_REQ_PMI KVM_ARCH_REQ(11) #ifdef CONFIG_KVM_SMM #define KVM_REQ_SMI KVM_ARCH_REQ(12) #endif #define KVM_REQ_MASTERCLOCK_UPDATE KVM_ARCH_REQ(13) #define KVM_REQ_MCLOCK_INPROGRESS \ KVM_ARCH_REQ_FLAGS(14, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_SCAN_IOAPIC \ KVM_ARCH_REQ_FLAGS(15, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_GLOBAL_CLOCK_UPDATE KVM_ARCH_REQ(16) #define KVM_REQ_APIC_PAGE_RELOAD \ KVM_ARCH_REQ_FLAGS(17, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_HV_CRASH KVM_ARCH_REQ(18) #define KVM_REQ_IOAPIC_EOI_EXIT KVM_ARCH_REQ(19) #define KVM_REQ_HV_RESET KVM_ARCH_REQ(20) #define KVM_REQ_HV_EXIT KVM_ARCH_REQ(21) #define KVM_REQ_HV_STIMER KVM_ARCH_REQ(22) #define KVM_REQ_LOAD_EOI_EXITMAP KVM_ARCH_REQ(23) #define KVM_REQ_GET_NESTED_STATE_PAGES KVM_ARCH_REQ(24) #define KVM_REQ_APICV_UPDATE \ KVM_ARCH_REQ_FLAGS(25, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_TLB_FLUSH_CURRENT KVM_ARCH_REQ(26) #define KVM_REQ_TLB_FLUSH_GUEST \ KVM_ARCH_REQ_FLAGS(27, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_APF_READY KVM_ARCH_REQ(28) #define KVM_REQ_MSR_FILTER_CHANGED KVM_ARCH_REQ(29) #define KVM_REQ_UPDATE_CPU_DIRTY_LOGGING \ KVM_ARCH_REQ_FLAGS(30, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_MMU_FREE_OBSOLETE_ROOTS \ KVM_ARCH_REQ_FLAGS(31, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_HV_TLB_FLUSH \ KVM_ARCH_REQ_FLAGS(32, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define CR0_RESERVED_BITS \ (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \ | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG)) #define CR4_RESERVED_BITS \ (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR | X86_CR4_PCIDE \ | X86_CR4_OSXSAVE | X86_CR4_SMEP | X86_CR4_FSGSBASE \ | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_VMXE \ | X86_CR4_SMAP | X86_CR4_PKE | X86_CR4_UMIP \ | X86_CR4_LAM_SUP)) #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) #define INVALID_PAGE (~(hpa_t)0) #define VALID_PAGE(x) ((x) != INVALID_PAGE) /* KVM Hugepage definitions for x86 */ #define KVM_MAX_HUGEPAGE_LEVEL PG_LEVEL_1G #define KVM_NR_PAGE_SIZES (KVM_MAX_HUGEPAGE_LEVEL - PG_LEVEL_4K + 1) #define KVM_HPAGE_GFN_SHIFT(x) (((x) - 1) * 9) #define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + KVM_HPAGE_GFN_SHIFT(x)) #define KVM_HPAGE_SIZE(x) (1UL << KVM_HPAGE_SHIFT(x)) #define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1)) #define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE) #define KVM_MEMSLOT_PAGES_TO_MMU_PAGES_RATIO 50 #define KVM_MIN_ALLOC_MMU_PAGES 64UL #define KVM_MMU_HASH_SHIFT 12 #define KVM_NUM_MMU_PAGES (1 << KVM_MMU_HASH_SHIFT) #define KVM_MIN_FREE_MMU_PAGES 5 #define KVM_REFILL_PAGES 25 #define KVM_MAX_CPUID_ENTRIES 256 #define KVM_NR_FIXED_MTRR_REGION 88 #define KVM_NR_VAR_MTRR 8 #define ASYNC_PF_PER_VCPU 64 enum kvm_reg { VCPU_REGS_RAX = __VCPU_REGS_RAX, VCPU_REGS_RCX = __VCPU_REGS_RCX, VCPU_REGS_RDX = __VCPU_REGS_RDX, VCPU_REGS_RBX = __VCPU_REGS_RBX, VCPU_REGS_RSP = __VCPU_REGS_RSP, VCPU_REGS_RBP = __VCPU_REGS_RBP, VCPU_REGS_RSI = __VCPU_REGS_RSI, VCPU_REGS_RDI = __VCPU_REGS_RDI, #ifdef CONFIG_X86_64 VCPU_REGS_R8 = __VCPU_REGS_R8, VCPU_REGS_R9 = __VCPU_REGS_R9, VCPU_REGS_R10 = __VCPU_REGS_R10, VCPU_REGS_R11 = __VCPU_REGS_R11, VCPU_REGS_R12 = __VCPU_REGS_R12, VCPU_REGS_R13 = __VCPU_REGS_R13, VCPU_REGS_R14 = __VCPU_REGS_R14, VCPU_REGS_R15 = __VCPU_REGS_R15, #endif VCPU_REGS_RIP, NR_VCPU_REGS, VCPU_EXREG_PDPTR = NR_VCPU_REGS, VCPU_EXREG_CR0, VCPU_EXREG_CR3, VCPU_EXREG_CR4, VCPU_EXREG_RFLAGS, VCPU_EXREG_SEGMENTS, VCPU_EXREG_EXIT_INFO_1, VCPU_EXREG_EXIT_INFO_2, }; enum { VCPU_SREG_ES, VCPU_SREG_CS, VCPU_SREG_SS, VCPU_SREG_DS, VCPU_SREG_FS, VCPU_SREG_GS, VCPU_SREG_TR, VCPU_SREG_LDTR, }; enum exit_fastpath_completion { EXIT_FASTPATH_NONE, EXIT_FASTPATH_REENTER_GUEST, EXIT_FASTPATH_EXIT_HANDLED, }; typedef enum exit_fastpath_completion fastpath_t; struct x86_emulate_ctxt; struct x86_exception; union kvm_smram; enum x86_intercept; enum x86_intercept_stage; #define KVM_NR_DB_REGS 4 #define DR6_BUS_LOCK (1 << 11) #define DR6_BD (1 << 13) #define DR6_BS (1 << 14) #define DR6_BT (1 << 15) #define DR6_RTM (1 << 16) /* * DR6_ACTIVE_LOW combines fixed-1 and active-low bits. * We can regard all the bits in DR6_FIXED_1 as active_low bits; * they will never be 0 for now, but when they are defined * in the future it will require no code change. * * DR6_ACTIVE_LOW is also used as the init/reset value for DR6. */ #define DR6_ACTIVE_LOW 0xffff0ff0 #define DR6_VOLATILE 0x0001e80f #define DR6_FIXED_1 (DR6_ACTIVE_LOW & ~DR6_VOLATILE) #define DR7_BP_EN_MASK 0x000000ff #define DR7_GE (1 << 9) #define DR7_GD (1 << 13) #define DR7_FIXED_1 0x00000400 #define DR7_VOLATILE 0xffff2bff #define KVM_GUESTDBG_VALID_MASK \ (KVM_GUESTDBG_ENABLE | \ KVM_GUESTDBG_SINGLESTEP | \ KVM_GUESTDBG_USE_HW_BP | \ KVM_GUESTDBG_USE_SW_BP | \ KVM_GUESTDBG_INJECT_BP | \ KVM_GUESTDBG_INJECT_DB | \ KVM_GUESTDBG_BLOCKIRQ) #define PFERR_PRESENT_BIT 0 #define PFERR_WRITE_BIT 1 #define PFERR_USER_BIT 2 #define PFERR_RSVD_BIT 3 #define PFERR_FETCH_BIT 4 #define PFERR_PK_BIT 5 #define PFERR_SGX_BIT 15 #define PFERR_GUEST_FINAL_BIT 32 #define PFERR_GUEST_PAGE_BIT 33 #define PFERR_IMPLICIT_ACCESS_BIT 48 #define PFERR_PRESENT_MASK BIT(PFERR_PRESENT_BIT) #define PFERR_WRITE_MASK BIT(PFERR_WRITE_BIT) #define PFERR_USER_MASK BIT(PFERR_USER_BIT) #define PFERR_RSVD_MASK BIT(PFERR_RSVD_BIT) #define PFERR_FETCH_MASK BIT(PFERR_FETCH_BIT) #define PFERR_PK_MASK BIT(PFERR_PK_BIT) #define PFERR_SGX_MASK BIT(PFERR_SGX_BIT) #define PFERR_GUEST_FINAL_MASK BIT_ULL(PFERR_GUEST_FINAL_BIT) #define PFERR_GUEST_PAGE_MASK BIT_ULL(PFERR_GUEST_PAGE_BIT) #define PFERR_IMPLICIT_ACCESS BIT_ULL(PFERR_IMPLICIT_ACCESS_BIT) #define PFERR_NESTED_GUEST_PAGE (PFERR_GUEST_PAGE_MASK | \ PFERR_WRITE_MASK | \ PFERR_PRESENT_MASK) /* apic attention bits */ #define KVM_APIC_CHECK_VAPIC 0 /* * The following bit is set with PV-EOI, unset on EOI. * We detect PV-EOI changes by guest by comparing * this bit with PV-EOI in guest memory. * See the implementation in apic_update_pv_eoi. */ #define KVM_APIC_PV_EOI_PENDING 1 struct kvm_kernel_irq_routing_entry; /* * kvm_mmu_page_role tracks the properties of a shadow page (where shadow page * also includes TDP pages) to determine whether or not a page can be used in * the given MMU context. This is a subset of the overall kvm_cpu_role to * minimize the size of kvm_memory_slot.arch.gfn_write_track, i.e. allows * allocating 2 bytes per gfn instead of 4 bytes per gfn. * * Upper-level shadow pages having gptes are tracked for write-protection via * gfn_write_track. As above, gfn_write_track is a 16 bit counter, so KVM must * not create more than 2^16-1 upper-level shadow pages at a single gfn, * otherwise gfn_write_track will overflow and explosions will ensue. * * A unique shadow page (SP) for a gfn is created if and only if an existing SP * cannot be reused. The ability to reuse a SP is tracked by its role, which * incorporates various mode bits and properties of the SP. Roughly speaking, * the number of unique SPs that can theoretically be created is 2^n, where n * is the number of bits that are used to compute the role. * * But, even though there are 19 bits in the mask below, not all combinations * of modes and flags are possible: * * - invalid shadow pages are not accounted, so the bits are effectively 18 * * - quadrant will only be used if has_4_byte_gpte=1 (non-PAE paging); * execonly and ad_disabled are only used for nested EPT which has * has_4_byte_gpte=0. Therefore, 2 bits are always unused. * * - the 4 bits of level are effectively limited to the values 2/3/4/5, * as 4k SPs are not tracked (allowed to go unsync). In addition non-PAE * paging has exactly one upper level, making level completely redundant * when has_4_byte_gpte=1. * * - on top of this, smep_andnot_wp and smap_andnot_wp are only set if * cr0_wp=0, therefore these three bits only give rise to 5 possibilities. * * Therefore, the maximum number of possible upper-level shadow pages for a * single gfn is a bit less than 2^13. */ union kvm_mmu_page_role { u32 word; struct { unsigned level:4; unsigned has_4_byte_gpte:1; unsigned quadrant:2; unsigned direct:1; unsigned access:3; unsigned invalid:1; unsigned efer_nx:1; unsigned cr0_wp:1; unsigned smep_andnot_wp:1; unsigned smap_andnot_wp:1; unsigned ad_disabled:1; unsigned guest_mode:1; unsigned passthrough:1; unsigned :5; /* * This is left at the top of the word so that * kvm_memslots_for_spte_role can extract it with a * simple shift. While there is room, give it a whole * byte so it is also faster to load it from memory. */ unsigned smm:8; }; }; /* * kvm_mmu_extended_role complements kvm_mmu_page_role, tracking properties * relevant to the current MMU configuration. When loading CR0, CR4, or EFER, * including on nested transitions, if nothing in the full role changes then * MMU re-configuration can be skipped. @valid bit is set on first usage so we * don't treat all-zero structure as valid data. * * The properties that are tracked in the extended role but not the page role * are for things that either (a) do not affect the validity of the shadow page * or (b) are indirectly reflected in the shadow page's role. For example, * CR4.PKE only affects permission checks for software walks of the guest page * tables (because KVM doesn't support Protection Keys with shadow paging), and * CR0.PG, CR4.PAE, and CR4.PSE are indirectly reflected in role.level. * * Note, SMEP and SMAP are not redundant with sm*p_andnot_wp in the page role. * If CR0.WP=1, KVM can reuse shadow pages for the guest regardless of SMEP and * SMAP, but the MMU's permission checks for software walks need to be SMEP and * SMAP aware regardless of CR0.WP. */ union kvm_mmu_extended_role { u32 word; struct { unsigned int valid:1; unsigned int execonly:1; unsigned int cr4_pse:1; unsigned int cr4_pke:1; unsigned int cr4_smap:1; unsigned int cr4_smep:1; unsigned int cr4_la57:1; unsigned int efer_lma:1; }; }; union kvm_cpu_role { u64 as_u64; struct { union kvm_mmu_page_role base; union kvm_mmu_extended_role ext; }; }; struct kvm_rmap_head { unsigned long val; }; struct kvm_pio_request { unsigned long linear_rip; unsigned long count; int in; int port; int size; }; #define PT64_ROOT_MAX_LEVEL 5 struct rsvd_bits_validate { u64 rsvd_bits_mask[2][PT64_ROOT_MAX_LEVEL]; u64 bad_mt_xwr; }; struct kvm_mmu_root_info { gpa_t pgd; hpa_t hpa; }; #define KVM_MMU_ROOT_INFO_INVALID \ ((struct kvm_mmu_root_info) { .pgd = INVALID_PAGE, .hpa = INVALID_PAGE }) #define KVM_MMU_NUM_PREV_ROOTS 3 #define KVM_MMU_ROOT_CURRENT BIT(0) #define KVM_MMU_ROOT_PREVIOUS(i) BIT(1+i) #define KVM_MMU_ROOTS_ALL (BIT(1 + KVM_MMU_NUM_PREV_ROOTS) - 1) #define KVM_HAVE_MMU_RWLOCK struct kvm_mmu_page; struct kvm_page_fault; /* * x86 supports 4 paging modes (5-level 64-bit, 4-level 64-bit, 3-level 32-bit, * and 2-level 32-bit). The kvm_mmu structure abstracts the details of the * current mmu mode. */ struct kvm_mmu { unsigned long (*get_guest_pgd)(struct kvm_vcpu *vcpu); u64 (*get_pdptr)(struct kvm_vcpu *vcpu, int index); int (*page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault); void (*inject_page_fault)(struct kvm_vcpu *vcpu, struct x86_exception *fault); gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, gpa_t gva_or_gpa, u64 access, struct x86_exception *exception); int (*sync_spte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int i); struct kvm_mmu_root_info root; union kvm_cpu_role cpu_role; union kvm_mmu_page_role root_role; /* * The pkru_mask indicates if protection key checks are needed. It * consists of 16 domains indexed by page fault error code bits [4:1], * with PFEC.RSVD replaced by ACC_USER_MASK from the page tables. * Each domain has 2 bits which are ANDed with AD and WD from PKRU. */ u32 pkru_mask; struct kvm_mmu_root_info prev_roots[KVM_MMU_NUM_PREV_ROOTS]; /* * Bitmap; bit set = permission fault * Byte index: page fault error code [4:1] * Bit index: pte permissions in ACC_* format */ u8 permissions[16]; u64 *pae_root; u64 *pml4_root; u64 *pml5_root; /* * check zero bits on shadow page table entries, these * bits include not only hardware reserved bits but also * the bits spte never used. */ struct rsvd_bits_validate shadow_zero_check; struct rsvd_bits_validate guest_rsvd_check; u64 pdptrs[4]; /* pae */ }; enum pmc_type { KVM_PMC_GP = 0, KVM_PMC_FIXED, }; struct kvm_pmc { enum pmc_type type; u8 idx; bool is_paused; bool intr; /* * Base value of the PMC counter, relative to the *consumed* count in * the associated perf_event. This value includes counter updates from * the perf_event and emulated_count since the last time the counter * was reprogrammed, but it is *not* the current value as seen by the * guest or userspace. * * The count is relative to the associated perf_event so that KVM * doesn't need to reprogram the perf_event every time the guest writes * to the counter. */ u64 counter; /* * PMC events triggered by KVM emulation that haven't been fully * processed, i.e. haven't undergone overflow detection. */ u64 emulated_counter; u64 eventsel; struct perf_event *perf_event; struct kvm_vcpu *vcpu; /* * only for creating or reusing perf_event, * eventsel value for general purpose counters, * ctrl value for fixed counters. */ u64 current_config; }; /* More counters may conflict with other existing Architectural MSRs */ #define KVM_INTEL_PMC_MAX_GENERIC 8 #define MSR_ARCH_PERFMON_PERFCTR_MAX (MSR_ARCH_PERFMON_PERFCTR0 + KVM_INTEL_PMC_MAX_GENERIC - 1) #define MSR_ARCH_PERFMON_EVENTSEL_MAX (MSR_ARCH_PERFMON_EVENTSEL0 + KVM_INTEL_PMC_MAX_GENERIC - 1) #define KVM_PMC_MAX_FIXED 3 #define MSR_ARCH_PERFMON_FIXED_CTR_MAX (MSR_ARCH_PERFMON_FIXED_CTR0 + KVM_PMC_MAX_FIXED - 1) #define KVM_AMD_PMC_MAX_GENERIC 6 struct kvm_pmu { u8 version; unsigned nr_arch_gp_counters; unsigned nr_arch_fixed_counters; unsigned available_event_types; u64 fixed_ctr_ctrl; u64 fixed_ctr_ctrl_mask; u64 global_ctrl; u64 global_status; u64 counter_bitmask[2]; u64 global_ctrl_mask; u64 global_status_mask; u64 reserved_bits; u64 raw_event_mask; struct kvm_pmc gp_counters[KVM_INTEL_PMC_MAX_GENERIC]; struct kvm_pmc fixed_counters[KVM_PMC_MAX_FIXED]; /* * Overlay the bitmap with a 64-bit atomic so that all bits can be * set in a single access, e.g. to reprogram all counters when the PMU * filter changes. */ union { DECLARE_BITMAP(reprogram_pmi, X86_PMC_IDX_MAX); atomic64_t __reprogram_pmi; }; DECLARE_BITMAP(all_valid_pmc_idx, X86_PMC_IDX_MAX); DECLARE_BITMAP(pmc_in_use, X86_PMC_IDX_MAX); u64 ds_area; u64 pebs_enable; u64 pebs_enable_mask; u64 pebs_data_cfg; u64 pebs_data_cfg_mask; /* * If a guest counter is cross-mapped to host counter with different * index, its PEBS capability will be temporarily disabled. * * The user should make sure that this mask is updated * after disabling interrupts and before perf_guest_get_msrs(); */ u64 host_cross_mapped_mask; /* * The gate to release perf_events not marked in * pmc_in_use only once in a vcpu time slice. */ bool need_cleanup; /* * The total number of programmed perf_events and it helps to avoid * redundant check before cleanup if guest don't use vPMU at all. */ u8 event_count; }; struct kvm_pmu_ops; enum { KVM_DEBUGREG_BP_ENABLED = 1, KVM_DEBUGREG_WONT_EXIT = 2, }; struct kvm_mtrr_range { u64 base; u64 mask; struct list_head node; }; struct kvm_mtrr { struct kvm_mtrr_range var_ranges[KVM_NR_VAR_MTRR]; mtrr_type fixed_ranges[KVM_NR_FIXED_MTRR_REGION]; u64 deftype; struct list_head head; }; /* Hyper-V SynIC timer */ struct kvm_vcpu_hv_stimer { struct hrtimer timer; int index; union hv_stimer_config config; u64 count; u64 exp_time; struct hv_message msg; bool msg_pending; }; /* Hyper-V synthetic interrupt controller (SynIC)*/ struct kvm_vcpu_hv_synic { u64 version; u64 control; u64 msg_page; u64 evt_page; atomic64_t sint[HV_SYNIC_SINT_COUNT]; atomic_t sint_to_gsi[HV_SYNIC_SINT_COUNT]; DECLARE_BITMAP(auto_eoi_bitmap, 256); DECLARE_BITMAP(vec_bitmap, 256); bool active; bool dont_zero_synic_pages; }; /* The maximum number of entries on the TLB flush fifo. */ #define KVM_HV_TLB_FLUSH_FIFO_SIZE (16) /* * Note: the following 'magic' entry is made up by KVM to avoid putting * anything besides GVA on the TLB flush fifo. It is theoretically possible * to observe a request to flush 4095 PFNs starting from 0xfffffffffffff000 * which will look identical. KVM's action to 'flush everything' instead of * flushing these particular addresses is, however, fully legitimate as * flushing more than requested is always OK. */ #define KVM_HV_TLB_FLUSHALL_ENTRY ((u64)-1) enum hv_tlb_flush_fifos { HV_L1_TLB_FLUSH_FIFO, HV_L2_TLB_FLUSH_FIFO, HV_NR_TLB_FLUSH_FIFOS, }; struct kvm_vcpu_hv_tlb_flush_fifo { spinlock_t write_lock; DECLARE_KFIFO(entries, u64, KVM_HV_TLB_FLUSH_FIFO_SIZE); }; /* Hyper-V per vcpu emulation context */ struct kvm_vcpu_hv { struct kvm_vcpu *vcpu; u32 vp_index; u64 hv_vapic; s64 runtime_offset; struct kvm_vcpu_hv_synic synic; struct kvm_hyperv_exit exit; struct kvm_vcpu_hv_stimer stimer[HV_SYNIC_STIMER_COUNT]; DECLARE_BITMAP(stimer_pending_bitmap, HV_SYNIC_STIMER_COUNT); bool enforce_cpuid; struct { u32 features_eax; /* HYPERV_CPUID_FEATURES.EAX */ u32 features_ebx; /* HYPERV_CPUID_FEATURES.EBX */ u32 features_edx; /* HYPERV_CPUID_FEATURES.EDX */ u32 enlightenments_eax; /* HYPERV_CPUID_ENLIGHTMENT_INFO.EAX */ u32 enlightenments_ebx; /* HYPERV_CPUID_ENLIGHTMENT_INFO.EBX */ u32 syndbg_cap_eax; /* HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES.EAX */ u32 nested_eax; /* HYPERV_CPUID_NESTED_FEATURES.EAX */ u32 nested_ebx; /* HYPERV_CPUID_NESTED_FEATURES.EBX */ } cpuid_cache; struct kvm_vcpu_hv_tlb_flush_fifo tlb_flush_fifo[HV_NR_TLB_FLUSH_FIFOS]; /* Preallocated buffer for handling hypercalls passing sparse vCPU set */ u64 sparse_banks[HV_MAX_SPARSE_VCPU_BANKS]; struct hv_vp_assist_page vp_assist_page; struct { u64 pa_page_gpa; u64 vm_id; u32 vp_id; } nested; }; struct kvm_hypervisor_cpuid { u32 base; u32 limit; }; #ifdef CONFIG_KVM_XEN /* Xen HVM per vcpu emulation context */ struct kvm_vcpu_xen { u64 hypercall_rip; u32 current_runstate; u8 upcall_vector; struct gfn_to_pfn_cache vcpu_info_cache; struct gfn_to_pfn_cache vcpu_time_info_cache; struct gfn_to_pfn_cache runstate_cache; struct gfn_to_pfn_cache runstate2_cache; u64 last_steal; u64 runstate_entry_time; u64 runstate_times[4]; unsigned long evtchn_pending_sel; u32 vcpu_id; /* The Xen / ACPI vCPU ID */ u32 timer_virq; u64 timer_expires; /* In guest epoch */ atomic_t timer_pending; struct hrtimer timer; int poll_evtchn; struct timer_list poll_timer; struct kvm_hypervisor_cpuid cpuid; }; #endif struct kvm_queued_exception { bool pending; bool injected; bool has_error_code; u8 vector; u32 error_code; unsigned long payload; bool has_payload; }; struct kvm_vcpu_arch { /* * rip and regs accesses must go through * kvm_{register,rip}_{read,write} functions. */ unsigned long regs[NR_VCPU_REGS]; u32 regs_avail; u32 regs_dirty; unsigned long cr0; unsigned long cr0_guest_owned_bits; unsigned long cr2; unsigned long cr3; unsigned long cr4; unsigned long cr4_guest_owned_bits; unsigned long cr4_guest_rsvd_bits; unsigned long cr8; u32 host_pkru; u32 pkru; u32 hflags; u64 efer; u64 apic_base; struct kvm_lapic *apic; /* kernel irqchip context */ bool load_eoi_exitmap_pending; DECLARE_BITMAP(ioapic_handled_vectors, 256); unsigned long apic_attention; int32_t apic_arb_prio; int mp_state; u64 ia32_misc_enable_msr; u64 smbase; u64 smi_count; bool at_instruction_boundary; bool tpr_access_reporting; bool xfd_no_write_intercept; u64 ia32_xss; u64 microcode_version; u64 arch_capabilities; u64 perf_capabilities; /* * Paging state of the vcpu * * If the vcpu runs in guest mode with two level paging this still saves * the paging mode of the l1 guest. This context is always used to * handle faults. */ struct kvm_mmu *mmu; /* Non-nested MMU for L1 */ struct kvm_mmu root_mmu; /* L1 MMU when running nested */ struct kvm_mmu guest_mmu; /* * Paging state of an L2 guest (used for nested npt) * * This context will save all necessary information to walk page tables * of an L2 guest. This context is only initialized for page table * walking and not for faulting since we never handle l2 page faults on * the host. */ struct kvm_mmu nested_mmu; /* * Pointer to the mmu context currently used for * gva_to_gpa translations. */ struct kvm_mmu *walk_mmu; struct kvm_mmu_memory_cache mmu_pte_list_desc_cache; struct kvm_mmu_memory_cache mmu_shadow_page_cache; struct kvm_mmu_memory_cache mmu_shadowed_info_cache; struct kvm_mmu_memory_cache mmu_page_header_cache; /* * QEMU userspace and the guest each have their own FPU state. * In vcpu_run, we switch between the user and guest FPU contexts. * While running a VCPU, the VCPU thread will have the guest FPU * context. * * Note that while the PKRU state lives inside the fpu registers, * it is switched out separately at VMENTER and VMEXIT time. The * "guest_fpstate" state here contains the guest FPU context, with the * host PRKU bits. */ struct fpu_guest guest_fpu; u64 xcr0; u64 guest_supported_xcr0; struct kvm_pio_request pio; void *pio_data; void *sev_pio_data; unsigned sev_pio_count; u8 event_exit_inst_len; bool exception_from_userspace; /* Exceptions to be injected to the guest. */ struct kvm_queued_exception exception; /* Exception VM-Exits to be synthesized to L1. */ struct kvm_queued_exception exception_vmexit; struct kvm_queued_interrupt { bool injected; bool soft; u8 nr; } interrupt; int halt_request; /* real mode on Intel only */ int cpuid_nent; struct kvm_cpuid_entry2 *cpuid_entries; struct kvm_hypervisor_cpuid kvm_cpuid; /* * FIXME: Drop this macro and use KVM_NR_GOVERNED_FEATURES directly * when "struct kvm_vcpu_arch" is no longer defined in an * arch/x86/include/asm header. The max is mostly arbitrary, i.e. * can be increased as necessary. */ #define KVM_MAX_NR_GOVERNED_FEATURES BITS_PER_LONG /* * Track whether or not the guest is allowed to use features that are * governed by KVM, where "governed" means KVM needs to manage state * and/or explicitly enable the feature in hardware. Typically, but * not always, governed features can be used by the guest if and only * if both KVM and userspace want to expose the feature to the guest. */ struct { DECLARE_BITMAP(enabled, KVM_MAX_NR_GOVERNED_FEATURES); } governed_features; u64 reserved_gpa_bits; int maxphyaddr; /* emulate context */ struct x86_emulate_ctxt *emulate_ctxt; bool emulate_regs_need_sync_to_vcpu; bool emulate_regs_need_sync_from_vcpu; int (*complete_userspace_io)(struct kvm_vcpu *vcpu); gpa_t time; struct pvclock_vcpu_time_info hv_clock; unsigned int hw_tsc_khz; struct gfn_to_pfn_cache pv_time; /* set guest stopped flag in pvclock flags field */ bool pvclock_set_guest_stopped_request; struct { u8 preempted; u64 msr_val; u64 last_steal; struct gfn_to_hva_cache cache; } st; u64 l1_tsc_offset; u64 tsc_offset; /* current tsc offset */ u64 last_guest_tsc; u64 last_host_tsc; u64 tsc_offset_adjustment; u64 this_tsc_nsec; u64 this_tsc_write; u64 this_tsc_generation; bool tsc_catchup; bool tsc_always_catchup; s8 virtual_tsc_shift; u32 virtual_tsc_mult; u32 virtual_tsc_khz; s64 ia32_tsc_adjust_msr; u64 msr_ia32_power_ctl; u64 l1_tsc_scaling_ratio; u64 tsc_scaling_ratio; /* current scaling ratio */ atomic_t nmi_queued; /* unprocessed asynchronous NMIs */ /* Number of NMIs pending injection, not including hardware vNMIs. */ unsigned int nmi_pending; bool nmi_injected; /* Trying to inject an NMI this entry */ bool smi_pending; /* SMI queued after currently running handler */ u8 handling_intr_from_guest; struct kvm_mtrr mtrr_state; u64 pat; unsigned switch_db_regs; unsigned long db[KVM_NR_DB_REGS]; unsigned long dr6; unsigned long dr7; unsigned long eff_db[KVM_NR_DB_REGS]; unsigned long guest_debug_dr7; u64 msr_platform_info; u64 msr_misc_features_enables; u64 mcg_cap; u64 mcg_status; u64 mcg_ctl; u64 mcg_ext_ctl; u64 *mce_banks; u64 *mci_ctl2_banks; /* Cache MMIO info */ u64 mmio_gva; unsigned mmio_access; gfn_t mmio_gfn; u64 mmio_gen; struct kvm_pmu pmu; /* used for guest single stepping over the given code position */ unsigned long singlestep_rip; #ifdef CONFIG_KVM_HYPERV bool hyperv_enabled; struct kvm_vcpu_hv *hyperv; #endif #ifdef CONFIG_KVM_XEN struct kvm_vcpu_xen xen; #endif cpumask_var_t wbinvd_dirty_mask; unsigned long last_retry_eip; unsigned long last_retry_addr; struct { bool halted; gfn_t gfns[ASYNC_PF_PER_VCPU]; struct gfn_to_hva_cache data; u64 msr_en_val; /* MSR_KVM_ASYNC_PF_EN */ u64 msr_int_val; /* MSR_KVM_ASYNC_PF_INT */ u16 vec; u32 id; bool send_user_only; u32 host_apf_flags; bool delivery_as_pf_vmexit; bool pageready_pending; } apf; /* OSVW MSRs (AMD only) */ struct { u64 length; u64 status; } osvw; struct { u64 msr_val; struct gfn_to_hva_cache data; } pv_eoi; u64 msr_kvm_poll_control; /* set at EPT violation at this point */ unsigned long exit_qualification; /* pv related host specific info */ struct { bool pv_unhalted; } pv; int pending_ioapic_eoi; int pending_external_vector; /* be preempted when it's in kernel-mode(cpl=0) */ bool preempted_in_kernel; /* Flush the L1 Data cache for L1TF mitigation on VMENTER */ bool l1tf_flush_l1d; /* Host CPU on which VM-entry was most recently attempted */ int last_vmentry_cpu; /* AMD MSRC001_0015 Hardware Configuration */ u64 msr_hwcr; /* pv related cpuid info */ struct { /* * value of the eax register in the KVM_CPUID_FEATURES CPUID * leaf. */ u32 features; /* * indicates whether pv emulation should be disabled if features * are not present in the guest's cpuid */ bool enforce; } pv_cpuid; /* Protected Guests */ bool guest_state_protected; /* * Set when PDPTS were loaded directly by the userspace without * reading the guest memory */ bool pdptrs_from_userspace; #if IS_ENABLED(CONFIG_HYPERV) hpa_t hv_root_tdp; #endif }; struct kvm_lpage_info { int disallow_lpage; }; struct kvm_arch_memory_slot { struct kvm_rmap_head *rmap[KVM_NR_PAGE_SIZES]; struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1]; unsigned short *gfn_write_track; }; /* * Track the mode of the optimized logical map, as the rules for decoding the * destination vary per mode. Enabling the optimized logical map requires all * software-enabled local APIs to be in the same mode, each addressable APIC to * be mapped to only one MDA, and each MDA to map to at most one APIC. */ enum kvm_apic_logical_mode { /* All local APICs are software disabled. */ KVM_APIC_MODE_SW_DISABLED, /* All software enabled local APICs in xAPIC cluster addressing mode. */ KVM_APIC_MODE_XAPIC_CLUSTER, /* All software enabled local APICs in xAPIC flat addressing mode. */ KVM_APIC_MODE_XAPIC_FLAT, /* All software enabled local APICs in x2APIC mode. */ KVM_APIC_MODE_X2APIC, /* * Optimized map disabled, e.g. not all local APICs in the same logical * mode, same logical ID assigned to multiple APICs, etc. */ KVM_APIC_MODE_MAP_DISABLED, }; struct kvm_apic_map { struct rcu_head rcu; enum kvm_apic_logical_mode logical_mode; u32 max_apic_id; union { struct kvm_lapic *xapic_flat_map[8]; struct kvm_lapic *xapic_cluster_map[16][4]; }; struct kvm_lapic *phys_map[]; }; /* Hyper-V synthetic debugger (SynDbg)*/ struct kvm_hv_syndbg { struct { u64 control; u64 status; u64 send_page; u64 recv_page; u64 pending_page; } control; u64 options; }; /* Current state of Hyper-V TSC page clocksource */ enum hv_tsc_page_status { /* TSC page was not set up or disabled */ HV_TSC_PAGE_UNSET = 0, /* TSC page MSR was written by the guest, update pending */ HV_TSC_PAGE_GUEST_CHANGED, /* TSC page update was triggered from the host side */ HV_TSC_PAGE_HOST_CHANGED, /* TSC page was properly set up and is currently active */ HV_TSC_PAGE_SET, /* TSC page was set up with an inaccessible GPA */ HV_TSC_PAGE_BROKEN, }; #ifdef CONFIG_KVM_HYPERV /* Hyper-V emulation context */ struct kvm_hv { struct mutex hv_lock; u64 hv_guest_os_id; u64 hv_hypercall; u64 hv_tsc_page; enum hv_tsc_page_status hv_tsc_page_status; /* Hyper-v based guest crash (NT kernel bugcheck) parameters */ u64 hv_crash_param[HV_X64_MSR_CRASH_PARAMS]; u64 hv_crash_ctl; struct ms_hyperv_tsc_page tsc_ref; struct idr conn_to_evt; u64 hv_reenlightenment_control; u64 hv_tsc_emulation_control; u64 hv_tsc_emulation_status; u64 hv_invtsc_control; /* How many vCPUs have VP index != vCPU index */ atomic_t num_mismatched_vp_indexes; /* * How many SynICs use 'AutoEOI' feature * (protected by arch.apicv_update_lock) */ unsigned int synic_auto_eoi_used; struct kvm_hv_syndbg hv_syndbg; bool xsaves_xsavec_checked; }; #endif struct msr_bitmap_range { u32 flags; u32 nmsrs; u32 base; unsigned long *bitmap; }; #ifdef CONFIG_KVM_XEN /* Xen emulation context */ struct kvm_xen { struct mutex xen_lock; u32 xen_version; bool long_mode; bool runstate_update_flag; u8 upcall_vector; struct gfn_to_pfn_cache shinfo_cache; struct idr evtchn_ports; unsigned long poll_mask[BITS_TO_LONGS(KVM_MAX_VCPUS)]; }; #endif enum kvm_irqchip_mode { KVM_IRQCHIP_NONE, KVM_IRQCHIP_KERNEL, /* created with KVM_CREATE_IRQCHIP */ KVM_IRQCHIP_SPLIT, /* created with KVM_CAP_SPLIT_IRQCHIP */ }; struct kvm_x86_msr_filter { u8 count; bool default_allow:1; struct msr_bitmap_range ranges[16]; }; struct kvm_x86_pmu_event_filter { __u32 action; __u32 nevents; __u32 fixed_counter_bitmap; __u32 flags; __u32 nr_includes; __u32 nr_excludes; __u64 *includes; __u64 *excludes; __u64 events[]; }; enum kvm_apicv_inhibit { /********************************************************************/ /* INHIBITs that are relevant to both Intel's APICv and AMD's AVIC. */ /********************************************************************/ /* * APIC acceleration is disabled by a module parameter * and/or not supported in hardware. */ APICV_INHIBIT_REASON_DISABLE, /* * APIC acceleration is inhibited because AutoEOI feature is * being used by a HyperV guest. */ APICV_INHIBIT_REASON_HYPERV, /* * APIC acceleration is inhibited because the userspace didn't yet * enable the kernel/split irqchip. */ APICV_INHIBIT_REASON_ABSENT, /* APIC acceleration is inhibited because KVM_GUESTDBG_BLOCKIRQ * (out of band, debug measure of blocking all interrupts on this vCPU) * was enabled, to avoid AVIC/APICv bypassing it. */ APICV_INHIBIT_REASON_BLOCKIRQ, /* * APICv is disabled because not all vCPUs have a 1:1 mapping between * APIC ID and vCPU, _and_ KVM is not applying its x2APIC hotplug hack. */ APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED, /* * For simplicity, the APIC acceleration is inhibited * first time either APIC ID or APIC base are changed by the guest * from their reset values. */ APICV_INHIBIT_REASON_APIC_ID_MODIFIED, APICV_INHIBIT_REASON_APIC_BASE_MODIFIED, /******************************************************/ /* INHIBITs that are relevant only to the AMD's AVIC. */ /******************************************************/ /* * AVIC is inhibited on a vCPU because it runs a nested guest. * * This is needed because unlike APICv, the peers of this vCPU * cannot use the doorbell mechanism to signal interrupts via AVIC when * a vCPU runs nested. */ APICV_INHIBIT_REASON_NESTED, /* * On SVM, the wait for the IRQ window is implemented with pending vIRQ, * which cannot be injected when the AVIC is enabled, thus AVIC * is inhibited while KVM waits for IRQ window. */ APICV_INHIBIT_REASON_IRQWIN, /* * PIT (i8254) 're-inject' mode, relies on EOI intercept, * which AVIC doesn't support for edge triggered interrupts. */ APICV_INHIBIT_REASON_PIT_REINJ, /* * AVIC is disabled because SEV doesn't support it. */ APICV_INHIBIT_REASON_SEV, /* * AVIC is disabled because not all vCPUs with a valid LDR have a 1:1 * mapping between logical ID and vCPU. */ APICV_INHIBIT_REASON_LOGICAL_ID_ALIASED, }; struct kvm_arch { unsigned long vm_type; unsigned long n_used_mmu_pages; unsigned long n_requested_mmu_pages; unsigned long n_max_mmu_pages; unsigned int indirect_shadow_pages; u8 mmu_valid_gen; struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; struct list_head active_mmu_pages; struct list_head zapped_obsolete_pages; /* * A list of kvm_mmu_page structs that, if zapped, could possibly be * replaced by an NX huge page. A shadow page is on this list if its * existence disallows an NX huge page (nx_huge_page_disallowed is set) * and there are no other conditions that prevent a huge page, e.g. * the backing host page is huge, dirtly logging is not enabled for its * memslot, etc... Note, zapping shadow pages on this list doesn't * guarantee an NX huge page will be created in its stead, e.g. if the * guest attempts to execute from the region then KVM obviously can't * create an NX huge page (without hanging the guest). */ struct list_head possible_nx_huge_pages; #ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING struct kvm_page_track_notifier_head track_notifier_head; #endif /* * Protects marking pages unsync during page faults, as TDP MMU page * faults only take mmu_lock for read. For simplicity, the unsync * pages lock is always taken when marking pages unsync regardless of * whether mmu_lock is held for read or write. */ spinlock_t mmu_unsync_pages_lock; struct iommu_domain *iommu_domain; bool iommu_noncoherent; #define __KVM_HAVE_ARCH_NONCOHERENT_DMA atomic_t noncoherent_dma_count; #define __KVM_HAVE_ARCH_ASSIGNED_DEVICE atomic_t assigned_device_count; struct kvm_pic *vpic; struct kvm_ioapic *vioapic; struct kvm_pit *vpit; atomic_t vapics_in_nmi_mode; struct mutex apic_map_lock; struct kvm_apic_map __rcu *apic_map; atomic_t apic_map_dirty; bool apic_access_memslot_enabled; bool apic_access_memslot_inhibited; /* Protects apicv_inhibit_reasons */ struct rw_semaphore apicv_update_lock; unsigned long apicv_inhibit_reasons; gpa_t wall_clock; bool mwait_in_guest; bool hlt_in_guest; bool pause_in_guest; bool cstate_in_guest; unsigned long irq_sources_bitmap; s64 kvmclock_offset; /* * This also protects nr_vcpus_matched_tsc which is read from a * preemption-disabled region, so it must be a raw spinlock. */ raw_spinlock_t tsc_write_lock; u64 last_tsc_nsec; u64 last_tsc_write; u32 last_tsc_khz; u64 last_tsc_offset; u64 cur_tsc_nsec; u64 cur_tsc_write; u64 cur_tsc_offset; u64 cur_tsc_generation; int nr_vcpus_matched_tsc; u32 default_tsc_khz; bool user_set_tsc; seqcount_raw_spinlock_t pvclock_sc; bool use_master_clock; u64 master_kernel_ns; u64 master_cycle_now; struct delayed_work kvmclock_update_work; struct delayed_work kvmclock_sync_work; struct kvm_xen_hvm_config xen_hvm_config; /* reads protected by irq_srcu, writes by irq_lock */ struct hlist_head mask_notifier_list; #ifdef CONFIG_KVM_HYPERV struct kvm_hv hyperv; #endif #ifdef CONFIG_KVM_XEN struct kvm_xen xen; #endif bool backwards_tsc_observed; bool boot_vcpu_runs_old_kvmclock; u32 bsp_vcpu_id; u64 disabled_quirks; enum kvm_irqchip_mode irqchip_mode; u8 nr_reserved_ioapic_pins; bool disabled_lapic_found; bool x2apic_format; bool x2apic_broadcast_quirk_disabled; bool guest_can_read_msr_platform_info; bool exception_payload_enabled; bool triple_fault_event; bool bus_lock_detection_enabled; bool enable_pmu; u32 notify_window; u32 notify_vmexit_flags; /* * If exit_on_emulation_error is set, and the in-kernel instruction * emulator fails to emulate an instruction, allow userspace * the opportunity to look at it. */ bool exit_on_emulation_error; /* Deflect RDMSR and WRMSR to user space when they trigger a #GP */ u32 user_space_msr_mask; struct kvm_x86_msr_filter __rcu *msr_filter; u32 hypercall_exit_enabled; /* Guest can access the SGX PROVISIONKEY. */ bool sgx_provisioning_allowed; struct kvm_x86_pmu_event_filter __rcu *pmu_event_filter; struct task_struct *nx_huge_page_recovery_thread; #ifdef CONFIG_X86_64 /* The number of TDP MMU pages across all roots. */ atomic64_t tdp_mmu_pages; /* * List of struct kvm_mmu_pages being used as roots. * All struct kvm_mmu_pages in the list should have * tdp_mmu_page set. * * For reads, this list is protected by: * the MMU lock in read mode + RCU or * the MMU lock in write mode * * For writes, this list is protected by tdp_mmu_pages_lock; see * below for the details. * * Roots will remain in the list until their tdp_mmu_root_count * drops to zero, at which point the thread that decremented the * count to zero should removed the root from the list and clean * it up, freeing the root after an RCU grace period. */ struct list_head tdp_mmu_roots; /* * Protects accesses to the following fields when the MMU lock * is held in read mode: * - tdp_mmu_roots (above) * - the link field of kvm_mmu_page structs used by the TDP MMU * - possible_nx_huge_pages; * - the possible_nx_huge_page_link field of kvm_mmu_page structs used * by the TDP MMU * Because the lock is only taken within the MMU lock, strictly * speaking it is redundant to acquire this lock when the thread * holds the MMU lock in write mode. However it often simplifies * the code to do so. */ spinlock_t tdp_mmu_pages_lock; #endif /* CONFIG_X86_64 */ /* * If set, at least one shadow root has been allocated. This flag * is used as one input when determining whether certain memslot * related allocations are necessary. */ bool shadow_root_allocated; #ifdef CONFIG_KVM_EXTERNAL_WRITE_TRACKING /* * If set, the VM has (or had) an external write tracking user, and * thus all write tracking metadata has been allocated, even if KVM * itself isn't using write tracking. */ bool external_write_tracking_enabled; #endif #if IS_ENABLED(CONFIG_HYPERV) hpa_t hv_root_tdp; spinlock_t hv_root_tdp_lock; struct hv_partition_assist_pg *hv_pa_pg; #endif /* * VM-scope maximum vCPU ID. Used to determine the size of structures * that increase along with the maximum vCPU ID, in which case, using * the global KVM_MAX_VCPU_IDS may lead to significant memory waste. */ u32 max_vcpu_ids; bool disable_nx_huge_pages; /* * Memory caches used to allocate shadow pages when performing eager * page splitting. No need for a shadowed_info_cache since eager page * splitting only allocates direct shadow pages. * * Protected by kvm->slots_lock. */ struct kvm_mmu_memory_cache split_shadow_page_cache; struct kvm_mmu_memory_cache split_page_header_cache; /* * Memory cache used to allocate pte_list_desc structs while splitting * huge pages. In the worst case, to split one huge page, 512 * pte_list_desc structs are needed to add each lower level leaf sptep * to the rmap plus 1 to extend the parent_ptes rmap of the lower level * page table. * * Protected by kvm->slots_lock. */ #define SPLIT_DESC_CACHE_MIN_NR_OBJECTS (SPTE_ENT_PER_PAGE + 1) struct kvm_mmu_memory_cache split_desc_cache; }; struct kvm_vm_stat { struct kvm_vm_stat_generic generic; u64 mmu_shadow_zapped; u64 mmu_pte_write; u64 mmu_pde_zapped; u64 mmu_flooded; u64 mmu_recycled; u64 mmu_cache_miss; u64 mmu_unsync; union { struct { atomic64_t pages_4k; atomic64_t pages_2m; atomic64_t pages_1g; }; atomic64_t pages[KVM_NR_PAGE_SIZES]; }; u64 nx_lpage_splits; u64 max_mmu_page_hash_collisions; u64 max_mmu_rmap_size; }; struct kvm_vcpu_stat { struct kvm_vcpu_stat_generic generic; u64 pf_taken; u64 pf_fixed; u64 pf_emulate; u64 pf_spurious; u64 pf_fast; u64 pf_mmio_spte_created; u64 pf_guest; u64 tlb_flush; u64 invlpg; u64 exits; u64 io_exits; u64 mmio_exits; u64 signal_exits; u64 irq_window_exits; u64 nmi_window_exits; u64 l1d_flush; u64 halt_exits; u64 request_irq_exits; u64 irq_exits; u64 host_state_reload; u64 fpu_reload; u64 insn_emulation; u64 insn_emulation_fail; u64 hypercalls; u64 irq_injections; u64 nmi_injections; u64 req_event; u64 nested_run; u64 directed_yield_attempted; u64 directed_yield_successful; u64 preemption_reported; u64 preemption_other; u64 guest_mode; u64 notify_window_exits; }; struct x86_instruction_info; struct msr_data { bool host_initiated; u32 index; u64 data; }; struct kvm_lapic_irq { u32 vector; u16 delivery_mode; u16 dest_mode; bool level; u16 trig_mode; u32 shorthand; u32 dest_id; bool msi_redir_hint; }; static inline u16 kvm_lapic_irq_dest_mode(bool dest_mode_logical) { return dest_mode_logical ? APIC_DEST_LOGICAL : APIC_DEST_PHYSICAL; } struct kvm_x86_ops { const char *name; int (*check_processor_compatibility)(void); int (*hardware_enable)(void); void (*hardware_disable)(void); void (*hardware_unsetup)(void); bool (*has_emulated_msr)(struct kvm *kvm, u32 index); void (*vcpu_after_set_cpuid)(struct kvm_vcpu *vcpu); unsigned int vm_size; int (*vm_init)(struct kvm *kvm); void (*vm_destroy)(struct kvm *kvm); /* Create, but do not attach this VCPU */ int (*vcpu_precreate)(struct kvm *kvm); int (*vcpu_create)(struct kvm_vcpu *vcpu); void (*vcpu_free)(struct kvm_vcpu *vcpu); void (*vcpu_reset)(struct kvm_vcpu *vcpu, bool init_event); void (*prepare_switch_to_guest)(struct kvm_vcpu *vcpu); void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu); void (*vcpu_put)(struct kvm_vcpu *vcpu); void (*update_exception_bitmap)(struct kvm_vcpu *vcpu); int (*get_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr); int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr); u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg); void (*get_segment)(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); int (*get_cpl)(struct kvm_vcpu *vcpu); void (*set_segment)(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l); bool (*is_valid_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0); void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0); void (*post_set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); bool (*is_valid_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4); void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4); int (*set_efer)(struct kvm_vcpu *vcpu, u64 efer); void (*get_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); void (*set_idt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); void (*get_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); void (*set_gdt)(struct kvm_vcpu *vcpu, struct desc_ptr *dt); void (*sync_dirty_debug_regs)(struct kvm_vcpu *vcpu); void (*set_dr7)(struct kvm_vcpu *vcpu, unsigned long value); void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg); unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); bool (*get_if_flag)(struct kvm_vcpu *vcpu); void (*flush_tlb_all)(struct kvm_vcpu *vcpu); void (*flush_tlb_current)(struct kvm_vcpu *vcpu); #if IS_ENABLED(CONFIG_HYPERV) int (*flush_remote_tlbs)(struct kvm *kvm); int (*flush_remote_tlbs_range)(struct kvm *kvm, gfn_t gfn, gfn_t nr_pages); #endif /* * Flush any TLB entries associated with the given GVA. * Does not need to flush GPA->HPA mappings. * Can potentially get non-canonical addresses through INVLPGs, which * the implementation may choose to ignore if appropriate. */ void (*flush_tlb_gva)(struct kvm_vcpu *vcpu, gva_t addr); /* * Flush any TLB entries created by the guest. Like tlb_flush_gva(), * does not need to flush GPA->HPA mappings. */ void (*flush_tlb_guest)(struct kvm_vcpu *vcpu); int (*vcpu_pre_run)(struct kvm_vcpu *vcpu); enum exit_fastpath_completion (*vcpu_run)(struct kvm_vcpu *vcpu, bool force_immediate_exit); int (*handle_exit)(struct kvm_vcpu *vcpu, enum exit_fastpath_completion exit_fastpath); int (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); void (*update_emulated_instruction)(struct kvm_vcpu *vcpu); void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask); u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu); void (*patch_hypercall)(struct kvm_vcpu *vcpu, unsigned char *hypercall_addr); void (*inject_irq)(struct kvm_vcpu *vcpu, bool reinjected); void (*inject_nmi)(struct kvm_vcpu *vcpu); void (*inject_exception)(struct kvm_vcpu *vcpu); void (*cancel_injection)(struct kvm_vcpu *vcpu); int (*interrupt_allowed)(struct kvm_vcpu *vcpu, bool for_injection); int (*nmi_allowed)(struct kvm_vcpu *vcpu, bool for_injection); bool (*get_nmi_mask)(struct kvm_vcpu *vcpu); void (*set_nmi_mask)(struct kvm_vcpu *vcpu, bool masked); /* Whether or not a virtual NMI is pending in hardware. */ bool (*is_vnmi_pending)(struct kvm_vcpu *vcpu); /* * Attempt to pend a virtual NMI in hardware. Returns %true on success * to allow using static_call_ret0 as the fallback. */ bool (*set_vnmi_pending)(struct kvm_vcpu *vcpu); void (*enable_nmi_window)(struct kvm_vcpu *vcpu); void (*enable_irq_window)(struct kvm_vcpu *vcpu); void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); bool (*check_apicv_inhibit_reasons)(enum kvm_apicv_inhibit reason); const unsigned long required_apicv_inhibits; bool allow_apicv_in_x2apic_without_x2apic_virtualization; void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu); void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr); void (*hwapic_isr_update)(int isr); bool (*guest_apic_has_interrupt)(struct kvm_vcpu *vcpu); void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); void (*set_virtual_apic_mode)(struct kvm_vcpu *vcpu); void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu); void (*deliver_interrupt)(struct kvm_lapic *apic, int delivery_mode, int trig_mode, int vector); int (*sync_pir_to_irr)(struct kvm_vcpu *vcpu); int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); int (*set_identity_map_addr)(struct kvm *kvm, u64 ident_addr); u8 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level); bool (*has_wbinvd_exit)(void); u64 (*get_l2_tsc_offset)(struct kvm_vcpu *vcpu); u64 (*get_l2_tsc_multiplier)(struct kvm_vcpu *vcpu); void (*write_tsc_offset)(struct kvm_vcpu *vcpu); void (*write_tsc_multiplier)(struct kvm_vcpu *vcpu); /* * Retrieve somewhat arbitrary exit information. Intended to * be used only from within tracepoints or error paths. */ void (*get_exit_info)(struct kvm_vcpu *vcpu, u32 *reason, u64 *info1, u64 *info2, u32 *exit_int_info, u32 *exit_int_info_err_code); int (*check_intercept)(struct kvm_vcpu *vcpu, struct x86_instruction_info *info, enum x86_intercept_stage stage, struct x86_exception *exception); void (*handle_exit_irqoff)(struct kvm_vcpu *vcpu); void (*sched_in)(struct kvm_vcpu *vcpu, int cpu); /* * Size of the CPU's dirty log buffer, i.e. VMX's PML buffer. A zero * value indicates CPU dirty logging is unsupported or disabled. */ int cpu_dirty_log_size; void (*update_cpu_dirty_logging)(struct kvm_vcpu *vcpu); const struct kvm_x86_nested_ops *nested_ops; void (*vcpu_blocking)(struct kvm_vcpu *vcpu); void (*vcpu_unblocking)(struct kvm_vcpu *vcpu); int (*pi_update_irte)(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set); void (*pi_start_assignment)(struct kvm *kvm); void (*apicv_pre_state_restore)(struct kvm_vcpu *vcpu); void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu); bool (*dy_apicv_has_pending_interrupt)(struct kvm_vcpu *vcpu); int (*set_hv_timer)(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc, bool *expired); void (*cancel_hv_timer)(struct kvm_vcpu *vcpu); void (*setup_mce)(struct kvm_vcpu *vcpu); #ifdef CONFIG_KVM_SMM int (*smi_allowed)(struct kvm_vcpu *vcpu, bool for_injection); int (*enter_smm)(struct kvm_vcpu *vcpu, union kvm_smram *smram); int (*leave_smm)(struct kvm_vcpu *vcpu, const union kvm_smram *smram); void (*enable_smi_window)(struct kvm_vcpu *vcpu); #endif int (*mem_enc_ioctl)(struct kvm *kvm, void __user *argp); int (*mem_enc_register_region)(struct kvm *kvm, struct kvm_enc_region *argp); int (*mem_enc_unregister_region)(struct kvm *kvm, struct kvm_enc_region *argp); int (*vm_copy_enc_context_from)(struct kvm *kvm, unsigned int source_fd); int (*vm_move_enc_context_from)(struct kvm *kvm, unsigned int source_fd); void (*guest_memory_reclaimed)(struct kvm *kvm); int (*get_msr_feature)(struct kvm_msr_entry *entry); int (*check_emulate_instruction)(struct kvm_vcpu *vcpu, int emul_type, void *insn, int insn_len); bool (*apic_init_signal_blocked)(struct kvm_vcpu *vcpu); int (*enable_l2_tlb_flush)(struct kvm_vcpu *vcpu); void (*migrate_timers)(struct kvm_vcpu *vcpu); void (*msr_filter_changed)(struct kvm_vcpu *vcpu); int (*complete_emulated_msr)(struct kvm_vcpu *vcpu, int err); void (*vcpu_deliver_sipi_vector)(struct kvm_vcpu *vcpu, u8 vector); /* * Returns vCPU specific APICv inhibit reasons */ unsigned long (*vcpu_get_apicv_inhibit_reasons)(struct kvm_vcpu *vcpu); gva_t (*get_untagged_addr)(struct kvm_vcpu *vcpu, gva_t gva, unsigned int flags); void *(*alloc_apic_backing_page)(struct kvm_vcpu *vcpu); }; struct kvm_x86_nested_ops { void (*leave_nested)(struct kvm_vcpu *vcpu); bool (*is_exception_vmexit)(struct kvm_vcpu *vcpu, u8 vector, u32 error_code); int (*check_events)(struct kvm_vcpu *vcpu); bool (*has_events)(struct kvm_vcpu *vcpu); void (*triple_fault)(struct kvm_vcpu *vcpu); int (*get_state)(struct kvm_vcpu *vcpu, struct kvm_nested_state __user *user_kvm_nested_state, unsigned user_data_size); int (*set_state)(struct kvm_vcpu *vcpu, struct kvm_nested_state __user *user_kvm_nested_state, struct kvm_nested_state *kvm_state); bool (*get_nested_state_pages)(struct kvm_vcpu *vcpu); int (*write_log_dirty)(struct kvm_vcpu *vcpu, gpa_t l2_gpa); int (*enable_evmcs)(struct kvm_vcpu *vcpu, uint16_t *vmcs_version); uint16_t (*get_evmcs_version)(struct kvm_vcpu *vcpu); void (*hv_inject_synthetic_vmexit_post_tlb_flush)(struct kvm_vcpu *vcpu); }; struct kvm_x86_init_ops { int (*hardware_setup)(void); unsigned int (*handle_intel_pt_intr)(void); struct kvm_x86_ops *runtime_ops; struct kvm_pmu_ops *pmu_ops; }; struct kvm_arch_async_pf { u32 token; gfn_t gfn; unsigned long cr3; bool direct_map; }; extern u32 __read_mostly kvm_nr_uret_msrs; extern u64 __read_mostly host_efer; extern bool __read_mostly allow_smaller_maxphyaddr; extern bool __read_mostly enable_apicv; extern struct kvm_x86_ops kvm_x86_ops; #define KVM_X86_OP(func) \ DECLARE_STATIC_CALL(kvm_x86_##func, *(((struct kvm_x86_ops *)0)->func)); #define KVM_X86_OP_OPTIONAL KVM_X86_OP #define KVM_X86_OP_OPTIONAL_RET0 KVM_X86_OP #include <asm/kvm-x86-ops.h> int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops); void kvm_x86_vendor_exit(void); #define __KVM_HAVE_ARCH_VM_ALLOC static inline struct kvm *kvm_arch_alloc_vm(void) { return __vmalloc(kvm_x86_ops.vm_size, GFP_KERNEL_ACCOUNT | __GFP_ZERO); } #define __KVM_HAVE_ARCH_VM_FREE void kvm_arch_free_vm(struct kvm *kvm); #if IS_ENABLED(CONFIG_HYPERV) #define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS static inline int kvm_arch_flush_remote_tlbs(struct kvm *kvm) { if (kvm_x86_ops.flush_remote_tlbs && !static_call(kvm_x86_flush_remote_tlbs)(kvm)) return 0; else return -ENOTSUPP; } #define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLBS_RANGE static inline int kvm_arch_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages) { if (!kvm_x86_ops.flush_remote_tlbs_range) return -EOPNOTSUPP; return static_call(kvm_x86_flush_remote_tlbs_range)(kvm, gfn, nr_pages); } #endif /* CONFIG_HYPERV */ enum kvm_intr_type { /* Values are arbitrary, but must be non-zero. */ KVM_HANDLING_IRQ = 1, KVM_HANDLING_NMI, }; /* Enable perf NMI and timer modes to work, and minimise false positives. */ #define kvm_arch_pmi_in_guest(vcpu) \ ((vcpu) && (vcpu)->arch.handling_intr_from_guest && \ (!!in_nmi() == ((vcpu)->arch.handling_intr_from_guest == KVM_HANDLING_NMI))) void __init kvm_mmu_x86_module_init(void); int kvm_mmu_vendor_module_init(void); void kvm_mmu_vendor_module_exit(void); void kvm_mmu_destroy(struct kvm_vcpu *vcpu); int kvm_mmu_create(struct kvm_vcpu *vcpu); void kvm_mmu_init_vm(struct kvm *kvm); void kvm_mmu_uninit_vm(struct kvm *kvm); void kvm_mmu_init_memslot_memory_attributes(struct kvm *kvm, struct kvm_memory_slot *slot); void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu); void kvm_mmu_reset_context(struct kvm_vcpu *vcpu); void kvm_mmu_slot_remove_write_access(struct kvm *kvm, const struct kvm_memory_slot *memslot, int start_level); void kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm, const struct kvm_memory_slot *memslot, int target_level); void kvm_mmu_try_split_huge_pages(struct kvm *kvm, const struct kvm_memory_slot *memslot, u64 start, u64 end, int target_level); void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, const struct kvm_memory_slot *memslot); void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, const struct kvm_memory_slot *memslot); void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen); void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned long kvm_nr_mmu_pages); int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3); int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, const void *val, int bytes); struct kvm_irq_mask_notifier { void (*func)(struct kvm_irq_mask_notifier *kimn, bool masked); int irq; struct hlist_node link; }; void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq, struct kvm_irq_mask_notifier *kimn); void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq, struct kvm_irq_mask_notifier *kimn); void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin, bool mask); extern bool tdp_enabled; u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu); /* * EMULTYPE_NO_DECODE - Set when re-emulating an instruction (after completing * userspace I/O) to indicate that the emulation context * should be reused as is, i.e. skip initialization of * emulation context, instruction fetch and decode. * * EMULTYPE_TRAP_UD - Set when emulating an intercepted #UD from hardware. * Indicates that only select instructions (tagged with * EmulateOnUD) should be emulated (to minimize the emulator * attack surface). See also EMULTYPE_TRAP_UD_FORCED. * * EMULTYPE_SKIP - Set when emulating solely to skip an instruction, i.e. to * decode the instruction length. For use *only* by * kvm_x86_ops.skip_emulated_instruction() implementations if * EMULTYPE_COMPLETE_USER_EXIT is not set. * * EMULTYPE_ALLOW_RETRY_PF - Set when the emulator should resume the guest to * retry native execution under certain conditions, * Can only be set in conjunction with EMULTYPE_PF. * * EMULTYPE_TRAP_UD_FORCED - Set when emulating an intercepted #UD that was * triggered by KVM's magic "force emulation" prefix, * which is opt in via module param (off by default). * Bypasses EmulateOnUD restriction despite emulating * due to an intercepted #UD (see EMULTYPE_TRAP_UD). * Used to test the full emulator from userspace. * * EMULTYPE_VMWARE_GP - Set when emulating an intercepted #GP for VMware * backdoor emulation, which is opt in via module param. * VMware backdoor emulation handles select instructions * and reinjects the #GP for all other cases. * * EMULTYPE_PF - Set when emulating MMIO by way of an intercepted #PF, in which * case the CR2/GPA value pass on the stack is valid. * * EMULTYPE_COMPLETE_USER_EXIT - Set when the emulator should update interruptibility * state and inject single-step #DBs after skipping * an instruction (after completing userspace I/O). * * EMULTYPE_WRITE_PF_TO_SP - Set when emulating an intercepted page fault that * is attempting to write a gfn that contains one or * more of the PTEs used to translate the write itself, * and the owning page table is being shadowed by KVM. * If emulation of the faulting instruction fails and * this flag is set, KVM will exit to userspace instead * of retrying emulation as KVM cannot make forward * progress. * * If emulation fails for a write to guest page tables, * KVM unprotects (zaps) the shadow page for the target * gfn and resumes the guest to retry the non-emulatable * instruction (on hardware). Unprotecting the gfn * doesn't allow forward progress for a self-changing * access because doing so also zaps the translation for * the gfn, i.e. retrying the instruction will hit a * !PRESENT fault, which results in a new shadow page * and sends KVM back to square one. */ #define EMULTYPE_NO_DECODE (1 << 0) #define EMULTYPE_TRAP_UD (1 << 1) #define EMULTYPE_SKIP (1 << 2) #define EMULTYPE_ALLOW_RETRY_PF (1 << 3) #define EMULTYPE_TRAP_UD_FORCED (1 << 4) #define EMULTYPE_VMWARE_GP (1 << 5) #define EMULTYPE_PF (1 << 6) #define EMULTYPE_COMPLETE_USER_EXIT (1 << 7) #define EMULTYPE_WRITE_PF_TO_SP (1 << 8) int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type); int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu, void *insn, int insn_len); void __kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu, u64 *data, u8 ndata); void kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu); void kvm_enable_efer_bits(u64); bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer); int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, bool host_initiated); int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data); int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data); int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu); int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu); int kvm_emulate_as_nop(struct kvm_vcpu *vcpu); int kvm_emulate_invd(struct kvm_vcpu *vcpu); int kvm_emulate_mwait(struct kvm_vcpu *vcpu); int kvm_handle_invalid_op(struct kvm_vcpu *vcpu); int kvm_emulate_monitor(struct kvm_vcpu *vcpu); int kvm_fast_pio(struct kvm_vcpu *vcpu, int size, unsigned short port, int in); int kvm_emulate_cpuid(struct kvm_vcpu *vcpu); int kvm_emulate_halt(struct kvm_vcpu *vcpu); int kvm_emulate_halt_noskip(struct kvm_vcpu *vcpu); int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu); int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu); void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); void kvm_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg); void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector); int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, int reason, bool has_error_code, u32 error_code); void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0); void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned long cr4); int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8); int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val); unsigned long kvm_get_dr(struct kvm_vcpu *vcpu, int dr); unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu); void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw); int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu); int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr); int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr); unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu); void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu); void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr); void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr, unsigned long payload); void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr); void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault); void kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault); bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl); bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr); static inline int __kvm_irq_line_state(unsigned long *irq_state, int irq_source_id, int level) { /* Logical OR for level trig interrupt */ if (level) __set_bit(irq_source_id, irq_state); else __clear_bit(irq_source_id, irq_state); return !!(*irq_state); } int kvm_pic_set_irq(struct kvm_pic *pic, int irq, int irq_source_id, int level); void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id); void kvm_inject_nmi(struct kvm_vcpu *vcpu); int kvm_get_nr_pending_nmis(struct kvm_vcpu *vcpu); void kvm_update_dr7(struct kvm_vcpu *vcpu); int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn); void kvm_mmu_free_roots(struct kvm *kvm, struct kvm_mmu *mmu, ulong roots_to_free); void kvm_mmu_free_guest_mode_roots(struct kvm *kvm, struct kvm_mmu *mmu); gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception); gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception); gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception); bool kvm_apicv_activated(struct kvm *kvm); bool kvm_vcpu_apicv_activated(struct kvm_vcpu *vcpu); void __kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu); void __kvm_set_or_clear_apicv_inhibit(struct kvm *kvm, enum kvm_apicv_inhibit reason, bool set); void kvm_set_or_clear_apicv_inhibit(struct kvm *kvm, enum kvm_apicv_inhibit reason, bool set); static inline void kvm_set_apicv_inhibit(struct kvm *kvm, enum kvm_apicv_inhibit reason) { kvm_set_or_clear_apicv_inhibit(kvm, reason, true); } static inline void kvm_clear_apicv_inhibit(struct kvm *kvm, enum kvm_apicv_inhibit reason) { kvm_set_or_clear_apicv_inhibit(kvm, reason, false); } int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, void *insn, int insn_len); void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva); void kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, u64 addr, unsigned long roots); void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid); void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd); void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level, int tdp_max_root_level, int tdp_huge_page_level); #ifdef CONFIG_KVM_PRIVATE_MEM #define kvm_arch_has_private_mem(kvm) ((kvm)->arch.vm_type != KVM_X86_DEFAULT_VM) #else #define kvm_arch_has_private_mem(kvm) false #endif static inline u16 kvm_read_ldt(void) { u16 ldt; asm("sldt %0" : "=g"(ldt)); return ldt; } static inline void kvm_load_ldt(u16 sel) { asm("lldt %0" : : "rm"(sel)); } #ifdef CONFIG_X86_64 static inline unsigned long read_msr(unsigned long msr) { u64 value; rdmsrl(msr, value); return value; } #endif static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code) { kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); } #define TSS_IOPB_BASE_OFFSET 0x66 #define TSS_BASE_SIZE 0x68 #define TSS_IOPB_SIZE (65536 / 8) #define TSS_REDIRECTION_SIZE (256 / 8) #define RMODE_TSS_SIZE \ (TSS_BASE_SIZE + TSS_REDIRECTION_SIZE + TSS_IOPB_SIZE + 1) enum { TASK_SWITCH_CALL = 0, TASK_SWITCH_IRET = 1, TASK_SWITCH_JMP = 2, TASK_SWITCH_GATE = 3, }; #define HF_GUEST_MASK (1 << 0) /* VCPU is in guest-mode */ #ifdef CONFIG_KVM_SMM #define HF_SMM_MASK (1 << 1) #define HF_SMM_INSIDE_NMI_MASK (1 << 2) # define KVM_MAX_NR_ADDRESS_SPACES 2 /* SMM is currently unsupported for guests with private memory. */ # define kvm_arch_nr_memslot_as_ids(kvm) (kvm_arch_has_private_mem(kvm) ? 1 : 2) # define kvm_arch_vcpu_memslots_id(vcpu) ((vcpu)->arch.hflags & HF_SMM_MASK ? 1 : 0) # define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, (role).smm) #else # define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, 0) #endif int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v); int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); int kvm_cpu_has_extint(struct kvm_vcpu *v); int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); int kvm_cpu_get_interrupt(struct kvm_vcpu *v); void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event); int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low, unsigned long ipi_bitmap_high, u32 min, unsigned long icr, int op_64_bit); int kvm_add_user_return_msr(u32 msr); int kvm_find_user_return_msr(u32 msr); int kvm_set_user_return_msr(unsigned index, u64 val, u64 mask); static inline bool kvm_is_supported_user_return_msr(u32 msr) { return kvm_find_user_return_msr(msr) >= 0; } u64 kvm_scale_tsc(u64 tsc, u64 ratio); u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc); u64 kvm_calc_nested_tsc_offset(u64 l1_offset, u64 l2_offset, u64 l2_multiplier); u64 kvm_calc_nested_tsc_multiplier(u64 l1_multiplier, u64 l2_multiplier); unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu); bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip); void kvm_make_scan_ioapic_request(struct kvm *kvm); void kvm_make_scan_ioapic_request_mask(struct kvm *kvm, unsigned long *vcpu_bitmap); bool kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, struct kvm_async_pf *work); void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, struct kvm_async_pf *work); void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work); void kvm_arch_async_page_present_queued(struct kvm_vcpu *vcpu); bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu); extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn); int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu); int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err); void __user *__x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size); bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu); bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu); bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq, struct kvm_vcpu **dest_vcpu); void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, struct kvm_lapic_irq *irq); static inline bool kvm_irq_is_postable(struct kvm_lapic_irq *irq) { /* We can only post Fixed and LowPrio IRQs */ return (irq->delivery_mode == APIC_DM_FIXED || irq->delivery_mode == APIC_DM_LOWEST); } static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) { static_call_cond(kvm_x86_vcpu_blocking)(vcpu); } static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) { static_call_cond(kvm_x86_vcpu_unblocking)(vcpu); } static inline int kvm_cpu_get_apicid(int mps_cpu) { #ifdef CONFIG_X86_LOCAL_APIC return default_cpu_present_to_apicid(mps_cpu); #else WARN_ON_ONCE(1); return BAD_APICID; #endif } int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages); #define KVM_CLOCK_VALID_FLAGS \ (KVM_CLOCK_TSC_STABLE | KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC) #define KVM_X86_VALID_QUIRKS \ (KVM_X86_QUIRK_LINT0_REENABLED | \ KVM_X86_QUIRK_CD_NW_CLEARED | \ KVM_X86_QUIRK_LAPIC_MMIO_HOLE | \ KVM_X86_QUIRK_OUT_7E_INC_RIP | \ KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT | \ KVM_X86_QUIRK_FIX_HYPERCALL_INSN | \ KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS) /* * KVM previously used a u32 field in kvm_run to indicate the hypercall was * initiated from long mode. KVM now sets bit 0 to indicate long mode, but the * remaining 31 lower bits must be 0 to preserve ABI. */ #define KVM_EXIT_HYPERCALL_MBZ GENMASK_ULL(31, 1) #endif /* _ASM_X86_KVM_HOST_H */
14 13 14 6 9 20 19 13 2 2 2 4 2 36 36 33 33 33 32 4 2 4 1 2 2 2 2 2 1 11 20 13 32 31 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/syscalls.h> #include <linux/fdtable.h> #include <linux/string.h> #include <linux/random.h> #include <linux/module.h> #include <linux/ptrace.h> #include <linux/init.h> #include <linux/errno.h> #include <linux/cache.h> #include <linux/bug.h> #include <linux/err.h> #include <linux/kcmp.h> #include <linux/capability.h> #include <linux/list.h> #include <linux/eventpoll.h> #include <linux/file.h> #include <asm/unistd.h> /* * We don't expose the real in-memory order of objects for security reasons. * But still the comparison results should be suitable for sorting. So we * obfuscate kernel pointers values and compare the production instead. * * The obfuscation is done in two steps. First we xor the kernel pointer with * a random value, which puts pointer into a new position in a reordered space. * Secondly we multiply the xor production with a large odd random number to * permute its bits even more (the odd multiplier guarantees that the product * is unique ever after the high bits are truncated, since any odd number is * relative prime to 2^n). * * Note also that the obfuscation itself is invisible to userspace and if needed * it can be changed to an alternate scheme. */ static unsigned long cookies[KCMP_TYPES][2] __read_mostly; static long kptr_obfuscate(long v, int type) { return (v ^ cookies[type][0]) * cookies[type][1]; } /* * 0 - equal, i.e. v1 = v2 * 1 - less than, i.e. v1 < v2 * 2 - greater than, i.e. v1 > v2 * 3 - not equal but ordering unavailable (reserved for future) */ static int kcmp_ptr(void *v1, void *v2, enum kcmp_type type) { long t1, t2; t1 = kptr_obfuscate((long)v1, type); t2 = kptr_obfuscate((long)v2, type); return (t1 < t2) | ((t1 > t2) << 1); } /* The caller must have pinned the task */ static struct file * get_file_raw_ptr(struct task_struct *task, unsigned int idx) { struct file *file; rcu_read_lock(); file = task_lookup_fdget_rcu(task, idx); rcu_read_unlock(); if (file) fput(file); return file; } static void kcmp_unlock(struct rw_semaphore *l1, struct rw_semaphore *l2) { if (likely(l2 != l1)) up_read(l2); up_read(l1); } static int kcmp_lock(struct rw_semaphore *l1, struct rw_semaphore *l2) { int err; if (l2 > l1) swap(l1, l2); err = down_read_killable(l1); if (!err && likely(l1 != l2)) { err = down_read_killable_nested(l2, SINGLE_DEPTH_NESTING); if (err) up_read(l1); } return err; } #ifdef CONFIG_EPOLL static int kcmp_epoll_target(struct task_struct *task1, struct task_struct *task2, unsigned long idx1, struct kcmp_epoll_slot __user *uslot) { struct file *filp, *filp_epoll, *filp_tgt; struct kcmp_epoll_slot slot; if (copy_from_user(&slot, uslot, sizeof(slot))) return -EFAULT; filp = get_file_raw_ptr(task1, idx1); if (!filp) return -EBADF; filp_epoll = fget_task(task2, slot.efd); if (!filp_epoll) return -EBADF; filp_tgt = get_epoll_tfile_raw_ptr(filp_epoll, slot.tfd, slot.toff); fput(filp_epoll); if (IS_ERR(filp_tgt)) return PTR_ERR(filp_tgt); return kcmp_ptr(filp, filp_tgt, KCMP_FILE); } #else static int kcmp_epoll_target(struct task_struct *task1, struct task_struct *task2, unsigned long idx1, struct kcmp_epoll_slot __user *uslot) { return -EOPNOTSUPP; } #endif SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type, unsigned long, idx1, unsigned long, idx2) { struct task_struct *task1, *task2; int ret; rcu_read_lock(); /* * Tasks are looked up in caller's PID namespace only. */ task1 = find_task_by_vpid(pid1); task2 = find_task_by_vpid(pid2); if (!task1 || !task2) goto err_no_task; get_task_struct(task1); get_task_struct(task2); rcu_read_unlock(); /* * One should have enough rights to inspect task details. */ ret = kcmp_lock(&task1->signal->exec_update_lock, &task2->signal->exec_update_lock); if (ret) goto err; if (!ptrace_may_access(task1, PTRACE_MODE_READ_REALCREDS) || !ptrace_may_access(task2, PTRACE_MODE_READ_REALCREDS)) { ret = -EPERM; goto err_unlock; } switch (type) { case KCMP_FILE: { struct file *filp1, *filp2; filp1 = get_file_raw_ptr(task1, idx1); filp2 = get_file_raw_ptr(task2, idx2); if (filp1 && filp2) ret = kcmp_ptr(filp1, filp2, KCMP_FILE); else ret = -EBADF; break; } case KCMP_VM: ret = kcmp_ptr(task1->mm, task2->mm, KCMP_VM); break; case KCMP_FILES: ret = kcmp_ptr(task1->files, task2->files, KCMP_FILES); break; case KCMP_FS: ret = kcmp_ptr(task1->fs, task2->fs, KCMP_FS); break; case KCMP_SIGHAND: ret = kcmp_ptr(task1->sighand, task2->sighand, KCMP_SIGHAND); break; case KCMP_IO: ret = kcmp_ptr(task1->io_context, task2->io_context, KCMP_IO); break; case KCMP_SYSVSEM: #ifdef CONFIG_SYSVIPC ret = kcmp_ptr(task1->sysvsem.undo_list, task2->sysvsem.undo_list, KCMP_SYSVSEM); #else ret = -EOPNOTSUPP; #endif break; case KCMP_EPOLL_TFD: ret = kcmp_epoll_target(task1, task2, idx1, (void *)idx2); break; default: ret = -EINVAL; break; } err_unlock: kcmp_unlock(&task1->signal->exec_update_lock, &task2->signal->exec_update_lock); err: put_task_struct(task1); put_task_struct(task2); return ret; err_no_task: rcu_read_unlock(); return -ESRCH; } static __init int kcmp_cookies_init(void) { int i; get_random_bytes(cookies, sizeof(cookies)); for (i = 0; i < KCMP_TYPES; i++) cookies[i][1] |= (~(~0UL >> 1) | 1); return 0; } arch_initcall(kcmp_cookies_init);
34 34 16 16 32 34 16 34 31 15 31 32 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. * All Rights Reserved. */ #include "xfs.h" #include "xfs_log_format.h" #include "xfs_bit.h" /* * XFS bit manipulation routines, used in non-realtime code. */ /* * Return whether bitmap is empty. * Size is number of words in the bitmap, which is padded to word boundary * Returns 1 for empty, 0 for non-empty. */ int xfs_bitmap_empty(uint *map, uint size) { uint i; for (i = 0; i < size; i++) { if (map[i] != 0) return 0; } return 1; } /* * Count the number of contiguous bits set in the bitmap starting with bit * start_bit. Size is the size of the bitmap in words. */ int xfs_contig_bits(uint *map, uint size, uint start_bit) { uint * p = ((unsigned int *) map) + (start_bit >> BIT_TO_WORD_SHIFT); uint result = 0; uint tmp; size <<= BIT_TO_WORD_SHIFT; ASSERT(start_bit < size); size -= start_bit & ~(NBWORD - 1); start_bit &= (NBWORD - 1); if (start_bit) { tmp = *p++; /* set to one first offset bits prior to start */ tmp |= (~0U >> (NBWORD-start_bit)); if (tmp != ~0U) goto found; result += NBWORD; size -= NBWORD; } while (size) { if ((tmp = *p++) != ~0U) goto found; result += NBWORD; size -= NBWORD; } return result - start_bit; found: return result + ffz(tmp) - start_bit; } /* * This takes the bit number to start looking from and * returns the next set bit from there. It returns -1 * if there are no more bits set or the start bit is * beyond the end of the bitmap. * * Size is the number of words, not bytes, in the bitmap. */ int xfs_next_bit(uint *map, uint size, uint start_bit) { uint * p = ((unsigned int *) map) + (start_bit >> BIT_TO_WORD_SHIFT); uint result = start_bit & ~(NBWORD - 1); uint tmp; size <<= BIT_TO_WORD_SHIFT; if (start_bit >= size) return -1; size -= result; start_bit &= (NBWORD - 1); if (start_bit) { tmp = *p++; /* set to zero first offset bits prior to start */ tmp &= (~0U << start_bit); if (tmp != 0U) goto found; result += NBWORD; size -= NBWORD; } while (size) { if ((tmp = *p++) != 0U) goto found; result += NBWORD; size -= NBWORD; } return -1; found: return result + ffs(tmp) - 1; }
1 1 1 1 1 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 // SPDX-License-Identifier: GPL-2.0-or-later /* * sch_plug.c Queue traffic until an explicit release command * * There are two ways to use this qdisc: * 1. A simple "instantaneous" plug/unplug operation, by issuing an alternating * sequence of TCQ_PLUG_BUFFER & TCQ_PLUG_RELEASE_INDEFINITE commands. * * 2. For network output buffering (a.k.a output commit) functionality. * Output commit property is commonly used by applications using checkpoint * based fault-tolerance to ensure that the checkpoint from which a system * is being restored is consistent w.r.t outside world. * * Consider for e.g. Remus - a Virtual Machine checkpointing system, * wherein a VM is checkpointed, say every 50ms. The checkpoint is replicated * asynchronously to the backup host, while the VM continues executing the * next epoch speculatively. * * The following is a typical sequence of output buffer operations: * 1.At epoch i, start_buffer(i) * 2. At end of epoch i (i.e. after 50ms): * 2.1 Stop VM and take checkpoint(i). * 2.2 start_buffer(i+1) and Resume VM * 3. While speculatively executing epoch(i+1), asynchronously replicate * checkpoint(i) to backup host. * 4. When checkpoint_ack(i) is received from backup, release_buffer(i) * Thus, this Qdisc would receive the following sequence of commands: * TCQ_PLUG_BUFFER (epoch i) * .. TCQ_PLUG_BUFFER (epoch i+1) * ....TCQ_PLUG_RELEASE_ONE (epoch i) * ......TCQ_PLUG_BUFFER (epoch i+2) * ........ */ #include <linux/module.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/errno.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <net/pkt_sched.h> /* * State of the queue, when used for network output buffering: * * plug(i+1) plug(i) head * ------------------+--------------------+----------------> * | | * | | * pkts_current_epoch| pkts_last_epoch |pkts_to_release * ----------------->|<--------+--------->|+---------------> * v v * */ struct plug_sched_data { /* If true, the dequeue function releases all packets * from head to end of the queue. The queue turns into * a pass-through queue for newly arriving packets. */ bool unplug_indefinite; bool throttled; /* Queue Limit in bytes */ u32 limit; /* Number of packets (output) from the current speculatively * executing epoch. */ u32 pkts_current_epoch; /* Number of packets corresponding to the recently finished * epoch. These will be released when we receive a * TCQ_PLUG_RELEASE_ONE command. This command is typically * issued after committing a checkpoint at the target. */ u32 pkts_last_epoch; /* * Number of packets from the head of the queue, that can * be released (committed checkpoint). */ u32 pkts_to_release; }; static int plug_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { struct plug_sched_data *q = qdisc_priv(sch); if (likely(sch->qstats.backlog + skb->len <= q->limit)) { if (!q->unplug_indefinite) q->pkts_current_epoch++; return qdisc_enqueue_tail(skb, sch); } return qdisc_drop(skb, sch, to_free); } static struct sk_buff *plug_dequeue(struct Qdisc *sch) { struct plug_sched_data *q = qdisc_priv(sch); if (q->throttled) return NULL; if (!q->unplug_indefinite) { if (!q->pkts_to_release) { /* No more packets to dequeue. Block the queue * and wait for the next release command. */ q->throttled = true; return NULL; } q->pkts_to_release--; } return qdisc_dequeue_head(sch); } static int plug_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct plug_sched_data *q = qdisc_priv(sch); q->pkts_current_epoch = 0; q->pkts_last_epoch = 0; q->pkts_to_release = 0; q->unplug_indefinite = false; if (opt == NULL) { q->limit = qdisc_dev(sch)->tx_queue_len * psched_mtu(qdisc_dev(sch)); } else { struct tc_plug_qopt *ctl = nla_data(opt); if (nla_len(opt) < sizeof(*ctl)) return -EINVAL; q->limit = ctl->limit; } q->throttled = true; return 0; } /* Receives 4 types of messages: * TCQ_PLUG_BUFFER: Inset a plug into the queue and * buffer any incoming packets * TCQ_PLUG_RELEASE_ONE: Dequeue packets from queue head * to beginning of the next plug. * TCQ_PLUG_RELEASE_INDEFINITE: Dequeue all packets from queue. * Stop buffering packets until the next TCQ_PLUG_BUFFER * command is received (just act as a pass-thru queue). * TCQ_PLUG_LIMIT: Increase/decrease queue size */ static int plug_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct plug_sched_data *q = qdisc_priv(sch); struct tc_plug_qopt *msg; msg = nla_data(opt); if (nla_len(opt) < sizeof(*msg)) return -EINVAL; switch (msg->action) { case TCQ_PLUG_BUFFER: /* Save size of the current buffer */ q->pkts_last_epoch = q->pkts_current_epoch; q->pkts_current_epoch = 0; if (q->unplug_indefinite) q->throttled = true; q->unplug_indefinite = false; break; case TCQ_PLUG_RELEASE_ONE: /* Add packets from the last complete buffer to the * packets to be released set. */ q->pkts_to_release += q->pkts_last_epoch; q->pkts_last_epoch = 0; q->throttled = false; netif_schedule_queue(sch->dev_queue); break; case TCQ_PLUG_RELEASE_INDEFINITE: q->unplug_indefinite = true; q->pkts_to_release = 0; q->pkts_last_epoch = 0; q->pkts_current_epoch = 0; q->throttled = false; netif_schedule_queue(sch->dev_queue); break; case TCQ_PLUG_LIMIT: /* Limit is supplied in bytes */ q->limit = msg->limit; break; default: return -EINVAL; } return 0; } static struct Qdisc_ops plug_qdisc_ops __read_mostly = { .id = "plug", .priv_size = sizeof(struct plug_sched_data), .enqueue = plug_enqueue, .dequeue = plug_dequeue, .peek = qdisc_peek_dequeued, .init = plug_init, .change = plug_change, .reset = qdisc_reset_queue, .owner = THIS_MODULE, }; MODULE_ALIAS_NET_SCH("plug"); static int __init plug_module_init(void) { return register_qdisc(&plug_qdisc_ops); } static void __exit plug_module_exit(void) { unregister_qdisc(&plug_qdisc_ops); } module_init(plug_module_init) module_exit(plug_module_exit) MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Qdisc to plug and unplug traffic via netlink control");
1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM compaction #if !defined(_TRACE_COMPACTION_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_COMPACTION_H #include <linux/types.h> #include <linux/list.h> #include <linux/tracepoint.h> #include <trace/events/mmflags.h> DECLARE_EVENT_CLASS(mm_compaction_isolate_template, TP_PROTO( unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_scanned, unsigned long nr_taken), TP_ARGS(start_pfn, end_pfn, nr_scanned, nr_taken), TP_STRUCT__entry( __field(unsigned long, start_pfn) __field(unsigned long, end_pfn) __field(unsigned long, nr_scanned) __field(unsigned long, nr_taken) ), TP_fast_assign( __entry->start_pfn = start_pfn; __entry->end_pfn = end_pfn; __entry->nr_scanned = nr_scanned; __entry->nr_taken = nr_taken; ), TP_printk("range=(0x%lx ~ 0x%lx) nr_scanned=%lu nr_taken=%lu", __entry->start_pfn, __entry->end_pfn, __entry->nr_scanned, __entry->nr_taken) ); DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_migratepages, TP_PROTO( unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_scanned, unsigned long nr_taken), TP_ARGS(start_pfn, end_pfn, nr_scanned, nr_taken) ); DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_freepages, TP_PROTO( unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_scanned, unsigned long nr_taken), TP_ARGS(start_pfn, end_pfn, nr_scanned, nr_taken) ); DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_fast_isolate_freepages, TP_PROTO( unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_scanned, unsigned long nr_taken), TP_ARGS(start_pfn, end_pfn, nr_scanned, nr_taken) ); #ifdef CONFIG_COMPACTION TRACE_EVENT(mm_compaction_migratepages, TP_PROTO(unsigned int nr_migratepages, unsigned int nr_succeeded), TP_ARGS(nr_migratepages, nr_succeeded), TP_STRUCT__entry( __field(unsigned long, nr_migrated) __field(unsigned long, nr_failed) ), TP_fast_assign( __entry->nr_migrated = nr_succeeded; __entry->nr_failed = nr_migratepages - nr_succeeded; ), TP_printk("nr_migrated=%lu nr_failed=%lu", __entry->nr_migrated, __entry->nr_failed) ); TRACE_EVENT(mm_compaction_begin, TP_PROTO(struct compact_control *cc, unsigned long zone_start, unsigned long zone_end, bool sync), TP_ARGS(cc, zone_start, zone_end, sync), TP_STRUCT__entry( __field(unsigned long, zone_start) __field(unsigned long, migrate_pfn) __field(unsigned long, free_pfn) __field(unsigned long, zone_end) __field(bool, sync) ), TP_fast_assign( __entry->zone_start = zone_start; __entry->migrate_pfn = cc->migrate_pfn; __entry->free_pfn = cc->free_pfn; __entry->zone_end = zone_end; __entry->sync = sync; ), TP_printk("zone_start=0x%lx migrate_pfn=0x%lx free_pfn=0x%lx zone_end=0x%lx, mode=%s", __entry->zone_start, __entry->migrate_pfn, __entry->free_pfn, __entry->zone_end, __entry->sync ? "sync" : "async") ); TRACE_EVENT(mm_compaction_end, TP_PROTO(struct compact_control *cc, unsigned long zone_start, unsigned long zone_end, bool sync, int status), TP_ARGS(cc, zone_start, zone_end, sync, status), TP_STRUCT__entry( __field(unsigned long, zone_start) __field(unsigned long, migrate_pfn) __field(unsigned long, free_pfn) __field(unsigned long, zone_end) __field(bool, sync) __field(int, status) ), TP_fast_assign( __entry->zone_start = zone_start; __entry->migrate_pfn = cc->migrate_pfn; __entry->free_pfn = cc->free_pfn; __entry->zone_end = zone_end; __entry->sync = sync; __entry->status = status; ), TP_printk("zone_start=0x%lx migrate_pfn=0x%lx free_pfn=0x%lx zone_end=0x%lx, mode=%s status=%s", __entry->zone_start, __entry->migrate_pfn, __entry->free_pfn, __entry->zone_end, __entry->sync ? "sync" : "async", __print_symbolic(__entry->status, COMPACTION_STATUS)) ); TRACE_EVENT(mm_compaction_try_to_compact_pages, TP_PROTO( int order, gfp_t gfp_mask, int prio), TP_ARGS(order, gfp_mask, prio), TP_STRUCT__entry( __field(int, order) __field(unsigned long, gfp_mask) __field(int, prio) ), TP_fast_assign( __entry->order = order; __entry->gfp_mask = (__force unsigned long)gfp_mask; __entry->prio = prio; ), TP_printk("order=%d gfp_mask=%s priority=%d", __entry->order, show_gfp_flags(__entry->gfp_mask), __entry->prio) ); DECLARE_EVENT_CLASS(mm_compaction_suitable_template, TP_PROTO(struct zone *zone, int order, int ret), TP_ARGS(zone, order, ret), TP_STRUCT__entry( __field(int, nid) __field(enum zone_type, idx) __field(int, order) __field(int, ret) ), TP_fast_assign( __entry->nid = zone_to_nid(zone); __entry->idx = zone_idx(zone); __entry->order = order; __entry->ret = ret; ), TP_printk("node=%d zone=%-8s order=%d ret=%s", __entry->nid, __print_symbolic(__entry->idx, ZONE_TYPE), __entry->order, __print_symbolic(__entry->ret, COMPACTION_STATUS)) ); DEFINE_EVENT(mm_compaction_suitable_template, mm_compaction_finished, TP_PROTO(struct zone *zone, int order, int ret), TP_ARGS(zone, order, ret) ); DEFINE_EVENT(mm_compaction_suitable_template, mm_compaction_suitable, TP_PROTO(struct zone *zone, int order, int ret), TP_ARGS(zone, order, ret) ); DECLARE_EVENT_CLASS(mm_compaction_defer_template, TP_PROTO(struct zone *zone, int order), TP_ARGS(zone, order), TP_STRUCT__entry( __field(int, nid) __field(enum zone_type, idx) __field(int, order) __field(unsigned int, considered) __field(unsigned int, defer_shift) __field(int, order_failed) ), TP_fast_assign( __entry->nid = zone_to_nid(zone); __entry->idx = zone_idx(zone); __entry->order = order; __entry->considered = zone->compact_considered; __entry->defer_shift = zone->compact_defer_shift; __entry->order_failed = zone->compact_order_failed; ), TP_printk("node=%d zone=%-8s order=%d order_failed=%d consider=%u limit=%lu", __entry->nid, __print_symbolic(__entry->idx, ZONE_TYPE), __entry->order, __entry->order_failed, __entry->considered, 1UL << __entry->defer_shift) ); DEFINE_EVENT(mm_compaction_defer_template, mm_compaction_deferred, TP_PROTO(struct zone *zone, int order), TP_ARGS(zone, order) ); DEFINE_EVENT(mm_compaction_defer_template, mm_compaction_defer_compaction, TP_PROTO(struct zone *zone, int order), TP_ARGS(zone, order) ); DEFINE_EVENT(mm_compaction_defer_template, mm_compaction_defer_reset, TP_PROTO(struct zone *zone, int order), TP_ARGS(zone, order) ); TRACE_EVENT(mm_compaction_kcompactd_sleep, TP_PROTO(int nid), TP_ARGS(nid), TP_STRUCT__entry( __field(int, nid) ), TP_fast_assign( __entry->nid = nid; ), TP_printk("nid=%d", __entry->nid) ); DECLARE_EVENT_CLASS(kcompactd_wake_template, TP_PROTO(int nid, int order, enum zone_type highest_zoneidx), TP_ARGS(nid, order, highest_zoneidx), TP_STRUCT__entry( __field(int, nid) __field(int, order) __field(enum zone_type, highest_zoneidx) ), TP_fast_assign( __entry->nid = nid; __entry->order = order; __entry->highest_zoneidx = highest_zoneidx; ), /* * classzone_idx is previous name of the highest_zoneidx. * Reason not to change it is the ABI requirement of the tracepoint. */ TP_printk("nid=%d order=%d classzone_idx=%-8s", __entry->nid, __entry->order, __print_symbolic(__entry->highest_zoneidx, ZONE_TYPE)) ); DEFINE_EVENT(kcompactd_wake_template, mm_compaction_wakeup_kcompactd, TP_PROTO(int nid, int order, enum zone_type highest_zoneidx), TP_ARGS(nid, order, highest_zoneidx) ); DEFINE_EVENT(kcompactd_wake_template, mm_compaction_kcompactd_wake, TP_PROTO(int nid, int order, enum zone_type highest_zoneidx), TP_ARGS(nid, order, highest_zoneidx) ); #endif #endif /* _TRACE_COMPACTION_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
633 1057 1210 7 70 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 /* SPDX-License-Identifier: GPL-2.0 */ /* * This header is used to share core functionality between the * standalone connection tracking module, and the compatibility layer's use * of connection tracking. * * 16 Dec 2003: Yasuyuki Kozakai @USAGI <yasuyuki.kozakai@toshiba.co.jp> * - generalize L3 protocol dependent part. * * Derived from include/linux/netfiter_ipv4/ip_conntrack_core.h */ #ifndef _NF_CONNTRACK_CORE_H #define _NF_CONNTRACK_CORE_H #include <linux/netfilter.h> #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_ecache.h> #include <net/netfilter/nf_conntrack_l4proto.h> /* This header is used to share core functionality between the standalone connection tracking module, and the compatibility layer's use of connection tracking. */ unsigned int nf_conntrack_in(struct sk_buff *skb, const struct nf_hook_state *state); int nf_conntrack_init_net(struct net *net); void nf_conntrack_cleanup_net(struct net *net); void nf_conntrack_cleanup_net_list(struct list_head *net_exit_list); void nf_conntrack_proto_pernet_init(struct net *net); int nf_conntrack_proto_init(void); void nf_conntrack_proto_fini(void); int nf_conntrack_init_start(void); void nf_conntrack_cleanup_start(void); void nf_conntrack_init_end(void); void nf_conntrack_cleanup_end(void); bool nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse, const struct nf_conntrack_tuple *orig); /* Find a connection corresponding to a tuple. */ struct nf_conntrack_tuple_hash * nf_conntrack_find_get(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple); int __nf_conntrack_confirm(struct sk_buff *skb); /* Confirm a connection: returns NF_DROP if packet must be dropped. */ static inline int nf_conntrack_confirm(struct sk_buff *skb) { struct nf_conn *ct = (struct nf_conn *)skb_nfct(skb); int ret = NF_ACCEPT; if (ct) { if (!nf_ct_is_confirmed(ct)) { ret = __nf_conntrack_confirm(skb); if (ret == NF_ACCEPT) ct = (struct nf_conn *)skb_nfct(skb); } if (ret == NF_ACCEPT && nf_ct_ecache_exist(ct)) nf_ct_deliver_cached_events(ct); } return ret; } unsigned int nf_confirm(void *priv, struct sk_buff *skb, const struct nf_hook_state *state); void print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple, const struct nf_conntrack_l4proto *proto); #define CONNTRACK_LOCKS 1024 extern spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS]; void nf_conntrack_lock(spinlock_t *lock); extern spinlock_t nf_conntrack_expect_lock; /* ctnetlink code shared by both ctnetlink and nf_conntrack_bpf */ static inline void __nf_ct_set_timeout(struct nf_conn *ct, u64 timeout) { if (timeout > INT_MAX) timeout = INT_MAX; if (nf_ct_is_confirmed(ct)) WRITE_ONCE(ct->timeout, nfct_time_stamp + (u32)timeout); else ct->timeout = (u32)timeout; } int __nf_ct_change_timeout(struct nf_conn *ct, u64 cta_timeout); void __nf_ct_change_status(struct nf_conn *ct, unsigned long on, unsigned long off); int nf_ct_change_status_common(struct nf_conn *ct, unsigned int status); #endif /* _NF_CONNTRACK_CORE_H */
3 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 // SPDX-License-Identifier: LGPL-2.1 /* * * Copyright (C) International Business Machines Corp., 2007,2008 * Author(s): Steve French (sfrench@us.ibm.com) * * Contains the routines for mapping CIFS/NTFS ACLs * */ #include <linux/fs.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/keyctl.h> #include <linux/key-type.h> #include <uapi/linux/posix_acl.h> #include <linux/posix_acl.h> #include <linux/posix_acl_xattr.h> #include <keys/user-type.h> #include "cifspdu.h" #include "cifsglob.h" #include "cifsacl.h" #include "cifsproto.h" #include "cifs_debug.h" #include "fs_context.h" #include "cifs_fs_sb.h" #include "cifs_unicode.h" /* security id for everyone/world system group */ static const struct cifs_sid sid_everyone = { 1, 1, {0, 0, 0, 0, 0, 1}, {0} }; /* security id for Authenticated Users system group */ static const struct cifs_sid sid_authusers = { 1, 1, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(11)} }; /* S-1-22-1 Unmapped Unix users */ static const struct cifs_sid sid_unix_users = {1, 1, {0, 0, 0, 0, 0, 22}, {cpu_to_le32(1), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} }; /* S-1-22-2 Unmapped Unix groups */ static const struct cifs_sid sid_unix_groups = { 1, 1, {0, 0, 0, 0, 0, 22}, {cpu_to_le32(2), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} }; /* * See https://technet.microsoft.com/en-us/library/hh509017(v=ws.10).aspx */ /* S-1-5-88 MS NFS and Apple style UID/GID/mode */ /* S-1-5-88-1 Unix uid */ static const struct cifs_sid sid_unix_NFS_users = { 1, 2, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(88), cpu_to_le32(1), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} }; /* S-1-5-88-2 Unix gid */ static const struct cifs_sid sid_unix_NFS_groups = { 1, 2, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(88), cpu_to_le32(2), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} }; /* S-1-5-88-3 Unix mode */ static const struct cifs_sid sid_unix_NFS_mode = { 1, 2, {0, 0, 0, 0, 0, 5}, {cpu_to_le32(88), cpu_to_le32(3), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0} }; static const struct cred *root_cred; static int cifs_idmap_key_instantiate(struct key *key, struct key_preparsed_payload *prep) { char *payload; /* * If the payload is less than or equal to the size of a pointer, then * an allocation here is wasteful. Just copy the data directly to the * payload.value union member instead. * * With this however, you must check the datalen before trying to * dereference payload.data! */ if (prep->datalen <= sizeof(key->payload)) { key->payload.data[0] = NULL; memcpy(&key->payload, prep->data, prep->datalen); } else { payload = kmemdup(prep->data, prep->datalen, GFP_KERNEL); if (!payload) return -ENOMEM; key->payload.data[0] = payload; } key->datalen = prep->datalen; return 0; } static inline void cifs_idmap_key_destroy(struct key *key) { if (key->datalen > sizeof(key->payload)) kfree(key->payload.data[0]); } static struct key_type cifs_idmap_key_type = { .name = "cifs.idmap", .instantiate = cifs_idmap_key_instantiate, .destroy = cifs_idmap_key_destroy, .describe = user_describe, }; static char * sid_to_key_str(struct cifs_sid *sidptr, unsigned int type) { int i, len; unsigned int saval; char *sidstr, *strptr; unsigned long long id_auth_val; /* 3 bytes for prefix */ sidstr = kmalloc(3 + SID_STRING_BASE_SIZE + (SID_STRING_SUBAUTH_SIZE * sidptr->num_subauth), GFP_KERNEL); if (!sidstr) return sidstr; strptr = sidstr; len = sprintf(strptr, "%cs:S-%hhu", type == SIDOWNER ? 'o' : 'g', sidptr->revision); strptr += len; /* The authority field is a single 48-bit number */ id_auth_val = (unsigned long long)sidptr->authority[5]; id_auth_val |= (unsigned long long)sidptr->authority[4] << 8; id_auth_val |= (unsigned long long)sidptr->authority[3] << 16; id_auth_val |= (unsigned long long)sidptr->authority[2] << 24; id_auth_val |= (unsigned long long)sidptr->authority[1] << 32; id_auth_val |= (unsigned long long)sidptr->authority[0] << 48; /* * MS-DTYP states that if the authority is >= 2^32, then it should be * expressed as a hex value. */ if (id_auth_val <= UINT_MAX) len = sprintf(strptr, "-%llu", id_auth_val); else len = sprintf(strptr, "-0x%llx", id_auth_val); strptr += len; for (i = 0; i < sidptr->num_subauth; ++i) { saval = le32_to_cpu(sidptr->sub_auth[i]); len = sprintf(strptr, "-%u", saval); strptr += len; } return sidstr; } /* * if the two SIDs (roughly equivalent to a UUID for a user or group) are * the same returns zero, if they do not match returns non-zero. */ static int compare_sids(const struct cifs_sid *ctsid, const struct cifs_sid *cwsid) { int i; int num_subauth, num_sat, num_saw; if ((!ctsid) || (!cwsid)) return 1; /* compare the revision */ if (ctsid->revision != cwsid->revision) { if (ctsid->revision > cwsid->revision) return 1; else return -1; } /* compare all of the six auth values */ for (i = 0; i < NUM_AUTHS; ++i) { if (ctsid->authority[i] != cwsid->authority[i]) { if (ctsid->authority[i] > cwsid->authority[i]) return 1; else return -1; } } /* compare all of the subauth values if any */ num_sat = ctsid->num_subauth; num_saw = cwsid->num_subauth; num_subauth = num_sat < num_saw ? num_sat : num_saw; if (num_subauth) { for (i = 0; i < num_subauth; ++i) { if (ctsid->sub_auth[i] != cwsid->sub_auth[i]) { if (le32_to_cpu(ctsid->sub_auth[i]) > le32_to_cpu(cwsid->sub_auth[i])) return 1; else return -1; } } } return 0; /* sids compare/match */ } static bool is_well_known_sid(const struct cifs_sid *psid, uint32_t *puid, bool is_group) { int i; int num_subauth; const struct cifs_sid *pwell_known_sid; if (!psid || (puid == NULL)) return false; num_subauth = psid->num_subauth; /* check if Mac (or Windows NFS) vs. Samba format for Unix owner SID */ if (num_subauth == 2) { if (is_group) pwell_known_sid = &sid_unix_groups; else pwell_known_sid = &sid_unix_users; } else if (num_subauth == 3) { if (is_group) pwell_known_sid = &sid_unix_NFS_groups; else pwell_known_sid = &sid_unix_NFS_users; } else return false; /* compare the revision */ if (psid->revision != pwell_known_sid->revision) return false; /* compare all of the six auth values */ for (i = 0; i < NUM_AUTHS; ++i) { if (psid->authority[i] != pwell_known_sid->authority[i]) { cifs_dbg(FYI, "auth %d did not match\n", i); return false; } } if (num_subauth == 2) { if (psid->sub_auth[0] != pwell_known_sid->sub_auth[0]) return false; *puid = le32_to_cpu(psid->sub_auth[1]); } else /* 3 subauths, ie Windows/Mac style */ { *puid = le32_to_cpu(psid->sub_auth[0]); if ((psid->sub_auth[0] != pwell_known_sid->sub_auth[0]) || (psid->sub_auth[1] != pwell_known_sid->sub_auth[1])) return false; *puid = le32_to_cpu(psid->sub_auth[2]); } cifs_dbg(FYI, "Unix UID %d returned from SID\n", *puid); return true; /* well known sid found, uid returned */ } static __u16 cifs_copy_sid(struct cifs_sid *dst, const struct cifs_sid *src) { int i; __u16 size = 1 + 1 + 6; dst->revision = src->revision; dst->num_subauth = min_t(u8, src->num_subauth, SID_MAX_SUB_AUTHORITIES); for (i = 0; i < NUM_AUTHS; ++i) dst->authority[i] = src->authority[i]; for (i = 0; i < dst->num_subauth; ++i) dst->sub_auth[i] = src->sub_auth[i]; size += (dst->num_subauth * 4); return size; } static int id_to_sid(unsigned int cid, uint sidtype, struct cifs_sid *ssid) { int rc; struct key *sidkey; struct cifs_sid *ksid; unsigned int ksid_size; char desc[3 + 10 + 1]; /* 3 byte prefix + 10 bytes for value + NULL */ const struct cred *saved_cred; rc = snprintf(desc, sizeof(desc), "%ci:%u", sidtype == SIDOWNER ? 'o' : 'g', cid); if (rc >= sizeof(desc)) return -EINVAL; rc = 0; saved_cred = override_creds(root_cred); sidkey = request_key(&cifs_idmap_key_type, desc, ""); if (IS_ERR(sidkey)) { rc = -EINVAL; cifs_dbg(FYI, "%s: Can't map %cid %u to a SID\n", __func__, sidtype == SIDOWNER ? 'u' : 'g', cid); goto out_revert_creds; } else if (sidkey->datalen < CIFS_SID_BASE_SIZE) { rc = -EIO; cifs_dbg(FYI, "%s: Downcall contained malformed key (datalen=%hu)\n", __func__, sidkey->datalen); goto invalidate_key; } /* * A sid is usually too large to be embedded in payload.value, but if * there are no subauthorities and the host has 8-byte pointers, then * it could be. */ ksid = sidkey->datalen <= sizeof(sidkey->payload) ? (struct cifs_sid *)&sidkey->payload : (struct cifs_sid *)sidkey->payload.data[0]; ksid_size = CIFS_SID_BASE_SIZE + (ksid->num_subauth * sizeof(__le32)); if (ksid_size > sidkey->datalen) { rc = -EIO; cifs_dbg(FYI, "%s: Downcall contained malformed key (datalen=%hu, ksid_size=%u)\n", __func__, sidkey->datalen, ksid_size); goto invalidate_key; } cifs_copy_sid(ssid, ksid); out_key_put: key_put(sidkey); out_revert_creds: revert_creds(saved_cred); return rc; invalidate_key: key_invalidate(sidkey); goto out_key_put; } int sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid, struct cifs_fattr *fattr, uint sidtype) { int rc = 0; struct key *sidkey; char *sidstr; const struct cred *saved_cred; kuid_t fuid = cifs_sb->ctx->linux_uid; kgid_t fgid = cifs_sb->ctx->linux_gid; /* * If we have too many subauthorities, then something is really wrong. * Just return an error. */ if (unlikely(psid->num_subauth > SID_MAX_SUB_AUTHORITIES)) { cifs_dbg(FYI, "%s: %u subauthorities is too many!\n", __func__, psid->num_subauth); return -EIO; } if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UID_FROM_ACL) || (cifs_sb_master_tcon(cifs_sb)->posix_extensions)) { uint32_t unix_id; bool is_group; if (sidtype != SIDOWNER) is_group = true; else is_group = false; if (is_well_known_sid(psid, &unix_id, is_group) == false) goto try_upcall_to_get_id; if (is_group) { kgid_t gid; gid_t id; id = (gid_t)unix_id; gid = make_kgid(&init_user_ns, id); if (gid_valid(gid)) { fgid = gid; goto got_valid_id; } } else { kuid_t uid; uid_t id; id = (uid_t)unix_id; uid = make_kuid(&init_user_ns, id); if (uid_valid(uid)) { fuid = uid; goto got_valid_id; } } /* If unable to find uid/gid easily from SID try via upcall */ } try_upcall_to_get_id: sidstr = sid_to_key_str(psid, sidtype); if (!sidstr) return -ENOMEM; saved_cred = override_creds(root_cred); sidkey = request_key(&cifs_idmap_key_type, sidstr, ""); if (IS_ERR(sidkey)) { cifs_dbg(FYI, "%s: Can't map SID %s to a %cid\n", __func__, sidstr, sidtype == SIDOWNER ? 'u' : 'g'); goto out_revert_creds; } /* * FIXME: Here we assume that uid_t and gid_t are same size. It's * probably a safe assumption but might be better to check based on * sidtype. */ BUILD_BUG_ON(sizeof(uid_t) != sizeof(gid_t)); if (sidkey->datalen != sizeof(uid_t)) { cifs_dbg(FYI, "%s: Downcall contained malformed key (datalen=%hu)\n", __func__, sidkey->datalen); key_invalidate(sidkey); goto out_key_put; } if (sidtype == SIDOWNER) { kuid_t uid; uid_t id; memcpy(&id, &sidkey->payload.data[0], sizeof(uid_t)); uid = make_kuid(&init_user_ns, id); if (uid_valid(uid)) fuid = uid; } else { kgid_t gid; gid_t id; memcpy(&id, &sidkey->payload.data[0], sizeof(gid_t)); gid = make_kgid(&init_user_ns, id); if (gid_valid(gid)) fgid = gid; } out_key_put: key_put(sidkey); out_revert_creds: revert_creds(saved_cred); kfree(sidstr); /* * Note that we return 0 here unconditionally. If the mapping * fails then we just fall back to using the ctx->linux_uid/linux_gid. */ got_valid_id: rc = 0; if (sidtype == SIDOWNER) fattr->cf_uid = fuid; else fattr->cf_gid = fgid; return rc; } int init_cifs_idmap(void) { struct cred *cred; struct key *keyring; int ret; cifs_dbg(FYI, "Registering the %s key type\n", cifs_idmap_key_type.name); /* create an override credential set with a special thread keyring in * which requests are cached * * this is used to prevent malicious redirections from being installed * with add_key(). */ cred = prepare_kernel_cred(&init_task); if (!cred) return -ENOMEM; keyring = keyring_alloc(".cifs_idmap", GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, cred, (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_VIEW | KEY_USR_READ, KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL); if (IS_ERR(keyring)) { ret = PTR_ERR(keyring); goto failed_put_cred; } ret = register_key_type(&cifs_idmap_key_type); if (ret < 0) goto failed_put_key; /* instruct request_key() to use this special keyring as a cache for * the results it looks up */ set_bit(KEY_FLAG_ROOT_CAN_CLEAR, &keyring->flags); cred->thread_keyring = keyring; cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING; root_cred = cred; cifs_dbg(FYI, "cifs idmap keyring: %d\n", key_serial(keyring)); return 0; failed_put_key: key_put(keyring); failed_put_cred: put_cred(cred); return ret; } void exit_cifs_idmap(void) { key_revoke(root_cred->thread_keyring); unregister_key_type(&cifs_idmap_key_type); put_cred(root_cred); cifs_dbg(FYI, "Unregistered %s key type\n", cifs_idmap_key_type.name); } /* copy ntsd, owner sid, and group sid from a security descriptor to another */ static __u32 copy_sec_desc(const struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd, __u32 sidsoffset, struct cifs_sid *pownersid, struct cifs_sid *pgrpsid) { struct cifs_sid *owner_sid_ptr, *group_sid_ptr; struct cifs_sid *nowner_sid_ptr, *ngroup_sid_ptr; /* copy security descriptor control portion */ pnntsd->revision = pntsd->revision; pnntsd->type = pntsd->type; pnntsd->dacloffset = cpu_to_le32(sizeof(struct cifs_ntsd)); pnntsd->sacloffset = 0; pnntsd->osidoffset = cpu_to_le32(sidsoffset); pnntsd->gsidoffset = cpu_to_le32(sidsoffset + sizeof(struct cifs_sid)); /* copy owner sid */ if (pownersid) owner_sid_ptr = pownersid; else owner_sid_ptr = (struct cifs_sid *)((char *)pntsd + le32_to_cpu(pntsd->osidoffset)); nowner_sid_ptr = (struct cifs_sid *)((char *)pnntsd + sidsoffset); cifs_copy_sid(nowner_sid_ptr, owner_sid_ptr); /* copy group sid */ if (pgrpsid) group_sid_ptr = pgrpsid; else group_sid_ptr = (struct cifs_sid *)((char *)pntsd + le32_to_cpu(pntsd->gsidoffset)); ngroup_sid_ptr = (struct cifs_sid *)((char *)pnntsd + sidsoffset + sizeof(struct cifs_sid)); cifs_copy_sid(ngroup_sid_ptr, group_sid_ptr); return sidsoffset + (2 * sizeof(struct cifs_sid)); } /* change posix mode to reflect permissions pmode is the existing mode (we only want to overwrite part of this bits to set can be: S_IRWXU, S_IRWXG or S_IRWXO ie 00700 or 00070 or 00007 */ static void access_flags_to_mode(__le32 ace_flags, int type, umode_t *pmode, umode_t *pdenied, umode_t mask) { __u32 flags = le32_to_cpu(ace_flags); /* * Do not assume "preferred" or "canonical" order. * The first DENY or ALLOW ACE which matches perfectly is * the permission to be used. Once allowed or denied, same * permission in later ACEs do not matter. */ /* If not already allowed, deny these bits */ if (type == ACCESS_DENIED) { if (flags & GENERIC_ALL && !(*pmode & mask & 0777)) *pdenied |= mask & 0777; if (((flags & GENERIC_WRITE) || ((flags & FILE_WRITE_RIGHTS) == FILE_WRITE_RIGHTS)) && !(*pmode & mask & 0222)) *pdenied |= mask & 0222; if (((flags & GENERIC_READ) || ((flags & FILE_READ_RIGHTS) == FILE_READ_RIGHTS)) && !(*pmode & mask & 0444)) *pdenied |= mask & 0444; if (((flags & GENERIC_EXECUTE) || ((flags & FILE_EXEC_RIGHTS) == FILE_EXEC_RIGHTS)) && !(*pmode & mask & 0111)) *pdenied |= mask & 0111; return; } else if (type != ACCESS_ALLOWED) { cifs_dbg(VFS, "unknown access control type %d\n", type); return; } /* else ACCESS_ALLOWED type */ if ((flags & GENERIC_ALL) && !(*pdenied & mask & 0777)) { *pmode |= mask & 0777; cifs_dbg(NOISY, "all perms\n"); return; } if (((flags & GENERIC_WRITE) || ((flags & FILE_WRITE_RIGHTS) == FILE_WRITE_RIGHTS)) && !(*pdenied & mask & 0222)) *pmode |= mask & 0222; if (((flags & GENERIC_READ) || ((flags & FILE_READ_RIGHTS) == FILE_READ_RIGHTS)) && !(*pdenied & mask & 0444)) *pmode |= mask & 0444; if (((flags & GENERIC_EXECUTE) || ((flags & FILE_EXEC_RIGHTS) == FILE_EXEC_RIGHTS)) && !(*pdenied & mask & 0111)) *pmode |= mask & 0111; /* If DELETE_CHILD is set only on an owner ACE, set sticky bit */ if (flags & FILE_DELETE_CHILD) { if (mask == ACL_OWNER_MASK) { if (!(*pdenied & 01000)) *pmode |= 01000; } else if (!(*pdenied & 01000)) { *pmode &= ~01000; *pdenied |= 01000; } } cifs_dbg(NOISY, "access flags 0x%x mode now %04o\n", flags, *pmode); return; } /* Generate access flags to reflect permissions mode is the existing mode. This function is called for every ACE in the DACL whose SID matches with either owner or group or everyone. */ static void mode_to_access_flags(umode_t mode, umode_t bits_to_use, __u32 *pace_flags) { /* reset access mask */ *pace_flags = 0x0; /* bits to use are either S_IRWXU or S_IRWXG or S_IRWXO */ mode &= bits_to_use; /* check for R/W/X UGO since we do not know whose flags is this but we have cleared all the bits sans RWX for either user or group or other as per bits_to_use */ if (mode & S_IRUGO) *pace_flags |= SET_FILE_READ_RIGHTS; if (mode & S_IWUGO) *pace_flags |= SET_FILE_WRITE_RIGHTS; if (mode & S_IXUGO) *pace_flags |= SET_FILE_EXEC_RIGHTS; cifs_dbg(NOISY, "mode: %04o, access flags now 0x%x\n", mode, *pace_flags); return; } static __u16 cifs_copy_ace(struct cifs_ace *dst, struct cifs_ace *src, struct cifs_sid *psid) { __u16 size = 1 + 1 + 2 + 4; dst->type = src->type; dst->flags = src->flags; dst->access_req = src->access_req; /* Check if there's a replacement sid specified */ if (psid) size += cifs_copy_sid(&dst->sid, psid); else size += cifs_copy_sid(&dst->sid, &src->sid); dst->size = cpu_to_le16(size); return size; } static __u16 fill_ace_for_sid(struct cifs_ace *pntace, const struct cifs_sid *psid, __u64 nmode, umode_t bits, __u8 access_type, bool allow_delete_child) { int i; __u16 size = 0; __u32 access_req = 0; pntace->type = access_type; pntace->flags = 0x0; mode_to_access_flags(nmode, bits, &access_req); if (access_type == ACCESS_ALLOWED && allow_delete_child) access_req |= FILE_DELETE_CHILD; if (access_type == ACCESS_ALLOWED && !access_req) access_req = SET_MINIMUM_RIGHTS; else if (access_type == ACCESS_DENIED) access_req &= ~SET_MINIMUM_RIGHTS; pntace->access_req = cpu_to_le32(access_req); pntace->sid.revision = psid->revision; pntace->sid.num_subauth = psid->num_subauth; for (i = 0; i < NUM_AUTHS; i++) pntace->sid.authority[i] = psid->authority[i]; for (i = 0; i < psid->num_subauth; i++) pntace->sid.sub_auth[i] = psid->sub_auth[i]; size = 1 + 1 + 2 + 4 + 1 + 1 + 6 + (psid->num_subauth * 4); pntace->size = cpu_to_le16(size); return size; } #ifdef CONFIG_CIFS_DEBUG2 static void dump_ace(struct cifs_ace *pace, char *end_of_acl) { int num_subauth; /* validate that we do not go past end of acl */ if (le16_to_cpu(pace->size) < 16) { cifs_dbg(VFS, "ACE too small %d\n", le16_to_cpu(pace->size)); return; } if (end_of_acl < (char *)pace + le16_to_cpu(pace->size)) { cifs_dbg(VFS, "ACL too small to parse ACE\n"); return; } num_subauth = pace->sid.num_subauth; if (num_subauth) { int i; cifs_dbg(FYI, "ACE revision %d num_auth %d type %d flags %d size %d\n", pace->sid.revision, pace->sid.num_subauth, pace->type, pace->flags, le16_to_cpu(pace->size)); for (i = 0; i < num_subauth; ++i) { cifs_dbg(FYI, "ACE sub_auth[%d]: 0x%x\n", i, le32_to_cpu(pace->sid.sub_auth[i])); } /* BB add length check to make sure that we do not have huge num auths and therefore go off the end */ } return; } #endif static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl, struct cifs_sid *pownersid, struct cifs_sid *pgrpsid, struct cifs_fattr *fattr, bool mode_from_special_sid) { int i; int num_aces = 0; int acl_size; char *acl_base; struct cifs_ace **ppace; /* BB need to add parm so we can store the SID BB */ if (!pdacl) { /* no DACL in the security descriptor, set all the permissions for user/group/other */ fattr->cf_mode |= 0777; return; } /* validate that we do not go past end of acl */ if (end_of_acl < (char *)pdacl + le16_to_cpu(pdacl->size)) { cifs_dbg(VFS, "ACL too small to parse DACL\n"); return; } cifs_dbg(NOISY, "DACL revision %d size %d num aces %d\n", le16_to_cpu(pdacl->revision), le16_to_cpu(pdacl->size), le32_to_cpu(pdacl->num_aces)); /* reset rwx permissions for user/group/other. Also, if num_aces is 0 i.e. DACL has no ACEs, user/group/other have no permissions */ fattr->cf_mode &= ~(0777); acl_base = (char *)pdacl; acl_size = sizeof(struct cifs_acl); num_aces = le32_to_cpu(pdacl->num_aces); if (num_aces > 0) { umode_t denied_mode = 0; if (num_aces > ULONG_MAX / sizeof(struct cifs_ace *)) return; ppace = kmalloc_array(num_aces, sizeof(struct cifs_ace *), GFP_KERNEL); if (!ppace) return; for (i = 0; i < num_aces; ++i) { ppace[i] = (struct cifs_ace *) (acl_base + acl_size); #ifdef CONFIG_CIFS_DEBUG2 dump_ace(ppace[i], end_of_acl); #endif if (mode_from_special_sid && (compare_sids(&(ppace[i]->sid), &sid_unix_NFS_mode) == 0)) { /* * Full permissions are: * 07777 = S_ISUID | S_ISGID | S_ISVTX | * S_IRWXU | S_IRWXG | S_IRWXO */ fattr->cf_mode &= ~07777; fattr->cf_mode |= le32_to_cpu(ppace[i]->sid.sub_auth[2]); break; } else { if (compare_sids(&(ppace[i]->sid), pownersid) == 0) { access_flags_to_mode(ppace[i]->access_req, ppace[i]->type, &fattr->cf_mode, &denied_mode, ACL_OWNER_MASK); } else if (compare_sids(&(ppace[i]->sid), pgrpsid) == 0) { access_flags_to_mode(ppace[i]->access_req, ppace[i]->type, &fattr->cf_mode, &denied_mode, ACL_GROUP_MASK); } else if ((compare_sids(&(ppace[i]->sid), &sid_everyone) == 0) || (compare_sids(&(ppace[i]->sid), &sid_authusers) == 0)) { access_flags_to_mode(ppace[i]->access_req, ppace[i]->type, &fattr->cf_mode, &denied_mode, ACL_EVERYONE_MASK); } } /* memcpy((void *)(&(cifscred->aces[i])), (void *)ppace[i], sizeof(struct cifs_ace)); */ acl_base = (char *)ppace[i]; acl_size = le16_to_cpu(ppace[i]->size); } kfree(ppace); } return; } unsigned int setup_authusers_ACE(struct cifs_ace *pntace) { int i; unsigned int ace_size = 20; pntace->type = ACCESS_ALLOWED_ACE_TYPE; pntace->flags = 0x0; pntace->access_req = cpu_to_le32(GENERIC_ALL); pntace->sid.num_subauth = 1; pntace->sid.revision = 1; for (i = 0; i < NUM_AUTHS; i++) pntace->sid.authority[i] = sid_authusers.authority[i]; pntace->sid.sub_auth[0] = sid_authusers.sub_auth[0]; /* size = 1 + 1 + 2 + 4 + 1 + 1 + 6 + (psid->num_subauth*4) */ pntace->size = cpu_to_le16(ace_size); return ace_size; } /* * Fill in the special SID based on the mode. See * https://technet.microsoft.com/en-us/library/hh509017(v=ws.10).aspx */ unsigned int setup_special_mode_ACE(struct cifs_ace *pntace, __u64 nmode) { int i; unsigned int ace_size = 28; pntace->type = ACCESS_DENIED_ACE_TYPE; pntace->flags = 0x0; pntace->access_req = 0; pntace->sid.num_subauth = 3; pntace->sid.revision = 1; for (i = 0; i < NUM_AUTHS; i++) pntace->sid.authority[i] = sid_unix_NFS_mode.authority[i]; pntace->sid.sub_auth[0] = sid_unix_NFS_mode.sub_auth[0]; pntace->sid.sub_auth[1] = sid_unix_NFS_mode.sub_auth[1]; pntace->sid.sub_auth[2] = cpu_to_le32(nmode & 07777); /* size = 1 + 1 + 2 + 4 + 1 + 1 + 6 + (psid->num_subauth*4) */ pntace->size = cpu_to_le16(ace_size); return ace_size; } unsigned int setup_special_user_owner_ACE(struct cifs_ace *pntace) { int i; unsigned int ace_size = 28; pntace->type = ACCESS_ALLOWED_ACE_TYPE; pntace->flags = 0x0; pntace->access_req = cpu_to_le32(GENERIC_ALL); pntace->sid.num_subauth = 3; pntace->sid.revision = 1; for (i = 0; i < NUM_AUTHS; i++) pntace->sid.authority[i] = sid_unix_NFS_users.authority[i]; pntace->sid.sub_auth[0] = sid_unix_NFS_users.sub_auth[0]; pntace->sid.sub_auth[1] = sid_unix_NFS_users.sub_auth[1]; pntace->sid.sub_auth[2] = cpu_to_le32(current_fsgid().val); /* size = 1 + 1 + 2 + 4 + 1 + 1 + 6 + (psid->num_subauth*4) */ pntace->size = cpu_to_le16(ace_size); return ace_size; } static void populate_new_aces(char *nacl_base, struct cifs_sid *pownersid, struct cifs_sid *pgrpsid, __u64 *pnmode, u32 *pnum_aces, u16 *pnsize, bool modefromsid) { __u64 nmode; u32 num_aces = 0; u16 nsize = 0; __u64 user_mode; __u64 group_mode; __u64 other_mode; __u64 deny_user_mode = 0; __u64 deny_group_mode = 0; bool sticky_set = false; struct cifs_ace *pnntace = NULL; nmode = *pnmode; num_aces = *pnum_aces; nsize = *pnsize; if (modefromsid) { pnntace = (struct cifs_ace *) (nacl_base + nsize); nsize += setup_special_mode_ACE(pnntace, nmode); num_aces++; pnntace = (struct cifs_ace *) (nacl_base + nsize); nsize += setup_authusers_ACE(pnntace); num_aces++; goto set_size; } /* * We'll try to keep the mode as requested by the user. * But in cases where we cannot meaningfully convert that * into ACL, return back the updated mode, so that it is * updated in the inode. */ if (!memcmp(pownersid, pgrpsid, sizeof(struct cifs_sid))) { /* * Case when owner and group SIDs are the same. * Set the more restrictive of the two modes. */ user_mode = nmode & (nmode << 3) & 0700; group_mode = nmode & (nmode >> 3) & 0070; } else { user_mode = nmode & 0700; group_mode = nmode & 0070; } other_mode = nmode & 0007; /* We need DENY ACE when the perm is more restrictive than the next sets. */ deny_user_mode = ~(user_mode) & ((group_mode << 3) | (other_mode << 6)) & 0700; deny_group_mode = ~(group_mode) & (other_mode << 3) & 0070; *pnmode = user_mode | group_mode | other_mode | (nmode & ~0777); /* This tells if we should allow delete child for group and everyone. */ if (nmode & 01000) sticky_set = true; if (deny_user_mode) { pnntace = (struct cifs_ace *) (nacl_base + nsize); nsize += fill_ace_for_sid(pnntace, pownersid, deny_user_mode, 0700, ACCESS_DENIED, false); num_aces++; } /* Group DENY ACE does not conflict with owner ALLOW ACE. Keep in preferred order*/ if (deny_group_mode && !(deny_group_mode & (user_mode >> 3))) { pnntace = (struct cifs_ace *) (nacl_base + nsize); nsize += fill_ace_for_sid(pnntace, pgrpsid, deny_group_mode, 0070, ACCESS_DENIED, false); num_aces++; } pnntace = (struct cifs_ace *) (nacl_base + nsize); nsize += fill_ace_for_sid(pnntace, pownersid, user_mode, 0700, ACCESS_ALLOWED, true); num_aces++; /* Group DENY ACE conflicts with owner ALLOW ACE. So keep it after. */ if (deny_group_mode && (deny_group_mode & (user_mode >> 3))) { pnntace = (struct cifs_ace *) (nacl_base + nsize); nsize += fill_ace_for_sid(pnntace, pgrpsid, deny_group_mode, 0070, ACCESS_DENIED, false); num_aces++; } pnntace = (struct cifs_ace *) (nacl_base + nsize); nsize += fill_ace_for_sid(pnntace, pgrpsid, group_mode, 0070, ACCESS_ALLOWED, !sticky_set); num_aces++; pnntace = (struct cifs_ace *) (nacl_base + nsize); nsize += fill_ace_for_sid(pnntace, &sid_everyone, other_mode, 0007, ACCESS_ALLOWED, !sticky_set); num_aces++; set_size: *pnum_aces = num_aces; *pnsize = nsize; } static __u16 replace_sids_and_copy_aces(struct cifs_acl *pdacl, struct cifs_acl *pndacl, struct cifs_sid *pownersid, struct cifs_sid *pgrpsid, struct cifs_sid *pnownersid, struct cifs_sid *pngrpsid) { int i; u16 size = 0; struct cifs_ace *pntace = NULL; char *acl_base = NULL; u32 src_num_aces = 0; u16 nsize = 0; struct cifs_ace *pnntace = NULL; char *nacl_base = NULL; u16 ace_size = 0; acl_base = (char *)pdacl; size = sizeof(struct cifs_acl); src_num_aces = le32_to_cpu(pdacl->num_aces); nacl_base = (char *)pndacl; nsize = sizeof(struct cifs_acl); /* Go through all the ACEs */ for (i = 0; i < src_num_aces; ++i) { pntace = (struct cifs_ace *) (acl_base + size); pnntace = (struct cifs_ace *) (nacl_base + nsize); if (pnownersid && compare_sids(&pntace->sid, pownersid) == 0) ace_size = cifs_copy_ace(pnntace, pntace, pnownersid); else if (pngrpsid && compare_sids(&pntace->sid, pgrpsid) == 0) ace_size = cifs_copy_ace(pnntace, pntace, pngrpsid); else ace_size = cifs_copy_ace(pnntace, pntace, NULL); size += le16_to_cpu(pntace->size); nsize += ace_size; } return nsize; } static int set_chmod_dacl(struct cifs_acl *pdacl, struct cifs_acl *pndacl, struct cifs_sid *pownersid, struct cifs_sid *pgrpsid, __u64 *pnmode, bool mode_from_sid) { int i; u16 size = 0; struct cifs_ace *pntace = NULL; char *acl_base = NULL; u32 src_num_aces = 0; u16 nsize = 0; struct cifs_ace *pnntace = NULL; char *nacl_base = NULL; u32 num_aces = 0; bool new_aces_set = false; /* Assuming that pndacl and pnmode are never NULL */ nacl_base = (char *)pndacl; nsize = sizeof(struct cifs_acl); /* If pdacl is NULL, we don't have a src. Simply populate new ACL. */ if (!pdacl) { populate_new_aces(nacl_base, pownersid, pgrpsid, pnmode, &num_aces, &nsize, mode_from_sid); goto finalize_dacl; } acl_base = (char *)pdacl; size = sizeof(struct cifs_acl); src_num_aces = le32_to_cpu(pdacl->num_aces); /* Retain old ACEs which we can retain */ for (i = 0; i < src_num_aces; ++i) { pntace = (struct cifs_ace *) (acl_base + size); if (!new_aces_set && (pntace->flags & INHERITED_ACE)) { /* Place the new ACEs in between existing explicit and inherited */ populate_new_aces(nacl_base, pownersid, pgrpsid, pnmode, &num_aces, &nsize, mode_from_sid); new_aces_set = true; } /* If it's any one of the ACE we're replacing, skip! */ if (((compare_sids(&pntace->sid, &sid_unix_NFS_mode) == 0) || (compare_sids(&pntace->sid, pownersid) == 0) || (compare_sids(&pntace->sid, pgrpsid) == 0) || (compare_sids(&pntace->sid, &sid_everyone) == 0) || (compare_sids(&pntace->sid, &sid_authusers) == 0))) { goto next_ace; } /* update the pointer to the next ACE to populate*/ pnntace = (struct cifs_ace *) (nacl_base + nsize); nsize += cifs_copy_ace(pnntace, pntace, NULL); num_aces++; next_ace: size += le16_to_cpu(pntace->size); } /* If inherited ACEs are not present, place the new ones at the tail */ if (!new_aces_set) { populate_new_aces(nacl_base, pownersid, pgrpsid, pnmode, &num_aces, &nsize, mode_from_sid); new_aces_set = true; } finalize_dacl: pndacl->num_aces = cpu_to_le32(num_aces); pndacl->size = cpu_to_le16(nsize); return 0; } static int parse_sid(struct cifs_sid *psid, char *end_of_acl) { /* BB need to add parm so we can store the SID BB */ /* validate that we do not go past end of ACL - sid must be at least 8 bytes long (assuming no sub-auths - e.g. the null SID */ if (end_of_acl < (char *)psid + 8) { cifs_dbg(VFS, "ACL too small to parse SID %p\n", psid); return -EINVAL; } #ifdef CONFIG_CIFS_DEBUG2 if (psid->num_subauth) { int i; cifs_dbg(FYI, "SID revision %d num_auth %d\n", psid->revision, psid->num_subauth); for (i = 0; i < psid->num_subauth; i++) { cifs_dbg(FYI, "SID sub_auth[%d]: 0x%x\n", i, le32_to_cpu(psid->sub_auth[i])); } /* BB add length check to make sure that we do not have huge num auths and therefore go off the end */ cifs_dbg(FYI, "RID 0x%x\n", le32_to_cpu(psid->sub_auth[psid->num_subauth-1])); } #endif return 0; } /* Convert CIFS ACL to POSIX form */ static int parse_sec_desc(struct cifs_sb_info *cifs_sb, struct cifs_ntsd *pntsd, int acl_len, struct cifs_fattr *fattr, bool get_mode_from_special_sid) { int rc = 0; struct cifs_sid *owner_sid_ptr, *group_sid_ptr; struct cifs_acl *dacl_ptr; /* no need for SACL ptr */ char *end_of_acl = ((char *)pntsd) + acl_len; __u32 dacloffset; if (pntsd == NULL) return -EIO; owner_sid_ptr = (struct cifs_sid *)((char *)pntsd + le32_to_cpu(pntsd->osidoffset)); group_sid_ptr = (struct cifs_sid *)((char *)pntsd + le32_to_cpu(pntsd->gsidoffset)); dacloffset = le32_to_cpu(pntsd->dacloffset); dacl_ptr = (struct cifs_acl *)((char *)pntsd + dacloffset); cifs_dbg(NOISY, "revision %d type 0x%x ooffset 0x%x goffset 0x%x sacloffset 0x%x dacloffset 0x%x\n", pntsd->revision, pntsd->type, le32_to_cpu(pntsd->osidoffset), le32_to_cpu(pntsd->gsidoffset), le32_to_cpu(pntsd->sacloffset), dacloffset); /* cifs_dump_mem("owner_sid: ", owner_sid_ptr, 64); */ rc = parse_sid(owner_sid_ptr, end_of_acl); if (rc) { cifs_dbg(FYI, "%s: Error %d parsing Owner SID\n", __func__, rc); return rc; } rc = sid_to_id(cifs_sb, owner_sid_ptr, fattr, SIDOWNER); if (rc) { cifs_dbg(FYI, "%s: Error %d mapping Owner SID to uid\n", __func__, rc); return rc; } rc = parse_sid(group_sid_ptr, end_of_acl); if (rc) { cifs_dbg(FYI, "%s: Error %d mapping Owner SID to gid\n", __func__, rc); return rc; } rc = sid_to_id(cifs_sb, group_sid_ptr, fattr, SIDGROUP); if (rc) { cifs_dbg(FYI, "%s: Error %d mapping Group SID to gid\n", __func__, rc); return rc; } if (dacloffset) parse_dacl(dacl_ptr, end_of_acl, owner_sid_ptr, group_sid_ptr, fattr, get_mode_from_special_sid); else cifs_dbg(FYI, "no ACL\n"); /* BB grant all or default perms? */ return rc; } /* Convert permission bits from mode to equivalent CIFS ACL */ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd, __u32 secdesclen, __u32 *pnsecdesclen, __u64 *pnmode, kuid_t uid, kgid_t gid, bool mode_from_sid, bool id_from_sid, int *aclflag) { int rc = 0; __u32 dacloffset; __u32 ndacloffset; __u32 sidsoffset; struct cifs_sid *owner_sid_ptr, *group_sid_ptr; struct cifs_sid *nowner_sid_ptr = NULL, *ngroup_sid_ptr = NULL; struct cifs_acl *dacl_ptr = NULL; /* no need for SACL ptr */ struct cifs_acl *ndacl_ptr = NULL; /* no need for SACL ptr */ char *end_of_acl = ((char *)pntsd) + secdesclen; u16 size = 0; dacloffset = le32_to_cpu(pntsd->dacloffset); if (dacloffset) { dacl_ptr = (struct cifs_acl *)((char *)pntsd + dacloffset); if (end_of_acl < (char *)dacl_ptr + le16_to_cpu(dacl_ptr->size)) { cifs_dbg(VFS, "Server returned illegal ACL size\n"); return -EINVAL; } } owner_sid_ptr = (struct cifs_sid *)((char *)pntsd + le32_to_cpu(pntsd->osidoffset)); group_sid_ptr = (struct cifs_sid *)((char *)pntsd + le32_to_cpu(pntsd->gsidoffset)); if (pnmode && *pnmode != NO_CHANGE_64) { /* chmod */ ndacloffset = sizeof(struct cifs_ntsd); ndacl_ptr = (struct cifs_acl *)((char *)pnntsd + ndacloffset); ndacl_ptr->revision = dacloffset ? dacl_ptr->revision : cpu_to_le16(ACL_REVISION); ndacl_ptr->size = cpu_to_le16(0); ndacl_ptr->num_aces = cpu_to_le32(0); rc = set_chmod_dacl(dacl_ptr, ndacl_ptr, owner_sid_ptr, group_sid_ptr, pnmode, mode_from_sid); sidsoffset = ndacloffset + le16_to_cpu(ndacl_ptr->size); /* copy the non-dacl portion of secdesc */ *pnsecdesclen = copy_sec_desc(pntsd, pnntsd, sidsoffset, NULL, NULL); *aclflag |= CIFS_ACL_DACL; } else { ndacloffset = sizeof(struct cifs_ntsd); ndacl_ptr = (struct cifs_acl *)((char *)pnntsd + ndacloffset); ndacl_ptr->revision = dacloffset ? dacl_ptr->revision : cpu_to_le16(ACL_REVISION); ndacl_ptr->num_aces = dacl_ptr ? dacl_ptr->num_aces : 0; if (uid_valid(uid)) { /* chown */ uid_t id; nowner_sid_ptr = kzalloc(sizeof(struct cifs_sid), GFP_KERNEL); if (!nowner_sid_ptr) { rc = -ENOMEM; goto chown_chgrp_exit; } id = from_kuid(&init_user_ns, uid); if (id_from_sid) { struct owner_sid *osid = (struct owner_sid *)nowner_sid_ptr; /* Populate the user ownership fields S-1-5-88-1 */ osid->Revision = 1; osid->NumAuth = 3; osid->Authority[5] = 5; osid->SubAuthorities[0] = cpu_to_le32(88); osid->SubAuthorities[1] = cpu_to_le32(1); osid->SubAuthorities[2] = cpu_to_le32(id); } else { /* lookup sid with upcall */ rc = id_to_sid(id, SIDOWNER, nowner_sid_ptr); if (rc) { cifs_dbg(FYI, "%s: Mapping error %d for owner id %d\n", __func__, rc, id); goto chown_chgrp_exit; } } *aclflag |= CIFS_ACL_OWNER; } if (gid_valid(gid)) { /* chgrp */ gid_t id; ngroup_sid_ptr = kzalloc(sizeof(struct cifs_sid), GFP_KERNEL); if (!ngroup_sid_ptr) { rc = -ENOMEM; goto chown_chgrp_exit; } id = from_kgid(&init_user_ns, gid); if (id_from_sid) { struct owner_sid *gsid = (struct owner_sid *)ngroup_sid_ptr; /* Populate the group ownership fields S-1-5-88-2 */ gsid->Revision = 1; gsid->NumAuth = 3; gsid->Authority[5] = 5; gsid->SubAuthorities[0] = cpu_to_le32(88); gsid->SubAuthorities[1] = cpu_to_le32(2); gsid->SubAuthorities[2] = cpu_to_le32(id); } else { /* lookup sid with upcall */ rc = id_to_sid(id, SIDGROUP, ngroup_sid_ptr); if (rc) { cifs_dbg(FYI, "%s: Mapping error %d for group id %d\n", __func__, rc, id); goto chown_chgrp_exit; } } *aclflag |= CIFS_ACL_GROUP; } if (dacloffset) { /* Replace ACEs for old owner with new one */ size = replace_sids_and_copy_aces(dacl_ptr, ndacl_ptr, owner_sid_ptr, group_sid_ptr, nowner_sid_ptr, ngroup_sid_ptr); ndacl_ptr->size = cpu_to_le16(size); } sidsoffset = ndacloffset + le16_to_cpu(ndacl_ptr->size); /* copy the non-dacl portion of secdesc */ *pnsecdesclen = copy_sec_desc(pntsd, pnntsd, sidsoffset, nowner_sid_ptr, ngroup_sid_ptr); chown_chgrp_exit: /* errors could jump here. So make sure we return soon after this */ kfree(nowner_sid_ptr); kfree(ngroup_sid_ptr); } return rc; } #ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb, const struct cifs_fid *cifsfid, u32 *pacllen, u32 __maybe_unused unused) { struct cifs_ntsd *pntsd = NULL; unsigned int xid; int rc; struct tcon_link *tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) return ERR_CAST(tlink); xid = get_xid(); rc = CIFSSMBGetCIFSACL(xid, tlink_tcon(tlink), cifsfid->netfid, &pntsd, pacllen); free_xid(xid); cifs_put_tlink(tlink); cifs_dbg(FYI, "%s: rc = %d ACL len %d\n", __func__, rc, *pacllen); if (rc) return ERR_PTR(rc); return pntsd; } static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, const char *path, u32 *pacllen) { struct cifs_ntsd *pntsd = NULL; int oplock = 0; unsigned int xid; int rc; struct cifs_tcon *tcon; struct tcon_link *tlink = cifs_sb_tlink(cifs_sb); struct cifs_fid fid; struct cifs_open_parms oparms; if (IS_ERR(tlink)) return ERR_CAST(tlink); tcon = tlink_tcon(tlink); xid = get_xid(); oparms = (struct cifs_open_parms) { .tcon = tcon, .cifs_sb = cifs_sb, .desired_access = READ_CONTROL, .create_options = cifs_create_options(cifs_sb, 0), .disposition = FILE_OPEN, .path = path, .fid = &fid, }; rc = CIFS_open(xid, &oparms, &oplock, NULL); if (!rc) { rc = CIFSSMBGetCIFSACL(xid, tcon, fid.netfid, &pntsd, pacllen); CIFSSMBClose(xid, tcon, fid.netfid); } cifs_put_tlink(tlink); free_xid(xid); cifs_dbg(FYI, "%s: rc = %d ACL len %d\n", __func__, rc, *pacllen); if (rc) return ERR_PTR(rc); return pntsd; } /* Retrieve an ACL from the server */ struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb, struct inode *inode, const char *path, u32 *pacllen, u32 info) { struct cifs_ntsd *pntsd = NULL; struct cifsFileInfo *open_file = NULL; if (inode) open_file = find_readable_file(CIFS_I(inode), true); if (!open_file) return get_cifs_acl_by_path(cifs_sb, path, pacllen); pntsd = get_cifs_acl_by_fid(cifs_sb, &open_file->fid, pacllen, info); cifsFileInfo_put(open_file); return pntsd; } /* Set an ACL on the server */ int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen, struct inode *inode, const char *path, int aclflag) { int oplock = 0; unsigned int xid; int rc, access_flags; struct cifs_tcon *tcon; struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct tcon_link *tlink = cifs_sb_tlink(cifs_sb); struct cifs_fid fid; struct cifs_open_parms oparms; if (IS_ERR(tlink)) return PTR_ERR(tlink); tcon = tlink_tcon(tlink); xid = get_xid(); if (aclflag == CIFS_ACL_OWNER || aclflag == CIFS_ACL_GROUP) access_flags = WRITE_OWNER; else access_flags = WRITE_DAC; oparms = (struct cifs_open_parms) { .tcon = tcon, .cifs_sb = cifs_sb, .desired_access = access_flags, .create_options = cifs_create_options(cifs_sb, 0), .disposition = FILE_OPEN, .path = path, .fid = &fid, }; rc = CIFS_open(xid, &oparms, &oplock, NULL); if (rc) { cifs_dbg(VFS, "Unable to open file to set ACL\n"); goto out; } rc = CIFSSMBSetCIFSACL(xid, tcon, fid.netfid, pnntsd, acllen, aclflag); cifs_dbg(NOISY, "SetCIFSACL rc = %d\n", rc); CIFSSMBClose(xid, tcon, fid.netfid); out: free_xid(xid); cifs_put_tlink(tlink); return rc; } #endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ /* Translate the CIFS ACL (similar to NTFS ACL) for a file into mode bits */ int cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, struct inode *inode, bool mode_from_special_sid, const char *path, const struct cifs_fid *pfid) { struct cifs_ntsd *pntsd = NULL; u32 acllen = 0; int rc = 0; struct tcon_link *tlink = cifs_sb_tlink(cifs_sb); struct smb_version_operations *ops; const u32 info = 0; cifs_dbg(NOISY, "converting ACL to mode for %s\n", path); if (IS_ERR(tlink)) return PTR_ERR(tlink); ops = tlink_tcon(tlink)->ses->server->ops; if (pfid && (ops->get_acl_by_fid)) pntsd = ops->get_acl_by_fid(cifs_sb, pfid, &acllen, info); else if (ops->get_acl) pntsd = ops->get_acl(cifs_sb, inode, path, &acllen, info); else { cifs_put_tlink(tlink); return -EOPNOTSUPP; } /* if we can retrieve the ACL, now parse Access Control Entries, ACEs */ if (IS_ERR(pntsd)) { rc = PTR_ERR(pntsd); cifs_dbg(VFS, "%s: error %d getting sec desc\n", __func__, rc); } else if (mode_from_special_sid) { rc = parse_sec_desc(cifs_sb, pntsd, acllen, fattr, true); kfree(pntsd); } else { /* get approximated mode from ACL */ rc = parse_sec_desc(cifs_sb, pntsd, acllen, fattr, false); kfree(pntsd); if (rc) cifs_dbg(VFS, "parse sec desc failed rc = %d\n", rc); } cifs_put_tlink(tlink); return rc; } /* Convert mode bits to an ACL so we can update the ACL on the server */ int id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 *pnmode, kuid_t uid, kgid_t gid) { int rc = 0; int aclflag = CIFS_ACL_DACL; /* default flag to set */ __u32 secdesclen = 0; __u32 nsecdesclen = 0; __u32 dacloffset = 0; struct cifs_acl *dacl_ptr = NULL; struct cifs_ntsd *pntsd = NULL; /* acl obtained from server */ struct cifs_ntsd *pnntsd = NULL; /* modified acl to be sent to server */ struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct tcon_link *tlink = cifs_sb_tlink(cifs_sb); struct smb_version_operations *ops; bool mode_from_sid, id_from_sid; const u32 info = 0; if (IS_ERR(tlink)) return PTR_ERR(tlink); ops = tlink_tcon(tlink)->ses->server->ops; cifs_dbg(NOISY, "set ACL from mode for %s\n", path); /* Get the security descriptor */ if (ops->get_acl == NULL) { cifs_put_tlink(tlink); return -EOPNOTSUPP; } pntsd = ops->get_acl(cifs_sb, inode, path, &secdesclen, info); if (IS_ERR(pntsd)) { rc = PTR_ERR(pntsd); cifs_dbg(VFS, "%s: error %d getting sec desc\n", __func__, rc); cifs_put_tlink(tlink); return rc; } if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MODE_FROM_SID) mode_from_sid = true; else mode_from_sid = false; if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UID_FROM_ACL) id_from_sid = true; else id_from_sid = false; /* Potentially, five new ACEs can be added to the ACL for U,G,O mapping */ nsecdesclen = secdesclen; if (pnmode && *pnmode != NO_CHANGE_64) { /* chmod */ if (mode_from_sid) nsecdesclen += 2 * sizeof(struct cifs_ace); else /* cifsacl */ nsecdesclen += 5 * sizeof(struct cifs_ace); } else { /* chown */ /* When ownership changes, changes new owner sid length could be different */ nsecdesclen = sizeof(struct cifs_ntsd) + (sizeof(struct cifs_sid) * 2); dacloffset = le32_to_cpu(pntsd->dacloffset); if (dacloffset) { dacl_ptr = (struct cifs_acl *)((char *)pntsd + dacloffset); if (mode_from_sid) nsecdesclen += le32_to_cpu(dacl_ptr->num_aces) * sizeof(struct cifs_ace); else /* cifsacl */ nsecdesclen += le16_to_cpu(dacl_ptr->size); } } /* * Add three ACEs for owner, group, everyone getting rid of other ACEs * as chmod disables ACEs and set the security descriptor. Allocate * memory for the smb header, set security descriptor request security * descriptor parameters, and security descriptor itself */ nsecdesclen = max_t(u32, nsecdesclen, DEFAULT_SEC_DESC_LEN); pnntsd = kmalloc(nsecdesclen, GFP_KERNEL); if (!pnntsd) { kfree(pntsd); cifs_put_tlink(tlink); return -ENOMEM; } rc = build_sec_desc(pntsd, pnntsd, secdesclen, &nsecdesclen, pnmode, uid, gid, mode_from_sid, id_from_sid, &aclflag); cifs_dbg(NOISY, "build_sec_desc rc: %d\n", rc); if (ops->set_acl == NULL) rc = -EOPNOTSUPP; if (!rc) { /* Set the security descriptor */ rc = ops->set_acl(pnntsd, nsecdesclen, inode, path, aclflag); cifs_dbg(NOISY, "set_cifs_acl rc: %d\n", rc); } cifs_put_tlink(tlink); kfree(pnntsd); kfree(pntsd); return rc; } struct posix_acl *cifs_get_acl(struct mnt_idmap *idmap, struct dentry *dentry, int type) { #if defined(CONFIG_CIFS_ALLOW_INSECURE_LEGACY) && defined(CONFIG_CIFS_POSIX) struct posix_acl *acl = NULL; ssize_t rc = -EOPNOTSUPP; unsigned int xid; struct super_block *sb = dentry->d_sb; struct cifs_sb_info *cifs_sb = CIFS_SB(sb); struct tcon_link *tlink; struct cifs_tcon *pTcon; const char *full_path; void *page; tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) return ERR_CAST(tlink); pTcon = tlink_tcon(tlink); xid = get_xid(); page = alloc_dentry_path(); full_path = build_path_from_dentry(dentry, page); if (IS_ERR(full_path)) { acl = ERR_CAST(full_path); goto out; } /* return alt name if available as pseudo attr */ switch (type) { case ACL_TYPE_ACCESS: if (sb->s_flags & SB_POSIXACL) rc = cifs_do_get_acl(xid, pTcon, full_path, &acl, ACL_TYPE_ACCESS, cifs_sb->local_nls, cifs_remap(cifs_sb)); break; case ACL_TYPE_DEFAULT: if (sb->s_flags & SB_POSIXACL) rc = cifs_do_get_acl(xid, pTcon, full_path, &acl, ACL_TYPE_DEFAULT, cifs_sb->local_nls, cifs_remap(cifs_sb)); break; } if (rc < 0) { if (rc == -EINVAL) acl = ERR_PTR(-EOPNOTSUPP); else acl = ERR_PTR(rc); } out: free_dentry_path(page); free_xid(xid); cifs_put_tlink(tlink); return acl; #else return ERR_PTR(-EOPNOTSUPP); #endif } int cifs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type) { #if defined(CONFIG_CIFS_ALLOW_INSECURE_LEGACY) && defined(CONFIG_CIFS_POSIX) int rc = -EOPNOTSUPP; unsigned int xid; struct super_block *sb = dentry->d_sb; struct cifs_sb_info *cifs_sb = CIFS_SB(sb); struct tcon_link *tlink; struct cifs_tcon *pTcon; const char *full_path; void *page; tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) return PTR_ERR(tlink); pTcon = tlink_tcon(tlink); xid = get_xid(); page = alloc_dentry_path(); full_path = build_path_from_dentry(dentry, page); if (IS_ERR(full_path)) { rc = PTR_ERR(full_path); goto out; } if (!acl) goto out; /* return dos attributes as pseudo xattr */ /* return alt name if available as pseudo attr */ /* if proc/fs/cifs/streamstoxattr is set then search server for EAs or streams to returns as xattrs */ if (posix_acl_xattr_size(acl->a_count) > CIFSMaxBufSize) { cifs_dbg(FYI, "size of EA value too large\n"); rc = -EOPNOTSUPP; goto out; } switch (type) { case ACL_TYPE_ACCESS: if (sb->s_flags & SB_POSIXACL) rc = cifs_do_set_acl(xid, pTcon, full_path, acl, ACL_TYPE_ACCESS, cifs_sb->local_nls, cifs_remap(cifs_sb)); break; case ACL_TYPE_DEFAULT: if (sb->s_flags & SB_POSIXACL) rc = cifs_do_set_acl(xid, pTcon, full_path, acl, ACL_TYPE_DEFAULT, cifs_sb->local_nls, cifs_remap(cifs_sb)); break; } out: free_dentry_path(page); free_xid(xid); cifs_put_tlink(tlink); return rc; #else return -EOPNOTSUPP; #endif }
5 5 5 4 2 3 5 2 2 17 17 15 8 8 8 8 9 8 5 5 5 5 3 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 // SPDX-License-Identifier: GPL-2.0-only /* * truncate.c * * PURPOSE * Truncate handling routines for the OSTA-UDF(tm) filesystem. * * COPYRIGHT * (C) 1999-2004 Ben Fennema * (C) 1999 Stelias Computing Inc * * HISTORY * * 02/24/99 blf Created. * */ #include "udfdecl.h" #include <linux/fs.h> #include <linux/mm.h> #include "udf_i.h" #include "udf_sb.h" static void extent_trunc(struct inode *inode, struct extent_position *epos, struct kernel_lb_addr *eloc, int8_t etype, uint32_t elen, uint32_t nelen) { struct kernel_lb_addr neloc = {}; int last_block = (elen + inode->i_sb->s_blocksize - 1) >> inode->i_sb->s_blocksize_bits; int first_block = (nelen + inode->i_sb->s_blocksize - 1) >> inode->i_sb->s_blocksize_bits; if (nelen) { if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) { udf_free_blocks(inode->i_sb, inode, eloc, 0, last_block); etype = (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30); } else neloc = *eloc; nelen = (etype << 30) | nelen; } if (elen != nelen) { udf_write_aext(inode, epos, &neloc, nelen, 0); if (last_block > first_block) { if (etype == (EXT_RECORDED_ALLOCATED >> 30)) mark_inode_dirty(inode); if (etype != (EXT_NOT_RECORDED_NOT_ALLOCATED >> 30)) udf_free_blocks(inode->i_sb, inode, eloc, first_block, last_block - first_block); } } } /* * Truncate the last extent to match i_size. This function assumes * that preallocation extent is already truncated. */ void udf_truncate_tail_extent(struct inode *inode) { struct extent_position epos = {}; struct kernel_lb_addr eloc; uint32_t elen, nelen; uint64_t lbcount = 0; int8_t etype = -1, netype; int adsize; struct udf_inode_info *iinfo = UDF_I(inode); if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB || inode->i_size == iinfo->i_lenExtents) return; /* Are we going to delete the file anyway? */ if (inode->i_nlink == 0) return; if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) adsize = sizeof(struct short_ad); else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) adsize = sizeof(struct long_ad); else BUG(); /* Find the last extent in the file */ while ((netype = udf_next_aext(inode, &epos, &eloc, &elen, 1)) != -1) { etype = netype; lbcount += elen; if (lbcount > inode->i_size) { if (lbcount - inode->i_size >= inode->i_sb->s_blocksize) udf_warn(inode->i_sb, "Too long extent after EOF in inode %u: i_size: %lld lbcount: %lld extent %u+%u\n", (unsigned)inode->i_ino, (long long)inode->i_size, (long long)lbcount, (unsigned)eloc.logicalBlockNum, (unsigned)elen); nelen = elen - (lbcount - inode->i_size); epos.offset -= adsize; extent_trunc(inode, &epos, &eloc, etype, elen, nelen); epos.offset += adsize; if (udf_next_aext(inode, &epos, &eloc, &elen, 1) != -1) udf_err(inode->i_sb, "Extent after EOF in inode %u\n", (unsigned)inode->i_ino); break; } } /* This inode entry is in-memory only and thus we don't have to mark * the inode dirty */ iinfo->i_lenExtents = inode->i_size; brelse(epos.bh); } void udf_discard_prealloc(struct inode *inode) { struct extent_position epos = {}; struct extent_position prev_epos = {}; struct kernel_lb_addr eloc; uint32_t elen; uint64_t lbcount = 0; int8_t etype = -1; struct udf_inode_info *iinfo = UDF_I(inode); int bsize = i_blocksize(inode); if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB || ALIGN(inode->i_size, bsize) == ALIGN(iinfo->i_lenExtents, bsize)) return; epos.block = iinfo->i_location; /* Find the last extent in the file */ while (udf_next_aext(inode, &epos, &eloc, &elen, 0) != -1) { brelse(prev_epos.bh); prev_epos = epos; if (prev_epos.bh) get_bh(prev_epos.bh); etype = udf_next_aext(inode, &epos, &eloc, &elen, 1); lbcount += elen; } if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) { lbcount -= elen; udf_delete_aext(inode, prev_epos); udf_free_blocks(inode->i_sb, inode, &eloc, 0, DIV_ROUND_UP(elen, bsize)); } /* This inode entry is in-memory only and thus we don't have to mark * the inode dirty */ iinfo->i_lenExtents = lbcount; brelse(epos.bh); brelse(prev_epos.bh); } static void udf_update_alloc_ext_desc(struct inode *inode, struct extent_position *epos, u32 lenalloc) { struct super_block *sb = inode->i_sb; struct udf_sb_info *sbi = UDF_SB(sb); struct allocExtDesc *aed = (struct allocExtDesc *) (epos->bh->b_data); int len = sizeof(struct allocExtDesc); aed->lengthAllocDescs = cpu_to_le32(lenalloc); if (!UDF_QUERY_FLAG(sb, UDF_FLAG_STRICT) || sbi->s_udfrev >= 0x0201) len += lenalloc; udf_update_tag(epos->bh->b_data, len); mark_buffer_dirty_inode(epos->bh, inode); } /* * Truncate extents of inode to inode->i_size. This function can be used only * for making file shorter. For making file longer, udf_extend_file() has to * be used. */ int udf_truncate_extents(struct inode *inode) { struct extent_position epos; struct kernel_lb_addr eloc, neloc = {}; uint32_t elen, nelen = 0, indirect_ext_len = 0, lenalloc; int8_t etype; struct super_block *sb = inode->i_sb; sector_t first_block = inode->i_size >> sb->s_blocksize_bits, offset; loff_t byte_offset; int adsize; struct udf_inode_info *iinfo = UDF_I(inode); if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) adsize = sizeof(struct short_ad); else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG) adsize = sizeof(struct long_ad); else BUG(); etype = inode_bmap(inode, first_block, &epos, &eloc, &elen, &offset); byte_offset = (offset << sb->s_blocksize_bits) + (inode->i_size & (sb->s_blocksize - 1)); if (etype == -1) { /* We should extend the file? */ WARN_ON(byte_offset); return 0; } epos.offset -= adsize; extent_trunc(inode, &epos, &eloc, etype, elen, byte_offset); epos.offset += adsize; if (byte_offset) lenalloc = epos.offset; else lenalloc = epos.offset - adsize; if (!epos.bh) lenalloc -= udf_file_entry_alloc_offset(inode); else lenalloc -= sizeof(struct allocExtDesc); while ((etype = udf_current_aext(inode, &epos, &eloc, &elen, 0)) != -1) { if (etype == (EXT_NEXT_EXTENT_ALLOCDESCS >> 30)) { udf_write_aext(inode, &epos, &neloc, nelen, 0); if (indirect_ext_len) { /* We managed to free all extents in the * indirect extent - free it too */ BUG_ON(!epos.bh); udf_free_blocks(sb, NULL, &epos.block, 0, indirect_ext_len); } else if (!epos.bh) { iinfo->i_lenAlloc = lenalloc; mark_inode_dirty(inode); } else udf_update_alloc_ext_desc(inode, &epos, lenalloc); brelse(epos.bh); epos.offset = sizeof(struct allocExtDesc); epos.block = eloc; epos.bh = sb_bread(sb, udf_get_lb_pblock(sb, &eloc, 0)); /* Error reading indirect block? */ if (!epos.bh) return -EIO; if (elen) indirect_ext_len = (elen + sb->s_blocksize - 1) >> sb->s_blocksize_bits; else indirect_ext_len = 1; } else { extent_trunc(inode, &epos, &eloc, etype, elen, 0); epos.offset += adsize; } } if (indirect_ext_len) { BUG_ON(!epos.bh); udf_free_blocks(sb, NULL, &epos.block, 0, indirect_ext_len); } else if (!epos.bh) { iinfo->i_lenAlloc = lenalloc; mark_inode_dirty(inode); } else udf_update_alloc_ext_desc(inode, &epos, lenalloc); iinfo->i_lenExtents = inode->i_size; brelse(epos.bh); return 0; }
19 504 260 259 510 257 257 29 29 29 91 91 85 18 18 143 2 14 32 5 27 15 18 8 1 19 2 2 8 16 10 9 3 13 1 14 14 11 19 5 7 18 25 75 4 3 1 50 13 8 2 7 32 28 21 57 51 11 27 6 5 10 16 21 60 52 8 60 47 34 1 13 11 273 215 161 153 224 216 161 153 223 34 66 56 34 33 58 33 58 66 12 4 8 6 7 5 4 313 314 316 315 1 315 311 35 29 6 35 8 31 29 6 32 4 50 22 17 5 22 24 2 1 20 1 23 4 8 12 16 7 35 35 34 34 6 6 6 6 5 2 5 4 4 27 10 2 4 3 3 2 1 13 3 10 8 1 3 6 1 1 1 1 7 4 7 8 18 1 1 11 3 1 1 11 5 5 11 2 18 9 14 2 8 7 3 2 11 569 474 289 24 1 21 2 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/pipe.c * * Copyright (C) 1991, 1992, 1999 Linus Torvalds */ #include <linux/mm.h> #include <linux/file.h> #include <linux/poll.h> #include <linux/slab.h> #include <linux/module.h> #include <linux/init.h> #include <linux/fs.h> #include <linux/log2.h> #include <linux/mount.h> #include <linux/pseudo_fs.h> #include <linux/magic.h> #include <linux/pipe_fs_i.h> #include <linux/uio.h> #include <linux/highmem.h> #include <linux/pagemap.h> #include <linux/audit.h> #include <linux/syscalls.h> #include <linux/fcntl.h> #include <linux/memcontrol.h> #include <linux/watch_queue.h> #include <linux/sysctl.h> #include <linux/uaccess.h> #include <asm/ioctls.h> #include "internal.h" /* * New pipe buffers will be restricted to this size while the user is exceeding * their pipe buffer quota. The general pipe use case needs at least two * buffers: one for data yet to be read, and one for new data. If this is less * than two, then a write to a non-empty pipe may block even if the pipe is not * full. This can occur with GNU make jobserver or similar uses of pipes as * semaphores: multiple processes may be waiting to write tokens back to the * pipe before reading tokens: https://lore.kernel.org/lkml/1628086770.5rn8p04n6j.none@localhost/. * * Users can reduce their pipe buffers with F_SETPIPE_SZ below this at their * own risk, namely: pipe writes to non-full pipes may block until the pipe is * emptied. */ #define PIPE_MIN_DEF_BUFFERS 2 /* * The max size that a non-root user is allowed to grow the pipe. Can * be set by root in /proc/sys/fs/pipe-max-size */ static unsigned int pipe_max_size = 1048576; /* Maximum allocatable pages per user. Hard limit is unset by default, soft * matches default values. */ static unsigned long pipe_user_pages_hard; static unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR; /* * We use head and tail indices that aren't masked off, except at the point of * dereference, but rather they're allowed to wrap naturally. This means there * isn't a dead spot in the buffer, but the ring has to be a power of two and * <= 2^31. * -- David Howells 2019-09-23. * * Reads with count = 0 should always return 0. * -- Julian Bradfield 1999-06-07. * * FIFOs and Pipes now generate SIGIO for both readers and writers. * -- Jeremy Elson <jelson@circlemud.org> 2001-08-16 * * pipe_read & write cleanup * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09 */ #define cmp_int(l, r) ((l > r) - (l < r)) #ifdef CONFIG_PROVE_LOCKING static int pipe_lock_cmp_fn(const struct lockdep_map *a, const struct lockdep_map *b) { return cmp_int((unsigned long) a, (unsigned long) b); } #endif void pipe_lock(struct pipe_inode_info *pipe) { if (pipe->files) mutex_lock(&pipe->mutex); } EXPORT_SYMBOL(pipe_lock); void pipe_unlock(struct pipe_inode_info *pipe) { if (pipe->files) mutex_unlock(&pipe->mutex); } EXPORT_SYMBOL(pipe_unlock); void pipe_double_lock(struct pipe_inode_info *pipe1, struct pipe_inode_info *pipe2) { BUG_ON(pipe1 == pipe2); if (pipe1 > pipe2) swap(pipe1, pipe2); pipe_lock(pipe1); pipe_lock(pipe2); } static void anon_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { struct page *page = buf->page; /* * If nobody else uses this page, and we don't already have a * temporary page, let's keep track of it as a one-deep * allocation cache. (Otherwise just release our reference to it) */ if (page_count(page) == 1 && !pipe->tmp_page) pipe->tmp_page = page; else put_page(page); } static bool anon_pipe_buf_try_steal(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { struct page *page = buf->page; if (page_count(page) != 1) return false; memcg_kmem_uncharge_page(page, 0); __SetPageLocked(page); return true; } /** * generic_pipe_buf_try_steal - attempt to take ownership of a &pipe_buffer * @pipe: the pipe that the buffer belongs to * @buf: the buffer to attempt to steal * * Description: * This function attempts to steal the &struct page attached to * @buf. If successful, this function returns 0 and returns with * the page locked. The caller may then reuse the page for whatever * he wishes; the typical use is insertion into a different file * page cache. */ bool generic_pipe_buf_try_steal(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { struct page *page = buf->page; /* * A reference of one is golden, that means that the owner of this * page is the only one holding a reference to it. lock the page * and return OK. */ if (page_count(page) == 1) { lock_page(page); return true; } return false; } EXPORT_SYMBOL(generic_pipe_buf_try_steal); /** * generic_pipe_buf_get - get a reference to a &struct pipe_buffer * @pipe: the pipe that the buffer belongs to * @buf: the buffer to get a reference to * * Description: * This function grabs an extra reference to @buf. It's used in * the tee() system call, when we duplicate the buffers in one * pipe into another. */ bool generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { return try_get_page(buf->page); } EXPORT_SYMBOL(generic_pipe_buf_get); /** * generic_pipe_buf_release - put a reference to a &struct pipe_buffer * @pipe: the pipe that the buffer belongs to * @buf: the buffer to put a reference to * * Description: * This function releases a reference to @buf. */ void generic_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { put_page(buf->page); } EXPORT_SYMBOL(generic_pipe_buf_release); static const struct pipe_buf_operations anon_pipe_buf_ops = { .release = anon_pipe_buf_release, .try_steal = anon_pipe_buf_try_steal, .get = generic_pipe_buf_get, }; /* Done while waiting without holding the pipe lock - thus the READ_ONCE() */ static inline bool pipe_readable(const struct pipe_inode_info *pipe) { unsigned int head = READ_ONCE(pipe->head); unsigned int tail = READ_ONCE(pipe->tail); unsigned int writers = READ_ONCE(pipe->writers); return !pipe_empty(head, tail) || !writers; } static inline unsigned int pipe_update_tail(struct pipe_inode_info *pipe, struct pipe_buffer *buf, unsigned int tail) { pipe_buf_release(pipe, buf); /* * If the pipe has a watch_queue, we need additional protection * by the spinlock because notifications get posted with only * this spinlock, no mutex */ if (pipe_has_watch_queue(pipe)) { spin_lock_irq(&pipe->rd_wait.lock); #ifdef CONFIG_WATCH_QUEUE if (buf->flags & PIPE_BUF_FLAG_LOSS) pipe->note_loss = true; #endif pipe->tail = ++tail; spin_unlock_irq(&pipe->rd_wait.lock); return tail; } /* * Without a watch_queue, we can simply increment the tail * without the spinlock - the mutex is enough. */ pipe->tail = ++tail; return tail; } static ssize_t pipe_read(struct kiocb *iocb, struct iov_iter *to) { size_t total_len = iov_iter_count(to); struct file *filp = iocb->ki_filp; struct pipe_inode_info *pipe = filp->private_data; bool was_full, wake_next_reader = false; ssize_t ret; /* Null read succeeds. */ if (unlikely(total_len == 0)) return 0; ret = 0; mutex_lock(&pipe->mutex); /* * We only wake up writers if the pipe was full when we started * reading in order to avoid unnecessary wakeups. * * But when we do wake up writers, we do so using a sync wakeup * (WF_SYNC), because we want them to get going and generate more * data for us. */ was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage); for (;;) { /* Read ->head with a barrier vs post_one_notification() */ unsigned int head = smp_load_acquire(&pipe->head); unsigned int tail = pipe->tail; unsigned int mask = pipe->ring_size - 1; #ifdef CONFIG_WATCH_QUEUE if (pipe->note_loss) { struct watch_notification n; if (total_len < 8) { if (ret == 0) ret = -ENOBUFS; break; } n.type = WATCH_TYPE_META; n.subtype = WATCH_META_LOSS_NOTIFICATION; n.info = watch_sizeof(n); if (copy_to_iter(&n, sizeof(n), to) != sizeof(n)) { if (ret == 0) ret = -EFAULT; break; } ret += sizeof(n); total_len -= sizeof(n); pipe->note_loss = false; } #endif if (!pipe_empty(head, tail)) { struct pipe_buffer *buf = &pipe->bufs[tail & mask]; size_t chars = buf->len; size_t written; int error; if (chars > total_len) { if (buf->flags & PIPE_BUF_FLAG_WHOLE) { if (ret == 0) ret = -ENOBUFS; break; } chars = total_len; } error = pipe_buf_confirm(pipe, buf); if (error) { if (!ret) ret = error; break; } written = copy_page_to_iter(buf->page, buf->offset, chars, to); if (unlikely(written < chars)) { if (!ret) ret = -EFAULT; break; } ret += chars; buf->offset += chars; buf->len -= chars; /* Was it a packet buffer? Clean up and exit */ if (buf->flags & PIPE_BUF_FLAG_PACKET) { total_len = chars; buf->len = 0; } if (!buf->len) tail = pipe_update_tail(pipe, buf, tail); total_len -= chars; if (!total_len) break; /* common path: read succeeded */ if (!pipe_empty(head, tail)) /* More to do? */ continue; } if (!pipe->writers) break; if (ret) break; if ((filp->f_flags & O_NONBLOCK) || (iocb->ki_flags & IOCB_NOWAIT)) { ret = -EAGAIN; break; } mutex_unlock(&pipe->mutex); /* * We only get here if we didn't actually read anything. * * However, we could have seen (and removed) a zero-sized * pipe buffer, and might have made space in the buffers * that way. * * You can't make zero-sized pipe buffers by doing an empty * write (not even in packet mode), but they can happen if * the writer gets an EFAULT when trying to fill a buffer * that already got allocated and inserted in the buffer * array. * * So we still need to wake up any pending writers in the * _very_ unlikely case that the pipe was full, but we got * no data. */ if (unlikely(was_full)) wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); /* * But because we didn't read anything, at this point we can * just return directly with -ERESTARTSYS if we're interrupted, * since we've done any required wakeups and there's no need * to mark anything accessed. And we've dropped the lock. */ if (wait_event_interruptible_exclusive(pipe->rd_wait, pipe_readable(pipe)) < 0) return -ERESTARTSYS; mutex_lock(&pipe->mutex); was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage); wake_next_reader = true; } if (pipe_empty(pipe->head, pipe->tail)) wake_next_reader = false; mutex_unlock(&pipe->mutex); if (was_full) wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); if (wake_next_reader) wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); if (ret > 0) file_accessed(filp); return ret; } static inline int is_packetized(struct file *file) { return (file->f_flags & O_DIRECT) != 0; } /* Done while waiting without holding the pipe lock - thus the READ_ONCE() */ static inline bool pipe_writable(const struct pipe_inode_info *pipe) { unsigned int head = READ_ONCE(pipe->head); unsigned int tail = READ_ONCE(pipe->tail); unsigned int max_usage = READ_ONCE(pipe->max_usage); return !pipe_full(head, tail, max_usage) || !READ_ONCE(pipe->readers); } static ssize_t pipe_write(struct kiocb *iocb, struct iov_iter *from) { struct file *filp = iocb->ki_filp; struct pipe_inode_info *pipe = filp->private_data; unsigned int head; ssize_t ret = 0; size_t total_len = iov_iter_count(from); ssize_t chars; bool was_empty = false; bool wake_next_writer = false; /* * Reject writing to watch queue pipes before the point where we lock * the pipe. * Otherwise, lockdep would be unhappy if the caller already has another * pipe locked. * If we had to support locking a normal pipe and a notification pipe at * the same time, we could set up lockdep annotations for that, but * since we don't actually need that, it's simpler to just bail here. */ if (pipe_has_watch_queue(pipe)) return -EXDEV; /* Null write succeeds. */ if (unlikely(total_len == 0)) return 0; mutex_lock(&pipe->mutex); if (!pipe->readers) { send_sig(SIGPIPE, current, 0); ret = -EPIPE; goto out; } /* * If it wasn't empty we try to merge new data into * the last buffer. * * That naturally merges small writes, but it also * page-aligns the rest of the writes for large writes * spanning multiple pages. */ head = pipe->head; was_empty = pipe_empty(head, pipe->tail); chars = total_len & (PAGE_SIZE-1); if (chars && !was_empty) { unsigned int mask = pipe->ring_size - 1; struct pipe_buffer *buf = &pipe->bufs[(head - 1) & mask]; int offset = buf->offset + buf->len; if ((buf->flags & PIPE_BUF_FLAG_CAN_MERGE) && offset + chars <= PAGE_SIZE) { ret = pipe_buf_confirm(pipe, buf); if (ret) goto out; ret = copy_page_from_iter(buf->page, offset, chars, from); if (unlikely(ret < chars)) { ret = -EFAULT; goto out; } buf->len += ret; if (!iov_iter_count(from)) goto out; } } for (;;) { if (!pipe->readers) { send_sig(SIGPIPE, current, 0); if (!ret) ret = -EPIPE; break; } head = pipe->head; if (!pipe_full(head, pipe->tail, pipe->max_usage)) { unsigned int mask = pipe->ring_size - 1; struct pipe_buffer *buf; struct page *page = pipe->tmp_page; int copied; if (!page) { page = alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT); if (unlikely(!page)) { ret = ret ? : -ENOMEM; break; } pipe->tmp_page = page; } /* Allocate a slot in the ring in advance and attach an * empty buffer. If we fault or otherwise fail to use * it, either the reader will consume it or it'll still * be there for the next write. */ pipe->head = head + 1; /* Insert it into the buffer array */ buf = &pipe->bufs[head & mask]; buf->page = page; buf->ops = &anon_pipe_buf_ops; buf->offset = 0; buf->len = 0; if (is_packetized(filp)) buf->flags = PIPE_BUF_FLAG_PACKET; else buf->flags = PIPE_BUF_FLAG_CAN_MERGE; pipe->tmp_page = NULL; copied = copy_page_from_iter(page, 0, PAGE_SIZE, from); if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) { if (!ret) ret = -EFAULT; break; } ret += copied; buf->len = copied; if (!iov_iter_count(from)) break; } if (!pipe_full(head, pipe->tail, pipe->max_usage)) continue; /* Wait for buffer space to become available. */ if ((filp->f_flags & O_NONBLOCK) || (iocb->ki_flags & IOCB_NOWAIT)) { if (!ret) ret = -EAGAIN; break; } if (signal_pending(current)) { if (!ret) ret = -ERESTARTSYS; break; } /* * We're going to release the pipe lock and wait for more * space. We wake up any readers if necessary, and then * after waiting we need to re-check whether the pipe * become empty while we dropped the lock. */ mutex_unlock(&pipe->mutex); if (was_empty) wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); wait_event_interruptible_exclusive(pipe->wr_wait, pipe_writable(pipe)); mutex_lock(&pipe->mutex); was_empty = pipe_empty(pipe->head, pipe->tail); wake_next_writer = true; } out: if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) wake_next_writer = false; mutex_unlock(&pipe->mutex); /* * If we do do a wakeup event, we do a 'sync' wakeup, because we * want the reader to start processing things asap, rather than * leave the data pending. * * This is particularly important for small writes, because of * how (for example) the GNU make jobserver uses small writes to * wake up pending jobs * * Epoll nonsensically wants a wakeup whether the pipe * was already empty or not. */ if (was_empty || pipe->poll_usage) wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); if (wake_next_writer) wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) { int err = file_update_time(filp); if (err) ret = err; sb_end_write(file_inode(filp)->i_sb); } return ret; } static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct pipe_inode_info *pipe = filp->private_data; unsigned int count, head, tail, mask; switch (cmd) { case FIONREAD: mutex_lock(&pipe->mutex); count = 0; head = pipe->head; tail = pipe->tail; mask = pipe->ring_size - 1; while (tail != head) { count += pipe->bufs[tail & mask].len; tail++; } mutex_unlock(&pipe->mutex); return put_user(count, (int __user *)arg); #ifdef CONFIG_WATCH_QUEUE case IOC_WATCH_QUEUE_SET_SIZE: { int ret; mutex_lock(&pipe->mutex); ret = watch_queue_set_size(pipe, arg); mutex_unlock(&pipe->mutex); return ret; } case IOC_WATCH_QUEUE_SET_FILTER: return watch_queue_set_filter( pipe, (struct watch_notification_filter __user *)arg); #endif default: return -ENOIOCTLCMD; } } /* No kernel lock held - fine */ static __poll_t pipe_poll(struct file *filp, poll_table *wait) { __poll_t mask; struct pipe_inode_info *pipe = filp->private_data; unsigned int head, tail; /* Epoll has some historical nasty semantics, this enables them */ WRITE_ONCE(pipe->poll_usage, true); /* * Reading pipe state only -- no need for acquiring the semaphore. * * But because this is racy, the code has to add the * entry to the poll table _first_ .. */ if (filp->f_mode & FMODE_READ) poll_wait(filp, &pipe->rd_wait, wait); if (filp->f_mode & FMODE_WRITE) poll_wait(filp, &pipe->wr_wait, wait); /* * .. and only then can you do the racy tests. That way, * if something changes and you got it wrong, the poll * table entry will wake you up and fix it. */ head = READ_ONCE(pipe->head); tail = READ_ONCE(pipe->tail); mask = 0; if (filp->f_mode & FMODE_READ) { if (!pipe_empty(head, tail)) mask |= EPOLLIN | EPOLLRDNORM; if (!pipe->writers && filp->f_version != pipe->w_counter) mask |= EPOLLHUP; } if (filp->f_mode & FMODE_WRITE) { if (!pipe_full(head, tail, pipe->max_usage)) mask |= EPOLLOUT | EPOLLWRNORM; /* * Most Unices do not set EPOLLERR for FIFOs but on Linux they * behave exactly like pipes for poll(). */ if (!pipe->readers) mask |= EPOLLERR; } return mask; } static void put_pipe_info(struct inode *inode, struct pipe_inode_info *pipe) { int kill = 0; spin_lock(&inode->i_lock); if (!--pipe->files) { inode->i_pipe = NULL; kill = 1; } spin_unlock(&inode->i_lock); if (kill) free_pipe_info(pipe); } static int pipe_release(struct inode *inode, struct file *file) { struct pipe_inode_info *pipe = file->private_data; mutex_lock(&pipe->mutex); if (file->f_mode & FMODE_READ) pipe->readers--; if (file->f_mode & FMODE_WRITE) pipe->writers--; /* Was that the last reader or writer, but not the other side? */ if (!pipe->readers != !pipe->writers) { wake_up_interruptible_all(&pipe->rd_wait); wake_up_interruptible_all(&pipe->wr_wait); kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } mutex_unlock(&pipe->mutex); put_pipe_info(inode, pipe); return 0; } static int pipe_fasync(int fd, struct file *filp, int on) { struct pipe_inode_info *pipe = filp->private_data; int retval = 0; mutex_lock(&pipe->mutex); if (filp->f_mode & FMODE_READ) retval = fasync_helper(fd, filp, on, &pipe->fasync_readers); if ((filp->f_mode & FMODE_WRITE) && retval >= 0) { retval = fasync_helper(fd, filp, on, &pipe->fasync_writers); if (retval < 0 && (filp->f_mode & FMODE_READ)) /* this can happen only if on == T */ fasync_helper(-1, filp, 0, &pipe->fasync_readers); } mutex_unlock(&pipe->mutex); return retval; } unsigned long account_pipe_buffers(struct user_struct *user, unsigned long old, unsigned long new) { return atomic_long_add_return(new - old, &user->pipe_bufs); } bool too_many_pipe_buffers_soft(unsigned long user_bufs) { unsigned long soft_limit = READ_ONCE(pipe_user_pages_soft); return soft_limit && user_bufs > soft_limit; } bool too_many_pipe_buffers_hard(unsigned long user_bufs) { unsigned long hard_limit = READ_ONCE(pipe_user_pages_hard); return hard_limit && user_bufs > hard_limit; } bool pipe_is_unprivileged_user(void) { return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN); } struct pipe_inode_info *alloc_pipe_info(void) { struct pipe_inode_info *pipe; unsigned long pipe_bufs = PIPE_DEF_BUFFERS; struct user_struct *user = get_current_user(); unsigned long user_bufs; unsigned int max_size = READ_ONCE(pipe_max_size); pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL_ACCOUNT); if (pipe == NULL) goto out_free_uid; if (pipe_bufs * PAGE_SIZE > max_size && !capable(CAP_SYS_RESOURCE)) pipe_bufs = max_size >> PAGE_SHIFT; user_bufs = account_pipe_buffers(user, 0, pipe_bufs); if (too_many_pipe_buffers_soft(user_bufs) && pipe_is_unprivileged_user()) { user_bufs = account_pipe_buffers(user, pipe_bufs, PIPE_MIN_DEF_BUFFERS); pipe_bufs = PIPE_MIN_DEF_BUFFERS; } if (too_many_pipe_buffers_hard(user_bufs) && pipe_is_unprivileged_user()) goto out_revert_acct; pipe->bufs = kcalloc(pipe_bufs, sizeof(struct pipe_buffer), GFP_KERNEL_ACCOUNT); if (pipe->bufs) { init_waitqueue_head(&pipe->rd_wait); init_waitqueue_head(&pipe->wr_wait); pipe->r_counter = pipe->w_counter = 1; pipe->max_usage = pipe_bufs; pipe->ring_size = pipe_bufs; pipe->nr_accounted = pipe_bufs; pipe->user = user; mutex_init(&pipe->mutex); lock_set_cmp_fn(&pipe->mutex, pipe_lock_cmp_fn, NULL); return pipe; } out_revert_acct: (void) account_pipe_buffers(user, pipe_bufs, 0); kfree(pipe); out_free_uid: free_uid(user); return NULL; } void free_pipe_info(struct pipe_inode_info *pipe) { unsigned int i; #ifdef CONFIG_WATCH_QUEUE if (pipe->watch_queue) watch_queue_clear(pipe->watch_queue); #endif (void) account_pipe_buffers(pipe->user, pipe->nr_accounted, 0); free_uid(pipe->user); for (i = 0; i < pipe->ring_size; i++) { struct pipe_buffer *buf = pipe->bufs + i; if (buf->ops) pipe_buf_release(pipe, buf); } #ifdef CONFIG_WATCH_QUEUE if (pipe->watch_queue) put_watch_queue(pipe->watch_queue); #endif if (pipe->tmp_page) __free_page(pipe->tmp_page); kfree(pipe->bufs); kfree(pipe); } static struct vfsmount *pipe_mnt __ro_after_init; /* * pipefs_dname() is called from d_path(). */ static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen) { return dynamic_dname(buffer, buflen, "pipe:[%lu]", d_inode(dentry)->i_ino); } static const struct dentry_operations pipefs_dentry_operations = { .d_dname = pipefs_dname, }; static struct inode * get_pipe_inode(void) { struct inode *inode = new_inode_pseudo(pipe_mnt->mnt_sb); struct pipe_inode_info *pipe; if (!inode) goto fail_inode; inode->i_ino = get_next_ino(); pipe = alloc_pipe_info(); if (!pipe) goto fail_iput; inode->i_pipe = pipe; pipe->files = 2; pipe->readers = pipe->writers = 1; inode->i_fop = &pipefifo_fops; /* * Mark the inode dirty from the very beginning, * that way it will never be moved to the dirty * list because "mark_inode_dirty()" will think * that it already _is_ on the dirty list. */ inode->i_state = I_DIRTY; inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); simple_inode_init_ts(inode); return inode; fail_iput: iput(inode); fail_inode: return NULL; } int create_pipe_files(struct file **res, int flags) { struct inode *inode = get_pipe_inode(); struct file *f; int error; if (!inode) return -ENFILE; if (flags & O_NOTIFICATION_PIPE) { error = watch_queue_init(inode->i_pipe); if (error) { free_pipe_info(inode->i_pipe); iput(inode); return error; } } f = alloc_file_pseudo(inode, pipe_mnt, "", O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)), &pipefifo_fops); if (IS_ERR(f)) { free_pipe_info(inode->i_pipe); iput(inode); return PTR_ERR(f); } f->private_data = inode->i_pipe; res[0] = alloc_file_clone(f, O_RDONLY | (flags & O_NONBLOCK), &pipefifo_fops); if (IS_ERR(res[0])) { put_pipe_info(inode, inode->i_pipe); fput(f); return PTR_ERR(res[0]); } res[0]->private_data = inode->i_pipe; res[1] = f; stream_open(inode, res[0]); stream_open(inode, res[1]); return 0; } static int __do_pipe_flags(int *fd, struct file **files, int flags) { int error; int fdw, fdr; if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT | O_NOTIFICATION_PIPE)) return -EINVAL; error = create_pipe_files(files, flags); if (error) return error; error = get_unused_fd_flags(flags); if (error < 0) goto err_read_pipe; fdr = error; error = get_unused_fd_flags(flags); if (error < 0) goto err_fdr; fdw = error; audit_fd_pair(fdr, fdw); fd[0] = fdr; fd[1] = fdw; /* pipe groks IOCB_NOWAIT */ files[0]->f_mode |= FMODE_NOWAIT; files[1]->f_mode |= FMODE_NOWAIT; return 0; err_fdr: put_unused_fd(fdr); err_read_pipe: fput(files[0]); fput(files[1]); return error; } int do_pipe_flags(int *fd, int flags) { struct file *files[2]; int error = __do_pipe_flags(fd, files, flags); if (!error) { fd_install(fd[0], files[0]); fd_install(fd[1], files[1]); } return error; } /* * sys_pipe() is the normal C calling standard for creating * a pipe. It's not the way Unix traditionally does this, though. */ static int do_pipe2(int __user *fildes, int flags) { struct file *files[2]; int fd[2]; int error; error = __do_pipe_flags(fd, files, flags); if (!error) { if (unlikely(copy_to_user(fildes, fd, sizeof(fd)))) { fput(files[0]); fput(files[1]); put_unused_fd(fd[0]); put_unused_fd(fd[1]); error = -EFAULT; } else { fd_install(fd[0], files[0]); fd_install(fd[1], files[1]); } } return error; } SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags) { return do_pipe2(fildes, flags); } SYSCALL_DEFINE1(pipe, int __user *, fildes) { return do_pipe2(fildes, 0); } /* * This is the stupid "wait for pipe to be readable or writable" * model. * * See pipe_read/write() for the proper kind of exclusive wait, * but that requires that we wake up any other readers/writers * if we then do not end up reading everything (ie the whole * "wake_next_reader/writer" logic in pipe_read/write()). */ void pipe_wait_readable(struct pipe_inode_info *pipe) { pipe_unlock(pipe); wait_event_interruptible(pipe->rd_wait, pipe_readable(pipe)); pipe_lock(pipe); } void pipe_wait_writable(struct pipe_inode_info *pipe) { pipe_unlock(pipe); wait_event_interruptible(pipe->wr_wait, pipe_writable(pipe)); pipe_lock(pipe); } /* * This depends on both the wait (here) and the wakeup (wake_up_partner) * holding the pipe lock, so "*cnt" is stable and we know a wakeup cannot * race with the count check and waitqueue prep. * * Normally in order to avoid races, you'd do the prepare_to_wait() first, * then check the condition you're waiting for, and only then sleep. But * because of the pipe lock, we can check the condition before being on * the wait queue. * * We use the 'rd_wait' waitqueue for pipe partner waiting. */ static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt) { DEFINE_WAIT(rdwait); int cur = *cnt; while (cur == *cnt) { prepare_to_wait(&pipe->rd_wait, &rdwait, TASK_INTERRUPTIBLE); pipe_unlock(pipe); schedule(); finish_wait(&pipe->rd_wait, &rdwait); pipe_lock(pipe); if (signal_pending(current)) break; } return cur == *cnt ? -ERESTARTSYS : 0; } static void wake_up_partner(struct pipe_inode_info *pipe) { wake_up_interruptible_all(&pipe->rd_wait); } static int fifo_open(struct inode *inode, struct file *filp) { struct pipe_inode_info *pipe; bool is_pipe = inode->i_sb->s_magic == PIPEFS_MAGIC; int ret; filp->f_version = 0; spin_lock(&inode->i_lock); if (inode->i_pipe) { pipe = inode->i_pipe; pipe->files++; spin_unlock(&inode->i_lock); } else { spin_unlock(&inode->i_lock); pipe = alloc_pipe_info(); if (!pipe) return -ENOMEM; pipe->files = 1; spin_lock(&inode->i_lock); if (unlikely(inode->i_pipe)) { inode->i_pipe->files++; spin_unlock(&inode->i_lock); free_pipe_info(pipe); pipe = inode->i_pipe; } else { inode->i_pipe = pipe; spin_unlock(&inode->i_lock); } } filp->private_data = pipe; /* OK, we have a pipe and it's pinned down */ mutex_lock(&pipe->mutex); /* We can only do regular read/write on fifos */ stream_open(inode, filp); switch (filp->f_mode & (FMODE_READ | FMODE_WRITE)) { case FMODE_READ: /* * O_RDONLY * POSIX.1 says that O_NONBLOCK means return with the FIFO * opened, even when there is no process writing the FIFO. */ pipe->r_counter++; if (pipe->readers++ == 0) wake_up_partner(pipe); if (!is_pipe && !pipe->writers) { if ((filp->f_flags & O_NONBLOCK)) { /* suppress EPOLLHUP until we have * seen a writer */ filp->f_version = pipe->w_counter; } else { if (wait_for_partner(pipe, &pipe->w_counter)) goto err_rd; } } break; case FMODE_WRITE: /* * O_WRONLY * POSIX.1 says that O_NONBLOCK means return -1 with * errno=ENXIO when there is no process reading the FIFO. */ ret = -ENXIO; if (!is_pipe && (filp->f_flags & O_NONBLOCK) && !pipe->readers) goto err; pipe->w_counter++; if (!pipe->writers++) wake_up_partner(pipe); if (!is_pipe && !pipe->readers) { if (wait_for_partner(pipe, &pipe->r_counter)) goto err_wr; } break; case FMODE_READ | FMODE_WRITE: /* * O_RDWR * POSIX.1 leaves this case "undefined" when O_NONBLOCK is set. * This implementation will NEVER block on a O_RDWR open, since * the process can at least talk to itself. */ pipe->readers++; pipe->writers++; pipe->r_counter++; pipe->w_counter++; if (pipe->readers == 1 || pipe->writers == 1) wake_up_partner(pipe); break; default: ret = -EINVAL; goto err; } /* Ok! */ mutex_unlock(&pipe->mutex); return 0; err_rd: if (!--pipe->readers) wake_up_interruptible(&pipe->wr_wait); ret = -ERESTARTSYS; goto err; err_wr: if (!--pipe->writers) wake_up_interruptible_all(&pipe->rd_wait); ret = -ERESTARTSYS; goto err; err: mutex_unlock(&pipe->mutex); put_pipe_info(inode, pipe); return ret; } const struct file_operations pipefifo_fops = { .open = fifo_open, .llseek = no_llseek, .read_iter = pipe_read, .write_iter = pipe_write, .poll = pipe_poll, .unlocked_ioctl = pipe_ioctl, .release = pipe_release, .fasync = pipe_fasync, .splice_write = iter_file_splice_write, }; /* * Currently we rely on the pipe array holding a power-of-2 number * of pages. Returns 0 on error. */ unsigned int round_pipe_size(unsigned int size) { if (size > (1U << 31)) return 0; /* Minimum pipe size, as required by POSIX */ if (size < PAGE_SIZE) return PAGE_SIZE; return roundup_pow_of_two(size); } /* * Resize the pipe ring to a number of slots. * * Note the pipe can be reduced in capacity, but only if the current * occupancy doesn't exceed nr_slots; if it does, EBUSY will be * returned instead. */ int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots) { struct pipe_buffer *bufs; unsigned int head, tail, mask, n; bufs = kcalloc(nr_slots, sizeof(*bufs), GFP_KERNEL_ACCOUNT | __GFP_NOWARN); if (unlikely(!bufs)) return -ENOMEM; spin_lock_irq(&pipe->rd_wait.lock); mask = pipe->ring_size - 1; head = pipe->head; tail = pipe->tail; n = pipe_occupancy(head, tail); if (nr_slots < n) { spin_unlock_irq(&pipe->rd_wait.lock); kfree(bufs); return -EBUSY; } /* * The pipe array wraps around, so just start the new one at zero * and adjust the indices. */ if (n > 0) { unsigned int h = head & mask; unsigned int t = tail & mask; if (h > t) { memcpy(bufs, pipe->bufs + t, n * sizeof(struct pipe_buffer)); } else { unsigned int tsize = pipe->ring_size - t; if (h > 0) memcpy(bufs + tsize, pipe->bufs, h * sizeof(struct pipe_buffer)); memcpy(bufs, pipe->bufs + t, tsize * sizeof(struct pipe_buffer)); } } head = n; tail = 0; kfree(pipe->bufs); pipe->bufs = bufs; pipe->ring_size = nr_slots; if (pipe->max_usage > nr_slots) pipe->max_usage = nr_slots; pipe->tail = tail; pipe->head = head; if (!pipe_has_watch_queue(pipe)) { pipe->max_usage = nr_slots; pipe->nr_accounted = nr_slots; } spin_unlock_irq(&pipe->rd_wait.lock); /* This might have made more room for writers */ wake_up_interruptible(&pipe->wr_wait); return 0; } /* * Allocate a new array of pipe buffers and copy the info over. Returns the * pipe size if successful, or return -ERROR on error. */ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned int arg) { unsigned long user_bufs; unsigned int nr_slots, size; long ret = 0; if (pipe_has_watch_queue(pipe)) return -EBUSY; size = round_pipe_size(arg); nr_slots = size >> PAGE_SHIFT; if (!nr_slots) return -EINVAL; /* * If trying to increase the pipe capacity, check that an * unprivileged user is not trying to exceed various limits * (soft limit check here, hard limit check just below). * Decreasing the pipe capacity is always permitted, even * if the user is currently over a limit. */ if (nr_slots > pipe->max_usage && size > pipe_max_size && !capable(CAP_SYS_RESOURCE)) return -EPERM; user_bufs = account_pipe_buffers(pipe->user, pipe->nr_accounted, nr_slots); if (nr_slots > pipe->max_usage && (too_many_pipe_buffers_hard(user_bufs) || too_many_pipe_buffers_soft(user_bufs)) && pipe_is_unprivileged_user()) { ret = -EPERM; goto out_revert_acct; } ret = pipe_resize_ring(pipe, nr_slots); if (ret < 0) goto out_revert_acct; return pipe->max_usage * PAGE_SIZE; out_revert_acct: (void) account_pipe_buffers(pipe->user, nr_slots, pipe->nr_accounted); return ret; } /* * Note that i_pipe and i_cdev share the same location, so checking ->i_pipe is * not enough to verify that this is a pipe. */ struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice) { struct pipe_inode_info *pipe = file->private_data; if (file->f_op != &pipefifo_fops || !pipe) return NULL; if (for_splice && pipe_has_watch_queue(pipe)) return NULL; return pipe; } long pipe_fcntl(struct file *file, unsigned int cmd, unsigned int arg) { struct pipe_inode_info *pipe; long ret; pipe = get_pipe_info(file, false); if (!pipe) return -EBADF; mutex_lock(&pipe->mutex); switch (cmd) { case F_SETPIPE_SZ: ret = pipe_set_size(pipe, arg); break; case F_GETPIPE_SZ: ret = pipe->max_usage * PAGE_SIZE; break; default: ret = -EINVAL; break; } mutex_unlock(&pipe->mutex); return ret; } static const struct super_operations pipefs_ops = { .destroy_inode = free_inode_nonrcu, .statfs = simple_statfs, }; /* * pipefs should _never_ be mounted by userland - too much of security hassle, * no real gain from having the whole whorehouse mounted. So we don't need * any operations on the root directory. However, we need a non-trivial * d_name - pipe: will go nicely and kill the special-casing in procfs. */ static int pipefs_init_fs_context(struct fs_context *fc) { struct pseudo_fs_context *ctx = init_pseudo(fc, PIPEFS_MAGIC); if (!ctx) return -ENOMEM; ctx->ops = &pipefs_ops; ctx->dops = &pipefs_dentry_operations; return 0; } static struct file_system_type pipe_fs_type = { .name = "pipefs", .init_fs_context = pipefs_init_fs_context, .kill_sb = kill_anon_super, }; #ifdef CONFIG_SYSCTL static int do_proc_dopipe_max_size_conv(unsigned long *lvalp, unsigned int *valp, int write, void *data) { if (write) { unsigned int val; val = round_pipe_size(*lvalp); if (val == 0) return -EINVAL; *valp = val; } else { unsigned int val = *valp; *lvalp = (unsigned long) val; } return 0; } static int proc_dopipe_max_size(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { return do_proc_douintvec(table, write, buffer, lenp, ppos, do_proc_dopipe_max_size_conv, NULL); } static struct ctl_table fs_pipe_sysctls[] = { { .procname = "pipe-max-size", .data = &pipe_max_size, .maxlen = sizeof(pipe_max_size), .mode = 0644, .proc_handler = proc_dopipe_max_size, }, { .procname = "pipe-user-pages-hard", .data = &pipe_user_pages_hard, .maxlen = sizeof(pipe_user_pages_hard), .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, { .procname = "pipe-user-pages-soft", .data = &pipe_user_pages_soft, .maxlen = sizeof(pipe_user_pages_soft), .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, }; #endif static int __init init_pipe_fs(void) { int err = register_filesystem(&pipe_fs_type); if (!err) { pipe_mnt = kern_mount(&pipe_fs_type); if (IS_ERR(pipe_mnt)) { err = PTR_ERR(pipe_mnt); unregister_filesystem(&pipe_fs_type); } } #ifdef CONFIG_SYSCTL register_sysctl_init("fs", fs_pipe_sysctls); #endif return err; } fs_initcall(init_pipe_fs);
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 // SPDX-License-Identifier: GPL-2.0-or-later /* * userdlm.c * * Code which implements the kernel side of a minimal userspace * interface to our DLM. * * Many of the functions here are pared down versions of dlmglue.c * functions. * * Copyright (C) 2003, 2004 Oracle. All rights reserved. */ #include <linux/signal.h> #include <linux/sched/signal.h> #include <linux/module.h> #include <linux/fs.h> #include <linux/types.h> #include <linux/crc32.h> #include "../ocfs2_lockingver.h" #include "../stackglue.h" #include "userdlm.h" #define MLOG_MASK_PREFIX ML_DLMFS #include "../cluster/masklog.h" static inline struct user_lock_res *user_lksb_to_lock_res(struct ocfs2_dlm_lksb *lksb) { return container_of(lksb, struct user_lock_res, l_lksb); } static inline int user_check_wait_flag(struct user_lock_res *lockres, int flag) { int ret; spin_lock(&lockres->l_lock); ret = lockres->l_flags & flag; spin_unlock(&lockres->l_lock); return ret; } static inline void user_wait_on_busy_lock(struct user_lock_res *lockres) { wait_event(lockres->l_event, !user_check_wait_flag(lockres, USER_LOCK_BUSY)); } static inline void user_wait_on_blocked_lock(struct user_lock_res *lockres) { wait_event(lockres->l_event, !user_check_wait_flag(lockres, USER_LOCK_BLOCKED)); } /* I heart container_of... */ static inline struct ocfs2_cluster_connection * cluster_connection_from_user_lockres(struct user_lock_res *lockres) { struct dlmfs_inode_private *ip; ip = container_of(lockres, struct dlmfs_inode_private, ip_lockres); return ip->ip_conn; } static struct inode * user_dlm_inode_from_user_lockres(struct user_lock_res *lockres) { struct dlmfs_inode_private *ip; ip = container_of(lockres, struct dlmfs_inode_private, ip_lockres); return &ip->ip_vfs_inode; } static inline void user_recover_from_dlm_error(struct user_lock_res *lockres) { spin_lock(&lockres->l_lock); lockres->l_flags &= ~USER_LOCK_BUSY; spin_unlock(&lockres->l_lock); } #define user_log_dlm_error(_func, _stat, _lockres) do { \ mlog(ML_ERROR, "Dlm error %d while calling %s on " \ "resource %.*s\n", _stat, _func, \ _lockres->l_namelen, _lockres->l_name); \ } while (0) /* WARNING: This function lives in a world where the only three lock * levels are EX, PR, and NL. It *will* have to be adjusted when more * lock types are added. */ static inline int user_highest_compat_lock_level(int level) { int new_level = DLM_LOCK_EX; if (level == DLM_LOCK_EX) new_level = DLM_LOCK_NL; else if (level == DLM_LOCK_PR) new_level = DLM_LOCK_PR; return new_level; } static void user_ast(struct ocfs2_dlm_lksb *lksb) { struct user_lock_res *lockres = user_lksb_to_lock_res(lksb); int status; mlog(ML_BASTS, "AST fired for lockres %.*s, level %d => %d\n", lockres->l_namelen, lockres->l_name, lockres->l_level, lockres->l_requested); spin_lock(&lockres->l_lock); status = ocfs2_dlm_lock_status(&lockres->l_lksb); if (status) { mlog(ML_ERROR, "lksb status value of %u on lockres %.*s\n", status, lockres->l_namelen, lockres->l_name); spin_unlock(&lockres->l_lock); return; } mlog_bug_on_msg(lockres->l_requested == DLM_LOCK_IV, "Lockres %.*s, requested ivmode. flags 0x%x\n", lockres->l_namelen, lockres->l_name, lockres->l_flags); /* we're downconverting. */ if (lockres->l_requested < lockres->l_level) { if (lockres->l_requested <= user_highest_compat_lock_level(lockres->l_blocking)) { lockres->l_blocking = DLM_LOCK_NL; lockres->l_flags &= ~USER_LOCK_BLOCKED; } } lockres->l_level = lockres->l_requested; lockres->l_requested = DLM_LOCK_IV; lockres->l_flags |= USER_LOCK_ATTACHED; lockres->l_flags &= ~USER_LOCK_BUSY; spin_unlock(&lockres->l_lock); wake_up(&lockres->l_event); } static inline void user_dlm_grab_inode_ref(struct user_lock_res *lockres) { struct inode *inode; inode = user_dlm_inode_from_user_lockres(lockres); if (!igrab(inode)) BUG(); } static void user_dlm_unblock_lock(struct work_struct *work); static void __user_dlm_queue_lockres(struct user_lock_res *lockres) { if (!(lockres->l_flags & USER_LOCK_QUEUED)) { user_dlm_grab_inode_ref(lockres); INIT_WORK(&lockres->l_work, user_dlm_unblock_lock); queue_work(user_dlm_worker, &lockres->l_work); lockres->l_flags |= USER_LOCK_QUEUED; } } static void __user_dlm_cond_queue_lockres(struct user_lock_res *lockres) { int queue = 0; if (!(lockres->l_flags & USER_LOCK_BLOCKED)) return; switch (lockres->l_blocking) { case DLM_LOCK_EX: if (!lockres->l_ex_holders && !lockres->l_ro_holders) queue = 1; break; case DLM_LOCK_PR: if (!lockres->l_ex_holders) queue = 1; break; default: BUG(); } if (queue) __user_dlm_queue_lockres(lockres); } static void user_bast(struct ocfs2_dlm_lksb *lksb, int level) { struct user_lock_res *lockres = user_lksb_to_lock_res(lksb); mlog(ML_BASTS, "BAST fired for lockres %.*s, blocking %d, level %d\n", lockres->l_namelen, lockres->l_name, level, lockres->l_level); spin_lock(&lockres->l_lock); lockres->l_flags |= USER_LOCK_BLOCKED; if (level > lockres->l_blocking) lockres->l_blocking = level; __user_dlm_queue_lockres(lockres); spin_unlock(&lockres->l_lock); wake_up(&lockres->l_event); } static void user_unlock_ast(struct ocfs2_dlm_lksb *lksb, int status) { struct user_lock_res *lockres = user_lksb_to_lock_res(lksb); mlog(ML_BASTS, "UNLOCK AST fired for lockres %.*s, flags 0x%x\n", lockres->l_namelen, lockres->l_name, lockres->l_flags); if (status) mlog(ML_ERROR, "dlm returns status %d\n", status); spin_lock(&lockres->l_lock); /* The teardown flag gets set early during the unlock process, * so test the cancel flag to make sure that this ast isn't * for a concurrent cancel. */ if (lockres->l_flags & USER_LOCK_IN_TEARDOWN && !(lockres->l_flags & USER_LOCK_IN_CANCEL)) { lockres->l_level = DLM_LOCK_IV; } else if (status == DLM_CANCELGRANT) { /* We tried to cancel a convert request, but it was * already granted. Don't clear the busy flag - the * ast should've done this already. */ BUG_ON(!(lockres->l_flags & USER_LOCK_IN_CANCEL)); lockres->l_flags &= ~USER_LOCK_IN_CANCEL; goto out_noclear; } else { BUG_ON(!(lockres->l_flags & USER_LOCK_IN_CANCEL)); /* Cancel succeeded, we want to re-queue */ lockres->l_requested = DLM_LOCK_IV; /* cancel an * upconvert * request. */ lockres->l_flags &= ~USER_LOCK_IN_CANCEL; /* we want the unblock thread to look at it again * now. */ if (lockres->l_flags & USER_LOCK_BLOCKED) __user_dlm_queue_lockres(lockres); } lockres->l_flags &= ~USER_LOCK_BUSY; out_noclear: spin_unlock(&lockres->l_lock); wake_up(&lockres->l_event); } /* * This is the userdlmfs locking protocol version. * * See fs/ocfs2/dlmglue.c for more details on locking versions. */ static struct ocfs2_locking_protocol user_dlm_lproto = { .lp_max_version = { .pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR, .pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR, }, .lp_lock_ast = user_ast, .lp_blocking_ast = user_bast, .lp_unlock_ast = user_unlock_ast, }; static inline void user_dlm_drop_inode_ref(struct user_lock_res *lockres) { struct inode *inode; inode = user_dlm_inode_from_user_lockres(lockres); iput(inode); } static void user_dlm_unblock_lock(struct work_struct *work) { int new_level, status; struct user_lock_res *lockres = container_of(work, struct user_lock_res, l_work); struct ocfs2_cluster_connection *conn = cluster_connection_from_user_lockres(lockres); mlog(0, "lockres %.*s\n", lockres->l_namelen, lockres->l_name); spin_lock(&lockres->l_lock); mlog_bug_on_msg(!(lockres->l_flags & USER_LOCK_QUEUED), "Lockres %.*s, flags 0x%x\n", lockres->l_namelen, lockres->l_name, lockres->l_flags); /* notice that we don't clear USER_LOCK_BLOCKED here. If it's * set, we want user_ast clear it. */ lockres->l_flags &= ~USER_LOCK_QUEUED; /* It's valid to get here and no longer be blocked - if we get * several basts in a row, we might be queued by the first * one, the unblock thread might run and clear the queued * flag, and finally we might get another bast which re-queues * us before our ast for the downconvert is called. */ if (!(lockres->l_flags & USER_LOCK_BLOCKED)) { mlog(ML_BASTS, "lockres %.*s USER_LOCK_BLOCKED\n", lockres->l_namelen, lockres->l_name); spin_unlock(&lockres->l_lock); goto drop_ref; } if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) { mlog(ML_BASTS, "lockres %.*s USER_LOCK_IN_TEARDOWN\n", lockres->l_namelen, lockres->l_name); spin_unlock(&lockres->l_lock); goto drop_ref; } if (lockres->l_flags & USER_LOCK_BUSY) { if (lockres->l_flags & USER_LOCK_IN_CANCEL) { mlog(ML_BASTS, "lockres %.*s USER_LOCK_IN_CANCEL\n", lockres->l_namelen, lockres->l_name); spin_unlock(&lockres->l_lock); goto drop_ref; } lockres->l_flags |= USER_LOCK_IN_CANCEL; spin_unlock(&lockres->l_lock); status = ocfs2_dlm_unlock(conn, &lockres->l_lksb, DLM_LKF_CANCEL); if (status) user_log_dlm_error("ocfs2_dlm_unlock", status, lockres); goto drop_ref; } /* If there are still incompat holders, we can exit safely * without worrying about re-queueing this lock as that will * happen on the last call to user_cluster_unlock. */ if ((lockres->l_blocking == DLM_LOCK_EX) && (lockres->l_ex_holders || lockres->l_ro_holders)) { spin_unlock(&lockres->l_lock); mlog(ML_BASTS, "lockres %.*s, EX/PR Holders %u,%u\n", lockres->l_namelen, lockres->l_name, lockres->l_ex_holders, lockres->l_ro_holders); goto drop_ref; } if ((lockres->l_blocking == DLM_LOCK_PR) && lockres->l_ex_holders) { spin_unlock(&lockres->l_lock); mlog(ML_BASTS, "lockres %.*s, EX Holders %u\n", lockres->l_namelen, lockres->l_name, lockres->l_ex_holders); goto drop_ref; } /* yay, we can downconvert now. */ new_level = user_highest_compat_lock_level(lockres->l_blocking); lockres->l_requested = new_level; lockres->l_flags |= USER_LOCK_BUSY; mlog(ML_BASTS, "lockres %.*s, downconvert %d => %d\n", lockres->l_namelen, lockres->l_name, lockres->l_level, new_level); spin_unlock(&lockres->l_lock); /* need lock downconvert request now... */ status = ocfs2_dlm_lock(conn, new_level, &lockres->l_lksb, DLM_LKF_CONVERT|DLM_LKF_VALBLK, lockres->l_name, lockres->l_namelen); if (status) { user_log_dlm_error("ocfs2_dlm_lock", status, lockres); user_recover_from_dlm_error(lockres); } drop_ref: user_dlm_drop_inode_ref(lockres); } static inline void user_dlm_inc_holders(struct user_lock_res *lockres, int level) { switch(level) { case DLM_LOCK_EX: lockres->l_ex_holders++; break; case DLM_LOCK_PR: lockres->l_ro_holders++; break; default: BUG(); } } /* predict what lock level we'll be dropping down to on behalf * of another node, and return true if the currently wanted * level will be compatible with it. */ static inline int user_may_continue_on_blocked_lock(struct user_lock_res *lockres, int wanted) { BUG_ON(!(lockres->l_flags & USER_LOCK_BLOCKED)); return wanted <= user_highest_compat_lock_level(lockres->l_blocking); } int user_dlm_cluster_lock(struct user_lock_res *lockres, int level, int lkm_flags) { int status, local_flags; struct ocfs2_cluster_connection *conn = cluster_connection_from_user_lockres(lockres); if (level != DLM_LOCK_EX && level != DLM_LOCK_PR) { mlog(ML_ERROR, "lockres %.*s: invalid request!\n", lockres->l_namelen, lockres->l_name); status = -EINVAL; goto bail; } mlog(ML_BASTS, "lockres %.*s, level %d, flags = 0x%x\n", lockres->l_namelen, lockres->l_name, level, lkm_flags); again: if (signal_pending(current)) { status = -ERESTARTSYS; goto bail; } spin_lock(&lockres->l_lock); if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) { spin_unlock(&lockres->l_lock); status = -EAGAIN; goto bail; } /* We only compare against the currently granted level * here. If the lock is blocked waiting on a downconvert, * we'll get caught below. */ if ((lockres->l_flags & USER_LOCK_BUSY) && (level > lockres->l_level)) { /* is someone sitting in dlm_lock? If so, wait on * them. */ spin_unlock(&lockres->l_lock); user_wait_on_busy_lock(lockres); goto again; } if ((lockres->l_flags & USER_LOCK_BLOCKED) && (!user_may_continue_on_blocked_lock(lockres, level))) { /* is the lock is currently blocked on behalf of * another node */ spin_unlock(&lockres->l_lock); user_wait_on_blocked_lock(lockres); goto again; } if (level > lockres->l_level) { local_flags = lkm_flags | DLM_LKF_VALBLK; if (lockres->l_level != DLM_LOCK_IV) local_flags |= DLM_LKF_CONVERT; lockres->l_requested = level; lockres->l_flags |= USER_LOCK_BUSY; spin_unlock(&lockres->l_lock); BUG_ON(level == DLM_LOCK_IV); BUG_ON(level == DLM_LOCK_NL); /* call dlm_lock to upgrade lock now */ status = ocfs2_dlm_lock(conn, level, &lockres->l_lksb, local_flags, lockres->l_name, lockres->l_namelen); if (status) { if ((lkm_flags & DLM_LKF_NOQUEUE) && (status != -EAGAIN)) user_log_dlm_error("ocfs2_dlm_lock", status, lockres); user_recover_from_dlm_error(lockres); goto bail; } user_wait_on_busy_lock(lockres); goto again; } user_dlm_inc_holders(lockres, level); spin_unlock(&lockres->l_lock); status = 0; bail: return status; } static inline void user_dlm_dec_holders(struct user_lock_res *lockres, int level) { switch(level) { case DLM_LOCK_EX: BUG_ON(!lockres->l_ex_holders); lockres->l_ex_holders--; break; case DLM_LOCK_PR: BUG_ON(!lockres->l_ro_holders); lockres->l_ro_holders--; break; default: BUG(); } } void user_dlm_cluster_unlock(struct user_lock_res *lockres, int level) { if (level != DLM_LOCK_EX && level != DLM_LOCK_PR) { mlog(ML_ERROR, "lockres %.*s: invalid request!\n", lockres->l_namelen, lockres->l_name); return; } spin_lock(&lockres->l_lock); user_dlm_dec_holders(lockres, level); __user_dlm_cond_queue_lockres(lockres); spin_unlock(&lockres->l_lock); } void user_dlm_write_lvb(struct inode *inode, const char *val, unsigned int len) { struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres; char *lvb; BUG_ON(len > DLM_LVB_LEN); spin_lock(&lockres->l_lock); BUG_ON(lockres->l_level < DLM_LOCK_EX); lvb = ocfs2_dlm_lvb(&lockres->l_lksb); memcpy(lvb, val, len); spin_unlock(&lockres->l_lock); } bool user_dlm_read_lvb(struct inode *inode, char *val) { struct user_lock_res *lockres = &DLMFS_I(inode)->ip_lockres; char *lvb; bool ret = true; spin_lock(&lockres->l_lock); BUG_ON(lockres->l_level < DLM_LOCK_PR); if (ocfs2_dlm_lvb_valid(&lockres->l_lksb)) { lvb = ocfs2_dlm_lvb(&lockres->l_lksb); memcpy(val, lvb, DLM_LVB_LEN); } else ret = false; spin_unlock(&lockres->l_lock); return ret; } void user_dlm_lock_res_init(struct user_lock_res *lockres, struct dentry *dentry) { memset(lockres, 0, sizeof(*lockres)); spin_lock_init(&lockres->l_lock); init_waitqueue_head(&lockres->l_event); lockres->l_level = DLM_LOCK_IV; lockres->l_requested = DLM_LOCK_IV; lockres->l_blocking = DLM_LOCK_IV; /* should have been checked before getting here. */ BUG_ON(dentry->d_name.len >= USER_DLM_LOCK_ID_MAX_LEN); memcpy(lockres->l_name, dentry->d_name.name, dentry->d_name.len); lockres->l_namelen = dentry->d_name.len; } int user_dlm_destroy_lock(struct user_lock_res *lockres) { int status = -EBUSY; struct ocfs2_cluster_connection *conn = cluster_connection_from_user_lockres(lockres); mlog(ML_BASTS, "lockres %.*s\n", lockres->l_namelen, lockres->l_name); spin_lock(&lockres->l_lock); if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) { spin_unlock(&lockres->l_lock); goto bail; } lockres->l_flags |= USER_LOCK_IN_TEARDOWN; while (lockres->l_flags & USER_LOCK_BUSY) { spin_unlock(&lockres->l_lock); user_wait_on_busy_lock(lockres); spin_lock(&lockres->l_lock); } if (lockres->l_ro_holders || lockres->l_ex_holders) { lockres->l_flags &= ~USER_LOCK_IN_TEARDOWN; spin_unlock(&lockres->l_lock); goto bail; } status = 0; if (!(lockres->l_flags & USER_LOCK_ATTACHED)) { /* * lock is never requested, leave USER_LOCK_IN_TEARDOWN set * to avoid new lock request coming in. */ spin_unlock(&lockres->l_lock); goto bail; } lockres->l_flags |= USER_LOCK_BUSY; spin_unlock(&lockres->l_lock); status = ocfs2_dlm_unlock(conn, &lockres->l_lksb, DLM_LKF_VALBLK); if (status) { spin_lock(&lockres->l_lock); lockres->l_flags &= ~USER_LOCK_IN_TEARDOWN; lockres->l_flags &= ~USER_LOCK_BUSY; spin_unlock(&lockres->l_lock); user_log_dlm_error("ocfs2_dlm_unlock", status, lockres); goto bail; } user_wait_on_busy_lock(lockres); status = 0; bail: return status; } static void user_dlm_recovery_handler_noop(int node_num, void *recovery_data) { /* We ignore recovery events */ return; } void user_dlm_set_locking_protocol(void) { ocfs2_stack_glue_set_max_proto_version(&user_dlm_lproto.lp_max_version); } struct ocfs2_cluster_connection *user_dlm_register(const struct qstr *name) { int rc; struct ocfs2_cluster_connection *conn; rc = ocfs2_cluster_connect_agnostic(name->name, name->len, &user_dlm_lproto, user_dlm_recovery_handler_noop, NULL, &conn); if (rc) mlog_errno(rc); return rc ? ERR_PTR(rc) : conn; } void user_dlm_unregister(struct ocfs2_cluster_connection *conn) { ocfs2_cluster_disconnect(conn, 0); }
1 1 1 99 99 99 1 117 118 4 95 95 1108 1111 220 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 // SPDX-License-Identifier: GPL-2.0 #include <linux/mutex.h> #include <linux/netdevice.h> #include <linux/xarray.h> #include <net/net_debug.h> #include <net/page_pool/types.h> #include <net/page_pool/helpers.h> #include <net/sock.h> #include "page_pool_priv.h" #include "netdev-genl-gen.h" static DEFINE_XARRAY_FLAGS(page_pools, XA_FLAGS_ALLOC1); /* Protects: page_pools, netdevice->page_pools, pool->slow.netdev, pool->user. * Ordering: inside rtnl_lock */ static DEFINE_MUTEX(page_pools_lock); /* Page pools are only reachable from user space (via netlink) if they are * linked to a netdev at creation time. Following page pool "visibility" * states are possible: * - normal * - user.list: linked to real netdev, netdev: real netdev * - orphaned - real netdev has disappeared * - user.list: linked to lo, netdev: lo * - invisible - either (a) created without netdev linking, (b) unlisted due * to error, or (c) the entire namespace which owned this pool disappeared * - user.list: unhashed, netdev: unknown */ typedef int (*pp_nl_fill_cb)(struct sk_buff *rsp, const struct page_pool *pool, const struct genl_info *info); static int netdev_nl_page_pool_get_do(struct genl_info *info, u32 id, pp_nl_fill_cb fill) { struct page_pool *pool; struct sk_buff *rsp; int err; mutex_lock(&page_pools_lock); pool = xa_load(&page_pools, id); if (!pool || hlist_unhashed(&pool->user.list) || !net_eq(dev_net(pool->slow.netdev), genl_info_net(info))) { err = -ENOENT; goto err_unlock; } rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!rsp) { err = -ENOMEM; goto err_unlock; } err = fill(rsp, pool, info); if (err) goto err_free_msg; mutex_unlock(&page_pools_lock); return genlmsg_reply(rsp, info); err_free_msg: nlmsg_free(rsp); err_unlock: mutex_unlock(&page_pools_lock); return err; } struct page_pool_dump_cb { unsigned long ifindex; u32 pp_id; }; static int netdev_nl_page_pool_get_dump(struct sk_buff *skb, struct netlink_callback *cb, pp_nl_fill_cb fill) { struct page_pool_dump_cb *state = (void *)cb->ctx; const struct genl_info *info = genl_info_dump(cb); struct net *net = sock_net(skb->sk); struct net_device *netdev; struct page_pool *pool; int err = 0; rtnl_lock(); mutex_lock(&page_pools_lock); for_each_netdev_dump(net, netdev, state->ifindex) { hlist_for_each_entry(pool, &netdev->page_pools, user.list) { if (state->pp_id && state->pp_id < pool->user.id) continue; state->pp_id = pool->user.id; err = fill(skb, pool, info); if (err) goto out; } state->pp_id = 0; } out: mutex_unlock(&page_pools_lock); rtnl_unlock(); return err; } static int page_pool_nl_stats_fill(struct sk_buff *rsp, const struct page_pool *pool, const struct genl_info *info) { #ifdef CONFIG_PAGE_POOL_STATS struct page_pool_stats stats = {}; struct nlattr *nest; void *hdr; if (!page_pool_get_stats(pool, &stats)) return 0; hdr = genlmsg_iput(rsp, info); if (!hdr) return -EMSGSIZE; nest = nla_nest_start(rsp, NETDEV_A_PAGE_POOL_STATS_INFO); if (nla_put_uint(rsp, NETDEV_A_PAGE_POOL_ID, pool->user.id) || (pool->slow.netdev->ifindex != LOOPBACK_IFINDEX && nla_put_u32(rsp, NETDEV_A_PAGE_POOL_IFINDEX, pool->slow.netdev->ifindex))) goto err_cancel_nest; nla_nest_end(rsp, nest); if (nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_ALLOC_FAST, stats.alloc_stats.fast) || nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_ALLOC_SLOW, stats.alloc_stats.slow) || nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_ALLOC_SLOW_HIGH_ORDER, stats.alloc_stats.slow_high_order) || nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_ALLOC_EMPTY, stats.alloc_stats.empty) || nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_ALLOC_REFILL, stats.alloc_stats.refill) || nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_ALLOC_WAIVE, stats.alloc_stats.waive) || nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_RECYCLE_CACHED, stats.recycle_stats.cached) || nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_RECYCLE_CACHE_FULL, stats.recycle_stats.cache_full) || nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_RECYCLE_RING, stats.recycle_stats.ring) || nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_RECYCLE_RING_FULL, stats.recycle_stats.ring_full) || nla_put_uint(rsp, NETDEV_A_PAGE_POOL_STATS_RECYCLE_RELEASED_REFCNT, stats.recycle_stats.released_refcnt)) goto err_cancel_msg; genlmsg_end(rsp, hdr); return 0; err_cancel_nest: nla_nest_cancel(rsp, nest); err_cancel_msg: genlmsg_cancel(rsp, hdr); return -EMSGSIZE; #else GENL_SET_ERR_MSG(info, "kernel built without CONFIG_PAGE_POOL_STATS"); return -EOPNOTSUPP; #endif } int netdev_nl_page_pool_stats_get_doit(struct sk_buff *skb, struct genl_info *info) { struct nlattr *tb[ARRAY_SIZE(netdev_page_pool_info_nl_policy)]; struct nlattr *nest; int err; u32 id; if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_PAGE_POOL_STATS_INFO)) return -EINVAL; nest = info->attrs[NETDEV_A_PAGE_POOL_STATS_INFO]; err = nla_parse_nested(tb, ARRAY_SIZE(tb) - 1, nest, netdev_page_pool_info_nl_policy, info->extack); if (err) return err; if (NL_REQ_ATTR_CHECK(info->extack, nest, tb, NETDEV_A_PAGE_POOL_ID)) return -EINVAL; if (tb[NETDEV_A_PAGE_POOL_IFINDEX]) { NL_SET_ERR_MSG_ATTR(info->extack, tb[NETDEV_A_PAGE_POOL_IFINDEX], "selecting by ifindex not supported"); return -EINVAL; } id = nla_get_uint(tb[NETDEV_A_PAGE_POOL_ID]); return netdev_nl_page_pool_get_do(info, id, page_pool_nl_stats_fill); } int netdev_nl_page_pool_stats_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { return netdev_nl_page_pool_get_dump(skb, cb, page_pool_nl_stats_fill); } static int page_pool_nl_fill(struct sk_buff *rsp, const struct page_pool *pool, const struct genl_info *info) { size_t inflight, refsz; void *hdr; hdr = genlmsg_iput(rsp, info); if (!hdr) return -EMSGSIZE; if (nla_put_uint(rsp, NETDEV_A_PAGE_POOL_ID, pool->user.id)) goto err_cancel; if (pool->slow.netdev->ifindex != LOOPBACK_IFINDEX && nla_put_u32(rsp, NETDEV_A_PAGE_POOL_IFINDEX, pool->slow.netdev->ifindex)) goto err_cancel; if (pool->user.napi_id && nla_put_uint(rsp, NETDEV_A_PAGE_POOL_NAPI_ID, pool->user.napi_id)) goto err_cancel; inflight = page_pool_inflight(pool, false); refsz = PAGE_SIZE << pool->p.order; if (nla_put_uint(rsp, NETDEV_A_PAGE_POOL_INFLIGHT, inflight) || nla_put_uint(rsp, NETDEV_A_PAGE_POOL_INFLIGHT_MEM, inflight * refsz)) goto err_cancel; if (pool->user.detach_time && nla_put_uint(rsp, NETDEV_A_PAGE_POOL_DETACH_TIME, pool->user.detach_time)) goto err_cancel; genlmsg_end(rsp, hdr); return 0; err_cancel: genlmsg_cancel(rsp, hdr); return -EMSGSIZE; } static void netdev_nl_page_pool_event(const struct page_pool *pool, u32 cmd) { struct genl_info info; struct sk_buff *ntf; struct net *net; lockdep_assert_held(&page_pools_lock); /* 'invisible' page pools don't matter */ if (hlist_unhashed(&pool->user.list)) return; net = dev_net(pool->slow.netdev); if (!genl_has_listeners(&netdev_nl_family, net, NETDEV_NLGRP_PAGE_POOL)) return; genl_info_init_ntf(&info, &netdev_nl_family, cmd); ntf = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!ntf) return; if (page_pool_nl_fill(ntf, pool, &info)) { nlmsg_free(ntf); return; } genlmsg_multicast_netns(&netdev_nl_family, net, ntf, 0, NETDEV_NLGRP_PAGE_POOL, GFP_KERNEL); } int netdev_nl_page_pool_get_doit(struct sk_buff *skb, struct genl_info *info) { u32 id; if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_PAGE_POOL_ID)) return -EINVAL; id = nla_get_uint(info->attrs[NETDEV_A_PAGE_POOL_ID]); return netdev_nl_page_pool_get_do(info, id, page_pool_nl_fill); } int netdev_nl_page_pool_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { return netdev_nl_page_pool_get_dump(skb, cb, page_pool_nl_fill); } int page_pool_list(struct page_pool *pool) { static u32 id_alloc_next; int err; mutex_lock(&page_pools_lock); err = xa_alloc_cyclic(&page_pools, &pool->user.id, pool, xa_limit_32b, &id_alloc_next, GFP_KERNEL); if (err < 0) goto err_unlock; INIT_HLIST_NODE(&pool->user.list); if (pool->slow.netdev) { hlist_add_head(&pool->user.list, &pool->slow.netdev->page_pools); pool->user.napi_id = pool->p.napi ? pool->p.napi->napi_id : 0; netdev_nl_page_pool_event(pool, NETDEV_CMD_PAGE_POOL_ADD_NTF); } mutex_unlock(&page_pools_lock); return 0; err_unlock: mutex_unlock(&page_pools_lock); return err; } void page_pool_detached(struct page_pool *pool) { mutex_lock(&page_pools_lock); pool->user.detach_time = ktime_get_boottime_seconds(); netdev_nl_page_pool_event(pool, NETDEV_CMD_PAGE_POOL_CHANGE_NTF); mutex_unlock(&page_pools_lock); } void page_pool_unlist(struct page_pool *pool) { mutex_lock(&page_pools_lock); netdev_nl_page_pool_event(pool, NETDEV_CMD_PAGE_POOL_DEL_NTF); xa_erase(&page_pools, pool->user.id); if (!hlist_unhashed(&pool->user.list)) hlist_del(&pool->user.list); mutex_unlock(&page_pools_lock); } static void page_pool_unreg_netdev_wipe(struct net_device *netdev) { struct page_pool *pool; struct hlist_node *n; mutex_lock(&page_pools_lock); hlist_for_each_entry_safe(pool, n, &netdev->page_pools, user.list) { hlist_del_init(&pool->user.list); pool->slow.netdev = NET_PTR_POISON; } mutex_unlock(&page_pools_lock); } static void page_pool_unreg_netdev(struct net_device *netdev) { struct page_pool *pool, *last; struct net_device *lo; lo = dev_net(netdev)->loopback_dev; mutex_lock(&page_pools_lock); last = NULL; hlist_for_each_entry(pool, &netdev->page_pools, user.list) { pool->slow.netdev = lo; netdev_nl_page_pool_event(pool, NETDEV_CMD_PAGE_POOL_CHANGE_NTF); last = pool; } if (last) hlist_splice_init(&netdev->page_pools, &last->user.list, &lo->page_pools); mutex_unlock(&page_pools_lock); } static int page_pool_netdevice_event(struct notifier_block *nb, unsigned long event, void *ptr) { struct net_device *netdev = netdev_notifier_info_to_dev(ptr); if (event != NETDEV_UNREGISTER) return NOTIFY_DONE; if (hlist_empty(&netdev->page_pools)) return NOTIFY_OK; if (netdev->ifindex != LOOPBACK_IFINDEX) page_pool_unreg_netdev(netdev); else page_pool_unreg_netdev_wipe(netdev); return NOTIFY_OK; } static struct notifier_block page_pool_netdevice_nb = { .notifier_call = page_pool_netdevice_event, }; static int __init page_pool_user_init(void) { return register_netdevice_notifier(&page_pool_netdevice_nb); } subsys_initcall(page_pool_user_init);
3 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 /* RFCOMM implementation for Linux Bluetooth stack (BlueZ) Copyright (C) 2002 Maxim Krasnyansky <maxk@qualcomm.com> Copyright (C) 2002 Marcel Holtmann <marcel@holtmann.org> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License version 2 as published by the Free Software Foundation; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS SOFTWARE IS DISCLAIMED. */ #include <linux/refcount.h> #ifndef __RFCOMM_H #define __RFCOMM_H #define RFCOMM_CONN_TIMEOUT (HZ * 30) #define RFCOMM_DISC_TIMEOUT (HZ * 20) #define RFCOMM_AUTH_TIMEOUT (HZ * 25) #define RFCOMM_IDLE_TIMEOUT (HZ * 2) #define RFCOMM_DEFAULT_MTU 127 #define RFCOMM_DEFAULT_CREDITS 7 #define RFCOMM_MAX_CREDITS 40 #define RFCOMM_SKB_HEAD_RESERVE 8 #define RFCOMM_SKB_TAIL_RESERVE 2 #define RFCOMM_SKB_RESERVE (RFCOMM_SKB_HEAD_RESERVE + RFCOMM_SKB_TAIL_RESERVE) #define RFCOMM_SABM 0x2f #define RFCOMM_DISC 0x43 #define RFCOMM_UA 0x63 #define RFCOMM_DM 0x0f #define RFCOMM_UIH 0xef #define RFCOMM_TEST 0x08 #define RFCOMM_FCON 0x28 #define RFCOMM_FCOFF 0x18 #define RFCOMM_MSC 0x38 #define RFCOMM_RPN 0x24 #define RFCOMM_RLS 0x14 #define RFCOMM_PN 0x20 #define RFCOMM_NSC 0x04 #define RFCOMM_V24_FC 0x02 #define RFCOMM_V24_RTC 0x04 #define RFCOMM_V24_RTR 0x08 #define RFCOMM_V24_IC 0x40 #define RFCOMM_V24_DV 0x80 #define RFCOMM_RPN_BR_2400 0x0 #define RFCOMM_RPN_BR_4800 0x1 #define RFCOMM_RPN_BR_7200 0x2 #define RFCOMM_RPN_BR_9600 0x3 #define RFCOMM_RPN_BR_19200 0x4 #define RFCOMM_RPN_BR_38400 0x5 #define RFCOMM_RPN_BR_57600 0x6 #define RFCOMM_RPN_BR_115200 0x7 #define RFCOMM_RPN_BR_230400 0x8 #define RFCOMM_RPN_DATA_5 0x0 #define RFCOMM_RPN_DATA_6 0x1 #define RFCOMM_RPN_DATA_7 0x2 #define RFCOMM_RPN_DATA_8 0x3 #define RFCOMM_RPN_STOP_1 0 #define RFCOMM_RPN_STOP_15 1 #define RFCOMM_RPN_PARITY_NONE 0x0 #define RFCOMM_RPN_PARITY_ODD 0x1 #define RFCOMM_RPN_PARITY_EVEN 0x3 #define RFCOMM_RPN_PARITY_MARK 0x5 #define RFCOMM_RPN_PARITY_SPACE 0x7 #define RFCOMM_RPN_FLOW_NONE 0x00 #define RFCOMM_RPN_XON_CHAR 0x11 #define RFCOMM_RPN_XOFF_CHAR 0x13 #define RFCOMM_RPN_PM_BITRATE 0x0001 #define RFCOMM_RPN_PM_DATA 0x0002 #define RFCOMM_RPN_PM_STOP 0x0004 #define RFCOMM_RPN_PM_PARITY 0x0008 #define RFCOMM_RPN_PM_PARITY_TYPE 0x0010 #define RFCOMM_RPN_PM_XON 0x0020 #define RFCOMM_RPN_PM_XOFF 0x0040 #define RFCOMM_RPN_PM_FLOW 0x3F00 #define RFCOMM_RPN_PM_ALL 0x3F7F struct rfcomm_hdr { u8 addr; u8 ctrl; u8 len; /* Actual size can be 2 bytes */ } __packed; struct rfcomm_cmd { u8 addr; u8 ctrl; u8 len; u8 fcs; } __packed; struct rfcomm_mcc { u8 type; u8 len; } __packed; struct rfcomm_pn { u8 dlci; u8 flow_ctrl; u8 priority; u8 ack_timer; __le16 mtu; u8 max_retrans; u8 credits; } __packed; struct rfcomm_rpn { u8 dlci; u8 bit_rate; u8 line_settings; u8 flow_ctrl; u8 xon_char; u8 xoff_char; __le16 param_mask; } __packed; struct rfcomm_rls { u8 dlci; u8 status; } __packed; struct rfcomm_msc { u8 dlci; u8 v24_sig; } __packed; /* ---- Core structures, flags etc ---- */ struct rfcomm_session { struct list_head list; struct socket *sock; struct timer_list timer; unsigned long state; unsigned long flags; int initiator; /* Default DLC parameters */ int cfc; uint mtu; struct list_head dlcs; }; struct rfcomm_dlc { struct list_head list; struct rfcomm_session *session; struct sk_buff_head tx_queue; struct timer_list timer; struct mutex lock; unsigned long state; unsigned long flags; refcount_t refcnt; u8 dlci; u8 addr; u8 priority; u8 v24_sig; u8 remote_v24_sig; u8 mscex; u8 out; u8 sec_level; u8 role_switch; u32 defer_setup; uint mtu; uint cfc; uint rx_credits; uint tx_credits; void *owner; void (*data_ready)(struct rfcomm_dlc *d, struct sk_buff *skb); void (*state_change)(struct rfcomm_dlc *d, int err); void (*modem_status)(struct rfcomm_dlc *d, u8 v24_sig); }; /* DLC and session flags */ #define RFCOMM_RX_THROTTLED 0 #define RFCOMM_TX_THROTTLED 1 #define RFCOMM_TIMED_OUT 2 #define RFCOMM_MSC_PENDING 3 #define RFCOMM_SEC_PENDING 4 #define RFCOMM_AUTH_PENDING 5 #define RFCOMM_AUTH_ACCEPT 6 #define RFCOMM_AUTH_REJECT 7 #define RFCOMM_DEFER_SETUP 8 #define RFCOMM_ENC_DROP 9 /* Scheduling flags and events */ #define RFCOMM_SCHED_WAKEUP 31 /* MSC exchange flags */ #define RFCOMM_MSCEX_TX 1 #define RFCOMM_MSCEX_RX 2 #define RFCOMM_MSCEX_OK (RFCOMM_MSCEX_TX + RFCOMM_MSCEX_RX) /* CFC states */ #define RFCOMM_CFC_UNKNOWN -1 #define RFCOMM_CFC_DISABLED 0 #define RFCOMM_CFC_ENABLED RFCOMM_MAX_CREDITS /* ---- RFCOMM SEND RPN ---- */ int rfcomm_send_rpn(struct rfcomm_session *s, int cr, u8 dlci, u8 bit_rate, u8 data_bits, u8 stop_bits, u8 parity, u8 flow_ctrl_settings, u8 xon_char, u8 xoff_char, u16 param_mask); /* ---- RFCOMM DLCs (channels) ---- */ struct rfcomm_dlc *rfcomm_dlc_alloc(gfp_t prio); void rfcomm_dlc_free(struct rfcomm_dlc *d); int rfcomm_dlc_open(struct rfcomm_dlc *d, bdaddr_t *src, bdaddr_t *dst, u8 channel); int rfcomm_dlc_close(struct rfcomm_dlc *d, int reason); int rfcomm_dlc_send(struct rfcomm_dlc *d, struct sk_buff *skb); void rfcomm_dlc_send_noerror(struct rfcomm_dlc *d, struct sk_buff *skb); int rfcomm_dlc_set_modem_status(struct rfcomm_dlc *d, u8 v24_sig); int rfcomm_dlc_get_modem_status(struct rfcomm_dlc *d, u8 *v24_sig); void rfcomm_dlc_accept(struct rfcomm_dlc *d); struct rfcomm_dlc *rfcomm_dlc_exists(bdaddr_t *src, bdaddr_t *dst, u8 channel); #define rfcomm_dlc_lock(d) mutex_lock(&d->lock) #define rfcomm_dlc_unlock(d) mutex_unlock(&d->lock) static inline void rfcomm_dlc_hold(struct rfcomm_dlc *d) { refcount_inc(&d->refcnt); } static inline void rfcomm_dlc_put(struct rfcomm_dlc *d) { if (refcount_dec_and_test(&d->refcnt)) rfcomm_dlc_free(d); } void __rfcomm_dlc_throttle(struct rfcomm_dlc *d); void __rfcomm_dlc_unthrottle(struct rfcomm_dlc *d); static inline void rfcomm_dlc_throttle(struct rfcomm_dlc *d) { if (!test_and_set_bit(RFCOMM_RX_THROTTLED, &d->flags)) __rfcomm_dlc_throttle(d); } static inline void rfcomm_dlc_unthrottle(struct rfcomm_dlc *d) { if (test_and_clear_bit(RFCOMM_RX_THROTTLED, &d->flags)) __rfcomm_dlc_unthrottle(d); } /* ---- RFCOMM sessions ---- */ void rfcomm_session_getaddr(struct rfcomm_session *s, bdaddr_t *src, bdaddr_t *dst); /* ---- RFCOMM sockets ---- */ struct sockaddr_rc { sa_family_t rc_family; bdaddr_t rc_bdaddr; u8 rc_channel; }; #define RFCOMM_CONNINFO 0x02 struct rfcomm_conninfo { __u16 hci_handle; __u8 dev_class[3]; }; #define RFCOMM_LM 0x03 #define RFCOMM_LM_MASTER 0x0001 #define RFCOMM_LM_AUTH 0x0002 #define RFCOMM_LM_ENCRYPT 0x0004 #define RFCOMM_LM_TRUSTED 0x0008 #define RFCOMM_LM_RELIABLE 0x0010 #define RFCOMM_LM_SECURE 0x0020 #define RFCOMM_LM_FIPS 0x0040 #define rfcomm_pi(sk) ((struct rfcomm_pinfo *) sk) struct rfcomm_pinfo { struct bt_sock bt; bdaddr_t src; bdaddr_t dst; struct rfcomm_dlc *dlc; u8 channel; u8 sec_level; u8 role_switch; }; int rfcomm_init_sockets(void); void rfcomm_cleanup_sockets(void); int rfcomm_connect_ind(struct rfcomm_session *s, u8 channel, struct rfcomm_dlc **d); /* ---- RFCOMM TTY ---- */ #define RFCOMM_MAX_DEV 256 #define RFCOMMCREATEDEV _IOW('R', 200, int) #define RFCOMMRELEASEDEV _IOW('R', 201, int) #define RFCOMMGETDEVLIST _IOR('R', 210, int) #define RFCOMMGETDEVINFO _IOR('R', 211, int) #define RFCOMMSTEALDLC _IOW('R', 220, int) /* rfcomm_dev.flags bit definitions */ #define RFCOMM_REUSE_DLC 0 #define RFCOMM_RELEASE_ONHUP 1 #define RFCOMM_HANGUP_NOW 2 #define RFCOMM_TTY_ATTACHED 3 #define RFCOMM_DEFUNCT_BIT4 4 /* don't reuse this bit - userspace visible */ /* rfcomm_dev.status bit definitions */ #define RFCOMM_DEV_RELEASED 0 #define RFCOMM_TTY_OWNED 1 struct rfcomm_dev_req { s16 dev_id; u32 flags; bdaddr_t src; bdaddr_t dst; u8 channel; }; struct rfcomm_dev_info { s16 id; u32 flags; u16 state; bdaddr_t src; bdaddr_t dst; u8 channel; }; struct rfcomm_dev_list_req { u16 dev_num; struct rfcomm_dev_info dev_info[]; }; int rfcomm_dev_ioctl(struct sock *sk, unsigned int cmd, void __user *arg); #ifdef CONFIG_BT_RFCOMM_TTY int rfcomm_init_ttys(void); void rfcomm_cleanup_ttys(void); #else static inline int rfcomm_init_ttys(void) { return 0; } static inline void rfcomm_cleanup_ttys(void) { } #endif #endif /* __RFCOMM_H */
2 1 1 2 2 1 11 1 16 2 1 1 3 2 1 2 1 3 4 1 3 3 5 2 1 1 1 2 2 5 5 1 4 3 2 2 35 4 30 11 6 22 37 37 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright 2008 Red Hat, Inc. All rights reserved. * Copyright 2008 Ian Kent <raven@themaw.net> */ #include <linux/module.h> #include <linux/miscdevice.h> #include <linux/compat.h> #include <linux/fdtable.h> #include <linux/magic.h> #include <linux/nospec.h> #include "autofs_i.h" /* * This module implements an interface for routing autofs ioctl control * commands via a miscellaneous device file. * * The alternate interface is needed because we need to be able open * an ioctl file descriptor on an autofs mount that may be covered by * another mount. This situation arises when starting automount(8) * or other user space daemon which uses direct mounts or offset * mounts (used for autofs lazy mount/umount of nested mount trees), * which have been left busy at service shutdown. */ typedef int (*ioctl_fn)(struct file *, struct autofs_sb_info *, struct autofs_dev_ioctl *); static int check_name(const char *name) { if (!strchr(name, '/')) return -EINVAL; return 0; } /* * Check a string doesn't overrun the chunk of * memory we copied from user land. */ static int invalid_str(char *str, size_t size) { if (memchr(str, 0, size)) return 0; return -EINVAL; } /* * Check that the user compiled against correct version of autofs * misc device code. * * As well as checking the version compatibility this always copies * the kernel interface version out. */ static int check_dev_ioctl_version(int cmd, struct autofs_dev_ioctl *param) { int err = 0; if ((param->ver_major != AUTOFS_DEV_IOCTL_VERSION_MAJOR) || (param->ver_minor > AUTOFS_DEV_IOCTL_VERSION_MINOR)) { pr_warn("ioctl control interface version mismatch: " "kernel(%u.%u), user(%u.%u), cmd(0x%08x)\n", AUTOFS_DEV_IOCTL_VERSION_MAJOR, AUTOFS_DEV_IOCTL_VERSION_MINOR, param->ver_major, param->ver_minor, cmd); err = -EINVAL; } /* Fill in the kernel version. */ param->ver_major = AUTOFS_DEV_IOCTL_VERSION_MAJOR; param->ver_minor = AUTOFS_DEV_IOCTL_VERSION_MINOR; return err; } /* * Copy parameter control struct, including a possible path allocated * at the end of the struct. */ static struct autofs_dev_ioctl * copy_dev_ioctl(struct autofs_dev_ioctl __user *in) { struct autofs_dev_ioctl tmp, *res; if (copy_from_user(&tmp, in, AUTOFS_DEV_IOCTL_SIZE)) return ERR_PTR(-EFAULT); if (tmp.size < AUTOFS_DEV_IOCTL_SIZE) return ERR_PTR(-EINVAL); if (tmp.size > AUTOFS_DEV_IOCTL_SIZE + PATH_MAX) return ERR_PTR(-ENAMETOOLONG); res = memdup_user(in, tmp.size); if (!IS_ERR(res)) res->size = tmp.size; return res; } static inline void free_dev_ioctl(struct autofs_dev_ioctl *param) { kfree(param); } /* * Check sanity of parameter control fields and if a path is present * check that it is terminated and contains at least one "/". */ static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param) { int err; err = check_dev_ioctl_version(cmd, param); if (err) { pr_warn("invalid device control module version " "supplied for cmd(0x%08x)\n", cmd); goto out; } if (param->size > AUTOFS_DEV_IOCTL_SIZE) { err = invalid_str(param->path, param->size - AUTOFS_DEV_IOCTL_SIZE); if (err) { pr_warn( "path string terminator missing for cmd(0x%08x)\n", cmd); goto out; } err = check_name(param->path); if (err) { pr_warn("invalid path supplied for cmd(0x%08x)\n", cmd); goto out; } } else { unsigned int inr = _IOC_NR(cmd); if (inr == AUTOFS_DEV_IOCTL_OPENMOUNT_CMD || inr == AUTOFS_DEV_IOCTL_REQUESTER_CMD || inr == AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD) { err = -EINVAL; goto out; } } err = 0; out: return err; } /* Return autofs dev ioctl version */ static int autofs_dev_ioctl_version(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { /* This should have already been set. */ param->ver_major = AUTOFS_DEV_IOCTL_VERSION_MAJOR; param->ver_minor = AUTOFS_DEV_IOCTL_VERSION_MINOR; return 0; } /* Return autofs module protocol version */ static int autofs_dev_ioctl_protover(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { param->protover.version = sbi->version; return 0; } /* Return autofs module protocol sub version */ static int autofs_dev_ioctl_protosubver(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { param->protosubver.sub_version = sbi->sub_version; return 0; } /* Find the topmost mount satisfying test() */ static int find_autofs_mount(const char *pathname, struct path *res, int test(const struct path *path, void *data), void *data) { struct path path; int err; err = kern_path(pathname, LOOKUP_MOUNTPOINT, &path); if (err) return err; err = -ENOENT; while (path.dentry == path.mnt->mnt_root) { if (path.dentry->d_sb->s_magic == AUTOFS_SUPER_MAGIC) { if (test(&path, data)) { path_get(&path); *res = path; err = 0; break; } } if (!follow_up(&path)) break; } path_put(&path); return err; } static int test_by_dev(const struct path *path, void *p) { return path->dentry->d_sb->s_dev == *(dev_t *)p; } static int test_by_type(const struct path *path, void *p) { struct autofs_info *ino = autofs_dentry_ino(path->dentry); return ino && ino->sbi->type & *(unsigned *)p; } /* * Open a file descriptor on the autofs mount point corresponding * to the given path and device number (aka. new_encode_dev(sb->s_dev)). */ static int autofs_dev_ioctl_open_mountpoint(const char *name, dev_t devid) { int err, fd; fd = get_unused_fd_flags(O_CLOEXEC); if (likely(fd >= 0)) { struct file *filp; struct path path; err = find_autofs_mount(name, &path, test_by_dev, &devid); if (err) goto out; filp = dentry_open(&path, O_RDONLY, current_cred()); path_put(&path); if (IS_ERR(filp)) { err = PTR_ERR(filp); goto out; } fd_install(fd, filp); } return fd; out: put_unused_fd(fd); return err; } /* Open a file descriptor on an autofs mount point */ static int autofs_dev_ioctl_openmount(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { const char *path; dev_t devid; int err, fd; /* param->path has been checked in validate_dev_ioctl() */ if (!param->openmount.devid) return -EINVAL; param->ioctlfd = -1; path = param->path; devid = new_decode_dev(param->openmount.devid); err = 0; fd = autofs_dev_ioctl_open_mountpoint(path, devid); if (unlikely(fd < 0)) { err = fd; goto out; } param->ioctlfd = fd; out: return err; } /* Close file descriptor allocated above (user can also use close(2)). */ static int autofs_dev_ioctl_closemount(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { return close_fd(param->ioctlfd); } /* * Send "ready" status for an existing wait (either a mount or an expire * request). */ static int autofs_dev_ioctl_ready(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { autofs_wqt_t token; token = (autofs_wqt_t) param->ready.token; return autofs_wait_release(sbi, token, 0); } /* * Send "fail" status for an existing wait (either a mount or an expire * request). */ static int autofs_dev_ioctl_fail(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { autofs_wqt_t token; int status; token = (autofs_wqt_t) param->fail.token; status = param->fail.status < 0 ? param->fail.status : -ENOENT; return autofs_wait_release(sbi, token, status); } /* * Set the pipe fd for kernel communication to the daemon. * * Normally this is set at mount using an option but if we * are reconnecting to a busy mount then we need to use this * to tell the autofs mount about the new kernel pipe fd. In * order to protect mounts against incorrectly setting the * pipefd we also require that the autofs mount be catatonic. * * This also sets the process group id used to identify the * controlling process (eg. the owning automount(8) daemon). */ static int autofs_dev_ioctl_setpipefd(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { int pipefd; int err = 0; struct pid *new_pid = NULL; if (param->setpipefd.pipefd == -1) return -EINVAL; pipefd = param->setpipefd.pipefd; mutex_lock(&sbi->wq_mutex); if (!(sbi->flags & AUTOFS_SBI_CATATONIC)) { mutex_unlock(&sbi->wq_mutex); return -EBUSY; } else { struct file *pipe; new_pid = get_task_pid(current, PIDTYPE_PGID); if (ns_of_pid(new_pid) != ns_of_pid(sbi->oz_pgrp)) { pr_warn("not allowed to change PID namespace\n"); err = -EINVAL; goto out; } pipe = fget(pipefd); if (!pipe) { err = -EBADF; goto out; } if (autofs_prepare_pipe(pipe) < 0) { err = -EPIPE; fput(pipe); goto out; } swap(sbi->oz_pgrp, new_pid); sbi->pipefd = pipefd; sbi->pipe = pipe; sbi->flags &= ~AUTOFS_SBI_CATATONIC; } out: put_pid(new_pid); mutex_unlock(&sbi->wq_mutex); return err; } /* * Make the autofs mount point catatonic, no longer responsive to * mount requests. Also closes the kernel pipe file descriptor. */ static int autofs_dev_ioctl_catatonic(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { autofs_catatonic_mode(sbi); return 0; } /* Set the autofs mount timeout */ static int autofs_dev_ioctl_timeout(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { unsigned long timeout; timeout = param->timeout.timeout; param->timeout.timeout = sbi->exp_timeout / HZ; sbi->exp_timeout = timeout * HZ; return 0; } /* * Return the uid and gid of the last request for the mount * * When reconstructing an autofs mount tree with active mounts * we need to re-connect to mounts that may have used the original * process uid and gid (or string variations of them) for mount * lookups within the map entry. */ static int autofs_dev_ioctl_requester(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { struct autofs_info *ino; struct path path; dev_t devid; int err = -ENOENT; /* param->path has been checked in validate_dev_ioctl() */ devid = sbi->sb->s_dev; param->requester.uid = param->requester.gid = -1; err = find_autofs_mount(param->path, &path, test_by_dev, &devid); if (err) goto out; ino = autofs_dentry_ino(path.dentry); if (ino) { err = 0; autofs_expire_wait(&path, 0); spin_lock(&sbi->fs_lock); param->requester.uid = from_kuid_munged(current_user_ns(), ino->uid); param->requester.gid = from_kgid_munged(current_user_ns(), ino->gid); spin_unlock(&sbi->fs_lock); } path_put(&path); out: return err; } /* * Call repeatedly until it returns -EAGAIN, meaning there's nothing * more that can be done. */ static int autofs_dev_ioctl_expire(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { struct vfsmount *mnt; int how; how = param->expire.how; mnt = fp->f_path.mnt; return autofs_do_expire_multi(sbi->sb, mnt, sbi, how); } /* Check if autofs mount point is in use */ static int autofs_dev_ioctl_askumount(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { param->askumount.may_umount = 0; if (may_umount(fp->f_path.mnt)) param->askumount.may_umount = 1; return 0; } /* * Check if the given path is a mountpoint. * * If we are supplied with the file descriptor of an autofs * mount we're looking for a specific mount. In this case * the path is considered a mountpoint if it is itself a * mountpoint or contains a mount, such as a multi-mount * without a root mount. In this case we return 1 if the * path is a mount point and the super magic of the covering * mount if there is one or 0 if it isn't a mountpoint. * * If we aren't supplied with a file descriptor then we * lookup the path and check if it is the root of a mount. * If a type is given we are looking for a particular autofs * mount and if we don't find a match we return fail. If the * located path is the root of a mount we return 1 along with * the super magic of the mount or 0 otherwise. * * In both cases the device number (as returned by * new_encode_dev()) is also returned. */ static int autofs_dev_ioctl_ismountpoint(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { struct path path; const char *name; unsigned int type; unsigned int devid, magic; int err = -ENOENT; /* param->path has been checked in validate_dev_ioctl() */ name = param->path; type = param->ismountpoint.in.type; param->ismountpoint.out.devid = devid = 0; param->ismountpoint.out.magic = magic = 0; if (!fp || param->ioctlfd == -1) { if (autofs_type_any(type)) err = kern_path(name, LOOKUP_FOLLOW | LOOKUP_MOUNTPOINT, &path); else err = find_autofs_mount(name, &path, test_by_type, &type); if (err) goto out; devid = new_encode_dev(path.dentry->d_sb->s_dev); err = 0; if (path.mnt->mnt_root == path.dentry) { err = 1; magic = path.dentry->d_sb->s_magic; } } else { dev_t dev = sbi->sb->s_dev; err = find_autofs_mount(name, &path, test_by_dev, &dev); if (err) goto out; devid = new_encode_dev(dev); err = path_has_submounts(&path); if (follow_down_one(&path)) magic = path.dentry->d_sb->s_magic; } param->ismountpoint.out.devid = devid; param->ismountpoint.out.magic = magic; path_put(&path); out: return err; } /* * Our range of ioctl numbers isn't 0 based so we need to shift * the array index by _IOC_NR(AUTOFS_CTL_IOC_FIRST) for the table * lookup. */ #define cmd_idx(cmd) (cmd - _IOC_NR(AUTOFS_DEV_IOCTL_IOC_FIRST)) static ioctl_fn lookup_dev_ioctl(unsigned int cmd) { static const ioctl_fn _ioctls[] = { autofs_dev_ioctl_version, autofs_dev_ioctl_protover, autofs_dev_ioctl_protosubver, autofs_dev_ioctl_openmount, autofs_dev_ioctl_closemount, autofs_dev_ioctl_ready, autofs_dev_ioctl_fail, autofs_dev_ioctl_setpipefd, autofs_dev_ioctl_catatonic, autofs_dev_ioctl_timeout, autofs_dev_ioctl_requester, autofs_dev_ioctl_expire, autofs_dev_ioctl_askumount, autofs_dev_ioctl_ismountpoint, }; unsigned int idx = cmd_idx(cmd); if (idx >= ARRAY_SIZE(_ioctls)) return NULL; idx = array_index_nospec(idx, ARRAY_SIZE(_ioctls)); return _ioctls[idx]; } /* ioctl dispatcher */ static int _autofs_dev_ioctl(unsigned int command, struct autofs_dev_ioctl __user *user) { struct autofs_dev_ioctl *param; struct file *fp; struct autofs_sb_info *sbi; unsigned int cmd_first, cmd; ioctl_fn fn = NULL; int err = 0; cmd_first = _IOC_NR(AUTOFS_DEV_IOCTL_IOC_FIRST); cmd = _IOC_NR(command); if (_IOC_TYPE(command) != _IOC_TYPE(AUTOFS_DEV_IOCTL_IOC_FIRST) || cmd - cmd_first > AUTOFS_DEV_IOCTL_IOC_COUNT) { return -ENOTTY; } /* Only root can use ioctls other than AUTOFS_DEV_IOCTL_VERSION_CMD * and AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD */ if (cmd != AUTOFS_DEV_IOCTL_VERSION_CMD && cmd != AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD && !capable(CAP_SYS_ADMIN)) return -EPERM; /* Copy the parameters into kernel space. */ param = copy_dev_ioctl(user); if (IS_ERR(param)) return PTR_ERR(param); err = validate_dev_ioctl(command, param); if (err) goto out; fn = lookup_dev_ioctl(cmd); if (!fn) { pr_warn("unknown command 0x%08x\n", command); err = -ENOTTY; goto out; } fp = NULL; sbi = NULL; /* * For obvious reasons the openmount can't have a file * descriptor yet. We don't take a reference to the * file during close to allow for immediate release, * and the same for retrieving ioctl version. */ if (cmd != AUTOFS_DEV_IOCTL_VERSION_CMD && cmd != AUTOFS_DEV_IOCTL_OPENMOUNT_CMD && cmd != AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD) { struct super_block *sb; fp = fget(param->ioctlfd); if (!fp) { if (cmd == AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD) goto cont; err = -EBADF; goto out; } sb = file_inode(fp)->i_sb; if (sb->s_type != &autofs_fs_type) { err = -EINVAL; fput(fp); goto out; } sbi = autofs_sbi(sb); /* * Admin needs to be able to set the mount catatonic in * order to be able to perform the re-open. */ if (!autofs_oz_mode(sbi) && cmd != AUTOFS_DEV_IOCTL_CATATONIC_CMD) { err = -EACCES; fput(fp); goto out; } } cont: err = fn(fp, sbi, param); if (fp) fput(fp); if (err >= 0 && copy_to_user(user, param, AUTOFS_DEV_IOCTL_SIZE)) err = -EFAULT; out: free_dev_ioctl(param); return err; } static long autofs_dev_ioctl(struct file *file, unsigned int command, unsigned long u) { int err; err = _autofs_dev_ioctl(command, (struct autofs_dev_ioctl __user *) u); return (long) err; } #ifdef CONFIG_COMPAT static long autofs_dev_ioctl_compat(struct file *file, unsigned int command, unsigned long u) { return autofs_dev_ioctl(file, command, (unsigned long) compat_ptr(u)); } #else #define autofs_dev_ioctl_compat NULL #endif static const struct file_operations _dev_ioctl_fops = { .unlocked_ioctl = autofs_dev_ioctl, .compat_ioctl = autofs_dev_ioctl_compat, .owner = THIS_MODULE, .llseek = noop_llseek, }; static struct miscdevice _autofs_dev_ioctl_misc = { .minor = AUTOFS_MINOR, .name = AUTOFS_DEVICE_NAME, .fops = &_dev_ioctl_fops, .mode = 0644, }; MODULE_ALIAS_MISCDEV(AUTOFS_MINOR); MODULE_ALIAS("devname:autofs"); /* Register/deregister misc character device */ int __init autofs_dev_ioctl_init(void) { int r; r = misc_register(&_autofs_dev_ioctl_misc); if (r) { pr_err("misc_register failed for control device\n"); return r; } return 0; } void autofs_dev_ioctl_exit(void) { misc_deregister(&_autofs_dev_ioctl_misc); }
15 15 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 15 1 15 1 14 2 14 2 2 2 15 2 16 16 3 3 3 3 2 2 2 2 2 14 14 13 14 3 3 1 1 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. * All Rights Reserved. */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_bit.h" #include "xfs_sb.h" #include "xfs_mount.h" #include "xfs_ialloc.h" #include "xfs_alloc.h" #include "xfs_error.h" #include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_bmap_btree.h" #include "xfs_alloc_btree.h" #include "xfs_log.h" #include "xfs_rmap_btree.h" #include "xfs_refcount_btree.h" #include "xfs_da_format.h" #include "xfs_health.h" #include "xfs_ag.h" #include "xfs_rtbitmap.h" /* * Physical superblock buffer manipulations. Shared with libxfs in userspace. */ /* * Check that all the V4 feature bits that the V5 filesystem format requires are * correctly set. */ static bool xfs_sb_validate_v5_features( struct xfs_sb *sbp) { /* We must not have any unknown V4 feature bits set */ if (sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) return false; /* * The CRC bit is considered an invalid V4 flag, so we have to add it * manually to the OKBITS mask. */ if (sbp->sb_features2 & ~(XFS_SB_VERSION2_OKBITS | XFS_SB_VERSION2_CRCBIT)) return false; /* Now check all the required V4 feature flags are set. */ #define V5_VERS_FLAGS (XFS_SB_VERSION_NLINKBIT | \ XFS_SB_VERSION_ALIGNBIT | \ XFS_SB_VERSION_LOGV2BIT | \ XFS_SB_VERSION_EXTFLGBIT | \ XFS_SB_VERSION_DIRV2BIT | \ XFS_SB_VERSION_MOREBITSBIT) #define V5_FEAT_FLAGS (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \ XFS_SB_VERSION2_ATTR2BIT | \ XFS_SB_VERSION2_PROJID32BIT | \ XFS_SB_VERSION2_CRCBIT) if ((sbp->sb_versionnum & V5_VERS_FLAGS) != V5_VERS_FLAGS) return false; if ((sbp->sb_features2 & V5_FEAT_FLAGS) != V5_FEAT_FLAGS) return false; return true; } /* * We current support XFS v5 formats with known features and v4 superblocks with * at least V2 directories. */ bool xfs_sb_good_version( struct xfs_sb *sbp) { /* * All v5 filesystems are supported, but we must check that all the * required v4 feature flags are enabled correctly as the code checks * those flags and not for v5 support. */ if (xfs_sb_is_v5(sbp)) return xfs_sb_validate_v5_features(sbp); /* versions prior to v4 are not supported */ if (XFS_SB_VERSION_NUM(sbp) != XFS_SB_VERSION_4) return false; /* We must not have any unknown v4 feature bits set */ if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) || ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) && (sbp->sb_features2 & ~XFS_SB_VERSION2_OKBITS))) return false; /* V4 filesystems need v2 directories and unwritten extents */ if (!(sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT)) return false; if (!(sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT)) return false; /* It's a supported v4 filesystem */ return true; } uint64_t xfs_sb_version_to_features( struct xfs_sb *sbp) { uint64_t features = 0; /* optional V4 features */ if (sbp->sb_rblocks > 0) features |= XFS_FEAT_REALTIME; if (sbp->sb_versionnum & XFS_SB_VERSION_NLINKBIT) features |= XFS_FEAT_NLINK; if (sbp->sb_versionnum & XFS_SB_VERSION_ATTRBIT) features |= XFS_FEAT_ATTR; if (sbp->sb_versionnum & XFS_SB_VERSION_QUOTABIT) features |= XFS_FEAT_QUOTA; if (sbp->sb_versionnum & XFS_SB_VERSION_ALIGNBIT) features |= XFS_FEAT_ALIGN; if (sbp->sb_versionnum & XFS_SB_VERSION_LOGV2BIT) features |= XFS_FEAT_LOGV2; if (sbp->sb_versionnum & XFS_SB_VERSION_DALIGNBIT) features |= XFS_FEAT_DALIGN; if (sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT) features |= XFS_FEAT_EXTFLG; if (sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT) features |= XFS_FEAT_SECTOR; if (sbp->sb_versionnum & XFS_SB_VERSION_BORGBIT) features |= XFS_FEAT_ASCIICI; if (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) { if (sbp->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT) features |= XFS_FEAT_LAZYSBCOUNT; if (sbp->sb_features2 & XFS_SB_VERSION2_ATTR2BIT) features |= XFS_FEAT_ATTR2; if (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT) features |= XFS_FEAT_PROJID32; if (sbp->sb_features2 & XFS_SB_VERSION2_FTYPE) features |= XFS_FEAT_FTYPE; } if (!xfs_sb_is_v5(sbp)) return features; /* Always on V5 features */ features |= XFS_FEAT_ALIGN | XFS_FEAT_LOGV2 | XFS_FEAT_EXTFLG | XFS_FEAT_LAZYSBCOUNT | XFS_FEAT_ATTR2 | XFS_FEAT_PROJID32 | XFS_FEAT_V3INODES | XFS_FEAT_CRC | XFS_FEAT_PQUOTINO; /* Optional V5 features */ if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FINOBT) features |= XFS_FEAT_FINOBT; if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_RMAPBT) features |= XFS_FEAT_RMAPBT; if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_REFLINK) features |= XFS_FEAT_REFLINK; if (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_INOBTCNT) features |= XFS_FEAT_INOBTCNT; if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_FTYPE) features |= XFS_FEAT_FTYPE; if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_SPINODES) features |= XFS_FEAT_SPINODES; if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_META_UUID) features |= XFS_FEAT_META_UUID; if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_BIGTIME) features |= XFS_FEAT_BIGTIME; if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_NEEDSREPAIR) features |= XFS_FEAT_NEEDSREPAIR; if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_NREXT64) features |= XFS_FEAT_NREXT64; return features; } /* Check all the superblock fields we care about when reading one in. */ STATIC int xfs_validate_sb_read( struct xfs_mount *mp, struct xfs_sb *sbp) { if (!xfs_sb_is_v5(sbp)) return 0; /* * Version 5 superblock feature mask validation. Reject combinations * the kernel cannot support up front before checking anything else. */ if (xfs_sb_has_compat_feature(sbp, XFS_SB_FEAT_COMPAT_UNKNOWN)) { xfs_warn(mp, "Superblock has unknown compatible features (0x%x) enabled.", (sbp->sb_features_compat & XFS_SB_FEAT_COMPAT_UNKNOWN)); xfs_warn(mp, "Using a more recent kernel is recommended."); } if (xfs_sb_has_ro_compat_feature(sbp, XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) { xfs_alert(mp, "Superblock has unknown read-only compatible features (0x%x) enabled.", (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_UNKNOWN)); if (!xfs_is_readonly(mp)) { xfs_warn(mp, "Attempted to mount read-only compatible filesystem read-write."); xfs_warn(mp, "Filesystem can only be safely mounted read only."); return -EINVAL; } } if (xfs_sb_has_incompat_feature(sbp, XFS_SB_FEAT_INCOMPAT_UNKNOWN)) { xfs_warn(mp, "Superblock has unknown incompatible features (0x%x) enabled.", (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_UNKNOWN)); xfs_warn(mp, "Filesystem cannot be safely mounted by this kernel."); return -EINVAL; } return 0; } /* Check all the superblock fields we care about when writing one out. */ STATIC int xfs_validate_sb_write( struct xfs_mount *mp, struct xfs_buf *bp, struct xfs_sb *sbp) { /* * Carry out additional sb summary counter sanity checks when we write * the superblock. We skip this in the read validator because there * could be newer superblocks in the log and if the values are garbage * even after replay we'll recalculate them at the end of log mount. * * mkfs has traditionally written zeroed counters to inprogress and * secondary superblocks, so allow this usage to continue because * we never read counters from such superblocks. */ if (xfs_buf_daddr(bp) == XFS_SB_DADDR && !sbp->sb_inprogress && (sbp->sb_fdblocks > sbp->sb_dblocks || !xfs_verify_icount(mp, sbp->sb_icount) || sbp->sb_ifree > sbp->sb_icount)) { xfs_warn(mp, "SB summary counter sanity check failed"); return -EFSCORRUPTED; } if (!xfs_sb_is_v5(sbp)) return 0; /* * Version 5 superblock feature mask validation. Reject combinations * the kernel cannot support since we checked for unsupported bits in * the read verifier, which means that memory is corrupt. */ if (xfs_sb_has_compat_feature(sbp, XFS_SB_FEAT_COMPAT_UNKNOWN)) { xfs_warn(mp, "Corruption detected in superblock compatible features (0x%x)!", (sbp->sb_features_compat & XFS_SB_FEAT_COMPAT_UNKNOWN)); return -EFSCORRUPTED; } if (!xfs_is_readonly(mp) && xfs_sb_has_ro_compat_feature(sbp, XFS_SB_FEAT_RO_COMPAT_UNKNOWN)) { xfs_alert(mp, "Corruption detected in superblock read-only compatible features (0x%x)!", (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_UNKNOWN)); return -EFSCORRUPTED; } if (xfs_sb_has_incompat_feature(sbp, XFS_SB_FEAT_INCOMPAT_UNKNOWN)) { xfs_warn(mp, "Corruption detected in superblock incompatible features (0x%x)!", (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_UNKNOWN)); return -EFSCORRUPTED; } if (xfs_sb_has_incompat_log_feature(sbp, XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)) { xfs_warn(mp, "Corruption detected in superblock incompatible log features (0x%x)!", (sbp->sb_features_log_incompat & XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)); return -EFSCORRUPTED; } /* * We can't read verify the sb LSN because the read verifier is called * before the log is allocated and processed. We know the log is set up * before write verifier calls, so check it here. */ if (!xfs_log_check_lsn(mp, sbp->sb_lsn)) return -EFSCORRUPTED; return 0; } /* Check the validity of the SB. */ STATIC int xfs_validate_sb_common( struct xfs_mount *mp, struct xfs_buf *bp, struct xfs_sb *sbp) { struct xfs_dsb *dsb = bp->b_addr; uint32_t agcount = 0; uint32_t rem; bool has_dalign; if (!xfs_verify_magic(bp, dsb->sb_magicnum)) { xfs_warn(mp, "Superblock has bad magic number 0x%x. Not an XFS filesystem?", be32_to_cpu(dsb->sb_magicnum)); return -EWRONGFS; } if (!xfs_sb_good_version(sbp)) { xfs_warn(mp, "Superblock has unknown features enabled or corrupted feature masks."); return -EWRONGFS; } /* * Validate feature flags and state */ if (xfs_sb_is_v5(sbp)) { if (sbp->sb_blocksize < XFS_MIN_CRC_BLOCKSIZE) { xfs_notice(mp, "Block size (%u bytes) too small for Version 5 superblock (minimum %d bytes)", sbp->sb_blocksize, XFS_MIN_CRC_BLOCKSIZE); return -EFSCORRUPTED; } /* V5 has a separate project quota inode */ if (sbp->sb_qflags & (XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD)) { xfs_notice(mp, "Version 5 of Super block has XFS_OQUOTA bits."); return -EFSCORRUPTED; } /* * Full inode chunks must be aligned to inode chunk size when * sparse inodes are enabled to support the sparse chunk * allocation algorithm and prevent overlapping inode records. */ if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_SPINODES) { uint32_t align; align = XFS_INODES_PER_CHUNK * sbp->sb_inodesize >> sbp->sb_blocklog; if (sbp->sb_inoalignmt != align) { xfs_warn(mp, "Inode block alignment (%u) must match chunk size (%u) for sparse inodes.", sbp->sb_inoalignmt, align); return -EINVAL; } } } else if (sbp->sb_qflags & (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD | XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD)) { xfs_notice(mp, "Superblock earlier than Version 5 has XFS_{P|G}QUOTA_{ENFD|CHKD} bits."); return -EFSCORRUPTED; } if (unlikely( sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) { xfs_warn(mp, "filesystem is marked as having an external log; " "specify logdev on the mount command line."); return -EINVAL; } if (unlikely( sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp)) { xfs_warn(mp, "filesystem is marked as having an internal log; " "do not specify logdev on the mount command line."); return -EINVAL; } /* Compute agcount for this number of dblocks and agblocks */ if (sbp->sb_agblocks) { agcount = div_u64_rem(sbp->sb_dblocks, sbp->sb_agblocks, &rem); if (rem) agcount++; } /* * More sanity checking. Most of these were stolen directly from * xfs_repair. */ if (unlikely( sbp->sb_agcount <= 0 || sbp->sb_sectsize < XFS_MIN_SECTORSIZE || sbp->sb_sectsize > XFS_MAX_SECTORSIZE || sbp->sb_sectlog < XFS_MIN_SECTORSIZE_LOG || sbp->sb_sectlog > XFS_MAX_SECTORSIZE_LOG || sbp->sb_sectsize != (1 << sbp->sb_sectlog) || sbp->sb_blocksize < XFS_MIN_BLOCKSIZE || sbp->sb_blocksize > XFS_MAX_BLOCKSIZE || sbp->sb_blocklog < XFS_MIN_BLOCKSIZE_LOG || sbp->sb_blocklog > XFS_MAX_BLOCKSIZE_LOG || sbp->sb_blocksize != (1 << sbp->sb_blocklog) || sbp->sb_dirblklog + sbp->sb_blocklog > XFS_MAX_BLOCKSIZE_LOG || sbp->sb_inodesize < XFS_DINODE_MIN_SIZE || sbp->sb_inodesize > XFS_DINODE_MAX_SIZE || sbp->sb_inodelog < XFS_DINODE_MIN_LOG || sbp->sb_inodelog > XFS_DINODE_MAX_LOG || sbp->sb_inodesize != (1 << sbp->sb_inodelog) || sbp->sb_inopblock != howmany(sbp->sb_blocksize,sbp->sb_inodesize) || XFS_FSB_TO_B(mp, sbp->sb_agblocks) < XFS_MIN_AG_BYTES || XFS_FSB_TO_B(mp, sbp->sb_agblocks) > XFS_MAX_AG_BYTES || sbp->sb_agblklog != xfs_highbit32(sbp->sb_agblocks - 1) + 1 || agcount == 0 || agcount != sbp->sb_agcount || (sbp->sb_blocklog - sbp->sb_inodelog != sbp->sb_inopblog) || (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE) || (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) || (sbp->sb_imax_pct > 100 /* zero sb_imax_pct is valid */) || sbp->sb_dblocks == 0 || sbp->sb_dblocks > XFS_MAX_DBLOCKS(sbp) || sbp->sb_dblocks < XFS_MIN_DBLOCKS(sbp) || sbp->sb_shared_vn != 0)) { xfs_notice(mp, "SB sanity check failed"); return -EFSCORRUPTED; } /* * Logs that are too large are not supported at all. Reject them * outright. Logs that are too small are tolerated on v4 filesystems, * but we can only check that when mounting the log. Hence we skip * those checks here. */ if (sbp->sb_logblocks > XFS_MAX_LOG_BLOCKS) { xfs_notice(mp, "Log size 0x%x blocks too large, maximum size is 0x%llx blocks", sbp->sb_logblocks, XFS_MAX_LOG_BLOCKS); return -EFSCORRUPTED; } if (XFS_FSB_TO_B(mp, sbp->sb_logblocks) > XFS_MAX_LOG_BYTES) { xfs_warn(mp, "log size 0x%llx bytes too large, maximum size is 0x%llx bytes", XFS_FSB_TO_B(mp, sbp->sb_logblocks), XFS_MAX_LOG_BYTES); return -EFSCORRUPTED; } /* * Do not allow filesystems with corrupted log sector or stripe units to * be mounted. We cannot safely size the iclogs or write to the log if * the log stripe unit is not valid. */ if (sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT) { if (sbp->sb_logsectsize != (1U << sbp->sb_logsectlog)) { xfs_notice(mp, "log sector size in bytes/log2 (0x%x/0x%x) must match", sbp->sb_logsectsize, 1U << sbp->sb_logsectlog); return -EFSCORRUPTED; } } else if (sbp->sb_logsectsize || sbp->sb_logsectlog) { xfs_notice(mp, "log sector size in bytes/log2 (0x%x/0x%x) are not zero", sbp->sb_logsectsize, sbp->sb_logsectlog); return -EFSCORRUPTED; } if (sbp->sb_logsunit > 1) { if (sbp->sb_logsunit % sbp->sb_blocksize) { xfs_notice(mp, "log stripe unit 0x%x bytes must be a multiple of block size", sbp->sb_logsunit); return -EFSCORRUPTED; } if (sbp->sb_logsunit > XLOG_MAX_RECORD_BSIZE) { xfs_notice(mp, "log stripe unit 0x%x bytes over maximum size (0x%x bytes)", sbp->sb_logsunit, XLOG_MAX_RECORD_BSIZE); return -EFSCORRUPTED; } } /* Validate the realtime geometry; stolen from xfs_repair */ if (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE || sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) { xfs_notice(mp, "realtime extent sanity check failed"); return -EFSCORRUPTED; } if (sbp->sb_rblocks == 0) { if (sbp->sb_rextents != 0 || sbp->sb_rbmblocks != 0 || sbp->sb_rextslog != 0 || sbp->sb_frextents != 0) { xfs_notice(mp, "realtime zeroed geometry check failed"); return -EFSCORRUPTED; } } else { uint64_t rexts; uint64_t rbmblocks; rexts = div_u64(sbp->sb_rblocks, sbp->sb_rextsize); rbmblocks = howmany_64(sbp->sb_rextents, NBBY * sbp->sb_blocksize); if (!xfs_validate_rtextents(rexts) || sbp->sb_rextents != rexts || sbp->sb_rextslog != xfs_compute_rextslog(rexts) || sbp->sb_rbmblocks != rbmblocks) { xfs_notice(mp, "realtime geometry sanity check failed"); return -EFSCORRUPTED; } } /* * Either (sb_unit and !hasdalign) or (!sb_unit and hasdalign) * would imply the image is corrupted. */ has_dalign = sbp->sb_versionnum & XFS_SB_VERSION_DALIGNBIT; if (!!sbp->sb_unit ^ has_dalign) { xfs_notice(mp, "SB stripe alignment sanity check failed"); return -EFSCORRUPTED; } if (!xfs_validate_stripe_geometry(mp, XFS_FSB_TO_B(mp, sbp->sb_unit), XFS_FSB_TO_B(mp, sbp->sb_width), 0, false)) return -EFSCORRUPTED; /* * Currently only very few inode sizes are supported. */ switch (sbp->sb_inodesize) { case 256: case 512: case 1024: case 2048: break; default: xfs_warn(mp, "inode size of %d bytes not supported", sbp->sb_inodesize); return -ENOSYS; } return 0; } void xfs_sb_quota_from_disk(struct xfs_sb *sbp) { /* * older mkfs doesn't initialize quota inodes to NULLFSINO. This * leads to in-core values having two different values for a quota * inode to be invalid: 0 and NULLFSINO. Change it to a single value * NULLFSINO. * * Note that this change affect only the in-core values. These * values are not written back to disk unless any quota information * is written to the disk. Even in that case, sb_pquotino field is * not written to disk unless the superblock supports pquotino. */ if (sbp->sb_uquotino == 0) sbp->sb_uquotino = NULLFSINO; if (sbp->sb_gquotino == 0) sbp->sb_gquotino = NULLFSINO; if (sbp->sb_pquotino == 0) sbp->sb_pquotino = NULLFSINO; /* * We need to do these manipilations only if we are working * with an older version of on-disk superblock. */ if (xfs_sb_is_v5(sbp)) return; if (sbp->sb_qflags & XFS_OQUOTA_ENFD) sbp->sb_qflags |= (sbp->sb_qflags & XFS_PQUOTA_ACCT) ? XFS_PQUOTA_ENFD : XFS_GQUOTA_ENFD; if (sbp->sb_qflags & XFS_OQUOTA_CHKD) sbp->sb_qflags |= (sbp->sb_qflags & XFS_PQUOTA_ACCT) ? XFS_PQUOTA_CHKD : XFS_GQUOTA_CHKD; sbp->sb_qflags &= ~(XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD); if (sbp->sb_qflags & XFS_PQUOTA_ACCT && sbp->sb_gquotino != NULLFSINO) { /* * In older version of superblock, on-disk superblock only * has sb_gquotino, and in-core superblock has both sb_gquotino * and sb_pquotino. But, only one of them is supported at any * point of time. So, if PQUOTA is set in disk superblock, * copy over sb_gquotino to sb_pquotino. The NULLFSINO test * above is to make sure we don't do this twice and wipe them * both out! */ sbp->sb_pquotino = sbp->sb_gquotino; sbp->sb_gquotino = NULLFSINO; } } static void __xfs_sb_from_disk( struct xfs_sb *to, struct xfs_dsb *from, bool convert_xquota) { to->sb_magicnum = be32_to_cpu(from->sb_magicnum); to->sb_blocksize = be32_to_cpu(from->sb_blocksize); to->sb_dblocks = be64_to_cpu(from->sb_dblocks); to->sb_rblocks = be64_to_cpu(from->sb_rblocks); to->sb_rextents = be64_to_cpu(from->sb_rextents); memcpy(&to->sb_uuid, &from->sb_uuid, sizeof(to->sb_uuid)); to->sb_logstart = be64_to_cpu(from->sb_logstart); to->sb_rootino = be64_to_cpu(from->sb_rootino); to->sb_rbmino = be64_to_cpu(from->sb_rbmino); to->sb_rsumino = be64_to_cpu(from->sb_rsumino); to->sb_rextsize = be32_to_cpu(from->sb_rextsize); to->sb_agblocks = be32_to_cpu(from->sb_agblocks); to->sb_agcount = be32_to_cpu(from->sb_agcount); to->sb_rbmblocks = be32_to_cpu(from->sb_rbmblocks); to->sb_logblocks = be32_to_cpu(from->sb_logblocks); to->sb_versionnum = be16_to_cpu(from->sb_versionnum); to->sb_sectsize = be16_to_cpu(from->sb_sectsize); to->sb_inodesize = be16_to_cpu(from->sb_inodesize); to->sb_inopblock = be16_to_cpu(from->sb_inopblock); memcpy(&to->sb_fname, &from->sb_fname, sizeof(to->sb_fname)); to->sb_blocklog = from->sb_blocklog; to->sb_sectlog = from->sb_sectlog; to->sb_inodelog = from->sb_inodelog; to->sb_inopblog = from->sb_inopblog; to->sb_agblklog = from->sb_agblklog; to->sb_rextslog = from->sb_rextslog; to->sb_inprogress = from->sb_inprogress; to->sb_imax_pct = from->sb_imax_pct; to->sb_icount = be64_to_cpu(from->sb_icount); to->sb_ifree = be64_to_cpu(from->sb_ifree); to->sb_fdblocks = be64_to_cpu(from->sb_fdblocks); to->sb_frextents = be64_to_cpu(from->sb_frextents); to->sb_uquotino = be64_to_cpu(from->sb_uquotino); to->sb_gquotino = be64_to_cpu(from->sb_gquotino); to->sb_qflags = be16_to_cpu(from->sb_qflags); to->sb_flags = from->sb_flags; to->sb_shared_vn = from->sb_shared_vn; to->sb_inoalignmt = be32_to_cpu(from->sb_inoalignmt); to->sb_unit = be32_to_cpu(from->sb_unit); to->sb_width = be32_to_cpu(from->sb_width); to->sb_dirblklog = from->sb_dirblklog; to->sb_logsectlog = from->sb_logsectlog; to->sb_logsectsize = be16_to_cpu(from->sb_logsectsize); to->sb_logsunit = be32_to_cpu(from->sb_logsunit); to->sb_features2 = be32_to_cpu(from->sb_features2); to->sb_bad_features2 = be32_to_cpu(from->sb_bad_features2); to->sb_features_compat = be32_to_cpu(from->sb_features_compat); to->sb_features_ro_compat = be32_to_cpu(from->sb_features_ro_compat); to->sb_features_incompat = be32_to_cpu(from->sb_features_incompat); to->sb_features_log_incompat = be32_to_cpu(from->sb_features_log_incompat); /* crc is only used on disk, not in memory; just init to 0 here. */ to->sb_crc = 0; to->sb_spino_align = be32_to_cpu(from->sb_spino_align); to->sb_pquotino = be64_to_cpu(from->sb_pquotino); to->sb_lsn = be64_to_cpu(from->sb_lsn); /* * sb_meta_uuid is only on disk if it differs from sb_uuid and the * feature flag is set; if not set we keep it only in memory. */ if (xfs_sb_is_v5(to) && (to->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_META_UUID)) uuid_copy(&to->sb_meta_uuid, &from->sb_meta_uuid); else uuid_copy(&to->sb_meta_uuid, &from->sb_uuid); /* Convert on-disk flags to in-memory flags? */ if (convert_xquota) xfs_sb_quota_from_disk(to); } void xfs_sb_from_disk( struct xfs_sb *to, struct xfs_dsb *from) { __xfs_sb_from_disk(to, from, true); } static void xfs_sb_quota_to_disk( struct xfs_dsb *to, struct xfs_sb *from) { uint16_t qflags = from->sb_qflags; to->sb_uquotino = cpu_to_be64(from->sb_uquotino); /* * The in-memory superblock quota state matches the v5 on-disk format so * just write them out and return */ if (xfs_sb_is_v5(from)) { to->sb_qflags = cpu_to_be16(from->sb_qflags); to->sb_gquotino = cpu_to_be64(from->sb_gquotino); to->sb_pquotino = cpu_to_be64(from->sb_pquotino); return; } /* * For older superblocks (v4), the in-core version of sb_qflags do not * have XFS_OQUOTA_* flags, whereas the on-disk version does. So, * convert incore XFS_{PG}QUOTA_* flags to on-disk XFS_OQUOTA_* flags. */ qflags &= ~(XFS_PQUOTA_ENFD | XFS_PQUOTA_CHKD | XFS_GQUOTA_ENFD | XFS_GQUOTA_CHKD); if (from->sb_qflags & (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD)) qflags |= XFS_OQUOTA_ENFD; if (from->sb_qflags & (XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD)) qflags |= XFS_OQUOTA_CHKD; to->sb_qflags = cpu_to_be16(qflags); /* * GQUOTINO and PQUOTINO cannot be used together in versions * of superblock that do not have pquotino. from->sb_flags * tells us which quota is active and should be copied to * disk. If neither are active, we should NULL the inode. * * In all cases, the separate pquotino must remain 0 because it * is beyond the "end" of the valid non-pquotino superblock. */ if (from->sb_qflags & XFS_GQUOTA_ACCT) to->sb_gquotino = cpu_to_be64(from->sb_gquotino); else if (from->sb_qflags & XFS_PQUOTA_ACCT) to->sb_gquotino = cpu_to_be64(from->sb_pquotino); else { /* * We can't rely on just the fields being logged to tell us * that it is safe to write NULLFSINO - we should only do that * if quotas are not actually enabled. Hence only write * NULLFSINO if both in-core quota inodes are NULL. */ if (from->sb_gquotino == NULLFSINO && from->sb_pquotino == NULLFSINO) to->sb_gquotino = cpu_to_be64(NULLFSINO); } to->sb_pquotino = 0; } void xfs_sb_to_disk( struct xfs_dsb *to, struct xfs_sb *from) { xfs_sb_quota_to_disk(to, from); to->sb_magicnum = cpu_to_be32(from->sb_magicnum); to->sb_blocksize = cpu_to_be32(from->sb_blocksize); to->sb_dblocks = cpu_to_be64(from->sb_dblocks); to->sb_rblocks = cpu_to_be64(from->sb_rblocks); to->sb_rextents = cpu_to_be64(from->sb_rextents); memcpy(&to->sb_uuid, &from->sb_uuid, sizeof(to->sb_uuid)); to->sb_logstart = cpu_to_be64(from->sb_logstart); to->sb_rootino = cpu_to_be64(from->sb_rootino); to->sb_rbmino = cpu_to_be64(from->sb_rbmino); to->sb_rsumino = cpu_to_be64(from->sb_rsumino); to->sb_rextsize = cpu_to_be32(from->sb_rextsize); to->sb_agblocks = cpu_to_be32(from->sb_agblocks); to->sb_agcount = cpu_to_be32(from->sb_agcount); to->sb_rbmblocks = cpu_to_be32(from->sb_rbmblocks); to->sb_logblocks = cpu_to_be32(from->sb_logblocks); to->sb_versionnum = cpu_to_be16(from->sb_versionnum); to->sb_sectsize = cpu_to_be16(from->sb_sectsize); to->sb_inodesize = cpu_to_be16(from->sb_inodesize); to->sb_inopblock = cpu_to_be16(from->sb_inopblock); memcpy(&to->sb_fname, &from->sb_fname, sizeof(to->sb_fname)); to->sb_blocklog = from->sb_blocklog; to->sb_sectlog = from->sb_sectlog; to->sb_inodelog = from->sb_inodelog; to->sb_inopblog = from->sb_inopblog; to->sb_agblklog = from->sb_agblklog; to->sb_rextslog = from->sb_rextslog; to->sb_inprogress = from->sb_inprogress; to->sb_imax_pct = from->sb_imax_pct; to->sb_icount = cpu_to_be64(from->sb_icount); to->sb_ifree = cpu_to_be64(from->sb_ifree); to->sb_fdblocks = cpu_to_be64(from->sb_fdblocks); to->sb_frextents = cpu_to_be64(from->sb_frextents); to->sb_flags = from->sb_flags; to->sb_shared_vn = from->sb_shared_vn; to->sb_inoalignmt = cpu_to_be32(from->sb_inoalignmt); to->sb_unit = cpu_to_be32(from->sb_unit); to->sb_width = cpu_to_be32(from->sb_width); to->sb_dirblklog = from->sb_dirblklog; to->sb_logsectlog = from->sb_logsectlog; to->sb_logsectsize = cpu_to_be16(from->sb_logsectsize); to->sb_logsunit = cpu_to_be32(from->sb_logsunit); /* * We need to ensure that bad_features2 always matches features2. * Hence we enforce that here rather than having to remember to do it * everywhere else that updates features2. */ from->sb_bad_features2 = from->sb_features2; to->sb_features2 = cpu_to_be32(from->sb_features2); to->sb_bad_features2 = cpu_to_be32(from->sb_bad_features2); if (!xfs_sb_is_v5(from)) return; to->sb_features_compat = cpu_to_be32(from->sb_features_compat); to->sb_features_ro_compat = cpu_to_be32(from->sb_features_ro_compat); to->sb_features_incompat = cpu_to_be32(from->sb_features_incompat); to->sb_features_log_incompat = cpu_to_be32(from->sb_features_log_incompat); to->sb_spino_align = cpu_to_be32(from->sb_spino_align); to->sb_lsn = cpu_to_be64(from->sb_lsn); if (from->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_META_UUID) uuid_copy(&to->sb_meta_uuid, &from->sb_meta_uuid); } /* * If the superblock has the CRC feature bit set or the CRC field is non-null, * check that the CRC is valid. We check the CRC field is non-null because a * single bit error could clear the feature bit and unused parts of the * superblock are supposed to be zero. Hence a non-null crc field indicates that * we've potentially lost a feature bit and we should check it anyway. * * However, past bugs (i.e. in growfs) left non-zeroed regions beyond the * last field in V4 secondary superblocks. So for secondary superblocks, * we are more forgiving, and ignore CRC failures if the primary doesn't * indicate that the fs version is V5. */ static void xfs_sb_read_verify( struct xfs_buf *bp) { struct xfs_sb sb; struct xfs_mount *mp = bp->b_mount; struct xfs_dsb *dsb = bp->b_addr; int error; /* * open code the version check to avoid needing to convert the entire * superblock from disk order just to check the version number */ if (dsb->sb_magicnum == cpu_to_be32(XFS_SB_MAGIC) && (((be16_to_cpu(dsb->sb_versionnum) & XFS_SB_VERSION_NUMBITS) == XFS_SB_VERSION_5) || dsb->sb_crc != 0)) { if (!xfs_buf_verify_cksum(bp, XFS_SB_CRC_OFF)) { /* Only fail bad secondaries on a known V5 filesystem */ if (xfs_buf_daddr(bp) == XFS_SB_DADDR || xfs_has_crc(mp)) { error = -EFSBADCRC; goto out_error; } } } /* * Check all the superblock fields. Don't byteswap the xquota flags * because _verify_common checks the on-disk values. */ __xfs_sb_from_disk(&sb, dsb, false); error = xfs_validate_sb_common(mp, bp, &sb); if (error) goto out_error; error = xfs_validate_sb_read(mp, &sb); out_error: if (error == -EFSCORRUPTED || error == -EFSBADCRC) xfs_verifier_error(bp, error, __this_address); else if (error) xfs_buf_ioerror(bp, error); } /* * We may be probed for a filesystem match, so we may not want to emit * messages when the superblock buffer is not actually an XFS superblock. * If we find an XFS superblock, then run a normal, noisy mount because we are * really going to mount it and want to know about errors. */ static void xfs_sb_quiet_read_verify( struct xfs_buf *bp) { struct xfs_dsb *dsb = bp->b_addr; if (dsb->sb_magicnum == cpu_to_be32(XFS_SB_MAGIC)) { /* XFS filesystem, verify noisily! */ xfs_sb_read_verify(bp); return; } /* quietly fail */ xfs_buf_ioerror(bp, -EWRONGFS); } static void xfs_sb_write_verify( struct xfs_buf *bp) { struct xfs_sb sb; struct xfs_mount *mp = bp->b_mount; struct xfs_buf_log_item *bip = bp->b_log_item; struct xfs_dsb *dsb = bp->b_addr; int error; /* * Check all the superblock fields. Don't byteswap the xquota flags * because _verify_common checks the on-disk values. */ __xfs_sb_from_disk(&sb, dsb, false); error = xfs_validate_sb_common(mp, bp, &sb); if (error) goto out_error; error = xfs_validate_sb_write(mp, bp, &sb); if (error) goto out_error; if (!xfs_sb_is_v5(&sb)) return; if (bip) dsb->sb_lsn = cpu_to_be64(bip->bli_item.li_lsn); xfs_buf_update_cksum(bp, XFS_SB_CRC_OFF); return; out_error: xfs_verifier_error(bp, error, __this_address); } const struct xfs_buf_ops xfs_sb_buf_ops = { .name = "xfs_sb", .magic = { cpu_to_be32(XFS_SB_MAGIC), cpu_to_be32(XFS_SB_MAGIC) }, .verify_read = xfs_sb_read_verify, .verify_write = xfs_sb_write_verify, }; const struct xfs_buf_ops xfs_sb_quiet_buf_ops = { .name = "xfs_sb_quiet", .magic = { cpu_to_be32(XFS_SB_MAGIC), cpu_to_be32(XFS_SB_MAGIC) }, .verify_read = xfs_sb_quiet_read_verify, .verify_write = xfs_sb_write_verify, }; /* * xfs_mount_common * * Mount initialization code establishing various mount * fields from the superblock associated with the given * mount structure. * * Inode geometry are calculated in xfs_ialloc_setup_geometry. */ void xfs_sb_mount_common( struct xfs_mount *mp, struct xfs_sb *sbp) { mp->m_agfrotor = 0; atomic_set(&mp->m_agirotor, 0); mp->m_maxagi = mp->m_sb.sb_agcount; mp->m_blkbit_log = sbp->sb_blocklog + XFS_NBBYLOG; mp->m_blkbb_log = sbp->sb_blocklog - BBSHIFT; mp->m_sectbb_log = sbp->sb_sectlog - BBSHIFT; mp->m_agno_log = xfs_highbit32(sbp->sb_agcount - 1) + 1; mp->m_blockmask = sbp->sb_blocksize - 1; mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG; mp->m_blockwmask = mp->m_blockwsize - 1; mp->m_rtxblklog = log2_if_power2(sbp->sb_rextsize); mp->m_rtxblkmask = mask64_if_power2(sbp->sb_rextsize); mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 1); mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 0); mp->m_alloc_mnr[0] = mp->m_alloc_mxr[0] / 2; mp->m_alloc_mnr[1] = mp->m_alloc_mxr[1] / 2; mp->m_bmap_dmxr[0] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 1); mp->m_bmap_dmxr[1] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 0); mp->m_bmap_dmnr[0] = mp->m_bmap_dmxr[0] / 2; mp->m_bmap_dmnr[1] = mp->m_bmap_dmxr[1] / 2; mp->m_rmap_mxr[0] = xfs_rmapbt_maxrecs(sbp->sb_blocksize, 1); mp->m_rmap_mxr[1] = xfs_rmapbt_maxrecs(sbp->sb_blocksize, 0); mp->m_rmap_mnr[0] = mp->m_rmap_mxr[0] / 2; mp->m_rmap_mnr[1] = mp->m_rmap_mxr[1] / 2; mp->m_refc_mxr[0] = xfs_refcountbt_maxrecs(sbp->sb_blocksize, true); mp->m_refc_mxr[1] = xfs_refcountbt_maxrecs(sbp->sb_blocksize, false); mp->m_refc_mnr[0] = mp->m_refc_mxr[0] / 2; mp->m_refc_mnr[1] = mp->m_refc_mxr[1] / 2; mp->m_bsize = XFS_FSB_TO_BB(mp, 1); mp->m_alloc_set_aside = xfs_alloc_set_aside(mp); mp->m_ag_max_usable = xfs_alloc_ag_max_usable(mp); } /* * xfs_log_sb() can be used to copy arbitrary changes to the in-core superblock * into the superblock buffer to be logged. It does not provide the higher * level of locking that is needed to protect the in-core superblock from * concurrent access. */ void xfs_log_sb( struct xfs_trans *tp) { struct xfs_mount *mp = tp->t_mountp; struct xfs_buf *bp = xfs_trans_getsb(tp); /* * Lazy sb counters don't update the in-core superblock so do that now. * If this is at unmount, the counters will be exactly correct, but at * any other time they will only be ballpark correct because of * reservations that have been taken out percpu counters. If we have an * unclean shutdown, this will be corrected by log recovery rebuilding * the counters from the AGF block counts. * * Do not update sb_frextents here because it is not part of the lazy * sb counters, despite having a percpu counter. It is always kept * consistent with the ondisk rtbitmap by xfs_trans_apply_sb_deltas() * and hence we don't need have to update it here. */ if (xfs_has_lazysbcount(mp)) { mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount); mp->m_sb.sb_ifree = min_t(uint64_t, percpu_counter_sum(&mp->m_ifree), mp->m_sb.sb_icount); mp->m_sb.sb_fdblocks = percpu_counter_sum(&mp->m_fdblocks); } xfs_sb_to_disk(bp->b_addr, &mp->m_sb); xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF); xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsb) - 1); } /* * xfs_sync_sb * * Sync the superblock to disk. * * Note that the caller is responsible for checking the frozen state of the * filesystem. This procedure uses the non-blocking transaction allocator and * thus will allow modifications to a frozen fs. This is required because this * code can be called during the process of freezing where use of the high-level * allocator would deadlock. */ int xfs_sync_sb( struct xfs_mount *mp, bool wait) { struct xfs_trans *tp; int error; error = xfs_trans_alloc(mp, &M_RES(mp)->tr_sb, 0, 0, XFS_TRANS_NO_WRITECOUNT, &tp); if (error) return error; xfs_log_sb(tp); if (wait) xfs_trans_set_sync(tp); return xfs_trans_commit(tp); } /* * Update all the secondary superblocks to match the new state of the primary. * Because we are completely overwriting all the existing fields in the * secondary superblock buffers, there is no need to read them in from disk. * Just get a new buffer, stamp it and write it. * * The sb buffers need to be cached here so that we serialise against other * operations that access the secondary superblocks, but we don't want to keep * them in memory once it is written so we mark it as a one-shot buffer. */ int xfs_update_secondary_sbs( struct xfs_mount *mp) { struct xfs_perag *pag; xfs_agnumber_t agno = 1; int saved_error = 0; int error = 0; LIST_HEAD (buffer_list); /* update secondary superblocks. */ for_each_perag_from(mp, agno, pag) { struct xfs_buf *bp; error = xfs_buf_get(mp->m_ddev_targp, XFS_AG_DADDR(mp, pag->pag_agno, XFS_SB_DADDR), XFS_FSS_TO_BB(mp, 1), &bp); /* * If we get an error reading or writing alternate superblocks, * continue. xfs_repair chooses the "best" superblock based * on most matches; if we break early, we'll leave more * superblocks un-updated than updated, and xfs_repair may * pick them over the properly-updated primary. */ if (error) { xfs_warn(mp, "error allocating secondary superblock for ag %d", pag->pag_agno); if (!saved_error) saved_error = error; continue; } bp->b_ops = &xfs_sb_buf_ops; xfs_buf_oneshot(bp); xfs_buf_zero(bp, 0, BBTOB(bp->b_length)); xfs_sb_to_disk(bp->b_addr, &mp->m_sb); xfs_buf_delwri_queue(bp, &buffer_list); xfs_buf_relse(bp); /* don't hold too many buffers at once */ if (agno % 16) continue; error = xfs_buf_delwri_submit(&buffer_list); if (error) { xfs_warn(mp, "write error %d updating a secondary superblock near ag %d", error, pag->pag_agno); if (!saved_error) saved_error = error; continue; } } error = xfs_buf_delwri_submit(&buffer_list); if (error) { xfs_warn(mp, "write error %d updating a secondary superblock near ag %d", error, agno); } return saved_error ? saved_error : error; } /* * Same behavior as xfs_sync_sb, except that it is always synchronous and it * also writes the superblock buffer to disk sector 0 immediately. */ int xfs_sync_sb_buf( struct xfs_mount *mp) { struct xfs_trans *tp; struct xfs_buf *bp; int error; error = xfs_trans_alloc(mp, &M_RES(mp)->tr_sb, 0, 0, 0, &tp); if (error) return error; bp = xfs_trans_getsb(tp); xfs_log_sb(tp); xfs_trans_bhold(tp, bp); xfs_trans_set_sync(tp); error = xfs_trans_commit(tp); if (error) goto out; /* * write out the sb buffer to get the changes to disk */ error = xfs_bwrite(bp); out: xfs_buf_relse(bp); return error; } void xfs_fs_geometry( struct xfs_mount *mp, struct xfs_fsop_geom *geo, int struct_version) { struct xfs_sb *sbp = &mp->m_sb; memset(geo, 0, sizeof(struct xfs_fsop_geom)); geo->blocksize = sbp->sb_blocksize; geo->rtextsize = sbp->sb_rextsize; geo->agblocks = sbp->sb_agblocks; geo->agcount = sbp->sb_agcount; geo->logblocks = sbp->sb_logblocks; geo->sectsize = sbp->sb_sectsize; geo->inodesize = sbp->sb_inodesize; geo->imaxpct = sbp->sb_imax_pct; geo->datablocks = sbp->sb_dblocks; geo->rtblocks = sbp->sb_rblocks; geo->rtextents = sbp->sb_rextents; geo->logstart = sbp->sb_logstart; BUILD_BUG_ON(sizeof(geo->uuid) != sizeof(sbp->sb_uuid)); memcpy(geo->uuid, &sbp->sb_uuid, sizeof(sbp->sb_uuid)); if (struct_version < 2) return; geo->sunit = sbp->sb_unit; geo->swidth = sbp->sb_width; if (struct_version < 3) return; geo->version = XFS_FSOP_GEOM_VERSION; geo->flags = XFS_FSOP_GEOM_FLAGS_NLINK | XFS_FSOP_GEOM_FLAGS_DIRV2 | XFS_FSOP_GEOM_FLAGS_EXTFLG; if (xfs_has_attr(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_ATTR; if (xfs_has_quota(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_QUOTA; if (xfs_has_align(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_IALIGN; if (xfs_has_dalign(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_DALIGN; if (xfs_has_asciici(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_DIRV2CI; if (xfs_has_lazysbcount(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_LAZYSB; if (xfs_has_attr2(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_ATTR2; if (xfs_has_projid32(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_PROJID32; if (xfs_has_crc(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_V5SB; if (xfs_has_ftype(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_FTYPE; if (xfs_has_finobt(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_FINOBT; if (xfs_has_sparseinodes(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_SPINODES; if (xfs_has_rmapbt(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_RMAPBT; if (xfs_has_reflink(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_REFLINK; if (xfs_has_bigtime(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_BIGTIME; if (xfs_has_inobtcounts(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_INOBTCNT; if (xfs_has_sector(mp)) { geo->flags |= XFS_FSOP_GEOM_FLAGS_SECTOR; geo->logsectsize = sbp->sb_logsectsize; } else { geo->logsectsize = BBSIZE; } if (xfs_has_large_extent_counts(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_NREXT64; geo->rtsectsize = sbp->sb_blocksize; geo->dirblocksize = xfs_dir2_dirblock_bytes(sbp); if (struct_version < 4) return; if (xfs_has_logv2(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_LOGV2; geo->logsunit = sbp->sb_logsunit; if (struct_version < 5) return; geo->version = XFS_FSOP_GEOM_VERSION_V5; } /* Read a secondary superblock. */ int xfs_sb_read_secondary( struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, struct xfs_buf **bpp) { struct xfs_buf *bp; int error; ASSERT(agno != 0 && agno != NULLAGNUMBER); error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_SB_BLOCK(mp)), XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_sb_buf_ops); if (xfs_metadata_is_sick(error)) xfs_agno_mark_sick(mp, agno, XFS_SICK_AG_SB); if (error) return error; xfs_buf_set_ref(bp, XFS_SSB_REF); *bpp = bp; return 0; } /* Get an uninitialised secondary superblock buffer. */ int xfs_sb_get_secondary( struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, struct xfs_buf **bpp) { struct xfs_buf *bp; int error; ASSERT(agno != 0 && agno != NULLAGNUMBER); error = xfs_trans_get_buf(tp, mp->m_ddev_targp, XFS_AG_DADDR(mp, agno, XFS_SB_BLOCK(mp)), XFS_FSS_TO_BB(mp, 1), 0, &bp); if (error) return error; bp->b_ops = &xfs_sb_buf_ops; xfs_buf_oneshot(bp); *bpp = bp; return 0; } /* * sunit, swidth, sectorsize(optional with 0) should be all in bytes, * so users won't be confused by values in error messages. */ bool xfs_validate_stripe_geometry( struct xfs_mount *mp, __s64 sunit, __s64 swidth, int sectorsize, bool silent) { if (swidth > INT_MAX) { if (!silent) xfs_notice(mp, "stripe width (%lld) is too large", swidth); return false; } if (sunit > swidth) { if (!silent) xfs_notice(mp, "stripe unit (%lld) is larger than the stripe width (%lld)", sunit, swidth); return false; } if (sectorsize && (int)sunit % sectorsize) { if (!silent) xfs_notice(mp, "stripe unit (%lld) must be a multiple of the sector size (%d)", sunit, sectorsize); return false; } if (sunit && !swidth) { if (!silent) xfs_notice(mp, "invalid stripe unit (%lld) and stripe width of 0", sunit); return false; } if (!sunit && swidth) { if (!silent) xfs_notice(mp, "invalid stripe width (%lld) and stripe unit of 0", swidth); return false; } if (sunit && (int)swidth % (int)sunit) { if (!silent) xfs_notice(mp, "stripe width (%lld) must be a multiple of the stripe unit (%lld)", swidth, sunit); return false; } return true; } /* * Compute the maximum level number of the realtime summary file, as defined by * mkfs. The historic use of highbit32 on a 64-bit quantity prohibited correct * use of rt volumes with more than 2^32 extents. */ uint8_t xfs_compute_rextslog( xfs_rtbxlen_t rtextents) { if (!rtextents) return 0; return xfs_highbit64(rtextents); }
1636 95 79 8 79 79 79 52 15 38 38 2 29 274 13 259 4327 4317 4183 156 116 1 110 32 199 199 193 10 1 199 201 201 201 201 199 3 179 24 24 258 238 32 1 31 260 260 257 8 54 201 259 260 259 260 260 260 259 257 260 2 31 238 260 259 260 1148 1148 731 482 660 81 712 114 1129 1130 898 5 305 308 293 37 96 44 222 260 230 35 260 260 234 260 259 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com> */ #include <linux/dcache.h> #include <linux/fs.h> #include <linux/gfp.h> #include <linux/init.h> #include <linux/module.h> #include <linux/mount.h> #include <linux/srcu.h> #include <linux/fsnotify_backend.h> #include "fsnotify.h" /* * Clear all of the marks on an inode when it is being evicted from core */ void __fsnotify_inode_delete(struct inode *inode) { fsnotify_clear_marks_by_inode(inode); } EXPORT_SYMBOL_GPL(__fsnotify_inode_delete); void __fsnotify_vfsmount_delete(struct vfsmount *mnt) { fsnotify_clear_marks_by_mount(mnt); } /** * fsnotify_unmount_inodes - an sb is unmounting. handle any watched inodes. * @sb: superblock being unmounted. * * Called during unmount with no locks held, so needs to be safe against * concurrent modifiers. We temporarily drop sb->s_inode_list_lock and CAN block. */ static void fsnotify_unmount_inodes(struct super_block *sb) { struct inode *inode, *iput_inode = NULL; spin_lock(&sb->s_inode_list_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { /* * We cannot __iget() an inode in state I_FREEING, * I_WILL_FREE, or I_NEW which is fine because by that point * the inode cannot have any associated watches. */ spin_lock(&inode->i_lock); if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) { spin_unlock(&inode->i_lock); continue; } /* * If i_count is zero, the inode cannot have any watches and * doing an __iget/iput with SB_ACTIVE clear would actually * evict all inodes with zero i_count from icache which is * unnecessarily violent and may in fact be illegal to do. * However, we should have been called /after/ evict_inodes * removed all zero refcount inodes, in any case. Test to * be sure. */ if (!atomic_read(&inode->i_count)) { spin_unlock(&inode->i_lock); continue; } __iget(inode); spin_unlock(&inode->i_lock); spin_unlock(&sb->s_inode_list_lock); iput(iput_inode); /* for each watch, send FS_UNMOUNT and then remove it */ fsnotify_inode(inode, FS_UNMOUNT); fsnotify_inode_delete(inode); iput_inode = inode; cond_resched(); spin_lock(&sb->s_inode_list_lock); } spin_unlock(&sb->s_inode_list_lock); iput(iput_inode); } void fsnotify_sb_delete(struct super_block *sb) { fsnotify_unmount_inodes(sb); fsnotify_clear_marks_by_sb(sb); /* Wait for outstanding object references from connectors */ wait_var_event(&sb->s_fsnotify_connectors, !atomic_long_read(&sb->s_fsnotify_connectors)); } /* * Given an inode, first check if we care what happens to our children. Inotify * and dnotify both tell their parents about events. If we care about any event * on a child we run all of our children and set a dentry flag saying that the * parent cares. Thus when an event happens on a child it can quickly tell * if there is a need to find a parent and send the event to the parent. */ void __fsnotify_update_child_dentry_flags(struct inode *inode) { struct dentry *alias; int watched; if (!S_ISDIR(inode->i_mode)) return; /* determine if the children should tell inode about their events */ watched = fsnotify_inode_watches_children(inode); spin_lock(&inode->i_lock); /* run all of the dentries associated with this inode. Since this is a * directory, there damn well better only be one item on this list */ hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) { struct dentry *child; /* run all of the children of the original inode and fix their * d_flags to indicate parental interest (their parent is the * original inode) */ spin_lock(&alias->d_lock); hlist_for_each_entry(child, &alias->d_children, d_sib) { if (!child->d_inode) continue; spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED); if (watched) child->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED; else child->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED; spin_unlock(&child->d_lock); } spin_unlock(&alias->d_lock); } spin_unlock(&inode->i_lock); } /* Are inode/sb/mount interested in parent and name info with this event? */ static bool fsnotify_event_needs_parent(struct inode *inode, __u32 mnt_mask, __u32 mask) { __u32 marks_mask = 0; /* We only send parent/name to inode/sb/mount for events on non-dir */ if (mask & FS_ISDIR) return false; /* * All events that are possible on child can also may be reported with * parent/name info to inode/sb/mount. Otherwise, a watching parent * could result in events reported with unexpected name info to sb/mount. */ BUILD_BUG_ON(FS_EVENTS_POSS_ON_CHILD & ~FS_EVENTS_POSS_TO_PARENT); /* Did either inode/sb/mount subscribe for events with parent/name? */ marks_mask |= fsnotify_parent_needed_mask(inode->i_fsnotify_mask); marks_mask |= fsnotify_parent_needed_mask(inode->i_sb->s_fsnotify_mask); marks_mask |= fsnotify_parent_needed_mask(mnt_mask); /* Did they subscribe for this event with parent/name info? */ return mask & marks_mask; } /* Are there any inode/mount/sb objects that are interested in this event? */ static inline bool fsnotify_object_watched(struct inode *inode, __u32 mnt_mask, __u32 mask) { __u32 marks_mask = inode->i_fsnotify_mask | mnt_mask | inode->i_sb->s_fsnotify_mask; return mask & marks_mask & ALL_FSNOTIFY_EVENTS; } /* * Notify this dentry's parent about a child's events with child name info * if parent is watching or if inode/sb/mount are interested in events with * parent and name info. * * Notify only the child without name info if parent is not watching and * inode/sb/mount are not interested in events with parent and name info. */ int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data, int data_type) { const struct path *path = fsnotify_data_path(data, data_type); __u32 mnt_mask = path ? real_mount(path->mnt)->mnt_fsnotify_mask : 0; struct inode *inode = d_inode(dentry); struct dentry *parent; bool parent_watched = dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED; bool parent_needed, parent_interested; __u32 p_mask; struct inode *p_inode = NULL; struct name_snapshot name; struct qstr *file_name = NULL; int ret = 0; /* Optimize the likely case of nobody watching this path */ if (likely(!parent_watched && !fsnotify_object_watched(inode, mnt_mask, mask))) return 0; parent = NULL; parent_needed = fsnotify_event_needs_parent(inode, mnt_mask, mask); if (!parent_watched && !parent_needed) goto notify; /* Does parent inode care about events on children? */ parent = dget_parent(dentry); p_inode = parent->d_inode; p_mask = fsnotify_inode_watches_children(p_inode); if (unlikely(parent_watched && !p_mask)) __fsnotify_update_child_dentry_flags(p_inode); /* * Include parent/name in notification either if some notification * groups require parent info or the parent is interested in this event. */ parent_interested = mask & p_mask & ALL_FSNOTIFY_EVENTS; if (parent_needed || parent_interested) { /* When notifying parent, child should be passed as data */ WARN_ON_ONCE(inode != fsnotify_data_inode(data, data_type)); /* Notify both parent and child with child name info */ take_dentry_name_snapshot(&name, dentry); file_name = &name.name; if (parent_interested) mask |= FS_EVENT_ON_CHILD; } notify: ret = fsnotify(mask, data, data_type, p_inode, file_name, inode, 0); if (file_name) release_dentry_name_snapshot(&name); dput(parent); return ret; } EXPORT_SYMBOL_GPL(__fsnotify_parent); static int fsnotify_handle_inode_event(struct fsnotify_group *group, struct fsnotify_mark *inode_mark, u32 mask, const void *data, int data_type, struct inode *dir, const struct qstr *name, u32 cookie) { const struct path *path = fsnotify_data_path(data, data_type); struct inode *inode = fsnotify_data_inode(data, data_type); const struct fsnotify_ops *ops = group->ops; if (WARN_ON_ONCE(!ops->handle_inode_event)) return 0; if (WARN_ON_ONCE(!inode && !dir)) return 0; if ((inode_mark->flags & FSNOTIFY_MARK_FLAG_EXCL_UNLINK) && path && d_unlinked(path->dentry)) return 0; /* Check interest of this mark in case event was sent with two marks */ if (!(mask & inode_mark->mask & ALL_FSNOTIFY_EVENTS)) return 0; return ops->handle_inode_event(inode_mark, mask, inode, dir, name, cookie); } static int fsnotify_handle_event(struct fsnotify_group *group, __u32 mask, const void *data, int data_type, struct inode *dir, const struct qstr *name, u32 cookie, struct fsnotify_iter_info *iter_info) { struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info); struct fsnotify_mark *parent_mark = fsnotify_iter_parent_mark(iter_info); int ret; if (WARN_ON_ONCE(fsnotify_iter_sb_mark(iter_info)) || WARN_ON_ONCE(fsnotify_iter_vfsmount_mark(iter_info))) return 0; /* * For FS_RENAME, 'dir' is old dir and 'data' is new dentry. * The only ->handle_inode_event() backend that supports FS_RENAME is * dnotify, where it means file was renamed within same parent. */ if (mask & FS_RENAME) { struct dentry *moved = fsnotify_data_dentry(data, data_type); if (dir != moved->d_parent->d_inode) return 0; } if (parent_mark) { ret = fsnotify_handle_inode_event(group, parent_mark, mask, data, data_type, dir, name, 0); if (ret) return ret; } if (!inode_mark) return 0; if (mask & FS_EVENT_ON_CHILD) { /* * Some events can be sent on both parent dir and child marks * (e.g. FS_ATTRIB). If both parent dir and child are * watching, report the event once to parent dir with name (if * interested) and once to child without name (if interested). * The child watcher is expecting an event without a file name * and without the FS_EVENT_ON_CHILD flag. */ mask &= ~FS_EVENT_ON_CHILD; dir = NULL; name = NULL; } return fsnotify_handle_inode_event(group, inode_mark, mask, data, data_type, dir, name, cookie); } static int send_to_group(__u32 mask, const void *data, int data_type, struct inode *dir, const struct qstr *file_name, u32 cookie, struct fsnotify_iter_info *iter_info) { struct fsnotify_group *group = NULL; __u32 test_mask = (mask & ALL_FSNOTIFY_EVENTS); __u32 marks_mask = 0; __u32 marks_ignore_mask = 0; bool is_dir = mask & FS_ISDIR; struct fsnotify_mark *mark; int type; if (!iter_info->report_mask) return 0; /* clear ignored on inode modification */ if (mask & FS_MODIFY) { fsnotify_foreach_iter_mark_type(iter_info, mark, type) { if (!(mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY)) mark->ignore_mask = 0; } } /* Are any of the group marks interested in this event? */ fsnotify_foreach_iter_mark_type(iter_info, mark, type) { group = mark->group; marks_mask |= mark->mask; marks_ignore_mask |= fsnotify_effective_ignore_mask(mark, is_dir, type); } pr_debug("%s: group=%p mask=%x marks_mask=%x marks_ignore_mask=%x data=%p data_type=%d dir=%p cookie=%d\n", __func__, group, mask, marks_mask, marks_ignore_mask, data, data_type, dir, cookie); if (!(test_mask & marks_mask & ~marks_ignore_mask)) return 0; if (group->ops->handle_event) { return group->ops->handle_event(group, mask, data, data_type, dir, file_name, cookie, iter_info); } return fsnotify_handle_event(group, mask, data, data_type, dir, file_name, cookie, iter_info); } static struct fsnotify_mark *fsnotify_first_mark(struct fsnotify_mark_connector **connp) { struct fsnotify_mark_connector *conn; struct hlist_node *node = NULL; conn = srcu_dereference(*connp, &fsnotify_mark_srcu); if (conn) node = srcu_dereference(conn->list.first, &fsnotify_mark_srcu); return hlist_entry_safe(node, struct fsnotify_mark, obj_list); } static struct fsnotify_mark *fsnotify_next_mark(struct fsnotify_mark *mark) { struct hlist_node *node = NULL; if (mark) node = srcu_dereference(mark->obj_list.next, &fsnotify_mark_srcu); return hlist_entry_safe(node, struct fsnotify_mark, obj_list); } /* * iter_info is a multi head priority queue of marks. * Pick a subset of marks from queue heads, all with the same group * and set the report_mask to a subset of the selected marks. * Returns false if there are no more groups to iterate. */ static bool fsnotify_iter_select_report_types( struct fsnotify_iter_info *iter_info) { struct fsnotify_group *max_prio_group = NULL; struct fsnotify_mark *mark; int type; /* Choose max prio group among groups of all queue heads */ fsnotify_foreach_iter_type(type) { mark = iter_info->marks[type]; if (mark && fsnotify_compare_groups(max_prio_group, mark->group) > 0) max_prio_group = mark->group; } if (!max_prio_group) return false; /* Set the report mask for marks from same group as max prio group */ iter_info->current_group = max_prio_group; iter_info->report_mask = 0; fsnotify_foreach_iter_type(type) { mark = iter_info->marks[type]; if (mark && mark->group == iter_info->current_group) { /* * FSNOTIFY_ITER_TYPE_PARENT indicates that this inode * is watching children and interested in this event, * which is an event possible on child. * But is *this mark* watching children? */ if (type == FSNOTIFY_ITER_TYPE_PARENT && !(mark->mask & FS_EVENT_ON_CHILD) && !(fsnotify_ignore_mask(mark) & FS_EVENT_ON_CHILD)) continue; fsnotify_iter_set_report_type(iter_info, type); } } return true; } /* * Pop from iter_info multi head queue, the marks that belong to the group of * current iteration step. */ static void fsnotify_iter_next(struct fsnotify_iter_info *iter_info) { struct fsnotify_mark *mark; int type; /* * We cannot use fsnotify_foreach_iter_mark_type() here because we * may need to advance a mark of type X that belongs to current_group * but was not selected for reporting. */ fsnotify_foreach_iter_type(type) { mark = iter_info->marks[type]; if (mark && mark->group == iter_info->current_group) iter_info->marks[type] = fsnotify_next_mark(iter_info->marks[type]); } } /* * fsnotify - This is the main call to fsnotify. * * The VFS calls into hook specific functions in linux/fsnotify.h. * Those functions then in turn call here. Here will call out to all of the * registered fsnotify_group. Those groups can then use the notification event * in whatever means they feel necessary. * * @mask: event type and flags * @data: object that event happened on * @data_type: type of object for fanotify_data_XXX() accessors * @dir: optional directory associated with event - * if @file_name is not NULL, this is the directory that * @file_name is relative to * @file_name: optional file name associated with event * @inode: optional inode associated with event - * If @dir and @inode are both non-NULL, event may be * reported to both. * @cookie: inotify rename cookie */ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, const struct qstr *file_name, struct inode *inode, u32 cookie) { const struct path *path = fsnotify_data_path(data, data_type); struct super_block *sb = fsnotify_data_sb(data, data_type); struct fsnotify_iter_info iter_info = {}; struct mount *mnt = NULL; struct inode *inode2 = NULL; struct dentry *moved; int inode2_type; int ret = 0; __u32 test_mask, marks_mask; if (path) mnt = real_mount(path->mnt); if (!inode) { /* Dirent event - report on TYPE_INODE to dir */ inode = dir; /* For FS_RENAME, inode is old_dir and inode2 is new_dir */ if (mask & FS_RENAME) { moved = fsnotify_data_dentry(data, data_type); inode2 = moved->d_parent->d_inode; inode2_type = FSNOTIFY_ITER_TYPE_INODE2; } } else if (mask & FS_EVENT_ON_CHILD) { /* * Event on child - report on TYPE_PARENT to dir if it is * watching children and on TYPE_INODE to child. */ inode2 = dir; inode2_type = FSNOTIFY_ITER_TYPE_PARENT; } /* * Optimization: srcu_read_lock() has a memory barrier which can * be expensive. It protects walking the *_fsnotify_marks lists. * However, if we do not walk the lists, we do not have to do * SRCU because we have no references to any objects and do not * need SRCU to keep them "alive". */ if (!sb->s_fsnotify_marks && (!mnt || !mnt->mnt_fsnotify_marks) && (!inode || !inode->i_fsnotify_marks) && (!inode2 || !inode2->i_fsnotify_marks)) return 0; marks_mask = sb->s_fsnotify_mask; if (mnt) marks_mask |= mnt->mnt_fsnotify_mask; if (inode) marks_mask |= inode->i_fsnotify_mask; if (inode2) marks_mask |= inode2->i_fsnotify_mask; /* * If this is a modify event we may need to clear some ignore masks. * In that case, the object with ignore masks will have the FS_MODIFY * event in its mask. * Otherwise, return if none of the marks care about this type of event. */ test_mask = (mask & ALL_FSNOTIFY_EVENTS); if (!(test_mask & marks_mask)) return 0; iter_info.srcu_idx = srcu_read_lock(&fsnotify_mark_srcu); iter_info.marks[FSNOTIFY_ITER_TYPE_SB] = fsnotify_first_mark(&sb->s_fsnotify_marks); if (mnt) { iter_info.marks[FSNOTIFY_ITER_TYPE_VFSMOUNT] = fsnotify_first_mark(&mnt->mnt_fsnotify_marks); } if (inode) { iter_info.marks[FSNOTIFY_ITER_TYPE_INODE] = fsnotify_first_mark(&inode->i_fsnotify_marks); } if (inode2) { iter_info.marks[inode2_type] = fsnotify_first_mark(&inode2->i_fsnotify_marks); } /* * We need to merge inode/vfsmount/sb mark lists so that e.g. inode mark * ignore masks are properly reflected for mount/sb mark notifications. * That's why this traversal is so complicated... */ while (fsnotify_iter_select_report_types(&iter_info)) { ret = send_to_group(mask, data, data_type, dir, file_name, cookie, &iter_info); if (ret && (mask & ALL_FSNOTIFY_PERM_EVENTS)) goto out; fsnotify_iter_next(&iter_info); } ret = 0; out: srcu_read_unlock(&fsnotify_mark_srcu, iter_info.srcu_idx); return ret; } EXPORT_SYMBOL_GPL(fsnotify); static __init int fsnotify_init(void) { int ret; BUILD_BUG_ON(HWEIGHT32(ALL_FSNOTIFY_BITS) != 23); ret = init_srcu_struct(&fsnotify_mark_srcu); if (ret) panic("initializing fsnotify_mark_srcu"); fsnotify_mark_connector_cachep = KMEM_CACHE(fsnotify_mark_connector, SLAB_PANIC); return 0; } core_initcall(fsnotify_init);
520 377 414 93 15 4 417 3238 1781 263 1516 1791 14158 13724 215 13518 13767 2019 1879 261 262 253 1418 21 47 47 395 396 394 58 86 24 339 2 340 313 338 453 1139 1066 1 86 500 358 71 16 43 71 49 44 3 245 246 397 396 18 18 3 1 4 1 264 18 215 5 10 10 5 6 10 22 22 269 10 238 22 726 518 14 189 148 148 149 10 144 145 113 115 13 115 23 23 478 180 115 23 6 22 11831 18 1225 2909 1863 4 1003 262 1242 1231 12 17 11846 15 11833 14303 11901 29 2872 18 12 1177 2845 14176 14179 1412 16 13124 45 45 34 11 31 31 31 31 1 1 1 1 1 547 546 41 527 577 548 1 31 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 // SPDX-License-Identifier: GPL-2.0-only #include <linux/export.h> #include <linux/bvec.h> #include <linux/fault-inject-usercopy.h> #include <linux/uio.h> #include <linux/pagemap.h> #include <linux/highmem.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/splice.h> #include <linux/compat.h> #include <linux/scatterlist.h> #include <linux/instrumented.h> #include <linux/iov_iter.h> static __always_inline size_t copy_to_user_iter(void __user *iter_to, size_t progress, size_t len, void *from, void *priv2) { if (should_fail_usercopy()) return len; if (access_ok(iter_to, len)) { from += progress; instrument_copy_to_user(iter_to, from, len); len = raw_copy_to_user(iter_to, from, len); } return len; } static __always_inline size_t copy_to_user_iter_nofault(void __user *iter_to, size_t progress, size_t len, void *from, void *priv2) { ssize_t res; if (should_fail_usercopy()) return len; from += progress; res = copy_to_user_nofault(iter_to, from, len); return res < 0 ? len : res; } static __always_inline size_t copy_from_user_iter(void __user *iter_from, size_t progress, size_t len, void *to, void *priv2) { size_t res = len; if (should_fail_usercopy()) return len; if (access_ok(iter_from, len)) { to += progress; instrument_copy_from_user_before(to, iter_from, len); res = raw_copy_from_user(to, iter_from, len); instrument_copy_from_user_after(to, iter_from, len, res); } return res; } static __always_inline size_t memcpy_to_iter(void *iter_to, size_t progress, size_t len, void *from, void *priv2) { memcpy(iter_to, from + progress, len); return 0; } static __always_inline size_t memcpy_from_iter(void *iter_from, size_t progress, size_t len, void *to, void *priv2) { memcpy(to + progress, iter_from, len); return 0; } /* * fault_in_iov_iter_readable - fault in iov iterator for reading * @i: iterator * @size: maximum length * * Fault in one or more iovecs of the given iov_iter, to a maximum length of * @size. For each iovec, fault in each page that constitutes the iovec. * * Returns the number of bytes not faulted in (like copy_to_user() and * copy_from_user()). * * Always returns 0 for non-userspace iterators. */ size_t fault_in_iov_iter_readable(const struct iov_iter *i, size_t size) { if (iter_is_ubuf(i)) { size_t n = min(size, iov_iter_count(i)); n -= fault_in_readable(i->ubuf + i->iov_offset, n); return size - n; } else if (iter_is_iovec(i)) { size_t count = min(size, iov_iter_count(i)); const struct iovec *p; size_t skip; size -= count; for (p = iter_iov(i), skip = i->iov_offset; count; p++, skip = 0) { size_t len = min(count, p->iov_len - skip); size_t ret; if (unlikely(!len)) continue; ret = fault_in_readable(p->iov_base + skip, len); count -= len - ret; if (ret) break; } return count + size; } return 0; } EXPORT_SYMBOL(fault_in_iov_iter_readable); /* * fault_in_iov_iter_writeable - fault in iov iterator for writing * @i: iterator * @size: maximum length * * Faults in the iterator using get_user_pages(), i.e., without triggering * hardware page faults. This is primarily useful when we already know that * some or all of the pages in @i aren't in memory. * * Returns the number of bytes not faulted in, like copy_to_user() and * copy_from_user(). * * Always returns 0 for non-user-space iterators. */ size_t fault_in_iov_iter_writeable(const struct iov_iter *i, size_t size) { if (iter_is_ubuf(i)) { size_t n = min(size, iov_iter_count(i)); n -= fault_in_safe_writeable(i->ubuf + i->iov_offset, n); return size - n; } else if (iter_is_iovec(i)) { size_t count = min(size, iov_iter_count(i)); const struct iovec *p; size_t skip; size -= count; for (p = iter_iov(i), skip = i->iov_offset; count; p++, skip = 0) { size_t len = min(count, p->iov_len - skip); size_t ret; if (unlikely(!len)) continue; ret = fault_in_safe_writeable(p->iov_base + skip, len); count -= len - ret; if (ret) break; } return count + size; } return 0; } EXPORT_SYMBOL(fault_in_iov_iter_writeable); void iov_iter_init(struct iov_iter *i, unsigned int direction, const struct iovec *iov, unsigned long nr_segs, size_t count) { WARN_ON(direction & ~(READ | WRITE)); *i = (struct iov_iter) { .iter_type = ITER_IOVEC, .nofault = false, .data_source = direction, .__iov = iov, .nr_segs = nr_segs, .iov_offset = 0, .count = count }; } EXPORT_SYMBOL(iov_iter_init); size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i) { if (WARN_ON_ONCE(i->data_source)) return 0; if (user_backed_iter(i)) might_fault(); return iterate_and_advance(i, bytes, (void *)addr, copy_to_user_iter, memcpy_to_iter); } EXPORT_SYMBOL(_copy_to_iter); #ifdef CONFIG_ARCH_HAS_COPY_MC static __always_inline size_t copy_to_user_iter_mc(void __user *iter_to, size_t progress, size_t len, void *from, void *priv2) { if (access_ok(iter_to, len)) { from += progress; instrument_copy_to_user(iter_to, from, len); len = copy_mc_to_user(iter_to, from, len); } return len; } static __always_inline size_t memcpy_to_iter_mc(void *iter_to, size_t progress, size_t len, void *from, void *priv2) { return copy_mc_to_kernel(iter_to, from + progress, len); } /** * _copy_mc_to_iter - copy to iter with source memory error exception handling * @addr: source kernel address * @bytes: total transfer length * @i: destination iterator * * The pmem driver deploys this for the dax operation * (dax_copy_to_iter()) for dax reads (bypass page-cache and the * block-layer). Upon #MC read(2) aborts and returns EIO or the bytes * successfully copied. * * The main differences between this and typical _copy_to_iter(). * * * Typical tail/residue handling after a fault retries the copy * byte-by-byte until the fault happens again. Re-triggering machine * checks is potentially fatal so the implementation uses source * alignment and poison alignment assumptions to avoid re-triggering * hardware exceptions. * * * ITER_KVEC and ITER_BVEC can return short copies. Compare to * copy_to_iter() where only ITER_IOVEC attempts might return a short copy. * * Return: number of bytes copied (may be %0) */ size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i) { if (WARN_ON_ONCE(i->data_source)) return 0; if (user_backed_iter(i)) might_fault(); return iterate_and_advance(i, bytes, (void *)addr, copy_to_user_iter_mc, memcpy_to_iter_mc); } EXPORT_SYMBOL_GPL(_copy_mc_to_iter); #endif /* CONFIG_ARCH_HAS_COPY_MC */ static __always_inline size_t __copy_from_iter(void *addr, size_t bytes, struct iov_iter *i) { return iterate_and_advance(i, bytes, addr, copy_from_user_iter, memcpy_from_iter); } size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i) { if (WARN_ON_ONCE(!i->data_source)) return 0; if (user_backed_iter(i)) might_fault(); return __copy_from_iter(addr, bytes, i); } EXPORT_SYMBOL(_copy_from_iter); static __always_inline size_t copy_from_user_iter_nocache(void __user *iter_from, size_t progress, size_t len, void *to, void *priv2) { return __copy_from_user_inatomic_nocache(to + progress, iter_from, len); } size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i) { if (WARN_ON_ONCE(!i->data_source)) return 0; return iterate_and_advance(i, bytes, addr, copy_from_user_iter_nocache, memcpy_from_iter); } EXPORT_SYMBOL(_copy_from_iter_nocache); #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE static __always_inline size_t copy_from_user_iter_flushcache(void __user *iter_from, size_t progress, size_t len, void *to, void *priv2) { return __copy_from_user_flushcache(to + progress, iter_from, len); } static __always_inline size_t memcpy_from_iter_flushcache(void *iter_from, size_t progress, size_t len, void *to, void *priv2) { memcpy_flushcache(to + progress, iter_from, len); return 0; } /** * _copy_from_iter_flushcache - write destination through cpu cache * @addr: destination kernel address * @bytes: total transfer length * @i: source iterator * * The pmem driver arranges for filesystem-dax to use this facility via * dax_copy_from_iter() for ensuring that writes to persistent memory * are flushed through the CPU cache. It is differentiated from * _copy_from_iter_nocache() in that guarantees all data is flushed for * all iterator types. The _copy_from_iter_nocache() only attempts to * bypass the cache for the ITER_IOVEC case, and on some archs may use * instructions that strand dirty-data in the cache. * * Return: number of bytes copied (may be %0) */ size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i) { if (WARN_ON_ONCE(!i->data_source)) return 0; return iterate_and_advance(i, bytes, addr, copy_from_user_iter_flushcache, memcpy_from_iter_flushcache); } EXPORT_SYMBOL_GPL(_copy_from_iter_flushcache); #endif static inline bool page_copy_sane(struct page *page, size_t offset, size_t n) { struct page *head; size_t v = n + offset; /* * The general case needs to access the page order in order * to compute the page size. * However, we mostly deal with order-0 pages and thus can * avoid a possible cache line miss for requests that fit all * page orders. */ if (n <= v && v <= PAGE_SIZE) return true; head = compound_head(page); v += (page - head) << PAGE_SHIFT; if (WARN_ON(n > v || v > page_size(head))) return false; return true; } size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, struct iov_iter *i) { size_t res = 0; if (!page_copy_sane(page, offset, bytes)) return 0; if (WARN_ON_ONCE(i->data_source)) return 0; page += offset / PAGE_SIZE; // first subpage offset %= PAGE_SIZE; while (1) { void *kaddr = kmap_local_page(page); size_t n = min(bytes, (size_t)PAGE_SIZE - offset); n = _copy_to_iter(kaddr + offset, n, i); kunmap_local(kaddr); res += n; bytes -= n; if (!bytes || !n) break; offset += n; if (offset == PAGE_SIZE) { page++; offset = 0; } } return res; } EXPORT_SYMBOL(copy_page_to_iter); size_t copy_page_to_iter_nofault(struct page *page, unsigned offset, size_t bytes, struct iov_iter *i) { size_t res = 0; if (!page_copy_sane(page, offset, bytes)) return 0; if (WARN_ON_ONCE(i->data_source)) return 0; page += offset / PAGE_SIZE; // first subpage offset %= PAGE_SIZE; while (1) { void *kaddr = kmap_local_page(page); size_t n = min(bytes, (size_t)PAGE_SIZE - offset); n = iterate_and_advance(i, n, kaddr + offset, copy_to_user_iter_nofault, memcpy_to_iter); kunmap_local(kaddr); res += n; bytes -= n; if (!bytes || !n) break; offset += n; if (offset == PAGE_SIZE) { page++; offset = 0; } } return res; } EXPORT_SYMBOL(copy_page_to_iter_nofault); size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes, struct iov_iter *i) { size_t res = 0; if (!page_copy_sane(page, offset, bytes)) return 0; page += offset / PAGE_SIZE; // first subpage offset %= PAGE_SIZE; while (1) { void *kaddr = kmap_local_page(page); size_t n = min(bytes, (size_t)PAGE_SIZE - offset); n = _copy_from_iter(kaddr + offset, n, i); kunmap_local(kaddr); res += n; bytes -= n; if (!bytes || !n) break; offset += n; if (offset == PAGE_SIZE) { page++; offset = 0; } } return res; } EXPORT_SYMBOL(copy_page_from_iter); static __always_inline size_t zero_to_user_iter(void __user *iter_to, size_t progress, size_t len, void *priv, void *priv2) { return clear_user(iter_to, len); } static __always_inline size_t zero_to_iter(void *iter_to, size_t progress, size_t len, void *priv, void *priv2) { memset(iter_to, 0, len); return 0; } size_t iov_iter_zero(size_t bytes, struct iov_iter *i) { return iterate_and_advance(i, bytes, NULL, zero_to_user_iter, zero_to_iter); } EXPORT_SYMBOL(iov_iter_zero); size_t copy_page_from_iter_atomic(struct page *page, size_t offset, size_t bytes, struct iov_iter *i) { size_t n, copied = 0; if (!page_copy_sane(page, offset, bytes)) return 0; if (WARN_ON_ONCE(!i->data_source)) return 0; do { char *p; n = bytes - copied; if (PageHighMem(page)) { page += offset / PAGE_SIZE; offset %= PAGE_SIZE; n = min_t(size_t, n, PAGE_SIZE - offset); } p = kmap_atomic(page) + offset; n = __copy_from_iter(p, n, i); kunmap_atomic(p); copied += n; offset += n; } while (PageHighMem(page) && copied != bytes && n > 0); return copied; } EXPORT_SYMBOL(copy_page_from_iter_atomic); static void iov_iter_bvec_advance(struct iov_iter *i, size_t size) { const struct bio_vec *bvec, *end; if (!i->count) return; i->count -= size; size += i->iov_offset; for (bvec = i->bvec, end = bvec + i->nr_segs; bvec < end; bvec++) { if (likely(size < bvec->bv_len)) break; size -= bvec->bv_len; } i->iov_offset = size; i->nr_segs -= bvec - i->bvec; i->bvec = bvec; } static void iov_iter_iovec_advance(struct iov_iter *i, size_t size) { const struct iovec *iov, *end; if (!i->count) return; i->count -= size; size += i->iov_offset; // from beginning of current segment for (iov = iter_iov(i), end = iov + i->nr_segs; iov < end; iov++) { if (likely(size < iov->iov_len)) break; size -= iov->iov_len; } i->iov_offset = size; i->nr_segs -= iov - iter_iov(i); i->__iov = iov; } void iov_iter_advance(struct iov_iter *i, size_t size) { if (unlikely(i->count < size)) size = i->count; if (likely(iter_is_ubuf(i)) || unlikely(iov_iter_is_xarray(i))) { i->iov_offset += size; i->count -= size; } else if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) { /* iovec and kvec have identical layouts */ iov_iter_iovec_advance(i, size); } else if (iov_iter_is_bvec(i)) { iov_iter_bvec_advance(i, size); } else if (iov_iter_is_discard(i)) { i->count -= size; } } EXPORT_SYMBOL(iov_iter_advance); void iov_iter_revert(struct iov_iter *i, size_t unroll) { if (!unroll) return; if (WARN_ON(unroll > MAX_RW_COUNT)) return; i->count += unroll; if (unlikely(iov_iter_is_discard(i))) return; if (unroll <= i->iov_offset) { i->iov_offset -= unroll; return; } unroll -= i->iov_offset; if (iov_iter_is_xarray(i) || iter_is_ubuf(i)) { BUG(); /* We should never go beyond the start of the specified * range since we might then be straying into pages that * aren't pinned. */ } else if (iov_iter_is_bvec(i)) { const struct bio_vec *bvec = i->bvec; while (1) { size_t n = (--bvec)->bv_len; i->nr_segs++; if (unroll <= n) { i->bvec = bvec; i->iov_offset = n - unroll; return; } unroll -= n; } } else { /* same logics for iovec and kvec */ const struct iovec *iov = iter_iov(i); while (1) { size_t n = (--iov)->iov_len; i->nr_segs++; if (unroll <= n) { i->__iov = iov; i->iov_offset = n - unroll; return; } unroll -= n; } } } EXPORT_SYMBOL(iov_iter_revert); /* * Return the count of just the current iov_iter segment. */ size_t iov_iter_single_seg_count(const struct iov_iter *i) { if (i->nr_segs > 1) { if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) return min(i->count, iter_iov(i)->iov_len - i->iov_offset); if (iov_iter_is_bvec(i)) return min(i->count, i->bvec->bv_len - i->iov_offset); } return i->count; } EXPORT_SYMBOL(iov_iter_single_seg_count); void iov_iter_kvec(struct iov_iter *i, unsigned int direction, const struct kvec *kvec, unsigned long nr_segs, size_t count) { WARN_ON(direction & ~(READ | WRITE)); *i = (struct iov_iter){ .iter_type = ITER_KVEC, .data_source = direction, .kvec = kvec, .nr_segs = nr_segs, .iov_offset = 0, .count = count }; } EXPORT_SYMBOL(iov_iter_kvec); void iov_iter_bvec(struct iov_iter *i, unsigned int direction, const struct bio_vec *bvec, unsigned long nr_segs, size_t count) { WARN_ON(direction & ~(READ | WRITE)); *i = (struct iov_iter){ .iter_type = ITER_BVEC, .data_source = direction, .bvec = bvec, .nr_segs = nr_segs, .iov_offset = 0, .count = count }; } EXPORT_SYMBOL(iov_iter_bvec); /** * iov_iter_xarray - Initialise an I/O iterator to use the pages in an xarray * @i: The iterator to initialise. * @direction: The direction of the transfer. * @xarray: The xarray to access. * @start: The start file position. * @count: The size of the I/O buffer in bytes. * * Set up an I/O iterator to either draw data out of the pages attached to an * inode or to inject data into those pages. The pages *must* be prevented * from evaporation, either by taking a ref on them or locking them by the * caller. */ void iov_iter_xarray(struct iov_iter *i, unsigned int direction, struct xarray *xarray, loff_t start, size_t count) { BUG_ON(direction & ~1); *i = (struct iov_iter) { .iter_type = ITER_XARRAY, .data_source = direction, .xarray = xarray, .xarray_start = start, .count = count, .iov_offset = 0 }; } EXPORT_SYMBOL(iov_iter_xarray); /** * iov_iter_discard - Initialise an I/O iterator that discards data * @i: The iterator to initialise. * @direction: The direction of the transfer. * @count: The size of the I/O buffer in bytes. * * Set up an I/O iterator that just discards everything that's written to it. * It's only available as a READ iterator. */ void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count) { BUG_ON(direction != READ); *i = (struct iov_iter){ .iter_type = ITER_DISCARD, .data_source = false, .count = count, .iov_offset = 0 }; } EXPORT_SYMBOL(iov_iter_discard); static bool iov_iter_aligned_iovec(const struct iov_iter *i, unsigned addr_mask, unsigned len_mask) { const struct iovec *iov = iter_iov(i); size_t size = i->count; size_t skip = i->iov_offset; do { size_t len = iov->iov_len - skip; if (len > size) len = size; if (len & len_mask) return false; if ((unsigned long)(iov->iov_base + skip) & addr_mask) return false; iov++; size -= len; skip = 0; } while (size); return true; } static bool iov_iter_aligned_bvec(const struct iov_iter *i, unsigned addr_mask, unsigned len_mask) { const struct bio_vec *bvec = i->bvec; unsigned skip = i->iov_offset; size_t size = i->count; do { size_t len = bvec->bv_len; if (len > size) len = size; if (len & len_mask) return false; if ((unsigned long)(bvec->bv_offset + skip) & addr_mask) return false; bvec++; size -= len; skip = 0; } while (size); return true; } /** * iov_iter_is_aligned() - Check if the addresses and lengths of each segments * are aligned to the parameters. * * @i: &struct iov_iter to restore * @addr_mask: bit mask to check against the iov element's addresses * @len_mask: bit mask to check against the iov element's lengths * * Return: false if any addresses or lengths intersect with the provided masks */ bool iov_iter_is_aligned(const struct iov_iter *i, unsigned addr_mask, unsigned len_mask) { if (likely(iter_is_ubuf(i))) { if (i->count & len_mask) return false; if ((unsigned long)(i->ubuf + i->iov_offset) & addr_mask) return false; return true; } if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) return iov_iter_aligned_iovec(i, addr_mask, len_mask); if (iov_iter_is_bvec(i)) return iov_iter_aligned_bvec(i, addr_mask, len_mask); if (iov_iter_is_xarray(i)) { if (i->count & len_mask) return false; if ((i->xarray_start + i->iov_offset) & addr_mask) return false; } return true; } EXPORT_SYMBOL_GPL(iov_iter_is_aligned); static unsigned long iov_iter_alignment_iovec(const struct iov_iter *i) { const struct iovec *iov = iter_iov(i); unsigned long res = 0; size_t size = i->count; size_t skip = i->iov_offset; do { size_t len = iov->iov_len - skip; if (len) { res |= (unsigned long)iov->iov_base + skip; if (len > size) len = size; res |= len; size -= len; } iov++; skip = 0; } while (size); return res; } static unsigned long iov_iter_alignment_bvec(const struct iov_iter *i) { const struct bio_vec *bvec = i->bvec; unsigned res = 0; size_t size = i->count; unsigned skip = i->iov_offset; do { size_t len = bvec->bv_len - skip; res |= (unsigned long)bvec->bv_offset + skip; if (len > size) len = size; res |= len; bvec++; size -= len; skip = 0; } while (size); return res; } unsigned long iov_iter_alignment(const struct iov_iter *i) { if (likely(iter_is_ubuf(i))) { size_t size = i->count; if (size) return ((unsigned long)i->ubuf + i->iov_offset) | size; return 0; } /* iovec and kvec have identical layouts */ if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) return iov_iter_alignment_iovec(i); if (iov_iter_is_bvec(i)) return iov_iter_alignment_bvec(i); if (iov_iter_is_xarray(i)) return (i->xarray_start + i->iov_offset) | i->count; return 0; } EXPORT_SYMBOL(iov_iter_alignment); unsigned long iov_iter_gap_alignment(const struct iov_iter *i) { unsigned long res = 0; unsigned long v = 0; size_t size = i->count; unsigned k; if (iter_is_ubuf(i)) return 0; if (WARN_ON(!iter_is_iovec(i))) return ~0U; for (k = 0; k < i->nr_segs; k++) { const struct iovec *iov = iter_iov(i) + k; if (iov->iov_len) { unsigned long base = (unsigned long)iov->iov_base; if (v) // if not the first one res |= base | v; // this start | previous end v = base + iov->iov_len; if (size <= iov->iov_len) break; size -= iov->iov_len; } } return res; } EXPORT_SYMBOL(iov_iter_gap_alignment); static int want_pages_array(struct page ***res, size_t size, size_t start, unsigned int maxpages) { unsigned int count = DIV_ROUND_UP(size + start, PAGE_SIZE); if (count > maxpages) count = maxpages; WARN_ON(!count); // caller should've prevented that if (!*res) { *res = kvmalloc_array(count, sizeof(struct page *), GFP_KERNEL); if (!*res) return 0; } return count; } static ssize_t iter_xarray_populate_pages(struct page **pages, struct xarray *xa, pgoff_t index, unsigned int nr_pages) { XA_STATE(xas, xa, index); struct page *page; unsigned int ret = 0; rcu_read_lock(); for (page = xas_load(&xas); page; page = xas_next(&xas)) { if (xas_retry(&xas, page)) continue; /* Has the page moved or been split? */ if (unlikely(page != xas_reload(&xas))) { xas_reset(&xas); continue; } pages[ret] = find_subpage(page, xas.xa_index); get_page(pages[ret]); if (++ret == nr_pages) break; } rcu_read_unlock(); return ret; } static ssize_t iter_xarray_get_pages(struct iov_iter *i, struct page ***pages, size_t maxsize, unsigned maxpages, size_t *_start_offset) { unsigned nr, offset, count; pgoff_t index; loff_t pos; pos = i->xarray_start + i->iov_offset; index = pos >> PAGE_SHIFT; offset = pos & ~PAGE_MASK; *_start_offset = offset; count = want_pages_array(pages, maxsize, offset, maxpages); if (!count) return -ENOMEM; nr = iter_xarray_populate_pages(*pages, i->xarray, index, count); if (nr == 0) return 0; maxsize = min_t(size_t, nr * PAGE_SIZE - offset, maxsize); i->iov_offset += maxsize; i->count -= maxsize; return maxsize; } /* must be done on non-empty ITER_UBUF or ITER_IOVEC one */ static unsigned long first_iovec_segment(const struct iov_iter *i, size_t *size) { size_t skip; long k; if (iter_is_ubuf(i)) return (unsigned long)i->ubuf + i->iov_offset; for (k = 0, skip = i->iov_offset; k < i->nr_segs; k++, skip = 0) { const struct iovec *iov = iter_iov(i) + k; size_t len = iov->iov_len - skip; if (unlikely(!len)) continue; if (*size > len) *size = len; return (unsigned long)iov->iov_base + skip; } BUG(); // if it had been empty, we wouldn't get called } /* must be done on non-empty ITER_BVEC one */ static struct page *first_bvec_segment(const struct iov_iter *i, size_t *size, size_t *start) { struct page *page; size_t skip = i->iov_offset, len; len = i->bvec->bv_len - skip; if (*size > len) *size = len; skip += i->bvec->bv_offset; page = i->bvec->bv_page + skip / PAGE_SIZE; *start = skip % PAGE_SIZE; return page; } static ssize_t __iov_iter_get_pages_alloc(struct iov_iter *i, struct page ***pages, size_t maxsize, unsigned int maxpages, size_t *start) { unsigned int n, gup_flags = 0; if (maxsize > i->count) maxsize = i->count; if (!maxsize) return 0; if (maxsize > MAX_RW_COUNT) maxsize = MAX_RW_COUNT; if (likely(user_backed_iter(i))) { unsigned long addr; int res; if (iov_iter_rw(i) != WRITE) gup_flags |= FOLL_WRITE; if (i->nofault) gup_flags |= FOLL_NOFAULT; addr = first_iovec_segment(i, &maxsize); *start = addr % PAGE_SIZE; addr &= PAGE_MASK; n = want_pages_array(pages, maxsize, *start, maxpages); if (!n) return -ENOMEM; res = get_user_pages_fast(addr, n, gup_flags, *pages); if (unlikely(res <= 0)) return res; maxsize = min_t(size_t, maxsize, res * PAGE_SIZE - *start); iov_iter_advance(i, maxsize); return maxsize; } if (iov_iter_is_bvec(i)) { struct page **p; struct page *page; page = first_bvec_segment(i, &maxsize, start); n = want_pages_array(pages, maxsize, *start, maxpages); if (!n) return -ENOMEM; p = *pages; for (int k = 0; k < n; k++) get_page(p[k] = page + k); maxsize = min_t(size_t, maxsize, n * PAGE_SIZE - *start); i->count -= maxsize; i->iov_offset += maxsize; if (i->iov_offset == i->bvec->bv_len) { i->iov_offset = 0; i->bvec++; i->nr_segs--; } return maxsize; } if (iov_iter_is_xarray(i)) return iter_xarray_get_pages(i, pages, maxsize, maxpages, start); return -EFAULT; } ssize_t iov_iter_get_pages2(struct iov_iter *i, struct page **pages, size_t maxsize, unsigned maxpages, size_t *start) { if (!maxpages) return 0; BUG_ON(!pages); return __iov_iter_get_pages_alloc(i, &pages, maxsize, maxpages, start); } EXPORT_SYMBOL(iov_iter_get_pages2); ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i, struct page ***pages, size_t maxsize, size_t *start) { ssize_t len; *pages = NULL; len = __iov_iter_get_pages_alloc(i, pages, maxsize, ~0U, start); if (len <= 0) { kvfree(*pages); *pages = NULL; } return len; } EXPORT_SYMBOL(iov_iter_get_pages_alloc2); static int iov_npages(const struct iov_iter *i, int maxpages) { size_t skip = i->iov_offset, size = i->count; const struct iovec *p; int npages = 0; for (p = iter_iov(i); size; skip = 0, p++) { unsigned offs = offset_in_page(p->iov_base + skip); size_t len = min(p->iov_len - skip, size); if (len) { size -= len; npages += DIV_ROUND_UP(offs + len, PAGE_SIZE); if (unlikely(npages > maxpages)) return maxpages; } } return npages; } static int bvec_npages(const struct iov_iter *i, int maxpages) { size_t skip = i->iov_offset, size = i->count; const struct bio_vec *p; int npages = 0; for (p = i->bvec; size; skip = 0, p++) { unsigned offs = (p->bv_offset + skip) % PAGE_SIZE; size_t len = min(p->bv_len - skip, size); size -= len; npages += DIV_ROUND_UP(offs + len, PAGE_SIZE); if (unlikely(npages > maxpages)) return maxpages; } return npages; } int iov_iter_npages(const struct iov_iter *i, int maxpages) { if (unlikely(!i->count)) return 0; if (likely(iter_is_ubuf(i))) { unsigned offs = offset_in_page(i->ubuf + i->iov_offset); int npages = DIV_ROUND_UP(offs + i->count, PAGE_SIZE); return min(npages, maxpages); } /* iovec and kvec have identical layouts */ if (likely(iter_is_iovec(i) || iov_iter_is_kvec(i))) return iov_npages(i, maxpages); if (iov_iter_is_bvec(i)) return bvec_npages(i, maxpages); if (iov_iter_is_xarray(i)) { unsigned offset = (i->xarray_start + i->iov_offset) % PAGE_SIZE; int npages = DIV_ROUND_UP(offset + i->count, PAGE_SIZE); return min(npages, maxpages); } return 0; } EXPORT_SYMBOL(iov_iter_npages); const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags) { *new = *old; if (iov_iter_is_bvec(new)) return new->bvec = kmemdup(new->bvec, new->nr_segs * sizeof(struct bio_vec), flags); else if (iov_iter_is_kvec(new) || iter_is_iovec(new)) /* iovec and kvec have identical layout */ return new->__iov = kmemdup(new->__iov, new->nr_segs * sizeof(struct iovec), flags); return NULL; } EXPORT_SYMBOL(dup_iter); static __noclone int copy_compat_iovec_from_user(struct iovec *iov, const struct iovec __user *uvec, u32 nr_segs) { const struct compat_iovec __user *uiov = (const struct compat_iovec __user *)uvec; int ret = -EFAULT; u32 i; if (!user_access_begin(uiov, nr_segs * sizeof(*uiov))) return -EFAULT; for (i = 0; i < nr_segs; i++) { compat_uptr_t buf; compat_ssize_t len; unsafe_get_user(len, &uiov[i].iov_len, uaccess_end); unsafe_get_user(buf, &uiov[i].iov_base, uaccess_end); /* check for compat_size_t not fitting in compat_ssize_t .. */ if (len < 0) { ret = -EINVAL; goto uaccess_end; } iov[i].iov_base = compat_ptr(buf); iov[i].iov_len = len; } ret = 0; uaccess_end: user_access_end(); return ret; } static __noclone int copy_iovec_from_user(struct iovec *iov, const struct iovec __user *uiov, unsigned long nr_segs) { int ret = -EFAULT; if (!user_access_begin(uiov, nr_segs * sizeof(*uiov))) return -EFAULT; do { void __user *buf; ssize_t len; unsafe_get_user(len, &uiov->iov_len, uaccess_end); unsafe_get_user(buf, &uiov->iov_base, uaccess_end); /* check for size_t not fitting in ssize_t .. */ if (unlikely(len < 0)) { ret = -EINVAL; goto uaccess_end; } iov->iov_base = buf; iov->iov_len = len; uiov++; iov++; } while (--nr_segs); ret = 0; uaccess_end: user_access_end(); return ret; } struct iovec *iovec_from_user(const struct iovec __user *uvec, unsigned long nr_segs, unsigned long fast_segs, struct iovec *fast_iov, bool compat) { struct iovec *iov = fast_iov; int ret; /* * SuS says "The readv() function *may* fail if the iovcnt argument was * less than or equal to 0, or greater than {IOV_MAX}. Linux has * traditionally returned zero for zero segments, so... */ if (nr_segs == 0) return iov; if (nr_segs > UIO_MAXIOV) return ERR_PTR(-EINVAL); if (nr_segs > fast_segs) { iov = kmalloc_array(nr_segs, sizeof(struct iovec), GFP_KERNEL); if (!iov) return ERR_PTR(-ENOMEM); } if (unlikely(compat)) ret = copy_compat_iovec_from_user(iov, uvec, nr_segs); else ret = copy_iovec_from_user(iov, uvec, nr_segs); if (ret) { if (iov != fast_iov) kfree(iov); return ERR_PTR(ret); } return iov; } /* * Single segment iovec supplied by the user, import it as ITER_UBUF. */ static ssize_t __import_iovec_ubuf(int type, const struct iovec __user *uvec, struct iovec **iovp, struct iov_iter *i, bool compat) { struct iovec *iov = *iovp; ssize_t ret; if (compat) ret = copy_compat_iovec_from_user(iov, uvec, 1); else ret = copy_iovec_from_user(iov, uvec, 1); if (unlikely(ret)) return ret; ret = import_ubuf(type, iov->iov_base, iov->iov_len, i); if (unlikely(ret)) return ret; *iovp = NULL; return i->count; } ssize_t __import_iovec(int type, const struct iovec __user *uvec, unsigned nr_segs, unsigned fast_segs, struct iovec **iovp, struct iov_iter *i, bool compat) { ssize_t total_len = 0; unsigned long seg; struct iovec *iov; if (nr_segs == 1) return __import_iovec_ubuf(type, uvec, iovp, i, compat); iov = iovec_from_user(uvec, nr_segs, fast_segs, *iovp, compat); if (IS_ERR(iov)) { *iovp = NULL; return PTR_ERR(iov); } /* * According to the Single Unix Specification we should return EINVAL if * an element length is < 0 when cast to ssize_t or if the total length * would overflow the ssize_t return value of the system call. * * Linux caps all read/write calls to MAX_RW_COUNT, and avoids the * overflow case. */ for (seg = 0; seg < nr_segs; seg++) { ssize_t len = (ssize_t)iov[seg].iov_len; if (!access_ok(iov[seg].iov_base, len)) { if (iov != *iovp) kfree(iov); *iovp = NULL; return -EFAULT; } if (len > MAX_RW_COUNT - total_len) { len = MAX_RW_COUNT - total_len; iov[seg].iov_len = len; } total_len += len; } iov_iter_init(i, type, iov, nr_segs, total_len); if (iov == *iovp) *iovp = NULL; else *iovp = iov; return total_len; } /** * import_iovec() - Copy an array of &struct iovec from userspace * into the kernel, check that it is valid, and initialize a new * &struct iov_iter iterator to access it. * * @type: One of %READ or %WRITE. * @uvec: Pointer to the userspace array. * @nr_segs: Number of elements in userspace array. * @fast_segs: Number of elements in @iov. * @iovp: (input and output parameter) Pointer to pointer to (usually small * on-stack) kernel array. * @i: Pointer to iterator that will be initialized on success. * * If the array pointed to by *@iov is large enough to hold all @nr_segs, * then this function places %NULL in *@iov on return. Otherwise, a new * array will be allocated and the result placed in *@iov. This means that * the caller may call kfree() on *@iov regardless of whether the small * on-stack array was used or not (and regardless of whether this function * returns an error or not). * * Return: Negative error code on error, bytes imported on success */ ssize_t import_iovec(int type, const struct iovec __user *uvec, unsigned nr_segs, unsigned fast_segs, struct iovec **iovp, struct iov_iter *i) { return __import_iovec(type, uvec, nr_segs, fast_segs, iovp, i, in_compat_syscall()); } EXPORT_SYMBOL(import_iovec); int import_ubuf(int rw, void __user *buf, size_t len, struct iov_iter *i) { if (len > MAX_RW_COUNT) len = MAX_RW_COUNT; if (unlikely(!access_ok(buf, len))) return -EFAULT; iov_iter_ubuf(i, rw, buf, len); return 0; } EXPORT_SYMBOL_GPL(import_ubuf); /** * iov_iter_restore() - Restore a &struct iov_iter to the same state as when * iov_iter_save_state() was called. * * @i: &struct iov_iter to restore * @state: state to restore from * * Used after iov_iter_save_state() to bring restore @i, if operations may * have advanced it. * * Note: only works on ITER_IOVEC, ITER_BVEC, and ITER_KVEC */ void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state) { if (WARN_ON_ONCE(!iov_iter_is_bvec(i) && !iter_is_iovec(i) && !iter_is_ubuf(i)) && !iov_iter_is_kvec(i)) return; i->iov_offset = state->iov_offset; i->count = state->count; if (iter_is_ubuf(i)) return; /* * For the *vec iters, nr_segs + iov is constant - if we increment * the vec, then we also decrement the nr_segs count. Hence we don't * need to track both of these, just one is enough and we can deduct * the other from that. ITER_KVEC and ITER_IOVEC are the same struct * size, so we can just increment the iov pointer as they are unionzed. * ITER_BVEC _may_ be the same size on some archs, but on others it is * not. Be safe and handle it separately. */ BUILD_BUG_ON(sizeof(struct iovec) != sizeof(struct kvec)); if (iov_iter_is_bvec(i)) i->bvec -= state->nr_segs - i->nr_segs; else i->__iov -= state->nr_segs - i->nr_segs; i->nr_segs = state->nr_segs; } /* * Extract a list of contiguous pages from an ITER_XARRAY iterator. This does not * get references on the pages, nor does it get a pin on them. */ static ssize_t iov_iter_extract_xarray_pages(struct iov_iter *i, struct page ***pages, size_t maxsize, unsigned int maxpages, iov_iter_extraction_t extraction_flags, size_t *offset0) { struct page *page, **p; unsigned int nr = 0, offset; loff_t pos = i->xarray_start + i->iov_offset; pgoff_t index = pos >> PAGE_SHIFT; XA_STATE(xas, i->xarray, index); offset = pos & ~PAGE_MASK; *offset0 = offset; maxpages = want_pages_array(pages, maxsize, offset, maxpages); if (!maxpages) return -ENOMEM; p = *pages; rcu_read_lock(); for (page = xas_load(&xas); page; page = xas_next(&xas)) { if (xas_retry(&xas, page)) continue; /* Has the page moved or been split? */ if (unlikely(page != xas_reload(&xas))) { xas_reset(&xas); continue; } p[nr++] = find_subpage(page, xas.xa_index); if (nr == maxpages) break; } rcu_read_unlock(); maxsize = min_t(size_t, nr * PAGE_SIZE - offset, maxsize); iov_iter_advance(i, maxsize); return maxsize; } /* * Extract a list of contiguous pages from an ITER_BVEC iterator. This does * not get references on the pages, nor does it get a pin on them. */ static ssize_t iov_iter_extract_bvec_pages(struct iov_iter *i, struct page ***pages, size_t maxsize, unsigned int maxpages, iov_iter_extraction_t extraction_flags, size_t *offset0) { struct page **p, *page; size_t skip = i->iov_offset, offset, size; int k; for (;;) { if (i->nr_segs == 0) return 0; size = min(maxsize, i->bvec->bv_len - skip); if (size) break; i->iov_offset = 0; i->nr_segs--; i->bvec++; skip = 0; } skip += i->bvec->bv_offset; page = i->bvec->bv_page + skip / PAGE_SIZE; offset = skip % PAGE_SIZE; *offset0 = offset; maxpages = want_pages_array(pages, size, offset, maxpages); if (!maxpages) return -ENOMEM; p = *pages; for (k = 0; k < maxpages; k++) p[k] = page + k; size = min_t(size_t, size, maxpages * PAGE_SIZE - offset); iov_iter_advance(i, size); return size; } /* * Extract a list of virtually contiguous pages from an ITER_KVEC iterator. * This does not get references on the pages, nor does it get a pin on them. */ static ssize_t iov_iter_extract_kvec_pages(struct iov_iter *i, struct page ***pages, size_t maxsize, unsigned int maxpages, iov_iter_extraction_t extraction_flags, size_t *offset0) { struct page **p, *page; const void *kaddr; size_t skip = i->iov_offset, offset, len, size; int k; for (;;) { if (i->nr_segs == 0) return 0; size = min(maxsize, i->kvec->iov_len - skip); if (size) break; i->iov_offset = 0; i->nr_segs--; i->kvec++; skip = 0; } kaddr = i->kvec->iov_base + skip; offset = (unsigned long)kaddr & ~PAGE_MASK; *offset0 = offset; maxpages = want_pages_array(pages, size, offset, maxpages); if (!maxpages) return -ENOMEM; p = *pages; kaddr -= offset; len = offset + size; for (k = 0; k < maxpages; k++) { size_t seg = min_t(size_t, len, PAGE_SIZE); if (is_vmalloc_or_module_addr(kaddr)) page = vmalloc_to_page(kaddr); else page = virt_to_page(kaddr); p[k] = page; len -= seg; kaddr += PAGE_SIZE; } size = min_t(size_t, size, maxpages * PAGE_SIZE - offset); iov_iter_advance(i, size); return size; } /* * Extract a list of contiguous pages from a user iterator and get a pin on * each of them. This should only be used if the iterator is user-backed * (IOBUF/UBUF). * * It does not get refs on the pages, but the pages must be unpinned by the * caller once the transfer is complete. * * This is safe to be used where background IO/DMA *is* going to be modifying * the buffer; using a pin rather than a ref makes forces fork() to give the * child a copy of the page. */ static ssize_t iov_iter_extract_user_pages(struct iov_iter *i, struct page ***pages, size_t maxsize, unsigned int maxpages, iov_iter_extraction_t extraction_flags, size_t *offset0) { unsigned long addr; unsigned int gup_flags = 0; size_t offset; int res; if (i->data_source == ITER_DEST) gup_flags |= FOLL_WRITE; if (extraction_flags & ITER_ALLOW_P2PDMA) gup_flags |= FOLL_PCI_P2PDMA; if (i->nofault) gup_flags |= FOLL_NOFAULT; addr = first_iovec_segment(i, &maxsize); *offset0 = offset = addr % PAGE_SIZE; addr &= PAGE_MASK; maxpages = want_pages_array(pages, maxsize, offset, maxpages); if (!maxpages) return -ENOMEM; res = pin_user_pages_fast(addr, maxpages, gup_flags, *pages); if (unlikely(res <= 0)) return res; maxsize = min_t(size_t, maxsize, res * PAGE_SIZE - offset); iov_iter_advance(i, maxsize); return maxsize; } /** * iov_iter_extract_pages - Extract a list of contiguous pages from an iterator * @i: The iterator to extract from * @pages: Where to return the list of pages * @maxsize: The maximum amount of iterator to extract * @maxpages: The maximum size of the list of pages * @extraction_flags: Flags to qualify request * @offset0: Where to return the starting offset into (*@pages)[0] * * Extract a list of contiguous pages from the current point of the iterator, * advancing the iterator. The maximum number of pages and the maximum amount * of page contents can be set. * * If *@pages is NULL, a page list will be allocated to the required size and * *@pages will be set to its base. If *@pages is not NULL, it will be assumed * that the caller allocated a page list at least @maxpages in size and this * will be filled in. * * @extraction_flags can have ITER_ALLOW_P2PDMA set to request peer-to-peer DMA * be allowed on the pages extracted. * * The iov_iter_extract_will_pin() function can be used to query how cleanup * should be performed. * * Extra refs or pins on the pages may be obtained as follows: * * (*) If the iterator is user-backed (ITER_IOVEC/ITER_UBUF), pins will be * added to the pages, but refs will not be taken. * iov_iter_extract_will_pin() will return true. * * (*) If the iterator is ITER_KVEC, ITER_BVEC or ITER_XARRAY, the pages are * merely listed; no extra refs or pins are obtained. * iov_iter_extract_will_pin() will return 0. * * Note also: * * (*) Use with ITER_DISCARD is not supported as that has no content. * * On success, the function sets *@pages to the new pagelist, if allocated, and * sets *offset0 to the offset into the first page. * * It may also return -ENOMEM and -EFAULT. */ ssize_t iov_iter_extract_pages(struct iov_iter *i, struct page ***pages, size_t maxsize, unsigned int maxpages, iov_iter_extraction_t extraction_flags, size_t *offset0) { maxsize = min_t(size_t, min_t(size_t, maxsize, i->count), MAX_RW_COUNT); if (!maxsize) return 0; if (likely(user_backed_iter(i))) return iov_iter_extract_user_pages(i, pages, maxsize, maxpages, extraction_flags, offset0); if (iov_iter_is_kvec(i)) return iov_iter_extract_kvec_pages(i, pages, maxsize, maxpages, extraction_flags, offset0); if (iov_iter_is_bvec(i)) return iov_iter_extract_bvec_pages(i, pages, maxsize, maxpages, extraction_flags, offset0); if (iov_iter_is_xarray(i)) return iov_iter_extract_xarray_pages(i, pages, maxsize, maxpages, extraction_flags, offset0); return -EFAULT; } EXPORT_SYMBOL_GPL(iov_iter_extract_pages);
9 5 4 9 21 61 3 8 17 77 17 64 86 17 57 27 27 19 67 12 58 4 74 76 74 2 65 1 63 83 84 68 1 25 44 5 63 2 62 64 241 6 6 70 160 160 160 161 64 14 4 67 67 28 6 87 6 84 17 43 26 19 60 11 6 64 64 64 23 7 7 4 5 3 2 4 1 3 2 3 2 3 2 3 3 3 2 3 2 3 2 1 3 3 3 1 2 2 2 2 2 2 68 69 37 13 54 49 2 49 18 176 174 175 175 176 176 3 172 171 172 123 3 1 1 1 32 1 1 1 8 164 3 4 119 4 1 2 1 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 // SPDX-License-Identifier: GPL-2.0-or-later /* * Internet Control Message Protocol (ICMPv6) * Linux INET6 implementation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> * * Based on net/ipv4/icmp.c * * RFC 1885 */ /* * Changes: * * Andi Kleen : exception handling * Andi Kleen add rate limits. never reply to a icmp. * add more length checks and other fixes. * yoshfuji : ensure to sent parameter problem for * fragments. * YOSHIFUJI Hideaki @USAGI: added sysctl for icmp rate limit. * Randy Dunlap and * YOSHIFUJI Hideaki @USAGI: Per-interface statistics support * Kazunori MIYAZAWA @USAGI: change output process to use ip6_append_data */ #define pr_fmt(fmt) "IPv6: " fmt #include <linux/module.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/kernel.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/skbuff.h> #include <linux/init.h> #include <linux/netfilter.h> #include <linux/slab.h> #ifdef CONFIG_SYSCTL #include <linux/sysctl.h> #endif #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/icmpv6.h> #include <net/ip.h> #include <net/sock.h> #include <net/ipv6.h> #include <net/ip6_checksum.h> #include <net/ping.h> #include <net/protocol.h> #include <net/raw.h> #include <net/rawv6.h> #include <net/seg6.h> #include <net/transp_v6.h> #include <net/ip6_route.h> #include <net/addrconf.h> #include <net/icmp.h> #include <net/xfrm.h> #include <net/inet_common.h> #include <net/dsfield.h> #include <net/l3mdev.h> #include <linux/uaccess.h> static DEFINE_PER_CPU(struct sock *, ipv6_icmp_sk); static int icmpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { /* icmpv6_notify checks 8 bytes can be pulled, icmp6hdr is 8 bytes */ struct icmp6hdr *icmp6 = (struct icmp6hdr *) (skb->data + offset); struct net *net = dev_net(skb->dev); if (type == ICMPV6_PKT_TOOBIG) ip6_update_pmtu(skb, net, info, skb->dev->ifindex, 0, sock_net_uid(net, NULL)); else if (type == NDISC_REDIRECT) ip6_redirect(skb, net, skb->dev->ifindex, 0, sock_net_uid(net, NULL)); if (!(type & ICMPV6_INFOMSG_MASK)) if (icmp6->icmp6_type == ICMPV6_ECHO_REQUEST) ping_err(skb, offset, ntohl(info)); return 0; } static int icmpv6_rcv(struct sk_buff *skb); static const struct inet6_protocol icmpv6_protocol = { .handler = icmpv6_rcv, .err_handler = icmpv6_err, .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, }; /* Called with BH disabled */ static struct sock *icmpv6_xmit_lock(struct net *net) { struct sock *sk; sk = this_cpu_read(ipv6_icmp_sk); if (unlikely(!spin_trylock(&sk->sk_lock.slock))) { /* This can happen if the output path (f.e. SIT or * ip6ip6 tunnel) signals dst_link_failure() for an * outgoing ICMP6 packet. */ return NULL; } sock_net_set(sk, net); return sk; } static void icmpv6_xmit_unlock(struct sock *sk) { sock_net_set(sk, &init_net); spin_unlock(&sk->sk_lock.slock); } /* * Figure out, may we reply to this packet with icmp error. * * We do not reply, if: * - it was icmp error message. * - it is truncated, so that it is known, that protocol is ICMPV6 * (i.e. in the middle of some exthdr) * * --ANK (980726) */ static bool is_ineligible(const struct sk_buff *skb) { int ptr = (u8 *)(ipv6_hdr(skb) + 1) - skb->data; int len = skb->len - ptr; __u8 nexthdr = ipv6_hdr(skb)->nexthdr; __be16 frag_off; if (len < 0) return true; ptr = ipv6_skip_exthdr(skb, ptr, &nexthdr, &frag_off); if (ptr < 0) return false; if (nexthdr == IPPROTO_ICMPV6) { u8 _type, *tp; tp = skb_header_pointer(skb, ptr+offsetof(struct icmp6hdr, icmp6_type), sizeof(_type), &_type); /* Based on RFC 8200, Section 4.5 Fragment Header, return * false if this is a fragment packet with no icmp header info. */ if (!tp && frag_off != 0) return false; else if (!tp || !(*tp & ICMPV6_INFOMSG_MASK)) return true; } return false; } static bool icmpv6_mask_allow(struct net *net, int type) { if (type > ICMPV6_MSG_MAX) return true; /* Limit if icmp type is set in ratemask. */ if (!test_bit(type, net->ipv6.sysctl.icmpv6_ratemask)) return true; return false; } static bool icmpv6_global_allow(struct net *net, int type) { if (icmpv6_mask_allow(net, type)) return true; if (icmp_global_allow()) return true; __ICMP_INC_STATS(net, ICMP_MIB_RATELIMITGLOBAL); return false; } /* * Check the ICMP output rate limit */ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type, struct flowi6 *fl6) { struct net *net = sock_net(sk); struct dst_entry *dst; bool res = false; if (icmpv6_mask_allow(net, type)) return true; /* * Look up the output route. * XXX: perhaps the expire for routing entries cloned by * this lookup should be more aggressive (not longer than timeout). */ dst = ip6_route_output(net, sk, fl6); if (dst->error) { IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); } else if (dst->dev && (dst->dev->flags&IFF_LOOPBACK)) { res = true; } else { struct rt6_info *rt = (struct rt6_info *)dst; int tmo = net->ipv6.sysctl.icmpv6_time; struct inet_peer *peer; /* Give more bandwidth to wider prefixes. */ if (rt->rt6i_dst.plen < 128) tmo >>= ((128 - rt->rt6i_dst.plen)>>5); peer = inet_getpeer_v6(net->ipv6.peers, &fl6->daddr, 1); res = inet_peer_xrlim_allow(peer, tmo); if (peer) inet_putpeer(peer); } if (!res) __ICMP6_INC_STATS(net, ip6_dst_idev(dst), ICMP6_MIB_RATELIMITHOST); dst_release(dst); return res; } static bool icmpv6_rt_has_prefsrc(struct sock *sk, u8 type, struct flowi6 *fl6) { struct net *net = sock_net(sk); struct dst_entry *dst; bool res = false; dst = ip6_route_output(net, sk, fl6); if (!dst->error) { struct rt6_info *rt = (struct rt6_info *)dst; struct in6_addr prefsrc; rt6_get_prefsrc(rt, &prefsrc); res = !ipv6_addr_any(&prefsrc); } dst_release(dst); return res; } /* * an inline helper for the "simple" if statement below * checks if parameter problem report is caused by an * unrecognized IPv6 option that has the Option Type * highest-order two bits set to 10 */ static bool opt_unrec(struct sk_buff *skb, __u32 offset) { u8 _optval, *op; offset += skb_network_offset(skb); op = skb_header_pointer(skb, offset, sizeof(_optval), &_optval); if (!op) return true; return (*op & 0xC0) == 0x80; } void icmpv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6, struct icmp6hdr *thdr, int len) { struct sk_buff *skb; struct icmp6hdr *icmp6h; skb = skb_peek(&sk->sk_write_queue); if (!skb) return; icmp6h = icmp6_hdr(skb); memcpy(icmp6h, thdr, sizeof(struct icmp6hdr)); icmp6h->icmp6_cksum = 0; if (skb_queue_len(&sk->sk_write_queue) == 1) { skb->csum = csum_partial(icmp6h, sizeof(struct icmp6hdr), skb->csum); icmp6h->icmp6_cksum = csum_ipv6_magic(&fl6->saddr, &fl6->daddr, len, fl6->flowi6_proto, skb->csum); } else { __wsum tmp_csum = 0; skb_queue_walk(&sk->sk_write_queue, skb) { tmp_csum = csum_add(tmp_csum, skb->csum); } tmp_csum = csum_partial(icmp6h, sizeof(struct icmp6hdr), tmp_csum); icmp6h->icmp6_cksum = csum_ipv6_magic(&fl6->saddr, &fl6->daddr, len, fl6->flowi6_proto, tmp_csum); } ip6_push_pending_frames(sk); } struct icmpv6_msg { struct sk_buff *skb; int offset; uint8_t type; }; static int icmpv6_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb) { struct icmpv6_msg *msg = (struct icmpv6_msg *) from; struct sk_buff *org_skb = msg->skb; __wsum csum; csum = skb_copy_and_csum_bits(org_skb, msg->offset + offset, to, len); skb->csum = csum_block_add(skb->csum, csum, odd); if (!(msg->type & ICMPV6_INFOMSG_MASK)) nf_ct_attach(skb, org_skb); return 0; } #if IS_ENABLED(CONFIG_IPV6_MIP6) static void mip6_addr_swap(struct sk_buff *skb, const struct inet6_skb_parm *opt) { struct ipv6hdr *iph = ipv6_hdr(skb); struct ipv6_destopt_hao *hao; int off; if (opt->dsthao) { off = ipv6_find_tlv(skb, opt->dsthao, IPV6_TLV_HAO); if (likely(off >= 0)) { hao = (struct ipv6_destopt_hao *) (skb_network_header(skb) + off); swap(iph->saddr, hao->addr); } } } #else static inline void mip6_addr_swap(struct sk_buff *skb, const struct inet6_skb_parm *opt) {} #endif static struct dst_entry *icmpv6_route_lookup(struct net *net, struct sk_buff *skb, struct sock *sk, struct flowi6 *fl6) { struct dst_entry *dst, *dst2; struct flowi6 fl2; int err; err = ip6_dst_lookup(net, sk, &dst, fl6); if (err) return ERR_PTR(err); /* * We won't send icmp if the destination is known * anycast unless we need to treat anycast as unicast. */ if (!READ_ONCE(net->ipv6.sysctl.icmpv6_error_anycast_as_unicast) && ipv6_anycast_destination(dst, &fl6->daddr)) { net_dbg_ratelimited("icmp6_send: acast source\n"); dst_release(dst); return ERR_PTR(-EINVAL); } /* No need to clone since we're just using its address. */ dst2 = dst; dst = xfrm_lookup(net, dst, flowi6_to_flowi(fl6), sk, 0); if (!IS_ERR(dst)) { if (dst != dst2) return dst; } else { if (PTR_ERR(dst) == -EPERM) dst = NULL; else return dst; } err = xfrm_decode_session_reverse(net, skb, flowi6_to_flowi(&fl2), AF_INET6); if (err) goto relookup_failed; err = ip6_dst_lookup(net, sk, &dst2, &fl2); if (err) goto relookup_failed; dst2 = xfrm_lookup(net, dst2, flowi6_to_flowi(&fl2), sk, XFRM_LOOKUP_ICMP); if (!IS_ERR(dst2)) { dst_release(dst); dst = dst2; } else { err = PTR_ERR(dst2); if (err == -EPERM) { dst_release(dst); return dst2; } else goto relookup_failed; } relookup_failed: if (dst) return dst; return ERR_PTR(err); } static struct net_device *icmp6_dev(const struct sk_buff *skb) { struct net_device *dev = skb->dev; /* for local traffic to local address, skb dev is the loopback * device. Check if there is a dst attached to the skb and if so * get the real device index. Same is needed for replies to a link * local address on a device enslaved to an L3 master device */ if (unlikely(dev->ifindex == LOOPBACK_IFINDEX || netif_is_l3_master(skb->dev))) { const struct rt6_info *rt6 = skb_rt6_info(skb); /* The destination could be an external IP in Ext Hdr (SRv6, RPL, etc.), * and ip6_null_entry could be set to skb if no route is found. */ if (rt6 && rt6->rt6i_idev) dev = rt6->rt6i_idev->dev; } return dev; } static int icmp6_iif(const struct sk_buff *skb) { return icmp6_dev(skb)->ifindex; } /* * Send an ICMP message in response to a packet in error */ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, const struct in6_addr *force_saddr, const struct inet6_skb_parm *parm) { struct inet6_dev *idev = NULL; struct ipv6hdr *hdr = ipv6_hdr(skb); struct sock *sk; struct net *net; struct ipv6_pinfo *np; const struct in6_addr *saddr = NULL; struct dst_entry *dst; struct icmp6hdr tmp_hdr; struct flowi6 fl6; struct icmpv6_msg msg; struct ipcm6_cookie ipc6; int iif = 0; int addr_type = 0; int len; u32 mark; if ((u8 *)hdr < skb->head || (skb_network_header(skb) + sizeof(*hdr)) > skb_tail_pointer(skb)) return; if (!skb->dev) return; net = dev_net(skb->dev); mark = IP6_REPLY_MARK(net, skb->mark); /* * Make sure we respect the rules * i.e. RFC 1885 2.4(e) * Rule (e.1) is enforced by not using icmp6_send * in any code that processes icmp errors. */ addr_type = ipv6_addr_type(&hdr->daddr); if (ipv6_chk_addr(net, &hdr->daddr, skb->dev, 0) || ipv6_chk_acast_addr_src(net, skb->dev, &hdr->daddr)) saddr = &hdr->daddr; /* * Dest addr check */ if (addr_type & IPV6_ADDR_MULTICAST || skb->pkt_type != PACKET_HOST) { if (type != ICMPV6_PKT_TOOBIG && !(type == ICMPV6_PARAMPROB && code == ICMPV6_UNK_OPTION && (opt_unrec(skb, info)))) return; saddr = NULL; } addr_type = ipv6_addr_type(&hdr->saddr); /* * Source addr check */ if (__ipv6_addr_needs_scope_id(addr_type)) { iif = icmp6_iif(skb); } else { /* * The source device is used for looking up which routing table * to use for sending an ICMP error. */ iif = l3mdev_master_ifindex(skb->dev); } /* * Must not send error if the source does not uniquely * identify a single node (RFC2463 Section 2.4). * We check unspecified / multicast addresses here, * and anycast addresses will be checked later. */ if ((addr_type == IPV6_ADDR_ANY) || (addr_type & IPV6_ADDR_MULTICAST)) { net_dbg_ratelimited("icmp6_send: addr_any/mcast source [%pI6c > %pI6c]\n", &hdr->saddr, &hdr->daddr); return; } /* * Never answer to a ICMP packet. */ if (is_ineligible(skb)) { net_dbg_ratelimited("icmp6_send: no reply to icmp error [%pI6c > %pI6c]\n", &hdr->saddr, &hdr->daddr); return; } /* Needed by both icmp_global_allow and icmpv6_xmit_lock */ local_bh_disable(); /* Check global sysctl_icmp_msgs_per_sec ratelimit */ if (!(skb->dev->flags & IFF_LOOPBACK) && !icmpv6_global_allow(net, type)) goto out_bh_enable; mip6_addr_swap(skb, parm); sk = icmpv6_xmit_lock(net); if (!sk) goto out_bh_enable; memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_proto = IPPROTO_ICMPV6; fl6.daddr = hdr->saddr; if (force_saddr) saddr = force_saddr; if (saddr) { fl6.saddr = *saddr; } else if (!icmpv6_rt_has_prefsrc(sk, type, &fl6)) { /* select a more meaningful saddr from input if */ struct net_device *in_netdev; in_netdev = dev_get_by_index(net, parm->iif); if (in_netdev) { ipv6_dev_get_saddr(net, in_netdev, &fl6.daddr, inet6_sk(sk)->srcprefs, &fl6.saddr); dev_put(in_netdev); } } fl6.flowi6_mark = mark; fl6.flowi6_oif = iif; fl6.fl6_icmp_type = type; fl6.fl6_icmp_code = code; fl6.flowi6_uid = sock_net_uid(net, NULL); fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, NULL); security_skb_classify_flow(skb, flowi6_to_flowi_common(&fl6)); np = inet6_sk(sk); if (!icmpv6_xrlim_allow(sk, type, &fl6)) goto out; tmp_hdr.icmp6_type = type; tmp_hdr.icmp6_code = code; tmp_hdr.icmp6_cksum = 0; tmp_hdr.icmp6_pointer = htonl(info); if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) fl6.flowi6_oif = READ_ONCE(np->mcast_oif); else if (!fl6.flowi6_oif) fl6.flowi6_oif = READ_ONCE(np->ucast_oif); ipcm6_init_sk(&ipc6, sk); ipc6.sockc.mark = mark; fl6.flowlabel = ip6_make_flowinfo(ipc6.tclass, fl6.flowlabel); dst = icmpv6_route_lookup(net, skb, sk, &fl6); if (IS_ERR(dst)) goto out; ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); msg.skb = skb; msg.offset = skb_network_offset(skb); msg.type = type; len = skb->len - msg.offset; len = min_t(unsigned int, len, IPV6_MIN_MTU - sizeof(struct ipv6hdr) - sizeof(struct icmp6hdr)); if (len < 0) { net_dbg_ratelimited("icmp: len problem [%pI6c > %pI6c]\n", &hdr->saddr, &hdr->daddr); goto out_dst_release; } rcu_read_lock(); idev = __in6_dev_get(skb->dev); if (ip6_append_data(sk, icmpv6_getfrag, &msg, len + sizeof(struct icmp6hdr), sizeof(struct icmp6hdr), &ipc6, &fl6, (struct rt6_info *)dst, MSG_DONTWAIT)) { ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTERRORS); ip6_flush_pending_frames(sk); } else { icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr, len + sizeof(struct icmp6hdr)); } rcu_read_unlock(); out_dst_release: dst_release(dst); out: icmpv6_xmit_unlock(sk); out_bh_enable: local_bh_enable(); } EXPORT_SYMBOL(icmp6_send); /* Slightly more convenient version of icmp6_send with drop reasons. */ void icmpv6_param_prob_reason(struct sk_buff *skb, u8 code, int pos, enum skb_drop_reason reason) { icmp6_send(skb, ICMPV6_PARAMPROB, code, pos, NULL, IP6CB(skb)); kfree_skb_reason(skb, reason); } /* Generate icmpv6 with type/code ICMPV6_DEST_UNREACH/ICMPV6_ADDR_UNREACH * if sufficient data bytes are available * @nhs is the size of the tunnel header(s) : * Either an IPv4 header for SIT encap * an IPv4 header + GRE header for GRE encap */ int ip6_err_gen_icmpv6_unreach(struct sk_buff *skb, int nhs, int type, unsigned int data_len) { struct in6_addr temp_saddr; struct rt6_info *rt; struct sk_buff *skb2; u32 info = 0; if (!pskb_may_pull(skb, nhs + sizeof(struct ipv6hdr) + 8)) return 1; /* RFC 4884 (partial) support for ICMP extensions */ if (data_len < 128 || (data_len & 7) || skb->len < data_len) data_len = 0; skb2 = data_len ? skb_copy(skb, GFP_ATOMIC) : skb_clone(skb, GFP_ATOMIC); if (!skb2) return 1; skb_dst_drop(skb2); skb_pull(skb2, nhs); skb_reset_network_header(skb2); rt = rt6_lookup(dev_net(skb->dev), &ipv6_hdr(skb2)->saddr, NULL, 0, skb, 0); if (rt && rt->dst.dev) skb2->dev = rt->dst.dev; ipv6_addr_set_v4mapped(ip_hdr(skb)->saddr, &temp_saddr); if (data_len) { /* RFC 4884 (partial) support : * insert 0 padding at the end, before the extensions */ __skb_push(skb2, nhs); skb_reset_network_header(skb2); memmove(skb2->data, skb2->data + nhs, data_len - nhs); memset(skb2->data + data_len - nhs, 0, nhs); /* RFC 4884 4.5 : Length is measured in 64-bit words, * and stored in reserved[0] */ info = (data_len/8) << 24; } if (type == ICMP_TIME_EXCEEDED) icmp6_send(skb2, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, info, &temp_saddr, IP6CB(skb2)); else icmp6_send(skb2, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, info, &temp_saddr, IP6CB(skb2)); if (rt) ip6_rt_put(rt); kfree_skb(skb2); return 0; } EXPORT_SYMBOL(ip6_err_gen_icmpv6_unreach); static enum skb_drop_reason icmpv6_echo_reply(struct sk_buff *skb) { struct net *net = dev_net(skb->dev); struct sock *sk; struct inet6_dev *idev; struct ipv6_pinfo *np; const struct in6_addr *saddr = NULL; struct icmp6hdr *icmph = icmp6_hdr(skb); struct icmp6hdr tmp_hdr; struct flowi6 fl6; struct icmpv6_msg msg; struct dst_entry *dst; struct ipcm6_cookie ipc6; u32 mark = IP6_REPLY_MARK(net, skb->mark); SKB_DR(reason); bool acast; u8 type; if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr) && net->ipv6.sysctl.icmpv6_echo_ignore_multicast) return reason; saddr = &ipv6_hdr(skb)->daddr; acast = ipv6_anycast_destination(skb_dst(skb), saddr); if (acast && net->ipv6.sysctl.icmpv6_echo_ignore_anycast) return reason; if (!ipv6_unicast_destination(skb) && !(net->ipv6.sysctl.anycast_src_echo_reply && acast)) saddr = NULL; if (icmph->icmp6_type == ICMPV6_EXT_ECHO_REQUEST) type = ICMPV6_EXT_ECHO_REPLY; else type = ICMPV6_ECHO_REPLY; memcpy(&tmp_hdr, icmph, sizeof(tmp_hdr)); tmp_hdr.icmp6_type = type; memset(&fl6, 0, sizeof(fl6)); if (net->ipv6.sysctl.flowlabel_reflect & FLOWLABEL_REFLECT_ICMPV6_ECHO_REPLIES) fl6.flowlabel = ip6_flowlabel(ipv6_hdr(skb)); fl6.flowi6_proto = IPPROTO_ICMPV6; fl6.daddr = ipv6_hdr(skb)->saddr; if (saddr) fl6.saddr = *saddr; fl6.flowi6_oif = icmp6_iif(skb); fl6.fl6_icmp_type = type; fl6.flowi6_mark = mark; fl6.flowi6_uid = sock_net_uid(net, NULL); security_skb_classify_flow(skb, flowi6_to_flowi_common(&fl6)); local_bh_disable(); sk = icmpv6_xmit_lock(net); if (!sk) goto out_bh_enable; np = inet6_sk(sk); if (!fl6.flowi6_oif && ipv6_addr_is_multicast(&fl6.daddr)) fl6.flowi6_oif = READ_ONCE(np->mcast_oif); else if (!fl6.flowi6_oif) fl6.flowi6_oif = READ_ONCE(np->ucast_oif); if (ip6_dst_lookup(net, sk, &dst, &fl6)) goto out; dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), sk, 0); if (IS_ERR(dst)) goto out; /* Check the ratelimit */ if ((!(skb->dev->flags & IFF_LOOPBACK) && !icmpv6_global_allow(net, ICMPV6_ECHO_REPLY)) || !icmpv6_xrlim_allow(sk, ICMPV6_ECHO_REPLY, &fl6)) goto out_dst_release; idev = __in6_dev_get(skb->dev); msg.skb = skb; msg.offset = 0; msg.type = type; ipcm6_init_sk(&ipc6, sk); ipc6.hlimit = ip6_sk_dst_hoplimit(np, &fl6, dst); ipc6.tclass = ipv6_get_dsfield(ipv6_hdr(skb)); ipc6.sockc.mark = mark; if (icmph->icmp6_type == ICMPV6_EXT_ECHO_REQUEST) if (!icmp_build_probe(skb, (struct icmphdr *)&tmp_hdr)) goto out_dst_release; if (ip6_append_data(sk, icmpv6_getfrag, &msg, skb->len + sizeof(struct icmp6hdr), sizeof(struct icmp6hdr), &ipc6, &fl6, (struct rt6_info *)dst, MSG_DONTWAIT)) { __ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTERRORS); ip6_flush_pending_frames(sk); } else { icmpv6_push_pending_frames(sk, &fl6, &tmp_hdr, skb->len + sizeof(struct icmp6hdr)); reason = SKB_CONSUMED; } out_dst_release: dst_release(dst); out: icmpv6_xmit_unlock(sk); out_bh_enable: local_bh_enable(); return reason; } enum skb_drop_reason icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info) { struct inet6_skb_parm *opt = IP6CB(skb); struct net *net = dev_net(skb->dev); const struct inet6_protocol *ipprot; enum skb_drop_reason reason; int inner_offset; __be16 frag_off; u8 nexthdr; reason = pskb_may_pull_reason(skb, sizeof(struct ipv6hdr)); if (reason != SKB_NOT_DROPPED_YET) goto out; seg6_icmp_srh(skb, opt); nexthdr = ((struct ipv6hdr *)skb->data)->nexthdr; if (ipv6_ext_hdr(nexthdr)) { /* now skip over extension headers */ inner_offset = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, &frag_off); if (inner_offset < 0) { SKB_DR_SET(reason, IPV6_BAD_EXTHDR); goto out; } } else { inner_offset = sizeof(struct ipv6hdr); } /* Checkin header including 8 bytes of inner protocol header. */ reason = pskb_may_pull_reason(skb, inner_offset + 8); if (reason != SKB_NOT_DROPPED_YET) goto out; /* BUGGG_FUTURE: we should try to parse exthdrs in this packet. Without this we will not able f.e. to make source routed pmtu discovery. Corresponding argument (opt) to notifiers is already added. --ANK (980726) */ ipprot = rcu_dereference(inet6_protos[nexthdr]); if (ipprot && ipprot->err_handler) ipprot->err_handler(skb, opt, type, code, inner_offset, info); raw6_icmp_error(skb, nexthdr, type, code, inner_offset, info); return SKB_CONSUMED; out: __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev), ICMP6_MIB_INERRORS); return reason; } /* * Handle icmp messages */ static int icmpv6_rcv(struct sk_buff *skb) { enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; struct net *net = dev_net(skb->dev); struct net_device *dev = icmp6_dev(skb); struct inet6_dev *idev = __in6_dev_get(dev); const struct in6_addr *saddr, *daddr; struct icmp6hdr *hdr; u8 type; if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) { struct sec_path *sp = skb_sec_path(skb); int nh; if (!(sp && sp->xvec[sp->len - 1]->props.flags & XFRM_STATE_ICMP)) { reason = SKB_DROP_REASON_XFRM_POLICY; goto drop_no_count; } if (!pskb_may_pull(skb, sizeof(*hdr) + sizeof(struct ipv6hdr))) goto drop_no_count; nh = skb_network_offset(skb); skb_set_network_header(skb, sizeof(*hdr)); if (!xfrm6_policy_check_reverse(NULL, XFRM_POLICY_IN, skb)) { reason = SKB_DROP_REASON_XFRM_POLICY; goto drop_no_count; } skb_set_network_header(skb, nh); } __ICMP6_INC_STATS(dev_net(dev), idev, ICMP6_MIB_INMSGS); saddr = &ipv6_hdr(skb)->saddr; daddr = &ipv6_hdr(skb)->daddr; if (skb_checksum_validate(skb, IPPROTO_ICMPV6, ip6_compute_pseudo)) { net_dbg_ratelimited("ICMPv6 checksum failed [%pI6c > %pI6c]\n", saddr, daddr); goto csum_error; } if (!pskb_pull(skb, sizeof(*hdr))) goto discard_it; hdr = icmp6_hdr(skb); type = hdr->icmp6_type; ICMP6MSGIN_INC_STATS(dev_net(dev), idev, type); switch (type) { case ICMPV6_ECHO_REQUEST: if (!net->ipv6.sysctl.icmpv6_echo_ignore_all) reason = icmpv6_echo_reply(skb); break; case ICMPV6_EXT_ECHO_REQUEST: if (!net->ipv6.sysctl.icmpv6_echo_ignore_all && READ_ONCE(net->ipv4.sysctl_icmp_echo_enable_probe)) reason = icmpv6_echo_reply(skb); break; case ICMPV6_ECHO_REPLY: reason = ping_rcv(skb); break; case ICMPV6_EXT_ECHO_REPLY: reason = ping_rcv(skb); break; case ICMPV6_PKT_TOOBIG: /* BUGGG_FUTURE: if packet contains rthdr, we cannot update standard destination cache. Seems, only "advanced" destination cache will allow to solve this problem --ANK (980726) */ if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) goto discard_it; hdr = icmp6_hdr(skb); /* to notify */ fallthrough; case ICMPV6_DEST_UNREACH: case ICMPV6_TIME_EXCEED: case ICMPV6_PARAMPROB: reason = icmpv6_notify(skb, type, hdr->icmp6_code, hdr->icmp6_mtu); break; case NDISC_ROUTER_SOLICITATION: case NDISC_ROUTER_ADVERTISEMENT: case NDISC_NEIGHBOUR_SOLICITATION: case NDISC_NEIGHBOUR_ADVERTISEMENT: case NDISC_REDIRECT: reason = ndisc_rcv(skb); break; case ICMPV6_MGM_QUERY: igmp6_event_query(skb); return 0; case ICMPV6_MGM_REPORT: igmp6_event_report(skb); return 0; case ICMPV6_MGM_REDUCTION: case ICMPV6_NI_QUERY: case ICMPV6_NI_REPLY: case ICMPV6_MLD2_REPORT: case ICMPV6_DHAAD_REQUEST: case ICMPV6_DHAAD_REPLY: case ICMPV6_MOBILE_PREFIX_SOL: case ICMPV6_MOBILE_PREFIX_ADV: break; default: /* informational */ if (type & ICMPV6_INFOMSG_MASK) break; net_dbg_ratelimited("icmpv6: msg of unknown type [%pI6c > %pI6c]\n", saddr, daddr); /* * error of unknown type. * must pass to upper level */ reason = icmpv6_notify(skb, type, hdr->icmp6_code, hdr->icmp6_mtu); } /* until the v6 path can be better sorted assume failure and * preserve the status quo behaviour for the rest of the paths to here */ if (reason) kfree_skb_reason(skb, reason); else consume_skb(skb); return 0; csum_error: reason = SKB_DROP_REASON_ICMP_CSUM; __ICMP6_INC_STATS(dev_net(dev), idev, ICMP6_MIB_CSUMERRORS); discard_it: __ICMP6_INC_STATS(dev_net(dev), idev, ICMP6_MIB_INERRORS); drop_no_count: kfree_skb_reason(skb, reason); return 0; } void icmpv6_flow_init(const struct sock *sk, struct flowi6 *fl6, u8 type, const struct in6_addr *saddr, const struct in6_addr *daddr, int oif) { memset(fl6, 0, sizeof(*fl6)); fl6->saddr = *saddr; fl6->daddr = *daddr; fl6->flowi6_proto = IPPROTO_ICMPV6; fl6->fl6_icmp_type = type; fl6->fl6_icmp_code = 0; fl6->flowi6_oif = oif; security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6)); } int __init icmpv6_init(void) { struct sock *sk; int err, i; for_each_possible_cpu(i) { err = inet_ctl_sock_create(&sk, PF_INET6, SOCK_RAW, IPPROTO_ICMPV6, &init_net); if (err < 0) { pr_err("Failed to initialize the ICMP6 control socket (err %d)\n", err); return err; } per_cpu(ipv6_icmp_sk, i) = sk; /* Enough space for 2 64K ICMP packets, including * sk_buff struct overhead. */ sk->sk_sndbuf = 2 * SKB_TRUESIZE(64 * 1024); } err = -EAGAIN; if (inet6_add_protocol(&icmpv6_protocol, IPPROTO_ICMPV6) < 0) goto fail; err = inet6_register_icmp_sender(icmp6_send); if (err) goto sender_reg_err; return 0; sender_reg_err: inet6_del_protocol(&icmpv6_protocol, IPPROTO_ICMPV6); fail: pr_err("Failed to register ICMP6 protocol\n"); return err; } void icmpv6_cleanup(void) { inet6_unregister_icmp_sender(icmp6_send); inet6_del_protocol(&icmpv6_protocol, IPPROTO_ICMPV6); } static const struct icmp6_err { int err; int fatal; } tab_unreach[] = { { /* NOROUTE */ .err = ENETUNREACH, .fatal = 0, }, { /* ADM_PROHIBITED */ .err = EACCES, .fatal = 1, }, { /* Was NOT_NEIGHBOUR, now reserved */ .err = EHOSTUNREACH, .fatal = 0, }, { /* ADDR_UNREACH */ .err = EHOSTUNREACH, .fatal = 0, }, { /* PORT_UNREACH */ .err = ECONNREFUSED, .fatal = 1, }, { /* POLICY_FAIL */ .err = EACCES, .fatal = 1, }, { /* REJECT_ROUTE */ .err = EACCES, .fatal = 1, }, }; int icmpv6_err_convert(u8 type, u8 code, int *err) { int fatal = 0; *err = EPROTO; switch (type) { case ICMPV6_DEST_UNREACH: fatal = 1; if (code < ARRAY_SIZE(tab_unreach)) { *err = tab_unreach[code].err; fatal = tab_unreach[code].fatal; } break; case ICMPV6_PKT_TOOBIG: *err = EMSGSIZE; break; case ICMPV6_PARAMPROB: *err = EPROTO; fatal = 1; break; case ICMPV6_TIME_EXCEED: *err = EHOSTUNREACH; break; } return fatal; } EXPORT_SYMBOL(icmpv6_err_convert); #ifdef CONFIG_SYSCTL static struct ctl_table ipv6_icmp_table_template[] = { { .procname = "ratelimit", .data = &init_net.ipv6.sysctl.icmpv6_time, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_ms_jiffies, }, { .procname = "echo_ignore_all", .data = &init_net.ipv6.sysctl.icmpv6_echo_ignore_all, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "echo_ignore_multicast", .data = &init_net.ipv6.sysctl.icmpv6_echo_ignore_multicast, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "echo_ignore_anycast", .data = &init_net.ipv6.sysctl.icmpv6_echo_ignore_anycast, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, }, { .procname = "ratemask", .data = &init_net.ipv6.sysctl.icmpv6_ratemask_ptr, .maxlen = ICMPV6_MSG_MAX + 1, .mode = 0644, .proc_handler = proc_do_large_bitmap, }, { .procname = "error_anycast_as_unicast", .data = &init_net.ipv6.sysctl.icmpv6_error_anycast_as_unicast, .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { }, }; struct ctl_table * __net_init ipv6_icmp_sysctl_init(struct net *net) { struct ctl_table *table; table = kmemdup(ipv6_icmp_table_template, sizeof(ipv6_icmp_table_template), GFP_KERNEL); if (table) { table[0].data = &net->ipv6.sysctl.icmpv6_time; table[1].data = &net->ipv6.sysctl.icmpv6_echo_ignore_all; table[2].data = &net->ipv6.sysctl.icmpv6_echo_ignore_multicast; table[3].data = &net->ipv6.sysctl.icmpv6_echo_ignore_anycast; table[4].data = &net->ipv6.sysctl.icmpv6_ratemask_ptr; table[5].data = &net->ipv6.sysctl.icmpv6_error_anycast_as_unicast; } return table; } size_t ipv6_icmp_sysctl_table_size(void) { return ARRAY_SIZE(ipv6_icmp_table_template); } #endif
18 18 18 952 955 932 18 18 16 15 1 11 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 // SPDX-License-Identifier: GPL-2.0-only /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org> */ #include <linux/types.h> #include <linux/ip.h> #include <linux/netfilter.h> #include <linux/module.h> #include <linux/rcupdate.h> #include <linux/skbuff.h> #include <net/netns/generic.h> #include <net/route.h> #include <net/ip.h> #include <linux/netfilter_bridge.h> #include <linux/netfilter_ipv4.h> #include <net/netfilter/ipv4/nf_defrag_ipv4.h> #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include <net/netfilter/nf_conntrack.h> #endif #include <net/netfilter/nf_conntrack_zones.h> static DEFINE_MUTEX(defrag4_mutex); static int nf_ct_ipv4_gather_frags(struct net *net, struct sk_buff *skb, u_int32_t user) { int err; local_bh_disable(); err = ip_defrag(net, skb, user); local_bh_enable(); if (!err) skb->ignore_df = 1; return err; } static enum ip_defrag_users nf_ct_defrag_user(unsigned int hooknum, struct sk_buff *skb) { u16 zone_id = NF_CT_DEFAULT_ZONE_ID; #if IS_ENABLED(CONFIG_NF_CONNTRACK) if (skb_nfct(skb)) { enum ip_conntrack_info ctinfo; const struct nf_conn *ct = nf_ct_get(skb, &ctinfo); zone_id = nf_ct_zone_id(nf_ct_zone(ct), CTINFO2DIR(ctinfo)); } #endif if (nf_bridge_in_prerouting(skb)) return IP_DEFRAG_CONNTRACK_BRIDGE_IN + zone_id; if (hooknum == NF_INET_PRE_ROUTING) return IP_DEFRAG_CONNTRACK_IN + zone_id; else return IP_DEFRAG_CONNTRACK_OUT + zone_id; } static unsigned int ipv4_conntrack_defrag(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct sock *sk = skb->sk; if (sk && sk_fullsock(sk) && (sk->sk_family == PF_INET) && inet_test_bit(NODEFRAG, sk)) return NF_ACCEPT; #if IS_ENABLED(CONFIG_NF_CONNTRACK) #if !IS_ENABLED(CONFIG_NF_NAT) /* Previously seen (loopback)? Ignore. Do this before fragment check. */ if (skb_nfct(skb) && !nf_ct_is_template((struct nf_conn *)skb_nfct(skb))) return NF_ACCEPT; #endif if (skb->_nfct == IP_CT_UNTRACKED) return NF_ACCEPT; #endif /* Gather fragments. */ if (ip_is_fragment(ip_hdr(skb))) { enum ip_defrag_users user = nf_ct_defrag_user(state->hook, skb); if (nf_ct_ipv4_gather_frags(state->net, skb, user)) return NF_STOLEN; } return NF_ACCEPT; } static const struct nf_hook_ops ipv4_defrag_ops[] = { { .hook = ipv4_conntrack_defrag, .pf = NFPROTO_IPV4, .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP_PRI_CONNTRACK_DEFRAG, }, { .hook = ipv4_conntrack_defrag, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP_PRI_CONNTRACK_DEFRAG, }, }; static void __net_exit defrag4_net_exit(struct net *net) { if (net->nf.defrag_ipv4_users) { nf_unregister_net_hooks(net, ipv4_defrag_ops, ARRAY_SIZE(ipv4_defrag_ops)); net->nf.defrag_ipv4_users = 0; } } static const struct nf_defrag_hook defrag_hook = { .owner = THIS_MODULE, .enable = nf_defrag_ipv4_enable, .disable = nf_defrag_ipv4_disable, }; static struct pernet_operations defrag4_net_ops = { .exit = defrag4_net_exit, }; static int __init nf_defrag_init(void) { int err; err = register_pernet_subsys(&defrag4_net_ops); if (err) return err; rcu_assign_pointer(nf_defrag_v4_hook, &defrag_hook); return err; } static void __exit nf_defrag_fini(void) { rcu_assign_pointer(nf_defrag_v4_hook, NULL); unregister_pernet_subsys(&defrag4_net_ops); } int nf_defrag_ipv4_enable(struct net *net) { int err = 0; mutex_lock(&defrag4_mutex); if (net->nf.defrag_ipv4_users == UINT_MAX) { err = -EOVERFLOW; goto out_unlock; } if (net->nf.defrag_ipv4_users) { net->nf.defrag_ipv4_users++; goto out_unlock; } err = nf_register_net_hooks(net, ipv4_defrag_ops, ARRAY_SIZE(ipv4_defrag_ops)); if (err == 0) net->nf.defrag_ipv4_users = 1; out_unlock: mutex_unlock(&defrag4_mutex); return err; } EXPORT_SYMBOL_GPL(nf_defrag_ipv4_enable); void nf_defrag_ipv4_disable(struct net *net) { mutex_lock(&defrag4_mutex); if (net->nf.defrag_ipv4_users) { net->nf.defrag_ipv4_users--; if (net->nf.defrag_ipv4_users == 0) nf_unregister_net_hooks(net, ipv4_defrag_ops, ARRAY_SIZE(ipv4_defrag_ops)); } mutex_unlock(&defrag4_mutex); } EXPORT_SYMBOL_GPL(nf_defrag_ipv4_disable); module_init(nf_defrag_init); module_exit(nf_defrag_fini); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("IPv4 defragmentation support");
10 17 10 17 23 16 13 1 3 1 1 1 10 9 9 9 10 7 3 3 3 3 3 15 14 14 15 11 12 10 10 10