2 2 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 | // SPDX-License-Identifier: GPL-2.0-or-later /****************************************************************************** * usbtouchscreen.c * Driver for USB Touchscreens, supporting those devices: * - eGalax Touchkit * includes eTurboTouch CT-410/510/700 * - 3M/Microtouch EX II series * - ITM * - PanJit TouchSet * - eTurboTouch * - Gunze AHL61 * - DMC TSC-10/25 * - IRTOUCHSYSTEMS/UNITOP * - IdealTEK URTC1000 * - General Touch * - GoTop Super_Q2/GogoPen/PenPower tablets * - JASTEC USB touch controller/DigiTech DTR-02U * - Zytronic capacitive touchscreen * - NEXIO/iNexio * - Elo TouchSystems 2700 IntelliTouch * - EasyTouch USB Dual/Multi touch controller from Data Modul * * Copyright (C) 2004-2007 by Daniel Ritz <daniel.ritz@gmx.ch> * Copyright (C) by Todd E. Johnson (mtouchusb.c) * * Driver is based on touchkitusb.c * - ITM parts are from itmtouch.c * - 3M parts are from mtouchusb.c * - PanJit parts are from an unmerged driver by Lanslott Gish * - DMC TSC 10/25 are from Holger Schurig, with ideas from an unmerged * driver from Marius Vollmer * *****************************************************************************/ //#define DEBUG #include <linux/kernel.h> #include <linux/slab.h> #include <linux/input.h> #include <linux/module.h> #include <linux/usb.h> #include <linux/usb/input.h> #include <linux/hid.h> #include <linux/mutex.h> static bool swap_xy; module_param(swap_xy, bool, 0644); MODULE_PARM_DESC(swap_xy, "If set X and Y axes are swapped."); static bool hwcalib_xy; module_param(hwcalib_xy, bool, 0644); MODULE_PARM_DESC(hwcalib_xy, "If set hw-calibrated X/Y are used if available"); /* device specifc data/functions */ struct usbtouch_usb; struct usbtouch_device_info { int min_xc, max_xc; int min_yc, max_yc; int min_press, max_press; int rept_size; /* * Always service the USB devices irq not just when the input device is * open. This is useful when devices have a watchdog which prevents us * from periodically polling the device. Leave this unset unless your * touchscreen device requires it, as it does consume more of the USB * bandwidth. */ bool irq_always; void (*process_pkt) (struct usbtouch_usb *usbtouch, unsigned char *pkt, int len); /* * used to get the packet len. possible return values: * > 0: packet len * = 0: skip one byte * < 0: -return value more bytes needed */ int (*get_pkt_len) (unsigned char *pkt, int len); int (*read_data) (struct usbtouch_usb *usbtouch, unsigned char *pkt); int (*alloc) (struct usbtouch_usb *usbtouch); int (*init) (struct usbtouch_usb *usbtouch); void (*exit) (struct usbtouch_usb *usbtouch); }; /* a usbtouch device */ struct usbtouch_usb { unsigned char *data; dma_addr_t data_dma; int data_size; unsigned char *buffer; int buf_len; struct urb *irq; struct usb_interface *interface; struct input_dev *input; struct usbtouch_device_info *type; struct mutex pm_mutex; /* serialize access to open/suspend */ bool is_open; char name[128]; char phys[64]; void *priv; int x, y; int touch, press; }; /* device types */ enum { DEVTYPE_IGNORE = -1, DEVTYPE_EGALAX, DEVTYPE_PANJIT, DEVTYPE_3M, DEVTYPE_ITM, DEVTYPE_ETURBO, DEVTYPE_GUNZE, DEVTYPE_DMC_TSC10, DEVTYPE_IRTOUCH, DEVTYPE_IRTOUCH_HIRES, DEVTYPE_IDEALTEK, DEVTYPE_GENERAL_TOUCH, DEVTYPE_GOTOP, DEVTYPE_JASTEC, DEVTYPE_E2I, DEVTYPE_ZYTRONIC, DEVTYPE_TC45USB, DEVTYPE_NEXIO, DEVTYPE_ELO, DEVTYPE_ETOUCH, }; #define USB_DEVICE_HID_CLASS(vend, prod) \ .match_flags = USB_DEVICE_ID_MATCH_INT_CLASS \ | USB_DEVICE_ID_MATCH_DEVICE, \ .idVendor = (vend), \ .idProduct = (prod), \ .bInterfaceClass = USB_INTERFACE_CLASS_HID static const struct usb_device_id usbtouch_devices[] = { #ifdef CONFIG_TOUCHSCREEN_USB_EGALAX /* ignore the HID capable devices, handled by usbhid */ {USB_DEVICE_HID_CLASS(0x0eef, 0x0001), .driver_info = DEVTYPE_IGNORE}, {USB_DEVICE_HID_CLASS(0x0eef, 0x0002), .driver_info = DEVTYPE_IGNORE}, /* normal device IDs */ {USB_DEVICE(0x3823, 0x0001), .driver_info = DEVTYPE_EGALAX}, {USB_DEVICE(0x3823, 0x0002), .driver_info = DEVTYPE_EGALAX}, {USB_DEVICE(0x0123, 0x0001), .driver_info = DEVTYPE_EGALAX}, {USB_DEVICE(0x0eef, 0x0001), .driver_info = DEVTYPE_EGALAX}, {USB_DEVICE(0x0eef, 0x0002), .driver_info = DEVTYPE_EGALAX}, {USB_DEVICE(0x1234, 0x0001), .driver_info = DEVTYPE_EGALAX}, {USB_DEVICE(0x1234, 0x0002), .driver_info = DEVTYPE_EGALAX}, #endif #ifdef CONFIG_TOUCHSCREEN_USB_PANJIT {USB_DEVICE(0x134c, 0x0001), .driver_info = DEVTYPE_PANJIT}, {USB_DEVICE(0x134c, 0x0002), .driver_info = DEVTYPE_PANJIT}, {USB_DEVICE(0x134c, 0x0003), .driver_info = DEVTYPE_PANJIT}, {USB_DEVICE(0x134c, 0x0004), .driver_info = DEVTYPE_PANJIT}, #endif #ifdef CONFIG_TOUCHSCREEN_USB_3M {USB_DEVICE(0x0596, 0x0001), .driver_info = DEVTYPE_3M}, #endif #ifdef CONFIG_TOUCHSCREEN_USB_ITM {USB_DEVICE(0x0403, 0xf9e9), .driver_info = DEVTYPE_ITM}, {USB_DEVICE(0x16e3, 0xf9e9), .driver_info = DEVTYPE_ITM}, #endif #ifdef CONFIG_TOUCHSCREEN_USB_ETURBO {USB_DEVICE(0x1234, 0x5678), .driver_info = DEVTYPE_ETURBO}, #endif #ifdef CONFIG_TOUCHSCREEN_USB_GUNZE {USB_DEVICE(0x0637, 0x0001), .driver_info = DEVTYPE_GUNZE}, #endif #ifdef CONFIG_TOUCHSCREEN_USB_DMC_TSC10 {USB_DEVICE(0x0afa, 0x03e8), .driver_info = DEVTYPE_DMC_TSC10}, #endif #ifdef CONFIG_TOUCHSCREEN_USB_IRTOUCH {USB_DEVICE(0x255e, 0x0001), .driver_info = DEVTYPE_IRTOUCH}, {USB_DEVICE(0x595a, 0x0001), .driver_info = DEVTYPE_IRTOUCH}, {USB_DEVICE(0x6615, 0x0001), .driver_info = DEVTYPE_IRTOUCH}, {USB_DEVICE(0x6615, 0x0012), .driver_info = DEVTYPE_IRTOUCH_HIRES}, #endif #ifdef CONFIG_TOUCHSCREEN_USB_IDEALTEK {USB_DEVICE(0x1391, 0x1000), .driver_info = DEVTYPE_IDEALTEK}, #endif #ifdef CONFIG_TOUCHSCREEN_USB_GENERAL_TOUCH {USB_DEVICE(0x0dfc, 0x0001), .driver_info = DEVTYPE_GENERAL_TOUCH}, #endif #ifdef CONFIG_TOUCHSCREEN_USB_GOTOP {USB_DEVICE(0x08f2, 0x007f), .driver_info = DEVTYPE_GOTOP}, {USB_DEVICE(0x08f2, 0x00ce), .driver_info = DEVTYPE_GOTOP}, {USB_DEVICE(0x08f2, 0x00f4), .driver_info = DEVTYPE_GOTOP}, #endif #ifdef CONFIG_TOUCHSCREEN_USB_JASTEC {USB_DEVICE(0x0f92, 0x0001), .driver_info = DEVTYPE_JASTEC}, #endif #ifdef CONFIG_TOUCHSCREEN_USB_E2I {USB_DEVICE(0x1ac7, 0x0001), .driver_info = DEVTYPE_E2I}, #endif #ifdef CONFIG_TOUCHSCREEN_USB_ZYTRONIC {USB_DEVICE(0x14c8, 0x0003), .driver_info = DEVTYPE_ZYTRONIC}, #endif #ifdef CONFIG_TOUCHSCREEN_USB_ETT_TC45USB /* TC5UH */ {USB_DEVICE(0x0664, 0x0309), .driver_info = DEVTYPE_TC45USB}, /* TC4UM */ {USB_DEVICE(0x0664, 0x0306), .driver_info = DEVTYPE_TC45USB}, #endif #ifdef CONFIG_TOUCHSCREEN_USB_NEXIO /* data interface only */ {USB_DEVICE_AND_INTERFACE_INFO(0x10f0, 0x2002, 0x0a, 0x00, 0x00), .driver_info = DEVTYPE_NEXIO}, {USB_DEVICE_AND_INTERFACE_INFO(0x1870, 0x0001, 0x0a, 0x00, 0x00), .driver_info = DEVTYPE_NEXIO}, #endif #ifdef CONFIG_TOUCHSCREEN_USB_ELO {USB_DEVICE(0x04e7, 0x0020), .driver_info = DEVTYPE_ELO}, #endif #ifdef CONFIG_TOUCHSCREEN_USB_EASYTOUCH {USB_DEVICE(0x7374, 0x0001), .driver_info = DEVTYPE_ETOUCH}, #endif {} }; /***************************************************************************** * e2i Part */ #ifdef CONFIG_TOUCHSCREEN_USB_E2I static int e2i_init(struct usbtouch_usb *usbtouch) { int ret; struct usb_device *udev = interface_to_usbdev(usbtouch->interface); ret = usb_control_msg(udev, usb_sndctrlpipe(udev, 0), 0x01, 0x02, 0x0000, 0x0081, NULL, 0, USB_CTRL_SET_TIMEOUT); dev_dbg(&usbtouch->interface->dev, "%s - usb_control_msg - E2I_RESET - bytes|err: %d\n", __func__, ret); return ret; } static int e2i_read_data(struct usbtouch_usb *dev, unsigned char *pkt) { int tmp = (pkt[0] << 8) | pkt[1]; dev->x = (pkt[2] << 8) | pkt[3]; dev->y = (pkt[4] << 8) | pkt[5]; tmp = tmp - 0xA000; dev->touch = (tmp > 0); dev->press = (tmp > 0 ? tmp : 0); return 1; } #endif /***************************************************************************** * eGalax part */ #ifdef CONFIG_TOUCHSCREEN_USB_EGALAX #ifndef MULTI_PACKET #define MULTI_PACKET #endif #define EGALAX_PKT_TYPE_MASK 0xFE #define EGALAX_PKT_TYPE_REPT 0x80 #define EGALAX_PKT_TYPE_DIAG 0x0A static int egalax_init(struct usbtouch_usb *usbtouch) { int ret, i; unsigned char *buf; struct usb_device *udev = interface_to_usbdev(usbtouch->interface); /* * An eGalax diagnostic packet kicks the device into using the right * protocol. We send a "check active" packet. The response will be * read later and ignored. */ buf = kmalloc(3, GFP_KERNEL); if (!buf) return -ENOMEM; buf[0] = EGALAX_PKT_TYPE_DIAG; buf[1] = 1; /* length */ buf[2] = 'A'; /* command - check active */ for (i = 0; i < 3; i++) { ret = usb_control_msg(udev, usb_sndctrlpipe(udev, 0), 0, USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_DEVICE, 0, 0, buf, 3, USB_CTRL_SET_TIMEOUT); if (ret >= 0) { ret = 0; break; } if (ret != -EPIPE) break; } kfree(buf); return ret; } static int egalax_read_data(struct usbtouch_usb *dev, unsigned char *pkt) { if ((pkt[0] & EGALAX_PKT_TYPE_MASK) != EGALAX_PKT_TYPE_REPT) return 0; dev->x = ((pkt[3] & 0x0F) << 7) | (pkt[4] & 0x7F); dev->y = ((pkt[1] & 0x0F) << 7) | (pkt[2] & 0x7F); dev->touch = pkt[0] & 0x01; return 1; } static int egalax_get_pkt_len(unsigned char *buf, int len) { switch (buf[0] & EGALAX_PKT_TYPE_MASK) { case EGALAX_PKT_TYPE_REPT: return 5; case EGALAX_PKT_TYPE_DIAG: if (len < 2) return -1; return buf[1] + 2; } return 0; } #endif /***************************************************************************** * EasyTouch part */ #ifdef CONFIG_TOUCHSCREEN_USB_EASYTOUCH #ifndef MULTI_PACKET #define MULTI_PACKET #endif #define ETOUCH_PKT_TYPE_MASK 0xFE #define ETOUCH_PKT_TYPE_REPT 0x80 #define ETOUCH_PKT_TYPE_REPT2 0xB0 #define ETOUCH_PKT_TYPE_DIAG 0x0A static int etouch_read_data(struct usbtouch_usb *dev, unsigned char *pkt) { if ((pkt[0] & ETOUCH_PKT_TYPE_MASK) != ETOUCH_PKT_TYPE_REPT && (pkt[0] & ETOUCH_PKT_TYPE_MASK) != ETOUCH_PKT_TYPE_REPT2) return 0; dev->x = ((pkt[1] & 0x1F) << 7) | (pkt[2] & 0x7F); dev->y = ((pkt[3] & 0x1F) << 7) | (pkt[4] & 0x7F); dev->touch = pkt[0] & 0x01; return 1; } static int etouch_get_pkt_len(unsigned char *buf, int len) { switch (buf[0] & ETOUCH_PKT_TYPE_MASK) { case ETOUCH_PKT_TYPE_REPT: case ETOUCH_PKT_TYPE_REPT2: return 5; case ETOUCH_PKT_TYPE_DIAG: if (len < 2) return -1; return buf[1] + 2; } return 0; } #endif /***************************************************************************** * PanJit Part */ #ifdef CONFIG_TOUCHSCREEN_USB_PANJIT static int panjit_read_data(struct usbtouch_usb *dev, unsigned char *pkt) { dev->x = ((pkt[2] & 0x0F) << 8) | pkt[1]; dev->y = ((pkt[4] & 0x0F) << 8) | pkt[3]; dev->touch = pkt[0] & 0x01; return 1; } #endif /***************************************************************************** * 3M/Microtouch Part */ #ifdef CONFIG_TOUCHSCREEN_USB_3M #define MTOUCHUSB_ASYNC_REPORT 1 #define MTOUCHUSB_RESET 7 #define MTOUCHUSB_REQ_CTRLLR_ID 10 #define MTOUCHUSB_REQ_CTRLLR_ID_LEN 16 static int mtouch_read_data(struct usbtouch_usb *dev, unsigned char *pkt) { if (hwcalib_xy) { dev->x = (pkt[4] << 8) | pkt[3]; dev->y = 0xffff - ((pkt[6] << 8) | pkt[5]); } else { dev->x = (pkt[8] << 8) | pkt[7]; dev->y = (pkt[10] << 8) | pkt[9]; } dev->touch = (pkt[2] & 0x40) ? 1 : 0; return 1; } struct mtouch_priv { u8 fw_rev_major; u8 fw_rev_minor; }; static ssize_t mtouch_firmware_rev_show(struct device *dev, struct device_attribute *attr, char *output) { struct usb_interface *intf = to_usb_interface(dev); struct usbtouch_usb *usbtouch = usb_get_intfdata(intf); struct mtouch_priv *priv = usbtouch->priv; return scnprintf(output, PAGE_SIZE, "%1x.%1x\n", priv->fw_rev_major, priv->fw_rev_minor); } static DEVICE_ATTR(firmware_rev, 0444, mtouch_firmware_rev_show, NULL); static struct attribute *mtouch_attrs[] = { &dev_attr_firmware_rev.attr, NULL }; static const struct attribute_group mtouch_attr_group = { .attrs = mtouch_attrs, }; static int mtouch_get_fw_revision(struct usbtouch_usb *usbtouch) { struct usb_device *udev = interface_to_usbdev(usbtouch->interface); struct mtouch_priv *priv = usbtouch->priv; u8 *buf; int ret; buf = kzalloc(MTOUCHUSB_REQ_CTRLLR_ID_LEN, GFP_NOIO); if (!buf) return -ENOMEM; ret = usb_control_msg(udev, usb_rcvctrlpipe(udev, 0), MTOUCHUSB_REQ_CTRLLR_ID, USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_DEVICE, 0, 0, buf, MTOUCHUSB_REQ_CTRLLR_ID_LEN, USB_CTRL_SET_TIMEOUT); if (ret != MTOUCHUSB_REQ_CTRLLR_ID_LEN) { dev_warn(&usbtouch->interface->dev, "Failed to read FW rev: %d\n", ret); ret = ret < 0 ? ret : -EIO; goto free; } priv->fw_rev_major = buf[3]; priv->fw_rev_minor = buf[4]; ret = 0; free: kfree(buf); return ret; } static int mtouch_alloc(struct usbtouch_usb *usbtouch) { int ret; usbtouch->priv = kmalloc(sizeof(struct mtouch_priv), GFP_KERNEL); if (!usbtouch->priv) return -ENOMEM; ret = sysfs_create_group(&usbtouch->interface->dev.kobj, &mtouch_attr_group); if (ret) { kfree(usbtouch->priv); usbtouch->priv = NULL; return ret; } return 0; } static int mtouch_init(struct usbtouch_usb *usbtouch) { int ret, i; struct usb_device *udev = interface_to_usbdev(usbtouch->interface); ret = mtouch_get_fw_revision(usbtouch); if (ret) return ret; ret = usb_control_msg(udev, usb_sndctrlpipe(udev, 0), MTOUCHUSB_RESET, USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_DEVICE, 1, 0, NULL, 0, USB_CTRL_SET_TIMEOUT); dev_dbg(&usbtouch->interface->dev, "%s - usb_control_msg - MTOUCHUSB_RESET - bytes|err: %d\n", __func__, ret); if (ret < 0) return ret; msleep(150); for (i = 0; i < 3; i++) { ret = usb_control_msg(udev, usb_sndctrlpipe(udev, 0), MTOUCHUSB_ASYNC_REPORT, USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_DEVICE, 1, 1, NULL, 0, USB_CTRL_SET_TIMEOUT); dev_dbg(&usbtouch->interface->dev, "%s - usb_control_msg - MTOUCHUSB_ASYNC_REPORT - bytes|err: %d\n", __func__, ret); if (ret >= 0) break; if (ret != -EPIPE) return ret; } /* Default min/max xy are the raw values, override if using hw-calib */ if (hwcalib_xy) { input_set_abs_params(usbtouch->input, ABS_X, 0, 0xffff, 0, 0); input_set_abs_params(usbtouch->input, ABS_Y, 0, 0xffff, 0, 0); } return 0; } static void mtouch_exit(struct usbtouch_usb *usbtouch) { struct mtouch_priv *priv = usbtouch->priv; sysfs_remove_group(&usbtouch->interface->dev.kobj, &mtouch_attr_group); kfree(priv); } #endif /***************************************************************************** * ITM Part */ #ifdef CONFIG_TOUCHSCREEN_USB_ITM static int itm_read_data(struct usbtouch_usb *dev, unsigned char *pkt) { int touch; /* * ITM devices report invalid x/y data if not touched. * if the screen was touched before but is not touched any more * report touch as 0 with the last valid x/y data once. then stop * reporting data until touched again. */ dev->press = ((pkt[2] & 0x01) << 7) | (pkt[5] & 0x7F); touch = ~pkt[7] & 0x20; if (!touch) { if (dev->touch) { dev->touch = 0; return 1; } return 0; } dev->x = ((pkt[0] & 0x1F) << 7) | (pkt[3] & 0x7F); dev->y = ((pkt[1] & 0x1F) << 7) | (pkt[4] & 0x7F); dev->touch = touch; return 1; } #endif /***************************************************************************** * eTurboTouch part */ #ifdef CONFIG_TOUCHSCREEN_USB_ETURBO #ifndef MULTI_PACKET #define MULTI_PACKET #endif static int eturbo_read_data(struct usbtouch_usb *dev, unsigned char *pkt) { unsigned int shift; /* packets should start with sync */ if (!(pkt[0] & 0x80)) return 0; shift = (6 - (pkt[0] & 0x03)); dev->x = ((pkt[3] << 7) | pkt[4]) >> shift; dev->y = ((pkt[1] << 7) | pkt[2]) >> shift; dev->touch = (pkt[0] & 0x10) ? 1 : 0; return 1; } static int eturbo_get_pkt_len(unsigned char *buf, int len) { if (buf[0] & 0x80) return 5; if (buf[0] == 0x01) return 3; return 0; } #endif /***************************************************************************** * Gunze part */ #ifdef CONFIG_TOUCHSCREEN_USB_GUNZE static int gunze_read_data(struct usbtouch_usb *dev, unsigned char *pkt) { if (!(pkt[0] & 0x80) || ((pkt[1] | pkt[2] | pkt[3]) & 0x80)) return 0; dev->x = ((pkt[0] & 0x1F) << 7) | (pkt[2] & 0x7F); dev->y = ((pkt[1] & 0x1F) << 7) | (pkt[3] & 0x7F); dev->touch = pkt[0] & 0x20; return 1; } #endif /***************************************************************************** * DMC TSC-10/25 Part * * Documentation about the controller and it's protocol can be found at * http://www.dmccoltd.com/files/controler/tsc10usb_pi_e.pdf * http://www.dmccoltd.com/files/controler/tsc25_usb_e.pdf */ #ifdef CONFIG_TOUCHSCREEN_USB_DMC_TSC10 /* supported data rates. currently using 130 */ #define TSC10_RATE_POINT 0x50 #define TSC10_RATE_30 0x40 #define TSC10_RATE_50 0x41 #define TSC10_RATE_80 0x42 #define TSC10_RATE_100 0x43 #define TSC10_RATE_130 0x44 #define TSC10_RATE_150 0x45 /* commands */ #define TSC10_CMD_RESET 0x55 #define TSC10_CMD_RATE 0x05 #define TSC10_CMD_DATA1 0x01 static int dmc_tsc10_init(struct usbtouch_usb *usbtouch) { struct usb_device *dev = interface_to_usbdev(usbtouch->interface); int ret = -ENOMEM; unsigned char *buf; buf = kmalloc(2, GFP_NOIO); if (!buf) goto err_nobuf; /* reset */ buf[0] = buf[1] = 0xFF; ret = usb_control_msg(dev, usb_rcvctrlpipe (dev, 0), TSC10_CMD_RESET, USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_DEVICE, 0, 0, buf, 2, USB_CTRL_SET_TIMEOUT); if (ret < 0) goto err_out; if (buf[0] != 0x06) { ret = -ENODEV; goto err_out; } /* TSC-25 data sheet specifies a delay after the RESET command */ msleep(150); /* set coordinate output rate */ buf[0] = buf[1] = 0xFF; ret = usb_control_msg(dev, usb_rcvctrlpipe (dev, 0), TSC10_CMD_RATE, USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_DEVICE, TSC10_RATE_150, 0, buf, 2, USB_CTRL_SET_TIMEOUT); if (ret < 0) goto err_out; if ((buf[0] != 0x06) && (buf[0] != 0x15 || buf[1] != 0x01)) { ret = -ENODEV; goto err_out; } /* start sending data */ ret = usb_control_msg(dev, usb_sndctrlpipe(dev, 0), TSC10_CMD_DATA1, USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_DEVICE, 0, 0, NULL, 0, USB_CTRL_SET_TIMEOUT); err_out: kfree(buf); err_nobuf: return ret; } static int dmc_tsc10_read_data(struct usbtouch_usb *dev, unsigned char *pkt) { dev->x = ((pkt[2] & 0x03) << 8) | pkt[1]; dev->y = ((pkt[4] & 0x03) << 8) | pkt[3]; dev->touch = pkt[0] & 0x01; return 1; } #endif /***************************************************************************** * IRTOUCH Part */ #ifdef CONFIG_TOUCHSCREEN_USB_IRTOUCH static int irtouch_read_data(struct usbtouch_usb *dev, unsigned char *pkt) { dev->x = (pkt[3] << 8) | pkt[2]; dev->y = (pkt[5] << 8) | pkt[4]; dev->touch = (pkt[1] & 0x03) ? 1 : 0; return 1; } #endif /***************************************************************************** * ET&T TC5UH/TC4UM part */ #ifdef CONFIG_TOUCHSCREEN_USB_ETT_TC45USB static int tc45usb_read_data(struct usbtouch_usb *dev, unsigned char *pkt) { dev->x = ((pkt[2] & 0x0F) << 8) | pkt[1]; dev->y = ((pkt[4] & 0x0F) << 8) | pkt[3]; dev->touch = pkt[0] & 0x01; return 1; } #endif /***************************************************************************** * IdealTEK URTC1000 Part */ #ifdef CONFIG_TOUCHSCREEN_USB_IDEALTEK #ifndef MULTI_PACKET #define MULTI_PACKET #endif static int idealtek_get_pkt_len(unsigned char *buf, int len) { if (buf[0] & 0x80) return 5; if (buf[0] == 0x01) return len; return 0; } static int idealtek_read_data(struct usbtouch_usb *dev, unsigned char *pkt) { switch (pkt[0] & 0x98) { case 0x88: /* touch data in IdealTEK mode */ dev->x = (pkt[1] << 5) | (pkt[2] >> 2); dev->y = (pkt[3] << 5) | (pkt[4] >> 2); dev->touch = (pkt[0] & 0x40) ? 1 : 0; return 1; case 0x98: /* touch data in MT emulation mode */ dev->x = (pkt[2] << 5) | (pkt[1] >> 2); dev->y = (pkt[4] << 5) | (pkt[3] >> 2); dev->touch = (pkt[0] & 0x40) ? 1 : 0; return 1; default: return 0; } } #endif /***************************************************************************** * General Touch Part */ #ifdef CONFIG_TOUCHSCREEN_USB_GENERAL_TOUCH static int general_touch_read_data(struct usbtouch_usb *dev, unsigned char *pkt) { dev->x = (pkt[2] << 8) | pkt[1]; dev->y = (pkt[4] << 8) | pkt[3]; dev->press = pkt[5] & 0xff; dev->touch = pkt[0] & 0x01; return 1; } #endif /***************************************************************************** * GoTop Part */ #ifdef CONFIG_TOUCHSCREEN_USB_GOTOP static int gotop_read_data(struct usbtouch_usb *dev, unsigned char *pkt) { dev->x = ((pkt[1] & 0x38) << 4) | pkt[2]; dev->y = ((pkt[1] & 0x07) << 7) | pkt[3]; dev->touch = pkt[0] & 0x01; return 1; } #endif /***************************************************************************** * JASTEC Part */ #ifdef CONFIG_TOUCHSCREEN_USB_JASTEC static int jastec_read_data(struct usbtouch_usb *dev, unsigned char *pkt) { dev->x = ((pkt[0] & 0x3f) << 6) | (pkt[2] & 0x3f); dev->y = ((pkt[1] & 0x3f) << 6) | (pkt[3] & 0x3f); dev->touch = (pkt[0] & 0x40) >> 6; return 1; } #endif /***************************************************************************** * Zytronic Part */ #ifdef CONFIG_TOUCHSCREEN_USB_ZYTRONIC static int zytronic_read_data(struct usbtouch_usb *dev, unsigned char *pkt) { struct usb_interface *intf = dev->interface; switch (pkt[0]) { case 0x3A: /* command response */ dev_dbg(&intf->dev, "%s: Command response %d\n", __func__, pkt[1]); break; case 0xC0: /* down */ dev->x = (pkt[1] & 0x7f) | ((pkt[2] & 0x07) << 7); dev->y = (pkt[3] & 0x7f) | ((pkt[4] & 0x07) << 7); dev->touch = 1; dev_dbg(&intf->dev, "%s: down %d,%d\n", __func__, dev->x, dev->y); return 1; case 0x80: /* up */ dev->x = (pkt[1] & 0x7f) | ((pkt[2] & 0x07) << 7); dev->y = (pkt[3] & 0x7f) | ((pkt[4] & 0x07) << 7); dev->touch = 0; dev_dbg(&intf->dev, "%s: up %d,%d\n", __func__, dev->x, dev->y); return 1; default: dev_dbg(&intf->dev, "%s: Unknown return %d\n", __func__, pkt[0]); break; } return 0; } #endif /***************************************************************************** * NEXIO Part */ #ifdef CONFIG_TOUCHSCREEN_USB_NEXIO #define NEXIO_TIMEOUT 5000 #define NEXIO_BUFSIZE 1024 #define NEXIO_THRESHOLD 50 struct nexio_priv { struct urb *ack; unsigned char *ack_buf; }; struct nexio_touch_packet { u8 flags; /* 0xe1 = touch, 0xe1 = release */ __be16 data_len; /* total bytes of touch data */ __be16 x_len; /* bytes for X axis */ __be16 y_len; /* bytes for Y axis */ u8 data[]; } __attribute__ ((packed)); static unsigned char nexio_ack_pkt[2] = { 0xaa, 0x02 }; static unsigned char nexio_init_pkt[4] = { 0x82, 0x04, 0x0a, 0x0f }; static void nexio_ack_complete(struct urb *urb) { } static int nexio_alloc(struct usbtouch_usb *usbtouch) { struct nexio_priv *priv; int ret = -ENOMEM; usbtouch->priv = kmalloc(sizeof(struct nexio_priv), GFP_KERNEL); if (!usbtouch->priv) goto out_buf; priv = usbtouch->priv; priv->ack_buf = kmemdup(nexio_ack_pkt, sizeof(nexio_ack_pkt), GFP_KERNEL); if (!priv->ack_buf) goto err_priv; priv->ack = usb_alloc_urb(0, GFP_KERNEL); if (!priv->ack) { dev_dbg(&usbtouch->interface->dev, "%s - usb_alloc_urb failed: usbtouch->ack\n", __func__); goto err_ack_buf; } return 0; err_ack_buf: kfree(priv->ack_buf); err_priv: kfree(priv); out_buf: return ret; } static int nexio_init(struct usbtouch_usb *usbtouch) { struct usb_device *dev = interface_to_usbdev(usbtouch->interface); struct usb_host_interface *interface = usbtouch->interface->cur_altsetting; struct nexio_priv *priv = usbtouch->priv; int ret = -ENOMEM; int actual_len, i; unsigned char *buf; char *firmware_ver = NULL, *device_name = NULL; int input_ep = 0, output_ep = 0; /* find first input and output endpoint */ for (i = 0; i < interface->desc.bNumEndpoints; i++) { if (!input_ep && usb_endpoint_dir_in(&interface->endpoint[i].desc)) input_ep = interface->endpoint[i].desc.bEndpointAddress; if (!output_ep && usb_endpoint_dir_out(&interface->endpoint[i].desc)) output_ep = interface->endpoint[i].desc.bEndpointAddress; } if (!input_ep || !output_ep) return -ENXIO; buf = kmalloc(NEXIO_BUFSIZE, GFP_NOIO); if (!buf) goto out_buf; /* two empty reads */ for (i = 0; i < 2; i++) { ret = usb_bulk_msg(dev, usb_rcvbulkpipe(dev, input_ep), buf, NEXIO_BUFSIZE, &actual_len, NEXIO_TIMEOUT); if (ret < 0) goto out_buf; } /* send init command */ memcpy(buf, nexio_init_pkt, sizeof(nexio_init_pkt)); ret = usb_bulk_msg(dev, usb_sndbulkpipe(dev, output_ep), buf, sizeof(nexio_init_pkt), &actual_len, NEXIO_TIMEOUT); if (ret < 0) goto out_buf; /* read replies */ for (i = 0; i < 3; i++) { memset(buf, 0, NEXIO_BUFSIZE); ret = usb_bulk_msg(dev, usb_rcvbulkpipe(dev, input_ep), buf, NEXIO_BUFSIZE, &actual_len, NEXIO_TIMEOUT); if (ret < 0 || actual_len < 1 || buf[1] != actual_len) continue; switch (buf[0]) { case 0x83: /* firmware version */ if (!firmware_ver) firmware_ver = kstrdup(&buf[2], GFP_NOIO); break; case 0x84: /* device name */ if (!device_name) device_name = kstrdup(&buf[2], GFP_NOIO); break; } } printk(KERN_INFO "Nexio device: %s, firmware version: %s\n", device_name, firmware_ver); kfree(firmware_ver); kfree(device_name); usb_fill_bulk_urb(priv->ack, dev, usb_sndbulkpipe(dev, output_ep), priv->ack_buf, sizeof(nexio_ack_pkt), nexio_ack_complete, usbtouch); ret = 0; out_buf: kfree(buf); return ret; } static void nexio_exit(struct usbtouch_usb *usbtouch) { struct nexio_priv *priv = usbtouch->priv; usb_kill_urb(priv->ack); usb_free_urb(priv->ack); kfree(priv->ack_buf); kfree(priv); } static int nexio_read_data(struct usbtouch_usb *usbtouch, unsigned char *pkt) { struct device *dev = &usbtouch->interface->dev; struct nexio_touch_packet *packet = (void *) pkt; struct nexio_priv *priv = usbtouch->priv; unsigned int data_len = be16_to_cpu(packet->data_len); unsigned int x_len = be16_to_cpu(packet->x_len); unsigned int y_len = be16_to_cpu(packet->y_len); int x, y, begin_x, begin_y, end_x, end_y, w, h, ret; /* got touch data? */ if ((pkt[0] & 0xe0) != 0xe0) return 0; if (data_len > 0xff) data_len -= 0x100; if (x_len > 0xff) x_len -= 0x80; /* send ACK */ ret = usb_submit_urb(priv->ack, GFP_ATOMIC); if (ret) dev_warn(dev, "Failed to submit ACK URB: %d\n", ret); if (!usbtouch->type->max_xc) { usbtouch->type->max_xc = 2 * x_len; input_set_abs_params(usbtouch->input, ABS_X, 0, usbtouch->type->max_xc, 0, 0); usbtouch->type->max_yc = 2 * y_len; input_set_abs_params(usbtouch->input, ABS_Y, 0, usbtouch->type->max_yc, 0, 0); } /* * The device reports state of IR sensors on X and Y axes. * Each byte represents "darkness" percentage (0-100) of one element. * 17" touchscreen reports only 64 x 52 bytes so the resolution is low. * This also means that there's a limited multi-touch capability but * it's disabled (and untested) here as there's no X driver for that. */ begin_x = end_x = begin_y = end_y = -1; for (x = 0; x < x_len; x++) { if (begin_x == -1 && packet->data[x] > NEXIO_THRESHOLD) { begin_x = x; continue; } if (end_x == -1 && begin_x != -1 && packet->data[x] < NEXIO_THRESHOLD) { end_x = x - 1; for (y = x_len; y < data_len; y++) { if (begin_y == -1 && packet->data[y] > NEXIO_THRESHOLD) { begin_y = y - x_len; continue; } if (end_y == -1 && begin_y != -1 && packet->data[y] < NEXIO_THRESHOLD) { end_y = y - 1 - x_len; w = end_x - begin_x; h = end_y - begin_y; #if 0 /* multi-touch */ input_report_abs(usbtouch->input, ABS_MT_TOUCH_MAJOR, max(w,h)); input_report_abs(usbtouch->input, ABS_MT_TOUCH_MINOR, min(x,h)); input_report_abs(usbtouch->input, ABS_MT_POSITION_X, 2*begin_x+w); input_report_abs(usbtouch->input, ABS_MT_POSITION_Y, 2*begin_y+h); input_report_abs(usbtouch->input, ABS_MT_ORIENTATION, w > h); input_mt_sync(usbtouch->input); #endif /* single touch */ usbtouch->x = 2 * begin_x + w; usbtouch->y = 2 * begin_y + h; usbtouch->touch = packet->flags & 0x01; begin_y = end_y = -1; return 1; } } begin_x = end_x = -1; } } return 0; } #endif /***************************************************************************** * ELO part */ #ifdef CONFIG_TOUCHSCREEN_USB_ELO static int elo_read_data(struct usbtouch_usb *dev, unsigned char *pkt) { dev->x = (pkt[3] << 8) | pkt[2]; dev->y = (pkt[5] << 8) | pkt[4]; dev->touch = pkt[6] > 0; dev->press = pkt[6]; return 1; } #endif /***************************************************************************** * the different device descriptors */ #ifdef MULTI_PACKET static void usbtouch_process_multi(struct usbtouch_usb *usbtouch, unsigned char *pkt, int len); #endif static struct usbtouch_device_info usbtouch_dev_info[] = { #ifdef CONFIG_TOUCHSCREEN_USB_ELO [DEVTYPE_ELO] = { .min_xc = 0x0, .max_xc = 0x0fff, .min_yc = 0x0, .max_yc = 0x0fff, .max_press = 0xff, .rept_size = 8, .read_data = elo_read_data, }, #endif #ifdef CONFIG_TOUCHSCREEN_USB_EGALAX [DEVTYPE_EGALAX] = { .min_xc = 0x0, .max_xc = 0x07ff, .min_yc = 0x0, .max_yc = 0x07ff, .rept_size = 16, .process_pkt = usbtouch_process_multi, .get_pkt_len = egalax_get_pkt_len, .read_data = egalax_read_data, .init = egalax_init, }, #endif #ifdef CONFIG_TOUCHSCREEN_USB_PANJIT [DEVTYPE_PANJIT] = { .min_xc = 0x0, .max_xc = 0x0fff, .min_yc = 0x0, .max_yc = 0x0fff, .rept_size = 8, .read_data = panjit_read_data, }, #endif #ifdef CONFIG_TOUCHSCREEN_USB_3M [DEVTYPE_3M] = { .min_xc = 0x0, .max_xc = 0x4000, .min_yc = 0x0, .max_yc = 0x4000, .rept_size = 11, .read_data = mtouch_read_data, .alloc = mtouch_alloc, .init = mtouch_init, .exit = mtouch_exit, }, #endif #ifdef CONFIG_TOUCHSCREEN_USB_ITM [DEVTYPE_ITM] = { .min_xc = 0x0, .max_xc = 0x0fff, .min_yc = 0x0, .max_yc = 0x0fff, .max_press = 0xff, .rept_size = 8, .read_data = itm_read_data, }, #endif #ifdef CONFIG_TOUCHSCREEN_USB_ETURBO [DEVTYPE_ETURBO] = { .min_xc = 0x0, .max_xc = 0x07ff, .min_yc = 0x0, .max_yc = 0x07ff, .rept_size = 8, .process_pkt = usbtouch_process_multi, .get_pkt_len = eturbo_get_pkt_len, .read_data = eturbo_read_data, }, #endif #ifdef CONFIG_TOUCHSCREEN_USB_GUNZE [DEVTYPE_GUNZE] = { .min_xc = 0x0, .max_xc = 0x0fff, .min_yc = 0x0, .max_yc = 0x0fff, .rept_size = 4, .read_data = gunze_read_data, }, #endif #ifdef CONFIG_TOUCHSCREEN_USB_DMC_TSC10 [DEVTYPE_DMC_TSC10] = { .min_xc = 0x0, .max_xc = 0x03ff, .min_yc = 0x0, .max_yc = 0x03ff, .rept_size = 5, .init = dmc_tsc10_init, .read_data = dmc_tsc10_read_data, }, #endif #ifdef CONFIG_TOUCHSCREEN_USB_IRTOUCH [DEVTYPE_IRTOUCH] = { .min_xc = 0x0, .max_xc = 0x0fff, .min_yc = 0x0, .max_yc = 0x0fff, .rept_size = 8, .read_data = irtouch_read_data, }, [DEVTYPE_IRTOUCH_HIRES] = { .min_xc = 0x0, .max_xc = 0x7fff, .min_yc = 0x0, .max_yc = 0x7fff, .rept_size = 8, .read_data = irtouch_read_data, }, #endif #ifdef CONFIG_TOUCHSCREEN_USB_IDEALTEK [DEVTYPE_IDEALTEK] = { .min_xc = 0x0, .max_xc = 0x0fff, .min_yc = 0x0, .max_yc = 0x0fff, .rept_size = 8, .process_pkt = usbtouch_process_multi, .get_pkt_len = idealtek_get_pkt_len, .read_data = idealtek_read_data, }, #endif #ifdef CONFIG_TOUCHSCREEN_USB_GENERAL_TOUCH [DEVTYPE_GENERAL_TOUCH] = { .min_xc = 0x0, .max_xc = 0x7fff, .min_yc = 0x0, .max_yc = 0x7fff, .rept_size = 7, .read_data = general_touch_read_data, }, #endif #ifdef CONFIG_TOUCHSCREEN_USB_GOTOP [DEVTYPE_GOTOP] = { .min_xc = 0x0, .max_xc = 0x03ff, .min_yc = 0x0, .max_yc = 0x03ff, .rept_size = 4, .read_data = gotop_read_data, }, #endif #ifdef CONFIG_TOUCHSCREEN_USB_JASTEC [DEVTYPE_JASTEC] = { .min_xc = 0x0, .max_xc = 0x0fff, .min_yc = 0x0, .max_yc = 0x0fff, .rept_size = 4, .read_data = jastec_read_data, }, #endif #ifdef CONFIG_TOUCHSCREEN_USB_E2I [DEVTYPE_E2I] = { .min_xc = 0x0, .max_xc = 0x7fff, .min_yc = 0x0, .max_yc = 0x7fff, .rept_size = 6, .init = e2i_init, .read_data = e2i_read_data, }, #endif #ifdef CONFIG_TOUCHSCREEN_USB_ZYTRONIC [DEVTYPE_ZYTRONIC] = { .min_xc = 0x0, .max_xc = 0x03ff, .min_yc = 0x0, .max_yc = 0x03ff, .rept_size = 5, .read_data = zytronic_read_data, .irq_always = true, }, #endif #ifdef CONFIG_TOUCHSCREEN_USB_ETT_TC45USB [DEVTYPE_TC45USB] = { .min_xc = 0x0, .max_xc = 0x0fff, .min_yc = 0x0, .max_yc = 0x0fff, .rept_size = 5, .read_data = tc45usb_read_data, }, #endif #ifdef CONFIG_TOUCHSCREEN_USB_NEXIO [DEVTYPE_NEXIO] = { .rept_size = 1024, .irq_always = true, .read_data = nexio_read_data, .alloc = nexio_alloc, .init = nexio_init, .exit = nexio_exit, }, #endif #ifdef CONFIG_TOUCHSCREEN_USB_EASYTOUCH [DEVTYPE_ETOUCH] = { .min_xc = 0x0, .max_xc = 0x07ff, .min_yc = 0x0, .max_yc = 0x07ff, .rept_size = 16, .process_pkt = usbtouch_process_multi, .get_pkt_len = etouch_get_pkt_len, .read_data = etouch_read_data, }, #endif }; /***************************************************************************** * Generic Part */ static void usbtouch_process_pkt(struct usbtouch_usb *usbtouch, unsigned char *pkt, int len) { struct usbtouch_device_info *type = usbtouch->type; if (!type->read_data(usbtouch, pkt)) return; input_report_key(usbtouch->input, BTN_TOUCH, usbtouch->touch); if (swap_xy) { input_report_abs(usbtouch->input, ABS_X, usbtouch->y); input_report_abs(usbtouch->input, ABS_Y, usbtouch->x); } else { input_report_abs(usbtouch->input, ABS_X, usbtouch->x); input_report_abs(usbtouch->input, ABS_Y, usbtouch->y); } if (type->max_press) input_report_abs(usbtouch->input, ABS_PRESSURE, usbtouch->press); input_sync(usbtouch->input); } #ifdef MULTI_PACKET static void usbtouch_process_multi(struct usbtouch_usb *usbtouch, unsigned char *pkt, int len) { unsigned char *buffer; int pkt_len, pos, buf_len, tmp; /* process buffer */ if (unlikely(usbtouch->buf_len)) { /* try to get size */ pkt_len = usbtouch->type->get_pkt_len( usbtouch->buffer, usbtouch->buf_len); /* drop? */ if (unlikely(!pkt_len)) goto out_flush_buf; /* need to append -pkt_len bytes before able to get size */ if (unlikely(pkt_len < 0)) { int append = -pkt_len; if (unlikely(append > len)) append = len; if (usbtouch->buf_len + append >= usbtouch->type->rept_size) goto out_flush_buf; memcpy(usbtouch->buffer + usbtouch->buf_len, pkt, append); usbtouch->buf_len += append; pkt_len = usbtouch->type->get_pkt_len( usbtouch->buffer, usbtouch->buf_len); if (pkt_len < 0) return; } /* append */ tmp = pkt_len - usbtouch->buf_len; if (usbtouch->buf_len + tmp >= usbtouch->type->rept_size) goto out_flush_buf; memcpy(usbtouch->buffer + usbtouch->buf_len, pkt, tmp); usbtouch_process_pkt(usbtouch, usbtouch->buffer, pkt_len); buffer = pkt + tmp; buf_len = len - tmp; } else { buffer = pkt; buf_len = len; } /* loop over the received packet, process */ pos = 0; while (pos < buf_len) { /* get packet len */ pkt_len = usbtouch->type->get_pkt_len(buffer + pos, buf_len - pos); /* unknown packet: skip one byte */ if (unlikely(!pkt_len)) { pos++; continue; } /* full packet: process */ if (likely((pkt_len > 0) && (pkt_len <= buf_len - pos))) { usbtouch_process_pkt(usbtouch, buffer + pos, pkt_len); } else { /* incomplete packet: save in buffer */ memcpy(usbtouch->buffer, buffer + pos, buf_len - pos); usbtouch->buf_len = buf_len - pos; return; } pos += pkt_len; } out_flush_buf: usbtouch->buf_len = 0; return; } #endif static void usbtouch_irq(struct urb *urb) { struct usbtouch_usb *usbtouch = urb->context; struct device *dev = &usbtouch->interface->dev; int retval; switch (urb->status) { case 0: /* success */ break; case -ETIME: /* this urb is timing out */ dev_dbg(dev, "%s - urb timed out - was the device unplugged?\n", __func__); return; case -ECONNRESET: case -ENOENT: case -ESHUTDOWN: case -EPIPE: /* this urb is terminated, clean up */ dev_dbg(dev, "%s - urb shutting down with status: %d\n", __func__, urb->status); return; default: dev_dbg(dev, "%s - nonzero urb status received: %d\n", __func__, urb->status); goto exit; } usbtouch->type->process_pkt(usbtouch, usbtouch->data, urb->actual_length); exit: usb_mark_last_busy(interface_to_usbdev(usbtouch->interface)); retval = usb_submit_urb(urb, GFP_ATOMIC); if (retval) dev_err(dev, "%s - usb_submit_urb failed with result: %d\n", __func__, retval); } static int usbtouch_open(struct input_dev *input) { struct usbtouch_usb *usbtouch = input_get_drvdata(input); int r; usbtouch->irq->dev = interface_to_usbdev(usbtouch->interface); r = usb_autopm_get_interface(usbtouch->interface) ? -EIO : 0; if (r < 0) goto out; mutex_lock(&usbtouch->pm_mutex); if (!usbtouch->type->irq_always) { if (usb_submit_urb(usbtouch->irq, GFP_KERNEL)) { r = -EIO; goto out_put; } } usbtouch->interface->needs_remote_wakeup = 1; usbtouch->is_open = true; out_put: mutex_unlock(&usbtouch->pm_mutex); usb_autopm_put_interface(usbtouch->interface); out: return r; } static void usbtouch_close(struct input_dev *input) { struct usbtouch_usb *usbtouch = input_get_drvdata(input); int r; mutex_lock(&usbtouch->pm_mutex); if (!usbtouch->type->irq_always) usb_kill_urb(usbtouch->irq); usbtouch->is_open = false; mutex_unlock(&usbtouch->pm_mutex); r = usb_autopm_get_interface(usbtouch->interface); usbtouch->interface->needs_remote_wakeup = 0; if (!r) usb_autopm_put_interface(usbtouch->interface); } static int usbtouch_suspend (struct usb_interface *intf, pm_message_t message) { struct usbtouch_usb *usbtouch = usb_get_intfdata(intf); usb_kill_urb(usbtouch->irq); return 0; } static int usbtouch_resume(struct usb_interface *intf) { struct usbtouch_usb *usbtouch = usb_get_intfdata(intf); int result = 0; mutex_lock(&usbtouch->pm_mutex); if (usbtouch->is_open || usbtouch->type->irq_always) result = usb_submit_urb(usbtouch->irq, GFP_NOIO); mutex_unlock(&usbtouch->pm_mutex); return result; } static int usbtouch_reset_resume(struct usb_interface *intf) { struct usbtouch_usb *usbtouch = usb_get_intfdata(intf); int err = 0; /* reinit the device */ if (usbtouch->type->init) { err = usbtouch->type->init(usbtouch); if (err) { dev_dbg(&intf->dev, "%s - type->init() failed, err: %d\n", __func__, err); return err; } } /* restart IO if needed */ mutex_lock(&usbtouch->pm_mutex); if (usbtouch->is_open) err = usb_submit_urb(usbtouch->irq, GFP_NOIO); mutex_unlock(&usbtouch->pm_mutex); return err; } static void usbtouch_free_buffers(struct usb_device *udev, struct usbtouch_usb *usbtouch) { usb_free_coherent(udev, usbtouch->data_size, usbtouch->data, usbtouch->data_dma); kfree(usbtouch->buffer); } static struct usb_endpoint_descriptor * usbtouch_get_input_endpoint(struct usb_host_interface *interface) { int i; for (i = 0; i < interface->desc.bNumEndpoints; i++) if (usb_endpoint_dir_in(&interface->endpoint[i].desc)) return &interface->endpoint[i].desc; return NULL; } static int usbtouch_probe(struct usb_interface *intf, const struct usb_device_id *id) { struct usbtouch_usb *usbtouch; struct input_dev *input_dev; struct usb_endpoint_descriptor *endpoint; struct usb_device *udev = interface_to_usbdev(intf); struct usbtouch_device_info *type; int err = -ENOMEM; /* some devices are ignored */ if (id->driver_info == DEVTYPE_IGNORE) return -ENODEV; if (id->driver_info >= ARRAY_SIZE(usbtouch_dev_info)) return -ENODEV; endpoint = usbtouch_get_input_endpoint(intf->cur_altsetting); if (!endpoint) return -ENXIO; usbtouch = kzalloc(sizeof(struct usbtouch_usb), GFP_KERNEL); input_dev = input_allocate_device(); if (!usbtouch || !input_dev) goto out_free; mutex_init(&usbtouch->pm_mutex); type = &usbtouch_dev_info[id->driver_info]; usbtouch->type = type; if (!type->process_pkt) type->process_pkt = usbtouch_process_pkt; usbtouch->data_size = type->rept_size; if (type->get_pkt_len) { /* * When dealing with variable-length packets we should * not request more than wMaxPacketSize bytes at once * as we do not know if there is more data coming or * we filled exactly wMaxPacketSize bytes and there is * nothing else. */ usbtouch->data_size = min(usbtouch->data_size, usb_endpoint_maxp(endpoint)); } usbtouch->data = usb_alloc_coherent(udev, usbtouch->data_size, GFP_KERNEL, &usbtouch->data_dma); if (!usbtouch->data) goto out_free; if (type->get_pkt_len) { usbtouch->buffer = kmalloc(type->rept_size, GFP_KERNEL); if (!usbtouch->buffer) goto out_free_buffers; } usbtouch->irq = usb_alloc_urb(0, GFP_KERNEL); if (!usbtouch->irq) { dev_dbg(&intf->dev, "%s - usb_alloc_urb failed: usbtouch->irq\n", __func__); goto out_free_buffers; } usbtouch->interface = intf; usbtouch->input = input_dev; if (udev->manufacturer) strscpy(usbtouch->name, udev->manufacturer, sizeof(usbtouch->name)); if (udev->product) { if (udev->manufacturer) strlcat(usbtouch->name, " ", sizeof(usbtouch->name)); strlcat(usbtouch->name, udev->product, sizeof(usbtouch->name)); } if (!strlen(usbtouch->name)) snprintf(usbtouch->name, sizeof(usbtouch->name), "USB Touchscreen %04x:%04x", le16_to_cpu(udev->descriptor.idVendor), le16_to_cpu(udev->descriptor.idProduct)); usb_make_path(udev, usbtouch->phys, sizeof(usbtouch->phys)); strlcat(usbtouch->phys, "/input0", sizeof(usbtouch->phys)); input_dev->name = usbtouch->name; input_dev->phys = usbtouch->phys; usb_to_input_id(udev, &input_dev->id); input_dev->dev.parent = &intf->dev; input_set_drvdata(input_dev, usbtouch); input_dev->open = usbtouch_open; input_dev->close = usbtouch_close; input_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_ABS); input_dev->keybit[BIT_WORD(BTN_TOUCH)] = BIT_MASK(BTN_TOUCH); input_set_abs_params(input_dev, ABS_X, type->min_xc, type->max_xc, 0, 0); input_set_abs_params(input_dev, ABS_Y, type->min_yc, type->max_yc, 0, 0); if (type->max_press) input_set_abs_params(input_dev, ABS_PRESSURE, type->min_press, type->max_press, 0, 0); if (usb_endpoint_type(endpoint) == USB_ENDPOINT_XFER_INT) usb_fill_int_urb(usbtouch->irq, udev, usb_rcvintpipe(udev, endpoint->bEndpointAddress), usbtouch->data, usbtouch->data_size, usbtouch_irq, usbtouch, endpoint->bInterval); else usb_fill_bulk_urb(usbtouch->irq, udev, usb_rcvbulkpipe(udev, endpoint->bEndpointAddress), usbtouch->data, usbtouch->data_size, usbtouch_irq, usbtouch); usbtouch->irq->dev = udev; usbtouch->irq->transfer_dma = usbtouch->data_dma; usbtouch->irq->transfer_flags |= URB_NO_TRANSFER_DMA_MAP; /* device specific allocations */ if (type->alloc) { err = type->alloc(usbtouch); if (err) { dev_dbg(&intf->dev, "%s - type->alloc() failed, err: %d\n", __func__, err); goto out_free_urb; } } /* device specific initialisation*/ if (type->init) { err = type->init(usbtouch); if (err) { dev_dbg(&intf->dev, "%s - type->init() failed, err: %d\n", __func__, err); goto out_do_exit; } } err = input_register_device(usbtouch->input); if (err) { dev_dbg(&intf->dev, "%s - input_register_device failed, err: %d\n", __func__, err); goto out_do_exit; } usb_set_intfdata(intf, usbtouch); if (usbtouch->type->irq_always) { /* this can't fail */ usb_autopm_get_interface(intf); err = usb_submit_urb(usbtouch->irq, GFP_KERNEL); if (err) { usb_autopm_put_interface(intf); dev_err(&intf->dev, "%s - usb_submit_urb failed with result: %d\n", __func__, err); goto out_unregister_input; } } return 0; out_unregister_input: input_unregister_device(input_dev); input_dev = NULL; out_do_exit: if (type->exit) type->exit(usbtouch); out_free_urb: usb_free_urb(usbtouch->irq); out_free_buffers: usbtouch_free_buffers(udev, usbtouch); out_free: input_free_device(input_dev); kfree(usbtouch); return err; } static void usbtouch_disconnect(struct usb_interface *intf) { struct usbtouch_usb *usbtouch = usb_get_intfdata(intf); if (!usbtouch) return; dev_dbg(&intf->dev, "%s - usbtouch is initialized, cleaning up\n", __func__); usb_set_intfdata(intf, NULL); /* this will stop IO via close */ input_unregister_device(usbtouch->input); usb_free_urb(usbtouch->irq); if (usbtouch->type->exit) usbtouch->type->exit(usbtouch); usbtouch_free_buffers(interface_to_usbdev(intf), usbtouch); kfree(usbtouch); } MODULE_DEVICE_TABLE(usb, usbtouch_devices); static struct usb_driver usbtouch_driver = { .name = "usbtouchscreen", .probe = usbtouch_probe, .disconnect = usbtouch_disconnect, .suspend = usbtouch_suspend, .resume = usbtouch_resume, .reset_resume = usbtouch_reset_resume, .id_table = usbtouch_devices, .supports_autosuspend = 1, }; module_usb_driver(usbtouch_driver); MODULE_AUTHOR("Daniel Ritz <daniel.ritz@gmx.ch>"); MODULE_DESCRIPTION("USB Touchscreen Driver"); MODULE_LICENSE("GPL"); MODULE_ALIAS("touchkitusb"); MODULE_ALIAS("itmtouch"); MODULE_ALIAS("mtouchusb"); |
1918 1902 4 3 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 | // SPDX-License-Identifier: GPL-2.0-only /* * Landlock LSM - Credential hooks * * Copyright © 2017-2020 Mickaël Salaün <mic@digikod.net> * Copyright © 2018-2020 ANSSI */ #include <linux/cred.h> #include <linux/lsm_hooks.h> #include "common.h" #include "cred.h" #include "ruleset.h" #include "setup.h" static int hook_cred_prepare(struct cred *const new, const struct cred *const old, const gfp_t gfp) { struct landlock_ruleset *const old_dom = landlock_cred(old)->domain; if (old_dom) { landlock_get_ruleset(old_dom); landlock_cred(new)->domain = old_dom; } return 0; } static void hook_cred_free(struct cred *const cred) { struct landlock_ruleset *const dom = landlock_cred(cred)->domain; if (dom) landlock_put_ruleset_deferred(dom); } static struct security_hook_list landlock_hooks[] __ro_after_init = { LSM_HOOK_INIT(cred_prepare, hook_cred_prepare), LSM_HOOK_INIT(cred_free, hook_cred_free), }; __init void landlock_add_cred_hooks(void) { security_add_hooks(landlock_hooks, ARRAY_SIZE(landlock_hooks), LANDLOCK_NAME); } |
2 26 27 2 27 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 | // SPDX-License-Identifier: GPL-2.0 /* * fs/partitions/amiga.c * * Code extracted from drivers/block/genhd.c * * Copyright (C) 1991-1998 Linus Torvalds * Re-organised Feb 1998 Russell King */ #define pr_fmt(fmt) fmt #include <linux/types.h> #include <linux/mm_types.h> #include <linux/overflow.h> #include <linux/affs_hardblocks.h> #include "check.h" /* magic offsets in partition DosEnvVec */ #define NR_HD 3 #define NR_SECT 5 #define LO_CYL 9 #define HI_CYL 10 static __inline__ u32 checksum_block(__be32 *m, int size) { u32 sum = 0; while (size--) sum += be32_to_cpu(*m++); return sum; } int amiga_partition(struct parsed_partitions *state) { Sector sect; unsigned char *data; struct RigidDiskBlock *rdb; struct PartitionBlock *pb; u64 start_sect, nr_sects; sector_t blk, end_sect; u32 cylblk; /* rdb_CylBlocks = nr_heads*sect_per_track */ u32 nr_hd, nr_sect, lo_cyl, hi_cyl; int part, res = 0; unsigned int blksize = 1; /* Multiplier for disk block size */ int slot = 1; for (blk = 0; ; blk++, put_dev_sector(sect)) { if (blk == RDB_ALLOCATION_LIMIT) goto rdb_done; data = read_part_sector(state, blk, §); if (!data) { pr_err("Dev %s: unable to read RDB block %llu\n", state->disk->disk_name, blk); res = -1; goto rdb_done; } if (*(__be32 *)data != cpu_to_be32(IDNAME_RIGIDDISK)) continue; rdb = (struct RigidDiskBlock *)data; if (checksum_block((__be32 *)data, be32_to_cpu(rdb->rdb_SummedLongs) & 0x7F) == 0) break; /* Try again with 0xdc..0xdf zeroed, Windows might have * trashed it. */ *(__be32 *)(data+0xdc) = 0; if (checksum_block((__be32 *)data, be32_to_cpu(rdb->rdb_SummedLongs) & 0x7F)==0) { pr_err("Trashed word at 0xd0 in block %llu ignored in checksum calculation\n", blk); break; } pr_err("Dev %s: RDB in block %llu has bad checksum\n", state->disk->disk_name, blk); } /* blksize is blocks per 512 byte standard block */ blksize = be32_to_cpu( rdb->rdb_BlockBytes ) / 512; { char tmp[7 + 10 + 1 + 1]; /* Be more informative */ snprintf(tmp, sizeof(tmp), " RDSK (%d)", blksize * 512); strlcat(state->pp_buf, tmp, PAGE_SIZE); } blk = be32_to_cpu(rdb->rdb_PartitionList); put_dev_sector(sect); for (part = 1; (s32) blk>0 && part<=16; part++, put_dev_sector(sect)) { /* Read in terms partition table understands */ if (check_mul_overflow(blk, (sector_t) blksize, &blk)) { pr_err("Dev %s: overflow calculating partition block %llu! Skipping partitions %u and beyond\n", state->disk->disk_name, blk, part); break; } data = read_part_sector(state, blk, §); if (!data) { pr_err("Dev %s: unable to read partition block %llu\n", state->disk->disk_name, blk); res = -1; goto rdb_done; } pb = (struct PartitionBlock *)data; blk = be32_to_cpu(pb->pb_Next); if (pb->pb_ID != cpu_to_be32(IDNAME_PARTITION)) continue; if (checksum_block((__be32 *)pb, be32_to_cpu(pb->pb_SummedLongs) & 0x7F) != 0 ) continue; /* RDB gives us more than enough rope to hang ourselves with, * many times over (2^128 bytes if all fields max out). * Some careful checks are in order, so check for potential * overflows. * We are multiplying four 32 bit numbers to one sector_t! */ nr_hd = be32_to_cpu(pb->pb_Environment[NR_HD]); nr_sect = be32_to_cpu(pb->pb_Environment[NR_SECT]); /* CylBlocks is total number of blocks per cylinder */ if (check_mul_overflow(nr_hd, nr_sect, &cylblk)) { pr_err("Dev %s: heads*sects %u overflows u32, skipping partition!\n", state->disk->disk_name, cylblk); continue; } /* check for consistency with RDB defined CylBlocks */ if (cylblk > be32_to_cpu(rdb->rdb_CylBlocks)) { pr_warn("Dev %s: cylblk %u > rdb_CylBlocks %u!\n", state->disk->disk_name, cylblk, be32_to_cpu(rdb->rdb_CylBlocks)); } /* RDB allows for variable logical block size - * normalize to 512 byte blocks and check result. */ if (check_mul_overflow(cylblk, blksize, &cylblk)) { pr_err("Dev %s: partition %u bytes per cyl. overflows u32, skipping partition!\n", state->disk->disk_name, part); continue; } /* Calculate partition start and end. Limit of 32 bit on cylblk * guarantees no overflow occurs if LBD support is enabled. */ lo_cyl = be32_to_cpu(pb->pb_Environment[LO_CYL]); start_sect = ((u64) lo_cyl * cylblk); hi_cyl = be32_to_cpu(pb->pb_Environment[HI_CYL]); nr_sects = (((u64) hi_cyl - lo_cyl + 1) * cylblk); if (!nr_sects) continue; /* Warn user if partition end overflows u32 (AmigaDOS limit) */ if ((start_sect + nr_sects) > UINT_MAX) { pr_warn("Dev %s: partition %u (%llu-%llu) needs 64 bit device support!\n", state->disk->disk_name, part, start_sect, start_sect + nr_sects); } if (check_add_overflow(start_sect, nr_sects, &end_sect)) { pr_err("Dev %s: partition %u (%llu-%llu) needs LBD device support, skipping partition!\n", state->disk->disk_name, part, start_sect, end_sect); continue; } /* Tell Kernel about it */ put_partition(state,slot++,start_sect,nr_sects); { /* Be even more informative to aid mounting */ char dostype[4]; char tmp[42]; __be32 *dt = (__be32 *)dostype; *dt = pb->pb_Environment[16]; if (dostype[3] < ' ') snprintf(tmp, sizeof(tmp), " (%c%c%c^%c)", dostype[0], dostype[1], dostype[2], dostype[3] + '@' ); else snprintf(tmp, sizeof(tmp), " (%c%c%c%c)", dostype[0], dostype[1], dostype[2], dostype[3]); strlcat(state->pp_buf, tmp, PAGE_SIZE); snprintf(tmp, sizeof(tmp), "(res %d spb %d)", be32_to_cpu(pb->pb_Environment[6]), be32_to_cpu(pb->pb_Environment[4])); strlcat(state->pp_buf, tmp, PAGE_SIZE); } res = 1; } strlcat(state->pp_buf, "\n", PAGE_SIZE); rdb_done: return res; } |
1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 | // SPDX-License-Identifier: GPL-2.0 /* Sysctl interface for parport devices. * * Authors: David Campbell * Tim Waugh <tim@cyberelk.demon.co.uk> * Philip Blundell <philb@gnu.org> * Andrea Arcangeli * Riccardo Facchetti <fizban@tin.it> * * based on work by Grant Guenther <grant@torque.net> * and Philip Blundell * * Cleaned up include files - Russell King <linux@arm.uk.linux.org> */ #include <linux/string.h> #include <linux/init.h> #include <linux/module.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/parport.h> #include <linux/ctype.h> #include <linux/sysctl.h> #include <linux/device.h> #include <linux/uaccess.h> #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) #define PARPORT_MIN_TIMESLICE_VALUE 1ul #define PARPORT_MAX_TIMESLICE_VALUE ((unsigned long) HZ) #define PARPORT_MIN_SPINTIME_VALUE 1 #define PARPORT_MAX_SPINTIME_VALUE 1000 static int do_active_device(struct ctl_table *table, int write, void *result, size_t *lenp, loff_t *ppos) { struct parport *port = (struct parport *)table->extra1; char buffer[256]; struct pardevice *dev; int len = 0; if (write) /* can't happen anyway */ return -EACCES; if (*ppos) { *lenp = 0; return 0; } for (dev = port->devices; dev ; dev = dev->next) { if(dev == port->cad) { len += sprintf(buffer, "%s\n", dev->name); } } if(!len) { len += sprintf(buffer, "%s\n", "none"); } if (len > *lenp) len = *lenp; else *lenp = len; *ppos += len; memcpy(result, buffer, len); return 0; } #ifdef CONFIG_PARPORT_1284 static int do_autoprobe(struct ctl_table *table, int write, void *result, size_t *lenp, loff_t *ppos) { struct parport_device_info *info = table->extra2; const char *str; char buffer[256]; int len = 0; if (write) /* permissions stop this */ return -EACCES; if (*ppos) { *lenp = 0; return 0; } if ((str = info->class_name) != NULL) len += sprintf (buffer + len, "CLASS:%s;\n", str); if ((str = info->model) != NULL) len += sprintf (buffer + len, "MODEL:%s;\n", str); if ((str = info->mfr) != NULL) len += sprintf (buffer + len, "MANUFACTURER:%s;\n", str); if ((str = info->description) != NULL) len += sprintf (buffer + len, "DESCRIPTION:%s;\n", str); if ((str = info->cmdset) != NULL) len += sprintf (buffer + len, "COMMAND SET:%s;\n", str); if (len > *lenp) len = *lenp; else *lenp = len; *ppos += len; memcpy(result, buffer, len); return 0; } #endif /* IEEE1284.3 support. */ static int do_hardware_base_addr(struct ctl_table *table, int write, void *result, size_t *lenp, loff_t *ppos) { struct parport *port = (struct parport *)table->extra1; char buffer[20]; int len = 0; if (*ppos) { *lenp = 0; return 0; } if (write) /* permissions prevent this anyway */ return -EACCES; len += sprintf (buffer, "%lu\t%lu\n", port->base, port->base_hi); if (len > *lenp) len = *lenp; else *lenp = len; *ppos += len; memcpy(result, buffer, len); return 0; } static int do_hardware_irq(struct ctl_table *table, int write, void *result, size_t *lenp, loff_t *ppos) { struct parport *port = (struct parport *)table->extra1; char buffer[20]; int len = 0; if (*ppos) { *lenp = 0; return 0; } if (write) /* permissions prevent this anyway */ return -EACCES; len += sprintf (buffer, "%d\n", port->irq); if (len > *lenp) len = *lenp; else *lenp = len; *ppos += len; memcpy(result, buffer, len); return 0; } static int do_hardware_dma(struct ctl_table *table, int write, void *result, size_t *lenp, loff_t *ppos) { struct parport *port = (struct parport *)table->extra1; char buffer[20]; int len = 0; if (*ppos) { *lenp = 0; return 0; } if (write) /* permissions prevent this anyway */ return -EACCES; len += sprintf (buffer, "%d\n", port->dma); if (len > *lenp) len = *lenp; else *lenp = len; *ppos += len; memcpy(result, buffer, len); return 0; } static int do_hardware_modes(struct ctl_table *table, int write, void *result, size_t *lenp, loff_t *ppos) { struct parport *port = (struct parport *)table->extra1; char buffer[40]; int len = 0; if (*ppos) { *lenp = 0; return 0; } if (write) /* permissions prevent this anyway */ return -EACCES; { #define printmode(x) \ do { \ if (port->modes & PARPORT_MODE_##x) \ len += sprintf(buffer + len, "%s%s", f++ ? "," : "", #x); \ } while (0) int f = 0; printmode(PCSPP); printmode(TRISTATE); printmode(COMPAT); printmode(EPP); printmode(ECP); printmode(DMA); #undef printmode } buffer[len++] = '\n'; if (len > *lenp) len = *lenp; else *lenp = len; *ppos += len; memcpy(result, buffer, len); return 0; } static const unsigned long parport_min_timeslice_value = PARPORT_MIN_TIMESLICE_VALUE; static const unsigned long parport_max_timeslice_value = PARPORT_MAX_TIMESLICE_VALUE; static const int parport_min_spintime_value = PARPORT_MIN_SPINTIME_VALUE; static const int parport_max_spintime_value = PARPORT_MAX_SPINTIME_VALUE; struct parport_sysctl_table { struct ctl_table_header *port_header; struct ctl_table_header *devices_header; #ifdef CONFIG_PARPORT_1284 struct ctl_table vars[10]; #else struct ctl_table vars[5]; #endif /* IEEE 1284 support */ struct ctl_table device_dir[1]; }; static const struct parport_sysctl_table parport_sysctl_template = { .port_header = NULL, .devices_header = NULL, { { .procname = "spintime", .data = NULL, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = (void*) &parport_min_spintime_value, .extra2 = (void*) &parport_max_spintime_value }, { .procname = "base-addr", .data = NULL, .maxlen = 0, .mode = 0444, .proc_handler = do_hardware_base_addr }, { .procname = "irq", .data = NULL, .maxlen = 0, .mode = 0444, .proc_handler = do_hardware_irq }, { .procname = "dma", .data = NULL, .maxlen = 0, .mode = 0444, .proc_handler = do_hardware_dma }, { .procname = "modes", .data = NULL, .maxlen = 0, .mode = 0444, .proc_handler = do_hardware_modes }, #ifdef CONFIG_PARPORT_1284 { .procname = "autoprobe", .data = NULL, .maxlen = 0, .mode = 0444, .proc_handler = do_autoprobe }, { .procname = "autoprobe0", .data = NULL, .maxlen = 0, .mode = 0444, .proc_handler = do_autoprobe }, { .procname = "autoprobe1", .data = NULL, .maxlen = 0, .mode = 0444, .proc_handler = do_autoprobe }, { .procname = "autoprobe2", .data = NULL, .maxlen = 0, .mode = 0444, .proc_handler = do_autoprobe }, { .procname = "autoprobe3", .data = NULL, .maxlen = 0, .mode = 0444, .proc_handler = do_autoprobe }, #endif /* IEEE 1284 support */ }, { { .procname = "active", .data = NULL, .maxlen = 0, .mode = 0444, .proc_handler = do_active_device }, }, }; struct parport_device_sysctl_table { struct ctl_table_header *sysctl_header; struct ctl_table vars[1]; struct ctl_table device_dir[1]; }; static const struct parport_device_sysctl_table parport_device_sysctl_template = { .sysctl_header = NULL, { { .procname = "timeslice", .data = NULL, .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_doulongvec_ms_jiffies_minmax, .extra1 = (void*) &parport_min_timeslice_value, .extra2 = (void*) &parport_max_timeslice_value }, }, { { .procname = NULL, .data = NULL, .maxlen = 0, .mode = 0555, }, } }; struct parport_default_sysctl_table { struct ctl_table_header *sysctl_header; struct ctl_table vars[2]; }; static struct parport_default_sysctl_table parport_default_sysctl_table = { .sysctl_header = NULL, { { .procname = "timeslice", .data = &parport_default_timeslice, .maxlen = sizeof(parport_default_timeslice), .mode = 0644, .proc_handler = proc_doulongvec_ms_jiffies_minmax, .extra1 = (void*) &parport_min_timeslice_value, .extra2 = (void*) &parport_max_timeslice_value }, { .procname = "spintime", .data = &parport_default_spintime, .maxlen = sizeof(parport_default_spintime), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = (void*) &parport_min_spintime_value, .extra2 = (void*) &parport_max_spintime_value }, } }; int parport_proc_register(struct parport *port) { struct parport_sysctl_table *t; char *tmp_dir_path; int i, err = 0; t = kmemdup(&parport_sysctl_template, sizeof(*t), GFP_KERNEL); if (t == NULL) return -ENOMEM; t->device_dir[0].extra1 = port; t->vars[0].data = &port->spintime; for (i = 0; i < 5; i++) { t->vars[i].extra1 = port; #ifdef CONFIG_PARPORT_1284 t->vars[5 + i].extra2 = &port->probe_info[i]; #endif /* IEEE 1284 support */ } tmp_dir_path = kasprintf(GFP_KERNEL, "dev/parport/%s/devices", port->name); if (!tmp_dir_path) { err = -ENOMEM; goto exit_free_t; } t->devices_header = register_sysctl(tmp_dir_path, t->device_dir); if (t->devices_header == NULL) { err = -ENOENT; goto exit_free_tmp_dir_path; } kfree(tmp_dir_path); tmp_dir_path = kasprintf(GFP_KERNEL, "dev/parport/%s", port->name); if (!tmp_dir_path) { err = -ENOMEM; goto unregister_devices_h; } t->port_header = register_sysctl(tmp_dir_path, t->vars); if (t->port_header == NULL) { err = -ENOENT; goto unregister_devices_h; } port->sysctl_table = t; kfree(tmp_dir_path); return 0; unregister_devices_h: unregister_sysctl_table(t->devices_header); exit_free_tmp_dir_path: kfree(tmp_dir_path); exit_free_t: kfree(t); return err; } int parport_proc_unregister(struct parport *port) { if (port->sysctl_table) { struct parport_sysctl_table *t = port->sysctl_table; port->sysctl_table = NULL; unregister_sysctl_table(t->devices_header); unregister_sysctl_table(t->port_header); kfree(t); } return 0; } int parport_device_proc_register(struct pardevice *device) { struct parport_device_sysctl_table *t; struct parport * port = device->port; char *tmp_dir_path; int err = 0; t = kmemdup(&parport_device_sysctl_template, sizeof(*t), GFP_KERNEL); if (t == NULL) return -ENOMEM; /* Allocate a buffer for two paths: dev/parport/PORT/devices/DEVICE. */ tmp_dir_path = kasprintf(GFP_KERNEL, "dev/parport/%s/devices/%s", port->name, device->name); if (!tmp_dir_path) { err = -ENOMEM; goto exit_free_t; } t->vars[0].data = &device->timeslice; t->sysctl_header = register_sysctl(tmp_dir_path, t->vars); if (t->sysctl_header == NULL) { kfree(t); t = NULL; } device->sysctl_table = t; kfree(tmp_dir_path); return 0; exit_free_t: kfree(t); return err; } int parport_device_proc_unregister(struct pardevice *device) { if (device->sysctl_table) { struct parport_device_sysctl_table *t = device->sysctl_table; device->sysctl_table = NULL; unregister_sysctl_table(t->sysctl_header); kfree(t); } return 0; } static int __init parport_default_proc_register(void) { int ret; parport_default_sysctl_table.sysctl_header = register_sysctl("dev/parport/default", parport_default_sysctl_table.vars); if (!parport_default_sysctl_table.sysctl_header) return -ENOMEM; ret = parport_bus_init(); if (ret) { unregister_sysctl_table(parport_default_sysctl_table. sysctl_header); return ret; } return 0; } static void __exit parport_default_proc_unregister(void) { if (parport_default_sysctl_table.sysctl_header) { unregister_sysctl_table(parport_default_sysctl_table. sysctl_header); parport_default_sysctl_table.sysctl_header = NULL; } parport_bus_exit(); } #else /* no sysctl or no procfs*/ int parport_proc_register(struct parport *pp) { return 0; } int parport_proc_unregister(struct parport *pp) { return 0; } int parport_device_proc_register(struct pardevice *device) { return 0; } int parport_device_proc_unregister(struct pardevice *device) { return 0; } static int __init parport_default_proc_register (void) { return parport_bus_init(); } static void __exit parport_default_proc_unregister (void) { parport_bus_exit(); } #endif subsys_initcall(parport_default_proc_register) module_exit(parport_default_proc_unregister) |
95 109 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_MSR_H #define _ASM_X86_MSR_H #include "msr-index.h" #ifndef __ASSEMBLY__ #include <asm/asm.h> #include <asm/errno.h> #include <asm/cpumask.h> #include <uapi/asm/msr.h> #include <asm/shared/msr.h> struct msr_info { u32 msr_no; struct msr reg; struct msr *msrs; int err; }; struct msr_regs_info { u32 *regs; int err; }; struct saved_msr { bool valid; struct msr_info info; }; struct saved_msrs { unsigned int num; struct saved_msr *array; }; /* * both i386 and x86_64 returns 64-bit value in edx:eax, but gcc's "A" * constraint has different meanings. For i386, "A" means exactly * edx:eax, while for x86_64 it doesn't mean rdx:rax or edx:eax. Instead, * it means rax *or* rdx. */ #ifdef CONFIG_X86_64 /* Using 64-bit values saves one instruction clearing the high half of low */ #define DECLARE_ARGS(val, low, high) unsigned long low, high #define EAX_EDX_VAL(val, low, high) ((low) | (high) << 32) #define EAX_EDX_RET(val, low, high) "=a" (low), "=d" (high) #else #define DECLARE_ARGS(val, low, high) unsigned long long val #define EAX_EDX_VAL(val, low, high) (val) #define EAX_EDX_RET(val, low, high) "=A" (val) #endif /* * Be very careful with includes. This header is prone to include loops. */ #include <asm/atomic.h> #include <linux/tracepoint-defs.h> #ifdef CONFIG_TRACEPOINTS DECLARE_TRACEPOINT(read_msr); DECLARE_TRACEPOINT(write_msr); DECLARE_TRACEPOINT(rdpmc); extern void do_trace_write_msr(unsigned int msr, u64 val, int failed); extern void do_trace_read_msr(unsigned int msr, u64 val, int failed); extern void do_trace_rdpmc(unsigned int msr, u64 val, int failed); #else static inline void do_trace_write_msr(unsigned int msr, u64 val, int failed) {} static inline void do_trace_read_msr(unsigned int msr, u64 val, int failed) {} static inline void do_trace_rdpmc(unsigned int msr, u64 val, int failed) {} #endif /* * __rdmsr() and __wrmsr() are the two primitives which are the bare minimum MSR * accessors and should not have any tracing or other functionality piggybacking * on them - those are *purely* for accessing MSRs and nothing more. So don't even * think of extending them - you will be slapped with a stinking trout or a frozen * shark will reach you, wherever you are! You've been warned. */ static __always_inline unsigned long long __rdmsr(unsigned int msr) { DECLARE_ARGS(val, low, high); asm volatile("1: rdmsr\n" "2:\n" _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_RDMSR) : EAX_EDX_RET(val, low, high) : "c" (msr)); return EAX_EDX_VAL(val, low, high); } static __always_inline void __wrmsr(unsigned int msr, u32 low, u32 high) { asm volatile("1: wrmsr\n" "2:\n" _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR) : : "c" (msr), "a"(low), "d" (high) : "memory"); } #define native_rdmsr(msr, val1, val2) \ do { \ u64 __val = __rdmsr((msr)); \ (void)((val1) = (u32)__val); \ (void)((val2) = (u32)(__val >> 32)); \ } while (0) #define native_wrmsr(msr, low, high) \ __wrmsr(msr, low, high) #define native_wrmsrl(msr, val) \ __wrmsr((msr), (u32)((u64)(val)), \ (u32)((u64)(val) >> 32)) static inline unsigned long long native_read_msr(unsigned int msr) { unsigned long long val; val = __rdmsr(msr); if (tracepoint_enabled(read_msr)) do_trace_read_msr(msr, val, 0); return val; } static inline unsigned long long native_read_msr_safe(unsigned int msr, int *err) { DECLARE_ARGS(val, low, high); asm volatile("1: rdmsr ; xor %[err],%[err]\n" "2:\n\t" _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_RDMSR_SAFE, %[err]) : [err] "=r" (*err), EAX_EDX_RET(val, low, high) : "c" (msr)); if (tracepoint_enabled(read_msr)) do_trace_read_msr(msr, EAX_EDX_VAL(val, low, high), *err); return EAX_EDX_VAL(val, low, high); } /* Can be uninlined because referenced by paravirt */ static inline void notrace native_write_msr(unsigned int msr, u32 low, u32 high) { __wrmsr(msr, low, high); if (tracepoint_enabled(write_msr)) do_trace_write_msr(msr, ((u64)high << 32 | low), 0); } /* Can be uninlined because referenced by paravirt */ static inline int notrace native_write_msr_safe(unsigned int msr, u32 low, u32 high) { int err; asm volatile("1: wrmsr ; xor %[err],%[err]\n" "2:\n\t" _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_WRMSR_SAFE, %[err]) : [err] "=a" (err) : "c" (msr), "0" (low), "d" (high) : "memory"); if (tracepoint_enabled(write_msr)) do_trace_write_msr(msr, ((u64)high << 32 | low), err); return err; } extern int rdmsr_safe_regs(u32 regs[8]); extern int wrmsr_safe_regs(u32 regs[8]); /** * rdtsc() - returns the current TSC without ordering constraints * * rdtsc() returns the result of RDTSC as a 64-bit integer. The * only ordering constraint it supplies is the ordering implied by * "asm volatile": it will put the RDTSC in the place you expect. The * CPU can and will speculatively execute that RDTSC, though, so the * results can be non-monotonic if compared on different CPUs. */ static __always_inline unsigned long long rdtsc(void) { DECLARE_ARGS(val, low, high); asm volatile("rdtsc" : EAX_EDX_RET(val, low, high)); return EAX_EDX_VAL(val, low, high); } /** * rdtsc_ordered() - read the current TSC in program order * * rdtsc_ordered() returns the result of RDTSC as a 64-bit integer. * It is ordered like a load to a global in-memory counter. It should * be impossible to observe non-monotonic rdtsc_unordered() behavior * across multiple CPUs as long as the TSC is synced. */ static __always_inline unsigned long long rdtsc_ordered(void) { DECLARE_ARGS(val, low, high); /* * The RDTSC instruction is not ordered relative to memory * access. The Intel SDM and the AMD APM are both vague on this * point, but empirically an RDTSC instruction can be * speculatively executed before prior loads. An RDTSC * immediately after an appropriate barrier appears to be * ordered as a normal load, that is, it provides the same * ordering guarantees as reading from a global memory location * that some other imaginary CPU is updating continuously with a * time stamp. * * Thus, use the preferred barrier on the respective CPU, aiming for * RDTSCP as the default. */ asm volatile(ALTERNATIVE_2("rdtsc", "lfence; rdtsc", X86_FEATURE_LFENCE_RDTSC, "rdtscp", X86_FEATURE_RDTSCP) : EAX_EDX_RET(val, low, high) /* RDTSCP clobbers ECX with MSR_TSC_AUX. */ :: "ecx"); return EAX_EDX_VAL(val, low, high); } static inline unsigned long long native_read_pmc(int counter) { DECLARE_ARGS(val, low, high); asm volatile("rdpmc" : EAX_EDX_RET(val, low, high) : "c" (counter)); if (tracepoint_enabled(rdpmc)) do_trace_rdpmc(counter, EAX_EDX_VAL(val, low, high), 0); return EAX_EDX_VAL(val, low, high); } #ifdef CONFIG_PARAVIRT_XXL #include <asm/paravirt.h> #else #include <linux/errno.h> /* * Access to machine-specific registers (available on 586 and better only) * Note: the rd* operations modify the parameters directly (without using * pointer indirection), this allows gcc to optimize better */ #define rdmsr(msr, low, high) \ do { \ u64 __val = native_read_msr((msr)); \ (void)((low) = (u32)__val); \ (void)((high) = (u32)(__val >> 32)); \ } while (0) static inline void wrmsr(unsigned int msr, u32 low, u32 high) { native_write_msr(msr, low, high); } #define rdmsrl(msr, val) \ ((val) = native_read_msr((msr))) static inline void wrmsrl(unsigned int msr, u64 val) { native_write_msr(msr, (u32)(val & 0xffffffffULL), (u32)(val >> 32)); } /* wrmsr with exception handling */ static inline int wrmsr_safe(unsigned int msr, u32 low, u32 high) { return native_write_msr_safe(msr, low, high); } /* rdmsr with exception handling */ #define rdmsr_safe(msr, low, high) \ ({ \ int __err; \ u64 __val = native_read_msr_safe((msr), &__err); \ (*low) = (u32)__val; \ (*high) = (u32)(__val >> 32); \ __err; \ }) static inline int rdmsrl_safe(unsigned int msr, unsigned long long *p) { int err; *p = native_read_msr_safe(msr, &err); return err; } #define rdpmc(counter, low, high) \ do { \ u64 _l = native_read_pmc((counter)); \ (low) = (u32)_l; \ (high) = (u32)(_l >> 32); \ } while (0) #define rdpmcl(counter, val) ((val) = native_read_pmc(counter)) #endif /* !CONFIG_PARAVIRT_XXL */ /* * 64-bit version of wrmsr_safe(): */ static inline int wrmsrl_safe(u32 msr, u64 val) { return wrmsr_safe(msr, (u32)val, (u32)(val >> 32)); } struct msr *msrs_alloc(void); void msrs_free(struct msr *msrs); int msr_set_bit(u32 msr, u8 bit); int msr_clear_bit(u32 msr, u8 bit); #ifdef CONFIG_SMP int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h); int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h); int rdmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 *q); int wrmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 q); void rdmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs); void wrmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs); int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h); int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h); int rdmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 *q); int wrmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 q); int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]); int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]); #else /* CONFIG_SMP */ static inline int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) { rdmsr(msr_no, *l, *h); return 0; } static inline int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) { wrmsr(msr_no, l, h); return 0; } static inline int rdmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 *q) { rdmsrl(msr_no, *q); return 0; } static inline int wrmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 q) { wrmsrl(msr_no, q); return 0; } static inline void rdmsr_on_cpus(const struct cpumask *m, u32 msr_no, struct msr *msrs) { rdmsr_on_cpu(0, msr_no, &(msrs[0].l), &(msrs[0].h)); } static inline void wrmsr_on_cpus(const struct cpumask *m, u32 msr_no, struct msr *msrs) { wrmsr_on_cpu(0, msr_no, msrs[0].l, msrs[0].h); } static inline int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) { return rdmsr_safe(msr_no, l, h); } static inline int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) { return wrmsr_safe(msr_no, l, h); } static inline int rdmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 *q) { return rdmsrl_safe(msr_no, q); } static inline int wrmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 q) { return wrmsrl_safe(msr_no, q); } static inline int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]) { return rdmsr_safe_regs(regs); } static inline int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]) { return wrmsr_safe_regs(regs); } #endif /* CONFIG_SMP */ #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_MSR_H */ |
13 6 6 13 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 | // SPDX-License-Identifier: GPL-2.0 #include <linux/ceph/ceph_debug.h> #include <linux/fs.h> #include <linux/kernel.h> #include <linux/sched/signal.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/wait.h> #include <linux/writeback.h> #include <linux/iversion.h> #include <linux/filelock.h> #include "super.h" #include "mds_client.h" #include "cache.h" #include "crypto.h" #include <linux/ceph/decode.h> #include <linux/ceph/messenger.h> /* * Capability management * * The Ceph metadata servers control client access to inode metadata * and file data by issuing capabilities, granting clients permission * to read and/or write both inode field and file data to OSDs * (storage nodes). Each capability consists of a set of bits * indicating which operations are allowed. * * If the client holds a *_SHARED cap, the client has a coherent value * that can be safely read from the cached inode. * * In the case of a *_EXCL (exclusive) or FILE_WR capabilities, the * client is allowed to change inode attributes (e.g., file size, * mtime), note its dirty state in the ceph_cap, and asynchronously * flush that metadata change to the MDS. * * In the event of a conflicting operation (perhaps by another * client), the MDS will revoke the conflicting client capabilities. * * In order for a client to cache an inode, it must hold a capability * with at least one MDS server. When inodes are released, release * notifications are batched and periodically sent en masse to the MDS * cluster to release server state. */ static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc); static void __kick_flushing_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session, struct ceph_inode_info *ci, u64 oldest_flush_tid); /* * Generate readable cap strings for debugging output. */ #define MAX_CAP_STR 20 static char cap_str[MAX_CAP_STR][40]; static DEFINE_SPINLOCK(cap_str_lock); static int last_cap_str; static char *gcap_string(char *s, int c) { if (c & CEPH_CAP_GSHARED) *s++ = 's'; if (c & CEPH_CAP_GEXCL) *s++ = 'x'; if (c & CEPH_CAP_GCACHE) *s++ = 'c'; if (c & CEPH_CAP_GRD) *s++ = 'r'; if (c & CEPH_CAP_GWR) *s++ = 'w'; if (c & CEPH_CAP_GBUFFER) *s++ = 'b'; if (c & CEPH_CAP_GWREXTEND) *s++ = 'a'; if (c & CEPH_CAP_GLAZYIO) *s++ = 'l'; return s; } const char *ceph_cap_string(int caps) { int i; char *s; int c; spin_lock(&cap_str_lock); i = last_cap_str++; if (last_cap_str == MAX_CAP_STR) last_cap_str = 0; spin_unlock(&cap_str_lock); s = cap_str[i]; if (caps & CEPH_CAP_PIN) *s++ = 'p'; c = (caps >> CEPH_CAP_SAUTH) & 3; if (c) { *s++ = 'A'; s = gcap_string(s, c); } c = (caps >> CEPH_CAP_SLINK) & 3; if (c) { *s++ = 'L'; s = gcap_string(s, c); } c = (caps >> CEPH_CAP_SXATTR) & 3; if (c) { *s++ = 'X'; s = gcap_string(s, c); } c = caps >> CEPH_CAP_SFILE; if (c) { *s++ = 'F'; s = gcap_string(s, c); } if (s == cap_str[i]) *s++ = '-'; *s = 0; return cap_str[i]; } void ceph_caps_init(struct ceph_mds_client *mdsc) { INIT_LIST_HEAD(&mdsc->caps_list); spin_lock_init(&mdsc->caps_list_lock); } void ceph_caps_finalize(struct ceph_mds_client *mdsc) { struct ceph_cap *cap; spin_lock(&mdsc->caps_list_lock); while (!list_empty(&mdsc->caps_list)) { cap = list_first_entry(&mdsc->caps_list, struct ceph_cap, caps_item); list_del(&cap->caps_item); kmem_cache_free(ceph_cap_cachep, cap); } mdsc->caps_total_count = 0; mdsc->caps_avail_count = 0; mdsc->caps_use_count = 0; mdsc->caps_reserve_count = 0; mdsc->caps_min_count = 0; spin_unlock(&mdsc->caps_list_lock); } void ceph_adjust_caps_max_min(struct ceph_mds_client *mdsc, struct ceph_mount_options *fsopt) { spin_lock(&mdsc->caps_list_lock); mdsc->caps_min_count = fsopt->max_readdir; if (mdsc->caps_min_count < 1024) mdsc->caps_min_count = 1024; mdsc->caps_use_max = fsopt->caps_max; if (mdsc->caps_use_max > 0 && mdsc->caps_use_max < mdsc->caps_min_count) mdsc->caps_use_max = mdsc->caps_min_count; spin_unlock(&mdsc->caps_list_lock); } static void __ceph_unreserve_caps(struct ceph_mds_client *mdsc, int nr_caps) { struct ceph_cap *cap; int i; if (nr_caps) { BUG_ON(mdsc->caps_reserve_count < nr_caps); mdsc->caps_reserve_count -= nr_caps; if (mdsc->caps_avail_count >= mdsc->caps_reserve_count + mdsc->caps_min_count) { mdsc->caps_total_count -= nr_caps; for (i = 0; i < nr_caps; i++) { cap = list_first_entry(&mdsc->caps_list, struct ceph_cap, caps_item); list_del(&cap->caps_item); kmem_cache_free(ceph_cap_cachep, cap); } } else { mdsc->caps_avail_count += nr_caps; } doutc(mdsc->fsc->client, "caps %d = %d used + %d resv + %d avail\n", mdsc->caps_total_count, mdsc->caps_use_count, mdsc->caps_reserve_count, mdsc->caps_avail_count); BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + mdsc->caps_reserve_count + mdsc->caps_avail_count); } } /* * Called under mdsc->mutex. */ int ceph_reserve_caps(struct ceph_mds_client *mdsc, struct ceph_cap_reservation *ctx, int need) { struct ceph_client *cl = mdsc->fsc->client; int i, j; struct ceph_cap *cap; int have; int alloc = 0; int max_caps; int err = 0; bool trimmed = false; struct ceph_mds_session *s; LIST_HEAD(newcaps); doutc(cl, "ctx=%p need=%d\n", ctx, need); /* first reserve any caps that are already allocated */ spin_lock(&mdsc->caps_list_lock); if (mdsc->caps_avail_count >= need) have = need; else have = mdsc->caps_avail_count; mdsc->caps_avail_count -= have; mdsc->caps_reserve_count += have; BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + mdsc->caps_reserve_count + mdsc->caps_avail_count); spin_unlock(&mdsc->caps_list_lock); for (i = have; i < need; ) { cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS); if (cap) { list_add(&cap->caps_item, &newcaps); alloc++; i++; continue; } if (!trimmed) { for (j = 0; j < mdsc->max_sessions; j++) { s = __ceph_lookup_mds_session(mdsc, j); if (!s) continue; mutex_unlock(&mdsc->mutex); mutex_lock(&s->s_mutex); max_caps = s->s_nr_caps - (need - i); ceph_trim_caps(mdsc, s, max_caps); mutex_unlock(&s->s_mutex); ceph_put_mds_session(s); mutex_lock(&mdsc->mutex); } trimmed = true; spin_lock(&mdsc->caps_list_lock); if (mdsc->caps_avail_count) { int more_have; if (mdsc->caps_avail_count >= need - i) more_have = need - i; else more_have = mdsc->caps_avail_count; i += more_have; have += more_have; mdsc->caps_avail_count -= more_have; mdsc->caps_reserve_count += more_have; } spin_unlock(&mdsc->caps_list_lock); continue; } pr_warn_client(cl, "ctx=%p ENOMEM need=%d got=%d\n", ctx, need, have + alloc); err = -ENOMEM; break; } if (!err) { BUG_ON(have + alloc != need); ctx->count = need; ctx->used = 0; } spin_lock(&mdsc->caps_list_lock); mdsc->caps_total_count += alloc; mdsc->caps_reserve_count += alloc; list_splice(&newcaps, &mdsc->caps_list); BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + mdsc->caps_reserve_count + mdsc->caps_avail_count); if (err) __ceph_unreserve_caps(mdsc, have + alloc); spin_unlock(&mdsc->caps_list_lock); doutc(cl, "ctx=%p %d = %d used + %d resv + %d avail\n", ctx, mdsc->caps_total_count, mdsc->caps_use_count, mdsc->caps_reserve_count, mdsc->caps_avail_count); return err; } void ceph_unreserve_caps(struct ceph_mds_client *mdsc, struct ceph_cap_reservation *ctx) { struct ceph_client *cl = mdsc->fsc->client; bool reclaim = false; if (!ctx->count) return; doutc(cl, "ctx=%p count=%d\n", ctx, ctx->count); spin_lock(&mdsc->caps_list_lock); __ceph_unreserve_caps(mdsc, ctx->count); ctx->count = 0; if (mdsc->caps_use_max > 0 && mdsc->caps_use_count > mdsc->caps_use_max) reclaim = true; spin_unlock(&mdsc->caps_list_lock); if (reclaim) ceph_reclaim_caps_nr(mdsc, ctx->used); } struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc, struct ceph_cap_reservation *ctx) { struct ceph_client *cl = mdsc->fsc->client; struct ceph_cap *cap = NULL; /* temporary, until we do something about cap import/export */ if (!ctx) { cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS); if (cap) { spin_lock(&mdsc->caps_list_lock); mdsc->caps_use_count++; mdsc->caps_total_count++; spin_unlock(&mdsc->caps_list_lock); } else { spin_lock(&mdsc->caps_list_lock); if (mdsc->caps_avail_count) { BUG_ON(list_empty(&mdsc->caps_list)); mdsc->caps_avail_count--; mdsc->caps_use_count++; cap = list_first_entry(&mdsc->caps_list, struct ceph_cap, caps_item); list_del(&cap->caps_item); BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + mdsc->caps_reserve_count + mdsc->caps_avail_count); } spin_unlock(&mdsc->caps_list_lock); } return cap; } spin_lock(&mdsc->caps_list_lock); doutc(cl, "ctx=%p (%d) %d = %d used + %d resv + %d avail\n", ctx, ctx->count, mdsc->caps_total_count, mdsc->caps_use_count, mdsc->caps_reserve_count, mdsc->caps_avail_count); BUG_ON(!ctx->count); BUG_ON(ctx->count > mdsc->caps_reserve_count); BUG_ON(list_empty(&mdsc->caps_list)); ctx->count--; ctx->used++; mdsc->caps_reserve_count--; mdsc->caps_use_count++; cap = list_first_entry(&mdsc->caps_list, struct ceph_cap, caps_item); list_del(&cap->caps_item); BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + mdsc->caps_reserve_count + mdsc->caps_avail_count); spin_unlock(&mdsc->caps_list_lock); return cap; } void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap) { struct ceph_client *cl = mdsc->fsc->client; spin_lock(&mdsc->caps_list_lock); doutc(cl, "%p %d = %d used + %d resv + %d avail\n", cap, mdsc->caps_total_count, mdsc->caps_use_count, mdsc->caps_reserve_count, mdsc->caps_avail_count); mdsc->caps_use_count--; /* * Keep some preallocated caps around (ceph_min_count), to * avoid lots of free/alloc churn. */ if (mdsc->caps_avail_count >= mdsc->caps_reserve_count + mdsc->caps_min_count) { mdsc->caps_total_count--; kmem_cache_free(ceph_cap_cachep, cap); } else { mdsc->caps_avail_count++; list_add(&cap->caps_item, &mdsc->caps_list); } BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + mdsc->caps_reserve_count + mdsc->caps_avail_count); spin_unlock(&mdsc->caps_list_lock); } void ceph_reservation_status(struct ceph_fs_client *fsc, int *total, int *avail, int *used, int *reserved, int *min) { struct ceph_mds_client *mdsc = fsc->mdsc; spin_lock(&mdsc->caps_list_lock); if (total) *total = mdsc->caps_total_count; if (avail) *avail = mdsc->caps_avail_count; if (used) *used = mdsc->caps_use_count; if (reserved) *reserved = mdsc->caps_reserve_count; if (min) *min = mdsc->caps_min_count; spin_unlock(&mdsc->caps_list_lock); } /* * Find ceph_cap for given mds, if any. * * Called with i_ceph_lock held. */ struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds) { struct ceph_cap *cap; struct rb_node *n = ci->i_caps.rb_node; while (n) { cap = rb_entry(n, struct ceph_cap, ci_node); if (mds < cap->mds) n = n->rb_left; else if (mds > cap->mds) n = n->rb_right; else return cap; } return NULL; } struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, int mds) { struct ceph_cap *cap; spin_lock(&ci->i_ceph_lock); cap = __get_cap_for_mds(ci, mds); spin_unlock(&ci->i_ceph_lock); return cap; } /* * Called under i_ceph_lock. */ static void __insert_cap_node(struct ceph_inode_info *ci, struct ceph_cap *new) { struct rb_node **p = &ci->i_caps.rb_node; struct rb_node *parent = NULL; struct ceph_cap *cap = NULL; while (*p) { parent = *p; cap = rb_entry(parent, struct ceph_cap, ci_node); if (new->mds < cap->mds) p = &(*p)->rb_left; else if (new->mds > cap->mds) p = &(*p)->rb_right; else BUG(); } rb_link_node(&new->ci_node, parent, p); rb_insert_color(&new->ci_node, &ci->i_caps); } /* * (re)set cap hold timeouts, which control the delayed release * of unused caps back to the MDS. Should be called on cap use. */ static void __cap_set_timeouts(struct ceph_mds_client *mdsc, struct ceph_inode_info *ci) { struct inode *inode = &ci->netfs.inode; struct ceph_mount_options *opt = mdsc->fsc->mount_options; ci->i_hold_caps_max = round_jiffies(jiffies + opt->caps_wanted_delay_max * HZ); doutc(mdsc->fsc->client, "%p %llx.%llx %lu\n", inode, ceph_vinop(inode), ci->i_hold_caps_max - jiffies); } /* * (Re)queue cap at the end of the delayed cap release list. * * If I_FLUSH is set, leave the inode at the front of the list. * * Caller holds i_ceph_lock * -> we take mdsc->cap_delay_lock */ static void __cap_delay_requeue(struct ceph_mds_client *mdsc, struct ceph_inode_info *ci) { struct inode *inode = &ci->netfs.inode; doutc(mdsc->fsc->client, "%p %llx.%llx flags 0x%lx at %lu\n", inode, ceph_vinop(inode), ci->i_ceph_flags, ci->i_hold_caps_max); if (!mdsc->stopping) { spin_lock(&mdsc->cap_delay_lock); if (!list_empty(&ci->i_cap_delay_list)) { if (ci->i_ceph_flags & CEPH_I_FLUSH) goto no_change; list_del_init(&ci->i_cap_delay_list); } __cap_set_timeouts(mdsc, ci); list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list); no_change: spin_unlock(&mdsc->cap_delay_lock); } } /* * Queue an inode for immediate writeback. Mark inode with I_FLUSH, * indicating we should send a cap message to flush dirty metadata * asap, and move to the front of the delayed cap list. */ static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc, struct ceph_inode_info *ci) { struct inode *inode = &ci->netfs.inode; doutc(mdsc->fsc->client, "%p %llx.%llx\n", inode, ceph_vinop(inode)); spin_lock(&mdsc->cap_delay_lock); ci->i_ceph_flags |= CEPH_I_FLUSH; if (!list_empty(&ci->i_cap_delay_list)) list_del_init(&ci->i_cap_delay_list); list_add(&ci->i_cap_delay_list, &mdsc->cap_delay_list); spin_unlock(&mdsc->cap_delay_lock); } /* * Cancel delayed work on cap. * * Caller must hold i_ceph_lock. */ static void __cap_delay_cancel(struct ceph_mds_client *mdsc, struct ceph_inode_info *ci) { struct inode *inode = &ci->netfs.inode; doutc(mdsc->fsc->client, "%p %llx.%llx\n", inode, ceph_vinop(inode)); if (list_empty(&ci->i_cap_delay_list)) return; spin_lock(&mdsc->cap_delay_lock); list_del_init(&ci->i_cap_delay_list); spin_unlock(&mdsc->cap_delay_lock); } /* Common issue checks for add_cap, handle_cap_grant. */ static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap, unsigned issued) { struct inode *inode = &ci->netfs.inode; struct ceph_client *cl = ceph_inode_to_client(inode); unsigned had = __ceph_caps_issued(ci, NULL); lockdep_assert_held(&ci->i_ceph_lock); /* * Each time we receive FILE_CACHE anew, we increment * i_rdcache_gen. */ if (S_ISREG(ci->netfs.inode.i_mode) && (issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && (had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) { ci->i_rdcache_gen++; } /* * If FILE_SHARED is newly issued, mark dir not complete. We don't * know what happened to this directory while we didn't have the cap. * If FILE_SHARED is being revoked, also mark dir not complete. It * stops on-going cached readdir. */ if ((issued & CEPH_CAP_FILE_SHARED) != (had & CEPH_CAP_FILE_SHARED)) { if (issued & CEPH_CAP_FILE_SHARED) atomic_inc(&ci->i_shared_gen); if (S_ISDIR(ci->netfs.inode.i_mode)) { doutc(cl, " marking %p NOT complete\n", inode); __ceph_dir_clear_complete(ci); } } /* Wipe saved layout if we're losing DIR_CREATE caps */ if (S_ISDIR(ci->netfs.inode.i_mode) && (had & CEPH_CAP_DIR_CREATE) && !(issued & CEPH_CAP_DIR_CREATE)) { ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns)); memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout)); } } /** * change_auth_cap_ses - move inode to appropriate lists when auth caps change * @ci: inode to be moved * @session: new auth caps session */ void change_auth_cap_ses(struct ceph_inode_info *ci, struct ceph_mds_session *session) { lockdep_assert_held(&ci->i_ceph_lock); if (list_empty(&ci->i_dirty_item) && list_empty(&ci->i_flushing_item)) return; spin_lock(&session->s_mdsc->cap_dirty_lock); if (!list_empty(&ci->i_dirty_item)) list_move(&ci->i_dirty_item, &session->s_cap_dirty); if (!list_empty(&ci->i_flushing_item)) list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing); spin_unlock(&session->s_mdsc->cap_dirty_lock); } /* * Add a capability under the given MDS session. * * Caller should hold session snap_rwsem (read) and ci->i_ceph_lock * * @fmode is the open file mode, if we are opening a file, otherwise * it is < 0. (This is so we can atomically add the cap and add an * open file reference to it.) */ void ceph_add_cap(struct inode *inode, struct ceph_mds_session *session, u64 cap_id, unsigned issued, unsigned wanted, unsigned seq, unsigned mseq, u64 realmino, int flags, struct ceph_cap **new_cap) { struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc; struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_cap *cap; int mds = session->s_mds; int actual_wanted; u32 gen; lockdep_assert_held(&ci->i_ceph_lock); doutc(cl, "%p %llx.%llx mds%d cap %llx %s seq %d\n", inode, ceph_vinop(inode), session->s_mds, cap_id, ceph_cap_string(issued), seq); gen = atomic_read(&session->s_cap_gen); cap = __get_cap_for_mds(ci, mds); if (!cap) { cap = *new_cap; *new_cap = NULL; cap->issued = 0; cap->implemented = 0; cap->mds = mds; cap->mds_wanted = 0; cap->mseq = 0; cap->ci = ci; __insert_cap_node(ci, cap); /* add to session cap list */ cap->session = session; spin_lock(&session->s_cap_lock); list_add_tail(&cap->session_caps, &session->s_caps); session->s_nr_caps++; atomic64_inc(&mdsc->metric.total_caps); spin_unlock(&session->s_cap_lock); } else { spin_lock(&session->s_cap_lock); list_move_tail(&cap->session_caps, &session->s_caps); spin_unlock(&session->s_cap_lock); if (cap->cap_gen < gen) cap->issued = cap->implemented = CEPH_CAP_PIN; /* * auth mds of the inode changed. we received the cap export * message, but still haven't received the cap import message. * handle_cap_export() updated the new auth MDS' cap. * * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing * a message that was send before the cap import message. So * don't remove caps. */ if (ceph_seq_cmp(seq, cap->seq) <= 0) { WARN_ON(cap != ci->i_auth_cap); WARN_ON(cap->cap_id != cap_id); seq = cap->seq; mseq = cap->mseq; issued |= cap->issued; flags |= CEPH_CAP_FLAG_AUTH; } } if (!ci->i_snap_realm || ((flags & CEPH_CAP_FLAG_AUTH) && realmino != (u64)-1 && ci->i_snap_realm->ino != realmino)) { /* * add this inode to the appropriate snap realm */ struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc, realmino); if (realm) ceph_change_snap_realm(inode, realm); else WARN(1, "%s: couldn't find snap realm 0x%llx (ino 0x%llx oldrealm 0x%llx)\n", __func__, realmino, ci->i_vino.ino, ci->i_snap_realm ? ci->i_snap_realm->ino : 0); } __check_cap_issue(ci, cap, issued); /* * If we are issued caps we don't want, or the mds' wanted * value appears to be off, queue a check so we'll release * later and/or update the mds wanted value. */ actual_wanted = __ceph_caps_wanted(ci); if ((wanted & ~actual_wanted) || (issued & ~actual_wanted & CEPH_CAP_ANY_WR)) { doutc(cl, "issued %s, mds wanted %s, actual %s, queueing\n", ceph_cap_string(issued), ceph_cap_string(wanted), ceph_cap_string(actual_wanted)); __cap_delay_requeue(mdsc, ci); } if (flags & CEPH_CAP_FLAG_AUTH) { if (!ci->i_auth_cap || ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0) { if (ci->i_auth_cap && ci->i_auth_cap->session != cap->session) change_auth_cap_ses(ci, cap->session); ci->i_auth_cap = cap; cap->mds_wanted = wanted; } } else { WARN_ON(ci->i_auth_cap == cap); } doutc(cl, "inode %p %llx.%llx cap %p %s now %s seq %d mds%d\n", inode, ceph_vinop(inode), cap, ceph_cap_string(issued), ceph_cap_string(issued|cap->issued), seq, mds); cap->cap_id = cap_id; cap->issued = issued; cap->implemented |= issued; if (ceph_seq_cmp(mseq, cap->mseq) > 0) cap->mds_wanted = wanted; else cap->mds_wanted |= wanted; cap->seq = seq; cap->issue_seq = seq; cap->mseq = mseq; cap->cap_gen = gen; wake_up_all(&ci->i_cap_wq); } /* * Return true if cap has not timed out and belongs to the current * generation of the MDS session (i.e. has not gone 'stale' due to * us losing touch with the mds). */ static int __cap_is_valid(struct ceph_cap *cap) { struct inode *inode = &cap->ci->netfs.inode; struct ceph_client *cl = cap->session->s_mdsc->fsc->client; unsigned long ttl; u32 gen; gen = atomic_read(&cap->session->s_cap_gen); ttl = cap->session->s_cap_ttl; if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) { doutc(cl, "%p %llx.%llx cap %p issued %s but STALE (gen %u vs %u)\n", inode, ceph_vinop(inode), cap, ceph_cap_string(cap->issued), cap->cap_gen, gen); return 0; } return 1; } /* * Return set of valid cap bits issued to us. Note that caps time * out, and may be invalidated in bulk if the client session times out * and session->s_cap_gen is bumped. */ int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented) { struct inode *inode = &ci->netfs.inode; struct ceph_client *cl = ceph_inode_to_client(inode); int have = ci->i_snap_caps; struct ceph_cap *cap; struct rb_node *p; if (implemented) *implemented = 0; for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { cap = rb_entry(p, struct ceph_cap, ci_node); if (!__cap_is_valid(cap)) continue; doutc(cl, "%p %llx.%llx cap %p issued %s\n", inode, ceph_vinop(inode), cap, ceph_cap_string(cap->issued)); have |= cap->issued; if (implemented) *implemented |= cap->implemented; } /* * exclude caps issued by non-auth MDS, but are been revoking * by the auth MDS. The non-auth MDS should be revoking/exporting * these caps, but the message is delayed. */ if (ci->i_auth_cap) { cap = ci->i_auth_cap; have &= ~cap->implemented | cap->issued; } return have; } /* * Get cap bits issued by caps other than @ocap */ int __ceph_caps_issued_other(struct ceph_inode_info *ci, struct ceph_cap *ocap) { int have = ci->i_snap_caps; struct ceph_cap *cap; struct rb_node *p; for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { cap = rb_entry(p, struct ceph_cap, ci_node); if (cap == ocap) continue; if (!__cap_is_valid(cap)) continue; have |= cap->issued; } return have; } /* * Move a cap to the end of the LRU (oldest caps at list head, newest * at list tail). */ static void __touch_cap(struct ceph_cap *cap) { struct inode *inode = &cap->ci->netfs.inode; struct ceph_mds_session *s = cap->session; struct ceph_client *cl = s->s_mdsc->fsc->client; spin_lock(&s->s_cap_lock); if (!s->s_cap_iterator) { doutc(cl, "%p %llx.%llx cap %p mds%d\n", inode, ceph_vinop(inode), cap, s->s_mds); list_move_tail(&cap->session_caps, &s->s_caps); } else { doutc(cl, "%p %llx.%llx cap %p mds%d NOP, iterating over caps\n", inode, ceph_vinop(inode), cap, s->s_mds); } spin_unlock(&s->s_cap_lock); } /* * Check if we hold the given mask. If so, move the cap(s) to the * front of their respective LRUs. (This is the preferred way for * callers to check for caps they want.) */ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch) { struct inode *inode = &ci->netfs.inode; struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_cap *cap; struct rb_node *p; int have = ci->i_snap_caps; if ((have & mask) == mask) { doutc(cl, "mask %p %llx.%llx snap issued %s (mask %s)\n", inode, ceph_vinop(inode), ceph_cap_string(have), ceph_cap_string(mask)); return 1; } for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { cap = rb_entry(p, struct ceph_cap, ci_node); if (!__cap_is_valid(cap)) continue; if ((cap->issued & mask) == mask) { doutc(cl, "mask %p %llx.%llx cap %p issued %s (mask %s)\n", inode, ceph_vinop(inode), cap, ceph_cap_string(cap->issued), ceph_cap_string(mask)); if (touch) __touch_cap(cap); return 1; } /* does a combination of caps satisfy mask? */ have |= cap->issued; if ((have & mask) == mask) { doutc(cl, "mask %p %llx.%llx combo issued %s (mask %s)\n", inode, ceph_vinop(inode), ceph_cap_string(cap->issued), ceph_cap_string(mask)); if (touch) { struct rb_node *q; /* touch this + preceding caps */ __touch_cap(cap); for (q = rb_first(&ci->i_caps); q != p; q = rb_next(q)) { cap = rb_entry(q, struct ceph_cap, ci_node); if (!__cap_is_valid(cap)) continue; if (cap->issued & mask) __touch_cap(cap); } } return 1; } } return 0; } int __ceph_caps_issued_mask_metric(struct ceph_inode_info *ci, int mask, int touch) { struct ceph_fs_client *fsc = ceph_sb_to_fs_client(ci->netfs.inode.i_sb); int r; r = __ceph_caps_issued_mask(ci, mask, touch); if (r) ceph_update_cap_hit(&fsc->mdsc->metric); else ceph_update_cap_mis(&fsc->mdsc->metric); return r; } /* * Return true if mask caps are currently being revoked by an MDS. */ int __ceph_caps_revoking_other(struct ceph_inode_info *ci, struct ceph_cap *ocap, int mask) { struct ceph_cap *cap; struct rb_node *p; for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { cap = rb_entry(p, struct ceph_cap, ci_node); if (cap != ocap && (cap->implemented & ~cap->issued & mask)) return 1; } return 0; } int ceph_caps_revoking(struct ceph_inode_info *ci, int mask) { struct inode *inode = &ci->netfs.inode; struct ceph_client *cl = ceph_inode_to_client(inode); int ret; spin_lock(&ci->i_ceph_lock); ret = __ceph_caps_revoking_other(ci, NULL, mask); spin_unlock(&ci->i_ceph_lock); doutc(cl, "%p %llx.%llx %s = %d\n", inode, ceph_vinop(inode), ceph_cap_string(mask), ret); return ret; } int __ceph_caps_used(struct ceph_inode_info *ci) { int used = 0; if (ci->i_pin_ref) used |= CEPH_CAP_PIN; if (ci->i_rd_ref) used |= CEPH_CAP_FILE_RD; if (ci->i_rdcache_ref || (S_ISREG(ci->netfs.inode.i_mode) && ci->netfs.inode.i_data.nrpages)) used |= CEPH_CAP_FILE_CACHE; if (ci->i_wr_ref) used |= CEPH_CAP_FILE_WR; if (ci->i_wb_ref || ci->i_wrbuffer_ref) used |= CEPH_CAP_FILE_BUFFER; if (ci->i_fx_ref) used |= CEPH_CAP_FILE_EXCL; return used; } #define FMODE_WAIT_BIAS 1000 /* * wanted, by virtue of open file modes */ int __ceph_caps_file_wanted(struct ceph_inode_info *ci) { const int PIN_SHIFT = ffs(CEPH_FILE_MODE_PIN); const int RD_SHIFT = ffs(CEPH_FILE_MODE_RD); const int WR_SHIFT = ffs(CEPH_FILE_MODE_WR); const int LAZY_SHIFT = ffs(CEPH_FILE_MODE_LAZY); struct ceph_mount_options *opt = ceph_inode_to_fs_client(&ci->netfs.inode)->mount_options; unsigned long used_cutoff = jiffies - opt->caps_wanted_delay_max * HZ; unsigned long idle_cutoff = jiffies - opt->caps_wanted_delay_min * HZ; if (S_ISDIR(ci->netfs.inode.i_mode)) { int want = 0; /* use used_cutoff here, to keep dir's wanted caps longer */ if (ci->i_nr_by_mode[RD_SHIFT] > 0 || time_after(ci->i_last_rd, used_cutoff)) want |= CEPH_CAP_ANY_SHARED; if (ci->i_nr_by_mode[WR_SHIFT] > 0 || time_after(ci->i_last_wr, used_cutoff)) { want |= CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL; if (opt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS) want |= CEPH_CAP_ANY_DIR_OPS; } if (want || ci->i_nr_by_mode[PIN_SHIFT] > 0) want |= CEPH_CAP_PIN; return want; } else { int bits = 0; if (ci->i_nr_by_mode[RD_SHIFT] > 0) { if (ci->i_nr_by_mode[RD_SHIFT] >= FMODE_WAIT_BIAS || time_after(ci->i_last_rd, used_cutoff)) bits |= 1 << RD_SHIFT; } else if (time_after(ci->i_last_rd, idle_cutoff)) { bits |= 1 << RD_SHIFT; } if (ci->i_nr_by_mode[WR_SHIFT] > 0) { if (ci->i_nr_by_mode[WR_SHIFT] >= FMODE_WAIT_BIAS || time_after(ci->i_last_wr, used_cutoff)) bits |= 1 << WR_SHIFT; } else if (time_after(ci->i_last_wr, idle_cutoff)) { bits |= 1 << WR_SHIFT; } /* check lazyio only when read/write is wanted */ if ((bits & (CEPH_FILE_MODE_RDWR << 1)) && ci->i_nr_by_mode[LAZY_SHIFT] > 0) bits |= 1 << LAZY_SHIFT; return bits ? ceph_caps_for_mode(bits >> 1) : 0; } } /* * wanted, by virtue of open file modes AND cap refs (buffered/cached data) */ int __ceph_caps_wanted(struct ceph_inode_info *ci) { int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci); if (S_ISDIR(ci->netfs.inode.i_mode)) { /* we want EXCL if holding caps of dir ops */ if (w & CEPH_CAP_ANY_DIR_OPS) w |= CEPH_CAP_FILE_EXCL; } else { /* we want EXCL if dirty data */ if (w & CEPH_CAP_FILE_BUFFER) w |= CEPH_CAP_FILE_EXCL; } return w; } /* * Return caps we have registered with the MDS(s) as 'wanted'. */ int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check) { struct ceph_cap *cap; struct rb_node *p; int mds_wanted = 0; for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { cap = rb_entry(p, struct ceph_cap, ci_node); if (check && !__cap_is_valid(cap)) continue; if (cap == ci->i_auth_cap) mds_wanted |= cap->mds_wanted; else mds_wanted |= (cap->mds_wanted & ~CEPH_CAP_ANY_FILE_WR); } return mds_wanted; } int ceph_is_any_caps(struct inode *inode) { struct ceph_inode_info *ci = ceph_inode(inode); int ret; spin_lock(&ci->i_ceph_lock); ret = __ceph_is_any_real_caps(ci); spin_unlock(&ci->i_ceph_lock); return ret; } /* * Remove a cap. Take steps to deal with a racing iterate_session_caps. * * caller should hold i_ceph_lock. * caller will not hold session s_mutex if called from destroy_inode. */ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) { struct ceph_mds_session *session = cap->session; struct ceph_client *cl = session->s_mdsc->fsc->client; struct ceph_inode_info *ci = cap->ci; struct inode *inode = &ci->netfs.inode; struct ceph_mds_client *mdsc; int removed = 0; /* 'ci' being NULL means the remove have already occurred */ if (!ci) { doutc(cl, "inode is NULL\n"); return; } lockdep_assert_held(&ci->i_ceph_lock); doutc(cl, "%p from %p %llx.%llx\n", cap, inode, ceph_vinop(inode)); mdsc = ceph_inode_to_fs_client(&ci->netfs.inode)->mdsc; /* remove from inode's cap rbtree, and clear auth cap */ rb_erase(&cap->ci_node, &ci->i_caps); if (ci->i_auth_cap == cap) ci->i_auth_cap = NULL; /* remove from session list */ spin_lock(&session->s_cap_lock); if (session->s_cap_iterator == cap) { /* not yet, we are iterating over this very cap */ doutc(cl, "delaying %p removal from session %p\n", cap, cap->session); } else { list_del_init(&cap->session_caps); session->s_nr_caps--; atomic64_dec(&mdsc->metric.total_caps); cap->session = NULL; removed = 1; } /* protect backpointer with s_cap_lock: see iterate_session_caps */ cap->ci = NULL; /* * s_cap_reconnect is protected by s_cap_lock. no one changes * s_cap_gen while session is in the reconnect state. */ if (queue_release && (!session->s_cap_reconnect || cap->cap_gen == atomic_read(&session->s_cap_gen))) { cap->queue_release = 1; if (removed) { __ceph_queue_cap_release(session, cap); removed = 0; } } else { cap->queue_release = 0; } cap->cap_ino = ci->i_vino.ino; spin_unlock(&session->s_cap_lock); if (removed) ceph_put_cap(mdsc, cap); if (!__ceph_is_any_real_caps(ci)) { /* when reconnect denied, we remove session caps forcibly, * i_wr_ref can be non-zero. If there are ongoing write, * keep i_snap_realm. */ if (ci->i_wr_ref == 0 && ci->i_snap_realm) ceph_change_snap_realm(&ci->netfs.inode, NULL); __cap_delay_cancel(mdsc, ci); } } void ceph_remove_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, bool queue_release) { struct ceph_inode_info *ci = cap->ci; struct ceph_fs_client *fsc; /* 'ci' being NULL means the remove have already occurred */ if (!ci) { doutc(mdsc->fsc->client, "inode is NULL\n"); return; } lockdep_assert_held(&ci->i_ceph_lock); fsc = ceph_inode_to_fs_client(&ci->netfs.inode); WARN_ON_ONCE(ci->i_auth_cap == cap && !list_empty(&ci->i_dirty_item) && !fsc->blocklisted && !ceph_inode_is_shutdown(&ci->netfs.inode)); __ceph_remove_cap(cap, queue_release); } struct cap_msg_args { struct ceph_mds_session *session; u64 ino, cid, follows; u64 flush_tid, oldest_flush_tid, size, max_size; u64 xattr_version; u64 change_attr; struct ceph_buffer *xattr_buf; struct ceph_buffer *old_xattr_buf; struct timespec64 atime, mtime, ctime, btime; int op, caps, wanted, dirty; u32 seq, issue_seq, mseq, time_warp_seq; u32 flags; kuid_t uid; kgid_t gid; umode_t mode; bool inline_data; bool wake; bool encrypted; u32 fscrypt_auth_len; u8 fscrypt_auth[sizeof(struct ceph_fscrypt_auth)]; // for context }; /* Marshal up the cap msg to the MDS */ static void encode_cap_msg(struct ceph_msg *msg, struct cap_msg_args *arg) { struct ceph_mds_caps *fc; void *p; struct ceph_mds_client *mdsc = arg->session->s_mdsc; struct ceph_osd_client *osdc = &mdsc->fsc->client->osdc; doutc(mdsc->fsc->client, "%s %llx %llx caps %s wanted %s dirty %s seq %u/%u" " tid %llu/%llu mseq %u follows %lld size %llu/%llu" " xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(arg->op), arg->cid, arg->ino, ceph_cap_string(arg->caps), ceph_cap_string(arg->wanted), ceph_cap_string(arg->dirty), arg->seq, arg->issue_seq, arg->flush_tid, arg->oldest_flush_tid, arg->mseq, arg->follows, arg->size, arg->max_size, arg->xattr_version, arg->xattr_buf ? (int)arg->xattr_buf->vec.iov_len : 0); msg->hdr.version = cpu_to_le16(12); msg->hdr.tid = cpu_to_le64(arg->flush_tid); fc = msg->front.iov_base; memset(fc, 0, sizeof(*fc)); fc->cap_id = cpu_to_le64(arg->cid); fc->op = cpu_to_le32(arg->op); fc->seq = cpu_to_le32(arg->seq); fc->issue_seq = cpu_to_le32(arg->issue_seq); fc->migrate_seq = cpu_to_le32(arg->mseq); fc->caps = cpu_to_le32(arg->caps); fc->wanted = cpu_to_le32(arg->wanted); fc->dirty = cpu_to_le32(arg->dirty); fc->ino = cpu_to_le64(arg->ino); fc->snap_follows = cpu_to_le64(arg->follows); #if IS_ENABLED(CONFIG_FS_ENCRYPTION) if (arg->encrypted) fc->size = cpu_to_le64(round_up(arg->size, CEPH_FSCRYPT_BLOCK_SIZE)); else #endif fc->size = cpu_to_le64(arg->size); fc->max_size = cpu_to_le64(arg->max_size); ceph_encode_timespec64(&fc->mtime, &arg->mtime); ceph_encode_timespec64(&fc->atime, &arg->atime); ceph_encode_timespec64(&fc->ctime, &arg->ctime); fc->time_warp_seq = cpu_to_le32(arg->time_warp_seq); fc->uid = cpu_to_le32(from_kuid(&init_user_ns, arg->uid)); fc->gid = cpu_to_le32(from_kgid(&init_user_ns, arg->gid)); fc->mode = cpu_to_le32(arg->mode); fc->xattr_version = cpu_to_le64(arg->xattr_version); if (arg->xattr_buf) { msg->middle = ceph_buffer_get(arg->xattr_buf); fc->xattr_len = cpu_to_le32(arg->xattr_buf->vec.iov_len); msg->hdr.middle_len = cpu_to_le32(arg->xattr_buf->vec.iov_len); } p = fc + 1; /* flock buffer size (version 2) */ ceph_encode_32(&p, 0); /* inline version (version 4) */ ceph_encode_64(&p, arg->inline_data ? 0 : CEPH_INLINE_NONE); /* inline data size */ ceph_encode_32(&p, 0); /* * osd_epoch_barrier (version 5) * The epoch_barrier is protected osdc->lock, so READ_ONCE here in * case it was recently changed */ ceph_encode_32(&p, READ_ONCE(osdc->epoch_barrier)); /* oldest_flush_tid (version 6) */ ceph_encode_64(&p, arg->oldest_flush_tid); /* * caller_uid/caller_gid (version 7) * * Currently, we don't properly track which caller dirtied the caps * last, and force a flush of them when there is a conflict. For now, * just set this to 0:0, to emulate how the MDS has worked up to now. */ ceph_encode_32(&p, 0); ceph_encode_32(&p, 0); /* pool namespace (version 8) (mds always ignores this) */ ceph_encode_32(&p, 0); /* btime and change_attr (version 9) */ ceph_encode_timespec64(p, &arg->btime); p += sizeof(struct ceph_timespec); ceph_encode_64(&p, arg->change_attr); /* Advisory flags (version 10) */ ceph_encode_32(&p, arg->flags); /* dirstats (version 11) - these are r/o on the client */ ceph_encode_64(&p, 0); ceph_encode_64(&p, 0); #if IS_ENABLED(CONFIG_FS_ENCRYPTION) /* * fscrypt_auth and fscrypt_file (version 12) * * fscrypt_auth holds the crypto context (if any). fscrypt_file * tracks the real i_size as an __le64 field (and we use a rounded-up * i_size in the traditional size field). */ ceph_encode_32(&p, arg->fscrypt_auth_len); ceph_encode_copy(&p, arg->fscrypt_auth, arg->fscrypt_auth_len); ceph_encode_32(&p, sizeof(__le64)); ceph_encode_64(&p, arg->size); #else /* CONFIG_FS_ENCRYPTION */ ceph_encode_32(&p, 0); ceph_encode_32(&p, 0); #endif /* CONFIG_FS_ENCRYPTION */ } /* * Queue cap releases when an inode is dropped from our cache. */ void __ceph_remove_caps(struct ceph_inode_info *ci) { struct inode *inode = &ci->netfs.inode; struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc; struct rb_node *p; /* lock i_ceph_lock, because ceph_d_revalidate(..., LOOKUP_RCU) * may call __ceph_caps_issued_mask() on a freeing inode. */ spin_lock(&ci->i_ceph_lock); p = rb_first(&ci->i_caps); while (p) { struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node); p = rb_next(p); ceph_remove_cap(mdsc, cap, true); } spin_unlock(&ci->i_ceph_lock); } /* * Prepare to send a cap message to an MDS. Update the cap state, and populate * the arg struct with the parameters that will need to be sent. This should * be done under the i_ceph_lock to guard against changes to cap state. * * Make note of max_size reported/requested from mds, revoked caps * that have now been implemented. */ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap, int op, int flags, int used, int want, int retain, int flushing, u64 flush_tid, u64 oldest_flush_tid) { struct ceph_inode_info *ci = cap->ci; struct inode *inode = &ci->netfs.inode; struct ceph_client *cl = ceph_inode_to_client(inode); int held, revoking; lockdep_assert_held(&ci->i_ceph_lock); held = cap->issued | cap->implemented; revoking = cap->implemented & ~cap->issued; retain &= ~revoking; doutc(cl, "%p %llx.%llx cap %p session %p %s -> %s (revoking %s)\n", inode, ceph_vinop(inode), cap, cap->session, ceph_cap_string(held), ceph_cap_string(held & retain), ceph_cap_string(revoking)); BUG_ON((retain & CEPH_CAP_PIN) == 0); ci->i_ceph_flags &= ~CEPH_I_FLUSH; cap->issued &= retain; /* drop bits we don't want */ /* * Wake up any waiters on wanted -> needed transition. This is due to * the weird transition from buffered to sync IO... we need to flush * dirty pages _before_ allowing sync writes to avoid reordering. */ arg->wake = cap->implemented & ~cap->issued; cap->implemented &= cap->issued | used; cap->mds_wanted = want; arg->session = cap->session; arg->ino = ceph_vino(inode).ino; arg->cid = cap->cap_id; arg->follows = flushing ? ci->i_head_snapc->seq : 0; arg->flush_tid = flush_tid; arg->oldest_flush_tid = oldest_flush_tid; arg->size = i_size_read(inode); ci->i_reported_size = arg->size; arg->max_size = ci->i_wanted_max_size; if (cap == ci->i_auth_cap) { if (want & CEPH_CAP_ANY_FILE_WR) ci->i_requested_max_size = arg->max_size; else ci->i_requested_max_size = 0; } if (flushing & CEPH_CAP_XATTR_EXCL) { arg->old_xattr_buf = __ceph_build_xattrs_blob(ci); arg->xattr_version = ci->i_xattrs.version; arg->xattr_buf = ci->i_xattrs.blob; } else { arg->xattr_buf = NULL; arg->old_xattr_buf = NULL; } arg->mtime = inode_get_mtime(inode); arg->atime = inode_get_atime(inode); arg->ctime = inode_get_ctime(inode); arg->btime = ci->i_btime; arg->change_attr = inode_peek_iversion_raw(inode); arg->op = op; arg->caps = cap->implemented; arg->wanted = want; arg->dirty = flushing; arg->seq = cap->seq; arg->issue_seq = cap->issue_seq; arg->mseq = cap->mseq; arg->time_warp_seq = ci->i_time_warp_seq; arg->uid = inode->i_uid; arg->gid = inode->i_gid; arg->mode = inode->i_mode; arg->inline_data = ci->i_inline_version != CEPH_INLINE_NONE; if (!(flags & CEPH_CLIENT_CAPS_PENDING_CAPSNAP) && !list_empty(&ci->i_cap_snaps)) { struct ceph_cap_snap *capsnap; list_for_each_entry_reverse(capsnap, &ci->i_cap_snaps, ci_item) { if (capsnap->cap_flush.tid) break; if (capsnap->need_flush) { flags |= CEPH_CLIENT_CAPS_PENDING_CAPSNAP; break; } } } arg->flags = flags; arg->encrypted = IS_ENCRYPTED(inode); #if IS_ENABLED(CONFIG_FS_ENCRYPTION) if (ci->fscrypt_auth_len && WARN_ON_ONCE(ci->fscrypt_auth_len > sizeof(struct ceph_fscrypt_auth))) { /* Don't set this if it's too big */ arg->fscrypt_auth_len = 0; } else { arg->fscrypt_auth_len = ci->fscrypt_auth_len; memcpy(arg->fscrypt_auth, ci->fscrypt_auth, min_t(size_t, ci->fscrypt_auth_len, sizeof(arg->fscrypt_auth))); } #endif /* CONFIG_FS_ENCRYPTION */ } #if IS_ENABLED(CONFIG_FS_ENCRYPTION) #define CAP_MSG_FIXED_FIELDS (sizeof(struct ceph_mds_caps) + \ 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4 + 8 + 8 + 4 + 4 + 8) static inline int cap_msg_size(struct cap_msg_args *arg) { return CAP_MSG_FIXED_FIELDS + arg->fscrypt_auth_len; } #else #define CAP_MSG_FIXED_FIELDS (sizeof(struct ceph_mds_caps) + \ 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4 + 8 + 8 + 4 + 4) static inline int cap_msg_size(struct cap_msg_args *arg) { return CAP_MSG_FIXED_FIELDS; } #endif /* CONFIG_FS_ENCRYPTION */ /* * Send a cap msg on the given inode. * * Caller should hold snap_rwsem (read), s_mutex. */ static void __send_cap(struct cap_msg_args *arg, struct ceph_inode_info *ci) { struct ceph_msg *msg; struct inode *inode = &ci->netfs.inode; struct ceph_client *cl = ceph_inode_to_client(inode); msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, cap_msg_size(arg), GFP_NOFS, false); if (!msg) { pr_err_client(cl, "error allocating cap msg: ino (%llx.%llx)" " flushing %s tid %llu, requeuing cap.\n", ceph_vinop(inode), ceph_cap_string(arg->dirty), arg->flush_tid); spin_lock(&ci->i_ceph_lock); __cap_delay_requeue(arg->session->s_mdsc, ci); spin_unlock(&ci->i_ceph_lock); return; } encode_cap_msg(msg, arg); ceph_con_send(&arg->session->s_con, msg); ceph_buffer_put(arg->old_xattr_buf); if (arg->wake) wake_up_all(&ci->i_cap_wq); } static inline int __send_flush_snap(struct inode *inode, struct ceph_mds_session *session, struct ceph_cap_snap *capsnap, u32 mseq, u64 oldest_flush_tid) { struct cap_msg_args arg; struct ceph_msg *msg; arg.session = session; arg.ino = ceph_vino(inode).ino; arg.cid = 0; arg.follows = capsnap->follows; arg.flush_tid = capsnap->cap_flush.tid; arg.oldest_flush_tid = oldest_flush_tid; arg.size = capsnap->size; arg.max_size = 0; arg.xattr_version = capsnap->xattr_version; arg.xattr_buf = capsnap->xattr_blob; arg.old_xattr_buf = NULL; arg.atime = capsnap->atime; arg.mtime = capsnap->mtime; arg.ctime = capsnap->ctime; arg.btime = capsnap->btime; arg.change_attr = capsnap->change_attr; arg.op = CEPH_CAP_OP_FLUSHSNAP; arg.caps = capsnap->issued; arg.wanted = 0; arg.dirty = capsnap->dirty; arg.seq = 0; arg.issue_seq = 0; arg.mseq = mseq; arg.time_warp_seq = capsnap->time_warp_seq; arg.uid = capsnap->uid; arg.gid = capsnap->gid; arg.mode = capsnap->mode; arg.inline_data = capsnap->inline_data; arg.flags = 0; arg.wake = false; arg.encrypted = IS_ENCRYPTED(inode); /* No fscrypt_auth changes from a capsnap.*/ arg.fscrypt_auth_len = 0; msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, cap_msg_size(&arg), GFP_NOFS, false); if (!msg) return -ENOMEM; encode_cap_msg(msg, &arg); ceph_con_send(&arg.session->s_con, msg); return 0; } /* * When a snapshot is taken, clients accumulate dirty metadata on * inodes with capabilities in ceph_cap_snaps to describe the file * state at the time the snapshot was taken. This must be flushed * asynchronously back to the MDS once sync writes complete and dirty * data is written out. * * Called under i_ceph_lock. */ static void __ceph_flush_snaps(struct ceph_inode_info *ci, struct ceph_mds_session *session) __releases(ci->i_ceph_lock) __acquires(ci->i_ceph_lock) { struct inode *inode = &ci->netfs.inode; struct ceph_mds_client *mdsc = session->s_mdsc; struct ceph_client *cl = mdsc->fsc->client; struct ceph_cap_snap *capsnap; u64 oldest_flush_tid = 0; u64 first_tid = 1, last_tid = 0; doutc(cl, "%p %llx.%llx session %p\n", inode, ceph_vinop(inode), session); list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { /* * we need to wait for sync writes to complete and for dirty * pages to be written out. */ if (capsnap->dirty_pages || capsnap->writing) break; /* should be removed by ceph_try_drop_cap_snap() */ BUG_ON(!capsnap->need_flush); /* only flush each capsnap once */ if (capsnap->cap_flush.tid > 0) { doutc(cl, "already flushed %p, skipping\n", capsnap); continue; } spin_lock(&mdsc->cap_dirty_lock); capsnap->cap_flush.tid = ++mdsc->last_cap_flush_tid; list_add_tail(&capsnap->cap_flush.g_list, &mdsc->cap_flush_list); if (oldest_flush_tid == 0) oldest_flush_tid = __get_oldest_flush_tid(mdsc); if (list_empty(&ci->i_flushing_item)) { list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing); } spin_unlock(&mdsc->cap_dirty_lock); list_add_tail(&capsnap->cap_flush.i_list, &ci->i_cap_flush_list); if (first_tid == 1) first_tid = capsnap->cap_flush.tid; last_tid = capsnap->cap_flush.tid; } ci->i_ceph_flags &= ~CEPH_I_FLUSH_SNAPS; while (first_tid <= last_tid) { struct ceph_cap *cap = ci->i_auth_cap; struct ceph_cap_flush *cf = NULL, *iter; int ret; if (!(cap && cap->session == session)) { doutc(cl, "%p %llx.%llx auth cap %p not mds%d, stop\n", inode, ceph_vinop(inode), cap, session->s_mds); break; } ret = -ENOENT; list_for_each_entry(iter, &ci->i_cap_flush_list, i_list) { if (iter->tid >= first_tid) { cf = iter; ret = 0; break; } } if (ret < 0) break; first_tid = cf->tid + 1; capsnap = container_of(cf, struct ceph_cap_snap, cap_flush); refcount_inc(&capsnap->nref); spin_unlock(&ci->i_ceph_lock); doutc(cl, "%p %llx.%llx capsnap %p tid %llu %s\n", inode, ceph_vinop(inode), capsnap, cf->tid, ceph_cap_string(capsnap->dirty)); ret = __send_flush_snap(inode, session, capsnap, cap->mseq, oldest_flush_tid); if (ret < 0) { pr_err_client(cl, "error sending cap flushsnap, " "ino (%llx.%llx) tid %llu follows %llu\n", ceph_vinop(inode), cf->tid, capsnap->follows); } ceph_put_cap_snap(capsnap); spin_lock(&ci->i_ceph_lock); } } void ceph_flush_snaps(struct ceph_inode_info *ci, struct ceph_mds_session **psession) { struct inode *inode = &ci->netfs.inode; struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc; struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_mds_session *session = NULL; bool need_put = false; int mds; doutc(cl, "%p %llx.%llx\n", inode, ceph_vinop(inode)); if (psession) session = *psession; retry: spin_lock(&ci->i_ceph_lock); if (!(ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)) { doutc(cl, " no capsnap needs flush, doing nothing\n"); goto out; } if (!ci->i_auth_cap) { doutc(cl, " no auth cap (migrating?), doing nothing\n"); goto out; } mds = ci->i_auth_cap->session->s_mds; if (session && session->s_mds != mds) { doutc(cl, " oops, wrong session %p mutex\n", session); ceph_put_mds_session(session); session = NULL; } if (!session) { spin_unlock(&ci->i_ceph_lock); mutex_lock(&mdsc->mutex); session = __ceph_lookup_mds_session(mdsc, mds); mutex_unlock(&mdsc->mutex); goto retry; } // make sure flushsnap messages are sent in proper order. if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) __kick_flushing_caps(mdsc, session, ci, 0); __ceph_flush_snaps(ci, session); out: spin_unlock(&ci->i_ceph_lock); if (psession) *psession = session; else ceph_put_mds_session(session); /* we flushed them all; remove this inode from the queue */ spin_lock(&mdsc->snap_flush_lock); if (!list_empty(&ci->i_snap_flush_item)) need_put = true; list_del_init(&ci->i_snap_flush_item); spin_unlock(&mdsc->snap_flush_lock); if (need_put) iput(inode); } /* * Mark caps dirty. If inode is newly dirty, return the dirty flags. * Caller is then responsible for calling __mark_inode_dirty with the * returned flags value. */ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask, struct ceph_cap_flush **pcf) { struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(ci->netfs.inode.i_sb)->mdsc; struct inode *inode = &ci->netfs.inode; struct ceph_client *cl = ceph_inode_to_client(inode); int was = ci->i_dirty_caps; int dirty = 0; lockdep_assert_held(&ci->i_ceph_lock); if (!ci->i_auth_cap) { pr_warn_client(cl, "%p %llx.%llx mask %s, " "but no auth cap (session was closed?)\n", inode, ceph_vinop(inode), ceph_cap_string(mask)); return 0; } doutc(cl, "%p %llx.%llx %s dirty %s -> %s\n", inode, ceph_vinop(inode), ceph_cap_string(mask), ceph_cap_string(was), ceph_cap_string(was | mask)); ci->i_dirty_caps |= mask; if (was == 0) { struct ceph_mds_session *session = ci->i_auth_cap->session; WARN_ON_ONCE(ci->i_prealloc_cap_flush); swap(ci->i_prealloc_cap_flush, *pcf); if (!ci->i_head_snapc) { WARN_ON_ONCE(!rwsem_is_locked(&mdsc->snap_rwsem)); ci->i_head_snapc = ceph_get_snap_context( ci->i_snap_realm->cached_context); } doutc(cl, "%p %llx.%llx now dirty snapc %p auth cap %p\n", inode, ceph_vinop(inode), ci->i_head_snapc, ci->i_auth_cap); BUG_ON(!list_empty(&ci->i_dirty_item)); spin_lock(&mdsc->cap_dirty_lock); list_add(&ci->i_dirty_item, &session->s_cap_dirty); spin_unlock(&mdsc->cap_dirty_lock); if (ci->i_flushing_caps == 0) { ihold(inode); dirty |= I_DIRTY_SYNC; } } else { WARN_ON_ONCE(!ci->i_prealloc_cap_flush); } BUG_ON(list_empty(&ci->i_dirty_item)); if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) && (mask & CEPH_CAP_FILE_BUFFER)) dirty |= I_DIRTY_DATASYNC; __cap_delay_requeue(mdsc, ci); return dirty; } struct ceph_cap_flush *ceph_alloc_cap_flush(void) { struct ceph_cap_flush *cf; cf = kmem_cache_alloc(ceph_cap_flush_cachep, GFP_KERNEL); if (!cf) return NULL; cf->is_capsnap = false; return cf; } void ceph_free_cap_flush(struct ceph_cap_flush *cf) { if (cf) kmem_cache_free(ceph_cap_flush_cachep, cf); } static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc) { if (!list_empty(&mdsc->cap_flush_list)) { struct ceph_cap_flush *cf = list_first_entry(&mdsc->cap_flush_list, struct ceph_cap_flush, g_list); return cf->tid; } return 0; } /* * Remove cap_flush from the mdsc's or inode's flushing cap list. * Return true if caller needs to wake up flush waiters. */ static bool __detach_cap_flush_from_mdsc(struct ceph_mds_client *mdsc, struct ceph_cap_flush *cf) { struct ceph_cap_flush *prev; bool wake = cf->wake; if (wake && cf->g_list.prev != &mdsc->cap_flush_list) { prev = list_prev_entry(cf, g_list); prev->wake = true; wake = false; } list_del_init(&cf->g_list); return wake; } static bool __detach_cap_flush_from_ci(struct ceph_inode_info *ci, struct ceph_cap_flush *cf) { struct ceph_cap_flush *prev; bool wake = cf->wake; if (wake && cf->i_list.prev != &ci->i_cap_flush_list) { prev = list_prev_entry(cf, i_list); prev->wake = true; wake = false; } list_del_init(&cf->i_list); return wake; } /* * Add dirty inode to the flushing list. Assigned a seq number so we * can wait for caps to flush without starving. * * Called under i_ceph_lock. Returns the flush tid. */ static u64 __mark_caps_flushing(struct inode *inode, struct ceph_mds_session *session, bool wake, u64 *oldest_flush_tid) { struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc; struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_cap_flush *cf = NULL; int flushing; lockdep_assert_held(&ci->i_ceph_lock); BUG_ON(ci->i_dirty_caps == 0); BUG_ON(list_empty(&ci->i_dirty_item)); BUG_ON(!ci->i_prealloc_cap_flush); flushing = ci->i_dirty_caps; doutc(cl, "flushing %s, flushing_caps %s -> %s\n", ceph_cap_string(flushing), ceph_cap_string(ci->i_flushing_caps), ceph_cap_string(ci->i_flushing_caps | flushing)); ci->i_flushing_caps |= flushing; ci->i_dirty_caps = 0; doutc(cl, "%p %llx.%llx now !dirty\n", inode, ceph_vinop(inode)); swap(cf, ci->i_prealloc_cap_flush); cf->caps = flushing; cf->wake = wake; spin_lock(&mdsc->cap_dirty_lock); list_del_init(&ci->i_dirty_item); cf->tid = ++mdsc->last_cap_flush_tid; list_add_tail(&cf->g_list, &mdsc->cap_flush_list); *oldest_flush_tid = __get_oldest_flush_tid(mdsc); if (list_empty(&ci->i_flushing_item)) { list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing); mdsc->num_cap_flushing++; } spin_unlock(&mdsc->cap_dirty_lock); list_add_tail(&cf->i_list, &ci->i_cap_flush_list); return cf->tid; } /* * try to invalidate mapping pages without blocking. */ static int try_nonblocking_invalidate(struct inode *inode) __releases(ci->i_ceph_lock) __acquires(ci->i_ceph_lock) { struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); u32 invalidating_gen = ci->i_rdcache_gen; spin_unlock(&ci->i_ceph_lock); ceph_fscache_invalidate(inode, false); invalidate_mapping_pages(&inode->i_data, 0, -1); spin_lock(&ci->i_ceph_lock); if (inode->i_data.nrpages == 0 && invalidating_gen == ci->i_rdcache_gen) { /* success. */ doutc(cl, "%p %llx.%llx success\n", inode, ceph_vinop(inode)); /* save any racing async invalidate some trouble */ ci->i_rdcache_revoking = ci->i_rdcache_gen - 1; return 0; } doutc(cl, "%p %llx.%llx failed\n", inode, ceph_vinop(inode)); return -1; } bool __ceph_should_report_size(struct ceph_inode_info *ci) { loff_t size = i_size_read(&ci->netfs.inode); /* mds will adjust max size according to the reported size */ if (ci->i_flushing_caps & CEPH_CAP_FILE_WR) return false; if (size >= ci->i_max_size) return true; /* half of previous max_size increment has been used */ if (ci->i_max_size > ci->i_reported_size && (size << 1) >= ci->i_max_size + ci->i_reported_size) return true; return false; } /* * Swiss army knife function to examine currently used and wanted * versus held caps. Release, flush, ack revoked caps to mds as * appropriate. * * CHECK_CAPS_AUTHONLY - we should only check the auth cap * CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without * further delay. */ void ceph_check_caps(struct ceph_inode_info *ci, int flags) { struct inode *inode = &ci->netfs.inode; struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_cap *cap; u64 flush_tid, oldest_flush_tid; int file_wanted, used, cap_used; int issued, implemented, want, retain, revoking, flushing = 0; int mds = -1; /* keep track of how far we've gone through i_caps list to avoid an infinite loop on retry */ struct rb_node *p; bool queue_invalidate = false; bool tried_invalidate = false; bool queue_writeback = false; struct ceph_mds_session *session = NULL; spin_lock(&ci->i_ceph_lock); if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) { ci->i_ceph_flags |= CEPH_I_ASYNC_CHECK_CAPS; /* Don't send messages until we get async create reply */ spin_unlock(&ci->i_ceph_lock); return; } if (ci->i_ceph_flags & CEPH_I_FLUSH) flags |= CHECK_CAPS_FLUSH; retry: /* Caps wanted by virtue of active open files. */ file_wanted = __ceph_caps_file_wanted(ci); /* Caps which have active references against them */ used = __ceph_caps_used(ci); /* * "issued" represents the current caps that the MDS wants us to have. * "implemented" is the set that we have been granted, and includes the * ones that have not yet been returned to the MDS (the "revoking" set, * usually because they have outstanding references). */ issued = __ceph_caps_issued(ci, &implemented); revoking = implemented & ~issued; want = file_wanted; /* The ones we currently want to retain (may be adjusted below) */ retain = file_wanted | used | CEPH_CAP_PIN; if (!mdsc->stopping && inode->i_nlink > 0) { if (file_wanted) { retain |= CEPH_CAP_ANY; /* be greedy */ } else if (S_ISDIR(inode->i_mode) && (issued & CEPH_CAP_FILE_SHARED) && __ceph_dir_is_complete(ci)) { /* * If a directory is complete, we want to keep * the exclusive cap. So that MDS does not end up * revoking the shared cap on every create/unlink * operation. */ if (IS_RDONLY(inode)) { want = CEPH_CAP_ANY_SHARED; } else { want |= CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL; } retain |= want; } else { retain |= CEPH_CAP_ANY_SHARED; /* * keep RD only if we didn't have the file open RW, * because then the mds would revoke it anyway to * journal max_size=0. */ if (ci->i_max_size == 0) retain |= CEPH_CAP_ANY_RD; } } doutc(cl, "%p %llx.%llx file_want %s used %s dirty %s " "flushing %s issued %s revoking %s retain %s %s%s%s\n", inode, ceph_vinop(inode), ceph_cap_string(file_wanted), ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps), ceph_cap_string(ci->i_flushing_caps), ceph_cap_string(issued), ceph_cap_string(revoking), ceph_cap_string(retain), (flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "", (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "", (flags & CHECK_CAPS_NOINVAL) ? " NOINVAL" : ""); /* * If we no longer need to hold onto old our caps, and we may * have cached pages, but don't want them, then try to invalidate. * If we fail, it's because pages are locked.... try again later. */ if ((!(flags & CHECK_CAPS_NOINVAL) || mdsc->stopping) && S_ISREG(inode->i_mode) && !(ci->i_wb_ref || ci->i_wrbuffer_ref) && /* no dirty pages... */ inode->i_data.nrpages && /* have cached pages */ (revoking & (CEPH_CAP_FILE_CACHE| CEPH_CAP_FILE_LAZYIO)) && /* or revoking cache */ !tried_invalidate) { doutc(cl, "trying to invalidate on %p %llx.%llx\n", inode, ceph_vinop(inode)); if (try_nonblocking_invalidate(inode) < 0) { doutc(cl, "queuing invalidate\n"); queue_invalidate = true; ci->i_rdcache_revoking = ci->i_rdcache_gen; } tried_invalidate = true; goto retry; } for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) { int mflags = 0; struct cap_msg_args arg; cap = rb_entry(p, struct ceph_cap, ci_node); /* avoid looping forever */ if (mds >= cap->mds || ((flags & CHECK_CAPS_AUTHONLY) && cap != ci->i_auth_cap)) continue; /* * If we have an auth cap, we don't need to consider any * overlapping caps as used. */ cap_used = used; if (ci->i_auth_cap && cap != ci->i_auth_cap) cap_used &= ~ci->i_auth_cap->issued; revoking = cap->implemented & ~cap->issued; doutc(cl, " mds%d cap %p used %s issued %s implemented %s revoking %s\n", cap->mds, cap, ceph_cap_string(cap_used), ceph_cap_string(cap->issued), ceph_cap_string(cap->implemented), ceph_cap_string(revoking)); if (cap == ci->i_auth_cap && (cap->issued & CEPH_CAP_FILE_WR)) { /* request larger max_size from MDS? */ if (ci->i_wanted_max_size > ci->i_max_size && ci->i_wanted_max_size > ci->i_requested_max_size) { doutc(cl, "requesting new max_size\n"); goto ack; } /* approaching file_max? */ if (__ceph_should_report_size(ci)) { doutc(cl, "i_size approaching max_size\n"); goto ack; } } /* flush anything dirty? */ if (cap == ci->i_auth_cap) { if ((flags & CHECK_CAPS_FLUSH) && ci->i_dirty_caps) { doutc(cl, "flushing dirty caps\n"); goto ack; } if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS) { doutc(cl, "flushing snap caps\n"); goto ack; } } /* completed revocation? going down and there are no caps? */ if (revoking) { if ((revoking & cap_used) == 0) { doutc(cl, "completed revocation of %s\n", ceph_cap_string(cap->implemented & ~cap->issued)); goto ack; } /* * If the "i_wrbuffer_ref" was increased by mmap or generic * cache write just before the ceph_check_caps() is called, * the Fb capability revoking will fail this time. Then we * must wait for the BDI's delayed work to flush the dirty * pages and to release the "i_wrbuffer_ref", which will cost * at most 5 seconds. That means the MDS needs to wait at * most 5 seconds to finished the Fb capability's revocation. * * Let's queue a writeback for it. */ if (S_ISREG(inode->i_mode) && ci->i_wrbuffer_ref && (revoking & CEPH_CAP_FILE_BUFFER)) queue_writeback = true; } /* want more caps from mds? */ if (want & ~cap->mds_wanted) { if (want & ~(cap->mds_wanted | cap->issued)) goto ack; if (!__cap_is_valid(cap)) goto ack; } /* things we might delay */ if ((cap->issued & ~retain) == 0) continue; /* nope, all good */ ack: ceph_put_mds_session(session); session = ceph_get_mds_session(cap->session); /* kick flushing and flush snaps before sending normal * cap message */ if (cap == ci->i_auth_cap && (ci->i_ceph_flags & (CEPH_I_KICK_FLUSH | CEPH_I_FLUSH_SNAPS))) { if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) __kick_flushing_caps(mdsc, session, ci, 0); if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS) __ceph_flush_snaps(ci, session); goto retry; } if (cap == ci->i_auth_cap && ci->i_dirty_caps) { flushing = ci->i_dirty_caps; flush_tid = __mark_caps_flushing(inode, session, false, &oldest_flush_tid); if (flags & CHECK_CAPS_FLUSH && list_empty(&session->s_cap_dirty)) mflags |= CEPH_CLIENT_CAPS_SYNC; } else { flushing = 0; flush_tid = 0; spin_lock(&mdsc->cap_dirty_lock); oldest_flush_tid = __get_oldest_flush_tid(mdsc); spin_unlock(&mdsc->cap_dirty_lock); } mds = cap->mds; /* remember mds, so we don't repeat */ __prep_cap(&arg, cap, CEPH_CAP_OP_UPDATE, mflags, cap_used, want, retain, flushing, flush_tid, oldest_flush_tid); spin_unlock(&ci->i_ceph_lock); __send_cap(&arg, ci); spin_lock(&ci->i_ceph_lock); goto retry; /* retake i_ceph_lock and restart our cap scan. */ } /* periodically re-calculate caps wanted by open files */ if (__ceph_is_any_real_caps(ci) && list_empty(&ci->i_cap_delay_list) && (file_wanted & ~CEPH_CAP_PIN) && !(used & (CEPH_CAP_FILE_RD | CEPH_CAP_ANY_FILE_WR))) { __cap_delay_requeue(mdsc, ci); } spin_unlock(&ci->i_ceph_lock); ceph_put_mds_session(session); if (queue_writeback) ceph_queue_writeback(inode); if (queue_invalidate) ceph_queue_invalidate(inode); } /* * Try to flush dirty caps back to the auth mds. */ static int try_flush_caps(struct inode *inode, u64 *ptid) { struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); int flushing = 0; u64 flush_tid = 0, oldest_flush_tid = 0; spin_lock(&ci->i_ceph_lock); retry_locked: if (ci->i_dirty_caps && ci->i_auth_cap) { struct ceph_cap *cap = ci->i_auth_cap; struct cap_msg_args arg; struct ceph_mds_session *session = cap->session; if (session->s_state < CEPH_MDS_SESSION_OPEN) { spin_unlock(&ci->i_ceph_lock); goto out; } if (ci->i_ceph_flags & (CEPH_I_KICK_FLUSH | CEPH_I_FLUSH_SNAPS)) { if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) __kick_flushing_caps(mdsc, session, ci, 0); if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS) __ceph_flush_snaps(ci, session); goto retry_locked; } flushing = ci->i_dirty_caps; flush_tid = __mark_caps_flushing(inode, session, true, &oldest_flush_tid); __prep_cap(&arg, cap, CEPH_CAP_OP_FLUSH, CEPH_CLIENT_CAPS_SYNC, __ceph_caps_used(ci), __ceph_caps_wanted(ci), (cap->issued | cap->implemented), flushing, flush_tid, oldest_flush_tid); spin_unlock(&ci->i_ceph_lock); __send_cap(&arg, ci); } else { if (!list_empty(&ci->i_cap_flush_list)) { struct ceph_cap_flush *cf = list_last_entry(&ci->i_cap_flush_list, struct ceph_cap_flush, i_list); cf->wake = true; flush_tid = cf->tid; } flushing = ci->i_flushing_caps; spin_unlock(&ci->i_ceph_lock); } out: *ptid = flush_tid; return flushing; } /* * Return true if we've flushed caps through the given flush_tid. */ static int caps_are_flushed(struct inode *inode, u64 flush_tid) { struct ceph_inode_info *ci = ceph_inode(inode); int ret = 1; spin_lock(&ci->i_ceph_lock); if (!list_empty(&ci->i_cap_flush_list)) { struct ceph_cap_flush * cf = list_first_entry(&ci->i_cap_flush_list, struct ceph_cap_flush, i_list); if (cf->tid <= flush_tid) ret = 0; } spin_unlock(&ci->i_ceph_lock); return ret; } /* * flush the mdlog and wait for any unsafe requests to complete. */ static int flush_mdlog_and_wait_inode_unsafe_requests(struct inode *inode) { struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc; struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_request *req1 = NULL, *req2 = NULL; int ret, err = 0; spin_lock(&ci->i_unsafe_lock); if (S_ISDIR(inode->i_mode) && !list_empty(&ci->i_unsafe_dirops)) { req1 = list_last_entry(&ci->i_unsafe_dirops, struct ceph_mds_request, r_unsafe_dir_item); ceph_mdsc_get_request(req1); } if (!list_empty(&ci->i_unsafe_iops)) { req2 = list_last_entry(&ci->i_unsafe_iops, struct ceph_mds_request, r_unsafe_target_item); ceph_mdsc_get_request(req2); } spin_unlock(&ci->i_unsafe_lock); /* * Trigger to flush the journal logs in all the relevant MDSes * manually, or in the worst case we must wait at most 5 seconds * to wait the journal logs to be flushed by the MDSes periodically. */ if (req1 || req2) { struct ceph_mds_request *req; struct ceph_mds_session **sessions; struct ceph_mds_session *s; unsigned int max_sessions; int i; mutex_lock(&mdsc->mutex); max_sessions = mdsc->max_sessions; sessions = kcalloc(max_sessions, sizeof(s), GFP_KERNEL); if (!sessions) { mutex_unlock(&mdsc->mutex); err = -ENOMEM; goto out; } spin_lock(&ci->i_unsafe_lock); if (req1) { list_for_each_entry(req, &ci->i_unsafe_dirops, r_unsafe_dir_item) { s = req->r_session; if (!s) continue; if (!sessions[s->s_mds]) { s = ceph_get_mds_session(s); sessions[s->s_mds] = s; } } } if (req2) { list_for_each_entry(req, &ci->i_unsafe_iops, r_unsafe_target_item) { s = req->r_session; if (!s) continue; if (!sessions[s->s_mds]) { s = ceph_get_mds_session(s); sessions[s->s_mds] = s; } } } spin_unlock(&ci->i_unsafe_lock); /* the auth MDS */ spin_lock(&ci->i_ceph_lock); if (ci->i_auth_cap) { s = ci->i_auth_cap->session; if (!sessions[s->s_mds]) sessions[s->s_mds] = ceph_get_mds_session(s); } spin_unlock(&ci->i_ceph_lock); mutex_unlock(&mdsc->mutex); /* send flush mdlog request to MDSes */ for (i = 0; i < max_sessions; i++) { s = sessions[i]; if (s) { send_flush_mdlog(s); ceph_put_mds_session(s); } } kfree(sessions); } doutc(cl, "%p %llx.%llx wait on tid %llu %llu\n", inode, ceph_vinop(inode), req1 ? req1->r_tid : 0ULL, req2 ? req2->r_tid : 0ULL); if (req1) { ret = !wait_for_completion_timeout(&req1->r_safe_completion, ceph_timeout_jiffies(req1->r_timeout)); if (ret) err = -EIO; } if (req2) { ret = !wait_for_completion_timeout(&req2->r_safe_completion, ceph_timeout_jiffies(req2->r_timeout)); if (ret) err = -EIO; } out: if (req1) ceph_mdsc_put_request(req1); if (req2) ceph_mdsc_put_request(req2); return err; } int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct inode *inode = file->f_mapping->host; struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_client *cl = ceph_inode_to_client(inode); u64 flush_tid; int ret, err; int dirty; doutc(cl, "%p %llx.%llx%s\n", inode, ceph_vinop(inode), datasync ? " datasync" : ""); ret = file_write_and_wait_range(file, start, end); if (datasync) goto out; ret = ceph_wait_on_async_create(inode); if (ret) goto out; dirty = try_flush_caps(inode, &flush_tid); doutc(cl, "dirty caps are %s\n", ceph_cap_string(dirty)); err = flush_mdlog_and_wait_inode_unsafe_requests(inode); /* * only wait on non-file metadata writeback (the mds * can recover size and mtime, so we don't need to * wait for that) */ if (!err && (dirty & ~CEPH_CAP_ANY_FILE_WR)) { err = wait_event_interruptible(ci->i_cap_wq, caps_are_flushed(inode, flush_tid)); } if (err < 0) ret = err; err = file_check_and_advance_wb_err(file); if (err < 0) ret = err; out: doutc(cl, "%p %llx.%llx%s result=%d\n", inode, ceph_vinop(inode), datasync ? " datasync" : "", ret); return ret; } /* * Flush any dirty caps back to the mds. If we aren't asked to wait, * queue inode for flush but don't do so immediately, because we can * get by with fewer MDS messages if we wait for data writeback to * complete first. */ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_client *cl = ceph_inode_to_client(inode); u64 flush_tid; int err = 0; int dirty; int wait = (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync); doutc(cl, "%p %llx.%llx wait=%d\n", inode, ceph_vinop(inode), wait); ceph_fscache_unpin_writeback(inode, wbc); if (wait) { err = ceph_wait_on_async_create(inode); if (err) return err; dirty = try_flush_caps(inode, &flush_tid); if (dirty) err = wait_event_interruptible(ci->i_cap_wq, caps_are_flushed(inode, flush_tid)); } else { struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc; spin_lock(&ci->i_ceph_lock); if (__ceph_caps_dirty(ci)) __cap_delay_requeue_front(mdsc, ci); spin_unlock(&ci->i_ceph_lock); } return err; } static void __kick_flushing_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session, struct ceph_inode_info *ci, u64 oldest_flush_tid) __releases(ci->i_ceph_lock) __acquires(ci->i_ceph_lock) { struct inode *inode = &ci->netfs.inode; struct ceph_client *cl = mdsc->fsc->client; struct ceph_cap *cap; struct ceph_cap_flush *cf; int ret; u64 first_tid = 0; u64 last_snap_flush = 0; /* Don't do anything until create reply comes in */ if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) return; ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH; list_for_each_entry_reverse(cf, &ci->i_cap_flush_list, i_list) { if (cf->is_capsnap) { last_snap_flush = cf->tid; break; } } list_for_each_entry(cf, &ci->i_cap_flush_list, i_list) { if (cf->tid < first_tid) continue; cap = ci->i_auth_cap; if (!(cap && cap->session == session)) { pr_err_client(cl, "%p auth cap %p not mds%d ???\n", inode, cap, session->s_mds); break; } first_tid = cf->tid + 1; if (!cf->is_capsnap) { struct cap_msg_args arg; doutc(cl, "%p %llx.%llx cap %p tid %llu %s\n", inode, ceph_vinop(inode), cap, cf->tid, ceph_cap_string(cf->caps)); __prep_cap(&arg, cap, CEPH_CAP_OP_FLUSH, (cf->tid < last_snap_flush ? CEPH_CLIENT_CAPS_PENDING_CAPSNAP : 0), __ceph_caps_used(ci), __ceph_caps_wanted(ci), (cap->issued | cap->implemented), cf->caps, cf->tid, oldest_flush_tid); spin_unlock(&ci->i_ceph_lock); __send_cap(&arg, ci); } else { struct ceph_cap_snap *capsnap = container_of(cf, struct ceph_cap_snap, cap_flush); doutc(cl, "%p %llx.%llx capsnap %p tid %llu %s\n", inode, ceph_vinop(inode), capsnap, cf->tid, ceph_cap_string(capsnap->dirty)); refcount_inc(&capsnap->nref); spin_unlock(&ci->i_ceph_lock); ret = __send_flush_snap(inode, session, capsnap, cap->mseq, oldest_flush_tid); if (ret < 0) { pr_err_client(cl, "error sending cap flushsnap," " %p %llx.%llx tid %llu follows %llu\n", inode, ceph_vinop(inode), cf->tid, capsnap->follows); } ceph_put_cap_snap(capsnap); } spin_lock(&ci->i_ceph_lock); } } void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session) { struct ceph_client *cl = mdsc->fsc->client; struct ceph_inode_info *ci; struct ceph_cap *cap; u64 oldest_flush_tid; doutc(cl, "mds%d\n", session->s_mds); spin_lock(&mdsc->cap_dirty_lock); oldest_flush_tid = __get_oldest_flush_tid(mdsc); spin_unlock(&mdsc->cap_dirty_lock); list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) { struct inode *inode = &ci->netfs.inode; spin_lock(&ci->i_ceph_lock); cap = ci->i_auth_cap; if (!(cap && cap->session == session)) { pr_err_client(cl, "%p %llx.%llx auth cap %p not mds%d ???\n", inode, ceph_vinop(inode), cap, session->s_mds); spin_unlock(&ci->i_ceph_lock); continue; } /* * if flushing caps were revoked, we re-send the cap flush * in client reconnect stage. This guarantees MDS * processes * the cap flush message before issuing the flushing caps to * other client. */ if ((cap->issued & ci->i_flushing_caps) != ci->i_flushing_caps) { /* encode_caps_cb() also will reset these sequence * numbers. make sure sequence numbers in cap flush * message match later reconnect message */ cap->seq = 0; cap->issue_seq = 0; cap->mseq = 0; __kick_flushing_caps(mdsc, session, ci, oldest_flush_tid); } else { ci->i_ceph_flags |= CEPH_I_KICK_FLUSH; } spin_unlock(&ci->i_ceph_lock); } } void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session) { struct ceph_client *cl = mdsc->fsc->client; struct ceph_inode_info *ci; struct ceph_cap *cap; u64 oldest_flush_tid; lockdep_assert_held(&session->s_mutex); doutc(cl, "mds%d\n", session->s_mds); spin_lock(&mdsc->cap_dirty_lock); oldest_flush_tid = __get_oldest_flush_tid(mdsc); spin_unlock(&mdsc->cap_dirty_lock); list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) { struct inode *inode = &ci->netfs.inode; spin_lock(&ci->i_ceph_lock); cap = ci->i_auth_cap; if (!(cap && cap->session == session)) { pr_err_client(cl, "%p %llx.%llx auth cap %p not mds%d ???\n", inode, ceph_vinop(inode), cap, session->s_mds); spin_unlock(&ci->i_ceph_lock); continue; } if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) { __kick_flushing_caps(mdsc, session, ci, oldest_flush_tid); } spin_unlock(&ci->i_ceph_lock); } } void ceph_kick_flushing_inode_caps(struct ceph_mds_session *session, struct ceph_inode_info *ci) { struct ceph_mds_client *mdsc = session->s_mdsc; struct ceph_cap *cap = ci->i_auth_cap; struct inode *inode = &ci->netfs.inode; lockdep_assert_held(&ci->i_ceph_lock); doutc(mdsc->fsc->client, "%p %llx.%llx flushing %s\n", inode, ceph_vinop(inode), ceph_cap_string(ci->i_flushing_caps)); if (!list_empty(&ci->i_cap_flush_list)) { u64 oldest_flush_tid; spin_lock(&mdsc->cap_dirty_lock); list_move_tail(&ci->i_flushing_item, &cap->session->s_cap_flushing); oldest_flush_tid = __get_oldest_flush_tid(mdsc); spin_unlock(&mdsc->cap_dirty_lock); __kick_flushing_caps(mdsc, session, ci, oldest_flush_tid); } } /* * Take references to capabilities we hold, so that we don't release * them to the MDS prematurely. */ void ceph_take_cap_refs(struct ceph_inode_info *ci, int got, bool snap_rwsem_locked) { struct inode *inode = &ci->netfs.inode; struct ceph_client *cl = ceph_inode_to_client(inode); lockdep_assert_held(&ci->i_ceph_lock); if (got & CEPH_CAP_PIN) ci->i_pin_ref++; if (got & CEPH_CAP_FILE_RD) ci->i_rd_ref++; if (got & CEPH_CAP_FILE_CACHE) ci->i_rdcache_ref++; if (got & CEPH_CAP_FILE_EXCL) ci->i_fx_ref++; if (got & CEPH_CAP_FILE_WR) { if (ci->i_wr_ref == 0 && !ci->i_head_snapc) { BUG_ON(!snap_rwsem_locked); ci->i_head_snapc = ceph_get_snap_context( ci->i_snap_realm->cached_context); } ci->i_wr_ref++; } if (got & CEPH_CAP_FILE_BUFFER) { if (ci->i_wb_ref == 0) ihold(inode); ci->i_wb_ref++; doutc(cl, "%p %llx.%llx wb %d -> %d (?)\n", inode, ceph_vinop(inode), ci->i_wb_ref-1, ci->i_wb_ref); } } /* * Try to grab cap references. Specify those refs we @want, and the * minimal set we @need. Also include the larger offset we are writing * to (when applicable), and check against max_size here as well. * Note that caller is responsible for ensuring max_size increases are * requested from the MDS. * * Returns 0 if caps were not able to be acquired (yet), 1 if succeed, * or a negative error code. There are 3 speical error codes: * -EAGAIN: need to sleep but non-blocking is specified * -EFBIG: ask caller to call check_max_size() and try again. * -EUCLEAN: ask caller to call ceph_renew_caps() and try again. */ enum { /* first 8 bits are reserved for CEPH_FILE_MODE_FOO */ NON_BLOCKING = (1 << 8), CHECK_FILELOCK = (1 << 9), }; static int try_get_cap_refs(struct inode *inode, int need, int want, loff_t endoff, int flags, int *got) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc; struct ceph_client *cl = ceph_inode_to_client(inode); int ret = 0; int have, implemented; bool snap_rwsem_locked = false; doutc(cl, "%p %llx.%llx need %s want %s\n", inode, ceph_vinop(inode), ceph_cap_string(need), ceph_cap_string(want)); again: spin_lock(&ci->i_ceph_lock); if ((flags & CHECK_FILELOCK) && (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK)) { doutc(cl, "%p %llx.%llx error filelock\n", inode, ceph_vinop(inode)); ret = -EIO; goto out_unlock; } /* finish pending truncate */ while (ci->i_truncate_pending) { spin_unlock(&ci->i_ceph_lock); if (snap_rwsem_locked) { up_read(&mdsc->snap_rwsem); snap_rwsem_locked = false; } __ceph_do_pending_vmtruncate(inode); spin_lock(&ci->i_ceph_lock); } have = __ceph_caps_issued(ci, &implemented); if (have & need & CEPH_CAP_FILE_WR) { if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) { doutc(cl, "%p %llx.%llx endoff %llu > maxsize %llu\n", inode, ceph_vinop(inode), endoff, ci->i_max_size); if (endoff > ci->i_requested_max_size) ret = ci->i_auth_cap ? -EFBIG : -EUCLEAN; goto out_unlock; } /* * If a sync write is in progress, we must wait, so that we * can get a final snapshot value for size+mtime. */ if (__ceph_have_pending_cap_snap(ci)) { doutc(cl, "%p %llx.%llx cap_snap_pending\n", inode, ceph_vinop(inode)); goto out_unlock; } } if ((have & need) == need) { /* * Look at (implemented & ~have & not) so that we keep waiting * on transition from wanted -> needed caps. This is needed * for WRBUFFER|WR -> WR to avoid a new WR sync write from * going before a prior buffered writeback happens. * * For RDCACHE|RD -> RD, there is not need to wait and we can * just exclude the revoking caps and force to sync read. */ int not = want & ~(have & need); int revoking = implemented & ~have; int exclude = revoking & not; doutc(cl, "%p %llx.%llx have %s but not %s (revoking %s)\n", inode, ceph_vinop(inode), ceph_cap_string(have), ceph_cap_string(not), ceph_cap_string(revoking)); if (!exclude || !(exclude & CEPH_CAP_FILE_BUFFER)) { if (!snap_rwsem_locked && !ci->i_head_snapc && (need & CEPH_CAP_FILE_WR)) { if (!down_read_trylock(&mdsc->snap_rwsem)) { /* * we can not call down_read() when * task isn't in TASK_RUNNING state */ if (flags & NON_BLOCKING) { ret = -EAGAIN; goto out_unlock; } spin_unlock(&ci->i_ceph_lock); down_read(&mdsc->snap_rwsem); snap_rwsem_locked = true; goto again; } snap_rwsem_locked = true; } if ((have & want) == want) *got = need | (want & ~exclude); else *got = need; ceph_take_cap_refs(ci, *got, true); ret = 1; } } else { int session_readonly = false; int mds_wanted; if (ci->i_auth_cap && (need & (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_EXCL))) { struct ceph_mds_session *s = ci->i_auth_cap->session; spin_lock(&s->s_cap_lock); session_readonly = s->s_readonly; spin_unlock(&s->s_cap_lock); } if (session_readonly) { doutc(cl, "%p %llx.%llx need %s but mds%d readonly\n", inode, ceph_vinop(inode), ceph_cap_string(need), ci->i_auth_cap->mds); ret = -EROFS; goto out_unlock; } if (ceph_inode_is_shutdown(inode)) { doutc(cl, "%p %llx.%llx inode is shutdown\n", inode, ceph_vinop(inode)); ret = -ESTALE; goto out_unlock; } mds_wanted = __ceph_caps_mds_wanted(ci, false); if (need & ~mds_wanted) { doutc(cl, "%p %llx.%llx need %s > mds_wanted %s\n", inode, ceph_vinop(inode), ceph_cap_string(need), ceph_cap_string(mds_wanted)); ret = -EUCLEAN; goto out_unlock; } doutc(cl, "%p %llx.%llx have %s need %s\n", inode, ceph_vinop(inode), ceph_cap_string(have), ceph_cap_string(need)); } out_unlock: __ceph_touch_fmode(ci, mdsc, flags); spin_unlock(&ci->i_ceph_lock); if (snap_rwsem_locked) up_read(&mdsc->snap_rwsem); if (!ret) ceph_update_cap_mis(&mdsc->metric); else if (ret == 1) ceph_update_cap_hit(&mdsc->metric); doutc(cl, "%p %llx.%llx ret %d got %s\n", inode, ceph_vinop(inode), ret, ceph_cap_string(*got)); return ret; } /* * Check the offset we are writing up to against our current * max_size. If necessary, tell the MDS we want to write to * a larger offset. */ static void check_max_size(struct inode *inode, loff_t endoff) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_client *cl = ceph_inode_to_client(inode); int check = 0; /* do we need to explicitly request a larger max_size? */ spin_lock(&ci->i_ceph_lock); if (endoff >= ci->i_max_size && endoff > ci->i_wanted_max_size) { doutc(cl, "write %p %llx.%llx at large endoff %llu, req max_size\n", inode, ceph_vinop(inode), endoff); ci->i_wanted_max_size = endoff; } /* duplicate ceph_check_caps()'s logic */ if (ci->i_auth_cap && (ci->i_auth_cap->issued & CEPH_CAP_FILE_WR) && ci->i_wanted_max_size > ci->i_max_size && ci->i_wanted_max_size > ci->i_requested_max_size) check = 1; spin_unlock(&ci->i_ceph_lock); if (check) ceph_check_caps(ci, CHECK_CAPS_AUTHONLY); } static inline int get_used_fmode(int caps) { int fmode = 0; if (caps & CEPH_CAP_FILE_RD) fmode |= CEPH_FILE_MODE_RD; if (caps & CEPH_CAP_FILE_WR) fmode |= CEPH_FILE_MODE_WR; return fmode; } int ceph_try_get_caps(struct inode *inode, int need, int want, bool nonblock, int *got) { int ret, flags; BUG_ON(need & ~CEPH_CAP_FILE_RD); BUG_ON(want & ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO | CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL | CEPH_CAP_ANY_DIR_OPS)); if (need) { ret = ceph_pool_perm_check(inode, need); if (ret < 0) return ret; } flags = get_used_fmode(need | want); if (nonblock) flags |= NON_BLOCKING; ret = try_get_cap_refs(inode, need, want, 0, flags, got); /* three special error codes */ if (ret == -EAGAIN || ret == -EFBIG || ret == -EUCLEAN) ret = 0; return ret; } /* * Wait for caps, and take cap references. If we can't get a WR cap * due to a small max_size, make sure we check_max_size (and possibly * ask the mds) so we don't get hung up indefinitely. */ int __ceph_get_caps(struct inode *inode, struct ceph_file_info *fi, int need, int want, loff_t endoff, int *got) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); int ret, _got, flags; ret = ceph_pool_perm_check(inode, need); if (ret < 0) return ret; if (fi && (fi->fmode & CEPH_FILE_MODE_WR) && fi->filp_gen != READ_ONCE(fsc->filp_gen)) return -EBADF; flags = get_used_fmode(need | want); while (true) { flags &= CEPH_FILE_MODE_MASK; if (vfs_inode_has_locks(inode)) flags |= CHECK_FILELOCK; _got = 0; ret = try_get_cap_refs(inode, need, want, endoff, flags, &_got); WARN_ON_ONCE(ret == -EAGAIN); if (!ret) { struct ceph_mds_client *mdsc = fsc->mdsc; struct cap_wait cw; DEFINE_WAIT_FUNC(wait, woken_wake_function); cw.ino = ceph_ino(inode); cw.tgid = current->tgid; cw.need = need; cw.want = want; spin_lock(&mdsc->caps_list_lock); list_add(&cw.list, &mdsc->cap_wait_list); spin_unlock(&mdsc->caps_list_lock); /* make sure used fmode not timeout */ ceph_get_fmode(ci, flags, FMODE_WAIT_BIAS); add_wait_queue(&ci->i_cap_wq, &wait); flags |= NON_BLOCKING; while (!(ret = try_get_cap_refs(inode, need, want, endoff, flags, &_got))) { if (signal_pending(current)) { ret = -ERESTARTSYS; break; } wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); } remove_wait_queue(&ci->i_cap_wq, &wait); ceph_put_fmode(ci, flags, FMODE_WAIT_BIAS); spin_lock(&mdsc->caps_list_lock); list_del(&cw.list); spin_unlock(&mdsc->caps_list_lock); if (ret == -EAGAIN) continue; } if (fi && (fi->fmode & CEPH_FILE_MODE_WR) && fi->filp_gen != READ_ONCE(fsc->filp_gen)) { if (ret >= 0 && _got) ceph_put_cap_refs(ci, _got); return -EBADF; } if (ret < 0) { if (ret == -EFBIG || ret == -EUCLEAN) { int ret2 = ceph_wait_on_async_create(inode); if (ret2 < 0) return ret2; } if (ret == -EFBIG) { check_max_size(inode, endoff); continue; } if (ret == -EUCLEAN) { /* session was killed, try renew caps */ ret = ceph_renew_caps(inode, flags); if (ret == 0) continue; } return ret; } if (S_ISREG(ci->netfs.inode.i_mode) && ceph_has_inline_data(ci) && (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && i_size_read(inode) > 0) { struct page *page = find_get_page(inode->i_mapping, 0); if (page) { bool uptodate = PageUptodate(page); put_page(page); if (uptodate) break; } /* * drop cap refs first because getattr while * holding * caps refs can cause deadlock. */ ceph_put_cap_refs(ci, _got); _got = 0; /* * getattr request will bring inline data into * page cache */ ret = __ceph_do_getattr(inode, NULL, CEPH_STAT_CAP_INLINE_DATA, true); if (ret < 0) return ret; continue; } break; } *got = _got; return 0; } int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got) { struct ceph_file_info *fi = filp->private_data; struct inode *inode = file_inode(filp); return __ceph_get_caps(inode, fi, need, want, endoff, got); } /* * Take cap refs. Caller must already know we hold at least one ref * on the caps in question or we don't know this is safe. */ void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps) { spin_lock(&ci->i_ceph_lock); ceph_take_cap_refs(ci, caps, false); spin_unlock(&ci->i_ceph_lock); } /* * drop cap_snap that is not associated with any snapshot. * we don't need to send FLUSHSNAP message for it. */ static int ceph_try_drop_cap_snap(struct ceph_inode_info *ci, struct ceph_cap_snap *capsnap) { struct inode *inode = &ci->netfs.inode; struct ceph_client *cl = ceph_inode_to_client(inode); if (!capsnap->need_flush && !capsnap->writing && !capsnap->dirty_pages) { doutc(cl, "%p follows %llu\n", capsnap, capsnap->follows); BUG_ON(capsnap->cap_flush.tid > 0); ceph_put_snap_context(capsnap->context); if (!list_is_last(&capsnap->ci_item, &ci->i_cap_snaps)) ci->i_ceph_flags |= CEPH_I_FLUSH_SNAPS; list_del(&capsnap->ci_item); ceph_put_cap_snap(capsnap); return 1; } return 0; } enum put_cap_refs_mode { PUT_CAP_REFS_SYNC = 0, PUT_CAP_REFS_NO_CHECK, PUT_CAP_REFS_ASYNC, }; /* * Release cap refs. * * If we released the last ref on any given cap, call ceph_check_caps * to release (or schedule a release). * * If we are releasing a WR cap (from a sync write), finalize any affected * cap_snap, and wake up any waiters. */ static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had, enum put_cap_refs_mode mode) { struct inode *inode = &ci->netfs.inode; struct ceph_client *cl = ceph_inode_to_client(inode); int last = 0, put = 0, flushsnaps = 0, wake = 0; bool check_flushsnaps = false; spin_lock(&ci->i_ceph_lock); if (had & CEPH_CAP_PIN) --ci->i_pin_ref; if (had & CEPH_CAP_FILE_RD) if (--ci->i_rd_ref == 0) last++; if (had & CEPH_CAP_FILE_CACHE) if (--ci->i_rdcache_ref == 0) last++; if (had & CEPH_CAP_FILE_EXCL) if (--ci->i_fx_ref == 0) last++; if (had & CEPH_CAP_FILE_BUFFER) { if (--ci->i_wb_ref == 0) { last++; /* put the ref held by ceph_take_cap_refs() */ put++; check_flushsnaps = true; } doutc(cl, "%p %llx.%llx wb %d -> %d (?)\n", inode, ceph_vinop(inode), ci->i_wb_ref+1, ci->i_wb_ref); } if (had & CEPH_CAP_FILE_WR) { if (--ci->i_wr_ref == 0) { /* * The Fb caps will always be took and released * together with the Fw caps. */ WARN_ON_ONCE(ci->i_wb_ref); last++; check_flushsnaps = true; if (ci->i_wrbuffer_ref_head == 0 && ci->i_dirty_caps == 0 && ci->i_flushing_caps == 0) { BUG_ON(!ci->i_head_snapc); ceph_put_snap_context(ci->i_head_snapc); ci->i_head_snapc = NULL; } /* see comment in __ceph_remove_cap() */ if (!__ceph_is_any_real_caps(ci) && ci->i_snap_realm) ceph_change_snap_realm(inode, NULL); } } if (check_flushsnaps && __ceph_have_pending_cap_snap(ci)) { struct ceph_cap_snap *capsnap = list_last_entry(&ci->i_cap_snaps, struct ceph_cap_snap, ci_item); capsnap->writing = 0; if (ceph_try_drop_cap_snap(ci, capsnap)) /* put the ref held by ceph_queue_cap_snap() */ put++; else if (__ceph_finish_cap_snap(ci, capsnap)) flushsnaps = 1; wake = 1; } spin_unlock(&ci->i_ceph_lock); doutc(cl, "%p %llx.%llx had %s%s%s\n", inode, ceph_vinop(inode), ceph_cap_string(had), last ? " last" : "", put ? " put" : ""); switch (mode) { case PUT_CAP_REFS_SYNC: if (last) ceph_check_caps(ci, 0); else if (flushsnaps) ceph_flush_snaps(ci, NULL); break; case PUT_CAP_REFS_ASYNC: if (last) ceph_queue_check_caps(inode); else if (flushsnaps) ceph_queue_flush_snaps(inode); break; default: break; } if (wake) wake_up_all(&ci->i_cap_wq); while (put-- > 0) iput(inode); } void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) { __ceph_put_cap_refs(ci, had, PUT_CAP_REFS_SYNC); } void ceph_put_cap_refs_async(struct ceph_inode_info *ci, int had) { __ceph_put_cap_refs(ci, had, PUT_CAP_REFS_ASYNC); } void ceph_put_cap_refs_no_check_caps(struct ceph_inode_info *ci, int had) { __ceph_put_cap_refs(ci, had, PUT_CAP_REFS_NO_CHECK); } /* * Release @nr WRBUFFER refs on dirty pages for the given @snapc snap * context. Adjust per-snap dirty page accounting as appropriate. * Once all dirty data for a cap_snap is flushed, flush snapped file * metadata back to the MDS. If we dropped the last ref, call * ceph_check_caps. */ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, struct ceph_snap_context *snapc) { struct inode *inode = &ci->netfs.inode; struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_cap_snap *capsnap = NULL, *iter; int put = 0; bool last = false; bool flush_snaps = false; bool complete_capsnap = false; spin_lock(&ci->i_ceph_lock); ci->i_wrbuffer_ref -= nr; if (ci->i_wrbuffer_ref == 0) { last = true; put++; } if (ci->i_head_snapc == snapc) { ci->i_wrbuffer_ref_head -= nr; if (ci->i_wrbuffer_ref_head == 0 && ci->i_wr_ref == 0 && ci->i_dirty_caps == 0 && ci->i_flushing_caps == 0) { BUG_ON(!ci->i_head_snapc); ceph_put_snap_context(ci->i_head_snapc); ci->i_head_snapc = NULL; } doutc(cl, "on %p %llx.%llx head %d/%d -> %d/%d %s\n", inode, ceph_vinop(inode), ci->i_wrbuffer_ref+nr, ci->i_wrbuffer_ref_head+nr, ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head, last ? " LAST" : ""); } else { list_for_each_entry(iter, &ci->i_cap_snaps, ci_item) { if (iter->context == snapc) { capsnap = iter; break; } } if (!capsnap) { /* * The capsnap should already be removed when removing * auth cap in the case of a forced unmount. */ WARN_ON_ONCE(ci->i_auth_cap); goto unlock; } capsnap->dirty_pages -= nr; if (capsnap->dirty_pages == 0) { complete_capsnap = true; if (!capsnap->writing) { if (ceph_try_drop_cap_snap(ci, capsnap)) { put++; } else { ci->i_ceph_flags |= CEPH_I_FLUSH_SNAPS; flush_snaps = true; } } } doutc(cl, "%p %llx.%llx cap_snap %p snap %lld %d/%d -> %d/%d %s%s\n", inode, ceph_vinop(inode), capsnap, capsnap->context->seq, ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr, ci->i_wrbuffer_ref, capsnap->dirty_pages, last ? " (wrbuffer last)" : "", complete_capsnap ? " (complete capsnap)" : ""); } unlock: spin_unlock(&ci->i_ceph_lock); if (last) { ceph_check_caps(ci, 0); } else if (flush_snaps) { ceph_flush_snaps(ci, NULL); } if (complete_capsnap) wake_up_all(&ci->i_cap_wq); while (put-- > 0) { iput(inode); } } /* * Invalidate unlinked inode's aliases, so we can drop the inode ASAP. */ static void invalidate_aliases(struct inode *inode) { struct ceph_client *cl = ceph_inode_to_client(inode); struct dentry *dn, *prev = NULL; doutc(cl, "%p %llx.%llx\n", inode, ceph_vinop(inode)); d_prune_aliases(inode); /* * For non-directory inode, d_find_alias() only returns * hashed dentry. After calling d_invalidate(), the * dentry becomes unhashed. * * For directory inode, d_find_alias() can return * unhashed dentry. But directory inode should have * one alias at most. */ while ((dn = d_find_alias(inode))) { if (dn == prev) { dput(dn); break; } d_invalidate(dn); if (prev) dput(prev); prev = dn; } if (prev) dput(prev); } struct cap_extra_info { struct ceph_string *pool_ns; /* inline data */ u64 inline_version; void *inline_data; u32 inline_len; /* dirstat */ bool dirstat_valid; u64 nfiles; u64 nsubdirs; u64 change_attr; /* currently issued */ int issued; struct timespec64 btime; u8 *fscrypt_auth; u32 fscrypt_auth_len; u64 fscrypt_file_size; }; /* * Handle a cap GRANT message from the MDS. (Note that a GRANT may * actually be a revocation if it specifies a smaller cap set.) * * caller holds s_mutex and i_ceph_lock, we drop both. */ static void handle_cap_grant(struct inode *inode, struct ceph_mds_session *session, struct ceph_cap *cap, struct ceph_mds_caps *grant, struct ceph_buffer *xattr_buf, struct cap_extra_info *extra_info) __releases(ci->i_ceph_lock) __releases(session->s_mdsc->snap_rwsem) { struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); int seq = le32_to_cpu(grant->seq); int newcaps = le32_to_cpu(grant->caps); int used, wanted, dirty; u64 size = le64_to_cpu(grant->size); u64 max_size = le64_to_cpu(grant->max_size); unsigned char check_caps = 0; bool was_stale = cap->cap_gen < atomic_read(&session->s_cap_gen); bool wake = false; bool writeback = false; bool queue_trunc = false; bool queue_invalidate = false; bool deleted_inode = false; bool fill_inline = false; /* * If there is at least one crypto block then we'll trust * fscrypt_file_size. If the real length of the file is 0, then * ignore it (it has probably been truncated down to 0 by the MDS). */ if (IS_ENCRYPTED(inode) && size) size = extra_info->fscrypt_file_size; doutc(cl, "%p %llx.%llx cap %p mds%d seq %d %s\n", inode, ceph_vinop(inode), cap, session->s_mds, seq, ceph_cap_string(newcaps)); doutc(cl, " size %llu max_size %llu, i_size %llu\n", size, max_size, i_size_read(inode)); /* * If CACHE is being revoked, and we have no dirty buffers, * try to invalidate (once). (If there are dirty buffers, we * will invalidate _after_ writeback.) */ if (S_ISREG(inode->i_mode) && /* don't invalidate readdir cache */ ((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) && (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 && !(ci->i_wrbuffer_ref || ci->i_wb_ref)) { if (try_nonblocking_invalidate(inode)) { /* there were locked pages.. invalidate later in a separate thread. */ if (ci->i_rdcache_revoking != ci->i_rdcache_gen) { queue_invalidate = true; ci->i_rdcache_revoking = ci->i_rdcache_gen; } } } if (was_stale) cap->issued = cap->implemented = CEPH_CAP_PIN; /* * auth mds of the inode changed. we received the cap export message, * but still haven't received the cap import message. handle_cap_export * updated the new auth MDS' cap. * * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing a message * that was sent before the cap import message. So don't remove caps. */ if (ceph_seq_cmp(seq, cap->seq) <= 0) { WARN_ON(cap != ci->i_auth_cap); WARN_ON(cap->cap_id != le64_to_cpu(grant->cap_id)); seq = cap->seq; newcaps |= cap->issued; } /* side effects now are allowed */ cap->cap_gen = atomic_read(&session->s_cap_gen); cap->seq = seq; __check_cap_issue(ci, cap, newcaps); inode_set_max_iversion_raw(inode, extra_info->change_attr); if ((newcaps & CEPH_CAP_AUTH_SHARED) && (extra_info->issued & CEPH_CAP_AUTH_EXCL) == 0) { umode_t mode = le32_to_cpu(grant->mode); if (inode_wrong_type(inode, mode)) pr_warn_once("inode type changed! (ino %llx.%llx is 0%o, mds says 0%o)\n", ceph_vinop(inode), inode->i_mode, mode); else inode->i_mode = mode; inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(grant->uid)); inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(grant->gid)); ci->i_btime = extra_info->btime; doutc(cl, "%p %llx.%llx mode 0%o uid.gid %d.%d\n", inode, ceph_vinop(inode), inode->i_mode, from_kuid(&init_user_ns, inode->i_uid), from_kgid(&init_user_ns, inode->i_gid)); #if IS_ENABLED(CONFIG_FS_ENCRYPTION) if (ci->fscrypt_auth_len != extra_info->fscrypt_auth_len || memcmp(ci->fscrypt_auth, extra_info->fscrypt_auth, ci->fscrypt_auth_len)) pr_warn_ratelimited_client(cl, "cap grant attempt to change fscrypt_auth on non-I_NEW inode (old len %d new len %d)\n", ci->fscrypt_auth_len, extra_info->fscrypt_auth_len); #endif } if ((newcaps & CEPH_CAP_LINK_SHARED) && (extra_info->issued & CEPH_CAP_LINK_EXCL) == 0) { set_nlink(inode, le32_to_cpu(grant->nlink)); if (inode->i_nlink == 0) deleted_inode = true; } if ((extra_info->issued & CEPH_CAP_XATTR_EXCL) == 0 && grant->xattr_len) { int len = le32_to_cpu(grant->xattr_len); u64 version = le64_to_cpu(grant->xattr_version); if (version > ci->i_xattrs.version) { doutc(cl, " got new xattrs v%llu on %p %llx.%llx len %d\n", version, inode, ceph_vinop(inode), len); if (ci->i_xattrs.blob) ceph_buffer_put(ci->i_xattrs.blob); ci->i_xattrs.blob = ceph_buffer_get(xattr_buf); ci->i_xattrs.version = version; ceph_forget_all_cached_acls(inode); ceph_security_invalidate_secctx(inode); } } if (newcaps & CEPH_CAP_ANY_RD) { struct timespec64 mtime, atime, ctime; /* ctime/mtime/atime? */ ceph_decode_timespec64(&mtime, &grant->mtime); ceph_decode_timespec64(&atime, &grant->atime); ceph_decode_timespec64(&ctime, &grant->ctime); ceph_fill_file_time(inode, extra_info->issued, le32_to_cpu(grant->time_warp_seq), &ctime, &mtime, &atime); } if ((newcaps & CEPH_CAP_FILE_SHARED) && extra_info->dirstat_valid) { ci->i_files = extra_info->nfiles; ci->i_subdirs = extra_info->nsubdirs; } if (newcaps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) { /* file layout may have changed */ s64 old_pool = ci->i_layout.pool_id; struct ceph_string *old_ns; ceph_file_layout_from_legacy(&ci->i_layout, &grant->layout); old_ns = rcu_dereference_protected(ci->i_layout.pool_ns, lockdep_is_held(&ci->i_ceph_lock)); rcu_assign_pointer(ci->i_layout.pool_ns, extra_info->pool_ns); if (ci->i_layout.pool_id != old_pool || extra_info->pool_ns != old_ns) ci->i_ceph_flags &= ~CEPH_I_POOL_PERM; extra_info->pool_ns = old_ns; /* size/truncate_seq? */ queue_trunc = ceph_fill_file_size(inode, extra_info->issued, le32_to_cpu(grant->truncate_seq), le64_to_cpu(grant->truncate_size), size); } if (ci->i_auth_cap == cap && (newcaps & CEPH_CAP_ANY_FILE_WR)) { if (max_size != ci->i_max_size) { doutc(cl, "max_size %lld -> %llu\n", ci->i_max_size, max_size); ci->i_max_size = max_size; if (max_size >= ci->i_wanted_max_size) { ci->i_wanted_max_size = 0; /* reset */ ci->i_requested_max_size = 0; } wake = true; } } /* check cap bits */ wanted = __ceph_caps_wanted(ci); used = __ceph_caps_used(ci); dirty = __ceph_caps_dirty(ci); doutc(cl, " my wanted = %s, used = %s, dirty %s\n", ceph_cap_string(wanted), ceph_cap_string(used), ceph_cap_string(dirty)); if ((was_stale || le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) && (wanted & ~(cap->mds_wanted | newcaps))) { /* * If mds is importing cap, prior cap messages that update * 'wanted' may get dropped by mds (migrate seq mismatch). * * We don't send cap message to update 'wanted' if what we * want are already issued. If mds revokes caps, cap message * that releases caps also tells mds what we want. But if * caps got revoked by mds forcedly (session stale). We may * haven't told mds what we want. */ check_caps = 1; } /* revocation, grant, or no-op? */ if (cap->issued & ~newcaps) { int revoking = cap->issued & ~newcaps; doutc(cl, "revocation: %s -> %s (revoking %s)\n", ceph_cap_string(cap->issued), ceph_cap_string(newcaps), ceph_cap_string(revoking)); if (S_ISREG(inode->i_mode) && (revoking & used & CEPH_CAP_FILE_BUFFER)) writeback = true; /* initiate writeback; will delay ack */ else if (queue_invalidate && revoking == CEPH_CAP_FILE_CACHE && (newcaps & CEPH_CAP_FILE_LAZYIO) == 0) ; /* do nothing yet, invalidation will be queued */ else if (cap == ci->i_auth_cap) check_caps = 1; /* check auth cap only */ else check_caps = 2; /* check all caps */ /* If there is new caps, try to wake up the waiters */ if (~cap->issued & newcaps) wake = true; cap->issued = newcaps; cap->implemented |= newcaps; } else if (cap->issued == newcaps) { doutc(cl, "caps unchanged: %s -> %s\n", ceph_cap_string(cap->issued), ceph_cap_string(newcaps)); } else { doutc(cl, "grant: %s -> %s\n", ceph_cap_string(cap->issued), ceph_cap_string(newcaps)); /* non-auth MDS is revoking the newly grant caps ? */ if (cap == ci->i_auth_cap && __ceph_caps_revoking_other(ci, cap, newcaps)) check_caps = 2; cap->issued = newcaps; cap->implemented |= newcaps; /* add bits only, to * avoid stepping on a * pending revocation */ wake = true; } BUG_ON(cap->issued & ~cap->implemented); /* don't let check_caps skip sending a response to MDS for revoke msgs */ if (le32_to_cpu(grant->op) == CEPH_CAP_OP_REVOKE) { cap->mds_wanted = 0; if (cap == ci->i_auth_cap) check_caps = 1; /* check auth cap only */ else check_caps = 2; /* check all caps */ } if (extra_info->inline_version > 0 && extra_info->inline_version >= ci->i_inline_version) { ci->i_inline_version = extra_info->inline_version; if (ci->i_inline_version != CEPH_INLINE_NONE && (newcaps & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO))) fill_inline = true; } if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) { if (ci->i_auth_cap == cap) { if (newcaps & ~extra_info->issued) wake = true; if (ci->i_requested_max_size > max_size || !(le32_to_cpu(grant->wanted) & CEPH_CAP_ANY_FILE_WR)) { /* re-request max_size if necessary */ ci->i_requested_max_size = 0; wake = true; } ceph_kick_flushing_inode_caps(session, ci); } up_read(&session->s_mdsc->snap_rwsem); } spin_unlock(&ci->i_ceph_lock); if (fill_inline) ceph_fill_inline_data(inode, NULL, extra_info->inline_data, extra_info->inline_len); if (queue_trunc) ceph_queue_vmtruncate(inode); if (writeback) /* * queue inode for writeback: we can't actually call * filemap_write_and_wait, etc. from message handler * context. */ ceph_queue_writeback(inode); if (queue_invalidate) ceph_queue_invalidate(inode); if (deleted_inode) invalidate_aliases(inode); if (wake) wake_up_all(&ci->i_cap_wq); mutex_unlock(&session->s_mutex); if (check_caps == 1) ceph_check_caps(ci, CHECK_CAPS_AUTHONLY | CHECK_CAPS_NOINVAL); else if (check_caps == 2) ceph_check_caps(ci, CHECK_CAPS_NOINVAL); } /* * Handle FLUSH_ACK from MDS, indicating that metadata we sent to the * MDS has been safely committed. */ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, struct ceph_mds_caps *m, struct ceph_mds_session *session, struct ceph_cap *cap) __releases(ci->i_ceph_lock) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc; struct ceph_client *cl = mdsc->fsc->client; struct ceph_cap_flush *cf, *tmp_cf; LIST_HEAD(to_remove); unsigned seq = le32_to_cpu(m->seq); int dirty = le32_to_cpu(m->dirty); int cleaned = 0; bool drop = false; bool wake_ci = false; bool wake_mdsc = false; list_for_each_entry_safe(cf, tmp_cf, &ci->i_cap_flush_list, i_list) { /* Is this the one that was flushed? */ if (cf->tid == flush_tid) cleaned = cf->caps; /* Is this a capsnap? */ if (cf->is_capsnap) continue; if (cf->tid <= flush_tid) { /* * An earlier or current tid. The FLUSH_ACK should * represent a superset of this flush's caps. */ wake_ci |= __detach_cap_flush_from_ci(ci, cf); list_add_tail(&cf->i_list, &to_remove); } else { /* * This is a later one. Any caps in it are still dirty * so don't count them as cleaned. */ cleaned &= ~cf->caps; if (!cleaned) break; } } doutc(cl, "%p %llx.%llx mds%d seq %d on %s cleaned %s, flushing %s -> %s\n", inode, ceph_vinop(inode), session->s_mds, seq, ceph_cap_string(dirty), ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps), ceph_cap_string(ci->i_flushing_caps & ~cleaned)); if (list_empty(&to_remove) && !cleaned) goto out; ci->i_flushing_caps &= ~cleaned; spin_lock(&mdsc->cap_dirty_lock); list_for_each_entry(cf, &to_remove, i_list) wake_mdsc |= __detach_cap_flush_from_mdsc(mdsc, cf); if (ci->i_flushing_caps == 0) { if (list_empty(&ci->i_cap_flush_list)) { list_del_init(&ci->i_flushing_item); if (!list_empty(&session->s_cap_flushing)) { struct inode *inode = &list_first_entry(&session->s_cap_flushing, struct ceph_inode_info, i_flushing_item)->netfs.inode; doutc(cl, " mds%d still flushing cap on %p %llx.%llx\n", session->s_mds, inode, ceph_vinop(inode)); } } mdsc->num_cap_flushing--; doutc(cl, " %p %llx.%llx now !flushing\n", inode, ceph_vinop(inode)); if (ci->i_dirty_caps == 0) { doutc(cl, " %p %llx.%llx now clean\n", inode, ceph_vinop(inode)); BUG_ON(!list_empty(&ci->i_dirty_item)); drop = true; if (ci->i_wr_ref == 0 && ci->i_wrbuffer_ref_head == 0) { BUG_ON(!ci->i_head_snapc); ceph_put_snap_context(ci->i_head_snapc); ci->i_head_snapc = NULL; } } else { BUG_ON(list_empty(&ci->i_dirty_item)); } } spin_unlock(&mdsc->cap_dirty_lock); out: spin_unlock(&ci->i_ceph_lock); while (!list_empty(&to_remove)) { cf = list_first_entry(&to_remove, struct ceph_cap_flush, i_list); list_del_init(&cf->i_list); if (!cf->is_capsnap) ceph_free_cap_flush(cf); } if (wake_ci) wake_up_all(&ci->i_cap_wq); if (wake_mdsc) wake_up_all(&mdsc->cap_flushing_wq); if (drop) iput(inode); } void __ceph_remove_capsnap(struct inode *inode, struct ceph_cap_snap *capsnap, bool *wake_ci, bool *wake_mdsc) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc; struct ceph_client *cl = mdsc->fsc->client; bool ret; lockdep_assert_held(&ci->i_ceph_lock); doutc(cl, "removing capsnap %p, %p %llx.%llx ci %p\n", capsnap, inode, ceph_vinop(inode), ci); list_del_init(&capsnap->ci_item); ret = __detach_cap_flush_from_ci(ci, &capsnap->cap_flush); if (wake_ci) *wake_ci = ret; spin_lock(&mdsc->cap_dirty_lock); if (list_empty(&ci->i_cap_flush_list)) list_del_init(&ci->i_flushing_item); ret = __detach_cap_flush_from_mdsc(mdsc, &capsnap->cap_flush); if (wake_mdsc) *wake_mdsc = ret; spin_unlock(&mdsc->cap_dirty_lock); } void ceph_remove_capsnap(struct inode *inode, struct ceph_cap_snap *capsnap, bool *wake_ci, bool *wake_mdsc) { struct ceph_inode_info *ci = ceph_inode(inode); lockdep_assert_held(&ci->i_ceph_lock); WARN_ON_ONCE(capsnap->dirty_pages || capsnap->writing); __ceph_remove_capsnap(inode, capsnap, wake_ci, wake_mdsc); } /* * Handle FLUSHSNAP_ACK. MDS has flushed snap data to disk and we can * throw away our cap_snap. * * Caller hold s_mutex. */ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid, struct ceph_mds_caps *m, struct ceph_mds_session *session) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_client *mdsc = ceph_sb_to_fs_client(inode->i_sb)->mdsc; struct ceph_client *cl = mdsc->fsc->client; u64 follows = le64_to_cpu(m->snap_follows); struct ceph_cap_snap *capsnap = NULL, *iter; bool wake_ci = false; bool wake_mdsc = false; doutc(cl, "%p %llx.%llx ci %p mds%d follows %lld\n", inode, ceph_vinop(inode), ci, session->s_mds, follows); spin_lock(&ci->i_ceph_lock); list_for_each_entry(iter, &ci->i_cap_snaps, ci_item) { if (iter->follows == follows) { if (iter->cap_flush.tid != flush_tid) { doutc(cl, " cap_snap %p follows %lld " "tid %lld != %lld\n", iter, follows, flush_tid, iter->cap_flush.tid); break; } capsnap = iter; break; } else { doutc(cl, " skipping cap_snap %p follows %lld\n", iter, iter->follows); } } if (capsnap) ceph_remove_capsnap(inode, capsnap, &wake_ci, &wake_mdsc); spin_unlock(&ci->i_ceph_lock); if (capsnap) { ceph_put_snap_context(capsnap->context); ceph_put_cap_snap(capsnap); if (wake_ci) wake_up_all(&ci->i_cap_wq); if (wake_mdsc) wake_up_all(&mdsc->cap_flushing_wq); iput(inode); } } /* * Handle TRUNC from MDS, indicating file truncation. * * caller hold s_mutex. */ static bool handle_cap_trunc(struct inode *inode, struct ceph_mds_caps *trunc, struct ceph_mds_session *session, struct cap_extra_info *extra_info) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_client *cl = ceph_inode_to_client(inode); int mds = session->s_mds; int seq = le32_to_cpu(trunc->seq); u32 truncate_seq = le32_to_cpu(trunc->truncate_seq); u64 truncate_size = le64_to_cpu(trunc->truncate_size); u64 size = le64_to_cpu(trunc->size); int implemented = 0; int dirty = __ceph_caps_dirty(ci); int issued = __ceph_caps_issued(ceph_inode(inode), &implemented); bool queue_trunc = false; lockdep_assert_held(&ci->i_ceph_lock); issued |= implemented | dirty; /* * If there is at least one crypto block then we'll trust * fscrypt_file_size. If the real length of the file is 0, then * ignore it (it has probably been truncated down to 0 by the MDS). */ if (IS_ENCRYPTED(inode) && size) size = extra_info->fscrypt_file_size; doutc(cl, "%p %llx.%llx mds%d seq %d to %lld truncate seq %d\n", inode, ceph_vinop(inode), mds, seq, truncate_size, truncate_seq); queue_trunc = ceph_fill_file_size(inode, issued, truncate_seq, truncate_size, size); return queue_trunc; } /* * Handle EXPORT from MDS. Cap is being migrated _from_ this mds to a * different one. If we are the most recent migration we've seen (as * indicated by mseq), make note of the migrating cap bits for the * duration (until we see the corresponding IMPORT). * * caller holds s_mutex */ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex, struct ceph_mds_cap_peer *ph, struct ceph_mds_session *session) { struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc; struct ceph_client *cl = mdsc->fsc->client; struct ceph_mds_session *tsession = NULL; struct ceph_cap *cap, *tcap, *new_cap = NULL; struct ceph_inode_info *ci = ceph_inode(inode); u64 t_cap_id; unsigned mseq = le32_to_cpu(ex->migrate_seq); unsigned t_seq, t_mseq; int target, issued; int mds = session->s_mds; if (ph) { t_cap_id = le64_to_cpu(ph->cap_id); t_seq = le32_to_cpu(ph->seq); t_mseq = le32_to_cpu(ph->mseq); target = le32_to_cpu(ph->mds); } else { t_cap_id = t_seq = t_mseq = 0; target = -1; } doutc(cl, "%p %llx.%llx ci %p mds%d mseq %d target %d\n", inode, ceph_vinop(inode), ci, mds, mseq, target); retry: down_read(&mdsc->snap_rwsem); spin_lock(&ci->i_ceph_lock); cap = __get_cap_for_mds(ci, mds); if (!cap || cap->cap_id != le64_to_cpu(ex->cap_id)) goto out_unlock; if (target < 0) { ceph_remove_cap(mdsc, cap, false); goto out_unlock; } /* * now we know we haven't received the cap import message yet * because the exported cap still exist. */ issued = cap->issued; if (issued != cap->implemented) pr_err_ratelimited_client(cl, "issued != implemented: " "%p %llx.%llx mds%d seq %d mseq %d" " issued %s implemented %s\n", inode, ceph_vinop(inode), mds, cap->seq, cap->mseq, ceph_cap_string(issued), ceph_cap_string(cap->implemented)); tcap = __get_cap_for_mds(ci, target); if (tcap) { /* already have caps from the target */ if (tcap->cap_id == t_cap_id && ceph_seq_cmp(tcap->seq, t_seq) < 0) { doutc(cl, " updating import cap %p mds%d\n", tcap, target); tcap->cap_id = t_cap_id; tcap->seq = t_seq - 1; tcap->issue_seq = t_seq - 1; tcap->issued |= issued; tcap->implemented |= issued; if (cap == ci->i_auth_cap) { ci->i_auth_cap = tcap; change_auth_cap_ses(ci, tcap->session); } } ceph_remove_cap(mdsc, cap, false); goto out_unlock; } else if (tsession) { /* add placeholder for the export tagert */ int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0; tcap = new_cap; ceph_add_cap(inode, tsession, t_cap_id, issued, 0, t_seq - 1, t_mseq, (u64)-1, flag, &new_cap); if (!list_empty(&ci->i_cap_flush_list) && ci->i_auth_cap == tcap) { spin_lock(&mdsc->cap_dirty_lock); list_move_tail(&ci->i_flushing_item, &tcap->session->s_cap_flushing); spin_unlock(&mdsc->cap_dirty_lock); } ceph_remove_cap(mdsc, cap, false); goto out_unlock; } spin_unlock(&ci->i_ceph_lock); up_read(&mdsc->snap_rwsem); mutex_unlock(&session->s_mutex); /* open target session */ tsession = ceph_mdsc_open_export_target_session(mdsc, target); if (!IS_ERR(tsession)) { if (mds > target) { mutex_lock(&session->s_mutex); mutex_lock_nested(&tsession->s_mutex, SINGLE_DEPTH_NESTING); } else { mutex_lock(&tsession->s_mutex); mutex_lock_nested(&session->s_mutex, SINGLE_DEPTH_NESTING); } new_cap = ceph_get_cap(mdsc, NULL); } else { WARN_ON(1); tsession = NULL; target = -1; mutex_lock(&session->s_mutex); } goto retry; out_unlock: spin_unlock(&ci->i_ceph_lock); up_read(&mdsc->snap_rwsem); mutex_unlock(&session->s_mutex); if (tsession) { mutex_unlock(&tsession->s_mutex); ceph_put_mds_session(tsession); } if (new_cap) ceph_put_cap(mdsc, new_cap); } /* * Handle cap IMPORT. * * caller holds s_mutex. acquires i_ceph_lock */ static void handle_cap_import(struct ceph_mds_client *mdsc, struct inode *inode, struct ceph_mds_caps *im, struct ceph_mds_cap_peer *ph, struct ceph_mds_session *session, struct ceph_cap **target_cap, int *old_issued) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_client *cl = mdsc->fsc->client; struct ceph_cap *cap, *ocap, *new_cap = NULL; int mds = session->s_mds; int issued; unsigned caps = le32_to_cpu(im->caps); unsigned wanted = le32_to_cpu(im->wanted); unsigned seq = le32_to_cpu(im->seq); unsigned mseq = le32_to_cpu(im->migrate_seq); u64 realmino = le64_to_cpu(im->realm); u64 cap_id = le64_to_cpu(im->cap_id); u64 p_cap_id; int peer; if (ph) { p_cap_id = le64_to_cpu(ph->cap_id); peer = le32_to_cpu(ph->mds); } else { p_cap_id = 0; peer = -1; } doutc(cl, "%p %llx.%llx ci %p mds%d mseq %d peer %d\n", inode, ceph_vinop(inode), ci, mds, mseq, peer); retry: cap = __get_cap_for_mds(ci, mds); if (!cap) { if (!new_cap) { spin_unlock(&ci->i_ceph_lock); new_cap = ceph_get_cap(mdsc, NULL); spin_lock(&ci->i_ceph_lock); goto retry; } cap = new_cap; } else { if (new_cap) { ceph_put_cap(mdsc, new_cap); new_cap = NULL; } } __ceph_caps_issued(ci, &issued); issued |= __ceph_caps_dirty(ci); ceph_add_cap(inode, session, cap_id, caps, wanted, seq, mseq, realmino, CEPH_CAP_FLAG_AUTH, &new_cap); ocap = peer >= 0 ? __get_cap_for_mds(ci, peer) : NULL; if (ocap && ocap->cap_id == p_cap_id) { doutc(cl, " remove export cap %p mds%d flags %d\n", ocap, peer, ph->flags); if ((ph->flags & CEPH_CAP_FLAG_AUTH) && (ocap->seq != le32_to_cpu(ph->seq) || ocap->mseq != le32_to_cpu(ph->mseq))) { pr_err_ratelimited_client(cl, "mismatched seq/mseq: " "%p %llx.%llx mds%d seq %d mseq %d" " importer mds%d has peer seq %d mseq %d\n", inode, ceph_vinop(inode), peer, ocap->seq, ocap->mseq, mds, le32_to_cpu(ph->seq), le32_to_cpu(ph->mseq)); } ceph_remove_cap(mdsc, ocap, (ph->flags & CEPH_CAP_FLAG_RELEASE)); } *old_issued = issued; *target_cap = cap; } #ifdef CONFIG_FS_ENCRYPTION static int parse_fscrypt_fields(void **p, void *end, struct cap_extra_info *extra) { u32 len; ceph_decode_32_safe(p, end, extra->fscrypt_auth_len, bad); if (extra->fscrypt_auth_len) { ceph_decode_need(p, end, extra->fscrypt_auth_len, bad); extra->fscrypt_auth = kmalloc(extra->fscrypt_auth_len, GFP_KERNEL); if (!extra->fscrypt_auth) return -ENOMEM; ceph_decode_copy_safe(p, end, extra->fscrypt_auth, extra->fscrypt_auth_len, bad); } ceph_decode_32_safe(p, end, len, bad); if (len >= sizeof(u64)) { ceph_decode_64_safe(p, end, extra->fscrypt_file_size, bad); len -= sizeof(u64); } ceph_decode_skip_n(p, end, len, bad); return 0; bad: return -EIO; } #else static int parse_fscrypt_fields(void **p, void *end, struct cap_extra_info *extra) { u32 len; /* Don't care about these fields unless we're encryption-capable */ ceph_decode_32_safe(p, end, len, bad); if (len) ceph_decode_skip_n(p, end, len, bad); ceph_decode_32_safe(p, end, len, bad); if (len) ceph_decode_skip_n(p, end, len, bad); return 0; bad: return -EIO; } #endif /* * Handle a caps message from the MDS. * * Identify the appropriate session, inode, and call the right handler * based on the cap op. */ void ceph_handle_caps(struct ceph_mds_session *session, struct ceph_msg *msg) { struct ceph_mds_client *mdsc = session->s_mdsc; struct ceph_client *cl = mdsc->fsc->client; struct inode *inode; struct ceph_inode_info *ci; struct ceph_cap *cap; struct ceph_mds_caps *h; struct ceph_mds_cap_peer *peer = NULL; struct ceph_snap_realm *realm = NULL; int op; int msg_version = le16_to_cpu(msg->hdr.version); u32 seq, mseq; struct ceph_vino vino; void *snaptrace; size_t snaptrace_len; void *p, *end; struct cap_extra_info extra_info = {}; bool queue_trunc; bool close_sessions = false; bool do_cap_release = false; doutc(cl, "from mds%d\n", session->s_mds); if (!ceph_inc_mds_stopping_blocker(mdsc, session)) return; /* decode */ end = msg->front.iov_base + msg->front.iov_len; if (msg->front.iov_len < sizeof(*h)) goto bad; h = msg->front.iov_base; op = le32_to_cpu(h->op); vino.ino = le64_to_cpu(h->ino); vino.snap = CEPH_NOSNAP; seq = le32_to_cpu(h->seq); mseq = le32_to_cpu(h->migrate_seq); snaptrace = h + 1; snaptrace_len = le32_to_cpu(h->snap_trace_len); p = snaptrace + snaptrace_len; if (msg_version >= 2) { u32 flock_len; ceph_decode_32_safe(&p, end, flock_len, bad); if (p + flock_len > end) goto bad; p += flock_len; } if (msg_version >= 3) { if (op == CEPH_CAP_OP_IMPORT) { if (p + sizeof(*peer) > end) goto bad; peer = p; p += sizeof(*peer); } else if (op == CEPH_CAP_OP_EXPORT) { /* recorded in unused fields */ peer = (void *)&h->size; } } if (msg_version >= 4) { ceph_decode_64_safe(&p, end, extra_info.inline_version, bad); ceph_decode_32_safe(&p, end, extra_info.inline_len, bad); if (p + extra_info.inline_len > end) goto bad; extra_info.inline_data = p; p += extra_info.inline_len; } if (msg_version >= 5) { struct ceph_osd_client *osdc = &mdsc->fsc->client->osdc; u32 epoch_barrier; ceph_decode_32_safe(&p, end, epoch_barrier, bad); ceph_osdc_update_epoch_barrier(osdc, epoch_barrier); } if (msg_version >= 8) { u32 pool_ns_len; /* version >= 6 */ ceph_decode_skip_64(&p, end, bad); // flush_tid /* version >= 7 */ ceph_decode_skip_32(&p, end, bad); // caller_uid ceph_decode_skip_32(&p, end, bad); // caller_gid /* version >= 8 */ ceph_decode_32_safe(&p, end, pool_ns_len, bad); if (pool_ns_len > 0) { ceph_decode_need(&p, end, pool_ns_len, bad); extra_info.pool_ns = ceph_find_or_create_string(p, pool_ns_len); p += pool_ns_len; } } if (msg_version >= 9) { struct ceph_timespec *btime; if (p + sizeof(*btime) > end) goto bad; btime = p; ceph_decode_timespec64(&extra_info.btime, btime); p += sizeof(*btime); ceph_decode_64_safe(&p, end, extra_info.change_attr, bad); } if (msg_version >= 11) { /* version >= 10 */ ceph_decode_skip_32(&p, end, bad); // flags /* version >= 11 */ extra_info.dirstat_valid = true; ceph_decode_64_safe(&p, end, extra_info.nfiles, bad); ceph_decode_64_safe(&p, end, extra_info.nsubdirs, bad); } if (msg_version >= 12) { if (parse_fscrypt_fields(&p, end, &extra_info)) goto bad; } /* lookup ino */ inode = ceph_find_inode(mdsc->fsc->sb, vino); doutc(cl, " op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino, vino.snap, inode); mutex_lock(&session->s_mutex); doutc(cl, " mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq, (unsigned)seq); if (!inode) { doutc(cl, " i don't have ino %llx\n", vino.ino); switch (op) { case CEPH_CAP_OP_IMPORT: case CEPH_CAP_OP_REVOKE: case CEPH_CAP_OP_GRANT: do_cap_release = true; break; default: break; } goto flush_cap_releases; } ci = ceph_inode(inode); /* these will work even if we don't have a cap yet */ switch (op) { case CEPH_CAP_OP_FLUSHSNAP_ACK: handle_cap_flushsnap_ack(inode, le64_to_cpu(msg->hdr.tid), h, session); goto done; case CEPH_CAP_OP_EXPORT: handle_cap_export(inode, h, peer, session); goto done_unlocked; case CEPH_CAP_OP_IMPORT: realm = NULL; if (snaptrace_len) { down_write(&mdsc->snap_rwsem); if (ceph_update_snap_trace(mdsc, snaptrace, snaptrace + snaptrace_len, false, &realm)) { up_write(&mdsc->snap_rwsem); close_sessions = true; goto done; } downgrade_write(&mdsc->snap_rwsem); } else { down_read(&mdsc->snap_rwsem); } spin_lock(&ci->i_ceph_lock); handle_cap_import(mdsc, inode, h, peer, session, &cap, &extra_info.issued); handle_cap_grant(inode, session, cap, h, msg->middle, &extra_info); if (realm) ceph_put_snap_realm(mdsc, realm); goto done_unlocked; } /* the rest require a cap */ spin_lock(&ci->i_ceph_lock); cap = __get_cap_for_mds(ceph_inode(inode), session->s_mds); if (!cap) { doutc(cl, " no cap on %p ino %llx.%llx from mds%d\n", inode, ceph_ino(inode), ceph_snap(inode), session->s_mds); spin_unlock(&ci->i_ceph_lock); switch (op) { case CEPH_CAP_OP_REVOKE: case CEPH_CAP_OP_GRANT: do_cap_release = true; break; default: break; } goto flush_cap_releases; } /* note that each of these drops i_ceph_lock for us */ switch (op) { case CEPH_CAP_OP_REVOKE: case CEPH_CAP_OP_GRANT: __ceph_caps_issued(ci, &extra_info.issued); extra_info.issued |= __ceph_caps_dirty(ci); handle_cap_grant(inode, session, cap, h, msg->middle, &extra_info); goto done_unlocked; case CEPH_CAP_OP_FLUSH_ACK: handle_cap_flush_ack(inode, le64_to_cpu(msg->hdr.tid), h, session, cap); break; case CEPH_CAP_OP_TRUNC: queue_trunc = handle_cap_trunc(inode, h, session, &extra_info); spin_unlock(&ci->i_ceph_lock); if (queue_trunc) ceph_queue_vmtruncate(inode); break; default: spin_unlock(&ci->i_ceph_lock); pr_err_client(cl, "unknown cap op %d %s\n", op, ceph_cap_op_name(op)); } done: mutex_unlock(&session->s_mutex); done_unlocked: iput(inode); out: ceph_dec_mds_stopping_blocker(mdsc); ceph_put_string(extra_info.pool_ns); /* Defer closing the sessions after s_mutex lock being released */ if (close_sessions) ceph_mdsc_close_sessions(mdsc); kfree(extra_info.fscrypt_auth); return; flush_cap_releases: /* * send any cap release message to try to move things * along for the mds (who clearly thinks we still have this * cap). */ if (do_cap_release) { cap = ceph_get_cap(mdsc, NULL); cap->cap_ino = vino.ino; cap->queue_release = 1; cap->cap_id = le64_to_cpu(h->cap_id); cap->mseq = mseq; cap->seq = seq; cap->issue_seq = seq; spin_lock(&session->s_cap_lock); __ceph_queue_cap_release(session, cap); spin_unlock(&session->s_cap_lock); } ceph_flush_cap_releases(mdsc, session); goto done; bad: pr_err_client(cl, "corrupt message\n"); ceph_msg_dump(msg); goto out; } /* * Delayed work handler to process end of delayed cap release LRU list. * * If new caps are added to the list while processing it, these won't get * processed in this run. In this case, the ci->i_hold_caps_max will be * returned so that the work can be scheduled accordingly. */ unsigned long ceph_check_delayed_caps(struct ceph_mds_client *mdsc) { struct ceph_client *cl = mdsc->fsc->client; struct inode *inode; struct ceph_inode_info *ci; struct ceph_mount_options *opt = mdsc->fsc->mount_options; unsigned long delay_max = opt->caps_wanted_delay_max * HZ; unsigned long loop_start = jiffies; unsigned long delay = 0; doutc(cl, "begin\n"); spin_lock(&mdsc->cap_delay_lock); while (!list_empty(&mdsc->cap_delay_list)) { ci = list_first_entry(&mdsc->cap_delay_list, struct ceph_inode_info, i_cap_delay_list); if (time_before(loop_start, ci->i_hold_caps_max - delay_max)) { doutc(cl, "caps added recently. Exiting loop"); delay = ci->i_hold_caps_max; break; } if ((ci->i_ceph_flags & CEPH_I_FLUSH) == 0 && time_before(jiffies, ci->i_hold_caps_max)) break; list_del_init(&ci->i_cap_delay_list); inode = igrab(&ci->netfs.inode); if (inode) { spin_unlock(&mdsc->cap_delay_lock); doutc(cl, "on %p %llx.%llx\n", inode, ceph_vinop(inode)); ceph_check_caps(ci, 0); iput(inode); spin_lock(&mdsc->cap_delay_lock); } } spin_unlock(&mdsc->cap_delay_lock); doutc(cl, "done\n"); return delay; } /* * Flush all dirty caps to the mds */ static void flush_dirty_session_caps(struct ceph_mds_session *s) { struct ceph_mds_client *mdsc = s->s_mdsc; struct ceph_client *cl = mdsc->fsc->client; struct ceph_inode_info *ci; struct inode *inode; doutc(cl, "begin\n"); spin_lock(&mdsc->cap_dirty_lock); while (!list_empty(&s->s_cap_dirty)) { ci = list_first_entry(&s->s_cap_dirty, struct ceph_inode_info, i_dirty_item); inode = &ci->netfs.inode; ihold(inode); doutc(cl, "%p %llx.%llx\n", inode, ceph_vinop(inode)); spin_unlock(&mdsc->cap_dirty_lock); ceph_wait_on_async_create(inode); ceph_check_caps(ci, CHECK_CAPS_FLUSH); iput(inode); spin_lock(&mdsc->cap_dirty_lock); } spin_unlock(&mdsc->cap_dirty_lock); doutc(cl, "done\n"); } void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc) { ceph_mdsc_iterate_sessions(mdsc, flush_dirty_session_caps, true); } void __ceph_touch_fmode(struct ceph_inode_info *ci, struct ceph_mds_client *mdsc, int fmode) { unsigned long now = jiffies; if (fmode & CEPH_FILE_MODE_RD) ci->i_last_rd = now; if (fmode & CEPH_FILE_MODE_WR) ci->i_last_wr = now; /* queue periodic check */ if (fmode && __ceph_is_any_real_caps(ci) && list_empty(&ci->i_cap_delay_list)) __cap_delay_requeue(mdsc, ci); } void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count) { struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->netfs.inode.i_sb); int bits = (fmode << 1) | 1; bool already_opened = false; int i; if (count == 1) atomic64_inc(&mdsc->metric.opened_files); spin_lock(&ci->i_ceph_lock); for (i = 0; i < CEPH_FILE_MODE_BITS; i++) { /* * If any of the mode ref is larger than 0, * that means it has been already opened by * others. Just skip checking the PIN ref. */ if (i && ci->i_nr_by_mode[i]) already_opened = true; if (bits & (1 << i)) ci->i_nr_by_mode[i] += count; } if (!already_opened) percpu_counter_inc(&mdsc->metric.opened_inodes); spin_unlock(&ci->i_ceph_lock); } /* * Drop open file reference. If we were the last open file, * we may need to release capabilities to the MDS (or schedule * their delayed release). */ void ceph_put_fmode(struct ceph_inode_info *ci, int fmode, int count) { struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->netfs.inode.i_sb); int bits = (fmode << 1) | 1; bool is_closed = true; int i; if (count == 1) atomic64_dec(&mdsc->metric.opened_files); spin_lock(&ci->i_ceph_lock); for (i = 0; i < CEPH_FILE_MODE_BITS; i++) { if (bits & (1 << i)) { BUG_ON(ci->i_nr_by_mode[i] < count); ci->i_nr_by_mode[i] -= count; } /* * If any of the mode ref is not 0 after * decreased, that means it is still opened * by others. Just skip checking the PIN ref. */ if (i && ci->i_nr_by_mode[i]) is_closed = false; } if (is_closed) percpu_counter_dec(&mdsc->metric.opened_inodes); spin_unlock(&ci->i_ceph_lock); } /* * For a soon-to-be unlinked file, drop the LINK caps. If it * looks like the link count will hit 0, drop any other caps (other * than PIN) we don't specifically want (due to the file still being * open). */ int ceph_drop_caps_for_unlink(struct inode *inode) { struct ceph_inode_info *ci = ceph_inode(inode); int drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL; spin_lock(&ci->i_ceph_lock); if (inode->i_nlink == 1) { drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN); if (__ceph_caps_dirty(ci)) { struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc; __cap_delay_requeue_front(mdsc, ci); } } spin_unlock(&ci->i_ceph_lock); return drop; } /* * Helpers for embedding cap and dentry lease releases into mds * requests. * * @force is used by dentry_release (below) to force inclusion of a * record for the directory inode, even when there aren't any caps to * drop. */ int ceph_encode_inode_release(void **p, struct inode *inode, int mds, int drop, int unless, int force) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_cap *cap; struct ceph_mds_request_release *rel = *p; int used, dirty; int ret = 0; spin_lock(&ci->i_ceph_lock); used = __ceph_caps_used(ci); dirty = __ceph_caps_dirty(ci); doutc(cl, "%p %llx.%llx mds%d used|dirty %s drop %s unless %s\n", inode, ceph_vinop(inode), mds, ceph_cap_string(used|dirty), ceph_cap_string(drop), ceph_cap_string(unless)); /* only drop unused, clean caps */ drop &= ~(used | dirty); cap = __get_cap_for_mds(ci, mds); if (cap && __cap_is_valid(cap)) { unless &= cap->issued; if (unless) { if (unless & CEPH_CAP_AUTH_EXCL) drop &= ~CEPH_CAP_AUTH_SHARED; if (unless & CEPH_CAP_LINK_EXCL) drop &= ~CEPH_CAP_LINK_SHARED; if (unless & CEPH_CAP_XATTR_EXCL) drop &= ~CEPH_CAP_XATTR_SHARED; if (unless & CEPH_CAP_FILE_EXCL) drop &= ~CEPH_CAP_FILE_SHARED; } if (force || (cap->issued & drop)) { if (cap->issued & drop) { int wanted = __ceph_caps_wanted(ci); doutc(cl, "%p %llx.%llx cap %p %s -> %s, " "wanted %s -> %s\n", inode, ceph_vinop(inode), cap, ceph_cap_string(cap->issued), ceph_cap_string(cap->issued & ~drop), ceph_cap_string(cap->mds_wanted), ceph_cap_string(wanted)); cap->issued &= ~drop; cap->implemented &= ~drop; cap->mds_wanted = wanted; if (cap == ci->i_auth_cap && !(wanted & CEPH_CAP_ANY_FILE_WR)) ci->i_requested_max_size = 0; } else { doutc(cl, "%p %llx.%llx cap %p %s (force)\n", inode, ceph_vinop(inode), cap, ceph_cap_string(cap->issued)); } rel->ino = cpu_to_le64(ceph_ino(inode)); rel->cap_id = cpu_to_le64(cap->cap_id); rel->seq = cpu_to_le32(cap->seq); rel->issue_seq = cpu_to_le32(cap->issue_seq); rel->mseq = cpu_to_le32(cap->mseq); rel->caps = cpu_to_le32(cap->implemented); rel->wanted = cpu_to_le32(cap->mds_wanted); rel->dname_len = 0; rel->dname_seq = 0; *p += sizeof(*rel); ret = 1; } else { doutc(cl, "%p %llx.%llx cap %p %s (noop)\n", inode, ceph_vinop(inode), cap, ceph_cap_string(cap->issued)); } } spin_unlock(&ci->i_ceph_lock); return ret; } /** * ceph_encode_dentry_release - encode a dentry release into an outgoing request * @p: outgoing request buffer * @dentry: dentry to release * @dir: dir to release it from * @mds: mds that we're speaking to * @drop: caps being dropped * @unless: unless we have these caps * * Encode a dentry release into an outgoing request buffer. Returns 1 if the * thing was released, or a negative error code otherwise. */ int ceph_encode_dentry_release(void **p, struct dentry *dentry, struct inode *dir, int mds, int drop, int unless) { struct dentry *parent = NULL; struct ceph_mds_request_release *rel = *p; struct ceph_dentry_info *di = ceph_dentry(dentry); struct ceph_client *cl; int force = 0; int ret; /* * force an record for the directory caps if we have a dentry lease. * this is racy (can't take i_ceph_lock and d_lock together), but it * doesn't have to be perfect; the mds will revoke anything we don't * release. */ spin_lock(&dentry->d_lock); if (di->lease_session && di->lease_session->s_mds == mds) force = 1; if (!dir) { parent = dget(dentry->d_parent); dir = d_inode(parent); } spin_unlock(&dentry->d_lock); ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force); dput(parent); cl = ceph_inode_to_client(dir); spin_lock(&dentry->d_lock); if (ret && di->lease_session && di->lease_session->s_mds == mds) { doutc(cl, "%p mds%d seq %d\n", dentry, mds, (int)di->lease_seq); rel->dname_seq = cpu_to_le32(di->lease_seq); __ceph_mdsc_drop_dentry_lease(dentry); spin_unlock(&dentry->d_lock); if (IS_ENCRYPTED(dir) && fscrypt_has_encryption_key(dir)) { int ret2 = ceph_encode_encrypted_fname(dir, dentry, *p); if (ret2 < 0) return ret2; rel->dname_len = cpu_to_le32(ret2); *p += ret2; } else { rel->dname_len = cpu_to_le32(dentry->d_name.len); memcpy(*p, dentry->d_name.name, dentry->d_name.len); *p += dentry->d_name.len; } } else { spin_unlock(&dentry->d_lock); } return ret; } static int remove_capsnaps(struct ceph_mds_client *mdsc, struct inode *inode) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_client *cl = mdsc->fsc->client; struct ceph_cap_snap *capsnap; int capsnap_release = 0; lockdep_assert_held(&ci->i_ceph_lock); doutc(cl, "removing capsnaps, ci is %p, %p %llx.%llx\n", ci, inode, ceph_vinop(inode)); while (!list_empty(&ci->i_cap_snaps)) { capsnap = list_first_entry(&ci->i_cap_snaps, struct ceph_cap_snap, ci_item); __ceph_remove_capsnap(inode, capsnap, NULL, NULL); ceph_put_snap_context(capsnap->context); ceph_put_cap_snap(capsnap); capsnap_release++; } wake_up_all(&ci->i_cap_wq); wake_up_all(&mdsc->cap_flushing_wq); return capsnap_release; } int ceph_purge_inode_cap(struct inode *inode, struct ceph_cap *cap, bool *invalidate) { struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_client *cl = fsc->client; struct ceph_inode_info *ci = ceph_inode(inode); bool is_auth; bool dirty_dropped = false; int iputs = 0; lockdep_assert_held(&ci->i_ceph_lock); doutc(cl, "removing cap %p, ci is %p, %p %llx.%llx\n", cap, ci, inode, ceph_vinop(inode)); is_auth = (cap == ci->i_auth_cap); __ceph_remove_cap(cap, false); if (is_auth) { struct ceph_cap_flush *cf; if (ceph_inode_is_shutdown(inode)) { if (inode->i_data.nrpages > 0) *invalidate = true; if (ci->i_wrbuffer_ref > 0) mapping_set_error(&inode->i_data, -EIO); } spin_lock(&mdsc->cap_dirty_lock); /* trash all of the cap flushes for this inode */ while (!list_empty(&ci->i_cap_flush_list)) { cf = list_first_entry(&ci->i_cap_flush_list, struct ceph_cap_flush, i_list); list_del_init(&cf->g_list); list_del_init(&cf->i_list); if (!cf->is_capsnap) ceph_free_cap_flush(cf); } if (!list_empty(&ci->i_dirty_item)) { pr_warn_ratelimited_client(cl, " dropping dirty %s state for %p %llx.%llx\n", ceph_cap_string(ci->i_dirty_caps), inode, ceph_vinop(inode)); ci->i_dirty_caps = 0; list_del_init(&ci->i_dirty_item); dirty_dropped = true; } if (!list_empty(&ci->i_flushing_item)) { pr_warn_ratelimited_client(cl, " dropping dirty+flushing %s state for %p %llx.%llx\n", ceph_cap_string(ci->i_flushing_caps), inode, ceph_vinop(inode)); ci->i_flushing_caps = 0; list_del_init(&ci->i_flushing_item); mdsc->num_cap_flushing--; dirty_dropped = true; } spin_unlock(&mdsc->cap_dirty_lock); if (dirty_dropped) { mapping_set_error(inode->i_mapping, -EIO); if (ci->i_wrbuffer_ref_head == 0 && ci->i_wr_ref == 0 && ci->i_dirty_caps == 0 && ci->i_flushing_caps == 0) { ceph_put_snap_context(ci->i_head_snapc); ci->i_head_snapc = NULL; } } if (atomic_read(&ci->i_filelock_ref) > 0) { /* make further file lock syscall return -EIO */ ci->i_ceph_flags |= CEPH_I_ERROR_FILELOCK; pr_warn_ratelimited_client(cl, " dropping file locks for %p %llx.%llx\n", inode, ceph_vinop(inode)); } if (!ci->i_dirty_caps && ci->i_prealloc_cap_flush) { cf = ci->i_prealloc_cap_flush; ci->i_prealloc_cap_flush = NULL; if (!cf->is_capsnap) ceph_free_cap_flush(cf); } if (!list_empty(&ci->i_cap_snaps)) iputs = remove_capsnaps(mdsc, inode); } if (dirty_dropped) ++iputs; return iputs; } |
12 12 12 12 12 12 11 12 12 3 3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (c) 2007-2014 Nicira, Inc. */ #include "flow.h" #include "datapath.h" #include "flow_netlink.h" #include <linux/uaccess.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/if_ether.h> #include <linux/if_vlan.h> #include <net/llc_pdu.h> #include <linux/kernel.h> #include <linux/jhash.h> #include <linux/jiffies.h> #include <linux/llc.h> #include <linux/module.h> #include <linux/in.h> #include <linux/rcupdate.h> #include <linux/cpumask.h> #include <linux/if_arp.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/sctp.h> #include <linux/tcp.h> #include <linux/udp.h> #include <linux/icmp.h> #include <linux/icmpv6.h> #include <linux/rculist.h> #include <linux/sort.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/ndisc.h> #define TBL_MIN_BUCKETS 1024 #define MASK_ARRAY_SIZE_MIN 16 #define REHASH_INTERVAL (10 * 60 * HZ) #define MC_DEFAULT_HASH_ENTRIES 256 #define MC_HASH_SHIFT 8 #define MC_HASH_SEGS ((sizeof(uint32_t) * 8) / MC_HASH_SHIFT) static struct kmem_cache *flow_cache; struct kmem_cache *flow_stats_cache __read_mostly; static u16 range_n_bytes(const struct sw_flow_key_range *range) { return range->end - range->start; } void ovs_flow_mask_key(struct sw_flow_key *dst, const struct sw_flow_key *src, bool full, const struct sw_flow_mask *mask) { int start = full ? 0 : mask->range.start; int len = full ? sizeof *dst : range_n_bytes(&mask->range); const long *m = (const long *)((const u8 *)&mask->key + start); const long *s = (const long *)((const u8 *)src + start); long *d = (long *)((u8 *)dst + start); int i; /* If 'full' is true then all of 'dst' is fully initialized. Otherwise, * if 'full' is false the memory outside of the 'mask->range' is left * uninitialized. This can be used as an optimization when further * operations on 'dst' only use contents within 'mask->range'. */ for (i = 0; i < len; i += sizeof(long)) *d++ = *s++ & *m++; } struct sw_flow *ovs_flow_alloc(void) { struct sw_flow *flow; struct sw_flow_stats *stats; flow = kmem_cache_zalloc(flow_cache, GFP_KERNEL); if (!flow) return ERR_PTR(-ENOMEM); flow->stats_last_writer = -1; flow->cpu_used_mask = (struct cpumask *)&flow->stats[nr_cpu_ids]; /* Initialize the default stat node. */ stats = kmem_cache_alloc_node(flow_stats_cache, GFP_KERNEL | __GFP_ZERO, node_online(0) ? 0 : NUMA_NO_NODE); if (!stats) goto err; spin_lock_init(&stats->lock); RCU_INIT_POINTER(flow->stats[0], stats); cpumask_set_cpu(0, flow->cpu_used_mask); return flow; err: kmem_cache_free(flow_cache, flow); return ERR_PTR(-ENOMEM); } int ovs_flow_tbl_count(const struct flow_table *table) { return table->count; } static void flow_free(struct sw_flow *flow) { int cpu; if (ovs_identifier_is_key(&flow->id)) kfree(flow->id.unmasked_key); if (flow->sf_acts) ovs_nla_free_flow_actions((struct sw_flow_actions __force *) flow->sf_acts); /* We open code this to make sure cpu 0 is always considered */ for (cpu = 0; cpu < nr_cpu_ids; cpu = cpumask_next(cpu, flow->cpu_used_mask)) { if (flow->stats[cpu]) kmem_cache_free(flow_stats_cache, (struct sw_flow_stats __force *)flow->stats[cpu]); } kmem_cache_free(flow_cache, flow); } static void rcu_free_flow_callback(struct rcu_head *rcu) { struct sw_flow *flow = container_of(rcu, struct sw_flow, rcu); flow_free(flow); } void ovs_flow_free(struct sw_flow *flow, bool deferred) { if (!flow) return; if (deferred) call_rcu(&flow->rcu, rcu_free_flow_callback); else flow_free(flow); } static void __table_instance_destroy(struct table_instance *ti) { kvfree(ti->buckets); kfree(ti); } static struct table_instance *table_instance_alloc(int new_size) { struct table_instance *ti = kmalloc(sizeof(*ti), GFP_KERNEL); int i; if (!ti) return NULL; ti->buckets = kvmalloc_array(new_size, sizeof(struct hlist_head), GFP_KERNEL); if (!ti->buckets) { kfree(ti); return NULL; } for (i = 0; i < new_size; i++) INIT_HLIST_HEAD(&ti->buckets[i]); ti->n_buckets = new_size; ti->node_ver = 0; get_random_bytes(&ti->hash_seed, sizeof(u32)); return ti; } static void __mask_array_destroy(struct mask_array *ma) { free_percpu(ma->masks_usage_stats); kfree(ma); } static void mask_array_rcu_cb(struct rcu_head *rcu) { struct mask_array *ma = container_of(rcu, struct mask_array, rcu); __mask_array_destroy(ma); } static void tbl_mask_array_reset_counters(struct mask_array *ma) { int i, cpu; /* As the per CPU counters are not atomic we can not go ahead and * reset them from another CPU. To be able to still have an approximate * zero based counter we store the value at reset, and subtract it * later when processing. */ for (i = 0; i < ma->max; i++) { ma->masks_usage_zero_cntr[i] = 0; for_each_possible_cpu(cpu) { struct mask_array_stats *stats; unsigned int start; u64 counter; stats = per_cpu_ptr(ma->masks_usage_stats, cpu); do { start = u64_stats_fetch_begin(&stats->syncp); counter = stats->usage_cntrs[i]; } while (u64_stats_fetch_retry(&stats->syncp, start)); ma->masks_usage_zero_cntr[i] += counter; } } } static struct mask_array *tbl_mask_array_alloc(int size) { struct mask_array *new; size = max(MASK_ARRAY_SIZE_MIN, size); new = kzalloc(struct_size(new, masks, size) + sizeof(u64) * size, GFP_KERNEL); if (!new) return NULL; new->masks_usage_zero_cntr = (u64 *)((u8 *)new + struct_size(new, masks, size)); new->masks_usage_stats = __alloc_percpu(sizeof(struct mask_array_stats) + sizeof(u64) * size, __alignof__(u64)); if (!new->masks_usage_stats) { kfree(new); return NULL; } new->count = 0; new->max = size; return new; } static int tbl_mask_array_realloc(struct flow_table *tbl, int size) { struct mask_array *old; struct mask_array *new; new = tbl_mask_array_alloc(size); if (!new) return -ENOMEM; old = ovsl_dereference(tbl->mask_array); if (old) { int i; for (i = 0; i < old->max; i++) { if (ovsl_dereference(old->masks[i])) new->masks[new->count++] = old->masks[i]; } call_rcu(&old->rcu, mask_array_rcu_cb); } rcu_assign_pointer(tbl->mask_array, new); return 0; } static int tbl_mask_array_add_mask(struct flow_table *tbl, struct sw_flow_mask *new) { struct mask_array *ma = ovsl_dereference(tbl->mask_array); int err, ma_count = READ_ONCE(ma->count); if (ma_count >= ma->max) { err = tbl_mask_array_realloc(tbl, ma->max + MASK_ARRAY_SIZE_MIN); if (err) return err; ma = ovsl_dereference(tbl->mask_array); } else { /* On every add or delete we need to reset the counters so * every new mask gets a fair chance of being prioritized. */ tbl_mask_array_reset_counters(ma); } BUG_ON(ovsl_dereference(ma->masks[ma_count])); rcu_assign_pointer(ma->masks[ma_count], new); WRITE_ONCE(ma->count, ma_count + 1); return 0; } static void tbl_mask_array_del_mask(struct flow_table *tbl, struct sw_flow_mask *mask) { struct mask_array *ma = ovsl_dereference(tbl->mask_array); int i, ma_count = READ_ONCE(ma->count); /* Remove the deleted mask pointers from the array */ for (i = 0; i < ma_count; i++) { if (mask == ovsl_dereference(ma->masks[i])) goto found; } BUG(); return; found: WRITE_ONCE(ma->count, ma_count - 1); rcu_assign_pointer(ma->masks[i], ma->masks[ma_count - 1]); RCU_INIT_POINTER(ma->masks[ma_count - 1], NULL); kfree_rcu(mask, rcu); /* Shrink the mask array if necessary. */ if (ma->max >= (MASK_ARRAY_SIZE_MIN * 2) && ma_count <= (ma->max / 3)) tbl_mask_array_realloc(tbl, ma->max / 2); else tbl_mask_array_reset_counters(ma); } /* Remove 'mask' from the mask list, if it is not needed any more. */ static void flow_mask_remove(struct flow_table *tbl, struct sw_flow_mask *mask) { if (mask) { /* ovs-lock is required to protect mask-refcount and * mask list. */ ASSERT_OVSL(); BUG_ON(!mask->ref_count); mask->ref_count--; if (!mask->ref_count) tbl_mask_array_del_mask(tbl, mask); } } static void __mask_cache_destroy(struct mask_cache *mc) { free_percpu(mc->mask_cache); kfree(mc); } static void mask_cache_rcu_cb(struct rcu_head *rcu) { struct mask_cache *mc = container_of(rcu, struct mask_cache, rcu); __mask_cache_destroy(mc); } static struct mask_cache *tbl_mask_cache_alloc(u32 size) { struct mask_cache_entry __percpu *cache = NULL; struct mask_cache *new; /* Only allow size to be 0, or a power of 2, and does not exceed * percpu allocation size. */ if ((!is_power_of_2(size) && size != 0) || (size * sizeof(struct mask_cache_entry)) > PCPU_MIN_UNIT_SIZE) return NULL; new = kzalloc(sizeof(*new), GFP_KERNEL); if (!new) return NULL; new->cache_size = size; if (new->cache_size > 0) { cache = __alloc_percpu(array_size(sizeof(struct mask_cache_entry), new->cache_size), __alignof__(struct mask_cache_entry)); if (!cache) { kfree(new); return NULL; } } new->mask_cache = cache; return new; } int ovs_flow_tbl_masks_cache_resize(struct flow_table *table, u32 size) { struct mask_cache *mc = rcu_dereference_ovsl(table->mask_cache); struct mask_cache *new; if (size == mc->cache_size) return 0; if ((!is_power_of_2(size) && size != 0) || (size * sizeof(struct mask_cache_entry)) > PCPU_MIN_UNIT_SIZE) return -EINVAL; new = tbl_mask_cache_alloc(size); if (!new) return -ENOMEM; rcu_assign_pointer(table->mask_cache, new); call_rcu(&mc->rcu, mask_cache_rcu_cb); return 0; } int ovs_flow_tbl_init(struct flow_table *table) { struct table_instance *ti, *ufid_ti; struct mask_cache *mc; struct mask_array *ma; mc = tbl_mask_cache_alloc(MC_DEFAULT_HASH_ENTRIES); if (!mc) return -ENOMEM; ma = tbl_mask_array_alloc(MASK_ARRAY_SIZE_MIN); if (!ma) goto free_mask_cache; ti = table_instance_alloc(TBL_MIN_BUCKETS); if (!ti) goto free_mask_array; ufid_ti = table_instance_alloc(TBL_MIN_BUCKETS); if (!ufid_ti) goto free_ti; rcu_assign_pointer(table->ti, ti); rcu_assign_pointer(table->ufid_ti, ufid_ti); rcu_assign_pointer(table->mask_array, ma); rcu_assign_pointer(table->mask_cache, mc); table->last_rehash = jiffies; table->count = 0; table->ufid_count = 0; return 0; free_ti: __table_instance_destroy(ti); free_mask_array: __mask_array_destroy(ma); free_mask_cache: __mask_cache_destroy(mc); return -ENOMEM; } static void flow_tbl_destroy_rcu_cb(struct rcu_head *rcu) { struct table_instance *ti; ti = container_of(rcu, struct table_instance, rcu); __table_instance_destroy(ti); } static void table_instance_flow_free(struct flow_table *table, struct table_instance *ti, struct table_instance *ufid_ti, struct sw_flow *flow) { hlist_del_rcu(&flow->flow_table.node[ti->node_ver]); table->count--; if (ovs_identifier_is_ufid(&flow->id)) { hlist_del_rcu(&flow->ufid_table.node[ufid_ti->node_ver]); table->ufid_count--; } flow_mask_remove(table, flow->mask); } /* Must be called with OVS mutex held. */ void table_instance_flow_flush(struct flow_table *table, struct table_instance *ti, struct table_instance *ufid_ti) { int i; for (i = 0; i < ti->n_buckets; i++) { struct hlist_head *head = &ti->buckets[i]; struct hlist_node *n; struct sw_flow *flow; hlist_for_each_entry_safe(flow, n, head, flow_table.node[ti->node_ver]) { table_instance_flow_free(table, ti, ufid_ti, flow); ovs_flow_free(flow, true); } } if (WARN_ON(table->count != 0 || table->ufid_count != 0)) { table->count = 0; table->ufid_count = 0; } } static void table_instance_destroy(struct table_instance *ti, struct table_instance *ufid_ti) { call_rcu(&ti->rcu, flow_tbl_destroy_rcu_cb); call_rcu(&ufid_ti->rcu, flow_tbl_destroy_rcu_cb); } /* No need for locking this function is called from RCU callback or * error path. */ void ovs_flow_tbl_destroy(struct flow_table *table) { struct table_instance *ti = rcu_dereference_raw(table->ti); struct table_instance *ufid_ti = rcu_dereference_raw(table->ufid_ti); struct mask_cache *mc = rcu_dereference_raw(table->mask_cache); struct mask_array *ma = rcu_dereference_raw(table->mask_array); call_rcu(&mc->rcu, mask_cache_rcu_cb); call_rcu(&ma->rcu, mask_array_rcu_cb); table_instance_destroy(ti, ufid_ti); } struct sw_flow *ovs_flow_tbl_dump_next(struct table_instance *ti, u32 *bucket, u32 *last) { struct sw_flow *flow; struct hlist_head *head; int ver; int i; ver = ti->node_ver; while (*bucket < ti->n_buckets) { i = 0; head = &ti->buckets[*bucket]; hlist_for_each_entry_rcu(flow, head, flow_table.node[ver]) { if (i < *last) { i++; continue; } *last = i + 1; return flow; } (*bucket)++; *last = 0; } return NULL; } static struct hlist_head *find_bucket(struct table_instance *ti, u32 hash) { hash = jhash_1word(hash, ti->hash_seed); return &ti->buckets[hash & (ti->n_buckets - 1)]; } static void table_instance_insert(struct table_instance *ti, struct sw_flow *flow) { struct hlist_head *head; head = find_bucket(ti, flow->flow_table.hash); hlist_add_head_rcu(&flow->flow_table.node[ti->node_ver], head); } static void ufid_table_instance_insert(struct table_instance *ti, struct sw_flow *flow) { struct hlist_head *head; head = find_bucket(ti, flow->ufid_table.hash); hlist_add_head_rcu(&flow->ufid_table.node[ti->node_ver], head); } static void flow_table_copy_flows(struct table_instance *old, struct table_instance *new, bool ufid) { int old_ver; int i; old_ver = old->node_ver; new->node_ver = !old_ver; /* Insert in new table. */ for (i = 0; i < old->n_buckets; i++) { struct sw_flow *flow; struct hlist_head *head = &old->buckets[i]; if (ufid) hlist_for_each_entry_rcu(flow, head, ufid_table.node[old_ver], lockdep_ovsl_is_held()) ufid_table_instance_insert(new, flow); else hlist_for_each_entry_rcu(flow, head, flow_table.node[old_ver], lockdep_ovsl_is_held()) table_instance_insert(new, flow); } } static struct table_instance *table_instance_rehash(struct table_instance *ti, int n_buckets, bool ufid) { struct table_instance *new_ti; new_ti = table_instance_alloc(n_buckets); if (!new_ti) return NULL; flow_table_copy_flows(ti, new_ti, ufid); return new_ti; } int ovs_flow_tbl_flush(struct flow_table *flow_table) { struct table_instance *old_ti, *new_ti; struct table_instance *old_ufid_ti, *new_ufid_ti; new_ti = table_instance_alloc(TBL_MIN_BUCKETS); if (!new_ti) return -ENOMEM; new_ufid_ti = table_instance_alloc(TBL_MIN_BUCKETS); if (!new_ufid_ti) goto err_free_ti; old_ti = ovsl_dereference(flow_table->ti); old_ufid_ti = ovsl_dereference(flow_table->ufid_ti); rcu_assign_pointer(flow_table->ti, new_ti); rcu_assign_pointer(flow_table->ufid_ti, new_ufid_ti); flow_table->last_rehash = jiffies; table_instance_flow_flush(flow_table, old_ti, old_ufid_ti); table_instance_destroy(old_ti, old_ufid_ti); return 0; err_free_ti: __table_instance_destroy(new_ti); return -ENOMEM; } static u32 flow_hash(const struct sw_flow_key *key, const struct sw_flow_key_range *range) { const u32 *hash_key = (const u32 *)((const u8 *)key + range->start); /* Make sure number of hash bytes are multiple of u32. */ int hash_u32s = range_n_bytes(range) >> 2; return jhash2(hash_key, hash_u32s, 0); } static int flow_key_start(const struct sw_flow_key *key) { if (key->tun_proto) return 0; else return rounddown(offsetof(struct sw_flow_key, phy), sizeof(long)); } static bool cmp_key(const struct sw_flow_key *key1, const struct sw_flow_key *key2, int key_start, int key_end) { const long *cp1 = (const long *)((const u8 *)key1 + key_start); const long *cp2 = (const long *)((const u8 *)key2 + key_start); int i; for (i = key_start; i < key_end; i += sizeof(long)) if (*cp1++ ^ *cp2++) return false; return true; } static bool flow_cmp_masked_key(const struct sw_flow *flow, const struct sw_flow_key *key, const struct sw_flow_key_range *range) { return cmp_key(&flow->key, key, range->start, range->end); } static bool ovs_flow_cmp_unmasked_key(const struct sw_flow *flow, const struct sw_flow_match *match) { struct sw_flow_key *key = match->key; int key_start = flow_key_start(key); int key_end = match->range.end; BUG_ON(ovs_identifier_is_ufid(&flow->id)); return cmp_key(flow->id.unmasked_key, key, key_start, key_end); } static struct sw_flow *masked_flow_lookup(struct table_instance *ti, const struct sw_flow_key *unmasked, const struct sw_flow_mask *mask, u32 *n_mask_hit) { struct sw_flow *flow; struct hlist_head *head; u32 hash; struct sw_flow_key masked_key; ovs_flow_mask_key(&masked_key, unmasked, false, mask); hash = flow_hash(&masked_key, &mask->range); head = find_bucket(ti, hash); (*n_mask_hit)++; hlist_for_each_entry_rcu(flow, head, flow_table.node[ti->node_ver], lockdep_ovsl_is_held()) { if (flow->mask == mask && flow->flow_table.hash == hash && flow_cmp_masked_key(flow, &masked_key, &mask->range)) return flow; } return NULL; } /* Flow lookup does full lookup on flow table. It starts with * mask from index passed in *index. * This function MUST be called with BH disabled due to the use * of CPU specific variables. */ static struct sw_flow *flow_lookup(struct flow_table *tbl, struct table_instance *ti, struct mask_array *ma, const struct sw_flow_key *key, u32 *n_mask_hit, u32 *n_cache_hit, u32 *index) { struct mask_array_stats *stats = this_cpu_ptr(ma->masks_usage_stats); struct sw_flow *flow; struct sw_flow_mask *mask; int i; if (likely(*index < ma->max)) { mask = rcu_dereference_ovsl(ma->masks[*index]); if (mask) { flow = masked_flow_lookup(ti, key, mask, n_mask_hit); if (flow) { u64_stats_update_begin(&stats->syncp); stats->usage_cntrs[*index]++; u64_stats_update_end(&stats->syncp); (*n_cache_hit)++; return flow; } } } for (i = 0; i < ma->max; i++) { if (i == *index) continue; mask = rcu_dereference_ovsl(ma->masks[i]); if (unlikely(!mask)) break; flow = masked_flow_lookup(ti, key, mask, n_mask_hit); if (flow) { /* Found */ *index = i; u64_stats_update_begin(&stats->syncp); stats->usage_cntrs[*index]++; u64_stats_update_end(&stats->syncp); return flow; } } return NULL; } /* * mask_cache maps flow to probable mask. This cache is not tightly * coupled cache, It means updates to mask list can result in inconsistent * cache entry in mask cache. * This is per cpu cache and is divided in MC_HASH_SEGS segments. * In case of a hash collision the entry is hashed in next segment. * */ struct sw_flow *ovs_flow_tbl_lookup_stats(struct flow_table *tbl, const struct sw_flow_key *key, u32 skb_hash, u32 *n_mask_hit, u32 *n_cache_hit) { struct mask_cache *mc = rcu_dereference(tbl->mask_cache); struct mask_array *ma = rcu_dereference(tbl->mask_array); struct table_instance *ti = rcu_dereference(tbl->ti); struct mask_cache_entry *entries, *ce; struct sw_flow *flow; u32 hash; int seg; *n_mask_hit = 0; *n_cache_hit = 0; if (unlikely(!skb_hash || mc->cache_size == 0)) { u32 mask_index = 0; u32 cache = 0; return flow_lookup(tbl, ti, ma, key, n_mask_hit, &cache, &mask_index); } /* Pre and post recirulation flows usually have the same skb_hash * value. To avoid hash collisions, rehash the 'skb_hash' with * 'recirc_id'. */ if (key->recirc_id) skb_hash = jhash_1word(skb_hash, key->recirc_id); ce = NULL; hash = skb_hash; entries = this_cpu_ptr(mc->mask_cache); /* Find the cache entry 'ce' to operate on. */ for (seg = 0; seg < MC_HASH_SEGS; seg++) { int index = hash & (mc->cache_size - 1); struct mask_cache_entry *e; e = &entries[index]; if (e->skb_hash == skb_hash) { flow = flow_lookup(tbl, ti, ma, key, n_mask_hit, n_cache_hit, &e->mask_index); if (!flow) e->skb_hash = 0; return flow; } if (!ce || e->skb_hash < ce->skb_hash) ce = e; /* A better replacement cache candidate. */ hash >>= MC_HASH_SHIFT; } /* Cache miss, do full lookup. */ flow = flow_lookup(tbl, ti, ma, key, n_mask_hit, n_cache_hit, &ce->mask_index); if (flow) ce->skb_hash = skb_hash; *n_cache_hit = 0; return flow; } struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *tbl, const struct sw_flow_key *key) { struct table_instance *ti = rcu_dereference_ovsl(tbl->ti); struct mask_array *ma = rcu_dereference_ovsl(tbl->mask_array); u32 __always_unused n_mask_hit; u32 __always_unused n_cache_hit; struct sw_flow *flow; u32 index = 0; /* This function gets called trough the netlink interface and therefore * is preemptible. However, flow_lookup() function needs to be called * with BH disabled due to CPU specific variables. */ local_bh_disable(); flow = flow_lookup(tbl, ti, ma, key, &n_mask_hit, &n_cache_hit, &index); local_bh_enable(); return flow; } struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl, const struct sw_flow_match *match) { struct mask_array *ma = ovsl_dereference(tbl->mask_array); int i; /* Always called under ovs-mutex. */ for (i = 0; i < ma->max; i++) { struct table_instance *ti = rcu_dereference_ovsl(tbl->ti); u32 __always_unused n_mask_hit; struct sw_flow_mask *mask; struct sw_flow *flow; mask = ovsl_dereference(ma->masks[i]); if (!mask) continue; flow = masked_flow_lookup(ti, match->key, mask, &n_mask_hit); if (flow && ovs_identifier_is_key(&flow->id) && ovs_flow_cmp_unmasked_key(flow, match)) { return flow; } } return NULL; } static u32 ufid_hash(const struct sw_flow_id *sfid) { return jhash(sfid->ufid, sfid->ufid_len, 0); } static bool ovs_flow_cmp_ufid(const struct sw_flow *flow, const struct sw_flow_id *sfid) { if (flow->id.ufid_len != sfid->ufid_len) return false; return !memcmp(flow->id.ufid, sfid->ufid, sfid->ufid_len); } bool ovs_flow_cmp(const struct sw_flow *flow, const struct sw_flow_match *match) { if (ovs_identifier_is_ufid(&flow->id)) return flow_cmp_masked_key(flow, match->key, &match->range); return ovs_flow_cmp_unmasked_key(flow, match); } struct sw_flow *ovs_flow_tbl_lookup_ufid(struct flow_table *tbl, const struct sw_flow_id *ufid) { struct table_instance *ti = rcu_dereference_ovsl(tbl->ufid_ti); struct sw_flow *flow; struct hlist_head *head; u32 hash; hash = ufid_hash(ufid); head = find_bucket(ti, hash); hlist_for_each_entry_rcu(flow, head, ufid_table.node[ti->node_ver], lockdep_ovsl_is_held()) { if (flow->ufid_table.hash == hash && ovs_flow_cmp_ufid(flow, ufid)) return flow; } return NULL; } int ovs_flow_tbl_num_masks(const struct flow_table *table) { struct mask_array *ma = rcu_dereference_ovsl(table->mask_array); return READ_ONCE(ma->count); } u32 ovs_flow_tbl_masks_cache_size(const struct flow_table *table) { struct mask_cache *mc = rcu_dereference_ovsl(table->mask_cache); return READ_ONCE(mc->cache_size); } static struct table_instance *table_instance_expand(struct table_instance *ti, bool ufid) { return table_instance_rehash(ti, ti->n_buckets * 2, ufid); } /* Must be called with OVS mutex held. */ void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow) { struct table_instance *ti = ovsl_dereference(table->ti); struct table_instance *ufid_ti = ovsl_dereference(table->ufid_ti); BUG_ON(table->count == 0); table_instance_flow_free(table, ti, ufid_ti, flow); } static struct sw_flow_mask *mask_alloc(void) { struct sw_flow_mask *mask; mask = kmalloc(sizeof(*mask), GFP_KERNEL); if (mask) mask->ref_count = 1; return mask; } static bool mask_equal(const struct sw_flow_mask *a, const struct sw_flow_mask *b) { const u8 *a_ = (const u8 *)&a->key + a->range.start; const u8 *b_ = (const u8 *)&b->key + b->range.start; return (a->range.end == b->range.end) && (a->range.start == b->range.start) && (memcmp(a_, b_, range_n_bytes(&a->range)) == 0); } static struct sw_flow_mask *flow_mask_find(const struct flow_table *tbl, const struct sw_flow_mask *mask) { struct mask_array *ma; int i; ma = ovsl_dereference(tbl->mask_array); for (i = 0; i < ma->max; i++) { struct sw_flow_mask *t; t = ovsl_dereference(ma->masks[i]); if (t && mask_equal(mask, t)) return t; } return NULL; } /* Add 'mask' into the mask list, if it is not already there. */ static int flow_mask_insert(struct flow_table *tbl, struct sw_flow *flow, const struct sw_flow_mask *new) { struct sw_flow_mask *mask; mask = flow_mask_find(tbl, new); if (!mask) { /* Allocate a new mask if none exists. */ mask = mask_alloc(); if (!mask) return -ENOMEM; mask->key = new->key; mask->range = new->range; /* Add mask to mask-list. */ if (tbl_mask_array_add_mask(tbl, mask)) { kfree(mask); return -ENOMEM; } } else { BUG_ON(!mask->ref_count); mask->ref_count++; } flow->mask = mask; return 0; } /* Must be called with OVS mutex held. */ static void flow_key_insert(struct flow_table *table, struct sw_flow *flow) { struct table_instance *new_ti = NULL; struct table_instance *ti; flow->flow_table.hash = flow_hash(&flow->key, &flow->mask->range); ti = ovsl_dereference(table->ti); table_instance_insert(ti, flow); table->count++; /* Expand table, if necessary, to make room. */ if (table->count > ti->n_buckets) new_ti = table_instance_expand(ti, false); else if (time_after(jiffies, table->last_rehash + REHASH_INTERVAL)) new_ti = table_instance_rehash(ti, ti->n_buckets, false); if (new_ti) { rcu_assign_pointer(table->ti, new_ti); call_rcu(&ti->rcu, flow_tbl_destroy_rcu_cb); table->last_rehash = jiffies; } } /* Must be called with OVS mutex held. */ static void flow_ufid_insert(struct flow_table *table, struct sw_flow *flow) { struct table_instance *ti; flow->ufid_table.hash = ufid_hash(&flow->id); ti = ovsl_dereference(table->ufid_ti); ufid_table_instance_insert(ti, flow); table->ufid_count++; /* Expand table, if necessary, to make room. */ if (table->ufid_count > ti->n_buckets) { struct table_instance *new_ti; new_ti = table_instance_expand(ti, true); if (new_ti) { rcu_assign_pointer(table->ufid_ti, new_ti); call_rcu(&ti->rcu, flow_tbl_destroy_rcu_cb); } } } /* Must be called with OVS mutex held. */ int ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow, const struct sw_flow_mask *mask) { int err; err = flow_mask_insert(table, flow, mask); if (err) return err; flow_key_insert(table, flow); if (ovs_identifier_is_ufid(&flow->id)) flow_ufid_insert(table, flow); return 0; } static int compare_mask_and_count(const void *a, const void *b) { const struct mask_count *mc_a = a; const struct mask_count *mc_b = b; return (s64)mc_b->counter - (s64)mc_a->counter; } /* Must be called with OVS mutex held. */ void ovs_flow_masks_rebalance(struct flow_table *table) { struct mask_array *ma = rcu_dereference_ovsl(table->mask_array); struct mask_count *masks_and_count; struct mask_array *new; int masks_entries = 0; int i; /* Build array of all current entries with use counters. */ masks_and_count = kmalloc_array(ma->max, sizeof(*masks_and_count), GFP_KERNEL); if (!masks_and_count) return; for (i = 0; i < ma->max; i++) { struct sw_flow_mask *mask; int cpu; mask = rcu_dereference_ovsl(ma->masks[i]); if (unlikely(!mask)) break; masks_and_count[i].index = i; masks_and_count[i].counter = 0; for_each_possible_cpu(cpu) { struct mask_array_stats *stats; unsigned int start; u64 counter; stats = per_cpu_ptr(ma->masks_usage_stats, cpu); do { start = u64_stats_fetch_begin(&stats->syncp); counter = stats->usage_cntrs[i]; } while (u64_stats_fetch_retry(&stats->syncp, start)); masks_and_count[i].counter += counter; } /* Subtract the zero count value. */ masks_and_count[i].counter -= ma->masks_usage_zero_cntr[i]; /* Rather than calling tbl_mask_array_reset_counters() * below when no change is needed, do it inline here. */ ma->masks_usage_zero_cntr[i] += masks_and_count[i].counter; } if (i == 0) goto free_mask_entries; /* Sort the entries */ masks_entries = i; sort(masks_and_count, masks_entries, sizeof(*masks_and_count), compare_mask_and_count, NULL); /* If the order is the same, nothing to do... */ for (i = 0; i < masks_entries; i++) { if (i != masks_and_count[i].index) break; } if (i == masks_entries) goto free_mask_entries; /* Rebuilt the new list in order of usage. */ new = tbl_mask_array_alloc(ma->max); if (!new) goto free_mask_entries; for (i = 0; i < masks_entries; i++) { int index = masks_and_count[i].index; if (ovsl_dereference(ma->masks[index])) new->masks[new->count++] = ma->masks[index]; } rcu_assign_pointer(table->mask_array, new); call_rcu(&ma->rcu, mask_array_rcu_cb); free_mask_entries: kfree(masks_and_count); } /* Initializes the flow module. * Returns zero if successful or a negative error code. */ int ovs_flow_init(void) { BUILD_BUG_ON(__alignof__(struct sw_flow_key) % __alignof__(long)); BUILD_BUG_ON(sizeof(struct sw_flow_key) % sizeof(long)); flow_cache = kmem_cache_create("sw_flow", sizeof(struct sw_flow) + (nr_cpu_ids * sizeof(struct sw_flow_stats *)) + cpumask_size(), 0, 0, NULL); if (flow_cache == NULL) return -ENOMEM; flow_stats_cache = kmem_cache_create("sw_flow_stats", sizeof(struct sw_flow_stats), 0, SLAB_HWCACHE_ALIGN, NULL); if (flow_stats_cache == NULL) { kmem_cache_destroy(flow_cache); flow_cache = NULL; return -ENOMEM; } return 0; } /* Uninitializes the flow module. */ void ovs_flow_exit(void) { kmem_cache_destroy(flow_stats_cache); kmem_cache_destroy(flow_cache); } |
1 1 1 1 1 1 2 2 2 2 2 2 2 1 1 2 1 1 1 1 1 1 1 2 2 2 2 2 1 43 43 41 2 43 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 | // SPDX-License-Identifier: GPL-2.0 /* * SME code for cfg80211 * both driver SME event handling and the SME implementation * (for nl80211's connect() and wext) * * Copyright 2009 Johannes Berg <johannes@sipsolutions.net> * Copyright (C) 2009, 2020, 2022-2023 Intel Corporation. All rights reserved. * Copyright 2017 Intel Deutschland GmbH */ #include <linux/etherdevice.h> #include <linux/if_arp.h> #include <linux/slab.h> #include <linux/workqueue.h> #include <linux/wireless.h> #include <linux/export.h> #include <net/iw_handler.h> #include <net/cfg80211.h> #include <net/rtnetlink.h> #include "nl80211.h" #include "reg.h" #include "rdev-ops.h" /* * Software SME in cfg80211, using auth/assoc/deauth calls to the * driver. This is for implementing nl80211's connect/disconnect * and wireless extensions (if configured.) */ struct cfg80211_conn { struct cfg80211_connect_params params; /* these are sub-states of the _CONNECTING sme_state */ enum { CFG80211_CONN_SCANNING, CFG80211_CONN_SCAN_AGAIN, CFG80211_CONN_AUTHENTICATE_NEXT, CFG80211_CONN_AUTHENTICATING, CFG80211_CONN_AUTH_FAILED_TIMEOUT, CFG80211_CONN_ASSOCIATE_NEXT, CFG80211_CONN_ASSOCIATING, CFG80211_CONN_ASSOC_FAILED, CFG80211_CONN_ASSOC_FAILED_TIMEOUT, CFG80211_CONN_DEAUTH, CFG80211_CONN_ABANDON, CFG80211_CONN_CONNECTED, } state; u8 bssid[ETH_ALEN], prev_bssid[ETH_ALEN]; const u8 *ie; size_t ie_len; bool auto_auth, prev_bssid_valid; }; static void cfg80211_sme_free(struct wireless_dev *wdev) { if (!wdev->conn) return; kfree(wdev->conn->ie); kfree(wdev->conn); wdev->conn = NULL; } static int cfg80211_conn_scan(struct wireless_dev *wdev) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_scan_request *request; int n_channels, err; lockdep_assert_wiphy(wdev->wiphy); if (rdev->scan_req || rdev->scan_msg) return -EBUSY; if (wdev->conn->params.channel) n_channels = 1; else n_channels = ieee80211_get_num_supported_channels(wdev->wiphy); request = kzalloc(sizeof(*request) + sizeof(request->ssids[0]) + sizeof(request->channels[0]) * n_channels, GFP_KERNEL); if (!request) return -ENOMEM; if (wdev->conn->params.channel) { enum nl80211_band band = wdev->conn->params.channel->band; struct ieee80211_supported_band *sband = wdev->wiphy->bands[band]; if (!sband) { kfree(request); return -EINVAL; } request->channels[0] = wdev->conn->params.channel; request->rates[band] = (1 << sband->n_bitrates) - 1; } else { int i = 0, j; enum nl80211_band band; struct ieee80211_supported_band *bands; struct ieee80211_channel *channel; for (band = 0; band < NUM_NL80211_BANDS; band++) { bands = wdev->wiphy->bands[band]; if (!bands) continue; for (j = 0; j < bands->n_channels; j++) { channel = &bands->channels[j]; if (channel->flags & IEEE80211_CHAN_DISABLED) continue; request->channels[i++] = channel; } request->rates[band] = (1 << bands->n_bitrates) - 1; } n_channels = i; } request->n_channels = n_channels; request->ssids = (void *)&request->channels[n_channels]; request->n_ssids = 1; memcpy(request->ssids[0].ssid, wdev->conn->params.ssid, wdev->conn->params.ssid_len); request->ssids[0].ssid_len = wdev->conn->params.ssid_len; eth_broadcast_addr(request->bssid); request->wdev = wdev; request->wiphy = &rdev->wiphy; request->scan_start = jiffies; rdev->scan_req = request; err = rdev_scan(rdev, request); if (!err) { wdev->conn->state = CFG80211_CONN_SCANNING; nl80211_send_scan_start(rdev, wdev); dev_hold(wdev->netdev); } else { rdev->scan_req = NULL; kfree(request); } return err; } static int cfg80211_conn_do_work(struct wireless_dev *wdev, enum nl80211_timeout_reason *treason) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_connect_params *params; struct cfg80211_auth_request auth_req = {}; struct cfg80211_assoc_request req = {}; int err; lockdep_assert_wiphy(wdev->wiphy); if (!wdev->conn) return 0; params = &wdev->conn->params; switch (wdev->conn->state) { case CFG80211_CONN_SCANNING: /* didn't find it during scan ... */ return -ENOENT; case CFG80211_CONN_SCAN_AGAIN: return cfg80211_conn_scan(wdev); case CFG80211_CONN_AUTHENTICATE_NEXT: if (WARN_ON(!rdev->ops->auth)) return -EOPNOTSUPP; wdev->conn->state = CFG80211_CONN_AUTHENTICATING; auth_req.key = params->key; auth_req.key_len = params->key_len; auth_req.key_idx = params->key_idx; auth_req.auth_type = params->auth_type; auth_req.bss = cfg80211_get_bss(&rdev->wiphy, params->channel, params->bssid, params->ssid, params->ssid_len, IEEE80211_BSS_TYPE_ESS, IEEE80211_PRIVACY_ANY); auth_req.link_id = -1; err = cfg80211_mlme_auth(rdev, wdev->netdev, &auth_req); cfg80211_put_bss(&rdev->wiphy, auth_req.bss); return err; case CFG80211_CONN_AUTH_FAILED_TIMEOUT: *treason = NL80211_TIMEOUT_AUTH; return -ENOTCONN; case CFG80211_CONN_ASSOCIATE_NEXT: if (WARN_ON(!rdev->ops->assoc)) return -EOPNOTSUPP; wdev->conn->state = CFG80211_CONN_ASSOCIATING; if (wdev->conn->prev_bssid_valid) req.prev_bssid = wdev->conn->prev_bssid; req.ie = params->ie; req.ie_len = params->ie_len; req.use_mfp = params->mfp != NL80211_MFP_NO; req.crypto = params->crypto; req.flags = params->flags; req.ht_capa = params->ht_capa; req.ht_capa_mask = params->ht_capa_mask; req.vht_capa = params->vht_capa; req.vht_capa_mask = params->vht_capa_mask; req.link_id = -1; req.bss = cfg80211_get_bss(&rdev->wiphy, params->channel, params->bssid, params->ssid, params->ssid_len, IEEE80211_BSS_TYPE_ESS, IEEE80211_PRIVACY_ANY); if (!req.bss) { err = -ENOENT; } else { err = cfg80211_mlme_assoc(rdev, wdev->netdev, &req); cfg80211_put_bss(&rdev->wiphy, req.bss); } if (err) cfg80211_mlme_deauth(rdev, wdev->netdev, params->bssid, NULL, 0, WLAN_REASON_DEAUTH_LEAVING, false); return err; case CFG80211_CONN_ASSOC_FAILED_TIMEOUT: *treason = NL80211_TIMEOUT_ASSOC; fallthrough; case CFG80211_CONN_ASSOC_FAILED: cfg80211_mlme_deauth(rdev, wdev->netdev, params->bssid, NULL, 0, WLAN_REASON_DEAUTH_LEAVING, false); return -ENOTCONN; case CFG80211_CONN_DEAUTH: cfg80211_mlme_deauth(rdev, wdev->netdev, params->bssid, NULL, 0, WLAN_REASON_DEAUTH_LEAVING, false); fallthrough; case CFG80211_CONN_ABANDON: /* free directly, disconnected event already sent */ cfg80211_sme_free(wdev); return 0; default: return 0; } } void cfg80211_conn_work(struct work_struct *work) { struct cfg80211_registered_device *rdev = container_of(work, struct cfg80211_registered_device, conn_work); struct wireless_dev *wdev; u8 bssid_buf[ETH_ALEN], *bssid = NULL; enum nl80211_timeout_reason treason; wiphy_lock(&rdev->wiphy); list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) { if (!wdev->netdev) continue; if (!netif_running(wdev->netdev)) continue; if (!wdev->conn || wdev->conn->state == CFG80211_CONN_CONNECTED) continue; if (wdev->conn->params.bssid) { memcpy(bssid_buf, wdev->conn->params.bssid, ETH_ALEN); bssid = bssid_buf; } treason = NL80211_TIMEOUT_UNSPECIFIED; if (cfg80211_conn_do_work(wdev, &treason)) { struct cfg80211_connect_resp_params cr; memset(&cr, 0, sizeof(cr)); cr.status = -1; cr.links[0].bssid = bssid; cr.timeout_reason = treason; __cfg80211_connect_result(wdev->netdev, &cr, false); } } wiphy_unlock(&rdev->wiphy); } static void cfg80211_step_auth_next(struct cfg80211_conn *conn, struct cfg80211_bss *bss) { memcpy(conn->bssid, bss->bssid, ETH_ALEN); conn->params.bssid = conn->bssid; conn->params.channel = bss->channel; conn->state = CFG80211_CONN_AUTHENTICATE_NEXT; } /* Returned bss is reference counted and must be cleaned up appropriately. */ static struct cfg80211_bss *cfg80211_get_conn_bss(struct wireless_dev *wdev) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_bss *bss; lockdep_assert_wiphy(wdev->wiphy); bss = cfg80211_get_bss(wdev->wiphy, wdev->conn->params.channel, wdev->conn->params.bssid, wdev->conn->params.ssid, wdev->conn->params.ssid_len, wdev->conn_bss_type, IEEE80211_PRIVACY(wdev->conn->params.privacy)); if (!bss) return NULL; cfg80211_step_auth_next(wdev->conn, bss); schedule_work(&rdev->conn_work); return bss; } void cfg80211_sme_scan_done(struct net_device *dev) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_bss *bss; lockdep_assert_wiphy(wdev->wiphy); if (!wdev->conn) return; if (wdev->conn->state != CFG80211_CONN_SCANNING && wdev->conn->state != CFG80211_CONN_SCAN_AGAIN) return; bss = cfg80211_get_conn_bss(wdev); if (bss) cfg80211_put_bss(&rdev->wiphy, bss); else schedule_work(&rdev->conn_work); } void cfg80211_sme_rx_auth(struct wireless_dev *wdev, const u8 *buf, size_t len) { struct wiphy *wiphy = wdev->wiphy; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)buf; u16 status_code = le16_to_cpu(mgmt->u.auth.status_code); lockdep_assert_wiphy(wdev->wiphy); if (!wdev->conn || wdev->conn->state == CFG80211_CONN_CONNECTED) return; if (status_code == WLAN_STATUS_NOT_SUPPORTED_AUTH_ALG && wdev->conn->auto_auth && wdev->conn->params.auth_type != NL80211_AUTHTYPE_NETWORK_EAP) { /* select automatically between only open, shared, leap */ switch (wdev->conn->params.auth_type) { case NL80211_AUTHTYPE_OPEN_SYSTEM: if (wdev->connect_keys) wdev->conn->params.auth_type = NL80211_AUTHTYPE_SHARED_KEY; else wdev->conn->params.auth_type = NL80211_AUTHTYPE_NETWORK_EAP; break; case NL80211_AUTHTYPE_SHARED_KEY: wdev->conn->params.auth_type = NL80211_AUTHTYPE_NETWORK_EAP; break; default: /* huh? */ wdev->conn->params.auth_type = NL80211_AUTHTYPE_OPEN_SYSTEM; break; } wdev->conn->state = CFG80211_CONN_AUTHENTICATE_NEXT; schedule_work(&rdev->conn_work); } else if (status_code != WLAN_STATUS_SUCCESS) { struct cfg80211_connect_resp_params cr; memset(&cr, 0, sizeof(cr)); cr.status = status_code; cr.links[0].bssid = mgmt->bssid; cr.timeout_reason = NL80211_TIMEOUT_UNSPECIFIED; __cfg80211_connect_result(wdev->netdev, &cr, false); } else if (wdev->conn->state == CFG80211_CONN_AUTHENTICATING) { wdev->conn->state = CFG80211_CONN_ASSOCIATE_NEXT; schedule_work(&rdev->conn_work); } } bool cfg80211_sme_rx_assoc_resp(struct wireless_dev *wdev, u16 status) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); if (!wdev->conn) return false; if (status == WLAN_STATUS_SUCCESS) { wdev->conn->state = CFG80211_CONN_CONNECTED; return false; } if (wdev->conn->prev_bssid_valid) { /* * Some stupid APs don't accept reassoc, so we * need to fall back to trying regular assoc; * return true so no event is sent to userspace. */ wdev->conn->prev_bssid_valid = false; wdev->conn->state = CFG80211_CONN_ASSOCIATE_NEXT; schedule_work(&rdev->conn_work); return true; } wdev->conn->state = CFG80211_CONN_ASSOC_FAILED; schedule_work(&rdev->conn_work); return false; } void cfg80211_sme_deauth(struct wireless_dev *wdev) { cfg80211_sme_free(wdev); } void cfg80211_sme_auth_timeout(struct wireless_dev *wdev) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); if (!wdev->conn) return; wdev->conn->state = CFG80211_CONN_AUTH_FAILED_TIMEOUT; schedule_work(&rdev->conn_work); } void cfg80211_sme_disassoc(struct wireless_dev *wdev) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); if (!wdev->conn) return; wdev->conn->state = CFG80211_CONN_DEAUTH; schedule_work(&rdev->conn_work); } void cfg80211_sme_assoc_timeout(struct wireless_dev *wdev) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); if (!wdev->conn) return; wdev->conn->state = CFG80211_CONN_ASSOC_FAILED_TIMEOUT; schedule_work(&rdev->conn_work); } void cfg80211_sme_abandon_assoc(struct wireless_dev *wdev) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); if (!wdev->conn) return; wdev->conn->state = CFG80211_CONN_ABANDON; schedule_work(&rdev->conn_work); } static void cfg80211_wdev_release_bsses(struct wireless_dev *wdev) { unsigned int link; for_each_valid_link(wdev, link) { if (!wdev->links[link].client.current_bss) continue; cfg80211_unhold_bss(wdev->links[link].client.current_bss); cfg80211_put_bss(wdev->wiphy, &wdev->links[link].client.current_bss->pub); wdev->links[link].client.current_bss = NULL; } } void cfg80211_wdev_release_link_bsses(struct wireless_dev *wdev, u16 link_mask) { unsigned int link; for_each_valid_link(wdev, link) { if (!wdev->links[link].client.current_bss || !(link_mask & BIT(link))) continue; cfg80211_unhold_bss(wdev->links[link].client.current_bss); cfg80211_put_bss(wdev->wiphy, &wdev->links[link].client.current_bss->pub); wdev->links[link].client.current_bss = NULL; } } static int cfg80211_sme_get_conn_ies(struct wireless_dev *wdev, const u8 *ies, size_t ies_len, const u8 **out_ies, size_t *out_ies_len) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); u8 *buf; size_t offs; if (!rdev->wiphy.extended_capabilities_len || (ies && cfg80211_find_ie(WLAN_EID_EXT_CAPABILITY, ies, ies_len))) { *out_ies = kmemdup(ies, ies_len, GFP_KERNEL); if (!*out_ies) return -ENOMEM; *out_ies_len = ies_len; return 0; } buf = kmalloc(ies_len + rdev->wiphy.extended_capabilities_len + 2, GFP_KERNEL); if (!buf) return -ENOMEM; if (ies_len) { static const u8 before_extcapa[] = { /* not listing IEs expected to be created by driver */ WLAN_EID_RSN, WLAN_EID_QOS_CAPA, WLAN_EID_RRM_ENABLED_CAPABILITIES, WLAN_EID_MOBILITY_DOMAIN, WLAN_EID_SUPPORTED_REGULATORY_CLASSES, WLAN_EID_BSS_COEX_2040, }; offs = ieee80211_ie_split(ies, ies_len, before_extcapa, ARRAY_SIZE(before_extcapa), 0); memcpy(buf, ies, offs); /* leave a whole for extended capabilities IE */ memcpy(buf + offs + rdev->wiphy.extended_capabilities_len + 2, ies + offs, ies_len - offs); } else { offs = 0; } /* place extended capabilities IE (with only driver capabilities) */ buf[offs] = WLAN_EID_EXT_CAPABILITY; buf[offs + 1] = rdev->wiphy.extended_capabilities_len; memcpy(buf + offs + 2, rdev->wiphy.extended_capabilities, rdev->wiphy.extended_capabilities_len); *out_ies = buf; *out_ies_len = ies_len + rdev->wiphy.extended_capabilities_len + 2; return 0; } static int cfg80211_sme_connect(struct wireless_dev *wdev, struct cfg80211_connect_params *connect, const u8 *prev_bssid) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_bss *bss; int err; if (!rdev->ops->auth || !rdev->ops->assoc) return -EOPNOTSUPP; cfg80211_wdev_release_bsses(wdev); if (wdev->connected) { cfg80211_sme_free(wdev); wdev->connected = false; } if (wdev->conn) return -EINPROGRESS; wdev->conn = kzalloc(sizeof(*wdev->conn), GFP_KERNEL); if (!wdev->conn) return -ENOMEM; /* * Copy all parameters, and treat explicitly IEs, BSSID, SSID. */ memcpy(&wdev->conn->params, connect, sizeof(*connect)); if (connect->bssid) { wdev->conn->params.bssid = wdev->conn->bssid; memcpy(wdev->conn->bssid, connect->bssid, ETH_ALEN); } if (cfg80211_sme_get_conn_ies(wdev, connect->ie, connect->ie_len, &wdev->conn->ie, &wdev->conn->params.ie_len)) { kfree(wdev->conn); wdev->conn = NULL; return -ENOMEM; } wdev->conn->params.ie = wdev->conn->ie; if (connect->auth_type == NL80211_AUTHTYPE_AUTOMATIC) { wdev->conn->auto_auth = true; /* start with open system ... should mostly work */ wdev->conn->params.auth_type = NL80211_AUTHTYPE_OPEN_SYSTEM; } else { wdev->conn->auto_auth = false; } wdev->conn->params.ssid = wdev->u.client.ssid; wdev->conn->params.ssid_len = wdev->u.client.ssid_len; /* see if we have the bss already */ bss = cfg80211_get_bss(wdev->wiphy, wdev->conn->params.channel, wdev->conn->params.bssid, wdev->conn->params.ssid, wdev->conn->params.ssid_len, wdev->conn_bss_type, IEEE80211_PRIVACY(wdev->conn->params.privacy)); if (prev_bssid) { memcpy(wdev->conn->prev_bssid, prev_bssid, ETH_ALEN); wdev->conn->prev_bssid_valid = true; } /* we're good if we have a matching bss struct */ if (bss) { enum nl80211_timeout_reason treason; cfg80211_step_auth_next(wdev->conn, bss); err = cfg80211_conn_do_work(wdev, &treason); cfg80211_put_bss(wdev->wiphy, bss); } else { /* otherwise we'll need to scan for the AP first */ err = cfg80211_conn_scan(wdev); /* * If we can't scan right now, then we need to scan again * after the current scan finished, since the parameters * changed (unless we find a good AP anyway). */ if (err == -EBUSY) { err = 0; wdev->conn->state = CFG80211_CONN_SCAN_AGAIN; } } if (err) cfg80211_sme_free(wdev); return err; } static int cfg80211_sme_disconnect(struct wireless_dev *wdev, u16 reason) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); int err; if (!wdev->conn) return 0; if (!rdev->ops->deauth) return -EOPNOTSUPP; if (wdev->conn->state == CFG80211_CONN_SCANNING || wdev->conn->state == CFG80211_CONN_SCAN_AGAIN) { err = 0; goto out; } /* wdev->conn->params.bssid must be set if > SCANNING */ err = cfg80211_mlme_deauth(rdev, wdev->netdev, wdev->conn->params.bssid, NULL, 0, reason, false); out: cfg80211_sme_free(wdev); return err; } /* * code shared for in-device and software SME */ static bool cfg80211_is_all_idle(void) { struct cfg80211_registered_device *rdev; struct wireless_dev *wdev; bool is_all_idle = true; /* * All devices must be idle as otherwise if you are actively * scanning some new beacon hints could be learned and would * count as new regulatory hints. * Also if there is any other active beaconing interface we * need not issue a disconnect hint and reset any info such * as chan dfs state, etc. */ for_each_rdev(rdev) { wiphy_lock(&rdev->wiphy); list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) { if (wdev->conn || wdev->connected || cfg80211_beaconing_iface_active(wdev)) is_all_idle = false; } wiphy_unlock(&rdev->wiphy); } return is_all_idle; } static void disconnect_work(struct work_struct *work) { rtnl_lock(); if (cfg80211_is_all_idle()) regulatory_hint_disconnect(); rtnl_unlock(); } DECLARE_WORK(cfg80211_disconnect_work, disconnect_work); static void cfg80211_connect_result_release_bsses(struct wireless_dev *wdev, struct cfg80211_connect_resp_params *cr) { unsigned int link; for_each_valid_link(cr, link) { if (!cr->links[link].bss) continue; cfg80211_unhold_bss(bss_from_pub(cr->links[link].bss)); cfg80211_put_bss(wdev->wiphy, cr->links[link].bss); } } /* * API calls for drivers implementing connect/disconnect and * SME event handling */ /* This method must consume bss one way or another */ void __cfg80211_connect_result(struct net_device *dev, struct cfg80211_connect_resp_params *cr, bool wextev) { struct wireless_dev *wdev = dev->ieee80211_ptr; const struct element *country_elem = NULL; const struct element *ssid; const u8 *country_data; u8 country_datalen; #ifdef CONFIG_CFG80211_WEXT union iwreq_data wrqu; #endif unsigned int link; const u8 *connected_addr; bool bss_not_found = false; lockdep_assert_wiphy(wdev->wiphy); if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION && wdev->iftype != NL80211_IFTYPE_P2P_CLIENT)) goto out; if (cr->valid_links) { if (WARN_ON(!cr->ap_mld_addr)) goto out; for_each_valid_link(cr, link) { if (WARN_ON(!cr->links[link].addr)) goto out; } if (WARN_ON(wdev->connect_keys)) goto out; } wdev->unprot_beacon_reported = 0; nl80211_send_connect_result(wiphy_to_rdev(wdev->wiphy), dev, cr, GFP_KERNEL); connected_addr = cr->valid_links ? cr->ap_mld_addr : cr->links[0].bssid; #ifdef CONFIG_CFG80211_WEXT if (wextev && !cr->valid_links) { if (cr->req_ie && cr->status == WLAN_STATUS_SUCCESS) { memset(&wrqu, 0, sizeof(wrqu)); wrqu.data.length = cr->req_ie_len; wireless_send_event(dev, IWEVASSOCREQIE, &wrqu, cr->req_ie); } if (cr->resp_ie && cr->status == WLAN_STATUS_SUCCESS) { memset(&wrqu, 0, sizeof(wrqu)); wrqu.data.length = cr->resp_ie_len; wireless_send_event(dev, IWEVASSOCRESPIE, &wrqu, cr->resp_ie); } memset(&wrqu, 0, sizeof(wrqu)); wrqu.ap_addr.sa_family = ARPHRD_ETHER; if (connected_addr && cr->status == WLAN_STATUS_SUCCESS) { memcpy(wrqu.ap_addr.sa_data, connected_addr, ETH_ALEN); memcpy(wdev->wext.prev_bssid, connected_addr, ETH_ALEN); wdev->wext.prev_bssid_valid = true; } wireless_send_event(dev, SIOCGIWAP, &wrqu, NULL); } #endif if (cr->status == WLAN_STATUS_SUCCESS) { if (!wiphy_to_rdev(wdev->wiphy)->ops->connect) { for_each_valid_link(cr, link) { if (WARN_ON_ONCE(!cr->links[link].bss)) break; } } for_each_valid_link(cr, link) { /* don't do extra lookups for failures */ if (cr->links[link].status != WLAN_STATUS_SUCCESS) continue; if (cr->links[link].bss) continue; cr->links[link].bss = cfg80211_get_bss(wdev->wiphy, NULL, cr->links[link].bssid, wdev->u.client.ssid, wdev->u.client.ssid_len, wdev->conn_bss_type, IEEE80211_PRIVACY_ANY); if (!cr->links[link].bss) { bss_not_found = true; break; } cfg80211_hold_bss(bss_from_pub(cr->links[link].bss)); } } cfg80211_wdev_release_bsses(wdev); if (cr->status != WLAN_STATUS_SUCCESS) { kfree_sensitive(wdev->connect_keys); wdev->connect_keys = NULL; wdev->u.client.ssid_len = 0; wdev->conn_owner_nlportid = 0; cfg80211_connect_result_release_bsses(wdev, cr); cfg80211_sme_free(wdev); return; } if (WARN_ON(bss_not_found)) { cfg80211_connect_result_release_bsses(wdev, cr); return; } memset(wdev->links, 0, sizeof(wdev->links)); for_each_valid_link(cr, link) { if (cr->links[link].status == WLAN_STATUS_SUCCESS) continue; cr->valid_links &= ~BIT(link); /* don't require bss pointer for failed links */ if (!cr->links[link].bss) continue; cfg80211_unhold_bss(bss_from_pub(cr->links[link].bss)); cfg80211_put_bss(wdev->wiphy, cr->links[link].bss); } wdev->valid_links = cr->valid_links; for_each_valid_link(cr, link) wdev->links[link].client.current_bss = bss_from_pub(cr->links[link].bss); wdev->connected = true; ether_addr_copy(wdev->u.client.connected_addr, connected_addr); if (cr->valid_links) { for_each_valid_link(cr, link) memcpy(wdev->links[link].addr, cr->links[link].addr, ETH_ALEN); } cfg80211_upload_connect_keys(wdev); rcu_read_lock(); for_each_valid_link(cr, link) { country_elem = ieee80211_bss_get_elem(cr->links[link].bss, WLAN_EID_COUNTRY); if (country_elem) break; } if (!country_elem) { rcu_read_unlock(); return; } country_datalen = country_elem->datalen; country_data = kmemdup(country_elem->data, country_datalen, GFP_ATOMIC); rcu_read_unlock(); if (!country_data) return; regulatory_hint_country_ie(wdev->wiphy, cr->links[link].bss->channel->band, country_data, country_datalen); kfree(country_data); if (!wdev->u.client.ssid_len) { rcu_read_lock(); for_each_valid_link(cr, link) { ssid = ieee80211_bss_get_elem(cr->links[link].bss, WLAN_EID_SSID); if (!ssid || !ssid->datalen) continue; memcpy(wdev->u.client.ssid, ssid->data, ssid->datalen); wdev->u.client.ssid_len = ssid->datalen; break; } rcu_read_unlock(); } return; out: for_each_valid_link(cr, link) cfg80211_put_bss(wdev->wiphy, cr->links[link].bss); } static void cfg80211_update_link_bss(struct wireless_dev *wdev, struct cfg80211_bss **bss) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_internal_bss *ibss; if (!*bss) return; ibss = bss_from_pub(*bss); if (list_empty(&ibss->list)) { struct cfg80211_bss *found = NULL, *tmp = *bss; found = cfg80211_get_bss(wdev->wiphy, NULL, (*bss)->bssid, wdev->u.client.ssid, wdev->u.client.ssid_len, wdev->conn_bss_type, IEEE80211_PRIVACY_ANY); if (found) { /* The same BSS is already updated so use it * instead, as it has latest info. */ *bss = found; } else { /* Update with BSS provided by driver, it will * be freshly added and ref cnted, we can free * the old one. * * signal_valid can be false, as we are not * expecting the BSS to be found. * * keep the old timestamp to avoid confusion */ cfg80211_bss_update(rdev, ibss, false, ibss->ts); } cfg80211_put_bss(wdev->wiphy, tmp); } } /* Consumes bss object(s) one way or another */ void cfg80211_connect_done(struct net_device *dev, struct cfg80211_connect_resp_params *params, gfp_t gfp) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_event *ev; unsigned long flags; u8 *next; size_t link_info_size = 0; unsigned int link; for_each_valid_link(params, link) { cfg80211_update_link_bss(wdev, ¶ms->links[link].bss); link_info_size += params->links[link].bssid ? ETH_ALEN : 0; link_info_size += params->links[link].addr ? ETH_ALEN : 0; } ev = kzalloc(sizeof(*ev) + (params->ap_mld_addr ? ETH_ALEN : 0) + params->req_ie_len + params->resp_ie_len + params->fils.kek_len + params->fils.pmk_len + (params->fils.pmkid ? WLAN_PMKID_LEN : 0) + link_info_size, gfp); if (!ev) { for_each_valid_link(params, link) cfg80211_put_bss(wdev->wiphy, params->links[link].bss); return; } ev->type = EVENT_CONNECT_RESULT; next = ((u8 *)ev) + sizeof(*ev); if (params->ap_mld_addr) { ev->cr.ap_mld_addr = next; memcpy((void *)ev->cr.ap_mld_addr, params->ap_mld_addr, ETH_ALEN); next += ETH_ALEN; } if (params->req_ie_len) { ev->cr.req_ie = next; ev->cr.req_ie_len = params->req_ie_len; memcpy((void *)ev->cr.req_ie, params->req_ie, params->req_ie_len); next += params->req_ie_len; } if (params->resp_ie_len) { ev->cr.resp_ie = next; ev->cr.resp_ie_len = params->resp_ie_len; memcpy((void *)ev->cr.resp_ie, params->resp_ie, params->resp_ie_len); next += params->resp_ie_len; } if (params->fils.kek_len) { ev->cr.fils.kek = next; ev->cr.fils.kek_len = params->fils.kek_len; memcpy((void *)ev->cr.fils.kek, params->fils.kek, params->fils.kek_len); next += params->fils.kek_len; } if (params->fils.pmk_len) { ev->cr.fils.pmk = next; ev->cr.fils.pmk_len = params->fils.pmk_len; memcpy((void *)ev->cr.fils.pmk, params->fils.pmk, params->fils.pmk_len); next += params->fils.pmk_len; } if (params->fils.pmkid) { ev->cr.fils.pmkid = next; memcpy((void *)ev->cr.fils.pmkid, params->fils.pmkid, WLAN_PMKID_LEN); next += WLAN_PMKID_LEN; } ev->cr.fils.update_erp_next_seq_num = params->fils.update_erp_next_seq_num; if (params->fils.update_erp_next_seq_num) ev->cr.fils.erp_next_seq_num = params->fils.erp_next_seq_num; ev->cr.valid_links = params->valid_links; for_each_valid_link(params, link) { if (params->links[link].bss) cfg80211_hold_bss( bss_from_pub(params->links[link].bss)); ev->cr.links[link].bss = params->links[link].bss; if (params->links[link].addr) { ev->cr.links[link].addr = next; memcpy((void *)ev->cr.links[link].addr, params->links[link].addr, ETH_ALEN); next += ETH_ALEN; } if (params->links[link].bssid) { ev->cr.links[link].bssid = next; memcpy((void *)ev->cr.links[link].bssid, params->links[link].bssid, ETH_ALEN); next += ETH_ALEN; } } ev->cr.status = params->status; ev->cr.timeout_reason = params->timeout_reason; spin_lock_irqsave(&wdev->event_lock, flags); list_add_tail(&ev->list, &wdev->event_list); spin_unlock_irqrestore(&wdev->event_lock, flags); queue_work(cfg80211_wq, &rdev->event_work); } EXPORT_SYMBOL(cfg80211_connect_done); /* Consumes bss object one way or another */ void __cfg80211_roamed(struct wireless_dev *wdev, struct cfg80211_roam_info *info) { #ifdef CONFIG_CFG80211_WEXT union iwreq_data wrqu; #endif unsigned int link; const u8 *connected_addr; lockdep_assert_wiphy(wdev->wiphy); if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION && wdev->iftype != NL80211_IFTYPE_P2P_CLIENT)) goto out; if (WARN_ON(!wdev->connected)) goto out; if (info->valid_links) { if (WARN_ON(!info->ap_mld_addr)) goto out; for_each_valid_link(info, link) { if (WARN_ON(!info->links[link].addr)) goto out; } } cfg80211_wdev_release_bsses(wdev); for_each_valid_link(info, link) { if (WARN_ON(!info->links[link].bss)) goto out; } memset(wdev->links, 0, sizeof(wdev->links)); wdev->valid_links = info->valid_links; for_each_valid_link(info, link) { cfg80211_hold_bss(bss_from_pub(info->links[link].bss)); wdev->links[link].client.current_bss = bss_from_pub(info->links[link].bss); } connected_addr = info->valid_links ? info->ap_mld_addr : info->links[0].bss->bssid; ether_addr_copy(wdev->u.client.connected_addr, connected_addr); if (info->valid_links) { for_each_valid_link(info, link) memcpy(wdev->links[link].addr, info->links[link].addr, ETH_ALEN); } wdev->unprot_beacon_reported = 0; nl80211_send_roamed(wiphy_to_rdev(wdev->wiphy), wdev->netdev, info, GFP_KERNEL); #ifdef CONFIG_CFG80211_WEXT if (!info->valid_links) { if (info->req_ie) { memset(&wrqu, 0, sizeof(wrqu)); wrqu.data.length = info->req_ie_len; wireless_send_event(wdev->netdev, IWEVASSOCREQIE, &wrqu, info->req_ie); } if (info->resp_ie) { memset(&wrqu, 0, sizeof(wrqu)); wrqu.data.length = info->resp_ie_len; wireless_send_event(wdev->netdev, IWEVASSOCRESPIE, &wrqu, info->resp_ie); } memset(&wrqu, 0, sizeof(wrqu)); wrqu.ap_addr.sa_family = ARPHRD_ETHER; memcpy(wrqu.ap_addr.sa_data, connected_addr, ETH_ALEN); memcpy(wdev->wext.prev_bssid, connected_addr, ETH_ALEN); wdev->wext.prev_bssid_valid = true; wireless_send_event(wdev->netdev, SIOCGIWAP, &wrqu, NULL); } #endif return; out: for_each_valid_link(info, link) cfg80211_put_bss(wdev->wiphy, info->links[link].bss); } /* Consumes info->links.bss object(s) one way or another */ void cfg80211_roamed(struct net_device *dev, struct cfg80211_roam_info *info, gfp_t gfp) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_event *ev; unsigned long flags; u8 *next; unsigned int link; size_t link_info_size = 0; bool bss_not_found = false; for_each_valid_link(info, link) { link_info_size += info->links[link].addr ? ETH_ALEN : 0; link_info_size += info->links[link].bssid ? ETH_ALEN : 0; if (info->links[link].bss) continue; info->links[link].bss = cfg80211_get_bss(wdev->wiphy, info->links[link].channel, info->links[link].bssid, wdev->u.client.ssid, wdev->u.client.ssid_len, wdev->conn_bss_type, IEEE80211_PRIVACY_ANY); if (!info->links[link].bss) { bss_not_found = true; break; } } if (WARN_ON(bss_not_found)) goto out; ev = kzalloc(sizeof(*ev) + info->req_ie_len + info->resp_ie_len + info->fils.kek_len + info->fils.pmk_len + (info->fils.pmkid ? WLAN_PMKID_LEN : 0) + (info->ap_mld_addr ? ETH_ALEN : 0) + link_info_size, gfp); if (!ev) goto out; ev->type = EVENT_ROAMED; next = ((u8 *)ev) + sizeof(*ev); if (info->req_ie_len) { ev->rm.req_ie = next; ev->rm.req_ie_len = info->req_ie_len; memcpy((void *)ev->rm.req_ie, info->req_ie, info->req_ie_len); next += info->req_ie_len; } if (info->resp_ie_len) { ev->rm.resp_ie = next; ev->rm.resp_ie_len = info->resp_ie_len; memcpy((void *)ev->rm.resp_ie, info->resp_ie, info->resp_ie_len); next += info->resp_ie_len; } if (info->fils.kek_len) { ev->rm.fils.kek = next; ev->rm.fils.kek_len = info->fils.kek_len; memcpy((void *)ev->rm.fils.kek, info->fils.kek, info->fils.kek_len); next += info->fils.kek_len; } if (info->fils.pmk_len) { ev->rm.fils.pmk = next; ev->rm.fils.pmk_len = info->fils.pmk_len; memcpy((void *)ev->rm.fils.pmk, info->fils.pmk, info->fils.pmk_len); next += info->fils.pmk_len; } if (info->fils.pmkid) { ev->rm.fils.pmkid = next; memcpy((void *)ev->rm.fils.pmkid, info->fils.pmkid, WLAN_PMKID_LEN); next += WLAN_PMKID_LEN; } ev->rm.fils.update_erp_next_seq_num = info->fils.update_erp_next_seq_num; if (info->fils.update_erp_next_seq_num) ev->rm.fils.erp_next_seq_num = info->fils.erp_next_seq_num; if (info->ap_mld_addr) { ev->rm.ap_mld_addr = next; memcpy((void *)ev->rm.ap_mld_addr, info->ap_mld_addr, ETH_ALEN); next += ETH_ALEN; } ev->rm.valid_links = info->valid_links; for_each_valid_link(info, link) { ev->rm.links[link].bss = info->links[link].bss; if (info->links[link].addr) { ev->rm.links[link].addr = next; memcpy((void *)ev->rm.links[link].addr, info->links[link].addr, ETH_ALEN); next += ETH_ALEN; } if (info->links[link].bssid) { ev->rm.links[link].bssid = next; memcpy((void *)ev->rm.links[link].bssid, info->links[link].bssid, ETH_ALEN); next += ETH_ALEN; } } spin_lock_irqsave(&wdev->event_lock, flags); list_add_tail(&ev->list, &wdev->event_list); spin_unlock_irqrestore(&wdev->event_lock, flags); queue_work(cfg80211_wq, &rdev->event_work); return; out: for_each_valid_link(info, link) cfg80211_put_bss(wdev->wiphy, info->links[link].bss); } EXPORT_SYMBOL(cfg80211_roamed); void __cfg80211_port_authorized(struct wireless_dev *wdev, const u8 *peer_addr, const u8 *td_bitmap, u8 td_bitmap_len) { lockdep_assert_wiphy(wdev->wiphy); if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION && wdev->iftype != NL80211_IFTYPE_P2P_CLIENT && wdev->iftype != NL80211_IFTYPE_AP && wdev->iftype != NL80211_IFTYPE_P2P_GO)) return; if (wdev->iftype == NL80211_IFTYPE_STATION || wdev->iftype == NL80211_IFTYPE_P2P_CLIENT) { if (WARN_ON(!wdev->connected) || WARN_ON(!ether_addr_equal(wdev->u.client.connected_addr, peer_addr))) return; } nl80211_send_port_authorized(wiphy_to_rdev(wdev->wiphy), wdev->netdev, peer_addr, td_bitmap, td_bitmap_len); } void cfg80211_port_authorized(struct net_device *dev, const u8 *peer_addr, const u8 *td_bitmap, u8 td_bitmap_len, gfp_t gfp) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_event *ev; unsigned long flags; if (WARN_ON(!peer_addr)) return; ev = kzalloc(sizeof(*ev) + td_bitmap_len, gfp); if (!ev) return; ev->type = EVENT_PORT_AUTHORIZED; memcpy(ev->pa.peer_addr, peer_addr, ETH_ALEN); ev->pa.td_bitmap = ((u8 *)ev) + sizeof(*ev); ev->pa.td_bitmap_len = td_bitmap_len; memcpy((void *)ev->pa.td_bitmap, td_bitmap, td_bitmap_len); /* * Use the wdev event list so that if there are pending * connected/roamed events, they will be reported first. */ spin_lock_irqsave(&wdev->event_lock, flags); list_add_tail(&ev->list, &wdev->event_list); spin_unlock_irqrestore(&wdev->event_lock, flags); queue_work(cfg80211_wq, &rdev->event_work); } EXPORT_SYMBOL(cfg80211_port_authorized); void __cfg80211_disconnected(struct net_device *dev, const u8 *ie, size_t ie_len, u16 reason, bool from_ap) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); int i; #ifdef CONFIG_CFG80211_WEXT union iwreq_data wrqu; #endif lockdep_assert_wiphy(wdev->wiphy); if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION && wdev->iftype != NL80211_IFTYPE_P2P_CLIENT)) return; cfg80211_wdev_release_bsses(wdev); wdev->connected = false; wdev->u.client.ssid_len = 0; wdev->conn_owner_nlportid = 0; kfree_sensitive(wdev->connect_keys); wdev->connect_keys = NULL; nl80211_send_disconnected(rdev, dev, reason, ie, ie_len, from_ap); /* stop critical protocol if supported */ if (rdev->ops->crit_proto_stop && rdev->crit_proto_nlportid) { rdev->crit_proto_nlportid = 0; rdev_crit_proto_stop(rdev, wdev); } /* * Delete all the keys ... pairwise keys can't really * exist any more anyway, but default keys might. */ if (rdev->ops->del_key) { int max_key_idx = 5; if (wiphy_ext_feature_isset( wdev->wiphy, NL80211_EXT_FEATURE_BEACON_PROTECTION) || wiphy_ext_feature_isset( wdev->wiphy, NL80211_EXT_FEATURE_BEACON_PROTECTION_CLIENT)) max_key_idx = 7; for (i = 0; i <= max_key_idx; i++) rdev_del_key(rdev, dev, -1, i, false, NULL); } rdev_set_qos_map(rdev, dev, NULL); #ifdef CONFIG_CFG80211_WEXT memset(&wrqu, 0, sizeof(wrqu)); wrqu.ap_addr.sa_family = ARPHRD_ETHER; wireless_send_event(dev, SIOCGIWAP, &wrqu, NULL); wdev->wext.connect.ssid_len = 0; #endif schedule_work(&cfg80211_disconnect_work); } void cfg80211_disconnected(struct net_device *dev, u16 reason, const u8 *ie, size_t ie_len, bool locally_generated, gfp_t gfp) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_event *ev; unsigned long flags; ev = kzalloc(sizeof(*ev) + ie_len, gfp); if (!ev) return; ev->type = EVENT_DISCONNECTED; ev->dc.ie = ((u8 *)ev) + sizeof(*ev); ev->dc.ie_len = ie_len; memcpy((void *)ev->dc.ie, ie, ie_len); ev->dc.reason = reason; ev->dc.locally_generated = locally_generated; spin_lock_irqsave(&wdev->event_lock, flags); list_add_tail(&ev->list, &wdev->event_list); spin_unlock_irqrestore(&wdev->event_lock, flags); queue_work(cfg80211_wq, &rdev->event_work); } EXPORT_SYMBOL(cfg80211_disconnected); /* * API calls for nl80211/wext compatibility code */ int cfg80211_connect(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_connect_params *connect, struct cfg80211_cached_keys *connkeys, const u8 *prev_bssid) { struct wireless_dev *wdev = dev->ieee80211_ptr; int err; lockdep_assert_wiphy(wdev->wiphy); /* * If we have an ssid_len, we're trying to connect or are * already connected, so reject a new SSID unless it's the * same (which is the case for re-association.) */ if (wdev->u.client.ssid_len && (wdev->u.client.ssid_len != connect->ssid_len || memcmp(wdev->u.client.ssid, connect->ssid, wdev->u.client.ssid_len))) return -EALREADY; /* * If connected, reject (re-)association unless prev_bssid * matches the current BSSID. */ if (wdev->connected) { if (!prev_bssid) return -EALREADY; if (!ether_addr_equal(prev_bssid, wdev->u.client.connected_addr)) return -ENOTCONN; } /* * Reject if we're in the process of connecting with WEP, * this case isn't very interesting and trying to handle * it would make the code much more complex. */ if (wdev->connect_keys) return -EINPROGRESS; cfg80211_oper_and_ht_capa(&connect->ht_capa_mask, rdev->wiphy.ht_capa_mod_mask); cfg80211_oper_and_vht_capa(&connect->vht_capa_mask, rdev->wiphy.vht_capa_mod_mask); if (connkeys && connkeys->def >= 0) { int idx; u32 cipher; idx = connkeys->def; cipher = connkeys->params[idx].cipher; /* If given a WEP key we may need it for shared key auth */ if (cipher == WLAN_CIPHER_SUITE_WEP40 || cipher == WLAN_CIPHER_SUITE_WEP104) { connect->key_idx = idx; connect->key = connkeys->params[idx].key; connect->key_len = connkeys->params[idx].key_len; /* * If ciphers are not set (e.g. when going through * iwconfig), we have to set them appropriately here. */ if (connect->crypto.cipher_group == 0) connect->crypto.cipher_group = cipher; if (connect->crypto.n_ciphers_pairwise == 0) { connect->crypto.n_ciphers_pairwise = 1; connect->crypto.ciphers_pairwise[0] = cipher; } } } else { if (WARN_ON(connkeys)) return -EINVAL; /* connect can point to wdev->wext.connect which * can hold key data from a previous connection */ connect->key = NULL; connect->key_len = 0; connect->key_idx = 0; } wdev->connect_keys = connkeys; memcpy(wdev->u.client.ssid, connect->ssid, connect->ssid_len); wdev->u.client.ssid_len = connect->ssid_len; wdev->conn_bss_type = connect->pbss ? IEEE80211_BSS_TYPE_PBSS : IEEE80211_BSS_TYPE_ESS; if (!rdev->ops->connect) err = cfg80211_sme_connect(wdev, connect, prev_bssid); else err = rdev_connect(rdev, dev, connect); if (err) { wdev->connect_keys = NULL; /* * This could be reassoc getting refused, don't clear * ssid_len in that case. */ if (!wdev->connected) wdev->u.client.ssid_len = 0; return err; } return 0; } int cfg80211_disconnect(struct cfg80211_registered_device *rdev, struct net_device *dev, u16 reason, bool wextev) { struct wireless_dev *wdev = dev->ieee80211_ptr; int err = 0; lockdep_assert_wiphy(wdev->wiphy); kfree_sensitive(wdev->connect_keys); wdev->connect_keys = NULL; wdev->conn_owner_nlportid = 0; if (wdev->conn) err = cfg80211_sme_disconnect(wdev, reason); else if (!rdev->ops->disconnect) cfg80211_mlme_down(rdev, dev); else if (wdev->u.client.ssid_len) err = rdev_disconnect(rdev, dev, reason); /* * Clear ssid_len unless we actually were fully connected, * in which case cfg80211_disconnected() will take care of * this later. */ if (!wdev->connected) wdev->u.client.ssid_len = 0; return err; } /* * Used to clean up after the connection / connection attempt owner socket * disconnects */ void cfg80211_autodisconnect_wk(struct work_struct *work) { struct wireless_dev *wdev = container_of(work, struct wireless_dev, disconnect_wk); struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); wiphy_lock(wdev->wiphy); if (wdev->conn_owner_nlportid) { switch (wdev->iftype) { case NL80211_IFTYPE_ADHOC: cfg80211_leave_ibss(rdev, wdev->netdev, false); break; case NL80211_IFTYPE_AP: case NL80211_IFTYPE_P2P_GO: cfg80211_stop_ap(rdev, wdev->netdev, -1, false); break; case NL80211_IFTYPE_MESH_POINT: cfg80211_leave_mesh(rdev, wdev->netdev); break; case NL80211_IFTYPE_STATION: case NL80211_IFTYPE_P2P_CLIENT: /* * Use disconnect_bssid if still connecting and * ops->disconnect not implemented. Otherwise we can * use cfg80211_disconnect. */ if (rdev->ops->disconnect || wdev->connected) cfg80211_disconnect(rdev, wdev->netdev, WLAN_REASON_DEAUTH_LEAVING, true); else cfg80211_mlme_deauth(rdev, wdev->netdev, wdev->disconnect_bssid, NULL, 0, WLAN_REASON_DEAUTH_LEAVING, false); break; default: break; } } wiphy_unlock(wdev->wiphy); } |
19 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* Authentication token and access key management internal defs * * Copyright (C) 2003-5, 2007 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #ifndef _INTERNAL_H #define _INTERNAL_H #include <linux/sched.h> #include <linux/wait_bit.h> #include <linux/cred.h> #include <linux/key-type.h> #include <linux/task_work.h> #include <linux/keyctl.h> #include <linux/refcount.h> #include <linux/watch_queue.h> #include <linux/compat.h> #include <linux/mm.h> #include <linux/vmalloc.h> struct iovec; #ifdef __KDEBUG #define kenter(FMT, ...) \ printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__) #define kleave(FMT, ...) \ printk(KERN_DEBUG "<== %s()"FMT"\n", __func__, ##__VA_ARGS__) #define kdebug(FMT, ...) \ printk(KERN_DEBUG " "FMT"\n", ##__VA_ARGS__) #else #define kenter(FMT, ...) \ no_printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__) #define kleave(FMT, ...) \ no_printk(KERN_DEBUG "<== %s()"FMT"\n", __func__, ##__VA_ARGS__) #define kdebug(FMT, ...) \ no_printk(KERN_DEBUG FMT"\n", ##__VA_ARGS__) #endif extern struct key_type key_type_dead; extern struct key_type key_type_user; extern struct key_type key_type_logon; /*****************************************************************************/ /* * Keep track of keys for a user. * * This needs to be separate to user_struct to avoid a refcount-loop * (user_struct pins some keyrings which pin this struct). * * We also keep track of keys under request from userspace for this UID here. */ struct key_user { struct rb_node node; struct mutex cons_lock; /* construction initiation lock */ spinlock_t lock; refcount_t usage; /* for accessing qnkeys & qnbytes */ atomic_t nkeys; /* number of keys */ atomic_t nikeys; /* number of instantiated keys */ kuid_t uid; int qnkeys; /* number of keys allocated to this user */ int qnbytes; /* number of bytes allocated to this user */ }; extern struct rb_root key_user_tree; extern spinlock_t key_user_lock; extern struct key_user root_key_user; extern struct key_user *key_user_lookup(kuid_t uid); extern void key_user_put(struct key_user *user); /* * Key quota limits. * - root has its own separate limits to everyone else */ extern unsigned key_quota_root_maxkeys; extern unsigned key_quota_root_maxbytes; extern unsigned key_quota_maxkeys; extern unsigned key_quota_maxbytes; #define KEYQUOTA_LINK_BYTES 4 /* a link in a keyring is worth 4 bytes */ extern struct kmem_cache *key_jar; extern struct rb_root key_serial_tree; extern spinlock_t key_serial_lock; extern struct mutex key_construction_mutex; extern wait_queue_head_t request_key_conswq; extern void key_set_index_key(struct keyring_index_key *index_key); extern struct key_type *key_type_lookup(const char *type); extern void key_type_put(struct key_type *ktype); extern int __key_link_lock(struct key *keyring, const struct keyring_index_key *index_key); extern int __key_move_lock(struct key *l_keyring, struct key *u_keyring, const struct keyring_index_key *index_key); extern int __key_link_begin(struct key *keyring, const struct keyring_index_key *index_key, struct assoc_array_edit **_edit); extern int __key_link_check_live_key(struct key *keyring, struct key *key); extern void __key_link(struct key *keyring, struct key *key, struct assoc_array_edit **_edit); extern void __key_link_end(struct key *keyring, const struct keyring_index_key *index_key, struct assoc_array_edit *edit); extern key_ref_t find_key_to_update(key_ref_t keyring_ref, const struct keyring_index_key *index_key); struct keyring_search_context { struct keyring_index_key index_key; const struct cred *cred; struct key_match_data match_data; unsigned flags; #define KEYRING_SEARCH_NO_STATE_CHECK 0x0001 /* Skip state checks */ #define KEYRING_SEARCH_DO_STATE_CHECK 0x0002 /* Override NO_STATE_CHECK */ #define KEYRING_SEARCH_NO_UPDATE_TIME 0x0004 /* Don't update times */ #define KEYRING_SEARCH_NO_CHECK_PERM 0x0008 /* Don't check permissions */ #define KEYRING_SEARCH_DETECT_TOO_DEEP 0x0010 /* Give an error on excessive depth */ #define KEYRING_SEARCH_SKIP_EXPIRED 0x0020 /* Ignore expired keys (intention to replace) */ #define KEYRING_SEARCH_RECURSE 0x0040 /* Search child keyrings also */ int (*iterator)(const void *object, void *iterator_data); /* Internal stuff */ int skipped_ret; bool possessed; key_ref_t result; time64_t now; }; extern bool key_default_cmp(const struct key *key, const struct key_match_data *match_data); extern key_ref_t keyring_search_rcu(key_ref_t keyring_ref, struct keyring_search_context *ctx); extern key_ref_t search_cred_keyrings_rcu(struct keyring_search_context *ctx); extern key_ref_t search_process_keyrings_rcu(struct keyring_search_context *ctx); extern struct key *find_keyring_by_name(const char *name, bool uid_keyring); extern int look_up_user_keyrings(struct key **, struct key **); extern struct key *get_user_session_keyring_rcu(const struct cred *); extern int install_thread_keyring_to_cred(struct cred *); extern int install_process_keyring_to_cred(struct cred *); extern int install_session_keyring_to_cred(struct cred *, struct key *); extern struct key *request_key_and_link(struct key_type *type, const char *description, struct key_tag *domain_tag, const void *callout_info, size_t callout_len, void *aux, struct key *dest_keyring, unsigned long flags); extern bool lookup_user_key_possessed(const struct key *key, const struct key_match_data *match_data); extern long join_session_keyring(const char *name); extern void key_change_session_keyring(struct callback_head *twork); extern struct work_struct key_gc_work; extern unsigned key_gc_delay; extern void keyring_gc(struct key *keyring, time64_t limit); extern void keyring_restriction_gc(struct key *keyring, struct key_type *dead_type); void key_set_expiry(struct key *key, time64_t expiry); extern void key_schedule_gc(time64_t gc_at); extern void key_schedule_gc_links(void); extern void key_gc_keytype(struct key_type *ktype); extern int key_task_permission(const key_ref_t key_ref, const struct cred *cred, enum key_need_perm need_perm); static inline void notify_key(struct key *key, enum key_notification_subtype subtype, u32 aux) { #ifdef CONFIG_KEY_NOTIFICATIONS struct key_notification n = { .watch.type = WATCH_TYPE_KEY_NOTIFY, .watch.subtype = subtype, .watch.info = watch_sizeof(n), .key_id = key_serial(key), .aux = aux, }; post_watch_notification(key->watchers, &n.watch, current_cred(), n.key_id); #endif } /* * Check to see whether permission is granted to use a key in the desired way. */ static inline int key_permission(const key_ref_t key_ref, enum key_need_perm need_perm) { return key_task_permission(key_ref, current_cred(), need_perm); } extern struct key_type key_type_request_key_auth; extern struct key *request_key_auth_new(struct key *target, const char *op, const void *callout_info, size_t callout_len, struct key *dest_keyring); extern struct key *key_get_instantiation_authkey(key_serial_t target_id); /* * Determine whether a key is dead. */ static inline bool key_is_dead(const struct key *key, time64_t limit) { time64_t expiry = key->expiry; if (expiry != TIME64_MAX) { if (!(key->type->flags & KEY_TYPE_INSTANT_REAP)) expiry += key_gc_delay; if (expiry <= limit) return true; } return key->flags & ((1 << KEY_FLAG_DEAD) | (1 << KEY_FLAG_INVALIDATED)) || key->domain_tag->removed; } /* * keyctl() functions */ extern long keyctl_get_keyring_ID(key_serial_t, int); extern long keyctl_join_session_keyring(const char __user *); extern long keyctl_update_key(key_serial_t, const void __user *, size_t); extern long keyctl_revoke_key(key_serial_t); extern long keyctl_keyring_clear(key_serial_t); extern long keyctl_keyring_link(key_serial_t, key_serial_t); extern long keyctl_keyring_move(key_serial_t, key_serial_t, key_serial_t, unsigned int); extern long keyctl_keyring_unlink(key_serial_t, key_serial_t); extern long keyctl_describe_key(key_serial_t, char __user *, size_t); extern long keyctl_keyring_search(key_serial_t, const char __user *, const char __user *, key_serial_t); extern long keyctl_read_key(key_serial_t, char __user *, size_t); extern long keyctl_chown_key(key_serial_t, uid_t, gid_t); extern long keyctl_setperm_key(key_serial_t, key_perm_t); extern long keyctl_instantiate_key(key_serial_t, const void __user *, size_t, key_serial_t); extern long keyctl_negate_key(key_serial_t, unsigned, key_serial_t); extern long keyctl_set_reqkey_keyring(int); extern long keyctl_set_timeout(key_serial_t, unsigned); extern long keyctl_assume_authority(key_serial_t); extern long keyctl_get_security(key_serial_t keyid, char __user *buffer, size_t buflen); extern long keyctl_session_to_parent(void); extern long keyctl_reject_key(key_serial_t, unsigned, unsigned, key_serial_t); extern long keyctl_instantiate_key_iov(key_serial_t, const struct iovec __user *, unsigned, key_serial_t); extern long keyctl_invalidate_key(key_serial_t); extern long keyctl_restrict_keyring(key_serial_t id, const char __user *_type, const char __user *_restriction); #ifdef CONFIG_PERSISTENT_KEYRINGS extern long keyctl_get_persistent(uid_t, key_serial_t); extern unsigned persistent_keyring_expiry; #else static inline long keyctl_get_persistent(uid_t uid, key_serial_t destring) { return -EOPNOTSUPP; } #endif #ifdef CONFIG_KEY_DH_OPERATIONS extern long keyctl_dh_compute(struct keyctl_dh_params __user *, char __user *, size_t, struct keyctl_kdf_params __user *); extern long __keyctl_dh_compute(struct keyctl_dh_params __user *, char __user *, size_t, struct keyctl_kdf_params *); #ifdef CONFIG_COMPAT extern long compat_keyctl_dh_compute(struct keyctl_dh_params __user *params, char __user *buffer, size_t buflen, struct compat_keyctl_kdf_params __user *kdf); #endif #define KEYCTL_KDF_MAX_OUTPUT_LEN 1024 /* max length of KDF output */ #define KEYCTL_KDF_MAX_OI_LEN 64 /* max length of otherinfo */ #else static inline long keyctl_dh_compute(struct keyctl_dh_params __user *params, char __user *buffer, size_t buflen, struct keyctl_kdf_params __user *kdf) { return -EOPNOTSUPP; } #ifdef CONFIG_COMPAT static inline long compat_keyctl_dh_compute( struct keyctl_dh_params __user *params, char __user *buffer, size_t buflen, struct keyctl_kdf_params __user *kdf) { return -EOPNOTSUPP; } #endif #endif #ifdef CONFIG_ASYMMETRIC_KEY_TYPE extern long keyctl_pkey_query(key_serial_t, const char __user *, struct keyctl_pkey_query __user *); extern long keyctl_pkey_verify(const struct keyctl_pkey_params __user *, const char __user *, const void __user *, const void __user *); extern long keyctl_pkey_e_d_s(int, const struct keyctl_pkey_params __user *, const char __user *, const void __user *, void __user *); #else static inline long keyctl_pkey_query(key_serial_t id, const char __user *_info, struct keyctl_pkey_query __user *_res) { return -EOPNOTSUPP; } static inline long keyctl_pkey_verify(const struct keyctl_pkey_params __user *params, const char __user *_info, const void __user *_in, const void __user *_in2) { return -EOPNOTSUPP; } static inline long keyctl_pkey_e_d_s(int op, const struct keyctl_pkey_params __user *params, const char __user *_info, const void __user *_in, void __user *_out) { return -EOPNOTSUPP; } #endif extern long keyctl_capabilities(unsigned char __user *_buffer, size_t buflen); #ifdef CONFIG_KEY_NOTIFICATIONS extern long keyctl_watch_key(key_serial_t, int, int); #else static inline long keyctl_watch_key(key_serial_t key_id, int watch_fd, int watch_id) { return -EOPNOTSUPP; } #endif /* * Debugging key validation */ #ifdef KEY_DEBUGGING extern void __key_check(const struct key *); static inline void key_check(const struct key *key) { if (key && (IS_ERR(key) || key->magic != KEY_DEBUG_MAGIC)) __key_check(key); } #else #define key_check(key) do {} while(0) #endif #endif /* _INTERNAL_H */ |
103 29 1 92 82 68 111 90 69 32 98 49 49 49 13 8 13 13 31 6 12 13 45 1 2 1 41 37 9 36 15 17 12 45 26 13 32 32 32 32 45 11 11 11 10 10 7 7 7 12 2 10 2 8 10 10 12 8 8 4 5 6 5 1 1 2 1 2 2 1 1 2 2 7 49 8 57 4 8 38 46 21 18 8 16 3 14 48 106 106 105 57 10 16 105 42 104 104 2 101 33 23 31 24 13 48 103 12 1 1 1 1 33 25 14 57 57 33 32 33 33 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 67 9 5 4 12 12 4 4 3 3 3 3 35 35 9 11 11 11 29 11 9 35 35 34 1 12 27 21 20 35 9 29 26 11 1 53 112 1 113 3 4 9 8 5 8 7 7 5 2 1 1 28 1 19 9 10 9 1 10 10 11 11 11 1 11 8 10 1 1 16 1 15 15 15 18 1 1 17 10 7 7 7 11 10 3 3 5 60 33 3 83 83 8 75 5 35 35 7 41 7 6 85 1 61 30 3 26 21 5 26 85 7 7 7 5 12 12 3 4 2 3 2 1 2 1 2 62 62 1 62 52 52 5 60 2 54 5 52 54 54 54 54 49 49 53 4 5 4 7 36 10 45 1 5 4 47 5 4 47 4 49 53 4 49 5 51 53 58 4 54 4 58 58 48 1 47 48 2 2 45 45 1 1 10 3 7 80 45 62 45 18 29 22 13 12 9 4 25 4 9 10 45 80 15 15 15 10 8 8 8 8 7 8 8 8 19 16 14 3 15 10 8 44 15 8 10 8 8 10 15 15 19 16 17 2 13 16 3 2 8 9 2 10 14 2 8 3 19 10 17 1 18 7 15 17 6 1 5 57 27 29 29 6 28 17 13 1 13 13 1 1 1 1 1 2 2 2 8 9 15 15 10 4 4 139 139 9 9 9 9 9 9 1 9 9 9 9 1 8 3 13 1 3 12 55 55 55 55 55 55 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 5 5 3 4 7 7 7 7 7 2 1 4 3 7 7 7 7 7 1 7 6 6 1 1 6 6 6 6 1 1 1 14 14 14 14 5 14 14 14 104 104 79 1 78 5 5 5 33 33 29 8 21 8 2 22 22 22 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636 7637 7638 7639 7640 7641 7642 7643 7644 7645 7646 7647 7648 7649 7650 7651 7652 7653 7654 7655 7656 7657 7658 7659 7660 7661 7662 7663 7664 7665 7666 7667 7668 7669 7670 7671 7672 7673 7674 7675 7676 7677 7678 7679 7680 7681 7682 7683 7684 7685 7686 7687 7688 7689 7690 7691 7692 7693 7694 7695 7696 7697 7698 7699 7700 7701 7702 7703 7704 7705 7706 7707 7708 7709 7710 7711 7712 7713 7714 7715 7716 7717 7718 7719 7720 7721 7722 7723 7724 7725 7726 7727 7728 7729 7730 7731 7732 7733 7734 7735 7736 7737 7738 7739 7740 7741 7742 7743 7744 7745 7746 7747 7748 7749 7750 7751 7752 7753 7754 7755 7756 7757 7758 7759 7760 7761 7762 7763 7764 7765 7766 7767 7768 7769 7770 7771 7772 7773 7774 7775 7776 7777 7778 7779 7780 7781 7782 7783 7784 7785 7786 7787 7788 7789 7790 7791 7792 7793 7794 7795 7796 7797 7798 7799 7800 7801 7802 7803 7804 7805 7806 7807 7808 7809 7810 7811 7812 7813 7814 7815 7816 7817 7818 7819 7820 7821 7822 7823 7824 7825 7826 7827 7828 7829 7830 7831 7832 7833 7834 7835 7836 7837 7838 7839 7840 7841 7842 7843 7844 7845 7846 7847 7848 7849 7850 7851 7852 7853 7854 7855 7856 7857 7858 7859 7860 7861 7862 7863 7864 7865 7866 7867 7868 7869 7870 7871 7872 7873 7874 7875 7876 7877 7878 7879 7880 7881 7882 7883 7884 7885 7886 7887 7888 7889 7890 7891 7892 7893 7894 7895 7896 7897 7898 7899 7900 7901 7902 7903 7904 7905 7906 7907 7908 7909 7910 7911 7912 7913 7914 7915 7916 7917 7918 7919 7920 7921 7922 7923 7924 7925 7926 7927 7928 7929 7930 7931 7932 7933 7934 7935 7936 7937 7938 7939 7940 7941 7942 7943 7944 7945 7946 7947 7948 7949 7950 7951 7952 7953 7954 7955 7956 7957 7958 7959 7960 7961 7962 7963 7964 7965 7966 7967 7968 7969 7970 7971 7972 7973 7974 7975 7976 7977 7978 7979 7980 7981 7982 7983 7984 7985 7986 7987 7988 7989 7990 7991 7992 7993 7994 7995 7996 7997 7998 7999 8000 8001 8002 8003 8004 8005 8006 8007 8008 8009 8010 8011 8012 8013 8014 8015 8016 8017 8018 8019 8020 8021 8022 8023 8024 8025 8026 8027 8028 8029 8030 8031 8032 8033 8034 8035 8036 8037 8038 8039 8040 8041 8042 8043 8044 8045 8046 8047 8048 8049 8050 8051 8052 8053 8054 8055 8056 8057 8058 8059 8060 8061 8062 8063 8064 8065 8066 8067 8068 8069 8070 8071 8072 8073 8074 8075 8076 8077 8078 8079 8080 8081 8082 8083 8084 8085 8086 8087 8088 8089 8090 8091 8092 8093 8094 8095 8096 8097 8098 8099 8100 8101 8102 8103 8104 8105 8106 8107 8108 8109 8110 8111 8112 8113 8114 8115 8116 8117 8118 8119 8120 8121 8122 8123 8124 8125 8126 8127 8128 8129 8130 8131 8132 8133 8134 8135 8136 8137 8138 8139 8140 8141 8142 8143 8144 8145 8146 8147 8148 8149 8150 8151 8152 8153 8154 8155 8156 8157 8158 8159 8160 8161 8162 8163 8164 8165 8166 8167 8168 8169 8170 8171 8172 8173 8174 8175 8176 8177 8178 8179 8180 8181 8182 8183 8184 8185 8186 8187 8188 8189 8190 8191 8192 8193 8194 8195 8196 8197 8198 8199 8200 8201 8202 8203 8204 8205 8206 8207 8208 8209 8210 8211 8212 8213 8214 8215 8216 8217 8218 8219 8220 8221 8222 8223 8224 8225 8226 8227 8228 8229 8230 8231 8232 8233 8234 8235 8236 8237 8238 8239 8240 8241 8242 8243 8244 8245 8246 8247 8248 8249 8250 8251 8252 8253 8254 8255 8256 8257 8258 8259 8260 8261 8262 8263 8264 8265 8266 8267 8268 8269 8270 8271 8272 8273 8274 8275 8276 8277 8278 8279 8280 8281 8282 8283 8284 8285 8286 8287 8288 8289 8290 8291 8292 8293 8294 8295 8296 8297 8298 8299 8300 8301 8302 8303 8304 8305 8306 8307 8308 8309 8310 8311 8312 8313 8314 8315 8316 8317 8318 8319 8320 8321 8322 8323 8324 8325 8326 8327 8328 8329 8330 8331 8332 8333 8334 8335 8336 8337 8338 8339 8340 8341 8342 8343 8344 8345 8346 8347 8348 8349 8350 8351 8352 8353 8354 8355 8356 8357 8358 8359 8360 8361 8362 8363 8364 8365 8366 8367 8368 8369 8370 8371 8372 8373 8374 8375 8376 8377 8378 8379 8380 8381 8382 8383 8384 8385 8386 8387 8388 8389 8390 8391 8392 8393 8394 8395 8396 8397 8398 8399 8400 8401 8402 8403 8404 8405 8406 8407 8408 8409 8410 8411 8412 8413 8414 8415 8416 8417 8418 8419 8420 8421 8422 8423 8424 8425 8426 8427 8428 8429 8430 8431 8432 8433 8434 8435 8436 8437 8438 8439 8440 8441 8442 8443 8444 8445 8446 8447 8448 8449 8450 8451 8452 8453 8454 8455 8456 8457 8458 8459 8460 8461 8462 8463 8464 8465 8466 8467 8468 8469 8470 8471 8472 8473 8474 8475 8476 8477 8478 8479 8480 8481 8482 8483 8484 8485 8486 8487 8488 8489 8490 8491 8492 8493 8494 8495 8496 8497 8498 8499 8500 8501 8502 8503 8504 8505 8506 8507 8508 8509 8510 8511 8512 8513 8514 8515 8516 8517 8518 8519 8520 8521 8522 8523 8524 8525 8526 8527 8528 8529 8530 8531 8532 8533 8534 8535 8536 8537 8538 8539 8540 8541 8542 8543 8544 8545 8546 8547 8548 8549 8550 8551 8552 8553 8554 8555 8556 8557 8558 8559 8560 8561 8562 8563 8564 8565 8566 8567 8568 8569 8570 8571 8572 8573 8574 8575 8576 8577 8578 8579 8580 8581 8582 8583 8584 8585 8586 8587 8588 8589 8590 8591 8592 8593 8594 8595 8596 8597 8598 8599 8600 8601 8602 8603 8604 8605 8606 8607 8608 8609 8610 8611 8612 8613 8614 8615 8616 8617 8618 8619 8620 8621 8622 8623 8624 8625 8626 8627 8628 8629 8630 8631 8632 8633 8634 8635 8636 8637 8638 8639 8640 8641 8642 8643 8644 8645 8646 8647 8648 8649 8650 8651 8652 8653 8654 8655 8656 8657 8658 8659 8660 8661 8662 8663 8664 8665 8666 8667 8668 8669 8670 8671 8672 8673 8674 8675 8676 8677 8678 8679 8680 8681 8682 8683 8684 8685 8686 8687 8688 8689 8690 8691 8692 8693 8694 8695 8696 8697 8698 8699 8700 8701 8702 8703 8704 8705 8706 8707 8708 8709 8710 8711 8712 8713 8714 8715 8716 8717 8718 8719 8720 8721 8722 8723 8724 8725 8726 8727 8728 8729 8730 8731 8732 8733 8734 8735 8736 8737 8738 8739 8740 8741 8742 8743 8744 8745 8746 8747 8748 8749 8750 8751 8752 8753 8754 8755 8756 8757 8758 8759 8760 8761 8762 8763 8764 8765 8766 8767 8768 8769 8770 8771 8772 8773 8774 8775 8776 8777 8778 8779 8780 8781 8782 8783 8784 8785 8786 8787 8788 8789 8790 8791 8792 8793 8794 8795 8796 8797 8798 8799 8800 8801 8802 8803 8804 8805 8806 8807 8808 8809 8810 8811 8812 8813 8814 8815 8816 8817 8818 8819 8820 8821 8822 8823 8824 8825 8826 8827 8828 8829 8830 8831 8832 8833 8834 8835 8836 8837 8838 8839 8840 8841 8842 8843 8844 8845 8846 8847 8848 8849 8850 8851 8852 8853 8854 8855 8856 8857 8858 8859 8860 8861 8862 8863 8864 8865 8866 8867 8868 8869 8870 8871 8872 8873 8874 8875 8876 8877 8878 8879 8880 8881 8882 8883 8884 8885 8886 8887 8888 8889 8890 8891 8892 8893 8894 8895 8896 8897 8898 8899 8900 8901 8902 8903 8904 8905 8906 8907 8908 8909 8910 8911 8912 8913 8914 8915 8916 8917 8918 8919 8920 8921 8922 8923 8924 8925 8926 8927 8928 8929 8930 8931 8932 8933 8934 8935 8936 8937 8938 8939 8940 8941 8942 8943 8944 8945 8946 8947 8948 8949 8950 8951 8952 8953 8954 8955 8956 8957 8958 8959 8960 8961 8962 8963 8964 8965 8966 8967 8968 8969 8970 8971 8972 8973 8974 8975 8976 8977 8978 8979 8980 8981 8982 8983 8984 8985 8986 8987 8988 8989 8990 8991 8992 8993 8994 8995 8996 8997 8998 8999 9000 9001 9002 9003 9004 9005 9006 9007 9008 9009 9010 9011 9012 9013 9014 9015 9016 9017 9018 9019 9020 9021 9022 9023 9024 9025 9026 9027 9028 9029 9030 9031 9032 9033 9034 9035 9036 9037 9038 9039 9040 9041 9042 9043 9044 9045 9046 9047 9048 9049 9050 9051 9052 9053 9054 9055 9056 9057 9058 9059 9060 9061 9062 9063 9064 9065 9066 9067 9068 9069 9070 9071 9072 9073 9074 9075 9076 9077 9078 9079 9080 9081 9082 9083 9084 9085 9086 9087 9088 9089 9090 9091 9092 9093 9094 9095 9096 9097 9098 9099 9100 9101 9102 9103 9104 9105 9106 9107 9108 9109 9110 9111 9112 9113 9114 9115 9116 9117 9118 9119 9120 9121 9122 9123 9124 9125 9126 9127 9128 9129 9130 9131 9132 9133 9134 9135 9136 9137 9138 9139 9140 9141 9142 9143 9144 9145 9146 9147 9148 9149 9150 9151 9152 9153 9154 9155 9156 9157 9158 9159 9160 9161 9162 9163 9164 9165 9166 9167 9168 9169 9170 9171 9172 9173 9174 9175 9176 9177 9178 9179 9180 9181 9182 9183 9184 9185 9186 9187 9188 9189 9190 9191 9192 9193 9194 9195 9196 9197 9198 9199 9200 9201 9202 9203 9204 9205 9206 9207 9208 9209 9210 9211 9212 9213 9214 9215 9216 9217 9218 9219 9220 9221 9222 9223 9224 9225 9226 9227 9228 9229 9230 9231 9232 9233 9234 9235 9236 9237 9238 9239 9240 9241 9242 9243 9244 9245 9246 9247 9248 9249 9250 9251 9252 9253 9254 9255 9256 9257 9258 9259 9260 9261 9262 9263 9264 9265 9266 9267 9268 9269 9270 9271 9272 9273 9274 9275 9276 9277 9278 9279 9280 9281 9282 9283 9284 9285 9286 9287 9288 9289 9290 9291 9292 9293 9294 9295 9296 9297 9298 9299 9300 9301 9302 9303 9304 9305 9306 9307 9308 9309 9310 9311 9312 9313 9314 9315 9316 9317 9318 9319 9320 9321 9322 9323 9324 9325 9326 9327 9328 9329 9330 9331 9332 9333 9334 9335 9336 9337 9338 9339 9340 9341 9342 9343 9344 9345 9346 9347 9348 9349 9350 9351 9352 9353 9354 9355 9356 9357 9358 9359 9360 9361 9362 9363 9364 9365 9366 9367 9368 9369 9370 9371 9372 9373 9374 9375 9376 9377 9378 9379 9380 9381 9382 9383 9384 9385 9386 9387 9388 9389 9390 9391 9392 9393 9394 9395 9396 9397 9398 9399 9400 9401 9402 9403 9404 9405 9406 9407 9408 9409 9410 9411 9412 9413 9414 9415 9416 9417 9418 9419 9420 9421 9422 9423 9424 9425 9426 9427 9428 9429 9430 9431 9432 9433 9434 9435 9436 9437 9438 9439 9440 9441 9442 9443 9444 9445 9446 9447 9448 9449 9450 9451 9452 9453 9454 9455 9456 9457 9458 9459 9460 9461 9462 9463 9464 9465 9466 9467 9468 9469 9470 9471 9472 9473 9474 9475 9476 9477 9478 9479 9480 9481 9482 9483 9484 9485 9486 9487 9488 9489 9490 9491 9492 9493 9494 9495 9496 9497 9498 9499 9500 9501 9502 9503 9504 9505 9506 9507 9508 9509 9510 9511 9512 9513 9514 9515 9516 9517 9518 9519 9520 9521 9522 9523 9524 9525 9526 9527 9528 9529 9530 9531 9532 9533 9534 9535 9536 9537 9538 9539 9540 9541 9542 9543 9544 9545 9546 9547 9548 9549 9550 9551 9552 9553 9554 9555 9556 9557 9558 9559 9560 9561 9562 9563 9564 9565 9566 9567 9568 9569 9570 9571 9572 9573 9574 9575 9576 9577 9578 9579 9580 9581 9582 9583 9584 9585 9586 9587 9588 9589 9590 9591 9592 9593 9594 9595 9596 9597 9598 9599 9600 9601 9602 9603 9604 9605 9606 9607 9608 9609 9610 9611 9612 9613 9614 9615 9616 9617 9618 9619 9620 9621 9622 9623 9624 9625 9626 9627 9628 9629 9630 9631 9632 9633 9634 9635 9636 9637 9638 9639 9640 9641 9642 9643 9644 9645 9646 9647 9648 9649 9650 9651 9652 9653 9654 9655 9656 9657 9658 9659 9660 9661 9662 9663 9664 9665 9666 9667 9668 9669 9670 9671 9672 9673 9674 9675 9676 9677 9678 9679 9680 9681 9682 9683 9684 9685 9686 9687 9688 9689 9690 9691 9692 9693 9694 9695 9696 9697 9698 9699 9700 9701 9702 9703 9704 9705 9706 9707 9708 9709 9710 9711 9712 9713 9714 9715 9716 9717 9718 9719 9720 9721 9722 9723 9724 9725 9726 9727 9728 9729 9730 9731 9732 9733 9734 9735 9736 9737 9738 9739 9740 9741 9742 9743 9744 9745 9746 9747 9748 9749 9750 9751 9752 9753 9754 9755 9756 9757 9758 9759 9760 9761 9762 9763 9764 9765 9766 9767 9768 9769 9770 9771 9772 9773 9774 9775 9776 9777 9778 9779 9780 9781 9782 9783 9784 9785 9786 9787 9788 9789 9790 9791 9792 9793 9794 9795 9796 9797 9798 9799 9800 9801 9802 9803 9804 9805 9806 9807 9808 9809 9810 9811 9812 9813 9814 9815 9816 9817 9818 9819 9820 9821 9822 9823 9824 9825 9826 9827 9828 9829 9830 9831 9832 9833 9834 9835 9836 9837 9838 9839 9840 9841 9842 9843 9844 9845 9846 9847 9848 9849 9850 9851 9852 9853 9854 9855 9856 9857 9858 9859 9860 9861 9862 9863 9864 9865 9866 9867 9868 9869 9870 9871 9872 9873 9874 9875 9876 9877 9878 9879 9880 9881 9882 9883 9884 9885 9886 9887 9888 9889 9890 9891 9892 9893 9894 9895 9896 9897 9898 9899 9900 9901 9902 9903 9904 9905 9906 9907 9908 9909 9910 9911 9912 9913 9914 9915 9916 9917 9918 9919 9920 9921 9922 9923 9924 9925 9926 9927 9928 9929 9930 9931 9932 9933 9934 9935 9936 9937 9938 9939 9940 9941 9942 9943 9944 9945 9946 9947 9948 9949 9950 9951 9952 9953 9954 9955 9956 9957 9958 9959 9960 9961 9962 9963 9964 9965 9966 9967 9968 9969 9970 9971 9972 9973 9974 9975 9976 9977 9978 9979 9980 9981 9982 9983 9984 9985 9986 9987 9988 9989 9990 9991 9992 9993 9994 9995 9996 9997 9998 9999 10000 10001 10002 10003 10004 10005 10006 10007 10008 10009 10010 10011 10012 10013 10014 10015 10016 10017 10018 10019 10020 10021 10022 10023 10024 10025 10026 10027 10028 10029 10030 10031 10032 10033 10034 10035 10036 10037 10038 10039 10040 10041 10042 10043 10044 10045 10046 10047 10048 10049 10050 10051 10052 10053 10054 10055 10056 10057 10058 10059 10060 10061 10062 10063 10064 10065 10066 10067 10068 10069 10070 10071 10072 10073 10074 10075 10076 10077 10078 10079 10080 10081 10082 10083 10084 10085 10086 10087 10088 10089 10090 10091 10092 10093 10094 10095 10096 10097 10098 10099 10100 10101 10102 10103 10104 10105 10106 10107 10108 10109 10110 10111 10112 10113 10114 10115 10116 10117 10118 10119 10120 10121 10122 10123 10124 10125 10126 10127 10128 10129 10130 10131 10132 10133 10134 10135 10136 10137 10138 10139 10140 10141 10142 10143 10144 10145 10146 10147 10148 10149 10150 10151 10152 10153 10154 10155 10156 10157 10158 10159 10160 10161 10162 10163 10164 10165 10166 10167 10168 10169 10170 10171 10172 10173 10174 10175 10176 10177 10178 10179 10180 10181 10182 10183 10184 10185 10186 10187 10188 10189 10190 10191 10192 10193 10194 10195 10196 10197 10198 10199 10200 10201 10202 10203 10204 10205 10206 10207 10208 10209 10210 10211 10212 10213 10214 10215 10216 10217 10218 10219 10220 10221 10222 10223 10224 10225 10226 10227 10228 10229 10230 10231 10232 10233 10234 10235 10236 10237 10238 10239 10240 10241 10242 10243 10244 10245 10246 10247 10248 10249 10250 10251 10252 10253 10254 10255 10256 10257 10258 10259 10260 10261 10262 10263 10264 10265 10266 10267 10268 10269 10270 10271 10272 10273 10274 10275 10276 10277 10278 10279 10280 10281 10282 10283 10284 10285 10286 10287 10288 10289 10290 10291 10292 10293 10294 10295 10296 10297 10298 10299 10300 10301 10302 10303 10304 10305 10306 10307 10308 10309 10310 10311 10312 10313 10314 10315 10316 10317 10318 10319 10320 10321 10322 10323 10324 10325 10326 10327 10328 10329 10330 10331 10332 10333 10334 10335 10336 10337 10338 10339 10340 10341 10342 10343 10344 10345 10346 10347 10348 10349 10350 10351 10352 10353 10354 10355 10356 10357 10358 10359 10360 10361 10362 10363 10364 10365 10366 10367 10368 10369 10370 10371 10372 10373 10374 10375 10376 10377 10378 10379 10380 10381 10382 10383 10384 10385 10386 10387 10388 10389 10390 10391 10392 10393 10394 10395 10396 10397 10398 10399 10400 10401 10402 10403 10404 10405 10406 10407 10408 10409 10410 10411 10412 10413 10414 10415 10416 10417 10418 10419 10420 10421 10422 10423 10424 10425 10426 10427 10428 10429 10430 10431 10432 10433 10434 10435 10436 10437 10438 10439 10440 10441 10442 10443 10444 10445 10446 10447 10448 10449 10450 10451 10452 10453 10454 10455 10456 10457 10458 10459 10460 10461 10462 10463 10464 10465 10466 10467 10468 10469 10470 10471 10472 10473 10474 10475 10476 10477 10478 10479 10480 10481 10482 10483 10484 10485 10486 10487 10488 10489 10490 10491 10492 10493 10494 10495 10496 10497 10498 10499 10500 10501 10502 10503 10504 10505 10506 10507 10508 10509 10510 10511 10512 10513 10514 10515 10516 10517 10518 10519 10520 10521 10522 10523 10524 10525 10526 10527 10528 10529 10530 10531 10532 10533 10534 10535 10536 10537 10538 10539 10540 10541 10542 10543 10544 10545 10546 10547 10548 10549 10550 10551 10552 10553 10554 10555 10556 10557 10558 10559 10560 10561 10562 10563 10564 10565 10566 10567 10568 10569 10570 10571 10572 10573 10574 10575 10576 10577 10578 10579 10580 10581 10582 10583 10584 10585 10586 10587 10588 10589 10590 10591 10592 10593 10594 10595 10596 10597 10598 10599 10600 10601 10602 10603 10604 10605 10606 10607 10608 10609 10610 10611 10612 10613 10614 10615 10616 10617 10618 10619 10620 10621 10622 10623 10624 10625 10626 10627 10628 10629 10630 10631 10632 10633 10634 10635 10636 10637 10638 10639 10640 10641 10642 10643 10644 10645 10646 10647 10648 10649 10650 10651 10652 10653 10654 10655 10656 10657 10658 10659 10660 10661 10662 10663 10664 10665 10666 10667 10668 10669 10670 10671 10672 10673 10674 10675 10676 10677 10678 10679 10680 10681 10682 10683 10684 10685 10686 10687 10688 10689 10690 10691 10692 10693 10694 10695 10696 10697 10698 10699 10700 10701 10702 10703 10704 10705 10706 10707 10708 10709 10710 10711 10712 10713 10714 10715 10716 10717 10718 10719 10720 10721 10722 10723 10724 10725 10726 10727 10728 10729 10730 10731 10732 10733 10734 10735 10736 10737 10738 10739 10740 10741 10742 10743 10744 10745 10746 10747 10748 10749 10750 10751 10752 10753 10754 10755 10756 10757 10758 10759 10760 10761 10762 10763 10764 10765 10766 10767 10768 10769 10770 10771 10772 10773 10774 10775 10776 10777 10778 10779 10780 10781 10782 10783 10784 10785 10786 10787 10788 10789 10790 10791 10792 10793 10794 10795 10796 10797 10798 10799 10800 10801 10802 10803 10804 10805 10806 10807 10808 10809 10810 10811 10812 10813 10814 10815 10816 10817 10818 10819 10820 10821 10822 10823 10824 10825 10826 10827 10828 10829 10830 10831 10832 10833 10834 10835 10836 10837 10838 10839 10840 10841 10842 10843 10844 10845 10846 10847 10848 10849 10850 10851 10852 10853 10854 10855 10856 10857 10858 10859 10860 10861 10862 10863 10864 10865 10866 10867 10868 10869 10870 10871 10872 10873 10874 10875 10876 10877 10878 10879 10880 10881 10882 10883 10884 10885 10886 10887 10888 10889 10890 10891 10892 10893 10894 10895 10896 10897 10898 10899 10900 10901 10902 10903 10904 10905 10906 10907 10908 10909 10910 10911 10912 10913 10914 10915 10916 10917 10918 10919 10920 10921 10922 10923 10924 10925 10926 10927 10928 10929 10930 10931 10932 10933 10934 10935 10936 10937 10938 10939 10940 10941 10942 10943 10944 10945 10946 10947 10948 10949 10950 10951 10952 10953 10954 10955 10956 10957 10958 10959 10960 10961 10962 10963 10964 10965 10966 10967 10968 10969 10970 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Oracle. All rights reserved. */ #include <crypto/hash.h> #include <linux/kernel.h> #include <linux/bio.h> #include <linux/blk-cgroup.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/pagemap.h> #include <linux/highmem.h> #include <linux/time.h> #include <linux/init.h> #include <linux/string.h> #include <linux/backing-dev.h> #include <linux/writeback.h> #include <linux/compat.h> #include <linux/xattr.h> #include <linux/posix_acl.h> #include <linux/falloc.h> #include <linux/slab.h> #include <linux/ratelimit.h> #include <linux/btrfs.h> #include <linux/blkdev.h> #include <linux/posix_acl_xattr.h> #include <linux/uio.h> #include <linux/magic.h> #include <linux/iversion.h> #include <linux/swap.h> #include <linux/migrate.h> #include <linux/sched/mm.h> #include <linux/iomap.h> #include <asm/unaligned.h> #include <linux/fsverity.h> #include "misc.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" #include "btrfs_inode.h" #include "print-tree.h" #include "ordered-data.h" #include "xattr.h" #include "tree-log.h" #include "bio.h" #include "compression.h" #include "locking.h" #include "free-space-cache.h" #include "props.h" #include "qgroup.h" #include "delalloc-space.h" #include "block-group.h" #include "space-info.h" #include "zoned.h" #include "subpage.h" #include "inode-item.h" #include "fs.h" #include "accessors.h" #include "extent-tree.h" #include "root-tree.h" #include "defrag.h" #include "dir-item.h" #include "file-item.h" #include "uuid-tree.h" #include "ioctl.h" #include "file.h" #include "acl.h" #include "relocation.h" #include "verity.h" #include "super.h" #include "orphan.h" #include "backref.h" #include "raid-stripe-tree.h" struct btrfs_iget_args { u64 ino; struct btrfs_root *root; }; struct btrfs_dio_data { ssize_t submitted; struct extent_changeset *data_reserved; struct btrfs_ordered_extent *ordered; bool data_space_reserved; bool nocow_done; }; struct btrfs_dio_private { /* Range of I/O */ u64 file_offset; u32 bytes; /* This must be last */ struct btrfs_bio bbio; }; static struct bio_set btrfs_dio_bioset; struct btrfs_rename_ctx { /* Output field. Stores the index number of the old directory entry. */ u64 index; }; /* * Used by data_reloc_print_warning_inode() to pass needed info for filename * resolution and output of error message. */ struct data_reloc_warn { struct btrfs_path path; struct btrfs_fs_info *fs_info; u64 extent_item_size; u64 logical; int mirror_num; }; static const struct inode_operations btrfs_dir_inode_operations; static const struct inode_operations btrfs_symlink_inode_operations; static const struct inode_operations btrfs_special_inode_operations; static const struct inode_operations btrfs_file_inode_operations; static const struct address_space_operations btrfs_aops; static const struct file_operations btrfs_dir_file_operations; static struct kmem_cache *btrfs_inode_cachep; static int btrfs_setsize(struct inode *inode, struct iattr *attr); static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback); static noinline int run_delalloc_cow(struct btrfs_inode *inode, struct page *locked_page, u64 start, u64 end, struct writeback_control *wbc, bool pages_dirty); static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, u64 len, u64 orig_start, u64 block_start, u64 block_len, u64 orig_block_len, u64 ram_bytes, int compress_type, int type); static int data_reloc_print_warning_inode(u64 inum, u64 offset, u64 num_bytes, u64 root, void *warn_ctx) { struct data_reloc_warn *warn = warn_ctx; struct btrfs_fs_info *fs_info = warn->fs_info; struct extent_buffer *eb; struct btrfs_inode_item *inode_item; struct inode_fs_paths *ipath = NULL; struct btrfs_root *local_root; struct btrfs_key key; unsigned int nofs_flag; u32 nlink; int ret; local_root = btrfs_get_fs_root(fs_info, root, true); if (IS_ERR(local_root)) { ret = PTR_ERR(local_root); goto err; } /* This makes the path point to (inum INODE_ITEM ioff). */ key.objectid = inum; key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; ret = btrfs_search_slot(NULL, local_root, &key, &warn->path, 0, 0); if (ret) { btrfs_put_root(local_root); btrfs_release_path(&warn->path); goto err; } eb = warn->path.nodes[0]; inode_item = btrfs_item_ptr(eb, warn->path.slots[0], struct btrfs_inode_item); nlink = btrfs_inode_nlink(eb, inode_item); btrfs_release_path(&warn->path); nofs_flag = memalloc_nofs_save(); ipath = init_ipath(4096, local_root, &warn->path); memalloc_nofs_restore(nofs_flag); if (IS_ERR(ipath)) { btrfs_put_root(local_root); ret = PTR_ERR(ipath); ipath = NULL; /* * -ENOMEM, not a critical error, just output an generic error * without filename. */ btrfs_warn(fs_info, "checksum error at logical %llu mirror %u root %llu, inode %llu offset %llu", warn->logical, warn->mirror_num, root, inum, offset); return ret; } ret = paths_from_inode(inum, ipath); if (ret < 0) goto err; /* * We deliberately ignore the bit ipath might have been too small to * hold all of the paths here */ for (int i = 0; i < ipath->fspath->elem_cnt; i++) { btrfs_warn(fs_info, "checksum error at logical %llu mirror %u root %llu inode %llu offset %llu length %u links %u (path: %s)", warn->logical, warn->mirror_num, root, inum, offset, fs_info->sectorsize, nlink, (char *)(unsigned long)ipath->fspath->val[i]); } btrfs_put_root(local_root); free_ipath(ipath); return 0; err: btrfs_warn(fs_info, "checksum error at logical %llu mirror %u root %llu inode %llu offset %llu, path resolving failed with ret=%d", warn->logical, warn->mirror_num, root, inum, offset, ret); free_ipath(ipath); return ret; } /* * Do extra user-friendly error output (e.g. lookup all the affected files). * * Return true if we succeeded doing the backref lookup. * Return false if such lookup failed, and has to fallback to the old error message. */ static void print_data_reloc_error(const struct btrfs_inode *inode, u64 file_off, const u8 *csum, const u8 *csum_expected, int mirror_num) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_path path = { 0 }; struct btrfs_key found_key = { 0 }; struct extent_buffer *eb; struct btrfs_extent_item *ei; const u32 csum_size = fs_info->csum_size; u64 logical; u64 flags; u32 item_size; int ret; mutex_lock(&fs_info->reloc_mutex); logical = btrfs_get_reloc_bg_bytenr(fs_info); mutex_unlock(&fs_info->reloc_mutex); if (logical == U64_MAX) { btrfs_warn_rl(fs_info, "has data reloc tree but no running relocation"); btrfs_warn_rl(fs_info, "csum failed root %lld ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", inode->root->root_key.objectid, btrfs_ino(inode), file_off, CSUM_FMT_VALUE(csum_size, csum), CSUM_FMT_VALUE(csum_size, csum_expected), mirror_num); return; } logical += file_off; btrfs_warn_rl(fs_info, "csum failed root %lld ino %llu off %llu logical %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", inode->root->root_key.objectid, btrfs_ino(inode), file_off, logical, CSUM_FMT_VALUE(csum_size, csum), CSUM_FMT_VALUE(csum_size, csum_expected), mirror_num); ret = extent_from_logical(fs_info, logical, &path, &found_key, &flags); if (ret < 0) { btrfs_err_rl(fs_info, "failed to lookup extent item for logical %llu: %d", logical, ret); return; } eb = path.nodes[0]; ei = btrfs_item_ptr(eb, path.slots[0], struct btrfs_extent_item); item_size = btrfs_item_size(eb, path.slots[0]); if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { unsigned long ptr = 0; u64 ref_root; u8 ref_level; while (true) { ret = tree_backref_for_extent(&ptr, eb, &found_key, ei, item_size, &ref_root, &ref_level); if (ret < 0) { btrfs_warn_rl(fs_info, "failed to resolve tree backref for logical %llu: %d", logical, ret); break; } if (ret > 0) break; btrfs_warn_rl(fs_info, "csum error at logical %llu mirror %u: metadata %s (level %d) in tree %llu", logical, mirror_num, (ref_level ? "node" : "leaf"), ref_level, ref_root); } btrfs_release_path(&path); } else { struct btrfs_backref_walk_ctx ctx = { 0 }; struct data_reloc_warn reloc_warn = { 0 }; btrfs_release_path(&path); ctx.bytenr = found_key.objectid; ctx.extent_item_pos = logical - found_key.objectid; ctx.fs_info = fs_info; reloc_warn.logical = logical; reloc_warn.extent_item_size = found_key.offset; reloc_warn.mirror_num = mirror_num; reloc_warn.fs_info = fs_info; iterate_extent_inodes(&ctx, true, data_reloc_print_warning_inode, &reloc_warn); } } static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode, u64 logical_start, u8 *csum, u8 *csum_expected, int mirror_num) { struct btrfs_root *root = inode->root; const u32 csum_size = root->fs_info->csum_size; /* For data reloc tree, it's better to do a backref lookup instead. */ if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) return print_data_reloc_error(inode, logical_start, csum, csum_expected, mirror_num); /* Output without objectid, which is more meaningful */ if (root->root_key.objectid >= BTRFS_LAST_FREE_OBJECTID) { btrfs_warn_rl(root->fs_info, "csum failed root %lld ino %lld off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", root->root_key.objectid, btrfs_ino(inode), logical_start, CSUM_FMT_VALUE(csum_size, csum), CSUM_FMT_VALUE(csum_size, csum_expected), mirror_num); } else { btrfs_warn_rl(root->fs_info, "csum failed root %llu ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", root->root_key.objectid, btrfs_ino(inode), logical_start, CSUM_FMT_VALUE(csum_size, csum), CSUM_FMT_VALUE(csum_size, csum_expected), mirror_num); } } /* * Lock inode i_rwsem based on arguments passed. * * ilock_flags can have the following bit set: * * BTRFS_ILOCK_SHARED - acquire a shared lock on the inode * BTRFS_ILOCK_TRY - try to acquire the lock, if fails on first attempt * return -EAGAIN * BTRFS_ILOCK_MMAP - acquire a write lock on the i_mmap_lock */ int btrfs_inode_lock(struct btrfs_inode *inode, unsigned int ilock_flags) { if (ilock_flags & BTRFS_ILOCK_SHARED) { if (ilock_flags & BTRFS_ILOCK_TRY) { if (!inode_trylock_shared(&inode->vfs_inode)) return -EAGAIN; else return 0; } inode_lock_shared(&inode->vfs_inode); } else { if (ilock_flags & BTRFS_ILOCK_TRY) { if (!inode_trylock(&inode->vfs_inode)) return -EAGAIN; else return 0; } inode_lock(&inode->vfs_inode); } if (ilock_flags & BTRFS_ILOCK_MMAP) down_write(&inode->i_mmap_lock); return 0; } /* * Unock inode i_rwsem. * * ilock_flags should contain the same bits set as passed to btrfs_inode_lock() * to decide whether the lock acquired is shared or exclusive. */ void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags) { if (ilock_flags & BTRFS_ILOCK_MMAP) up_write(&inode->i_mmap_lock); if (ilock_flags & BTRFS_ILOCK_SHARED) inode_unlock_shared(&inode->vfs_inode); else inode_unlock(&inode->vfs_inode); } /* * Cleanup all submitted ordered extents in specified range to handle errors * from the btrfs_run_delalloc_range() callback. * * NOTE: caller must ensure that when an error happens, it can not call * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING * and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata * to be released, which we want to happen only when finishing the ordered * extent (btrfs_finish_ordered_io()). */ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, struct page *locked_page, u64 offset, u64 bytes) { unsigned long index = offset >> PAGE_SHIFT; unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT; u64 page_start = 0, page_end = 0; struct page *page; if (locked_page) { page_start = page_offset(locked_page); page_end = page_start + PAGE_SIZE - 1; } while (index <= end_index) { /* * For locked page, we will call btrfs_mark_ordered_io_finished * through btrfs_mark_ordered_io_finished() on it * in run_delalloc_range() for the error handling, which will * clear page Ordered and run the ordered extent accounting. * * Here we can't just clear the Ordered bit, or * btrfs_mark_ordered_io_finished() would skip the accounting * for the page range, and the ordered extent will never finish. */ if (locked_page && index == (page_start >> PAGE_SHIFT)) { index++; continue; } page = find_get_page(inode->vfs_inode.i_mapping, index); index++; if (!page) continue; /* * Here we just clear all Ordered bits for every page in the * range, then btrfs_mark_ordered_io_finished() will handle * the ordered extent accounting for the range. */ btrfs_page_clamp_clear_ordered(inode->root->fs_info, page, offset, bytes); put_page(page); } if (locked_page) { /* The locked page covers the full range, nothing needs to be done */ if (bytes + offset <= page_start + PAGE_SIZE) return; /* * In case this page belongs to the delalloc range being * instantiated then skip it, since the first page of a range is * going to be properly cleaned up by the caller of * run_delalloc_range */ if (page_start >= offset && page_end <= (offset + bytes - 1)) { bytes = offset + bytes - page_offset(locked_page) - PAGE_SIZE; offset = page_offset(locked_page) + PAGE_SIZE; } } return btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes, false); } static int btrfs_dirty_inode(struct btrfs_inode *inode); static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, struct btrfs_new_inode_args *args) { int err; if (args->default_acl) { err = __btrfs_set_acl(trans, args->inode, args->default_acl, ACL_TYPE_DEFAULT); if (err) return err; } if (args->acl) { err = __btrfs_set_acl(trans, args->inode, args->acl, ACL_TYPE_ACCESS); if (err) return err; } if (!args->default_acl && !args->acl) cache_no_acl(args->inode); return btrfs_xattr_security_init(trans, args->inode, args->dir, &args->dentry->d_name); } /* * this does all the hard work for inserting an inline extent into * the btree. The caller should have done a btrfs_drop_extents so that * no overlapping inline items exist in the btree */ static int insert_inline_extent(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_inode *inode, bool extent_inserted, size_t size, size_t compressed_size, int compress_type, struct page **compressed_pages, bool update_i_size) { struct btrfs_root *root = inode->root; struct extent_buffer *leaf; struct page *page = NULL; char *kaddr; unsigned long ptr; struct btrfs_file_extent_item *ei; int ret; size_t cur_size = size; u64 i_size; ASSERT((compressed_size > 0 && compressed_pages) || (compressed_size == 0 && !compressed_pages)); if (compressed_size && compressed_pages) cur_size = compressed_size; if (!extent_inserted) { struct btrfs_key key; size_t datasize; key.objectid = btrfs_ino(inode); key.offset = 0; key.type = BTRFS_EXTENT_DATA_KEY; datasize = btrfs_file_extent_calc_inline_size(cur_size); ret = btrfs_insert_empty_item(trans, root, path, &key, datasize); if (ret) goto fail; } leaf = path->nodes[0]; ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); btrfs_set_file_extent_generation(leaf, ei, trans->transid); btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE); btrfs_set_file_extent_encryption(leaf, ei, 0); btrfs_set_file_extent_other_encoding(leaf, ei, 0); btrfs_set_file_extent_ram_bytes(leaf, ei, size); ptr = btrfs_file_extent_inline_start(ei); if (compress_type != BTRFS_COMPRESS_NONE) { struct page *cpage; int i = 0; while (compressed_size > 0) { cpage = compressed_pages[i]; cur_size = min_t(unsigned long, compressed_size, PAGE_SIZE); kaddr = kmap_local_page(cpage); write_extent_buffer(leaf, kaddr, ptr, cur_size); kunmap_local(kaddr); i++; ptr += cur_size; compressed_size -= cur_size; } btrfs_set_file_extent_compression(leaf, ei, compress_type); } else { page = find_get_page(inode->vfs_inode.i_mapping, 0); btrfs_set_file_extent_compression(leaf, ei, 0); kaddr = kmap_local_page(page); write_extent_buffer(leaf, kaddr, ptr, size); kunmap_local(kaddr); put_page(page); } btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); /* * We align size to sectorsize for inline extents just for simplicity * sake. */ ret = btrfs_inode_set_file_extent_range(inode, 0, ALIGN(size, root->fs_info->sectorsize)); if (ret) goto fail; /* * We're an inline extent, so nobody can extend the file past i_size * without locking a page we already have locked. * * We must do any i_size and inode updates before we unlock the pages. * Otherwise we could end up racing with unlink. */ i_size = i_size_read(&inode->vfs_inode); if (update_i_size && size > i_size) { i_size_write(&inode->vfs_inode, size); i_size = size; } inode->disk_i_size = i_size; fail: return ret; } /* * conditionally insert an inline extent into the file. This * does the checks required to make sure the data is small enough * to fit as an inline extent. */ static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size, size_t compressed_size, int compress_type, struct page **compressed_pages, bool update_i_size) { struct btrfs_drop_extents_args drop_args = { 0 }; struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans; u64 data_len = (compressed_size ?: size); int ret; struct btrfs_path *path; /* * We can create an inline extent if it ends at or beyond the current * i_size, is no larger than a sector (decompressed), and the (possibly * compressed) data fits in a leaf and the configured maximum inline * size. */ if (size < i_size_read(&inode->vfs_inode) || size > fs_info->sectorsize || data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) || data_len > fs_info->max_inline) return 1; path = btrfs_alloc_path(); if (!path) return -ENOMEM; trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { btrfs_free_path(path); return PTR_ERR(trans); } trans->block_rsv = &inode->block_rsv; drop_args.path = path; drop_args.start = 0; drop_args.end = fs_info->sectorsize; drop_args.drop_cache = true; drop_args.replace_extent = true; drop_args.extent_item_size = btrfs_file_extent_calc_inline_size(data_len); ret = btrfs_drop_extents(trans, root, inode, &drop_args); if (ret) { btrfs_abort_transaction(trans, ret); goto out; } ret = insert_inline_extent(trans, path, inode, drop_args.extent_inserted, size, compressed_size, compress_type, compressed_pages, update_i_size); if (ret && ret != -ENOSPC) { btrfs_abort_transaction(trans, ret); goto out; } else if (ret == -ENOSPC) { ret = 1; goto out; } btrfs_update_inode_bytes(inode, size, drop_args.bytes_found); ret = btrfs_update_inode(trans, inode); if (ret && ret != -ENOSPC) { btrfs_abort_transaction(trans, ret); goto out; } else if (ret == -ENOSPC) { ret = 1; goto out; } btrfs_set_inode_full_sync(inode); out: /* * Don't forget to free the reserved space, as for inlined extent * it won't count as data extent, free them directly here. * And at reserve time, it's always aligned to page size, so * just free one page here. */ btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE, NULL); btrfs_free_path(path); btrfs_end_transaction(trans); return ret; } struct async_extent { u64 start; u64 ram_size; u64 compressed_size; struct page **pages; unsigned long nr_pages; int compress_type; struct list_head list; }; struct async_chunk { struct btrfs_inode *inode; struct page *locked_page; u64 start; u64 end; blk_opf_t write_flags; struct list_head extents; struct cgroup_subsys_state *blkcg_css; struct btrfs_work work; struct async_cow *async_cow; }; struct async_cow { atomic_t num_chunks; struct async_chunk chunks[]; }; static noinline int add_async_extent(struct async_chunk *cow, u64 start, u64 ram_size, u64 compressed_size, struct page **pages, unsigned long nr_pages, int compress_type) { struct async_extent *async_extent; async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS); BUG_ON(!async_extent); /* -ENOMEM */ async_extent->start = start; async_extent->ram_size = ram_size; async_extent->compressed_size = compressed_size; async_extent->pages = pages; async_extent->nr_pages = nr_pages; async_extent->compress_type = compress_type; list_add_tail(&async_extent->list, &cow->extents); return 0; } /* * Check if the inode needs to be submitted to compression, based on mount * options, defragmentation, properties or heuristics. */ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start, u64 end) { struct btrfs_fs_info *fs_info = inode->root->fs_info; if (!btrfs_inode_can_compress(inode)) { WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG), KERN_ERR "BTRFS: unexpected compression for ino %llu\n", btrfs_ino(inode)); return 0; } /* * Special check for subpage. * * We lock the full page then run each delalloc range in the page, thus * for the following case, we will hit some subpage specific corner case: * * 0 32K 64K * | |///////| |///////| * \- A \- B * * In above case, both range A and range B will try to unlock the full * page [0, 64K), causing the one finished later will have page * unlocked already, triggering various page lock requirement BUG_ON()s. * * So here we add an artificial limit that subpage compression can only * if the range is fully page aligned. * * In theory we only need to ensure the first page is fully covered, but * the tailing partial page will be locked until the full compression * finishes, delaying the write of other range. * * TODO: Make btrfs_run_delalloc_range() to lock all delalloc range * first to prevent any submitted async extent to unlock the full page. * By this, we can ensure for subpage case that only the last async_cow * will unlock the full page. */ if (fs_info->sectorsize < PAGE_SIZE) { if (!PAGE_ALIGNED(start) || !PAGE_ALIGNED(end + 1)) return 0; } /* force compress */ if (btrfs_test_opt(fs_info, FORCE_COMPRESS)) return 1; /* defrag ioctl */ if (inode->defrag_compress) return 1; /* bad compression ratios */ if (inode->flags & BTRFS_INODE_NOCOMPRESS) return 0; if (btrfs_test_opt(fs_info, COMPRESS) || inode->flags & BTRFS_INODE_COMPRESS || inode->prop_compress) return btrfs_compress_heuristic(&inode->vfs_inode, start, end); return 0; } static inline void inode_should_defrag(struct btrfs_inode *inode, u64 start, u64 end, u64 num_bytes, u32 small_write) { /* If this is a small write inside eof, kick off a defrag */ if (num_bytes < small_write && (start > 0 || end + 1 < inode->disk_i_size)) btrfs_add_inode_defrag(NULL, inode, small_write); } /* * Work queue call back to started compression on a file and pages. * * This is done inside an ordered work queue, and the compression is spread * across many cpus. The actual IO submission is step two, and the ordered work * queue takes care of making sure that happens in the same order things were * put onto the queue by writepages and friends. * * If this code finds it can't get good compression, it puts an entry onto the * work queue to write the uncompressed bytes. This makes sure that both * compressed inodes and uncompressed inodes are written in the same order that * the flusher thread sent them down. */ static void compress_file_range(struct btrfs_work *work) { struct async_chunk *async_chunk = container_of(work, struct async_chunk, work); struct btrfs_inode *inode = async_chunk->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; struct address_space *mapping = inode->vfs_inode.i_mapping; u64 blocksize = fs_info->sectorsize; u64 start = async_chunk->start; u64 end = async_chunk->end; u64 actual_end; u64 i_size; int ret = 0; struct page **pages; unsigned long nr_pages; unsigned long total_compressed = 0; unsigned long total_in = 0; unsigned int poff; int i; int compress_type = fs_info->compress_type; inode_should_defrag(inode, start, end, end - start + 1, SZ_16K); /* * We need to call clear_page_dirty_for_io on each page in the range. * Otherwise applications with the file mmap'd can wander in and change * the page contents while we are compressing them. */ extent_range_clear_dirty_for_io(&inode->vfs_inode, start, end); /* * We need to save i_size before now because it could change in between * us evaluating the size and assigning it. This is because we lock and * unlock the page in truncate and fallocate, and then modify the i_size * later on. * * The barriers are to emulate READ_ONCE, remove that once i_size_read * does that for us. */ barrier(); i_size = i_size_read(&inode->vfs_inode); barrier(); actual_end = min_t(u64, i_size, end + 1); again: pages = NULL; nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1; nr_pages = min_t(unsigned long, nr_pages, BTRFS_MAX_COMPRESSED_PAGES); /* * we don't want to send crud past the end of i_size through * compression, that's just a waste of CPU time. So, if the * end of the file is before the start of our current * requested range of bytes, we bail out to the uncompressed * cleanup code that can deal with all of this. * * It isn't really the fastest way to fix things, but this is a * very uncommon corner. */ if (actual_end <= start) goto cleanup_and_bail_uncompressed; total_compressed = actual_end - start; /* * Skip compression for a small file range(<=blocksize) that * isn't an inline extent, since it doesn't save disk space at all. */ if (total_compressed <= blocksize && (start > 0 || end + 1 < inode->disk_i_size)) goto cleanup_and_bail_uncompressed; /* * For subpage case, we require full page alignment for the sector * aligned range. * Thus we must also check against @actual_end, not just @end. */ if (blocksize < PAGE_SIZE) { if (!PAGE_ALIGNED(start) || !PAGE_ALIGNED(round_up(actual_end, blocksize))) goto cleanup_and_bail_uncompressed; } total_compressed = min_t(unsigned long, total_compressed, BTRFS_MAX_UNCOMPRESSED); total_in = 0; ret = 0; /* * We do compression for mount -o compress and when the inode has not * been flagged as NOCOMPRESS. This flag can change at any time if we * discover bad compression ratios. */ if (!inode_need_compress(inode, start, end)) goto cleanup_and_bail_uncompressed; pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); if (!pages) { /* * Memory allocation failure is not a fatal error, we can fall * back to uncompressed code. */ goto cleanup_and_bail_uncompressed; } if (inode->defrag_compress) compress_type = inode->defrag_compress; else if (inode->prop_compress) compress_type = inode->prop_compress; /* Compression level is applied here. */ ret = btrfs_compress_pages(compress_type | (fs_info->compress_level << 4), mapping, start, pages, &nr_pages, &total_in, &total_compressed); if (ret) goto mark_incompressible; /* * Zero the tail end of the last page, as we might be sending it down * to disk. */ poff = offset_in_page(total_compressed); if (poff) memzero_page(pages[nr_pages - 1], poff, PAGE_SIZE - poff); /* * Try to create an inline extent. * * If we didn't compress the entire range, try to create an uncompressed * inline extent, else a compressed one. * * Check cow_file_range() for why we don't even try to create inline * extent for the subpage case. */ if (start == 0 && fs_info->sectorsize == PAGE_SIZE) { if (total_in < actual_end) { ret = cow_file_range_inline(inode, actual_end, 0, BTRFS_COMPRESS_NONE, NULL, false); } else { ret = cow_file_range_inline(inode, actual_end, total_compressed, compress_type, pages, false); } if (ret <= 0) { unsigned long clear_flags = EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING; if (ret < 0) mapping_set_error(mapping, -EIO); /* * inline extent creation worked or returned error, * we don't need to create any more async work items. * Unlock and free up our temp pages. * * We use DO_ACCOUNTING here because we need the * delalloc_release_metadata to be done _after_ we drop * our outstanding extent for clearing delalloc for this * range. */ extent_clear_unlock_delalloc(inode, start, end, NULL, clear_flags, PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); goto free_pages; } } /* * We aren't doing an inline extent. Round the compressed size up to a * block size boundary so the allocator does sane things. */ total_compressed = ALIGN(total_compressed, blocksize); /* * One last check to make sure the compression is really a win, compare * the page count read with the blocks on disk, compression must free at * least one sector. */ total_in = round_up(total_in, fs_info->sectorsize); if (total_compressed + blocksize > total_in) goto mark_incompressible; /* * The async work queues will take care of doing actual allocation on * disk for these compressed pages, and will submit the bios. */ add_async_extent(async_chunk, start, total_in, total_compressed, pages, nr_pages, compress_type); if (start + total_in < end) { start += total_in; cond_resched(); goto again; } return; mark_incompressible: if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) && !inode->prop_compress) inode->flags |= BTRFS_INODE_NOCOMPRESS; cleanup_and_bail_uncompressed: add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0, BTRFS_COMPRESS_NONE); free_pages: if (pages) { for (i = 0; i < nr_pages; i++) { WARN_ON(pages[i]->mapping); put_page(pages[i]); } kfree(pages); } } static void free_async_extent_pages(struct async_extent *async_extent) { int i; if (!async_extent->pages) return; for (i = 0; i < async_extent->nr_pages; i++) { WARN_ON(async_extent->pages[i]->mapping); put_page(async_extent->pages[i]); } kfree(async_extent->pages); async_extent->nr_pages = 0; async_extent->pages = NULL; } static void submit_uncompressed_range(struct btrfs_inode *inode, struct async_extent *async_extent, struct page *locked_page) { u64 start = async_extent->start; u64 end = async_extent->start + async_extent->ram_size - 1; int ret; struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, .range_start = start, .range_end = end, .no_cgroup_owner = 1, }; wbc_attach_fdatawrite_inode(&wbc, &inode->vfs_inode); ret = run_delalloc_cow(inode, locked_page, start, end, &wbc, false); wbc_detach_inode(&wbc); if (ret < 0) { btrfs_cleanup_ordered_extents(inode, locked_page, start, end - start + 1); if (locked_page) { const u64 page_start = page_offset(locked_page); set_page_writeback(locked_page); end_page_writeback(locked_page); btrfs_mark_ordered_io_finished(inode, locked_page, page_start, PAGE_SIZE, !ret); mapping_set_error(locked_page->mapping, ret); unlock_page(locked_page); } } } static void submit_one_async_extent(struct async_chunk *async_chunk, struct async_extent *async_extent, u64 *alloc_hint) { struct btrfs_inode *inode = async_chunk->inode; struct extent_io_tree *io_tree = &inode->io_tree; struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_ordered_extent *ordered; struct btrfs_key ins; struct page *locked_page = NULL; struct extent_map *em; int ret = 0; u64 start = async_extent->start; u64 end = async_extent->start + async_extent->ram_size - 1; if (async_chunk->blkcg_css) kthread_associate_blkcg(async_chunk->blkcg_css); /* * If async_chunk->locked_page is in the async_extent range, we need to * handle it. */ if (async_chunk->locked_page) { u64 locked_page_start = page_offset(async_chunk->locked_page); u64 locked_page_end = locked_page_start + PAGE_SIZE - 1; if (!(start >= locked_page_end || end <= locked_page_start)) locked_page = async_chunk->locked_page; } lock_extent(io_tree, start, end, NULL); if (async_extent->compress_type == BTRFS_COMPRESS_NONE) { submit_uncompressed_range(inode, async_extent, locked_page); goto done; } ret = btrfs_reserve_extent(root, async_extent->ram_size, async_extent->compressed_size, async_extent->compressed_size, 0, *alloc_hint, &ins, 1, 1); if (ret) { /* * Here we used to try again by going back to non-compressed * path for ENOSPC. But we can't reserve space even for * compressed size, how could it work for uncompressed size * which requires larger size? So here we directly go error * path. */ goto out_free; } /* Here we're doing allocation and writeback of the compressed pages */ em = create_io_em(inode, start, async_extent->ram_size, /* len */ start, /* orig_start */ ins.objectid, /* block_start */ ins.offset, /* block_len */ ins.offset, /* orig_block_len */ async_extent->ram_size, /* ram_bytes */ async_extent->compress_type, BTRFS_ORDERED_COMPRESSED); if (IS_ERR(em)) { ret = PTR_ERR(em); goto out_free_reserve; } free_extent_map(em); ordered = btrfs_alloc_ordered_extent(inode, start, /* file_offset */ async_extent->ram_size, /* num_bytes */ async_extent->ram_size, /* ram_bytes */ ins.objectid, /* disk_bytenr */ ins.offset, /* disk_num_bytes */ 0, /* offset */ 1 << BTRFS_ORDERED_COMPRESSED, async_extent->compress_type); if (IS_ERR(ordered)) { btrfs_drop_extent_map_range(inode, start, end, false); ret = PTR_ERR(ordered); goto out_free_reserve; } btrfs_dec_block_group_reservations(fs_info, ins.objectid); /* Clear dirty, set writeback and unlock the pages. */ extent_clear_unlock_delalloc(inode, start, end, NULL, EXTENT_LOCKED | EXTENT_DELALLOC, PAGE_UNLOCK | PAGE_START_WRITEBACK); btrfs_submit_compressed_write(ordered, async_extent->pages, /* compressed_pages */ async_extent->nr_pages, async_chunk->write_flags, true); *alloc_hint = ins.objectid + ins.offset; done: if (async_chunk->blkcg_css) kthread_associate_blkcg(NULL); kfree(async_extent); return; out_free_reserve: btrfs_dec_block_group_reservations(fs_info, ins.objectid); btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); out_free: mapping_set_error(inode->vfs_inode.i_mapping, -EIO); extent_clear_unlock_delalloc(inode, start, end, NULL, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); free_async_extent_pages(async_extent); if (async_chunk->blkcg_css) kthread_associate_blkcg(NULL); btrfs_debug(fs_info, "async extent submission failed root=%lld inode=%llu start=%llu len=%llu ret=%d", root->root_key.objectid, btrfs_ino(inode), start, async_extent->ram_size, ret); kfree(async_extent); } static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, u64 num_bytes) { struct extent_map_tree *em_tree = &inode->extent_tree; struct extent_map *em; u64 alloc_hint = 0; read_lock(&em_tree->lock); em = search_extent_mapping(em_tree, start, num_bytes); if (em) { /* * if block start isn't an actual block number then find the * first block in this inode and use that as a hint. If that * block is also bogus then just don't worry about it. */ if (em->block_start >= EXTENT_MAP_LAST_BYTE) { free_extent_map(em); em = search_extent_mapping(em_tree, 0, 0); if (em && em->block_start < EXTENT_MAP_LAST_BYTE) alloc_hint = em->block_start; if (em) free_extent_map(em); } else { alloc_hint = em->block_start; free_extent_map(em); } } read_unlock(&em_tree->lock); return alloc_hint; } /* * when extent_io.c finds a delayed allocation range in the file, * the call backs end up in this code. The basic idea is to * allocate extents on disk for the range, and create ordered data structs * in ram to track those extents. * * locked_page is the page that writepage had locked already. We use * it to make sure we don't do extra locks or unlocks. * * When this function fails, it unlocks all pages except @locked_page. * * When this function successfully creates an inline extent, it returns 1 and * unlocks all pages including locked_page and starts I/O on them. * (In reality inline extents are limited to a single page, so locked_page is * the only page handled anyway). * * When this function succeed and creates a normal extent, the page locking * status depends on the passed in flags: * * - If @keep_locked is set, all pages are kept locked. * - Else all pages except for @locked_page are unlocked. * * When a failure happens in the second or later iteration of the * while-loop, the ordered extents created in previous iterations are kept * intact. So, the caller must clean them up by calling * btrfs_cleanup_ordered_extents(). See btrfs_run_delalloc_range() for * example. */ static noinline int cow_file_range(struct btrfs_inode *inode, struct page *locked_page, u64 start, u64 end, u64 *done_offset, bool keep_locked, bool no_inline) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; u64 alloc_hint = 0; u64 orig_start = start; u64 num_bytes; unsigned long ram_size; u64 cur_alloc_size = 0; u64 min_alloc_size; u64 blocksize = fs_info->sectorsize; struct btrfs_key ins; struct extent_map *em; unsigned clear_bits; unsigned long page_ops; bool extent_reserved = false; int ret = 0; if (btrfs_is_free_space_inode(inode)) { ret = -EINVAL; goto out_unlock; } num_bytes = ALIGN(end - start + 1, blocksize); num_bytes = max(blocksize, num_bytes); ASSERT(num_bytes <= btrfs_super_total_bytes(fs_info->super_copy)); inode_should_defrag(inode, start, end, num_bytes, SZ_64K); /* * Due to the page size limit, for subpage we can only trigger the * writeback for the dirty sectors of page, that means data writeback * is doing more writeback than what we want. * * This is especially unexpected for some call sites like fallocate, * where we only increase i_size after everything is done. * This means we can trigger inline extent even if we didn't want to. * So here we skip inline extent creation completely. */ if (start == 0 && fs_info->sectorsize == PAGE_SIZE && !no_inline) { u64 actual_end = min_t(u64, i_size_read(&inode->vfs_inode), end + 1); /* lets try to make an inline extent */ ret = cow_file_range_inline(inode, actual_end, 0, BTRFS_COMPRESS_NONE, NULL, false); if (ret == 0) { /* * We use DO_ACCOUNTING here because we need the * delalloc_release_metadata to be run _after_ we drop * our outstanding extent for clearing delalloc for this * range. */ extent_clear_unlock_delalloc(inode, start, end, locked_page, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); /* * locked_page is locked by the caller of * writepage_delalloc(), not locked by * __process_pages_contig(). * * We can't let __process_pages_contig() to unlock it, * as it doesn't have any subpage::writers recorded. * * Here we manually unlock the page, since the caller * can't determine if it's an inline extent or a * compressed extent. */ unlock_page(locked_page); ret = 1; goto done; } else if (ret < 0) { goto out_unlock; } } alloc_hint = get_extent_allocation_hint(inode, start, num_bytes); /* * Relocation relies on the relocated extents to have exactly the same * size as the original extents. Normally writeback for relocation data * extents follows a NOCOW path because relocation preallocates the * extents. However, due to an operation such as scrub turning a block * group to RO mode, it may fallback to COW mode, so we must make sure * an extent allocated during COW has exactly the requested size and can * not be split into smaller extents, otherwise relocation breaks and * fails during the stage where it updates the bytenr of file extent * items. */ if (btrfs_is_data_reloc_root(root)) min_alloc_size = num_bytes; else min_alloc_size = fs_info->sectorsize; while (num_bytes > 0) { struct btrfs_ordered_extent *ordered; cur_alloc_size = num_bytes; ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size, min_alloc_size, 0, alloc_hint, &ins, 1, 1); if (ret == -EAGAIN) { /* * btrfs_reserve_extent only returns -EAGAIN for zoned * file systems, which is an indication that there are * no active zones to allocate from at the moment. * * If this is the first loop iteration, wait for at * least one zone to finish before retrying the * allocation. Otherwise ask the caller to write out * the already allocated blocks before coming back to * us, or return -ENOSPC if it can't handle retries. */ ASSERT(btrfs_is_zoned(fs_info)); if (start == orig_start) { wait_on_bit_io(&inode->root->fs_info->flags, BTRFS_FS_NEED_ZONE_FINISH, TASK_UNINTERRUPTIBLE); continue; } if (done_offset) { *done_offset = start - 1; return 0; } ret = -ENOSPC; } if (ret < 0) goto out_unlock; cur_alloc_size = ins.offset; extent_reserved = true; ram_size = ins.offset; em = create_io_em(inode, start, ins.offset, /* len */ start, /* orig_start */ ins.objectid, /* block_start */ ins.offset, /* block_len */ ins.offset, /* orig_block_len */ ram_size, /* ram_bytes */ BTRFS_COMPRESS_NONE, /* compress_type */ BTRFS_ORDERED_REGULAR /* type */); if (IS_ERR(em)) { ret = PTR_ERR(em); goto out_reserve; } free_extent_map(em); ordered = btrfs_alloc_ordered_extent(inode, start, ram_size, ram_size, ins.objectid, cur_alloc_size, 0, 1 << BTRFS_ORDERED_REGULAR, BTRFS_COMPRESS_NONE); if (IS_ERR(ordered)) { ret = PTR_ERR(ordered); goto out_drop_extent_cache; } if (btrfs_is_data_reloc_root(root)) { ret = btrfs_reloc_clone_csums(ordered); /* * Only drop cache here, and process as normal. * * We must not allow extent_clear_unlock_delalloc() * at out_unlock label to free meta of this ordered * extent, as its meta should be freed by * btrfs_finish_ordered_io(). * * So we must continue until @start is increased to * skip current ordered extent. */ if (ret) btrfs_drop_extent_map_range(inode, start, start + ram_size - 1, false); } btrfs_put_ordered_extent(ordered); btrfs_dec_block_group_reservations(fs_info, ins.objectid); /* * We're not doing compressed IO, don't unlock the first page * (which the caller expects to stay locked), don't clear any * dirty bits and don't set any writeback bits * * Do set the Ordered (Private2) bit so we know this page was * properly setup for writepage. */ page_ops = (keep_locked ? 0 : PAGE_UNLOCK); page_ops |= PAGE_SET_ORDERED; extent_clear_unlock_delalloc(inode, start, start + ram_size - 1, locked_page, EXTENT_LOCKED | EXTENT_DELALLOC, page_ops); if (num_bytes < cur_alloc_size) num_bytes = 0; else num_bytes -= cur_alloc_size; alloc_hint = ins.objectid + ins.offset; start += cur_alloc_size; extent_reserved = false; /* * btrfs_reloc_clone_csums() error, since start is increased * extent_clear_unlock_delalloc() at out_unlock label won't * free metadata of current ordered extent, we're OK to exit. */ if (ret) goto out_unlock; } done: if (done_offset) *done_offset = end; return ret; out_drop_extent_cache: btrfs_drop_extent_map_range(inode, start, start + ram_size - 1, false); out_reserve: btrfs_dec_block_group_reservations(fs_info, ins.objectid); btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); out_unlock: /* * Now, we have three regions to clean up: * * |-------(1)----|---(2)---|-------------(3)----------| * `- orig_start `- start `- start + cur_alloc_size `- end * * We process each region below. */ clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV; page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK; /* * For the range (1). We have already instantiated the ordered extents * for this region. They are cleaned up by * btrfs_cleanup_ordered_extents() in e.g, * btrfs_run_delalloc_range(). EXTENT_LOCKED | EXTENT_DELALLOC are * already cleared in the above loop. And, EXTENT_DELALLOC_NEW | * EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV are handled by the cleanup * function. * * However, in case of @keep_locked, we still need to unlock the pages * (except @locked_page) to ensure all the pages are unlocked. */ if (keep_locked && orig_start < start) { if (!locked_page) mapping_set_error(inode->vfs_inode.i_mapping, ret); extent_clear_unlock_delalloc(inode, orig_start, start - 1, locked_page, 0, page_ops); } /* * For the range (2). If we reserved an extent for our delalloc range * (or a subrange) and failed to create the respective ordered extent, * then it means that when we reserved the extent we decremented the * extent's size from the data space_info's bytes_may_use counter and * incremented the space_info's bytes_reserved counter by the same * amount. We must make sure extent_clear_unlock_delalloc() does not try * to decrement again the data space_info's bytes_may_use counter, * therefore we do not pass it the flag EXTENT_CLEAR_DATA_RESV. */ if (extent_reserved) { extent_clear_unlock_delalloc(inode, start, start + cur_alloc_size - 1, locked_page, clear_bits, page_ops); start += cur_alloc_size; } /* * For the range (3). We never touched the region. In addition to the * clear_bits above, we add EXTENT_CLEAR_DATA_RESV to release the data * space_info's bytes_may_use counter, reserved in * btrfs_check_data_free_space(). */ if (start < end) { clear_bits |= EXTENT_CLEAR_DATA_RESV; extent_clear_unlock_delalloc(inode, start, end, locked_page, clear_bits, page_ops); } return ret; } /* * Phase two of compressed writeback. This is the ordered portion of the code, * which only gets called in the order the work was queued. We walk all the * async extents created by compress_file_range and send them down to the disk. * * If called with @do_free == true then it'll try to finish the work and free * the work struct eventually. */ static noinline void submit_compressed_extents(struct btrfs_work *work, bool do_free) { struct async_chunk *async_chunk = container_of(work, struct async_chunk, work); struct btrfs_fs_info *fs_info = btrfs_work_owner(work); struct async_extent *async_extent; unsigned long nr_pages; u64 alloc_hint = 0; if (do_free) { struct async_chunk *async_chunk; struct async_cow *async_cow; async_chunk = container_of(work, struct async_chunk, work); btrfs_add_delayed_iput(async_chunk->inode); if (async_chunk->blkcg_css) css_put(async_chunk->blkcg_css); async_cow = async_chunk->async_cow; if (atomic_dec_and_test(&async_cow->num_chunks)) kvfree(async_cow); return; } nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >> PAGE_SHIFT; while (!list_empty(&async_chunk->extents)) { async_extent = list_entry(async_chunk->extents.next, struct async_extent, list); list_del(&async_extent->list); submit_one_async_extent(async_chunk, async_extent, &alloc_hint); } /* atomic_sub_return implies a barrier */ if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) < 5 * SZ_1M) cond_wake_up_nomb(&fs_info->async_submit_wait); } static bool run_delalloc_compressed(struct btrfs_inode *inode, struct page *locked_page, u64 start, u64 end, struct writeback_control *wbc) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct cgroup_subsys_state *blkcg_css = wbc_blkcg_css(wbc); struct async_cow *ctx; struct async_chunk *async_chunk; unsigned long nr_pages; u64 num_chunks = DIV_ROUND_UP(end - start, SZ_512K); int i; unsigned nofs_flag; const blk_opf_t write_flags = wbc_to_write_flags(wbc); nofs_flag = memalloc_nofs_save(); ctx = kvmalloc(struct_size(ctx, chunks, num_chunks), GFP_KERNEL); memalloc_nofs_restore(nofs_flag); if (!ctx) return false; unlock_extent(&inode->io_tree, start, end, NULL); set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags); async_chunk = ctx->chunks; atomic_set(&ctx->num_chunks, num_chunks); for (i = 0; i < num_chunks; i++) { u64 cur_end = min(end, start + SZ_512K - 1); /* * igrab is called higher up in the call chain, take only the * lightweight reference for the callback lifetime */ ihold(&inode->vfs_inode); async_chunk[i].async_cow = ctx; async_chunk[i].inode = inode; async_chunk[i].start = start; async_chunk[i].end = cur_end; async_chunk[i].write_flags = write_flags; INIT_LIST_HEAD(&async_chunk[i].extents); /* * The locked_page comes all the way from writepage and its * the original page we were actually given. As we spread * this large delalloc region across multiple async_chunk * structs, only the first struct needs a pointer to locked_page * * This way we don't need racey decisions about who is supposed * to unlock it. */ if (locked_page) { /* * Depending on the compressibility, the pages might or * might not go through async. We want all of them to * be accounted against wbc once. Let's do it here * before the paths diverge. wbc accounting is used * only for foreign writeback detection and doesn't * need full accuracy. Just account the whole thing * against the first page. */ wbc_account_cgroup_owner(wbc, locked_page, cur_end - start); async_chunk[i].locked_page = locked_page; locked_page = NULL; } else { async_chunk[i].locked_page = NULL; } if (blkcg_css != blkcg_root_css) { css_get(blkcg_css); async_chunk[i].blkcg_css = blkcg_css; async_chunk[i].write_flags |= REQ_BTRFS_CGROUP_PUNT; } else { async_chunk[i].blkcg_css = NULL; } btrfs_init_work(&async_chunk[i].work, compress_file_range, submit_compressed_extents); nr_pages = DIV_ROUND_UP(cur_end - start, PAGE_SIZE); atomic_add(nr_pages, &fs_info->async_delalloc_pages); btrfs_queue_work(fs_info->delalloc_workers, &async_chunk[i].work); start = cur_end + 1; } return true; } /* * Run the delalloc range from start to end, and write back any dirty pages * covered by the range. */ static noinline int run_delalloc_cow(struct btrfs_inode *inode, struct page *locked_page, u64 start, u64 end, struct writeback_control *wbc, bool pages_dirty) { u64 done_offset = end; int ret; while (start <= end) { ret = cow_file_range(inode, locked_page, start, end, &done_offset, true, false); if (ret) return ret; extent_write_locked_range(&inode->vfs_inode, locked_page, start, done_offset, wbc, pages_dirty); start = done_offset + 1; } return 1; } static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, bool nowait) { struct btrfs_root *csum_root = btrfs_csum_root(fs_info, bytenr); struct btrfs_ordered_sum *sums; int ret; LIST_HEAD(list); ret = btrfs_lookup_csums_list(csum_root, bytenr, bytenr + num_bytes - 1, &list, 0, nowait); if (ret == 0 && list_empty(&list)) return 0; while (!list_empty(&list)) { sums = list_entry(list.next, struct btrfs_ordered_sum, list); list_del(&sums->list); kfree(sums); } if (ret < 0) return ret; return 1; } static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, const u64 start, const u64 end) { const bool is_space_ino = btrfs_is_free_space_inode(inode); const bool is_reloc_ino = btrfs_is_data_reloc_root(inode->root); const u64 range_bytes = end + 1 - start; struct extent_io_tree *io_tree = &inode->io_tree; u64 range_start = start; u64 count; int ret; /* * If EXTENT_NORESERVE is set it means that when the buffered write was * made we had not enough available data space and therefore we did not * reserve data space for it, since we though we could do NOCOW for the * respective file range (either there is prealloc extent or the inode * has the NOCOW bit set). * * However when we need to fallback to COW mode (because for example the * block group for the corresponding extent was turned to RO mode by a * scrub or relocation) we need to do the following: * * 1) We increment the bytes_may_use counter of the data space info. * If COW succeeds, it allocates a new data extent and after doing * that it decrements the space info's bytes_may_use counter and * increments its bytes_reserved counter by the same amount (we do * this at btrfs_add_reserved_bytes()). So we need to increment the * bytes_may_use counter to compensate (when space is reserved at * buffered write time, the bytes_may_use counter is incremented); * * 2) We clear the EXTENT_NORESERVE bit from the range. We do this so * that if the COW path fails for any reason, it decrements (through * extent_clear_unlock_delalloc()) the bytes_may_use counter of the * data space info, which we incremented in the step above. * * If we need to fallback to cow and the inode corresponds to a free * space cache inode or an inode of the data relocation tree, we must * also increment bytes_may_use of the data space_info for the same * reason. Space caches and relocated data extents always get a prealloc * extent for them, however scrub or balance may have set the block * group that contains that extent to RO mode and therefore force COW * when starting writeback. */ count = count_range_bits(io_tree, &range_start, end, range_bytes, EXTENT_NORESERVE, 0, NULL); if (count > 0 || is_space_ino || is_reloc_ino) { u64 bytes = count; struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_space_info *sinfo = fs_info->data_sinfo; if (is_space_ino || is_reloc_ino) bytes = range_bytes; spin_lock(&sinfo->lock); btrfs_space_info_update_bytes_may_use(fs_info, sinfo, bytes); spin_unlock(&sinfo->lock); if (count > 0) clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE, NULL); } /* * Don't try to create inline extents, as a mix of inline extent that * is written out and unlocked directly and a normal NOCOW extent * doesn't work. */ ret = cow_file_range(inode, locked_page, start, end, NULL, false, true); ASSERT(ret != 1); return ret; } struct can_nocow_file_extent_args { /* Input fields. */ /* Start file offset of the range we want to NOCOW. */ u64 start; /* End file offset (inclusive) of the range we want to NOCOW. */ u64 end; bool writeback_path; bool strict; /* * Free the path passed to can_nocow_file_extent() once it's not needed * anymore. */ bool free_path; /* Output fields. Only set when can_nocow_file_extent() returns 1. */ u64 disk_bytenr; u64 disk_num_bytes; u64 extent_offset; /* Number of bytes that can be written to in NOCOW mode. */ u64 num_bytes; }; /* * Check if we can NOCOW the file extent that the path points to. * This function may return with the path released, so the caller should check * if path->nodes[0] is NULL or not if it needs to use the path afterwards. * * Returns: < 0 on error * 0 if we can not NOCOW * 1 if we can NOCOW */ static int can_nocow_file_extent(struct btrfs_path *path, struct btrfs_key *key, struct btrfs_inode *inode, struct can_nocow_file_extent_args *args) { const bool is_freespace_inode = btrfs_is_free_space_inode(inode); struct extent_buffer *leaf = path->nodes[0]; struct btrfs_root *root = inode->root; struct btrfs_file_extent_item *fi; u64 extent_end; u8 extent_type; int can_nocow = 0; int ret = 0; bool nowait = path->nowait; fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); extent_type = btrfs_file_extent_type(leaf, fi); if (extent_type == BTRFS_FILE_EXTENT_INLINE) goto out; /* Can't access these fields unless we know it's not an inline extent. */ args->disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); args->disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); args->extent_offset = btrfs_file_extent_offset(leaf, fi); if (!(inode->flags & BTRFS_INODE_NODATACOW) && extent_type == BTRFS_FILE_EXTENT_REG) goto out; /* * If the extent was created before the generation where the last snapshot * for its subvolume was created, then this implies the extent is shared, * hence we must COW. */ if (!args->strict && btrfs_file_extent_generation(leaf, fi) <= btrfs_root_last_snapshot(&root->root_item)) goto out; /* An explicit hole, must COW. */ if (args->disk_bytenr == 0) goto out; /* Compressed/encrypted/encoded extents must be COWed. */ if (btrfs_file_extent_compression(leaf, fi) || btrfs_file_extent_encryption(leaf, fi) || btrfs_file_extent_other_encoding(leaf, fi)) goto out; extent_end = btrfs_file_extent_end(path); /* * The following checks can be expensive, as they need to take other * locks and do btree or rbtree searches, so release the path to avoid * blocking other tasks for too long. */ btrfs_release_path(path); ret = btrfs_cross_ref_exist(root, btrfs_ino(inode), key->offset - args->extent_offset, args->disk_bytenr, args->strict, path); WARN_ON_ONCE(ret > 0 && is_freespace_inode); if (ret != 0) goto out; if (args->free_path) { /* * We don't need the path anymore, plus through the * csum_exist_in_range() call below we will end up allocating * another path. So free the path to avoid unnecessary extra * memory usage. */ btrfs_free_path(path); path = NULL; } /* If there are pending snapshots for this root, we must COW. */ if (args->writeback_path && !is_freespace_inode && atomic_read(&root->snapshot_force_cow)) goto out; args->disk_bytenr += args->extent_offset; args->disk_bytenr += args->start - key->offset; args->num_bytes = min(args->end + 1, extent_end) - args->start; /* * Force COW if csums exist in the range. This ensures that csums for a * given extent are either valid or do not exist. */ ret = csum_exist_in_range(root->fs_info, args->disk_bytenr, args->num_bytes, nowait); WARN_ON_ONCE(ret > 0 && is_freespace_inode); if (ret != 0) goto out; can_nocow = 1; out: if (args->free_path && path) btrfs_free_path(path); return ret < 0 ? ret : can_nocow; } /* * when nowcow writeback call back. This checks for snapshots or COW copies * of the extents that exist in the file, and COWs the file as required. * * If no cow copies or snapshots exist, we write directly to the existing * blocks on disk */ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, struct page *locked_page, const u64 start, const u64 end) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_root *root = inode->root; struct btrfs_path *path; u64 cow_start = (u64)-1; u64 cur_offset = start; int ret; bool check_prev = true; u64 ino = btrfs_ino(inode); struct can_nocow_file_extent_args nocow_args = { 0 }; /* * Normally on a zoned device we're only doing COW writes, but in case * of relocation on a zoned filesystem serializes I/O so that we're only * writing sequentially and can end up here as well. */ ASSERT(!btrfs_is_zoned(fs_info) || btrfs_is_data_reloc_root(root)); path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; goto error; } nocow_args.end = end; nocow_args.writeback_path = true; while (1) { struct btrfs_block_group *nocow_bg = NULL; struct btrfs_ordered_extent *ordered; struct btrfs_key found_key; struct btrfs_file_extent_item *fi; struct extent_buffer *leaf; u64 extent_end; u64 ram_bytes; u64 nocow_end; int extent_type; bool is_prealloc; ret = btrfs_lookup_file_extent(NULL, root, path, ino, cur_offset, 0); if (ret < 0) goto error; /* * If there is no extent for our range when doing the initial * search, then go back to the previous slot as it will be the * one containing the search offset */ if (ret > 0 && path->slots[0] > 0 && check_prev) { leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0] - 1); if (found_key.objectid == ino && found_key.type == BTRFS_EXTENT_DATA_KEY) path->slots[0]--; } check_prev = false; next_slot: /* Go to next leaf if we have exhausted the current one */ leaf = path->nodes[0]; if (path->slots[0] >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(root, path); if (ret < 0) goto error; if (ret > 0) break; leaf = path->nodes[0]; } btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); /* Didn't find anything for our INO */ if (found_key.objectid > ino) break; /* * Keep searching until we find an EXTENT_ITEM or there are no * more extents for this inode */ if (WARN_ON_ONCE(found_key.objectid < ino) || found_key.type < BTRFS_EXTENT_DATA_KEY) { path->slots[0]++; goto next_slot; } /* Found key is not EXTENT_DATA_KEY or starts after req range */ if (found_key.type > BTRFS_EXTENT_DATA_KEY || found_key.offset > end) break; /* * If the found extent starts after requested offset, then * adjust extent_end to be right before this extent begins */ if (found_key.offset > cur_offset) { extent_end = found_key.offset; extent_type = 0; goto must_cow; } /* * Found extent which begins before our range and potentially * intersect it */ fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); extent_type = btrfs_file_extent_type(leaf, fi); /* If this is triggered then we have a memory corruption. */ ASSERT(extent_type < BTRFS_NR_FILE_EXTENT_TYPES); if (WARN_ON(extent_type >= BTRFS_NR_FILE_EXTENT_TYPES)) { ret = -EUCLEAN; goto error; } ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); extent_end = btrfs_file_extent_end(path); /* * If the extent we got ends before our current offset, skip to * the next extent. */ if (extent_end <= cur_offset) { path->slots[0]++; goto next_slot; } nocow_args.start = cur_offset; ret = can_nocow_file_extent(path, &found_key, inode, &nocow_args); if (ret < 0) goto error; if (ret == 0) goto must_cow; ret = 0; nocow_bg = btrfs_inc_nocow_writers(fs_info, nocow_args.disk_bytenr); if (!nocow_bg) { must_cow: /* * If we can't perform NOCOW writeback for the range, * then record the beginning of the range that needs to * be COWed. It will be written out before the next * NOCOW range if we find one, or when exiting this * loop. */ if (cow_start == (u64)-1) cow_start = cur_offset; cur_offset = extent_end; if (cur_offset > end) break; if (!path->nodes[0]) continue; path->slots[0]++; goto next_slot; } /* * COW range from cow_start to found_key.offset - 1. As the key * will contain the beginning of the first extent that can be * NOCOW, following one which needs to be COW'ed */ if (cow_start != (u64)-1) { ret = fallback_to_cow(inode, locked_page, cow_start, found_key.offset - 1); cow_start = (u64)-1; if (ret) { btrfs_dec_nocow_writers(nocow_bg); goto error; } } nocow_end = cur_offset + nocow_args.num_bytes - 1; is_prealloc = extent_type == BTRFS_FILE_EXTENT_PREALLOC; if (is_prealloc) { u64 orig_start = found_key.offset - nocow_args.extent_offset; struct extent_map *em; em = create_io_em(inode, cur_offset, nocow_args.num_bytes, orig_start, nocow_args.disk_bytenr, /* block_start */ nocow_args.num_bytes, /* block_len */ nocow_args.disk_num_bytes, /* orig_block_len */ ram_bytes, BTRFS_COMPRESS_NONE, BTRFS_ORDERED_PREALLOC); if (IS_ERR(em)) { btrfs_dec_nocow_writers(nocow_bg); ret = PTR_ERR(em); goto error; } free_extent_map(em); } ordered = btrfs_alloc_ordered_extent(inode, cur_offset, nocow_args.num_bytes, nocow_args.num_bytes, nocow_args.disk_bytenr, nocow_args.num_bytes, 0, is_prealloc ? (1 << BTRFS_ORDERED_PREALLOC) : (1 << BTRFS_ORDERED_NOCOW), BTRFS_COMPRESS_NONE); btrfs_dec_nocow_writers(nocow_bg); if (IS_ERR(ordered)) { if (is_prealloc) { btrfs_drop_extent_map_range(inode, cur_offset, nocow_end, false); } ret = PTR_ERR(ordered); goto error; } if (btrfs_is_data_reloc_root(root)) /* * Error handled later, as we must prevent * extent_clear_unlock_delalloc() in error handler * from freeing metadata of created ordered extent. */ ret = btrfs_reloc_clone_csums(ordered); btrfs_put_ordered_extent(ordered); extent_clear_unlock_delalloc(inode, cur_offset, nocow_end, locked_page, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_CLEAR_DATA_RESV, PAGE_UNLOCK | PAGE_SET_ORDERED); cur_offset = extent_end; /* * btrfs_reloc_clone_csums() error, now we're OK to call error * handler, as metadata for created ordered extent will only * be freed by btrfs_finish_ordered_io(). */ if (ret) goto error; if (cur_offset > end) break; } btrfs_release_path(path); if (cur_offset <= end && cow_start == (u64)-1) cow_start = cur_offset; if (cow_start != (u64)-1) { cur_offset = end; ret = fallback_to_cow(inode, locked_page, cow_start, end); cow_start = (u64)-1; if (ret) goto error; } btrfs_free_path(path); return 0; error: /* * If an error happened while a COW region is outstanding, cur_offset * needs to be reset to cow_start to ensure the COW region is unlocked * as well. */ if (cow_start != (u64)-1) cur_offset = cow_start; if (cur_offset < end) extent_clear_unlock_delalloc(inode, cur_offset, end, locked_page, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); btrfs_free_path(path); return ret; } static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end) { if (inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) { if (inode->defrag_bytes && test_range_bit_exists(&inode->io_tree, start, end, EXTENT_DEFRAG)) return false; return true; } return false; } /* * Function to process delayed allocation (create CoW) for ranges which are * being touched for the first time. */ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page, u64 start, u64 end, struct writeback_control *wbc) { const bool zoned = btrfs_is_zoned(inode->root->fs_info); int ret; /* * The range must cover part of the @locked_page, or a return of 1 * can confuse the caller. */ ASSERT(!(end <= page_offset(locked_page) || start >= page_offset(locked_page) + PAGE_SIZE)); if (should_nocow(inode, start, end)) { ret = run_delalloc_nocow(inode, locked_page, start, end); goto out; } if (btrfs_inode_can_compress(inode) && inode_need_compress(inode, start, end) && run_delalloc_compressed(inode, locked_page, start, end, wbc)) return 1; if (zoned) ret = run_delalloc_cow(inode, locked_page, start, end, wbc, true); else ret = cow_file_range(inode, locked_page, start, end, NULL, false, false); out: if (ret < 0) btrfs_cleanup_ordered_extents(inode, locked_page, start, end - start + 1); return ret; } void btrfs_split_delalloc_extent(struct btrfs_inode *inode, struct extent_state *orig, u64 split) { struct btrfs_fs_info *fs_info = inode->root->fs_info; u64 size; /* not delalloc, ignore it */ if (!(orig->state & EXTENT_DELALLOC)) return; size = orig->end - orig->start + 1; if (size > fs_info->max_extent_size) { u32 num_extents; u64 new_size; /* * See the explanation in btrfs_merge_delalloc_extent, the same * applies here, just in reverse. */ new_size = orig->end - split + 1; num_extents = count_max_extents(fs_info, new_size); new_size = split - orig->start; num_extents += count_max_extents(fs_info, new_size); if (count_max_extents(fs_info, size) >= num_extents) return; } spin_lock(&inode->lock); btrfs_mod_outstanding_extents(inode, 1); spin_unlock(&inode->lock); } /* * Handle merged delayed allocation extents so we can keep track of new extents * that are just merged onto old extents, such as when we are doing sequential * writes, so we can properly account for the metadata space we'll need. */ void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state *new, struct extent_state *other) { struct btrfs_fs_info *fs_info = inode->root->fs_info; u64 new_size, old_size; u32 num_extents; /* not delalloc, ignore it */ if (!(other->state & EXTENT_DELALLOC)) return; if (new->start > other->start) new_size = new->end - other->start + 1; else new_size = other->end - new->start + 1; /* we're not bigger than the max, unreserve the space and go */ if (new_size <= fs_info->max_extent_size) { spin_lock(&inode->lock); btrfs_mod_outstanding_extents(inode, -1); spin_unlock(&inode->lock); return; } /* * We have to add up either side to figure out how many extents were * accounted for before we merged into one big extent. If the number of * extents we accounted for is <= the amount we need for the new range * then we can return, otherwise drop. Think of it like this * * [ 4k][MAX_SIZE] * * So we've grown the extent by a MAX_SIZE extent, this would mean we * need 2 outstanding extents, on one side we have 1 and the other side * we have 1 so they are == and we can return. But in this case * * [MAX_SIZE+4k][MAX_SIZE+4k] * * Each range on their own accounts for 2 extents, but merged together * they are only 3 extents worth of accounting, so we need to drop in * this case. */ old_size = other->end - other->start + 1; num_extents = count_max_extents(fs_info, old_size); old_size = new->end - new->start + 1; num_extents += count_max_extents(fs_info, old_size); if (count_max_extents(fs_info, new_size) >= num_extents) return; spin_lock(&inode->lock); btrfs_mod_outstanding_extents(inode, -1); spin_unlock(&inode->lock); } static void btrfs_add_delalloc_inodes(struct btrfs_root *root, struct btrfs_inode *inode) { struct btrfs_fs_info *fs_info = inode->root->fs_info; spin_lock(&root->delalloc_lock); if (list_empty(&inode->delalloc_inodes)) { list_add_tail(&inode->delalloc_inodes, &root->delalloc_inodes); set_bit(BTRFS_INODE_IN_DELALLOC_LIST, &inode->runtime_flags); root->nr_delalloc_inodes++; if (root->nr_delalloc_inodes == 1) { spin_lock(&fs_info->delalloc_root_lock); BUG_ON(!list_empty(&root->delalloc_root)); list_add_tail(&root->delalloc_root, &fs_info->delalloc_roots); spin_unlock(&fs_info->delalloc_root_lock); } } spin_unlock(&root->delalloc_lock); } void __btrfs_del_delalloc_inode(struct btrfs_root *root, struct btrfs_inode *inode) { struct btrfs_fs_info *fs_info = root->fs_info; if (!list_empty(&inode->delalloc_inodes)) { list_del_init(&inode->delalloc_inodes); clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, &inode->runtime_flags); root->nr_delalloc_inodes--; if (!root->nr_delalloc_inodes) { ASSERT(list_empty(&root->delalloc_inodes)); spin_lock(&fs_info->delalloc_root_lock); BUG_ON(list_empty(&root->delalloc_root)); list_del_init(&root->delalloc_root); spin_unlock(&fs_info->delalloc_root_lock); } } } static void btrfs_del_delalloc_inode(struct btrfs_root *root, struct btrfs_inode *inode) { spin_lock(&root->delalloc_lock); __btrfs_del_delalloc_inode(root, inode); spin_unlock(&root->delalloc_lock); } /* * Properly track delayed allocation bytes in the inode and to maintain the * list of inodes that have pending delalloc work to be done. */ void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *state, u32 bits) { struct btrfs_fs_info *fs_info = inode->root->fs_info; if ((bits & EXTENT_DEFRAG) && !(bits & EXTENT_DELALLOC)) WARN_ON(1); /* * set_bit and clear bit hooks normally require _irqsave/restore * but in this case, we are only testing for the DELALLOC * bit, which is only set or cleared with irqs on */ if (!(state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { struct btrfs_root *root = inode->root; u64 len = state->end + 1 - state->start; u32 num_extents = count_max_extents(fs_info, len); bool do_list = !btrfs_is_free_space_inode(inode); spin_lock(&inode->lock); btrfs_mod_outstanding_extents(inode, num_extents); spin_unlock(&inode->lock); /* For sanity tests */ if (btrfs_is_testing(fs_info)) return; percpu_counter_add_batch(&fs_info->delalloc_bytes, len, fs_info->delalloc_batch); spin_lock(&inode->lock); inode->delalloc_bytes += len; if (bits & EXTENT_DEFRAG) inode->defrag_bytes += len; if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST, &inode->runtime_flags)) btrfs_add_delalloc_inodes(root, inode); spin_unlock(&inode->lock); } if (!(state->state & EXTENT_DELALLOC_NEW) && (bits & EXTENT_DELALLOC_NEW)) { spin_lock(&inode->lock); inode->new_delalloc_bytes += state->end + 1 - state->start; spin_unlock(&inode->lock); } } /* * Once a range is no longer delalloc this function ensures that proper * accounting happens. */ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode, struct extent_state *state, u32 bits) { struct btrfs_fs_info *fs_info = inode->root->fs_info; u64 len = state->end + 1 - state->start; u32 num_extents = count_max_extents(fs_info, len); if ((state->state & EXTENT_DEFRAG) && (bits & EXTENT_DEFRAG)) { spin_lock(&inode->lock); inode->defrag_bytes -= len; spin_unlock(&inode->lock); } /* * set_bit and clear bit hooks normally require _irqsave/restore * but in this case, we are only testing for the DELALLOC * bit, which is only set or cleared with irqs on */ if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { struct btrfs_root *root = inode->root; bool do_list = !btrfs_is_free_space_inode(inode); spin_lock(&inode->lock); btrfs_mod_outstanding_extents(inode, -num_extents); spin_unlock(&inode->lock); /* * We don't reserve metadata space for space cache inodes so we * don't need to call delalloc_release_metadata if there is an * error. */ if (bits & EXTENT_CLEAR_META_RESV && root != fs_info->tree_root) btrfs_delalloc_release_metadata(inode, len, false); /* For sanity tests. */ if (btrfs_is_testing(fs_info)) return; if (!btrfs_is_data_reloc_root(root) && do_list && !(state->state & EXTENT_NORESERVE) && (bits & EXTENT_CLEAR_DATA_RESV)) btrfs_free_reserved_data_space_noquota(fs_info, len); percpu_counter_add_batch(&fs_info->delalloc_bytes, -len, fs_info->delalloc_batch); spin_lock(&inode->lock); inode->delalloc_bytes -= len; if (do_list && inode->delalloc_bytes == 0 && test_bit(BTRFS_INODE_IN_DELALLOC_LIST, &inode->runtime_flags)) btrfs_del_delalloc_inode(root, inode); spin_unlock(&inode->lock); } if ((state->state & EXTENT_DELALLOC_NEW) && (bits & EXTENT_DELALLOC_NEW)) { spin_lock(&inode->lock); ASSERT(inode->new_delalloc_bytes >= len); inode->new_delalloc_bytes -= len; if (bits & EXTENT_ADD_INODE_BYTES) inode_add_bytes(&inode->vfs_inode, len); spin_unlock(&inode->lock); } } static int btrfs_extract_ordered_extent(struct btrfs_bio *bbio, struct btrfs_ordered_extent *ordered) { u64 start = (u64)bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT; u64 len = bbio->bio.bi_iter.bi_size; struct btrfs_ordered_extent *new; int ret; /* Must always be called for the beginning of an ordered extent. */ if (WARN_ON_ONCE(start != ordered->disk_bytenr)) return -EINVAL; /* No need to split if the ordered extent covers the entire bio. */ if (ordered->disk_num_bytes == len) { refcount_inc(&ordered->refs); bbio->ordered = ordered; return 0; } /* * Don't split the extent_map for NOCOW extents, as we're writing into * a pre-existing one. */ if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) { ret = split_extent_map(bbio->inode, bbio->file_offset, ordered->num_bytes, len, ordered->disk_bytenr); if (ret) return ret; } new = btrfs_split_ordered_extent(ordered, len); if (IS_ERR(new)) return PTR_ERR(new); bbio->ordered = new; return 0; } /* * given a list of ordered sums record them in the inode. This happens * at IO completion time based on sums calculated at bio submission time. */ static int add_pending_csums(struct btrfs_trans_handle *trans, struct list_head *list) { struct btrfs_ordered_sum *sum; struct btrfs_root *csum_root = NULL; int ret; list_for_each_entry(sum, list, list) { trans->adding_csums = true; if (!csum_root) csum_root = btrfs_csum_root(trans->fs_info, sum->logical); ret = btrfs_csum_file_blocks(trans, csum_root, sum); trans->adding_csums = false; if (ret) return ret; } return 0; } static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode, const u64 start, const u64 len, struct extent_state **cached_state) { u64 search_start = start; const u64 end = start + len - 1; while (search_start < end) { const u64 search_len = end - search_start + 1; struct extent_map *em; u64 em_len; int ret = 0; em = btrfs_get_extent(inode, NULL, 0, search_start, search_len); if (IS_ERR(em)) return PTR_ERR(em); if (em->block_start != EXTENT_MAP_HOLE) goto next; em_len = em->len; if (em->start < search_start) em_len -= search_start - em->start; if (em_len > search_len) em_len = search_len; ret = set_extent_bit(&inode->io_tree, search_start, search_start + em_len - 1, EXTENT_DELALLOC_NEW, cached_state); next: search_start = extent_map_end(em); free_extent_map(em); if (ret) return ret; } return 0; } int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, unsigned int extra_bits, struct extent_state **cached_state) { WARN_ON(PAGE_ALIGNED(end)); if (start >= i_size_read(&inode->vfs_inode) && !(inode->flags & BTRFS_INODE_PREALLOC)) { /* * There can't be any extents following eof in this case so just * set the delalloc new bit for the range directly. */ extra_bits |= EXTENT_DELALLOC_NEW; } else { int ret; ret = btrfs_find_new_delalloc_bytes(inode, start, end + 1 - start, cached_state); if (ret) return ret; } return set_extent_bit(&inode->io_tree, start, end, EXTENT_DELALLOC | extra_bits, cached_state); } /* see btrfs_writepage_start_hook for details on why this is required */ struct btrfs_writepage_fixup { struct page *page; struct btrfs_inode *inode; struct btrfs_work work; }; static void btrfs_writepage_fixup_worker(struct btrfs_work *work) { struct btrfs_writepage_fixup *fixup = container_of(work, struct btrfs_writepage_fixup, work); struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; struct extent_changeset *data_reserved = NULL; struct page *page = fixup->page; struct btrfs_inode *inode = fixup->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; u64 page_start = page_offset(page); u64 page_end = page_offset(page) + PAGE_SIZE - 1; int ret = 0; bool free_delalloc_space = true; /* * This is similar to page_mkwrite, we need to reserve the space before * we take the page lock. */ ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start, PAGE_SIZE); again: lock_page(page); /* * Before we queued this fixup, we took a reference on the page. * page->mapping may go NULL, but it shouldn't be moved to a different * address space. */ if (!page->mapping || !PageDirty(page) || !PageChecked(page)) { /* * Unfortunately this is a little tricky, either * * 1) We got here and our page had already been dealt with and * we reserved our space, thus ret == 0, so we need to just * drop our space reservation and bail. This can happen the * first time we come into the fixup worker, or could happen * while waiting for the ordered extent. * 2) Our page was already dealt with, but we happened to get an * ENOSPC above from the btrfs_delalloc_reserve_space. In * this case we obviously don't have anything to release, but * because the page was already dealt with we don't want to * mark the page with an error, so make sure we're resetting * ret to 0. This is why we have this check _before_ the ret * check, because we do not want to have a surprise ENOSPC * when the page was already properly dealt with. */ if (!ret) { btrfs_delalloc_release_extents(inode, PAGE_SIZE); btrfs_delalloc_release_space(inode, data_reserved, page_start, PAGE_SIZE, true); } ret = 0; goto out_page; } /* * We can't mess with the page state unless it is locked, so now that * it is locked bail if we failed to make our space reservation. */ if (ret) goto out_page; lock_extent(&inode->io_tree, page_start, page_end, &cached_state); /* already ordered? We're done */ if (PageOrdered(page)) goto out_reserved; ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE); if (ordered) { unlock_extent(&inode->io_tree, page_start, page_end, &cached_state); unlock_page(page); btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); goto again; } ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0, &cached_state); if (ret) goto out_reserved; /* * Everything went as planned, we're now the owner of a dirty page with * delayed allocation bits set and space reserved for our COW * destination. * * The page was dirty when we started, nothing should have cleaned it. */ BUG_ON(!PageDirty(page)); free_delalloc_space = false; out_reserved: btrfs_delalloc_release_extents(inode, PAGE_SIZE); if (free_delalloc_space) btrfs_delalloc_release_space(inode, data_reserved, page_start, PAGE_SIZE, true); unlock_extent(&inode->io_tree, page_start, page_end, &cached_state); out_page: if (ret) { /* * We hit ENOSPC or other errors. Update the mapping and page * to reflect the errors and clean the page. */ mapping_set_error(page->mapping, ret); btrfs_mark_ordered_io_finished(inode, page, page_start, PAGE_SIZE, !ret); clear_page_dirty_for_io(page); } btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE); unlock_page(page); put_page(page); kfree(fixup); extent_changeset_free(data_reserved); /* * As a precaution, do a delayed iput in case it would be the last iput * that could need flushing space. Recursing back to fixup worker would * deadlock. */ btrfs_add_delayed_iput(inode); } /* * There are a few paths in the higher layers of the kernel that directly * set the page dirty bit without asking the filesystem if it is a * good idea. This causes problems because we want to make sure COW * properly happens and the data=ordered rules are followed. * * In our case any range that doesn't have the ORDERED bit set * hasn't been properly setup for IO. We kick off an async process * to fix it up. The async helper will wait for ordered extents, set * the delalloc bit and make it safe to write the page. */ int btrfs_writepage_cow_fixup(struct page *page) { struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_writepage_fixup *fixup; /* This page has ordered extent covering it already */ if (PageOrdered(page)) return 0; /* * PageChecked is set below when we create a fixup worker for this page, * don't try to create another one if we're already PageChecked() * * The extent_io writepage code will redirty the page if we send back * EAGAIN. */ if (PageChecked(page)) return -EAGAIN; fixup = kzalloc(sizeof(*fixup), GFP_NOFS); if (!fixup) return -EAGAIN; /* * We are already holding a reference to this inode from * write_cache_pages. We need to hold it because the space reservation * takes place outside of the page lock, and we can't trust * page->mapping outside of the page lock. */ ihold(inode); btrfs_page_set_checked(fs_info, page, page_offset(page), PAGE_SIZE); get_page(page); btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL); fixup->page = page; fixup->inode = BTRFS_I(inode); btrfs_queue_work(fs_info->fixup_workers, &fixup->work); return -EAGAIN; } static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, u64 file_pos, struct btrfs_file_extent_item *stack_fi, const bool update_inode_bytes, u64 qgroup_reserved) { struct btrfs_root *root = inode->root; const u64 sectorsize = root->fs_info->sectorsize; struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_key ins; u64 disk_num_bytes = btrfs_stack_file_extent_disk_num_bytes(stack_fi); u64 disk_bytenr = btrfs_stack_file_extent_disk_bytenr(stack_fi); u64 offset = btrfs_stack_file_extent_offset(stack_fi); u64 num_bytes = btrfs_stack_file_extent_num_bytes(stack_fi); u64 ram_bytes = btrfs_stack_file_extent_ram_bytes(stack_fi); struct btrfs_drop_extents_args drop_args = { 0 }; int ret; path = btrfs_alloc_path(); if (!path) return -ENOMEM; /* * we may be replacing one extent in the tree with another. * The new extent is pinned in the extent map, and we don't want * to drop it from the cache until it is completely in the btree. * * So, tell btrfs_drop_extents to leave this extent in the cache. * the caller is expected to unpin it and allow it to be merged * with the others. */ drop_args.path = path; drop_args.start = file_pos; drop_args.end = file_pos + num_bytes; drop_args.replace_extent = true; drop_args.extent_item_size = sizeof(*stack_fi); ret = btrfs_drop_extents(trans, root, inode, &drop_args); if (ret) goto out; if (!drop_args.extent_inserted) { ins.objectid = btrfs_ino(inode); ins.offset = file_pos; ins.type = BTRFS_EXTENT_DATA_KEY; ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*stack_fi)); if (ret) goto out; } leaf = path->nodes[0]; btrfs_set_stack_file_extent_generation(stack_fi, trans->transid); write_extent_buffer(leaf, stack_fi, btrfs_item_ptr_offset(leaf, path->slots[0]), sizeof(struct btrfs_file_extent_item)); btrfs_mark_buffer_dirty(trans, leaf); btrfs_release_path(path); /* * If we dropped an inline extent here, we know the range where it is * was not marked with the EXTENT_DELALLOC_NEW bit, so we update the * number of bytes only for that range containing the inline extent. * The remaining of the range will be processed when clearning the * EXTENT_DELALLOC_BIT bit through the ordered extent completion. */ if (file_pos == 0 && !IS_ALIGNED(drop_args.bytes_found, sectorsize)) { u64 inline_size = round_down(drop_args.bytes_found, sectorsize); inline_size = drop_args.bytes_found - inline_size; btrfs_update_inode_bytes(inode, sectorsize, inline_size); drop_args.bytes_found -= inline_size; num_bytes -= sectorsize; } if (update_inode_bytes) btrfs_update_inode_bytes(inode, num_bytes, drop_args.bytes_found); ins.objectid = disk_bytenr; ins.offset = disk_num_bytes; ins.type = BTRFS_EXTENT_ITEM_KEY; ret = btrfs_inode_set_file_extent_range(inode, file_pos, ram_bytes); if (ret) goto out; ret = btrfs_alloc_reserved_file_extent(trans, root, btrfs_ino(inode), file_pos - offset, qgroup_reserved, &ins); out: btrfs_free_path(path); return ret; } static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info, u64 start, u64 len) { struct btrfs_block_group *cache; cache = btrfs_lookup_block_group(fs_info, start); ASSERT(cache); spin_lock(&cache->lock); cache->delalloc_bytes -= len; spin_unlock(&cache->lock); btrfs_put_block_group(cache); } static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, struct btrfs_ordered_extent *oe) { struct btrfs_file_extent_item stack_fi; bool update_inode_bytes; u64 num_bytes = oe->num_bytes; u64 ram_bytes = oe->ram_bytes; memset(&stack_fi, 0, sizeof(stack_fi)); btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_REG); btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, oe->disk_bytenr); btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi, oe->disk_num_bytes); btrfs_set_stack_file_extent_offset(&stack_fi, oe->offset); if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags)) { num_bytes = oe->truncated_len; ram_bytes = num_bytes; } btrfs_set_stack_file_extent_num_bytes(&stack_fi, num_bytes); btrfs_set_stack_file_extent_ram_bytes(&stack_fi, ram_bytes); btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type); /* Encryption and other encoding is reserved and all 0 */ /* * For delalloc, when completing an ordered extent we update the inode's * bytes when clearing the range in the inode's io tree, so pass false * as the argument 'update_inode_bytes' to insert_reserved_file_extent(), * except if the ordered extent was truncated. */ update_inode_bytes = test_bit(BTRFS_ORDERED_DIRECT, &oe->flags) || test_bit(BTRFS_ORDERED_ENCODED, &oe->flags) || test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags); return insert_reserved_file_extent(trans, BTRFS_I(oe->inode), oe->file_offset, &stack_fi, update_inode_bytes, oe->qgroup_rsv); } /* * As ordered data IO finishes, this gets called so we can finish * an ordered extent if the range of bytes in the file it covers are * fully written. */ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) { struct btrfs_inode *inode = BTRFS_I(ordered_extent->inode); struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans = NULL; struct extent_io_tree *io_tree = &inode->io_tree; struct extent_state *cached_state = NULL; u64 start, end; int compress_type = 0; int ret = 0; u64 logical_len = ordered_extent->num_bytes; bool freespace_inode; bool truncated = false; bool clear_reserved_extent = true; unsigned int clear_bits = EXTENT_DEFRAG; start = ordered_extent->file_offset; end = start + ordered_extent->num_bytes - 1; if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) && !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags) && !test_bit(BTRFS_ORDERED_ENCODED, &ordered_extent->flags)) clear_bits |= EXTENT_DELALLOC_NEW; freespace_inode = btrfs_is_free_space_inode(inode); if (!freespace_inode) btrfs_lockdep_acquire(fs_info, btrfs_ordered_extent); if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) { ret = -EIO; goto out; } if (btrfs_is_zoned(fs_info)) btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr, ordered_extent->disk_num_bytes); if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) { truncated = true; logical_len = ordered_extent->truncated_len; /* Truncated the entire extent, don't bother adding */ if (!logical_len) goto out; } if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */ btrfs_inode_safe_disk_i_size_write(inode, 0); if (freespace_inode) trans = btrfs_join_transaction_spacecache(root); else trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); trans = NULL; goto out; } trans->block_rsv = &inode->block_rsv; ret = btrfs_update_inode_fallback(trans, inode); if (ret) /* -ENOMEM or corruption */ btrfs_abort_transaction(trans, ret); goto out; } clear_bits |= EXTENT_LOCKED; lock_extent(io_tree, start, end, &cached_state); if (freespace_inode) trans = btrfs_join_transaction_spacecache(root); else trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); trans = NULL; goto out; } trans->block_rsv = &inode->block_rsv; ret = btrfs_insert_raid_extent(trans, ordered_extent); if (ret) goto out; if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) compress_type = ordered_extent->compress_type; if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { BUG_ON(compress_type); ret = btrfs_mark_extent_written(trans, inode, ordered_extent->file_offset, ordered_extent->file_offset + logical_len); btrfs_zoned_release_data_reloc_bg(fs_info, ordered_extent->disk_bytenr, ordered_extent->disk_num_bytes); } else { BUG_ON(root == fs_info->tree_root); ret = insert_ordered_extent_file_extent(trans, ordered_extent); if (!ret) { clear_reserved_extent = false; btrfs_release_delalloc_bytes(fs_info, ordered_extent->disk_bytenr, ordered_extent->disk_num_bytes); } } unpin_extent_cache(&inode->extent_tree, ordered_extent->file_offset, ordered_extent->num_bytes, trans->transid); if (ret < 0) { btrfs_abort_transaction(trans, ret); goto out; } ret = add_pending_csums(trans, &ordered_extent->list); if (ret) { btrfs_abort_transaction(trans, ret); goto out; } /* * If this is a new delalloc range, clear its new delalloc flag to * update the inode's number of bytes. This needs to be done first * before updating the inode item. */ if ((clear_bits & EXTENT_DELALLOC_NEW) && !test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) clear_extent_bit(&inode->io_tree, start, end, EXTENT_DELALLOC_NEW | EXTENT_ADD_INODE_BYTES, &cached_state); btrfs_inode_safe_disk_i_size_write(inode, 0); ret = btrfs_update_inode_fallback(trans, inode); if (ret) { /* -ENOMEM or corruption */ btrfs_abort_transaction(trans, ret); goto out; } ret = 0; out: clear_extent_bit(&inode->io_tree, start, end, clear_bits, &cached_state); if (trans) btrfs_end_transaction(trans); if (ret || truncated) { u64 unwritten_start = start; /* * If we failed to finish this ordered extent for any reason we * need to make sure BTRFS_ORDERED_IOERR is set on the ordered * extent, and mark the inode with the error if it wasn't * already set. Any error during writeback would have already * set the mapping error, so we need to set it if we're the ones * marking this ordered extent as failed. */ if (ret && !test_and_set_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) mapping_set_error(ordered_extent->inode->i_mapping, -EIO); if (truncated) unwritten_start += logical_len; clear_extent_uptodate(io_tree, unwritten_start, end, NULL); /* Drop extent maps for the part of the extent we didn't write. */ btrfs_drop_extent_map_range(inode, unwritten_start, end, false); /* * If the ordered extent had an IOERR or something else went * wrong we need to return the space for this ordered extent * back to the allocator. We only free the extent in the * truncated case if we didn't write out the extent at all. * * If we made it past insert_reserved_file_extent before we * errored out then we don't need to do this as the accounting * has already been done. */ if ((ret || !logical_len) && clear_reserved_extent && !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { /* * Discard the range before returning it back to the * free space pool */ if (ret && btrfs_test_opt(fs_info, DISCARD_SYNC)) btrfs_discard_extent(fs_info, ordered_extent->disk_bytenr, ordered_extent->disk_num_bytes, NULL); btrfs_free_reserved_extent(fs_info, ordered_extent->disk_bytenr, ordered_extent->disk_num_bytes, 1); /* * Actually free the qgroup rsv which was released when * the ordered extent was created. */ btrfs_qgroup_free_refroot(fs_info, inode->root->root_key.objectid, ordered_extent->qgroup_rsv, BTRFS_QGROUP_RSV_DATA); } } /* * This needs to be done to make sure anybody waiting knows we are done * updating everything for this ordered extent. */ btrfs_remove_ordered_extent(inode, ordered_extent); /* once for us */ btrfs_put_ordered_extent(ordered_extent); /* once for the tree */ btrfs_put_ordered_extent(ordered_extent); return ret; } int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered) { if (btrfs_is_zoned(btrfs_sb(ordered->inode->i_sb)) && !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) && list_empty(&ordered->bioc_list)) btrfs_finish_ordered_zoned(ordered); return btrfs_finish_one_ordered(ordered); } /* * Verify the checksum for a single sector without any extra action that depend * on the type of I/O. */ int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page, u32 pgoff, u8 *csum, const u8 * const csum_expected) { SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); char *kaddr; ASSERT(pgoff + fs_info->sectorsize <= PAGE_SIZE); shash->tfm = fs_info->csum_shash; kaddr = kmap_local_page(page) + pgoff; crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum); kunmap_local(kaddr); if (memcmp(csum, csum_expected, fs_info->csum_size)) return -EIO; return 0; } /* * Verify the checksum of a single data sector. * * @bbio: btrfs_io_bio which contains the csum * @dev: device the sector is on * @bio_offset: offset to the beginning of the bio (in bytes) * @bv: bio_vec to check * * Check if the checksum on a data block is valid. When a checksum mismatch is * detected, report the error and fill the corrupted range with zero. * * Return %true if the sector is ok or had no checksum to start with, else %false. */ bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, u32 bio_offset, struct bio_vec *bv) { struct btrfs_inode *inode = bbio->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; u64 file_offset = bbio->file_offset + bio_offset; u64 end = file_offset + bv->bv_len - 1; u8 *csum_expected; u8 csum[BTRFS_CSUM_SIZE]; ASSERT(bv->bv_len == fs_info->sectorsize); if (!bbio->csum) return true; if (btrfs_is_data_reloc_root(inode->root) && test_range_bit(&inode->io_tree, file_offset, end, EXTENT_NODATASUM, NULL)) { /* Skip the range without csum for data reloc inode */ clear_extent_bits(&inode->io_tree, file_offset, end, EXTENT_NODATASUM); return true; } csum_expected = bbio->csum + (bio_offset >> fs_info->sectorsize_bits) * fs_info->csum_size; if (btrfs_check_sector_csum(fs_info, bv->bv_page, bv->bv_offset, csum, csum_expected)) goto zeroit; return true; zeroit: btrfs_print_data_csum_error(inode, file_offset, csum, csum_expected, bbio->mirror_num); if (dev) btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); memzero_bvec(bv); return false; } /* * Perform a delayed iput on @inode. * * @inode: The inode we want to perform iput on * * This function uses the generic vfs_inode::i_count to track whether we should * just decrement it (in case it's > 1) or if this is the last iput then link * the inode to the delayed iput machinery. Delayed iputs are processed at * transaction commit time/superblock commit/cleaner kthread. */ void btrfs_add_delayed_iput(struct btrfs_inode *inode) { struct btrfs_fs_info *fs_info = inode->root->fs_info; unsigned long flags; if (atomic_add_unless(&inode->vfs_inode.i_count, -1, 1)) return; atomic_inc(&fs_info->nr_delayed_iputs); /* * Need to be irq safe here because we can be called from either an irq * context (see bio.c and btrfs_put_ordered_extent()) or a non-irq * context. */ spin_lock_irqsave(&fs_info->delayed_iput_lock, flags); ASSERT(list_empty(&inode->delayed_iput)); list_add_tail(&inode->delayed_iput, &fs_info->delayed_iputs); spin_unlock_irqrestore(&fs_info->delayed_iput_lock, flags); if (!test_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags)) wake_up_process(fs_info->cleaner_kthread); } static void run_delayed_iput_locked(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) { list_del_init(&inode->delayed_iput); spin_unlock_irq(&fs_info->delayed_iput_lock); iput(&inode->vfs_inode); if (atomic_dec_and_test(&fs_info->nr_delayed_iputs)) wake_up(&fs_info->delayed_iputs_wait); spin_lock_irq(&fs_info->delayed_iput_lock); } static void btrfs_run_delayed_iput(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) { if (!list_empty(&inode->delayed_iput)) { spin_lock_irq(&fs_info->delayed_iput_lock); if (!list_empty(&inode->delayed_iput)) run_delayed_iput_locked(fs_info, inode); spin_unlock_irq(&fs_info->delayed_iput_lock); } } void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info) { /* * btrfs_put_ordered_extent() can run in irq context (see bio.c), which * calls btrfs_add_delayed_iput() and that needs to lock * fs_info->delayed_iput_lock. So we need to disable irqs here to * prevent a deadlock. */ spin_lock_irq(&fs_info->delayed_iput_lock); while (!list_empty(&fs_info->delayed_iputs)) { struct btrfs_inode *inode; inode = list_first_entry(&fs_info->delayed_iputs, struct btrfs_inode, delayed_iput); run_delayed_iput_locked(fs_info, inode); if (need_resched()) { spin_unlock_irq(&fs_info->delayed_iput_lock); cond_resched(); spin_lock_irq(&fs_info->delayed_iput_lock); } } spin_unlock_irq(&fs_info->delayed_iput_lock); } /* * Wait for flushing all delayed iputs * * @fs_info: the filesystem * * This will wait on any delayed iputs that are currently running with KILLABLE * set. Once they are all done running we will return, unless we are killed in * which case we return EINTR. This helps in user operations like fallocate etc * that might get blocked on the iputs. * * Return EINTR if we were killed, 0 if nothing's pending */ int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info) { int ret = wait_event_killable(fs_info->delayed_iputs_wait, atomic_read(&fs_info->nr_delayed_iputs) == 0); if (ret) return -EINTR; return 0; } /* * This creates an orphan entry for the given inode in case something goes wrong * in the middle of an unlink. */ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct btrfs_inode *inode) { int ret; ret = btrfs_insert_orphan_item(trans, inode->root, btrfs_ino(inode)); if (ret && ret != -EEXIST) { btrfs_abort_transaction(trans, ret); return ret; } return 0; } /* * We have done the delete so we can go ahead and remove the orphan item for * this particular inode. */ static int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct btrfs_inode *inode) { return btrfs_del_orphan_item(trans, inode->root, btrfs_ino(inode)); } /* * this cleans up any orphans that may be left on the list from the last use * of this root. */ int btrfs_orphan_cleanup(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_key key, found_key; struct btrfs_trans_handle *trans; struct inode *inode; u64 last_objectid = 0; int ret = 0, nr_unlink = 0; if (test_and_set_bit(BTRFS_ROOT_ORPHAN_CLEANUP, &root->state)) return 0; path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; goto out; } path->reada = READA_BACK; key.objectid = BTRFS_ORPHAN_OBJECTID; key.type = BTRFS_ORPHAN_ITEM_KEY; key.offset = (u64)-1; while (1) { ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto out; /* * if ret == 0 means we found what we were searching for, which * is weird, but possible, so only screw with path if we didn't * find the key and see if we have stuff that matches */ if (ret > 0) { ret = 0; if (path->slots[0] == 0) break; path->slots[0]--; } /* pull out the item */ leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); /* make sure the item matches what we want */ if (found_key.objectid != BTRFS_ORPHAN_OBJECTID) break; if (found_key.type != BTRFS_ORPHAN_ITEM_KEY) break; /* release the path since we're done with it */ btrfs_release_path(path); /* * this is where we are basically btrfs_lookup, without the * crossing root thing. we store the inode number in the * offset of the orphan item. */ if (found_key.offset == last_objectid) { /* * We found the same inode as before. This means we were * not able to remove its items via eviction triggered * by an iput(). A transaction abort may have happened, * due to -ENOSPC for example, so try to grab the error * that lead to a transaction abort, if any. */ btrfs_err(fs_info, "Error removing orphan entry, stopping orphan cleanup"); ret = BTRFS_FS_ERROR(fs_info) ?: -EINVAL; goto out; } last_objectid = found_key.offset; found_key.objectid = found_key.offset; found_key.type = BTRFS_INODE_ITEM_KEY; found_key.offset = 0; inode = btrfs_iget(fs_info->sb, last_objectid, root); if (IS_ERR(inode)) { ret = PTR_ERR(inode); inode = NULL; if (ret != -ENOENT) goto out; } if (!inode && root == fs_info->tree_root) { struct btrfs_root *dead_root; int is_dead_root = 0; /* * This is an orphan in the tree root. Currently these * could come from 2 sources: * a) a root (snapshot/subvolume) deletion in progress * b) a free space cache inode * We need to distinguish those two, as the orphan item * for a root must not get deleted before the deletion * of the snapshot/subvolume's tree completes. * * btrfs_find_orphan_roots() ran before us, which has * found all deleted roots and loaded them into * fs_info->fs_roots_radix. So here we can find if an * orphan item corresponds to a deleted root by looking * up the root from that radix tree. */ spin_lock(&fs_info->fs_roots_radix_lock); dead_root = radix_tree_lookup(&fs_info->fs_roots_radix, (unsigned long)found_key.objectid); if (dead_root && btrfs_root_refs(&dead_root->root_item) == 0) is_dead_root = 1; spin_unlock(&fs_info->fs_roots_radix_lock); if (is_dead_root) { /* prevent this orphan from being found again */ key.offset = found_key.objectid - 1; continue; } } /* * If we have an inode with links, there are a couple of * possibilities: * * 1. We were halfway through creating fsverity metadata for the * file. In that case, the orphan item represents incomplete * fsverity metadata which must be cleaned up with * btrfs_drop_verity_items and deleting the orphan item. * 2. Old kernels (before v3.12) used to create an * orphan item for truncate indicating that there were possibly * extent items past i_size that needed to be deleted. In v3.12, * truncate was changed to update i_size in sync with the extent * items, but the (useless) orphan item was still created. Since * v4.18, we don't create the orphan item for truncate at all. * * So, this item could mean that we need to do a truncate, but * only if this filesystem was last used on a pre-v3.12 kernel * and was not cleanly unmounted. The odds of that are quite * slim, and it's a pain to do the truncate now, so just delete * the orphan item. * * It's also possible that this orphan item was supposed to be * deleted but wasn't. The inode number may have been reused, * but either way, we can delete the orphan item. */ if (!inode || inode->i_nlink) { if (inode) { ret = btrfs_drop_verity_items(BTRFS_I(inode)); iput(inode); inode = NULL; if (ret) goto out; } trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out; } btrfs_debug(fs_info, "auto deleting %Lu", found_key.objectid); ret = btrfs_del_orphan_item(trans, root, found_key.objectid); btrfs_end_transaction(trans); if (ret) goto out; continue; } nr_unlink++; /* this will do delete_inode and everything for us */ iput(inode); } /* release the path since we're done with it */ btrfs_release_path(path); if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) { trans = btrfs_join_transaction(root); if (!IS_ERR(trans)) btrfs_end_transaction(trans); } if (nr_unlink) btrfs_debug(fs_info, "unlinked %d orphans", nr_unlink); out: if (ret) btrfs_err(fs_info, "could not do orphan cleanup %d", ret); btrfs_free_path(path); return ret; } /* * very simple check to peek ahead in the leaf looking for xattrs. If we * don't find any xattrs, we know there can't be any acls. * * slot is the slot the inode is in, objectid is the objectid of the inode */ static noinline int acls_after_inode_item(struct extent_buffer *leaf, int slot, u64 objectid, int *first_xattr_slot) { u32 nritems = btrfs_header_nritems(leaf); struct btrfs_key found_key; static u64 xattr_access = 0; static u64 xattr_default = 0; int scanned = 0; if (!xattr_access) { xattr_access = btrfs_name_hash(XATTR_NAME_POSIX_ACL_ACCESS, strlen(XATTR_NAME_POSIX_ACL_ACCESS)); xattr_default = btrfs_name_hash(XATTR_NAME_POSIX_ACL_DEFAULT, strlen(XATTR_NAME_POSIX_ACL_DEFAULT)); } slot++; *first_xattr_slot = -1; while (slot < nritems) { btrfs_item_key_to_cpu(leaf, &found_key, slot); /* we found a different objectid, there must not be acls */ if (found_key.objectid != objectid) return 0; /* we found an xattr, assume we've got an acl */ if (found_key.type == BTRFS_XATTR_ITEM_KEY) { if (*first_xattr_slot == -1) *first_xattr_slot = slot; if (found_key.offset == xattr_access || found_key.offset == xattr_default) return 1; } /* * we found a key greater than an xattr key, there can't * be any acls later on */ if (found_key.type > BTRFS_XATTR_ITEM_KEY) return 0; slot++; scanned++; /* * it goes inode, inode backrefs, xattrs, extents, * so if there are a ton of hard links to an inode there can * be a lot of backrefs. Don't waste time searching too hard, * this is just an optimization */ if (scanned >= 8) break; } /* we hit the end of the leaf before we found an xattr or * something larger than an xattr. We have to assume the inode * has acls */ if (*first_xattr_slot == -1) *first_xattr_slot = slot; return 1; } /* * read an inode from the btree into the in-memory inode */ static int btrfs_read_locked_inode(struct inode *inode, struct btrfs_path *in_path) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_path *path = in_path; struct extent_buffer *leaf; struct btrfs_inode_item *inode_item; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_key location; unsigned long ptr; int maybe_acls; u32 rdev; int ret; bool filled = false; int first_xattr_slot; ret = btrfs_fill_inode(inode, &rdev); if (!ret) filled = true; if (!path) { path = btrfs_alloc_path(); if (!path) return -ENOMEM; } memcpy(&location, &BTRFS_I(inode)->location, sizeof(location)); ret = btrfs_lookup_inode(NULL, root, path, &location, 0); if (ret) { if (path != in_path) btrfs_free_path(path); return ret; } leaf = path->nodes[0]; if (filled) goto cache_index; inode_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); inode->i_mode = btrfs_inode_mode(leaf, inode_item); set_nlink(inode, btrfs_inode_nlink(leaf, inode_item)); i_uid_write(inode, btrfs_inode_uid(leaf, inode_item)); i_gid_write(inode, btrfs_inode_gid(leaf, inode_item)); btrfs_i_size_write(BTRFS_I(inode), btrfs_inode_size(leaf, inode_item)); btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0, round_up(i_size_read(inode), fs_info->sectorsize)); inode_set_atime(inode, btrfs_timespec_sec(leaf, &inode_item->atime), btrfs_timespec_nsec(leaf, &inode_item->atime)); inode_set_mtime(inode, btrfs_timespec_sec(leaf, &inode_item->mtime), btrfs_timespec_nsec(leaf, &inode_item->mtime)); inode_set_ctime(inode, btrfs_timespec_sec(leaf, &inode_item->ctime), btrfs_timespec_nsec(leaf, &inode_item->ctime)); BTRFS_I(inode)->i_otime_sec = btrfs_timespec_sec(leaf, &inode_item->otime); BTRFS_I(inode)->i_otime_nsec = btrfs_timespec_nsec(leaf, &inode_item->otime); inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item)); BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item); inode_set_iversion_queried(inode, btrfs_inode_sequence(leaf, inode_item)); inode->i_generation = BTRFS_I(inode)->generation; inode->i_rdev = 0; rdev = btrfs_inode_rdev(leaf, inode_item); BTRFS_I(inode)->index_cnt = (u64)-1; btrfs_inode_split_flags(btrfs_inode_flags(leaf, inode_item), &BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags); cache_index: /* * If we were modified in the current generation and evicted from memory * and then re-read we need to do a full sync since we don't have any * idea about which extents were modified before we were evicted from * cache. * * This is required for both inode re-read from disk and delayed inode * in delayed_nodes_tree. */ if (BTRFS_I(inode)->last_trans == btrfs_get_fs_generation(fs_info)) set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); /* * We don't persist the id of the transaction where an unlink operation * against the inode was last made. So here we assume the inode might * have been evicted, and therefore the exact value of last_unlink_trans * lost, and set it to last_trans to avoid metadata inconsistencies * between the inode and its parent if the inode is fsync'ed and the log * replayed. For example, in the scenario: * * touch mydir/foo * ln mydir/foo mydir/bar * sync * unlink mydir/bar * echo 2 > /proc/sys/vm/drop_caches # evicts inode * xfs_io -c fsync mydir/foo * <power failure> * mount fs, triggers fsync log replay * * We must make sure that when we fsync our inode foo we also log its * parent inode, otherwise after log replay the parent still has the * dentry with the "bar" name but our inode foo has a link count of 1 * and doesn't have an inode ref with the name "bar" anymore. * * Setting last_unlink_trans to last_trans is a pessimistic approach, * but it guarantees correctness at the expense of occasional full * transaction commits on fsync if our inode is a directory, or if our * inode is not a directory, logging its parent unnecessarily. */ BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans; /* * Same logic as for last_unlink_trans. We don't persist the generation * of the last transaction where this inode was used for a reflink * operation, so after eviction and reloading the inode we must be * pessimistic and assume the last transaction that modified the inode. */ BTRFS_I(inode)->last_reflink_trans = BTRFS_I(inode)->last_trans; path->slots[0]++; if (inode->i_nlink != 1 || path->slots[0] >= btrfs_header_nritems(leaf)) goto cache_acl; btrfs_item_key_to_cpu(leaf, &location, path->slots[0]); if (location.objectid != btrfs_ino(BTRFS_I(inode))) goto cache_acl; ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); if (location.type == BTRFS_INODE_REF_KEY) { struct btrfs_inode_ref *ref; ref = (struct btrfs_inode_ref *)ptr; BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref); } else if (location.type == BTRFS_INODE_EXTREF_KEY) { struct btrfs_inode_extref *extref; extref = (struct btrfs_inode_extref *)ptr; BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf, extref); } cache_acl: /* * try to precache a NULL acl entry for files that don't have * any xattrs or acls */ maybe_acls = acls_after_inode_item(leaf, path->slots[0], btrfs_ino(BTRFS_I(inode)), &first_xattr_slot); if (first_xattr_slot != -1) { path->slots[0] = first_xattr_slot; ret = btrfs_load_inode_props(inode, path); if (ret) btrfs_err(fs_info, "error loading props for ino %llu (root %llu): %d", btrfs_ino(BTRFS_I(inode)), root->root_key.objectid, ret); } if (path != in_path) btrfs_free_path(path); if (!maybe_acls) cache_no_acl(inode); switch (inode->i_mode & S_IFMT) { case S_IFREG: inode->i_mapping->a_ops = &btrfs_aops; inode->i_fop = &btrfs_file_operations; inode->i_op = &btrfs_file_inode_operations; break; case S_IFDIR: inode->i_fop = &btrfs_dir_file_operations; inode->i_op = &btrfs_dir_inode_operations; break; case S_IFLNK: inode->i_op = &btrfs_symlink_inode_operations; inode_nohighmem(inode); inode->i_mapping->a_ops = &btrfs_aops; break; default: inode->i_op = &btrfs_special_inode_operations; init_special_inode(inode, inode->i_mode, rdev); break; } btrfs_sync_inode_flags_to_i_flags(inode); return 0; } /* * given a leaf and an inode, copy the inode fields into the leaf */ static void fill_inode_item(struct btrfs_trans_handle *trans, struct extent_buffer *leaf, struct btrfs_inode_item *item, struct inode *inode) { struct btrfs_map_token token; u64 flags; btrfs_init_map_token(&token, leaf); btrfs_set_token_inode_uid(&token, item, i_uid_read(inode)); btrfs_set_token_inode_gid(&token, item, i_gid_read(inode)); btrfs_set_token_inode_size(&token, item, BTRFS_I(inode)->disk_i_size); btrfs_set_token_inode_mode(&token, item, inode->i_mode); btrfs_set_token_inode_nlink(&token, item, inode->i_nlink); btrfs_set_token_timespec_sec(&token, &item->atime, inode_get_atime_sec(inode)); btrfs_set_token_timespec_nsec(&token, &item->atime, inode_get_atime_nsec(inode)); btrfs_set_token_timespec_sec(&token, &item->mtime, inode_get_mtime_sec(inode)); btrfs_set_token_timespec_nsec(&token, &item->mtime, inode_get_mtime_nsec(inode)); btrfs_set_token_timespec_sec(&token, &item->ctime, inode_get_ctime_sec(inode)); btrfs_set_token_timespec_nsec(&token, &item->ctime, inode_get_ctime_nsec(inode)); btrfs_set_token_timespec_sec(&token, &item->otime, BTRFS_I(inode)->i_otime_sec); btrfs_set_token_timespec_nsec(&token, &item->otime, BTRFS_I(inode)->i_otime_nsec); btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode)); btrfs_set_token_inode_generation(&token, item, BTRFS_I(inode)->generation); btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode)); btrfs_set_token_inode_transid(&token, item, trans->transid); btrfs_set_token_inode_rdev(&token, item, inode->i_rdev); flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags, BTRFS_I(inode)->ro_flags); btrfs_set_token_inode_flags(&token, item, flags); btrfs_set_token_inode_block_group(&token, item, 0); } /* * copy everything in the in-memory inode into the btree. */ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, struct btrfs_inode *inode) { struct btrfs_inode_item *inode_item; struct btrfs_path *path; struct extent_buffer *leaf; int ret; path = btrfs_alloc_path(); if (!path) return -ENOMEM; ret = btrfs_lookup_inode(trans, inode->root, path, &inode->location, 1); if (ret) { if (ret > 0) ret = -ENOENT; goto failed; } leaf = path->nodes[0]; inode_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); fill_inode_item(trans, leaf, inode_item, &inode->vfs_inode); btrfs_mark_buffer_dirty(trans, leaf); btrfs_set_inode_last_trans(trans, inode); ret = 0; failed: btrfs_free_path(path); return ret; } /* * copy everything in the in-memory inode into the btree. */ int btrfs_update_inode(struct btrfs_trans_handle *trans, struct btrfs_inode *inode) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; int ret; /* * If the inode is a free space inode, we can deadlock during commit * if we put it into the delayed code. * * The data relocation inode should also be directly updated * without delay */ if (!btrfs_is_free_space_inode(inode) && !btrfs_is_data_reloc_root(root) && !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) { btrfs_update_root_times(trans, root); ret = btrfs_delayed_update_inode(trans, inode); if (!ret) btrfs_set_inode_last_trans(trans, inode); return ret; } return btrfs_update_inode_item(trans, inode); } int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, struct btrfs_inode *inode) { int ret; ret = btrfs_update_inode(trans, inode); if (ret == -ENOSPC) return btrfs_update_inode_item(trans, inode); return ret; } /* * unlink helper that gets used here in inode.c and in the tree logging * recovery code. It remove a link in a directory with a given name, and * also drops the back refs in the inode to the directory */ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, struct btrfs_inode *dir, struct btrfs_inode *inode, const struct fscrypt_str *name, struct btrfs_rename_ctx *rename_ctx) { struct btrfs_root *root = dir->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; int ret = 0; struct btrfs_dir_item *di; u64 index; u64 ino = btrfs_ino(inode); u64 dir_ino = btrfs_ino(dir); path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; goto out; } di = btrfs_lookup_dir_item(trans, root, path, dir_ino, name, -1); if (IS_ERR_OR_NULL(di)) { ret = di ? PTR_ERR(di) : -ENOENT; goto err; } ret = btrfs_delete_one_dir_name(trans, root, path, di); if (ret) goto err; btrfs_release_path(path); /* * If we don't have dir index, we have to get it by looking up * the inode ref, since we get the inode ref, remove it directly, * it is unnecessary to do delayed deletion. * * But if we have dir index, needn't search inode ref to get it. * Since the inode ref is close to the inode item, it is better * that we delay to delete it, and just do this deletion when * we update the inode item. */ if (inode->dir_index) { ret = btrfs_delayed_delete_inode_ref(inode); if (!ret) { index = inode->dir_index; goto skip_backref; } } ret = btrfs_del_inode_ref(trans, root, name, ino, dir_ino, &index); if (ret) { btrfs_info(fs_info, "failed to delete reference to %.*s, inode %llu parent %llu", name->len, name->name, ino, dir_ino); btrfs_abort_transaction(trans, ret); goto err; } skip_backref: if (rename_ctx) rename_ctx->index = index; ret = btrfs_delete_delayed_dir_index(trans, dir, index); if (ret) { btrfs_abort_transaction(trans, ret); goto err; } /* * If we are in a rename context, we don't need to update anything in the * log. That will be done later during the rename by btrfs_log_new_name(). * Besides that, doing it here would only cause extra unnecessary btree * operations on the log tree, increasing latency for applications. */ if (!rename_ctx) { btrfs_del_inode_ref_in_log(trans, root, name, inode, dir_ino); btrfs_del_dir_entries_in_log(trans, root, name, dir, index); } /* * If we have a pending delayed iput we could end up with the final iput * being run in btrfs-cleaner context. If we have enough of these built * up we can end up burning a lot of time in btrfs-cleaner without any * way to throttle the unlinks. Since we're currently holding a ref on * the inode we can run the delayed iput here without any issues as the * final iput won't be done until after we drop the ref we're currently * holding. */ btrfs_run_delayed_iput(fs_info, inode); err: btrfs_free_path(path); if (ret) goto out; btrfs_i_size_write(dir, dir->vfs_inode.i_size - name->len * 2); inode_inc_iversion(&inode->vfs_inode); inode_inc_iversion(&dir->vfs_inode); inode_set_mtime_to_ts(&dir->vfs_inode, inode_set_ctime_current(&dir->vfs_inode)); ret = btrfs_update_inode(trans, dir); out: return ret; } int btrfs_unlink_inode(struct btrfs_trans_handle *trans, struct btrfs_inode *dir, struct btrfs_inode *inode, const struct fscrypt_str *name) { int ret; ret = __btrfs_unlink_inode(trans, dir, inode, name, NULL); if (!ret) { drop_nlink(&inode->vfs_inode); ret = btrfs_update_inode(trans, inode); } return ret; } /* * helper to start transaction for unlink and rmdir. * * unlink and rmdir are special in btrfs, they do not always free space, so * if we cannot make our reservations the normal way try and see if there is * plenty of slack room in the global reserve to migrate, otherwise we cannot * allow the unlink to occur. */ static struct btrfs_trans_handle *__unlink_start_trans(struct btrfs_inode *dir) { struct btrfs_root *root = dir->root; return btrfs_start_transaction_fallback_global_rsv(root, BTRFS_UNLINK_METADATA_UNITS); } static int btrfs_unlink(struct inode *dir, struct dentry *dentry) { struct btrfs_trans_handle *trans; struct inode *inode = d_inode(dentry); int ret; struct fscrypt_name fname; ret = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname); if (ret) return ret; /* This needs to handle no-key deletions later on */ trans = __unlink_start_trans(BTRFS_I(dir)); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto fscrypt_free; } btrfs_record_unlink_dir(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), false); ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), &fname.disk_name); if (ret) goto end_trans; if (inode->i_nlink == 0) { ret = btrfs_orphan_add(trans, BTRFS_I(inode)); if (ret) goto end_trans; } end_trans: btrfs_end_transaction(trans); btrfs_btree_balance_dirty(BTRFS_I(dir)->root->fs_info); fscrypt_free: fscrypt_free_filename(&fname); return ret; } static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, struct btrfs_inode *dir, struct dentry *dentry) { struct btrfs_root *root = dir->root; struct btrfs_inode *inode = BTRFS_I(d_inode(dentry)); struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_dir_item *di; struct btrfs_key key; u64 index; int ret; u64 objectid; u64 dir_ino = btrfs_ino(dir); struct fscrypt_name fname; ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 1, &fname); if (ret) return ret; /* This needs to handle no-key deletions later on */ if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) { objectid = inode->root->root_key.objectid; } else if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) { objectid = inode->location.objectid; } else { WARN_ON(1); fscrypt_free_filename(&fname); return -EINVAL; } path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; goto out; } di = btrfs_lookup_dir_item(trans, root, path, dir_ino, &fname.disk_name, -1); if (IS_ERR_OR_NULL(di)) { ret = di ? PTR_ERR(di) : -ENOENT; goto out; } leaf = path->nodes[0]; btrfs_dir_item_key_to_cpu(leaf, di, &key); WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid); ret = btrfs_delete_one_dir_name(trans, root, path, di); if (ret) { btrfs_abort_transaction(trans, ret); goto out; } btrfs_release_path(path); /* * This is a placeholder inode for a subvolume we didn't have a * reference to at the time of the snapshot creation. In the meantime * we could have renamed the real subvol link into our snapshot, so * depending on btrfs_del_root_ref to return -ENOENT here is incorrect. * Instead simply lookup the dir_index_item for this entry so we can * remove it. Otherwise we know we have a ref to the root and we can * call btrfs_del_root_ref, and it _shouldn't_ fail. */ if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) { di = btrfs_search_dir_index_item(root, path, dir_ino, &fname.disk_name); if (IS_ERR_OR_NULL(di)) { if (!di) ret = -ENOENT; else ret = PTR_ERR(di); btrfs_abort_transaction(trans, ret); goto out; } leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); index = key.offset; btrfs_release_path(path); } else { ret = btrfs_del_root_ref(trans, objectid, root->root_key.objectid, dir_ino, &index, &fname.disk_name); if (ret) { btrfs_abort_transaction(trans, ret); goto out; } } ret = btrfs_delete_delayed_dir_index(trans, dir, index); if (ret) { btrfs_abort_transaction(trans, ret); goto out; } btrfs_i_size_write(dir, dir->vfs_inode.i_size - fname.disk_name.len * 2); inode_inc_iversion(&dir->vfs_inode); inode_set_mtime_to_ts(&dir->vfs_inode, inode_set_ctime_current(&dir->vfs_inode)); ret = btrfs_update_inode_fallback(trans, dir); if (ret) btrfs_abort_transaction(trans, ret); out: btrfs_free_path(path); fscrypt_free_filename(&fname); return ret; } /* * Helper to check if the subvolume references other subvolumes or if it's * default. */ static noinline int may_destroy_subvol(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; struct btrfs_dir_item *di; struct btrfs_key key; struct fscrypt_str name = FSTR_INIT("default", 7); u64 dir_id; int ret; path = btrfs_alloc_path(); if (!path) return -ENOMEM; /* Make sure this root isn't set as the default subvol */ dir_id = btrfs_super_root_dir(fs_info->super_copy); di = btrfs_lookup_dir_item(NULL, fs_info->tree_root, path, dir_id, &name, 0); if (di && !IS_ERR(di)) { btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key); if (key.objectid == root->root_key.objectid) { ret = -EPERM; btrfs_err(fs_info, "deleting default subvolume %llu is not allowed", key.objectid); goto out; } btrfs_release_path(path); } key.objectid = root->root_key.objectid; key.type = BTRFS_ROOT_REF_KEY; key.offset = (u64)-1; ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); if (ret < 0) goto out; BUG_ON(ret == 0); ret = 0; if (path->slots[0] > 0) { path->slots[0]--; btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); if (key.objectid == root->root_key.objectid && key.type == BTRFS_ROOT_REF_KEY) ret = -ENOTEMPTY; } out: btrfs_free_path(path); return ret; } /* Delete all dentries for inodes belonging to the root */ static void btrfs_prune_dentries(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; struct rb_node *node; struct rb_node *prev; struct btrfs_inode *entry; struct inode *inode; u64 objectid = 0; if (!BTRFS_FS_ERROR(fs_info)) WARN_ON(btrfs_root_refs(&root->root_item) != 0); spin_lock(&root->inode_lock); again: node = root->inode_tree.rb_node; prev = NULL; while (node) { prev = node; entry = rb_entry(node, struct btrfs_inode, rb_node); if (objectid < btrfs_ino(entry)) node = node->rb_left; else if (objectid > btrfs_ino(entry)) node = node->rb_right; else break; } if (!node) { while (prev) { entry = rb_entry(prev, struct btrfs_inode, rb_node); if (objectid <= btrfs_ino(entry)) { node = prev; break; } prev = rb_next(prev); } } while (node) { entry = rb_entry(node, struct btrfs_inode, rb_node); objectid = btrfs_ino(entry) + 1; inode = igrab(&entry->vfs_inode); if (inode) { spin_unlock(&root->inode_lock); if (atomic_read(&inode->i_count) > 1) d_prune_aliases(inode); /* * btrfs_drop_inode will have it removed from the inode * cache when its usage count hits zero. */ iput(inode); cond_resched(); spin_lock(&root->inode_lock); goto again; } if (cond_resched_lock(&root->inode_lock)) goto again; node = rb_next(node); } spin_unlock(&root->inode_lock); } int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) { struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb); struct btrfs_root *root = dir->root; struct inode *inode = d_inode(dentry); struct btrfs_root *dest = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; struct btrfs_block_rsv block_rsv; u64 root_flags; int ret; /* * Don't allow to delete a subvolume with send in progress. This is * inside the inode lock so the error handling that has to drop the bit * again is not run concurrently. */ spin_lock(&dest->root_item_lock); if (dest->send_in_progress) { spin_unlock(&dest->root_item_lock); btrfs_warn(fs_info, "attempt to delete subvolume %llu during send", dest->root_key.objectid); return -EPERM; } if (atomic_read(&dest->nr_swapfiles)) { spin_unlock(&dest->root_item_lock); btrfs_warn(fs_info, "attempt to delete subvolume %llu with active swapfile", root->root_key.objectid); return -EPERM; } root_flags = btrfs_root_flags(&dest->root_item); btrfs_set_root_flags(&dest->root_item, root_flags | BTRFS_ROOT_SUBVOL_DEAD); spin_unlock(&dest->root_item_lock); down_write(&fs_info->subvol_sem); ret = may_destroy_subvol(dest); if (ret) goto out_up_write; btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP); /* * One for dir inode, * two for dir entries, * two for root ref/backref. */ ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true); if (ret) goto out_up_write; trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out_release; } trans->block_rsv = &block_rsv; trans->bytes_reserved = block_rsv.size; btrfs_record_snapshot_destroy(trans, dir); ret = btrfs_unlink_subvol(trans, dir, dentry); if (ret) { btrfs_abort_transaction(trans, ret); goto out_end_trans; } ret = btrfs_record_root_in_trans(trans, dest); if (ret) { btrfs_abort_transaction(trans, ret); goto out_end_trans; } memset(&dest->root_item.drop_progress, 0, sizeof(dest->root_item.drop_progress)); btrfs_set_root_drop_level(&dest->root_item, 0); btrfs_set_root_refs(&dest->root_item, 0); if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) { ret = btrfs_insert_orphan_item(trans, fs_info->tree_root, dest->root_key.objectid); if (ret) { btrfs_abort_transaction(trans, ret); goto out_end_trans; } } ret = btrfs_uuid_tree_remove(trans, dest->root_item.uuid, BTRFS_UUID_KEY_SUBVOL, dest->root_key.objectid); if (ret && ret != -ENOENT) { btrfs_abort_transaction(trans, ret); goto out_end_trans; } if (!btrfs_is_empty_uuid(dest->root_item.received_uuid)) { ret = btrfs_uuid_tree_remove(trans, dest->root_item.received_uuid, BTRFS_UUID_KEY_RECEIVED_SUBVOL, dest->root_key.objectid); if (ret && ret != -ENOENT) { btrfs_abort_transaction(trans, ret); goto out_end_trans; } } free_anon_bdev(dest->anon_dev); dest->anon_dev = 0; out_end_trans: trans->block_rsv = NULL; trans->bytes_reserved = 0; ret = btrfs_end_transaction(trans); inode->i_flags |= S_DEAD; out_release: btrfs_subvolume_release_metadata(root, &block_rsv); out_up_write: up_write(&fs_info->subvol_sem); if (ret) { spin_lock(&dest->root_item_lock); root_flags = btrfs_root_flags(&dest->root_item); btrfs_set_root_flags(&dest->root_item, root_flags & ~BTRFS_ROOT_SUBVOL_DEAD); spin_unlock(&dest->root_item_lock); } else { d_invalidate(dentry); btrfs_prune_dentries(dest); ASSERT(dest->send_in_progress == 0); } return ret; } static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(dentry); struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; int err = 0; struct btrfs_trans_handle *trans; u64 last_unlink_trans; struct fscrypt_name fname; if (inode->i_size > BTRFS_EMPTY_DIR_SIZE) return -ENOTEMPTY; if (btrfs_ino(BTRFS_I(inode)) == BTRFS_FIRST_FREE_OBJECTID) { if (unlikely(btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))) { btrfs_err(fs_info, "extent tree v2 doesn't support snapshot deletion yet"); return -EOPNOTSUPP; } return btrfs_delete_subvolume(BTRFS_I(dir), dentry); } err = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname); if (err) return err; /* This needs to handle no-key deletions later on */ trans = __unlink_start_trans(BTRFS_I(dir)); if (IS_ERR(trans)) { err = PTR_ERR(trans); goto out_notrans; } if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { err = btrfs_unlink_subvol(trans, BTRFS_I(dir), dentry); goto out; } err = btrfs_orphan_add(trans, BTRFS_I(inode)); if (err) goto out; last_unlink_trans = BTRFS_I(inode)->last_unlink_trans; /* now the directory is empty */ err = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), &fname.disk_name); if (!err) { btrfs_i_size_write(BTRFS_I(inode), 0); /* * Propagate the last_unlink_trans value of the deleted dir to * its parent directory. This is to prevent an unrecoverable * log tree in the case we do something like this: * 1) create dir foo * 2) create snapshot under dir foo * 3) delete the snapshot * 4) rmdir foo * 5) mkdir foo * 6) fsync foo or some file inside foo */ if (last_unlink_trans >= trans->transid) BTRFS_I(dir)->last_unlink_trans = last_unlink_trans; } out: btrfs_end_transaction(trans); out_notrans: btrfs_btree_balance_dirty(fs_info); fscrypt_free_filename(&fname); return err; } /* * Read, zero a chunk and write a block. * * @inode - inode that we're zeroing * @from - the offset to start zeroing * @len - the length to zero, 0 to zero the entire range respective to the * offset * @front - zero up to the offset instead of from the offset on * * This will find the block for the "from" offset and cow the block and zero the * part we want to zero. This is used with truncate and hole punching. */ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, int front) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct address_space *mapping = inode->vfs_inode.i_mapping; struct extent_io_tree *io_tree = &inode->io_tree; struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; struct extent_changeset *data_reserved = NULL; bool only_release_metadata = false; u32 blocksize = fs_info->sectorsize; pgoff_t index = from >> PAGE_SHIFT; unsigned offset = from & (blocksize - 1); struct page *page; gfp_t mask = btrfs_alloc_write_mask(mapping); size_t write_bytes = blocksize; int ret = 0; u64 block_start; u64 block_end; if (IS_ALIGNED(offset, blocksize) && (!len || IS_ALIGNED(len, blocksize))) goto out; block_start = round_down(from, blocksize); block_end = block_start + blocksize - 1; ret = btrfs_check_data_free_space(inode, &data_reserved, block_start, blocksize, false); if (ret < 0) { if (btrfs_check_nocow_lock(inode, block_start, &write_bytes, false) > 0) { /* For nocow case, no need to reserve data space */ only_release_metadata = true; } else { goto out; } } ret = btrfs_delalloc_reserve_metadata(inode, blocksize, blocksize, false); if (ret < 0) { if (!only_release_metadata) btrfs_free_reserved_data_space(inode, data_reserved, block_start, blocksize); goto out; } again: page = find_or_create_page(mapping, index, mask); if (!page) { btrfs_delalloc_release_space(inode, data_reserved, block_start, blocksize, true); btrfs_delalloc_release_extents(inode, blocksize); ret = -ENOMEM; goto out; } if (!PageUptodate(page)) { ret = btrfs_read_folio(NULL, page_folio(page)); lock_page(page); if (page->mapping != mapping) { unlock_page(page); put_page(page); goto again; } if (!PageUptodate(page)) { ret = -EIO; goto out_unlock; } } /* * We unlock the page after the io is completed and then re-lock it * above. release_folio() could have come in between that and cleared * PagePrivate(), but left the page in the mapping. Set the page mapped * here to make sure it's properly set for the subpage stuff. */ ret = set_page_extent_mapped(page); if (ret < 0) goto out_unlock; wait_on_page_writeback(page); lock_extent(io_tree, block_start, block_end, &cached_state); ordered = btrfs_lookup_ordered_extent(inode, block_start); if (ordered) { unlock_extent(io_tree, block_start, block_end, &cached_state); unlock_page(page); put_page(page); btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); goto again; } clear_extent_bit(&inode->io_tree, block_start, block_end, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, &cached_state); ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0, &cached_state); if (ret) { unlock_extent(io_tree, block_start, block_end, &cached_state); goto out_unlock; } if (offset != blocksize) { if (!len) len = blocksize - offset; if (front) memzero_page(page, (block_start - page_offset(page)), offset); else memzero_page(page, (block_start - page_offset(page)) + offset, len); } btrfs_page_clear_checked(fs_info, page, block_start, block_end + 1 - block_start); btrfs_page_set_dirty(fs_info, page, block_start, block_end + 1 - block_start); unlock_extent(io_tree, block_start, block_end, &cached_state); if (only_release_metadata) set_extent_bit(&inode->io_tree, block_start, block_end, EXTENT_NORESERVE, NULL); out_unlock: if (ret) { if (only_release_metadata) btrfs_delalloc_release_metadata(inode, blocksize, true); else btrfs_delalloc_release_space(inode, data_reserved, block_start, blocksize, true); } btrfs_delalloc_release_extents(inode, blocksize); unlock_page(page); put_page(page); out: if (only_release_metadata) btrfs_check_nocow_unlock(inode); extent_changeset_free(data_reserved); return ret; } static int maybe_insert_hole(struct btrfs_inode *inode, u64 offset, u64 len) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans; struct btrfs_drop_extents_args drop_args = { 0 }; int ret; /* * If NO_HOLES is enabled, we don't need to do anything. * Later, up in the call chain, either btrfs_set_inode_last_sub_trans() * or btrfs_update_inode() will be called, which guarantee that the next * fsync will know this inode was changed and needs to be logged. */ if (btrfs_fs_incompat(fs_info, NO_HOLES)) return 0; /* * 1 - for the one we're dropping * 1 - for the one we're adding * 1 - for updating the inode. */ trans = btrfs_start_transaction(root, 3); if (IS_ERR(trans)) return PTR_ERR(trans); drop_args.start = offset; drop_args.end = offset + len; drop_args.drop_cache = true; ret = btrfs_drop_extents(trans, root, inode, &drop_args); if (ret) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); return ret; } ret = btrfs_insert_hole_extent(trans, root, btrfs_ino(inode), offset, len); if (ret) { btrfs_abort_transaction(trans, ret); } else { btrfs_update_inode_bytes(inode, 0, drop_args.bytes_found); btrfs_update_inode(trans, inode); } btrfs_end_transaction(trans); return ret; } /* * This function puts in dummy file extents for the area we're creating a hole * for. So if we are truncating this file to a larger size we need to insert * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for * the range between oldsize and size */ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct extent_io_tree *io_tree = &inode->io_tree; struct extent_map *em = NULL; struct extent_state *cached_state = NULL; u64 hole_start = ALIGN(oldsize, fs_info->sectorsize); u64 block_end = ALIGN(size, fs_info->sectorsize); u64 last_byte; u64 cur_offset; u64 hole_size; int err = 0; /* * If our size started in the middle of a block we need to zero out the * rest of the block before we expand the i_size, otherwise we could * expose stale data. */ err = btrfs_truncate_block(inode, oldsize, 0, 0); if (err) return err; if (size <= hole_start) return 0; btrfs_lock_and_flush_ordered_range(inode, hole_start, block_end - 1, &cached_state); cur_offset = hole_start; while (1) { em = btrfs_get_extent(inode, NULL, 0, cur_offset, block_end - cur_offset); if (IS_ERR(em)) { err = PTR_ERR(em); em = NULL; break; } last_byte = min(extent_map_end(em), block_end); last_byte = ALIGN(last_byte, fs_info->sectorsize); hole_size = last_byte - cur_offset; if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { struct extent_map *hole_em; err = maybe_insert_hole(inode, cur_offset, hole_size); if (err) break; err = btrfs_inode_set_file_extent_range(inode, cur_offset, hole_size); if (err) break; hole_em = alloc_extent_map(); if (!hole_em) { btrfs_drop_extent_map_range(inode, cur_offset, cur_offset + hole_size - 1, false); btrfs_set_inode_full_sync(inode); goto next; } hole_em->start = cur_offset; hole_em->len = hole_size; hole_em->orig_start = cur_offset; hole_em->block_start = EXTENT_MAP_HOLE; hole_em->block_len = 0; hole_em->orig_block_len = 0; hole_em->ram_bytes = hole_size; hole_em->compress_type = BTRFS_COMPRESS_NONE; hole_em->generation = btrfs_get_fs_generation(fs_info); err = btrfs_replace_extent_map_range(inode, hole_em, true); free_extent_map(hole_em); } else { err = btrfs_inode_set_file_extent_range(inode, cur_offset, hole_size); if (err) break; } next: free_extent_map(em); em = NULL; cur_offset = last_byte; if (cur_offset >= block_end) break; } free_extent_map(em); unlock_extent(io_tree, hole_start, block_end - 1, &cached_state); return err; } static int btrfs_setsize(struct inode *inode, struct iattr *attr) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; loff_t oldsize = i_size_read(inode); loff_t newsize = attr->ia_size; int mask = attr->ia_valid; int ret; /* * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a * special case where we need to update the times despite not having * these flags set. For all other operations the VFS set these flags * explicitly if it wants a timestamp update. */ if (newsize != oldsize) { inode_inc_iversion(inode); if (!(mask & (ATTR_CTIME | ATTR_MTIME))) { inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); } } if (newsize > oldsize) { /* * Don't do an expanding truncate while snapshotting is ongoing. * This is to ensure the snapshot captures a fully consistent * state of this file - if the snapshot captures this expanding * truncation, it must capture all writes that happened before * this truncation. */ btrfs_drew_write_lock(&root->snapshot_lock); ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, newsize); if (ret) { btrfs_drew_write_unlock(&root->snapshot_lock); return ret; } trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { btrfs_drew_write_unlock(&root->snapshot_lock); return PTR_ERR(trans); } i_size_write(inode, newsize); btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); pagecache_isize_extended(inode, oldsize, newsize); ret = btrfs_update_inode(trans, BTRFS_I(inode)); btrfs_drew_write_unlock(&root->snapshot_lock); btrfs_end_transaction(trans); } else { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); if (btrfs_is_zoned(fs_info)) { ret = btrfs_wait_ordered_range(inode, ALIGN(newsize, fs_info->sectorsize), (u64)-1); if (ret) return ret; } /* * We're truncating a file that used to have good data down to * zero. Make sure any new writes to the file get on disk * on close. */ if (newsize == 0) set_bit(BTRFS_INODE_FLUSH_ON_CLOSE, &BTRFS_I(inode)->runtime_flags); truncate_setsize(inode, newsize); inode_dio_wait(inode); ret = btrfs_truncate(BTRFS_I(inode), newsize == oldsize); if (ret && inode->i_nlink) { int err; /* * Truncate failed, so fix up the in-memory size. We * adjusted disk_i_size down as we removed extents, so * wait for disk_i_size to be stable and then update the * in-memory size to match. */ err = btrfs_wait_ordered_range(inode, 0, (u64)-1); if (err) return err; i_size_write(inode, BTRFS_I(inode)->disk_i_size); } } return ret; } static int btrfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); struct btrfs_root *root = BTRFS_I(inode)->root; int err; if (btrfs_root_readonly(root)) return -EROFS; err = setattr_prepare(idmap, dentry, attr); if (err) return err; if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { err = btrfs_setsize(inode, attr); if (err) return err; } if (attr->ia_valid) { setattr_copy(idmap, inode, attr); inode_inc_iversion(inode); err = btrfs_dirty_inode(BTRFS_I(inode)); if (!err && attr->ia_valid & ATTR_MODE) err = posix_acl_chmod(idmap, dentry, inode->i_mode); } return err; } /* * While truncating the inode pages during eviction, we get the VFS * calling btrfs_invalidate_folio() against each folio of the inode. This * is slow because the calls to btrfs_invalidate_folio() result in a * huge amount of calls to lock_extent() and clear_extent_bit(), * which keep merging and splitting extent_state structures over and over, * wasting lots of time. * * Therefore if the inode is being evicted, let btrfs_invalidate_folio() * skip all those expensive operations on a per folio basis and do only * the ordered io finishing, while we release here the extent_map and * extent_state structures, without the excessive merging and splitting. */ static void evict_inode_truncate_pages(struct inode *inode) { struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct rb_node *node; ASSERT(inode->i_state & I_FREEING); truncate_inode_pages_final(&inode->i_data); btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false); /* * Keep looping until we have no more ranges in the io tree. * We can have ongoing bios started by readahead that have * their endio callback (extent_io.c:end_bio_extent_readpage) * still in progress (unlocked the pages in the bio but did not yet * unlocked the ranges in the io tree). Therefore this means some * ranges can still be locked and eviction started because before * submitting those bios, which are executed by a separate task (work * queue kthread), inode references (inode->i_count) were not taken * (which would be dropped in the end io callback of each bio). * Therefore here we effectively end up waiting for those bios and * anyone else holding locked ranges without having bumped the inode's * reference count - if we don't do it, when they access the inode's * io_tree to unlock a range it may be too late, leading to an * use-after-free issue. */ spin_lock(&io_tree->lock); while (!RB_EMPTY_ROOT(&io_tree->state)) { struct extent_state *state; struct extent_state *cached_state = NULL; u64 start; u64 end; unsigned state_flags; node = rb_first(&io_tree->state); state = rb_entry(node, struct extent_state, rb_node); start = state->start; end = state->end; state_flags = state->state; spin_unlock(&io_tree->lock); lock_extent(io_tree, start, end, &cached_state); /* * If still has DELALLOC flag, the extent didn't reach disk, * and its reserved space won't be freed by delayed_ref. * So we need to free its reserved space here. * (Refer to comment in btrfs_invalidate_folio, case 2) * * Note, end is the bytenr of last byte, so we need + 1 here. */ if (state_flags & EXTENT_DELALLOC) btrfs_qgroup_free_data(BTRFS_I(inode), NULL, start, end - start + 1, NULL); clear_extent_bit(io_tree, start, end, EXTENT_CLEAR_ALL_BITS | EXTENT_DO_ACCOUNTING, &cached_state); cond_resched(); spin_lock(&io_tree->lock); } spin_unlock(&io_tree->lock); } static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root, struct btrfs_block_rsv *rsv) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans; u64 delayed_refs_extra = btrfs_calc_delayed_ref_bytes(fs_info, 1); int ret; /* * Eviction should be taking place at some place safe because of our * delayed iputs. However the normal flushing code will run delayed * iputs, so we cannot use FLUSH_ALL otherwise we'll deadlock. * * We reserve the delayed_refs_extra here again because we can't use * btrfs_start_transaction(root, 0) for the same deadlocky reason as * above. We reserve our extra bit here because we generate a ton of * delayed refs activity by truncating. * * BTRFS_RESERVE_FLUSH_EVICT will steal from the global_rsv if it can, * if we fail to make this reservation we can re-try without the * delayed_refs_extra so we can make some forward progress. */ ret = btrfs_block_rsv_refill(fs_info, rsv, rsv->size + delayed_refs_extra, BTRFS_RESERVE_FLUSH_EVICT); if (ret) { ret = btrfs_block_rsv_refill(fs_info, rsv, rsv->size, BTRFS_RESERVE_FLUSH_EVICT); if (ret) { btrfs_warn(fs_info, "could not allocate space for delete; will truncate on mount"); return ERR_PTR(-ENOSPC); } delayed_refs_extra = 0; } trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return trans; if (delayed_refs_extra) { trans->block_rsv = &fs_info->trans_block_rsv; trans->bytes_reserved = delayed_refs_extra; btrfs_block_rsv_migrate(rsv, trans->block_rsv, delayed_refs_extra, true); } return trans; } void btrfs_evict_inode(struct inode *inode) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_block_rsv *rsv = NULL; int ret; trace_btrfs_inode_evict(inode); if (!root) { fsverity_cleanup_inode(inode); clear_inode(inode); return; } evict_inode_truncate_pages(inode); if (inode->i_nlink && ((btrfs_root_refs(&root->root_item) != 0 && root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) || btrfs_is_free_space_inode(BTRFS_I(inode)))) goto out; if (is_bad_inode(inode)) goto out; if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) goto out; if (inode->i_nlink > 0) { BUG_ON(btrfs_root_refs(&root->root_item) != 0 && root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID); goto out; } /* * This makes sure the inode item in tree is uptodate and the space for * the inode update is released. */ ret = btrfs_commit_inode_delayed_inode(BTRFS_I(inode)); if (ret) goto out; /* * This drops any pending insert or delete operations we have for this * inode. We could have a delayed dir index deletion queued up, but * we're removing the inode completely so that'll be taken care of in * the truncate. */ btrfs_kill_delayed_inode_items(BTRFS_I(inode)); rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP); if (!rsv) goto out; rsv->size = btrfs_calc_metadata_size(fs_info, 1); rsv->failfast = true; btrfs_i_size_write(BTRFS_I(inode), 0); while (1) { struct btrfs_truncate_control control = { .inode = BTRFS_I(inode), .ino = btrfs_ino(BTRFS_I(inode)), .new_size = 0, .min_type = 0, }; trans = evict_refill_and_join(root, rsv); if (IS_ERR(trans)) goto out; trans->block_rsv = rsv; ret = btrfs_truncate_inode_items(trans, root, &control); trans->block_rsv = &fs_info->trans_block_rsv; btrfs_end_transaction(trans); /* * We have not added new delayed items for our inode after we * have flushed its delayed items, so no need to throttle on * delayed items. However we have modified extent buffers. */ btrfs_btree_balance_dirty_nodelay(fs_info); if (ret && ret != -ENOSPC && ret != -EAGAIN) goto out; else if (!ret) break; } /* * Errors here aren't a big deal, it just means we leave orphan items in * the tree. They will be cleaned up on the next mount. If the inode * number gets reused, cleanup deletes the orphan item without doing * anything, and unlink reuses the existing orphan item. * * If it turns out that we are dropping too many of these, we might want * to add a mechanism for retrying these after a commit. */ trans = evict_refill_and_join(root, rsv); if (!IS_ERR(trans)) { trans->block_rsv = rsv; btrfs_orphan_del(trans, BTRFS_I(inode)); trans->block_rsv = &fs_info->trans_block_rsv; btrfs_end_transaction(trans); } out: btrfs_free_block_rsv(fs_info, rsv); /* * If we didn't successfully delete, the orphan item will still be in * the tree and we'll retry on the next mount. Again, we might also want * to retry these periodically in the future. */ btrfs_remove_delayed_node(BTRFS_I(inode)); fsverity_cleanup_inode(inode); clear_inode(inode); } /* * Return the key found in the dir entry in the location pointer, fill @type * with BTRFS_FT_*, and return 0. * * If no dir entries were found, returns -ENOENT. * If found a corrupted location in dir entry, returns -EUCLEAN. */ static int btrfs_inode_by_name(struct btrfs_inode *dir, struct dentry *dentry, struct btrfs_key *location, u8 *type) { struct btrfs_dir_item *di; struct btrfs_path *path; struct btrfs_root *root = dir->root; int ret = 0; struct fscrypt_name fname; path = btrfs_alloc_path(); if (!path) return -ENOMEM; ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 1, &fname); if (ret < 0) goto out; /* * fscrypt_setup_filename() should never return a positive value, but * gcc on sparc/parisc thinks it can, so assert that doesn't happen. */ ASSERT(ret == 0); /* This needs to handle no-key deletions later on */ di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir), &fname.disk_name, 0); if (IS_ERR_OR_NULL(di)) { ret = di ? PTR_ERR(di) : -ENOENT; goto out; } btrfs_dir_item_key_to_cpu(path->nodes[0], di, location); if (location->type != BTRFS_INODE_ITEM_KEY && location->type != BTRFS_ROOT_ITEM_KEY) { ret = -EUCLEAN; btrfs_warn(root->fs_info, "%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))", __func__, fname.disk_name.name, btrfs_ino(dir), location->objectid, location->type, location->offset); } if (!ret) *type = btrfs_dir_ftype(path->nodes[0], di); out: fscrypt_free_filename(&fname); btrfs_free_path(path); return ret; } /* * when we hit a tree root in a directory, the btrfs part of the inode * needs to be changed to reflect the root directory of the tree root. This * is kind of like crossing a mount point. */ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info, struct btrfs_inode *dir, struct dentry *dentry, struct btrfs_key *location, struct btrfs_root **sub_root) { struct btrfs_path *path; struct btrfs_root *new_root; struct btrfs_root_ref *ref; struct extent_buffer *leaf; struct btrfs_key key; int ret; int err = 0; struct fscrypt_name fname; ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 0, &fname); if (ret) return ret; path = btrfs_alloc_path(); if (!path) { err = -ENOMEM; goto out; } err = -ENOENT; key.objectid = dir->root->root_key.objectid; key.type = BTRFS_ROOT_REF_KEY; key.offset = location->objectid; ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); if (ret) { if (ret < 0) err = ret; goto out; } leaf = path->nodes[0]; ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(dir) || btrfs_root_ref_name_len(leaf, ref) != fname.disk_name.len) goto out; ret = memcmp_extent_buffer(leaf, fname.disk_name.name, (unsigned long)(ref + 1), fname.disk_name.len); if (ret) goto out; btrfs_release_path(path); new_root = btrfs_get_fs_root(fs_info, location->objectid, true); if (IS_ERR(new_root)) { err = PTR_ERR(new_root); goto out; } *sub_root = new_root; location->objectid = btrfs_root_dirid(&new_root->root_item); location->type = BTRFS_INODE_ITEM_KEY; location->offset = 0; err = 0; out: btrfs_free_path(path); fscrypt_free_filename(&fname); return err; } static void inode_tree_add(struct btrfs_inode *inode) { struct btrfs_root *root = inode->root; struct btrfs_inode *entry; struct rb_node **p; struct rb_node *parent; struct rb_node *new = &inode->rb_node; u64 ino = btrfs_ino(inode); if (inode_unhashed(&inode->vfs_inode)) return; parent = NULL; spin_lock(&root->inode_lock); p = &root->inode_tree.rb_node; while (*p) { parent = *p; entry = rb_entry(parent, struct btrfs_inode, rb_node); if (ino < btrfs_ino(entry)) p = &parent->rb_left; else if (ino > btrfs_ino(entry)) p = &parent->rb_right; else { WARN_ON(!(entry->vfs_inode.i_state & (I_WILL_FREE | I_FREEING))); rb_replace_node(parent, new, &root->inode_tree); RB_CLEAR_NODE(parent); spin_unlock(&root->inode_lock); return; } } rb_link_node(new, parent, p); rb_insert_color(new, &root->inode_tree); spin_unlock(&root->inode_lock); } static void inode_tree_del(struct btrfs_inode *inode) { struct btrfs_root *root = inode->root; int empty = 0; spin_lock(&root->inode_lock); if (!RB_EMPTY_NODE(&inode->rb_node)) { rb_erase(&inode->rb_node, &root->inode_tree); RB_CLEAR_NODE(&inode->rb_node); empty = RB_EMPTY_ROOT(&root->inode_tree); } spin_unlock(&root->inode_lock); if (empty && btrfs_root_refs(&root->root_item) == 0) { spin_lock(&root->inode_lock); empty = RB_EMPTY_ROOT(&root->inode_tree); spin_unlock(&root->inode_lock); if (empty) btrfs_add_dead_root(root); } } static int btrfs_init_locked_inode(struct inode *inode, void *p) { struct btrfs_iget_args *args = p; inode->i_ino = args->ino; BTRFS_I(inode)->location.objectid = args->ino; BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; BTRFS_I(inode)->location.offset = 0; BTRFS_I(inode)->root = btrfs_grab_root(args->root); BUG_ON(args->root && !BTRFS_I(inode)->root); if (args->root && args->root == args->root->fs_info->tree_root && args->ino != BTRFS_BTREE_INODE_OBJECTID) set_bit(BTRFS_INODE_FREE_SPACE_INODE, &BTRFS_I(inode)->runtime_flags); return 0; } static int btrfs_find_actor(struct inode *inode, void *opaque) { struct btrfs_iget_args *args = opaque; return args->ino == BTRFS_I(inode)->location.objectid && args->root == BTRFS_I(inode)->root; } static struct inode *btrfs_iget_locked(struct super_block *s, u64 ino, struct btrfs_root *root) { struct inode *inode; struct btrfs_iget_args args; unsigned long hashval = btrfs_inode_hash(ino, root); args.ino = ino; args.root = root; inode = iget5_locked(s, hashval, btrfs_find_actor, btrfs_init_locked_inode, (void *)&args); return inode; } /* * Get an inode object given its inode number and corresponding root. * Path can be preallocated to prevent recursing back to iget through * allocator. NULL is also valid but may require an additional allocation * later. */ struct inode *btrfs_iget_path(struct super_block *s, u64 ino, struct btrfs_root *root, struct btrfs_path *path) { struct inode *inode; inode = btrfs_iget_locked(s, ino, root); if (!inode) return ERR_PTR(-ENOMEM); if (inode->i_state & I_NEW) { int ret; ret = btrfs_read_locked_inode(inode, path); if (!ret) { inode_tree_add(BTRFS_I(inode)); unlock_new_inode(inode); } else { iget_failed(inode); /* * ret > 0 can come from btrfs_search_slot called by * btrfs_read_locked_inode, this means the inode item * was not found. */ if (ret > 0) ret = -ENOENT; inode = ERR_PTR(ret); } } return inode; } struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root) { return btrfs_iget_path(s, ino, root, NULL); } static struct inode *new_simple_dir(struct inode *dir, struct btrfs_key *key, struct btrfs_root *root) { struct timespec64 ts; struct inode *inode = new_inode(dir->i_sb); if (!inode) return ERR_PTR(-ENOMEM); BTRFS_I(inode)->root = btrfs_grab_root(root); memcpy(&BTRFS_I(inode)->location, key, sizeof(*key)); set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID; /* * We only need lookup, the rest is read-only and there's no inode * associated with the dentry */ inode->i_op = &simple_dir_inode_operations; inode->i_opflags &= ~IOP_XATTR; inode->i_fop = &simple_dir_operations; inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; ts = inode_set_ctime_current(inode); inode_set_mtime_to_ts(inode, ts); inode_set_atime_to_ts(inode, inode_get_atime(dir)); BTRFS_I(inode)->i_otime_sec = ts.tv_sec; BTRFS_I(inode)->i_otime_nsec = ts.tv_nsec; inode->i_uid = dir->i_uid; inode->i_gid = dir->i_gid; return inode; } static_assert(BTRFS_FT_UNKNOWN == FT_UNKNOWN); static_assert(BTRFS_FT_REG_FILE == FT_REG_FILE); static_assert(BTRFS_FT_DIR == FT_DIR); static_assert(BTRFS_FT_CHRDEV == FT_CHRDEV); static_assert(BTRFS_FT_BLKDEV == FT_BLKDEV); static_assert(BTRFS_FT_FIFO == FT_FIFO); static_assert(BTRFS_FT_SOCK == FT_SOCK); static_assert(BTRFS_FT_SYMLINK == FT_SYMLINK); static inline u8 btrfs_inode_type(struct inode *inode) { return fs_umode_to_ftype(inode->i_mode); } struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) { struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); struct inode *inode; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_root *sub_root = root; struct btrfs_key location; u8 di_type = 0; int ret = 0; if (dentry->d_name.len > BTRFS_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); ret = btrfs_inode_by_name(BTRFS_I(dir), dentry, &location, &di_type); if (ret < 0) return ERR_PTR(ret); if (location.type == BTRFS_INODE_ITEM_KEY) { inode = btrfs_iget(dir->i_sb, location.objectid, root); if (IS_ERR(inode)) return inode; /* Do extra check against inode mode with di_type */ if (btrfs_inode_type(inode) != di_type) { btrfs_crit(fs_info, "inode mode mismatch with dir: inode mode=0%o btrfs type=%u dir type=%u", inode->i_mode, btrfs_inode_type(inode), di_type); iput(inode); return ERR_PTR(-EUCLEAN); } return inode; } ret = fixup_tree_root_location(fs_info, BTRFS_I(dir), dentry, &location, &sub_root); if (ret < 0) { if (ret != -ENOENT) inode = ERR_PTR(ret); else inode = new_simple_dir(dir, &location, root); } else { inode = btrfs_iget(dir->i_sb, location.objectid, sub_root); btrfs_put_root(sub_root); if (IS_ERR(inode)) return inode; down_read(&fs_info->cleanup_work_sem); if (!sb_rdonly(inode->i_sb)) ret = btrfs_orphan_cleanup(sub_root); up_read(&fs_info->cleanup_work_sem); if (ret) { iput(inode); inode = ERR_PTR(ret); } } return inode; } static int btrfs_dentry_delete(const struct dentry *dentry) { struct btrfs_root *root; struct inode *inode = d_inode(dentry); if (!inode && !IS_ROOT(dentry)) inode = d_inode(dentry->d_parent); if (inode) { root = BTRFS_I(inode)->root; if (btrfs_root_refs(&root->root_item) == 0) return 1; if (btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) return 1; } return 0; } static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct inode *inode = btrfs_lookup_dentry(dir, dentry); if (inode == ERR_PTR(-ENOENT)) inode = NULL; return d_splice_alias(inode, dentry); } /* * Find the highest existing sequence number in a directory and then set the * in-memory index_cnt variable to the first free sequence number. */ static int btrfs_set_inode_index_count(struct btrfs_inode *inode) { struct btrfs_root *root = inode->root; struct btrfs_key key, found_key; struct btrfs_path *path; struct extent_buffer *leaf; int ret; key.objectid = btrfs_ino(inode); key.type = BTRFS_DIR_INDEX_KEY; key.offset = (u64)-1; path = btrfs_alloc_path(); if (!path) return -ENOMEM; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto out; /* FIXME: we should be able to handle this */ if (ret == 0) goto out; ret = 0; if (path->slots[0] == 0) { inode->index_cnt = BTRFS_DIR_START_INDEX; goto out; } path->slots[0]--; leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); if (found_key.objectid != btrfs_ino(inode) || found_key.type != BTRFS_DIR_INDEX_KEY) { inode->index_cnt = BTRFS_DIR_START_INDEX; goto out; } inode->index_cnt = found_key.offset + 1; out: btrfs_free_path(path); return ret; } static int btrfs_get_dir_last_index(struct btrfs_inode *dir, u64 *index) { int ret = 0; btrfs_inode_lock(dir, 0); if (dir->index_cnt == (u64)-1) { ret = btrfs_inode_delayed_dir_index_count(dir); if (ret) { ret = btrfs_set_inode_index_count(dir); if (ret) goto out; } } /* index_cnt is the index number of next new entry, so decrement it. */ *index = dir->index_cnt - 1; out: btrfs_inode_unlock(dir, 0); return ret; } /* * All this infrastructure exists because dir_emit can fault, and we are holding * the tree lock when doing readdir. For now just allocate a buffer and copy * our information into that, and then dir_emit from the buffer. This is * similar to what NFS does, only we don't keep the buffer around in pagecache * because I'm afraid I'll mess that up. Long term we need to make filldir do * copy_to_user_inatomic so we don't have to worry about page faulting under the * tree lock. */ static int btrfs_opendir(struct inode *inode, struct file *file) { struct btrfs_file_private *private; u64 last_index; int ret; ret = btrfs_get_dir_last_index(BTRFS_I(inode), &last_index); if (ret) return ret; private = kzalloc(sizeof(struct btrfs_file_private), GFP_KERNEL); if (!private) return -ENOMEM; private->last_index = last_index; private->filldir_buf = kzalloc(PAGE_SIZE, GFP_KERNEL); if (!private->filldir_buf) { kfree(private); return -ENOMEM; } file->private_data = private; return 0; } static loff_t btrfs_dir_llseek(struct file *file, loff_t offset, int whence) { struct btrfs_file_private *private = file->private_data; int ret; ret = btrfs_get_dir_last_index(BTRFS_I(file_inode(file)), &private->last_index); if (ret) return ret; return generic_file_llseek(file, offset, whence); } struct dir_entry { u64 ino; u64 offset; unsigned type; int name_len; }; static int btrfs_filldir(void *addr, int entries, struct dir_context *ctx) { while (entries--) { struct dir_entry *entry = addr; char *name = (char *)(entry + 1); ctx->pos = get_unaligned(&entry->offset); if (!dir_emit(ctx, name, get_unaligned(&entry->name_len), get_unaligned(&entry->ino), get_unaligned(&entry->type))) return 1; addr += sizeof(struct dir_entry) + get_unaligned(&entry->name_len); ctx->pos++; } return 0; } static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) { struct inode *inode = file_inode(file); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_file_private *private = file->private_data; struct btrfs_dir_item *di; struct btrfs_key key; struct btrfs_key found_key; struct btrfs_path *path; void *addr; LIST_HEAD(ins_list); LIST_HEAD(del_list); int ret; char *name_ptr; int name_len; int entries = 0; int total_len = 0; bool put = false; struct btrfs_key location; if (!dir_emit_dots(file, ctx)) return 0; path = btrfs_alloc_path(); if (!path) return -ENOMEM; addr = private->filldir_buf; path->reada = READA_FORWARD; put = btrfs_readdir_get_delayed_items(inode, private->last_index, &ins_list, &del_list); again: key.type = BTRFS_DIR_INDEX_KEY; key.offset = ctx->pos; key.objectid = btrfs_ino(BTRFS_I(inode)); btrfs_for_each_slot(root, &key, &found_key, path, ret) { struct dir_entry *entry; struct extent_buffer *leaf = path->nodes[0]; u8 ftype; if (found_key.objectid != key.objectid) break; if (found_key.type != BTRFS_DIR_INDEX_KEY) break; if (found_key.offset < ctx->pos) continue; if (found_key.offset > private->last_index) break; if (btrfs_should_delete_dir_index(&del_list, found_key.offset)) continue; di = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item); name_len = btrfs_dir_name_len(leaf, di); if ((total_len + sizeof(struct dir_entry) + name_len) >= PAGE_SIZE) { btrfs_release_path(path); ret = btrfs_filldir(private->filldir_buf, entries, ctx); if (ret) goto nopos; addr = private->filldir_buf; entries = 0; total_len = 0; goto again; } ftype = btrfs_dir_flags_to_ftype(btrfs_dir_flags(leaf, di)); entry = addr; name_ptr = (char *)(entry + 1); read_extent_buffer(leaf, name_ptr, (unsigned long)(di + 1), name_len); put_unaligned(name_len, &entry->name_len); put_unaligned(fs_ftype_to_dtype(ftype), &entry->type); btrfs_dir_item_key_to_cpu(leaf, di, &location); put_unaligned(location.objectid, &entry->ino); put_unaligned(found_key.offset, &entry->offset); entries++; addr += sizeof(struct dir_entry) + name_len; total_len += sizeof(struct dir_entry) + name_len; } /* Catch error encountered during iteration */ if (ret < 0) goto err; btrfs_release_path(path); ret = btrfs_filldir(private->filldir_buf, entries, ctx); if (ret) goto nopos; ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list); if (ret) goto nopos; /* * Stop new entries from being returned after we return the last * entry. * * New directory entries are assigned a strictly increasing * offset. This means that new entries created during readdir * are *guaranteed* to be seen in the future by that readdir. * This has broken buggy programs which operate on names as * they're returned by readdir. Until we re-use freed offsets * we have this hack to stop new entries from being returned * under the assumption that they'll never reach this huge * offset. * * This is being careful not to overflow 32bit loff_t unless the * last entry requires it because doing so has broken 32bit apps * in the past. */ if (ctx->pos >= INT_MAX) ctx->pos = LLONG_MAX; else ctx->pos = INT_MAX; nopos: ret = 0; err: if (put) btrfs_readdir_put_delayed_items(inode, &ins_list, &del_list); btrfs_free_path(path); return ret; } /* * This is somewhat expensive, updating the tree every time the * inode changes. But, it is most likely to find the inode in cache. * FIXME, needs more benchmarking...there are no reasons other than performance * to keep or drop this code. */ static int btrfs_dirty_inode(struct btrfs_inode *inode) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans; int ret; if (test_bit(BTRFS_INODE_DUMMY, &inode->runtime_flags)) return 0; trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return PTR_ERR(trans); ret = btrfs_update_inode(trans, inode); if (ret == -ENOSPC || ret == -EDQUOT) { /* whoops, lets try again with the full transaction */ btrfs_end_transaction(trans); trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) return PTR_ERR(trans); ret = btrfs_update_inode(trans, inode); } btrfs_end_transaction(trans); if (inode->delayed_node) btrfs_balance_delayed_items(fs_info); return ret; } /* * This is a copy of file_update_time. We need this so we can return error on * ENOSPC for updating the inode in the case of file write and mmap writes. */ static int btrfs_update_time(struct inode *inode, int flags) { struct btrfs_root *root = BTRFS_I(inode)->root; bool dirty; if (btrfs_root_readonly(root)) return -EROFS; dirty = inode_update_timestamps(inode, flags); return dirty ? btrfs_dirty_inode(BTRFS_I(inode)) : 0; } /* * helper to find a free sequence number in a given directory. This current * code is very simple, later versions will do smarter things in the btree */ int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index) { int ret = 0; if (dir->index_cnt == (u64)-1) { ret = btrfs_inode_delayed_dir_index_count(dir); if (ret) { ret = btrfs_set_inode_index_count(dir); if (ret) return ret; } } *index = dir->index_cnt; dir->index_cnt++; return ret; } static int btrfs_insert_inode_locked(struct inode *inode) { struct btrfs_iget_args args; args.ino = BTRFS_I(inode)->location.objectid; args.root = BTRFS_I(inode)->root; return insert_inode_locked4(inode, btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root), btrfs_find_actor, &args); } int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args, unsigned int *trans_num_items) { struct inode *dir = args->dir; struct inode *inode = args->inode; int ret; if (!args->orphan) { ret = fscrypt_setup_filename(dir, &args->dentry->d_name, 0, &args->fname); if (ret) return ret; } ret = posix_acl_create(dir, &inode->i_mode, &args->default_acl, &args->acl); if (ret) { fscrypt_free_filename(&args->fname); return ret; } /* 1 to add inode item */ *trans_num_items = 1; /* 1 to add compression property */ if (BTRFS_I(dir)->prop_compress) (*trans_num_items)++; /* 1 to add default ACL xattr */ if (args->default_acl) (*trans_num_items)++; /* 1 to add access ACL xattr */ if (args->acl) (*trans_num_items)++; #ifdef CONFIG_SECURITY /* 1 to add LSM xattr */ if (dir->i_security) (*trans_num_items)++; #endif if (args->orphan) { /* 1 to add orphan item */ (*trans_num_items)++; } else { /* * 1 to add dir item * 1 to add dir index * 1 to update parent inode item * * No need for 1 unit for the inode ref item because it is * inserted in a batch together with the inode item at * btrfs_create_new_inode(). */ *trans_num_items += 3; } return 0; } void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args) { posix_acl_release(args->acl); posix_acl_release(args->default_acl); fscrypt_free_filename(&args->fname); } /* * Inherit flags from the parent inode. * * Currently only the compression flags and the cow flags are inherited. */ static void btrfs_inherit_iflags(struct btrfs_inode *inode, struct btrfs_inode *dir) { unsigned int flags; flags = dir->flags; if (flags & BTRFS_INODE_NOCOMPRESS) { inode->flags &= ~BTRFS_INODE_COMPRESS; inode->flags |= BTRFS_INODE_NOCOMPRESS; } else if (flags & BTRFS_INODE_COMPRESS) { inode->flags &= ~BTRFS_INODE_NOCOMPRESS; inode->flags |= BTRFS_INODE_COMPRESS; } if (flags & BTRFS_INODE_NODATACOW) { inode->flags |= BTRFS_INODE_NODATACOW; if (S_ISREG(inode->vfs_inode.i_mode)) inode->flags |= BTRFS_INODE_NODATASUM; } btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode); } int btrfs_create_new_inode(struct btrfs_trans_handle *trans, struct btrfs_new_inode_args *args) { struct timespec64 ts; struct inode *dir = args->dir; struct inode *inode = args->inode; const struct fscrypt_str *name = args->orphan ? NULL : &args->fname.disk_name; struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); struct btrfs_root *root; struct btrfs_inode_item *inode_item; struct btrfs_key *location; struct btrfs_path *path; u64 objectid; struct btrfs_inode_ref *ref; struct btrfs_key key[2]; u32 sizes[2]; struct btrfs_item_batch batch; unsigned long ptr; int ret; path = btrfs_alloc_path(); if (!path) return -ENOMEM; if (!args->subvol) BTRFS_I(inode)->root = btrfs_grab_root(BTRFS_I(dir)->root); root = BTRFS_I(inode)->root; ret = btrfs_get_free_objectid(root, &objectid); if (ret) goto out; inode->i_ino = objectid; if (args->orphan) { /* * O_TMPFILE, set link count to 0, so that after this point, we * fill in an inode item with the correct link count. */ set_nlink(inode, 0); } else { trace_btrfs_inode_request(dir); ret = btrfs_set_inode_index(BTRFS_I(dir), &BTRFS_I(inode)->dir_index); if (ret) goto out; } /* index_cnt is ignored for everything but a dir. */ BTRFS_I(inode)->index_cnt = BTRFS_DIR_START_INDEX; BTRFS_I(inode)->generation = trans->transid; inode->i_generation = BTRFS_I(inode)->generation; /* * Subvolumes don't inherit flags from their parent directory. * Originally this was probably by accident, but we probably can't * change it now without compatibility issues. */ if (!args->subvol) btrfs_inherit_iflags(BTRFS_I(inode), BTRFS_I(dir)); if (S_ISREG(inode->i_mode)) { if (btrfs_test_opt(fs_info, NODATASUM)) BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; if (btrfs_test_opt(fs_info, NODATACOW)) BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW | BTRFS_INODE_NODATASUM; } location = &BTRFS_I(inode)->location; location->objectid = objectid; location->offset = 0; location->type = BTRFS_INODE_ITEM_KEY; ret = btrfs_insert_inode_locked(inode); if (ret < 0) { if (!args->orphan) BTRFS_I(dir)->index_cnt--; goto out; } /* * We could have gotten an inode number from somebody who was fsynced * and then removed in this same transaction, so let's just set full * sync since it will be a full sync anyway and this will blow away the * old info in the log. */ btrfs_set_inode_full_sync(BTRFS_I(inode)); key[0].objectid = objectid; key[0].type = BTRFS_INODE_ITEM_KEY; key[0].offset = 0; sizes[0] = sizeof(struct btrfs_inode_item); if (!args->orphan) { /* * Start new inodes with an inode_ref. This is slightly more * efficient for small numbers of hard links since they will * be packed into one item. Extended refs will kick in if we * add more hard links than can fit in the ref item. */ key[1].objectid = objectid; key[1].type = BTRFS_INODE_REF_KEY; if (args->subvol) { key[1].offset = objectid; sizes[1] = 2 + sizeof(*ref); } else { key[1].offset = btrfs_ino(BTRFS_I(dir)); sizes[1] = name->len + sizeof(*ref); } } batch.keys = &key[0]; batch.data_sizes = &sizes[0]; batch.total_data_size = sizes[0] + (args->orphan ? 0 : sizes[1]); batch.nr = args->orphan ? 1 : 2; ret = btrfs_insert_empty_items(trans, root, path, &batch); if (ret != 0) { btrfs_abort_transaction(trans, ret); goto discard; } ts = simple_inode_init_ts(inode); BTRFS_I(inode)->i_otime_sec = ts.tv_sec; BTRFS_I(inode)->i_otime_nsec = ts.tv_nsec; /* * We're going to fill the inode item now, so at this point the inode * must be fully initialized. */ inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_item); memzero_extent_buffer(path->nodes[0], (unsigned long)inode_item, sizeof(*inode_item)); fill_inode_item(trans, path->nodes[0], inode_item, inode); if (!args->orphan) { ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, struct btrfs_inode_ref); ptr = (unsigned long)(ref + 1); if (args->subvol) { btrfs_set_inode_ref_name_len(path->nodes[0], ref, 2); btrfs_set_inode_ref_index(path->nodes[0], ref, 0); write_extent_buffer(path->nodes[0], "..", ptr, 2); } else { btrfs_set_inode_ref_name_len(path->nodes[0], ref, name->len); btrfs_set_inode_ref_index(path->nodes[0], ref, BTRFS_I(inode)->dir_index); write_extent_buffer(path->nodes[0], name->name, ptr, name->len); } } btrfs_mark_buffer_dirty(trans, path->nodes[0]); /* * We don't need the path anymore, plus inheriting properties, adding * ACLs, security xattrs, orphan item or adding the link, will result in * allocating yet another path. So just free our path. */ btrfs_free_path(path); path = NULL; if (args->subvol) { struct inode *parent; /* * Subvolumes inherit properties from their parent subvolume, * not the directory they were created in. */ parent = btrfs_iget(fs_info->sb, BTRFS_FIRST_FREE_OBJECTID, BTRFS_I(dir)->root); if (IS_ERR(parent)) { ret = PTR_ERR(parent); } else { ret = btrfs_inode_inherit_props(trans, inode, parent); iput(parent); } } else { ret = btrfs_inode_inherit_props(trans, inode, dir); } if (ret) { btrfs_err(fs_info, "error inheriting props for ino %llu (root %llu): %d", btrfs_ino(BTRFS_I(inode)), root->root_key.objectid, ret); } /* * Subvolumes don't inherit ACLs or get passed to the LSM. This is * probably a bug. */ if (!args->subvol) { ret = btrfs_init_inode_security(trans, args); if (ret) { btrfs_abort_transaction(trans, ret); goto discard; } } inode_tree_add(BTRFS_I(inode)); trace_btrfs_inode_new(inode); btrfs_set_inode_last_trans(trans, BTRFS_I(inode)); btrfs_update_root_times(trans, root); if (args->orphan) { ret = btrfs_orphan_add(trans, BTRFS_I(inode)); } else { ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name, 0, BTRFS_I(inode)->dir_index); } if (ret) { btrfs_abort_transaction(trans, ret); goto discard; } return 0; discard: /* * discard_new_inode() calls iput(), but the caller owns the reference * to the inode. */ ihold(inode); discard_new_inode(inode); out: btrfs_free_path(path); return ret; } /* * utility function to add 'inode' into 'parent_inode' with * a give name and a given sequence number. * if 'add_backref' is true, also insert a backref from the * inode to the parent directory. */ int btrfs_add_link(struct btrfs_trans_handle *trans, struct btrfs_inode *parent_inode, struct btrfs_inode *inode, const struct fscrypt_str *name, int add_backref, u64 index) { int ret = 0; struct btrfs_key key; struct btrfs_root *root = parent_inode->root; u64 ino = btrfs_ino(inode); u64 parent_ino = btrfs_ino(parent_inode); if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { memcpy(&key, &inode->root->root_key, sizeof(key)); } else { key.objectid = ino; key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; } if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { ret = btrfs_add_root_ref(trans, key.objectid, root->root_key.objectid, parent_ino, index, name); } else if (add_backref) { ret = btrfs_insert_inode_ref(trans, root, name, ino, parent_ino, index); } /* Nothing to clean up yet */ if (ret) return ret; ret = btrfs_insert_dir_item(trans, name, parent_inode, &key, btrfs_inode_type(&inode->vfs_inode), index); if (ret == -EEXIST || ret == -EOVERFLOW) goto fail_dir_item; else if (ret) { btrfs_abort_transaction(trans, ret); return ret; } btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size + name->len * 2); inode_inc_iversion(&parent_inode->vfs_inode); /* * If we are replaying a log tree, we do not want to update the mtime * and ctime of the parent directory with the current time, since the * log replay procedure is responsible for setting them to their correct * values (the ones it had when the fsync was done). */ if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags)) inode_set_mtime_to_ts(&parent_inode->vfs_inode, inode_set_ctime_current(&parent_inode->vfs_inode)); ret = btrfs_update_inode(trans, parent_inode); if (ret) btrfs_abort_transaction(trans, ret); return ret; fail_dir_item: if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { u64 local_index; int err; err = btrfs_del_root_ref(trans, key.objectid, root->root_key.objectid, parent_ino, &local_index, name); if (err) btrfs_abort_transaction(trans, err); } else if (add_backref) { u64 local_index; int err; err = btrfs_del_inode_ref(trans, root, name, ino, parent_ino, &local_index); if (err) btrfs_abort_transaction(trans, err); } /* Return the original error code */ return ret; } static int btrfs_create_common(struct inode *dir, struct dentry *dentry, struct inode *inode) { struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_new_inode_args new_inode_args = { .dir = dir, .dentry = dentry, .inode = inode, }; unsigned int trans_num_items; struct btrfs_trans_handle *trans; int err; err = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items); if (err) goto out_inode; trans = btrfs_start_transaction(root, trans_num_items); if (IS_ERR(trans)) { err = PTR_ERR(trans); goto out_new_inode_args; } err = btrfs_create_new_inode(trans, &new_inode_args); if (!err) d_instantiate_new(dentry, inode); btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); out_new_inode_args: btrfs_new_inode_args_destroy(&new_inode_args); out_inode: if (err) iput(inode); return err; } static int btrfs_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct inode *inode; inode = new_inode(dir->i_sb); if (!inode) return -ENOMEM; inode_init_owner(idmap, inode, dir, mode); inode->i_op = &btrfs_special_inode_operations; init_special_inode(inode, inode->i_mode, rdev); return btrfs_create_common(dir, dentry, inode); } static int btrfs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { struct inode *inode; inode = new_inode(dir->i_sb); if (!inode) return -ENOMEM; inode_init_owner(idmap, inode, dir, mode); inode->i_fop = &btrfs_file_operations; inode->i_op = &btrfs_file_inode_operations; inode->i_mapping->a_ops = &btrfs_aops; return btrfs_create_common(dir, dentry, inode); } static int btrfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { struct btrfs_trans_handle *trans = NULL; struct btrfs_root *root = BTRFS_I(dir)->root; struct inode *inode = d_inode(old_dentry); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct fscrypt_name fname; u64 index; int err; int drop_inode = 0; /* do not allow sys_link's with other subvols of the same device */ if (root->root_key.objectid != BTRFS_I(inode)->root->root_key.objectid) return -EXDEV; if (inode->i_nlink >= BTRFS_LINK_MAX) return -EMLINK; err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &fname); if (err) goto fail; err = btrfs_set_inode_index(BTRFS_I(dir), &index); if (err) goto fail; /* * 2 items for inode and inode ref * 2 items for dir items * 1 item for parent inode * 1 item for orphan item deletion if O_TMPFILE */ trans = btrfs_start_transaction(root, inode->i_nlink ? 5 : 6); if (IS_ERR(trans)) { err = PTR_ERR(trans); trans = NULL; goto fail; } /* There are several dir indexes for this inode, clear the cache. */ BTRFS_I(inode)->dir_index = 0ULL; inc_nlink(inode); inode_inc_iversion(inode); inode_set_ctime_current(inode); ihold(inode); set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), &fname.disk_name, 1, index); if (err) { drop_inode = 1; } else { struct dentry *parent = dentry->d_parent; err = btrfs_update_inode(trans, BTRFS_I(inode)); if (err) goto fail; if (inode->i_nlink == 1) { /* * If new hard link count is 1, it's a file created * with open(2) O_TMPFILE flag. */ err = btrfs_orphan_del(trans, BTRFS_I(inode)); if (err) goto fail; } d_instantiate(dentry, inode); btrfs_log_new_name(trans, old_dentry, NULL, 0, parent); } fail: fscrypt_free_filename(&fname); if (trans) btrfs_end_transaction(trans); if (drop_inode) { inode_dec_link_count(inode); iput(inode); } btrfs_btree_balance_dirty(fs_info); return err; } static int btrfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode; inode = new_inode(dir->i_sb); if (!inode) return -ENOMEM; inode_init_owner(idmap, inode, dir, S_IFDIR | mode); inode->i_op = &btrfs_dir_inode_operations; inode->i_fop = &btrfs_dir_file_operations; return btrfs_create_common(dir, dentry, inode); } static noinline int uncompress_inline(struct btrfs_path *path, struct page *page, struct btrfs_file_extent_item *item) { int ret; struct extent_buffer *leaf = path->nodes[0]; char *tmp; size_t max_size; unsigned long inline_size; unsigned long ptr; int compress_type; compress_type = btrfs_file_extent_compression(leaf, item); max_size = btrfs_file_extent_ram_bytes(leaf, item); inline_size = btrfs_file_extent_inline_item_len(leaf, path->slots[0]); tmp = kmalloc(inline_size, GFP_NOFS); if (!tmp) return -ENOMEM; ptr = btrfs_file_extent_inline_start(item); read_extent_buffer(leaf, tmp, ptr, inline_size); max_size = min_t(unsigned long, PAGE_SIZE, max_size); ret = btrfs_decompress(compress_type, tmp, page, 0, inline_size, max_size); /* * decompression code contains a memset to fill in any space between the end * of the uncompressed data and the end of max_size in case the decompressed * data ends up shorter than ram_bytes. That doesn't cover the hole between * the end of an inline extent and the beginning of the next block, so we * cover that region here. */ if (max_size < PAGE_SIZE) memzero_page(page, max_size, PAGE_SIZE - max_size); kfree(tmp); return ret; } static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path, struct page *page) { struct btrfs_file_extent_item *fi; void *kaddr; size_t copy_size; if (!page || PageUptodate(page)) return 0; ASSERT(page_offset(page) == 0); fi = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_file_extent_item); if (btrfs_file_extent_compression(path->nodes[0], fi) != BTRFS_COMPRESS_NONE) return uncompress_inline(path, page, fi); copy_size = min_t(u64, PAGE_SIZE, btrfs_file_extent_ram_bytes(path->nodes[0], fi)); kaddr = kmap_local_page(page); read_extent_buffer(path->nodes[0], kaddr, btrfs_file_extent_inline_start(fi), copy_size); kunmap_local(kaddr); if (copy_size < PAGE_SIZE) memzero_page(page, copy_size, PAGE_SIZE - copy_size); return 0; } /* * Lookup the first extent overlapping a range in a file. * * @inode: file to search in * @page: page to read extent data into if the extent is inline * @pg_offset: offset into @page to copy to * @start: file offset * @len: length of range starting at @start * * Return the first &struct extent_map which overlaps the given range, reading * it from the B-tree and caching it if necessary. Note that there may be more * extents which overlap the given range after the returned extent_map. * * If @page is not NULL and the extent is inline, this also reads the extent * data directly into the page and marks the extent up to date in the io_tree. * * Return: ERR_PTR on error, non-NULL extent_map on success. */ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, struct page *page, size_t pg_offset, u64 start, u64 len) { struct btrfs_fs_info *fs_info = inode->root->fs_info; int ret = 0; u64 extent_start = 0; u64 extent_end = 0; u64 objectid = btrfs_ino(inode); int extent_type = -1; struct btrfs_path *path = NULL; struct btrfs_root *root = inode->root; struct btrfs_file_extent_item *item; struct extent_buffer *leaf; struct btrfs_key found_key; struct extent_map *em = NULL; struct extent_map_tree *em_tree = &inode->extent_tree; read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, start, len); read_unlock(&em_tree->lock); if (em) { if (em->start > start || em->start + em->len <= start) free_extent_map(em); else if (em->block_start == EXTENT_MAP_INLINE && page) free_extent_map(em); else goto out; } em = alloc_extent_map(); if (!em) { ret = -ENOMEM; goto out; } em->start = EXTENT_MAP_HOLE; em->orig_start = EXTENT_MAP_HOLE; em->len = (u64)-1; em->block_len = (u64)-1; path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; goto out; } /* Chances are we'll be called again, so go ahead and do readahead */ path->reada = READA_FORWARD; /* * The same explanation in load_free_space_cache applies here as well, * we only read when we're loading the free space cache, and at that * point the commit_root has everything we need. */ if (btrfs_is_free_space_inode(inode)) { path->search_commit_root = 1; path->skip_locking = 1; } ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0); if (ret < 0) { goto out; } else if (ret > 0) { if (path->slots[0] == 0) goto not_found; path->slots[0]--; ret = 0; } leaf = path->nodes[0]; item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); if (found_key.objectid != objectid || found_key.type != BTRFS_EXTENT_DATA_KEY) { /* * If we backup past the first extent we want to move forward * and see if there is an extent in front of us, otherwise we'll * say there is a hole for our whole search range which can * cause problems. */ extent_end = start; goto next; } extent_type = btrfs_file_extent_type(leaf, item); extent_start = found_key.offset; extent_end = btrfs_file_extent_end(path); if (extent_type == BTRFS_FILE_EXTENT_REG || extent_type == BTRFS_FILE_EXTENT_PREALLOC) { /* Only regular file could have regular/prealloc extent */ if (!S_ISREG(inode->vfs_inode.i_mode)) { ret = -EUCLEAN; btrfs_crit(fs_info, "regular/prealloc extent found for non-regular inode %llu", btrfs_ino(inode)); goto out; } trace_btrfs_get_extent_show_fi_regular(inode, leaf, item, extent_start); } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { trace_btrfs_get_extent_show_fi_inline(inode, leaf, item, path->slots[0], extent_start); } next: if (start >= extent_end) { path->slots[0]++; if (path->slots[0] >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(root, path); if (ret < 0) goto out; else if (ret > 0) goto not_found; leaf = path->nodes[0]; } btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); if (found_key.objectid != objectid || found_key.type != BTRFS_EXTENT_DATA_KEY) goto not_found; if (start + len <= found_key.offset) goto not_found; if (start > found_key.offset) goto next; /* New extent overlaps with existing one */ em->start = start; em->orig_start = start; em->len = found_key.offset - start; em->block_start = EXTENT_MAP_HOLE; goto insert; } btrfs_extent_item_to_extent_map(inode, path, item, em); if (extent_type == BTRFS_FILE_EXTENT_REG || extent_type == BTRFS_FILE_EXTENT_PREALLOC) { goto insert; } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { /* * Inline extent can only exist at file offset 0. This is * ensured by tree-checker and inline extent creation path. * Thus all members representing file offsets should be zero. */ ASSERT(pg_offset == 0); ASSERT(extent_start == 0); ASSERT(em->start == 0); /* * btrfs_extent_item_to_extent_map() should have properly * initialized em members already. * * Other members are not utilized for inline extents. */ ASSERT(em->block_start == EXTENT_MAP_INLINE); ASSERT(em->len == fs_info->sectorsize); ret = read_inline_extent(inode, path, page); if (ret < 0) goto out; goto insert; } not_found: em->start = start; em->orig_start = start; em->len = len; em->block_start = EXTENT_MAP_HOLE; insert: ret = 0; btrfs_release_path(path); if (em->start > start || extent_map_end(em) <= start) { btrfs_err(fs_info, "bad extent! em: [%llu %llu] passed [%llu %llu]", em->start, em->len, start, len); ret = -EIO; goto out; } write_lock(&em_tree->lock); ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len); write_unlock(&em_tree->lock); out: btrfs_free_path(path); trace_btrfs_get_extent(root, inode, em); if (ret) { free_extent_map(em); return ERR_PTR(ret); } return em; } static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode, struct btrfs_dio_data *dio_data, const u64 start, const u64 len, const u64 orig_start, const u64 block_start, const u64 block_len, const u64 orig_block_len, const u64 ram_bytes, const int type) { struct extent_map *em = NULL; struct btrfs_ordered_extent *ordered; if (type != BTRFS_ORDERED_NOCOW) { em = create_io_em(inode, start, len, orig_start, block_start, block_len, orig_block_len, ram_bytes, BTRFS_COMPRESS_NONE, /* compress_type */ type); if (IS_ERR(em)) goto out; } ordered = btrfs_alloc_ordered_extent(inode, start, len, len, block_start, block_len, 0, (1 << type) | (1 << BTRFS_ORDERED_DIRECT), BTRFS_COMPRESS_NONE); if (IS_ERR(ordered)) { if (em) { free_extent_map(em); btrfs_drop_extent_map_range(inode, start, start + len - 1, false); } em = ERR_CAST(ordered); } else { ASSERT(!dio_data->ordered); dio_data->ordered = ordered; } out: return em; } static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode, struct btrfs_dio_data *dio_data, u64 start, u64 len) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct extent_map *em; struct btrfs_key ins; u64 alloc_hint; int ret; alloc_hint = get_extent_allocation_hint(inode, start, len); again: ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize, 0, alloc_hint, &ins, 1, 1); if (ret == -EAGAIN) { ASSERT(btrfs_is_zoned(fs_info)); wait_on_bit_io(&inode->root->fs_info->flags, BTRFS_FS_NEED_ZONE_FINISH, TASK_UNINTERRUPTIBLE); goto again; } if (ret) return ERR_PTR(ret); em = btrfs_create_dio_extent(inode, dio_data, start, ins.offset, start, ins.objectid, ins.offset, ins.offset, ins.offset, BTRFS_ORDERED_REGULAR); btrfs_dec_block_group_reservations(fs_info, ins.objectid); if (IS_ERR(em)) btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); return em; } static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr) { struct btrfs_block_group *block_group; bool readonly = false; block_group = btrfs_lookup_block_group(fs_info, bytenr); if (!block_group || block_group->ro) readonly = true; if (block_group) btrfs_put_block_group(block_group); return readonly; } /* * Check if we can do nocow write into the range [@offset, @offset + @len) * * @offset: File offset * @len: The length to write, will be updated to the nocow writeable * range * @orig_start: (optional) Return the original file offset of the file extent * @orig_len: (optional) Return the original on-disk length of the file extent * @ram_bytes: (optional) Return the ram_bytes of the file extent * @strict: if true, omit optimizations that might force us into unnecessary * cow. e.g., don't trust generation number. * * Return: * >0 and update @len if we can do nocow write * 0 if we can't do nocow write * <0 if error happened * * NOTE: This only checks the file extents, caller is responsible to wait for * any ordered extents. */ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, u64 *orig_start, u64 *orig_block_len, u64 *ram_bytes, bool nowait, bool strict) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct can_nocow_file_extent_args nocow_args = { 0 }; struct btrfs_path *path; int ret; struct extent_buffer *leaf; struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_file_extent_item *fi; struct btrfs_key key; int found_type; path = btrfs_alloc_path(); if (!path) return -ENOMEM; path->nowait = nowait; ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(BTRFS_I(inode)), offset, 0); if (ret < 0) goto out; if (ret == 1) { if (path->slots[0] == 0) { /* can't find the item, must cow */ ret = 0; goto out; } path->slots[0]--; } ret = 0; leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); if (key.objectid != btrfs_ino(BTRFS_I(inode)) || key.type != BTRFS_EXTENT_DATA_KEY) { /* not our file or wrong item type, must cow */ goto out; } if (key.offset > offset) { /* Wrong offset, must cow */ goto out; } if (btrfs_file_extent_end(path) <= offset) goto out; fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); found_type = btrfs_file_extent_type(leaf, fi); if (ram_bytes) *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); nocow_args.start = offset; nocow_args.end = offset + *len - 1; nocow_args.strict = strict; nocow_args.free_path = true; ret = can_nocow_file_extent(path, &key, BTRFS_I(inode), &nocow_args); /* can_nocow_file_extent() has freed the path. */ path = NULL; if (ret != 1) { /* Treat errors as not being able to NOCOW. */ ret = 0; goto out; } ret = 0; if (btrfs_extent_readonly(fs_info, nocow_args.disk_bytenr)) goto out; if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && found_type == BTRFS_FILE_EXTENT_PREALLOC) { u64 range_end; range_end = round_up(offset + nocow_args.num_bytes, root->fs_info->sectorsize) - 1; ret = test_range_bit_exists(io_tree, offset, range_end, EXTENT_DELALLOC); if (ret) { ret = -EAGAIN; goto out; } } if (orig_start) *orig_start = key.offset - nocow_args.extent_offset; if (orig_block_len) *orig_block_len = nocow_args.disk_num_bytes; *len = nocow_args.num_bytes; ret = 1; out: btrfs_free_path(path); return ret; } static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, struct extent_state **cached_state, unsigned int iomap_flags) { const bool writing = (iomap_flags & IOMAP_WRITE); const bool nowait = (iomap_flags & IOMAP_NOWAIT); struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_ordered_extent *ordered; int ret = 0; while (1) { if (nowait) { if (!try_lock_extent(io_tree, lockstart, lockend, cached_state)) return -EAGAIN; } else { lock_extent(io_tree, lockstart, lockend, cached_state); } /* * We're concerned with the entire range that we're going to be * doing DIO to, so we need to make sure there's no ordered * extents in this range. */ ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), lockstart, lockend - lockstart + 1); /* * We need to make sure there are no buffered pages in this * range either, we could have raced between the invalidate in * generic_file_direct_write and locking the extent. The * invalidate needs to happen so that reads after a write do not * get stale data. */ if (!ordered && (!writing || !filemap_range_has_page(inode->i_mapping, lockstart, lockend))) break; unlock_extent(io_tree, lockstart, lockend, cached_state); if (ordered) { if (nowait) { btrfs_put_ordered_extent(ordered); ret = -EAGAIN; break; } /* * If we are doing a DIO read and the ordered extent we * found is for a buffered write, we can not wait for it * to complete and retry, because if we do so we can * deadlock with concurrent buffered writes on page * locks. This happens only if our DIO read covers more * than one extent map, if at this point has already * created an ordered extent for a previous extent map * and locked its range in the inode's io tree, and a * concurrent write against that previous extent map's * range and this range started (we unlock the ranges * in the io tree only when the bios complete and * buffered writes always lock pages before attempting * to lock range in the io tree). */ if (writing || test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) btrfs_start_ordered_extent(ordered); else ret = nowait ? -EAGAIN : -ENOTBLK; btrfs_put_ordered_extent(ordered); } else { /* * We could trigger writeback for this range (and wait * for it to complete) and then invalidate the pages for * this range (through invalidate_inode_pages2_range()), * but that can lead us to a deadlock with a concurrent * call to readahead (a buffered read or a defrag call * triggered a readahead) on a page lock due to an * ordered dio extent we created before but did not have * yet a corresponding bio submitted (whence it can not * complete), which makes readahead wait for that * ordered extent to complete while holding a lock on * that page. */ ret = nowait ? -EAGAIN : -ENOTBLK; } if (ret) break; cond_resched(); } return ret; } /* The callers of this must take lock_extent() */ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, u64 len, u64 orig_start, u64 block_start, u64 block_len, u64 orig_block_len, u64 ram_bytes, int compress_type, int type) { struct extent_map *em; int ret; ASSERT(type == BTRFS_ORDERED_PREALLOC || type == BTRFS_ORDERED_COMPRESSED || type == BTRFS_ORDERED_NOCOW || type == BTRFS_ORDERED_REGULAR); em = alloc_extent_map(); if (!em) return ERR_PTR(-ENOMEM); em->start = start; em->orig_start = orig_start; em->len = len; em->block_len = block_len; em->block_start = block_start; em->orig_block_len = orig_block_len; em->ram_bytes = ram_bytes; em->generation = -1; set_bit(EXTENT_FLAG_PINNED, &em->flags); if (type == BTRFS_ORDERED_PREALLOC) { set_bit(EXTENT_FLAG_FILLING, &em->flags); } else if (type == BTRFS_ORDERED_COMPRESSED) { set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); em->compress_type = compress_type; } ret = btrfs_replace_extent_map_range(inode, em, true); if (ret) { free_extent_map(em); return ERR_PTR(ret); } /* em got 2 refs now, callers needs to do free_extent_map once. */ return em; } static int btrfs_get_blocks_direct_write(struct extent_map **map, struct inode *inode, struct btrfs_dio_data *dio_data, u64 start, u64 *lenp, unsigned int iomap_flags) { const bool nowait = (iomap_flags & IOMAP_NOWAIT); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_map *em = *map; int type; u64 block_start, orig_start, orig_block_len, ram_bytes; struct btrfs_block_group *bg; bool can_nocow = false; bool space_reserved = false; u64 len = *lenp; u64 prev_len; int ret = 0; /* * We don't allocate a new extent in the following cases * * 1) The inode is marked as NODATACOW. In this case we'll just use the * existing extent. * 2) The extent is marked as PREALLOC. We're good to go here and can * just use the extent. * */ if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && em->block_start != EXTENT_MAP_HOLE)) { if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) type = BTRFS_ORDERED_PREALLOC; else type = BTRFS_ORDERED_NOCOW; len = min(len, em->len - (start - em->start)); block_start = em->block_start + (start - em->start); if (can_nocow_extent(inode, start, &len, &orig_start, &orig_block_len, &ram_bytes, false, false) == 1) { bg = btrfs_inc_nocow_writers(fs_info, block_start); if (bg) can_nocow = true; } } prev_len = len; if (can_nocow) { struct extent_map *em2; /* We can NOCOW, so only need to reserve metadata space. */ ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len, nowait); if (ret < 0) { /* Our caller expects us to free the input extent map. */ free_extent_map(em); *map = NULL; btrfs_dec_nocow_writers(bg); if (nowait && (ret == -ENOSPC || ret == -EDQUOT)) ret = -EAGAIN; goto out; } space_reserved = true; em2 = btrfs_create_dio_extent(BTRFS_I(inode), dio_data, start, len, orig_start, block_start, len, orig_block_len, ram_bytes, type); btrfs_dec_nocow_writers(bg); if (type == BTRFS_ORDERED_PREALLOC) { free_extent_map(em); *map = em2; em = em2; } if (IS_ERR(em2)) { ret = PTR_ERR(em2); goto out; } dio_data->nocow_done = true; } else { /* Our caller expects us to free the input extent map. */ free_extent_map(em); *map = NULL; if (nowait) { ret = -EAGAIN; goto out; } /* * If we could not allocate data space before locking the file * range and we can't do a NOCOW write, then we have to fail. */ if (!dio_data->data_space_reserved) { ret = -ENOSPC; goto out; } /* * We have to COW and we have already reserved data space before, * so now we reserve only metadata. */ ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len, false); if (ret < 0) goto out; space_reserved = true; em = btrfs_new_extent_direct(BTRFS_I(inode), dio_data, start, len); if (IS_ERR(em)) { ret = PTR_ERR(em); goto out; } *map = em; len = min(len, em->len - (start - em->start)); if (len < prev_len) btrfs_delalloc_release_metadata(BTRFS_I(inode), prev_len - len, true); } /* * We have created our ordered extent, so we can now release our reservation * for an outstanding extent. */ btrfs_delalloc_release_extents(BTRFS_I(inode), prev_len); /* * Need to update the i_size under the extent lock so buffered * readers will get the updated i_size when we unlock. */ if (start + len > i_size_read(inode)) i_size_write(inode, start + len); out: if (ret && space_reserved) { btrfs_delalloc_release_extents(BTRFS_I(inode), len); btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true); } *lenp = len; return ret; } static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, loff_t length, unsigned int flags, struct iomap *iomap, struct iomap *srcmap) { struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_map *em; struct extent_state *cached_state = NULL; struct btrfs_dio_data *dio_data = iter->private; u64 lockstart, lockend; const bool write = !!(flags & IOMAP_WRITE); int ret = 0; u64 len = length; const u64 data_alloc_len = length; bool unlock_extents = false; /* * We could potentially fault if we have a buffer > PAGE_SIZE, and if * we're NOWAIT we may submit a bio for a partial range and return * EIOCBQUEUED, which would result in an errant short read. * * The best way to handle this would be to allow for partial completions * of iocb's, so we could submit the partial bio, return and fault in * the rest of the pages, and then submit the io for the rest of the * range. However we don't have that currently, so simply return * -EAGAIN at this point so that the normal path is used. */ if (!write && (flags & IOMAP_NOWAIT) && length > PAGE_SIZE) return -EAGAIN; /* * Cap the size of reads to that usually seen in buffered I/O as we need * to allocate a contiguous array for the checksums. */ if (!write) len = min_t(u64, len, fs_info->sectorsize * BTRFS_MAX_BIO_SECTORS); lockstart = start; lockend = start + len - 1; /* * iomap_dio_rw() only does filemap_write_and_wait_range(), which isn't * enough if we've written compressed pages to this area, so we need to * flush the dirty pages again to make absolutely sure that any * outstanding dirty pages are on disk - the first flush only starts * compression on the data, while keeping the pages locked, so by the * time the second flush returns we know bios for the compressed pages * were submitted and finished, and the pages no longer under writeback. * * If we have a NOWAIT request and we have any pages in the range that * are locked, likely due to compression still in progress, we don't want * to block on page locks. We also don't want to block on pages marked as * dirty or under writeback (same as for the non-compression case). * iomap_dio_rw() did the same check, but after that and before we got * here, mmap'ed writes may have happened or buffered reads started * (readpage() and readahead(), which lock pages), as we haven't locked * the file range yet. */ if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &BTRFS_I(inode)->runtime_flags)) { if (flags & IOMAP_NOWAIT) { if (filemap_range_needs_writeback(inode->i_mapping, lockstart, lockend)) return -EAGAIN; } else { ret = filemap_fdatawrite_range(inode->i_mapping, start, start + length - 1); if (ret) return ret; } } memset(dio_data, 0, sizeof(*dio_data)); /* * We always try to allocate data space and must do it before locking * the file range, to avoid deadlocks with concurrent writes to the same * range if the range has several extents and the writes don't expand the * current i_size (the inode lock is taken in shared mode). If we fail to * allocate data space here we continue and later, after locking the * file range, we fail with ENOSPC only if we figure out we can not do a * NOCOW write. */ if (write && !(flags & IOMAP_NOWAIT)) { ret = btrfs_check_data_free_space(BTRFS_I(inode), &dio_data->data_reserved, start, data_alloc_len, false); if (!ret) dio_data->data_space_reserved = true; else if (ret && !(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC))) goto err; } /* * If this errors out it's because we couldn't invalidate pagecache for * this range and we need to fallback to buffered IO, or we are doing a * NOWAIT read/write and we need to block. */ ret = lock_extent_direct(inode, lockstart, lockend, &cached_state, flags); if (ret < 0) goto err; em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len); if (IS_ERR(em)) { ret = PTR_ERR(em); goto unlock_err; } /* * Ok for INLINE and COMPRESSED extents we need to fallback on buffered * io. INLINE is special, and we could probably kludge it in here, but * it's still buffered so for safety lets just fall back to the generic * buffered path. * * For COMPRESSED we _have_ to read the entire extent in so we can * decompress it, so there will be buffering required no matter what we * do, so go ahead and fallback to buffered. * * We return -ENOTBLK because that's what makes DIO go ahead and go back * to buffered IO. Don't blame me, this is the price we pay for using * the generic code. */ if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) || em->block_start == EXTENT_MAP_INLINE) { free_extent_map(em); /* * If we are in a NOWAIT context, return -EAGAIN in order to * fallback to buffered IO. This is not only because we can * block with buffered IO (no support for NOWAIT semantics at * the moment) but also to avoid returning short reads to user * space - this happens if we were able to read some data from * previous non-compressed extents and then when we fallback to * buffered IO, at btrfs_file_read_iter() by calling * filemap_read(), we fail to fault in pages for the read buffer, * in which case filemap_read() returns a short read (the number * of bytes previously read is > 0, so it does not return -EFAULT). */ ret = (flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOTBLK; goto unlock_err; } len = min(len, em->len - (start - em->start)); /* * If we have a NOWAIT request and the range contains multiple extents * (or a mix of extents and holes), then we return -EAGAIN to make the * caller fallback to a context where it can do a blocking (without * NOWAIT) request. This way we avoid doing partial IO and returning * success to the caller, which is not optimal for writes and for reads * it can result in unexpected behaviour for an application. * * When doing a read, because we use IOMAP_DIO_PARTIAL when calling * iomap_dio_rw(), we can end up returning less data then what the caller * asked for, resulting in an unexpected, and incorrect, short read. * That is, the caller asked to read N bytes and we return less than that, * which is wrong unless we are crossing EOF. This happens if we get a * page fault error when trying to fault in pages for the buffer that is * associated to the struct iov_iter passed to iomap_dio_rw(), and we * have previously submitted bios for other extents in the range, in * which case iomap_dio_rw() may return us EIOCBQUEUED if not all of * those bios have completed by the time we get the page fault error, * which we return back to our caller - we should only return EIOCBQUEUED * after we have submitted bios for all the extents in the range. */ if ((flags & IOMAP_NOWAIT) && len < length) { free_extent_map(em); ret = -EAGAIN; goto unlock_err; } if (write) { ret = btrfs_get_blocks_direct_write(&em, inode, dio_data, start, &len, flags); if (ret < 0) goto unlock_err; unlock_extents = true; /* Recalc len in case the new em is smaller than requested */ len = min(len, em->len - (start - em->start)); if (dio_data->data_space_reserved) { u64 release_offset; u64 release_len = 0; if (dio_data->nocow_done) { release_offset = start; release_len = data_alloc_len; } else if (len < data_alloc_len) { release_offset = start + len; release_len = data_alloc_len - len; } if (release_len > 0) btrfs_free_reserved_data_space(BTRFS_I(inode), dio_data->data_reserved, release_offset, release_len); } } else { /* * We need to unlock only the end area that we aren't using. * The rest is going to be unlocked by the endio routine. */ lockstart = start + len; if (lockstart < lockend) unlock_extents = true; } if (unlock_extents) unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state); else free_extent_state(cached_state); /* * Translate extent map information to iomap. * We trim the extents (and move the addr) even though iomap code does * that, since we have locked only the parts we are performing I/O in. */ if ((em->block_start == EXTENT_MAP_HOLE) || (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && !write)) { iomap->addr = IOMAP_NULL_ADDR; iomap->type = IOMAP_HOLE; } else { iomap->addr = em->block_start + (start - em->start); iomap->type = IOMAP_MAPPED; } iomap->offset = start; iomap->bdev = fs_info->fs_devices->latest_dev->bdev; iomap->length = len; free_extent_map(em); return 0; unlock_err: unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state); err: if (dio_data->data_space_reserved) { btrfs_free_reserved_data_space(BTRFS_I(inode), dio_data->data_reserved, start, data_alloc_len); extent_changeset_free(dio_data->data_reserved); } return ret; } static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, ssize_t written, unsigned int flags, struct iomap *iomap) { struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap); struct btrfs_dio_data *dio_data = iter->private; size_t submitted = dio_data->submitted; const bool write = !!(flags & IOMAP_WRITE); int ret = 0; if (!write && (iomap->type == IOMAP_HOLE)) { /* If reading from a hole, unlock and return */ unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1, NULL); return 0; } if (submitted < length) { pos += submitted; length -= submitted; if (write) btrfs_finish_ordered_extent(dio_data->ordered, NULL, pos, length, false); else unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1, NULL); ret = -ENOTBLK; } if (write) { btrfs_put_ordered_extent(dio_data->ordered); dio_data->ordered = NULL; } if (write) extent_changeset_free(dio_data->data_reserved); return ret; } static void btrfs_dio_end_io(struct btrfs_bio *bbio) { struct btrfs_dio_private *dip = container_of(bbio, struct btrfs_dio_private, bbio); struct btrfs_inode *inode = bbio->inode; struct bio *bio = &bbio->bio; if (bio->bi_status) { btrfs_warn(inode->root->fs_info, "direct IO failed ino %llu op 0x%0x offset %#llx len %u err no %d", btrfs_ino(inode), bio->bi_opf, dip->file_offset, dip->bytes, bio->bi_status); } if (btrfs_op(bio) == BTRFS_MAP_WRITE) { btrfs_finish_ordered_extent(bbio->ordered, NULL, dip->file_offset, dip->bytes, !bio->bi_status); } else { unlock_extent(&inode->io_tree, dip->file_offset, dip->file_offset + dip->bytes - 1, NULL); } bbio->bio.bi_private = bbio->private; iomap_dio_bio_end_io(bio); } static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio, loff_t file_offset) { struct btrfs_bio *bbio = btrfs_bio(bio); struct btrfs_dio_private *dip = container_of(bbio, struct btrfs_dio_private, bbio); struct btrfs_dio_data *dio_data = iter->private; btrfs_bio_init(bbio, BTRFS_I(iter->inode)->root->fs_info, btrfs_dio_end_io, bio->bi_private); bbio->inode = BTRFS_I(iter->inode); bbio->file_offset = file_offset; dip->file_offset = file_offset; dip->bytes = bio->bi_iter.bi_size; dio_data->submitted += bio->bi_iter.bi_size; /* * Check if we are doing a partial write. If we are, we need to split * the ordered extent to match the submitted bio. Hang on to the * remaining unfinishable ordered_extent in dio_data so that it can be * cancelled in iomap_end to avoid a deadlock wherein faulting the * remaining pages is blocked on the outstanding ordered extent. */ if (iter->flags & IOMAP_WRITE) { int ret; ret = btrfs_extract_ordered_extent(bbio, dio_data->ordered); if (ret) { btrfs_finish_ordered_extent(dio_data->ordered, NULL, file_offset, dip->bytes, !ret); bio->bi_status = errno_to_blk_status(ret); iomap_dio_bio_end_io(bio); return; } } btrfs_submit_bio(bbio, 0); } static const struct iomap_ops btrfs_dio_iomap_ops = { .iomap_begin = btrfs_dio_iomap_begin, .iomap_end = btrfs_dio_iomap_end, }; static const struct iomap_dio_ops btrfs_dio_ops = { .submit_io = btrfs_dio_submit_io, .bio_set = &btrfs_dio_bioset, }; ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter, size_t done_before) { struct btrfs_dio_data data = { 0 }; return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops, IOMAP_DIO_PARTIAL, &data, done_before); } struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter, size_t done_before) { struct btrfs_dio_data data = { 0 }; return __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops, IOMAP_DIO_PARTIAL, &data, done_before); } static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { int ret; ret = fiemap_prep(inode, fieinfo, start, &len, 0); if (ret) return ret; /* * fiemap_prep() called filemap_write_and_wait() for the whole possible * file range (0 to LLONG_MAX), but that is not enough if we have * compression enabled. The first filemap_fdatawrite_range() only kicks * in the compression of data (in an async thread) and will return * before the compression is done and writeback is started. A second * filemap_fdatawrite_range() is needed to wait for the compression to * complete and writeback to start. We also need to wait for ordered * extents to complete, because our fiemap implementation uses mainly * file extent items to list the extents, searching for extent maps * only for file ranges with holes or prealloc extents to figure out * if we have delalloc in those ranges. */ if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) { ret = btrfs_wait_ordered_range(inode, 0, LLONG_MAX); if (ret) return ret; } return extent_fiemap(BTRFS_I(inode), fieinfo, start, len); } static int btrfs_writepages(struct address_space *mapping, struct writeback_control *wbc) { return extent_writepages(mapping, wbc); } static void btrfs_readahead(struct readahead_control *rac) { extent_readahead(rac); } /* * For release_folio() and invalidate_folio() we have a race window where * folio_end_writeback() is called but the subpage spinlock is not yet released. * If we continue to release/invalidate the page, we could cause use-after-free * for subpage spinlock. So this function is to spin and wait for subpage * spinlock. */ static void wait_subpage_spinlock(struct page *page) { struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); struct btrfs_subpage *subpage; if (!btrfs_is_subpage(fs_info, page)) return; ASSERT(PagePrivate(page) && page->private); subpage = (struct btrfs_subpage *)page->private; /* * This may look insane as we just acquire the spinlock and release it, * without doing anything. But we just want to make sure no one is * still holding the subpage spinlock. * And since the page is not dirty nor writeback, and we have page * locked, the only possible way to hold a spinlock is from the endio * function to clear page writeback. * * Here we just acquire the spinlock so that all existing callers * should exit and we're safe to release/invalidate the page. */ spin_lock_irq(&subpage->lock); spin_unlock_irq(&subpage->lock); } static bool __btrfs_release_folio(struct folio *folio, gfp_t gfp_flags) { int ret = try_release_extent_mapping(&folio->page, gfp_flags); if (ret == 1) { wait_subpage_spinlock(&folio->page); clear_page_extent_mapped(&folio->page); } return ret; } static bool btrfs_release_folio(struct folio *folio, gfp_t gfp_flags) { if (folio_test_writeback(folio) || folio_test_dirty(folio)) return false; return __btrfs_release_folio(folio, gfp_flags); } #ifdef CONFIG_MIGRATION static int btrfs_migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode) { int ret = filemap_migrate_folio(mapping, dst, src, mode); if (ret != MIGRATEPAGE_SUCCESS) return ret; if (folio_test_ordered(src)) { folio_clear_ordered(src); folio_set_ordered(dst); } return MIGRATEPAGE_SUCCESS; } #else #define btrfs_migrate_folio NULL #endif static void btrfs_invalidate_folio(struct folio *folio, size_t offset, size_t length) { struct btrfs_inode *inode = BTRFS_I(folio->mapping->host); struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_io_tree *tree = &inode->io_tree; struct extent_state *cached_state = NULL; u64 page_start = folio_pos(folio); u64 page_end = page_start + folio_size(folio) - 1; u64 cur; int inode_evicting = inode->vfs_inode.i_state & I_FREEING; /* * We have folio locked so no new ordered extent can be created on this * page, nor bio can be submitted for this folio. * * But already submitted bio can still be finished on this folio. * Furthermore, endio function won't skip folio which has Ordered * (Private2) already cleared, so it's possible for endio and * invalidate_folio to do the same ordered extent accounting twice * on one folio. * * So here we wait for any submitted bios to finish, so that we won't * do double ordered extent accounting on the same folio. */ folio_wait_writeback(folio); wait_subpage_spinlock(&folio->page); /* * For subpage case, we have call sites like * btrfs_punch_hole_lock_range() which passes range not aligned to * sectorsize. * If the range doesn't cover the full folio, we don't need to and * shouldn't clear page extent mapped, as folio->private can still * record subpage dirty bits for other part of the range. * * For cases that invalidate the full folio even the range doesn't * cover the full folio, like invalidating the last folio, we're * still safe to wait for ordered extent to finish. */ if (!(offset == 0 && length == folio_size(folio))) { btrfs_release_folio(folio, GFP_NOFS); return; } if (!inode_evicting) lock_extent(tree, page_start, page_end, &cached_state); cur = page_start; while (cur < page_end) { struct btrfs_ordered_extent *ordered; u64 range_end; u32 range_len; u32 extra_flags = 0; ordered = btrfs_lookup_first_ordered_range(inode, cur, page_end + 1 - cur); if (!ordered) { range_end = page_end; /* * No ordered extent covering this range, we are safe * to delete all extent states in the range. */ extra_flags = EXTENT_CLEAR_ALL_BITS; goto next; } if (ordered->file_offset > cur) { /* * There is a range between [cur, oe->file_offset) not * covered by any ordered extent. * We are safe to delete all extent states, and handle * the ordered extent in the next iteration. */ range_end = ordered->file_offset - 1; extra_flags = EXTENT_CLEAR_ALL_BITS; goto next; } range_end = min(ordered->file_offset + ordered->num_bytes - 1, page_end); ASSERT(range_end + 1 - cur < U32_MAX); range_len = range_end + 1 - cur; if (!btrfs_page_test_ordered(fs_info, &folio->page, cur, range_len)) { /* * If Ordered (Private2) is cleared, it means endio has * already been executed for the range. * We can't delete the extent states as * btrfs_finish_ordered_io() may still use some of them. */ goto next; } btrfs_page_clear_ordered(fs_info, &folio->page, cur, range_len); /* * IO on this page will never be started, so we need to account * for any ordered extents now. Don't clear EXTENT_DELALLOC_NEW * here, must leave that up for the ordered extent completion. * * This will also unlock the range for incoming * btrfs_finish_ordered_io(). */ if (!inode_evicting) clear_extent_bit(tree, cur, range_end, EXTENT_DELALLOC | EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, &cached_state); spin_lock_irq(&inode->ordered_tree_lock); set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags); ordered->truncated_len = min(ordered->truncated_len, cur - ordered->file_offset); spin_unlock_irq(&inode->ordered_tree_lock); /* * If the ordered extent has finished, we're safe to delete all * the extent states of the range, otherwise * btrfs_finish_ordered_io() will get executed by endio for * other pages, so we can't delete extent states. */ if (btrfs_dec_test_ordered_pending(inode, &ordered, cur, range_end + 1 - cur)) { btrfs_finish_ordered_io(ordered); /* * The ordered extent has finished, now we're again * safe to delete all extent states of the range. */ extra_flags = EXTENT_CLEAR_ALL_BITS; } next: if (ordered) btrfs_put_ordered_extent(ordered); /* * Qgroup reserved space handler * Sector(s) here will be either: * * 1) Already written to disk or bio already finished * Then its QGROUP_RESERVED bit in io_tree is already cleared. * Qgroup will be handled by its qgroup_record then. * btrfs_qgroup_free_data() call will do nothing here. * * 2) Not written to disk yet * Then btrfs_qgroup_free_data() call will clear the * QGROUP_RESERVED bit of its io_tree, and free the qgroup * reserved data space. * Since the IO will never happen for this page. */ btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur, NULL); if (!inode_evicting) { clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG | extra_flags, &cached_state); } cur = range_end + 1; } /* * We have iterated through all ordered extents of the page, the page * should not have Ordered (Private2) anymore, or the above iteration * did something wrong. */ ASSERT(!folio_test_ordered(folio)); btrfs_page_clear_checked(fs_info, &folio->page, folio_pos(folio), folio_size(folio)); if (!inode_evicting) __btrfs_release_folio(folio, GFP_NOFS); clear_page_extent_mapped(&folio->page); } /* * btrfs_page_mkwrite() is not allowed to change the file size as it gets * called from a page fault handler when a page is first dirtied. Hence we must * be careful to check for EOF conditions here. We set the page up correctly * for a written page which means we get ENOSPC checking when writing into * holes and correct delalloc and unwritten extent mapping on filesystems that * support these features. * * We are not allowed to take the i_mutex here so we have to play games to * protect against truncate races as the page could now be beyond EOF. Because * truncate_setsize() writes the inode size before removing pages, once we have * the page lock we can determine safely if the page is beyond EOF. If it is not * beyond EOF, then the page is guaranteed safe against truncation until we * unlock the page. */ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) { struct page *page = vmf->page; struct inode *inode = file_inode(vmf->vma->vm_file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; struct extent_changeset *data_reserved = NULL; unsigned long zero_start; loff_t size; vm_fault_t ret; int ret2; int reserved = 0; u64 reserved_space; u64 page_start; u64 page_end; u64 end; reserved_space = PAGE_SIZE; sb_start_pagefault(inode->i_sb); page_start = page_offset(page); page_end = page_start + PAGE_SIZE - 1; end = page_end; /* * Reserving delalloc space after obtaining the page lock can lead to * deadlock. For example, if a dirty page is locked by this function * and the call to btrfs_delalloc_reserve_space() ends up triggering * dirty page write out, then the btrfs_writepages() function could * end up waiting indefinitely to get a lock on the page currently * being processed by btrfs_page_mkwrite() function. */ ret2 = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved, page_start, reserved_space); if (!ret2) { ret2 = file_update_time(vmf->vma->vm_file); reserved = 1; } if (ret2) { ret = vmf_error(ret2); if (reserved) goto out; goto out_noreserve; } ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ again: down_read(&BTRFS_I(inode)->i_mmap_lock); lock_page(page); size = i_size_read(inode); if ((page->mapping != inode->i_mapping) || (page_start >= size)) { /* page got truncated out from underneath us */ goto out_unlock; } wait_on_page_writeback(page); lock_extent(io_tree, page_start, page_end, &cached_state); ret2 = set_page_extent_mapped(page); if (ret2 < 0) { ret = vmf_error(ret2); unlock_extent(io_tree, page_start, page_end, &cached_state); goto out_unlock; } /* * we can't set the delalloc bits if there are pending ordered * extents. Drop our locks and wait for them to finish */ ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, PAGE_SIZE); if (ordered) { unlock_extent(io_tree, page_start, page_end, &cached_state); unlock_page(page); up_read(&BTRFS_I(inode)->i_mmap_lock); btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); goto again; } if (page->index == ((size - 1) >> PAGE_SHIFT)) { reserved_space = round_up(size - page_start, fs_info->sectorsize); if (reserved_space < PAGE_SIZE) { end = page_start + reserved_space - 1; btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start, PAGE_SIZE - reserved_space, true); } } /* * page_mkwrite gets called when the page is firstly dirtied after it's * faulted in, but write(2) could also dirty a page and set delalloc * bits, thus in this case for space account reason, we still need to * clear any delalloc bits within this page range since we have to * reserve data&meta space before lock_page() (see above comments). */ clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, &cached_state); ret2 = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, end, 0, &cached_state); if (ret2) { unlock_extent(io_tree, page_start, page_end, &cached_state); ret = VM_FAULT_SIGBUS; goto out_unlock; } /* page is wholly or partially inside EOF */ if (page_start + PAGE_SIZE > size) zero_start = offset_in_page(size); else zero_start = PAGE_SIZE; if (zero_start != PAGE_SIZE) memzero_page(page, zero_start, PAGE_SIZE - zero_start); btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE); btrfs_page_set_dirty(fs_info, page, page_start, end + 1 - page_start); btrfs_page_set_uptodate(fs_info, page, page_start, end + 1 - page_start); btrfs_set_inode_last_sub_trans(BTRFS_I(inode)); unlock_extent(io_tree, page_start, page_end, &cached_state); up_read(&BTRFS_I(inode)->i_mmap_lock); btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); sb_end_pagefault(inode->i_sb); extent_changeset_free(data_reserved); return VM_FAULT_LOCKED; out_unlock: unlock_page(page); up_read(&BTRFS_I(inode)->i_mmap_lock); out: btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start, reserved_space, (ret != 0)); out_noreserve: sb_end_pagefault(inode->i_sb); extent_changeset_free(data_reserved); return ret; } static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) { struct btrfs_truncate_control control = { .inode = inode, .ino = btrfs_ino(inode), .min_type = BTRFS_EXTENT_DATA_KEY, .clear_extent_range = true, }; struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_rsv *rsv; int ret; struct btrfs_trans_handle *trans; u64 mask = fs_info->sectorsize - 1; const u64 min_size = btrfs_calc_metadata_size(fs_info, 1); if (!skip_writeback) { ret = btrfs_wait_ordered_range(&inode->vfs_inode, inode->vfs_inode.i_size & (~mask), (u64)-1); if (ret) return ret; } /* * Yes ladies and gentlemen, this is indeed ugly. We have a couple of * things going on here: * * 1) We need to reserve space to update our inode. * * 2) We need to have something to cache all the space that is going to * be free'd up by the truncate operation, but also have some slack * space reserved in case it uses space during the truncate (thank you * very much snapshotting). * * And we need these to be separate. The fact is we can use a lot of * space doing the truncate, and we have no earthly idea how much space * we will use, so we need the truncate reservation to be separate so it * doesn't end up using space reserved for updating the inode. We also * need to be able to stop the transaction and start a new one, which * means we need to be able to update the inode several times, and we * have no idea of knowing how many times that will be, so we can't just * reserve 1 item for the entirety of the operation, so that has to be * done separately as well. * * So that leaves us with * * 1) rsv - for the truncate reservation, which we will steal from the * transaction reservation. * 2) fs_info->trans_block_rsv - this will have 1 items worth left for * updating the inode. */ rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP); if (!rsv) return -ENOMEM; rsv->size = min_size; rsv->failfast = true; /* * 1 for the truncate slack space * 1 for updating the inode. */ trans = btrfs_start_transaction(root, 2); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out; } /* Migrate the slack space for the truncate to our reserve */ ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv, min_size, false); /* * We have reserved 2 metadata units when we started the transaction and * min_size matches 1 unit, so this should never fail, but if it does, * it's not critical we just fail truncation. */ if (WARN_ON(ret)) { btrfs_end_transaction(trans); goto out; } trans->block_rsv = rsv; while (1) { struct extent_state *cached_state = NULL; const u64 new_size = inode->vfs_inode.i_size; const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize); control.new_size = new_size; lock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state); /* * We want to drop from the next block forward in case this new * size is not block aligned since we will be keeping the last * block of the extent just the way it is. */ btrfs_drop_extent_map_range(inode, ALIGN(new_size, fs_info->sectorsize), (u64)-1, false); ret = btrfs_truncate_inode_items(trans, root, &control); inode_sub_bytes(&inode->vfs_inode, control.sub_bytes); btrfs_inode_safe_disk_i_size_write(inode, control.last_size); unlock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state); trans->block_rsv = &fs_info->trans_block_rsv; if (ret != -ENOSPC && ret != -EAGAIN) break; ret = btrfs_update_inode(trans, inode); if (ret) break; btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); trans = btrfs_start_transaction(root, 2); if (IS_ERR(trans)) { ret = PTR_ERR(trans); trans = NULL; break; } btrfs_block_rsv_release(fs_info, rsv, -1, NULL); ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv, min_size, false); /* * We have reserved 2 metadata units when we started the * transaction and min_size matches 1 unit, so this should never * fail, but if it does, it's not critical we just fail truncation. */ if (WARN_ON(ret)) break; trans->block_rsv = rsv; } /* * We can't call btrfs_truncate_block inside a trans handle as we could * deadlock with freeze, if we got BTRFS_NEED_TRUNCATE_BLOCK then we * know we've truncated everything except the last little bit, and can * do btrfs_truncate_block and then update the disk_i_size. */ if (ret == BTRFS_NEED_TRUNCATE_BLOCK) { btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); ret = btrfs_truncate_block(inode, inode->vfs_inode.i_size, 0, 0); if (ret) goto out; trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out; } btrfs_inode_safe_disk_i_size_write(inode, 0); } if (trans) { int ret2; trans->block_rsv = &fs_info->trans_block_rsv; ret2 = btrfs_update_inode(trans, inode); if (ret2 && !ret) ret = ret2; ret2 = btrfs_end_transaction(trans); if (ret2 && !ret) ret = ret2; btrfs_btree_balance_dirty(fs_info); } out: btrfs_free_block_rsv(fs_info, rsv); /* * So if we truncate and then write and fsync we normally would just * write the extents that changed, which is a problem if we need to * first truncate that entire inode. So set this flag so we write out * all of the extents in the inode to the sync log so we're completely * safe. * * If no extents were dropped or trimmed we don't need to force the next * fsync to truncate all the inode's items from the log and re-log them * all. This means the truncate operation did not change the file size, * or changed it to a smaller size but there was only an implicit hole * between the old i_size and the new i_size, and there were no prealloc * extents beyond i_size to drop. */ if (control.extents_found > 0) btrfs_set_inode_full_sync(inode); return ret; } struct inode *btrfs_new_subvol_inode(struct mnt_idmap *idmap, struct inode *dir) { struct inode *inode; inode = new_inode(dir->i_sb); if (inode) { /* * Subvolumes don't inherit the sgid bit or the parent's gid if * the parent's sgid bit is set. This is probably a bug. */ inode_init_owner(idmap, inode, NULL, S_IFDIR | (~current_umask() & S_IRWXUGO)); inode->i_op = &btrfs_dir_inode_operations; inode->i_fop = &btrfs_dir_file_operations; } return inode; } struct inode *btrfs_alloc_inode(struct super_block *sb) { struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_inode *ei; struct inode *inode; ei = alloc_inode_sb(sb, btrfs_inode_cachep, GFP_KERNEL); if (!ei) return NULL; ei->root = NULL; ei->generation = 0; ei->last_trans = 0; ei->last_sub_trans = 0; ei->logged_trans = 0; ei->delalloc_bytes = 0; ei->new_delalloc_bytes = 0; ei->defrag_bytes = 0; ei->disk_i_size = 0; ei->flags = 0; ei->ro_flags = 0; ei->csum_bytes = 0; ei->index_cnt = (u64)-1; ei->dir_index = 0; ei->last_unlink_trans = 0; ei->last_reflink_trans = 0; ei->last_log_commit = 0; spin_lock_init(&ei->lock); ei->outstanding_extents = 0; if (sb->s_magic != BTRFS_TEST_MAGIC) btrfs_init_metadata_block_rsv(fs_info, &ei->block_rsv, BTRFS_BLOCK_RSV_DELALLOC); ei->runtime_flags = 0; ei->prop_compress = BTRFS_COMPRESS_NONE; ei->defrag_compress = BTRFS_COMPRESS_NONE; ei->delayed_node = NULL; ei->i_otime_sec = 0; ei->i_otime_nsec = 0; inode = &ei->vfs_inode; extent_map_tree_init(&ei->extent_tree); extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO); ei->io_tree.inode = ei; extent_io_tree_init(fs_info, &ei->file_extent_tree, IO_TREE_INODE_FILE_EXTENT); mutex_init(&ei->log_mutex); spin_lock_init(&ei->ordered_tree_lock); ei->ordered_tree = RB_ROOT; ei->ordered_tree_last = NULL; INIT_LIST_HEAD(&ei->delalloc_inodes); INIT_LIST_HEAD(&ei->delayed_iput); RB_CLEAR_NODE(&ei->rb_node); init_rwsem(&ei->i_mmap_lock); return inode; } #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS void btrfs_test_destroy_inode(struct inode *inode) { btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false); kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); } #endif void btrfs_free_inode(struct inode *inode) { kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); } void btrfs_destroy_inode(struct inode *vfs_inode) { struct btrfs_ordered_extent *ordered; struct btrfs_inode *inode = BTRFS_I(vfs_inode); struct btrfs_root *root = inode->root; bool freespace_inode; WARN_ON(!hlist_empty(&vfs_inode->i_dentry)); WARN_ON(vfs_inode->i_data.nrpages); WARN_ON(inode->block_rsv.reserved); WARN_ON(inode->block_rsv.size); WARN_ON(inode->outstanding_extents); if (!S_ISDIR(vfs_inode->i_mode)) { WARN_ON(inode->delalloc_bytes); WARN_ON(inode->new_delalloc_bytes); } WARN_ON(inode->csum_bytes); WARN_ON(inode->defrag_bytes); /* * This can happen where we create an inode, but somebody else also * created the same inode and we need to destroy the one we already * created. */ if (!root) return; /* * If this is a free space inode do not take the ordered extents lockdep * map. */ freespace_inode = btrfs_is_free_space_inode(inode); while (1) { ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); if (!ordered) break; else { btrfs_err(root->fs_info, "found ordered extent %llu %llu on inode cleanup", ordered->file_offset, ordered->num_bytes); if (!freespace_inode) btrfs_lockdep_acquire(root->fs_info, btrfs_ordered_extent); btrfs_remove_ordered_extent(inode, ordered); btrfs_put_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); } } btrfs_qgroup_check_reserved_leak(inode); inode_tree_del(inode); btrfs_drop_extent_map_range(inode, 0, (u64)-1, false); btrfs_inode_clear_file_extent_range(inode, 0, (u64)-1); btrfs_put_root(inode->root); } int btrfs_drop_inode(struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; if (root == NULL) return 1; /* the snap/subvol tree is on deleting */ if (btrfs_root_refs(&root->root_item) == 0) return 1; else return generic_drop_inode(inode); } static void init_once(void *foo) { struct btrfs_inode *ei = foo; inode_init_once(&ei->vfs_inode); } void __cold btrfs_destroy_cachep(void) { /* * Make sure all delayed rcu free inodes are flushed before we * destroy cache. */ rcu_barrier(); bioset_exit(&btrfs_dio_bioset); kmem_cache_destroy(btrfs_inode_cachep); } int __init btrfs_init_cachep(void) { btrfs_inode_cachep = kmem_cache_create("btrfs_inode", sizeof(struct btrfs_inode), 0, SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT, init_once); if (!btrfs_inode_cachep) goto fail; if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE, offsetof(struct btrfs_dio_private, bbio.bio), BIOSET_NEED_BVECS)) goto fail; return 0; fail: btrfs_destroy_cachep(); return -ENOMEM; } static int btrfs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { u64 delalloc_bytes; u64 inode_bytes; struct inode *inode = d_inode(path->dentry); u32 blocksize = inode->i_sb->s_blocksize; u32 bi_flags = BTRFS_I(inode)->flags; u32 bi_ro_flags = BTRFS_I(inode)->ro_flags; stat->result_mask |= STATX_BTIME; stat->btime.tv_sec = BTRFS_I(inode)->i_otime_sec; stat->btime.tv_nsec = BTRFS_I(inode)->i_otime_nsec; if (bi_flags & BTRFS_INODE_APPEND) stat->attributes |= STATX_ATTR_APPEND; if (bi_flags & BTRFS_INODE_COMPRESS) stat->attributes |= STATX_ATTR_COMPRESSED; if (bi_flags & BTRFS_INODE_IMMUTABLE) stat->attributes |= STATX_ATTR_IMMUTABLE; if (bi_flags & BTRFS_INODE_NODUMP) stat->attributes |= STATX_ATTR_NODUMP; if (bi_ro_flags & BTRFS_INODE_RO_VERITY) stat->attributes |= STATX_ATTR_VERITY; stat->attributes_mask |= (STATX_ATTR_APPEND | STATX_ATTR_COMPRESSED | STATX_ATTR_IMMUTABLE | STATX_ATTR_NODUMP); generic_fillattr(idmap, request_mask, inode, stat); stat->dev = BTRFS_I(inode)->root->anon_dev; spin_lock(&BTRFS_I(inode)->lock); delalloc_bytes = BTRFS_I(inode)->new_delalloc_bytes; inode_bytes = inode_get_bytes(inode); spin_unlock(&BTRFS_I(inode)->lock); stat->blocks = (ALIGN(inode_bytes, blocksize) + ALIGN(delalloc_bytes, blocksize)) >> SECTOR_SHIFT; return 0; } static int btrfs_rename_exchange(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb); struct btrfs_trans_handle *trans; unsigned int trans_num_items; struct btrfs_root *root = BTRFS_I(old_dir)->root; struct btrfs_root *dest = BTRFS_I(new_dir)->root; struct inode *new_inode = new_dentry->d_inode; struct inode *old_inode = old_dentry->d_inode; struct btrfs_rename_ctx old_rename_ctx; struct btrfs_rename_ctx new_rename_ctx; u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); u64 new_ino = btrfs_ino(BTRFS_I(new_inode)); u64 old_idx = 0; u64 new_idx = 0; int ret; int ret2; bool need_abort = false; struct fscrypt_name old_fname, new_fname; struct fscrypt_str *old_name, *new_name; /* * For non-subvolumes allow exchange only within one subvolume, in the * same inode namespace. Two subvolumes (represented as directory) can * be exchanged as they're a logical link and have a fixed inode number. */ if (root != dest && (old_ino != BTRFS_FIRST_FREE_OBJECTID || new_ino != BTRFS_FIRST_FREE_OBJECTID)) return -EXDEV; ret = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &old_fname); if (ret) return ret; ret = fscrypt_setup_filename(new_dir, &new_dentry->d_name, 0, &new_fname); if (ret) { fscrypt_free_filename(&old_fname); return ret; } old_name = &old_fname.disk_name; new_name = &new_fname.disk_name; /* close the race window with snapshot create/destroy ioctl */ if (old_ino == BTRFS_FIRST_FREE_OBJECTID || new_ino == BTRFS_FIRST_FREE_OBJECTID) down_read(&fs_info->subvol_sem); /* * For each inode: * 1 to remove old dir item * 1 to remove old dir index * 1 to add new dir item * 1 to add new dir index * 1 to update parent inode * * If the parents are the same, we only need to account for one */ trans_num_items = (old_dir == new_dir ? 9 : 10); if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { /* * 1 to remove old root ref * 1 to remove old root backref * 1 to add new root ref * 1 to add new root backref */ trans_num_items += 4; } else { /* * 1 to update inode item * 1 to remove old inode ref * 1 to add new inode ref */ trans_num_items += 3; } if (new_ino == BTRFS_FIRST_FREE_OBJECTID) trans_num_items += 4; else trans_num_items += 3; trans = btrfs_start_transaction(root, trans_num_items); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out_notrans; } if (dest != root) { ret = btrfs_record_root_in_trans(trans, dest); if (ret) goto out_fail; } /* * We need to find a free sequence number both in the source and * in the destination directory for the exchange. */ ret = btrfs_set_inode_index(BTRFS_I(new_dir), &old_idx); if (ret) goto out_fail; ret = btrfs_set_inode_index(BTRFS_I(old_dir), &new_idx); if (ret) goto out_fail; BTRFS_I(old_inode)->dir_index = 0ULL; BTRFS_I(new_inode)->dir_index = 0ULL; /* Reference for the source. */ if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { /* force full log commit if subvolume involved. */ btrfs_set_log_full_commit(trans); } else { ret = btrfs_insert_inode_ref(trans, dest, new_name, old_ino, btrfs_ino(BTRFS_I(new_dir)), old_idx); if (ret) goto out_fail; need_abort = true; } /* And now for the dest. */ if (new_ino == BTRFS_FIRST_FREE_OBJECTID) { /* force full log commit if subvolume involved. */ btrfs_set_log_full_commit(trans); } else { ret = btrfs_insert_inode_ref(trans, root, old_name, new_ino, btrfs_ino(BTRFS_I(old_dir)), new_idx); if (ret) { if (need_abort) btrfs_abort_transaction(trans, ret); goto out_fail; } } /* Update inode version and ctime/mtime. */ inode_inc_iversion(old_dir); inode_inc_iversion(new_dir); inode_inc_iversion(old_inode); inode_inc_iversion(new_inode); simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); if (old_dentry->d_parent != new_dentry->d_parent) { btrfs_record_unlink_dir(trans, BTRFS_I(old_dir), BTRFS_I(old_inode), true); btrfs_record_unlink_dir(trans, BTRFS_I(new_dir), BTRFS_I(new_inode), true); } /* src is a subvolume */ if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry); } else { /* src is an inode */ ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir), BTRFS_I(old_dentry->d_inode), old_name, &old_rename_ctx); if (!ret) ret = btrfs_update_inode(trans, BTRFS_I(old_inode)); } if (ret) { btrfs_abort_transaction(trans, ret); goto out_fail; } /* dest is a subvolume */ if (new_ino == BTRFS_FIRST_FREE_OBJECTID) { ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry); } else { /* dest is an inode */ ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir), BTRFS_I(new_dentry->d_inode), new_name, &new_rename_ctx); if (!ret) ret = btrfs_update_inode(trans, BTRFS_I(new_inode)); } if (ret) { btrfs_abort_transaction(trans, ret); goto out_fail; } ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode), new_name, 0, old_idx); if (ret) { btrfs_abort_transaction(trans, ret); goto out_fail; } ret = btrfs_add_link(trans, BTRFS_I(old_dir), BTRFS_I(new_inode), old_name, 0, new_idx); if (ret) { btrfs_abort_transaction(trans, ret); goto out_fail; } if (old_inode->i_nlink == 1) BTRFS_I(old_inode)->dir_index = old_idx; if (new_inode->i_nlink == 1) BTRFS_I(new_inode)->dir_index = new_idx; /* * Now pin the logs of the roots. We do it to ensure that no other task * can sync the logs while we are in progress with the rename, because * that could result in an inconsistency in case any of the inodes that * are part of this rename operation were logged before. */ if (old_ino != BTRFS_FIRST_FREE_OBJECTID) btrfs_pin_log_trans(root); if (new_ino != BTRFS_FIRST_FREE_OBJECTID) btrfs_pin_log_trans(dest); /* Do the log updates for all inodes. */ if (old_ino != BTRFS_FIRST_FREE_OBJECTID) btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir), old_rename_ctx.index, new_dentry->d_parent); if (new_ino != BTRFS_FIRST_FREE_OBJECTID) btrfs_log_new_name(trans, new_dentry, BTRFS_I(new_dir), new_rename_ctx.index, old_dentry->d_parent); /* Now unpin the logs. */ if (old_ino != BTRFS_FIRST_FREE_OBJECTID) btrfs_end_log_trans(root); if (new_ino != BTRFS_FIRST_FREE_OBJECTID) btrfs_end_log_trans(dest); out_fail: ret2 = btrfs_end_transaction(trans); ret = ret ? ret : ret2; out_notrans: if (new_ino == BTRFS_FIRST_FREE_OBJECTID || old_ino == BTRFS_FIRST_FREE_OBJECTID) up_read(&fs_info->subvol_sem); fscrypt_free_filename(&new_fname); fscrypt_free_filename(&old_fname); return ret; } static struct inode *new_whiteout_inode(struct mnt_idmap *idmap, struct inode *dir) { struct inode *inode; inode = new_inode(dir->i_sb); if (inode) { inode_init_owner(idmap, inode, dir, S_IFCHR | WHITEOUT_MODE); inode->i_op = &btrfs_special_inode_operations; init_special_inode(inode, inode->i_mode, WHITEOUT_DEV); } return inode; } static int btrfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb); struct btrfs_new_inode_args whiteout_args = { .dir = old_dir, .dentry = old_dentry, }; struct btrfs_trans_handle *trans; unsigned int trans_num_items; struct btrfs_root *root = BTRFS_I(old_dir)->root; struct btrfs_root *dest = BTRFS_I(new_dir)->root; struct inode *new_inode = d_inode(new_dentry); struct inode *old_inode = d_inode(old_dentry); struct btrfs_rename_ctx rename_ctx; u64 index = 0; int ret; int ret2; u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); struct fscrypt_name old_fname, new_fname; if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) return -EPERM; /* we only allow rename subvolume link between subvolumes */ if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest) return -EXDEV; if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID || (new_inode && btrfs_ino(BTRFS_I(new_inode)) == BTRFS_FIRST_FREE_OBJECTID)) return -ENOTEMPTY; if (S_ISDIR(old_inode->i_mode) && new_inode && new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) return -ENOTEMPTY; ret = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &old_fname); if (ret) return ret; ret = fscrypt_setup_filename(new_dir, &new_dentry->d_name, 0, &new_fname); if (ret) { fscrypt_free_filename(&old_fname); return ret; } /* check for collisions, even if the name isn't there */ ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino, &new_fname.disk_name); if (ret) { if (ret == -EEXIST) { /* we shouldn't get * eexist without a new_inode */ if (WARN_ON(!new_inode)) { goto out_fscrypt_names; } } else { /* maybe -EOVERFLOW */ goto out_fscrypt_names; } } ret = 0; /* * we're using rename to replace one file with another. Start IO on it * now so we don't add too much work to the end of the transaction */ if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size) filemap_flush(old_inode->i_mapping); if (flags & RENAME_WHITEOUT) { whiteout_args.inode = new_whiteout_inode(idmap, old_dir); if (!whiteout_args.inode) { ret = -ENOMEM; goto out_fscrypt_names; } ret = btrfs_new_inode_prepare(&whiteout_args, &trans_num_items); if (ret) goto out_whiteout_inode; } else { /* 1 to update the old parent inode. */ trans_num_items = 1; } if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { /* Close the race window with snapshot create/destroy ioctl */ down_read(&fs_info->subvol_sem); /* * 1 to remove old root ref * 1 to remove old root backref * 1 to add new root ref * 1 to add new root backref */ trans_num_items += 4; } else { /* * 1 to update inode * 1 to remove old inode ref * 1 to add new inode ref */ trans_num_items += 3; } /* * 1 to remove old dir item * 1 to remove old dir index * 1 to add new dir item * 1 to add new dir index */ trans_num_items += 4; /* 1 to update new parent inode if it's not the same as the old parent */ if (new_dir != old_dir) trans_num_items++; if (new_inode) { /* * 1 to update inode * 1 to remove inode ref * 1 to remove dir item * 1 to remove dir index * 1 to possibly add orphan item */ trans_num_items += 5; } trans = btrfs_start_transaction(root, trans_num_items); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out_notrans; } if (dest != root) { ret = btrfs_record_root_in_trans(trans, dest); if (ret) goto out_fail; } ret = btrfs_set_inode_index(BTRFS_I(new_dir), &index); if (ret) goto out_fail; BTRFS_I(old_inode)->dir_index = 0ULL; if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { /* force full log commit if subvolume involved. */ btrfs_set_log_full_commit(trans); } else { ret = btrfs_insert_inode_ref(trans, dest, &new_fname.disk_name, old_ino, btrfs_ino(BTRFS_I(new_dir)), index); if (ret) goto out_fail; } inode_inc_iversion(old_dir); inode_inc_iversion(new_dir); inode_inc_iversion(old_inode); simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); if (old_dentry->d_parent != new_dentry->d_parent) btrfs_record_unlink_dir(trans, BTRFS_I(old_dir), BTRFS_I(old_inode), true); if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry); } else { ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir), BTRFS_I(d_inode(old_dentry)), &old_fname.disk_name, &rename_ctx); if (!ret) ret = btrfs_update_inode(trans, BTRFS_I(old_inode)); } if (ret) { btrfs_abort_transaction(trans, ret); goto out_fail; } if (new_inode) { inode_inc_iversion(new_inode); if (unlikely(btrfs_ino(BTRFS_I(new_inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry); BUG_ON(new_inode->i_nlink == 0); } else { ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir), BTRFS_I(d_inode(new_dentry)), &new_fname.disk_name); } if (!ret && new_inode->i_nlink == 0) ret = btrfs_orphan_add(trans, BTRFS_I(d_inode(new_dentry))); if (ret) { btrfs_abort_transaction(trans, ret); goto out_fail; } } ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode), &new_fname.disk_name, 0, index); if (ret) { btrfs_abort_transaction(trans, ret); goto out_fail; } if (old_inode->i_nlink == 1) BTRFS_I(old_inode)->dir_index = index; if (old_ino != BTRFS_FIRST_FREE_OBJECTID) btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir), rename_ctx.index, new_dentry->d_parent); if (flags & RENAME_WHITEOUT) { ret = btrfs_create_new_inode(trans, &whiteout_args); if (ret) { btrfs_abort_transaction(trans, ret); goto out_fail; } else { unlock_new_inode(whiteout_args.inode); iput(whiteout_args.inode); whiteout_args.inode = NULL; } } out_fail: ret2 = btrfs_end_transaction(trans); ret = ret ? ret : ret2; out_notrans: if (old_ino == BTRFS_FIRST_FREE_OBJECTID) up_read(&fs_info->subvol_sem); if (flags & RENAME_WHITEOUT) btrfs_new_inode_args_destroy(&whiteout_args); out_whiteout_inode: if (flags & RENAME_WHITEOUT) iput(whiteout_args.inode); out_fscrypt_names: fscrypt_free_filename(&old_fname); fscrypt_free_filename(&new_fname); return ret; } static int btrfs_rename2(struct mnt_idmap *idmap, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { int ret; if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return -EINVAL; if (flags & RENAME_EXCHANGE) ret = btrfs_rename_exchange(old_dir, old_dentry, new_dir, new_dentry); else ret = btrfs_rename(idmap, old_dir, old_dentry, new_dir, new_dentry, flags); btrfs_btree_balance_dirty(BTRFS_I(new_dir)->root->fs_info); return ret; } struct btrfs_delalloc_work { struct inode *inode; struct completion completion; struct list_head list; struct btrfs_work work; }; static void btrfs_run_delalloc_work(struct btrfs_work *work) { struct btrfs_delalloc_work *delalloc_work; struct inode *inode; delalloc_work = container_of(work, struct btrfs_delalloc_work, work); inode = delalloc_work->inode; filemap_flush(inode->i_mapping); if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &BTRFS_I(inode)->runtime_flags)) filemap_flush(inode->i_mapping); iput(inode); complete(&delalloc_work->completion); } static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode) { struct btrfs_delalloc_work *work; work = kmalloc(sizeof(*work), GFP_NOFS); if (!work) return NULL; init_completion(&work->completion); INIT_LIST_HEAD(&work->list); work->inode = inode; btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL); return work; } /* * some fairly slow code that needs optimization. This walks the list * of all the inodes with pending delalloc and forces them to disk. */ static int start_delalloc_inodes(struct btrfs_root *root, struct writeback_control *wbc, bool snapshot, bool in_reclaim_context) { struct btrfs_inode *binode; struct inode *inode; struct btrfs_delalloc_work *work, *next; LIST_HEAD(works); LIST_HEAD(splice); int ret = 0; bool full_flush = wbc->nr_to_write == LONG_MAX; mutex_lock(&root->delalloc_mutex); spin_lock(&root->delalloc_lock); list_splice_init(&root->delalloc_inodes, &splice); while (!list_empty(&splice)) { binode = list_entry(splice.next, struct btrfs_inode, delalloc_inodes); list_move_tail(&binode->delalloc_inodes, &root->delalloc_inodes); if (in_reclaim_context && test_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &binode->runtime_flags)) continue; inode = igrab(&binode->vfs_inode); if (!inode) { cond_resched_lock(&root->delalloc_lock); continue; } spin_unlock(&root->delalloc_lock); if (snapshot) set_bit(BTRFS_INODE_SNAPSHOT_FLUSH, &binode->runtime_flags); if (full_flush) { work = btrfs_alloc_delalloc_work(inode); if (!work) { iput(inode); ret = -ENOMEM; goto out; } list_add_tail(&work->list, &works); btrfs_queue_work(root->fs_info->flush_workers, &work->work); } else { ret = filemap_fdatawrite_wbc(inode->i_mapping, wbc); btrfs_add_delayed_iput(BTRFS_I(inode)); if (ret || wbc->nr_to_write <= 0) goto out; } cond_resched(); spin_lock(&root->delalloc_lock); } spin_unlock(&root->delalloc_lock); out: list_for_each_entry_safe(work, next, &works, list) { list_del_init(&work->list); wait_for_completion(&work->completion); kfree(work); } if (!list_empty(&splice)) { spin_lock(&root->delalloc_lock); list_splice_tail(&splice, &root->delalloc_inodes); spin_unlock(&root->delalloc_lock); } mutex_unlock(&root->delalloc_mutex); return ret; } int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context) { struct writeback_control wbc = { .nr_to_write = LONG_MAX, .sync_mode = WB_SYNC_NONE, .range_start = 0, .range_end = LLONG_MAX, }; struct btrfs_fs_info *fs_info = root->fs_info; if (BTRFS_FS_ERROR(fs_info)) return -EROFS; return start_delalloc_inodes(root, &wbc, true, in_reclaim_context); } int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, bool in_reclaim_context) { struct writeback_control wbc = { .nr_to_write = nr, .sync_mode = WB_SYNC_NONE, .range_start = 0, .range_end = LLONG_MAX, }; struct btrfs_root *root; LIST_HEAD(splice); int ret; if (BTRFS_FS_ERROR(fs_info)) return -EROFS; mutex_lock(&fs_info->delalloc_root_mutex); spin_lock(&fs_info->delalloc_root_lock); list_splice_init(&fs_info->delalloc_roots, &splice); while (!list_empty(&splice)) { /* * Reset nr_to_write here so we know that we're doing a full * flush. */ if (nr == LONG_MAX) wbc.nr_to_write = LONG_MAX; root = list_first_entry(&splice, struct btrfs_root, delalloc_root); root = btrfs_grab_root(root); BUG_ON(!root); list_move_tail(&root->delalloc_root, &fs_info->delalloc_roots); spin_unlock(&fs_info->delalloc_root_lock); ret = start_delalloc_inodes(root, &wbc, false, in_reclaim_context); btrfs_put_root(root); if (ret < 0 || wbc.nr_to_write <= 0) goto out; spin_lock(&fs_info->delalloc_root_lock); } spin_unlock(&fs_info->delalloc_root_lock); ret = 0; out: if (!list_empty(&splice)) { spin_lock(&fs_info->delalloc_root_lock); list_splice_tail(&splice, &fs_info->delalloc_roots); spin_unlock(&fs_info->delalloc_root_lock); } mutex_unlock(&fs_info->delalloc_root_mutex); return ret; } static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_path *path; struct btrfs_key key; struct inode *inode; struct btrfs_new_inode_args new_inode_args = { .dir = dir, .dentry = dentry, }; unsigned int trans_num_items; int err; int name_len; int datasize; unsigned long ptr; struct btrfs_file_extent_item *ei; struct extent_buffer *leaf; name_len = strlen(symname); if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info)) return -ENAMETOOLONG; inode = new_inode(dir->i_sb); if (!inode) return -ENOMEM; inode_init_owner(idmap, inode, dir, S_IFLNK | S_IRWXUGO); inode->i_op = &btrfs_symlink_inode_operations; inode_nohighmem(inode); inode->i_mapping->a_ops = &btrfs_aops; btrfs_i_size_write(BTRFS_I(inode), name_len); inode_set_bytes(inode, name_len); new_inode_args.inode = inode; err = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items); if (err) goto out_inode; /* 1 additional item for the inline extent */ trans_num_items++; trans = btrfs_start_transaction(root, trans_num_items); if (IS_ERR(trans)) { err = PTR_ERR(trans); goto out_new_inode_args; } err = btrfs_create_new_inode(trans, &new_inode_args); if (err) goto out; path = btrfs_alloc_path(); if (!path) { err = -ENOMEM; btrfs_abort_transaction(trans, err); discard_new_inode(inode); inode = NULL; goto out; } key.objectid = btrfs_ino(BTRFS_I(inode)); key.offset = 0; key.type = BTRFS_EXTENT_DATA_KEY; datasize = btrfs_file_extent_calc_inline_size(name_len); err = btrfs_insert_empty_item(trans, root, path, &key, datasize); if (err) { btrfs_abort_transaction(trans, err); btrfs_free_path(path); discard_new_inode(inode); inode = NULL; goto out; } leaf = path->nodes[0]; ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); btrfs_set_file_extent_generation(leaf, ei, trans->transid); btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE); btrfs_set_file_extent_encryption(leaf, ei, 0); btrfs_set_file_extent_compression(leaf, ei, 0); btrfs_set_file_extent_other_encoding(leaf, ei, 0); btrfs_set_file_extent_ram_bytes(leaf, ei, name_len); ptr = btrfs_file_extent_inline_start(ei); write_extent_buffer(leaf, symname, ptr, name_len); btrfs_mark_buffer_dirty(trans, leaf); btrfs_free_path(path); d_instantiate_new(dentry, inode); err = 0; out: btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); out_new_inode_args: btrfs_new_inode_args_destroy(&new_inode_args); out_inode: if (err) iput(inode); return err; } static struct btrfs_trans_handle *insert_prealloc_file_extent( struct btrfs_trans_handle *trans_in, struct btrfs_inode *inode, struct btrfs_key *ins, u64 file_offset) { struct btrfs_file_extent_item stack_fi; struct btrfs_replace_extent_info extent_info; struct btrfs_trans_handle *trans = trans_in; struct btrfs_path *path; u64 start = ins->objectid; u64 len = ins->offset; u64 qgroup_released = 0; int ret; memset(&stack_fi, 0, sizeof(stack_fi)); btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_PREALLOC); btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, start); btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi, len); btrfs_set_stack_file_extent_num_bytes(&stack_fi, len); btrfs_set_stack_file_extent_ram_bytes(&stack_fi, len); btrfs_set_stack_file_extent_compression(&stack_fi, BTRFS_COMPRESS_NONE); /* Encryption and other encoding is reserved and all 0 */ ret = btrfs_qgroup_release_data(inode, file_offset, len, &qgroup_released); if (ret < 0) return ERR_PTR(ret); if (trans) { ret = insert_reserved_file_extent(trans, inode, file_offset, &stack_fi, true, qgroup_released); if (ret) goto free_qgroup; return trans; } extent_info.disk_offset = start; extent_info.disk_len = len; extent_info.data_offset = 0; extent_info.data_len = len; extent_info.file_offset = file_offset; extent_info.extent_buf = (char *)&stack_fi; extent_info.is_new_extent = true; extent_info.update_times = true; extent_info.qgroup_reserved = qgroup_released; extent_info.insertions = 0; path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; goto free_qgroup; } ret = btrfs_replace_file_extents(inode, path, file_offset, file_offset + len - 1, &extent_info, &trans); btrfs_free_path(path); if (ret) goto free_qgroup; return trans; free_qgroup: /* * We have released qgroup data range at the beginning of the function, * and normally qgroup_released bytes will be freed when committing * transaction. * But if we error out early, we have to free what we have released * or we leak qgroup data reservation. */ btrfs_qgroup_free_refroot(inode->root->fs_info, inode->root->root_key.objectid, qgroup_released, BTRFS_QGROUP_RSV_DATA); return ERR_PTR(ret); } static int __btrfs_prealloc_file_range(struct inode *inode, int mode, u64 start, u64 num_bytes, u64 min_size, loff_t actual_len, u64 *alloc_hint, struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_map *em; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_key ins; u64 cur_offset = start; u64 clear_offset = start; u64 i_size; u64 cur_bytes; u64 last_alloc = (u64)-1; int ret = 0; bool own_trans = true; u64 end = start + num_bytes - 1; if (trans) own_trans = false; while (num_bytes > 0) { cur_bytes = min_t(u64, num_bytes, SZ_256M); cur_bytes = max(cur_bytes, min_size); /* * If we are severely fragmented we could end up with really * small allocations, so if the allocator is returning small * chunks lets make its job easier by only searching for those * sized chunks. */ cur_bytes = min(cur_bytes, last_alloc); ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes, min_size, 0, *alloc_hint, &ins, 1, 0); if (ret) break; /* * We've reserved this space, and thus converted it from * ->bytes_may_use to ->bytes_reserved. Any error that happens * from here on out we will only need to clear our reservation * for the remaining unreserved area, so advance our * clear_offset by our extent size. */ clear_offset += ins.offset; last_alloc = ins.offset; trans = insert_prealloc_file_extent(trans, BTRFS_I(inode), &ins, cur_offset); /* * Now that we inserted the prealloc extent we can finally * decrement the number of reservations in the block group. * If we did it before, we could race with relocation and have * relocation miss the reserved extent, making it fail later. */ btrfs_dec_block_group_reservations(fs_info, ins.objectid); if (IS_ERR(trans)) { ret = PTR_ERR(trans); btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0); break; } em = alloc_extent_map(); if (!em) { btrfs_drop_extent_map_range(BTRFS_I(inode), cur_offset, cur_offset + ins.offset - 1, false); btrfs_set_inode_full_sync(BTRFS_I(inode)); goto next; } em->start = cur_offset; em->orig_start = cur_offset; em->len = ins.offset; em->block_start = ins.objectid; em->block_len = ins.offset; em->orig_block_len = ins.offset; em->ram_bytes = ins.offset; set_bit(EXTENT_FLAG_PREALLOC, &em->flags); em->generation = trans->transid; ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, true); free_extent_map(em); next: num_bytes -= ins.offset; cur_offset += ins.offset; *alloc_hint = ins.objectid + ins.offset; inode_inc_iversion(inode); inode_set_ctime_current(inode); BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC; if (!(mode & FALLOC_FL_KEEP_SIZE) && (actual_len > inode->i_size) && (cur_offset > inode->i_size)) { if (cur_offset > actual_len) i_size = actual_len; else i_size = cur_offset; i_size_write(inode, i_size); btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); } ret = btrfs_update_inode(trans, BTRFS_I(inode)); if (ret) { btrfs_abort_transaction(trans, ret); if (own_trans) btrfs_end_transaction(trans); break; } if (own_trans) { btrfs_end_transaction(trans); trans = NULL; } } if (clear_offset < end) btrfs_free_reserved_data_space(BTRFS_I(inode), NULL, clear_offset, end - clear_offset + 1); return ret; } int btrfs_prealloc_file_range(struct inode *inode, int mode, u64 start, u64 num_bytes, u64 min_size, loff_t actual_len, u64 *alloc_hint) { return __btrfs_prealloc_file_range(inode, mode, start, num_bytes, min_size, actual_len, alloc_hint, NULL); } int btrfs_prealloc_file_range_trans(struct inode *inode, struct btrfs_trans_handle *trans, int mode, u64 start, u64 num_bytes, u64 min_size, loff_t actual_len, u64 *alloc_hint) { return __btrfs_prealloc_file_range(inode, mode, start, num_bytes, min_size, actual_len, alloc_hint, trans); } static int btrfs_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { struct btrfs_root *root = BTRFS_I(inode)->root; umode_t mode = inode->i_mode; if (mask & MAY_WRITE && (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) { if (btrfs_root_readonly(root)) return -EROFS; if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) return -EACCES; } return generic_permission(idmap, inode, mask); } static int btrfs_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct file *file, umode_t mode) { struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(dir)->root; struct inode *inode; struct btrfs_new_inode_args new_inode_args = { .dir = dir, .dentry = file->f_path.dentry, .orphan = true, }; unsigned int trans_num_items; int ret; inode = new_inode(dir->i_sb); if (!inode) return -ENOMEM; inode_init_owner(idmap, inode, dir, mode); inode->i_fop = &btrfs_file_operations; inode->i_op = &btrfs_file_inode_operations; inode->i_mapping->a_ops = &btrfs_aops; new_inode_args.inode = inode; ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items); if (ret) goto out_inode; trans = btrfs_start_transaction(root, trans_num_items); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out_new_inode_args; } ret = btrfs_create_new_inode(trans, &new_inode_args); /* * We set number of links to 0 in btrfs_create_new_inode(), and here we * set it to 1 because d_tmpfile() will issue a warning if the count is * 0, through: * * d_tmpfile() -> inode_dec_link_count() -> drop_nlink() */ set_nlink(inode, 1); if (!ret) { d_tmpfile(file, inode); unlock_new_inode(inode); mark_inode_dirty(inode); } btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); out_new_inode_args: btrfs_new_inode_args_destroy(&new_inode_args); out_inode: if (ret) iput(inode); return finish_open_simple(file, ret); } void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end) { struct btrfs_fs_info *fs_info = inode->root->fs_info; unsigned long index = start >> PAGE_SHIFT; unsigned long end_index = end >> PAGE_SHIFT; struct page *page; u32 len; ASSERT(end + 1 - start <= U32_MAX); len = end + 1 - start; while (index <= end_index) { page = find_get_page(inode->vfs_inode.i_mapping, index); ASSERT(page); /* Pages should be in the extent_io_tree */ btrfs_page_set_writeback(fs_info, page, start, len); put_page(page); index++; } } int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info, int compress_type) { switch (compress_type) { case BTRFS_COMPRESS_NONE: return BTRFS_ENCODED_IO_COMPRESSION_NONE; case BTRFS_COMPRESS_ZLIB: return BTRFS_ENCODED_IO_COMPRESSION_ZLIB; case BTRFS_COMPRESS_LZO: /* * The LZO format depends on the sector size. 64K is the maximum * sector size that we support. */ if (fs_info->sectorsize < SZ_4K || fs_info->sectorsize > SZ_64K) return -EINVAL; return BTRFS_ENCODED_IO_COMPRESSION_LZO_4K + (fs_info->sectorsize_bits - 12); case BTRFS_COMPRESS_ZSTD: return BTRFS_ENCODED_IO_COMPRESSION_ZSTD; default: return -EUCLEAN; } } static ssize_t btrfs_encoded_read_inline( struct kiocb *iocb, struct iov_iter *iter, u64 start, u64 lockend, struct extent_state **cached_state, u64 extent_start, size_t count, struct btrfs_ioctl_encoded_io_args *encoded, bool *unlocked) { struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct extent_io_tree *io_tree = &inode->io_tree; struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_file_extent_item *item; u64 ram_bytes; unsigned long ptr; void *tmp; ssize_t ret; path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; goto out; } ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), extent_start, 0); if (ret) { if (ret > 0) { /* The extent item disappeared? */ ret = -EIO; } goto out; } leaf = path->nodes[0]; item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); ram_bytes = btrfs_file_extent_ram_bytes(leaf, item); ptr = btrfs_file_extent_inline_start(item); encoded->len = min_t(u64, extent_start + ram_bytes, inode->vfs_inode.i_size) - iocb->ki_pos; ret = btrfs_encoded_io_compression_from_extent(fs_info, btrfs_file_extent_compression(leaf, item)); if (ret < 0) goto out; encoded->compression = ret; if (encoded->compression) { size_t inline_size; inline_size = btrfs_file_extent_inline_item_len(leaf, path->slots[0]); if (inline_size > count) { ret = -ENOBUFS; goto out; } count = inline_size; encoded->unencoded_len = ram_bytes; encoded->unencoded_offset = iocb->ki_pos - extent_start; } else { count = min_t(u64, count, encoded->len); encoded->len = count; encoded->unencoded_len = count; ptr += iocb->ki_pos - extent_start; } tmp = kmalloc(count, GFP_NOFS); if (!tmp) { ret = -ENOMEM; goto out; } read_extent_buffer(leaf, tmp, ptr, count); btrfs_release_path(path); unlock_extent(io_tree, start, lockend, cached_state); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); *unlocked = true; ret = copy_to_iter(tmp, count, iter); if (ret != count) ret = -EFAULT; kfree(tmp); out: btrfs_free_path(path); return ret; } struct btrfs_encoded_read_private { wait_queue_head_t wait; atomic_t pending; blk_status_t status; }; static void btrfs_encoded_read_endio(struct btrfs_bio *bbio) { struct btrfs_encoded_read_private *priv = bbio->private; if (bbio->bio.bi_status) { /* * The memory barrier implied by the atomic_dec_return() here * pairs with the memory barrier implied by the * atomic_dec_return() or io_wait_event() in * btrfs_encoded_read_regular_fill_pages() to ensure that this * write is observed before the load of status in * btrfs_encoded_read_regular_fill_pages(). */ WRITE_ONCE(priv->status, bbio->bio.bi_status); } if (!atomic_dec_return(&priv->pending)) wake_up(&priv->wait); bio_put(&bbio->bio); } int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, u64 file_offset, u64 disk_bytenr, u64 disk_io_size, struct page **pages) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_encoded_read_private priv = { .pending = ATOMIC_INIT(1), }; unsigned long i = 0; struct btrfs_bio *bbio; init_waitqueue_head(&priv.wait); bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info, btrfs_encoded_read_endio, &priv); bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; bbio->inode = inode; do { size_t bytes = min_t(u64, disk_io_size, PAGE_SIZE); if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) { atomic_inc(&priv.pending); btrfs_submit_bio(bbio, 0); bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info, btrfs_encoded_read_endio, &priv); bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; bbio->inode = inode; continue; } i++; disk_bytenr += bytes; disk_io_size -= bytes; } while (disk_io_size); atomic_inc(&priv.pending); btrfs_submit_bio(bbio, 0); if (atomic_dec_return(&priv.pending)) io_wait_event(priv.wait, !atomic_read(&priv.pending)); /* See btrfs_encoded_read_endio() for ordering. */ return blk_status_to_errno(READ_ONCE(priv.status)); } static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, struct iov_iter *iter, u64 start, u64 lockend, struct extent_state **cached_state, u64 disk_bytenr, u64 disk_io_size, size_t count, bool compressed, bool *unlocked) { struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); struct extent_io_tree *io_tree = &inode->io_tree; struct page **pages; unsigned long nr_pages, i; u64 cur; size_t page_offset; ssize_t ret; nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE); pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); if (!pages) return -ENOMEM; ret = btrfs_alloc_page_array(nr_pages, pages); if (ret) { ret = -ENOMEM; goto out; } ret = btrfs_encoded_read_regular_fill_pages(inode, start, disk_bytenr, disk_io_size, pages); if (ret) goto out; unlock_extent(io_tree, start, lockend, cached_state); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); *unlocked = true; if (compressed) { i = 0; page_offset = 0; } else { i = (iocb->ki_pos - start) >> PAGE_SHIFT; page_offset = (iocb->ki_pos - start) & (PAGE_SIZE - 1); } cur = 0; while (cur < count) { size_t bytes = min_t(size_t, count - cur, PAGE_SIZE - page_offset); if (copy_page_to_iter(pages[i], page_offset, bytes, iter) != bytes) { ret = -EFAULT; goto out; } i++; cur += bytes; page_offset = 0; } ret = count; out: for (i = 0; i < nr_pages; i++) { if (pages[i]) __free_page(pages[i]); } kfree(pages); return ret; } ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, struct btrfs_ioctl_encoded_io_args *encoded) { struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_io_tree *io_tree = &inode->io_tree; ssize_t ret; size_t count = iov_iter_count(iter); u64 start, lockend, disk_bytenr, disk_io_size; struct extent_state *cached_state = NULL; struct extent_map *em; bool unlocked = false; file_accessed(iocb->ki_filp); btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED); if (iocb->ki_pos >= inode->vfs_inode.i_size) { btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); return 0; } start = ALIGN_DOWN(iocb->ki_pos, fs_info->sectorsize); /* * We don't know how long the extent containing iocb->ki_pos is, but if * it's compressed we know that it won't be longer than this. */ lockend = start + BTRFS_MAX_UNCOMPRESSED - 1; for (;;) { struct btrfs_ordered_extent *ordered; ret = btrfs_wait_ordered_range(&inode->vfs_inode, start, lockend - start + 1); if (ret) goto out_unlock_inode; lock_extent(io_tree, start, lockend, &cached_state); ordered = btrfs_lookup_ordered_range(inode, start, lockend - start + 1); if (!ordered) break; btrfs_put_ordered_extent(ordered); unlock_extent(io_tree, start, lockend, &cached_state); cond_resched(); } em = btrfs_get_extent(inode, NULL, 0, start, lockend - start + 1); if (IS_ERR(em)) { ret = PTR_ERR(em); goto out_unlock_extent; } if (em->block_start == EXTENT_MAP_INLINE) { u64 extent_start = em->start; /* * For inline extents we get everything we need out of the * extent item. */ free_extent_map(em); em = NULL; ret = btrfs_encoded_read_inline(iocb, iter, start, lockend, &cached_state, extent_start, count, encoded, &unlocked); goto out; } /* * We only want to return up to EOF even if the extent extends beyond * that. */ encoded->len = min_t(u64, extent_map_end(em), inode->vfs_inode.i_size) - iocb->ki_pos; if (em->block_start == EXTENT_MAP_HOLE || test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { disk_bytenr = EXTENT_MAP_HOLE; count = min_t(u64, count, encoded->len); encoded->len = count; encoded->unencoded_len = count; } else if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { disk_bytenr = em->block_start; /* * Bail if the buffer isn't large enough to return the whole * compressed extent. */ if (em->block_len > count) { ret = -ENOBUFS; goto out_em; } disk_io_size = em->block_len; count = em->block_len; encoded->unencoded_len = em->ram_bytes; encoded->unencoded_offset = iocb->ki_pos - em->orig_start; ret = btrfs_encoded_io_compression_from_extent(fs_info, em->compress_type); if (ret < 0) goto out_em; encoded->compression = ret; } else { disk_bytenr = em->block_start + (start - em->start); if (encoded->len > count) encoded->len = count; /* * Don't read beyond what we locked. This also limits the page * allocations that we'll do. */ disk_io_size = min(lockend + 1, iocb->ki_pos + encoded->len) - start; count = start + disk_io_size - iocb->ki_pos; encoded->len = count; encoded->unencoded_len = count; disk_io_size = ALIGN(disk_io_size, fs_info->sectorsize); } free_extent_map(em); em = NULL; if (disk_bytenr == EXTENT_MAP_HOLE) { unlock_extent(io_tree, start, lockend, &cached_state); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); unlocked = true; ret = iov_iter_zero(count, iter); if (ret != count) ret = -EFAULT; } else { ret = btrfs_encoded_read_regular(iocb, iter, start, lockend, &cached_state, disk_bytenr, disk_io_size, count, encoded->compression, &unlocked); } out: if (ret >= 0) iocb->ki_pos += encoded->len; out_em: free_extent_map(em); out_unlock_extent: if (!unlocked) unlock_extent(io_tree, start, lockend, &cached_state); out_unlock_inode: if (!unlocked) btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); return ret; } ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, const struct btrfs_ioctl_encoded_io_args *encoded) { struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp)); struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct extent_io_tree *io_tree = &inode->io_tree; struct extent_changeset *data_reserved = NULL; struct extent_state *cached_state = NULL; struct btrfs_ordered_extent *ordered; int compression; size_t orig_count; u64 start, end; u64 num_bytes, ram_bytes, disk_num_bytes; unsigned long nr_pages, i; struct page **pages; struct btrfs_key ins; bool extent_reserved = false; struct extent_map *em; ssize_t ret; switch (encoded->compression) { case BTRFS_ENCODED_IO_COMPRESSION_ZLIB: compression = BTRFS_COMPRESS_ZLIB; break; case BTRFS_ENCODED_IO_COMPRESSION_ZSTD: compression = BTRFS_COMPRESS_ZSTD; break; case BTRFS_ENCODED_IO_COMPRESSION_LZO_4K: case BTRFS_ENCODED_IO_COMPRESSION_LZO_8K: case BTRFS_ENCODED_IO_COMPRESSION_LZO_16K: case BTRFS_ENCODED_IO_COMPRESSION_LZO_32K: case BTRFS_ENCODED_IO_COMPRESSION_LZO_64K: /* The sector size must match for LZO. */ if (encoded->compression - BTRFS_ENCODED_IO_COMPRESSION_LZO_4K + 12 != fs_info->sectorsize_bits) return -EINVAL; compression = BTRFS_COMPRESS_LZO; break; default: return -EINVAL; } if (encoded->encryption != BTRFS_ENCODED_IO_ENCRYPTION_NONE) return -EINVAL; orig_count = iov_iter_count(from); /* The extent size must be sane. */ if (encoded->unencoded_len > BTRFS_MAX_UNCOMPRESSED || orig_count > BTRFS_MAX_COMPRESSED || orig_count == 0) return -EINVAL; /* * The compressed data must be smaller than the decompressed data. * * It's of course possible for data to compress to larger or the same * size, but the buffered I/O path falls back to no compression for such * data, and we don't want to break any assumptions by creating these * extents. * * Note that this is less strict than the current check we have that the * compressed data must be at least one sector smaller than the * decompressed data. We only want to enforce the weaker requirement * from old kernels that it is at least one byte smaller. */ if (orig_count >= encoded->unencoded_len) return -EINVAL; /* The extent must start on a sector boundary. */ start = iocb->ki_pos; if (!IS_ALIGNED(start, fs_info->sectorsize)) return -EINVAL; /* * The extent must end on a sector boundary. However, we allow a write * which ends at or extends i_size to have an unaligned length; we round * up the extent size and set i_size to the unaligned end. */ if (start + encoded->len < inode->vfs_inode.i_size && !IS_ALIGNED(start + encoded->len, fs_info->sectorsize)) return -EINVAL; /* Finally, the offset in the unencoded data must be sector-aligned. */ if (!IS_ALIGNED(encoded->unencoded_offset, fs_info->sectorsize)) return -EINVAL; num_bytes = ALIGN(encoded->len, fs_info->sectorsize); ram_bytes = ALIGN(encoded->unencoded_len, fs_info->sectorsize); end = start + num_bytes - 1; /* * If the extent cannot be inline, the compressed data on disk must be * sector-aligned. For convenience, we extend it with zeroes if it * isn't. */ disk_num_bytes = ALIGN(orig_count, fs_info->sectorsize); nr_pages = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE); pages = kvcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL_ACCOUNT); if (!pages) return -ENOMEM; for (i = 0; i < nr_pages; i++) { size_t bytes = min_t(size_t, PAGE_SIZE, iov_iter_count(from)); char *kaddr; pages[i] = alloc_page(GFP_KERNEL_ACCOUNT); if (!pages[i]) { ret = -ENOMEM; goto out_pages; } kaddr = kmap_local_page(pages[i]); if (copy_from_iter(kaddr, bytes, from) != bytes) { kunmap_local(kaddr); ret = -EFAULT; goto out_pages; } if (bytes < PAGE_SIZE) memset(kaddr + bytes, 0, PAGE_SIZE - bytes); kunmap_local(kaddr); } for (;;) { struct btrfs_ordered_extent *ordered; ret = btrfs_wait_ordered_range(&inode->vfs_inode, start, num_bytes); if (ret) goto out_pages; ret = invalidate_inode_pages2_range(inode->vfs_inode.i_mapping, start >> PAGE_SHIFT, end >> PAGE_SHIFT); if (ret) goto out_pages; lock_extent(io_tree, start, end, &cached_state); ordered = btrfs_lookup_ordered_range(inode, start, num_bytes); if (!ordered && !filemap_range_has_page(inode->vfs_inode.i_mapping, start, end)) break; if (ordered) btrfs_put_ordered_extent(ordered); unlock_extent(io_tree, start, end, &cached_state); cond_resched(); } /* * We don't use the higher-level delalloc space functions because our * num_bytes and disk_num_bytes are different. */ ret = btrfs_alloc_data_chunk_ondemand(inode, disk_num_bytes); if (ret) goto out_unlock; ret = btrfs_qgroup_reserve_data(inode, &data_reserved, start, num_bytes); if (ret) goto out_free_data_space; ret = btrfs_delalloc_reserve_metadata(inode, num_bytes, disk_num_bytes, false); if (ret) goto out_qgroup_free_data; /* Try an inline extent first. */ if (start == 0 && encoded->unencoded_len == encoded->len && encoded->unencoded_offset == 0) { ret = cow_file_range_inline(inode, encoded->len, orig_count, compression, pages, true); if (ret <= 0) { if (ret == 0) ret = orig_count; goto out_delalloc_release; } } ret = btrfs_reserve_extent(root, disk_num_bytes, disk_num_bytes, disk_num_bytes, 0, 0, &ins, 1, 1); if (ret) goto out_delalloc_release; extent_reserved = true; em = create_io_em(inode, start, num_bytes, start - encoded->unencoded_offset, ins.objectid, ins.offset, ins.offset, ram_bytes, compression, BTRFS_ORDERED_COMPRESSED); if (IS_ERR(em)) { ret = PTR_ERR(em); goto out_free_reserved; } free_extent_map(em); ordered = btrfs_alloc_ordered_extent(inode, start, num_bytes, ram_bytes, ins.objectid, ins.offset, encoded->unencoded_offset, (1 << BTRFS_ORDERED_ENCODED) | (1 << BTRFS_ORDERED_COMPRESSED), compression); if (IS_ERR(ordered)) { btrfs_drop_extent_map_range(inode, start, end, false); ret = PTR_ERR(ordered); goto out_free_reserved; } btrfs_dec_block_group_reservations(fs_info, ins.objectid); if (start + encoded->len > inode->vfs_inode.i_size) i_size_write(&inode->vfs_inode, start + encoded->len); unlock_extent(io_tree, start, end, &cached_state); btrfs_delalloc_release_extents(inode, num_bytes); btrfs_submit_compressed_write(ordered, pages, nr_pages, 0, false); ret = orig_count; goto out; out_free_reserved: btrfs_dec_block_group_reservations(fs_info, ins.objectid); btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); out_delalloc_release: btrfs_delalloc_release_extents(inode, num_bytes); btrfs_delalloc_release_metadata(inode, disk_num_bytes, ret < 0); out_qgroup_free_data: if (ret < 0) btrfs_qgroup_free_data(inode, data_reserved, start, num_bytes, NULL); out_free_data_space: /* * If btrfs_reserve_extent() succeeded, then we already decremented * bytes_may_use. */ if (!extent_reserved) btrfs_free_reserved_data_space_noquota(fs_info, disk_num_bytes); out_unlock: unlock_extent(io_tree, start, end, &cached_state); out_pages: for (i = 0; i < nr_pages; i++) { if (pages[i]) __free_page(pages[i]); } kvfree(pages); out: if (ret >= 0) iocb->ki_pos += encoded->len; return ret; } #ifdef CONFIG_SWAP /* * Add an entry indicating a block group or device which is pinned by a * swapfile. Returns 0 on success, 1 if there is already an entry for it, or a * negative errno on failure. */ static int btrfs_add_swapfile_pin(struct inode *inode, void *ptr, bool is_block_group) { struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; struct btrfs_swapfile_pin *sp, *entry; struct rb_node **p; struct rb_node *parent = NULL; sp = kmalloc(sizeof(*sp), GFP_NOFS); if (!sp) return -ENOMEM; sp->ptr = ptr; sp->inode = inode; sp->is_block_group = is_block_group; sp->bg_extent_count = 1; spin_lock(&fs_info->swapfile_pins_lock); p = &fs_info->swapfile_pins.rb_node; while (*p) { parent = *p; entry = rb_entry(parent, struct btrfs_swapfile_pin, node); if (sp->ptr < entry->ptr || (sp->ptr == entry->ptr && sp->inode < entry->inode)) { p = &(*p)->rb_left; } else if (sp->ptr > entry->ptr || (sp->ptr == entry->ptr && sp->inode > entry->inode)) { p = &(*p)->rb_right; } else { if (is_block_group) entry->bg_extent_count++; spin_unlock(&fs_info->swapfile_pins_lock); kfree(sp); return 1; } } rb_link_node(&sp->node, parent, p); rb_insert_color(&sp->node, &fs_info->swapfile_pins); spin_unlock(&fs_info->swapfile_pins_lock); return 0; } /* Free all of the entries pinned by this swapfile. */ static void btrfs_free_swapfile_pins(struct inode *inode) { struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; struct btrfs_swapfile_pin *sp; struct rb_node *node, *next; spin_lock(&fs_info->swapfile_pins_lock); node = rb_first(&fs_info->swapfile_pins); while (node) { next = rb_next(node); sp = rb_entry(node, struct btrfs_swapfile_pin, node); if (sp->inode == inode) { rb_erase(&sp->node, &fs_info->swapfile_pins); if (sp->is_block_group) { btrfs_dec_block_group_swap_extents(sp->ptr, sp->bg_extent_count); btrfs_put_block_group(sp->ptr); } kfree(sp); } node = next; } spin_unlock(&fs_info->swapfile_pins_lock); } struct btrfs_swap_info { u64 start; u64 block_start; u64 block_len; u64 lowest_ppage; u64 highest_ppage; unsigned long nr_pages; int nr_extents; }; static int btrfs_add_swap_extent(struct swap_info_struct *sis, struct btrfs_swap_info *bsi) { unsigned long nr_pages; unsigned long max_pages; u64 first_ppage, first_ppage_reported, next_ppage; int ret; /* * Our swapfile may have had its size extended after the swap header was * written. In that case activating the swapfile should not go beyond * the max size set in the swap header. */ if (bsi->nr_pages >= sis->max) return 0; max_pages = sis->max - bsi->nr_pages; first_ppage = PAGE_ALIGN(bsi->block_start) >> PAGE_SHIFT; next_ppage = PAGE_ALIGN_DOWN(bsi->block_start + bsi->block_len) >> PAGE_SHIFT; if (first_ppage >= next_ppage) return 0; nr_pages = next_ppage - first_ppage; nr_pages = min(nr_pages, max_pages); first_ppage_reported = first_ppage; if (bsi->start == 0) first_ppage_reported++; if (bsi->lowest_ppage > first_ppage_reported) bsi->lowest_ppage = first_ppage_reported; if (bsi->highest_ppage < (next_ppage - 1)) bsi->highest_ppage = next_ppage - 1; ret = add_swap_extent(sis, bsi->nr_pages, nr_pages, first_ppage); if (ret < 0) return ret; bsi->nr_extents += ret; bsi->nr_pages += nr_pages; return 0; } static void btrfs_swap_deactivate(struct file *file) { struct inode *inode = file_inode(file); btrfs_free_swapfile_pins(inode); atomic_dec(&BTRFS_I(inode)->root->nr_swapfiles); } static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, sector_t *span) { struct inode *inode = file_inode(file); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_fs_info *fs_info = root->fs_info; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_state *cached_state = NULL; struct extent_map *em = NULL; struct btrfs_device *device = NULL; struct btrfs_swap_info bsi = { .lowest_ppage = (sector_t)-1ULL, }; int ret = 0; u64 isize; u64 start; /* * If the swap file was just created, make sure delalloc is done. If the * file changes again after this, the user is doing something stupid and * we don't really care. */ ret = btrfs_wait_ordered_range(inode, 0, (u64)-1); if (ret) return ret; /* * The inode is locked, so these flags won't change after we check them. */ if (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS) { btrfs_warn(fs_info, "swapfile must not be compressed"); return -EINVAL; } if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)) { btrfs_warn(fs_info, "swapfile must not be copy-on-write"); return -EINVAL; } if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { btrfs_warn(fs_info, "swapfile must not be checksummed"); return -EINVAL; } /* * Balance or device remove/replace/resize can move stuff around from * under us. The exclop protection makes sure they aren't running/won't * run concurrently while we are mapping the swap extents, and * fs_info->swapfile_pins prevents them from running while the swap * file is active and moving the extents. Note that this also prevents * a concurrent device add which isn't actually necessary, but it's not * really worth the trouble to allow it. */ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_SWAP_ACTIVATE)) { btrfs_warn(fs_info, "cannot activate swapfile while exclusive operation is running"); return -EBUSY; } /* * Prevent snapshot creation while we are activating the swap file. * We do not want to race with snapshot creation. If snapshot creation * already started before we bumped nr_swapfiles from 0 to 1 and * completes before the first write into the swap file after it is * activated, than that write would fallback to COW. */ if (!btrfs_drew_try_write_lock(&root->snapshot_lock)) { btrfs_exclop_finish(fs_info); btrfs_warn(fs_info, "cannot activate swapfile because snapshot creation is in progress"); return -EINVAL; } /* * Snapshots can create extents which require COW even if NODATACOW is * set. We use this counter to prevent snapshots. We must increment it * before walking the extents because we don't want a concurrent * snapshot to run after we've already checked the extents. * * It is possible that subvolume is marked for deletion but still not * removed yet. To prevent this race, we check the root status before * activating the swapfile. */ spin_lock(&root->root_item_lock); if (btrfs_root_dead(root)) { spin_unlock(&root->root_item_lock); btrfs_exclop_finish(fs_info); btrfs_warn(fs_info, "cannot activate swapfile because subvolume %llu is being deleted", root->root_key.objectid); return -EPERM; } atomic_inc(&root->nr_swapfiles); spin_unlock(&root->root_item_lock); isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize); lock_extent(io_tree, 0, isize - 1, &cached_state); start = 0; while (start < isize) { u64 logical_block_start, physical_block_start; struct btrfs_block_group *bg; u64 len = isize - start; em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len); if (IS_ERR(em)) { ret = PTR_ERR(em); goto out; } if (em->block_start == EXTENT_MAP_HOLE) { btrfs_warn(fs_info, "swapfile must not have holes"); ret = -EINVAL; goto out; } if (em->block_start == EXTENT_MAP_INLINE) { /* * It's unlikely we'll ever actually find ourselves * here, as a file small enough to fit inline won't be * big enough to store more than the swap header, but in * case something changes in the future, let's catch it * here rather than later. */ btrfs_warn(fs_info, "swapfile must not be inline"); ret = -EINVAL; goto out; } if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { btrfs_warn(fs_info, "swapfile must not be compressed"); ret = -EINVAL; goto out; } logical_block_start = em->block_start + (start - em->start); len = min(len, em->len - (start - em->start)); free_extent_map(em); em = NULL; ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL, false, true); if (ret < 0) { goto out; } else if (ret) { ret = 0; } else { btrfs_warn(fs_info, "swapfile must not be copy-on-write"); ret = -EINVAL; goto out; } em = btrfs_get_chunk_map(fs_info, logical_block_start, len); if (IS_ERR(em)) { ret = PTR_ERR(em); goto out; } if (em->map_lookup->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { btrfs_warn(fs_info, "swapfile must have single data profile"); ret = -EINVAL; goto out; } if (device == NULL) { device = em->map_lookup->stripes[0].dev; ret = btrfs_add_swapfile_pin(inode, device, false); if (ret == 1) ret = 0; else if (ret) goto out; } else if (device != em->map_lookup->stripes[0].dev) { btrfs_warn(fs_info, "swapfile must be on one device"); ret = -EINVAL; goto out; } physical_block_start = (em->map_lookup->stripes[0].physical + (logical_block_start - em->start)); len = min(len, em->len - (logical_block_start - em->start)); free_extent_map(em); em = NULL; bg = btrfs_lookup_block_group(fs_info, logical_block_start); if (!bg) { btrfs_warn(fs_info, "could not find block group containing swapfile"); ret = -EINVAL; goto out; } if (!btrfs_inc_block_group_swap_extents(bg)) { btrfs_warn(fs_info, "block group for swapfile at %llu is read-only%s", bg->start, atomic_read(&fs_info->scrubs_running) ? " (scrub running)" : ""); btrfs_put_block_group(bg); ret = -EINVAL; goto out; } ret = btrfs_add_swapfile_pin(inode, bg, true); if (ret) { btrfs_put_block_group(bg); if (ret == 1) ret = 0; else goto out; } if (bsi.block_len && bsi.block_start + bsi.block_len == physical_block_start) { bsi.block_len += len; } else { if (bsi.block_len) { ret = btrfs_add_swap_extent(sis, &bsi); if (ret) goto out; } bsi.start = start; bsi.block_start = physical_block_start; bsi.block_len = len; } start += len; } if (bsi.block_len) ret = btrfs_add_swap_extent(sis, &bsi); out: if (!IS_ERR_OR_NULL(em)) free_extent_map(em); unlock_extent(io_tree, 0, isize - 1, &cached_state); if (ret) btrfs_swap_deactivate(file); btrfs_drew_write_unlock(&root->snapshot_lock); btrfs_exclop_finish(fs_info); if (ret) return ret; if (device) sis->bdev = device->bdev; *span = bsi.highest_ppage - bsi.lowest_ppage + 1; sis->max = bsi.nr_pages; sis->pages = bsi.nr_pages - 1; sis->highest_bit = bsi.nr_pages - 1; return bsi.nr_extents; } #else static void btrfs_swap_deactivate(struct file *file) { } static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, sector_t *span) { return -EOPNOTSUPP; } #endif /* * Update the number of bytes used in the VFS' inode. When we replace extents in * a range (clone, dedupe, fallocate's zero range), we must update the number of * bytes used by the inode in an atomic manner, so that concurrent stat(2) calls * always get a correct value. */ void btrfs_update_inode_bytes(struct btrfs_inode *inode, const u64 add_bytes, const u64 del_bytes) { if (add_bytes == del_bytes) return; spin_lock(&inode->lock); if (del_bytes > 0) inode_sub_bytes(&inode->vfs_inode, del_bytes); if (add_bytes > 0) inode_add_bytes(&inode->vfs_inode, add_bytes); spin_unlock(&inode->lock); } /* * Verify that there are no ordered extents for a given file range. * * @inode: The target inode. * @start: Start offset of the file range, should be sector size aligned. * @end: End offset (inclusive) of the file range, its value +1 should be * sector size aligned. * * This should typically be used for cases where we locked an inode's VFS lock in * exclusive mode, we have also locked the inode's i_mmap_lock in exclusive mode, * we have flushed all delalloc in the range, we have waited for all ordered * extents in the range to complete and finally we have locked the file range in * the inode's io_tree. */ void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end) { struct btrfs_root *root = inode->root; struct btrfs_ordered_extent *ordered; if (!IS_ENABLED(CONFIG_BTRFS_ASSERT)) return; ordered = btrfs_lookup_first_ordered_range(inode, start, end + 1 - start); if (ordered) { btrfs_err(root->fs_info, "found unexpected ordered extent in file range [%llu, %llu] for inode %llu root %llu (ordered range [%llu, %llu])", start, end, btrfs_ino(inode), root->root_key.objectid, ordered->file_offset, ordered->file_offset + ordered->num_bytes - 1); btrfs_put_ordered_extent(ordered); } ASSERT(ordered == NULL); } static const struct inode_operations btrfs_dir_inode_operations = { .getattr = btrfs_getattr, .lookup = btrfs_lookup, .create = btrfs_create, .unlink = btrfs_unlink, .link = btrfs_link, .mkdir = btrfs_mkdir, .rmdir = btrfs_rmdir, .rename = btrfs_rename2, .symlink = btrfs_symlink, .setattr = btrfs_setattr, .mknod = btrfs_mknod, .listxattr = btrfs_listxattr, .permission = btrfs_permission, .get_inode_acl = btrfs_get_acl, .set_acl = btrfs_set_acl, .update_time = btrfs_update_time, .tmpfile = btrfs_tmpfile, .fileattr_get = btrfs_fileattr_get, .fileattr_set = btrfs_fileattr_set, }; static const struct file_operations btrfs_dir_file_operations = { .llseek = btrfs_dir_llseek, .read = generic_read_dir, .iterate_shared = btrfs_real_readdir, .open = btrfs_opendir, .unlocked_ioctl = btrfs_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = btrfs_compat_ioctl, #endif .release = btrfs_release_file, .fsync = btrfs_sync_file, }; /* * btrfs doesn't support the bmap operation because swapfiles * use bmap to make a mapping of extents in the file. They assume * these extents won't change over the life of the file and they * use the bmap result to do IO directly to the drive. * * the btrfs bmap call would return logical addresses that aren't * suitable for IO and they also will change frequently as COW * operations happen. So, swapfile + btrfs == corruption. * * For now we're avoiding this by dropping bmap. */ static const struct address_space_operations btrfs_aops = { .read_folio = btrfs_read_folio, .writepages = btrfs_writepages, .readahead = btrfs_readahead, .invalidate_folio = btrfs_invalidate_folio, .release_folio = btrfs_release_folio, .migrate_folio = btrfs_migrate_folio, .dirty_folio = filemap_dirty_folio, .error_remove_page = generic_error_remove_page, .swap_activate = btrfs_swap_activate, .swap_deactivate = btrfs_swap_deactivate, }; static const struct inode_operations btrfs_file_inode_operations = { .getattr = btrfs_getattr, .setattr = btrfs_setattr, .listxattr = btrfs_listxattr, .permission = btrfs_permission, .fiemap = btrfs_fiemap, .get_inode_acl = btrfs_get_acl, .set_acl = btrfs_set_acl, .update_time = btrfs_update_time, .fileattr_get = btrfs_fileattr_get, .fileattr_set = btrfs_fileattr_set, }; static const struct inode_operations btrfs_special_inode_operations = { .getattr = btrfs_getattr, .setattr = btrfs_setattr, .permission = btrfs_permission, .listxattr = btrfs_listxattr, .get_inode_acl = btrfs_get_acl, .set_acl = btrfs_set_acl, .update_time = btrfs_update_time, }; static const struct inode_operations btrfs_symlink_inode_operations = { .get_link = page_get_link, .getattr = btrfs_getattr, .setattr = btrfs_setattr, .permission = btrfs_permission, .listxattr = btrfs_listxattr, .update_time = btrfs_update_time, }; const struct dentry_operations btrfs_dentry_operations = { .d_delete = btrfs_dentry_delete, }; |
2 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 | /* * Copyright (c) 2006 Oracle. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include <linux/percpu.h> #include <linux/seq_file.h> #include <linux/proc_fs.h> #include "rds.h" #include "ib.h" DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_ib_statistics, rds_ib_stats); static const char *const rds_ib_stat_names[] = { "ib_connect_raced", "ib_listen_closed_stale", "ib_evt_handler_call", "ib_tasklet_call", "ib_tx_cq_event", "ib_tx_ring_full", "ib_tx_throttle", "ib_tx_sg_mapping_failure", "ib_tx_stalled", "ib_tx_credit_updates", "ib_rx_cq_event", "ib_rx_ring_empty", "ib_rx_refill_from_cq", "ib_rx_refill_from_thread", "ib_rx_alloc_limit", "ib_rx_total_frags", "ib_rx_total_incs", "ib_rx_credit_updates", "ib_ack_sent", "ib_ack_send_failure", "ib_ack_send_delayed", "ib_ack_send_piggybacked", "ib_ack_received", "ib_rdma_mr_8k_alloc", "ib_rdma_mr_8k_free", "ib_rdma_mr_8k_used", "ib_rdma_mr_8k_pool_flush", "ib_rdma_mr_8k_pool_wait", "ib_rdma_mr_8k_pool_depleted", "ib_rdma_mr_1m_alloc", "ib_rdma_mr_1m_free", "ib_rdma_mr_1m_used", "ib_rdma_mr_1m_pool_flush", "ib_rdma_mr_1m_pool_wait", "ib_rdma_mr_1m_pool_depleted", "ib_rdma_mr_8k_reused", "ib_rdma_mr_1m_reused", "ib_atomic_cswp", "ib_atomic_fadd", }; unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter, unsigned int avail) { struct rds_ib_statistics stats = {0, }; uint64_t *src; uint64_t *sum; size_t i; int cpu; if (avail < ARRAY_SIZE(rds_ib_stat_names)) goto out; for_each_online_cpu(cpu) { src = (uint64_t *)&(per_cpu(rds_ib_stats, cpu)); sum = (uint64_t *)&stats; for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++) *(sum++) += *(src++); } rds_stats_info_copy(iter, (uint64_t *)&stats, rds_ib_stat_names, ARRAY_SIZE(rds_ib_stat_names)); out: return ARRAY_SIZE(rds_ib_stat_names); } |
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) 2016 Mellanox Technologies. All rights reserved. * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com> */ #include "devl_internal.h" static const struct devlink_param devlink_param_generic[] = { { .id = DEVLINK_PARAM_GENERIC_ID_INT_ERR_RESET, .name = DEVLINK_PARAM_GENERIC_INT_ERR_RESET_NAME, .type = DEVLINK_PARAM_GENERIC_INT_ERR_RESET_TYPE, }, { .id = DEVLINK_PARAM_GENERIC_ID_MAX_MACS, .name = DEVLINK_PARAM_GENERIC_MAX_MACS_NAME, .type = DEVLINK_PARAM_GENERIC_MAX_MACS_TYPE, }, { .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_SRIOV, .name = DEVLINK_PARAM_GENERIC_ENABLE_SRIOV_NAME, .type = DEVLINK_PARAM_GENERIC_ENABLE_SRIOV_TYPE, }, { .id = DEVLINK_PARAM_GENERIC_ID_REGION_SNAPSHOT, .name = DEVLINK_PARAM_GENERIC_REGION_SNAPSHOT_NAME, .type = DEVLINK_PARAM_GENERIC_REGION_SNAPSHOT_TYPE, }, { .id = DEVLINK_PARAM_GENERIC_ID_IGNORE_ARI, .name = DEVLINK_PARAM_GENERIC_IGNORE_ARI_NAME, .type = DEVLINK_PARAM_GENERIC_IGNORE_ARI_TYPE, }, { .id = DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MAX, .name = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MAX_NAME, .type = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MAX_TYPE, }, { .id = DEVLINK_PARAM_GENERIC_ID_MSIX_VEC_PER_PF_MIN, .name = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MIN_NAME, .type = DEVLINK_PARAM_GENERIC_MSIX_VEC_PER_PF_MIN_TYPE, }, { .id = DEVLINK_PARAM_GENERIC_ID_FW_LOAD_POLICY, .name = DEVLINK_PARAM_GENERIC_FW_LOAD_POLICY_NAME, .type = DEVLINK_PARAM_GENERIC_FW_LOAD_POLICY_TYPE, }, { .id = DEVLINK_PARAM_GENERIC_ID_RESET_DEV_ON_DRV_PROBE, .name = DEVLINK_PARAM_GENERIC_RESET_DEV_ON_DRV_PROBE_NAME, .type = DEVLINK_PARAM_GENERIC_RESET_DEV_ON_DRV_PROBE_TYPE, }, { .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_ROCE, .name = DEVLINK_PARAM_GENERIC_ENABLE_ROCE_NAME, .type = DEVLINK_PARAM_GENERIC_ENABLE_ROCE_TYPE, }, { .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_REMOTE_DEV_RESET, .name = DEVLINK_PARAM_GENERIC_ENABLE_REMOTE_DEV_RESET_NAME, .type = DEVLINK_PARAM_GENERIC_ENABLE_REMOTE_DEV_RESET_TYPE, }, { .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_ETH, .name = DEVLINK_PARAM_GENERIC_ENABLE_ETH_NAME, .type = DEVLINK_PARAM_GENERIC_ENABLE_ETH_TYPE, }, { .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_RDMA, .name = DEVLINK_PARAM_GENERIC_ENABLE_RDMA_NAME, .type = DEVLINK_PARAM_GENERIC_ENABLE_RDMA_TYPE, }, { .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_VNET, .name = DEVLINK_PARAM_GENERIC_ENABLE_VNET_NAME, .type = DEVLINK_PARAM_GENERIC_ENABLE_VNET_TYPE, }, { .id = DEVLINK_PARAM_GENERIC_ID_ENABLE_IWARP, .name = DEVLINK_PARAM_GENERIC_ENABLE_IWARP_NAME, .type = DEVLINK_PARAM_GENERIC_ENABLE_IWARP_TYPE, }, { .id = DEVLINK_PARAM_GENERIC_ID_IO_EQ_SIZE, .name = DEVLINK_PARAM_GENERIC_IO_EQ_SIZE_NAME, .type = DEVLINK_PARAM_GENERIC_IO_EQ_SIZE_TYPE, }, { .id = DEVLINK_PARAM_GENERIC_ID_EVENT_EQ_SIZE, .name = DEVLINK_PARAM_GENERIC_EVENT_EQ_SIZE_NAME, .type = DEVLINK_PARAM_GENERIC_EVENT_EQ_SIZE_TYPE, }, }; static int devlink_param_generic_verify(const struct devlink_param *param) { /* verify it match generic parameter by id and name */ if (param->id > DEVLINK_PARAM_GENERIC_ID_MAX) return -EINVAL; if (strcmp(param->name, devlink_param_generic[param->id].name)) return -ENOENT; WARN_ON(param->type != devlink_param_generic[param->id].type); return 0; } static int devlink_param_driver_verify(const struct devlink_param *param) { int i; if (param->id <= DEVLINK_PARAM_GENERIC_ID_MAX) return -EINVAL; /* verify no such name in generic params */ for (i = 0; i <= DEVLINK_PARAM_GENERIC_ID_MAX; i++) if (!strcmp(param->name, devlink_param_generic[i].name)) return -EEXIST; return 0; } static struct devlink_param_item * devlink_param_find_by_name(struct xarray *params, const char *param_name) { struct devlink_param_item *param_item; unsigned long param_id; xa_for_each(params, param_id, param_item) { if (!strcmp(param_item->param->name, param_name)) return param_item; } return NULL; } static struct devlink_param_item * devlink_param_find_by_id(struct xarray *params, u32 param_id) { return xa_load(params, param_id); } static bool devlink_param_cmode_is_supported(const struct devlink_param *param, enum devlink_param_cmode cmode) { return test_bit(cmode, ¶m->supported_cmodes); } static int devlink_param_get(struct devlink *devlink, const struct devlink_param *param, struct devlink_param_gset_ctx *ctx) { if (!param->get) return -EOPNOTSUPP; return param->get(devlink, param->id, ctx); } static int devlink_param_set(struct devlink *devlink, const struct devlink_param *param, struct devlink_param_gset_ctx *ctx) { if (!param->set) return -EOPNOTSUPP; return param->set(devlink, param->id, ctx); } static int devlink_param_type_to_nla_type(enum devlink_param_type param_type) { switch (param_type) { case DEVLINK_PARAM_TYPE_U8: return NLA_U8; case DEVLINK_PARAM_TYPE_U16: return NLA_U16; case DEVLINK_PARAM_TYPE_U32: return NLA_U32; case DEVLINK_PARAM_TYPE_STRING: return NLA_STRING; case DEVLINK_PARAM_TYPE_BOOL: return NLA_FLAG; default: return -EINVAL; } } static int devlink_nl_param_value_fill_one(struct sk_buff *msg, enum devlink_param_type type, enum devlink_param_cmode cmode, union devlink_param_value val) { struct nlattr *param_value_attr; param_value_attr = nla_nest_start_noflag(msg, DEVLINK_ATTR_PARAM_VALUE); if (!param_value_attr) goto nla_put_failure; if (nla_put_u8(msg, DEVLINK_ATTR_PARAM_VALUE_CMODE, cmode)) goto value_nest_cancel; switch (type) { case DEVLINK_PARAM_TYPE_U8: if (nla_put_u8(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, val.vu8)) goto value_nest_cancel; break; case DEVLINK_PARAM_TYPE_U16: if (nla_put_u16(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, val.vu16)) goto value_nest_cancel; break; case DEVLINK_PARAM_TYPE_U32: if (nla_put_u32(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, val.vu32)) goto value_nest_cancel; break; case DEVLINK_PARAM_TYPE_STRING: if (nla_put_string(msg, DEVLINK_ATTR_PARAM_VALUE_DATA, val.vstr)) goto value_nest_cancel; break; case DEVLINK_PARAM_TYPE_BOOL: if (val.vbool && nla_put_flag(msg, DEVLINK_ATTR_PARAM_VALUE_DATA)) goto value_nest_cancel; break; } nla_nest_end(msg, param_value_attr); return 0; value_nest_cancel: nla_nest_cancel(msg, param_value_attr); nla_put_failure: return -EMSGSIZE; } static int devlink_nl_param_fill(struct sk_buff *msg, struct devlink *devlink, unsigned int port_index, struct devlink_param_item *param_item, enum devlink_command cmd, u32 portid, u32 seq, int flags) { union devlink_param_value param_value[DEVLINK_PARAM_CMODE_MAX + 1]; bool param_value_set[DEVLINK_PARAM_CMODE_MAX + 1] = {}; const struct devlink_param *param = param_item->param; struct devlink_param_gset_ctx ctx; struct nlattr *param_values_list; struct nlattr *param_attr; int nla_type; void *hdr; int err; int i; /* Get value from driver part to driverinit configuration mode */ for (i = 0; i <= DEVLINK_PARAM_CMODE_MAX; i++) { if (!devlink_param_cmode_is_supported(param, i)) continue; if (i == DEVLINK_PARAM_CMODE_DRIVERINIT) { if (param_item->driverinit_value_new_valid) param_value[i] = param_item->driverinit_value_new; else if (param_item->driverinit_value_valid) param_value[i] = param_item->driverinit_value; else return -EOPNOTSUPP; } else { ctx.cmode = i; err = devlink_param_get(devlink, param, &ctx); if (err) return err; param_value[i] = ctx.val; } param_value_set[i] = true; } hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); if (!hdr) return -EMSGSIZE; if (devlink_nl_put_handle(msg, devlink)) goto genlmsg_cancel; if (cmd == DEVLINK_CMD_PORT_PARAM_GET || cmd == DEVLINK_CMD_PORT_PARAM_NEW || cmd == DEVLINK_CMD_PORT_PARAM_DEL) if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, port_index)) goto genlmsg_cancel; param_attr = nla_nest_start_noflag(msg, DEVLINK_ATTR_PARAM); if (!param_attr) goto genlmsg_cancel; if (nla_put_string(msg, DEVLINK_ATTR_PARAM_NAME, param->name)) goto param_nest_cancel; if (param->generic && nla_put_flag(msg, DEVLINK_ATTR_PARAM_GENERIC)) goto param_nest_cancel; nla_type = devlink_param_type_to_nla_type(param->type); if (nla_type < 0) goto param_nest_cancel; if (nla_put_u8(msg, DEVLINK_ATTR_PARAM_TYPE, nla_type)) goto param_nest_cancel; param_values_list = nla_nest_start_noflag(msg, DEVLINK_ATTR_PARAM_VALUES_LIST); if (!param_values_list) goto param_nest_cancel; for (i = 0; i <= DEVLINK_PARAM_CMODE_MAX; i++) { if (!param_value_set[i]) continue; err = devlink_nl_param_value_fill_one(msg, param->type, i, param_value[i]); if (err) goto values_list_nest_cancel; } nla_nest_end(msg, param_values_list); nla_nest_end(msg, param_attr); genlmsg_end(msg, hdr); return 0; values_list_nest_cancel: nla_nest_end(msg, param_values_list); param_nest_cancel: nla_nest_cancel(msg, param_attr); genlmsg_cancel: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } static void devlink_param_notify(struct devlink *devlink, unsigned int port_index, struct devlink_param_item *param_item, enum devlink_command cmd) { struct sk_buff *msg; int err; WARN_ON(cmd != DEVLINK_CMD_PARAM_NEW && cmd != DEVLINK_CMD_PARAM_DEL && cmd != DEVLINK_CMD_PORT_PARAM_NEW && cmd != DEVLINK_CMD_PORT_PARAM_DEL); /* devlink_notify_register() / devlink_notify_unregister() * will replay the notifications if the params are added/removed * outside of the lifetime of the instance. */ if (!devl_is_registered(devlink)) return; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return; err = devlink_nl_param_fill(msg, devlink, port_index, param_item, cmd, 0, 0, 0); if (err) { nlmsg_free(msg); return; } genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); } static void devlink_params_notify(struct devlink *devlink, enum devlink_command cmd) { struct devlink_param_item *param_item; unsigned long param_id; xa_for_each(&devlink->params, param_id, param_item) devlink_param_notify(devlink, 0, param_item, cmd); } void devlink_params_notify_register(struct devlink *devlink) { devlink_params_notify(devlink, DEVLINK_CMD_PARAM_NEW); } void devlink_params_notify_unregister(struct devlink *devlink) { devlink_params_notify(devlink, DEVLINK_CMD_PARAM_DEL); } static int devlink_nl_param_get_dump_one(struct sk_buff *msg, struct devlink *devlink, struct netlink_callback *cb, int flags) { struct devlink_nl_dump_state *state = devlink_dump_state(cb); struct devlink_param_item *param_item; unsigned long param_id; int err = 0; xa_for_each_start(&devlink->params, param_id, param_item, state->idx) { err = devlink_nl_param_fill(msg, devlink, 0, param_item, DEVLINK_CMD_PARAM_GET, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, flags); if (err == -EOPNOTSUPP) { err = 0; } else if (err) { state->idx = param_id; break; } } return err; } int devlink_nl_param_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { return devlink_nl_dumpit(skb, cb, devlink_nl_param_get_dump_one); } static int devlink_param_type_get_from_info(struct genl_info *info, enum devlink_param_type *param_type) { if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_PARAM_TYPE)) return -EINVAL; switch (nla_get_u8(info->attrs[DEVLINK_ATTR_PARAM_TYPE])) { case NLA_U8: *param_type = DEVLINK_PARAM_TYPE_U8; break; case NLA_U16: *param_type = DEVLINK_PARAM_TYPE_U16; break; case NLA_U32: *param_type = DEVLINK_PARAM_TYPE_U32; break; case NLA_STRING: *param_type = DEVLINK_PARAM_TYPE_STRING; break; case NLA_FLAG: *param_type = DEVLINK_PARAM_TYPE_BOOL; break; default: return -EINVAL; } return 0; } static int devlink_param_value_get_from_info(const struct devlink_param *param, struct genl_info *info, union devlink_param_value *value) { struct nlattr *param_data; int len; param_data = info->attrs[DEVLINK_ATTR_PARAM_VALUE_DATA]; if (param->type != DEVLINK_PARAM_TYPE_BOOL && !param_data) return -EINVAL; switch (param->type) { case DEVLINK_PARAM_TYPE_U8: if (nla_len(param_data) != sizeof(u8)) return -EINVAL; value->vu8 = nla_get_u8(param_data); break; case DEVLINK_PARAM_TYPE_U16: if (nla_len(param_data) != sizeof(u16)) return -EINVAL; value->vu16 = nla_get_u16(param_data); break; case DEVLINK_PARAM_TYPE_U32: if (nla_len(param_data) != sizeof(u32)) return -EINVAL; value->vu32 = nla_get_u32(param_data); break; case DEVLINK_PARAM_TYPE_STRING: len = strnlen(nla_data(param_data), nla_len(param_data)); if (len == nla_len(param_data) || len >= __DEVLINK_PARAM_MAX_STRING_VALUE) return -EINVAL; strcpy(value->vstr, nla_data(param_data)); break; case DEVLINK_PARAM_TYPE_BOOL: if (param_data && nla_len(param_data)) return -EINVAL; value->vbool = nla_get_flag(param_data); break; } return 0; } static struct devlink_param_item * devlink_param_get_from_info(struct xarray *params, struct genl_info *info) { char *param_name; if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_PARAM_NAME)) return NULL; param_name = nla_data(info->attrs[DEVLINK_ATTR_PARAM_NAME]); return devlink_param_find_by_name(params, param_name); } int devlink_nl_param_get_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; struct devlink_param_item *param_item; struct sk_buff *msg; int err; param_item = devlink_param_get_from_info(&devlink->params, info); if (!param_item) return -EINVAL; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; err = devlink_nl_param_fill(msg, devlink, 0, param_item, DEVLINK_CMD_PARAM_GET, info->snd_portid, info->snd_seq, 0); if (err) { nlmsg_free(msg); return err; } return genlmsg_reply(msg, info); } static int __devlink_nl_cmd_param_set_doit(struct devlink *devlink, unsigned int port_index, struct xarray *params, struct genl_info *info, enum devlink_command cmd) { enum devlink_param_type param_type; struct devlink_param_gset_ctx ctx; enum devlink_param_cmode cmode; struct devlink_param_item *param_item; const struct devlink_param *param; union devlink_param_value value; int err = 0; param_item = devlink_param_get_from_info(params, info); if (!param_item) return -EINVAL; param = param_item->param; err = devlink_param_type_get_from_info(info, ¶m_type); if (err) return err; if (param_type != param->type) return -EINVAL; err = devlink_param_value_get_from_info(param, info, &value); if (err) return err; if (param->validate) { err = param->validate(devlink, param->id, value, info->extack); if (err) return err; } if (GENL_REQ_ATTR_CHECK(info, DEVLINK_ATTR_PARAM_VALUE_CMODE)) return -EINVAL; cmode = nla_get_u8(info->attrs[DEVLINK_ATTR_PARAM_VALUE_CMODE]); if (!devlink_param_cmode_is_supported(param, cmode)) return -EOPNOTSUPP; if (cmode == DEVLINK_PARAM_CMODE_DRIVERINIT) { param_item->driverinit_value_new = value; param_item->driverinit_value_new_valid = true; } else { if (!param->set) return -EOPNOTSUPP; ctx.val = value; ctx.cmode = cmode; err = devlink_param_set(devlink, param, &ctx); if (err) return err; } devlink_param_notify(devlink, port_index, param_item, cmd); return 0; } int devlink_nl_param_set_doit(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; return __devlink_nl_cmd_param_set_doit(devlink, 0, &devlink->params, info, DEVLINK_CMD_PARAM_NEW); } int devlink_nl_port_param_get_dumpit(struct sk_buff *msg, struct netlink_callback *cb) { NL_SET_ERR_MSG(cb->extack, "Port params are not supported"); return msg->len; } int devlink_nl_port_param_get_doit(struct sk_buff *skb, struct genl_info *info) { NL_SET_ERR_MSG(info->extack, "Port params are not supported"); return -EINVAL; } int devlink_nl_port_param_set_doit(struct sk_buff *skb, struct genl_info *info) { NL_SET_ERR_MSG(info->extack, "Port params are not supported"); return -EINVAL; } static int devlink_param_verify(const struct devlink_param *param) { if (!param || !param->name || !param->supported_cmodes) return -EINVAL; if (param->generic) return devlink_param_generic_verify(param); else return devlink_param_driver_verify(param); } static int devlink_param_register(struct devlink *devlink, const struct devlink_param *param) { struct devlink_param_item *param_item; int err; WARN_ON(devlink_param_verify(param)); WARN_ON(devlink_param_find_by_name(&devlink->params, param->name)); if (param->supported_cmodes == BIT(DEVLINK_PARAM_CMODE_DRIVERINIT)) WARN_ON(param->get || param->set); else WARN_ON(!param->get || !param->set); param_item = kzalloc(sizeof(*param_item), GFP_KERNEL); if (!param_item) return -ENOMEM; param_item->param = param; err = xa_insert(&devlink->params, param->id, param_item, GFP_KERNEL); if (err) goto err_xa_insert; devlink_param_notify(devlink, 0, param_item, DEVLINK_CMD_PARAM_NEW); return 0; err_xa_insert: kfree(param_item); return err; } static void devlink_param_unregister(struct devlink *devlink, const struct devlink_param *param) { struct devlink_param_item *param_item; param_item = devlink_param_find_by_id(&devlink->params, param->id); if (WARN_ON(!param_item)) return; devlink_param_notify(devlink, 0, param_item, DEVLINK_CMD_PARAM_DEL); xa_erase(&devlink->params, param->id); kfree(param_item); } /** * devl_params_register - register configuration parameters * * @devlink: devlink * @params: configuration parameters array * @params_count: number of parameters provided * * Register the configuration parameters supported by the driver. */ int devl_params_register(struct devlink *devlink, const struct devlink_param *params, size_t params_count) { const struct devlink_param *param = params; int i, err; lockdep_assert_held(&devlink->lock); for (i = 0; i < params_count; i++, param++) { err = devlink_param_register(devlink, param); if (err) goto rollback; } return 0; rollback: if (!i) return err; for (param--; i > 0; i--, param--) devlink_param_unregister(devlink, param); return err; } EXPORT_SYMBOL_GPL(devl_params_register); int devlink_params_register(struct devlink *devlink, const struct devlink_param *params, size_t params_count) { int err; devl_lock(devlink); err = devl_params_register(devlink, params, params_count); devl_unlock(devlink); return err; } EXPORT_SYMBOL_GPL(devlink_params_register); /** * devl_params_unregister - unregister configuration parameters * @devlink: devlink * @params: configuration parameters to unregister * @params_count: number of parameters provided */ void devl_params_unregister(struct devlink *devlink, const struct devlink_param *params, size_t params_count) { const struct devlink_param *param = params; int i; lockdep_assert_held(&devlink->lock); for (i = 0; i < params_count; i++, param++) devlink_param_unregister(devlink, param); } EXPORT_SYMBOL_GPL(devl_params_unregister); void devlink_params_unregister(struct devlink *devlink, const struct devlink_param *params, size_t params_count) { devl_lock(devlink); devl_params_unregister(devlink, params, params_count); devl_unlock(devlink); } EXPORT_SYMBOL_GPL(devlink_params_unregister); /** * devl_param_driverinit_value_get - get configuration parameter * value for driver initializing * * @devlink: devlink * @param_id: parameter ID * @val: pointer to store the value of parameter in driverinit * configuration mode * * This function should be used by the driver to get driverinit * configuration for initialization after reload command. * * Note that lockless call of this function relies on the * driver to maintain following basic sane behavior: * 1) Driver ensures a call to this function cannot race with * registering/unregistering the parameter with the same parameter ID. * 2) Driver ensures a call to this function cannot race with * devl_param_driverinit_value_set() call with the same parameter ID. * 3) Driver ensures a call to this function cannot race with * reload operation. * If the driver is not able to comply, it has to take the devlink->lock * while calling this. */ int devl_param_driverinit_value_get(struct devlink *devlink, u32 param_id, union devlink_param_value *val) { struct devlink_param_item *param_item; if (WARN_ON(!devlink_reload_supported(devlink->ops))) return -EOPNOTSUPP; param_item = devlink_param_find_by_id(&devlink->params, param_id); if (!param_item) return -EINVAL; if (!param_item->driverinit_value_valid) return -EOPNOTSUPP; if (WARN_ON(!devlink_param_cmode_is_supported(param_item->param, DEVLINK_PARAM_CMODE_DRIVERINIT))) return -EOPNOTSUPP; *val = param_item->driverinit_value; return 0; } EXPORT_SYMBOL_GPL(devl_param_driverinit_value_get); /** * devl_param_driverinit_value_set - set value of configuration * parameter for driverinit * configuration mode * * @devlink: devlink * @param_id: parameter ID * @init_val: value of parameter to set for driverinit configuration mode * * This function should be used by the driver to set driverinit * configuration mode default value. */ void devl_param_driverinit_value_set(struct devlink *devlink, u32 param_id, union devlink_param_value init_val) { struct devlink_param_item *param_item; devl_assert_locked(devlink); param_item = devlink_param_find_by_id(&devlink->params, param_id); if (WARN_ON(!param_item)) return; if (WARN_ON(!devlink_param_cmode_is_supported(param_item->param, DEVLINK_PARAM_CMODE_DRIVERINIT))) return; param_item->driverinit_value = init_val; param_item->driverinit_value_valid = true; devlink_param_notify(devlink, 0, param_item, DEVLINK_CMD_PARAM_NEW); } EXPORT_SYMBOL_GPL(devl_param_driverinit_value_set); void devlink_params_driverinit_load_new(struct devlink *devlink) { struct devlink_param_item *param_item; unsigned long param_id; xa_for_each(&devlink->params, param_id, param_item) { if (!devlink_param_cmode_is_supported(param_item->param, DEVLINK_PARAM_CMODE_DRIVERINIT) || !param_item->driverinit_value_new_valid) continue; param_item->driverinit_value = param_item->driverinit_value_new; param_item->driverinit_value_valid = true; param_item->driverinit_value_new_valid = false; } } /** * devl_param_value_changed - notify devlink on a parameter's value * change. Should be called by the driver * right after the change. * * @devlink: devlink * @param_id: parameter ID * * This function should be used by the driver to notify devlink on value * change, excluding driverinit configuration mode. * For driverinit configuration mode driver should use the function */ void devl_param_value_changed(struct devlink *devlink, u32 param_id) { struct devlink_param_item *param_item; param_item = devlink_param_find_by_id(&devlink->params, param_id); WARN_ON(!param_item); devlink_param_notify(devlink, 0, param_item, DEVLINK_CMD_PARAM_NEW); } EXPORT_SYMBOL_GPL(devl_param_value_changed); |
2 2 3 1 2 1 1 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 | // SPDX-License-Identifier: GPL-2.0-or-later /* * * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) */ #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/spinlock.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/slab.h> #include <net/ax25.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <net/sock.h> #include <linux/uaccess.h> #include <linux/fcntl.h> #include <linux/mm.h> #include <linux/interrupt.h> static struct ax25_protocol *protocol_list; static DEFINE_RWLOCK(protocol_list_lock); static HLIST_HEAD(ax25_linkfail_list); static DEFINE_SPINLOCK(linkfail_lock); static struct listen_struct { struct listen_struct *next; ax25_address callsign; struct net_device *dev; } *listen_list = NULL; static DEFINE_SPINLOCK(listen_lock); /* * Do not register the internal protocols AX25_P_TEXT, AX25_P_SEGMENT, * AX25_P_IP or AX25_P_ARP ... */ void ax25_register_pid(struct ax25_protocol *ap) { write_lock_bh(&protocol_list_lock); ap->next = protocol_list; protocol_list = ap; write_unlock_bh(&protocol_list_lock); } EXPORT_SYMBOL_GPL(ax25_register_pid); void ax25_protocol_release(unsigned int pid) { struct ax25_protocol *protocol; write_lock_bh(&protocol_list_lock); protocol = protocol_list; if (protocol == NULL) goto out; if (protocol->pid == pid) { protocol_list = protocol->next; goto out; } while (protocol != NULL && protocol->next != NULL) { if (protocol->next->pid == pid) { protocol->next = protocol->next->next; goto out; } protocol = protocol->next; } out: write_unlock_bh(&protocol_list_lock); } EXPORT_SYMBOL(ax25_protocol_release); void ax25_linkfail_register(struct ax25_linkfail *lf) { spin_lock_bh(&linkfail_lock); hlist_add_head(&lf->lf_node, &ax25_linkfail_list); spin_unlock_bh(&linkfail_lock); } EXPORT_SYMBOL(ax25_linkfail_register); void ax25_linkfail_release(struct ax25_linkfail *lf) { spin_lock_bh(&linkfail_lock); hlist_del_init(&lf->lf_node); spin_unlock_bh(&linkfail_lock); } EXPORT_SYMBOL(ax25_linkfail_release); int ax25_listen_register(const ax25_address *callsign, struct net_device *dev) { struct listen_struct *listen; if (ax25_listen_mine(callsign, dev)) return 0; if ((listen = kmalloc(sizeof(*listen), GFP_ATOMIC)) == NULL) return -ENOMEM; listen->callsign = *callsign; listen->dev = dev; spin_lock_bh(&listen_lock); listen->next = listen_list; listen_list = listen; spin_unlock_bh(&listen_lock); return 0; } EXPORT_SYMBOL(ax25_listen_register); void ax25_listen_release(const ax25_address *callsign, struct net_device *dev) { struct listen_struct *s, *listen; spin_lock_bh(&listen_lock); listen = listen_list; if (listen == NULL) { spin_unlock_bh(&listen_lock); return; } if (ax25cmp(&listen->callsign, callsign) == 0 && listen->dev == dev) { listen_list = listen->next; spin_unlock_bh(&listen_lock); kfree(listen); return; } while (listen != NULL && listen->next != NULL) { if (ax25cmp(&listen->next->callsign, callsign) == 0 && listen->next->dev == dev) { s = listen->next; listen->next = listen->next->next; spin_unlock_bh(&listen_lock); kfree(s); return; } listen = listen->next; } spin_unlock_bh(&listen_lock); } EXPORT_SYMBOL(ax25_listen_release); int (*ax25_protocol_function(unsigned int pid))(struct sk_buff *, ax25_cb *) { int (*res)(struct sk_buff *, ax25_cb *) = NULL; struct ax25_protocol *protocol; read_lock(&protocol_list_lock); for (protocol = protocol_list; protocol != NULL; protocol = protocol->next) if (protocol->pid == pid) { res = protocol->func; break; } read_unlock(&protocol_list_lock); return res; } int ax25_listen_mine(const ax25_address *callsign, struct net_device *dev) { struct listen_struct *listen; spin_lock_bh(&listen_lock); for (listen = listen_list; listen != NULL; listen = listen->next) if (ax25cmp(&listen->callsign, callsign) == 0 && (listen->dev == dev || listen->dev == NULL)) { spin_unlock_bh(&listen_lock); return 1; } spin_unlock_bh(&listen_lock); return 0; } void ax25_link_failed(ax25_cb *ax25, int reason) { struct ax25_linkfail *lf; spin_lock_bh(&linkfail_lock); hlist_for_each_entry(lf, &ax25_linkfail_list, lf_node) lf->func(ax25, reason); spin_unlock_bh(&linkfail_lock); } int ax25_protocol_is_registered(unsigned int pid) { struct ax25_protocol *protocol; int res = 0; read_lock_bh(&protocol_list_lock); for (protocol = protocol_list; protocol != NULL; protocol = protocol->next) if (protocol->pid == pid) { res = 1; break; } read_unlock_bh(&protocol_list_lock); return res; } |
19 10 1959 93 116 16 70 355 39 30 34 34 25 34 1278 736 296 99 20 106 152 2 3845 356 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2001 Jens Axboe <axboe@suse.de> */ #ifndef __LINUX_BIO_H #define __LINUX_BIO_H #include <linux/mempool.h> /* struct bio, bio_vec and BIO_* flags are defined in blk_types.h */ #include <linux/blk_types.h> #include <linux/uio.h> #define BIO_MAX_VECS 256U struct queue_limits; static inline unsigned int bio_max_segs(unsigned int nr_segs) { return min(nr_segs, BIO_MAX_VECS); } #define bio_prio(bio) (bio)->bi_ioprio #define bio_set_prio(bio, prio) ((bio)->bi_ioprio = prio) #define bio_iter_iovec(bio, iter) \ bvec_iter_bvec((bio)->bi_io_vec, (iter)) #define bio_iter_page(bio, iter) \ bvec_iter_page((bio)->bi_io_vec, (iter)) #define bio_iter_len(bio, iter) \ bvec_iter_len((bio)->bi_io_vec, (iter)) #define bio_iter_offset(bio, iter) \ bvec_iter_offset((bio)->bi_io_vec, (iter)) #define bio_page(bio) bio_iter_page((bio), (bio)->bi_iter) #define bio_offset(bio) bio_iter_offset((bio), (bio)->bi_iter) #define bio_iovec(bio) bio_iter_iovec((bio), (bio)->bi_iter) #define bvec_iter_sectors(iter) ((iter).bi_size >> 9) #define bvec_iter_end_sector(iter) ((iter).bi_sector + bvec_iter_sectors((iter))) #define bio_sectors(bio) bvec_iter_sectors((bio)->bi_iter) #define bio_end_sector(bio) bvec_iter_end_sector((bio)->bi_iter) /* * Return the data direction, READ or WRITE. */ #define bio_data_dir(bio) \ (op_is_write(bio_op(bio)) ? WRITE : READ) /* * Check whether this bio carries any data or not. A NULL bio is allowed. */ static inline bool bio_has_data(struct bio *bio) { if (bio && bio->bi_iter.bi_size && bio_op(bio) != REQ_OP_DISCARD && bio_op(bio) != REQ_OP_SECURE_ERASE && bio_op(bio) != REQ_OP_WRITE_ZEROES) return true; return false; } static inline bool bio_no_advance_iter(const struct bio *bio) { return bio_op(bio) == REQ_OP_DISCARD || bio_op(bio) == REQ_OP_SECURE_ERASE || bio_op(bio) == REQ_OP_WRITE_ZEROES; } static inline void *bio_data(struct bio *bio) { if (bio_has_data(bio)) return page_address(bio_page(bio)) + bio_offset(bio); return NULL; } static inline bool bio_next_segment(const struct bio *bio, struct bvec_iter_all *iter) { if (iter->idx >= bio->bi_vcnt) return false; bvec_advance(&bio->bi_io_vec[iter->idx], iter); return true; } /* * drivers should _never_ use the all version - the bio may have been split * before it got to the driver and the driver won't own all of it */ #define bio_for_each_segment_all(bvl, bio, iter) \ for (bvl = bvec_init_iter_all(&iter); bio_next_segment((bio), &iter); ) static inline void bio_advance_iter(const struct bio *bio, struct bvec_iter *iter, unsigned int bytes) { iter->bi_sector += bytes >> 9; if (bio_no_advance_iter(bio)) iter->bi_size -= bytes; else bvec_iter_advance(bio->bi_io_vec, iter, bytes); /* TODO: It is reasonable to complete bio with error here. */ } /* @bytes should be less or equal to bvec[i->bi_idx].bv_len */ static inline void bio_advance_iter_single(const struct bio *bio, struct bvec_iter *iter, unsigned int bytes) { iter->bi_sector += bytes >> 9; if (bio_no_advance_iter(bio)) iter->bi_size -= bytes; else bvec_iter_advance_single(bio->bi_io_vec, iter, bytes); } void __bio_advance(struct bio *, unsigned bytes); /** * bio_advance - increment/complete a bio by some number of bytes * @bio: bio to advance * @nbytes: number of bytes to complete * * This updates bi_sector, bi_size and bi_idx; if the number of bytes to * complete doesn't align with a bvec boundary, then bv_len and bv_offset will * be updated on the last bvec as well. * * @bio will then represent the remaining, uncompleted portion of the io. */ static inline void bio_advance(struct bio *bio, unsigned int nbytes) { if (nbytes == bio->bi_iter.bi_size) { bio->bi_iter.bi_size = 0; return; } __bio_advance(bio, nbytes); } #define __bio_for_each_segment(bvl, bio, iter, start) \ for (iter = (start); \ (iter).bi_size && \ ((bvl = bio_iter_iovec((bio), (iter))), 1); \ bio_advance_iter_single((bio), &(iter), (bvl).bv_len)) #define bio_for_each_segment(bvl, bio, iter) \ __bio_for_each_segment(bvl, bio, iter, (bio)->bi_iter) #define __bio_for_each_bvec(bvl, bio, iter, start) \ for (iter = (start); \ (iter).bi_size && \ ((bvl = mp_bvec_iter_bvec((bio)->bi_io_vec, (iter))), 1); \ bio_advance_iter_single((bio), &(iter), (bvl).bv_len)) /* iterate over multi-page bvec */ #define bio_for_each_bvec(bvl, bio, iter) \ __bio_for_each_bvec(bvl, bio, iter, (bio)->bi_iter) /* * Iterate over all multi-page bvecs. Drivers shouldn't use this version for the * same reasons as bio_for_each_segment_all(). */ #define bio_for_each_bvec_all(bvl, bio, i) \ for (i = 0, bvl = bio_first_bvec_all(bio); \ i < (bio)->bi_vcnt; i++, bvl++) #define bio_iter_last(bvec, iter) ((iter).bi_size == (bvec).bv_len) static inline unsigned bio_segments(struct bio *bio) { unsigned segs = 0; struct bio_vec bv; struct bvec_iter iter; /* * We special case discard/write same/write zeroes, because they * interpret bi_size differently: */ switch (bio_op(bio)) { case REQ_OP_DISCARD: case REQ_OP_SECURE_ERASE: case REQ_OP_WRITE_ZEROES: return 0; default: break; } bio_for_each_segment(bv, bio, iter) segs++; return segs; } /* * get a reference to a bio, so it won't disappear. the intended use is * something like: * * bio_get(bio); * submit_bio(rw, bio); * if (bio->bi_flags ...) * do_something * bio_put(bio); * * without the bio_get(), it could potentially complete I/O before submit_bio * returns. and then bio would be freed memory when if (bio->bi_flags ...) * runs */ static inline void bio_get(struct bio *bio) { bio->bi_flags |= (1 << BIO_REFFED); smp_mb__before_atomic(); atomic_inc(&bio->__bi_cnt); } static inline void bio_cnt_set(struct bio *bio, unsigned int count) { if (count != 1) { bio->bi_flags |= (1 << BIO_REFFED); smp_mb(); } atomic_set(&bio->__bi_cnt, count); } static inline bool bio_flagged(struct bio *bio, unsigned int bit) { return bio->bi_flags & (1U << bit); } static inline void bio_set_flag(struct bio *bio, unsigned int bit) { bio->bi_flags |= (1U << bit); } static inline void bio_clear_flag(struct bio *bio, unsigned int bit) { bio->bi_flags &= ~(1U << bit); } static inline struct bio_vec *bio_first_bvec_all(struct bio *bio) { WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)); return bio->bi_io_vec; } static inline struct page *bio_first_page_all(struct bio *bio) { return bio_first_bvec_all(bio)->bv_page; } static inline struct folio *bio_first_folio_all(struct bio *bio) { return page_folio(bio_first_page_all(bio)); } static inline struct bio_vec *bio_last_bvec_all(struct bio *bio) { WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)); return &bio->bi_io_vec[bio->bi_vcnt - 1]; } /** * struct folio_iter - State for iterating all folios in a bio. * @folio: The current folio we're iterating. NULL after the last folio. * @offset: The byte offset within the current folio. * @length: The number of bytes in this iteration (will not cross folio * boundary). */ struct folio_iter { struct folio *folio; size_t offset; size_t length; /* private: for use by the iterator */ struct folio *_next; size_t _seg_count; int _i; }; static inline void bio_first_folio(struct folio_iter *fi, struct bio *bio, int i) { struct bio_vec *bvec = bio_first_bvec_all(bio) + i; fi->folio = page_folio(bvec->bv_page); fi->offset = bvec->bv_offset + PAGE_SIZE * (bvec->bv_page - &fi->folio->page); fi->_seg_count = bvec->bv_len; fi->length = min(folio_size(fi->folio) - fi->offset, fi->_seg_count); fi->_next = folio_next(fi->folio); fi->_i = i; } static inline void bio_next_folio(struct folio_iter *fi, struct bio *bio) { fi->_seg_count -= fi->length; if (fi->_seg_count) { fi->folio = fi->_next; fi->offset = 0; fi->length = min(folio_size(fi->folio), fi->_seg_count); fi->_next = folio_next(fi->folio); } else if (fi->_i + 1 < bio->bi_vcnt) { bio_first_folio(fi, bio, fi->_i + 1); } else { fi->folio = NULL; } } /** * bio_for_each_folio_all - Iterate over each folio in a bio. * @fi: struct folio_iter which is updated for each folio. * @bio: struct bio to iterate over. */ #define bio_for_each_folio_all(fi, bio) \ for (bio_first_folio(&fi, bio, 0); fi.folio; bio_next_folio(&fi, bio)) enum bip_flags { BIP_BLOCK_INTEGRITY = 1 << 0, /* block layer owns integrity data */ BIP_MAPPED_INTEGRITY = 1 << 1, /* ref tag has been remapped */ BIP_CTRL_NOCHECK = 1 << 2, /* disable HBA integrity checking */ BIP_DISK_NOCHECK = 1 << 3, /* disable disk integrity checking */ BIP_IP_CHECKSUM = 1 << 4, /* IP checksum */ }; /* * bio integrity payload */ struct bio_integrity_payload { struct bio *bip_bio; /* parent bio */ struct bvec_iter bip_iter; unsigned short bip_vcnt; /* # of integrity bio_vecs */ unsigned short bip_max_vcnt; /* integrity bio_vec slots */ unsigned short bip_flags; /* control flags */ struct bvec_iter bio_iter; /* for rewinding parent bio */ struct work_struct bip_work; /* I/O completion */ struct bio_vec *bip_vec; struct bio_vec bip_inline_vecs[];/* embedded bvec array */ }; #if defined(CONFIG_BLK_DEV_INTEGRITY) static inline struct bio_integrity_payload *bio_integrity(struct bio *bio) { if (bio->bi_opf & REQ_INTEGRITY) return bio->bi_integrity; return NULL; } static inline bool bio_integrity_flagged(struct bio *bio, enum bip_flags flag) { struct bio_integrity_payload *bip = bio_integrity(bio); if (bip) return bip->bip_flags & flag; return false; } static inline sector_t bip_get_seed(struct bio_integrity_payload *bip) { return bip->bip_iter.bi_sector; } static inline void bip_set_seed(struct bio_integrity_payload *bip, sector_t seed) { bip->bip_iter.bi_sector = seed; } #endif /* CONFIG_BLK_DEV_INTEGRITY */ void bio_trim(struct bio *bio, sector_t offset, sector_t size); extern struct bio *bio_split(struct bio *bio, int sectors, gfp_t gfp, struct bio_set *bs); struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim, unsigned *segs, struct bio_set *bs, unsigned max_bytes); /** * bio_next_split - get next @sectors from a bio, splitting if necessary * @bio: bio to split * @sectors: number of sectors to split from the front of @bio * @gfp: gfp mask * @bs: bio set to allocate from * * Return: a bio representing the next @sectors of @bio - if the bio is smaller * than @sectors, returns the original bio unchanged. */ static inline struct bio *bio_next_split(struct bio *bio, int sectors, gfp_t gfp, struct bio_set *bs) { if (sectors >= bio_sectors(bio)) return bio; return bio_split(bio, sectors, gfp, bs); } enum { BIOSET_NEED_BVECS = BIT(0), BIOSET_NEED_RESCUER = BIT(1), BIOSET_PERCPU_CACHE = BIT(2), }; extern int bioset_init(struct bio_set *, unsigned int, unsigned int, int flags); extern void bioset_exit(struct bio_set *); extern int biovec_init_pool(mempool_t *pool, int pool_entries); struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs, blk_opf_t opf, gfp_t gfp_mask, struct bio_set *bs); struct bio *bio_kmalloc(unsigned short nr_vecs, gfp_t gfp_mask); extern void bio_put(struct bio *); struct bio *bio_alloc_clone(struct block_device *bdev, struct bio *bio_src, gfp_t gfp, struct bio_set *bs); int bio_init_clone(struct block_device *bdev, struct bio *bio, struct bio *bio_src, gfp_t gfp); extern struct bio_set fs_bio_set; static inline struct bio *bio_alloc(struct block_device *bdev, unsigned short nr_vecs, blk_opf_t opf, gfp_t gfp_mask) { return bio_alloc_bioset(bdev, nr_vecs, opf, gfp_mask, &fs_bio_set); } void submit_bio(struct bio *bio); extern void bio_endio(struct bio *); static inline void bio_io_error(struct bio *bio) { bio->bi_status = BLK_STS_IOERR; bio_endio(bio); } static inline void bio_wouldblock_error(struct bio *bio) { bio_set_flag(bio, BIO_QUIET); bio->bi_status = BLK_STS_AGAIN; bio_endio(bio); } /* * Calculate number of bvec segments that should be allocated to fit data * pointed by @iter. If @iter is backed by bvec it's going to be reused * instead of allocating a new one. */ static inline int bio_iov_vecs_to_alloc(struct iov_iter *iter, int max_segs) { if (iov_iter_is_bvec(iter)) return 0; return iov_iter_npages(iter, max_segs); } struct request_queue; extern int submit_bio_wait(struct bio *bio); void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table, unsigned short max_vecs, blk_opf_t opf); extern void bio_uninit(struct bio *); void bio_reset(struct bio *bio, struct block_device *bdev, blk_opf_t opf); void bio_chain(struct bio *, struct bio *); int __must_check bio_add_page(struct bio *bio, struct page *page, unsigned len, unsigned off); bool __must_check bio_add_folio(struct bio *bio, struct folio *folio, size_t len, size_t off); extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *, unsigned int, unsigned int); int bio_add_zone_append_page(struct bio *bio, struct page *page, unsigned int len, unsigned int offset); void __bio_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int off); void bio_add_folio_nofail(struct bio *bio, struct folio *folio, size_t len, size_t off); int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter); void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter); void __bio_release_pages(struct bio *bio, bool mark_dirty); extern void bio_set_pages_dirty(struct bio *bio); extern void bio_check_pages_dirty(struct bio *bio); extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, struct bio *src, struct bvec_iter *src_iter); extern void bio_copy_data(struct bio *dst, struct bio *src); extern void bio_free_pages(struct bio *bio); void guard_bio_eod(struct bio *bio); void zero_fill_bio_iter(struct bio *bio, struct bvec_iter iter); static inline void zero_fill_bio(struct bio *bio) { zero_fill_bio_iter(bio, bio->bi_iter); } static inline void bio_release_pages(struct bio *bio, bool mark_dirty) { if (bio_flagged(bio, BIO_PAGE_PINNED)) __bio_release_pages(bio, mark_dirty); } #define bio_dev(bio) \ disk_devt((bio)->bi_bdev->bd_disk) #ifdef CONFIG_BLK_CGROUP void bio_associate_blkg(struct bio *bio); void bio_associate_blkg_from_css(struct bio *bio, struct cgroup_subsys_state *css); void bio_clone_blkg_association(struct bio *dst, struct bio *src); void blkcg_punt_bio_submit(struct bio *bio); #else /* CONFIG_BLK_CGROUP */ static inline void bio_associate_blkg(struct bio *bio) { } static inline void bio_associate_blkg_from_css(struct bio *bio, struct cgroup_subsys_state *css) { } static inline void bio_clone_blkg_association(struct bio *dst, struct bio *src) { } static inline void blkcg_punt_bio_submit(struct bio *bio) { submit_bio(bio); } #endif /* CONFIG_BLK_CGROUP */ static inline void bio_set_dev(struct bio *bio, struct block_device *bdev) { bio_clear_flag(bio, BIO_REMAPPED); if (bio->bi_bdev != bdev) bio_clear_flag(bio, BIO_BPS_THROTTLED); bio->bi_bdev = bdev; bio_associate_blkg(bio); } /* * BIO list management for use by remapping drivers (e.g. DM or MD) and loop. * * A bio_list anchors a singly-linked list of bios chained through the bi_next * member of the bio. The bio_list also caches the last list member to allow * fast access to the tail. */ struct bio_list { struct bio *head; struct bio *tail; }; static inline int bio_list_empty(const struct bio_list *bl) { return bl->head == NULL; } static inline void bio_list_init(struct bio_list *bl) { bl->head = bl->tail = NULL; } #define BIO_EMPTY_LIST { NULL, NULL } #define bio_list_for_each(bio, bl) \ for (bio = (bl)->head; bio; bio = bio->bi_next) static inline unsigned bio_list_size(const struct bio_list *bl) { unsigned sz = 0; struct bio *bio; bio_list_for_each(bio, bl) sz++; return sz; } static inline void bio_list_add(struct bio_list *bl, struct bio *bio) { bio->bi_next = NULL; if (bl->tail) bl->tail->bi_next = bio; else bl->head = bio; bl->tail = bio; } static inline void bio_list_add_head(struct bio_list *bl, struct bio *bio) { bio->bi_next = bl->head; bl->head = bio; if (!bl->tail) bl->tail = bio; } static inline void bio_list_merge(struct bio_list *bl, struct bio_list *bl2) { if (!bl2->head) return; if (bl->tail) bl->tail->bi_next = bl2->head; else bl->head = bl2->head; bl->tail = bl2->tail; } static inline void bio_list_merge_head(struct bio_list *bl, struct bio_list *bl2) { if (!bl2->head) return; if (bl->head) bl2->tail->bi_next = bl->head; else bl->tail = bl2->tail; bl->head = bl2->head; } static inline struct bio *bio_list_peek(struct bio_list *bl) { return bl->head; } static inline struct bio *bio_list_pop(struct bio_list *bl) { struct bio *bio = bl->head; if (bio) { bl->head = bl->head->bi_next; if (!bl->head) bl->tail = NULL; bio->bi_next = NULL; } return bio; } static inline struct bio *bio_list_get(struct bio_list *bl) { struct bio *bio = bl->head; bl->head = bl->tail = NULL; return bio; } /* * Increment chain count for the bio. Make sure the CHAIN flag update * is visible before the raised count. */ static inline void bio_inc_remaining(struct bio *bio) { bio_set_flag(bio, BIO_CHAIN); smp_mb__before_atomic(); atomic_inc(&bio->__bi_remaining); } /* * bio_set is used to allow other portions of the IO system to * allocate their own private memory pools for bio and iovec structures. * These memory pools in turn all allocate from the bio_slab * and the bvec_slabs[]. */ #define BIO_POOL_SIZE 2 struct bio_set { struct kmem_cache *bio_slab; unsigned int front_pad; /* * per-cpu bio alloc cache */ struct bio_alloc_cache __percpu *cache; mempool_t bio_pool; mempool_t bvec_pool; #if defined(CONFIG_BLK_DEV_INTEGRITY) mempool_t bio_integrity_pool; mempool_t bvec_integrity_pool; #endif unsigned int back_pad; /* * Deadlock avoidance for stacking block drivers: see comments in * bio_alloc_bioset() for details */ spinlock_t rescue_lock; struct bio_list rescue_list; struct work_struct rescue_work; struct workqueue_struct *rescue_workqueue; /* * Hot un-plug notifier for the per-cpu cache, if used */ struct hlist_node cpuhp_dead; }; static inline bool bioset_initialized(struct bio_set *bs) { return bs->bio_slab != NULL; } #if defined(CONFIG_BLK_DEV_INTEGRITY) #define bip_for_each_vec(bvl, bip, iter) \ for_each_bvec(bvl, (bip)->bip_vec, iter, (bip)->bip_iter) #define bio_for_each_integrity_vec(_bvl, _bio, _iter) \ for_each_bio(_bio) \ bip_for_each_vec(_bvl, _bio->bi_integrity, _iter) extern struct bio_integrity_payload *bio_integrity_alloc(struct bio *, gfp_t, unsigned int); extern int bio_integrity_add_page(struct bio *, struct page *, unsigned int, unsigned int); extern bool bio_integrity_prep(struct bio *); extern void bio_integrity_advance(struct bio *, unsigned int); extern void bio_integrity_trim(struct bio *); extern int bio_integrity_clone(struct bio *, struct bio *, gfp_t); extern int bioset_integrity_create(struct bio_set *, int); extern void bioset_integrity_free(struct bio_set *); extern void bio_integrity_init(void); #else /* CONFIG_BLK_DEV_INTEGRITY */ static inline void *bio_integrity(struct bio *bio) { return NULL; } static inline int bioset_integrity_create(struct bio_set *bs, int pool_size) { return 0; } static inline void bioset_integrity_free (struct bio_set *bs) { return; } static inline bool bio_integrity_prep(struct bio *bio) { return true; } static inline int bio_integrity_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp_mask) { return 0; } static inline void bio_integrity_advance(struct bio *bio, unsigned int bytes_done) { return; } static inline void bio_integrity_trim(struct bio *bio) { return; } static inline void bio_integrity_init(void) { return; } static inline bool bio_integrity_flagged(struct bio *bio, enum bip_flags flag) { return false; } static inline void *bio_integrity_alloc(struct bio * bio, gfp_t gfp, unsigned int nr) { return ERR_PTR(-EINVAL); } static inline int bio_integrity_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int offset) { return 0; } #endif /* CONFIG_BLK_DEV_INTEGRITY */ /* * Mark a bio as polled. Note that for async polled IO, the caller must * expect -EWOULDBLOCK if we cannot allocate a request (or other resources). * We cannot block waiting for requests on polled IO, as those completions * must be found by the caller. This is different than IRQ driven IO, where * it's safe to wait for IO to complete. */ static inline void bio_set_polled(struct bio *bio, struct kiocb *kiocb) { bio->bi_opf |= REQ_POLLED; if (kiocb->ki_flags & IOCB_NOWAIT) bio->bi_opf |= REQ_NOWAIT; } static inline void bio_clear_polled(struct bio *bio) { bio->bi_opf &= ~REQ_POLLED; } struct bio *blk_next_bio(struct bio *bio, struct block_device *bdev, unsigned int nr_pages, blk_opf_t opf, gfp_t gfp); #endif /* __LINUX_BIO_H */ |
26 27 3 2 26 26 26 26 26 2 1 3 21 1 1 1 26 3 28 1 23 2 25 4 28 6 6 6 1 1 39 39 1 1 40 40 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 | // SPDX-License-Identifier: GPL-2.0 /* * main.c - Multi purpose firmware loading support * * Copyright (c) 2003 Manuel Estrada Sainz * * Please see Documentation/driver-api/firmware/ for more information. * */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/capability.h> #include <linux/device.h> #include <linux/kernel_read_file.h> #include <linux/module.h> #include <linux/init.h> #include <linux/initrd.h> #include <linux/timer.h> #include <linux/vmalloc.h> #include <linux/interrupt.h> #include <linux/bitops.h> #include <linux/mutex.h> #include <linux/workqueue.h> #include <linux/highmem.h> #include <linux/firmware.h> #include <linux/slab.h> #include <linux/sched.h> #include <linux/file.h> #include <linux/list.h> #include <linux/fs.h> #include <linux/async.h> #include <linux/pm.h> #include <linux/suspend.h> #include <linux/syscore_ops.h> #include <linux/reboot.h> #include <linux/security.h> #include <linux/zstd.h> #include <linux/xz.h> #include <generated/utsrelease.h> #include "../base.h" #include "firmware.h" #include "fallback.h" MODULE_AUTHOR("Manuel Estrada Sainz"); MODULE_DESCRIPTION("Multi purpose firmware loading support"); MODULE_LICENSE("GPL"); struct firmware_cache { /* firmware_buf instance will be added into the below list */ spinlock_t lock; struct list_head head; int state; #ifdef CONFIG_FW_CACHE /* * Names of firmware images which have been cached successfully * will be added into the below list so that device uncache * helper can trace which firmware images have been cached * before. */ spinlock_t name_lock; struct list_head fw_names; struct delayed_work work; struct notifier_block pm_notify; #endif }; struct fw_cache_entry { struct list_head list; const char *name; }; struct fw_name_devm { unsigned long magic; const char *name; }; static inline struct fw_priv *to_fw_priv(struct kref *ref) { return container_of(ref, struct fw_priv, ref); } #define FW_LOADER_NO_CACHE 0 #define FW_LOADER_START_CACHE 1 /* fw_lock could be moved to 'struct fw_sysfs' but since it is just * guarding for corner cases a global lock should be OK */ DEFINE_MUTEX(fw_lock); struct firmware_cache fw_cache; bool fw_load_abort_all; void fw_state_init(struct fw_priv *fw_priv) { struct fw_state *fw_st = &fw_priv->fw_st; init_completion(&fw_st->completion); fw_st->status = FW_STATUS_UNKNOWN; } static inline int fw_state_wait(struct fw_priv *fw_priv) { return __fw_state_wait_common(fw_priv, MAX_SCHEDULE_TIMEOUT); } static void fw_cache_piggyback_on_request(struct fw_priv *fw_priv); static struct fw_priv *__allocate_fw_priv(const char *fw_name, struct firmware_cache *fwc, void *dbuf, size_t size, size_t offset, u32 opt_flags) { struct fw_priv *fw_priv; /* For a partial read, the buffer must be preallocated. */ if ((opt_flags & FW_OPT_PARTIAL) && !dbuf) return NULL; /* Only partial reads are allowed to use an offset. */ if (offset != 0 && !(opt_flags & FW_OPT_PARTIAL)) return NULL; fw_priv = kzalloc(sizeof(*fw_priv), GFP_ATOMIC); if (!fw_priv) return NULL; fw_priv->fw_name = kstrdup_const(fw_name, GFP_ATOMIC); if (!fw_priv->fw_name) { kfree(fw_priv); return NULL; } kref_init(&fw_priv->ref); fw_priv->fwc = fwc; fw_priv->data = dbuf; fw_priv->allocated_size = size; fw_priv->offset = offset; fw_priv->opt_flags = opt_flags; fw_state_init(fw_priv); #ifdef CONFIG_FW_LOADER_USER_HELPER INIT_LIST_HEAD(&fw_priv->pending_list); #endif pr_debug("%s: fw-%s fw_priv=%p\n", __func__, fw_name, fw_priv); return fw_priv; } static struct fw_priv *__lookup_fw_priv(const char *fw_name) { struct fw_priv *tmp; struct firmware_cache *fwc = &fw_cache; list_for_each_entry(tmp, &fwc->head, list) if (!strcmp(tmp->fw_name, fw_name)) return tmp; return NULL; } /* Returns 1 for batching firmware requests with the same name */ int alloc_lookup_fw_priv(const char *fw_name, struct firmware_cache *fwc, struct fw_priv **fw_priv, void *dbuf, size_t size, size_t offset, u32 opt_flags) { struct fw_priv *tmp; spin_lock(&fwc->lock); /* * Do not merge requests that are marked to be non-cached or * are performing partial reads. */ if (!(opt_flags & (FW_OPT_NOCACHE | FW_OPT_PARTIAL))) { tmp = __lookup_fw_priv(fw_name); if (tmp) { kref_get(&tmp->ref); spin_unlock(&fwc->lock); *fw_priv = tmp; pr_debug("batched request - sharing the same struct fw_priv and lookup for multiple requests\n"); return 1; } } tmp = __allocate_fw_priv(fw_name, fwc, dbuf, size, offset, opt_flags); if (tmp) { INIT_LIST_HEAD(&tmp->list); if (!(opt_flags & FW_OPT_NOCACHE)) list_add(&tmp->list, &fwc->head); } spin_unlock(&fwc->lock); *fw_priv = tmp; return tmp ? 0 : -ENOMEM; } static void __free_fw_priv(struct kref *ref) __releases(&fwc->lock) { struct fw_priv *fw_priv = to_fw_priv(ref); struct firmware_cache *fwc = fw_priv->fwc; pr_debug("%s: fw-%s fw_priv=%p data=%p size=%u\n", __func__, fw_priv->fw_name, fw_priv, fw_priv->data, (unsigned int)fw_priv->size); list_del(&fw_priv->list); spin_unlock(&fwc->lock); if (fw_is_paged_buf(fw_priv)) fw_free_paged_buf(fw_priv); else if (!fw_priv->allocated_size) vfree(fw_priv->data); kfree_const(fw_priv->fw_name); kfree(fw_priv); } void free_fw_priv(struct fw_priv *fw_priv) { struct firmware_cache *fwc = fw_priv->fwc; spin_lock(&fwc->lock); if (!kref_put(&fw_priv->ref, __free_fw_priv)) spin_unlock(&fwc->lock); } #ifdef CONFIG_FW_LOADER_PAGED_BUF bool fw_is_paged_buf(struct fw_priv *fw_priv) { return fw_priv->is_paged_buf; } void fw_free_paged_buf(struct fw_priv *fw_priv) { int i; if (!fw_priv->pages) return; vunmap(fw_priv->data); for (i = 0; i < fw_priv->nr_pages; i++) __free_page(fw_priv->pages[i]); kvfree(fw_priv->pages); fw_priv->pages = NULL; fw_priv->page_array_size = 0; fw_priv->nr_pages = 0; fw_priv->data = NULL; fw_priv->size = 0; } int fw_grow_paged_buf(struct fw_priv *fw_priv, int pages_needed) { /* If the array of pages is too small, grow it */ if (fw_priv->page_array_size < pages_needed) { int new_array_size = max(pages_needed, fw_priv->page_array_size * 2); struct page **new_pages; new_pages = kvmalloc_array(new_array_size, sizeof(void *), GFP_KERNEL); if (!new_pages) return -ENOMEM; memcpy(new_pages, fw_priv->pages, fw_priv->page_array_size * sizeof(void *)); memset(&new_pages[fw_priv->page_array_size], 0, sizeof(void *) * (new_array_size - fw_priv->page_array_size)); kvfree(fw_priv->pages); fw_priv->pages = new_pages; fw_priv->page_array_size = new_array_size; } while (fw_priv->nr_pages < pages_needed) { fw_priv->pages[fw_priv->nr_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); if (!fw_priv->pages[fw_priv->nr_pages]) return -ENOMEM; fw_priv->nr_pages++; } return 0; } int fw_map_paged_buf(struct fw_priv *fw_priv) { /* one pages buffer should be mapped/unmapped only once */ if (!fw_priv->pages) return 0; vunmap(fw_priv->data); fw_priv->data = vmap(fw_priv->pages, fw_priv->nr_pages, 0, PAGE_KERNEL_RO); if (!fw_priv->data) return -ENOMEM; return 0; } #endif /* * ZSTD-compressed firmware support */ #ifdef CONFIG_FW_LOADER_COMPRESS_ZSTD static int fw_decompress_zstd(struct device *dev, struct fw_priv *fw_priv, size_t in_size, const void *in_buffer) { size_t len, out_size, workspace_size; void *workspace, *out_buf; zstd_dctx *ctx; int err; if (fw_priv->allocated_size) { out_size = fw_priv->allocated_size; out_buf = fw_priv->data; } else { zstd_frame_header params; if (zstd_get_frame_header(¶ms, in_buffer, in_size) || params.frameContentSize == ZSTD_CONTENTSIZE_UNKNOWN) { dev_dbg(dev, "%s: invalid zstd header\n", __func__); return -EINVAL; } out_size = params.frameContentSize; out_buf = vzalloc(out_size); if (!out_buf) return -ENOMEM; } workspace_size = zstd_dctx_workspace_bound(); workspace = kvzalloc(workspace_size, GFP_KERNEL); if (!workspace) { err = -ENOMEM; goto error; } ctx = zstd_init_dctx(workspace, workspace_size); if (!ctx) { dev_dbg(dev, "%s: failed to initialize context\n", __func__); err = -EINVAL; goto error; } len = zstd_decompress_dctx(ctx, out_buf, out_size, in_buffer, in_size); if (zstd_is_error(len)) { dev_dbg(dev, "%s: failed to decompress: %d\n", __func__, zstd_get_error_code(len)); err = -EINVAL; goto error; } if (!fw_priv->allocated_size) fw_priv->data = out_buf; fw_priv->size = len; err = 0; error: kvfree(workspace); if (err && !fw_priv->allocated_size) vfree(out_buf); return err; } #endif /* CONFIG_FW_LOADER_COMPRESS_ZSTD */ /* * XZ-compressed firmware support */ #ifdef CONFIG_FW_LOADER_COMPRESS_XZ /* show an error and return the standard error code */ static int fw_decompress_xz_error(struct device *dev, enum xz_ret xz_ret) { if (xz_ret != XZ_STREAM_END) { dev_warn(dev, "xz decompression failed (xz_ret=%d)\n", xz_ret); return xz_ret == XZ_MEM_ERROR ? -ENOMEM : -EINVAL; } return 0; } /* single-shot decompression onto the pre-allocated buffer */ static int fw_decompress_xz_single(struct device *dev, struct fw_priv *fw_priv, size_t in_size, const void *in_buffer) { struct xz_dec *xz_dec; struct xz_buf xz_buf; enum xz_ret xz_ret; xz_dec = xz_dec_init(XZ_SINGLE, (u32)-1); if (!xz_dec) return -ENOMEM; xz_buf.in_size = in_size; xz_buf.in = in_buffer; xz_buf.in_pos = 0; xz_buf.out_size = fw_priv->allocated_size; xz_buf.out = fw_priv->data; xz_buf.out_pos = 0; xz_ret = xz_dec_run(xz_dec, &xz_buf); xz_dec_end(xz_dec); fw_priv->size = xz_buf.out_pos; return fw_decompress_xz_error(dev, xz_ret); } /* decompression on paged buffer and map it */ static int fw_decompress_xz_pages(struct device *dev, struct fw_priv *fw_priv, size_t in_size, const void *in_buffer) { struct xz_dec *xz_dec; struct xz_buf xz_buf; enum xz_ret xz_ret; struct page *page; int err = 0; xz_dec = xz_dec_init(XZ_DYNALLOC, (u32)-1); if (!xz_dec) return -ENOMEM; xz_buf.in_size = in_size; xz_buf.in = in_buffer; xz_buf.in_pos = 0; fw_priv->is_paged_buf = true; fw_priv->size = 0; do { if (fw_grow_paged_buf(fw_priv, fw_priv->nr_pages + 1)) { err = -ENOMEM; goto out; } /* decompress onto the new allocated page */ page = fw_priv->pages[fw_priv->nr_pages - 1]; xz_buf.out = kmap_local_page(page); xz_buf.out_pos = 0; xz_buf.out_size = PAGE_SIZE; xz_ret = xz_dec_run(xz_dec, &xz_buf); kunmap_local(xz_buf.out); fw_priv->size += xz_buf.out_pos; /* partial decompression means either end or error */ if (xz_buf.out_pos != PAGE_SIZE) break; } while (xz_ret == XZ_OK); err = fw_decompress_xz_error(dev, xz_ret); if (!err) err = fw_map_paged_buf(fw_priv); out: xz_dec_end(xz_dec); return err; } static int fw_decompress_xz(struct device *dev, struct fw_priv *fw_priv, size_t in_size, const void *in_buffer) { /* if the buffer is pre-allocated, we can perform in single-shot mode */ if (fw_priv->data) return fw_decompress_xz_single(dev, fw_priv, in_size, in_buffer); else return fw_decompress_xz_pages(dev, fw_priv, in_size, in_buffer); } #endif /* CONFIG_FW_LOADER_COMPRESS_XZ */ /* direct firmware loading support */ static char fw_path_para[256]; static const char * const fw_path[] = { fw_path_para, "/lib/firmware/updates/" UTS_RELEASE, "/lib/firmware/updates", "/lib/firmware/" UTS_RELEASE, "/lib/firmware" }; /* * Typical usage is that passing 'firmware_class.path=$CUSTOMIZED_PATH' * from kernel command line because firmware_class is generally built in * kernel instead of module. */ module_param_string(path, fw_path_para, sizeof(fw_path_para), 0644); MODULE_PARM_DESC(path, "customized firmware image search path with a higher priority than default path"); static int fw_get_filesystem_firmware(struct device *device, struct fw_priv *fw_priv, const char *suffix, int (*decompress)(struct device *dev, struct fw_priv *fw_priv, size_t in_size, const void *in_buffer)) { size_t size; int i, len, maxlen = 0; int rc = -ENOENT; char *path, *nt = NULL; size_t msize = INT_MAX; void *buffer = NULL; /* Already populated data member means we're loading into a buffer */ if (!decompress && fw_priv->data) { buffer = fw_priv->data; msize = fw_priv->allocated_size; } path = __getname(); if (!path) return -ENOMEM; wait_for_initramfs(); for (i = 0; i < ARRAY_SIZE(fw_path); i++) { size_t file_size = 0; size_t *file_size_ptr = NULL; /* skip the unset customized path */ if (!fw_path[i][0]) continue; /* strip off \n from customized path */ maxlen = strlen(fw_path[i]); if (i == 0) { nt = strchr(fw_path[i], '\n'); if (nt) maxlen = nt - fw_path[i]; } len = snprintf(path, PATH_MAX, "%.*s/%s%s", maxlen, fw_path[i], fw_priv->fw_name, suffix); if (len >= PATH_MAX) { rc = -ENAMETOOLONG; break; } fw_priv->size = 0; /* * The total file size is only examined when doing a partial * read; the "full read" case needs to fail if the whole * firmware was not completely loaded. */ if ((fw_priv->opt_flags & FW_OPT_PARTIAL) && buffer) file_size_ptr = &file_size; /* load firmware files from the mount namespace of init */ rc = kernel_read_file_from_path_initns(path, fw_priv->offset, &buffer, msize, file_size_ptr, READING_FIRMWARE); if (rc < 0) { if (rc != -ENOENT) dev_warn(device, "loading %s failed with error %d\n", path, rc); else dev_dbg(device, "loading %s failed for no such file or directory.\n", path); continue; } size = rc; rc = 0; dev_dbg(device, "Loading firmware from %s\n", path); if (decompress) { dev_dbg(device, "f/w decompressing %s\n", fw_priv->fw_name); rc = decompress(device, fw_priv, size, buffer); /* discard the superfluous original content */ vfree(buffer); buffer = NULL; if (rc) { fw_free_paged_buf(fw_priv); continue; } } else { dev_dbg(device, "direct-loading %s\n", fw_priv->fw_name); if (!fw_priv->data) fw_priv->data = buffer; fw_priv->size = size; } fw_state_done(fw_priv); break; } __putname(path); return rc; } /* firmware holds the ownership of pages */ static void firmware_free_data(const struct firmware *fw) { /* Loaded directly? */ if (!fw->priv) { vfree(fw->data); return; } free_fw_priv(fw->priv); } /* store the pages buffer info firmware from buf */ static void fw_set_page_data(struct fw_priv *fw_priv, struct firmware *fw) { fw->priv = fw_priv; fw->size = fw_priv->size; fw->data = fw_priv->data; pr_debug("%s: fw-%s fw_priv=%p data=%p size=%u\n", __func__, fw_priv->fw_name, fw_priv, fw_priv->data, (unsigned int)fw_priv->size); } #ifdef CONFIG_FW_CACHE static void fw_name_devm_release(struct device *dev, void *res) { struct fw_name_devm *fwn = res; if (fwn->magic == (unsigned long)&fw_cache) pr_debug("%s: fw_name-%s devm-%p released\n", __func__, fwn->name, res); kfree_const(fwn->name); } static int fw_devm_match(struct device *dev, void *res, void *match_data) { struct fw_name_devm *fwn = res; return (fwn->magic == (unsigned long)&fw_cache) && !strcmp(fwn->name, match_data); } static struct fw_name_devm *fw_find_devm_name(struct device *dev, const char *name) { struct fw_name_devm *fwn; fwn = devres_find(dev, fw_name_devm_release, fw_devm_match, (void *)name); return fwn; } static bool fw_cache_is_setup(struct device *dev, const char *name) { struct fw_name_devm *fwn; fwn = fw_find_devm_name(dev, name); if (fwn) return true; return false; } /* add firmware name into devres list */ static int fw_add_devm_name(struct device *dev, const char *name) { struct fw_name_devm *fwn; if (fw_cache_is_setup(dev, name)) return 0; fwn = devres_alloc(fw_name_devm_release, sizeof(struct fw_name_devm), GFP_KERNEL); if (!fwn) return -ENOMEM; fwn->name = kstrdup_const(name, GFP_KERNEL); if (!fwn->name) { devres_free(fwn); return -ENOMEM; } fwn->magic = (unsigned long)&fw_cache; devres_add(dev, fwn); return 0; } #else static bool fw_cache_is_setup(struct device *dev, const char *name) { return false; } static int fw_add_devm_name(struct device *dev, const char *name) { return 0; } #endif int assign_fw(struct firmware *fw, struct device *device) { struct fw_priv *fw_priv = fw->priv; int ret; mutex_lock(&fw_lock); if (!fw_priv->size || fw_state_is_aborted(fw_priv)) { mutex_unlock(&fw_lock); return -ENOENT; } /* * add firmware name into devres list so that we can auto cache * and uncache firmware for device. * * device may has been deleted already, but the problem * should be fixed in devres or driver core. */ /* don't cache firmware handled without uevent */ if (device && (fw_priv->opt_flags & FW_OPT_UEVENT) && !(fw_priv->opt_flags & FW_OPT_NOCACHE)) { ret = fw_add_devm_name(device, fw_priv->fw_name); if (ret) { mutex_unlock(&fw_lock); return ret; } } /* * After caching firmware image is started, let it piggyback * on request firmware. */ if (!(fw_priv->opt_flags & FW_OPT_NOCACHE) && fw_priv->fwc->state == FW_LOADER_START_CACHE) fw_cache_piggyback_on_request(fw_priv); /* pass the pages buffer to driver at the last minute */ fw_set_page_data(fw_priv, fw); mutex_unlock(&fw_lock); return 0; } /* prepare firmware and firmware_buf structs; * return 0 if a firmware is already assigned, 1 if need to load one, * or a negative error code */ static int _request_firmware_prepare(struct firmware **firmware_p, const char *name, struct device *device, void *dbuf, size_t size, size_t offset, u32 opt_flags) { struct firmware *firmware; struct fw_priv *fw_priv; int ret; *firmware_p = firmware = kzalloc(sizeof(*firmware), GFP_KERNEL); if (!firmware) { dev_err(device, "%s: kmalloc(struct firmware) failed\n", __func__); return -ENOMEM; } if (firmware_request_builtin_buf(firmware, name, dbuf, size)) { dev_dbg(device, "using built-in %s\n", name); return 0; /* assigned */ } ret = alloc_lookup_fw_priv(name, &fw_cache, &fw_priv, dbuf, size, offset, opt_flags); /* * bind with 'priv' now to avoid warning in failure path * of requesting firmware. */ firmware->priv = fw_priv; if (ret > 0) { ret = fw_state_wait(fw_priv); if (!ret) { fw_set_page_data(fw_priv, firmware); return 0; /* assigned */ } } if (ret < 0) return ret; return 1; /* need to load */ } /* * Batched requests need only one wake, we need to do this step last due to the * fallback mechanism. The buf is protected with kref_get(), and it won't be * released until the last user calls release_firmware(). * * Failed batched requests are possible as well, in such cases we just share * the struct fw_priv and won't release it until all requests are woken * and have gone through this same path. */ static void fw_abort_batch_reqs(struct firmware *fw) { struct fw_priv *fw_priv; /* Loaded directly? */ if (!fw || !fw->priv) return; fw_priv = fw->priv; mutex_lock(&fw_lock); if (!fw_state_is_aborted(fw_priv)) fw_state_aborted(fw_priv); mutex_unlock(&fw_lock); } #if defined(CONFIG_FW_LOADER_DEBUG) #include <crypto/hash.h> #include <crypto/sha2.h> static void fw_log_firmware_info(const struct firmware *fw, const char *name, struct device *device) { struct shash_desc *shash; struct crypto_shash *alg; u8 *sha256buf; char *outbuf; alg = crypto_alloc_shash("sha256", 0, 0); if (IS_ERR(alg)) return; sha256buf = kmalloc(SHA256_DIGEST_SIZE, GFP_KERNEL); outbuf = kmalloc(SHA256_BLOCK_SIZE + 1, GFP_KERNEL); shash = kmalloc(sizeof(*shash) + crypto_shash_descsize(alg), GFP_KERNEL); if (!sha256buf || !outbuf || !shash) goto out_free; shash->tfm = alg; if (crypto_shash_digest(shash, fw->data, fw->size, sha256buf) < 0) goto out_shash; for (int i = 0; i < SHA256_DIGEST_SIZE; i++) sprintf(&outbuf[i * 2], "%02x", sha256buf[i]); outbuf[SHA256_BLOCK_SIZE] = 0; dev_dbg(device, "Loaded FW: %s, sha256: %s\n", name, outbuf); out_shash: crypto_free_shash(alg); out_free: kfree(shash); kfree(outbuf); kfree(sha256buf); } #else static void fw_log_firmware_info(const struct firmware *fw, const char *name, struct device *device) {} #endif /* called from request_firmware() and request_firmware_work_func() */ static int _request_firmware(const struct firmware **firmware_p, const char *name, struct device *device, void *buf, size_t size, size_t offset, u32 opt_flags) { struct firmware *fw = NULL; struct cred *kern_cred = NULL; const struct cred *old_cred; bool nondirect = false; int ret; if (!firmware_p) return -EINVAL; if (!name || name[0] == '\0') { ret = -EINVAL; goto out; } ret = _request_firmware_prepare(&fw, name, device, buf, size, offset, opt_flags); if (ret <= 0) /* error or already assigned */ goto out; /* * We are about to try to access the firmware file. Because we may have been * called by a driver when serving an unrelated request from userland, we use * the kernel credentials to read the file. */ kern_cred = prepare_kernel_cred(&init_task); if (!kern_cred) { ret = -ENOMEM; goto out; } old_cred = override_creds(kern_cred); ret = fw_get_filesystem_firmware(device, fw->priv, "", NULL); /* Only full reads can support decompression, platform, and sysfs. */ if (!(opt_flags & FW_OPT_PARTIAL)) nondirect = true; #ifdef CONFIG_FW_LOADER_COMPRESS_ZSTD if (ret == -ENOENT && nondirect) ret = fw_get_filesystem_firmware(device, fw->priv, ".zst", fw_decompress_zstd); #endif #ifdef CONFIG_FW_LOADER_COMPRESS_XZ if (ret == -ENOENT && nondirect) ret = fw_get_filesystem_firmware(device, fw->priv, ".xz", fw_decompress_xz); #endif if (ret == -ENOENT && nondirect) ret = firmware_fallback_platform(fw->priv); if (ret) { if (!(opt_flags & FW_OPT_NO_WARN)) dev_warn(device, "Direct firmware load for %s failed with error %d\n", name, ret); if (nondirect) ret = firmware_fallback_sysfs(fw, name, device, opt_flags, ret); } else ret = assign_fw(fw, device); revert_creds(old_cred); put_cred(kern_cred); out: if (ret < 0) { fw_abort_batch_reqs(fw); release_firmware(fw); fw = NULL; } else { fw_log_firmware_info(fw, name, device); } *firmware_p = fw; return ret; } /** * request_firmware() - send firmware request and wait for it * @firmware_p: pointer to firmware image * @name: name of firmware file * @device: device for which firmware is being loaded * * @firmware_p will be used to return a firmware image by the name * of @name for device @device. * * Should be called from user context where sleeping is allowed. * * @name will be used as $FIRMWARE in the uevent environment and * should be distinctive enough not to be confused with any other * firmware image for this or any other device. * * Caller must hold the reference count of @device. * * The function can be called safely inside device's suspend and * resume callback. **/ int request_firmware(const struct firmware **firmware_p, const char *name, struct device *device) { int ret; /* Need to pin this module until return */ __module_get(THIS_MODULE); ret = _request_firmware(firmware_p, name, device, NULL, 0, 0, FW_OPT_UEVENT); module_put(THIS_MODULE); return ret; } EXPORT_SYMBOL(request_firmware); /** * firmware_request_nowarn() - request for an optional fw module * @firmware: pointer to firmware image * @name: name of firmware file * @device: device for which firmware is being loaded * * This function is similar in behaviour to request_firmware(), except it * doesn't produce warning messages when the file is not found. The sysfs * fallback mechanism is enabled if direct filesystem lookup fails. However, * failures to find the firmware file with it are still suppressed. It is * therefore up to the driver to check for the return value of this call and to * decide when to inform the users of errors. **/ int firmware_request_nowarn(const struct firmware **firmware, const char *name, struct device *device) { int ret; /* Need to pin this module until return */ __module_get(THIS_MODULE); ret = _request_firmware(firmware, name, device, NULL, 0, 0, FW_OPT_UEVENT | FW_OPT_NO_WARN); module_put(THIS_MODULE); return ret; } EXPORT_SYMBOL_GPL(firmware_request_nowarn); /** * request_firmware_direct() - load firmware directly without usermode helper * @firmware_p: pointer to firmware image * @name: name of firmware file * @device: device for which firmware is being loaded * * This function works pretty much like request_firmware(), but this doesn't * fall back to usermode helper even if the firmware couldn't be loaded * directly from fs. Hence it's useful for loading optional firmwares, which * aren't always present, without extra long timeouts of udev. **/ int request_firmware_direct(const struct firmware **firmware_p, const char *name, struct device *device) { int ret; __module_get(THIS_MODULE); ret = _request_firmware(firmware_p, name, device, NULL, 0, 0, FW_OPT_UEVENT | FW_OPT_NO_WARN | FW_OPT_NOFALLBACK_SYSFS); module_put(THIS_MODULE); return ret; } EXPORT_SYMBOL_GPL(request_firmware_direct); /** * firmware_request_platform() - request firmware with platform-fw fallback * @firmware: pointer to firmware image * @name: name of firmware file * @device: device for which firmware is being loaded * * This function is similar in behaviour to request_firmware, except that if * direct filesystem lookup fails, it will fallback to looking for a copy of the * requested firmware embedded in the platform's main (e.g. UEFI) firmware. **/ int firmware_request_platform(const struct firmware **firmware, const char *name, struct device *device) { int ret; /* Need to pin this module until return */ __module_get(THIS_MODULE); ret = _request_firmware(firmware, name, device, NULL, 0, 0, FW_OPT_UEVENT | FW_OPT_FALLBACK_PLATFORM); module_put(THIS_MODULE); return ret; } EXPORT_SYMBOL_GPL(firmware_request_platform); /** * firmware_request_cache() - cache firmware for suspend so resume can use it * @name: name of firmware file * @device: device for which firmware should be cached for * * There are some devices with an optimization that enables the device to not * require loading firmware on system reboot. This optimization may still * require the firmware present on resume from suspend. This routine can be * used to ensure the firmware is present on resume from suspend in these * situations. This helper is not compatible with drivers which use * request_firmware_into_buf() or request_firmware_nowait() with no uevent set. **/ int firmware_request_cache(struct device *device, const char *name) { int ret; mutex_lock(&fw_lock); ret = fw_add_devm_name(device, name); mutex_unlock(&fw_lock); return ret; } EXPORT_SYMBOL_GPL(firmware_request_cache); /** * request_firmware_into_buf() - load firmware into a previously allocated buffer * @firmware_p: pointer to firmware image * @name: name of firmware file * @device: device for which firmware is being loaded and DMA region allocated * @buf: address of buffer to load firmware into * @size: size of buffer * * This function works pretty much like request_firmware(), but it doesn't * allocate a buffer to hold the firmware data. Instead, the firmware * is loaded directly into the buffer pointed to by @buf and the @firmware_p * data member is pointed at @buf. * * This function doesn't cache firmware either. */ int request_firmware_into_buf(const struct firmware **firmware_p, const char *name, struct device *device, void *buf, size_t size) { int ret; if (fw_cache_is_setup(device, name)) return -EOPNOTSUPP; __module_get(THIS_MODULE); ret = _request_firmware(firmware_p, name, device, buf, size, 0, FW_OPT_UEVENT | FW_OPT_NOCACHE); module_put(THIS_MODULE); return ret; } EXPORT_SYMBOL(request_firmware_into_buf); /** * request_partial_firmware_into_buf() - load partial firmware into a previously allocated buffer * @firmware_p: pointer to firmware image * @name: name of firmware file * @device: device for which firmware is being loaded and DMA region allocated * @buf: address of buffer to load firmware into * @size: size of buffer * @offset: offset into file to read * * This function works pretty much like request_firmware_into_buf except * it allows a partial read of the file. */ int request_partial_firmware_into_buf(const struct firmware **firmware_p, const char *name, struct device *device, void *buf, size_t size, size_t offset) { int ret; if (fw_cache_is_setup(device, name)) return -EOPNOTSUPP; __module_get(THIS_MODULE); ret = _request_firmware(firmware_p, name, device, buf, size, offset, FW_OPT_UEVENT | FW_OPT_NOCACHE | FW_OPT_PARTIAL); module_put(THIS_MODULE); return ret; } EXPORT_SYMBOL(request_partial_firmware_into_buf); /** * release_firmware() - release the resource associated with a firmware image * @fw: firmware resource to release **/ void release_firmware(const struct firmware *fw) { if (fw) { if (!firmware_is_builtin(fw)) firmware_free_data(fw); kfree(fw); } } EXPORT_SYMBOL(release_firmware); /* Async support */ struct firmware_work { struct work_struct work; struct module *module; const char *name; struct device *device; void *context; void (*cont)(const struct firmware *fw, void *context); u32 opt_flags; }; static void request_firmware_work_func(struct work_struct *work) { struct firmware_work *fw_work; const struct firmware *fw; fw_work = container_of(work, struct firmware_work, work); _request_firmware(&fw, fw_work->name, fw_work->device, NULL, 0, 0, fw_work->opt_flags); fw_work->cont(fw, fw_work->context); put_device(fw_work->device); /* taken in request_firmware_nowait() */ module_put(fw_work->module); kfree_const(fw_work->name); kfree(fw_work); } /** * request_firmware_nowait() - asynchronous version of request_firmware * @module: module requesting the firmware * @uevent: sends uevent to copy the firmware image if this flag * is non-zero else the firmware copy must be done manually. * @name: name of firmware file * @device: device for which firmware is being loaded * @gfp: allocation flags * @context: will be passed over to @cont, and * @fw may be %NULL if firmware request fails. * @cont: function will be called asynchronously when the firmware * request is over. * * Caller must hold the reference count of @device. * * Asynchronous variant of request_firmware() for user contexts: * - sleep for as small periods as possible since it may * increase kernel boot time of built-in device drivers * requesting firmware in their ->probe() methods, if * @gfp is GFP_KERNEL. * * - can't sleep at all if @gfp is GFP_ATOMIC. **/ int request_firmware_nowait( struct module *module, bool uevent, const char *name, struct device *device, gfp_t gfp, void *context, void (*cont)(const struct firmware *fw, void *context)) { struct firmware_work *fw_work; fw_work = kzalloc(sizeof(struct firmware_work), gfp); if (!fw_work) return -ENOMEM; fw_work->module = module; fw_work->name = kstrdup_const(name, gfp); if (!fw_work->name) { kfree(fw_work); return -ENOMEM; } fw_work->device = device; fw_work->context = context; fw_work->cont = cont; fw_work->opt_flags = FW_OPT_NOWAIT | (uevent ? FW_OPT_UEVENT : FW_OPT_USERHELPER); if (!uevent && fw_cache_is_setup(device, name)) { kfree_const(fw_work->name); kfree(fw_work); return -EOPNOTSUPP; } if (!try_module_get(module)) { kfree_const(fw_work->name); kfree(fw_work); return -EFAULT; } get_device(fw_work->device); INIT_WORK(&fw_work->work, request_firmware_work_func); schedule_work(&fw_work->work); return 0; } EXPORT_SYMBOL(request_firmware_nowait); #ifdef CONFIG_FW_CACHE static ASYNC_DOMAIN_EXCLUSIVE(fw_cache_domain); /** * cache_firmware() - cache one firmware image in kernel memory space * @fw_name: the firmware image name * * Cache firmware in kernel memory so that drivers can use it when * system isn't ready for them to request firmware image from userspace. * Once it returns successfully, driver can use request_firmware or its * nowait version to get the cached firmware without any interacting * with userspace * * Return 0 if the firmware image has been cached successfully * Return !0 otherwise * */ static int cache_firmware(const char *fw_name) { int ret; const struct firmware *fw; pr_debug("%s: %s\n", __func__, fw_name); ret = request_firmware(&fw, fw_name, NULL); if (!ret) kfree(fw); pr_debug("%s: %s ret=%d\n", __func__, fw_name, ret); return ret; } static struct fw_priv *lookup_fw_priv(const char *fw_name) { struct fw_priv *tmp; struct firmware_cache *fwc = &fw_cache; spin_lock(&fwc->lock); tmp = __lookup_fw_priv(fw_name); spin_unlock(&fwc->lock); return tmp; } /** * uncache_firmware() - remove one cached firmware image * @fw_name: the firmware image name * * Uncache one firmware image which has been cached successfully * before. * * Return 0 if the firmware cache has been removed successfully * Return !0 otherwise * */ static int uncache_firmware(const char *fw_name) { struct fw_priv *fw_priv; struct firmware fw; pr_debug("%s: %s\n", __func__, fw_name); if (firmware_request_builtin(&fw, fw_name)) return 0; fw_priv = lookup_fw_priv(fw_name); if (fw_priv) { free_fw_priv(fw_priv); return 0; } return -EINVAL; } static struct fw_cache_entry *alloc_fw_cache_entry(const char *name) { struct fw_cache_entry *fce; fce = kzalloc(sizeof(*fce), GFP_ATOMIC); if (!fce) goto exit; fce->name = kstrdup_const(name, GFP_ATOMIC); if (!fce->name) { kfree(fce); fce = NULL; goto exit; } exit: return fce; } static int __fw_entry_found(const char *name) { struct firmware_cache *fwc = &fw_cache; struct fw_cache_entry *fce; list_for_each_entry(fce, &fwc->fw_names, list) { if (!strcmp(fce->name, name)) return 1; } return 0; } static void fw_cache_piggyback_on_request(struct fw_priv *fw_priv) { const char *name = fw_priv->fw_name; struct firmware_cache *fwc = fw_priv->fwc; struct fw_cache_entry *fce; spin_lock(&fwc->name_lock); if (__fw_entry_found(name)) goto found; fce = alloc_fw_cache_entry(name); if (fce) { list_add(&fce->list, &fwc->fw_names); kref_get(&fw_priv->ref); pr_debug("%s: fw: %s\n", __func__, name); } found: spin_unlock(&fwc->name_lock); } static void free_fw_cache_entry(struct fw_cache_entry *fce) { kfree_const(fce->name); kfree(fce); } static void __async_dev_cache_fw_image(void *fw_entry, async_cookie_t cookie) { struct fw_cache_entry *fce = fw_entry; struct firmware_cache *fwc = &fw_cache; int ret; ret = cache_firmware(fce->name); if (ret) { spin_lock(&fwc->name_lock); list_del(&fce->list); spin_unlock(&fwc->name_lock); free_fw_cache_entry(fce); } } /* called with dev->devres_lock held */ static void dev_create_fw_entry(struct device *dev, void *res, void *data) { struct fw_name_devm *fwn = res; const char *fw_name = fwn->name; struct list_head *head = data; struct fw_cache_entry *fce; fce = alloc_fw_cache_entry(fw_name); if (fce) list_add(&fce->list, head); } static int devm_name_match(struct device *dev, void *res, void *match_data) { struct fw_name_devm *fwn = res; return (fwn->magic == (unsigned long)match_data); } static void dev_cache_fw_image(struct device *dev, void *data) { LIST_HEAD(todo); struct fw_cache_entry *fce; struct fw_cache_entry *fce_next; struct firmware_cache *fwc = &fw_cache; devres_for_each_res(dev, fw_name_devm_release, devm_name_match, &fw_cache, dev_create_fw_entry, &todo); list_for_each_entry_safe(fce, fce_next, &todo, list) { list_del(&fce->list); spin_lock(&fwc->name_lock); /* only one cache entry for one firmware */ if (!__fw_entry_found(fce->name)) { list_add(&fce->list, &fwc->fw_names); } else { free_fw_cache_entry(fce); fce = NULL; } spin_unlock(&fwc->name_lock); if (fce) async_schedule_domain(__async_dev_cache_fw_image, (void *)fce, &fw_cache_domain); } } static void __device_uncache_fw_images(void) { struct firmware_cache *fwc = &fw_cache; struct fw_cache_entry *fce; spin_lock(&fwc->name_lock); while (!list_empty(&fwc->fw_names)) { fce = list_entry(fwc->fw_names.next, struct fw_cache_entry, list); list_del(&fce->list); spin_unlock(&fwc->name_lock); uncache_firmware(fce->name); free_fw_cache_entry(fce); spin_lock(&fwc->name_lock); } spin_unlock(&fwc->name_lock); } /** * device_cache_fw_images() - cache devices' firmware * * If one device called request_firmware or its nowait version * successfully before, the firmware names are recored into the * device's devres link list, so device_cache_fw_images can call * cache_firmware() to cache these firmwares for the device, * then the device driver can load its firmwares easily at * time when system is not ready to complete loading firmware. */ static void device_cache_fw_images(void) { struct firmware_cache *fwc = &fw_cache; DEFINE_WAIT(wait); pr_debug("%s\n", __func__); /* cancel uncache work */ cancel_delayed_work_sync(&fwc->work); fw_fallback_set_cache_timeout(); mutex_lock(&fw_lock); fwc->state = FW_LOADER_START_CACHE; dpm_for_each_dev(NULL, dev_cache_fw_image); mutex_unlock(&fw_lock); /* wait for completion of caching firmware for all devices */ async_synchronize_full_domain(&fw_cache_domain); fw_fallback_set_default_timeout(); } /** * device_uncache_fw_images() - uncache devices' firmware * * uncache all firmwares which have been cached successfully * by device_uncache_fw_images earlier */ static void device_uncache_fw_images(void) { pr_debug("%s\n", __func__); __device_uncache_fw_images(); } static void device_uncache_fw_images_work(struct work_struct *work) { device_uncache_fw_images(); } /** * device_uncache_fw_images_delay() - uncache devices firmwares * @delay: number of milliseconds to delay uncache device firmwares * * uncache all devices's firmwares which has been cached successfully * by device_cache_fw_images after @delay milliseconds. */ static void device_uncache_fw_images_delay(unsigned long delay) { queue_delayed_work(system_power_efficient_wq, &fw_cache.work, msecs_to_jiffies(delay)); } static int fw_pm_notify(struct notifier_block *notify_block, unsigned long mode, void *unused) { switch (mode) { case PM_HIBERNATION_PREPARE: case PM_SUSPEND_PREPARE: case PM_RESTORE_PREPARE: /* * Here, kill pending fallback requests will only kill * non-uevent firmware request to avoid stalling suspend. */ kill_pending_fw_fallback_reqs(false); device_cache_fw_images(); break; case PM_POST_SUSPEND: case PM_POST_HIBERNATION: case PM_POST_RESTORE: /* * In case that system sleep failed and syscore_suspend is * not called. */ mutex_lock(&fw_lock); fw_cache.state = FW_LOADER_NO_CACHE; mutex_unlock(&fw_lock); device_uncache_fw_images_delay(10 * MSEC_PER_SEC); break; } return 0; } /* stop caching firmware once syscore_suspend is reached */ static int fw_suspend(void) { fw_cache.state = FW_LOADER_NO_CACHE; return 0; } static struct syscore_ops fw_syscore_ops = { .suspend = fw_suspend, }; static int __init register_fw_pm_ops(void) { int ret; spin_lock_init(&fw_cache.name_lock); INIT_LIST_HEAD(&fw_cache.fw_names); INIT_DELAYED_WORK(&fw_cache.work, device_uncache_fw_images_work); fw_cache.pm_notify.notifier_call = fw_pm_notify; ret = register_pm_notifier(&fw_cache.pm_notify); if (ret) return ret; register_syscore_ops(&fw_syscore_ops); return ret; } static inline void unregister_fw_pm_ops(void) { unregister_syscore_ops(&fw_syscore_ops); unregister_pm_notifier(&fw_cache.pm_notify); } #else static void fw_cache_piggyback_on_request(struct fw_priv *fw_priv) { } static inline int register_fw_pm_ops(void) { return 0; } static inline void unregister_fw_pm_ops(void) { } #endif static void __init fw_cache_init(void) { spin_lock_init(&fw_cache.lock); INIT_LIST_HEAD(&fw_cache.head); fw_cache.state = FW_LOADER_NO_CACHE; } static int fw_shutdown_notify(struct notifier_block *unused1, unsigned long unused2, void *unused3) { /* * Kill all pending fallback requests to avoid both stalling shutdown, * and avoid a deadlock with the usermode_lock. */ kill_pending_fw_fallback_reqs(true); return NOTIFY_DONE; } static struct notifier_block fw_shutdown_nb = { .notifier_call = fw_shutdown_notify, }; static int __init firmware_class_init(void) { int ret; /* No need to unfold these on exit */ fw_cache_init(); ret = register_fw_pm_ops(); if (ret) return ret; ret = register_reboot_notifier(&fw_shutdown_nb); if (ret) goto out; return register_sysfs_loader(); out: unregister_fw_pm_ops(); return ret; } static void __exit firmware_class_exit(void) { unregister_fw_pm_ops(); unregister_reboot_notifier(&fw_shutdown_nb); unregister_sysfs_loader(); } fs_initcall(firmware_class_init); module_exit(firmware_class_exit); |
7532 1 5627 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 | /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM vmalloc #if !defined(_TRACE_VMALLOC_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_VMALLOC_H #include <linux/tracepoint.h> /** * alloc_vmap_area - called when a new vmap allocation occurs * @addr: an allocated address * @size: a requested size * @align: a requested alignment * @vstart: a requested start range * @vend: a requested end range * @failed: an allocation failed or not * * This event is used for a debug purpose, it can give an extra * information for a developer about how often it occurs and which * parameters are passed for further validation. */ TRACE_EVENT(alloc_vmap_area, TP_PROTO(unsigned long addr, unsigned long size, unsigned long align, unsigned long vstart, unsigned long vend, int failed), TP_ARGS(addr, size, align, vstart, vend, failed), TP_STRUCT__entry( __field(unsigned long, addr) __field(unsigned long, size) __field(unsigned long, align) __field(unsigned long, vstart) __field(unsigned long, vend) __field(int, failed) ), TP_fast_assign( __entry->addr = addr; __entry->size = size; __entry->align = align; __entry->vstart = vstart; __entry->vend = vend; __entry->failed = failed; ), TP_printk("va_start: %lu size=%lu align=%lu vstart=0x%lx vend=0x%lx failed=%d", __entry->addr, __entry->size, __entry->align, __entry->vstart, __entry->vend, __entry->failed) ); /** * purge_vmap_area_lazy - called when vmap areas were lazily freed * @start: purging start address * @end: purging end address * @npurged: numbed of purged vmap areas * * This event is used for a debug purpose. It gives some * indication about start:end range and how many objects * are released. */ TRACE_EVENT(purge_vmap_area_lazy, TP_PROTO(unsigned long start, unsigned long end, unsigned int npurged), TP_ARGS(start, end, npurged), TP_STRUCT__entry( __field(unsigned long, start) __field(unsigned long, end) __field(unsigned int, npurged) ), TP_fast_assign( __entry->start = start; __entry->end = end; __entry->npurged = npurged; ), TP_printk("start=0x%lx end=0x%lx num_purged=%u", __entry->start, __entry->end, __entry->npurged) ); /** * free_vmap_area_noflush - called when a vmap area is freed * @va_start: a start address of VA * @nr_lazy: number of current lazy pages * @nr_lazy_max: number of maximum lazy pages * * This event is used for a debug purpose. It gives some * indication about a VA that is released, number of current * outstanding areas and a maximum allowed threshold before * dropping all of them. */ TRACE_EVENT(free_vmap_area_noflush, TP_PROTO(unsigned long va_start, unsigned long nr_lazy, unsigned long nr_lazy_max), TP_ARGS(va_start, nr_lazy, nr_lazy_max), TP_STRUCT__entry( __field(unsigned long, va_start) __field(unsigned long, nr_lazy) __field(unsigned long, nr_lazy_max) ), TP_fast_assign( __entry->va_start = va_start; __entry->nr_lazy = nr_lazy; __entry->nr_lazy_max = nr_lazy_max; ), TP_printk("va_start=0x%lx nr_lazy=%lu nr_lazy_max=%lu", __entry->va_start, __entry->nr_lazy, __entry->nr_lazy_max) ); #endif /* _TRACE_VMALLOC_H */ /* This part must be outside protection */ #include <trace/define_trace.h> |
8 2 4 3 22 7 19 10 8 1 24 12 1 33 4 14 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 | // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2021 Cong Wang <cong.wang@bytedance.com> */ #include <linux/skmsg.h> #include <linux/bpf.h> #include <net/sock.h> #include <net/af_unix.h> #define unix_sk_has_data(__sk, __psock) \ ({ !skb_queue_empty(&__sk->sk_receive_queue) || \ !skb_queue_empty(&__psock->ingress_skb) || \ !list_empty(&__psock->ingress_msg); \ }) static int unix_msg_wait_data(struct sock *sk, struct sk_psock *psock, long timeo) { DEFINE_WAIT_FUNC(wait, woken_wake_function); struct unix_sock *u = unix_sk(sk); int ret = 0; if (sk->sk_shutdown & RCV_SHUTDOWN) return 1; if (!timeo) return ret; add_wait_queue(sk_sleep(sk), &wait); sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); if (!unix_sk_has_data(sk, psock)) { mutex_unlock(&u->iolock); wait_woken(&wait, TASK_INTERRUPTIBLE, timeo); mutex_lock(&u->iolock); ret = unix_sk_has_data(sk, psock); } sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); remove_wait_queue(sk_sleep(sk), &wait); return ret; } static int __unix_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags) { if (sk->sk_type == SOCK_DGRAM) return __unix_dgram_recvmsg(sk, msg, len, flags); else return __unix_stream_recvmsg(sk, msg, len, flags); } static int unix_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, int *addr_len) { struct unix_sock *u = unix_sk(sk); struct sk_psock *psock; int copied; if (!len) return 0; psock = sk_psock_get(sk); if (unlikely(!psock)) return __unix_recvmsg(sk, msg, len, flags); mutex_lock(&u->iolock); if (!skb_queue_empty(&sk->sk_receive_queue) && sk_psock_queue_empty(psock)) { mutex_unlock(&u->iolock); sk_psock_put(sk, psock); return __unix_recvmsg(sk, msg, len, flags); } msg_bytes_ready: copied = sk_msg_recvmsg(sk, psock, msg, len, flags); if (!copied) { long timeo; int data; timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); data = unix_msg_wait_data(sk, psock, timeo); if (data) { if (!sk_psock_queue_empty(psock)) goto msg_bytes_ready; mutex_unlock(&u->iolock); sk_psock_put(sk, psock); return __unix_recvmsg(sk, msg, len, flags); } copied = -EAGAIN; } mutex_unlock(&u->iolock); sk_psock_put(sk, psock); return copied; } static struct proto *unix_dgram_prot_saved __read_mostly; static DEFINE_SPINLOCK(unix_dgram_prot_lock); static struct proto unix_dgram_bpf_prot; static struct proto *unix_stream_prot_saved __read_mostly; static DEFINE_SPINLOCK(unix_stream_prot_lock); static struct proto unix_stream_bpf_prot; static void unix_dgram_bpf_rebuild_protos(struct proto *prot, const struct proto *base) { *prot = *base; prot->close = sock_map_close; prot->recvmsg = unix_bpf_recvmsg; prot->sock_is_readable = sk_msg_is_readable; } static void unix_stream_bpf_rebuild_protos(struct proto *prot, const struct proto *base) { *prot = *base; prot->close = sock_map_close; prot->recvmsg = unix_bpf_recvmsg; prot->sock_is_readable = sk_msg_is_readable; prot->unhash = sock_map_unhash; } static void unix_dgram_bpf_check_needs_rebuild(struct proto *ops) { if (unlikely(ops != smp_load_acquire(&unix_dgram_prot_saved))) { spin_lock_bh(&unix_dgram_prot_lock); if (likely(ops != unix_dgram_prot_saved)) { unix_dgram_bpf_rebuild_protos(&unix_dgram_bpf_prot, ops); smp_store_release(&unix_dgram_prot_saved, ops); } spin_unlock_bh(&unix_dgram_prot_lock); } } static void unix_stream_bpf_check_needs_rebuild(struct proto *ops) { if (unlikely(ops != smp_load_acquire(&unix_stream_prot_saved))) { spin_lock_bh(&unix_stream_prot_lock); if (likely(ops != unix_stream_prot_saved)) { unix_stream_bpf_rebuild_protos(&unix_stream_bpf_prot, ops); smp_store_release(&unix_stream_prot_saved, ops); } spin_unlock_bh(&unix_stream_prot_lock); } } int unix_dgram_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) { if (sk->sk_type != SOCK_DGRAM) return -EOPNOTSUPP; if (restore) { sk->sk_write_space = psock->saved_write_space; sock_replace_proto(sk, psock->sk_proto); return 0; } unix_dgram_bpf_check_needs_rebuild(psock->sk_proto); sock_replace_proto(sk, &unix_dgram_bpf_prot); return 0; } int unix_stream_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore) { struct sock *sk_pair; if (restore) { sk->sk_write_space = psock->saved_write_space; sock_replace_proto(sk, psock->sk_proto); return 0; } sk_pair = unix_peer(sk); sock_hold(sk_pair); psock->sk_pair = sk_pair; unix_stream_bpf_check_needs_rebuild(psock->sk_proto); sock_replace_proto(sk, &unix_stream_bpf_prot); return 0; } void __init unix_bpf_build_proto(void) { unix_dgram_bpf_rebuild_protos(&unix_dgram_bpf_prot, &unix_dgram_proto); unix_stream_bpf_rebuild_protos(&unix_stream_bpf_prot, &unix_stream_proto); } |
13 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 | // SPDX-License-Identifier: GPL-2.0 /* Shared Memory Communications Direct over ISM devices (SMC-D) * * Functions for ISM device. * * Copyright IBM Corp. 2018 */ #include <linux/if_vlan.h> #include <linux/spinlock.h> #include <linux/mutex.h> #include <linux/slab.h> #include <asm/page.h> #include "smc.h" #include "smc_core.h" #include "smc_ism.h" #include "smc_pnet.h" #include "smc_netlink.h" #include "linux/ism.h" struct smcd_dev_list smcd_dev_list = { .list = LIST_HEAD_INIT(smcd_dev_list.list), .mutex = __MUTEX_INITIALIZER(smcd_dev_list.mutex) }; static bool smc_ism_v2_capable; static u8 smc_ism_v2_system_eid[SMC_MAX_EID_LEN]; #if IS_ENABLED(CONFIG_ISM) static void smcd_register_dev(struct ism_dev *ism); static void smcd_unregister_dev(struct ism_dev *ism); static void smcd_handle_event(struct ism_dev *ism, struct ism_event *event); static void smcd_handle_irq(struct ism_dev *ism, unsigned int dmbno, u16 dmbemask); static struct ism_client smc_ism_client = { .name = "SMC-D", .add = smcd_register_dev, .remove = smcd_unregister_dev, .handle_event = smcd_handle_event, .handle_irq = smcd_handle_irq, }; #endif /* Test if an ISM communication is possible - same CPC */ int smc_ism_cantalk(u64 peer_gid, unsigned short vlan_id, struct smcd_dev *smcd) { return smcd->ops->query_remote_gid(smcd, peer_gid, vlan_id ? 1 : 0, vlan_id); } void smc_ism_get_system_eid(u8 **eid) { if (!smc_ism_v2_capable) *eid = NULL; else *eid = smc_ism_v2_system_eid; } u16 smc_ism_get_chid(struct smcd_dev *smcd) { return smcd->ops->get_chid(smcd); } /* HW supports ISM V2 and thus System EID is defined */ bool smc_ism_is_v2_capable(void) { return smc_ism_v2_capable; } /* Set a connection using this DMBE. */ void smc_ism_set_conn(struct smc_connection *conn) { unsigned long flags; spin_lock_irqsave(&conn->lgr->smcd->lock, flags); conn->lgr->smcd->conn[conn->rmb_desc->sba_idx] = conn; spin_unlock_irqrestore(&conn->lgr->smcd->lock, flags); } /* Unset a connection using this DMBE. */ void smc_ism_unset_conn(struct smc_connection *conn) { unsigned long flags; if (!conn->rmb_desc) return; spin_lock_irqsave(&conn->lgr->smcd->lock, flags); conn->lgr->smcd->conn[conn->rmb_desc->sba_idx] = NULL; spin_unlock_irqrestore(&conn->lgr->smcd->lock, flags); } /* Register a VLAN identifier with the ISM device. Use a reference count * and add a VLAN identifier only when the first DMB using this VLAN is * registered. */ int smc_ism_get_vlan(struct smcd_dev *smcd, unsigned short vlanid) { struct smc_ism_vlanid *new_vlan, *vlan; unsigned long flags; int rc = 0; if (!vlanid) /* No valid vlan id */ return -EINVAL; /* create new vlan entry, in case we need it */ new_vlan = kzalloc(sizeof(*new_vlan), GFP_KERNEL); if (!new_vlan) return -ENOMEM; new_vlan->vlanid = vlanid; refcount_set(&new_vlan->refcnt, 1); /* if there is an existing entry, increase count and return */ spin_lock_irqsave(&smcd->lock, flags); list_for_each_entry(vlan, &smcd->vlan, list) { if (vlan->vlanid == vlanid) { refcount_inc(&vlan->refcnt); kfree(new_vlan); goto out; } } /* no existing entry found. * add new entry to device; might fail, e.g., if HW limit reached */ if (smcd->ops->add_vlan_id(smcd, vlanid)) { kfree(new_vlan); rc = -EIO; goto out; } list_add_tail(&new_vlan->list, &smcd->vlan); out: spin_unlock_irqrestore(&smcd->lock, flags); return rc; } /* Unregister a VLAN identifier with the ISM device. Use a reference count * and remove a VLAN identifier only when the last DMB using this VLAN is * unregistered. */ int smc_ism_put_vlan(struct smcd_dev *smcd, unsigned short vlanid) { struct smc_ism_vlanid *vlan; unsigned long flags; bool found = false; int rc = 0; if (!vlanid) /* No valid vlan id */ return -EINVAL; spin_lock_irqsave(&smcd->lock, flags); list_for_each_entry(vlan, &smcd->vlan, list) { if (vlan->vlanid == vlanid) { if (!refcount_dec_and_test(&vlan->refcnt)) goto out; found = true; break; } } if (!found) { rc = -ENOENT; goto out; /* VLAN id not in table */ } /* Found and the last reference just gone */ if (smcd->ops->del_vlan_id(smcd, vlanid)) rc = -EIO; list_del(&vlan->list); kfree(vlan); out: spin_unlock_irqrestore(&smcd->lock, flags); return rc; } int smc_ism_unregister_dmb(struct smcd_dev *smcd, struct smc_buf_desc *dmb_desc) { struct smcd_dmb dmb; int rc = 0; if (!dmb_desc->dma_addr) return rc; memset(&dmb, 0, sizeof(dmb)); dmb.dmb_tok = dmb_desc->token; dmb.sba_idx = dmb_desc->sba_idx; dmb.cpu_addr = dmb_desc->cpu_addr; dmb.dma_addr = dmb_desc->dma_addr; dmb.dmb_len = dmb_desc->len; rc = smcd->ops->unregister_dmb(smcd, &dmb); if (!rc || rc == ISM_ERROR) { dmb_desc->cpu_addr = NULL; dmb_desc->dma_addr = 0; } return rc; } int smc_ism_register_dmb(struct smc_link_group *lgr, int dmb_len, struct smc_buf_desc *dmb_desc) { #if IS_ENABLED(CONFIG_ISM) struct smcd_dmb dmb; int rc; memset(&dmb, 0, sizeof(dmb)); dmb.dmb_len = dmb_len; dmb.sba_idx = dmb_desc->sba_idx; dmb.vlan_id = lgr->vlan_id; dmb.rgid = lgr->peer_gid; rc = lgr->smcd->ops->register_dmb(lgr->smcd, &dmb, &smc_ism_client); if (!rc) { dmb_desc->sba_idx = dmb.sba_idx; dmb_desc->token = dmb.dmb_tok; dmb_desc->cpu_addr = dmb.cpu_addr; dmb_desc->dma_addr = dmb.dma_addr; dmb_desc->len = dmb.dmb_len; } return rc; #else return 0; #endif } static int smc_nl_handle_smcd_dev(struct smcd_dev *smcd, struct sk_buff *skb, struct netlink_callback *cb) { char smc_pnet[SMC_MAX_PNETID_LEN + 1]; struct smc_pci_dev smc_pci_dev; struct nlattr *port_attrs; struct nlattr *attrs; struct ism_dev *ism; int use_cnt = 0; void *nlh; ism = smcd->priv; nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &smc_gen_nl_family, NLM_F_MULTI, SMC_NETLINK_GET_DEV_SMCD); if (!nlh) goto errmsg; attrs = nla_nest_start(skb, SMC_GEN_DEV_SMCD); if (!attrs) goto errout; use_cnt = atomic_read(&smcd->lgr_cnt); if (nla_put_u32(skb, SMC_NLA_DEV_USE_CNT, use_cnt)) goto errattr; if (nla_put_u8(skb, SMC_NLA_DEV_IS_CRIT, use_cnt > 0)) goto errattr; memset(&smc_pci_dev, 0, sizeof(smc_pci_dev)); smc_set_pci_values(to_pci_dev(ism->dev.parent), &smc_pci_dev); if (nla_put_u32(skb, SMC_NLA_DEV_PCI_FID, smc_pci_dev.pci_fid)) goto errattr; if (nla_put_u16(skb, SMC_NLA_DEV_PCI_CHID, smc_pci_dev.pci_pchid)) goto errattr; if (nla_put_u16(skb, SMC_NLA_DEV_PCI_VENDOR, smc_pci_dev.pci_vendor)) goto errattr; if (nla_put_u16(skb, SMC_NLA_DEV_PCI_DEVICE, smc_pci_dev.pci_device)) goto errattr; if (nla_put_string(skb, SMC_NLA_DEV_PCI_ID, smc_pci_dev.pci_id)) goto errattr; port_attrs = nla_nest_start(skb, SMC_NLA_DEV_PORT); if (!port_attrs) goto errattr; if (nla_put_u8(skb, SMC_NLA_DEV_PORT_PNET_USR, smcd->pnetid_by_user)) goto errportattr; memcpy(smc_pnet, smcd->pnetid, SMC_MAX_PNETID_LEN); smc_pnet[SMC_MAX_PNETID_LEN] = 0; if (nla_put_string(skb, SMC_NLA_DEV_PORT_PNETID, smc_pnet)) goto errportattr; nla_nest_end(skb, port_attrs); nla_nest_end(skb, attrs); genlmsg_end(skb, nlh); return 0; errportattr: nla_nest_cancel(skb, port_attrs); errattr: nla_nest_cancel(skb, attrs); errout: nlmsg_cancel(skb, nlh); errmsg: return -EMSGSIZE; } static void smc_nl_prep_smcd_dev(struct smcd_dev_list *dev_list, struct sk_buff *skb, struct netlink_callback *cb) { struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb); int snum = cb_ctx->pos[0]; struct smcd_dev *smcd; int num = 0; mutex_lock(&dev_list->mutex); list_for_each_entry(smcd, &dev_list->list, list) { if (num < snum) goto next; if (smc_nl_handle_smcd_dev(smcd, skb, cb)) goto errout; next: num++; } errout: mutex_unlock(&dev_list->mutex); cb_ctx->pos[0] = num; } int smcd_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb) { smc_nl_prep_smcd_dev(&smcd_dev_list, skb, cb); return skb->len; } #if IS_ENABLED(CONFIG_ISM) struct smc_ism_event_work { struct work_struct work; struct smcd_dev *smcd; struct ism_event event; }; #define ISM_EVENT_REQUEST 0x0001 #define ISM_EVENT_RESPONSE 0x0002 #define ISM_EVENT_REQUEST_IR 0x00000001 #define ISM_EVENT_CODE_SHUTDOWN 0x80 #define ISM_EVENT_CODE_TESTLINK 0x83 union smcd_sw_event_info { u64 info; struct { u8 uid[SMC_LGR_ID_SIZE]; unsigned short vlan_id; u16 code; }; }; static void smcd_handle_sw_event(struct smc_ism_event_work *wrk) { union smcd_sw_event_info ev_info; ev_info.info = wrk->event.info; switch (wrk->event.code) { case ISM_EVENT_CODE_SHUTDOWN: /* Peer shut down DMBs */ smc_smcd_terminate(wrk->smcd, wrk->event.tok, ev_info.vlan_id); break; case ISM_EVENT_CODE_TESTLINK: /* Activity timer */ if (ev_info.code == ISM_EVENT_REQUEST) { ev_info.code = ISM_EVENT_RESPONSE; wrk->smcd->ops->signal_event(wrk->smcd, wrk->event.tok, ISM_EVENT_REQUEST_IR, ISM_EVENT_CODE_TESTLINK, ev_info.info); } break; } } /* worker for SMC-D events */ static void smc_ism_event_work(struct work_struct *work) { struct smc_ism_event_work *wrk = container_of(work, struct smc_ism_event_work, work); switch (wrk->event.type) { case ISM_EVENT_GID: /* GID event, token is peer GID */ smc_smcd_terminate(wrk->smcd, wrk->event.tok, VLAN_VID_MASK); break; case ISM_EVENT_DMB: break; case ISM_EVENT_SWR: /* Software defined event */ smcd_handle_sw_event(wrk); break; } kfree(wrk); } static struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name, const struct smcd_ops *ops, int max_dmbs) { struct smcd_dev *smcd; smcd = devm_kzalloc(parent, sizeof(*smcd), GFP_KERNEL); if (!smcd) return NULL; smcd->conn = devm_kcalloc(parent, max_dmbs, sizeof(struct smc_connection *), GFP_KERNEL); if (!smcd->conn) return NULL; smcd->event_wq = alloc_ordered_workqueue("ism_evt_wq-%s)", WQ_MEM_RECLAIM, name); if (!smcd->event_wq) return NULL; smcd->ops = ops; spin_lock_init(&smcd->lock); spin_lock_init(&smcd->lgr_lock); INIT_LIST_HEAD(&smcd->vlan); INIT_LIST_HEAD(&smcd->lgr_list); init_waitqueue_head(&smcd->lgrs_deleted); return smcd; } static void smcd_register_dev(struct ism_dev *ism) { const struct smcd_ops *ops = ism_get_smcd_ops(); struct smcd_dev *smcd; if (!ops) return; smcd = smcd_alloc_dev(&ism->pdev->dev, dev_name(&ism->pdev->dev), ops, ISM_NR_DMBS); if (!smcd) return; smcd->priv = ism; ism_set_priv(ism, &smc_ism_client, smcd); if (smc_pnetid_by_dev_port(&ism->pdev->dev, 0, smcd->pnetid)) smc_pnetid_by_table_smcd(smcd); mutex_lock(&smcd_dev_list.mutex); if (list_empty(&smcd_dev_list.list)) { u8 *system_eid = NULL; system_eid = smcd->ops->get_system_eid(); if (smcd->ops->supports_v2()) { smc_ism_v2_capable = true; memcpy(smc_ism_v2_system_eid, system_eid, SMC_MAX_EID_LEN); } } /* sort list: devices without pnetid before devices with pnetid */ if (smcd->pnetid[0]) list_add_tail(&smcd->list, &smcd_dev_list.list); else list_add(&smcd->list, &smcd_dev_list.list); mutex_unlock(&smcd_dev_list.mutex); pr_warn_ratelimited("smc: adding smcd device %s with pnetid %.16s%s\n", dev_name(&ism->dev), smcd->pnetid, smcd->pnetid_by_user ? " (user defined)" : ""); return; } static void smcd_unregister_dev(struct ism_dev *ism) { struct smcd_dev *smcd = ism_get_priv(ism, &smc_ism_client); pr_warn_ratelimited("smc: removing smcd device %s\n", dev_name(&ism->dev)); smcd->going_away = 1; smc_smcd_terminate_all(smcd); mutex_lock(&smcd_dev_list.mutex); list_del_init(&smcd->list); mutex_unlock(&smcd_dev_list.mutex); destroy_workqueue(smcd->event_wq); } /* SMCD Device event handler. Called from ISM device interrupt handler. * Parameters are ism device pointer, * - event->type (0 --> DMB, 1 --> GID), * - event->code (event code), * - event->tok (either DMB token when event type 0, or GID when event type 1) * - event->time (time of day) * - event->info (debug info). * * Context: * - Function called in IRQ context from ISM device driver event handler. */ static void smcd_handle_event(struct ism_dev *ism, struct ism_event *event) { struct smcd_dev *smcd = ism_get_priv(ism, &smc_ism_client); struct smc_ism_event_work *wrk; if (smcd->going_away) return; /* copy event to event work queue, and let it be handled there */ wrk = kmalloc(sizeof(*wrk), GFP_ATOMIC); if (!wrk) return; INIT_WORK(&wrk->work, smc_ism_event_work); wrk->smcd = smcd; wrk->event = *event; queue_work(smcd->event_wq, &wrk->work); } /* SMCD Device interrupt handler. Called from ISM device interrupt handler. * Parameters are the ism device pointer, DMB number, and the DMBE bitmask. * Find the connection and schedule the tasklet for this connection. * * Context: * - Function called in IRQ context from ISM device driver IRQ handler. */ static void smcd_handle_irq(struct ism_dev *ism, unsigned int dmbno, u16 dmbemask) { struct smcd_dev *smcd = ism_get_priv(ism, &smc_ism_client); struct smc_connection *conn = NULL; unsigned long flags; spin_lock_irqsave(&smcd->lock, flags); conn = smcd->conn[dmbno]; if (conn && !conn->killed) tasklet_schedule(&conn->rx_tsklet); spin_unlock_irqrestore(&smcd->lock, flags); } #endif int smc_ism_signal_shutdown(struct smc_link_group *lgr) { int rc = 0; #if IS_ENABLED(CONFIG_ISM) union smcd_sw_event_info ev_info; if (lgr->peer_shutdown) return 0; memcpy(ev_info.uid, lgr->id, SMC_LGR_ID_SIZE); ev_info.vlan_id = lgr->vlan_id; ev_info.code = ISM_EVENT_REQUEST; rc = lgr->smcd->ops->signal_event(lgr->smcd, lgr->peer_gid, ISM_EVENT_REQUEST_IR, ISM_EVENT_CODE_SHUTDOWN, ev_info.info); #endif return rc; } int smc_ism_init(void) { int rc = 0; #if IS_ENABLED(CONFIG_ISM) smc_ism_v2_capable = false; memset(smc_ism_v2_system_eid, 0, SMC_MAX_EID_LEN); rc = ism_register_client(&smc_ism_client); #endif return rc; } void smc_ism_exit(void) { #if IS_ENABLED(CONFIG_ISM) ism_unregister_client(&smc_ism_client); #endif } |
12 3 12 12 1 12 12 10 3 10 3 3 3 3 3 3 11 11 11 12 2 3 3 2 1 3 3 21 2 3 21 49 49 49 48 49 46 26 26 26 26 4 13 25 7 7 16 16 7 9 16 7 9 16 15 21 1 21 21 16 14 14 1 14 14 14 14 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 | /* * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README */ #include <linux/uaccess.h> #include <linux/string.h> #include <linux/time.h> #include "reiserfs.h" #include <linux/buffer_head.h> /* * copy copy_count entries from source directory item to dest buffer * (creating new item if needed) */ static void leaf_copy_dir_entries(struct buffer_info *dest_bi, struct buffer_head *source, int last_first, int item_num, int from, int copy_count) { struct buffer_head *dest = dest_bi->bi_bh; /* * either the number of target item, or if we must create a * new item, the number of the item we will create it next to */ int item_num_in_dest; struct item_head *ih; struct reiserfs_de_head *deh; int copy_records_len; /* length of all records in item to be copied */ char *records; ih = item_head(source, item_num); RFALSE(!is_direntry_le_ih(ih), "vs-10000: item must be directory item"); /* * length of all record to be copied and first byte of * the last of them */ deh = B_I_DEH(source, ih); if (copy_count) { copy_records_len = (from ? deh_location(&deh[from - 1]) : ih_item_len(ih)) - deh_location(&deh[from + copy_count - 1]); records = source->b_data + ih_location(ih) + deh_location(&deh[from + copy_count - 1]); } else { copy_records_len = 0; records = NULL; } /* when copy last to first, dest buffer can contain 0 items */ item_num_in_dest = (last_first == LAST_TO_FIRST) ? ((B_NR_ITEMS(dest)) ? 0 : -1) : (B_NR_ITEMS(dest) - 1); /* * if there are no items in dest or the first/last item in * dest is not item of the same directory */ if ((item_num_in_dest == -1) || (last_first == FIRST_TO_LAST && le_ih_k_offset(ih) == DOT_OFFSET) || (last_first == LAST_TO_FIRST && comp_short_le_keys /*COMP_SHORT_KEYS */ (&ih->ih_key, leaf_key(dest, item_num_in_dest)))) { /* create new item in dest */ struct item_head new_ih; /* form item header */ memcpy(&new_ih.ih_key, &ih->ih_key, KEY_SIZE); put_ih_version(&new_ih, KEY_FORMAT_3_5); /* calculate item len */ put_ih_item_len(&new_ih, DEH_SIZE * copy_count + copy_records_len); put_ih_entry_count(&new_ih, 0); if (last_first == LAST_TO_FIRST) { /* form key by the following way */ if (from < ih_entry_count(ih)) { set_le_ih_k_offset(&new_ih, deh_offset(&deh[from])); } else { /* * no entries will be copied to this * item in this function */ set_le_ih_k_offset(&new_ih, U32_MAX); /* * this item is not yet valid, but we * want I_IS_DIRECTORY_ITEM to return 1 * for it, so we -1 */ } set_le_key_k_type(KEY_FORMAT_3_5, &new_ih.ih_key, TYPE_DIRENTRY); } /* insert item into dest buffer */ leaf_insert_into_buf(dest_bi, (last_first == LAST_TO_FIRST) ? 0 : B_NR_ITEMS(dest), &new_ih, NULL, 0); } else { /* prepare space for entries */ leaf_paste_in_buffer(dest_bi, (last_first == FIRST_TO_LAST) ? (B_NR_ITEMS(dest) - 1) : 0, MAX_US_INT, DEH_SIZE * copy_count + copy_records_len, records, 0); } item_num_in_dest = (last_first == FIRST_TO_LAST) ? (B_NR_ITEMS(dest) - 1) : 0; leaf_paste_entries(dest_bi, item_num_in_dest, (last_first == FIRST_TO_LAST) ? ih_entry_count(item_head(dest, item_num_in_dest)) : 0, copy_count, deh + from, records, DEH_SIZE * copy_count + copy_records_len); } /* * Copy the first (if last_first == FIRST_TO_LAST) or last * (last_first == LAST_TO_FIRST) item or part of it or nothing * (see the return 0 below) from SOURCE to the end (if last_first) * or beginning (!last_first) of the DEST */ /* returns 1 if anything was copied, else 0 */ static int leaf_copy_boundary_item(struct buffer_info *dest_bi, struct buffer_head *src, int last_first, int bytes_or_entries) { struct buffer_head *dest = dest_bi->bi_bh; /* number of items in the source and destination buffers */ int dest_nr_item, src_nr_item; struct item_head *ih; struct item_head *dih; dest_nr_item = B_NR_ITEMS(dest); /* * if ( DEST is empty or first item of SOURCE and last item of * DEST are the items of different objects or of different types ) * then there is no need to treat this item differently from the * other items that we copy, so we return */ if (last_first == FIRST_TO_LAST) { ih = item_head(src, 0); dih = item_head(dest, dest_nr_item - 1); /* there is nothing to merge */ if (!dest_nr_item || (!op_is_left_mergeable(&ih->ih_key, src->b_size))) return 0; RFALSE(!ih_item_len(ih), "vs-10010: item can not have empty length"); if (is_direntry_le_ih(ih)) { if (bytes_or_entries == -1) /* copy all entries to dest */ bytes_or_entries = ih_entry_count(ih); leaf_copy_dir_entries(dest_bi, src, FIRST_TO_LAST, 0, 0, bytes_or_entries); return 1; } /* * copy part of the body of the first item of SOURCE * to the end of the body of the last item of the DEST * part defined by 'bytes_or_entries'; if bytes_or_entries * == -1 copy whole body; don't create new item header */ if (bytes_or_entries == -1) bytes_or_entries = ih_item_len(ih); #ifdef CONFIG_REISERFS_CHECK else { if (bytes_or_entries == ih_item_len(ih) && is_indirect_le_ih(ih)) if (get_ih_free_space(ih)) reiserfs_panic(sb_from_bi(dest_bi), "vs-10020", "last unformatted node " "must be filled " "entirely (%h)", ih); } #endif /* * merge first item (or its part) of src buffer with the last * item of dest buffer. Both are of the same file */ leaf_paste_in_buffer(dest_bi, dest_nr_item - 1, ih_item_len(dih), bytes_or_entries, ih_item_body(src, ih), 0); if (is_indirect_le_ih(dih)) { RFALSE(get_ih_free_space(dih), "vs-10030: merge to left: last unformatted node of non-last indirect item %h must have zerto free space", ih); if (bytes_or_entries == ih_item_len(ih)) set_ih_free_space(dih, get_ih_free_space(ih)); } return 1; } /* copy boundary item to right (last_first == LAST_TO_FIRST) */ /* * (DEST is empty or last item of SOURCE and first item of DEST * are the items of different object or of different types) */ src_nr_item = B_NR_ITEMS(src); ih = item_head(src, src_nr_item - 1); dih = item_head(dest, 0); if (!dest_nr_item || !op_is_left_mergeable(&dih->ih_key, src->b_size)) return 0; if (is_direntry_le_ih(ih)) { /* * bytes_or_entries = entries number in last * item body of SOURCE */ if (bytes_or_entries == -1) bytes_or_entries = ih_entry_count(ih); leaf_copy_dir_entries(dest_bi, src, LAST_TO_FIRST, src_nr_item - 1, ih_entry_count(ih) - bytes_or_entries, bytes_or_entries); return 1; } /* * copy part of the body of the last item of SOURCE to the * begin of the body of the first item of the DEST; part defined * by 'bytes_or_entries'; if byte_or_entriess == -1 copy whole body; * change first item key of the DEST; don't create new item header */ RFALSE(is_indirect_le_ih(ih) && get_ih_free_space(ih), "vs-10040: merge to right: last unformatted node of non-last indirect item must be filled entirely (%h)", ih); if (bytes_or_entries == -1) { /* bytes_or_entries = length of last item body of SOURCE */ bytes_or_entries = ih_item_len(ih); RFALSE(le_ih_k_offset(dih) != le_ih_k_offset(ih) + op_bytes_number(ih, src->b_size), "vs-10050: items %h and %h do not match", ih, dih); /* change first item key of the DEST */ set_le_ih_k_offset(dih, le_ih_k_offset(ih)); /* item becomes non-mergeable */ /* or mergeable if left item was */ set_le_ih_k_type(dih, le_ih_k_type(ih)); } else { /* merge to right only part of item */ RFALSE(ih_item_len(ih) <= bytes_or_entries, "vs-10060: no so much bytes %lu (needed %lu)", (unsigned long)ih_item_len(ih), (unsigned long)bytes_or_entries); /* change first item key of the DEST */ if (is_direct_le_ih(dih)) { RFALSE(le_ih_k_offset(dih) <= (unsigned long)bytes_or_entries, "vs-10070: dih %h, bytes_or_entries(%d)", dih, bytes_or_entries); set_le_ih_k_offset(dih, le_ih_k_offset(dih) - bytes_or_entries); } else { RFALSE(le_ih_k_offset(dih) <= (bytes_or_entries / UNFM_P_SIZE) * dest->b_size, "vs-10080: dih %h, bytes_or_entries(%d)", dih, (bytes_or_entries / UNFM_P_SIZE) * dest->b_size); set_le_ih_k_offset(dih, le_ih_k_offset(dih) - ((bytes_or_entries / UNFM_P_SIZE) * dest->b_size)); } } leaf_paste_in_buffer(dest_bi, 0, 0, bytes_or_entries, ih_item_body(src, ih) + ih_item_len(ih) - bytes_or_entries, 0); return 1; } /* * copy cpy_mun items from buffer src to buffer dest * last_first == FIRST_TO_LAST means, that we copy cpy_num items beginning * from first-th item in src to tail of dest * last_first == LAST_TO_FIRST means, that we copy cpy_num items beginning * from first-th item in src to head of dest */ static void leaf_copy_items_entirely(struct buffer_info *dest_bi, struct buffer_head *src, int last_first, int first, int cpy_num) { struct buffer_head *dest; int nr, free_space; int dest_before; int last_loc, last_inserted_loc, location; int i, j; struct block_head *blkh; struct item_head *ih; RFALSE(last_first != LAST_TO_FIRST && last_first != FIRST_TO_LAST, "vs-10090: bad last_first parameter %d", last_first); RFALSE(B_NR_ITEMS(src) - first < cpy_num, "vs-10100: too few items in source %d, required %d from %d", B_NR_ITEMS(src), cpy_num, first); RFALSE(cpy_num < 0, "vs-10110: can not copy negative amount of items"); RFALSE(!dest_bi, "vs-10120: can not copy negative amount of items"); dest = dest_bi->bi_bh; RFALSE(!dest, "vs-10130: can not copy negative amount of items"); if (cpy_num == 0) return; blkh = B_BLK_HEAD(dest); nr = blkh_nr_item(blkh); free_space = blkh_free_space(blkh); /* * we will insert items before 0-th or nr-th item in dest buffer. * It depends of last_first parameter */ dest_before = (last_first == LAST_TO_FIRST) ? 0 : nr; /* location of head of first new item */ ih = item_head(dest, dest_before); RFALSE(blkh_free_space(blkh) < cpy_num * IH_SIZE, "vs-10140: not enough free space for headers %d (needed %d)", B_FREE_SPACE(dest), cpy_num * IH_SIZE); /* prepare space for headers */ memmove(ih + cpy_num, ih, (nr - dest_before) * IH_SIZE); /* copy item headers */ memcpy(ih, item_head(src, first), cpy_num * IH_SIZE); free_space -= (IH_SIZE * cpy_num); set_blkh_free_space(blkh, free_space); /* location of unmovable item */ j = location = (dest_before == 0) ? dest->b_size : ih_location(ih - 1); for (i = dest_before; i < nr + cpy_num; i++) { location -= ih_item_len(ih + i - dest_before); put_ih_location(ih + i - dest_before, location); } /* prepare space for items */ last_loc = ih_location(&ih[nr + cpy_num - 1 - dest_before]); last_inserted_loc = ih_location(&ih[cpy_num - 1]); /* check free space */ RFALSE(free_space < j - last_inserted_loc, "vs-10150: not enough free space for items %d (needed %d)", free_space, j - last_inserted_loc); memmove(dest->b_data + last_loc, dest->b_data + last_loc + j - last_inserted_loc, last_inserted_loc - last_loc); /* copy items */ memcpy(dest->b_data + last_inserted_loc, item_body(src, (first + cpy_num - 1)), j - last_inserted_loc); /* sizes, item number */ set_blkh_nr_item(blkh, nr + cpy_num); set_blkh_free_space(blkh, free_space - (j - last_inserted_loc)); do_balance_mark_leaf_dirty(dest_bi->tb, dest, 0); if (dest_bi->bi_parent) { struct disk_child *t_dc; t_dc = B_N_CHILD(dest_bi->bi_parent, dest_bi->bi_position); RFALSE(dc_block_number(t_dc) != dest->b_blocknr, "vs-10160: block number in bh does not match to field in disk_child structure %lu and %lu", (long unsigned)dest->b_blocknr, (long unsigned)dc_block_number(t_dc)); put_dc_size(t_dc, dc_size(t_dc) + (j - last_inserted_loc + IH_SIZE * cpy_num)); do_balance_mark_internal_dirty(dest_bi->tb, dest_bi->bi_parent, 0); } } /* * This function splits the (liquid) item into two items (useful when * shifting part of an item into another node.) */ static void leaf_item_bottle(struct buffer_info *dest_bi, struct buffer_head *src, int last_first, int item_num, int cpy_bytes) { struct buffer_head *dest = dest_bi->bi_bh; struct item_head *ih; RFALSE(cpy_bytes == -1, "vs-10170: bytes == - 1 means: do not split item"); if (last_first == FIRST_TO_LAST) { /* * if ( if item in position item_num in buffer SOURCE * is directory item ) */ ih = item_head(src, item_num); if (is_direntry_le_ih(ih)) leaf_copy_dir_entries(dest_bi, src, FIRST_TO_LAST, item_num, 0, cpy_bytes); else { struct item_head n_ih; /* * copy part of the body of the item number 'item_num' * of SOURCE to the end of the DEST part defined by * 'cpy_bytes'; create new item header; change old * item_header (????); n_ih = new item_header; */ memcpy(&n_ih, ih, IH_SIZE); put_ih_item_len(&n_ih, cpy_bytes); if (is_indirect_le_ih(ih)) { RFALSE(cpy_bytes == ih_item_len(ih) && get_ih_free_space(ih), "vs-10180: when whole indirect item is bottle to left neighbor, it must have free_space==0 (not %lu)", (long unsigned)get_ih_free_space(ih)); set_ih_free_space(&n_ih, 0); } RFALSE(op_is_left_mergeable(&ih->ih_key, src->b_size), "vs-10190: bad mergeability of item %h", ih); n_ih.ih_version = ih->ih_version; /* JDM Endian safe, both le */ leaf_insert_into_buf(dest_bi, B_NR_ITEMS(dest), &n_ih, item_body(src, item_num), 0); } } else { /* * if ( if item in position item_num in buffer * SOURCE is directory item ) */ ih = item_head(src, item_num); if (is_direntry_le_ih(ih)) leaf_copy_dir_entries(dest_bi, src, LAST_TO_FIRST, item_num, ih_entry_count(ih) - cpy_bytes, cpy_bytes); else { struct item_head n_ih; /* * copy part of the body of the item number 'item_num' * of SOURCE to the begin of the DEST part defined by * 'cpy_bytes'; create new item header; * n_ih = new item_header; */ memcpy(&n_ih.ih_key, &ih->ih_key, KEY_SIZE); /* Endian safe, both le */ n_ih.ih_version = ih->ih_version; if (is_direct_le_ih(ih)) { set_le_ih_k_offset(&n_ih, le_ih_k_offset(ih) + ih_item_len(ih) - cpy_bytes); set_le_ih_k_type(&n_ih, TYPE_DIRECT); set_ih_free_space(&n_ih, MAX_US_INT); } else { /* indirect item */ RFALSE(!cpy_bytes && get_ih_free_space(ih), "vs-10200: ih->ih_free_space must be 0 when indirect item will be appended"); set_le_ih_k_offset(&n_ih, le_ih_k_offset(ih) + (ih_item_len(ih) - cpy_bytes) / UNFM_P_SIZE * dest->b_size); set_le_ih_k_type(&n_ih, TYPE_INDIRECT); set_ih_free_space(&n_ih, get_ih_free_space(ih)); } /* set item length */ put_ih_item_len(&n_ih, cpy_bytes); /* Endian safe, both le */ n_ih.ih_version = ih->ih_version; leaf_insert_into_buf(dest_bi, 0, &n_ih, item_body(src, item_num) + ih_item_len(ih) - cpy_bytes, 0); } } } /* * If cpy_bytes equals minus one than copy cpy_num whole items from SOURCE * to DEST. If cpy_bytes not equal to minus one than copy cpy_num-1 whole * items from SOURCE to DEST. From last item copy cpy_num bytes for regular * item and cpy_num directory entries for directory item. */ static int leaf_copy_items(struct buffer_info *dest_bi, struct buffer_head *src, int last_first, int cpy_num, int cpy_bytes) { struct buffer_head *dest; int pos, i, src_nr_item, bytes; dest = dest_bi->bi_bh; RFALSE(!dest || !src, "vs-10210: !dest || !src"); RFALSE(last_first != FIRST_TO_LAST && last_first != LAST_TO_FIRST, "vs-10220:last_first != FIRST_TO_LAST && last_first != LAST_TO_FIRST"); RFALSE(B_NR_ITEMS(src) < cpy_num, "vs-10230: No enough items: %d, req. %d", B_NR_ITEMS(src), cpy_num); RFALSE(cpy_num < 0, "vs-10240: cpy_num < 0 (%d)", cpy_num); if (cpy_num == 0) return 0; if (last_first == FIRST_TO_LAST) { /* copy items to left */ pos = 0; if (cpy_num == 1) bytes = cpy_bytes; else bytes = -1; /* * copy the first item or it part or nothing to the end of * the DEST (i = leaf_copy_boundary_item(DEST,SOURCE,0,bytes)) */ i = leaf_copy_boundary_item(dest_bi, src, FIRST_TO_LAST, bytes); cpy_num -= i; if (cpy_num == 0) return i; pos += i; if (cpy_bytes == -1) /* * copy first cpy_num items starting from position * 'pos' of SOURCE to end of DEST */ leaf_copy_items_entirely(dest_bi, src, FIRST_TO_LAST, pos, cpy_num); else { /* * copy first cpy_num-1 items starting from position * 'pos-1' of the SOURCE to the end of the DEST */ leaf_copy_items_entirely(dest_bi, src, FIRST_TO_LAST, pos, cpy_num - 1); /* * copy part of the item which number is * cpy_num+pos-1 to the end of the DEST */ leaf_item_bottle(dest_bi, src, FIRST_TO_LAST, cpy_num + pos - 1, cpy_bytes); } } else { /* copy items to right */ src_nr_item = B_NR_ITEMS(src); if (cpy_num == 1) bytes = cpy_bytes; else bytes = -1; /* * copy the last item or it part or nothing to the * begin of the DEST * (i = leaf_copy_boundary_item(DEST,SOURCE,1,bytes)); */ i = leaf_copy_boundary_item(dest_bi, src, LAST_TO_FIRST, bytes); cpy_num -= i; if (cpy_num == 0) return i; pos = src_nr_item - cpy_num - i; if (cpy_bytes == -1) { /* * starting from position 'pos' copy last cpy_num * items of SOURCE to begin of DEST */ leaf_copy_items_entirely(dest_bi, src, LAST_TO_FIRST, pos, cpy_num); } else { /* * copy last cpy_num-1 items starting from position * 'pos+1' of the SOURCE to the begin of the DEST; */ leaf_copy_items_entirely(dest_bi, src, LAST_TO_FIRST, pos + 1, cpy_num - 1); /* * copy part of the item which number is pos to * the begin of the DEST */ leaf_item_bottle(dest_bi, src, LAST_TO_FIRST, pos, cpy_bytes); } } return i; } /* * there are types of coping: from S[0] to L[0], from S[0] to R[0], * from R[0] to L[0]. for each of these we have to define parent and * positions of destination and source buffers */ static void leaf_define_dest_src_infos(int shift_mode, struct tree_balance *tb, struct buffer_info *dest_bi, struct buffer_info *src_bi, int *first_last, struct buffer_head *Snew) { memset(dest_bi, 0, sizeof(struct buffer_info)); memset(src_bi, 0, sizeof(struct buffer_info)); /* define dest, src, dest parent, dest position */ switch (shift_mode) { case LEAF_FROM_S_TO_L: /* it is used in leaf_shift_left */ src_bi->tb = tb; src_bi->bi_bh = PATH_PLAST_BUFFER(tb->tb_path); src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, 0); /* src->b_item_order */ src_bi->bi_position = PATH_H_B_ITEM_ORDER(tb->tb_path, 0); dest_bi->tb = tb; dest_bi->bi_bh = tb->L[0]; dest_bi->bi_parent = tb->FL[0]; dest_bi->bi_position = get_left_neighbor_position(tb, 0); *first_last = FIRST_TO_LAST; break; case LEAF_FROM_S_TO_R: /* it is used in leaf_shift_right */ src_bi->tb = tb; src_bi->bi_bh = PATH_PLAST_BUFFER(tb->tb_path); src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, 0); src_bi->bi_position = PATH_H_B_ITEM_ORDER(tb->tb_path, 0); dest_bi->tb = tb; dest_bi->bi_bh = tb->R[0]; dest_bi->bi_parent = tb->FR[0]; dest_bi->bi_position = get_right_neighbor_position(tb, 0); *first_last = LAST_TO_FIRST; break; case LEAF_FROM_R_TO_L: /* it is used in balance_leaf_when_delete */ src_bi->tb = tb; src_bi->bi_bh = tb->R[0]; src_bi->bi_parent = tb->FR[0]; src_bi->bi_position = get_right_neighbor_position(tb, 0); dest_bi->tb = tb; dest_bi->bi_bh = tb->L[0]; dest_bi->bi_parent = tb->FL[0]; dest_bi->bi_position = get_left_neighbor_position(tb, 0); *first_last = FIRST_TO_LAST; break; case LEAF_FROM_L_TO_R: /* it is used in balance_leaf_when_delete */ src_bi->tb = tb; src_bi->bi_bh = tb->L[0]; src_bi->bi_parent = tb->FL[0]; src_bi->bi_position = get_left_neighbor_position(tb, 0); dest_bi->tb = tb; dest_bi->bi_bh = tb->R[0]; dest_bi->bi_parent = tb->FR[0]; dest_bi->bi_position = get_right_neighbor_position(tb, 0); *first_last = LAST_TO_FIRST; break; case LEAF_FROM_S_TO_SNEW: src_bi->tb = tb; src_bi->bi_bh = PATH_PLAST_BUFFER(tb->tb_path); src_bi->bi_parent = PATH_H_PPARENT(tb->tb_path, 0); src_bi->bi_position = PATH_H_B_ITEM_ORDER(tb->tb_path, 0); dest_bi->tb = tb; dest_bi->bi_bh = Snew; dest_bi->bi_parent = NULL; dest_bi->bi_position = 0; *first_last = LAST_TO_FIRST; break; default: reiserfs_panic(sb_from_bi(src_bi), "vs-10250", "shift type is unknown (%d)", shift_mode); } RFALSE(!src_bi->bi_bh || !dest_bi->bi_bh, "vs-10260: mode==%d, source (%p) or dest (%p) buffer is initialized incorrectly", shift_mode, src_bi->bi_bh, dest_bi->bi_bh); } /* * copy mov_num items and mov_bytes of the (mov_num-1)th item to * neighbor. Delete them from source */ int leaf_move_items(int shift_mode, struct tree_balance *tb, int mov_num, int mov_bytes, struct buffer_head *Snew) { int ret_value; struct buffer_info dest_bi, src_bi; int first_last; leaf_define_dest_src_infos(shift_mode, tb, &dest_bi, &src_bi, &first_last, Snew); ret_value = leaf_copy_items(&dest_bi, src_bi.bi_bh, first_last, mov_num, mov_bytes); leaf_delete_items(&src_bi, first_last, (first_last == FIRST_TO_LAST) ? 0 : (B_NR_ITEMS(src_bi.bi_bh) - mov_num), mov_num, mov_bytes); return ret_value; } /* * Shift shift_num items (and shift_bytes of last shifted item if * shift_bytes != -1) from S[0] to L[0] and replace the delimiting key */ int leaf_shift_left(struct tree_balance *tb, int shift_num, int shift_bytes) { struct buffer_head *S0 = PATH_PLAST_BUFFER(tb->tb_path); int i; /* * move shift_num (and shift_bytes bytes) items from S[0] * to left neighbor L[0] */ i = leaf_move_items(LEAF_FROM_S_TO_L, tb, shift_num, shift_bytes, NULL); if (shift_num) { /* number of items in S[0] == 0 */ if (B_NR_ITEMS(S0) == 0) { RFALSE(shift_bytes != -1, "vs-10270: S0 is empty now, but shift_bytes != -1 (%d)", shift_bytes); #ifdef CONFIG_REISERFS_CHECK if (tb->tb_mode == M_PASTE || tb->tb_mode == M_INSERT) { print_cur_tb("vs-10275"); reiserfs_panic(tb->tb_sb, "vs-10275", "balance condition corrupted " "(%c)", tb->tb_mode); } #endif if (PATH_H_POSITION(tb->tb_path, 1) == 0) replace_key(tb, tb->CFL[0], tb->lkey[0], PATH_H_PPARENT(tb->tb_path, 0), 0); } else { /* replace lkey in CFL[0] by 0-th key from S[0]; */ replace_key(tb, tb->CFL[0], tb->lkey[0], S0, 0); RFALSE((shift_bytes != -1 && !(is_direntry_le_ih(item_head(S0, 0)) && !ih_entry_count(item_head(S0, 0)))) && (!op_is_left_mergeable (leaf_key(S0, 0), S0->b_size)), "vs-10280: item must be mergeable"); } } return i; } /* CLEANING STOPPED HERE */ /* * Shift shift_num (shift_bytes) items from S[0] to the right neighbor, * and replace the delimiting key */ int leaf_shift_right(struct tree_balance *tb, int shift_num, int shift_bytes) { int ret_value; /* * move shift_num (and shift_bytes) items from S[0] to * right neighbor R[0] */ ret_value = leaf_move_items(LEAF_FROM_S_TO_R, tb, shift_num, shift_bytes, NULL); /* replace rkey in CFR[0] by the 0-th key from R[0] */ if (shift_num) { replace_key(tb, tb->CFR[0], tb->rkey[0], tb->R[0], 0); } return ret_value; } static void leaf_delete_items_entirely(struct buffer_info *bi, int first, int del_num); /* * If del_bytes == -1, starting from position 'first' delete del_num * items in whole in buffer CUR. * If not. * If last_first == 0. Starting from position 'first' delete del_num-1 * items in whole. Delete part of body of the first item. Part defined by * del_bytes. Don't delete first item header * If last_first == 1. Starting from position 'first+1' delete del_num-1 * items in whole. Delete part of body of the last item . Part defined by * del_bytes. Don't delete last item header. */ void leaf_delete_items(struct buffer_info *cur_bi, int last_first, int first, int del_num, int del_bytes) { struct buffer_head *bh; int item_amount = B_NR_ITEMS(bh = cur_bi->bi_bh); RFALSE(!bh, "10155: bh is not defined"); RFALSE(del_num < 0, "10160: del_num can not be < 0. del_num==%d", del_num); RFALSE(first < 0 || first + del_num > item_amount, "10165: invalid number of first item to be deleted (%d) or " "no so much items (%d) to delete (only %d)", first, first + del_num, item_amount); if (del_num == 0) return; if (first == 0 && del_num == item_amount && del_bytes == -1) { make_empty_node(cur_bi); do_balance_mark_leaf_dirty(cur_bi->tb, bh, 0); return; } if (del_bytes == -1) /* delete del_num items beginning from item in position first */ leaf_delete_items_entirely(cur_bi, first, del_num); else { if (last_first == FIRST_TO_LAST) { /* * delete del_num-1 items beginning from * item in position first */ leaf_delete_items_entirely(cur_bi, first, del_num - 1); /* * delete the part of the first item of the bh * do not delete item header */ leaf_cut_from_buffer(cur_bi, 0, 0, del_bytes); } else { struct item_head *ih; int len; /* * delete del_num-1 items beginning from * item in position first+1 */ leaf_delete_items_entirely(cur_bi, first + 1, del_num - 1); ih = item_head(bh, B_NR_ITEMS(bh) - 1); if (is_direntry_le_ih(ih)) /* the last item is directory */ /* * len = numbers of directory entries * in this item */ len = ih_entry_count(ih); else /* len = body len of item */ len = ih_item_len(ih); /* * delete the part of the last item of the bh * do not delete item header */ leaf_cut_from_buffer(cur_bi, B_NR_ITEMS(bh) - 1, len - del_bytes, del_bytes); } } } /* insert item into the leaf node in position before */ void leaf_insert_into_buf(struct buffer_info *bi, int before, struct item_head * const inserted_item_ih, const char * const inserted_item_body, int zeros_number) { struct buffer_head *bh = bi->bi_bh; int nr, free_space; struct block_head *blkh; struct item_head *ih; int i; int last_loc, unmoved_loc; char *to; blkh = B_BLK_HEAD(bh); nr = blkh_nr_item(blkh); free_space = blkh_free_space(blkh); /* check free space */ RFALSE(free_space < ih_item_len(inserted_item_ih) + IH_SIZE, "vs-10170: not enough free space in block %z, new item %h", bh, inserted_item_ih); RFALSE(zeros_number > ih_item_len(inserted_item_ih), "vs-10172: zero number == %d, item length == %d", zeros_number, ih_item_len(inserted_item_ih)); /* get item new item must be inserted before */ ih = item_head(bh, before); /* prepare space for the body of new item */ last_loc = nr ? ih_location(&ih[nr - before - 1]) : bh->b_size; unmoved_loc = before ? ih_location(ih - 1) : bh->b_size; memmove(bh->b_data + last_loc - ih_item_len(inserted_item_ih), bh->b_data + last_loc, unmoved_loc - last_loc); to = bh->b_data + unmoved_loc - ih_item_len(inserted_item_ih); memset(to, 0, zeros_number); to += zeros_number; /* copy body to prepared space */ if (inserted_item_body) memmove(to, inserted_item_body, ih_item_len(inserted_item_ih) - zeros_number); else memset(to, '\0', ih_item_len(inserted_item_ih) - zeros_number); /* insert item header */ memmove(ih + 1, ih, IH_SIZE * (nr - before)); memmove(ih, inserted_item_ih, IH_SIZE); /* change locations */ for (i = before; i < nr + 1; i++) { unmoved_loc -= ih_item_len(&ih[i - before]); put_ih_location(&ih[i - before], unmoved_loc); } /* sizes, free space, item number */ set_blkh_nr_item(blkh, blkh_nr_item(blkh) + 1); set_blkh_free_space(blkh, free_space - (IH_SIZE + ih_item_len(inserted_item_ih))); do_balance_mark_leaf_dirty(bi->tb, bh, 1); if (bi->bi_parent) { struct disk_child *t_dc; t_dc = B_N_CHILD(bi->bi_parent, bi->bi_position); put_dc_size(t_dc, dc_size(t_dc) + (IH_SIZE + ih_item_len(inserted_item_ih))); do_balance_mark_internal_dirty(bi->tb, bi->bi_parent, 0); } } /* * paste paste_size bytes to affected_item_num-th item. * When item is a directory, this only prepare space for new entries */ void leaf_paste_in_buffer(struct buffer_info *bi, int affected_item_num, int pos_in_item, int paste_size, const char *body, int zeros_number) { struct buffer_head *bh = bi->bi_bh; int nr, free_space; struct block_head *blkh; struct item_head *ih; int i; int last_loc, unmoved_loc; blkh = B_BLK_HEAD(bh); nr = blkh_nr_item(blkh); free_space = blkh_free_space(blkh); /* check free space */ RFALSE(free_space < paste_size, "vs-10175: not enough free space: needed %d, available %d", paste_size, free_space); #ifdef CONFIG_REISERFS_CHECK if (zeros_number > paste_size) { struct super_block *sb = NULL; if (bi && bi->tb) sb = bi->tb->tb_sb; print_cur_tb("10177"); reiserfs_panic(sb, "vs-10177", "zeros_number == %d, paste_size == %d", zeros_number, paste_size); } #endif /* CONFIG_REISERFS_CHECK */ /* item to be appended */ ih = item_head(bh, affected_item_num); last_loc = ih_location(&ih[nr - affected_item_num - 1]); unmoved_loc = affected_item_num ? ih_location(ih - 1) : bh->b_size; /* prepare space */ memmove(bh->b_data + last_loc - paste_size, bh->b_data + last_loc, unmoved_loc - last_loc); /* change locations */ for (i = affected_item_num; i < nr; i++) put_ih_location(&ih[i - affected_item_num], ih_location(&ih[i - affected_item_num]) - paste_size); if (body) { if (!is_direntry_le_ih(ih)) { if (!pos_in_item) { /* shift data to right */ memmove(bh->b_data + ih_location(ih) + paste_size, bh->b_data + ih_location(ih), ih_item_len(ih)); /* paste data in the head of item */ memset(bh->b_data + ih_location(ih), 0, zeros_number); memcpy(bh->b_data + ih_location(ih) + zeros_number, body, paste_size - zeros_number); } else { memset(bh->b_data + unmoved_loc - paste_size, 0, zeros_number); memcpy(bh->b_data + unmoved_loc - paste_size + zeros_number, body, paste_size - zeros_number); } } } else memset(bh->b_data + unmoved_loc - paste_size, '\0', paste_size); put_ih_item_len(ih, ih_item_len(ih) + paste_size); /* change free space */ set_blkh_free_space(blkh, free_space - paste_size); do_balance_mark_leaf_dirty(bi->tb, bh, 0); if (bi->bi_parent) { struct disk_child *t_dc = B_N_CHILD(bi->bi_parent, bi->bi_position); put_dc_size(t_dc, dc_size(t_dc) + paste_size); do_balance_mark_internal_dirty(bi->tb, bi->bi_parent, 0); } } /* * cuts DEL_COUNT entries beginning from FROM-th entry. Directory item * does not have free space, so it moves DEHs and remaining records as * necessary. Return value is size of removed part of directory item * in bytes. */ static int leaf_cut_entries(struct buffer_head *bh, struct item_head *ih, int from, int del_count) { char *item; struct reiserfs_de_head *deh; int prev_record_offset; /* offset of record, that is (from-1)th */ char *prev_record; /* */ int cut_records_len; /* length of all removed records */ int i; /* * make sure that item is directory and there are enough entries to * remove */ RFALSE(!is_direntry_le_ih(ih), "10180: item is not directory item"); RFALSE(ih_entry_count(ih) < from + del_count, "10185: item contains not enough entries: entry_count = %d, from = %d, to delete = %d", ih_entry_count(ih), from, del_count); if (del_count == 0) return 0; /* first byte of item */ item = bh->b_data + ih_location(ih); /* entry head array */ deh = B_I_DEH(bh, ih); /* * first byte of remaining entries, those are BEFORE cut entries * (prev_record) and length of all removed records (cut_records_len) */ prev_record_offset = (from ? deh_location(&deh[from - 1]) : ih_item_len(ih)); cut_records_len = prev_record_offset /*from_record */ - deh_location(&deh[from + del_count - 1]); prev_record = item + prev_record_offset; /* adjust locations of remaining entries */ for (i = ih_entry_count(ih) - 1; i > from + del_count - 1; i--) put_deh_location(&deh[i], deh_location(&deh[i]) - (DEH_SIZE * del_count)); for (i = 0; i < from; i++) put_deh_location(&deh[i], deh_location(&deh[i]) - (DEH_SIZE * del_count + cut_records_len)); put_ih_entry_count(ih, ih_entry_count(ih) - del_count); /* shift entry head array and entries those are AFTER removed entries */ memmove((char *)(deh + from), deh + from + del_count, prev_record - cut_records_len - (char *)(deh + from + del_count)); /* shift records, those are BEFORE removed entries */ memmove(prev_record - cut_records_len - DEH_SIZE * del_count, prev_record, item + ih_item_len(ih) - prev_record); return DEH_SIZE * del_count + cut_records_len; } /* * when cut item is part of regular file * pos_in_item - first byte that must be cut * cut_size - number of bytes to be cut beginning from pos_in_item * * when cut item is part of directory * pos_in_item - number of first deleted entry * cut_size - count of deleted entries */ void leaf_cut_from_buffer(struct buffer_info *bi, int cut_item_num, int pos_in_item, int cut_size) { int nr; struct buffer_head *bh = bi->bi_bh; struct block_head *blkh; struct item_head *ih; int last_loc, unmoved_loc; int i; blkh = B_BLK_HEAD(bh); nr = blkh_nr_item(blkh); /* item head of truncated item */ ih = item_head(bh, cut_item_num); if (is_direntry_le_ih(ih)) { /* first cut entry () */ cut_size = leaf_cut_entries(bh, ih, pos_in_item, cut_size); if (pos_in_item == 0) { /* change key */ RFALSE(cut_item_num, "when 0-th enrty of item is cut, that item must be first in the node, not %d-th", cut_item_num); /* change item key by key of first entry in the item */ set_le_ih_k_offset(ih, deh_offset(B_I_DEH(bh, ih))); } } else { /* item is direct or indirect */ RFALSE(is_statdata_le_ih(ih), "10195: item is stat data"); RFALSE(pos_in_item && pos_in_item + cut_size != ih_item_len(ih), "10200: invalid offset (%lu) or trunc_size (%lu) or ih_item_len (%lu)", (long unsigned)pos_in_item, (long unsigned)cut_size, (long unsigned)ih_item_len(ih)); /* shift item body to left if cut is from the head of item */ if (pos_in_item == 0) { memmove(bh->b_data + ih_location(ih), bh->b_data + ih_location(ih) + cut_size, ih_item_len(ih) - cut_size); /* change key of item */ if (is_direct_le_ih(ih)) set_le_ih_k_offset(ih, le_ih_k_offset(ih) + cut_size); else { set_le_ih_k_offset(ih, le_ih_k_offset(ih) + (cut_size / UNFM_P_SIZE) * bh->b_size); RFALSE(ih_item_len(ih) == cut_size && get_ih_free_space(ih), "10205: invalid ih_free_space (%h)", ih); } } } /* location of the last item */ last_loc = ih_location(&ih[nr - cut_item_num - 1]); /* location of the item, which is remaining at the same place */ unmoved_loc = cut_item_num ? ih_location(ih - 1) : bh->b_size; /* shift */ memmove(bh->b_data + last_loc + cut_size, bh->b_data + last_loc, unmoved_loc - last_loc - cut_size); /* change item length */ put_ih_item_len(ih, ih_item_len(ih) - cut_size); if (is_indirect_le_ih(ih)) { if (pos_in_item) set_ih_free_space(ih, 0); } /* change locations */ for (i = cut_item_num; i < nr; i++) put_ih_location(&ih[i - cut_item_num], ih_location(&ih[i - cut_item_num]) + cut_size); /* size, free space */ set_blkh_free_space(blkh, blkh_free_space(blkh) + cut_size); do_balance_mark_leaf_dirty(bi->tb, bh, 0); if (bi->bi_parent) { struct disk_child *t_dc; t_dc = B_N_CHILD(bi->bi_parent, bi->bi_position); put_dc_size(t_dc, dc_size(t_dc) - cut_size); do_balance_mark_internal_dirty(bi->tb, bi->bi_parent, 0); } } /* delete del_num items from buffer starting from the first'th item */ static void leaf_delete_items_entirely(struct buffer_info *bi, int first, int del_num) { struct buffer_head *bh = bi->bi_bh; int nr; int i, j; int last_loc, last_removed_loc; struct block_head *blkh; struct item_head *ih; RFALSE(bh == NULL, "10210: buffer is 0"); RFALSE(del_num < 0, "10215: del_num less than 0 (%d)", del_num); if (del_num == 0) return; blkh = B_BLK_HEAD(bh); nr = blkh_nr_item(blkh); RFALSE(first < 0 || first + del_num > nr, "10220: first=%d, number=%d, there is %d items", first, del_num, nr); if (first == 0 && del_num == nr) { /* this does not work */ make_empty_node(bi); do_balance_mark_leaf_dirty(bi->tb, bh, 0); return; } ih = item_head(bh, first); /* location of unmovable item */ j = (first == 0) ? bh->b_size : ih_location(ih - 1); /* delete items */ last_loc = ih_location(&ih[nr - 1 - first]); last_removed_loc = ih_location(&ih[del_num - 1]); memmove(bh->b_data + last_loc + j - last_removed_loc, bh->b_data + last_loc, last_removed_loc - last_loc); /* delete item headers */ memmove(ih, ih + del_num, (nr - first - del_num) * IH_SIZE); /* change item location */ for (i = first; i < nr - del_num; i++) put_ih_location(&ih[i - first], ih_location(&ih[i - first]) + (j - last_removed_loc)); /* sizes, item number */ set_blkh_nr_item(blkh, blkh_nr_item(blkh) - del_num); set_blkh_free_space(blkh, blkh_free_space(blkh) + (j - last_removed_loc + IH_SIZE * del_num)); do_balance_mark_leaf_dirty(bi->tb, bh, 0); if (bi->bi_parent) { struct disk_child *t_dc = B_N_CHILD(bi->bi_parent, bi->bi_position); put_dc_size(t_dc, dc_size(t_dc) - (j - last_removed_loc + IH_SIZE * del_num)); do_balance_mark_internal_dirty(bi->tb, bi->bi_parent, 0); } } /* * paste new_entry_count entries (new_dehs, records) into position * before to item_num-th item */ void leaf_paste_entries(struct buffer_info *bi, int item_num, int before, int new_entry_count, struct reiserfs_de_head *new_dehs, const char *records, int paste_size) { struct item_head *ih; char *item; struct reiserfs_de_head *deh; char *insert_point; int i; struct buffer_head *bh = bi->bi_bh; if (new_entry_count == 0) return; ih = item_head(bh, item_num); /* * make sure, that item is directory, and there are enough * records in it */ RFALSE(!is_direntry_le_ih(ih), "10225: item is not directory item"); RFALSE(ih_entry_count(ih) < before, "10230: there are no entry we paste entries before. entry_count = %d, before = %d", ih_entry_count(ih), before); /* first byte of dest item */ item = bh->b_data + ih_location(ih); /* entry head array */ deh = B_I_DEH(bh, ih); /* new records will be pasted at this point */ insert_point = item + (before ? deh_location(&deh[before - 1]) : (ih_item_len(ih) - paste_size)); /* adjust locations of records that will be AFTER new records */ for (i = ih_entry_count(ih) - 1; i >= before; i--) put_deh_location(&deh[i], deh_location(&deh[i]) + (DEH_SIZE * new_entry_count)); /* adjust locations of records that will be BEFORE new records */ for (i = 0; i < before; i++) put_deh_location(&deh[i], deh_location(&deh[i]) + paste_size); put_ih_entry_count(ih, ih_entry_count(ih) + new_entry_count); /* prepare space for pasted records */ memmove(insert_point + paste_size, insert_point, item + (ih_item_len(ih) - paste_size) - insert_point); /* copy new records */ memcpy(insert_point + DEH_SIZE * new_entry_count, records, paste_size - DEH_SIZE * new_entry_count); /* prepare space for new entry heads */ deh += before; memmove((char *)(deh + new_entry_count), deh, insert_point - (char *)deh); /* copy new entry heads */ deh = (struct reiserfs_de_head *)((char *)deh); memcpy(deh, new_dehs, DEH_SIZE * new_entry_count); /* set locations of new records */ for (i = 0; i < new_entry_count; i++) { put_deh_location(&deh[i], deh_location(&deh[i]) + (-deh_location (&new_dehs[new_entry_count - 1]) + insert_point + DEH_SIZE * new_entry_count - item)); } /* change item key if necessary (when we paste before 0-th entry */ if (!before) { set_le_ih_k_offset(ih, deh_offset(new_dehs)); } #ifdef CONFIG_REISERFS_CHECK { int prev, next; /* check record locations */ deh = B_I_DEH(bh, ih); for (i = 0; i < ih_entry_count(ih); i++) { next = (i < ih_entry_count(ih) - 1) ? deh_location(&deh[i + 1]) : 0; prev = (i != 0) ? deh_location(&deh[i - 1]) : 0; if (prev && prev <= deh_location(&deh[i])) reiserfs_error(sb_from_bi(bi), "vs-10240", "directory item (%h) " "corrupted (prev %a, " "cur(%d) %a)", ih, deh + i - 1, i, deh + i); if (next && next >= deh_location(&deh[i])) reiserfs_error(sb_from_bi(bi), "vs-10250", "directory item (%h) " "corrupted (cur(%d) %a, " "next %a)", ih, i, deh + i, deh + i + 1); } } #endif } |
10 10 43 24 16 16 5 11 24 24 10 10 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 | /* * linux/drivers/video/modedb.c -- Standard video mode database management * * Copyright (C) 1999 Geert Uytterhoeven * * 2001 - Documented with DocBook * - Brad Douglas <brad@neruo.com> * * This file is subject to the terms and conditions of the GNU General Public * License. See the file COPYING in the main directory of this archive for * more details. */ #include <linux/module.h> #include <linux/slab.h> #include <linux/fb.h> #include <linux/kernel.h> #undef DEBUG #define name_matches(v, s, l) \ ((v).name && !strncmp((s), (v).name, (l)) && strlen((v).name) == (l)) #define res_matches(v, x, y) \ ((v).xres == (x) && (v).yres == (y)) #ifdef DEBUG #define DPRINTK(fmt, args...) printk("modedb %s: " fmt, __func__ , ## args) #else #define DPRINTK(fmt, args...) #endif /* * Standard video mode definitions (taken from XFree86) */ static const struct fb_videomode modedb[] = { /* 640x400 @ 70 Hz, 31.5 kHz hsync */ { NULL, 70, 640, 400, 39721, 40, 24, 39, 9, 96, 2, 0, FB_VMODE_NONINTERLACED }, /* 640x480 @ 60 Hz, 31.5 kHz hsync */ { NULL, 60, 640, 480, 39721, 40, 24, 32, 11, 96, 2, 0, FB_VMODE_NONINTERLACED }, /* 800x600 @ 56 Hz, 35.15 kHz hsync */ { NULL, 56, 800, 600, 27777, 128, 24, 22, 1, 72, 2, 0, FB_VMODE_NONINTERLACED }, /* 1024x768 @ 87 Hz interlaced, 35.5 kHz hsync */ { NULL, 87, 1024, 768, 22271, 56, 24, 33, 8, 160, 8, 0, FB_VMODE_INTERLACED }, /* 640x400 @ 85 Hz, 37.86 kHz hsync */ { NULL, 85, 640, 400, 31746, 96, 32, 41, 1, 64, 3, FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED }, /* 640x480 @ 72 Hz, 36.5 kHz hsync */ { NULL, 72, 640, 480, 31746, 144, 40, 30, 8, 40, 3, 0, FB_VMODE_NONINTERLACED }, /* 640x480 @ 75 Hz, 37.50 kHz hsync */ { NULL, 75, 640, 480, 31746, 120, 16, 16, 1, 64, 3, 0, FB_VMODE_NONINTERLACED }, /* 800x600 @ 60 Hz, 37.8 kHz hsync */ { NULL, 60, 800, 600, 25000, 88, 40, 23, 1, 128, 4, FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED }, /* 640x480 @ 85 Hz, 43.27 kHz hsync */ { NULL, 85, 640, 480, 27777, 80, 56, 25, 1, 56, 3, 0, FB_VMODE_NONINTERLACED }, /* 1152x864 @ 89 Hz interlaced, 44 kHz hsync */ { NULL, 89, 1152, 864, 15384, 96, 16, 110, 1, 216, 10, 0, FB_VMODE_INTERLACED }, /* 800x600 @ 72 Hz, 48.0 kHz hsync */ { NULL, 72, 800, 600, 20000, 64, 56, 23, 37, 120, 6, FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED }, /* 1024x768 @ 60 Hz, 48.4 kHz hsync */ { NULL, 60, 1024, 768, 15384, 168, 8, 29, 3, 144, 6, 0, FB_VMODE_NONINTERLACED }, /* 640x480 @ 100 Hz, 53.01 kHz hsync */ { NULL, 100, 640, 480, 21834, 96, 32, 36, 8, 96, 6, 0, FB_VMODE_NONINTERLACED }, /* 1152x864 @ 60 Hz, 53.5 kHz hsync */ { NULL, 60, 1152, 864, 11123, 208, 64, 16, 4, 256, 8, 0, FB_VMODE_NONINTERLACED }, /* 800x600 @ 85 Hz, 55.84 kHz hsync */ { NULL, 85, 800, 600, 16460, 160, 64, 36, 16, 64, 5, 0, FB_VMODE_NONINTERLACED }, /* 1024x768 @ 70 Hz, 56.5 kHz hsync */ { NULL, 70, 1024, 768, 13333, 144, 24, 29, 3, 136, 6, 0, FB_VMODE_NONINTERLACED }, /* 1280x1024 @ 87 Hz interlaced, 51 kHz hsync */ { NULL, 87, 1280, 1024, 12500, 56, 16, 128, 1, 216, 12, 0, FB_VMODE_INTERLACED }, /* 800x600 @ 100 Hz, 64.02 kHz hsync */ { NULL, 100, 800, 600, 14357, 160, 64, 30, 4, 64, 6, 0, FB_VMODE_NONINTERLACED }, /* 1024x768 @ 76 Hz, 62.5 kHz hsync */ { NULL, 76, 1024, 768, 11764, 208, 8, 36, 16, 120, 3, 0, FB_VMODE_NONINTERLACED }, /* 1152x864 @ 70 Hz, 62.4 kHz hsync */ { NULL, 70, 1152, 864, 10869, 106, 56, 20, 1, 160, 10, 0, FB_VMODE_NONINTERLACED }, /* 1280x1024 @ 61 Hz, 64.2 kHz hsync */ { NULL, 61, 1280, 1024, 9090, 200, 48, 26, 1, 184, 3, 0, FB_VMODE_NONINTERLACED }, /* 1400x1050 @ 60Hz, 63.9 kHz hsync */ { NULL, 60, 1400, 1050, 9259, 136, 40, 13, 1, 112, 3, 0, FB_VMODE_NONINTERLACED }, /* 1400x1050 @ 75,107 Hz, 82,392 kHz +hsync +vsync*/ { NULL, 75, 1400, 1050, 7190, 120, 56, 23, 10, 112, 13, FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED }, /* 1400x1050 @ 60 Hz, ? kHz +hsync +vsync*/ { NULL, 60, 1400, 1050, 9259, 128, 40, 12, 0, 112, 3, FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED }, /* 1024x768 @ 85 Hz, 70.24 kHz hsync */ { NULL, 85, 1024, 768, 10111, 192, 32, 34, 14, 160, 6, 0, FB_VMODE_NONINTERLACED }, /* 1152x864 @ 78 Hz, 70.8 kHz hsync */ { NULL, 78, 1152, 864, 9090, 228, 88, 32, 0, 84, 12, 0, FB_VMODE_NONINTERLACED }, /* 1280x1024 @ 70 Hz, 74.59 kHz hsync */ { NULL, 70, 1280, 1024, 7905, 224, 32, 28, 8, 160, 8, 0, FB_VMODE_NONINTERLACED }, /* 1600x1200 @ 60Hz, 75.00 kHz hsync */ { NULL, 60, 1600, 1200, 6172, 304, 64, 46, 1, 192, 3, FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED }, /* 1152x864 @ 84 Hz, 76.0 kHz hsync */ { NULL, 84, 1152, 864, 7407, 184, 312, 32, 0, 128, 12, 0, FB_VMODE_NONINTERLACED }, /* 1280x1024 @ 74 Hz, 78.85 kHz hsync */ { NULL, 74, 1280, 1024, 7407, 256, 32, 34, 3, 144, 3, 0, FB_VMODE_NONINTERLACED }, /* 1024x768 @ 100Hz, 80.21 kHz hsync */ { NULL, 100, 1024, 768, 8658, 192, 32, 21, 3, 192, 10, 0, FB_VMODE_NONINTERLACED }, /* 1280x1024 @ 76 Hz, 81.13 kHz hsync */ { NULL, 76, 1280, 1024, 7407, 248, 32, 34, 3, 104, 3, 0, FB_VMODE_NONINTERLACED }, /* 1600x1200 @ 70 Hz, 87.50 kHz hsync */ { NULL, 70, 1600, 1200, 5291, 304, 64, 46, 1, 192, 3, 0, FB_VMODE_NONINTERLACED }, /* 1152x864 @ 100 Hz, 89.62 kHz hsync */ { NULL, 100, 1152, 864, 7264, 224, 32, 17, 2, 128, 19, 0, FB_VMODE_NONINTERLACED }, /* 1280x1024 @ 85 Hz, 91.15 kHz hsync */ { NULL, 85, 1280, 1024, 6349, 224, 64, 44, 1, 160, 3, FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED }, /* 1600x1200 @ 75 Hz, 93.75 kHz hsync */ { NULL, 75, 1600, 1200, 4938, 304, 64, 46, 1, 192, 3, FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED }, /* 1680x1050 @ 60 Hz, 65.191 kHz hsync */ { NULL, 60, 1680, 1050, 6848, 280, 104, 30, 3, 176, 6, FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED }, /* 1600x1200 @ 85 Hz, 105.77 kHz hsync */ { NULL, 85, 1600, 1200, 4545, 272, 16, 37, 4, 192, 3, FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED }, /* 1280x1024 @ 100 Hz, 107.16 kHz hsync */ { NULL, 100, 1280, 1024, 5502, 256, 32, 26, 7, 128, 15, 0, FB_VMODE_NONINTERLACED }, /* 1800x1440 @ 64Hz, 96.15 kHz hsync */ { NULL, 64, 1800, 1440, 4347, 304, 96, 46, 1, 192, 3, FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED }, /* 1800x1440 @ 70Hz, 104.52 kHz hsync */ { NULL, 70, 1800, 1440, 4000, 304, 96, 46, 1, 192, 3, FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED }, /* 512x384 @ 78 Hz, 31.50 kHz hsync */ { NULL, 78, 512, 384, 49603, 48, 16, 16, 1, 64, 3, 0, FB_VMODE_NONINTERLACED }, /* 512x384 @ 85 Hz, 34.38 kHz hsync */ { NULL, 85, 512, 384, 45454, 48, 16, 16, 1, 64, 3, 0, FB_VMODE_NONINTERLACED }, /* 320x200 @ 70 Hz, 31.5 kHz hsync, 8:5 aspect ratio */ { NULL, 70, 320, 200, 79440, 16, 16, 20, 4, 48, 1, 0, FB_VMODE_DOUBLE }, /* 320x240 @ 60 Hz, 31.5 kHz hsync, 4:3 aspect ratio */ { NULL, 60, 320, 240, 79440, 16, 16, 16, 5, 48, 1, 0, FB_VMODE_DOUBLE }, /* 320x240 @ 72 Hz, 36.5 kHz hsync */ { NULL, 72, 320, 240, 63492, 16, 16, 16, 4, 48, 2, 0, FB_VMODE_DOUBLE }, /* 400x300 @ 56 Hz, 35.2 kHz hsync, 4:3 aspect ratio */ { NULL, 56, 400, 300, 55555, 64, 16, 10, 1, 32, 1, 0, FB_VMODE_DOUBLE }, /* 400x300 @ 60 Hz, 37.8 kHz hsync */ { NULL, 60, 400, 300, 50000, 48, 16, 11, 1, 64, 2, 0, FB_VMODE_DOUBLE }, /* 400x300 @ 72 Hz, 48.0 kHz hsync */ { NULL, 72, 400, 300, 40000, 32, 24, 11, 19, 64, 3, 0, FB_VMODE_DOUBLE }, /* 480x300 @ 56 Hz, 35.2 kHz hsync, 8:5 aspect ratio */ { NULL, 56, 480, 300, 46176, 80, 16, 10, 1, 40, 1, 0, FB_VMODE_DOUBLE }, /* 480x300 @ 60 Hz, 37.8 kHz hsync */ { NULL, 60, 480, 300, 41858, 56, 16, 11, 1, 80, 2, 0, FB_VMODE_DOUBLE }, /* 480x300 @ 63 Hz, 39.6 kHz hsync */ { NULL, 63, 480, 300, 40000, 56, 16, 11, 1, 80, 2, 0, FB_VMODE_DOUBLE }, /* 480x300 @ 72 Hz, 48.0 kHz hsync */ { NULL, 72, 480, 300, 33386, 40, 24, 11, 19, 80, 3, 0, FB_VMODE_DOUBLE }, /* 1920x1080 @ 60 Hz, 67.3 kHz hsync */ { NULL, 60, 1920, 1080, 6734, 148, 88, 36, 4, 44, 5, 0, FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED }, /* 1920x1200 @ 60 Hz, 74.5 Khz hsync */ { NULL, 60, 1920, 1200, 5177, 128, 336, 1, 38, 208, 3, FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED }, /* 1152x768, 60 Hz, PowerBook G4 Titanium I and II */ { NULL, 60, 1152, 768, 14047, 158, 26, 29, 3, 136, 6, FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED }, /* 1366x768, 60 Hz, 47.403 kHz hsync, WXGA 16:9 aspect ratio */ { NULL, 60, 1366, 768, 13806, 120, 10, 14, 3, 32, 5, 0, FB_VMODE_NONINTERLACED }, /* 1280x800, 60 Hz, 47.403 kHz hsync, WXGA 16:10 aspect ratio */ { NULL, 60, 1280, 800, 12048, 200, 64, 24, 1, 136, 3, 0, FB_VMODE_NONINTERLACED }, /* 720x576i @ 50 Hz, 15.625 kHz hsync (PAL RGB) */ { NULL, 50, 720, 576, 74074, 64, 16, 39, 5, 64, 5, 0, FB_VMODE_INTERLACED }, /* 800x520i @ 50 Hz, 15.625 kHz hsync (PAL RGB) */ { NULL, 50, 800, 520, 58823, 144, 64, 72, 28, 80, 5, 0, FB_VMODE_INTERLACED }, /* 864x480 @ 60 Hz, 35.15 kHz hsync */ { NULL, 60, 864, 480, 27777, 1, 1, 1, 1, 0, 0, 0, FB_VMODE_NONINTERLACED }, }; #ifdef CONFIG_FB_MODE_HELPERS const struct fb_videomode vesa_modes[] = { /* 0 640x350-85 VESA */ { NULL, 85, 640, 350, 31746, 96, 32, 60, 32, 64, 3, FB_SYNC_HOR_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA}, /* 1 640x400-85 VESA */ { NULL, 85, 640, 400, 31746, 96, 32, 41, 01, 64, 3, FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 2 720x400-85 VESA */ { NULL, 85, 721, 400, 28169, 108, 36, 42, 01, 72, 3, FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 3 640x480-60 VESA */ { NULL, 60, 640, 480, 39682, 48, 16, 33, 10, 96, 2, 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 4 640x480-72 VESA */ { NULL, 72, 640, 480, 31746, 128, 24, 29, 9, 40, 2, 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 5 640x480-75 VESA */ { NULL, 75, 640, 480, 31746, 120, 16, 16, 01, 64, 3, 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 6 640x480-85 VESA */ { NULL, 85, 640, 480, 27777, 80, 56, 25, 01, 56, 3, 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 7 800x600-56 VESA */ { NULL, 56, 800, 600, 27777, 128, 24, 22, 01, 72, 2, FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 8 800x600-60 VESA */ { NULL, 60, 800, 600, 25000, 88, 40, 23, 01, 128, 4, FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 9 800x600-72 VESA */ { NULL, 72, 800, 600, 20000, 64, 56, 23, 37, 120, 6, FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 10 800x600-75 VESA */ { NULL, 75, 800, 600, 20202, 160, 16, 21, 01, 80, 3, FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 11 800x600-85 VESA */ { NULL, 85, 800, 600, 17761, 152, 32, 27, 01, 64, 3, FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 12 1024x768i-43 VESA */ { NULL, 43, 1024, 768, 22271, 56, 8, 41, 0, 176, 8, FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, FB_VMODE_INTERLACED, FB_MODE_IS_VESA }, /* 13 1024x768-60 VESA */ { NULL, 60, 1024, 768, 15384, 160, 24, 29, 3, 136, 6, 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 14 1024x768-70 VESA */ { NULL, 70, 1024, 768, 13333, 144, 24, 29, 3, 136, 6, 0, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 15 1024x768-75 VESA */ { NULL, 75, 1024, 768, 12690, 176, 16, 28, 1, 96, 3, FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 16 1024x768-85 VESA */ { NULL, 85, 1024, 768, 10582, 208, 48, 36, 1, 96, 3, FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 17 1152x864-75 VESA */ { NULL, 75, 1152, 864, 9259, 256, 64, 32, 1, 128, 3, FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 18 1280x960-60 VESA */ { NULL, 60, 1280, 960, 9259, 312, 96, 36, 1, 112, 3, FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 19 1280x960-85 VESA */ { NULL, 85, 1280, 960, 6734, 224, 64, 47, 1, 160, 3, FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 20 1280x1024-60 VESA */ { NULL, 60, 1280, 1024, 9259, 248, 48, 38, 1, 112, 3, FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 21 1280x1024-75 VESA */ { NULL, 75, 1280, 1024, 7407, 248, 16, 38, 1, 144, 3, FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 22 1280x1024-85 VESA */ { NULL, 85, 1280, 1024, 6349, 224, 64, 44, 1, 160, 3, FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 23 1600x1200-60 VESA */ { NULL, 60, 1600, 1200, 6172, 304, 64, 46, 1, 192, 3, FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 24 1600x1200-65 VESA */ { NULL, 65, 1600, 1200, 5698, 304, 64, 46, 1, 192, 3, FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 25 1600x1200-70 VESA */ { NULL, 70, 1600, 1200, 5291, 304, 64, 46, 1, 192, 3, FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 26 1600x1200-75 VESA */ { NULL, 75, 1600, 1200, 4938, 304, 64, 46, 1, 192, 3, FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 27 1600x1200-85 VESA */ { NULL, 85, 1600, 1200, 4357, 304, 64, 46, 1, 192, 3, FB_SYNC_HOR_HIGH_ACT | FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 28 1792x1344-60 VESA */ { NULL, 60, 1792, 1344, 4882, 328, 128, 46, 1, 200, 3, FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 29 1792x1344-75 VESA */ { NULL, 75, 1792, 1344, 3831, 352, 96, 69, 1, 216, 3, FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 30 1856x1392-60 VESA */ { NULL, 60, 1856, 1392, 4580, 352, 96, 43, 1, 224, 3, FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 31 1856x1392-75 VESA */ { NULL, 75, 1856, 1392, 3472, 352, 128, 104, 1, 224, 3, FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 32 1920x1440-60 VESA */ { NULL, 60, 1920, 1440, 4273, 344, 128, 56, 1, 200, 3, FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 33 1920x1440-75 VESA */ { NULL, 75, 1920, 1440, 3367, 352, 144, 56, 1, 224, 3, FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 34 1920x1200-60 RB VESA */ { NULL, 60, 1920, 1200, 6493, 80, 48, 26, 3, 32, 6, FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 35 1920x1200-60 VESA */ { NULL, 60, 1920, 1200, 5174, 336, 136, 36, 3, 200, 6, FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 36 1920x1200-75 VESA */ { NULL, 75, 1920, 1200, 4077, 344, 136, 46, 3, 208, 6, FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 37 1920x1200-85 VESA */ { NULL, 85, 1920, 1200, 3555, 352, 144, 53, 3, 208, 6, FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 38 2560x1600-60 RB VESA */ { NULL, 60, 2560, 1600, 3724, 80, 48, 37, 3, 32, 6, FB_SYNC_HOR_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 39 2560x1600-60 VESA */ { NULL, 60, 2560, 1600, 2869, 472, 192, 49, 3, 280, 6, FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 40 2560x1600-75 VESA */ { NULL, 75, 2560, 1600, 2256, 488, 208, 63, 3, 280, 6, FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 41 2560x1600-85 VESA */ { NULL, 85, 2560, 1600, 1979, 488, 208, 73, 3, 280, 6, FB_SYNC_VERT_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, /* 42 2560x1600-120 RB VESA */ { NULL, 120, 2560, 1600, 1809, 80, 48, 85, 3, 32, 6, FB_SYNC_HOR_HIGH_ACT, FB_VMODE_NONINTERLACED, FB_MODE_IS_VESA }, }; EXPORT_SYMBOL(vesa_modes); const struct dmt_videomode dmt_modes[DMT_SIZE] = { { 0x01, 0x0000, 0x000000, &vesa_modes[0] }, { 0x02, 0x3119, 0x000000, &vesa_modes[1] }, { 0x03, 0x0000, 0x000000, &vesa_modes[2] }, { 0x04, 0x3140, 0x000000, &vesa_modes[3] }, { 0x05, 0x314c, 0x000000, &vesa_modes[4] }, { 0x06, 0x314f, 0x000000, &vesa_modes[5] }, { 0x07, 0x3159, 0x000000, &vesa_modes[6] }, { 0x08, 0x0000, 0x000000, &vesa_modes[7] }, { 0x09, 0x4540, 0x000000, &vesa_modes[8] }, { 0x0a, 0x454c, 0x000000, &vesa_modes[9] }, { 0x0b, 0x454f, 0x000000, &vesa_modes[10] }, { 0x0c, 0x4559, 0x000000, &vesa_modes[11] }, { 0x0d, 0x0000, 0x000000, NULL }, { 0x0e, 0x0000, 0x000000, NULL }, { 0x0f, 0x0000, 0x000000, &vesa_modes[12] }, { 0x10, 0x6140, 0x000000, &vesa_modes[13] }, { 0x11, 0x614a, 0x000000, &vesa_modes[14] }, { 0x12, 0x614f, 0x000000, &vesa_modes[15] }, { 0x13, 0x6159, 0x000000, &vesa_modes[16] }, { 0x14, 0x0000, 0x000000, NULL }, { 0x15, 0x714f, 0x000000, &vesa_modes[17] }, { 0x16, 0x0000, 0x7f1c21, NULL }, { 0x17, 0x0000, 0x7f1c28, NULL }, { 0x18, 0x0000, 0x7f1c44, NULL }, { 0x19, 0x0000, 0x7f1c62, NULL }, { 0x1a, 0x0000, 0x000000, NULL }, { 0x1b, 0x0000, 0x8f1821, NULL }, { 0x1c, 0x8100, 0x8f1828, NULL }, { 0x1d, 0x810f, 0x8f1844, NULL }, { 0x1e, 0x8119, 0x8f1862, NULL }, { 0x1f, 0x0000, 0x000000, NULL }, { 0x20, 0x8140, 0x000000, &vesa_modes[18] }, { 0x21, 0x8159, 0x000000, &vesa_modes[19] }, { 0x22, 0x0000, 0x000000, NULL }, { 0x23, 0x8180, 0x000000, &vesa_modes[20] }, { 0x24, 0x818f, 0x000000, &vesa_modes[21] }, { 0x25, 0x8199, 0x000000, &vesa_modes[22] }, { 0x26, 0x0000, 0x000000, NULL }, { 0x27, 0x0000, 0x000000, NULL }, { 0x28, 0x0000, 0x000000, NULL }, { 0x29, 0x0000, 0x0c2021, NULL }, { 0x2a, 0x9040, 0x0c2028, NULL }, { 0x2b, 0x904f, 0x0c2044, NULL }, { 0x2c, 0x9059, 0x0c2062, NULL }, { 0x2d, 0x0000, 0x000000, NULL }, { 0x2e, 0x9500, 0xc11821, NULL }, { 0x2f, 0x9500, 0xc11828, NULL }, { 0x30, 0x950f, 0xc11844, NULL }, { 0x31, 0x9519, 0xc11868, NULL }, { 0x32, 0x0000, 0x000000, NULL }, { 0x33, 0xa940, 0x000000, &vesa_modes[23] }, { 0x34, 0xa945, 0x000000, &vesa_modes[24] }, { 0x35, 0xa94a, 0x000000, &vesa_modes[25] }, { 0x36, 0xa94f, 0x000000, &vesa_modes[26] }, { 0x37, 0xa959, 0x000000, &vesa_modes[27] }, { 0x38, 0x0000, 0x000000, NULL }, { 0x39, 0x0000, 0x0c2821, NULL }, { 0x3a, 0xb300, 0x0c2828, NULL }, { 0x3b, 0xb30f, 0x0c2844, NULL }, { 0x3c, 0xb319, 0x0c2868, NULL }, { 0x3d, 0x0000, 0x000000, NULL }, { 0x3e, 0xc140, 0x000000, &vesa_modes[28] }, { 0x3f, 0xc14f, 0x000000, &vesa_modes[29] }, { 0x40, 0x0000, 0x000000, NULL}, { 0x41, 0xc940, 0x000000, &vesa_modes[30] }, { 0x42, 0xc94f, 0x000000, &vesa_modes[31] }, { 0x43, 0x0000, 0x000000, NULL }, { 0x44, 0x0000, 0x572821, &vesa_modes[34] }, { 0x45, 0xd100, 0x572828, &vesa_modes[35] }, { 0x46, 0xd10f, 0x572844, &vesa_modes[36] }, { 0x47, 0xd119, 0x572862, &vesa_modes[37] }, { 0x48, 0x0000, 0x000000, NULL }, { 0x49, 0xd140, 0x000000, &vesa_modes[32] }, { 0x4a, 0xd14f, 0x000000, &vesa_modes[33] }, { 0x4b, 0x0000, 0x000000, NULL }, { 0x4c, 0x0000, 0x1f3821, &vesa_modes[38] }, { 0x4d, 0x0000, 0x1f3828, &vesa_modes[39] }, { 0x4e, 0x0000, 0x1f3844, &vesa_modes[40] }, { 0x4f, 0x0000, 0x1f3862, &vesa_modes[41] }, { 0x50, 0x0000, 0x000000, &vesa_modes[42] }, }; EXPORT_SYMBOL(dmt_modes); #endif /* CONFIG_FB_MODE_HELPERS */ /** * fb_try_mode - test a video mode * @var: frame buffer user defined part of display * @info: frame buffer info structure * @mode: frame buffer video mode structure * @bpp: color depth in bits per pixel * * Tries a video mode to test it's validity for device @info. * * Returns 1 on success. * */ static int fb_try_mode(struct fb_var_screeninfo *var, struct fb_info *info, const struct fb_videomode *mode, unsigned int bpp) { int err = 0; DPRINTK("Trying mode %s %dx%d-%d@%d\n", mode->name ? mode->name : "noname", mode->xres, mode->yres, bpp, mode->refresh); var->xres = mode->xres; var->yres = mode->yres; var->xres_virtual = mode->xres; var->yres_virtual = mode->yres; var->xoffset = 0; var->yoffset = 0; var->bits_per_pixel = bpp; var->activate |= FB_ACTIVATE_TEST; var->pixclock = mode->pixclock; var->left_margin = mode->left_margin; var->right_margin = mode->right_margin; var->upper_margin = mode->upper_margin; var->lower_margin = mode->lower_margin; var->hsync_len = mode->hsync_len; var->vsync_len = mode->vsync_len; var->sync = mode->sync; var->vmode = mode->vmode; if (info->fbops->fb_check_var) err = info->fbops->fb_check_var(var, info); var->activate &= ~FB_ACTIVATE_TEST; return err; } /** * fb_find_mode - finds a valid video mode * @var: frame buffer user defined part of display * @info: frame buffer info structure * @mode_option: string video mode to find * @db: video mode database * @dbsize: size of @db * @default_mode: default video mode to fall back to * @default_bpp: default color depth in bits per pixel * * Finds a suitable video mode, starting with the specified mode * in @mode_option with fallback to @default_mode. If * @default_mode fails, all modes in the video mode database will * be tried. * * Valid mode specifiers for @mode_option:: * * <xres>x<yres>[M][R][-<bpp>][@<refresh>][i][p][m] * * or :: * * <name>[-<bpp>][@<refresh>] * * with <xres>, <yres>, <bpp> and <refresh> decimal numbers and * <name> a string. * * If 'M' is present after yres (and before refresh/bpp if present), * the function will compute the timings using VESA(tm) Coordinated * Video Timings (CVT). If 'R' is present after 'M', will compute with * reduced blanking (for flatpanels). If 'i' or 'p' are present, compute * interlaced or progressive mode. If 'm' is present, add margins equal * to 1.8% of xres rounded down to 8 pixels, and 1.8% of yres. The char * 'i', 'p' and 'm' must be after 'M' and 'R'. Example:: * * 1024x768MR-8@60m - Reduced blank with margins at 60Hz. * * NOTE: The passed struct @var is _not_ cleared! This allows you * to supply values for e.g. the grayscale and accel_flags fields. * * Returns zero for failure, 1 if using specified @mode_option, * 2 if using specified @mode_option with an ignored refresh rate, * 3 if default mode is used, 4 if fall back to any valid mode. */ int fb_find_mode(struct fb_var_screeninfo *var, struct fb_info *info, const char *mode_option, const struct fb_videomode *db, unsigned int dbsize, const struct fb_videomode *default_mode, unsigned int default_bpp) { char *mode_option_buf = NULL; int i; /* Set up defaults */ if (!db) { db = modedb; dbsize = ARRAY_SIZE(modedb); } if (!default_mode) default_mode = &db[0]; if (!default_bpp) default_bpp = 8; /* Did the user specify a video mode? */ if (!mode_option) { fb_get_options(NULL, &mode_option_buf); mode_option = mode_option_buf; } if (mode_option) { const char *name = mode_option; unsigned int namelen = strlen(name); int res_specified = 0, bpp_specified = 0, refresh_specified = 0; unsigned int xres = 0, yres = 0, bpp = default_bpp, refresh = 0; int yres_specified = 0, cvt = 0, rb = 0; int interlace_specified = 0, interlace = 0; int margins = 0; u32 best, diff, tdiff; for (i = namelen-1; i >= 0; i--) { switch (name[i]) { case '@': namelen = i; if (!refresh_specified && !bpp_specified && !yres_specified) { refresh = simple_strtol(&name[i+1], NULL, 10); refresh_specified = 1; if (cvt || rb) cvt = 0; } else goto done; break; case '-': namelen = i; if (!bpp_specified && !yres_specified) { bpp = simple_strtol(&name[i+1], NULL, 10); bpp_specified = 1; if (cvt || rb) cvt = 0; } else goto done; break; case 'x': if (!yres_specified) { yres = simple_strtol(&name[i+1], NULL, 10); yres_specified = 1; } else goto done; break; case '0' ... '9': break; case 'M': if (!yres_specified) cvt = 1; break; case 'R': if (!cvt) rb = 1; break; case 'm': if (!cvt) margins = 1; break; case 'p': if (!cvt) { interlace = 0; interlace_specified = 1; } break; case 'i': if (!cvt) { interlace = 1; interlace_specified = 1; } break; default: goto done; } } if (i < 0 && yres_specified) { xres = simple_strtol(name, NULL, 10); res_specified = 1; } done: kfree(mode_option_buf); if (cvt) { struct fb_videomode cvt_mode; int ret; DPRINTK("CVT mode %dx%d@%dHz%s%s%s\n", xres, yres, (refresh) ? refresh : 60, (rb) ? " reduced blanking" : "", (margins) ? " with margins" : "", (interlace) ? " interlaced" : ""); memset(&cvt_mode, 0, sizeof(cvt_mode)); cvt_mode.xres = xres; cvt_mode.yres = yres; cvt_mode.refresh = (refresh) ? refresh : 60; if (interlace) cvt_mode.vmode |= FB_VMODE_INTERLACED; else cvt_mode.vmode &= ~FB_VMODE_INTERLACED; ret = fb_find_mode_cvt(&cvt_mode, margins, rb); if (!ret && !fb_try_mode(var, info, &cvt_mode, bpp)) { DPRINTK("modedb CVT: CVT mode ok\n"); return 1; } DPRINTK("CVT mode invalid, getting mode from database\n"); } DPRINTK("Trying specified video mode%s %ix%i\n", refresh_specified ? "" : " (ignoring refresh rate)", xres, yres); if (!refresh_specified) { /* * If the caller has provided a custom mode database and * a valid monspecs structure, we look for the mode with * the highest refresh rate. Otherwise we play it safe * it and try to find a mode with a refresh rate closest * to the standard 60 Hz. */ if (db != modedb && info->monspecs.vfmin && info->monspecs.vfmax && info->monspecs.hfmin && info->monspecs.hfmax && info->monspecs.dclkmax) { refresh = 1000; } else { refresh = 60; } } diff = -1; best = -1; for (i = 0; i < dbsize; i++) { if ((name_matches(db[i], name, namelen) || (res_specified && res_matches(db[i], xres, yres))) && !fb_try_mode(var, info, &db[i], bpp)) { const int db_interlace = (db[i].vmode & FB_VMODE_INTERLACED ? 1 : 0); int score = abs(db[i].refresh - refresh); if (interlace_specified) score += abs(db_interlace - interlace); if (!interlace_specified || db_interlace == interlace) if (refresh_specified && db[i].refresh == refresh) return 1; if (score < diff) { diff = score; best = i; } } } if (best != -1) { fb_try_mode(var, info, &db[best], bpp); return (refresh_specified) ? 2 : 1; } diff = 2 * (xres + yres); best = -1; DPRINTK("Trying best-fit modes\n"); for (i = 0; i < dbsize; i++) { DPRINTK("Trying %ix%i\n", db[i].xres, db[i].yres); if (!fb_try_mode(var, info, &db[i], bpp)) { tdiff = abs(db[i].xres - xres) + abs(db[i].yres - yres); /* * Penalize modes with resolutions smaller * than requested. */ if (xres > db[i].xres || yres > db[i].yres) tdiff += xres + yres; if (diff > tdiff) { diff = tdiff; best = i; } } } if (best != -1) { fb_try_mode(var, info, &db[best], bpp); return 5; } } DPRINTK("Trying default video mode\n"); if (!fb_try_mode(var, info, default_mode, default_bpp)) return 3; DPRINTK("Trying all modes\n"); for (i = 0; i < dbsize; i++) if (!fb_try_mode(var, info, &db[i], default_bpp)) return 4; DPRINTK("No valid mode found\n"); return 0; } /** * fb_var_to_videomode - convert fb_var_screeninfo to fb_videomode * @mode: pointer to struct fb_videomode * @var: pointer to struct fb_var_screeninfo */ void fb_var_to_videomode(struct fb_videomode *mode, const struct fb_var_screeninfo *var) { u32 pixclock, hfreq, htotal, vtotal; mode->name = NULL; mode->xres = var->xres; mode->yres = var->yres; mode->pixclock = var->pixclock; mode->hsync_len = var->hsync_len; mode->vsync_len = var->vsync_len; mode->left_margin = var->left_margin; mode->right_margin = var->right_margin; mode->upper_margin = var->upper_margin; mode->lower_margin = var->lower_margin; mode->sync = var->sync; mode->vmode = var->vmode & FB_VMODE_MASK; mode->flag = FB_MODE_IS_FROM_VAR; mode->refresh = 0; if (!var->pixclock) return; pixclock = PICOS2KHZ(var->pixclock) * 1000; htotal = var->xres + var->right_margin + var->hsync_len + var->left_margin; vtotal = var->yres + var->lower_margin + var->vsync_len + var->upper_margin; if (var->vmode & FB_VMODE_INTERLACED) vtotal /= 2; if (var->vmode & FB_VMODE_DOUBLE) vtotal *= 2; if (!htotal || !vtotal) return; hfreq = pixclock/htotal; mode->refresh = hfreq/vtotal; } /** * fb_videomode_to_var - convert fb_videomode to fb_var_screeninfo * @var: pointer to struct fb_var_screeninfo * @mode: pointer to struct fb_videomode */ void fb_videomode_to_var(struct fb_var_screeninfo *var, const struct fb_videomode *mode) { var->xres = mode->xres; var->yres = mode->yres; var->xres_virtual = mode->xres; var->yres_virtual = mode->yres; var->xoffset = 0; var->yoffset = 0; var->pixclock = mode->pixclock; var->left_margin = mode->left_margin; var->right_margin = mode->right_margin; var->upper_margin = mode->upper_margin; var->lower_margin = mode->lower_margin; var->hsync_len = mode->hsync_len; var->vsync_len = mode->vsync_len; var->sync = mode->sync; var->vmode = mode->vmode & FB_VMODE_MASK; } /** * fb_mode_is_equal - compare 2 videomodes * @mode1: first videomode * @mode2: second videomode * * RETURNS: * 1 if equal, 0 if not */ int fb_mode_is_equal(const struct fb_videomode *mode1, const struct fb_videomode *mode2) { return (mode1->xres == mode2->xres && mode1->yres == mode2->yres && mode1->pixclock == mode2->pixclock && mode1->hsync_len == mode2->hsync_len && mode1->vsync_len == mode2->vsync_len && mode1->left_margin == mode2->left_margin && mode1->right_margin == mode2->right_margin && mode1->upper_margin == mode2->upper_margin && mode1->lower_margin == mode2->lower_margin && mode1->sync == mode2->sync && mode1->vmode == mode2->vmode); } /** * fb_find_best_mode - find best matching videomode * @var: pointer to struct fb_var_screeninfo * @head: pointer to struct list_head of modelist * * RETURNS: * struct fb_videomode, NULL if none found * * IMPORTANT: * This function assumes that all modelist entries in * info->modelist are valid. * * NOTES: * Finds best matching videomode which has an equal or greater dimension than * var->xres and var->yres. If more than 1 videomode is found, will return * the videomode with the highest refresh rate */ const struct fb_videomode *fb_find_best_mode(const struct fb_var_screeninfo *var, struct list_head *head) { struct fb_modelist *modelist; struct fb_videomode *mode, *best = NULL; u32 diff = -1; list_for_each_entry(modelist, head, list) { u32 d; mode = &modelist->mode; if (mode->xres >= var->xres && mode->yres >= var->yres) { d = (mode->xres - var->xres) + (mode->yres - var->yres); if (diff > d) { diff = d; best = mode; } else if (diff == d && best && mode->refresh > best->refresh) best = mode; } } return best; } /** * fb_find_nearest_mode - find closest videomode * * @mode: pointer to struct fb_videomode * @head: pointer to modelist * * Finds best matching videomode, smaller or greater in dimension. * If more than 1 videomode is found, will return the videomode with * the closest refresh rate. */ const struct fb_videomode *fb_find_nearest_mode(const struct fb_videomode *mode, struct list_head *head) { struct fb_modelist *modelist; struct fb_videomode *cmode, *best = NULL; u32 diff = -1, diff_refresh = -1; list_for_each_entry(modelist, head, list) { u32 d; cmode = &modelist->mode; d = abs(cmode->xres - mode->xres) + abs(cmode->yres - mode->yres); if (diff > d) { diff = d; diff_refresh = abs(cmode->refresh - mode->refresh); best = cmode; } else if (diff == d) { d = abs(cmode->refresh - mode->refresh); if (diff_refresh > d) { diff_refresh = d; best = cmode; } } } return best; } /** * fb_match_mode - find a videomode which exactly matches the timings in var * @var: pointer to struct fb_var_screeninfo * @head: pointer to struct list_head of modelist * * RETURNS: * struct fb_videomode, NULL if none found */ const struct fb_videomode *fb_match_mode(const struct fb_var_screeninfo *var, struct list_head *head) { struct fb_modelist *modelist; struct fb_videomode *m, mode; fb_var_to_videomode(&mode, var); list_for_each_entry(modelist, head, list) { m = &modelist->mode; if (fb_mode_is_equal(m, &mode)) return m; } return NULL; } /** * fb_add_videomode - adds videomode entry to modelist * @mode: videomode to add * @head: struct list_head of modelist * * NOTES: * Will only add unmatched mode entries */ int fb_add_videomode(const struct fb_videomode *mode, struct list_head *head) { struct fb_modelist *modelist; struct fb_videomode *m; int found = 0; list_for_each_entry(modelist, head, list) { m = &modelist->mode; if (fb_mode_is_equal(m, mode)) { found = 1; break; } } if (!found) { modelist = kmalloc(sizeof(struct fb_modelist), GFP_KERNEL); if (!modelist) return -ENOMEM; modelist->mode = *mode; list_add(&modelist->list, head); } return 0; } /** * fb_delete_videomode - removed videomode entry from modelist * @mode: videomode to remove * @head: struct list_head of modelist * * NOTES: * Will remove all matching mode entries */ void fb_delete_videomode(const struct fb_videomode *mode, struct list_head *head) { struct list_head *pos, *n; struct fb_modelist *modelist; struct fb_videomode *m; list_for_each_safe(pos, n, head) { modelist = list_entry(pos, struct fb_modelist, list); m = &modelist->mode; if (fb_mode_is_equal(m, mode)) { list_del(pos); kfree(pos); } } } /** * fb_destroy_modelist - destroy modelist * @head: struct list_head of modelist */ void fb_destroy_modelist(struct list_head *head) { struct list_head *pos, *n; list_for_each_safe(pos, n, head) { list_del(pos); kfree(pos); } } EXPORT_SYMBOL_GPL(fb_destroy_modelist); /** * fb_videomode_to_modelist - convert mode array to mode list * @modedb: array of struct fb_videomode * @num: number of entries in array * @head: struct list_head of modelist */ void fb_videomode_to_modelist(const struct fb_videomode *modedb, int num, struct list_head *head) { int i; INIT_LIST_HEAD(head); for (i = 0; i < num; i++) { if (fb_add_videomode(&modedb[i], head)) return; } } const struct fb_videomode *fb_find_best_display(const struct fb_monspecs *specs, struct list_head *head) { struct fb_modelist *modelist; const struct fb_videomode *m, *m1 = NULL, *md = NULL, *best = NULL; int first = 0; if (!head->prev || !head->next || list_empty(head)) goto finished; /* get the first detailed mode and the very first mode */ list_for_each_entry(modelist, head, list) { m = &modelist->mode; if (!first) { m1 = m; first = 1; } if (m->flag & FB_MODE_IS_FIRST) { md = m; break; } } /* first detailed timing is preferred */ if (specs->misc & FB_MISC_1ST_DETAIL) { best = md; goto finished; } /* find best mode based on display width and height */ if (specs->max_x && specs->max_y) { struct fb_var_screeninfo var; memset(&var, 0, sizeof(struct fb_var_screeninfo)); var.xres = (specs->max_x * 7200)/254; var.yres = (specs->max_y * 7200)/254; m = fb_find_best_mode(&var, head); if (m) { best = m; goto finished; } } /* use first detailed mode */ if (md) { best = md; goto finished; } /* last resort, use the very first mode */ best = m1; finished: return best; } EXPORT_SYMBOL(fb_find_best_display); EXPORT_SYMBOL(fb_videomode_to_var); EXPORT_SYMBOL(fb_var_to_videomode); EXPORT_SYMBOL(fb_mode_is_equal); EXPORT_SYMBOL(fb_add_videomode); EXPORT_SYMBOL(fb_match_mode); EXPORT_SYMBOL(fb_find_best_mode); EXPORT_SYMBOL(fb_find_nearest_mode); EXPORT_SYMBOL(fb_videomode_to_modelist); EXPORT_SYMBOL(fb_find_mode); EXPORT_SYMBOL(fb_find_mode_cvt); |
13 9 5 5 4 5 5 12 12 3 1 2 7 7 6 6 6 3 3 3 3 17 3 14 6 10 12 1 2 1 1 23 2 24 1 20 5 3 20 3 10 4 10 1 10 41 1 2 1 3 3 3 3 3 3 7 6 1999 1999 1999 851 126 3 3 3 3 3 3 3 44 47 48 12 1 1 1 3 1 5 4 1 1 32 26 26 1 1 1 26 18 1 44 44 2 40 26 26 1 26 26 1 8 4 3 1 1 3 3 3 3 3 23 2 2 2 47 2 45 47 47 46 2 47 1 48 1 1 1 45 1 45 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 2 3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 | // SPDX-License-Identifier: GPL-2.0 /* net/sched/sch_taprio.c Time Aware Priority Scheduler * * Authors: Vinicius Costa Gomes <vinicius.gomes@intel.com> * */ #include <linux/ethtool.h> #include <linux/ethtool_netlink.h> #include <linux/types.h> #include <linux/slab.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/list.h> #include <linux/errno.h> #include <linux/skbuff.h> #include <linux/math64.h> #include <linux/module.h> #include <linux/spinlock.h> #include <linux/rcupdate.h> #include <linux/time.h> #include <net/gso.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> #include <net/sch_generic.h> #include <net/sock.h> #include <net/tcp.h> #define TAPRIO_STAT_NOT_SET (~0ULL) #include "sch_mqprio_lib.h" static LIST_HEAD(taprio_list); static struct static_key_false taprio_have_broken_mqprio; static struct static_key_false taprio_have_working_mqprio; #define TAPRIO_ALL_GATES_OPEN -1 #define TXTIME_ASSIST_IS_ENABLED(flags) ((flags) & TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST) #define FULL_OFFLOAD_IS_ENABLED(flags) ((flags) & TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD) #define TAPRIO_FLAGS_INVALID U32_MAX struct sched_entry { /* Durations between this GCL entry and the GCL entry where the * respective traffic class gate closes */ u64 gate_duration[TC_MAX_QUEUE]; atomic_t budget[TC_MAX_QUEUE]; /* The qdisc makes some effort so that no packet leaves * after this time */ ktime_t gate_close_time[TC_MAX_QUEUE]; struct list_head list; /* Used to calculate when to advance the schedule */ ktime_t end_time; ktime_t next_txtime; int index; u32 gate_mask; u32 interval; u8 command; }; struct sched_gate_list { /* Longest non-zero contiguous gate durations per traffic class, * or 0 if a traffic class gate never opens during the schedule. */ u64 max_open_gate_duration[TC_MAX_QUEUE]; u32 max_frm_len[TC_MAX_QUEUE]; /* for the fast path */ u32 max_sdu[TC_MAX_QUEUE]; /* for dump */ struct rcu_head rcu; struct list_head entries; size_t num_entries; ktime_t cycle_end_time; s64 cycle_time; s64 cycle_time_extension; s64 base_time; }; struct taprio_sched { struct Qdisc **qdiscs; struct Qdisc *root; u32 flags; enum tk_offsets tk_offset; int clockid; bool offloaded; bool detected_mqprio; bool broken_mqprio; atomic64_t picos_per_byte; /* Using picoseconds because for 10Gbps+ * speeds it's sub-nanoseconds per byte */ /* Protects the update side of the RCU protected current_entry */ spinlock_t current_entry_lock; struct sched_entry __rcu *current_entry; struct sched_gate_list __rcu *oper_sched; struct sched_gate_list __rcu *admin_sched; struct hrtimer advance_timer; struct list_head taprio_list; int cur_txq[TC_MAX_QUEUE]; u32 max_sdu[TC_MAX_QUEUE]; /* save info from the user */ u32 fp[TC_QOPT_MAX_QUEUE]; /* only for dump and offloading */ u32 txtime_delay; }; struct __tc_taprio_qopt_offload { refcount_t users; struct tc_taprio_qopt_offload offload; }; static void taprio_calculate_gate_durations(struct taprio_sched *q, struct sched_gate_list *sched) { struct net_device *dev = qdisc_dev(q->root); int num_tc = netdev_get_num_tc(dev); struct sched_entry *entry, *cur; int tc; list_for_each_entry(entry, &sched->entries, list) { u32 gates_still_open = entry->gate_mask; /* For each traffic class, calculate each open gate duration, * starting at this schedule entry and ending at the schedule * entry containing a gate close event for that TC. */ cur = entry; do { if (!gates_still_open) break; for (tc = 0; tc < num_tc; tc++) { if (!(gates_still_open & BIT(tc))) continue; if (cur->gate_mask & BIT(tc)) entry->gate_duration[tc] += cur->interval; else gates_still_open &= ~BIT(tc); } cur = list_next_entry_circular(cur, &sched->entries, list); } while (cur != entry); /* Keep track of the maximum gate duration for each traffic * class, taking care to not confuse a traffic class which is * temporarily closed with one that is always closed. */ for (tc = 0; tc < num_tc; tc++) if (entry->gate_duration[tc] && sched->max_open_gate_duration[tc] < entry->gate_duration[tc]) sched->max_open_gate_duration[tc] = entry->gate_duration[tc]; } } static bool taprio_entry_allows_tx(ktime_t skb_end_time, struct sched_entry *entry, int tc) { return ktime_before(skb_end_time, entry->gate_close_time[tc]); } static ktime_t sched_base_time(const struct sched_gate_list *sched) { if (!sched) return KTIME_MAX; return ns_to_ktime(sched->base_time); } static ktime_t taprio_mono_to_any(const struct taprio_sched *q, ktime_t mono) { /* This pairs with WRITE_ONCE() in taprio_parse_clockid() */ enum tk_offsets tk_offset = READ_ONCE(q->tk_offset); switch (tk_offset) { case TK_OFFS_MAX: return mono; default: return ktime_mono_to_any(mono, tk_offset); } } static ktime_t taprio_get_time(const struct taprio_sched *q) { return taprio_mono_to_any(q, ktime_get()); } static void taprio_free_sched_cb(struct rcu_head *head) { struct sched_gate_list *sched = container_of(head, struct sched_gate_list, rcu); struct sched_entry *entry, *n; list_for_each_entry_safe(entry, n, &sched->entries, list) { list_del(&entry->list); kfree(entry); } kfree(sched); } static void switch_schedules(struct taprio_sched *q, struct sched_gate_list **admin, struct sched_gate_list **oper) { rcu_assign_pointer(q->oper_sched, *admin); rcu_assign_pointer(q->admin_sched, NULL); if (*oper) call_rcu(&(*oper)->rcu, taprio_free_sched_cb); *oper = *admin; *admin = NULL; } /* Get how much time has been already elapsed in the current cycle. */ static s32 get_cycle_time_elapsed(struct sched_gate_list *sched, ktime_t time) { ktime_t time_since_sched_start; s32 time_elapsed; time_since_sched_start = ktime_sub(time, sched->base_time); div_s64_rem(time_since_sched_start, sched->cycle_time, &time_elapsed); return time_elapsed; } static ktime_t get_interval_end_time(struct sched_gate_list *sched, struct sched_gate_list *admin, struct sched_entry *entry, ktime_t intv_start) { s32 cycle_elapsed = get_cycle_time_elapsed(sched, intv_start); ktime_t intv_end, cycle_ext_end, cycle_end; cycle_end = ktime_add_ns(intv_start, sched->cycle_time - cycle_elapsed); intv_end = ktime_add_ns(intv_start, entry->interval); cycle_ext_end = ktime_add(cycle_end, sched->cycle_time_extension); if (ktime_before(intv_end, cycle_end)) return intv_end; else if (admin && admin != sched && ktime_after(admin->base_time, cycle_end) && ktime_before(admin->base_time, cycle_ext_end)) return admin->base_time; else return cycle_end; } static int length_to_duration(struct taprio_sched *q, int len) { return div_u64(len * atomic64_read(&q->picos_per_byte), PSEC_PER_NSEC); } static int duration_to_length(struct taprio_sched *q, u64 duration) { return div_u64(duration * PSEC_PER_NSEC, atomic64_read(&q->picos_per_byte)); } /* Sets sched->max_sdu[] and sched->max_frm_len[] to the minimum between the * q->max_sdu[] requested by the user and the max_sdu dynamically determined by * the maximum open gate durations at the given link speed. */ static void taprio_update_queue_max_sdu(struct taprio_sched *q, struct sched_gate_list *sched, struct qdisc_size_table *stab) { struct net_device *dev = qdisc_dev(q->root); int num_tc = netdev_get_num_tc(dev); u32 max_sdu_from_user; u32 max_sdu_dynamic; u32 max_sdu; int tc; for (tc = 0; tc < num_tc; tc++) { max_sdu_from_user = q->max_sdu[tc] ?: U32_MAX; /* TC gate never closes => keep the queueMaxSDU * selected by the user */ if (sched->max_open_gate_duration[tc] == sched->cycle_time) { max_sdu_dynamic = U32_MAX; } else { u32 max_frm_len; max_frm_len = duration_to_length(q, sched->max_open_gate_duration[tc]); /* Compensate for L1 overhead from size table, * but don't let the frame size go negative */ if (stab) { max_frm_len -= stab->szopts.overhead; max_frm_len = max_t(int, max_frm_len, dev->hard_header_len + 1); } max_sdu_dynamic = max_frm_len - dev->hard_header_len; if (max_sdu_dynamic > dev->max_mtu) max_sdu_dynamic = U32_MAX; } max_sdu = min(max_sdu_dynamic, max_sdu_from_user); if (max_sdu != U32_MAX) { sched->max_frm_len[tc] = max_sdu + dev->hard_header_len; sched->max_sdu[tc] = max_sdu; } else { sched->max_frm_len[tc] = U32_MAX; /* never oversized */ sched->max_sdu[tc] = 0; } } } /* Returns the entry corresponding to next available interval. If * validate_interval is set, it only validates whether the timestamp occurs * when the gate corresponding to the skb's traffic class is open. */ static struct sched_entry *find_entry_to_transmit(struct sk_buff *skb, struct Qdisc *sch, struct sched_gate_list *sched, struct sched_gate_list *admin, ktime_t time, ktime_t *interval_start, ktime_t *interval_end, bool validate_interval) { ktime_t curr_intv_start, curr_intv_end, cycle_end, packet_transmit_time; ktime_t earliest_txtime = KTIME_MAX, txtime, cycle, transmit_end_time; struct sched_entry *entry = NULL, *entry_found = NULL; struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); bool entry_available = false; s32 cycle_elapsed; int tc, n; tc = netdev_get_prio_tc_map(dev, skb->priority); packet_transmit_time = length_to_duration(q, qdisc_pkt_len(skb)); *interval_start = 0; *interval_end = 0; if (!sched) return NULL; cycle = sched->cycle_time; cycle_elapsed = get_cycle_time_elapsed(sched, time); curr_intv_end = ktime_sub_ns(time, cycle_elapsed); cycle_end = ktime_add_ns(curr_intv_end, cycle); list_for_each_entry(entry, &sched->entries, list) { curr_intv_start = curr_intv_end; curr_intv_end = get_interval_end_time(sched, admin, entry, curr_intv_start); if (ktime_after(curr_intv_start, cycle_end)) break; if (!(entry->gate_mask & BIT(tc)) || packet_transmit_time > entry->interval) continue; txtime = entry->next_txtime; if (ktime_before(txtime, time) || validate_interval) { transmit_end_time = ktime_add_ns(time, packet_transmit_time); if ((ktime_before(curr_intv_start, time) && ktime_before(transmit_end_time, curr_intv_end)) || (ktime_after(curr_intv_start, time) && !validate_interval)) { entry_found = entry; *interval_start = curr_intv_start; *interval_end = curr_intv_end; break; } else if (!entry_available && !validate_interval) { /* Here, we are just trying to find out the * first available interval in the next cycle. */ entry_available = true; entry_found = entry; *interval_start = ktime_add_ns(curr_intv_start, cycle); *interval_end = ktime_add_ns(curr_intv_end, cycle); } } else if (ktime_before(txtime, earliest_txtime) && !entry_available) { earliest_txtime = txtime; entry_found = entry; n = div_s64(ktime_sub(txtime, curr_intv_start), cycle); *interval_start = ktime_add(curr_intv_start, n * cycle); *interval_end = ktime_add(curr_intv_end, n * cycle); } } return entry_found; } static bool is_valid_interval(struct sk_buff *skb, struct Qdisc *sch) { struct taprio_sched *q = qdisc_priv(sch); struct sched_gate_list *sched, *admin; ktime_t interval_start, interval_end; struct sched_entry *entry; rcu_read_lock(); sched = rcu_dereference(q->oper_sched); admin = rcu_dereference(q->admin_sched); entry = find_entry_to_transmit(skb, sch, sched, admin, skb->tstamp, &interval_start, &interval_end, true); rcu_read_unlock(); return entry; } static bool taprio_flags_valid(u32 flags) { /* Make sure no other flag bits are set. */ if (flags & ~(TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST | TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD)) return false; /* txtime-assist and full offload are mutually exclusive */ if ((flags & TCA_TAPRIO_ATTR_FLAG_TXTIME_ASSIST) && (flags & TCA_TAPRIO_ATTR_FLAG_FULL_OFFLOAD)) return false; return true; } /* This returns the tstamp value set by TCP in terms of the set clock. */ static ktime_t get_tcp_tstamp(struct taprio_sched *q, struct sk_buff *skb) { unsigned int offset = skb_network_offset(skb); const struct ipv6hdr *ipv6h; const struct iphdr *iph; struct ipv6hdr _ipv6h; ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h); if (!ipv6h) return 0; if (ipv6h->version == 4) { iph = (struct iphdr *)ipv6h; offset += iph->ihl * 4; /* special-case 6in4 tunnelling, as that is a common way to get * v6 connectivity in the home */ if (iph->protocol == IPPROTO_IPV6) { ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h); if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP) return 0; } else if (iph->protocol != IPPROTO_TCP) { return 0; } } else if (ipv6h->version == 6 && ipv6h->nexthdr != IPPROTO_TCP) { return 0; } return taprio_mono_to_any(q, skb->skb_mstamp_ns); } /* There are a few scenarios where we will have to modify the txtime from * what is read from next_txtime in sched_entry. They are: * 1. If txtime is in the past, * a. The gate for the traffic class is currently open and packet can be * transmitted before it closes, schedule the packet right away. * b. If the gate corresponding to the traffic class is going to open later * in the cycle, set the txtime of packet to the interval start. * 2. If txtime is in the future, there are packets corresponding to the * current traffic class waiting to be transmitted. So, the following * possibilities exist: * a. We can transmit the packet before the window containing the txtime * closes. * b. The window might close before the transmission can be completed * successfully. So, schedule the packet in the next open window. */ static long get_packet_txtime(struct sk_buff *skb, struct Qdisc *sch) { ktime_t transmit_end_time, interval_end, interval_start, tcp_tstamp; struct taprio_sched *q = qdisc_priv(sch); struct sched_gate_list *sched, *admin; ktime_t minimum_time, now, txtime; int len, packet_transmit_time; struct sched_entry *entry; bool sched_changed; now = taprio_get_time(q); minimum_time = ktime_add_ns(now, q->txtime_delay); tcp_tstamp = get_tcp_tstamp(q, skb); minimum_time = max_t(ktime_t, minimum_time, tcp_tstamp); rcu_read_lock(); admin = rcu_dereference(q->admin_sched); sched = rcu_dereference(q->oper_sched); if (admin && ktime_after(minimum_time, admin->base_time)) switch_schedules(q, &admin, &sched); /* Until the schedule starts, all the queues are open */ if (!sched || ktime_before(minimum_time, sched->base_time)) { txtime = minimum_time; goto done; } len = qdisc_pkt_len(skb); packet_transmit_time = length_to_duration(q, len); do { sched_changed = false; entry = find_entry_to_transmit(skb, sch, sched, admin, minimum_time, &interval_start, &interval_end, false); if (!entry) { txtime = 0; goto done; } txtime = entry->next_txtime; txtime = max_t(ktime_t, txtime, minimum_time); txtime = max_t(ktime_t, txtime, interval_start); if (admin && admin != sched && ktime_after(txtime, admin->base_time)) { sched = admin; sched_changed = true; continue; } transmit_end_time = ktime_add(txtime, packet_transmit_time); minimum_time = transmit_end_time; /* Update the txtime of current entry to the next time it's * interval starts. */ if (ktime_after(transmit_end_time, interval_end)) entry->next_txtime = ktime_add(interval_start, sched->cycle_time); } while (sched_changed || ktime_after(transmit_end_time, interval_end)); entry->next_txtime = transmit_end_time; done: rcu_read_unlock(); return txtime; } /* Devices with full offload are expected to honor this in hardware */ static bool taprio_skb_exceeds_queue_max_sdu(struct Qdisc *sch, struct sk_buff *skb) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); struct sched_gate_list *sched; int prio = skb->priority; bool exceeds = false; u8 tc; tc = netdev_get_prio_tc_map(dev, prio); rcu_read_lock(); sched = rcu_dereference(q->oper_sched); if (sched && skb->len > sched->max_frm_len[tc]) exceeds = true; rcu_read_unlock(); return exceeds; } static int taprio_enqueue_one(struct sk_buff *skb, struct Qdisc *sch, struct Qdisc *child, struct sk_buff **to_free) { struct taprio_sched *q = qdisc_priv(sch); /* sk_flags are only safe to use on full sockets. */ if (skb->sk && sk_fullsock(skb->sk) && sock_flag(skb->sk, SOCK_TXTIME)) { if (!is_valid_interval(skb, sch)) return qdisc_drop(skb, sch, to_free); } else if (TXTIME_ASSIST_IS_ENABLED(q->flags)) { skb->tstamp = get_packet_txtime(skb, sch); if (!skb->tstamp) return qdisc_drop(skb, sch, to_free); } qdisc_qstats_backlog_inc(sch, skb); sch->q.qlen++; return qdisc_enqueue(skb, child, to_free); } static int taprio_enqueue_segmented(struct sk_buff *skb, struct Qdisc *sch, struct Qdisc *child, struct sk_buff **to_free) { unsigned int slen = 0, numsegs = 0, len = qdisc_pkt_len(skb); netdev_features_t features = netif_skb_features(skb); struct sk_buff *segs, *nskb; int ret; segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); if (IS_ERR_OR_NULL(segs)) return qdisc_drop(skb, sch, to_free); skb_list_walk_safe(segs, segs, nskb) { skb_mark_not_on_list(segs); qdisc_skb_cb(segs)->pkt_len = segs->len; slen += segs->len; /* FIXME: we should be segmenting to a smaller size * rather than dropping these */ if (taprio_skb_exceeds_queue_max_sdu(sch, segs)) ret = qdisc_drop(segs, sch, to_free); else ret = taprio_enqueue_one(segs, sch, child, to_free); if (ret != NET_XMIT_SUCCESS) { if (net_xmit_drop_count(ret)) qdisc_qstats_drop(sch); } else { numsegs++; } } if (numsegs > 1) qdisc_tree_reduce_backlog(sch, 1 - numsegs, len - slen); consume_skb(skb); return numsegs > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP; } /* Will not be called in the full offload case, since the TX queues are * attached to the Qdisc created using qdisc_create_dflt() */ static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { struct taprio_sched *q = qdisc_priv(sch); struct Qdisc *child; int queue; queue = skb_get_queue_mapping(skb); child = q->qdiscs[queue]; if (unlikely(!child)) return qdisc_drop(skb, sch, to_free); if (taprio_skb_exceeds_queue_max_sdu(sch, skb)) { /* Large packets might not be transmitted when the transmission * duration exceeds any configured interval. Therefore, segment * the skb into smaller chunks. Drivers with full offload are * expected to handle this in hardware. */ if (skb_is_gso(skb)) return taprio_enqueue_segmented(skb, sch, child, to_free); return qdisc_drop(skb, sch, to_free); } return taprio_enqueue_one(skb, sch, child, to_free); } static struct sk_buff *taprio_peek(struct Qdisc *sch) { WARN_ONCE(1, "taprio only supports operating as root qdisc, peek() not implemented"); return NULL; } static void taprio_set_budgets(struct taprio_sched *q, struct sched_gate_list *sched, struct sched_entry *entry) { struct net_device *dev = qdisc_dev(q->root); int num_tc = netdev_get_num_tc(dev); int tc, budget; for (tc = 0; tc < num_tc; tc++) { /* Traffic classes which never close have infinite budget */ if (entry->gate_duration[tc] == sched->cycle_time) budget = INT_MAX; else budget = div64_u64((u64)entry->gate_duration[tc] * PSEC_PER_NSEC, atomic64_read(&q->picos_per_byte)); atomic_set(&entry->budget[tc], budget); } } /* When an skb is sent, it consumes from the budget of all traffic classes */ static int taprio_update_budgets(struct sched_entry *entry, size_t len, int tc_consumed, int num_tc) { int tc, budget, new_budget = 0; for (tc = 0; tc < num_tc; tc++) { budget = atomic_read(&entry->budget[tc]); /* Don't consume from infinite budget */ if (budget == INT_MAX) { if (tc == tc_consumed) new_budget = budget; continue; } if (tc == tc_consumed) new_budget = atomic_sub_return(len, &entry->budget[tc]); else atomic_sub(len, &entry->budget[tc]); } return new_budget; } static struct sk_buff *taprio_dequeue_from_txq(struct Qdisc *sch, int txq, struct sched_entry *entry, u32 gate_mask) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); struct Qdisc *child = q->qdiscs[txq]; int num_tc = netdev_get_num_tc(dev); struct sk_buff *skb; ktime_t guard; int prio; int len; u8 tc; if (unlikely(!child)) return NULL; if (TXTIME_ASSIST_IS_ENABLED(q->flags)) goto skip_peek_checks; skb = child->ops->peek(child); if (!skb) return NULL; prio = skb->priority; tc = netdev_get_prio_tc_map(dev, prio); if (!(gate_mask & BIT(tc))) return NULL; len = qdisc_pkt_len(skb); guard = ktime_add_ns(taprio_get_time(q), length_to_duration(q, len)); /* In the case that there's no gate entry, there's no * guard band ... */ if (gate_mask != TAPRIO_ALL_GATES_OPEN && !taprio_entry_allows_tx(guard, entry, tc)) return NULL; /* ... and no budget. */ if (gate_mask != TAPRIO_ALL_GATES_OPEN && taprio_update_budgets(entry, len, tc, num_tc) < 0) return NULL; skip_peek_checks: skb = child->ops->dequeue(child); if (unlikely(!skb)) return NULL; qdisc_bstats_update(sch, skb); qdisc_qstats_backlog_dec(sch, skb); sch->q.qlen--; return skb; } static void taprio_next_tc_txq(struct net_device *dev, int tc, int *txq) { int offset = dev->tc_to_txq[tc].offset; int count = dev->tc_to_txq[tc].count; (*txq)++; if (*txq == offset + count) *txq = offset; } /* Prioritize higher traffic classes, and select among TXQs belonging to the * same TC using round robin */ static struct sk_buff *taprio_dequeue_tc_priority(struct Qdisc *sch, struct sched_entry *entry, u32 gate_mask) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); int num_tc = netdev_get_num_tc(dev); struct sk_buff *skb; int tc; for (tc = num_tc - 1; tc >= 0; tc--) { int first_txq = q->cur_txq[tc]; if (!(gate_mask & BIT(tc))) continue; do { skb = taprio_dequeue_from_txq(sch, q->cur_txq[tc], entry, gate_mask); taprio_next_tc_txq(dev, tc, &q->cur_txq[tc]); if (q->cur_txq[tc] >= dev->num_tx_queues) q->cur_txq[tc] = first_txq; if (skb) return skb; } while (q->cur_txq[tc] != first_txq); } return NULL; } /* Broken way of prioritizing smaller TXQ indices and ignoring the traffic * class other than to determine whether the gate is open or not */ static struct sk_buff *taprio_dequeue_txq_priority(struct Qdisc *sch, struct sched_entry *entry, u32 gate_mask) { struct net_device *dev = qdisc_dev(sch); struct sk_buff *skb; int i; for (i = 0; i < dev->num_tx_queues; i++) { skb = taprio_dequeue_from_txq(sch, i, entry, gate_mask); if (skb) return skb; } return NULL; } /* Will not be called in the full offload case, since the TX queues are * attached to the Qdisc created using qdisc_create_dflt() */ static struct sk_buff *taprio_dequeue(struct Qdisc *sch) { struct taprio_sched *q = qdisc_priv(sch); struct sk_buff *skb = NULL; struct sched_entry *entry; u32 gate_mask; rcu_read_lock(); entry = rcu_dereference(q->current_entry); /* if there's no entry, it means that the schedule didn't * start yet, so force all gates to be open, this is in * accordance to IEEE 802.1Qbv-2015 Section 8.6.9.4.5 * "AdminGateStates" */ gate_mask = entry ? entry->gate_mask : TAPRIO_ALL_GATES_OPEN; if (!gate_mask) goto done; if (static_branch_unlikely(&taprio_have_broken_mqprio) && !static_branch_likely(&taprio_have_working_mqprio)) { /* Single NIC kind which is broken */ skb = taprio_dequeue_txq_priority(sch, entry, gate_mask); } else if (static_branch_likely(&taprio_have_working_mqprio) && !static_branch_unlikely(&taprio_have_broken_mqprio)) { /* Single NIC kind which prioritizes properly */ skb = taprio_dequeue_tc_priority(sch, entry, gate_mask); } else { /* Mixed NIC kinds present in system, need dynamic testing */ if (q->broken_mqprio) skb = taprio_dequeue_txq_priority(sch, entry, gate_mask); else skb = taprio_dequeue_tc_priority(sch, entry, gate_mask); } done: rcu_read_unlock(); return skb; } static bool should_restart_cycle(const struct sched_gate_list *oper, const struct sched_entry *entry) { if (list_is_last(&entry->list, &oper->entries)) return true; if (ktime_compare(entry->end_time, oper->cycle_end_time) == 0) return true; return false; } static bool should_change_schedules(const struct sched_gate_list *admin, const struct sched_gate_list *oper, ktime_t end_time) { ktime_t next_base_time, extension_time; if (!admin) return false; next_base_time = sched_base_time(admin); /* This is the simple case, the end_time would fall after * the next schedule base_time. */ if (ktime_compare(next_base_time, end_time) <= 0) return true; /* This is the cycle_time_extension case, if the end_time * plus the amount that can be extended would fall after the * next schedule base_time, we can extend the current schedule * for that amount. */ extension_time = ktime_add_ns(end_time, oper->cycle_time_extension); /* FIXME: the IEEE 802.1Q-2018 Specification isn't clear about * how precisely the extension should be made. So after * conformance testing, this logic may change. */ if (ktime_compare(next_base_time, extension_time) <= 0) return true; return false; } static enum hrtimer_restart advance_sched(struct hrtimer *timer) { struct taprio_sched *q = container_of(timer, struct taprio_sched, advance_timer); struct net_device *dev = qdisc_dev(q->root); struct sched_gate_list *oper, *admin; int num_tc = netdev_get_num_tc(dev); struct sched_entry *entry, *next; struct Qdisc *sch = q->root; ktime_t end_time; int tc; spin_lock(&q->current_entry_lock); entry = rcu_dereference_protected(q->current_entry, lockdep_is_held(&q->current_entry_lock)); oper = rcu_dereference_protected(q->oper_sched, lockdep_is_held(&q->current_entry_lock)); admin = rcu_dereference_protected(q->admin_sched, lockdep_is_held(&q->current_entry_lock)); if (!oper) switch_schedules(q, &admin, &oper); /* This can happen in two cases: 1. this is the very first run * of this function (i.e. we weren't running any schedule * previously); 2. The previous schedule just ended. The first * entry of all schedules are pre-calculated during the * schedule initialization. */ if (unlikely(!entry || entry->end_time == oper->base_time)) { next = list_first_entry(&oper->entries, struct sched_entry, list); end_time = next->end_time; goto first_run; } if (should_restart_cycle(oper, entry)) { next = list_first_entry(&oper->entries, struct sched_entry, list); oper->cycle_end_time = ktime_add_ns(oper->cycle_end_time, oper->cycle_time); } else { next = list_next_entry(entry, list); } end_time = ktime_add_ns(entry->end_time, next->interval); end_time = min_t(ktime_t, end_time, oper->cycle_end_time); for (tc = 0; tc < num_tc; tc++) { if (next->gate_duration[tc] == oper->cycle_time) next->gate_close_time[tc] = KTIME_MAX; else next->gate_close_time[tc] = ktime_add_ns(entry->end_time, next->gate_duration[tc]); } if (should_change_schedules(admin, oper, end_time)) { /* Set things so the next time this runs, the new * schedule runs. */ end_time = sched_base_time(admin); switch_schedules(q, &admin, &oper); } next->end_time = end_time; taprio_set_budgets(q, oper, next); first_run: rcu_assign_pointer(q->current_entry, next); spin_unlock(&q->current_entry_lock); hrtimer_set_expires(&q->advance_timer, end_time); rcu_read_lock(); __netif_schedule(sch); rcu_read_unlock(); return HRTIMER_RESTART; } static const struct nla_policy entry_policy[TCA_TAPRIO_SCHED_ENTRY_MAX + 1] = { [TCA_TAPRIO_SCHED_ENTRY_INDEX] = { .type = NLA_U32 }, [TCA_TAPRIO_SCHED_ENTRY_CMD] = { .type = NLA_U8 }, [TCA_TAPRIO_SCHED_ENTRY_GATE_MASK] = { .type = NLA_U32 }, [TCA_TAPRIO_SCHED_ENTRY_INTERVAL] = { .type = NLA_U32 }, }; static const struct nla_policy taprio_tc_policy[TCA_TAPRIO_TC_ENTRY_MAX + 1] = { [TCA_TAPRIO_TC_ENTRY_INDEX] = { .type = NLA_U32 }, [TCA_TAPRIO_TC_ENTRY_MAX_SDU] = { .type = NLA_U32 }, [TCA_TAPRIO_TC_ENTRY_FP] = NLA_POLICY_RANGE(NLA_U32, TC_FP_EXPRESS, TC_FP_PREEMPTIBLE), }; static const struct netlink_range_validation_signed taprio_cycle_time_range = { .min = 0, .max = INT_MAX, }; static const struct nla_policy taprio_policy[TCA_TAPRIO_ATTR_MAX + 1] = { [TCA_TAPRIO_ATTR_PRIOMAP] = { .len = sizeof(struct tc_mqprio_qopt) }, [TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST] = { .type = NLA_NESTED }, [TCA_TAPRIO_ATTR_SCHED_BASE_TIME] = { .type = NLA_S64 }, [TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY] = { .type = NLA_NESTED }, [TCA_TAPRIO_ATTR_SCHED_CLOCKID] = { .type = NLA_S32 }, [TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME] = NLA_POLICY_FULL_RANGE_SIGNED(NLA_S64, &taprio_cycle_time_range), [TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION] = { .type = NLA_S64 }, [TCA_TAPRIO_ATTR_FLAGS] = { .type = NLA_U32 }, [TCA_TAPRIO_ATTR_TXTIME_DELAY] = { .type = NLA_U32 }, [TCA_TAPRIO_ATTR_TC_ENTRY] = { .type = NLA_NESTED }, }; static int fill_sched_entry(struct taprio_sched *q, struct nlattr **tb, struct sched_entry *entry, struct netlink_ext_ack *extack) { int min_duration = length_to_duration(q, ETH_ZLEN); u32 interval = 0; if (tb[TCA_TAPRIO_SCHED_ENTRY_CMD]) entry->command = nla_get_u8( tb[TCA_TAPRIO_SCHED_ENTRY_CMD]); if (tb[TCA_TAPRIO_SCHED_ENTRY_GATE_MASK]) entry->gate_mask = nla_get_u32( tb[TCA_TAPRIO_SCHED_ENTRY_GATE_MASK]); if (tb[TCA_TAPRIO_SCHED_ENTRY_INTERVAL]) interval = nla_get_u32( tb[TCA_TAPRIO_SCHED_ENTRY_INTERVAL]); /* The interval should allow at least the minimum ethernet * frame to go out. */ if (interval < min_duration) { NL_SET_ERR_MSG(extack, "Invalid interval for schedule entry"); return -EINVAL; } entry->interval = interval; return 0; } static int parse_sched_entry(struct taprio_sched *q, struct nlattr *n, struct sched_entry *entry, int index, struct netlink_ext_ack *extack) { struct nlattr *tb[TCA_TAPRIO_SCHED_ENTRY_MAX + 1] = { }; int err; err = nla_parse_nested_deprecated(tb, TCA_TAPRIO_SCHED_ENTRY_MAX, n, entry_policy, NULL); if (err < 0) { NL_SET_ERR_MSG(extack, "Could not parse nested entry"); return -EINVAL; } entry->index = index; return fill_sched_entry(q, tb, entry, extack); } static int parse_sched_list(struct taprio_sched *q, struct nlattr *list, struct sched_gate_list *sched, struct netlink_ext_ack *extack) { struct nlattr *n; int err, rem; int i = 0; if (!list) return -EINVAL; nla_for_each_nested(n, list, rem) { struct sched_entry *entry; if (nla_type(n) != TCA_TAPRIO_SCHED_ENTRY) { NL_SET_ERR_MSG(extack, "Attribute is not of type 'entry'"); continue; } entry = kzalloc(sizeof(*entry), GFP_KERNEL); if (!entry) { NL_SET_ERR_MSG(extack, "Not enough memory for entry"); return -ENOMEM; } err = parse_sched_entry(q, n, entry, i, extack); if (err < 0) { kfree(entry); return err; } list_add_tail(&entry->list, &sched->entries); i++; } sched->num_entries = i; return i; } static int parse_taprio_schedule(struct taprio_sched *q, struct nlattr **tb, struct sched_gate_list *new, struct netlink_ext_ack *extack) { int err = 0; if (tb[TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY]) { NL_SET_ERR_MSG(extack, "Adding a single entry is not supported"); return -ENOTSUPP; } if (tb[TCA_TAPRIO_ATTR_SCHED_BASE_TIME]) new->base_time = nla_get_s64(tb[TCA_TAPRIO_ATTR_SCHED_BASE_TIME]); if (tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION]) new->cycle_time_extension = nla_get_s64(tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION]); if (tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME]) new->cycle_time = nla_get_s64(tb[TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME]); if (tb[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST]) err = parse_sched_list(q, tb[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST], new, extack); if (err < 0) return err; if (!new->cycle_time) { struct sched_entry *entry; ktime_t cycle = 0; list_for_each_entry(entry, &new->entries, list) cycle = ktime_add_ns(cycle, entry->interval); if (!cycle) { NL_SET_ERR_MSG(extack, "'cycle_time' can never be 0"); return -EINVAL; } if (cycle < 0 || cycle > INT_MAX) { NL_SET_ERR_MSG(extack, "'cycle_time' is too big"); return -EINVAL; } new->cycle_time = cycle; } taprio_calculate_gate_durations(q, new); return 0; } static int taprio_parse_mqprio_opt(struct net_device *dev, struct tc_mqprio_qopt *qopt, struct netlink_ext_ack *extack, u32 taprio_flags) { bool allow_overlapping_txqs = TXTIME_ASSIST_IS_ENABLED(taprio_flags); if (!qopt && !dev->num_tc) { NL_SET_ERR_MSG(extack, "'mqprio' configuration is necessary"); return -EINVAL; } /* If num_tc is already set, it means that the user already * configured the mqprio part */ if (dev->num_tc) return 0; /* taprio imposes that traffic classes map 1:n to tx queues */ if (qopt->num_tc > dev->num_tx_queues) { NL_SET_ERR_MSG(extack, "Number of traffic classes is greater than number of HW queues"); return -EINVAL; } /* For some reason, in txtime-assist mode, we allow TXQ ranges for * different TCs to overlap, and just validate the TXQ ranges. */ return mqprio_validate_qopt(dev, qopt, true, allow_overlapping_txqs, extack); } static int taprio_get_start_time(struct Qdisc *sch, struct sched_gate_list *sched, ktime_t *start) { struct taprio_sched *q = qdisc_priv(sch); ktime_t now, base, cycle; s64 n; base = sched_base_time(sched); now = taprio_get_time(q); if (ktime_after(base, now)) { *start = base; return 0; } cycle = sched->cycle_time; /* The qdisc is expected to have at least one sched_entry. Moreover, * any entry must have 'interval' > 0. Thus if the cycle time is zero, * something went really wrong. In that case, we should warn about this * inconsistent state and return error. */ if (WARN_ON(!cycle)) return -EFAULT; /* Schedule the start time for the beginning of the next * cycle. */ n = div64_s64(ktime_sub_ns(now, base), cycle); *start = ktime_add_ns(base, (n + 1) * cycle); return 0; } static void setup_first_end_time(struct taprio_sched *q, struct sched_gate_list *sched, ktime_t base) { struct net_device *dev = qdisc_dev(q->root); int num_tc = netdev_get_num_tc(dev); struct sched_entry *first; ktime_t cycle; int tc; first = list_first_entry(&sched->entries, struct sched_entry, list); cycle = sched->cycle_time; /* FIXME: find a better place to do this */ sched->cycle_end_time = ktime_add_ns(base, cycle); first->end_time = ktime_add_ns(base, first->interval); taprio_set_budgets(q, sched, first); for (tc = 0; tc < num_tc; tc++) { if (first->gate_duration[tc] == sched->cycle_time) first->gate_close_time[tc] = KTIME_MAX; else first->gate_close_time[tc] = ktime_add_ns(base, first->gate_duration[tc]); } rcu_assign_pointer(q->current_entry, NULL); } static void taprio_start_sched(struct Qdisc *sch, ktime_t start, struct sched_gate_list *new) { struct taprio_sched *q = qdisc_priv(sch); ktime_t expires; if (FULL_OFFLOAD_IS_ENABLED(q->flags)) return; expires = hrtimer_get_expires(&q->advance_timer); if (expires == 0) expires = KTIME_MAX; /* If the new schedule starts before the next expiration, we * reprogram it to the earliest one, so we change the admin * schedule to the operational one at the right time. */ start = min_t(ktime_t, start, expires); hrtimer_start(&q->advance_timer, start, HRTIMER_MODE_ABS); } static void taprio_set_picos_per_byte(struct net_device *dev, struct taprio_sched *q) { struct ethtool_link_ksettings ecmd; int speed = SPEED_10; int picos_per_byte; int err; err = __ethtool_get_link_ksettings(dev, &ecmd); if (err < 0) goto skip; if (ecmd.base.speed && ecmd.base.speed != SPEED_UNKNOWN) speed = ecmd.base.speed; skip: picos_per_byte = (USEC_PER_SEC * 8) / speed; atomic64_set(&q->picos_per_byte, picos_per_byte); netdev_dbg(dev, "taprio: set %s's picos_per_byte to: %lld, linkspeed: %d\n", dev->name, (long long)atomic64_read(&q->picos_per_byte), ecmd.base.speed); } static int taprio_dev_notifier(struct notifier_block *nb, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct sched_gate_list *oper, *admin; struct qdisc_size_table *stab; struct taprio_sched *q; ASSERT_RTNL(); if (event != NETDEV_UP && event != NETDEV_CHANGE) return NOTIFY_DONE; list_for_each_entry(q, &taprio_list, taprio_list) { if (dev != qdisc_dev(q->root)) continue; taprio_set_picos_per_byte(dev, q); stab = rtnl_dereference(q->root->stab); oper = rtnl_dereference(q->oper_sched); if (oper) taprio_update_queue_max_sdu(q, oper, stab); admin = rtnl_dereference(q->admin_sched); if (admin) taprio_update_queue_max_sdu(q, admin, stab); break; } return NOTIFY_DONE; } static void setup_txtime(struct taprio_sched *q, struct sched_gate_list *sched, ktime_t base) { struct sched_entry *entry; u64 interval = 0; list_for_each_entry(entry, &sched->entries, list) { entry->next_txtime = ktime_add_ns(base, interval); interval += entry->interval; } } static struct tc_taprio_qopt_offload *taprio_offload_alloc(int num_entries) { struct __tc_taprio_qopt_offload *__offload; __offload = kzalloc(struct_size(__offload, offload.entries, num_entries), GFP_KERNEL); if (!__offload) return NULL; refcount_set(&__offload->users, 1); return &__offload->offload; } struct tc_taprio_qopt_offload *taprio_offload_get(struct tc_taprio_qopt_offload *offload) { struct __tc_taprio_qopt_offload *__offload; __offload = container_of(offload, struct __tc_taprio_qopt_offload, offload); refcount_inc(&__offload->users); return offload; } EXPORT_SYMBOL_GPL(taprio_offload_get); void taprio_offload_free(struct tc_taprio_qopt_offload *offload) { struct __tc_taprio_qopt_offload *__offload; __offload = container_of(offload, struct __tc_taprio_qopt_offload, offload); if (!refcount_dec_and_test(&__offload->users)) return; kfree(__offload); } EXPORT_SYMBOL_GPL(taprio_offload_free); /* The function will only serve to keep the pointers to the "oper" and "admin" * schedules valid in relation to their base times, so when calling dump() the * users looks at the right schedules. * When using full offload, the admin configuration is promoted to oper at the * base_time in the PHC time domain. But because the system time is not * necessarily in sync with that, we can't just trigger a hrtimer to call * switch_schedules at the right hardware time. * At the moment we call this by hand right away from taprio, but in the future * it will be useful to create a mechanism for drivers to notify taprio of the * offload state (PENDING, ACTIVE, INACTIVE) so it can be visible in dump(). * This is left as TODO. */ static void taprio_offload_config_changed(struct taprio_sched *q) { struct sched_gate_list *oper, *admin; oper = rtnl_dereference(q->oper_sched); admin = rtnl_dereference(q->admin_sched); switch_schedules(q, &admin, &oper); } static u32 tc_map_to_queue_mask(struct net_device *dev, u32 tc_mask) { u32 i, queue_mask = 0; for (i = 0; i < dev->num_tc; i++) { u32 offset, count; if (!(tc_mask & BIT(i))) continue; offset = dev->tc_to_txq[i].offset; count = dev->tc_to_txq[i].count; queue_mask |= GENMASK(offset + count - 1, offset); } return queue_mask; } static void taprio_sched_to_offload(struct net_device *dev, struct sched_gate_list *sched, struct tc_taprio_qopt_offload *offload, const struct tc_taprio_caps *caps) { struct sched_entry *entry; int i = 0; offload->base_time = sched->base_time; offload->cycle_time = sched->cycle_time; offload->cycle_time_extension = sched->cycle_time_extension; list_for_each_entry(entry, &sched->entries, list) { struct tc_taprio_sched_entry *e = &offload->entries[i]; e->command = entry->command; e->interval = entry->interval; if (caps->gate_mask_per_txq) e->gate_mask = tc_map_to_queue_mask(dev, entry->gate_mask); else e->gate_mask = entry->gate_mask; i++; } offload->num_entries = i; } static void taprio_detect_broken_mqprio(struct taprio_sched *q) { struct net_device *dev = qdisc_dev(q->root); struct tc_taprio_caps caps; qdisc_offload_query_caps(dev, TC_SETUP_QDISC_TAPRIO, &caps, sizeof(caps)); q->broken_mqprio = caps.broken_mqprio; if (q->broken_mqprio) static_branch_inc(&taprio_have_broken_mqprio); else static_branch_inc(&taprio_have_working_mqprio); q->detected_mqprio = true; } static void taprio_cleanup_broken_mqprio(struct taprio_sched *q) { if (!q->detected_mqprio) return; if (q->broken_mqprio) static_branch_dec(&taprio_have_broken_mqprio); else static_branch_dec(&taprio_have_working_mqprio); } static int taprio_enable_offload(struct net_device *dev, struct taprio_sched *q, struct sched_gate_list *sched, struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; struct tc_taprio_qopt_offload *offload; struct tc_taprio_caps caps; int tc, err = 0; if (!ops->ndo_setup_tc) { NL_SET_ERR_MSG(extack, "Device does not support taprio offload"); return -EOPNOTSUPP; } qdisc_offload_query_caps(dev, TC_SETUP_QDISC_TAPRIO, &caps, sizeof(caps)); if (!caps.supports_queue_max_sdu) { for (tc = 0; tc < TC_MAX_QUEUE; tc++) { if (q->max_sdu[tc]) { NL_SET_ERR_MSG_MOD(extack, "Device does not handle queueMaxSDU"); return -EOPNOTSUPP; } } } offload = taprio_offload_alloc(sched->num_entries); if (!offload) { NL_SET_ERR_MSG(extack, "Not enough memory for enabling offload mode"); return -ENOMEM; } offload->cmd = TAPRIO_CMD_REPLACE; offload->extack = extack; mqprio_qopt_reconstruct(dev, &offload->mqprio.qopt); offload->mqprio.extack = extack; taprio_sched_to_offload(dev, sched, offload, &caps); mqprio_fp_to_offload(q->fp, &offload->mqprio); for (tc = 0; tc < TC_MAX_QUEUE; tc++) offload->max_sdu[tc] = q->max_sdu[tc]; err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TAPRIO, offload); if (err < 0) { NL_SET_ERR_MSG_WEAK(extack, "Device failed to setup taprio offload"); goto done; } q->offloaded = true; done: /* The offload structure may linger around via a reference taken by the * device driver, so clear up the netlink extack pointer so that the * driver isn't tempted to dereference data which stopped being valid */ offload->extack = NULL; offload->mqprio.extack = NULL; taprio_offload_free(offload); return err; } static int taprio_disable_offload(struct net_device *dev, struct taprio_sched *q, struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; struct tc_taprio_qopt_offload *offload; int err; if (!q->offloaded) return 0; offload = taprio_offload_alloc(0); if (!offload) { NL_SET_ERR_MSG(extack, "Not enough memory to disable offload mode"); return -ENOMEM; } offload->cmd = TAPRIO_CMD_DESTROY; err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TAPRIO, offload); if (err < 0) { NL_SET_ERR_MSG(extack, "Device failed to disable offload"); goto out; } q->offloaded = false; out: taprio_offload_free(offload); return err; } /* If full offload is enabled, the only possible clockid is the net device's * PHC. For that reason, specifying a clockid through netlink is incorrect. * For txtime-assist, it is implicitly assumed that the device's PHC is kept * in sync with the specified clockid via a user space daemon such as phc2sys. * For both software taprio and txtime-assist, the clockid is used for the * hrtimer that advances the schedule and hence mandatory. */ static int taprio_parse_clockid(struct Qdisc *sch, struct nlattr **tb, struct netlink_ext_ack *extack) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); int err = -EINVAL; if (FULL_OFFLOAD_IS_ENABLED(q->flags)) { const struct ethtool_ops *ops = dev->ethtool_ops; struct ethtool_ts_info info = { .cmd = ETHTOOL_GET_TS_INFO, .phc_index = -1, }; if (tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) { NL_SET_ERR_MSG(extack, "The 'clockid' cannot be specified for full offload"); goto out; } if (ops && ops->get_ts_info) err = ops->get_ts_info(dev, &info); if (err || info.phc_index < 0) { NL_SET_ERR_MSG(extack, "Device does not have a PTP clock"); err = -ENOTSUPP; goto out; } } else if (tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) { int clockid = nla_get_s32(tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]); enum tk_offsets tk_offset; /* We only support static clockids and we don't allow * for it to be modified after the first init. */ if (clockid < 0 || (q->clockid != -1 && q->clockid != clockid)) { NL_SET_ERR_MSG(extack, "Changing the 'clockid' of a running schedule is not supported"); err = -ENOTSUPP; goto out; } switch (clockid) { case CLOCK_REALTIME: tk_offset = TK_OFFS_REAL; break; case CLOCK_MONOTONIC: tk_offset = TK_OFFS_MAX; break; case CLOCK_BOOTTIME: tk_offset = TK_OFFS_BOOT; break; case CLOCK_TAI: tk_offset = TK_OFFS_TAI; break; default: NL_SET_ERR_MSG(extack, "Invalid 'clockid'"); err = -EINVAL; goto out; } /* This pairs with READ_ONCE() in taprio_mono_to_any */ WRITE_ONCE(q->tk_offset, tk_offset); q->clockid = clockid; } else { NL_SET_ERR_MSG(extack, "Specifying a 'clockid' is mandatory"); goto out; } /* Everything went ok, return success. */ err = 0; out: return err; } static int taprio_parse_tc_entry(struct Qdisc *sch, struct nlattr *opt, u32 max_sdu[TC_QOPT_MAX_QUEUE], u32 fp[TC_QOPT_MAX_QUEUE], unsigned long *seen_tcs, struct netlink_ext_ack *extack) { struct nlattr *tb[TCA_TAPRIO_TC_ENTRY_MAX + 1] = { }; struct net_device *dev = qdisc_dev(sch); int err, tc; u32 val; err = nla_parse_nested(tb, TCA_TAPRIO_TC_ENTRY_MAX, opt, taprio_tc_policy, extack); if (err < 0) return err; if (!tb[TCA_TAPRIO_TC_ENTRY_INDEX]) { NL_SET_ERR_MSG_MOD(extack, "TC entry index missing"); return -EINVAL; } tc = nla_get_u32(tb[TCA_TAPRIO_TC_ENTRY_INDEX]); if (tc >= TC_QOPT_MAX_QUEUE) { NL_SET_ERR_MSG_MOD(extack, "TC entry index out of range"); return -ERANGE; } if (*seen_tcs & BIT(tc)) { NL_SET_ERR_MSG_MOD(extack, "Duplicate TC entry"); return -EINVAL; } *seen_tcs |= BIT(tc); if (tb[TCA_TAPRIO_TC_ENTRY_MAX_SDU]) { val = nla_get_u32(tb[TCA_TAPRIO_TC_ENTRY_MAX_SDU]); if (val > dev->max_mtu) { NL_SET_ERR_MSG_MOD(extack, "TC max SDU exceeds device max MTU"); return -ERANGE; } max_sdu[tc] = val; } if (tb[TCA_TAPRIO_TC_ENTRY_FP]) fp[tc] = nla_get_u32(tb[TCA_TAPRIO_TC_ENTRY_FP]); return 0; } static int taprio_parse_tc_entries(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); u32 max_sdu[TC_QOPT_MAX_QUEUE]; bool have_preemption = false; unsigned long seen_tcs = 0; u32 fp[TC_QOPT_MAX_QUEUE]; struct nlattr *n; int tc, rem; int err = 0; for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) { max_sdu[tc] = q->max_sdu[tc]; fp[tc] = q->fp[tc]; } nla_for_each_nested(n, opt, rem) { if (nla_type(n) != TCA_TAPRIO_ATTR_TC_ENTRY) continue; err = taprio_parse_tc_entry(sch, n, max_sdu, fp, &seen_tcs, extack); if (err) return err; } for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) { q->max_sdu[tc] = max_sdu[tc]; q->fp[tc] = fp[tc]; if (fp[tc] != TC_FP_EXPRESS) have_preemption = true; } if (have_preemption) { if (!FULL_OFFLOAD_IS_ENABLED(q->flags)) { NL_SET_ERR_MSG(extack, "Preemption only supported with full offload"); return -EOPNOTSUPP; } if (!ethtool_dev_mm_supported(dev)) { NL_SET_ERR_MSG(extack, "Device does not support preemption"); return -EOPNOTSUPP; } } return err; } static int taprio_mqprio_cmp(const struct net_device *dev, const struct tc_mqprio_qopt *mqprio) { int i; if (!mqprio || mqprio->num_tc != dev->num_tc) return -1; for (i = 0; i < mqprio->num_tc; i++) if (dev->tc_to_txq[i].count != mqprio->count[i] || dev->tc_to_txq[i].offset != mqprio->offset[i]) return -1; for (i = 0; i <= TC_BITMASK; i++) if (dev->prio_tc_map[i] != mqprio->prio_tc_map[i]) return -1; return 0; } /* The semantics of the 'flags' argument in relation to 'change()' * requests, are interpreted following two rules (which are applied in * this order): (1) an omitted 'flags' argument is interpreted as * zero; (2) the 'flags' of a "running" taprio instance cannot be * changed. */ static int taprio_new_flags(const struct nlattr *attr, u32 old, struct netlink_ext_ack *extack) { u32 new = 0; if (attr) new = nla_get_u32(attr); if (old != TAPRIO_FLAGS_INVALID && old != new) { NL_SET_ERR_MSG_MOD(extack, "Changing 'flags' of a running schedule is not supported"); return -EOPNOTSUPP; } if (!taprio_flags_valid(new)) { NL_SET_ERR_MSG_MOD(extack, "Specified 'flags' are not valid"); return -EINVAL; } return new; } static int taprio_change(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct qdisc_size_table *stab = rtnl_dereference(sch->stab); struct nlattr *tb[TCA_TAPRIO_ATTR_MAX + 1] = { }; struct sched_gate_list *oper, *admin, *new_admin; struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); struct tc_mqprio_qopt *mqprio = NULL; unsigned long flags; ktime_t start; int i, err; err = nla_parse_nested_deprecated(tb, TCA_TAPRIO_ATTR_MAX, opt, taprio_policy, extack); if (err < 0) return err; if (tb[TCA_TAPRIO_ATTR_PRIOMAP]) mqprio = nla_data(tb[TCA_TAPRIO_ATTR_PRIOMAP]); err = taprio_new_flags(tb[TCA_TAPRIO_ATTR_FLAGS], q->flags, extack); if (err < 0) return err; q->flags = err; err = taprio_parse_mqprio_opt(dev, mqprio, extack, q->flags); if (err < 0) return err; err = taprio_parse_tc_entries(sch, opt, extack); if (err) return err; new_admin = kzalloc(sizeof(*new_admin), GFP_KERNEL); if (!new_admin) { NL_SET_ERR_MSG(extack, "Not enough memory for a new schedule"); return -ENOMEM; } INIT_LIST_HEAD(&new_admin->entries); oper = rtnl_dereference(q->oper_sched); admin = rtnl_dereference(q->admin_sched); /* no changes - no new mqprio settings */ if (!taprio_mqprio_cmp(dev, mqprio)) mqprio = NULL; if (mqprio && (oper || admin)) { NL_SET_ERR_MSG(extack, "Changing the traffic mapping of a running schedule is not supported"); err = -ENOTSUPP; goto free_sched; } if (mqprio) { err = netdev_set_num_tc(dev, mqprio->num_tc); if (err) goto free_sched; for (i = 0; i < mqprio->num_tc; i++) { netdev_set_tc_queue(dev, i, mqprio->count[i], mqprio->offset[i]); q->cur_txq[i] = mqprio->offset[i]; } /* Always use supplied priority mappings */ for (i = 0; i <= TC_BITMASK; i++) netdev_set_prio_tc_map(dev, i, mqprio->prio_tc_map[i]); } err = parse_taprio_schedule(q, tb, new_admin, extack); if (err < 0) goto free_sched; if (new_admin->num_entries == 0) { NL_SET_ERR_MSG(extack, "There should be at least one entry in the schedule"); err = -EINVAL; goto free_sched; } err = taprio_parse_clockid(sch, tb, extack); if (err < 0) goto free_sched; taprio_set_picos_per_byte(dev, q); taprio_update_queue_max_sdu(q, new_admin, stab); if (FULL_OFFLOAD_IS_ENABLED(q->flags)) err = taprio_enable_offload(dev, q, new_admin, extack); else err = taprio_disable_offload(dev, q, extack); if (err) goto free_sched; /* Protects against enqueue()/dequeue() */ spin_lock_bh(qdisc_lock(sch)); if (tb[TCA_TAPRIO_ATTR_TXTIME_DELAY]) { if (!TXTIME_ASSIST_IS_ENABLED(q->flags)) { NL_SET_ERR_MSG_MOD(extack, "txtime-delay can only be set when txtime-assist mode is enabled"); err = -EINVAL; goto unlock; } q->txtime_delay = nla_get_u32(tb[TCA_TAPRIO_ATTR_TXTIME_DELAY]); } if (!TXTIME_ASSIST_IS_ENABLED(q->flags) && !FULL_OFFLOAD_IS_ENABLED(q->flags) && !hrtimer_active(&q->advance_timer)) { hrtimer_init(&q->advance_timer, q->clockid, HRTIMER_MODE_ABS); q->advance_timer.function = advance_sched; } err = taprio_get_start_time(sch, new_admin, &start); if (err < 0) { NL_SET_ERR_MSG(extack, "Internal error: failed get start time"); goto unlock; } setup_txtime(q, new_admin, start); if (TXTIME_ASSIST_IS_ENABLED(q->flags)) { if (!oper) { rcu_assign_pointer(q->oper_sched, new_admin); err = 0; new_admin = NULL; goto unlock; } rcu_assign_pointer(q->admin_sched, new_admin); if (admin) call_rcu(&admin->rcu, taprio_free_sched_cb); } else { setup_first_end_time(q, new_admin, start); /* Protects against advance_sched() */ spin_lock_irqsave(&q->current_entry_lock, flags); taprio_start_sched(sch, start, new_admin); rcu_assign_pointer(q->admin_sched, new_admin); if (admin) call_rcu(&admin->rcu, taprio_free_sched_cb); spin_unlock_irqrestore(&q->current_entry_lock, flags); if (FULL_OFFLOAD_IS_ENABLED(q->flags)) taprio_offload_config_changed(q); } new_admin = NULL; err = 0; if (!stab) NL_SET_ERR_MSG_MOD(extack, "Size table not specified, frame length estimations may be inaccurate"); unlock: spin_unlock_bh(qdisc_lock(sch)); free_sched: if (new_admin) call_rcu(&new_admin->rcu, taprio_free_sched_cb); return err; } static void taprio_reset(struct Qdisc *sch) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); int i; hrtimer_cancel(&q->advance_timer); if (q->qdiscs) { for (i = 0; i < dev->num_tx_queues; i++) if (q->qdiscs[i]) qdisc_reset(q->qdiscs[i]); } } static void taprio_destroy(struct Qdisc *sch) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); struct sched_gate_list *oper, *admin; unsigned int i; list_del(&q->taprio_list); /* Note that taprio_reset() might not be called if an error * happens in qdisc_create(), after taprio_init() has been called. */ hrtimer_cancel(&q->advance_timer); qdisc_synchronize(sch); taprio_disable_offload(dev, q, NULL); if (q->qdiscs) { for (i = 0; i < dev->num_tx_queues; i++) qdisc_put(q->qdiscs[i]); kfree(q->qdiscs); } q->qdiscs = NULL; netdev_reset_tc(dev); oper = rtnl_dereference(q->oper_sched); admin = rtnl_dereference(q->admin_sched); if (oper) call_rcu(&oper->rcu, taprio_free_sched_cb); if (admin) call_rcu(&admin->rcu, taprio_free_sched_cb); taprio_cleanup_broken_mqprio(q); } static int taprio_init(struct Qdisc *sch, struct nlattr *opt, struct netlink_ext_ack *extack) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); int i, tc; spin_lock_init(&q->current_entry_lock); hrtimer_init(&q->advance_timer, CLOCK_TAI, HRTIMER_MODE_ABS); q->advance_timer.function = advance_sched; q->root = sch; /* We only support static clockids. Use an invalid value as default * and get the valid one on taprio_change(). */ q->clockid = -1; q->flags = TAPRIO_FLAGS_INVALID; list_add(&q->taprio_list, &taprio_list); if (sch->parent != TC_H_ROOT) { NL_SET_ERR_MSG_MOD(extack, "Can only be attached as root qdisc"); return -EOPNOTSUPP; } if (!netif_is_multiqueue(dev)) { NL_SET_ERR_MSG_MOD(extack, "Multi-queue device is required"); return -EOPNOTSUPP; } q->qdiscs = kcalloc(dev->num_tx_queues, sizeof(q->qdiscs[0]), GFP_KERNEL); if (!q->qdiscs) return -ENOMEM; if (!opt) return -EINVAL; for (i = 0; i < dev->num_tx_queues; i++) { struct netdev_queue *dev_queue; struct Qdisc *qdisc; dev_queue = netdev_get_tx_queue(dev, i); qdisc = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops, TC_H_MAKE(TC_H_MAJ(sch->handle), TC_H_MIN(i + 1)), extack); if (!qdisc) return -ENOMEM; if (i < dev->real_num_tx_queues) qdisc_hash_add(qdisc, false); q->qdiscs[i] = qdisc; } for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) q->fp[tc] = TC_FP_EXPRESS; taprio_detect_broken_mqprio(q); return taprio_change(sch, opt, extack); } static void taprio_attach(struct Qdisc *sch) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); unsigned int ntx; /* Attach underlying qdisc */ for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, ntx); struct Qdisc *old, *dev_queue_qdisc; if (FULL_OFFLOAD_IS_ENABLED(q->flags)) { struct Qdisc *qdisc = q->qdiscs[ntx]; /* In offload mode, the root taprio qdisc is bypassed * and the netdev TX queues see the children directly */ qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; dev_queue_qdisc = qdisc; } else { /* In software mode, attach the root taprio qdisc * to all netdev TX queues, so that dev_qdisc_enqueue() * goes through taprio_enqueue(). */ dev_queue_qdisc = sch; } old = dev_graft_qdisc(dev_queue, dev_queue_qdisc); /* The qdisc's refcount requires to be elevated once * for each netdev TX queue it is grafted onto */ qdisc_refcount_inc(dev_queue_qdisc); if (old) qdisc_put(old); } } static struct netdev_queue *taprio_queue_get(struct Qdisc *sch, unsigned long cl) { struct net_device *dev = qdisc_dev(sch); unsigned long ntx = cl - 1; if (ntx >= dev->num_tx_queues) return NULL; return netdev_get_tx_queue(dev, ntx); } static int taprio_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new, struct Qdisc **old, struct netlink_ext_ack *extack) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); struct netdev_queue *dev_queue = taprio_queue_get(sch, cl); if (!dev_queue) return -EINVAL; if (dev->flags & IFF_UP) dev_deactivate(dev); /* In offload mode, the child Qdisc is directly attached to the netdev * TX queue, and thus, we need to keep its refcount elevated in order * to counteract qdisc_graft()'s call to qdisc_put() once per TX queue. * However, save the reference to the new qdisc in the private array in * both software and offload cases, to have an up-to-date reference to * our children. */ *old = q->qdiscs[cl - 1]; if (FULL_OFFLOAD_IS_ENABLED(q->flags)) { WARN_ON_ONCE(dev_graft_qdisc(dev_queue, new) != *old); if (new) qdisc_refcount_inc(new); if (*old) qdisc_put(*old); } q->qdiscs[cl - 1] = new; if (new) new->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; if (dev->flags & IFF_UP) dev_activate(dev); return 0; } static int dump_entry(struct sk_buff *msg, const struct sched_entry *entry) { struct nlattr *item; item = nla_nest_start_noflag(msg, TCA_TAPRIO_SCHED_ENTRY); if (!item) return -ENOSPC; if (nla_put_u32(msg, TCA_TAPRIO_SCHED_ENTRY_INDEX, entry->index)) goto nla_put_failure; if (nla_put_u8(msg, TCA_TAPRIO_SCHED_ENTRY_CMD, entry->command)) goto nla_put_failure; if (nla_put_u32(msg, TCA_TAPRIO_SCHED_ENTRY_GATE_MASK, entry->gate_mask)) goto nla_put_failure; if (nla_put_u32(msg, TCA_TAPRIO_SCHED_ENTRY_INTERVAL, entry->interval)) goto nla_put_failure; return nla_nest_end(msg, item); nla_put_failure: nla_nest_cancel(msg, item); return -1; } static int dump_schedule(struct sk_buff *msg, const struct sched_gate_list *root) { struct nlattr *entry_list; struct sched_entry *entry; if (nla_put_s64(msg, TCA_TAPRIO_ATTR_SCHED_BASE_TIME, root->base_time, TCA_TAPRIO_PAD)) return -1; if (nla_put_s64(msg, TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME, root->cycle_time, TCA_TAPRIO_PAD)) return -1; if (nla_put_s64(msg, TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION, root->cycle_time_extension, TCA_TAPRIO_PAD)) return -1; entry_list = nla_nest_start_noflag(msg, TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST); if (!entry_list) goto error_nest; list_for_each_entry(entry, &root->entries, list) { if (dump_entry(msg, entry) < 0) goto error_nest; } nla_nest_end(msg, entry_list); return 0; error_nest: nla_nest_cancel(msg, entry_list); return -1; } static int taprio_dump_tc_entries(struct sk_buff *skb, struct taprio_sched *q, struct sched_gate_list *sched) { struct nlattr *n; int tc; for (tc = 0; tc < TC_MAX_QUEUE; tc++) { n = nla_nest_start(skb, TCA_TAPRIO_ATTR_TC_ENTRY); if (!n) return -EMSGSIZE; if (nla_put_u32(skb, TCA_TAPRIO_TC_ENTRY_INDEX, tc)) goto nla_put_failure; if (nla_put_u32(skb, TCA_TAPRIO_TC_ENTRY_MAX_SDU, sched->max_sdu[tc])) goto nla_put_failure; if (nla_put_u32(skb, TCA_TAPRIO_TC_ENTRY_FP, q->fp[tc])) goto nla_put_failure; nla_nest_end(skb, n); } return 0; nla_put_failure: nla_nest_cancel(skb, n); return -EMSGSIZE; } static int taprio_put_stat(struct sk_buff *skb, u64 val, u16 attrtype) { if (val == TAPRIO_STAT_NOT_SET) return 0; if (nla_put_u64_64bit(skb, attrtype, val, TCA_TAPRIO_OFFLOAD_STATS_PAD)) return -EMSGSIZE; return 0; } static int taprio_dump_xstats(struct Qdisc *sch, struct gnet_dump *d, struct tc_taprio_qopt_offload *offload, struct tc_taprio_qopt_stats *stats) { struct net_device *dev = qdisc_dev(sch); const struct net_device_ops *ops; struct sk_buff *skb = d->skb; struct nlattr *xstats; int err; ops = qdisc_dev(sch)->netdev_ops; /* FIXME I could use qdisc_offload_dump_helper(), but that messes * with sch->flags depending on whether the device reports taprio * stats, and I'm not sure whether that's a good idea, considering * that stats are optional to the offload itself */ if (!ops->ndo_setup_tc) return 0; memset(stats, 0xff, sizeof(*stats)); err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TAPRIO, offload); if (err == -EOPNOTSUPP) return 0; if (err) return err; xstats = nla_nest_start(skb, TCA_STATS_APP); if (!xstats) goto err; if (taprio_put_stat(skb, stats->window_drops, TCA_TAPRIO_OFFLOAD_STATS_WINDOW_DROPS) || taprio_put_stat(skb, stats->tx_overruns, TCA_TAPRIO_OFFLOAD_STATS_TX_OVERRUNS)) goto err_cancel; nla_nest_end(skb, xstats); return 0; err_cancel: nla_nest_cancel(skb, xstats); err: return -EMSGSIZE; } static int taprio_dump_stats(struct Qdisc *sch, struct gnet_dump *d) { struct tc_taprio_qopt_offload offload = { .cmd = TAPRIO_CMD_STATS, }; return taprio_dump_xstats(sch, d, &offload, &offload.stats); } static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); struct sched_gate_list *oper, *admin; struct tc_mqprio_qopt opt = { 0 }; struct nlattr *nest, *sched_nest; oper = rtnl_dereference(q->oper_sched); admin = rtnl_dereference(q->admin_sched); mqprio_qopt_reconstruct(dev, &opt); nest = nla_nest_start_noflag(skb, TCA_OPTIONS); if (!nest) goto start_error; if (nla_put(skb, TCA_TAPRIO_ATTR_PRIOMAP, sizeof(opt), &opt)) goto options_error; if (!FULL_OFFLOAD_IS_ENABLED(q->flags) && nla_put_s32(skb, TCA_TAPRIO_ATTR_SCHED_CLOCKID, q->clockid)) goto options_error; if (q->flags && nla_put_u32(skb, TCA_TAPRIO_ATTR_FLAGS, q->flags)) goto options_error; if (q->txtime_delay && nla_put_u32(skb, TCA_TAPRIO_ATTR_TXTIME_DELAY, q->txtime_delay)) goto options_error; if (oper && taprio_dump_tc_entries(skb, q, oper)) goto options_error; if (oper && dump_schedule(skb, oper)) goto options_error; if (!admin) goto done; sched_nest = nla_nest_start_noflag(skb, TCA_TAPRIO_ATTR_ADMIN_SCHED); if (!sched_nest) goto options_error; if (dump_schedule(skb, admin)) goto admin_error; nla_nest_end(skb, sched_nest); done: return nla_nest_end(skb, nest); admin_error: nla_nest_cancel(skb, sched_nest); options_error: nla_nest_cancel(skb, nest); start_error: return -ENOSPC; } static struct Qdisc *taprio_leaf(struct Qdisc *sch, unsigned long cl) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); unsigned int ntx = cl - 1; if (ntx >= dev->num_tx_queues) return NULL; return q->qdiscs[ntx]; } static unsigned long taprio_find(struct Qdisc *sch, u32 classid) { unsigned int ntx = TC_H_MIN(classid); if (!taprio_queue_get(sch, ntx)) return 0; return ntx; } static int taprio_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb, struct tcmsg *tcm) { struct Qdisc *child = taprio_leaf(sch, cl); tcm->tcm_parent = TC_H_ROOT; tcm->tcm_handle |= TC_H_MIN(cl); tcm->tcm_info = child->handle; return 0; } static int taprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, struct gnet_dump *d) __releases(d->lock) __acquires(d->lock) { struct Qdisc *child = taprio_leaf(sch, cl); struct tc_taprio_qopt_offload offload = { .cmd = TAPRIO_CMD_QUEUE_STATS, .queue_stats = { .queue = cl - 1, }, }; if (gnet_stats_copy_basic(d, NULL, &child->bstats, true) < 0 || qdisc_qstats_copy(d, child) < 0) return -1; return taprio_dump_xstats(sch, d, &offload, &offload.queue_stats.stats); } static void taprio_walk(struct Qdisc *sch, struct qdisc_walker *arg) { struct net_device *dev = qdisc_dev(sch); unsigned long ntx; if (arg->stop) return; arg->count = arg->skip; for (ntx = arg->skip; ntx < dev->num_tx_queues; ntx++) { if (!tc_qdisc_stats_dump(sch, ntx + 1, arg)) break; } } static struct netdev_queue *taprio_select_queue(struct Qdisc *sch, struct tcmsg *tcm) { return taprio_queue_get(sch, TC_H_MIN(tcm->tcm_parent)); } static const struct Qdisc_class_ops taprio_class_ops = { .graft = taprio_graft, .leaf = taprio_leaf, .find = taprio_find, .walk = taprio_walk, .dump = taprio_dump_class, .dump_stats = taprio_dump_class_stats, .select_queue = taprio_select_queue, }; static struct Qdisc_ops taprio_qdisc_ops __read_mostly = { .cl_ops = &taprio_class_ops, .id = "taprio", .priv_size = sizeof(struct taprio_sched), .init = taprio_init, .change = taprio_change, .destroy = taprio_destroy, .reset = taprio_reset, .attach = taprio_attach, .peek = taprio_peek, .dequeue = taprio_dequeue, .enqueue = taprio_enqueue, .dump = taprio_dump, .dump_stats = taprio_dump_stats, .owner = THIS_MODULE, }; static struct notifier_block taprio_device_notifier = { .notifier_call = taprio_dev_notifier, }; static int __init taprio_module_init(void) { int err = register_netdevice_notifier(&taprio_device_notifier); if (err) return err; return register_qdisc(&taprio_qdisc_ops); } static void __exit taprio_module_exit(void) { unregister_qdisc(&taprio_qdisc_ops); unregister_netdevice_notifier(&taprio_device_notifier); } module_init(taprio_module_init); module_exit(taprio_module_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Time Aware Priority qdisc"); |
23 47 13 5 5 2 12 12 1 2 7 2 7 2 21 7 11 5 1 11 5 1 9 7 13 2 13 13 13 13 13 3 1 3 3 12 12 12 12 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 | /* * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README */ #include <linux/time.h> #include "reiserfs.h" /* * this contains item handlers for old item types: sd, direct, * indirect, directory */ /* * and where are the comments? how about saying where we can find an * explanation of each item handler method? -Hans */ /* stat data functions */ static int sd_bytes_number(struct item_head *ih, int block_size) { return 0; } static void sd_decrement_key(struct cpu_key *key) { key->on_disk_key.k_objectid--; set_cpu_key_k_type(key, TYPE_ANY); set_cpu_key_k_offset(key, (loff_t)(~0ULL >> 1)); } static int sd_is_left_mergeable(struct reiserfs_key *key, unsigned long bsize) { return 0; } static void sd_print_item(struct item_head *ih, char *item) { printk("\tmode | size | nlinks | first direct | mtime\n"); if (stat_data_v1(ih)) { struct stat_data_v1 *sd = (struct stat_data_v1 *)item; printk("\t0%-6o | %6u | %2u | %d | %u\n", sd_v1_mode(sd), sd_v1_size(sd), sd_v1_nlink(sd), sd_v1_first_direct_byte(sd), sd_v1_mtime(sd)); } else { struct stat_data *sd = (struct stat_data *)item; printk("\t0%-6o | %6llu | %2u | %d | %u\n", sd_v2_mode(sd), (unsigned long long)sd_v2_size(sd), sd_v2_nlink(sd), sd_v2_rdev(sd), sd_v2_mtime(sd)); } } static void sd_check_item(struct item_head *ih, char *item) { /* unused */ } static int sd_create_vi(struct virtual_node *vn, struct virtual_item *vi, int is_affected, int insert_size) { vi->vi_index = TYPE_STAT_DATA; return 0; } static int sd_check_left(struct virtual_item *vi, int free, int start_skip, int end_skip) { BUG_ON(start_skip || end_skip); return -1; } static int sd_check_right(struct virtual_item *vi, int free) { return -1; } static int sd_part_size(struct virtual_item *vi, int first, int count) { BUG_ON(count); return 0; } static int sd_unit_num(struct virtual_item *vi) { return vi->vi_item_len - IH_SIZE; } static void sd_print_vi(struct virtual_item *vi) { reiserfs_warning(NULL, "reiserfs-16100", "STATDATA, index %d, type 0x%x, %h", vi->vi_index, vi->vi_type, vi->vi_ih); } static struct item_operations stat_data_ops = { .bytes_number = sd_bytes_number, .decrement_key = sd_decrement_key, .is_left_mergeable = sd_is_left_mergeable, .print_item = sd_print_item, .check_item = sd_check_item, .create_vi = sd_create_vi, .check_left = sd_check_left, .check_right = sd_check_right, .part_size = sd_part_size, .unit_num = sd_unit_num, .print_vi = sd_print_vi }; /* direct item functions */ static int direct_bytes_number(struct item_head *ih, int block_size) { return ih_item_len(ih); } /* FIXME: this should probably switch to indirect as well */ static void direct_decrement_key(struct cpu_key *key) { cpu_key_k_offset_dec(key); if (cpu_key_k_offset(key) == 0) set_cpu_key_k_type(key, TYPE_STAT_DATA); } static int direct_is_left_mergeable(struct reiserfs_key *key, unsigned long bsize) { int version = le_key_version(key); return ((le_key_k_offset(version, key) & (bsize - 1)) != 1); } static void direct_print_item(struct item_head *ih, char *item) { int j = 0; /* return; */ printk("\""); while (j < ih_item_len(ih)) printk("%c", item[j++]); printk("\"\n"); } static void direct_check_item(struct item_head *ih, char *item) { /* unused */ } static int direct_create_vi(struct virtual_node *vn, struct virtual_item *vi, int is_affected, int insert_size) { vi->vi_index = TYPE_DIRECT; return 0; } static int direct_check_left(struct virtual_item *vi, int free, int start_skip, int end_skip) { int bytes; bytes = free - free % 8; return bytes ? : -1; } static int direct_check_right(struct virtual_item *vi, int free) { return direct_check_left(vi, free, 0, 0); } static int direct_part_size(struct virtual_item *vi, int first, int count) { return count; } static int direct_unit_num(struct virtual_item *vi) { return vi->vi_item_len - IH_SIZE; } static void direct_print_vi(struct virtual_item *vi) { reiserfs_warning(NULL, "reiserfs-16101", "DIRECT, index %d, type 0x%x, %h", vi->vi_index, vi->vi_type, vi->vi_ih); } static struct item_operations direct_ops = { .bytes_number = direct_bytes_number, .decrement_key = direct_decrement_key, .is_left_mergeable = direct_is_left_mergeable, .print_item = direct_print_item, .check_item = direct_check_item, .create_vi = direct_create_vi, .check_left = direct_check_left, .check_right = direct_check_right, .part_size = direct_part_size, .unit_num = direct_unit_num, .print_vi = direct_print_vi }; /* indirect item functions */ static int indirect_bytes_number(struct item_head *ih, int block_size) { return ih_item_len(ih) / UNFM_P_SIZE * block_size; } /* decrease offset, if it becomes 0, change type to stat data */ static void indirect_decrement_key(struct cpu_key *key) { cpu_key_k_offset_dec(key); if (cpu_key_k_offset(key) == 0) set_cpu_key_k_type(key, TYPE_STAT_DATA); } /* if it is not first item of the body, then it is mergeable */ static int indirect_is_left_mergeable(struct reiserfs_key *key, unsigned long bsize) { int version = le_key_version(key); return (le_key_k_offset(version, key) != 1); } /* printing of indirect item */ static void start_new_sequence(__u32 * start, int *len, __u32 new) { *start = new; *len = 1; } static int sequence_finished(__u32 start, int *len, __u32 new) { if (start == INT_MAX) return 1; if (start == 0 && new == 0) { (*len)++; return 0; } if (start != 0 && (start + *len) == new) { (*len)++; return 0; } return 1; } static void print_sequence(__u32 start, int len) { if (start == INT_MAX) return; if (len == 1) printk(" %d", start); else printk(" %d(%d)", start, len); } static void indirect_print_item(struct item_head *ih, char *item) { int j; __le32 *unp; __u32 prev = INT_MAX; int num = 0; unp = (__le32 *) item; if (ih_item_len(ih) % UNFM_P_SIZE) reiserfs_warning(NULL, "reiserfs-16102", "invalid item len"); printk("%d pointers\n[ ", (int)I_UNFM_NUM(ih)); for (j = 0; j < I_UNFM_NUM(ih); j++) { if (sequence_finished(prev, &num, get_block_num(unp, j))) { print_sequence(prev, num); start_new_sequence(&prev, &num, get_block_num(unp, j)); } } print_sequence(prev, num); printk("]\n"); } static void indirect_check_item(struct item_head *ih, char *item) { /* unused */ } static int indirect_create_vi(struct virtual_node *vn, struct virtual_item *vi, int is_affected, int insert_size) { vi->vi_index = TYPE_INDIRECT; return 0; } static int indirect_check_left(struct virtual_item *vi, int free, int start_skip, int end_skip) { int bytes; bytes = free - free % UNFM_P_SIZE; return bytes ? : -1; } static int indirect_check_right(struct virtual_item *vi, int free) { return indirect_check_left(vi, free, 0, 0); } /* * return size in bytes of 'units' units. If first == 0 - calculate * from the head (left), otherwise - from tail (right) */ static int indirect_part_size(struct virtual_item *vi, int first, int units) { /* unit of indirect item is byte (yet) */ return units; } static int indirect_unit_num(struct virtual_item *vi) { /* unit of indirect item is byte (yet) */ return vi->vi_item_len - IH_SIZE; } static void indirect_print_vi(struct virtual_item *vi) { reiserfs_warning(NULL, "reiserfs-16103", "INDIRECT, index %d, type 0x%x, %h", vi->vi_index, vi->vi_type, vi->vi_ih); } static struct item_operations indirect_ops = { .bytes_number = indirect_bytes_number, .decrement_key = indirect_decrement_key, .is_left_mergeable = indirect_is_left_mergeable, .print_item = indirect_print_item, .check_item = indirect_check_item, .create_vi = indirect_create_vi, .check_left = indirect_check_left, .check_right = indirect_check_right, .part_size = indirect_part_size, .unit_num = indirect_unit_num, .print_vi = indirect_print_vi }; /* direntry functions */ static int direntry_bytes_number(struct item_head *ih, int block_size) { reiserfs_warning(NULL, "vs-16090", "bytes number is asked for direntry"); return 0; } static void direntry_decrement_key(struct cpu_key *key) { cpu_key_k_offset_dec(key); if (cpu_key_k_offset(key) == 0) set_cpu_key_k_type(key, TYPE_STAT_DATA); } static int direntry_is_left_mergeable(struct reiserfs_key *key, unsigned long bsize) { if (le32_to_cpu(key->u.k_offset_v1.k_offset) == DOT_OFFSET) return 0; return 1; } static void direntry_print_item(struct item_head *ih, char *item) { int i; int namelen; struct reiserfs_de_head *deh; char *name; static char namebuf[80]; printk("\n # %-15s%-30s%-15s%-15s%-15s\n", "Name", "Key of pointed object", "Hash", "Gen number", "Status"); deh = (struct reiserfs_de_head *)item; for (i = 0; i < ih_entry_count(ih); i++, deh++) { namelen = (i ? (deh_location(deh - 1)) : ih_item_len(ih)) - deh_location(deh); name = item + deh_location(deh); if (name[namelen - 1] == 0) namelen = strlen(name); namebuf[0] = '"'; if (namelen > sizeof(namebuf) - 3) { strncpy(namebuf + 1, name, sizeof(namebuf) - 3); namebuf[sizeof(namebuf) - 2] = '"'; namebuf[sizeof(namebuf) - 1] = 0; } else { memcpy(namebuf + 1, name, namelen); namebuf[namelen + 1] = '"'; namebuf[namelen + 2] = 0; } printk("%d: %-15s%-15d%-15d%-15lld%-15lld(%s)\n", i, namebuf, deh_dir_id(deh), deh_objectid(deh), GET_HASH_VALUE(deh_offset(deh)), GET_GENERATION_NUMBER((deh_offset(deh))), (de_hidden(deh)) ? "HIDDEN" : "VISIBLE"); } } static void direntry_check_item(struct item_head *ih, char *item) { int i; struct reiserfs_de_head *deh; /* unused */ deh = (struct reiserfs_de_head *)item; for (i = 0; i < ih_entry_count(ih); i++, deh++) { ; } } #define DIRENTRY_VI_FIRST_DIRENTRY_ITEM 1 /* * function returns old entry number in directory item in real node * using new entry number in virtual item in virtual node */ static inline int old_entry_num(int is_affected, int virtual_entry_num, int pos_in_item, int mode) { if (mode == M_INSERT || mode == M_DELETE) return virtual_entry_num; if (!is_affected) /* cut or paste is applied to another item */ return virtual_entry_num; if (virtual_entry_num < pos_in_item) return virtual_entry_num; if (mode == M_CUT) return virtual_entry_num + 1; RFALSE(mode != M_PASTE || virtual_entry_num == 0, "vs-8015: old_entry_num: mode must be M_PASTE (mode = \'%c\'", mode); return virtual_entry_num - 1; } /* * Create an array of sizes of directory entries for virtual * item. Return space used by an item. FIXME: no control over * consuming of space used by this item handler */ static int direntry_create_vi(struct virtual_node *vn, struct virtual_item *vi, int is_affected, int insert_size) { struct direntry_uarea *dir_u = vi->vi_uarea; int i, j; int size = sizeof(struct direntry_uarea); struct reiserfs_de_head *deh; vi->vi_index = TYPE_DIRENTRY; BUG_ON(!(vi->vi_ih) || !vi->vi_item); dir_u->flags = 0; if (le_ih_k_offset(vi->vi_ih) == DOT_OFFSET) dir_u->flags |= DIRENTRY_VI_FIRST_DIRENTRY_ITEM; deh = (struct reiserfs_de_head *)(vi->vi_item); /* virtual directory item have this amount of entry after */ dir_u->entry_count = ih_entry_count(vi->vi_ih) + ((is_affected) ? ((vn->vn_mode == M_CUT) ? -1 : (vn->vn_mode == M_PASTE ? 1 : 0)) : 0); for (i = 0; i < dir_u->entry_count; i++) { j = old_entry_num(is_affected, i, vn->vn_pos_in_item, vn->vn_mode); dir_u->entry_sizes[i] = (j ? deh_location(&deh[j - 1]) : ih_item_len(vi->vi_ih)) - deh_location(&deh[j]) + DEH_SIZE; } size += (dir_u->entry_count * sizeof(short)); /* set size of pasted entry */ if (is_affected && vn->vn_mode == M_PASTE) dir_u->entry_sizes[vn->vn_pos_in_item] = insert_size; #ifdef CONFIG_REISERFS_CHECK /* compare total size of entries with item length */ { int k, l; l = 0; for (k = 0; k < dir_u->entry_count; k++) l += dir_u->entry_sizes[k]; if (l + IH_SIZE != vi->vi_item_len + ((is_affected && (vn->vn_mode == M_PASTE || vn->vn_mode == M_CUT)) ? insert_size : 0)) { reiserfs_panic(NULL, "vs-8025", "(mode==%c, " "insert_size==%d), invalid length of " "directory item", vn->vn_mode, insert_size); } } #endif return size; } /* * return number of entries which may fit into specified amount of * free space, or -1 if free space is not enough even for 1 entry */ static int direntry_check_left(struct virtual_item *vi, int free, int start_skip, int end_skip) { int i; int entries = 0; struct direntry_uarea *dir_u = vi->vi_uarea; for (i = start_skip; i < dir_u->entry_count - end_skip; i++) { /* i-th entry doesn't fit into the remaining free space */ if (dir_u->entry_sizes[i] > free) break; free -= dir_u->entry_sizes[i]; entries++; } if (entries == dir_u->entry_count) { reiserfs_panic(NULL, "item_ops-1", "free space %d, entry_count %d", free, dir_u->entry_count); } /* "." and ".." can not be separated from each other */ if (start_skip == 0 && (dir_u->flags & DIRENTRY_VI_FIRST_DIRENTRY_ITEM) && entries < 2) entries = 0; return entries ? : -1; } static int direntry_check_right(struct virtual_item *vi, int free) { int i; int entries = 0; struct direntry_uarea *dir_u = vi->vi_uarea; for (i = dir_u->entry_count - 1; i >= 0; i--) { /* i-th entry doesn't fit into the remaining free space */ if (dir_u->entry_sizes[i] > free) break; free -= dir_u->entry_sizes[i]; entries++; } BUG_ON(entries == dir_u->entry_count); /* "." and ".." can not be separated from each other */ if ((dir_u->flags & DIRENTRY_VI_FIRST_DIRENTRY_ITEM) && entries > dir_u->entry_count - 2) entries = dir_u->entry_count - 2; return entries ? : -1; } /* sum of entry sizes between from-th and to-th entries including both edges */ static int direntry_part_size(struct virtual_item *vi, int first, int count) { int i, retval; int from, to; struct direntry_uarea *dir_u = vi->vi_uarea; retval = 0; if (first == 0) from = 0; else from = dir_u->entry_count - count; to = from + count - 1; for (i = from; i <= to; i++) retval += dir_u->entry_sizes[i]; return retval; } static int direntry_unit_num(struct virtual_item *vi) { struct direntry_uarea *dir_u = vi->vi_uarea; return dir_u->entry_count; } static void direntry_print_vi(struct virtual_item *vi) { int i; struct direntry_uarea *dir_u = vi->vi_uarea; reiserfs_warning(NULL, "reiserfs-16104", "DIRENTRY, index %d, type 0x%x, %h, flags 0x%x", vi->vi_index, vi->vi_type, vi->vi_ih, dir_u->flags); printk("%d entries: ", dir_u->entry_count); for (i = 0; i < dir_u->entry_count; i++) printk("%d ", dir_u->entry_sizes[i]); printk("\n"); } static struct item_operations direntry_ops = { .bytes_number = direntry_bytes_number, .decrement_key = direntry_decrement_key, .is_left_mergeable = direntry_is_left_mergeable, .print_item = direntry_print_item, .check_item = direntry_check_item, .create_vi = direntry_create_vi, .check_left = direntry_check_left, .check_right = direntry_check_right, .part_size = direntry_part_size, .unit_num = direntry_unit_num, .print_vi = direntry_print_vi }; /* Error catching functions to catch errors caused by incorrect item types. */ static int errcatch_bytes_number(struct item_head *ih, int block_size) { reiserfs_warning(NULL, "green-16001", "Invalid item type observed, run fsck ASAP"); return 0; } static void errcatch_decrement_key(struct cpu_key *key) { reiserfs_warning(NULL, "green-16002", "Invalid item type observed, run fsck ASAP"); } static int errcatch_is_left_mergeable(struct reiserfs_key *key, unsigned long bsize) { reiserfs_warning(NULL, "green-16003", "Invalid item type observed, run fsck ASAP"); return 0; } static void errcatch_print_item(struct item_head *ih, char *item) { reiserfs_warning(NULL, "green-16004", "Invalid item type observed, run fsck ASAP"); } static void errcatch_check_item(struct item_head *ih, char *item) { reiserfs_warning(NULL, "green-16005", "Invalid item type observed, run fsck ASAP"); } static int errcatch_create_vi(struct virtual_node *vn, struct virtual_item *vi, int is_affected, int insert_size) { reiserfs_warning(NULL, "green-16006", "Invalid item type observed, run fsck ASAP"); /* * We might return -1 here as well, but it won't help as * create_virtual_node() from where this operation is called * from is of return type void. */ return 0; } static int errcatch_check_left(struct virtual_item *vi, int free, int start_skip, int end_skip) { reiserfs_warning(NULL, "green-16007", "Invalid item type observed, run fsck ASAP"); return -1; } static int errcatch_check_right(struct virtual_item *vi, int free) { reiserfs_warning(NULL, "green-16008", "Invalid item type observed, run fsck ASAP"); return -1; } static int errcatch_part_size(struct virtual_item *vi, int first, int count) { reiserfs_warning(NULL, "green-16009", "Invalid item type observed, run fsck ASAP"); return 0; } static int errcatch_unit_num(struct virtual_item *vi) { reiserfs_warning(NULL, "green-16010", "Invalid item type observed, run fsck ASAP"); return 0; } static void errcatch_print_vi(struct virtual_item *vi) { reiserfs_warning(NULL, "green-16011", "Invalid item type observed, run fsck ASAP"); } static struct item_operations errcatch_ops = { .bytes_number = errcatch_bytes_number, .decrement_key = errcatch_decrement_key, .is_left_mergeable = errcatch_is_left_mergeable, .print_item = errcatch_print_item, .check_item = errcatch_check_item, .create_vi = errcatch_create_vi, .check_left = errcatch_check_left, .check_right = errcatch_check_right, .part_size = errcatch_part_size, .unit_num = errcatch_unit_num, .print_vi = errcatch_print_vi }; #if ! (TYPE_STAT_DATA == 0 && TYPE_INDIRECT == 1 && TYPE_DIRECT == 2 && TYPE_DIRENTRY == 3) #error Item types must use disk-format assigned values. #endif struct item_operations *item_ops[TYPE_ANY + 1] = { &stat_data_ops, &indirect_ops, &direct_ops, &direntry_ops, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, &errcatch_ops /* This is to catch errors with invalid type (15th entry for TYPE_ANY) */ }; |
17 2 11 12 4 10 5 8 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * sm3_base.h - core logic for SM3 implementations * * Copyright (C) 2017 ARM Limited or its affiliates. * Written by Gilad Ben-Yossef <gilad@benyossef.com> */ #ifndef _CRYPTO_SM3_BASE_H #define _CRYPTO_SM3_BASE_H #include <crypto/internal/hash.h> #include <crypto/sm3.h> #include <linux/crypto.h> #include <linux/module.h> #include <linux/string.h> #include <asm/unaligned.h> typedef void (sm3_block_fn)(struct sm3_state *sst, u8 const *src, int blocks); static inline int sm3_base_init(struct shash_desc *desc) { struct sm3_state *sctx = shash_desc_ctx(desc); sctx->state[0] = SM3_IVA; sctx->state[1] = SM3_IVB; sctx->state[2] = SM3_IVC; sctx->state[3] = SM3_IVD; sctx->state[4] = SM3_IVE; sctx->state[5] = SM3_IVF; sctx->state[6] = SM3_IVG; sctx->state[7] = SM3_IVH; sctx->count = 0; return 0; } static inline int sm3_base_do_update(struct shash_desc *desc, const u8 *data, unsigned int len, sm3_block_fn *block_fn) { struct sm3_state *sctx = shash_desc_ctx(desc); unsigned int partial = sctx->count % SM3_BLOCK_SIZE; sctx->count += len; if (unlikely((partial + len) >= SM3_BLOCK_SIZE)) { int blocks; if (partial) { int p = SM3_BLOCK_SIZE - partial; memcpy(sctx->buffer + partial, data, p); data += p; len -= p; block_fn(sctx, sctx->buffer, 1); } blocks = len / SM3_BLOCK_SIZE; len %= SM3_BLOCK_SIZE; if (blocks) { block_fn(sctx, data, blocks); data += blocks * SM3_BLOCK_SIZE; } partial = 0; } if (len) memcpy(sctx->buffer + partial, data, len); return 0; } static inline int sm3_base_do_finalize(struct shash_desc *desc, sm3_block_fn *block_fn) { const int bit_offset = SM3_BLOCK_SIZE - sizeof(__be64); struct sm3_state *sctx = shash_desc_ctx(desc); __be64 *bits = (__be64 *)(sctx->buffer + bit_offset); unsigned int partial = sctx->count % SM3_BLOCK_SIZE; sctx->buffer[partial++] = 0x80; if (partial > bit_offset) { memset(sctx->buffer + partial, 0x0, SM3_BLOCK_SIZE - partial); partial = 0; block_fn(sctx, sctx->buffer, 1); } memset(sctx->buffer + partial, 0x0, bit_offset - partial); *bits = cpu_to_be64(sctx->count << 3); block_fn(sctx, sctx->buffer, 1); return 0; } static inline int sm3_base_finish(struct shash_desc *desc, u8 *out) { struct sm3_state *sctx = shash_desc_ctx(desc); __be32 *digest = (__be32 *)out; int i; for (i = 0; i < SM3_DIGEST_SIZE / sizeof(__be32); i++) put_unaligned_be32(sctx->state[i], digest++); memzero_explicit(sctx, sizeof(*sctx)); return 0; } #endif /* _CRYPTO_SM3_BASE_H */ |
133 3 128 8 2 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Media Controller ancillary functions * * Copyright (c) 2016 Mauro Carvalho Chehab <mchehab@kernel.org> * Copyright (C) 2016 Shuah Khan <shuahkh@osg.samsung.com> * Copyright (C) 2006-2010 Nokia Corporation * Copyright (c) 2016 Intel Corporation. */ #include <linux/module.h> #include <linux/pci.h> #include <linux/usb.h> #include <media/media-device.h> #include <media/media-entity.h> #include <media/v4l2-fh.h> #include <media/v4l2-mc.h> #include <media/v4l2-subdev.h> #include <media/videobuf2-core.h> int v4l2_mc_create_media_graph(struct media_device *mdev) { struct media_entity *entity; struct media_entity *if_vid = NULL, *if_aud = NULL; struct media_entity *tuner = NULL, *decoder = NULL; struct media_entity *io_v4l = NULL, *io_vbi = NULL, *io_swradio = NULL; bool is_webcam = false; u32 flags; int ret, pad_sink, pad_source; if (!mdev) return 0; media_device_for_each_entity(entity, mdev) { switch (entity->function) { case MEDIA_ENT_F_IF_VID_DECODER: if_vid = entity; break; case MEDIA_ENT_F_IF_AUD_DECODER: if_aud = entity; break; case MEDIA_ENT_F_TUNER: tuner = entity; break; case MEDIA_ENT_F_ATV_DECODER: decoder = entity; break; case MEDIA_ENT_F_IO_V4L: io_v4l = entity; break; case MEDIA_ENT_F_IO_VBI: io_vbi = entity; break; case MEDIA_ENT_F_IO_SWRADIO: io_swradio = entity; break; case MEDIA_ENT_F_CAM_SENSOR: is_webcam = true; break; } } /* It should have at least one I/O entity */ if (!io_v4l && !io_vbi && !io_swradio) { dev_warn(mdev->dev, "Didn't find any I/O entity\n"); return -EINVAL; } /* * Here, webcams are modelled on a very simple way: the sensor is * connected directly to the I/O entity. All dirty details, like * scaler and crop HW are hidden. While such mapping is not enough * for mc-centric hardware, it is enough for v4l2 interface centric * PC-consumer's hardware. */ if (is_webcam) { if (!io_v4l) { dev_warn(mdev->dev, "Didn't find a MEDIA_ENT_F_IO_V4L\n"); return -EINVAL; } media_device_for_each_entity(entity, mdev) { if (entity->function != MEDIA_ENT_F_CAM_SENSOR) continue; ret = media_create_pad_link(entity, 0, io_v4l, 0, MEDIA_LNK_FL_ENABLED); if (ret) { dev_warn(mdev->dev, "Failed to create a sensor link\n"); return ret; } } if (!decoder) return 0; } /* The device isn't a webcam. So, it should have a decoder */ if (!decoder) { dev_warn(mdev->dev, "Decoder not found\n"); return -EINVAL; } /* Link the tuner and IF video output pads */ if (tuner) { if (if_vid) { pad_source = media_get_pad_index(tuner, MEDIA_PAD_FL_SOURCE, PAD_SIGNAL_ANALOG); pad_sink = media_get_pad_index(if_vid, MEDIA_PAD_FL_SINK, PAD_SIGNAL_ANALOG); if (pad_source < 0 || pad_sink < 0) { dev_warn(mdev->dev, "Couldn't get tuner and/or PLL pad(s): (%d, %d)\n", pad_source, pad_sink); return -EINVAL; } ret = media_create_pad_link(tuner, pad_source, if_vid, pad_sink, MEDIA_LNK_FL_ENABLED); if (ret) { dev_warn(mdev->dev, "Couldn't create tuner->PLL link)\n"); return ret; } pad_source = media_get_pad_index(if_vid, MEDIA_PAD_FL_SOURCE, PAD_SIGNAL_ANALOG); pad_sink = media_get_pad_index(decoder, MEDIA_PAD_FL_SINK, PAD_SIGNAL_ANALOG); if (pad_source < 0 || pad_sink < 0) { dev_warn(mdev->dev, "get decoder and/or PLL pad(s): (%d, %d)\n", pad_source, pad_sink); return -EINVAL; } ret = media_create_pad_link(if_vid, pad_source, decoder, pad_sink, MEDIA_LNK_FL_ENABLED); if (ret) { dev_warn(mdev->dev, "couldn't link PLL to decoder\n"); return ret; } } else { pad_source = media_get_pad_index(tuner, MEDIA_PAD_FL_SOURCE, PAD_SIGNAL_ANALOG); pad_sink = media_get_pad_index(decoder, MEDIA_PAD_FL_SINK, PAD_SIGNAL_ANALOG); if (pad_source < 0 || pad_sink < 0) { dev_warn(mdev->dev, "couldn't get tuner and/or decoder pad(s): (%d, %d)\n", pad_source, pad_sink); return -EINVAL; } ret = media_create_pad_link(tuner, pad_source, decoder, pad_sink, MEDIA_LNK_FL_ENABLED); if (ret) return ret; } if (if_aud) { pad_source = media_get_pad_index(tuner, MEDIA_PAD_FL_SOURCE, PAD_SIGNAL_AUDIO); pad_sink = media_get_pad_index(if_aud, MEDIA_PAD_FL_SINK, PAD_SIGNAL_AUDIO); if (pad_source < 0 || pad_sink < 0) { dev_warn(mdev->dev, "couldn't get tuner and/or decoder pad(s) for audio: (%d, %d)\n", pad_source, pad_sink); return -EINVAL; } ret = media_create_pad_link(tuner, pad_source, if_aud, pad_sink, MEDIA_LNK_FL_ENABLED); if (ret) { dev_warn(mdev->dev, "couldn't link tuner->audio PLL\n"); return ret; } } else { if_aud = tuner; } } /* Create demod to V4L, VBI and SDR radio links */ if (io_v4l) { pad_source = media_get_pad_index(decoder, MEDIA_PAD_FL_SOURCE, PAD_SIGNAL_DV); if (pad_source < 0) { dev_warn(mdev->dev, "couldn't get decoder output pad for V4L I/O\n"); return -EINVAL; } ret = media_create_pad_link(decoder, pad_source, io_v4l, 0, MEDIA_LNK_FL_ENABLED); if (ret) { dev_warn(mdev->dev, "couldn't link decoder output to V4L I/O\n"); return ret; } } if (io_swradio) { pad_source = media_get_pad_index(decoder, MEDIA_PAD_FL_SOURCE, PAD_SIGNAL_DV); if (pad_source < 0) { dev_warn(mdev->dev, "couldn't get decoder output pad for SDR\n"); return -EINVAL; } ret = media_create_pad_link(decoder, pad_source, io_swradio, 0, MEDIA_LNK_FL_ENABLED); if (ret) { dev_warn(mdev->dev, "couldn't link decoder output to SDR\n"); return ret; } } if (io_vbi) { pad_source = media_get_pad_index(decoder, MEDIA_PAD_FL_SOURCE, PAD_SIGNAL_DV); if (pad_source < 0) { dev_warn(mdev->dev, "couldn't get decoder output pad for VBI\n"); return -EINVAL; } ret = media_create_pad_link(decoder, pad_source, io_vbi, 0, MEDIA_LNK_FL_ENABLED); if (ret) { dev_warn(mdev->dev, "couldn't link decoder output to VBI\n"); return ret; } } /* Create links for the media connectors */ flags = MEDIA_LNK_FL_ENABLED; media_device_for_each_entity(entity, mdev) { switch (entity->function) { case MEDIA_ENT_F_CONN_RF: if (!tuner) continue; pad_sink = media_get_pad_index(tuner, MEDIA_PAD_FL_SINK, PAD_SIGNAL_ANALOG); if (pad_sink < 0) { dev_warn(mdev->dev, "couldn't get tuner analog pad sink\n"); return -EINVAL; } ret = media_create_pad_link(entity, 0, tuner, pad_sink, flags); break; case MEDIA_ENT_F_CONN_SVIDEO: case MEDIA_ENT_F_CONN_COMPOSITE: pad_sink = media_get_pad_index(decoder, MEDIA_PAD_FL_SINK, PAD_SIGNAL_ANALOG); if (pad_sink < 0) { dev_warn(mdev->dev, "couldn't get decoder analog pad sink\n"); return -EINVAL; } ret = media_create_pad_link(entity, 0, decoder, pad_sink, flags); break; default: continue; } if (ret) return ret; flags = 0; } return 0; } EXPORT_SYMBOL_GPL(v4l2_mc_create_media_graph); int v4l_enable_media_source(struct video_device *vdev) { struct media_device *mdev = vdev->entity.graph_obj.mdev; int ret = 0, err; if (!mdev) return 0; mutex_lock(&mdev->graph_mutex); if (!mdev->enable_source) goto end; err = mdev->enable_source(&vdev->entity, &vdev->pipe); if (err) ret = -EBUSY; end: mutex_unlock(&mdev->graph_mutex); return ret; } EXPORT_SYMBOL_GPL(v4l_enable_media_source); void v4l_disable_media_source(struct video_device *vdev) { struct media_device *mdev = vdev->entity.graph_obj.mdev; if (mdev) { mutex_lock(&mdev->graph_mutex); if (mdev->disable_source) mdev->disable_source(&vdev->entity); mutex_unlock(&mdev->graph_mutex); } } EXPORT_SYMBOL_GPL(v4l_disable_media_source); int v4l_vb2q_enable_media_source(struct vb2_queue *q) { struct v4l2_fh *fh = q->owner; if (fh && fh->vdev) return v4l_enable_media_source(fh->vdev); return 0; } EXPORT_SYMBOL_GPL(v4l_vb2q_enable_media_source); int v4l2_create_fwnode_links_to_pad(struct v4l2_subdev *src_sd, struct media_pad *sink, u32 flags) { struct fwnode_handle *endpoint; if (!(sink->flags & MEDIA_PAD_FL_SINK)) return -EINVAL; fwnode_graph_for_each_endpoint(dev_fwnode(src_sd->dev), endpoint) { struct fwnode_handle *remote_ep; int src_idx, sink_idx, ret; struct media_pad *src; src_idx = media_entity_get_fwnode_pad(&src_sd->entity, endpoint, MEDIA_PAD_FL_SOURCE); if (src_idx < 0) continue; remote_ep = fwnode_graph_get_remote_endpoint(endpoint); if (!remote_ep) continue; /* * ask the sink to verify it owns the remote endpoint, * and translate to a sink pad. */ sink_idx = media_entity_get_fwnode_pad(sink->entity, remote_ep, MEDIA_PAD_FL_SINK); fwnode_handle_put(remote_ep); if (sink_idx < 0 || sink_idx != sink->index) continue; /* * the source endpoint corresponds to one of its source pads, * the source endpoint connects to an endpoint at the sink * entity, and the sink endpoint corresponds to the sink * pad requested, so we have found an endpoint connection * that works, create the media link for it. */ src = &src_sd->entity.pads[src_idx]; /* skip if link already exists */ if (media_entity_find_link(src, sink)) continue; dev_dbg(src_sd->dev, "creating link %s:%d -> %s:%d\n", src_sd->entity.name, src_idx, sink->entity->name, sink_idx); ret = media_create_pad_link(&src_sd->entity, src_idx, sink->entity, sink_idx, flags); if (ret) { dev_err(src_sd->dev, "link %s:%d -> %s:%d failed with %d\n", src_sd->entity.name, src_idx, sink->entity->name, sink_idx, ret); fwnode_handle_put(endpoint); return ret; } } return 0; } EXPORT_SYMBOL_GPL(v4l2_create_fwnode_links_to_pad); int v4l2_create_fwnode_links(struct v4l2_subdev *src_sd, struct v4l2_subdev *sink_sd) { unsigned int i; for (i = 0; i < sink_sd->entity.num_pads; i++) { struct media_pad *pad = &sink_sd->entity.pads[i]; int ret; if (!(pad->flags & MEDIA_PAD_FL_SINK)) continue; ret = v4l2_create_fwnode_links_to_pad(src_sd, pad, 0); if (ret) return ret; } return 0; } EXPORT_SYMBOL_GPL(v4l2_create_fwnode_links); /* ----------------------------------------------------------------------------- * Pipeline power management * * Entities must be powered up when part of a pipeline that contains at least * one open video device node. * * To achieve this use the entity use_count field to track the number of users. * For entities corresponding to video device nodes the use_count field stores * the users count of the node. For entities corresponding to subdevs the * use_count field stores the total number of users of all video device nodes * in the pipeline. * * The v4l2_pipeline_pm_{get, put}() functions must be called in the open() and * close() handlers of video device nodes. It increments or decrements the use * count of all subdev entities in the pipeline. * * To react to link management on powered pipelines, the link setup notification * callback updates the use count of all entities in the source and sink sides * of the link. */ /* * pipeline_pm_use_count - Count the number of users of a pipeline * @entity: The entity * * Return the total number of users of all video device nodes in the pipeline. */ static int pipeline_pm_use_count(struct media_entity *entity, struct media_graph *graph) { int use = 0; media_graph_walk_start(graph, entity); while ((entity = media_graph_walk_next(graph))) { if (is_media_entity_v4l2_video_device(entity)) use += entity->use_count; } return use; } /* * pipeline_pm_power_one - Apply power change to an entity * @entity: The entity * @change: Use count change * * Change the entity use count by @change. If the entity is a subdev update its * power state by calling the core::s_power operation when the use count goes * from 0 to != 0 or from != 0 to 0. * * Return 0 on success or a negative error code on failure. */ static int pipeline_pm_power_one(struct media_entity *entity, int change) { struct v4l2_subdev *subdev; int ret; subdev = is_media_entity_v4l2_subdev(entity) ? media_entity_to_v4l2_subdev(entity) : NULL; if (entity->use_count == 0 && change > 0 && subdev != NULL) { ret = v4l2_subdev_call(subdev, core, s_power, 1); if (ret < 0 && ret != -ENOIOCTLCMD) return ret; } entity->use_count += change; WARN_ON(entity->use_count < 0); if (entity->use_count == 0 && change < 0 && subdev != NULL) v4l2_subdev_call(subdev, core, s_power, 0); return 0; } /* * pipeline_pm_power - Apply power change to all entities in a pipeline * @entity: The entity * @change: Use count change * * Walk the pipeline to update the use count and the power state of all non-node * entities. * * Return 0 on success or a negative error code on failure. */ static int pipeline_pm_power(struct media_entity *entity, int change, struct media_graph *graph) { struct media_entity *first = entity; int ret = 0; if (!change) return 0; media_graph_walk_start(graph, entity); while (!ret && (entity = media_graph_walk_next(graph))) if (is_media_entity_v4l2_subdev(entity)) ret = pipeline_pm_power_one(entity, change); if (!ret) return ret; media_graph_walk_start(graph, first); while ((first = media_graph_walk_next(graph)) && first != entity) if (is_media_entity_v4l2_subdev(first)) pipeline_pm_power_one(first, -change); return ret; } static int v4l2_pipeline_pm_use(struct media_entity *entity, unsigned int use) { struct media_device *mdev = entity->graph_obj.mdev; int change = use ? 1 : -1; int ret; mutex_lock(&mdev->graph_mutex); /* Apply use count to node. */ entity->use_count += change; WARN_ON(entity->use_count < 0); /* Apply power change to connected non-nodes. */ ret = pipeline_pm_power(entity, change, &mdev->pm_count_walk); if (ret < 0) entity->use_count -= change; mutex_unlock(&mdev->graph_mutex); return ret; } int v4l2_pipeline_pm_get(struct media_entity *entity) { return v4l2_pipeline_pm_use(entity, 1); } EXPORT_SYMBOL_GPL(v4l2_pipeline_pm_get); void v4l2_pipeline_pm_put(struct media_entity *entity) { /* Powering off entities shouldn't fail. */ WARN_ON(v4l2_pipeline_pm_use(entity, 0)); } EXPORT_SYMBOL_GPL(v4l2_pipeline_pm_put); int v4l2_pipeline_link_notify(struct media_link *link, u32 flags, unsigned int notification) { struct media_graph *graph = &link->graph_obj.mdev->pm_count_walk; struct media_entity *source = link->source->entity; struct media_entity *sink = link->sink->entity; int source_use; int sink_use; int ret = 0; source_use = pipeline_pm_use_count(source, graph); sink_use = pipeline_pm_use_count(sink, graph); if (notification == MEDIA_DEV_NOTIFY_POST_LINK_CH && !(flags & MEDIA_LNK_FL_ENABLED)) { /* Powering off entities is assumed to never fail. */ pipeline_pm_power(source, -sink_use, graph); pipeline_pm_power(sink, -source_use, graph); return 0; } if (notification == MEDIA_DEV_NOTIFY_PRE_LINK_CH && (flags & MEDIA_LNK_FL_ENABLED)) { ret = pipeline_pm_power(source, sink_use, graph); if (ret < 0) return ret; ret = pipeline_pm_power(sink, source_use, graph); if (ret < 0) pipeline_pm_power(source, -sink_use, graph); } return ret; } EXPORT_SYMBOL_GPL(v4l2_pipeline_link_notify); |
131 9 103 3 3 2 1 1 1 44 44 10 10 10 2 8 10 10 11 1 8 1 10 6 6 6 44 43 2 42 2 42 5 4 1 46 42 46 12 12 12 41 1 14 2 45 46 41 12 46 4 46 12 2 2 10 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 | // SPDX-License-Identifier: GPL-2.0-or-later /* * ALSA sequencer Ports * Copyright (c) 1998 by Frank van de Pol <fvdpol@coil.demon.nl> * Jaroslav Kysela <perex@perex.cz> */ #include <sound/core.h> #include <linux/slab.h> #include <linux/module.h> #include "seq_system.h" #include "seq_ports.h" #include "seq_clientmgr.h" /* registration of client ports */ /* NOTE: the current implementation of the port structure as a linked list is not optimal for clients that have many ports. For sending messages to all subscribers of a port we first need to find the address of the port structure, which means we have to traverse the list. A direct access table (array) would be better, but big preallocated arrays waste memory. Possible actions: 1) leave it this way, a client does normaly does not have more than a few ports 2) replace the linked list of ports by a array of pointers which is dynamicly kmalloced. When a port is added or deleted we can simply allocate a new array, copy the corresponding pointers, and delete the old one. We then only need a pointer to this array, and an integer that tells us how much elements are in array. */ /* return pointer to port structure - port is locked if found */ struct snd_seq_client_port *snd_seq_port_use_ptr(struct snd_seq_client *client, int num) { struct snd_seq_client_port *port; if (client == NULL) return NULL; read_lock(&client->ports_lock); list_for_each_entry(port, &client->ports_list_head, list) { if (port->addr.port == num) { if (port->closing) break; /* deleting now */ snd_use_lock_use(&port->use_lock); read_unlock(&client->ports_lock); return port; } } read_unlock(&client->ports_lock); return NULL; /* not found */ } /* search for the next port - port is locked if found */ struct snd_seq_client_port *snd_seq_port_query_nearest(struct snd_seq_client *client, struct snd_seq_port_info *pinfo) { int num; struct snd_seq_client_port *port, *found; bool check_inactive = (pinfo->capability & SNDRV_SEQ_PORT_CAP_INACTIVE); num = pinfo->addr.port; found = NULL; read_lock(&client->ports_lock); list_for_each_entry(port, &client->ports_list_head, list) { if ((port->capability & SNDRV_SEQ_PORT_CAP_INACTIVE) && !check_inactive) continue; /* skip inactive ports */ if (port->addr.port < num) continue; if (port->addr.port == num) { found = port; break; } if (found == NULL || port->addr.port < found->addr.port) found = port; } if (found) { if (found->closing) found = NULL; else snd_use_lock_use(&found->use_lock); } read_unlock(&client->ports_lock); return found; } /* initialize snd_seq_port_subs_info */ static void port_subs_info_init(struct snd_seq_port_subs_info *grp) { INIT_LIST_HEAD(&grp->list_head); grp->count = 0; grp->exclusive = 0; rwlock_init(&grp->list_lock); init_rwsem(&grp->list_mutex); grp->open = NULL; grp->close = NULL; } /* create a port, port number or a negative error code is returned * the caller needs to unref the port via snd_seq_port_unlock() appropriately */ int snd_seq_create_port(struct snd_seq_client *client, int port, struct snd_seq_client_port **port_ret) { struct snd_seq_client_port *new_port, *p; int num; *port_ret = NULL; /* sanity check */ if (snd_BUG_ON(!client)) return -EINVAL; if (client->num_ports >= SNDRV_SEQ_MAX_PORTS) { pr_warn("ALSA: seq: too many ports for client %d\n", client->number); return -EINVAL; } /* create a new port */ new_port = kzalloc(sizeof(*new_port), GFP_KERNEL); if (!new_port) return -ENOMEM; /* failure, out of memory */ /* init port data */ new_port->addr.client = client->number; new_port->addr.port = -1; new_port->owner = THIS_MODULE; snd_use_lock_init(&new_port->use_lock); port_subs_info_init(&new_port->c_src); port_subs_info_init(&new_port->c_dest); snd_use_lock_use(&new_port->use_lock); num = max(port, 0); mutex_lock(&client->ports_mutex); write_lock_irq(&client->ports_lock); list_for_each_entry(p, &client->ports_list_head, list) { if (p->addr.port == port) { kfree(new_port); num = -EBUSY; goto unlock; } if (p->addr.port > num) break; if (port < 0) /* auto-probe mode */ num = p->addr.port + 1; } /* insert the new port */ list_add_tail(&new_port->list, &p->list); client->num_ports++; new_port->addr.port = num; /* store the port number in the port */ sprintf(new_port->name, "port-%d", num); *port_ret = new_port; unlock: write_unlock_irq(&client->ports_lock); mutex_unlock(&client->ports_mutex); return num; } /* */ static int subscribe_port(struct snd_seq_client *client, struct snd_seq_client_port *port, struct snd_seq_port_subs_info *grp, struct snd_seq_port_subscribe *info, int send_ack); static int unsubscribe_port(struct snd_seq_client *client, struct snd_seq_client_port *port, struct snd_seq_port_subs_info *grp, struct snd_seq_port_subscribe *info, int send_ack); static struct snd_seq_client_port *get_client_port(struct snd_seq_addr *addr, struct snd_seq_client **cp) { struct snd_seq_client_port *p; *cp = snd_seq_client_use_ptr(addr->client); if (*cp) { p = snd_seq_port_use_ptr(*cp, addr->port); if (! p) { snd_seq_client_unlock(*cp); *cp = NULL; } return p; } return NULL; } static void delete_and_unsubscribe_port(struct snd_seq_client *client, struct snd_seq_client_port *port, struct snd_seq_subscribers *subs, bool is_src, bool ack); static inline struct snd_seq_subscribers * get_subscriber(struct list_head *p, bool is_src) { if (is_src) return list_entry(p, struct snd_seq_subscribers, src_list); else return list_entry(p, struct snd_seq_subscribers, dest_list); } /* * remove all subscribers on the list * this is called from port_delete, for each src and dest list. */ static void clear_subscriber_list(struct snd_seq_client *client, struct snd_seq_client_port *port, struct snd_seq_port_subs_info *grp, int is_src) { struct list_head *p, *n; list_for_each_safe(p, n, &grp->list_head) { struct snd_seq_subscribers *subs; struct snd_seq_client *c; struct snd_seq_client_port *aport; subs = get_subscriber(p, is_src); if (is_src) aport = get_client_port(&subs->info.dest, &c); else aport = get_client_port(&subs->info.sender, &c); delete_and_unsubscribe_port(client, port, subs, is_src, false); if (!aport) { /* looks like the connected port is being deleted. * we decrease the counter, and when both ports are deleted * remove the subscriber info */ if (atomic_dec_and_test(&subs->ref_count)) kfree(subs); continue; } /* ok we got the connected port */ delete_and_unsubscribe_port(c, aport, subs, !is_src, true); kfree(subs); snd_seq_port_unlock(aport); snd_seq_client_unlock(c); } } /* delete port data */ static int port_delete(struct snd_seq_client *client, struct snd_seq_client_port *port) { /* set closing flag and wait for all port access are gone */ port->closing = 1; snd_use_lock_sync(&port->use_lock); /* clear subscribers info */ clear_subscriber_list(client, port, &port->c_src, true); clear_subscriber_list(client, port, &port->c_dest, false); if (port->private_free) port->private_free(port->private_data); snd_BUG_ON(port->c_src.count != 0); snd_BUG_ON(port->c_dest.count != 0); kfree(port); return 0; } /* delete a port with the given port id */ int snd_seq_delete_port(struct snd_seq_client *client, int port) { struct snd_seq_client_port *found = NULL, *p; mutex_lock(&client->ports_mutex); write_lock_irq(&client->ports_lock); list_for_each_entry(p, &client->ports_list_head, list) { if (p->addr.port == port) { /* ok found. delete from the list at first */ list_del(&p->list); client->num_ports--; found = p; break; } } write_unlock_irq(&client->ports_lock); mutex_unlock(&client->ports_mutex); if (found) return port_delete(client, found); else return -ENOENT; } /* delete the all ports belonging to the given client */ int snd_seq_delete_all_ports(struct snd_seq_client *client) { struct list_head deleted_list; struct snd_seq_client_port *port, *tmp; /* move the port list to deleted_list, and * clear the port list in the client data. */ mutex_lock(&client->ports_mutex); write_lock_irq(&client->ports_lock); if (! list_empty(&client->ports_list_head)) { list_add(&deleted_list, &client->ports_list_head); list_del_init(&client->ports_list_head); } else { INIT_LIST_HEAD(&deleted_list); } client->num_ports = 0; write_unlock_irq(&client->ports_lock); /* remove each port in deleted_list */ list_for_each_entry_safe(port, tmp, &deleted_list, list) { list_del(&port->list); snd_seq_system_client_ev_port_exit(port->addr.client, port->addr.port); port_delete(client, port); } mutex_unlock(&client->ports_mutex); return 0; } /* set port info fields */ int snd_seq_set_port_info(struct snd_seq_client_port * port, struct snd_seq_port_info * info) { if (snd_BUG_ON(!port || !info)) return -EINVAL; /* set port name */ if (info->name[0]) strscpy(port->name, info->name, sizeof(port->name)); /* set capabilities */ port->capability = info->capability; /* get port type */ port->type = info->type; /* information about supported channels/voices */ port->midi_channels = info->midi_channels; port->midi_voices = info->midi_voices; port->synth_voices = info->synth_voices; /* timestamping */ port->timestamping = (info->flags & SNDRV_SEQ_PORT_FLG_TIMESTAMP) ? 1 : 0; port->time_real = (info->flags & SNDRV_SEQ_PORT_FLG_TIME_REAL) ? 1 : 0; port->time_queue = info->time_queue; /* UMP direction and group */ port->direction = info->direction; port->ump_group = info->ump_group; if (port->ump_group > SNDRV_UMP_MAX_GROUPS) port->ump_group = 0; /* fill default port direction */ if (!port->direction) { if (info->capability & SNDRV_SEQ_PORT_CAP_READ) port->direction |= SNDRV_SEQ_PORT_DIR_INPUT; if (info->capability & SNDRV_SEQ_PORT_CAP_WRITE) port->direction |= SNDRV_SEQ_PORT_DIR_OUTPUT; } return 0; } /* get port info fields */ int snd_seq_get_port_info(struct snd_seq_client_port * port, struct snd_seq_port_info * info) { if (snd_BUG_ON(!port || !info)) return -EINVAL; /* get port name */ strscpy(info->name, port->name, sizeof(info->name)); /* get capabilities */ info->capability = port->capability; /* get port type */ info->type = port->type; /* information about supported channels/voices */ info->midi_channels = port->midi_channels; info->midi_voices = port->midi_voices; info->synth_voices = port->synth_voices; /* get subscriber counts */ info->read_use = port->c_src.count; info->write_use = port->c_dest.count; /* timestamping */ info->flags = 0; if (port->timestamping) { info->flags |= SNDRV_SEQ_PORT_FLG_TIMESTAMP; if (port->time_real) info->flags |= SNDRV_SEQ_PORT_FLG_TIME_REAL; info->time_queue = port->time_queue; } /* UMP direction and group */ info->direction = port->direction; info->ump_group = port->ump_group; return 0; } /* * call callback functions (if any): * the callbacks are invoked only when the first (for connection) or * the last subscription (for disconnection) is done. Second or later * subscription results in increment of counter, but no callback is * invoked. * This feature is useful if these callbacks are associated with * initialization or termination of devices (see seq_midi.c). */ static int subscribe_port(struct snd_seq_client *client, struct snd_seq_client_port *port, struct snd_seq_port_subs_info *grp, struct snd_seq_port_subscribe *info, int send_ack) { int err = 0; if (!try_module_get(port->owner)) return -EFAULT; grp->count++; if (grp->open && grp->count == 1) { err = grp->open(port->private_data, info); if (err < 0) { module_put(port->owner); grp->count--; } } if (err >= 0 && send_ack && client->type == USER_CLIENT) snd_seq_client_notify_subscription(port->addr.client, port->addr.port, info, SNDRV_SEQ_EVENT_PORT_SUBSCRIBED); return err; } static int unsubscribe_port(struct snd_seq_client *client, struct snd_seq_client_port *port, struct snd_seq_port_subs_info *grp, struct snd_seq_port_subscribe *info, int send_ack) { int err = 0; if (! grp->count) return -EINVAL; grp->count--; if (grp->close && grp->count == 0) err = grp->close(port->private_data, info); if (send_ack && client->type == USER_CLIENT) snd_seq_client_notify_subscription(port->addr.client, port->addr.port, info, SNDRV_SEQ_EVENT_PORT_UNSUBSCRIBED); module_put(port->owner); return err; } /* check if both addresses are identical */ static inline int addr_match(struct snd_seq_addr *r, struct snd_seq_addr *s) { return (r->client == s->client) && (r->port == s->port); } /* check the two subscribe info match */ /* if flags is zero, checks only sender and destination addresses */ static int match_subs_info(struct snd_seq_port_subscribe *r, struct snd_seq_port_subscribe *s) { if (addr_match(&r->sender, &s->sender) && addr_match(&r->dest, &s->dest)) { if (r->flags && r->flags == s->flags) return r->queue == s->queue; else if (! r->flags) return 1; } return 0; } static int check_and_subscribe_port(struct snd_seq_client *client, struct snd_seq_client_port *port, struct snd_seq_subscribers *subs, bool is_src, bool exclusive, bool ack) { struct snd_seq_port_subs_info *grp; struct list_head *p; struct snd_seq_subscribers *s; int err; grp = is_src ? &port->c_src : &port->c_dest; err = -EBUSY; down_write(&grp->list_mutex); if (exclusive) { if (!list_empty(&grp->list_head)) goto __error; } else { if (grp->exclusive) goto __error; /* check whether already exists */ list_for_each(p, &grp->list_head) { s = get_subscriber(p, is_src); if (match_subs_info(&subs->info, &s->info)) goto __error; } } err = subscribe_port(client, port, grp, &subs->info, ack); if (err < 0) { grp->exclusive = 0; goto __error; } /* add to list */ write_lock_irq(&grp->list_lock); if (is_src) list_add_tail(&subs->src_list, &grp->list_head); else list_add_tail(&subs->dest_list, &grp->list_head); grp->exclusive = exclusive; atomic_inc(&subs->ref_count); write_unlock_irq(&grp->list_lock); err = 0; __error: up_write(&grp->list_mutex); return err; } /* called with grp->list_mutex held */ static void __delete_and_unsubscribe_port(struct snd_seq_client *client, struct snd_seq_client_port *port, struct snd_seq_subscribers *subs, bool is_src, bool ack) { struct snd_seq_port_subs_info *grp; struct list_head *list; bool empty; grp = is_src ? &port->c_src : &port->c_dest; list = is_src ? &subs->src_list : &subs->dest_list; write_lock_irq(&grp->list_lock); empty = list_empty(list); if (!empty) list_del_init(list); grp->exclusive = 0; write_unlock_irq(&grp->list_lock); if (!empty) unsubscribe_port(client, port, grp, &subs->info, ack); } static void delete_and_unsubscribe_port(struct snd_seq_client *client, struct snd_seq_client_port *port, struct snd_seq_subscribers *subs, bool is_src, bool ack) { struct snd_seq_port_subs_info *grp; grp = is_src ? &port->c_src : &port->c_dest; down_write(&grp->list_mutex); __delete_and_unsubscribe_port(client, port, subs, is_src, ack); up_write(&grp->list_mutex); } /* connect two ports */ int snd_seq_port_connect(struct snd_seq_client *connector, struct snd_seq_client *src_client, struct snd_seq_client_port *src_port, struct snd_seq_client *dest_client, struct snd_seq_client_port *dest_port, struct snd_seq_port_subscribe *info) { struct snd_seq_subscribers *subs; bool exclusive; int err; subs = kzalloc(sizeof(*subs), GFP_KERNEL); if (!subs) return -ENOMEM; subs->info = *info; atomic_set(&subs->ref_count, 0); INIT_LIST_HEAD(&subs->src_list); INIT_LIST_HEAD(&subs->dest_list); exclusive = !!(info->flags & SNDRV_SEQ_PORT_SUBS_EXCLUSIVE); err = check_and_subscribe_port(src_client, src_port, subs, true, exclusive, connector->number != src_client->number); if (err < 0) goto error; err = check_and_subscribe_port(dest_client, dest_port, subs, false, exclusive, connector->number != dest_client->number); if (err < 0) goto error_dest; return 0; error_dest: delete_and_unsubscribe_port(src_client, src_port, subs, true, connector->number != src_client->number); error: kfree(subs); return err; } /* remove the connection */ int snd_seq_port_disconnect(struct snd_seq_client *connector, struct snd_seq_client *src_client, struct snd_seq_client_port *src_port, struct snd_seq_client *dest_client, struct snd_seq_client_port *dest_port, struct snd_seq_port_subscribe *info) { struct snd_seq_port_subs_info *dest = &dest_port->c_dest; struct snd_seq_subscribers *subs; int err = -ENOENT; /* always start from deleting the dest port for avoiding concurrent * deletions */ down_write(&dest->list_mutex); /* look for the connection */ list_for_each_entry(subs, &dest->list_head, dest_list) { if (match_subs_info(info, &subs->info)) { __delete_and_unsubscribe_port(dest_client, dest_port, subs, false, connector->number != dest_client->number); err = 0; break; } } up_write(&dest->list_mutex); if (err < 0) return err; delete_and_unsubscribe_port(src_client, src_port, subs, true, connector->number != src_client->number); kfree(subs); return 0; } /* get matched subscriber */ int snd_seq_port_get_subscription(struct snd_seq_port_subs_info *src_grp, struct snd_seq_addr *dest_addr, struct snd_seq_port_subscribe *subs) { struct snd_seq_subscribers *s; int err = -ENOENT; down_read(&src_grp->list_mutex); list_for_each_entry(s, &src_grp->list_head, src_list) { if (addr_match(dest_addr, &s->info.dest)) { *subs = s->info; err = 0; break; } } up_read(&src_grp->list_mutex); return err; } /* * Attach a device driver that wants to receive events from the * sequencer. Returns the new port number on success. * A driver that wants to receive the events converted to midi, will * use snd_seq_midisynth_register_port(). */ /* exported */ int snd_seq_event_port_attach(int client, struct snd_seq_port_callback *pcbp, int cap, int type, int midi_channels, int midi_voices, char *portname) { struct snd_seq_port_info portinfo; int ret; /* Set up the port */ memset(&portinfo, 0, sizeof(portinfo)); portinfo.addr.client = client; strscpy(portinfo.name, portname ? portname : "Unnamed port", sizeof(portinfo.name)); portinfo.capability = cap; portinfo.type = type; portinfo.kernel = pcbp; portinfo.midi_channels = midi_channels; portinfo.midi_voices = midi_voices; /* Create it */ ret = snd_seq_kernel_client_ctl(client, SNDRV_SEQ_IOCTL_CREATE_PORT, &portinfo); if (ret >= 0) ret = portinfo.addr.port; return ret; } EXPORT_SYMBOL(snd_seq_event_port_attach); /* * Detach the driver from a port. */ /* exported */ int snd_seq_event_port_detach(int client, int port) { struct snd_seq_port_info portinfo; int err; memset(&portinfo, 0, sizeof(portinfo)); portinfo.addr.client = client; portinfo.addr.port = port; err = snd_seq_kernel_client_ctl(client, SNDRV_SEQ_IOCTL_DELETE_PORT, &portinfo); return err; } EXPORT_SYMBOL(snd_seq_event_port_detach); |
173 172 173 173 173 168 61 169 169 169 4 99 99 99 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 | // SPDX-License-Identifier: GPL-2.0 #include <linux/cgroup.h> #include <linux/sched.h> #include <linux/sched/task.h> #include <linux/sched/signal.h> #include "cgroup-internal.h" #include <trace/events/cgroup.h> /* * Propagate the cgroup frozen state upwards by the cgroup tree. */ static void cgroup_propagate_frozen(struct cgroup *cgrp, bool frozen) { int desc = 1; /* * If the new state is frozen, some freezing ancestor cgroups may change * their state too, depending on if all their descendants are frozen. * * Otherwise, all ancestor cgroups are forced into the non-frozen state. */ while ((cgrp = cgroup_parent(cgrp))) { if (frozen) { cgrp->freezer.nr_frozen_descendants += desc; if (!test_bit(CGRP_FROZEN, &cgrp->flags) && test_bit(CGRP_FREEZE, &cgrp->flags) && cgrp->freezer.nr_frozen_descendants == cgrp->nr_descendants) { set_bit(CGRP_FROZEN, &cgrp->flags); cgroup_file_notify(&cgrp->events_file); TRACE_CGROUP_PATH(notify_frozen, cgrp, 1); desc++; } } else { cgrp->freezer.nr_frozen_descendants -= desc; if (test_bit(CGRP_FROZEN, &cgrp->flags)) { clear_bit(CGRP_FROZEN, &cgrp->flags); cgroup_file_notify(&cgrp->events_file); TRACE_CGROUP_PATH(notify_frozen, cgrp, 0); desc++; } } } } /* * Revisit the cgroup frozen state. * Checks if the cgroup is really frozen and perform all state transitions. */ void cgroup_update_frozen(struct cgroup *cgrp) { bool frozen; lockdep_assert_held(&css_set_lock); /* * If the cgroup has to be frozen (CGRP_FREEZE bit set), * and all tasks are frozen and/or stopped, let's consider * the cgroup frozen. Otherwise it's not frozen. */ frozen = test_bit(CGRP_FREEZE, &cgrp->flags) && cgrp->freezer.nr_frozen_tasks == __cgroup_task_count(cgrp); if (frozen) { /* Already there? */ if (test_bit(CGRP_FROZEN, &cgrp->flags)) return; set_bit(CGRP_FROZEN, &cgrp->flags); } else { /* Already there? */ if (!test_bit(CGRP_FROZEN, &cgrp->flags)) return; clear_bit(CGRP_FROZEN, &cgrp->flags); } cgroup_file_notify(&cgrp->events_file); TRACE_CGROUP_PATH(notify_frozen, cgrp, frozen); /* Update the state of ancestor cgroups. */ cgroup_propagate_frozen(cgrp, frozen); } /* * Increment cgroup's nr_frozen_tasks. */ static void cgroup_inc_frozen_cnt(struct cgroup *cgrp) { cgrp->freezer.nr_frozen_tasks++; } /* * Decrement cgroup's nr_frozen_tasks. */ static void cgroup_dec_frozen_cnt(struct cgroup *cgrp) { cgrp->freezer.nr_frozen_tasks--; WARN_ON_ONCE(cgrp->freezer.nr_frozen_tasks < 0); } /* * Enter frozen/stopped state, if not yet there. Update cgroup's counters, * and revisit the state of the cgroup, if necessary. */ void cgroup_enter_frozen(void) { struct cgroup *cgrp; if (current->frozen) return; spin_lock_irq(&css_set_lock); current->frozen = true; cgrp = task_dfl_cgroup(current); cgroup_inc_frozen_cnt(cgrp); cgroup_update_frozen(cgrp); spin_unlock_irq(&css_set_lock); } /* * Conditionally leave frozen/stopped state. Update cgroup's counters, * and revisit the state of the cgroup, if necessary. * * If always_leave is not set, and the cgroup is freezing, * we're racing with the cgroup freezing. In this case, we don't * drop the frozen counter to avoid a transient switch to * the unfrozen state. */ void cgroup_leave_frozen(bool always_leave) { struct cgroup *cgrp; spin_lock_irq(&css_set_lock); cgrp = task_dfl_cgroup(current); if (always_leave || !test_bit(CGRP_FREEZE, &cgrp->flags)) { cgroup_dec_frozen_cnt(cgrp); cgroup_update_frozen(cgrp); WARN_ON_ONCE(!current->frozen); current->frozen = false; } else if (!(current->jobctl & JOBCTL_TRAP_FREEZE)) { spin_lock(¤t->sighand->siglock); current->jobctl |= JOBCTL_TRAP_FREEZE; set_thread_flag(TIF_SIGPENDING); spin_unlock(¤t->sighand->siglock); } spin_unlock_irq(&css_set_lock); } /* * Freeze or unfreeze the task by setting or clearing the JOBCTL_TRAP_FREEZE * jobctl bit. */ static void cgroup_freeze_task(struct task_struct *task, bool freeze) { unsigned long flags; /* If the task is about to die, don't bother with freezing it. */ if (!lock_task_sighand(task, &flags)) return; if (freeze) { task->jobctl |= JOBCTL_TRAP_FREEZE; signal_wake_up(task, false); } else { task->jobctl &= ~JOBCTL_TRAP_FREEZE; wake_up_process(task); } unlock_task_sighand(task, &flags); } /* * Freeze or unfreeze all tasks in the given cgroup. */ static void cgroup_do_freeze(struct cgroup *cgrp, bool freeze) { struct css_task_iter it; struct task_struct *task; lockdep_assert_held(&cgroup_mutex); spin_lock_irq(&css_set_lock); if (freeze) set_bit(CGRP_FREEZE, &cgrp->flags); else clear_bit(CGRP_FREEZE, &cgrp->flags); spin_unlock_irq(&css_set_lock); if (freeze) TRACE_CGROUP_PATH(freeze, cgrp); else TRACE_CGROUP_PATH(unfreeze, cgrp); css_task_iter_start(&cgrp->self, 0, &it); while ((task = css_task_iter_next(&it))) { /* * Ignore kernel threads here. Freezing cgroups containing * kthreads isn't supported. */ if (task->flags & PF_KTHREAD) continue; cgroup_freeze_task(task, freeze); } css_task_iter_end(&it); /* * Cgroup state should be revisited here to cover empty leaf cgroups * and cgroups which descendants are already in the desired state. */ spin_lock_irq(&css_set_lock); if (cgrp->nr_descendants == cgrp->freezer.nr_frozen_descendants) cgroup_update_frozen(cgrp); spin_unlock_irq(&css_set_lock); } /* * Adjust the task state (freeze or unfreeze) and revisit the state of * source and destination cgroups. */ void cgroup_freezer_migrate_task(struct task_struct *task, struct cgroup *src, struct cgroup *dst) { lockdep_assert_held(&css_set_lock); /* * Kernel threads are not supposed to be frozen at all. */ if (task->flags & PF_KTHREAD) return; /* * It's not necessary to do changes if both of the src and dst cgroups * are not freezing and task is not frozen. */ if (!test_bit(CGRP_FREEZE, &src->flags) && !test_bit(CGRP_FREEZE, &dst->flags) && !task->frozen) return; /* * Adjust counters of freezing and frozen tasks. * Note, that if the task is frozen, but the destination cgroup is not * frozen, we bump both counters to keep them balanced. */ if (task->frozen) { cgroup_inc_frozen_cnt(dst); cgroup_dec_frozen_cnt(src); } cgroup_update_frozen(dst); cgroup_update_frozen(src); /* * Force the task to the desired state. */ cgroup_freeze_task(task, test_bit(CGRP_FREEZE, &dst->flags)); } void cgroup_freeze(struct cgroup *cgrp, bool freeze) { struct cgroup_subsys_state *css; struct cgroup *dsct; bool applied = false; lockdep_assert_held(&cgroup_mutex); /* * Nothing changed? Just exit. */ if (cgrp->freezer.freeze == freeze) return; cgrp->freezer.freeze = freeze; /* * Propagate changes downwards the cgroup tree. */ css_for_each_descendant_pre(css, &cgrp->self) { dsct = css->cgroup; if (cgroup_is_dead(dsct)) continue; if (freeze) { dsct->freezer.e_freeze++; /* * Already frozen because of ancestor's settings? */ if (dsct->freezer.e_freeze > 1) continue; } else { dsct->freezer.e_freeze--; /* * Still frozen because of ancestor's settings? */ if (dsct->freezer.e_freeze > 0) continue; WARN_ON_ONCE(dsct->freezer.e_freeze < 0); } /* * Do change actual state: freeze or unfreeze. */ cgroup_do_freeze(dsct, freeze); applied = true; } /* * Even if the actual state hasn't changed, let's notify a user. * The state can be enforced by an ancestor cgroup: the cgroup * can already be in the desired state or it can be locked in the * opposite state, so that the transition will never happen. * In both cases it's better to notify a user, that there is * nothing to wait for. */ if (!applied) { TRACE_CGROUP_PATH(notify_frozen, cgrp, test_bit(CGRP_FROZEN, &cgrp->flags)); cgroup_file_notify(&cgrp->events_file); } } |
111 110 103 36 36 35 35 1 2 2 1 1 2 2 1 64 54 54 25 2 2 2 2 2 2 54 53 42 13 2 117 117 76 76 31 22 113 75 1 6 111 113 99 1 86 111 105 110 2 48 2 17 2 15 58 71 1 1 2 68 68 2 2 2 2 57 126 2 55 55 105 97 97 97 97 97 88 9 9 1 88 1 89 9 89 9 89 114 63 114 114 7 103 9 3 3 3 53 42 42 42 42 42 42 42 42 42 2 2 2 2 12 12 2 2 2 2 2 40 40 40 40 40 49 29 25 29 4 21 12 40 17 10 8 8 10 10 16 1 15 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 | // SPDX-License-Identifier: GPL-2.0 /* * * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved. * */ #include <linux/blkdev.h> #include <linux/buffer_head.h> #include <linux/fs.h> #include <linux/kernel.h> #include "debug.h" #include "ntfs.h" #include "ntfs_fs.h" static const struct INDEX_NAMES { const __le16 *name; u8 name_len; } s_index_names[INDEX_MUTEX_TOTAL] = { { I30_NAME, ARRAY_SIZE(I30_NAME) }, { SII_NAME, ARRAY_SIZE(SII_NAME) }, { SDH_NAME, ARRAY_SIZE(SDH_NAME) }, { SO_NAME, ARRAY_SIZE(SO_NAME) }, { SQ_NAME, ARRAY_SIZE(SQ_NAME) }, { SR_NAME, ARRAY_SIZE(SR_NAME) }, }; /* * cmp_fnames - Compare two names in index. * * if l1 != 0 * Both names are little endian on-disk ATTR_FILE_NAME structs. * else * key1 - cpu_str, key2 - ATTR_FILE_NAME */ static int cmp_fnames(const void *key1, size_t l1, const void *key2, size_t l2, const void *data) { const struct ATTR_FILE_NAME *f2 = key2; const struct ntfs_sb_info *sbi = data; const struct ATTR_FILE_NAME *f1; u16 fsize2; bool both_case; if (l2 <= offsetof(struct ATTR_FILE_NAME, name)) return -1; fsize2 = fname_full_size(f2); if (l2 < fsize2) return -1; both_case = f2->type != FILE_NAME_DOS && !sbi->options->nocase; if (!l1) { const struct le_str *s2 = (struct le_str *)&f2->name_len; /* * If names are equal (case insensitive) * try to compare it case sensitive. */ return ntfs_cmp_names_cpu(key1, s2, sbi->upcase, both_case); } f1 = key1; return ntfs_cmp_names(f1->name, f1->name_len, f2->name, f2->name_len, sbi->upcase, both_case); } /* * cmp_uint - $SII of $Secure and $Q of Quota */ static int cmp_uint(const void *key1, size_t l1, const void *key2, size_t l2, const void *data) { const u32 *k1 = key1; const u32 *k2 = key2; if (l2 < sizeof(u32)) return -1; if (*k1 < *k2) return -1; if (*k1 > *k2) return 1; return 0; } /* * cmp_sdh - $SDH of $Secure */ static int cmp_sdh(const void *key1, size_t l1, const void *key2, size_t l2, const void *data) { const struct SECURITY_KEY *k1 = key1; const struct SECURITY_KEY *k2 = key2; u32 t1, t2; if (l2 < sizeof(struct SECURITY_KEY)) return -1; t1 = le32_to_cpu(k1->hash); t2 = le32_to_cpu(k2->hash); /* First value is a hash value itself. */ if (t1 < t2) return -1; if (t1 > t2) return 1; /* Second value is security Id. */ if (data) { t1 = le32_to_cpu(k1->sec_id); t2 = le32_to_cpu(k2->sec_id); if (t1 < t2) return -1; if (t1 > t2) return 1; } return 0; } /* * cmp_uints - $O of ObjId and "$R" for Reparse. */ static int cmp_uints(const void *key1, size_t l1, const void *key2, size_t l2, const void *data) { const __le32 *k1 = key1; const __le32 *k2 = key2; size_t count; if ((size_t)data == 1) { /* * ni_delete_all -> ntfs_remove_reparse -> * delete all with this reference. * k1, k2 - pointers to REPARSE_KEY */ k1 += 1; // Skip REPARSE_KEY.ReparseTag k2 += 1; // Skip REPARSE_KEY.ReparseTag if (l2 <= sizeof(int)) return -1; l2 -= sizeof(int); if (l1 <= sizeof(int)) return 1; l1 -= sizeof(int); } if (l2 < sizeof(int)) return -1; for (count = min(l1, l2) >> 2; count > 0; --count, ++k1, ++k2) { u32 t1 = le32_to_cpu(*k1); u32 t2 = le32_to_cpu(*k2); if (t1 > t2) return 1; if (t1 < t2) return -1; } if (l1 > l2) return 1; if (l1 < l2) return -1; return 0; } static inline NTFS_CMP_FUNC get_cmp_func(const struct INDEX_ROOT *root) { switch (root->type) { case ATTR_NAME: if (root->rule == NTFS_COLLATION_TYPE_FILENAME) return &cmp_fnames; break; case ATTR_ZERO: switch (root->rule) { case NTFS_COLLATION_TYPE_UINT: return &cmp_uint; case NTFS_COLLATION_TYPE_SECURITY_HASH: return &cmp_sdh; case NTFS_COLLATION_TYPE_UINTS: return &cmp_uints; default: break; } break; default: break; } return NULL; } struct bmp_buf { struct ATTRIB *b; struct mft_inode *mi; struct buffer_head *bh; ulong *buf; size_t bit; u32 nbits; u64 new_valid; }; static int bmp_buf_get(struct ntfs_index *indx, struct ntfs_inode *ni, size_t bit, struct bmp_buf *bbuf) { struct ATTRIB *b; size_t data_size, valid_size, vbo, off = bit >> 3; struct ntfs_sb_info *sbi = ni->mi.sbi; CLST vcn = off >> sbi->cluster_bits; struct ATTR_LIST_ENTRY *le = NULL; struct buffer_head *bh; struct super_block *sb; u32 blocksize; const struct INDEX_NAMES *in = &s_index_names[indx->type]; bbuf->bh = NULL; b = ni_find_attr(ni, NULL, &le, ATTR_BITMAP, in->name, in->name_len, &vcn, &bbuf->mi); bbuf->b = b; if (!b) return -EINVAL; if (!b->non_res) { data_size = le32_to_cpu(b->res.data_size); if (off >= data_size) return -EINVAL; bbuf->buf = (ulong *)resident_data(b); bbuf->bit = 0; bbuf->nbits = data_size * 8; return 0; } data_size = le64_to_cpu(b->nres.data_size); if (WARN_ON(off >= data_size)) { /* Looks like filesystem error. */ return -EINVAL; } valid_size = le64_to_cpu(b->nres.valid_size); bh = ntfs_bread_run(sbi, &indx->bitmap_run, off); if (!bh) return -EIO; if (IS_ERR(bh)) return PTR_ERR(bh); bbuf->bh = bh; if (buffer_locked(bh)) __wait_on_buffer(bh); lock_buffer(bh); sb = sbi->sb; blocksize = sb->s_blocksize; vbo = off & ~(size_t)sbi->block_mask; bbuf->new_valid = vbo + blocksize; if (bbuf->new_valid <= valid_size) bbuf->new_valid = 0; else if (bbuf->new_valid > data_size) bbuf->new_valid = data_size; if (vbo >= valid_size) { memset(bh->b_data, 0, blocksize); } else if (vbo + blocksize > valid_size) { u32 voff = valid_size & sbi->block_mask; memset(bh->b_data + voff, 0, blocksize - voff); } bbuf->buf = (ulong *)bh->b_data; bbuf->bit = 8 * (off & ~(size_t)sbi->block_mask); bbuf->nbits = 8 * blocksize; return 0; } static void bmp_buf_put(struct bmp_buf *bbuf, bool dirty) { struct buffer_head *bh = bbuf->bh; struct ATTRIB *b = bbuf->b; if (!bh) { if (b && !b->non_res && dirty) bbuf->mi->dirty = true; return; } if (!dirty) goto out; if (bbuf->new_valid) { b->nres.valid_size = cpu_to_le64(bbuf->new_valid); bbuf->mi->dirty = true; } set_buffer_uptodate(bh); mark_buffer_dirty(bh); out: unlock_buffer(bh); put_bh(bh); } /* * indx_mark_used - Mark the bit @bit as used. */ static int indx_mark_used(struct ntfs_index *indx, struct ntfs_inode *ni, size_t bit) { int err; struct bmp_buf bbuf; err = bmp_buf_get(indx, ni, bit, &bbuf); if (err) return err; __set_bit_le(bit - bbuf.bit, bbuf.buf); bmp_buf_put(&bbuf, true); return 0; } /* * indx_mark_free - Mark the bit @bit as free. */ static int indx_mark_free(struct ntfs_index *indx, struct ntfs_inode *ni, size_t bit) { int err; struct bmp_buf bbuf; err = bmp_buf_get(indx, ni, bit, &bbuf); if (err) return err; __clear_bit_le(bit - bbuf.bit, bbuf.buf); bmp_buf_put(&bbuf, true); return 0; } /* * scan_nres_bitmap * * If ntfs_readdir calls this function (indx_used_bit -> scan_nres_bitmap), * inode is shared locked and no ni_lock. * Use rw_semaphore for read/write access to bitmap_run. */ static int scan_nres_bitmap(struct ntfs_inode *ni, struct ATTRIB *bitmap, struct ntfs_index *indx, size_t from, bool (*fn)(const ulong *buf, u32 bit, u32 bits, size_t *ret), size_t *ret) { struct ntfs_sb_info *sbi = ni->mi.sbi; struct super_block *sb = sbi->sb; struct runs_tree *run = &indx->bitmap_run; struct rw_semaphore *lock = &indx->run_lock; u32 nbits = sb->s_blocksize * 8; u32 blocksize = sb->s_blocksize; u64 valid_size = le64_to_cpu(bitmap->nres.valid_size); u64 data_size = le64_to_cpu(bitmap->nres.data_size); sector_t eblock = bytes_to_block(sb, data_size); size_t vbo = from >> 3; sector_t blk = (vbo & sbi->cluster_mask) >> sb->s_blocksize_bits; sector_t vblock = vbo >> sb->s_blocksize_bits; sector_t blen, block; CLST lcn, clen, vcn, vcn_next; size_t idx; struct buffer_head *bh; bool ok; *ret = MINUS_ONE_T; if (vblock >= eblock) return 0; from &= nbits - 1; vcn = vbo >> sbi->cluster_bits; down_read(lock); ok = run_lookup_entry(run, vcn, &lcn, &clen, &idx); up_read(lock); next_run: if (!ok) { int err; const struct INDEX_NAMES *name = &s_index_names[indx->type]; down_write(lock); err = attr_load_runs_vcn(ni, ATTR_BITMAP, name->name, name->name_len, run, vcn); up_write(lock); if (err) return err; down_read(lock); ok = run_lookup_entry(run, vcn, &lcn, &clen, &idx); up_read(lock); if (!ok) return -EINVAL; } blen = (sector_t)clen * sbi->blocks_per_cluster; block = (sector_t)lcn * sbi->blocks_per_cluster; for (; blk < blen; blk++, from = 0) { bh = ntfs_bread(sb, block + blk); if (!bh) return -EIO; vbo = (u64)vblock << sb->s_blocksize_bits; if (vbo >= valid_size) { memset(bh->b_data, 0, blocksize); } else if (vbo + blocksize > valid_size) { u32 voff = valid_size & sbi->block_mask; memset(bh->b_data + voff, 0, blocksize - voff); } if (vbo + blocksize > data_size) nbits = 8 * (data_size - vbo); ok = nbits > from ? (*fn)((ulong *)bh->b_data, from, nbits, ret) : false; put_bh(bh); if (ok) { *ret += 8 * vbo; return 0; } if (++vblock >= eblock) { *ret = MINUS_ONE_T; return 0; } } blk = 0; vcn_next = vcn + clen; down_read(lock); ok = run_get_entry(run, ++idx, &vcn, &lcn, &clen) && vcn == vcn_next; if (!ok) vcn = vcn_next; up_read(lock); goto next_run; } static bool scan_for_free(const ulong *buf, u32 bit, u32 bits, size_t *ret) { size_t pos = find_next_zero_bit_le(buf, bits, bit); if (pos >= bits) return false; *ret = pos; return true; } /* * indx_find_free - Look for free bit. * * Return: -1 if no free bits. */ static int indx_find_free(struct ntfs_index *indx, struct ntfs_inode *ni, size_t *bit, struct ATTRIB **bitmap) { struct ATTRIB *b; struct ATTR_LIST_ENTRY *le = NULL; const struct INDEX_NAMES *in = &s_index_names[indx->type]; int err; b = ni_find_attr(ni, NULL, &le, ATTR_BITMAP, in->name, in->name_len, NULL, NULL); if (!b) return -ENOENT; *bitmap = b; *bit = MINUS_ONE_T; if (!b->non_res) { u32 nbits = 8 * le32_to_cpu(b->res.data_size); size_t pos = find_next_zero_bit_le(resident_data(b), nbits, 0); if (pos < nbits) *bit = pos; } else { err = scan_nres_bitmap(ni, b, indx, 0, &scan_for_free, bit); if (err) return err; } return 0; } static bool scan_for_used(const ulong *buf, u32 bit, u32 bits, size_t *ret) { size_t pos = find_next_bit_le(buf, bits, bit); if (pos >= bits) return false; *ret = pos; return true; } /* * indx_used_bit - Look for used bit. * * Return: MINUS_ONE_T if no used bits. */ int indx_used_bit(struct ntfs_index *indx, struct ntfs_inode *ni, size_t *bit) { struct ATTRIB *b; struct ATTR_LIST_ENTRY *le = NULL; size_t from = *bit; const struct INDEX_NAMES *in = &s_index_names[indx->type]; int err; b = ni_find_attr(ni, NULL, &le, ATTR_BITMAP, in->name, in->name_len, NULL, NULL); if (!b) return -ENOENT; *bit = MINUS_ONE_T; if (!b->non_res) { u32 nbits = le32_to_cpu(b->res.data_size) * 8; size_t pos = find_next_bit_le(resident_data(b), nbits, from); if (pos < nbits) *bit = pos; } else { err = scan_nres_bitmap(ni, b, indx, from, &scan_for_used, bit); if (err) return err; } return 0; } /* * hdr_find_split * * Find a point at which the index allocation buffer would like to be split. * NOTE: This function should never return 'END' entry NULL returns on error. */ static const struct NTFS_DE *hdr_find_split(const struct INDEX_HDR *hdr) { size_t o; const struct NTFS_DE *e = hdr_first_de(hdr); u32 used_2 = le32_to_cpu(hdr->used) >> 1; u16 esize; if (!e || de_is_last(e)) return NULL; esize = le16_to_cpu(e->size); for (o = le32_to_cpu(hdr->de_off) + esize; o < used_2; o += esize) { const struct NTFS_DE *p = e; e = Add2Ptr(hdr, o); /* We must not return END entry. */ if (de_is_last(e)) return p; esize = le16_to_cpu(e->size); } return e; } /* * hdr_insert_head - Insert some entries at the beginning of the buffer. * * It is used to insert entries into a newly-created buffer. */ static const struct NTFS_DE *hdr_insert_head(struct INDEX_HDR *hdr, const void *ins, u32 ins_bytes) { u32 to_move; struct NTFS_DE *e = hdr_first_de(hdr); u32 used = le32_to_cpu(hdr->used); if (!e) return NULL; /* Now we just make room for the inserted entries and jam it in. */ to_move = used - le32_to_cpu(hdr->de_off); memmove(Add2Ptr(e, ins_bytes), e, to_move); memcpy(e, ins, ins_bytes); hdr->used = cpu_to_le32(used + ins_bytes); return e; } /* * index_hdr_check * * return true if INDEX_HDR is valid */ static bool index_hdr_check(const struct INDEX_HDR *hdr, u32 bytes) { u32 end = le32_to_cpu(hdr->used); u32 tot = le32_to_cpu(hdr->total); u32 off = le32_to_cpu(hdr->de_off); if (!IS_ALIGNED(off, 8) || tot > bytes || end > tot || off + sizeof(struct NTFS_DE) > end) { /* incorrect index buffer. */ return false; } return true; } /* * index_buf_check * * return true if INDEX_BUFFER seems is valid */ static bool index_buf_check(const struct INDEX_BUFFER *ib, u32 bytes, const CLST *vbn) { const struct NTFS_RECORD_HEADER *rhdr = &ib->rhdr; u16 fo = le16_to_cpu(rhdr->fix_off); u16 fn = le16_to_cpu(rhdr->fix_num); if (bytes <= offsetof(struct INDEX_BUFFER, ihdr) || rhdr->sign != NTFS_INDX_SIGNATURE || fo < sizeof(struct INDEX_BUFFER) /* Check index buffer vbn. */ || (vbn && *vbn != le64_to_cpu(ib->vbn)) || (fo % sizeof(short)) || fo + fn * sizeof(short) >= bytes || fn != ((bytes >> SECTOR_SHIFT) + 1)) { /* incorrect index buffer. */ return false; } return index_hdr_check(&ib->ihdr, bytes - offsetof(struct INDEX_BUFFER, ihdr)); } void fnd_clear(struct ntfs_fnd *fnd) { int i; for (i = fnd->level - 1; i >= 0; i--) { struct indx_node *n = fnd->nodes[i]; if (!n) continue; put_indx_node(n); fnd->nodes[i] = NULL; } fnd->level = 0; fnd->root_de = NULL; } static int fnd_push(struct ntfs_fnd *fnd, struct indx_node *n, struct NTFS_DE *e) { int i = fnd->level; if (i < 0 || i >= ARRAY_SIZE(fnd->nodes)) return -EINVAL; fnd->nodes[i] = n; fnd->de[i] = e; fnd->level += 1; return 0; } static struct indx_node *fnd_pop(struct ntfs_fnd *fnd) { struct indx_node *n; int i = fnd->level; i -= 1; n = fnd->nodes[i]; fnd->nodes[i] = NULL; fnd->level = i; return n; } static bool fnd_is_empty(struct ntfs_fnd *fnd) { if (!fnd->level) return !fnd->root_de; return !fnd->de[fnd->level - 1]; } /* * hdr_find_e - Locate an entry the index buffer. * * If no matching entry is found, it returns the first entry which is greater * than the desired entry If the search key is greater than all the entries the * buffer, it returns the 'end' entry. This function does a binary search of the * current index buffer, for the first entry that is <= to the search value. * * Return: NULL if error. */ static struct NTFS_DE *hdr_find_e(const struct ntfs_index *indx, const struct INDEX_HDR *hdr, const void *key, size_t key_len, const void *ctx, int *diff) { struct NTFS_DE *e, *found = NULL; NTFS_CMP_FUNC cmp = indx->cmp; int min_idx = 0, mid_idx, max_idx = 0; int diff2; int table_size = 8; u32 e_size, e_key_len; u32 end = le32_to_cpu(hdr->used); u32 off = le32_to_cpu(hdr->de_off); u32 total = le32_to_cpu(hdr->total); u16 offs[128]; if (unlikely(!cmp)) return NULL; fill_table: if (end > total) return NULL; if (off + sizeof(struct NTFS_DE) > end) return NULL; e = Add2Ptr(hdr, off); e_size = le16_to_cpu(e->size); if (e_size < sizeof(struct NTFS_DE) || off + e_size > end) return NULL; if (!de_is_last(e)) { offs[max_idx] = off; off += e_size; max_idx++; if (max_idx < table_size) goto fill_table; max_idx--; } binary_search: e_key_len = le16_to_cpu(e->key_size); diff2 = (*cmp)(key, key_len, e + 1, e_key_len, ctx); if (diff2 > 0) { if (found) { min_idx = mid_idx + 1; } else { if (de_is_last(e)) return NULL; max_idx = 0; table_size = min(table_size * 2, (int)ARRAY_SIZE(offs)); goto fill_table; } } else if (diff2 < 0) { if (found) max_idx = mid_idx - 1; else max_idx--; found = e; } else { *diff = 0; return e; } if (min_idx > max_idx) { *diff = -1; return found; } mid_idx = (min_idx + max_idx) >> 1; e = Add2Ptr(hdr, offs[mid_idx]); goto binary_search; } /* * hdr_insert_de - Insert an index entry into the buffer. * * 'before' should be a pointer previously returned from hdr_find_e. */ static struct NTFS_DE *hdr_insert_de(const struct ntfs_index *indx, struct INDEX_HDR *hdr, const struct NTFS_DE *de, struct NTFS_DE *before, const void *ctx) { int diff; size_t off = PtrOffset(hdr, before); u32 used = le32_to_cpu(hdr->used); u32 total = le32_to_cpu(hdr->total); u16 de_size = le16_to_cpu(de->size); /* First, check to see if there's enough room. */ if (used + de_size > total) return NULL; /* We know there's enough space, so we know we'll succeed. */ if (before) { /* Check that before is inside Index. */ if (off >= used || off < le32_to_cpu(hdr->de_off) || off + le16_to_cpu(before->size) > total) { return NULL; } goto ok; } /* No insert point is applied. Get it manually. */ before = hdr_find_e(indx, hdr, de + 1, le16_to_cpu(de->key_size), ctx, &diff); if (!before) return NULL; off = PtrOffset(hdr, before); ok: /* Now we just make room for the entry and jam it in. */ memmove(Add2Ptr(before, de_size), before, used - off); hdr->used = cpu_to_le32(used + de_size); memcpy(before, de, de_size); return before; } /* * hdr_delete_de - Remove an entry from the index buffer. */ static inline struct NTFS_DE *hdr_delete_de(struct INDEX_HDR *hdr, struct NTFS_DE *re) { u32 used = le32_to_cpu(hdr->used); u16 esize = le16_to_cpu(re->size); u32 off = PtrOffset(hdr, re); int bytes = used - (off + esize); /* check INDEX_HDR valid before using INDEX_HDR */ if (!check_index_header(hdr, le32_to_cpu(hdr->total))) return NULL; if (off >= used || esize < sizeof(struct NTFS_DE) || bytes < sizeof(struct NTFS_DE)) return NULL; hdr->used = cpu_to_le32(used - esize); memmove(re, Add2Ptr(re, esize), bytes); return re; } void indx_clear(struct ntfs_index *indx) { run_close(&indx->alloc_run); run_close(&indx->bitmap_run); } int indx_init(struct ntfs_index *indx, struct ntfs_sb_info *sbi, const struct ATTRIB *attr, enum index_mutex_classed type) { u32 t32; const struct INDEX_ROOT *root = resident_data(attr); t32 = le32_to_cpu(attr->res.data_size); if (t32 <= offsetof(struct INDEX_ROOT, ihdr) || !index_hdr_check(&root->ihdr, t32 - offsetof(struct INDEX_ROOT, ihdr))) { goto out; } /* Check root fields. */ if (!root->index_block_clst) goto out; indx->type = type; indx->idx2vbn_bits = __ffs(root->index_block_clst); t32 = le32_to_cpu(root->index_block_size); indx->index_bits = blksize_bits(t32); /* Check index record size. */ if (t32 < sbi->cluster_size) { /* Index record is smaller than a cluster, use 512 blocks. */ if (t32 != root->index_block_clst * SECTOR_SIZE) goto out; /* Check alignment to a cluster. */ if ((sbi->cluster_size >> SECTOR_SHIFT) & (root->index_block_clst - 1)) { goto out; } indx->vbn2vbo_bits = SECTOR_SHIFT; } else { /* Index record must be a multiple of cluster size. */ if (t32 != root->index_block_clst << sbi->cluster_bits) goto out; indx->vbn2vbo_bits = sbi->cluster_bits; } init_rwsem(&indx->run_lock); indx->cmp = get_cmp_func(root); if (!indx->cmp) goto out; return 0; out: ntfs_set_state(sbi, NTFS_DIRTY_DIRTY); return -EINVAL; } static struct indx_node *indx_new(struct ntfs_index *indx, struct ntfs_inode *ni, CLST vbn, const __le64 *sub_vbn) { int err; struct NTFS_DE *e; struct indx_node *r; struct INDEX_HDR *hdr; struct INDEX_BUFFER *index; u64 vbo = (u64)vbn << indx->vbn2vbo_bits; u32 bytes = 1u << indx->index_bits; u16 fn; u32 eo; r = kzalloc(sizeof(struct indx_node), GFP_NOFS); if (!r) return ERR_PTR(-ENOMEM); index = kzalloc(bytes, GFP_NOFS); if (!index) { kfree(r); return ERR_PTR(-ENOMEM); } err = ntfs_get_bh(ni->mi.sbi, &indx->alloc_run, vbo, bytes, &r->nb); if (err) { kfree(index); kfree(r); return ERR_PTR(err); } /* Create header. */ index->rhdr.sign = NTFS_INDX_SIGNATURE; index->rhdr.fix_off = cpu_to_le16(sizeof(struct INDEX_BUFFER)); // 0x28 fn = (bytes >> SECTOR_SHIFT) + 1; // 9 index->rhdr.fix_num = cpu_to_le16(fn); index->vbn = cpu_to_le64(vbn); hdr = &index->ihdr; eo = ALIGN(sizeof(struct INDEX_BUFFER) + fn * sizeof(short), 8); hdr->de_off = cpu_to_le32(eo); e = Add2Ptr(hdr, eo); if (sub_vbn) { e->flags = NTFS_IE_LAST | NTFS_IE_HAS_SUBNODES; e->size = cpu_to_le16(sizeof(struct NTFS_DE) + sizeof(u64)); hdr->used = cpu_to_le32(eo + sizeof(struct NTFS_DE) + sizeof(u64)); de_set_vbn_le(e, *sub_vbn); hdr->flags = 1; } else { e->size = cpu_to_le16(sizeof(struct NTFS_DE)); hdr->used = cpu_to_le32(eo + sizeof(struct NTFS_DE)); e->flags = NTFS_IE_LAST; } hdr->total = cpu_to_le32(bytes - offsetof(struct INDEX_BUFFER, ihdr)); r->index = index; return r; } struct INDEX_ROOT *indx_get_root(struct ntfs_index *indx, struct ntfs_inode *ni, struct ATTRIB **attr, struct mft_inode **mi) { struct ATTR_LIST_ENTRY *le = NULL; struct ATTRIB *a; const struct INDEX_NAMES *in = &s_index_names[indx->type]; struct INDEX_ROOT *root; a = ni_find_attr(ni, NULL, &le, ATTR_ROOT, in->name, in->name_len, NULL, mi); if (!a) return NULL; if (attr) *attr = a; root = resident_data_ex(a, sizeof(struct INDEX_ROOT)); /* length check */ if (root && offsetof(struct INDEX_ROOT, ihdr) + le32_to_cpu(root->ihdr.used) > le32_to_cpu(a->res.data_size)) { return NULL; } return root; } static int indx_write(struct ntfs_index *indx, struct ntfs_inode *ni, struct indx_node *node, int sync) { struct INDEX_BUFFER *ib = node->index; return ntfs_write_bh(ni->mi.sbi, &ib->rhdr, &node->nb, sync); } /* * indx_read * * If ntfs_readdir calls this function * inode is shared locked and no ni_lock. * Use rw_semaphore for read/write access to alloc_run. */ int indx_read(struct ntfs_index *indx, struct ntfs_inode *ni, CLST vbn, struct indx_node **node) { int err; struct INDEX_BUFFER *ib; struct runs_tree *run = &indx->alloc_run; struct rw_semaphore *lock = &indx->run_lock; u64 vbo = (u64)vbn << indx->vbn2vbo_bits; u32 bytes = 1u << indx->index_bits; struct indx_node *in = *node; const struct INDEX_NAMES *name; if (!in) { in = kzalloc(sizeof(struct indx_node), GFP_NOFS); if (!in) return -ENOMEM; } else { nb_put(&in->nb); } ib = in->index; if (!ib) { ib = kmalloc(bytes, GFP_NOFS); if (!ib) { err = -ENOMEM; goto out; } } down_read(lock); err = ntfs_read_bh(ni->mi.sbi, run, vbo, &ib->rhdr, bytes, &in->nb); up_read(lock); if (!err) goto ok; if (err == -E_NTFS_FIXUP) goto ok; if (err != -ENOENT) goto out; name = &s_index_names[indx->type]; down_write(lock); err = attr_load_runs_range(ni, ATTR_ALLOC, name->name, name->name_len, run, vbo, vbo + bytes); up_write(lock); if (err) goto out; down_read(lock); err = ntfs_read_bh(ni->mi.sbi, run, vbo, &ib->rhdr, bytes, &in->nb); up_read(lock); if (err == -E_NTFS_FIXUP) goto ok; if (err) goto out; ok: if (!index_buf_check(ib, bytes, &vbn)) { ntfs_inode_err(&ni->vfs_inode, "directory corrupted"); ntfs_set_state(ni->mi.sbi, NTFS_DIRTY_ERROR); err = -EINVAL; goto out; } if (err == -E_NTFS_FIXUP) { ntfs_write_bh(ni->mi.sbi, &ib->rhdr, &in->nb, 0); err = 0; } /* check for index header length */ if (offsetof(struct INDEX_BUFFER, ihdr) + le32_to_cpu(ib->ihdr.used) > bytes) { err = -EINVAL; goto out; } in->index = ib; *node = in; out: if (err == -E_NTFS_CORRUPT) { ntfs_inode_err(&ni->vfs_inode, "directory corrupted"); ntfs_set_state(ni->mi.sbi, NTFS_DIRTY_ERROR); err = -EINVAL; } if (ib != in->index) kfree(ib); if (*node != in) { nb_put(&in->nb); kfree(in); } return err; } /* * indx_find - Scan NTFS directory for given entry. */ int indx_find(struct ntfs_index *indx, struct ntfs_inode *ni, const struct INDEX_ROOT *root, const void *key, size_t key_len, const void *ctx, int *diff, struct NTFS_DE **entry, struct ntfs_fnd *fnd) { int err; struct NTFS_DE *e; struct indx_node *node; if (!root) root = indx_get_root(&ni->dir, ni, NULL, NULL); if (!root) { /* Should not happen. */ return -EINVAL; } /* Check cache. */ e = fnd->level ? fnd->de[fnd->level - 1] : fnd->root_de; if (e && !de_is_last(e) && !(*indx->cmp)(key, key_len, e + 1, le16_to_cpu(e->key_size), ctx)) { *entry = e; *diff = 0; return 0; } /* Soft finder reset. */ fnd_clear(fnd); /* Lookup entry that is <= to the search value. */ e = hdr_find_e(indx, &root->ihdr, key, key_len, ctx, diff); if (!e) return -EINVAL; fnd->root_de = e; for (;;) { node = NULL; if (*diff >= 0 || !de_has_vcn_ex(e)) break; /* Read next level. */ err = indx_read(indx, ni, de_get_vbn(e), &node); if (err) { /* io error? */ return err; } /* Lookup entry that is <= to the search value. */ e = hdr_find_e(indx, &node->index->ihdr, key, key_len, ctx, diff); if (!e) { put_indx_node(node); return -EINVAL; } fnd_push(fnd, node, e); } *entry = e; return 0; } int indx_find_sort(struct ntfs_index *indx, struct ntfs_inode *ni, const struct INDEX_ROOT *root, struct NTFS_DE **entry, struct ntfs_fnd *fnd) { int err; struct indx_node *n = NULL; struct NTFS_DE *e; size_t iter = 0; int level = fnd->level; if (!*entry) { /* Start find. */ e = hdr_first_de(&root->ihdr); if (!e) return 0; fnd_clear(fnd); fnd->root_de = e; } else if (!level) { if (de_is_last(fnd->root_de)) { *entry = NULL; return 0; } e = hdr_next_de(&root->ihdr, fnd->root_de); if (!e) return -EINVAL; fnd->root_de = e; } else { n = fnd->nodes[level - 1]; e = fnd->de[level - 1]; if (de_is_last(e)) goto pop_level; e = hdr_next_de(&n->index->ihdr, e); if (!e) return -EINVAL; fnd->de[level - 1] = e; } /* Just to avoid tree cycle. */ next_iter: if (iter++ >= 1000) return -EINVAL; while (de_has_vcn_ex(e)) { if (le16_to_cpu(e->size) < sizeof(struct NTFS_DE) + sizeof(u64)) { if (n) { fnd_pop(fnd); kfree(n); } return -EINVAL; } /* Read next level. */ err = indx_read(indx, ni, de_get_vbn(e), &n); if (err) return err; /* Try next level. */ e = hdr_first_de(&n->index->ihdr); if (!e) { kfree(n); return -EINVAL; } fnd_push(fnd, n, e); } if (le16_to_cpu(e->size) > sizeof(struct NTFS_DE)) { *entry = e; return 0; } pop_level: for (;;) { if (!de_is_last(e)) goto next_iter; /* Pop one level. */ if (n) { fnd_pop(fnd); kfree(n); } level = fnd->level; if (level) { n = fnd->nodes[level - 1]; e = fnd->de[level - 1]; } else if (fnd->root_de) { n = NULL; e = fnd->root_de; fnd->root_de = NULL; } else { *entry = NULL; return 0; } if (le16_to_cpu(e->size) > sizeof(struct NTFS_DE)) { *entry = e; if (!fnd->root_de) fnd->root_de = e; return 0; } } } int indx_find_raw(struct ntfs_index *indx, struct ntfs_inode *ni, const struct INDEX_ROOT *root, struct NTFS_DE **entry, size_t *off, struct ntfs_fnd *fnd) { int err; struct indx_node *n = NULL; struct NTFS_DE *e = NULL; struct NTFS_DE *e2; size_t bit; CLST next_used_vbn; CLST next_vbn; u32 record_size = ni->mi.sbi->record_size; /* Use non sorted algorithm. */ if (!*entry) { /* This is the first call. */ e = hdr_first_de(&root->ihdr); if (!e) return 0; fnd_clear(fnd); fnd->root_de = e; /* The first call with setup of initial element. */ if (*off >= record_size) { next_vbn = (((*off - record_size) >> indx->index_bits)) << indx->idx2vbn_bits; /* Jump inside cycle 'for'. */ goto next; } /* Start enumeration from root. */ *off = 0; } else if (!fnd->root_de) return -EINVAL; for (;;) { /* Check if current entry can be used. */ if (e && le16_to_cpu(e->size) > sizeof(struct NTFS_DE)) goto ok; if (!fnd->level) { /* Continue to enumerate root. */ if (!de_is_last(fnd->root_de)) { e = hdr_next_de(&root->ihdr, fnd->root_de); if (!e) return -EINVAL; fnd->root_de = e; continue; } /* Start to enumerate indexes from 0. */ next_vbn = 0; } else { /* Continue to enumerate indexes. */ e2 = fnd->de[fnd->level - 1]; n = fnd->nodes[fnd->level - 1]; if (!de_is_last(e2)) { e = hdr_next_de(&n->index->ihdr, e2); if (!e) return -EINVAL; fnd->de[fnd->level - 1] = e; continue; } /* Continue with next index. */ next_vbn = le64_to_cpu(n->index->vbn) + root->index_block_clst; } next: /* Release current index. */ if (n) { fnd_pop(fnd); put_indx_node(n); n = NULL; } /* Skip all free indexes. */ bit = next_vbn >> indx->idx2vbn_bits; err = indx_used_bit(indx, ni, &bit); if (err == -ENOENT || bit == MINUS_ONE_T) { /* No used indexes. */ *entry = NULL; return 0; } next_used_vbn = bit << indx->idx2vbn_bits; /* Read buffer into memory. */ err = indx_read(indx, ni, next_used_vbn, &n); if (err) return err; e = hdr_first_de(&n->index->ihdr); fnd_push(fnd, n, e); if (!e) return -EINVAL; } ok: /* Return offset to restore enumerator if necessary. */ if (!n) { /* 'e' points in root, */ *off = PtrOffset(&root->ihdr, e); } else { /* 'e' points in index, */ *off = (le64_to_cpu(n->index->vbn) << indx->vbn2vbo_bits) + record_size + PtrOffset(&n->index->ihdr, e); } *entry = e; return 0; } /* * indx_create_allocate - Create "Allocation + Bitmap" attributes. */ static int indx_create_allocate(struct ntfs_index *indx, struct ntfs_inode *ni, CLST *vbn) { int err; struct ntfs_sb_info *sbi = ni->mi.sbi; struct ATTRIB *bitmap; struct ATTRIB *alloc; u32 data_size = 1u << indx->index_bits; u32 alloc_size = ntfs_up_cluster(sbi, data_size); CLST len = alloc_size >> sbi->cluster_bits; const struct INDEX_NAMES *in = &s_index_names[indx->type]; CLST alen; struct runs_tree run; run_init(&run); err = attr_allocate_clusters(sbi, &run, 0, 0, len, NULL, ALLOCATE_DEF, &alen, 0, NULL, NULL); if (err) goto out; err = ni_insert_nonresident(ni, ATTR_ALLOC, in->name, in->name_len, &run, 0, len, 0, &alloc, NULL, NULL); if (err) goto out1; alloc->nres.valid_size = alloc->nres.data_size = cpu_to_le64(data_size); err = ni_insert_resident(ni, bitmap_size(1), ATTR_BITMAP, in->name, in->name_len, &bitmap, NULL, NULL); if (err) goto out2; if (in->name == I30_NAME) { ni->vfs_inode.i_size = data_size; inode_set_bytes(&ni->vfs_inode, alloc_size); } memcpy(&indx->alloc_run, &run, sizeof(run)); *vbn = 0; return 0; out2: mi_remove_attr(NULL, &ni->mi, alloc); out1: run_deallocate(sbi, &run, false); out: return err; } /* * indx_add_allocate - Add clusters to index. */ static int indx_add_allocate(struct ntfs_index *indx, struct ntfs_inode *ni, CLST *vbn) { int err; size_t bit; u64 data_size; u64 bmp_size, bmp_size_v; struct ATTRIB *bmp, *alloc; struct mft_inode *mi; const struct INDEX_NAMES *in = &s_index_names[indx->type]; err = indx_find_free(indx, ni, &bit, &bmp); if (err) goto out1; if (bit != MINUS_ONE_T) { bmp = NULL; } else { if (bmp->non_res) { bmp_size = le64_to_cpu(bmp->nres.data_size); bmp_size_v = le64_to_cpu(bmp->nres.valid_size); } else { bmp_size = bmp_size_v = le32_to_cpu(bmp->res.data_size); } bit = bmp_size << 3; } data_size = (u64)(bit + 1) << indx->index_bits; if (bmp) { /* Increase bitmap. */ err = attr_set_size(ni, ATTR_BITMAP, in->name, in->name_len, &indx->bitmap_run, bitmap_size(bit + 1), NULL, true, NULL); if (err) goto out1; } alloc = ni_find_attr(ni, NULL, NULL, ATTR_ALLOC, in->name, in->name_len, NULL, &mi); if (!alloc) { err = -EINVAL; if (bmp) goto out2; goto out1; } /* Increase allocation. */ err = attr_set_size(ni, ATTR_ALLOC, in->name, in->name_len, &indx->alloc_run, data_size, &data_size, true, NULL); if (err) { if (bmp) goto out2; goto out1; } if (in->name == I30_NAME) ni->vfs_inode.i_size = data_size; *vbn = bit << indx->idx2vbn_bits; return 0; out2: /* Ops. No space? */ attr_set_size(ni, ATTR_BITMAP, in->name, in->name_len, &indx->bitmap_run, bmp_size, &bmp_size_v, false, NULL); out1: return err; } /* * indx_insert_into_root - Attempt to insert an entry into the index root. * * @undo - True if we undoing previous remove. * If necessary, it will twiddle the index b-tree. */ static int indx_insert_into_root(struct ntfs_index *indx, struct ntfs_inode *ni, const struct NTFS_DE *new_de, struct NTFS_DE *root_de, const void *ctx, struct ntfs_fnd *fnd, bool undo) { int err = 0; struct NTFS_DE *e, *e0, *re; struct mft_inode *mi; struct ATTRIB *attr; struct INDEX_HDR *hdr; struct indx_node *n; CLST new_vbn; __le64 *sub_vbn, t_vbn; u16 new_de_size; u32 hdr_used, hdr_total, asize, to_move; u32 root_size, new_root_size; struct ntfs_sb_info *sbi; int ds_root; struct INDEX_ROOT *root, *a_root; /* Get the record this root placed in. */ root = indx_get_root(indx, ni, &attr, &mi); if (!root) return -EINVAL; /* * Try easy case: * hdr_insert_de will succeed if there's * room the root for the new entry. */ hdr = &root->ihdr; sbi = ni->mi.sbi; new_de_size = le16_to_cpu(new_de->size); hdr_used = le32_to_cpu(hdr->used); hdr_total = le32_to_cpu(hdr->total); asize = le32_to_cpu(attr->size); root_size = le32_to_cpu(attr->res.data_size); ds_root = new_de_size + hdr_used - hdr_total; /* If 'undo' is set then reduce requirements. */ if ((undo || asize + ds_root < sbi->max_bytes_per_attr) && mi_resize_attr(mi, attr, ds_root)) { hdr->total = cpu_to_le32(hdr_total + ds_root); e = hdr_insert_de(indx, hdr, new_de, root_de, ctx); WARN_ON(!e); fnd_clear(fnd); fnd->root_de = e; return 0; } /* Make a copy of root attribute to restore if error. */ a_root = kmemdup(attr, asize, GFP_NOFS); if (!a_root) return -ENOMEM; /* * Copy all the non-end entries from * the index root to the new buffer. */ to_move = 0; e0 = hdr_first_de(hdr); /* Calculate the size to copy. */ for (e = e0;; e = hdr_next_de(hdr, e)) { if (!e) { err = -EINVAL; goto out_free_root; } if (de_is_last(e)) break; to_move += le16_to_cpu(e->size); } if (!to_move) { re = NULL; } else { re = kmemdup(e0, to_move, GFP_NOFS); if (!re) { err = -ENOMEM; goto out_free_root; } } sub_vbn = NULL; if (de_has_vcn(e)) { t_vbn = de_get_vbn_le(e); sub_vbn = &t_vbn; } new_root_size = sizeof(struct INDEX_ROOT) + sizeof(struct NTFS_DE) + sizeof(u64); ds_root = new_root_size - root_size; if (ds_root > 0 && asize + ds_root > sbi->max_bytes_per_attr) { /* Make root external. */ err = -EOPNOTSUPP; goto out_free_re; } if (ds_root) mi_resize_attr(mi, attr, ds_root); /* Fill first entry (vcn will be set later). */ e = (struct NTFS_DE *)(root + 1); memset(e, 0, sizeof(struct NTFS_DE)); e->size = cpu_to_le16(sizeof(struct NTFS_DE) + sizeof(u64)); e->flags = NTFS_IE_HAS_SUBNODES | NTFS_IE_LAST; hdr->flags = 1; hdr->used = hdr->total = cpu_to_le32(new_root_size - offsetof(struct INDEX_ROOT, ihdr)); fnd->root_de = hdr_first_de(hdr); mi->dirty = true; /* Create alloc and bitmap attributes (if not). */ err = run_is_empty(&indx->alloc_run) ? indx_create_allocate(indx, ni, &new_vbn) : indx_add_allocate(indx, ni, &new_vbn); /* Layout of record may be changed, so rescan root. */ root = indx_get_root(indx, ni, &attr, &mi); if (!root) { /* Bug? */ ntfs_set_state(sbi, NTFS_DIRTY_ERROR); err = -EINVAL; goto out_free_re; } if (err) { /* Restore root. */ if (mi_resize_attr(mi, attr, -ds_root)) { memcpy(attr, a_root, asize); } else { /* Bug? */ ntfs_set_state(sbi, NTFS_DIRTY_ERROR); } goto out_free_re; } e = (struct NTFS_DE *)(root + 1); *(__le64 *)(e + 1) = cpu_to_le64(new_vbn); mi->dirty = true; /* Now we can create/format the new buffer and copy the entries into. */ n = indx_new(indx, ni, new_vbn, sub_vbn); if (IS_ERR(n)) { err = PTR_ERR(n); goto out_free_re; } hdr = &n->index->ihdr; hdr_used = le32_to_cpu(hdr->used); hdr_total = le32_to_cpu(hdr->total); /* Copy root entries into new buffer. */ hdr_insert_head(hdr, re, to_move); /* Update bitmap attribute. */ indx_mark_used(indx, ni, new_vbn >> indx->idx2vbn_bits); /* Check if we can insert new entry new index buffer. */ if (hdr_used + new_de_size > hdr_total) { /* * This occurs if MFT record is the same or bigger than index * buffer. Move all root new index and have no space to add * new entry classic case when MFT record is 1K and index * buffer 4K the problem should not occurs. */ kfree(re); indx_write(indx, ni, n, 0); put_indx_node(n); fnd_clear(fnd); err = indx_insert_entry(indx, ni, new_de, ctx, fnd, undo); goto out_free_root; } /* * Now root is a parent for new index buffer. * Insert NewEntry a new buffer. */ e = hdr_insert_de(indx, hdr, new_de, NULL, ctx); if (!e) { err = -EINVAL; goto out_put_n; } fnd_push(fnd, n, e); /* Just write updates index into disk. */ indx_write(indx, ni, n, 0); n = NULL; out_put_n: put_indx_node(n); out_free_re: kfree(re); out_free_root: kfree(a_root); return err; } /* * indx_insert_into_buffer * * Attempt to insert an entry into an Index Allocation Buffer. * If necessary, it will split the buffer. */ static int indx_insert_into_buffer(struct ntfs_index *indx, struct ntfs_inode *ni, struct INDEX_ROOT *root, const struct NTFS_DE *new_de, const void *ctx, int level, struct ntfs_fnd *fnd) { int err; const struct NTFS_DE *sp; struct NTFS_DE *e, *de_t, *up_e; struct indx_node *n2; struct indx_node *n1 = fnd->nodes[level]; struct INDEX_HDR *hdr1 = &n1->index->ihdr; struct INDEX_HDR *hdr2; u32 to_copy, used, used1; CLST new_vbn; __le64 t_vbn, *sub_vbn; u16 sp_size; void *hdr1_saved = NULL; /* Try the most easy case. */ e = fnd->level - 1 == level ? fnd->de[level] : NULL; e = hdr_insert_de(indx, hdr1, new_de, e, ctx); fnd->de[level] = e; if (e) { /* Just write updated index into disk. */ indx_write(indx, ni, n1, 0); return 0; } /* * No space to insert into buffer. Split it. * To split we: * - Save split point ('cause index buffers will be changed) * - Allocate NewBuffer and copy all entries <= sp into new buffer * - Remove all entries (sp including) from TargetBuffer * - Insert NewEntry into left or right buffer (depending on sp <=> * NewEntry) * - Insert sp into parent buffer (or root) * - Make sp a parent for new buffer */ sp = hdr_find_split(hdr1); if (!sp) return -EINVAL; sp_size = le16_to_cpu(sp->size); up_e = kmalloc(sp_size + sizeof(u64), GFP_NOFS); if (!up_e) return -ENOMEM; memcpy(up_e, sp, sp_size); used1 = le32_to_cpu(hdr1->used); hdr1_saved = kmemdup(hdr1, used1, GFP_NOFS); if (!hdr1_saved) { err = -ENOMEM; goto out; } if (!hdr1->flags) { up_e->flags |= NTFS_IE_HAS_SUBNODES; up_e->size = cpu_to_le16(sp_size + sizeof(u64)); sub_vbn = NULL; } else { t_vbn = de_get_vbn_le(up_e); sub_vbn = &t_vbn; } /* Allocate on disk a new index allocation buffer. */ err = indx_add_allocate(indx, ni, &new_vbn); if (err) goto out; /* Allocate and format memory a new index buffer. */ n2 = indx_new(indx, ni, new_vbn, sub_vbn); if (IS_ERR(n2)) { err = PTR_ERR(n2); goto out; } hdr2 = &n2->index->ihdr; /* Make sp a parent for new buffer. */ de_set_vbn(up_e, new_vbn); /* Copy all the entries <= sp into the new buffer. */ de_t = hdr_first_de(hdr1); to_copy = PtrOffset(de_t, sp); hdr_insert_head(hdr2, de_t, to_copy); /* Remove all entries (sp including) from hdr1. */ used = used1 - to_copy - sp_size; memmove(de_t, Add2Ptr(sp, sp_size), used - le32_to_cpu(hdr1->de_off)); hdr1->used = cpu_to_le32(used); /* * Insert new entry into left or right buffer * (depending on sp <=> new_de). */ hdr_insert_de(indx, (*indx->cmp)(new_de + 1, le16_to_cpu(new_de->key_size), up_e + 1, le16_to_cpu(up_e->key_size), ctx) < 0 ? hdr2 : hdr1, new_de, NULL, ctx); indx_mark_used(indx, ni, new_vbn >> indx->idx2vbn_bits); indx_write(indx, ni, n1, 0); indx_write(indx, ni, n2, 0); put_indx_node(n2); /* * We've finished splitting everybody, so we are ready to * insert the promoted entry into the parent. */ if (!level) { /* Insert in root. */ err = indx_insert_into_root(indx, ni, up_e, NULL, ctx, fnd, 0); } else { /* * The target buffer's parent is another index buffer. * TODO: Remove recursion. */ err = indx_insert_into_buffer(indx, ni, root, up_e, ctx, level - 1, fnd); } if (err) { /* * Undo critical operations. */ indx_mark_free(indx, ni, new_vbn >> indx->idx2vbn_bits); memcpy(hdr1, hdr1_saved, used1); indx_write(indx, ni, n1, 0); } out: kfree(up_e); kfree(hdr1_saved); return err; } /* * indx_insert_entry - Insert new entry into index. * * @undo - True if we undoing previous remove. */ int indx_insert_entry(struct ntfs_index *indx, struct ntfs_inode *ni, const struct NTFS_DE *new_de, const void *ctx, struct ntfs_fnd *fnd, bool undo) { int err; int diff; struct NTFS_DE *e; struct ntfs_fnd *fnd_a = NULL; struct INDEX_ROOT *root; if (!fnd) { fnd_a = fnd_get(); if (!fnd_a) { err = -ENOMEM; goto out1; } fnd = fnd_a; } root = indx_get_root(indx, ni, NULL, NULL); if (!root) { err = -EINVAL; goto out; } if (fnd_is_empty(fnd)) { /* * Find the spot the tree where we want to * insert the new entry. */ err = indx_find(indx, ni, root, new_de + 1, le16_to_cpu(new_de->key_size), ctx, &diff, &e, fnd); if (err) goto out; if (!diff) { err = -EEXIST; goto out; } } if (!fnd->level) { /* * The root is also a leaf, so we'll insert the * new entry into it. */ err = indx_insert_into_root(indx, ni, new_de, fnd->root_de, ctx, fnd, undo); } else { /* * Found a leaf buffer, so we'll insert the new entry into it. */ err = indx_insert_into_buffer(indx, ni, root, new_de, ctx, fnd->level - 1, fnd); } out: fnd_put(fnd_a); out1: return err; } /* * indx_find_buffer - Locate a buffer from the tree. */ static struct indx_node *indx_find_buffer(struct ntfs_index *indx, struct ntfs_inode *ni, const struct INDEX_ROOT *root, __le64 vbn, struct indx_node *n) { int err; const struct NTFS_DE *e; struct indx_node *r; const struct INDEX_HDR *hdr = n ? &n->index->ihdr : &root->ihdr; /* Step 1: Scan one level. */ for (e = hdr_first_de(hdr);; e = hdr_next_de(hdr, e)) { if (!e) return ERR_PTR(-EINVAL); if (de_has_vcn(e) && vbn == de_get_vbn_le(e)) return n; if (de_is_last(e)) break; } /* Step2: Do recursion. */ e = Add2Ptr(hdr, le32_to_cpu(hdr->de_off)); for (;;) { if (de_has_vcn_ex(e)) { err = indx_read(indx, ni, de_get_vbn(e), &n); if (err) return ERR_PTR(err); r = indx_find_buffer(indx, ni, root, vbn, n); if (r) return r; } if (de_is_last(e)) break; e = Add2Ptr(e, le16_to_cpu(e->size)); } return NULL; } /* * indx_shrink - Deallocate unused tail indexes. */ static int indx_shrink(struct ntfs_index *indx, struct ntfs_inode *ni, size_t bit) { int err = 0; u64 bpb, new_data; size_t nbits; struct ATTRIB *b; struct ATTR_LIST_ENTRY *le = NULL; const struct INDEX_NAMES *in = &s_index_names[indx->type]; b = ni_find_attr(ni, NULL, &le, ATTR_BITMAP, in->name, in->name_len, NULL, NULL); if (!b) return -ENOENT; if (!b->non_res) { unsigned long pos; const unsigned long *bm = resident_data(b); nbits = (size_t)le32_to_cpu(b->res.data_size) * 8; if (bit >= nbits) return 0; pos = find_next_bit_le(bm, nbits, bit); if (pos < nbits) return 0; } else { size_t used = MINUS_ONE_T; nbits = le64_to_cpu(b->nres.data_size) * 8; if (bit >= nbits) return 0; err = scan_nres_bitmap(ni, b, indx, bit, &scan_for_used, &used); if (err) return err; if (used != MINUS_ONE_T) return 0; } new_data = (u64)bit << indx->index_bits; err = attr_set_size(ni, ATTR_ALLOC, in->name, in->name_len, &indx->alloc_run, new_data, &new_data, false, NULL); if (err) return err; if (in->name == I30_NAME) ni->vfs_inode.i_size = new_data; bpb = bitmap_size(bit); if (bpb * 8 == nbits) return 0; err = attr_set_size(ni, ATTR_BITMAP, in->name, in->name_len, &indx->bitmap_run, bpb, &bpb, false, NULL); return err; } static int indx_free_children(struct ntfs_index *indx, struct ntfs_inode *ni, const struct NTFS_DE *e, bool trim) { int err; struct indx_node *n = NULL; struct INDEX_HDR *hdr; CLST vbn = de_get_vbn(e); size_t i; err = indx_read(indx, ni, vbn, &n); if (err) return err; hdr = &n->index->ihdr; /* First, recurse into the children, if any. */ if (hdr_has_subnode(hdr)) { for (e = hdr_first_de(hdr); e; e = hdr_next_de(hdr, e)) { indx_free_children(indx, ni, e, false); if (de_is_last(e)) break; } } put_indx_node(n); i = vbn >> indx->idx2vbn_bits; /* * We've gotten rid of the children; add this buffer to the free list. */ indx_mark_free(indx, ni, i); if (!trim) return 0; /* * If there are no used indexes after current free index * then we can truncate allocation and bitmap. * Use bitmap to estimate the case. */ indx_shrink(indx, ni, i + 1); return 0; } /* * indx_get_entry_to_replace * * Find a replacement entry for a deleted entry. * Always returns a node entry: * NTFS_IE_HAS_SUBNODES is set the flags and the size includes the sub_vcn. */ static int indx_get_entry_to_replace(struct ntfs_index *indx, struct ntfs_inode *ni, const struct NTFS_DE *de_next, struct NTFS_DE **de_to_replace, struct ntfs_fnd *fnd) { int err; int level = -1; CLST vbn; struct NTFS_DE *e, *te, *re; struct indx_node *n; struct INDEX_BUFFER *ib; *de_to_replace = NULL; /* Find first leaf entry down from de_next. */ vbn = de_get_vbn(de_next); for (;;) { n = NULL; err = indx_read(indx, ni, vbn, &n); if (err) goto out; e = hdr_first_de(&n->index->ihdr); fnd_push(fnd, n, e); if (!de_is_last(e)) { /* * This buffer is non-empty, so its first entry * could be used as the replacement entry. */ level = fnd->level - 1; } if (!de_has_vcn(e)) break; /* This buffer is a node. Continue to go down. */ vbn = de_get_vbn(e); } if (level == -1) goto out; n = fnd->nodes[level]; te = hdr_first_de(&n->index->ihdr); /* Copy the candidate entry into the replacement entry buffer. */ re = kmalloc(le16_to_cpu(te->size) + sizeof(u64), GFP_NOFS); if (!re) { err = -ENOMEM; goto out; } *de_to_replace = re; memcpy(re, te, le16_to_cpu(te->size)); if (!de_has_vcn(re)) { /* * The replacement entry we found doesn't have a sub_vcn. * increase its size to hold one. */ le16_add_cpu(&re->size, sizeof(u64)); re->flags |= NTFS_IE_HAS_SUBNODES; } else { /* * The replacement entry we found was a node entry, which * means that all its child buffers are empty. Return them * to the free pool. */ indx_free_children(indx, ni, te, true); } /* * Expunge the replacement entry from its former location, * and then write that buffer. */ ib = n->index; e = hdr_delete_de(&ib->ihdr, te); fnd->de[level] = e; indx_write(indx, ni, n, 0); if (ib_is_leaf(ib) && ib_is_empty(ib)) { /* An empty leaf. */ return 0; } out: fnd_clear(fnd); return err; } /* * indx_delete_entry - Delete an entry from the index. */ int indx_delete_entry(struct ntfs_index *indx, struct ntfs_inode *ni, const void *key, u32 key_len, const void *ctx) { int err, diff; struct INDEX_ROOT *root; struct INDEX_HDR *hdr; struct ntfs_fnd *fnd, *fnd2; struct INDEX_BUFFER *ib; struct NTFS_DE *e, *re, *next, *prev, *me; struct indx_node *n, *n2d = NULL; __le64 sub_vbn; int level, level2; struct ATTRIB *attr; struct mft_inode *mi; u32 e_size, root_size, new_root_size; size_t trim_bit; const struct INDEX_NAMES *in; fnd = fnd_get(); if (!fnd) { err = -ENOMEM; goto out2; } fnd2 = fnd_get(); if (!fnd2) { err = -ENOMEM; goto out1; } root = indx_get_root(indx, ni, &attr, &mi); if (!root) { err = -EINVAL; goto out; } /* Locate the entry to remove. */ err = indx_find(indx, ni, root, key, key_len, ctx, &diff, &e, fnd); if (err) goto out; if (!e || diff) { err = -ENOENT; goto out; } level = fnd->level; if (level) { n = fnd->nodes[level - 1]; e = fnd->de[level - 1]; ib = n->index; hdr = &ib->ihdr; } else { hdr = &root->ihdr; e = fnd->root_de; n = NULL; } e_size = le16_to_cpu(e->size); if (!de_has_vcn_ex(e)) { /* The entry to delete is a leaf, so we can just rip it out. */ hdr_delete_de(hdr, e); if (!level) { hdr->total = hdr->used; /* Shrink resident root attribute. */ mi_resize_attr(mi, attr, 0 - e_size); goto out; } indx_write(indx, ni, n, 0); /* * Check to see if removing that entry made * the leaf empty. */ if (ib_is_leaf(ib) && ib_is_empty(ib)) { fnd_pop(fnd); fnd_push(fnd2, n, e); } } else { /* * The entry we wish to delete is a node buffer, so we * have to find a replacement for it. */ next = de_get_next(e); err = indx_get_entry_to_replace(indx, ni, next, &re, fnd2); if (err) goto out; if (re) { de_set_vbn_le(re, de_get_vbn_le(e)); hdr_delete_de(hdr, e); err = level ? indx_insert_into_buffer(indx, ni, root, re, ctx, fnd->level - 1, fnd) : indx_insert_into_root(indx, ni, re, e, ctx, fnd, 0); kfree(re); if (err) goto out; } else { /* * There is no replacement for the current entry. * This means that the subtree rooted at its node * is empty, and can be deleted, which turn means * that the node can just inherit the deleted * entry sub_vcn. */ indx_free_children(indx, ni, next, true); de_set_vbn_le(next, de_get_vbn_le(e)); hdr_delete_de(hdr, e); if (level) { indx_write(indx, ni, n, 0); } else { hdr->total = hdr->used; /* Shrink resident root attribute. */ mi_resize_attr(mi, attr, 0 - e_size); } } } /* Delete a branch of tree. */ if (!fnd2 || !fnd2->level) goto out; /* Reinit root 'cause it can be changed. */ root = indx_get_root(indx, ni, &attr, &mi); if (!root) { err = -EINVAL; goto out; } n2d = NULL; sub_vbn = fnd2->nodes[0]->index->vbn; level2 = 0; level = fnd->level; hdr = level ? &fnd->nodes[level - 1]->index->ihdr : &root->ihdr; /* Scan current level. */ for (e = hdr_first_de(hdr);; e = hdr_next_de(hdr, e)) { if (!e) { err = -EINVAL; goto out; } if (de_has_vcn(e) && sub_vbn == de_get_vbn_le(e)) break; if (de_is_last(e)) { e = NULL; break; } } if (!e) { /* Do slow search from root. */ struct indx_node *in; fnd_clear(fnd); in = indx_find_buffer(indx, ni, root, sub_vbn, NULL); if (IS_ERR(in)) { err = PTR_ERR(in); goto out; } if (in) fnd_push(fnd, in, NULL); } /* Merge fnd2 -> fnd. */ for (level = 0; level < fnd2->level; level++) { fnd_push(fnd, fnd2->nodes[level], fnd2->de[level]); fnd2->nodes[level] = NULL; } fnd2->level = 0; hdr = NULL; for (level = fnd->level; level; level--) { struct indx_node *in = fnd->nodes[level - 1]; ib = in->index; if (ib_is_empty(ib)) { sub_vbn = ib->vbn; } else { hdr = &ib->ihdr; n2d = in; level2 = level; break; } } if (!hdr) hdr = &root->ihdr; e = hdr_first_de(hdr); if (!e) { err = -EINVAL; goto out; } if (hdr != &root->ihdr || !de_is_last(e)) { prev = NULL; while (!de_is_last(e)) { if (de_has_vcn(e) && sub_vbn == de_get_vbn_le(e)) break; prev = e; e = hdr_next_de(hdr, e); if (!e) { err = -EINVAL; goto out; } } if (sub_vbn != de_get_vbn_le(e)) { /* * Didn't find the parent entry, although this buffer * is the parent trail. Something is corrupt. */ err = -EINVAL; goto out; } if (de_is_last(e)) { /* * Since we can't remove the end entry, we'll remove * its predecessor instead. This means we have to * transfer the predecessor's sub_vcn to the end entry. * Note: This index block is not empty, so the * predecessor must exist. */ if (!prev) { err = -EINVAL; goto out; } if (de_has_vcn(prev)) { de_set_vbn_le(e, de_get_vbn_le(prev)); } else if (de_has_vcn(e)) { le16_sub_cpu(&e->size, sizeof(u64)); e->flags &= ~NTFS_IE_HAS_SUBNODES; le32_sub_cpu(&hdr->used, sizeof(u64)); } e = prev; } /* * Copy the current entry into a temporary buffer (stripping * off its down-pointer, if any) and delete it from the current * buffer or root, as appropriate. */ e_size = le16_to_cpu(e->size); me = kmemdup(e, e_size, GFP_NOFS); if (!me) { err = -ENOMEM; goto out; } if (de_has_vcn(me)) { me->flags &= ~NTFS_IE_HAS_SUBNODES; le16_sub_cpu(&me->size, sizeof(u64)); } hdr_delete_de(hdr, e); if (hdr == &root->ihdr) { level = 0; hdr->total = hdr->used; /* Shrink resident root attribute. */ mi_resize_attr(mi, attr, 0 - e_size); } else { indx_write(indx, ni, n2d, 0); level = level2; } /* Mark unused buffers as free. */ trim_bit = -1; for (; level < fnd->level; level++) { ib = fnd->nodes[level]->index; if (ib_is_empty(ib)) { size_t k = le64_to_cpu(ib->vbn) >> indx->idx2vbn_bits; indx_mark_free(indx, ni, k); if (k < trim_bit) trim_bit = k; } } fnd_clear(fnd); /*fnd->root_de = NULL;*/ /* * Re-insert the entry into the tree. * Find the spot the tree where we want to insert the new entry. */ err = indx_insert_entry(indx, ni, me, ctx, fnd, 0); kfree(me); if (err) goto out; if (trim_bit != -1) indx_shrink(indx, ni, trim_bit); } else { /* * This tree needs to be collapsed down to an empty root. * Recreate the index root as an empty leaf and free all * the bits the index allocation bitmap. */ fnd_clear(fnd); fnd_clear(fnd2); in = &s_index_names[indx->type]; err = attr_set_size(ni, ATTR_ALLOC, in->name, in->name_len, &indx->alloc_run, 0, NULL, false, NULL); if (in->name == I30_NAME) ni->vfs_inode.i_size = 0; err = ni_remove_attr(ni, ATTR_ALLOC, in->name, in->name_len, false, NULL); run_close(&indx->alloc_run); err = attr_set_size(ni, ATTR_BITMAP, in->name, in->name_len, &indx->bitmap_run, 0, NULL, false, NULL); err = ni_remove_attr(ni, ATTR_BITMAP, in->name, in->name_len, false, NULL); run_close(&indx->bitmap_run); root = indx_get_root(indx, ni, &attr, &mi); if (!root) { err = -EINVAL; goto out; } root_size = le32_to_cpu(attr->res.data_size); new_root_size = sizeof(struct INDEX_ROOT) + sizeof(struct NTFS_DE); if (new_root_size != root_size && !mi_resize_attr(mi, attr, new_root_size - root_size)) { err = -EINVAL; goto out; } /* Fill first entry. */ e = (struct NTFS_DE *)(root + 1); e->ref.low = 0; e->ref.high = 0; e->ref.seq = 0; e->size = cpu_to_le16(sizeof(struct NTFS_DE)); e->flags = NTFS_IE_LAST; // 0x02 e->key_size = 0; e->res = 0; hdr = &root->ihdr; hdr->flags = 0; hdr->used = hdr->total = cpu_to_le32( new_root_size - offsetof(struct INDEX_ROOT, ihdr)); mi->dirty = true; } out: fnd_put(fnd2); out1: fnd_put(fnd); out2: return err; } /* * Update duplicated information in directory entry * 'dup' - info from MFT record */ int indx_update_dup(struct ntfs_inode *ni, struct ntfs_sb_info *sbi, const struct ATTR_FILE_NAME *fname, const struct NTFS_DUP_INFO *dup, int sync) { int err, diff; struct NTFS_DE *e = NULL; struct ATTR_FILE_NAME *e_fname; struct ntfs_fnd *fnd; struct INDEX_ROOT *root; struct mft_inode *mi; struct ntfs_index *indx = &ni->dir; fnd = fnd_get(); if (!fnd) return -ENOMEM; root = indx_get_root(indx, ni, NULL, &mi); if (!root) { err = -EINVAL; goto out; } /* Find entry in directory. */ err = indx_find(indx, ni, root, fname, fname_full_size(fname), sbi, &diff, &e, fnd); if (err) goto out; if (!e) { err = -EINVAL; goto out; } if (diff) { err = -EINVAL; goto out; } e_fname = (struct ATTR_FILE_NAME *)(e + 1); if (!memcmp(&e_fname->dup, dup, sizeof(*dup))) { /* * Nothing to update in index! Try to avoid this call. */ goto out; } memcpy(&e_fname->dup, dup, sizeof(*dup)); if (fnd->level) { /* Directory entry in index. */ err = indx_write(indx, ni, fnd->nodes[fnd->level - 1], sync); } else { /* Directory entry in directory MFT record. */ mi->dirty = true; if (sync) err = mi_write(mi, 1); else mark_inode_dirty(&ni->vfs_inode); } out: fnd_put(fnd); return err; } |
198 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 | // SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) /* Copyright (C) 2019 Netronome Systems, Inc. */ #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <net/snmp.h> #include <net/tls.h> #include "tls.h" #ifdef CONFIG_PROC_FS static const struct snmp_mib tls_mib_list[] = { SNMP_MIB_ITEM("TlsCurrTxSw", LINUX_MIB_TLSCURRTXSW), SNMP_MIB_ITEM("TlsCurrRxSw", LINUX_MIB_TLSCURRRXSW), SNMP_MIB_ITEM("TlsCurrTxDevice", LINUX_MIB_TLSCURRTXDEVICE), SNMP_MIB_ITEM("TlsCurrRxDevice", LINUX_MIB_TLSCURRRXDEVICE), SNMP_MIB_ITEM("TlsTxSw", LINUX_MIB_TLSTXSW), SNMP_MIB_ITEM("TlsRxSw", LINUX_MIB_TLSRXSW), SNMP_MIB_ITEM("TlsTxDevice", LINUX_MIB_TLSTXDEVICE), SNMP_MIB_ITEM("TlsRxDevice", LINUX_MIB_TLSRXDEVICE), SNMP_MIB_ITEM("TlsDecryptError", LINUX_MIB_TLSDECRYPTERROR), SNMP_MIB_ITEM("TlsRxDeviceResync", LINUX_MIB_TLSRXDEVICERESYNC), SNMP_MIB_ITEM("TlsDecryptRetry", LINUX_MIB_TLSDECRYPTRETRY), SNMP_MIB_ITEM("TlsRxNoPadViolation", LINUX_MIB_TLSRXNOPADVIOL), SNMP_MIB_SENTINEL }; static int tls_statistics_seq_show(struct seq_file *seq, void *v) { unsigned long buf[LINUX_MIB_TLSMAX] = {}; struct net *net = seq->private; int i; snmp_get_cpu_field_batch(buf, tls_mib_list, net->mib.tls_statistics); for (i = 0; tls_mib_list[i].name; i++) seq_printf(seq, "%-32s\t%lu\n", tls_mib_list[i].name, buf[i]); return 0; } #endif int __net_init tls_proc_init(struct net *net) { #ifdef CONFIG_PROC_FS if (!proc_create_net_single("tls_stat", 0444, net->proc_net, tls_statistics_seq_show, NULL)) return -ENOMEM; #endif /* CONFIG_PROC_FS */ return 0; } void __net_exit tls_proc_fini(struct net *net) { remove_proc_entry("tls_stat", net->proc_net); } |
17802 6154 78 78 78 12 78 848 33 739 735 74 30 2 24 18 736 1 98 89 9 1 109 1763 12912 697 35 32 31 3 921 152 1604 915 915 346 534 40 1156 1155 590 15 94 156 529 14397 69 13521 12796 768 12797 1653 9 28 12 4 24 14 2 11 2 9 15 5 1 8 1 1 2 7 23 1 11 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 | // SPDX-License-Identifier: GPL-2.0-only /* * Generic pidhash and scalable, time-bounded PID allocator * * (C) 2002-2003 Nadia Yvette Chambers, IBM * (C) 2004 Nadia Yvette Chambers, Oracle * (C) 2002-2004 Ingo Molnar, Red Hat * * pid-structures are backing objects for tasks sharing a given ID to chain * against. There is very little to them aside from hashing them and * parking tasks using given ID's on a list. * * The hash is always changed with the tasklist_lock write-acquired, * and the hash is only accessed with the tasklist_lock at least * read-acquired, so there's no additional SMP locking needed here. * * We have a list of bitmap pages, which bitmaps represent the PID space. * Allocating and freeing PIDs is completely lockless. The worst-case * allocation scenario when all but one out of 1 million PIDs possible are * allocated already: the scanning of 32 list entries and at most PAGE_SIZE * bytes. The typical fastpath is a single successful setbit. Freeing is O(1). * * Pid namespaces: * (C) 2007 Pavel Emelyanov <xemul@openvz.org>, OpenVZ, SWsoft Inc. * (C) 2007 Sukadev Bhattiprolu <sukadev@us.ibm.com>, IBM * Many thanks to Oleg Nesterov for comments and help * */ #include <linux/mm.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/rculist.h> #include <linux/memblock.h> #include <linux/pid_namespace.h> #include <linux/init_task.h> #include <linux/syscalls.h> #include <linux/proc_ns.h> #include <linux/refcount.h> #include <linux/anon_inodes.h> #include <linux/sched/signal.h> #include <linux/sched/task.h> #include <linux/idr.h> #include <net/sock.h> #include <uapi/linux/pidfd.h> struct pid init_struct_pid = { .count = REFCOUNT_INIT(1), .tasks = { { .first = NULL }, { .first = NULL }, { .first = NULL }, }, .level = 0, .numbers = { { .nr = 0, .ns = &init_pid_ns, }, } }; int pid_max = PID_MAX_DEFAULT; #define RESERVED_PIDS 300 int pid_max_min = RESERVED_PIDS + 1; int pid_max_max = PID_MAX_LIMIT; /* * PID-map pages start out as NULL, they get allocated upon * first use and are never deallocated. This way a low pid_max * value does not cause lots of bitmaps to be allocated, but * the scheme scales to up to 4 million PIDs, runtime. */ struct pid_namespace init_pid_ns = { .ns.count = REFCOUNT_INIT(2), .idr = IDR_INIT(init_pid_ns.idr), .pid_allocated = PIDNS_ADDING, .level = 0, .child_reaper = &init_task, .user_ns = &init_user_ns, .ns.inum = PROC_PID_INIT_INO, #ifdef CONFIG_PID_NS .ns.ops = &pidns_operations, #endif #if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) .memfd_noexec_scope = MEMFD_NOEXEC_SCOPE_EXEC, #endif }; EXPORT_SYMBOL_GPL(init_pid_ns); /* * Note: disable interrupts while the pidmap_lock is held as an * interrupt might come in and do read_lock(&tasklist_lock). * * If we don't disable interrupts there is a nasty deadlock between * detach_pid()->free_pid() and another cpu that does * spin_lock(&pidmap_lock) followed by an interrupt routine that does * read_lock(&tasklist_lock); * * After we clean up the tasklist_lock and know there are no * irq handlers that take it we can leave the interrupts enabled. * For now it is easier to be safe than to prove it can't happen. */ static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); void put_pid(struct pid *pid) { struct pid_namespace *ns; if (!pid) return; ns = pid->numbers[pid->level].ns; if (refcount_dec_and_test(&pid->count)) { kmem_cache_free(ns->pid_cachep, pid); put_pid_ns(ns); } } EXPORT_SYMBOL_GPL(put_pid); static void delayed_put_pid(struct rcu_head *rhp) { struct pid *pid = container_of(rhp, struct pid, rcu); put_pid(pid); } void free_pid(struct pid *pid) { /* We can be called with write_lock_irq(&tasklist_lock) held */ int i; unsigned long flags; spin_lock_irqsave(&pidmap_lock, flags); for (i = 0; i <= pid->level; i++) { struct upid *upid = pid->numbers + i; struct pid_namespace *ns = upid->ns; switch (--ns->pid_allocated) { case 2: case 1: /* When all that is left in the pid namespace * is the reaper wake up the reaper. The reaper * may be sleeping in zap_pid_ns_processes(). */ wake_up_process(ns->child_reaper); break; case PIDNS_ADDING: /* Handle a fork failure of the first process */ WARN_ON(ns->child_reaper); ns->pid_allocated = 0; break; } idr_remove(&ns->idr, upid->nr); } spin_unlock_irqrestore(&pidmap_lock, flags); call_rcu(&pid->rcu, delayed_put_pid); } struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, size_t set_tid_size) { struct pid *pid; enum pid_type type; int i, nr; struct pid_namespace *tmp; struct upid *upid; int retval = -ENOMEM; /* * set_tid_size contains the size of the set_tid array. Starting at * the most nested currently active PID namespace it tells alloc_pid() * which PID to set for a process in that most nested PID namespace * up to set_tid_size PID namespaces. It does not have to set the PID * for a process in all nested PID namespaces but set_tid_size must * never be greater than the current ns->level + 1. */ if (set_tid_size > ns->level + 1) return ERR_PTR(-EINVAL); pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL); if (!pid) return ERR_PTR(retval); tmp = ns; pid->level = ns->level; for (i = ns->level; i >= 0; i--) { int tid = 0; if (set_tid_size) { tid = set_tid[ns->level - i]; retval = -EINVAL; if (tid < 1 || tid >= pid_max) goto out_free; /* * Also fail if a PID != 1 is requested and * no PID 1 exists. */ if (tid != 1 && !tmp->child_reaper) goto out_free; retval = -EPERM; if (!checkpoint_restore_ns_capable(tmp->user_ns)) goto out_free; set_tid_size--; } idr_preload(GFP_KERNEL); spin_lock_irq(&pidmap_lock); if (tid) { nr = idr_alloc(&tmp->idr, NULL, tid, tid + 1, GFP_ATOMIC); /* * If ENOSPC is returned it means that the PID is * alreay in use. Return EEXIST in that case. */ if (nr == -ENOSPC) nr = -EEXIST; } else { int pid_min = 1; /* * init really needs pid 1, but after reaching the * maximum wrap back to RESERVED_PIDS */ if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS) pid_min = RESERVED_PIDS; /* * Store a null pointer so find_pid_ns does not find * a partially initialized PID (see below). */ nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min, pid_max, GFP_ATOMIC); } spin_unlock_irq(&pidmap_lock); idr_preload_end(); if (nr < 0) { retval = (nr == -ENOSPC) ? -EAGAIN : nr; goto out_free; } pid->numbers[i].nr = nr; pid->numbers[i].ns = tmp; tmp = tmp->parent; } /* * ENOMEM is not the most obvious choice especially for the case * where the child subreaper has already exited and the pid * namespace denies the creation of any new processes. But ENOMEM * is what we have exposed to userspace for a long time and it is * documented behavior for pid namespaces. So we can't easily * change it even if there were an error code better suited. */ retval = -ENOMEM; get_pid_ns(ns); refcount_set(&pid->count, 1); spin_lock_init(&pid->lock); for (type = 0; type < PIDTYPE_MAX; ++type) INIT_HLIST_HEAD(&pid->tasks[type]); init_waitqueue_head(&pid->wait_pidfd); INIT_HLIST_HEAD(&pid->inodes); upid = pid->numbers + ns->level; spin_lock_irq(&pidmap_lock); if (!(ns->pid_allocated & PIDNS_ADDING)) goto out_unlock; for ( ; upid >= pid->numbers; --upid) { /* Make the PID visible to find_pid_ns. */ idr_replace(&upid->ns->idr, pid, upid->nr); upid->ns->pid_allocated++; } spin_unlock_irq(&pidmap_lock); return pid; out_unlock: spin_unlock_irq(&pidmap_lock); put_pid_ns(ns); out_free: spin_lock_irq(&pidmap_lock); while (++i <= ns->level) { upid = pid->numbers + i; idr_remove(&upid->ns->idr, upid->nr); } /* On failure to allocate the first pid, reset the state */ if (ns->pid_allocated == PIDNS_ADDING) idr_set_cursor(&ns->idr, 0); spin_unlock_irq(&pidmap_lock); kmem_cache_free(ns->pid_cachep, pid); return ERR_PTR(retval); } void disable_pid_allocation(struct pid_namespace *ns) { spin_lock_irq(&pidmap_lock); ns->pid_allocated &= ~PIDNS_ADDING; spin_unlock_irq(&pidmap_lock); } struct pid *find_pid_ns(int nr, struct pid_namespace *ns) { return idr_find(&ns->idr, nr); } EXPORT_SYMBOL_GPL(find_pid_ns); struct pid *find_vpid(int nr) { return find_pid_ns(nr, task_active_pid_ns(current)); } EXPORT_SYMBOL_GPL(find_vpid); static struct pid **task_pid_ptr(struct task_struct *task, enum pid_type type) { return (type == PIDTYPE_PID) ? &task->thread_pid : &task->signal->pids[type]; } /* * attach_pid() must be called with the tasklist_lock write-held. */ void attach_pid(struct task_struct *task, enum pid_type type) { struct pid *pid = *task_pid_ptr(task, type); hlist_add_head_rcu(&task->pid_links[type], &pid->tasks[type]); } static void __change_pid(struct task_struct *task, enum pid_type type, struct pid *new) { struct pid **pid_ptr = task_pid_ptr(task, type); struct pid *pid; int tmp; pid = *pid_ptr; hlist_del_rcu(&task->pid_links[type]); *pid_ptr = new; for (tmp = PIDTYPE_MAX; --tmp >= 0; ) if (pid_has_task(pid, tmp)) return; free_pid(pid); } void detach_pid(struct task_struct *task, enum pid_type type) { __change_pid(task, type, NULL); } void change_pid(struct task_struct *task, enum pid_type type, struct pid *pid) { __change_pid(task, type, pid); attach_pid(task, type); } void exchange_tids(struct task_struct *left, struct task_struct *right) { struct pid *pid1 = left->thread_pid; struct pid *pid2 = right->thread_pid; struct hlist_head *head1 = &pid1->tasks[PIDTYPE_PID]; struct hlist_head *head2 = &pid2->tasks[PIDTYPE_PID]; /* Swap the single entry tid lists */ hlists_swap_heads_rcu(head1, head2); /* Swap the per task_struct pid */ rcu_assign_pointer(left->thread_pid, pid2); rcu_assign_pointer(right->thread_pid, pid1); /* Swap the cached value */ WRITE_ONCE(left->pid, pid_nr(pid2)); WRITE_ONCE(right->pid, pid_nr(pid1)); } /* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */ void transfer_pid(struct task_struct *old, struct task_struct *new, enum pid_type type) { if (type == PIDTYPE_PID) new->thread_pid = old->thread_pid; hlist_replace_rcu(&old->pid_links[type], &new->pid_links[type]); } struct task_struct *pid_task(struct pid *pid, enum pid_type type) { struct task_struct *result = NULL; if (pid) { struct hlist_node *first; first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]), lockdep_tasklist_lock_is_held()); if (first) result = hlist_entry(first, struct task_struct, pid_links[(type)]); } return result; } EXPORT_SYMBOL(pid_task); /* * Must be called under rcu_read_lock(). */ struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) { RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "find_task_by_pid_ns() needs rcu_read_lock() protection"); return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID); } struct task_struct *find_task_by_vpid(pid_t vnr) { return find_task_by_pid_ns(vnr, task_active_pid_ns(current)); } struct task_struct *find_get_task_by_vpid(pid_t nr) { struct task_struct *task; rcu_read_lock(); task = find_task_by_vpid(nr); if (task) get_task_struct(task); rcu_read_unlock(); return task; } struct pid *get_task_pid(struct task_struct *task, enum pid_type type) { struct pid *pid; rcu_read_lock(); pid = get_pid(rcu_dereference(*task_pid_ptr(task, type))); rcu_read_unlock(); return pid; } EXPORT_SYMBOL_GPL(get_task_pid); struct task_struct *get_pid_task(struct pid *pid, enum pid_type type) { struct task_struct *result; rcu_read_lock(); result = pid_task(pid, type); if (result) get_task_struct(result); rcu_read_unlock(); return result; } EXPORT_SYMBOL_GPL(get_pid_task); struct pid *find_get_pid(pid_t nr) { struct pid *pid; rcu_read_lock(); pid = get_pid(find_vpid(nr)); rcu_read_unlock(); return pid; } EXPORT_SYMBOL_GPL(find_get_pid); pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns) { struct upid *upid; pid_t nr = 0; if (pid && ns->level <= pid->level) { upid = &pid->numbers[ns->level]; if (upid->ns == ns) nr = upid->nr; } return nr; } EXPORT_SYMBOL_GPL(pid_nr_ns); pid_t pid_vnr(struct pid *pid) { return pid_nr_ns(pid, task_active_pid_ns(current)); } EXPORT_SYMBOL_GPL(pid_vnr); pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, struct pid_namespace *ns) { pid_t nr = 0; rcu_read_lock(); if (!ns) ns = task_active_pid_ns(current); nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns); rcu_read_unlock(); return nr; } EXPORT_SYMBOL(__task_pid_nr_ns); struct pid_namespace *task_active_pid_ns(struct task_struct *tsk) { return ns_of_pid(task_pid(tsk)); } EXPORT_SYMBOL_GPL(task_active_pid_ns); /* * Used by proc to find the first pid that is greater than or equal to nr. * * If there is a pid at nr this function is exactly the same as find_pid_ns. */ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) { return idr_get_next(&ns->idr, &nr); } EXPORT_SYMBOL_GPL(find_ge_pid); struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags) { struct fd f; struct pid *pid; f = fdget(fd); if (!f.file) return ERR_PTR(-EBADF); pid = pidfd_pid(f.file); if (!IS_ERR(pid)) { get_pid(pid); *flags = f.file->f_flags; } fdput(f); return pid; } /** * pidfd_get_task() - Get the task associated with a pidfd * * @pidfd: pidfd for which to get the task * @flags: flags associated with this pidfd * * Return the task associated with @pidfd. The function takes a reference on * the returned task. The caller is responsible for releasing that reference. * * Currently, the process identified by @pidfd is always a thread-group leader. * This restriction currently exists for all aspects of pidfds including pidfd * creation (CLONE_PIDFD cannot be used with CLONE_THREAD) and pidfd polling * (only supports thread group leaders). * * Return: On success, the task_struct associated with the pidfd. * On error, a negative errno number will be returned. */ struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags) { unsigned int f_flags; struct pid *pid; struct task_struct *task; pid = pidfd_get_pid(pidfd, &f_flags); if (IS_ERR(pid)) return ERR_CAST(pid); task = get_pid_task(pid, PIDTYPE_TGID); put_pid(pid); if (!task) return ERR_PTR(-ESRCH); *flags = f_flags; return task; } /** * pidfd_create() - Create a new pid file descriptor. * * @pid: struct pid that the pidfd will reference * @flags: flags to pass * * This creates a new pid file descriptor with the O_CLOEXEC flag set. * * Note, that this function can only be called after the fd table has * been unshared to avoid leaking the pidfd to the new process. * * This symbol should not be explicitly exported to loadable modules. * * Return: On success, a cloexec pidfd is returned. * On error, a negative errno number will be returned. */ int pidfd_create(struct pid *pid, unsigned int flags) { int pidfd; struct file *pidfd_file; pidfd = pidfd_prepare(pid, flags, &pidfd_file); if (pidfd < 0) return pidfd; fd_install(pidfd, pidfd_file); return pidfd; } /** * sys_pidfd_open() - Open new pid file descriptor. * * @pid: pid for which to retrieve a pidfd * @flags: flags to pass * * This creates a new pid file descriptor with the O_CLOEXEC flag set for * the process identified by @pid. Currently, the process identified by * @pid must be a thread-group leader. This restriction currently exists * for all aspects of pidfds including pidfd creation (CLONE_PIDFD cannot * be used with CLONE_THREAD) and pidfd polling (only supports thread group * leaders). * * Return: On success, a cloexec pidfd is returned. * On error, a negative errno number will be returned. */ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags) { int fd; struct pid *p; if (flags & ~PIDFD_NONBLOCK) return -EINVAL; if (pid <= 0) return -EINVAL; p = find_get_pid(pid); if (!p) return -ESRCH; fd = pidfd_create(p, flags); put_pid(p); return fd; } void __init pid_idr_init(void) { /* Verify no one has done anything silly: */ BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_ADDING); /* bump default and minimum pid_max based on number of cpus */ pid_max = min(pid_max_max, max_t(int, pid_max, PIDS_PER_CPU_DEFAULT * num_possible_cpus())); pid_max_min = max_t(int, pid_max_min, PIDS_PER_CPU_MIN * num_possible_cpus()); pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min); idr_init(&init_pid_ns.idr); init_pid_ns.pid_cachep = kmem_cache_create("pid", struct_size_t(struct pid, numbers, 1), __alignof__(struct pid), SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT, NULL); } static struct file *__pidfd_fget(struct task_struct *task, int fd) { struct file *file; int ret; ret = down_read_killable(&task->signal->exec_update_lock); if (ret) return ERR_PTR(ret); if (ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS)) file = fget_task(task, fd); else file = ERR_PTR(-EPERM); up_read(&task->signal->exec_update_lock); return file ?: ERR_PTR(-EBADF); } static int pidfd_getfd(struct pid *pid, int fd) { struct task_struct *task; struct file *file; int ret; task = get_pid_task(pid, PIDTYPE_PID); if (!task) return -ESRCH; file = __pidfd_fget(task, fd); put_task_struct(task); if (IS_ERR(file)) return PTR_ERR(file); ret = receive_fd(file, O_CLOEXEC); fput(file); return ret; } /** * sys_pidfd_getfd() - Get a file descriptor from another process * * @pidfd: the pidfd file descriptor of the process * @fd: the file descriptor number to get * @flags: flags on how to get the fd (reserved) * * This syscall gets a copy of a file descriptor from another process * based on the pidfd, and file descriptor number. It requires that * the calling process has the ability to ptrace the process represented * by the pidfd. The process which is having its file descriptor copied * is otherwise unaffected. * * Return: On success, a cloexec file descriptor is returned. * On error, a negative errno number will be returned. */ SYSCALL_DEFINE3(pidfd_getfd, int, pidfd, int, fd, unsigned int, flags) { struct pid *pid; struct fd f; int ret; /* flags is currently unused - make sure it's unset */ if (flags) return -EINVAL; f = fdget(pidfd); if (!f.file) return -EBADF; pid = pidfd_pid(f.file); if (IS_ERR(pid)) ret = PTR_ERR(pid); else ret = pidfd_getfd(pid, fd); fdput(f); return ret; } |
74 2 30 2 15 52 1 5 9 2 2 2 2 2 1 2 5 177 188 3 186 30 1 1 41 10 38 1 37 78 70 3 10 42 5 2 80 1 53 27 281 11 4 20 6 5 7 7 5 13 19 2 21 7 1 6 62 5 57 5 58 4 58 69 1 1 73 64 14 1 73 63 120 1 93 32 2 117 122 2 97 32 8 8 18 1 17 22 17 1 16 68 59 67 55 3 58 1 54 21 14 19 254 2 1 290 281 3 2 1 279 35 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 | // SPDX-License-Identifier: GPL-2.0+ /* * 2002-10-15 Posix Clocks & timers * by George Anzinger george@mvista.com * Copyright (C) 2002 2003 by MontaVista Software. * * 2004-06-01 Fix CLOCK_REALTIME clock/timer TIMER_ABSTIME bug. * Copyright (C) 2004 Boris Hu * * These are all the functions necessary to implement POSIX clocks & timers */ #include <linux/mm.h> #include <linux/interrupt.h> #include <linux/slab.h> #include <linux/time.h> #include <linux/mutex.h> #include <linux/sched/task.h> #include <linux/uaccess.h> #include <linux/list.h> #include <linux/init.h> #include <linux/compiler.h> #include <linux/hash.h> #include <linux/posix-clock.h> #include <linux/posix-timers.h> #include <linux/syscalls.h> #include <linux/wait.h> #include <linux/workqueue.h> #include <linux/export.h> #include <linux/hashtable.h> #include <linux/compat.h> #include <linux/nospec.h> #include <linux/time_namespace.h> #include "timekeeping.h" #include "posix-timers.h" static struct kmem_cache *posix_timers_cache; /* * Timers are managed in a hash table for lockless lookup. The hash key is * constructed from current::signal and the timer ID and the timer is * matched against current::signal and the timer ID when walking the hash * bucket list. * * This allows checkpoint/restore to reconstruct the exact timer IDs for * a process. */ static DEFINE_HASHTABLE(posix_timers_hashtable, 9); static DEFINE_SPINLOCK(hash_lock); static const struct k_clock * const posix_clocks[]; static const struct k_clock *clockid_to_kclock(const clockid_t id); static const struct k_clock clock_realtime, clock_monotonic; /* SIGEV_THREAD_ID cannot share a bit with the other SIGEV values. */ #if SIGEV_THREAD_ID != (SIGEV_THREAD_ID & \ ~(SIGEV_SIGNAL | SIGEV_NONE | SIGEV_THREAD)) #error "SIGEV_THREAD_ID must not share bit with other SIGEV values!" #endif static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags); #define lock_timer(tid, flags) \ ({ struct k_itimer *__timr; \ __cond_lock(&__timr->it_lock, __timr = __lock_timer(tid, flags)); \ __timr; \ }) static int hash(struct signal_struct *sig, unsigned int nr) { return hash_32(hash32_ptr(sig) ^ nr, HASH_BITS(posix_timers_hashtable)); } static struct k_itimer *__posix_timers_find(struct hlist_head *head, struct signal_struct *sig, timer_t id) { struct k_itimer *timer; hlist_for_each_entry_rcu(timer, head, t_hash, lockdep_is_held(&hash_lock)) { /* timer->it_signal can be set concurrently */ if ((READ_ONCE(timer->it_signal) == sig) && (timer->it_id == id)) return timer; } return NULL; } static struct k_itimer *posix_timer_by_id(timer_t id) { struct signal_struct *sig = current->signal; struct hlist_head *head = &posix_timers_hashtable[hash(sig, id)]; return __posix_timers_find(head, sig, id); } static int posix_timer_add(struct k_itimer *timer) { struct signal_struct *sig = current->signal; struct hlist_head *head; unsigned int cnt, id; /* * FIXME: Replace this by a per signal struct xarray once there is * a plan to handle the resulting CRIU regression gracefully. */ for (cnt = 0; cnt <= INT_MAX; cnt++) { spin_lock(&hash_lock); id = sig->next_posix_timer_id; /* Write the next ID back. Clamp it to the positive space */ sig->next_posix_timer_id = (id + 1) & INT_MAX; head = &posix_timers_hashtable[hash(sig, id)]; if (!__posix_timers_find(head, sig, id)) { hlist_add_head_rcu(&timer->t_hash, head); spin_unlock(&hash_lock); return id; } spin_unlock(&hash_lock); } /* POSIX return code when no timer ID could be allocated */ return -EAGAIN; } static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) { spin_unlock_irqrestore(&timr->it_lock, flags); } static int posix_get_realtime_timespec(clockid_t which_clock, struct timespec64 *tp) { ktime_get_real_ts64(tp); return 0; } static ktime_t posix_get_realtime_ktime(clockid_t which_clock) { return ktime_get_real(); } static int posix_clock_realtime_set(const clockid_t which_clock, const struct timespec64 *tp) { return do_sys_settimeofday64(tp, NULL); } static int posix_clock_realtime_adj(const clockid_t which_clock, struct __kernel_timex *t) { return do_adjtimex(t); } static int posix_get_monotonic_timespec(clockid_t which_clock, struct timespec64 *tp) { ktime_get_ts64(tp); timens_add_monotonic(tp); return 0; } static ktime_t posix_get_monotonic_ktime(clockid_t which_clock) { return ktime_get(); } static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec64 *tp) { ktime_get_raw_ts64(tp); timens_add_monotonic(tp); return 0; } static int posix_get_realtime_coarse(clockid_t which_clock, struct timespec64 *tp) { ktime_get_coarse_real_ts64(tp); return 0; } static int posix_get_monotonic_coarse(clockid_t which_clock, struct timespec64 *tp) { ktime_get_coarse_ts64(tp); timens_add_monotonic(tp); return 0; } static int posix_get_coarse_res(const clockid_t which_clock, struct timespec64 *tp) { *tp = ktime_to_timespec64(KTIME_LOW_RES); return 0; } static int posix_get_boottime_timespec(const clockid_t which_clock, struct timespec64 *tp) { ktime_get_boottime_ts64(tp); timens_add_boottime(tp); return 0; } static ktime_t posix_get_boottime_ktime(const clockid_t which_clock) { return ktime_get_boottime(); } static int posix_get_tai_timespec(clockid_t which_clock, struct timespec64 *tp) { ktime_get_clocktai_ts64(tp); return 0; } static ktime_t posix_get_tai_ktime(clockid_t which_clock) { return ktime_get_clocktai(); } static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec64 *tp) { tp->tv_sec = 0; tp->tv_nsec = hrtimer_resolution; return 0; } static __init int init_posix_timers(void) { posix_timers_cache = kmem_cache_create("posix_timers_cache", sizeof(struct k_itimer), 0, SLAB_PANIC | SLAB_ACCOUNT, NULL); return 0; } __initcall(init_posix_timers); /* * The siginfo si_overrun field and the return value of timer_getoverrun(2) * are of type int. Clamp the overrun value to INT_MAX */ static inline int timer_overrun_to_int(struct k_itimer *timr, int baseval) { s64 sum = timr->it_overrun_last + (s64)baseval; return sum > (s64)INT_MAX ? INT_MAX : (int)sum; } static void common_hrtimer_rearm(struct k_itimer *timr) { struct hrtimer *timer = &timr->it.real.timer; timr->it_overrun += hrtimer_forward(timer, timer->base->get_time(), timr->it_interval); hrtimer_restart(timer); } /* * This function is called from the signal delivery code if * info->si_sys_private is not zero, which indicates that the timer has to * be rearmed. Restart the timer and update info::si_overrun. */ void posixtimer_rearm(struct kernel_siginfo *info) { struct k_itimer *timr; unsigned long flags; timr = lock_timer(info->si_tid, &flags); if (!timr) return; if (timr->it_interval && timr->it_requeue_pending == info->si_sys_private) { timr->kclock->timer_rearm(timr); timr->it_active = 1; timr->it_overrun_last = timr->it_overrun; timr->it_overrun = -1LL; ++timr->it_requeue_pending; info->si_overrun = timer_overrun_to_int(timr, info->si_overrun); } unlock_timer(timr, flags); } int posix_timer_event(struct k_itimer *timr, int si_private) { enum pid_type type; int ret; /* * FIXME: if ->sigq is queued we can race with * dequeue_signal()->posixtimer_rearm(). * * If dequeue_signal() sees the "right" value of * si_sys_private it calls posixtimer_rearm(). * We re-queue ->sigq and drop ->it_lock(). * posixtimer_rearm() locks the timer * and re-schedules it while ->sigq is pending. * Not really bad, but not that we want. */ timr->sigq->info.si_sys_private = si_private; type = !(timr->it_sigev_notify & SIGEV_THREAD_ID) ? PIDTYPE_TGID : PIDTYPE_PID; ret = send_sigqueue(timr->sigq, timr->it_pid, type); /* If we failed to send the signal the timer stops. */ return ret > 0; } /* * This function gets called when a POSIX.1b interval timer expires from * the HRTIMER interrupt (soft interrupt on RT kernels). * * Handles CLOCK_REALTIME, CLOCK_MONOTONIC, CLOCK_BOOTTIME and CLOCK_TAI * based timers. */ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) { enum hrtimer_restart ret = HRTIMER_NORESTART; struct k_itimer *timr; unsigned long flags; int si_private = 0; timr = container_of(timer, struct k_itimer, it.real.timer); spin_lock_irqsave(&timr->it_lock, flags); timr->it_active = 0; if (timr->it_interval != 0) si_private = ++timr->it_requeue_pending; if (posix_timer_event(timr, si_private)) { /* * The signal was not queued due to SIG_IGN. As a * consequence the timer is not going to be rearmed from * the signal delivery path. But as a real signal handler * can be installed later the timer must be rearmed here. */ if (timr->it_interval != 0) { ktime_t now = hrtimer_cb_get_time(timer); /* * FIXME: What we really want, is to stop this * timer completely and restart it in case the * SIG_IGN is removed. This is a non trivial * change to the signal handling code. * * For now let timers with an interval less than a * jiffie expire every jiffie and recheck for a * valid signal handler. * * This avoids interrupt starvation in case of a * very small interval, which would expire the * timer immediately again. * * Moving now ahead of time by one jiffie tricks * hrtimer_forward() to expire the timer later, * while it still maintains the overrun accuracy * for the price of a slight inconsistency in the * timer_gettime() case. This is at least better * than a timer storm. * * Only required when high resolution timers are * enabled as the periodic tick based timers are * automatically aligned to the next tick. */ if (IS_ENABLED(CONFIG_HIGH_RES_TIMERS)) { ktime_t kj = TICK_NSEC; if (timr->it_interval < kj) now = ktime_add(now, kj); } timr->it_overrun += hrtimer_forward(timer, now, timr->it_interval); ret = HRTIMER_RESTART; ++timr->it_requeue_pending; timr->it_active = 1; } } unlock_timer(timr, flags); return ret; } static struct pid *good_sigevent(sigevent_t * event) { struct pid *pid = task_tgid(current); struct task_struct *rtn; switch (event->sigev_notify) { case SIGEV_SIGNAL | SIGEV_THREAD_ID: pid = find_vpid(event->sigev_notify_thread_id); rtn = pid_task(pid, PIDTYPE_PID); if (!rtn || !same_thread_group(rtn, current)) return NULL; fallthrough; case SIGEV_SIGNAL: case SIGEV_THREAD: if (event->sigev_signo <= 0 || event->sigev_signo > SIGRTMAX) return NULL; fallthrough; case SIGEV_NONE: return pid; default: return NULL; } } static struct k_itimer * alloc_posix_timer(void) { struct k_itimer *tmr = kmem_cache_zalloc(posix_timers_cache, GFP_KERNEL); if (!tmr) return tmr; if (unlikely(!(tmr->sigq = sigqueue_alloc()))) { kmem_cache_free(posix_timers_cache, tmr); return NULL; } clear_siginfo(&tmr->sigq->info); return tmr; } static void k_itimer_rcu_free(struct rcu_head *head) { struct k_itimer *tmr = container_of(head, struct k_itimer, rcu); kmem_cache_free(posix_timers_cache, tmr); } static void posix_timer_free(struct k_itimer *tmr) { put_pid(tmr->it_pid); sigqueue_free(tmr->sigq); call_rcu(&tmr->rcu, k_itimer_rcu_free); } static void posix_timer_unhash_and_free(struct k_itimer *tmr) { spin_lock(&hash_lock); hlist_del_rcu(&tmr->t_hash); spin_unlock(&hash_lock); posix_timer_free(tmr); } static int common_timer_create(struct k_itimer *new_timer) { hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0); return 0; } /* Create a POSIX.1b interval timer. */ static int do_timer_create(clockid_t which_clock, struct sigevent *event, timer_t __user *created_timer_id) { const struct k_clock *kc = clockid_to_kclock(which_clock); struct k_itimer *new_timer; int error, new_timer_id; if (!kc) return -EINVAL; if (!kc->timer_create) return -EOPNOTSUPP; new_timer = alloc_posix_timer(); if (unlikely(!new_timer)) return -EAGAIN; spin_lock_init(&new_timer->it_lock); /* * Add the timer to the hash table. The timer is not yet valid * because new_timer::it_signal is still NULL. The timer id is also * not yet visible to user space. */ new_timer_id = posix_timer_add(new_timer); if (new_timer_id < 0) { posix_timer_free(new_timer); return new_timer_id; } new_timer->it_id = (timer_t) new_timer_id; new_timer->it_clock = which_clock; new_timer->kclock = kc; new_timer->it_overrun = -1LL; if (event) { rcu_read_lock(); new_timer->it_pid = get_pid(good_sigevent(event)); rcu_read_unlock(); if (!new_timer->it_pid) { error = -EINVAL; goto out; } new_timer->it_sigev_notify = event->sigev_notify; new_timer->sigq->info.si_signo = event->sigev_signo; new_timer->sigq->info.si_value = event->sigev_value; } else { new_timer->it_sigev_notify = SIGEV_SIGNAL; new_timer->sigq->info.si_signo = SIGALRM; memset(&new_timer->sigq->info.si_value, 0, sizeof(sigval_t)); new_timer->sigq->info.si_value.sival_int = new_timer->it_id; new_timer->it_pid = get_pid(task_tgid(current)); } new_timer->sigq->info.si_tid = new_timer->it_id; new_timer->sigq->info.si_code = SI_TIMER; if (copy_to_user(created_timer_id, &new_timer_id, sizeof (new_timer_id))) { error = -EFAULT; goto out; } /* * After succesful copy out, the timer ID is visible to user space * now but not yet valid because new_timer::signal is still NULL. * * Complete the initialization with the clock specific create * callback. */ error = kc->timer_create(new_timer); if (error) goto out; spin_lock_irq(¤t->sighand->siglock); /* This makes the timer valid in the hash table */ WRITE_ONCE(new_timer->it_signal, current->signal); list_add(&new_timer->list, ¤t->signal->posix_timers); spin_unlock_irq(¤t->sighand->siglock); /* * After unlocking sighand::siglock @new_timer is subject to * concurrent removal and cannot be touched anymore */ return 0; out: posix_timer_unhash_and_free(new_timer); return error; } SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, struct sigevent __user *, timer_event_spec, timer_t __user *, created_timer_id) { if (timer_event_spec) { sigevent_t event; if (copy_from_user(&event, timer_event_spec, sizeof (event))) return -EFAULT; return do_timer_create(which_clock, &event, created_timer_id); } return do_timer_create(which_clock, NULL, created_timer_id); } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE3(timer_create, clockid_t, which_clock, struct compat_sigevent __user *, timer_event_spec, timer_t __user *, created_timer_id) { if (timer_event_spec) { sigevent_t event; if (get_compat_sigevent(&event, timer_event_spec)) return -EFAULT; return do_timer_create(which_clock, &event, created_timer_id); } return do_timer_create(which_clock, NULL, created_timer_id); } #endif static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) { struct k_itimer *timr; /* * timer_t could be any type >= int and we want to make sure any * @timer_id outside positive int range fails lookup. */ if ((unsigned long long)timer_id > INT_MAX) return NULL; /* * The hash lookup and the timers are RCU protected. * * Timers are added to the hash in invalid state where * timr::it_signal == NULL. timer::it_signal is only set after the * rest of the initialization succeeded. * * Timer destruction happens in steps: * 1) Set timr::it_signal to NULL with timr::it_lock held * 2) Release timr::it_lock * 3) Remove from the hash under hash_lock * 4) Call RCU for removal after the grace period * * Holding rcu_read_lock() accross the lookup ensures that * the timer cannot be freed. * * The lookup validates locklessly that timr::it_signal == * current::it_signal and timr::it_id == @timer_id. timr::it_id * can't change, but timr::it_signal becomes NULL during * destruction. */ rcu_read_lock(); timr = posix_timer_by_id(timer_id); if (timr) { spin_lock_irqsave(&timr->it_lock, *flags); /* * Validate under timr::it_lock that timr::it_signal is * still valid. Pairs with #1 above. */ if (timr->it_signal == current->signal) { rcu_read_unlock(); return timr; } spin_unlock_irqrestore(&timr->it_lock, *flags); } rcu_read_unlock(); return NULL; } static ktime_t common_hrtimer_remaining(struct k_itimer *timr, ktime_t now) { struct hrtimer *timer = &timr->it.real.timer; return __hrtimer_expires_remaining_adjusted(timer, now); } static s64 common_hrtimer_forward(struct k_itimer *timr, ktime_t now) { struct hrtimer *timer = &timr->it.real.timer; return hrtimer_forward(timer, now, timr->it_interval); } /* * Get the time remaining on a POSIX.1b interval timer. * * Two issues to handle here: * * 1) The timer has a requeue pending. The return value must appear as * if the timer has been requeued right now. * * 2) The timer is a SIGEV_NONE timer. These timers are never enqueued * into the hrtimer queue and therefore never expired. Emulate expiry * here taking #1 into account. */ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) { const struct k_clock *kc = timr->kclock; ktime_t now, remaining, iv; bool sig_none; sig_none = timr->it_sigev_notify == SIGEV_NONE; iv = timr->it_interval; /* interval timer ? */ if (iv) { cur_setting->it_interval = ktime_to_timespec64(iv); } else if (!timr->it_active) { /* * SIGEV_NONE oneshot timers are never queued and therefore * timr->it_active is always false. The check below * vs. remaining time will handle this case. * * For all other timers there is nothing to update here, so * return. */ if (!sig_none) return; } now = kc->clock_get_ktime(timr->it_clock); /* * If this is an interval timer and either has requeue pending or * is a SIGEV_NONE timer move the expiry time forward by intervals, * so expiry is > now. */ if (iv && (timr->it_requeue_pending & REQUEUE_PENDING || sig_none)) timr->it_overrun += kc->timer_forward(timr, now); remaining = kc->timer_remaining(timr, now); /* * As @now is retrieved before a possible timer_forward() and * cannot be reevaluated by the compiler @remaining is based on the * same @now value. Therefore @remaining is consistent vs. @now. * * Consequently all interval timers, i.e. @iv > 0, cannot have a * remaining time <= 0 because timer_forward() guarantees to move * them forward so that the next timer expiry is > @now. */ if (remaining <= 0) { /* * A single shot SIGEV_NONE timer must return 0, when it is * expired! Timers which have a real signal delivery mode * must return a remaining time greater than 0 because the * signal has not yet been delivered. */ if (!sig_none) cur_setting->it_value.tv_nsec = 1; } else { cur_setting->it_value = ktime_to_timespec64(remaining); } } static int do_timer_gettime(timer_t timer_id, struct itimerspec64 *setting) { const struct k_clock *kc; struct k_itimer *timr; unsigned long flags; int ret = 0; timr = lock_timer(timer_id, &flags); if (!timr) return -EINVAL; memset(setting, 0, sizeof(*setting)); kc = timr->kclock; if (WARN_ON_ONCE(!kc || !kc->timer_get)) ret = -EINVAL; else kc->timer_get(timr, setting); unlock_timer(timr, flags); return ret; } /* Get the time remaining on a POSIX.1b interval timer. */ SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, struct __kernel_itimerspec __user *, setting) { struct itimerspec64 cur_setting; int ret = do_timer_gettime(timer_id, &cur_setting); if (!ret) { if (put_itimerspec64(&cur_setting, setting)) ret = -EFAULT; } return ret; } #ifdef CONFIG_COMPAT_32BIT_TIME SYSCALL_DEFINE2(timer_gettime32, timer_t, timer_id, struct old_itimerspec32 __user *, setting) { struct itimerspec64 cur_setting; int ret = do_timer_gettime(timer_id, &cur_setting); if (!ret) { if (put_old_itimerspec32(&cur_setting, setting)) ret = -EFAULT; } return ret; } #endif /** * sys_timer_getoverrun - Get the number of overruns of a POSIX.1b interval timer * @timer_id: The timer ID which identifies the timer * * The "overrun count" of a timer is one plus the number of expiration * intervals which have elapsed between the first expiry, which queues the * signal and the actual signal delivery. On signal delivery the "overrun * count" is calculated and cached, so it can be returned directly here. * * As this is relative to the last queued signal the returned overrun count * is meaningless outside of the signal delivery path and even there it * does not accurately reflect the current state when user space evaluates * it. * * Returns: * -EINVAL @timer_id is invalid * 1..INT_MAX The number of overruns related to the last delivered signal */ SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id) { struct k_itimer *timr; unsigned long flags; int overrun; timr = lock_timer(timer_id, &flags); if (!timr) return -EINVAL; overrun = timer_overrun_to_int(timr, 0); unlock_timer(timr, flags); return overrun; } static void common_hrtimer_arm(struct k_itimer *timr, ktime_t expires, bool absolute, bool sigev_none) { struct hrtimer *timer = &timr->it.real.timer; enum hrtimer_mode mode; mode = absolute ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL; /* * Posix magic: Relative CLOCK_REALTIME timers are not affected by * clock modifications, so they become CLOCK_MONOTONIC based under the * hood. See hrtimer_init(). Update timr->kclock, so the generic * functions which use timr->kclock->clock_get_*() work. * * Note: it_clock stays unmodified, because the next timer_set() might * use ABSTIME, so it needs to switch back. */ if (timr->it_clock == CLOCK_REALTIME) timr->kclock = absolute ? &clock_realtime : &clock_monotonic; hrtimer_init(&timr->it.real.timer, timr->it_clock, mode); timr->it.real.timer.function = posix_timer_fn; if (!absolute) expires = ktime_add_safe(expires, timer->base->get_time()); hrtimer_set_expires(timer, expires); if (!sigev_none) hrtimer_start_expires(timer, HRTIMER_MODE_ABS); } static int common_hrtimer_try_to_cancel(struct k_itimer *timr) { return hrtimer_try_to_cancel(&timr->it.real.timer); } static void common_timer_wait_running(struct k_itimer *timer) { hrtimer_cancel_wait_running(&timer->it.real.timer); } /* * On PREEMPT_RT this prevents priority inversion and a potential livelock * against the ksoftirqd thread in case that ksoftirqd gets preempted while * executing a hrtimer callback. * * See the comments in hrtimer_cancel_wait_running(). For PREEMPT_RT=n this * just results in a cpu_relax(). * * For POSIX CPU timers with CONFIG_POSIX_CPU_TIMERS_TASK_WORK=n this is * just a cpu_relax(). With CONFIG_POSIX_CPU_TIMERS_TASK_WORK=y this * prevents spinning on an eventually scheduled out task and a livelock * when the task which tries to delete or disarm the timer has preempted * the task which runs the expiry in task work context. */ static struct k_itimer *timer_wait_running(struct k_itimer *timer, unsigned long *flags) { const struct k_clock *kc = READ_ONCE(timer->kclock); timer_t timer_id = READ_ONCE(timer->it_id); /* Prevent kfree(timer) after dropping the lock */ rcu_read_lock(); unlock_timer(timer, *flags); /* * kc->timer_wait_running() might drop RCU lock. So @timer * cannot be touched anymore after the function returns! */ if (!WARN_ON_ONCE(!kc->timer_wait_running)) kc->timer_wait_running(timer); rcu_read_unlock(); /* Relock the timer. It might be not longer hashed. */ return lock_timer(timer_id, flags); } /* Set a POSIX.1b interval timer. */ int common_timer_set(struct k_itimer *timr, int flags, struct itimerspec64 *new_setting, struct itimerspec64 *old_setting) { const struct k_clock *kc = timr->kclock; bool sigev_none; ktime_t expires; if (old_setting) common_timer_get(timr, old_setting); /* Prevent rearming by clearing the interval */ timr->it_interval = 0; /* * Careful here. On SMP systems the timer expiry function could be * active and spinning on timr->it_lock. */ if (kc->timer_try_to_cancel(timr) < 0) return TIMER_RETRY; timr->it_active = 0; timr->it_requeue_pending = (timr->it_requeue_pending + 2) & ~REQUEUE_PENDING; timr->it_overrun_last = 0; /* Switch off the timer when it_value is zero */ if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec) return 0; timr->it_interval = timespec64_to_ktime(new_setting->it_interval); expires = timespec64_to_ktime(new_setting->it_value); if (flags & TIMER_ABSTIME) expires = timens_ktime_to_host(timr->it_clock, expires); sigev_none = timr->it_sigev_notify == SIGEV_NONE; kc->timer_arm(timr, expires, flags & TIMER_ABSTIME, sigev_none); timr->it_active = !sigev_none; return 0; } static int do_timer_settime(timer_t timer_id, int tmr_flags, struct itimerspec64 *new_spec64, struct itimerspec64 *old_spec64) { const struct k_clock *kc; struct k_itimer *timr; unsigned long flags; int error = 0; if (!timespec64_valid(&new_spec64->it_interval) || !timespec64_valid(&new_spec64->it_value)) return -EINVAL; if (old_spec64) memset(old_spec64, 0, sizeof(*old_spec64)); timr = lock_timer(timer_id, &flags); retry: if (!timr) return -EINVAL; kc = timr->kclock; if (WARN_ON_ONCE(!kc || !kc->timer_set)) error = -EINVAL; else error = kc->timer_set(timr, tmr_flags, new_spec64, old_spec64); if (error == TIMER_RETRY) { // We already got the old time... old_spec64 = NULL; /* Unlocks and relocks the timer if it still exists */ timr = timer_wait_running(timr, &flags); goto retry; } unlock_timer(timr, flags); return error; } /* Set a POSIX.1b interval timer */ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, const struct __kernel_itimerspec __user *, new_setting, struct __kernel_itimerspec __user *, old_setting) { struct itimerspec64 new_spec, old_spec, *rtn; int error = 0; if (!new_setting) return -EINVAL; if (get_itimerspec64(&new_spec, new_setting)) return -EFAULT; rtn = old_setting ? &old_spec : NULL; error = do_timer_settime(timer_id, flags, &new_spec, rtn); if (!error && old_setting) { if (put_itimerspec64(&old_spec, old_setting)) error = -EFAULT; } return error; } #ifdef CONFIG_COMPAT_32BIT_TIME SYSCALL_DEFINE4(timer_settime32, timer_t, timer_id, int, flags, struct old_itimerspec32 __user *, new, struct old_itimerspec32 __user *, old) { struct itimerspec64 new_spec, old_spec; struct itimerspec64 *rtn = old ? &old_spec : NULL; int error = 0; if (!new) return -EINVAL; if (get_old_itimerspec32(&new_spec, new)) return -EFAULT; error = do_timer_settime(timer_id, flags, &new_spec, rtn); if (!error && old) { if (put_old_itimerspec32(&old_spec, old)) error = -EFAULT; } return error; } #endif int common_timer_del(struct k_itimer *timer) { const struct k_clock *kc = timer->kclock; timer->it_interval = 0; if (kc->timer_try_to_cancel(timer) < 0) return TIMER_RETRY; timer->it_active = 0; return 0; } static inline int timer_delete_hook(struct k_itimer *timer) { const struct k_clock *kc = timer->kclock; if (WARN_ON_ONCE(!kc || !kc->timer_del)) return -EINVAL; return kc->timer_del(timer); } /* Delete a POSIX.1b interval timer. */ SYSCALL_DEFINE1(timer_delete, timer_t, timer_id) { struct k_itimer *timer; unsigned long flags; timer = lock_timer(timer_id, &flags); retry_delete: if (!timer) return -EINVAL; if (unlikely(timer_delete_hook(timer) == TIMER_RETRY)) { /* Unlocks and relocks the timer if it still exists */ timer = timer_wait_running(timer, &flags); goto retry_delete; } spin_lock(¤t->sighand->siglock); list_del(&timer->list); spin_unlock(¤t->sighand->siglock); /* * A concurrent lookup could check timer::it_signal lockless. It * will reevaluate with timer::it_lock held and observe the NULL. */ WRITE_ONCE(timer->it_signal, NULL); unlock_timer(timer, flags); posix_timer_unhash_and_free(timer); return 0; } /* * Delete a timer if it is armed, remove it from the hash and schedule it * for RCU freeing. */ static void itimer_delete(struct k_itimer *timer) { unsigned long flags; /* * irqsave is required to make timer_wait_running() work. */ spin_lock_irqsave(&timer->it_lock, flags); retry_delete: /* * Even if the timer is not longer accessible from other tasks * it still might be armed and queued in the underlying timer * mechanism. Worse, that timer mechanism might run the expiry * function concurrently. */ if (timer_delete_hook(timer) == TIMER_RETRY) { /* * Timer is expired concurrently, prevent livelocks * and pointless spinning on RT. * * timer_wait_running() drops timer::it_lock, which opens * the possibility for another task to delete the timer. * * That's not possible here because this is invoked from * do_exit() only for the last thread of the thread group. * So no other task can access and delete that timer. */ if (WARN_ON_ONCE(timer_wait_running(timer, &flags) != timer)) return; goto retry_delete; } list_del(&timer->list); /* * Setting timer::it_signal to NULL is technically not required * here as nothing can access the timer anymore legitimately via * the hash table. Set it to NULL nevertheless so that all deletion * paths are consistent. */ WRITE_ONCE(timer->it_signal, NULL); spin_unlock_irqrestore(&timer->it_lock, flags); posix_timer_unhash_and_free(timer); } /* * Invoked from do_exit() when the last thread of a thread group exits. * At that point no other task can access the timers of the dying * task anymore. */ void exit_itimers(struct task_struct *tsk) { struct list_head timers; struct k_itimer *tmr; if (list_empty(&tsk->signal->posix_timers)) return; /* Protect against concurrent read via /proc/$PID/timers */ spin_lock_irq(&tsk->sighand->siglock); list_replace_init(&tsk->signal->posix_timers, &timers); spin_unlock_irq(&tsk->sighand->siglock); /* The timers are not longer accessible via tsk::signal */ while (!list_empty(&timers)) { tmr = list_first_entry(&timers, struct k_itimer, list); itimer_delete(tmr); } } SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, const struct __kernel_timespec __user *, tp) { const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 new_tp; if (!kc || !kc->clock_set) return -EINVAL; if (get_timespec64(&new_tp, tp)) return -EFAULT; /* * Permission checks have to be done inside the clock specific * setter callback. */ return kc->clock_set(which_clock, &new_tp); } SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, struct __kernel_timespec __user *, tp) { const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 kernel_tp; int error; if (!kc) return -EINVAL; error = kc->clock_get_timespec(which_clock, &kernel_tp); if (!error && put_timespec64(&kernel_tp, tp)) error = -EFAULT; return error; } int do_clock_adjtime(const clockid_t which_clock, struct __kernel_timex * ktx) { const struct k_clock *kc = clockid_to_kclock(which_clock); if (!kc) return -EINVAL; if (!kc->clock_adj) return -EOPNOTSUPP; return kc->clock_adj(which_clock, ktx); } SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock, struct __kernel_timex __user *, utx) { struct __kernel_timex ktx; int err; if (copy_from_user(&ktx, utx, sizeof(ktx))) return -EFAULT; err = do_clock_adjtime(which_clock, &ktx); if (err >= 0 && copy_to_user(utx, &ktx, sizeof(ktx))) return -EFAULT; return err; } /** * sys_clock_getres - Get the resolution of a clock * @which_clock: The clock to get the resolution for * @tp: Pointer to a a user space timespec64 for storage * * POSIX defines: * * "The clock_getres() function shall return the resolution of any * clock. Clock resolutions are implementation-defined and cannot be set by * a process. If the argument res is not NULL, the resolution of the * specified clock shall be stored in the location pointed to by res. If * res is NULL, the clock resolution is not returned. If the time argument * of clock_settime() is not a multiple of res, then the value is truncated * to a multiple of res." * * Due to the various hardware constraints the real resolution can vary * wildly and even change during runtime when the underlying devices are * replaced. The kernel also can use hardware devices with different * resolutions for reading the time and for arming timers. * * The kernel therefore deviates from the POSIX spec in various aspects: * * 1) The resolution returned to user space * * For CLOCK_REALTIME, CLOCK_MONOTONIC, CLOCK_BOOTTIME, CLOCK_TAI, * CLOCK_REALTIME_ALARM, CLOCK_BOOTTIME_ALAREM and CLOCK_MONOTONIC_RAW * the kernel differentiates only two cases: * * I) Low resolution mode: * * When high resolution timers are disabled at compile or runtime * the resolution returned is nanoseconds per tick, which represents * the precision at which timers expire. * * II) High resolution mode: * * When high resolution timers are enabled the resolution returned * is always one nanosecond independent of the actual resolution of * the underlying hardware devices. * * For CLOCK_*_ALARM the actual resolution depends on system * state. When system is running the resolution is the same as the * resolution of the other clocks. During suspend the actual * resolution is the resolution of the underlying RTC device which * might be way less precise than the clockevent device used during * running state. * * For CLOCK_REALTIME_COARSE and CLOCK_MONOTONIC_COARSE the resolution * returned is always nanoseconds per tick. * * For CLOCK_PROCESS_CPUTIME and CLOCK_THREAD_CPUTIME the resolution * returned is always one nanosecond under the assumption that the * underlying scheduler clock has a better resolution than nanoseconds * per tick. * * For dynamic POSIX clocks (PTP devices) the resolution returned is * always one nanosecond. * * 2) Affect on sys_clock_settime() * * The kernel does not truncate the time which is handed in to * sys_clock_settime(). The kernel internal timekeeping is always using * nanoseconds precision independent of the clocksource device which is * used to read the time from. The resolution of that device only * affects the presicion of the time returned by sys_clock_gettime(). * * Returns: * 0 Success. @tp contains the resolution * -EINVAL @which_clock is not a valid clock ID * -EFAULT Copying the resolution to @tp faulted * -ENODEV Dynamic POSIX clock is not backed by a device * -EOPNOTSUPP Dynamic POSIX clock does not support getres() */ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct __kernel_timespec __user *, tp) { const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 rtn_tp; int error; if (!kc) return -EINVAL; error = kc->clock_getres(which_clock, &rtn_tp); if (!error && tp && put_timespec64(&rtn_tp, tp)) error = -EFAULT; return error; } #ifdef CONFIG_COMPAT_32BIT_TIME SYSCALL_DEFINE2(clock_settime32, clockid_t, which_clock, struct old_timespec32 __user *, tp) { const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 ts; if (!kc || !kc->clock_set) return -EINVAL; if (get_old_timespec32(&ts, tp)) return -EFAULT; return kc->clock_set(which_clock, &ts); } SYSCALL_DEFINE2(clock_gettime32, clockid_t, which_clock, struct old_timespec32 __user *, tp) { const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 ts; int err; if (!kc) return -EINVAL; err = kc->clock_get_timespec(which_clock, &ts); if (!err && put_old_timespec32(&ts, tp)) err = -EFAULT; return err; } SYSCALL_DEFINE2(clock_adjtime32, clockid_t, which_clock, struct old_timex32 __user *, utp) { struct __kernel_timex ktx; int err; err = get_old_timex32(&ktx, utp); if (err) return err; err = do_clock_adjtime(which_clock, &ktx); if (err >= 0 && put_old_timex32(utp, &ktx)) return -EFAULT; return err; } SYSCALL_DEFINE2(clock_getres_time32, clockid_t, which_clock, struct old_timespec32 __user *, tp) { const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 ts; int err; if (!kc) return -EINVAL; err = kc->clock_getres(which_clock, &ts); if (!err && tp && put_old_timespec32(&ts, tp)) return -EFAULT; return err; } #endif /* * sys_clock_nanosleep() for CLOCK_REALTIME and CLOCK_TAI */ static int common_nsleep(const clockid_t which_clock, int flags, const struct timespec64 *rqtp) { ktime_t texp = timespec64_to_ktime(*rqtp); return hrtimer_nanosleep(texp, flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL, which_clock); } /* * sys_clock_nanosleep() for CLOCK_MONOTONIC and CLOCK_BOOTTIME * * Absolute nanosleeps for these clocks are time-namespace adjusted. */ static int common_nsleep_timens(const clockid_t which_clock, int flags, const struct timespec64 *rqtp) { ktime_t texp = timespec64_to_ktime(*rqtp); if (flags & TIMER_ABSTIME) texp = timens_ktime_to_host(which_clock, texp); return hrtimer_nanosleep(texp, flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL, which_clock); } SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, const struct __kernel_timespec __user *, rqtp, struct __kernel_timespec __user *, rmtp) { const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 t; if (!kc) return -EINVAL; if (!kc->nsleep) return -EOPNOTSUPP; if (get_timespec64(&t, rqtp)) return -EFAULT; if (!timespec64_valid(&t)) return -EINVAL; if (flags & TIMER_ABSTIME) rmtp = NULL; current->restart_block.fn = do_no_restart_syscall; current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; current->restart_block.nanosleep.rmtp = rmtp; return kc->nsleep(which_clock, flags, &t); } #ifdef CONFIG_COMPAT_32BIT_TIME SYSCALL_DEFINE4(clock_nanosleep_time32, clockid_t, which_clock, int, flags, struct old_timespec32 __user *, rqtp, struct old_timespec32 __user *, rmtp) { const struct k_clock *kc = clockid_to_kclock(which_clock); struct timespec64 t; if (!kc) return -EINVAL; if (!kc->nsleep) return -EOPNOTSUPP; if (get_old_timespec32(&t, rqtp)) return -EFAULT; if (!timespec64_valid(&t)) return -EINVAL; if (flags & TIMER_ABSTIME) rmtp = NULL; current->restart_block.fn = do_no_restart_syscall; current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; current->restart_block.nanosleep.compat_rmtp = rmtp; return kc->nsleep(which_clock, flags, &t); } #endif static const struct k_clock clock_realtime = { .clock_getres = posix_get_hrtimer_res, .clock_get_timespec = posix_get_realtime_timespec, .clock_get_ktime = posix_get_realtime_ktime, .clock_set = posix_clock_realtime_set, .clock_adj = posix_clock_realtime_adj, .nsleep = common_nsleep, .timer_create = common_timer_create, .timer_set = common_timer_set, .timer_get = common_timer_get, .timer_del = common_timer_del, .timer_rearm = common_hrtimer_rearm, .timer_forward = common_hrtimer_forward, .timer_remaining = common_hrtimer_remaining, .timer_try_to_cancel = common_hrtimer_try_to_cancel, .timer_wait_running = common_timer_wait_running, .timer_arm = common_hrtimer_arm, }; static const struct k_clock clock_monotonic = { .clock_getres = posix_get_hrtimer_res, .clock_get_timespec = posix_get_monotonic_timespec, .clock_get_ktime = posix_get_monotonic_ktime, .nsleep = common_nsleep_timens, .timer_create = common_timer_create, .timer_set = common_timer_set, .timer_get = common_timer_get, .timer_del = common_timer_del, .timer_rearm = common_hrtimer_rearm, .timer_forward = common_hrtimer_forward, .timer_remaining = common_hrtimer_remaining, .timer_try_to_cancel = common_hrtimer_try_to_cancel, .timer_wait_running = common_timer_wait_running, .timer_arm = common_hrtimer_arm, }; static const struct k_clock clock_monotonic_raw = { .clock_getres = posix_get_hrtimer_res, .clock_get_timespec = posix_get_monotonic_raw, }; static const struct k_clock clock_realtime_coarse = { .clock_getres = posix_get_coarse_res, .clock_get_timespec = posix_get_realtime_coarse, }; static const struct k_clock clock_monotonic_coarse = { .clock_getres = posix_get_coarse_res, .clock_get_timespec = posix_get_monotonic_coarse, }; static const struct k_clock clock_tai = { .clock_getres = posix_get_hrtimer_res, .clock_get_ktime = posix_get_tai_ktime, .clock_get_timespec = posix_get_tai_timespec, .nsleep = common_nsleep, .timer_create = common_timer_create, .timer_set = common_timer_set, .timer_get = common_timer_get, .timer_del = common_timer_del, .timer_rearm = common_hrtimer_rearm, .timer_forward = common_hrtimer_forward, .timer_remaining = common_hrtimer_remaining, .timer_try_to_cancel = common_hrtimer_try_to_cancel, .timer_wait_running = common_timer_wait_running, .timer_arm = common_hrtimer_arm, }; static const struct k_clock clock_boottime = { .clock_getres = posix_get_hrtimer_res, .clock_get_ktime = posix_get_boottime_ktime, .clock_get_timespec = posix_get_boottime_timespec, .nsleep = common_nsleep_timens, .timer_create = common_timer_create, .timer_set = common_timer_set, .timer_get = common_timer_get, .timer_del = common_timer_del, .timer_rearm = common_hrtimer_rearm, .timer_forward = common_hrtimer_forward, .timer_remaining = common_hrtimer_remaining, .timer_try_to_cancel = common_hrtimer_try_to_cancel, .timer_wait_running = common_timer_wait_running, .timer_arm = common_hrtimer_arm, }; static const struct k_clock * const posix_clocks[] = { [CLOCK_REALTIME] = &clock_realtime, [CLOCK_MONOTONIC] = &clock_monotonic, [CLOCK_PROCESS_CPUTIME_ID] = &clock_process, [CLOCK_THREAD_CPUTIME_ID] = &clock_thread, [CLOCK_MONOTONIC_RAW] = &clock_monotonic_raw, [CLOCK_REALTIME_COARSE] = &clock_realtime_coarse, [CLOCK_MONOTONIC_COARSE] = &clock_monotonic_coarse, [CLOCK_BOOTTIME] = &clock_boottime, [CLOCK_REALTIME_ALARM] = &alarm_clock, [CLOCK_BOOTTIME_ALARM] = &alarm_clock, [CLOCK_TAI] = &clock_tai, }; static const struct k_clock *clockid_to_kclock(const clockid_t id) { clockid_t idx = id; if (id < 0) { return (id & CLOCKFD_MASK) == CLOCKFD ? &clock_posix_dynamic : &clock_posix_cpu; } if (id >= ARRAY_SIZE(posix_clocks)) return NULL; return posix_clocks[array_index_nospec(idx, ARRAY_SIZE(posix_clocks))]; } |
103 78 78 2 78 2 102 102 102 102 102 102 91 97 2 81 101 1 78 102 67 91 67 67 90 81 90 57 1 47 15 27 1 78 27 2 2 2 31 90 3 1 102 3 60 2 92 78 92 7 40 94 6 31 80 100 78 26 10 19 103 113 19 110 27 75 75 75 2 74 75 2 1 73 75 75 1 3 3 8 1 66 35 41 75 36 69 69 66 67 26 65 9 7 1 1 2 2 3 2 1 1 1 1 28 21 21 8 21 16 5 9 9 9 9 8 9 9 1 9 32 11 2 1 1 1 40 29 11 6 2 11 21 40 2 9 33 36 40 37 11 10 12 1 1 6 1 1 1 6 1 23 1 25 13 39 38 1 1 29 11 2 38 2 2 13 12 9 2 90 3 3 3 3 3 2 3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 | // SPDX-License-Identifier: GPL-2.0 /* * * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved. * */ #include <linux/buffer_head.h> #include <linux/fs.h> #include <linux/mpage.h> #include <linux/namei.h> #include <linux/nls.h> #include <linux/uio.h> #include <linux/writeback.h> #include "debug.h" #include "ntfs.h" #include "ntfs_fs.h" /* * ntfs_read_mft - Read record and parses MFT. */ static struct inode *ntfs_read_mft(struct inode *inode, const struct cpu_str *name, const struct MFT_REF *ref) { int err = 0; struct ntfs_inode *ni = ntfs_i(inode); struct super_block *sb = inode->i_sb; struct ntfs_sb_info *sbi = sb->s_fs_info; mode_t mode = 0; struct ATTR_STD_INFO5 *std5 = NULL; struct ATTR_LIST_ENTRY *le; struct ATTRIB *attr; bool is_match = false; bool is_root = false; bool is_dir; unsigned long ino = inode->i_ino; u32 rp_fa = 0, asize, t32; u16 roff, rsize, names = 0; const struct ATTR_FILE_NAME *fname = NULL; const struct INDEX_ROOT *root; struct REPARSE_DATA_BUFFER rp; // 0x18 bytes u64 t64; struct MFT_REC *rec; struct runs_tree *run; struct timespec64 ts; inode->i_op = NULL; /* Setup 'uid' and 'gid' */ inode->i_uid = sbi->options->fs_uid; inode->i_gid = sbi->options->fs_gid; err = mi_init(&ni->mi, sbi, ino); if (err) goto out; if (!sbi->mft.ni && ino == MFT_REC_MFT && !sb->s_root) { t64 = sbi->mft.lbo >> sbi->cluster_bits; t32 = bytes_to_cluster(sbi, MFT_REC_VOL * sbi->record_size); sbi->mft.ni = ni; init_rwsem(&ni->file.run_lock); if (!run_add_entry(&ni->file.run, 0, t64, t32, true)) { err = -ENOMEM; goto out; } } err = mi_read(&ni->mi, ino == MFT_REC_MFT); if (err) goto out; rec = ni->mi.mrec; if (sbi->flags & NTFS_FLAGS_LOG_REPLAYING) { ; } else if (ref->seq != rec->seq) { err = -EINVAL; ntfs_err(sb, "MFT: r=%lx, expect seq=%x instead of %x!", ino, le16_to_cpu(ref->seq), le16_to_cpu(rec->seq)); goto out; } else if (!is_rec_inuse(rec)) { err = -ESTALE; ntfs_err(sb, "Inode r=%x is not in use!", (u32)ino); goto out; } if (le32_to_cpu(rec->total) != sbi->record_size) { /* Bad inode? */ err = -EINVAL; goto out; } if (!is_rec_base(rec)) { err = -EINVAL; goto out; } /* Record should contain $I30 root. */ is_dir = rec->flags & RECORD_FLAG_DIR; /* MFT_REC_MFT is not a dir */ if (is_dir && ino == MFT_REC_MFT) { err = -EINVAL; goto out; } inode->i_generation = le16_to_cpu(rec->seq); /* Enumerate all struct Attributes MFT. */ le = NULL; attr = NULL; /* * To reduce tab pressure use goto instead of * while( (attr = ni_enum_attr_ex(ni, attr, &le, NULL) )) */ next_attr: run = NULL; err = -EINVAL; attr = ni_enum_attr_ex(ni, attr, &le, NULL); if (!attr) goto end_enum; if (le && le->vcn) { /* This is non primary attribute segment. Ignore if not MFT. */ if (ino != MFT_REC_MFT || attr->type != ATTR_DATA) goto next_attr; run = &ni->file.run; asize = le32_to_cpu(attr->size); goto attr_unpack_run; } roff = attr->non_res ? 0 : le16_to_cpu(attr->res.data_off); rsize = attr->non_res ? 0 : le32_to_cpu(attr->res.data_size); asize = le32_to_cpu(attr->size); /* * Really this check was done in 'ni_enum_attr_ex' -> ... 'mi_enum_attr'. * There not critical to check this case again */ if (attr->name_len && sizeof(short) * attr->name_len + le16_to_cpu(attr->name_off) > asize) goto out; if (attr->non_res) { t64 = le64_to_cpu(attr->nres.alloc_size); if (le64_to_cpu(attr->nres.data_size) > t64 || le64_to_cpu(attr->nres.valid_size) > t64) goto out; } switch (attr->type) { case ATTR_STD: if (attr->non_res || asize < sizeof(struct ATTR_STD_INFO) + roff || rsize < sizeof(struct ATTR_STD_INFO)) goto out; if (std5) goto next_attr; std5 = Add2Ptr(attr, roff); #ifdef STATX_BTIME nt2kernel(std5->cr_time, &ni->i_crtime); #endif nt2kernel(std5->a_time, &ts); inode_set_atime_to_ts(inode, ts); nt2kernel(std5->c_time, &ts); inode_set_ctime_to_ts(inode, ts); nt2kernel(std5->m_time, &ts); inode_set_mtime_to_ts(inode, ts); ni->std_fa = std5->fa; if (asize >= sizeof(struct ATTR_STD_INFO5) + roff && rsize >= sizeof(struct ATTR_STD_INFO5)) ni->std_security_id = std5->security_id; goto next_attr; case ATTR_LIST: if (attr->name_len || le || ino == MFT_REC_LOG) goto out; err = ntfs_load_attr_list(ni, attr); if (err) goto out; le = NULL; attr = NULL; goto next_attr; case ATTR_NAME: if (attr->non_res || asize < SIZEOF_ATTRIBUTE_FILENAME + roff || rsize < SIZEOF_ATTRIBUTE_FILENAME) goto out; fname = Add2Ptr(attr, roff); if (fname->type == FILE_NAME_DOS) goto next_attr; names += 1; if (name && name->len == fname->name_len && !ntfs_cmp_names_cpu(name, (struct le_str *)&fname->name_len, NULL, false)) is_match = true; goto next_attr; case ATTR_DATA: if (is_dir) { /* Ignore data attribute in dir record. */ goto next_attr; } if (ino == MFT_REC_BADCLUST && !attr->non_res) goto next_attr; if (attr->name_len && ((ino != MFT_REC_BADCLUST || !attr->non_res || attr->name_len != ARRAY_SIZE(BAD_NAME) || memcmp(attr_name(attr), BAD_NAME, sizeof(BAD_NAME))) && (ino != MFT_REC_SECURE || !attr->non_res || attr->name_len != ARRAY_SIZE(SDS_NAME) || memcmp(attr_name(attr), SDS_NAME, sizeof(SDS_NAME))))) { /* File contains stream attribute. Ignore it. */ goto next_attr; } if (is_attr_sparsed(attr)) ni->std_fa |= FILE_ATTRIBUTE_SPARSE_FILE; else ni->std_fa &= ~FILE_ATTRIBUTE_SPARSE_FILE; if (is_attr_compressed(attr)) ni->std_fa |= FILE_ATTRIBUTE_COMPRESSED; else ni->std_fa &= ~FILE_ATTRIBUTE_COMPRESSED; if (is_attr_encrypted(attr)) ni->std_fa |= FILE_ATTRIBUTE_ENCRYPTED; else ni->std_fa &= ~FILE_ATTRIBUTE_ENCRYPTED; if (!attr->non_res) { ni->i_valid = inode->i_size = rsize; inode_set_bytes(inode, rsize); } mode = S_IFREG | (0777 & sbi->options->fs_fmask_inv); if (!attr->non_res) { ni->ni_flags |= NI_FLAG_RESIDENT; goto next_attr; } inode_set_bytes(inode, attr_ondisk_size(attr)); ni->i_valid = le64_to_cpu(attr->nres.valid_size); inode->i_size = le64_to_cpu(attr->nres.data_size); if (!attr->nres.alloc_size) goto next_attr; run = ino == MFT_REC_BITMAP ? &sbi->used.bitmap.run : &ni->file.run; break; case ATTR_ROOT: if (attr->non_res) goto out; root = Add2Ptr(attr, roff); if (attr->name_len != ARRAY_SIZE(I30_NAME) || memcmp(attr_name(attr), I30_NAME, sizeof(I30_NAME))) goto next_attr; if (root->type != ATTR_NAME || root->rule != NTFS_COLLATION_TYPE_FILENAME) goto out; if (!is_dir) goto next_attr; is_root = true; ni->ni_flags |= NI_FLAG_DIR; err = indx_init(&ni->dir, sbi, attr, INDEX_MUTEX_I30); if (err) goto out; mode = sb->s_root ? (S_IFDIR | (0777 & sbi->options->fs_dmask_inv)) : (S_IFDIR | 0777); goto next_attr; case ATTR_ALLOC: if (!is_root || attr->name_len != ARRAY_SIZE(I30_NAME) || memcmp(attr_name(attr), I30_NAME, sizeof(I30_NAME))) goto next_attr; inode->i_size = le64_to_cpu(attr->nres.data_size); ni->i_valid = le64_to_cpu(attr->nres.valid_size); inode_set_bytes(inode, le64_to_cpu(attr->nres.alloc_size)); run = &ni->dir.alloc_run; break; case ATTR_BITMAP: if (ino == MFT_REC_MFT) { if (!attr->non_res) goto out; #ifndef CONFIG_NTFS3_64BIT_CLUSTER /* 0x20000000 = 2^32 / 8 */ if (le64_to_cpu(attr->nres.alloc_size) >= 0x20000000) goto out; #endif run = &sbi->mft.bitmap.run; break; } else if (is_dir && attr->name_len == ARRAY_SIZE(I30_NAME) && !memcmp(attr_name(attr), I30_NAME, sizeof(I30_NAME)) && attr->non_res) { run = &ni->dir.bitmap_run; break; } goto next_attr; case ATTR_REPARSE: if (attr->name_len) goto next_attr; rp_fa = ni_parse_reparse(ni, attr, &rp); switch (rp_fa) { case REPARSE_LINK: /* * Normal symlink. * Assume one unicode symbol == one utf8. */ inode->i_size = le16_to_cpu(rp.SymbolicLinkReparseBuffer .PrintNameLength) / sizeof(u16); ni->i_valid = inode->i_size; /* Clear directory bit. */ if (ni->ni_flags & NI_FLAG_DIR) { indx_clear(&ni->dir); memset(&ni->dir, 0, sizeof(ni->dir)); ni->ni_flags &= ~NI_FLAG_DIR; } else { run_close(&ni->file.run); } mode = S_IFLNK | 0777; is_dir = false; if (attr->non_res) { run = &ni->file.run; goto attr_unpack_run; // Double break. } break; case REPARSE_COMPRESSED: break; case REPARSE_DEDUPLICATED: break; } goto next_attr; case ATTR_EA_INFO: if (!attr->name_len && resident_data_ex(attr, sizeof(struct EA_INFO))) { ni->ni_flags |= NI_FLAG_EA; /* * ntfs_get_wsl_perm updates inode->i_uid, inode->i_gid, inode->i_mode */ inode->i_mode = mode; ntfs_get_wsl_perm(inode); mode = inode->i_mode; } goto next_attr; default: goto next_attr; } attr_unpack_run: roff = le16_to_cpu(attr->nres.run_off); if (roff > asize) { err = -EINVAL; goto out; } t64 = le64_to_cpu(attr->nres.svcn); err = run_unpack_ex(run, sbi, ino, t64, le64_to_cpu(attr->nres.evcn), t64, Add2Ptr(attr, roff), asize - roff); if (err < 0) goto out; err = 0; goto next_attr; end_enum: if (!std5) goto out; if (!is_match && name) { /* Reuse rec as buffer for ascii name. */ err = -ENOENT; goto out; } if (std5->fa & FILE_ATTRIBUTE_READONLY) mode &= ~0222; if (!names) { err = -EINVAL; goto out; } if (names != le16_to_cpu(rec->hard_links)) { /* Correct minor error on the fly. Do not mark inode as dirty. */ rec->hard_links = cpu_to_le16(names); ni->mi.dirty = true; } set_nlink(inode, names); if (S_ISDIR(mode)) { ni->std_fa |= FILE_ATTRIBUTE_DIRECTORY; /* * Dot and dot-dot should be included in count but was not * included in enumeration. * Usually a hard links to directories are disabled. */ inode->i_op = &ntfs_dir_inode_operations; inode->i_fop = &ntfs_dir_operations; ni->i_valid = 0; } else if (S_ISLNK(mode)) { ni->std_fa &= ~FILE_ATTRIBUTE_DIRECTORY; inode->i_op = &ntfs_link_inode_operations; inode->i_fop = NULL; inode_nohighmem(inode); } else if (S_ISREG(mode)) { ni->std_fa &= ~FILE_ATTRIBUTE_DIRECTORY; inode->i_op = &ntfs_file_inode_operations; inode->i_fop = &ntfs_file_operations; inode->i_mapping->a_ops = is_compressed(ni) ? &ntfs_aops_cmpr : &ntfs_aops; if (ino != MFT_REC_MFT) init_rwsem(&ni->file.run_lock); } else if (S_ISCHR(mode) || S_ISBLK(mode) || S_ISFIFO(mode) || S_ISSOCK(mode)) { inode->i_op = &ntfs_special_inode_operations; init_special_inode(inode, mode, inode->i_rdev); } else if (fname && fname->home.low == cpu_to_le32(MFT_REC_EXTEND) && fname->home.seq == cpu_to_le16(MFT_REC_EXTEND)) { /* Records in $Extend are not a files or general directories. */ inode->i_op = &ntfs_file_inode_operations; } else { err = -EINVAL; goto out; } if ((sbi->options->sys_immutable && (std5->fa & FILE_ATTRIBUTE_SYSTEM)) && !S_ISFIFO(mode) && !S_ISSOCK(mode) && !S_ISLNK(mode)) { inode->i_flags |= S_IMMUTABLE; } else { inode->i_flags &= ~S_IMMUTABLE; } inode->i_mode = mode; if (!(ni->ni_flags & NI_FLAG_EA)) { /* If no xattr then no security (stored in xattr). */ inode->i_flags |= S_NOSEC; } if (ino == MFT_REC_MFT && !sb->s_root) sbi->mft.ni = NULL; unlock_new_inode(inode); return inode; out: if (ino == MFT_REC_MFT && !sb->s_root) sbi->mft.ni = NULL; iget_failed(inode); return ERR_PTR(err); } /* * ntfs_test_inode * * Return: 1 if match. */ static int ntfs_test_inode(struct inode *inode, void *data) { struct MFT_REF *ref = data; return ino_get(ref) == inode->i_ino; } static int ntfs_set_inode(struct inode *inode, void *data) { const struct MFT_REF *ref = data; inode->i_ino = ino_get(ref); return 0; } struct inode *ntfs_iget5(struct super_block *sb, const struct MFT_REF *ref, const struct cpu_str *name) { struct inode *inode; inode = iget5_locked(sb, ino_get(ref), ntfs_test_inode, ntfs_set_inode, (void *)ref); if (unlikely(!inode)) return ERR_PTR(-ENOMEM); /* If this is a freshly allocated inode, need to read it now. */ if (inode->i_state & I_NEW) inode = ntfs_read_mft(inode, name, ref); else if (ref->seq != ntfs_i(inode)->mi.mrec->seq) { /* Inode overlaps? */ _ntfs_bad_inode(inode); } if (IS_ERR(inode) && name) ntfs_set_state(sb->s_fs_info, NTFS_DIRTY_ERROR); return inode; } enum get_block_ctx { GET_BLOCK_GENERAL = 0, GET_BLOCK_WRITE_BEGIN = 1, GET_BLOCK_DIRECT_IO_R = 2, GET_BLOCK_DIRECT_IO_W = 3, GET_BLOCK_BMAP = 4, }; static noinline int ntfs_get_block_vbo(struct inode *inode, u64 vbo, struct buffer_head *bh, int create, enum get_block_ctx ctx) { struct super_block *sb = inode->i_sb; struct ntfs_sb_info *sbi = sb->s_fs_info; struct ntfs_inode *ni = ntfs_i(inode); struct folio *folio = bh->b_folio; u8 cluster_bits = sbi->cluster_bits; u32 block_size = sb->s_blocksize; u64 bytes, lbo, valid; u32 off; int err; CLST vcn, lcn, len; bool new; /* Clear previous state. */ clear_buffer_new(bh); clear_buffer_uptodate(bh); if (is_resident(ni)) { ni_lock(ni); err = attr_data_read_resident(ni, &folio->page); ni_unlock(ni); if (!err) set_buffer_uptodate(bh); bh->b_size = block_size; return err; } vcn = vbo >> cluster_bits; off = vbo & sbi->cluster_mask; new = false; err = attr_data_get_block(ni, vcn, 1, &lcn, &len, create ? &new : NULL, create && sbi->cluster_size > PAGE_SIZE); if (err) goto out; if (!len) return 0; bytes = ((u64)len << cluster_bits) - off; if (lcn == SPARSE_LCN) { if (!create) { if (bh->b_size > bytes) bh->b_size = bytes; return 0; } WARN_ON(1); } if (new) set_buffer_new(bh); lbo = ((u64)lcn << cluster_bits) + off; set_buffer_mapped(bh); bh->b_bdev = sb->s_bdev; bh->b_blocknr = lbo >> sb->s_blocksize_bits; valid = ni->i_valid; if (ctx == GET_BLOCK_DIRECT_IO_W) { /* ntfs_direct_IO will update ni->i_valid. */ if (vbo >= valid) set_buffer_new(bh); } else if (create) { /* Normal write. */ if (bytes > bh->b_size) bytes = bh->b_size; if (vbo >= valid) set_buffer_new(bh); if (vbo + bytes > valid) { ni->i_valid = vbo + bytes; mark_inode_dirty(inode); } } else if (vbo >= valid) { /* Read out of valid data. */ clear_buffer_mapped(bh); } else if (vbo + bytes <= valid) { /* Normal read. */ } else if (vbo + block_size <= valid) { /* Normal short read. */ bytes = block_size; } else { /* * Read across valid size: vbo < valid && valid < vbo + block_size */ bytes = block_size; if (folio) { u32 voff = valid - vbo; bh->b_size = block_size; off = vbo & (PAGE_SIZE - 1); folio_set_bh(bh, folio, off); err = bh_read(bh, 0); if (err < 0) goto out; folio_zero_segment(folio, off + voff, off + block_size); } } if (bh->b_size > bytes) bh->b_size = bytes; #ifndef __LP64__ if (ctx == GET_BLOCK_DIRECT_IO_W || ctx == GET_BLOCK_DIRECT_IO_R) { static_assert(sizeof(size_t) < sizeof(loff_t)); if (bytes > 0x40000000u) bh->b_size = 0x40000000u; } #endif return 0; out: return err; } int ntfs_get_block(struct inode *inode, sector_t vbn, struct buffer_head *bh_result, int create) { return ntfs_get_block_vbo(inode, (u64)vbn << inode->i_blkbits, bh_result, create, GET_BLOCK_GENERAL); } static int ntfs_get_block_bmap(struct inode *inode, sector_t vsn, struct buffer_head *bh_result, int create) { return ntfs_get_block_vbo(inode, (u64)vsn << inode->i_sb->s_blocksize_bits, bh_result, create, GET_BLOCK_BMAP); } static sector_t ntfs_bmap(struct address_space *mapping, sector_t block) { return generic_block_bmap(mapping, block, ntfs_get_block_bmap); } static int ntfs_read_folio(struct file *file, struct folio *folio) { struct page *page = &folio->page; int err; struct address_space *mapping = page->mapping; struct inode *inode = mapping->host; struct ntfs_inode *ni = ntfs_i(inode); if (is_resident(ni)) { ni_lock(ni); err = attr_data_read_resident(ni, page); ni_unlock(ni); if (err != E_NTFS_NONRESIDENT) { unlock_page(page); return err; } } if (is_compressed(ni)) { ni_lock(ni); err = ni_readpage_cmpr(ni, page); ni_unlock(ni); return err; } /* Normal + sparse files. */ return mpage_read_folio(folio, ntfs_get_block); } static void ntfs_readahead(struct readahead_control *rac) { struct address_space *mapping = rac->mapping; struct inode *inode = mapping->host; struct ntfs_inode *ni = ntfs_i(inode); u64 valid; loff_t pos; if (is_resident(ni)) { /* No readahead for resident. */ return; } if (is_compressed(ni)) { /* No readahead for compressed. */ return; } valid = ni->i_valid; pos = readahead_pos(rac); if (valid < i_size_read(inode) && pos <= valid && valid < pos + readahead_length(rac)) { /* Range cross 'valid'. Read it page by page. */ return; } mpage_readahead(rac, ntfs_get_block); } static int ntfs_get_block_direct_IO_R(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { return ntfs_get_block_vbo(inode, (u64)iblock << inode->i_blkbits, bh_result, create, GET_BLOCK_DIRECT_IO_R); } static int ntfs_get_block_direct_IO_W(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { return ntfs_get_block_vbo(inode, (u64)iblock << inode->i_blkbits, bh_result, create, GET_BLOCK_DIRECT_IO_W); } static ssize_t ntfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; struct ntfs_inode *ni = ntfs_i(inode); loff_t vbo = iocb->ki_pos; loff_t end; int wr = iov_iter_rw(iter) & WRITE; size_t iter_count = iov_iter_count(iter); loff_t valid; ssize_t ret; if (is_resident(ni)) { /* Switch to buffered write. */ ret = 0; goto out; } ret = blockdev_direct_IO(iocb, inode, iter, wr ? ntfs_get_block_direct_IO_W : ntfs_get_block_direct_IO_R); if (ret > 0) end = vbo + ret; else if (wr && ret == -EIOCBQUEUED) end = vbo + iter_count; else goto out; valid = ni->i_valid; if (wr) { if (end > valid && !S_ISBLK(inode->i_mode)) { ni->i_valid = end; mark_inode_dirty(inode); } } else if (vbo < valid && valid < end) { /* Fix page. */ iov_iter_revert(iter, end - valid); iov_iter_zero(end - valid, iter); } out: return ret; } int ntfs_set_size(struct inode *inode, u64 new_size) { struct super_block *sb = inode->i_sb; struct ntfs_sb_info *sbi = sb->s_fs_info; struct ntfs_inode *ni = ntfs_i(inode); int err; /* Check for maximum file size. */ if (is_sparsed(ni) || is_compressed(ni)) { if (new_size > sbi->maxbytes_sparse) { err = -EFBIG; goto out; } } else if (new_size > sbi->maxbytes) { err = -EFBIG; goto out; } ni_lock(ni); down_write(&ni->file.run_lock); err = attr_set_size(ni, ATTR_DATA, NULL, 0, &ni->file.run, new_size, &ni->i_valid, true, NULL); up_write(&ni->file.run_lock); ni_unlock(ni); mark_inode_dirty(inode); out: return err; } static int ntfs_resident_writepage(struct folio *folio, struct writeback_control *wbc, void *data) { struct address_space *mapping = data; struct ntfs_inode *ni = ntfs_i(mapping->host); int ret; ni_lock(ni); ret = attr_data_write_resident(ni, &folio->page); ni_unlock(ni); if (ret != E_NTFS_NONRESIDENT) folio_unlock(folio); mapping_set_error(mapping, ret); return ret; } static int ntfs_writepages(struct address_space *mapping, struct writeback_control *wbc) { if (is_resident(ntfs_i(mapping->host))) return write_cache_pages(mapping, wbc, ntfs_resident_writepage, mapping); return mpage_writepages(mapping, wbc, ntfs_get_block); } static int ntfs_get_block_write_begin(struct inode *inode, sector_t vbn, struct buffer_head *bh_result, int create) { return ntfs_get_block_vbo(inode, (u64)vbn << inode->i_blkbits, bh_result, create, GET_BLOCK_WRITE_BEGIN); } int ntfs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, u32 len, struct page **pagep, void **fsdata) { int err; struct inode *inode = mapping->host; struct ntfs_inode *ni = ntfs_i(inode); *pagep = NULL; if (is_resident(ni)) { struct page *page = grab_cache_page_write_begin(mapping, pos >> PAGE_SHIFT); if (!page) { err = -ENOMEM; goto out; } ni_lock(ni); err = attr_data_read_resident(ni, page); ni_unlock(ni); if (!err) { *pagep = page; goto out; } unlock_page(page); put_page(page); if (err != E_NTFS_NONRESIDENT) goto out; } err = block_write_begin(mapping, pos, len, pagep, ntfs_get_block_write_begin); out: return err; } /* * ntfs_write_end - Address_space_operations::write_end. */ int ntfs_write_end(struct file *file, struct address_space *mapping, loff_t pos, u32 len, u32 copied, struct page *page, void *fsdata) { struct inode *inode = mapping->host; struct ntfs_inode *ni = ntfs_i(inode); u64 valid = ni->i_valid; bool dirty = false; int err; if (is_resident(ni)) { ni_lock(ni); err = attr_data_write_resident(ni, page); ni_unlock(ni); if (!err) { dirty = true; /* Clear any buffers in page. */ if (page_has_buffers(page)) { struct buffer_head *head, *bh; bh = head = page_buffers(page); do { clear_buffer_dirty(bh); clear_buffer_mapped(bh); set_buffer_uptodate(bh); } while (head != (bh = bh->b_this_page)); } SetPageUptodate(page); err = copied; } unlock_page(page); put_page(page); } else { err = generic_write_end(file, mapping, pos, len, copied, page, fsdata); } if (err >= 0) { if (!(ni->std_fa & FILE_ATTRIBUTE_ARCHIVE)) { inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); ni->std_fa |= FILE_ATTRIBUTE_ARCHIVE; dirty = true; } if (valid != ni->i_valid) { /* ni->i_valid is changed in ntfs_get_block_vbo. */ dirty = true; } if (pos + err > inode->i_size) { inode->i_size = pos + err; dirty = true; } if (dirty) mark_inode_dirty(inode); } return err; } int reset_log_file(struct inode *inode) { int err; loff_t pos = 0; u32 log_size = inode->i_size; struct address_space *mapping = inode->i_mapping; for (;;) { u32 len; void *kaddr; struct page *page; len = pos + PAGE_SIZE > log_size ? (log_size - pos) : PAGE_SIZE; err = block_write_begin(mapping, pos, len, &page, ntfs_get_block_write_begin); if (err) goto out; kaddr = kmap_atomic(page); memset(kaddr, -1, len); kunmap_atomic(kaddr); flush_dcache_page(page); err = block_write_end(NULL, mapping, pos, len, len, page, NULL); if (err < 0) goto out; pos += len; if (pos >= log_size) break; balance_dirty_pages_ratelimited(mapping); } out: mark_inode_dirty_sync(inode); return err; } int ntfs3_write_inode(struct inode *inode, struct writeback_control *wbc) { return _ni_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL); } int ntfs_sync_inode(struct inode *inode) { return _ni_write_inode(inode, 1); } /* * writeback_inode - Helper function for ntfs_flush_inodes(). * * This writes both the inode and the file data blocks, waiting * for in flight data blocks before the start of the call. It * does not wait for any io started during the call. */ static int writeback_inode(struct inode *inode) { int ret = sync_inode_metadata(inode, 0); if (!ret) ret = filemap_fdatawrite(inode->i_mapping); return ret; } /* * ntfs_flush_inodes * * Write data and metadata corresponding to i1 and i2. The io is * started but we do not wait for any of it to finish. * * filemap_flush() is used for the block device, so if there is a dirty * page for a block already in flight, we will not wait and start the * io over again. */ int ntfs_flush_inodes(struct super_block *sb, struct inode *i1, struct inode *i2) { int ret = 0; if (i1) ret = writeback_inode(i1); if (!ret && i2) ret = writeback_inode(i2); if (!ret) ret = sync_blockdev_nowait(sb->s_bdev); return ret; } int inode_write_data(struct inode *inode, const void *data, size_t bytes) { pgoff_t idx; /* Write non resident data. */ for (idx = 0; bytes; idx++) { size_t op = bytes > PAGE_SIZE ? PAGE_SIZE : bytes; struct page *page = ntfs_map_page(inode->i_mapping, idx); if (IS_ERR(page)) return PTR_ERR(page); lock_page(page); WARN_ON(!PageUptodate(page)); ClearPageUptodate(page); memcpy(page_address(page), data, op); flush_dcache_page(page); SetPageUptodate(page); unlock_page(page); ntfs_unmap_page(page); bytes -= op; data = Add2Ptr(data, PAGE_SIZE); } return 0; } /* * ntfs_reparse_bytes * * Number of bytes for REPARSE_DATA_BUFFER(IO_REPARSE_TAG_SYMLINK) * for unicode string of @uni_len length. */ static inline u32 ntfs_reparse_bytes(u32 uni_len) { /* Header + unicode string + decorated unicode string. */ return sizeof(short) * (2 * uni_len + 4) + offsetof(struct REPARSE_DATA_BUFFER, SymbolicLinkReparseBuffer.PathBuffer); } static struct REPARSE_DATA_BUFFER * ntfs_create_reparse_buffer(struct ntfs_sb_info *sbi, const char *symname, u32 size, u16 *nsize) { int i, err; struct REPARSE_DATA_BUFFER *rp; __le16 *rp_name; typeof(rp->SymbolicLinkReparseBuffer) *rs; rp = kzalloc(ntfs_reparse_bytes(2 * size + 2), GFP_NOFS); if (!rp) return ERR_PTR(-ENOMEM); rs = &rp->SymbolicLinkReparseBuffer; rp_name = rs->PathBuffer; /* Convert link name to UTF-16. */ err = ntfs_nls_to_utf16(sbi, symname, size, (struct cpu_str *)(rp_name - 1), 2 * size, UTF16_LITTLE_ENDIAN); if (err < 0) goto out; /* err = the length of unicode name of symlink. */ *nsize = ntfs_reparse_bytes(err); if (*nsize > sbi->reparse.max_size) { err = -EFBIG; goto out; } /* Translate Linux '/' into Windows '\'. */ for (i = 0; i < err; i++) { if (rp_name[i] == cpu_to_le16('/')) rp_name[i] = cpu_to_le16('\\'); } rp->ReparseTag = IO_REPARSE_TAG_SYMLINK; rp->ReparseDataLength = cpu_to_le16(*nsize - offsetof(struct REPARSE_DATA_BUFFER, SymbolicLinkReparseBuffer)); /* PrintName + SubstituteName. */ rs->SubstituteNameOffset = cpu_to_le16(sizeof(short) * err); rs->SubstituteNameLength = cpu_to_le16(sizeof(short) * err + 8); rs->PrintNameLength = rs->SubstituteNameOffset; /* * TODO: Use relative path if possible to allow Windows to * parse this path. * 0-absolute path 1- relative path (SYMLINK_FLAG_RELATIVE). */ rs->Flags = 0; memmove(rp_name + err + 4, rp_name, sizeof(short) * err); /* Decorate SubstituteName. */ rp_name += err; rp_name[0] = cpu_to_le16('\\'); rp_name[1] = cpu_to_le16('?'); rp_name[2] = cpu_to_le16('?'); rp_name[3] = cpu_to_le16('\\'); return rp; out: kfree(rp); return ERR_PTR(err); } /* * ntfs_create_inode * * Helper function for: * - ntfs_create * - ntfs_mknod * - ntfs_symlink * - ntfs_mkdir * - ntfs_atomic_open * * NOTE: if fnd != NULL (ntfs_atomic_open) then @dir is locked */ struct inode *ntfs_create_inode(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const struct cpu_str *uni, umode_t mode, dev_t dev, const char *symname, u32 size, struct ntfs_fnd *fnd) { int err; struct super_block *sb = dir->i_sb; struct ntfs_sb_info *sbi = sb->s_fs_info; const struct qstr *name = &dentry->d_name; CLST ino = 0; struct ntfs_inode *dir_ni = ntfs_i(dir); struct ntfs_inode *ni = NULL; struct inode *inode = NULL; struct ATTRIB *attr; struct ATTR_STD_INFO5 *std5; struct ATTR_FILE_NAME *fname; struct MFT_REC *rec; u32 asize, dsize, sd_size; enum FILE_ATTRIBUTE fa; __le32 security_id = SECURITY_ID_INVALID; CLST vcn; const void *sd; u16 t16, nsize = 0, aid = 0; struct INDEX_ROOT *root, *dir_root; struct NTFS_DE *e, *new_de = NULL; struct REPARSE_DATA_BUFFER *rp = NULL; bool rp_inserted = false; if (!fnd) ni_lock_dir(dir_ni); dir_root = indx_get_root(&dir_ni->dir, dir_ni, NULL, NULL); if (!dir_root) { err = -EINVAL; goto out1; } if (S_ISDIR(mode)) { /* Use parent's directory attributes. */ fa = dir_ni->std_fa | FILE_ATTRIBUTE_DIRECTORY | FILE_ATTRIBUTE_ARCHIVE; /* * By default child directory inherits parent attributes. * Root directory is hidden + system. * Make an exception for children in root. */ if (dir->i_ino == MFT_REC_ROOT) fa &= ~(FILE_ATTRIBUTE_HIDDEN | FILE_ATTRIBUTE_SYSTEM); } else if (S_ISLNK(mode)) { /* It is good idea that link should be the same type (file/dir) as target */ fa = FILE_ATTRIBUTE_REPARSE_POINT; /* * Linux: there are dir/file/symlink and so on. * NTFS: symlinks are "dir + reparse" or "file + reparse" * It is good idea to create: * dir + reparse if 'symname' points to directory * or * file + reparse if 'symname' points to file * Unfortunately kern_path hangs if symname contains 'dir'. */ /* * struct path path; * * if (!kern_path(symname, LOOKUP_FOLLOW, &path)){ * struct inode *target = d_inode(path.dentry); * * if (S_ISDIR(target->i_mode)) * fa |= FILE_ATTRIBUTE_DIRECTORY; * // if ( target->i_sb == sb ){ * // use relative path? * // } * path_put(&path); * } */ } else if (S_ISREG(mode)) { if (sbi->options->sparse) { /* Sparsed regular file, cause option 'sparse'. */ fa = FILE_ATTRIBUTE_SPARSE_FILE | FILE_ATTRIBUTE_ARCHIVE; } else if (dir_ni->std_fa & FILE_ATTRIBUTE_COMPRESSED) { /* Compressed regular file, if parent is compressed. */ fa = FILE_ATTRIBUTE_COMPRESSED | FILE_ATTRIBUTE_ARCHIVE; } else { /* Regular file, default attributes. */ fa = FILE_ATTRIBUTE_ARCHIVE; } } else { fa = FILE_ATTRIBUTE_ARCHIVE; } /* If option "hide_dot_files" then set hidden attribute for dot files. */ if (sbi->options->hide_dot_files && name->name[0] == '.') fa |= FILE_ATTRIBUTE_HIDDEN; if (!(mode & 0222)) fa |= FILE_ATTRIBUTE_READONLY; /* Allocate PATH_MAX bytes. */ new_de = __getname(); if (!new_de) { err = -ENOMEM; goto out1; } /* Mark rw ntfs as dirty. it will be cleared at umount. */ ntfs_set_state(sbi, NTFS_DIRTY_DIRTY); /* Step 1: allocate and fill new mft record. */ err = ntfs_look_free_mft(sbi, &ino, false, NULL, NULL); if (err) goto out2; ni = ntfs_new_inode(sbi, ino, S_ISDIR(mode) ? RECORD_FLAG_DIR : 0); if (IS_ERR(ni)) { err = PTR_ERR(ni); ni = NULL; goto out3; } inode = &ni->vfs_inode; inode_init_owner(idmap, inode, dir, mode); mode = inode->i_mode; ni->i_crtime = current_time(inode); rec = ni->mi.mrec; rec->hard_links = cpu_to_le16(1); attr = Add2Ptr(rec, le16_to_cpu(rec->attr_off)); /* Get default security id. */ sd = s_default_security; sd_size = sizeof(s_default_security); if (is_ntfs3(sbi)) { security_id = dir_ni->std_security_id; if (le32_to_cpu(security_id) < SECURITY_ID_FIRST) { security_id = sbi->security.def_security_id; if (security_id == SECURITY_ID_INVALID && !ntfs_insert_security(sbi, sd, sd_size, &security_id, NULL)) sbi->security.def_security_id = security_id; } } /* Insert standard info. */ std5 = Add2Ptr(attr, SIZEOF_RESIDENT); if (security_id == SECURITY_ID_INVALID) { dsize = sizeof(struct ATTR_STD_INFO); } else { dsize = sizeof(struct ATTR_STD_INFO5); std5->security_id = security_id; ni->std_security_id = security_id; } asize = SIZEOF_RESIDENT + dsize; attr->type = ATTR_STD; attr->size = cpu_to_le32(asize); attr->id = cpu_to_le16(aid++); attr->res.data_off = SIZEOF_RESIDENT_LE; attr->res.data_size = cpu_to_le32(dsize); std5->cr_time = std5->m_time = std5->c_time = std5->a_time = kernel2nt(&ni->i_crtime); std5->fa = ni->std_fa = fa; attr = Add2Ptr(attr, asize); /* Insert file name. */ err = fill_name_de(sbi, new_de, name, uni); if (err) goto out4; mi_get_ref(&ni->mi, &new_de->ref); fname = (struct ATTR_FILE_NAME *)(new_de + 1); if (sbi->options->windows_names && !valid_windows_name(sbi, (struct le_str *)&fname->name_len)) { err = -EINVAL; goto out4; } mi_get_ref(&dir_ni->mi, &fname->home); fname->dup.cr_time = fname->dup.m_time = fname->dup.c_time = fname->dup.a_time = std5->cr_time; fname->dup.alloc_size = fname->dup.data_size = 0; fname->dup.fa = std5->fa; fname->dup.ea_size = fname->dup.reparse = 0; dsize = le16_to_cpu(new_de->key_size); asize = ALIGN(SIZEOF_RESIDENT + dsize, 8); attr->type = ATTR_NAME; attr->size = cpu_to_le32(asize); attr->res.data_off = SIZEOF_RESIDENT_LE; attr->res.flags = RESIDENT_FLAG_INDEXED; attr->id = cpu_to_le16(aid++); attr->res.data_size = cpu_to_le32(dsize); memcpy(Add2Ptr(attr, SIZEOF_RESIDENT), fname, dsize); attr = Add2Ptr(attr, asize); if (security_id == SECURITY_ID_INVALID) { /* Insert security attribute. */ asize = SIZEOF_RESIDENT + ALIGN(sd_size, 8); attr->type = ATTR_SECURE; attr->size = cpu_to_le32(asize); attr->id = cpu_to_le16(aid++); attr->res.data_off = SIZEOF_RESIDENT_LE; attr->res.data_size = cpu_to_le32(sd_size); memcpy(Add2Ptr(attr, SIZEOF_RESIDENT), sd, sd_size); attr = Add2Ptr(attr, asize); } attr->id = cpu_to_le16(aid++); if (fa & FILE_ATTRIBUTE_DIRECTORY) { /* * Regular directory or symlink to directory. * Create root attribute. */ dsize = sizeof(struct INDEX_ROOT) + sizeof(struct NTFS_DE); asize = sizeof(I30_NAME) + SIZEOF_RESIDENT + dsize; attr->type = ATTR_ROOT; attr->size = cpu_to_le32(asize); attr->name_len = ARRAY_SIZE(I30_NAME); attr->name_off = SIZEOF_RESIDENT_LE; attr->res.data_off = cpu_to_le16(sizeof(I30_NAME) + SIZEOF_RESIDENT); attr->res.data_size = cpu_to_le32(dsize); memcpy(Add2Ptr(attr, SIZEOF_RESIDENT), I30_NAME, sizeof(I30_NAME)); root = Add2Ptr(attr, sizeof(I30_NAME) + SIZEOF_RESIDENT); memcpy(root, dir_root, offsetof(struct INDEX_ROOT, ihdr)); root->ihdr.de_off = cpu_to_le32(sizeof(struct INDEX_HDR)); root->ihdr.used = cpu_to_le32(sizeof(struct INDEX_HDR) + sizeof(struct NTFS_DE)); root->ihdr.total = root->ihdr.used; e = Add2Ptr(root, sizeof(struct INDEX_ROOT)); e->size = cpu_to_le16(sizeof(struct NTFS_DE)); e->flags = NTFS_IE_LAST; } else if (S_ISLNK(mode)) { /* * Symlink to file. * Create empty resident data attribute. */ asize = SIZEOF_RESIDENT; /* Insert empty ATTR_DATA */ attr->type = ATTR_DATA; attr->size = cpu_to_le32(SIZEOF_RESIDENT); attr->name_off = SIZEOF_RESIDENT_LE; attr->res.data_off = SIZEOF_RESIDENT_LE; } else if (S_ISREG(mode)) { /* * Regular file. Create empty non resident data attribute. */ attr->type = ATTR_DATA; attr->non_res = 1; attr->nres.evcn = cpu_to_le64(-1ll); if (fa & FILE_ATTRIBUTE_SPARSE_FILE) { attr->size = cpu_to_le32(SIZEOF_NONRESIDENT_EX + 8); attr->name_off = SIZEOF_NONRESIDENT_EX_LE; attr->flags = ATTR_FLAG_SPARSED; asize = SIZEOF_NONRESIDENT_EX + 8; } else if (fa & FILE_ATTRIBUTE_COMPRESSED) { attr->size = cpu_to_le32(SIZEOF_NONRESIDENT_EX + 8); attr->name_off = SIZEOF_NONRESIDENT_EX_LE; attr->flags = ATTR_FLAG_COMPRESSED; attr->nres.c_unit = COMPRESSION_UNIT; asize = SIZEOF_NONRESIDENT_EX + 8; } else { attr->size = cpu_to_le32(SIZEOF_NONRESIDENT + 8); attr->name_off = SIZEOF_NONRESIDENT_LE; asize = SIZEOF_NONRESIDENT + 8; } attr->nres.run_off = attr->name_off; } else { /* * Node. Create empty resident data attribute. */ attr->type = ATTR_DATA; attr->size = cpu_to_le32(SIZEOF_RESIDENT); attr->name_off = SIZEOF_RESIDENT_LE; if (fa & FILE_ATTRIBUTE_SPARSE_FILE) attr->flags = ATTR_FLAG_SPARSED; else if (fa & FILE_ATTRIBUTE_COMPRESSED) attr->flags = ATTR_FLAG_COMPRESSED; attr->res.data_off = SIZEOF_RESIDENT_LE; asize = SIZEOF_RESIDENT; ni->ni_flags |= NI_FLAG_RESIDENT; } if (S_ISDIR(mode)) { ni->ni_flags |= NI_FLAG_DIR; err = indx_init(&ni->dir, sbi, attr, INDEX_MUTEX_I30); if (err) goto out4; } else if (S_ISLNK(mode)) { rp = ntfs_create_reparse_buffer(sbi, symname, size, &nsize); if (IS_ERR(rp)) { err = PTR_ERR(rp); rp = NULL; goto out4; } /* * Insert ATTR_REPARSE. */ attr = Add2Ptr(attr, asize); attr->type = ATTR_REPARSE; attr->id = cpu_to_le16(aid++); /* Resident or non resident? */ asize = ALIGN(SIZEOF_RESIDENT + nsize, 8); t16 = PtrOffset(rec, attr); /* * Below function 'ntfs_save_wsl_perm' requires 0x78 bytes. * It is good idea to keep extened attributes resident. */ if (asize + t16 + 0x78 + 8 > sbi->record_size) { CLST alen; CLST clst = bytes_to_cluster(sbi, nsize); /* Bytes per runs. */ t16 = sbi->record_size - t16 - SIZEOF_NONRESIDENT; attr->non_res = 1; attr->nres.evcn = cpu_to_le64(clst - 1); attr->name_off = SIZEOF_NONRESIDENT_LE; attr->nres.run_off = attr->name_off; attr->nres.data_size = cpu_to_le64(nsize); attr->nres.valid_size = attr->nres.data_size; attr->nres.alloc_size = cpu_to_le64(ntfs_up_cluster(sbi, nsize)); err = attr_allocate_clusters(sbi, &ni->file.run, 0, 0, clst, NULL, ALLOCATE_DEF, &alen, 0, NULL, NULL); if (err) goto out5; err = run_pack(&ni->file.run, 0, clst, Add2Ptr(attr, SIZEOF_NONRESIDENT), t16, &vcn); if (err < 0) goto out5; if (vcn != clst) { err = -EINVAL; goto out5; } asize = SIZEOF_NONRESIDENT + ALIGN(err, 8); /* Write non resident data. */ err = ntfs_sb_write_run(sbi, &ni->file.run, 0, rp, nsize, 0); if (err) goto out5; } else { attr->res.data_off = SIZEOF_RESIDENT_LE; attr->res.data_size = cpu_to_le32(nsize); memcpy(Add2Ptr(attr, SIZEOF_RESIDENT), rp, nsize); } /* Size of symlink equals the length of input string. */ inode->i_size = size; attr->size = cpu_to_le32(asize); err = ntfs_insert_reparse(sbi, IO_REPARSE_TAG_SYMLINK, &new_de->ref); if (err) goto out5; rp_inserted = true; } attr = Add2Ptr(attr, asize); attr->type = ATTR_END; rec->used = cpu_to_le32(PtrOffset(rec, attr) + 8); rec->next_attr_id = cpu_to_le16(aid); inode->i_generation = le16_to_cpu(rec->seq); if (S_ISDIR(mode)) { inode->i_op = &ntfs_dir_inode_operations; inode->i_fop = &ntfs_dir_operations; } else if (S_ISLNK(mode)) { inode->i_op = &ntfs_link_inode_operations; inode->i_fop = NULL; inode->i_mapping->a_ops = &ntfs_aops; inode->i_size = size; inode_nohighmem(inode); } else if (S_ISREG(mode)) { inode->i_op = &ntfs_file_inode_operations; inode->i_fop = &ntfs_file_operations; inode->i_mapping->a_ops = is_compressed(ni) ? &ntfs_aops_cmpr : &ntfs_aops; init_rwsem(&ni->file.run_lock); } else { inode->i_op = &ntfs_special_inode_operations; init_special_inode(inode, mode, dev); } #ifdef CONFIG_NTFS3_FS_POSIX_ACL if (!S_ISLNK(mode) && (sb->s_flags & SB_POSIXACL)) { err = ntfs_init_acl(idmap, inode, dir); if (err) goto out5; } else #endif { inode->i_flags |= S_NOSEC; } /* * ntfs_init_acl and ntfs_save_wsl_perm update extended attribute. * The packed size of extended attribute is stored in direntry too. * 'fname' here points to inside new_de. */ ntfs_save_wsl_perm(inode, &fname->dup.ea_size); /* * update ea_size in file_name attribute too. * Use ni_find_attr cause layout of MFT record may be changed * in ntfs_init_acl and ntfs_save_wsl_perm. */ attr = ni_find_attr(ni, NULL, NULL, ATTR_NAME, NULL, 0, NULL, NULL); if (attr) { struct ATTR_FILE_NAME *fn; fn = resident_data_ex(attr, SIZEOF_ATTRIBUTE_FILENAME); if (fn) fn->dup.ea_size = fname->dup.ea_size; } /* We do not need to update parent directory later */ ni->ni_flags &= ~NI_FLAG_UPDATE_PARENT; /* Step 2: Add new name in index. */ err = indx_insert_entry(&dir_ni->dir, dir_ni, new_de, sbi, fnd, 0); if (err) goto out6; /* * Call 'd_instantiate' after inode->i_op is set * but before finish_open. */ d_instantiate(dentry, inode); /* Set original time. inode times (i_ctime) may be changed in ntfs_init_acl. */ inode_set_atime_to_ts(inode, ni->i_crtime); inode_set_ctime_to_ts(inode, ni->i_crtime); inode_set_mtime_to_ts(inode, ni->i_crtime); inode_set_mtime_to_ts(dir, ni->i_crtime); inode_set_ctime_to_ts(dir, ni->i_crtime); mark_inode_dirty(dir); mark_inode_dirty(inode); /* Normal exit. */ goto out2; out6: if (rp_inserted) ntfs_remove_reparse(sbi, IO_REPARSE_TAG_SYMLINK, &new_de->ref); out5: if (!S_ISDIR(mode)) run_deallocate(sbi, &ni->file.run, false); out4: clear_rec_inuse(rec); clear_nlink(inode); ni->mi.dirty = false; discard_new_inode(inode); out3: ntfs_mark_rec_free(sbi, ino, false); out2: __putname(new_de); kfree(rp); out1: if (!fnd) ni_unlock(dir_ni); if (err) return ERR_PTR(err); unlock_new_inode(inode); return inode; } int ntfs_link_inode(struct inode *inode, struct dentry *dentry) { int err; struct ntfs_inode *ni = ntfs_i(inode); struct ntfs_sb_info *sbi = inode->i_sb->s_fs_info; struct NTFS_DE *de; /* Allocate PATH_MAX bytes. */ de = __getname(); if (!de) return -ENOMEM; /* Mark rw ntfs as dirty. It will be cleared at umount. */ ntfs_set_state(sbi, NTFS_DIRTY_DIRTY); /* Construct 'de'. */ err = fill_name_de(sbi, de, &dentry->d_name, NULL); if (err) goto out; err = ni_add_name(ntfs_i(d_inode(dentry->d_parent)), ni, de); out: __putname(de); return err; } /* * ntfs_unlink_inode * * inode_operations::unlink * inode_operations::rmdir */ int ntfs_unlink_inode(struct inode *dir, const struct dentry *dentry) { int err; struct ntfs_sb_info *sbi = dir->i_sb->s_fs_info; struct inode *inode = d_inode(dentry); struct ntfs_inode *ni = ntfs_i(inode); struct ntfs_inode *dir_ni = ntfs_i(dir); struct NTFS_DE *de, *de2 = NULL; int undo_remove; if (ntfs_is_meta_file(sbi, ni->mi.rno)) return -EINVAL; /* Allocate PATH_MAX bytes. */ de = __getname(); if (!de) return -ENOMEM; ni_lock(ni); if (S_ISDIR(inode->i_mode) && !dir_is_empty(inode)) { err = -ENOTEMPTY; goto out; } err = fill_name_de(sbi, de, &dentry->d_name, NULL); if (err < 0) goto out; undo_remove = 0; err = ni_remove_name(dir_ni, ni, de, &de2, &undo_remove); if (!err) { drop_nlink(inode); inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); inode_set_ctime_to_ts(inode, inode_get_ctime(dir)); if (inode->i_nlink) mark_inode_dirty(inode); } else if (!ni_remove_name_undo(dir_ni, ni, de, de2, undo_remove)) { _ntfs_bad_inode(inode); } else { if (ni_is_dirty(dir)) mark_inode_dirty(dir); if (ni_is_dirty(inode)) mark_inode_dirty(inode); } out: ni_unlock(ni); __putname(de); return err; } void ntfs_evict_inode(struct inode *inode) { truncate_inode_pages_final(&inode->i_data); invalidate_inode_buffers(inode); clear_inode(inode); ni_clear(ntfs_i(inode)); } /* * ntfs_translate_junction * * Translate a Windows junction target to the Linux equivalent. * On junctions, targets are always absolute (they include the drive * letter). We have no way of knowing if the target is for the current * mounted device or not so we just assume it is. */ static int ntfs_translate_junction(const struct super_block *sb, const struct dentry *link_de, char *target, int target_len, int target_max) { int tl_len, err = target_len; char *link_path_buffer = NULL, *link_path; char *translated = NULL; char *target_start; int copy_len; link_path_buffer = kmalloc(PATH_MAX, GFP_NOFS); if (!link_path_buffer) { err = -ENOMEM; goto out; } /* Get link path, relative to mount point */ link_path = dentry_path_raw(link_de, link_path_buffer, PATH_MAX); if (IS_ERR(link_path)) { ntfs_err(sb, "Error getting link path"); err = -EINVAL; goto out; } translated = kmalloc(PATH_MAX, GFP_NOFS); if (!translated) { err = -ENOMEM; goto out; } /* Make translated path a relative path to mount point */ strcpy(translated, "./"); ++link_path; /* Skip leading / */ for (tl_len = sizeof("./") - 1; *link_path; ++link_path) { if (*link_path == '/') { if (PATH_MAX - tl_len < sizeof("../")) { ntfs_err(sb, "Link path %s has too many components", link_path); err = -EINVAL; goto out; } strcpy(translated + tl_len, "../"); tl_len += sizeof("../") - 1; } } /* Skip drive letter */ target_start = target; while (*target_start && *target_start != ':') ++target_start; if (!*target_start) { ntfs_err(sb, "Link target (%s) missing drive separator", target); err = -EINVAL; goto out; } /* Skip drive separator and leading /, if exists */ target_start += 1 + (target_start[1] == '/'); copy_len = target_len - (target_start - target); if (PATH_MAX - tl_len <= copy_len) { ntfs_err(sb, "Link target %s too large for buffer (%d <= %d)", target_start, PATH_MAX - tl_len, copy_len); err = -EINVAL; goto out; } /* translated path has a trailing / and target_start does not */ strcpy(translated + tl_len, target_start); tl_len += copy_len; if (target_max <= tl_len) { ntfs_err(sb, "Target path %s too large for buffer (%d <= %d)", translated, target_max, tl_len); err = -EINVAL; goto out; } strcpy(target, translated); err = tl_len; out: kfree(link_path_buffer); kfree(translated); return err; } static noinline int ntfs_readlink_hlp(const struct dentry *link_de, struct inode *inode, char *buffer, int buflen) { int i, err = -EINVAL; struct ntfs_inode *ni = ntfs_i(inode); struct super_block *sb = inode->i_sb; struct ntfs_sb_info *sbi = sb->s_fs_info; u64 size; u16 ulen = 0; void *to_free = NULL; struct REPARSE_DATA_BUFFER *rp; const __le16 *uname; struct ATTRIB *attr; /* Reparse data present. Try to parse it. */ static_assert(!offsetof(struct REPARSE_DATA_BUFFER, ReparseTag)); static_assert(sizeof(u32) == sizeof(rp->ReparseTag)); *buffer = 0; attr = ni_find_attr(ni, NULL, NULL, ATTR_REPARSE, NULL, 0, NULL, NULL); if (!attr) goto out; if (!attr->non_res) { rp = resident_data_ex(attr, sizeof(struct REPARSE_DATA_BUFFER)); if (!rp) goto out; size = le32_to_cpu(attr->res.data_size); } else { size = le64_to_cpu(attr->nres.data_size); rp = NULL; } if (size > sbi->reparse.max_size || size <= sizeof(u32)) goto out; if (!rp) { rp = kmalloc(size, GFP_NOFS); if (!rp) { err = -ENOMEM; goto out; } to_free = rp; /* Read into temporal buffer. */ err = ntfs_read_run_nb(sbi, &ni->file.run, 0, rp, size, NULL); if (err) goto out; } /* Microsoft Tag. */ switch (rp->ReparseTag) { case IO_REPARSE_TAG_MOUNT_POINT: /* Mount points and junctions. */ /* Can we use 'Rp->MountPointReparseBuffer.PrintNameLength'? */ if (size <= offsetof(struct REPARSE_DATA_BUFFER, MountPointReparseBuffer.PathBuffer)) goto out; uname = Add2Ptr(rp, offsetof(struct REPARSE_DATA_BUFFER, MountPointReparseBuffer.PathBuffer) + le16_to_cpu(rp->MountPointReparseBuffer .PrintNameOffset)); ulen = le16_to_cpu(rp->MountPointReparseBuffer.PrintNameLength); break; case IO_REPARSE_TAG_SYMLINK: /* FolderSymbolicLink */ /* Can we use 'Rp->SymbolicLinkReparseBuffer.PrintNameLength'? */ if (size <= offsetof(struct REPARSE_DATA_BUFFER, SymbolicLinkReparseBuffer.PathBuffer)) goto out; uname = Add2Ptr( rp, offsetof(struct REPARSE_DATA_BUFFER, SymbolicLinkReparseBuffer.PathBuffer) + le16_to_cpu(rp->SymbolicLinkReparseBuffer .PrintNameOffset)); ulen = le16_to_cpu( rp->SymbolicLinkReparseBuffer.PrintNameLength); break; case IO_REPARSE_TAG_CLOUD: case IO_REPARSE_TAG_CLOUD_1: case IO_REPARSE_TAG_CLOUD_2: case IO_REPARSE_TAG_CLOUD_3: case IO_REPARSE_TAG_CLOUD_4: case IO_REPARSE_TAG_CLOUD_5: case IO_REPARSE_TAG_CLOUD_6: case IO_REPARSE_TAG_CLOUD_7: case IO_REPARSE_TAG_CLOUD_8: case IO_REPARSE_TAG_CLOUD_9: case IO_REPARSE_TAG_CLOUD_A: case IO_REPARSE_TAG_CLOUD_B: case IO_REPARSE_TAG_CLOUD_C: case IO_REPARSE_TAG_CLOUD_D: case IO_REPARSE_TAG_CLOUD_E: case IO_REPARSE_TAG_CLOUD_F: err = sizeof("OneDrive") - 1; if (err > buflen) err = buflen; memcpy(buffer, "OneDrive", err); goto out; default: if (IsReparseTagMicrosoft(rp->ReparseTag)) { /* Unknown Microsoft Tag. */ goto out; } if (!IsReparseTagNameSurrogate(rp->ReparseTag) || size <= sizeof(struct REPARSE_POINT)) { goto out; } /* Users tag. */ uname = Add2Ptr(rp, sizeof(struct REPARSE_POINT)); ulen = le16_to_cpu(rp->ReparseDataLength) - sizeof(struct REPARSE_POINT); } /* Convert nlen from bytes to UNICODE chars. */ ulen >>= 1; /* Check that name is available. */ if (!ulen || uname + ulen > (__le16 *)Add2Ptr(rp, size)) goto out; /* If name is already zero terminated then truncate it now. */ if (!uname[ulen - 1]) ulen -= 1; err = ntfs_utf16_to_nls(sbi, uname, ulen, buffer, buflen); if (err < 0) goto out; /* Translate Windows '\' into Linux '/'. */ for (i = 0; i < err; i++) { if (buffer[i] == '\\') buffer[i] = '/'; } /* Always set last zero. */ buffer[err] = 0; /* If this is a junction, translate the link target. */ if (rp->ReparseTag == IO_REPARSE_TAG_MOUNT_POINT) err = ntfs_translate_junction(sb, link_de, buffer, err, buflen); out: kfree(to_free); return err; } static const char *ntfs_get_link(struct dentry *de, struct inode *inode, struct delayed_call *done) { int err; char *ret; if (!de) return ERR_PTR(-ECHILD); ret = kmalloc(PAGE_SIZE, GFP_NOFS); if (!ret) return ERR_PTR(-ENOMEM); err = ntfs_readlink_hlp(de, inode, ret, PAGE_SIZE); if (err < 0) { kfree(ret); return ERR_PTR(err); } set_delayed_call(done, kfree_link, ret); return ret; } // clang-format off const struct inode_operations ntfs_link_inode_operations = { .get_link = ntfs_get_link, .setattr = ntfs3_setattr, .listxattr = ntfs_listxattr, }; const struct address_space_operations ntfs_aops = { .read_folio = ntfs_read_folio, .readahead = ntfs_readahead, .writepages = ntfs_writepages, .write_begin = ntfs_write_begin, .write_end = ntfs_write_end, .direct_IO = ntfs_direct_IO, .bmap = ntfs_bmap, .dirty_folio = block_dirty_folio, .migrate_folio = buffer_migrate_folio, .invalidate_folio = block_invalidate_folio, }; const struct address_space_operations ntfs_aops_cmpr = { .read_folio = ntfs_read_folio, .readahead = ntfs_readahead, }; // clang-format on |
1 2 1 1 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 | // SPDX-License-Identifier: GPL-2.0-only /* * QNX4 file system, Linux implementation. * * Version : 0.2.1 * * Using parts of the xiafs filesystem. * * History : * * 01-06-1998 by Richard Frowijn : first release. * 20-06-1998 by Frank Denis : Linux 2.1.99+ support, boot signature, misc. * 30-06-1998 by Frank Denis : first step to write inodes. */ #include <linux/module.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/highuid.h> #include <linux/pagemap.h> #include <linux/buffer_head.h> #include <linux/writeback.h> #include <linux/statfs.h> #include "qnx4.h" #define QNX4_VERSION 4 #define QNX4_BMNAME ".bitmap" static const struct super_operations qnx4_sops; static struct inode *qnx4_alloc_inode(struct super_block *sb); static void qnx4_free_inode(struct inode *inode); static int qnx4_remount(struct super_block *sb, int *flags, char *data); static int qnx4_statfs(struct dentry *, struct kstatfs *); static const struct super_operations qnx4_sops = { .alloc_inode = qnx4_alloc_inode, .free_inode = qnx4_free_inode, .statfs = qnx4_statfs, .remount_fs = qnx4_remount, }; static int qnx4_remount(struct super_block *sb, int *flags, char *data) { struct qnx4_sb_info *qs; sync_filesystem(sb); qs = qnx4_sb(sb); qs->Version = QNX4_VERSION; *flags |= SB_RDONLY; return 0; } static int qnx4_get_block( struct inode *inode, sector_t iblock, struct buffer_head *bh, int create ) { unsigned long phys; QNX4DEBUG((KERN_INFO "qnx4: qnx4_get_block inode=[%ld] iblock=[%ld]\n",inode->i_ino,iblock)); phys = qnx4_block_map( inode, iblock ); if ( phys ) { // logical block is before EOF map_bh(bh, inode->i_sb, phys); } return 0; } static inline u32 try_extent(qnx4_xtnt_t *extent, u32 *offset) { u32 size = le32_to_cpu(extent->xtnt_size); if (*offset < size) return le32_to_cpu(extent->xtnt_blk) + *offset - 1; *offset -= size; return 0; } unsigned long qnx4_block_map( struct inode *inode, long iblock ) { int ix; long i_xblk; struct buffer_head *bh = NULL; struct qnx4_xblk *xblk = NULL; struct qnx4_inode_entry *qnx4_inode = qnx4_raw_inode(inode); u16 nxtnt = le16_to_cpu(qnx4_inode->di_num_xtnts); u32 offset = iblock; u32 block = try_extent(&qnx4_inode->di_first_xtnt, &offset); if (block) { // iblock is in the first extent. This is easy. } else { // iblock is beyond first extent. We have to follow the extent chain. i_xblk = le32_to_cpu(qnx4_inode->di_xblk); ix = 0; while ( --nxtnt > 0 ) { if ( ix == 0 ) { // read next xtnt block. bh = sb_bread(inode->i_sb, i_xblk - 1); if ( !bh ) { QNX4DEBUG((KERN_ERR "qnx4: I/O error reading xtnt block [%ld])\n", i_xblk - 1)); return -EIO; } xblk = (struct qnx4_xblk*)bh->b_data; if ( memcmp( xblk->xblk_signature, "IamXblk", 7 ) ) { QNX4DEBUG((KERN_ERR "qnx4: block at %ld is not a valid xtnt\n", qnx4_inode->i_xblk)); return -EIO; } } block = try_extent(&xblk->xblk_xtnts[ix], &offset); if (block) { // got it! break; } if ( ++ix >= xblk->xblk_num_xtnts ) { i_xblk = le32_to_cpu(xblk->xblk_next_xblk); ix = 0; brelse( bh ); bh = NULL; } } if ( bh ) brelse( bh ); } QNX4DEBUG((KERN_INFO "qnx4: mapping block %ld of inode %ld = %ld\n",iblock,inode->i_ino,block)); return block; } static int qnx4_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; u64 id = huge_encode_dev(sb->s_bdev->bd_dev); buf->f_type = sb->s_magic; buf->f_bsize = sb->s_blocksize; buf->f_blocks = le32_to_cpu(qnx4_sb(sb)->BitMap->di_size) * 8; buf->f_bfree = qnx4_count_free_blocks(sb); buf->f_bavail = buf->f_bfree; buf->f_namelen = QNX4_NAME_MAX; buf->f_fsid = u64_to_fsid(id); return 0; } /* * Check the root directory of the filesystem to make sure * it really _is_ a qnx4 filesystem, and to check the size * of the directory entry. */ static const char *qnx4_checkroot(struct super_block *sb, struct qnx4_super_block *s) { struct buffer_head *bh; struct qnx4_inode_entry *rootdir; int rd, rl; int i, j; if (s->RootDir.di_fname[0] != '/' || s->RootDir.di_fname[1] != '\0') return "no qnx4 filesystem (no root dir)."; QNX4DEBUG((KERN_NOTICE "QNX4 filesystem found on dev %s.\n", sb->s_id)); rd = le32_to_cpu(s->RootDir.di_first_xtnt.xtnt_blk) - 1; rl = le32_to_cpu(s->RootDir.di_first_xtnt.xtnt_size); for (j = 0; j < rl; j++) { bh = sb_bread(sb, rd + j); /* root dir, first block */ if (bh == NULL) return "unable to read root entry."; rootdir = (struct qnx4_inode_entry *) bh->b_data; for (i = 0; i < QNX4_INODES_PER_BLOCK; i++, rootdir++) { QNX4DEBUG((KERN_INFO "rootdir entry found : [%s]\n", rootdir->di_fname)); if (strcmp(rootdir->di_fname, QNX4_BMNAME) != 0) continue; qnx4_sb(sb)->BitMap = kmemdup(rootdir, sizeof(struct qnx4_inode_entry), GFP_KERNEL); brelse(bh); if (!qnx4_sb(sb)->BitMap) return "not enough memory for bitmap inode"; /* keep bitmap inode known */ return NULL; } brelse(bh); } return "bitmap file not found."; } static int qnx4_fill_super(struct super_block *s, void *data, int silent) { struct buffer_head *bh; struct inode *root; const char *errmsg; struct qnx4_sb_info *qs; qs = kzalloc(sizeof(struct qnx4_sb_info), GFP_KERNEL); if (!qs) return -ENOMEM; s->s_fs_info = qs; sb_set_blocksize(s, QNX4_BLOCK_SIZE); s->s_op = &qnx4_sops; s->s_magic = QNX4_SUPER_MAGIC; s->s_flags |= SB_RDONLY; /* Yup, read-only yet */ s->s_time_min = 0; s->s_time_max = U32_MAX; /* Check the superblock signature. Since the qnx4 code is dangerous, we should leave as quickly as possible if we don't belong here... */ bh = sb_bread(s, 1); if (!bh) { printk(KERN_ERR "qnx4: unable to read the superblock\n"); return -EINVAL; } /* check before allocating dentries, inodes, .. */ errmsg = qnx4_checkroot(s, (struct qnx4_super_block *) bh->b_data); brelse(bh); if (errmsg != NULL) { if (!silent) printk(KERN_ERR "qnx4: %s\n", errmsg); return -EINVAL; } /* does root not have inode number QNX4_ROOT_INO ?? */ root = qnx4_iget(s, QNX4_ROOT_INO * QNX4_INODES_PER_BLOCK); if (IS_ERR(root)) { printk(KERN_ERR "qnx4: get inode failed\n"); return PTR_ERR(root); } s->s_root = d_make_root(root); if (s->s_root == NULL) return -ENOMEM; return 0; } static void qnx4_kill_sb(struct super_block *sb) { struct qnx4_sb_info *qs = qnx4_sb(sb); kill_block_super(sb); if (qs) { kfree(qs->BitMap); kfree(qs); } } static int qnx4_read_folio(struct file *file, struct folio *folio) { return block_read_full_folio(folio, qnx4_get_block); } static sector_t qnx4_bmap(struct address_space *mapping, sector_t block) { return generic_block_bmap(mapping,block,qnx4_get_block); } static const struct address_space_operations qnx4_aops = { .read_folio = qnx4_read_folio, .bmap = qnx4_bmap }; struct inode *qnx4_iget(struct super_block *sb, unsigned long ino) { struct buffer_head *bh; struct qnx4_inode_entry *raw_inode; int block; struct qnx4_inode_entry *qnx4_inode; struct inode *inode; inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); if (!(inode->i_state & I_NEW)) return inode; qnx4_inode = qnx4_raw_inode(inode); inode->i_mode = 0; QNX4DEBUG((KERN_INFO "reading inode : [%d]\n", ino)); if (!ino) { printk(KERN_ERR "qnx4: bad inode number on dev %s: %lu is " "out of range\n", sb->s_id, ino); iget_failed(inode); return ERR_PTR(-EIO); } block = ino / QNX4_INODES_PER_BLOCK; if (!(bh = sb_bread(sb, block))) { printk(KERN_ERR "qnx4: major problem: unable to read inode from dev " "%s\n", sb->s_id); iget_failed(inode); return ERR_PTR(-EIO); } raw_inode = ((struct qnx4_inode_entry *) bh->b_data) + (ino % QNX4_INODES_PER_BLOCK); inode->i_mode = le16_to_cpu(raw_inode->di_mode); i_uid_write(inode, (uid_t)le16_to_cpu(raw_inode->di_uid)); i_gid_write(inode, (gid_t)le16_to_cpu(raw_inode->di_gid)); set_nlink(inode, le16_to_cpu(raw_inode->di_nlink)); inode->i_size = le32_to_cpu(raw_inode->di_size); inode_set_mtime(inode, le32_to_cpu(raw_inode->di_mtime), 0); inode_set_atime(inode, le32_to_cpu(raw_inode->di_atime), 0); inode_set_ctime(inode, le32_to_cpu(raw_inode->di_ctime), 0); inode->i_blocks = le32_to_cpu(raw_inode->di_first_xtnt.xtnt_size); memcpy(qnx4_inode, raw_inode, QNX4_DIR_ENTRY_SIZE); if (S_ISREG(inode->i_mode)) { inode->i_fop = &generic_ro_fops; inode->i_mapping->a_ops = &qnx4_aops; qnx4_i(inode)->mmu_private = inode->i_size; } else if (S_ISDIR(inode->i_mode)) { inode->i_op = &qnx4_dir_inode_operations; inode->i_fop = &qnx4_dir_operations; } else if (S_ISLNK(inode->i_mode)) { inode->i_op = &page_symlink_inode_operations; inode_nohighmem(inode); inode->i_mapping->a_ops = &qnx4_aops; qnx4_i(inode)->mmu_private = inode->i_size; } else { printk(KERN_ERR "qnx4: bad inode %lu on dev %s\n", ino, sb->s_id); iget_failed(inode); brelse(bh); return ERR_PTR(-EIO); } brelse(bh); unlock_new_inode(inode); return inode; } static struct kmem_cache *qnx4_inode_cachep; static struct inode *qnx4_alloc_inode(struct super_block *sb) { struct qnx4_inode_info *ei; ei = alloc_inode_sb(sb, qnx4_inode_cachep, GFP_KERNEL); if (!ei) return NULL; return &ei->vfs_inode; } static void qnx4_free_inode(struct inode *inode) { kmem_cache_free(qnx4_inode_cachep, qnx4_i(inode)); } static void init_once(void *foo) { struct qnx4_inode_info *ei = (struct qnx4_inode_info *) foo; inode_init_once(&ei->vfs_inode); } static int init_inodecache(void) { qnx4_inode_cachep = kmem_cache_create("qnx4_inode_cache", sizeof(struct qnx4_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (qnx4_inode_cachep == NULL) return -ENOMEM; return 0; } static void destroy_inodecache(void) { /* * Make sure all delayed rcu free inodes are flushed before we * destroy cache. */ rcu_barrier(); kmem_cache_destroy(qnx4_inode_cachep); } static struct dentry *qnx4_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { return mount_bdev(fs_type, flags, dev_name, data, qnx4_fill_super); } static struct file_system_type qnx4_fs_type = { .owner = THIS_MODULE, .name = "qnx4", .mount = qnx4_mount, .kill_sb = qnx4_kill_sb, .fs_flags = FS_REQUIRES_DEV, }; MODULE_ALIAS_FS("qnx4"); static int __init init_qnx4_fs(void) { int err; err = init_inodecache(); if (err) return err; err = register_filesystem(&qnx4_fs_type); if (err) { destroy_inodecache(); return err; } printk(KERN_INFO "QNX4 filesystem 0.2.3 registered.\n"); return 0; } static void __exit exit_qnx4_fs(void) { unregister_filesystem(&qnx4_fs_type); destroy_inodecache(); } module_init(init_qnx4_fs) module_exit(exit_qnx4_fs) MODULE_LICENSE("GPL"); |
2 1 1 148 145 146 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 | // SPDX-License-Identifier: GPL-2.0-only /* * Landlock LSM - Ptrace hooks * * Copyright © 2017-2020 Mickaël Salaün <mic@digikod.net> * Copyright © 2019-2020 ANSSI */ #include <asm/current.h> #include <linux/cred.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/lsm_hooks.h> #include <linux/rcupdate.h> #include <linux/sched.h> #include "common.h" #include "cred.h" #include "ptrace.h" #include "ruleset.h" #include "setup.h" /** * domain_scope_le - Checks domain ordering for scoped ptrace * * @parent: Parent domain. * @child: Potential child of @parent. * * Checks if the @parent domain is less or equal to (i.e. an ancestor, which * means a subset of) the @child domain. */ static bool domain_scope_le(const struct landlock_ruleset *const parent, const struct landlock_ruleset *const child) { const struct landlock_hierarchy *walker; if (!parent) return true; if (!child) return false; for (walker = child->hierarchy; walker; walker = walker->parent) { if (walker == parent->hierarchy) /* @parent is in the scoped hierarchy of @child. */ return true; } /* There is no relationship between @parent and @child. */ return false; } static bool task_is_scoped(const struct task_struct *const parent, const struct task_struct *const child) { bool is_scoped; const struct landlock_ruleset *dom_parent, *dom_child; rcu_read_lock(); dom_parent = landlock_get_task_domain(parent); dom_child = landlock_get_task_domain(child); is_scoped = domain_scope_le(dom_parent, dom_child); rcu_read_unlock(); return is_scoped; } static int task_ptrace(const struct task_struct *const parent, const struct task_struct *const child) { /* Quick return for non-landlocked tasks. */ if (!landlocked(parent)) return 0; if (task_is_scoped(parent, child)) return 0; return -EPERM; } /** * hook_ptrace_access_check - Determines whether the current process may access * another * * @child: Process to be accessed. * @mode: Mode of attachment. * * If the current task has Landlock rules, then the child must have at least * the same rules. Else denied. * * Determines whether a process may access another, returning 0 if permission * granted, -errno if denied. */ static int hook_ptrace_access_check(struct task_struct *const child, const unsigned int mode) { return task_ptrace(current, child); } /** * hook_ptrace_traceme - Determines whether another process may trace the * current one * * @parent: Task proposed to be the tracer. * * If the parent has Landlock rules, then the current task must have the same * or more rules. Else denied. * * Determines whether the nominated task is permitted to trace the current * process, returning 0 if permission is granted, -errno if denied. */ static int hook_ptrace_traceme(struct task_struct *const parent) { return task_ptrace(parent, current); } static struct security_hook_list landlock_hooks[] __ro_after_init = { LSM_HOOK_INIT(ptrace_access_check, hook_ptrace_access_check), LSM_HOOK_INIT(ptrace_traceme, hook_ptrace_traceme), }; __init void landlock_add_ptrace_hooks(void) { security_add_hooks(landlock_hooks, ARRAY_SIZE(landlock_hooks), LANDLOCK_NAME); } |
36 29 4 4 35 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) by Jaroslav Kysela <perex@perex.cz> * Takashi Iwai <tiwai@suse.de> * * Generic memory allocators */ #include <linux/slab.h> #include <linux/mm.h> #include <linux/dma-mapping.h> #include <linux/dma-map-ops.h> #include <linux/genalloc.h> #include <linux/highmem.h> #include <linux/vmalloc.h> #ifdef CONFIG_X86 #include <asm/set_memory.h> #endif #include <sound/memalloc.h> #include "memalloc_local.h" #define DEFAULT_GFP \ (GFP_KERNEL | \ __GFP_RETRY_MAYFAIL | /* don't trigger OOM-killer */ \ __GFP_NOWARN) /* no stack trace print - this call is non-critical */ static const struct snd_malloc_ops *snd_dma_get_ops(struct snd_dma_buffer *dmab); #ifdef CONFIG_SND_DMA_SGBUF static void *snd_dma_sg_fallback_alloc(struct snd_dma_buffer *dmab, size_t size); #endif static void *__snd_dma_alloc_pages(struct snd_dma_buffer *dmab, size_t size) { const struct snd_malloc_ops *ops = snd_dma_get_ops(dmab); if (WARN_ON_ONCE(!ops || !ops->alloc)) return NULL; return ops->alloc(dmab, size); } /** * snd_dma_alloc_dir_pages - allocate the buffer area according to the given * type and direction * @type: the DMA buffer type * @device: the device pointer * @dir: DMA direction * @size: the buffer size to allocate * @dmab: buffer allocation record to store the allocated data * * Calls the memory-allocator function for the corresponding * buffer type. * * Return: Zero if the buffer with the given size is allocated successfully, * otherwise a negative value on error. */ int snd_dma_alloc_dir_pages(int type, struct device *device, enum dma_data_direction dir, size_t size, struct snd_dma_buffer *dmab) { if (WARN_ON(!size)) return -ENXIO; if (WARN_ON(!dmab)) return -ENXIO; size = PAGE_ALIGN(size); dmab->dev.type = type; dmab->dev.dev = device; dmab->dev.dir = dir; dmab->bytes = 0; dmab->addr = 0; dmab->private_data = NULL; dmab->area = __snd_dma_alloc_pages(dmab, size); if (!dmab->area) return -ENOMEM; dmab->bytes = size; return 0; } EXPORT_SYMBOL(snd_dma_alloc_dir_pages); /** * snd_dma_alloc_pages_fallback - allocate the buffer area according to the given type with fallback * @type: the DMA buffer type * @device: the device pointer * @size: the buffer size to allocate * @dmab: buffer allocation record to store the allocated data * * Calls the memory-allocator function for the corresponding * buffer type. When no space is left, this function reduces the size and * tries to allocate again. The size actually allocated is stored in * res_size argument. * * Return: Zero if the buffer with the given size is allocated successfully, * otherwise a negative value on error. */ int snd_dma_alloc_pages_fallback(int type, struct device *device, size_t size, struct snd_dma_buffer *dmab) { int err; while ((err = snd_dma_alloc_pages(type, device, size, dmab)) < 0) { if (err != -ENOMEM) return err; if (size <= PAGE_SIZE) return -ENOMEM; size >>= 1; size = PAGE_SIZE << get_order(size); } if (! dmab->area) return -ENOMEM; return 0; } EXPORT_SYMBOL(snd_dma_alloc_pages_fallback); /** * snd_dma_free_pages - release the allocated buffer * @dmab: the buffer allocation record to release * * Releases the allocated buffer via snd_dma_alloc_pages(). */ void snd_dma_free_pages(struct snd_dma_buffer *dmab) { const struct snd_malloc_ops *ops = snd_dma_get_ops(dmab); if (ops && ops->free) ops->free(dmab); } EXPORT_SYMBOL(snd_dma_free_pages); /* called by devres */ static void __snd_release_pages(struct device *dev, void *res) { snd_dma_free_pages(res); } /** * snd_devm_alloc_dir_pages - allocate the buffer and manage with devres * @dev: the device pointer * @type: the DMA buffer type * @dir: DMA direction * @size: the buffer size to allocate * * Allocate buffer pages depending on the given type and manage using devres. * The pages will be released automatically at the device removal. * * Unlike snd_dma_alloc_pages(), this function requires the real device pointer, * hence it can't work with SNDRV_DMA_TYPE_CONTINUOUS or * SNDRV_DMA_TYPE_VMALLOC type. * * Return: the snd_dma_buffer object at success, or NULL if failed */ struct snd_dma_buffer * snd_devm_alloc_dir_pages(struct device *dev, int type, enum dma_data_direction dir, size_t size) { struct snd_dma_buffer *dmab; int err; if (WARN_ON(type == SNDRV_DMA_TYPE_CONTINUOUS || type == SNDRV_DMA_TYPE_VMALLOC)) return NULL; dmab = devres_alloc(__snd_release_pages, sizeof(*dmab), GFP_KERNEL); if (!dmab) return NULL; err = snd_dma_alloc_dir_pages(type, dev, dir, size, dmab); if (err < 0) { devres_free(dmab); return NULL; } devres_add(dev, dmab); return dmab; } EXPORT_SYMBOL_GPL(snd_devm_alloc_dir_pages); /** * snd_dma_buffer_mmap - perform mmap of the given DMA buffer * @dmab: buffer allocation information * @area: VM area information * * Return: zero if successful, or a negative error code */ int snd_dma_buffer_mmap(struct snd_dma_buffer *dmab, struct vm_area_struct *area) { const struct snd_malloc_ops *ops; if (!dmab) return -ENOENT; ops = snd_dma_get_ops(dmab); if (ops && ops->mmap) return ops->mmap(dmab, area); else return -ENOENT; } EXPORT_SYMBOL(snd_dma_buffer_mmap); #ifdef CONFIG_HAS_DMA /** * snd_dma_buffer_sync - sync DMA buffer between CPU and device * @dmab: buffer allocation information * @mode: sync mode */ void snd_dma_buffer_sync(struct snd_dma_buffer *dmab, enum snd_dma_sync_mode mode) { const struct snd_malloc_ops *ops; if (!dmab || !dmab->dev.need_sync) return; ops = snd_dma_get_ops(dmab); if (ops && ops->sync) ops->sync(dmab, mode); } EXPORT_SYMBOL_GPL(snd_dma_buffer_sync); #endif /* CONFIG_HAS_DMA */ /** * snd_sgbuf_get_addr - return the physical address at the corresponding offset * @dmab: buffer allocation information * @offset: offset in the ring buffer * * Return: the physical address */ dma_addr_t snd_sgbuf_get_addr(struct snd_dma_buffer *dmab, size_t offset) { const struct snd_malloc_ops *ops = snd_dma_get_ops(dmab); if (ops && ops->get_addr) return ops->get_addr(dmab, offset); else return dmab->addr + offset; } EXPORT_SYMBOL(snd_sgbuf_get_addr); /** * snd_sgbuf_get_page - return the physical page at the corresponding offset * @dmab: buffer allocation information * @offset: offset in the ring buffer * * Return: the page pointer */ struct page *snd_sgbuf_get_page(struct snd_dma_buffer *dmab, size_t offset) { const struct snd_malloc_ops *ops = snd_dma_get_ops(dmab); if (ops && ops->get_page) return ops->get_page(dmab, offset); else return virt_to_page(dmab->area + offset); } EXPORT_SYMBOL(snd_sgbuf_get_page); /** * snd_sgbuf_get_chunk_size - compute the max chunk size with continuous pages * on sg-buffer * @dmab: buffer allocation information * @ofs: offset in the ring buffer * @size: the requested size * * Return: the chunk size */ unsigned int snd_sgbuf_get_chunk_size(struct snd_dma_buffer *dmab, unsigned int ofs, unsigned int size) { const struct snd_malloc_ops *ops = snd_dma_get_ops(dmab); if (ops && ops->get_chunk_size) return ops->get_chunk_size(dmab, ofs, size); else return size; } EXPORT_SYMBOL(snd_sgbuf_get_chunk_size); /* * Continuous pages allocator */ static void *do_alloc_pages(struct device *dev, size_t size, dma_addr_t *addr, bool wc) { void *p; gfp_t gfp = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN; again: p = alloc_pages_exact(size, gfp); if (!p) return NULL; *addr = page_to_phys(virt_to_page(p)); if (!dev) return p; if ((*addr + size - 1) & ~dev->coherent_dma_mask) { if (IS_ENABLED(CONFIG_ZONE_DMA32) && !(gfp & GFP_DMA32)) { gfp |= GFP_DMA32; goto again; } if (IS_ENABLED(CONFIG_ZONE_DMA) && !(gfp & GFP_DMA)) { gfp = (gfp & ~GFP_DMA32) | GFP_DMA; goto again; } } #ifdef CONFIG_X86 if (wc) set_memory_wc((unsigned long)(p), size >> PAGE_SHIFT); #endif return p; } static void do_free_pages(void *p, size_t size, bool wc) { #ifdef CONFIG_X86 if (wc) set_memory_wb((unsigned long)(p), size >> PAGE_SHIFT); #endif free_pages_exact(p, size); } static void *snd_dma_continuous_alloc(struct snd_dma_buffer *dmab, size_t size) { return do_alloc_pages(dmab->dev.dev, size, &dmab->addr, false); } static void snd_dma_continuous_free(struct snd_dma_buffer *dmab) { do_free_pages(dmab->area, dmab->bytes, false); } static int snd_dma_continuous_mmap(struct snd_dma_buffer *dmab, struct vm_area_struct *area) { return remap_pfn_range(area, area->vm_start, dmab->addr >> PAGE_SHIFT, area->vm_end - area->vm_start, area->vm_page_prot); } static const struct snd_malloc_ops snd_dma_continuous_ops = { .alloc = snd_dma_continuous_alloc, .free = snd_dma_continuous_free, .mmap = snd_dma_continuous_mmap, }; /* * VMALLOC allocator */ static void *snd_dma_vmalloc_alloc(struct snd_dma_buffer *dmab, size_t size) { return vmalloc(size); } static void snd_dma_vmalloc_free(struct snd_dma_buffer *dmab) { vfree(dmab->area); } static int snd_dma_vmalloc_mmap(struct snd_dma_buffer *dmab, struct vm_area_struct *area) { return remap_vmalloc_range(area, dmab->area, 0); } #define get_vmalloc_page_addr(dmab, offset) \ page_to_phys(vmalloc_to_page((dmab)->area + (offset))) static dma_addr_t snd_dma_vmalloc_get_addr(struct snd_dma_buffer *dmab, size_t offset) { return get_vmalloc_page_addr(dmab, offset) + offset % PAGE_SIZE; } static struct page *snd_dma_vmalloc_get_page(struct snd_dma_buffer *dmab, size_t offset) { return vmalloc_to_page(dmab->area + offset); } static unsigned int snd_dma_vmalloc_get_chunk_size(struct snd_dma_buffer *dmab, unsigned int ofs, unsigned int size) { unsigned int start, end; unsigned long addr; start = ALIGN_DOWN(ofs, PAGE_SIZE); end = ofs + size - 1; /* the last byte address */ /* check page continuity */ addr = get_vmalloc_page_addr(dmab, start); for (;;) { start += PAGE_SIZE; if (start > end) break; addr += PAGE_SIZE; if (get_vmalloc_page_addr(dmab, start) != addr) return start - ofs; } /* ok, all on continuous pages */ return size; } static const struct snd_malloc_ops snd_dma_vmalloc_ops = { .alloc = snd_dma_vmalloc_alloc, .free = snd_dma_vmalloc_free, .mmap = snd_dma_vmalloc_mmap, .get_addr = snd_dma_vmalloc_get_addr, .get_page = snd_dma_vmalloc_get_page, .get_chunk_size = snd_dma_vmalloc_get_chunk_size, }; #ifdef CONFIG_HAS_DMA /* * IRAM allocator */ #ifdef CONFIG_GENERIC_ALLOCATOR static void *snd_dma_iram_alloc(struct snd_dma_buffer *dmab, size_t size) { struct device *dev = dmab->dev.dev; struct gen_pool *pool; void *p; if (dev->of_node) { pool = of_gen_pool_get(dev->of_node, "iram", 0); /* Assign the pool into private_data field */ dmab->private_data = pool; p = gen_pool_dma_alloc_align(pool, size, &dmab->addr, PAGE_SIZE); if (p) return p; } /* Internal memory might have limited size and no enough space, * so if we fail to malloc, try to fetch memory traditionally. */ dmab->dev.type = SNDRV_DMA_TYPE_DEV; return __snd_dma_alloc_pages(dmab, size); } static void snd_dma_iram_free(struct snd_dma_buffer *dmab) { struct gen_pool *pool = dmab->private_data; if (pool && dmab->area) gen_pool_free(pool, (unsigned long)dmab->area, dmab->bytes); } static int snd_dma_iram_mmap(struct snd_dma_buffer *dmab, struct vm_area_struct *area) { area->vm_page_prot = pgprot_writecombine(area->vm_page_prot); return remap_pfn_range(area, area->vm_start, dmab->addr >> PAGE_SHIFT, area->vm_end - area->vm_start, area->vm_page_prot); } static const struct snd_malloc_ops snd_dma_iram_ops = { .alloc = snd_dma_iram_alloc, .free = snd_dma_iram_free, .mmap = snd_dma_iram_mmap, }; #endif /* CONFIG_GENERIC_ALLOCATOR */ /* * Coherent device pages allocator */ static void *snd_dma_dev_alloc(struct snd_dma_buffer *dmab, size_t size) { return dma_alloc_coherent(dmab->dev.dev, size, &dmab->addr, DEFAULT_GFP); } static void snd_dma_dev_free(struct snd_dma_buffer *dmab) { dma_free_coherent(dmab->dev.dev, dmab->bytes, dmab->area, dmab->addr); } static int snd_dma_dev_mmap(struct snd_dma_buffer *dmab, struct vm_area_struct *area) { return dma_mmap_coherent(dmab->dev.dev, area, dmab->area, dmab->addr, dmab->bytes); } static const struct snd_malloc_ops snd_dma_dev_ops = { .alloc = snd_dma_dev_alloc, .free = snd_dma_dev_free, .mmap = snd_dma_dev_mmap, }; /* * Write-combined pages */ /* x86-specific allocations */ #ifdef CONFIG_SND_DMA_SGBUF static void *snd_dma_wc_alloc(struct snd_dma_buffer *dmab, size_t size) { return do_alloc_pages(dmab->dev.dev, size, &dmab->addr, true); } static void snd_dma_wc_free(struct snd_dma_buffer *dmab) { do_free_pages(dmab->area, dmab->bytes, true); } static int snd_dma_wc_mmap(struct snd_dma_buffer *dmab, struct vm_area_struct *area) { area->vm_page_prot = pgprot_writecombine(area->vm_page_prot); return snd_dma_continuous_mmap(dmab, area); } #else static void *snd_dma_wc_alloc(struct snd_dma_buffer *dmab, size_t size) { return dma_alloc_wc(dmab->dev.dev, size, &dmab->addr, DEFAULT_GFP); } static void snd_dma_wc_free(struct snd_dma_buffer *dmab) { dma_free_wc(dmab->dev.dev, dmab->bytes, dmab->area, dmab->addr); } static int snd_dma_wc_mmap(struct snd_dma_buffer *dmab, struct vm_area_struct *area) { return dma_mmap_wc(dmab->dev.dev, area, dmab->area, dmab->addr, dmab->bytes); } #endif /* CONFIG_SND_DMA_SGBUF */ static const struct snd_malloc_ops snd_dma_wc_ops = { .alloc = snd_dma_wc_alloc, .free = snd_dma_wc_free, .mmap = snd_dma_wc_mmap, }; /* * Non-contiguous pages allocator */ static void *snd_dma_noncontig_alloc(struct snd_dma_buffer *dmab, size_t size) { struct sg_table *sgt; void *p; #ifdef CONFIG_SND_DMA_SGBUF if (cpu_feature_enabled(X86_FEATURE_XENPV)) return snd_dma_sg_fallback_alloc(dmab, size); #endif sgt = dma_alloc_noncontiguous(dmab->dev.dev, size, dmab->dev.dir, DEFAULT_GFP, 0); #ifdef CONFIG_SND_DMA_SGBUF if (!sgt && !get_dma_ops(dmab->dev.dev)) return snd_dma_sg_fallback_alloc(dmab, size); #endif if (!sgt) return NULL; dmab->dev.need_sync = dma_need_sync(dmab->dev.dev, sg_dma_address(sgt->sgl)); p = dma_vmap_noncontiguous(dmab->dev.dev, size, sgt); if (p) { dmab->private_data = sgt; /* store the first page address for convenience */ dmab->addr = snd_sgbuf_get_addr(dmab, 0); } else { dma_free_noncontiguous(dmab->dev.dev, size, sgt, dmab->dev.dir); } return p; } static void snd_dma_noncontig_free(struct snd_dma_buffer *dmab) { dma_vunmap_noncontiguous(dmab->dev.dev, dmab->area); dma_free_noncontiguous(dmab->dev.dev, dmab->bytes, dmab->private_data, dmab->dev.dir); } static int snd_dma_noncontig_mmap(struct snd_dma_buffer *dmab, struct vm_area_struct *area) { return dma_mmap_noncontiguous(dmab->dev.dev, area, dmab->bytes, dmab->private_data); } static void snd_dma_noncontig_sync(struct snd_dma_buffer *dmab, enum snd_dma_sync_mode mode) { if (mode == SNDRV_DMA_SYNC_CPU) { if (dmab->dev.dir == DMA_TO_DEVICE) return; invalidate_kernel_vmap_range(dmab->area, dmab->bytes); dma_sync_sgtable_for_cpu(dmab->dev.dev, dmab->private_data, dmab->dev.dir); } else { if (dmab->dev.dir == DMA_FROM_DEVICE) return; flush_kernel_vmap_range(dmab->area, dmab->bytes); dma_sync_sgtable_for_device(dmab->dev.dev, dmab->private_data, dmab->dev.dir); } } static inline void snd_dma_noncontig_iter_set(struct snd_dma_buffer *dmab, struct sg_page_iter *piter, size_t offset) { struct sg_table *sgt = dmab->private_data; __sg_page_iter_start(piter, sgt->sgl, sgt->orig_nents, offset >> PAGE_SHIFT); } static dma_addr_t snd_dma_noncontig_get_addr(struct snd_dma_buffer *dmab, size_t offset) { struct sg_dma_page_iter iter; snd_dma_noncontig_iter_set(dmab, &iter.base, offset); __sg_page_iter_dma_next(&iter); return sg_page_iter_dma_address(&iter) + offset % PAGE_SIZE; } static struct page *snd_dma_noncontig_get_page(struct snd_dma_buffer *dmab, size_t offset) { struct sg_page_iter iter; snd_dma_noncontig_iter_set(dmab, &iter, offset); __sg_page_iter_next(&iter); return sg_page_iter_page(&iter); } static unsigned int snd_dma_noncontig_get_chunk_size(struct snd_dma_buffer *dmab, unsigned int ofs, unsigned int size) { struct sg_dma_page_iter iter; unsigned int start, end; unsigned long addr; start = ALIGN_DOWN(ofs, PAGE_SIZE); end = ofs + size - 1; /* the last byte address */ snd_dma_noncontig_iter_set(dmab, &iter.base, start); if (!__sg_page_iter_dma_next(&iter)) return 0; /* check page continuity */ addr = sg_page_iter_dma_address(&iter); for (;;) { start += PAGE_SIZE; if (start > end) break; addr += PAGE_SIZE; if (!__sg_page_iter_dma_next(&iter) || sg_page_iter_dma_address(&iter) != addr) return start - ofs; } /* ok, all on continuous pages */ return size; } static const struct snd_malloc_ops snd_dma_noncontig_ops = { .alloc = snd_dma_noncontig_alloc, .free = snd_dma_noncontig_free, .mmap = snd_dma_noncontig_mmap, .sync = snd_dma_noncontig_sync, .get_addr = snd_dma_noncontig_get_addr, .get_page = snd_dma_noncontig_get_page, .get_chunk_size = snd_dma_noncontig_get_chunk_size, }; /* x86-specific SG-buffer with WC pages */ #ifdef CONFIG_SND_DMA_SGBUF #define sg_wc_address(it) ((unsigned long)page_address(sg_page_iter_page(it))) static void *snd_dma_sg_wc_alloc(struct snd_dma_buffer *dmab, size_t size) { void *p = snd_dma_noncontig_alloc(dmab, size); struct sg_table *sgt = dmab->private_data; struct sg_page_iter iter; if (!p) return NULL; if (dmab->dev.type != SNDRV_DMA_TYPE_DEV_WC_SG) return p; for_each_sgtable_page(sgt, &iter, 0) set_memory_wc(sg_wc_address(&iter), 1); return p; } static void snd_dma_sg_wc_free(struct snd_dma_buffer *dmab) { struct sg_table *sgt = dmab->private_data; struct sg_page_iter iter; for_each_sgtable_page(sgt, &iter, 0) set_memory_wb(sg_wc_address(&iter), 1); snd_dma_noncontig_free(dmab); } static int snd_dma_sg_wc_mmap(struct snd_dma_buffer *dmab, struct vm_area_struct *area) { area->vm_page_prot = pgprot_writecombine(area->vm_page_prot); return dma_mmap_noncontiguous(dmab->dev.dev, area, dmab->bytes, dmab->private_data); } static const struct snd_malloc_ops snd_dma_sg_wc_ops = { .alloc = snd_dma_sg_wc_alloc, .free = snd_dma_sg_wc_free, .mmap = snd_dma_sg_wc_mmap, .sync = snd_dma_noncontig_sync, .get_addr = snd_dma_noncontig_get_addr, .get_page = snd_dma_noncontig_get_page, .get_chunk_size = snd_dma_noncontig_get_chunk_size, }; /* Fallback SG-buffer allocations for x86 */ struct snd_dma_sg_fallback { bool use_dma_alloc_coherent; size_t count; struct page **pages; /* DMA address array; the first page contains #pages in ~PAGE_MASK */ dma_addr_t *addrs; }; static void __snd_dma_sg_fallback_free(struct snd_dma_buffer *dmab, struct snd_dma_sg_fallback *sgbuf) { size_t i, size; if (sgbuf->pages && sgbuf->addrs) { i = 0; while (i < sgbuf->count) { if (!sgbuf->pages[i] || !sgbuf->addrs[i]) break; size = sgbuf->addrs[i] & ~PAGE_MASK; if (WARN_ON(!size)) break; if (sgbuf->use_dma_alloc_coherent) dma_free_coherent(dmab->dev.dev, size << PAGE_SHIFT, page_address(sgbuf->pages[i]), sgbuf->addrs[i] & PAGE_MASK); else do_free_pages(page_address(sgbuf->pages[i]), size << PAGE_SHIFT, false); i += size; } } kvfree(sgbuf->pages); kvfree(sgbuf->addrs); kfree(sgbuf); } static void *snd_dma_sg_fallback_alloc(struct snd_dma_buffer *dmab, size_t size) { struct snd_dma_sg_fallback *sgbuf; struct page **pagep, *curp; size_t chunk, npages; dma_addr_t *addrp; dma_addr_t addr; void *p; /* correct the type */ if (dmab->dev.type == SNDRV_DMA_TYPE_DEV_SG) dmab->dev.type = SNDRV_DMA_TYPE_DEV_SG_FALLBACK; else if (dmab->dev.type == SNDRV_DMA_TYPE_DEV_WC_SG) dmab->dev.type = SNDRV_DMA_TYPE_DEV_WC_SG_FALLBACK; sgbuf = kzalloc(sizeof(*sgbuf), GFP_KERNEL); if (!sgbuf) return NULL; sgbuf->use_dma_alloc_coherent = cpu_feature_enabled(X86_FEATURE_XENPV); size = PAGE_ALIGN(size); sgbuf->count = size >> PAGE_SHIFT; sgbuf->pages = kvcalloc(sgbuf->count, sizeof(*sgbuf->pages), GFP_KERNEL); sgbuf->addrs = kvcalloc(sgbuf->count, sizeof(*sgbuf->addrs), GFP_KERNEL); if (!sgbuf->pages || !sgbuf->addrs) goto error; pagep = sgbuf->pages; addrp = sgbuf->addrs; chunk = (PAGE_SIZE - 1) << PAGE_SHIFT; /* to fit in low bits in addrs */ while (size > 0) { chunk = min(size, chunk); if (sgbuf->use_dma_alloc_coherent) p = dma_alloc_coherent(dmab->dev.dev, chunk, &addr, DEFAULT_GFP); else p = do_alloc_pages(dmab->dev.dev, chunk, &addr, false); if (!p) { if (chunk <= PAGE_SIZE) goto error; chunk >>= 1; chunk = PAGE_SIZE << get_order(chunk); continue; } size -= chunk; /* fill pages */ npages = chunk >> PAGE_SHIFT; *addrp = npages; /* store in lower bits */ curp = virt_to_page(p); while (npages--) { *pagep++ = curp++; *addrp++ |= addr; addr += PAGE_SIZE; } } p = vmap(sgbuf->pages, sgbuf->count, VM_MAP, PAGE_KERNEL); if (!p) goto error; if (dmab->dev.type == SNDRV_DMA_TYPE_DEV_WC_SG_FALLBACK) set_pages_array_wc(sgbuf->pages, sgbuf->count); dmab->private_data = sgbuf; /* store the first page address for convenience */ dmab->addr = sgbuf->addrs[0] & PAGE_MASK; return p; error: __snd_dma_sg_fallback_free(dmab, sgbuf); return NULL; } static void snd_dma_sg_fallback_free(struct snd_dma_buffer *dmab) { struct snd_dma_sg_fallback *sgbuf = dmab->private_data; if (dmab->dev.type == SNDRV_DMA_TYPE_DEV_WC_SG_FALLBACK) set_pages_array_wb(sgbuf->pages, sgbuf->count); vunmap(dmab->area); __snd_dma_sg_fallback_free(dmab, dmab->private_data); } static dma_addr_t snd_dma_sg_fallback_get_addr(struct snd_dma_buffer *dmab, size_t offset) { struct snd_dma_sg_fallback *sgbuf = dmab->private_data; size_t index = offset >> PAGE_SHIFT; return (sgbuf->addrs[index] & PAGE_MASK) | (offset & ~PAGE_MASK); } static int snd_dma_sg_fallback_mmap(struct snd_dma_buffer *dmab, struct vm_area_struct *area) { struct snd_dma_sg_fallback *sgbuf = dmab->private_data; if (dmab->dev.type == SNDRV_DMA_TYPE_DEV_WC_SG_FALLBACK) area->vm_page_prot = pgprot_writecombine(area->vm_page_prot); return vm_map_pages(area, sgbuf->pages, sgbuf->count); } static const struct snd_malloc_ops snd_dma_sg_fallback_ops = { .alloc = snd_dma_sg_fallback_alloc, .free = snd_dma_sg_fallback_free, .mmap = snd_dma_sg_fallback_mmap, .get_addr = snd_dma_sg_fallback_get_addr, /* reuse vmalloc helpers */ .get_page = snd_dma_vmalloc_get_page, .get_chunk_size = snd_dma_vmalloc_get_chunk_size, }; #endif /* CONFIG_SND_DMA_SGBUF */ /* * Non-coherent pages allocator */ static void *snd_dma_noncoherent_alloc(struct snd_dma_buffer *dmab, size_t size) { void *p; p = dma_alloc_noncoherent(dmab->dev.dev, size, &dmab->addr, dmab->dev.dir, DEFAULT_GFP); if (p) dmab->dev.need_sync = dma_need_sync(dmab->dev.dev, dmab->addr); return p; } static void snd_dma_noncoherent_free(struct snd_dma_buffer *dmab) { dma_free_noncoherent(dmab->dev.dev, dmab->bytes, dmab->area, dmab->addr, dmab->dev.dir); } static int snd_dma_noncoherent_mmap(struct snd_dma_buffer *dmab, struct vm_area_struct *area) { area->vm_page_prot = vm_get_page_prot(area->vm_flags); return dma_mmap_pages(dmab->dev.dev, area, area->vm_end - area->vm_start, virt_to_page(dmab->area)); } static void snd_dma_noncoherent_sync(struct snd_dma_buffer *dmab, enum snd_dma_sync_mode mode) { if (mode == SNDRV_DMA_SYNC_CPU) { if (dmab->dev.dir != DMA_TO_DEVICE) dma_sync_single_for_cpu(dmab->dev.dev, dmab->addr, dmab->bytes, dmab->dev.dir); } else { if (dmab->dev.dir != DMA_FROM_DEVICE) dma_sync_single_for_device(dmab->dev.dev, dmab->addr, dmab->bytes, dmab->dev.dir); } } static const struct snd_malloc_ops snd_dma_noncoherent_ops = { .alloc = snd_dma_noncoherent_alloc, .free = snd_dma_noncoherent_free, .mmap = snd_dma_noncoherent_mmap, .sync = snd_dma_noncoherent_sync, }; #endif /* CONFIG_HAS_DMA */ /* * Entry points */ static const struct snd_malloc_ops *snd_dma_ops[] = { [SNDRV_DMA_TYPE_CONTINUOUS] = &snd_dma_continuous_ops, [SNDRV_DMA_TYPE_VMALLOC] = &snd_dma_vmalloc_ops, #ifdef CONFIG_HAS_DMA [SNDRV_DMA_TYPE_DEV] = &snd_dma_dev_ops, [SNDRV_DMA_TYPE_DEV_WC] = &snd_dma_wc_ops, [SNDRV_DMA_TYPE_NONCONTIG] = &snd_dma_noncontig_ops, [SNDRV_DMA_TYPE_NONCOHERENT] = &snd_dma_noncoherent_ops, #ifdef CONFIG_SND_DMA_SGBUF [SNDRV_DMA_TYPE_DEV_WC_SG] = &snd_dma_sg_wc_ops, #endif #ifdef CONFIG_GENERIC_ALLOCATOR [SNDRV_DMA_TYPE_DEV_IRAM] = &snd_dma_iram_ops, #endif /* CONFIG_GENERIC_ALLOCATOR */ #ifdef CONFIG_SND_DMA_SGBUF [SNDRV_DMA_TYPE_DEV_SG_FALLBACK] = &snd_dma_sg_fallback_ops, [SNDRV_DMA_TYPE_DEV_WC_SG_FALLBACK] = &snd_dma_sg_fallback_ops, #endif #endif /* CONFIG_HAS_DMA */ }; static const struct snd_malloc_ops *snd_dma_get_ops(struct snd_dma_buffer *dmab) { if (WARN_ON_ONCE(!dmab)) return NULL; if (WARN_ON_ONCE(dmab->dev.type <= SNDRV_DMA_TYPE_UNKNOWN || dmab->dev.type >= ARRAY_SIZE(snd_dma_ops))) return NULL; return snd_dma_ops[dmab->dev.type]; } |
180 179 46 39 196 196 217 204 51 38 7 180 178 188 51 74 36 8 187 38 38 36 38 38 178 178 178 178 176 7 177 178 178 38 8 196 196 196 196 196 196 196 179 179 179 177 7 180 46 179 39 39 9 38 178 178 176 4 4 4 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 | /* * zsmalloc memory allocator * * Copyright (C) 2011 Nitin Gupta * Copyright (C) 2012, 2013 Minchan Kim * * This code is released using a dual license strategy: BSD/GPL * You can choose the license that better fits your requirements. * * Released under the terms of 3-clause BSD License * Released under the terms of GNU General Public License Version 2.0 */ /* * Following is how we use various fields and flags of underlying * struct page(s) to form a zspage. * * Usage of struct page fields: * page->private: points to zspage * page->index: links together all component pages of a zspage * For the huge page, this is always 0, so we use this field * to store handle. * page->page_type: first object offset in a subpage of zspage * * Usage of struct page flags: * PG_private: identifies the first component page * PG_owner_priv_1: identifies the huge component page * */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt /* * lock ordering: * page_lock * pool->lock * zspage->lock */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/bitops.h> #include <linux/errno.h> #include <linux/highmem.h> #include <linux/string.h> #include <linux/slab.h> #include <linux/pgtable.h> #include <asm/tlbflush.h> #include <linux/cpumask.h> #include <linux/cpu.h> #include <linux/vmalloc.h> #include <linux/preempt.h> #include <linux/spinlock.h> #include <linux/shrinker.h> #include <linux/types.h> #include <linux/debugfs.h> #include <linux/zsmalloc.h> #include <linux/zpool.h> #include <linux/migrate.h> #include <linux/wait.h> #include <linux/pagemap.h> #include <linux/fs.h> #include <linux/local_lock.h> #define ZSPAGE_MAGIC 0x58 /* * This must be power of 2 and greater than or equal to sizeof(link_free). * These two conditions ensure that any 'struct link_free' itself doesn't * span more than 1 page which avoids complex case of mapping 2 pages simply * to restore link_free pointer values. */ #define ZS_ALIGN 8 #define ZS_HANDLE_SIZE (sizeof(unsigned long)) /* * Object location (<PFN>, <obj_idx>) is encoded as * a single (unsigned long) handle value. * * Note that object index <obj_idx> starts from 0. * * This is made more complicated by various memory models and PAE. */ #ifndef MAX_POSSIBLE_PHYSMEM_BITS #ifdef MAX_PHYSMEM_BITS #define MAX_POSSIBLE_PHYSMEM_BITS MAX_PHYSMEM_BITS #else /* * If this definition of MAX_PHYSMEM_BITS is used, OBJ_INDEX_BITS will just * be PAGE_SHIFT */ #define MAX_POSSIBLE_PHYSMEM_BITS BITS_PER_LONG #endif #endif #define _PFN_BITS (MAX_POSSIBLE_PHYSMEM_BITS - PAGE_SHIFT) /* * Head in allocated object should have OBJ_ALLOCATED_TAG * to identify the object was allocated or not. * It's okay to add the status bit in the least bit because * header keeps handle which is 4byte-aligned address so we * have room for two bit at least. */ #define OBJ_ALLOCATED_TAG 1 #define OBJ_TAG_BITS 1 #define OBJ_TAG_MASK OBJ_ALLOCATED_TAG #define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS) #define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1) #define HUGE_BITS 1 #define FULLNESS_BITS 4 #define CLASS_BITS 8 #define ISOLATED_BITS 5 #define MAGIC_VAL_BITS 8 #define MAX(a, b) ((a) >= (b) ? (a) : (b)) #define ZS_MAX_PAGES_PER_ZSPAGE (_AC(CONFIG_ZSMALLOC_CHAIN_SIZE, UL)) /* ZS_MIN_ALLOC_SIZE must be multiple of ZS_ALIGN */ #define ZS_MIN_ALLOC_SIZE \ MAX(32, (ZS_MAX_PAGES_PER_ZSPAGE << PAGE_SHIFT >> OBJ_INDEX_BITS)) /* each chunk includes extra space to keep handle */ #define ZS_MAX_ALLOC_SIZE PAGE_SIZE /* * On systems with 4K page size, this gives 255 size classes! There is a * trader-off here: * - Large number of size classes is potentially wasteful as free page are * spread across these classes * - Small number of size classes causes large internal fragmentation * - Probably its better to use specific size classes (empirically * determined). NOTE: all those class sizes must be set as multiple of * ZS_ALIGN to make sure link_free itself never has to span 2 pages. * * ZS_MIN_ALLOC_SIZE and ZS_SIZE_CLASS_DELTA must be multiple of ZS_ALIGN * (reason above) */ #define ZS_SIZE_CLASS_DELTA (PAGE_SIZE >> CLASS_BITS) #define ZS_SIZE_CLASSES (DIV_ROUND_UP(ZS_MAX_ALLOC_SIZE - ZS_MIN_ALLOC_SIZE, \ ZS_SIZE_CLASS_DELTA) + 1) /* * Pages are distinguished by the ratio of used memory (that is the ratio * of ->inuse objects to all objects that page can store). For example, * INUSE_RATIO_10 means that the ratio of used objects is > 0% and <= 10%. * * The number of fullness groups is not random. It allows us to keep * difference between the least busy page in the group (minimum permitted * number of ->inuse objects) and the most busy page (maximum permitted * number of ->inuse objects) at a reasonable value. */ enum fullness_group { ZS_INUSE_RATIO_0, ZS_INUSE_RATIO_10, /* NOTE: 8 more fullness groups here */ ZS_INUSE_RATIO_99 = 10, ZS_INUSE_RATIO_100, NR_FULLNESS_GROUPS, }; enum class_stat_type { /* NOTE: stats for 12 fullness groups here: from inuse 0 to 100 */ ZS_OBJS_ALLOCATED = NR_FULLNESS_GROUPS, ZS_OBJS_INUSE, NR_CLASS_STAT_TYPES, }; struct zs_size_stat { unsigned long objs[NR_CLASS_STAT_TYPES]; }; #ifdef CONFIG_ZSMALLOC_STAT static struct dentry *zs_stat_root; #endif static size_t huge_class_size; struct size_class { struct list_head fullness_list[NR_FULLNESS_GROUPS]; /* * Size of objects stored in this class. Must be multiple * of ZS_ALIGN. */ int size; int objs_per_zspage; /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */ int pages_per_zspage; unsigned int index; struct zs_size_stat stats; }; /* * Placed within free objects to form a singly linked list. * For every zspage, zspage->freeobj gives head of this list. * * This must be power of 2 and less than or equal to ZS_ALIGN */ struct link_free { union { /* * Free object index; * It's valid for non-allocated object */ unsigned long next; /* * Handle of allocated object. */ unsigned long handle; }; }; struct zs_pool { const char *name; struct size_class *size_class[ZS_SIZE_CLASSES]; struct kmem_cache *handle_cachep; struct kmem_cache *zspage_cachep; atomic_long_t pages_allocated; struct zs_pool_stats stats; /* Compact classes */ struct shrinker *shrinker; #ifdef CONFIG_ZSMALLOC_STAT struct dentry *stat_dentry; #endif #ifdef CONFIG_COMPACTION struct work_struct free_work; #endif spinlock_t lock; atomic_t compaction_in_progress; }; struct zspage { struct { unsigned int huge:HUGE_BITS; unsigned int fullness:FULLNESS_BITS; unsigned int class:CLASS_BITS + 1; unsigned int isolated:ISOLATED_BITS; unsigned int magic:MAGIC_VAL_BITS; }; unsigned int inuse; unsigned int freeobj; struct page *first_page; struct list_head list; /* fullness list */ struct zs_pool *pool; rwlock_t lock; }; struct mapping_area { local_lock_t lock; char *vm_buf; /* copy buffer for objects that span pages */ char *vm_addr; /* address of kmap_atomic()'ed pages */ enum zs_mapmode vm_mm; /* mapping mode */ }; /* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */ static void SetZsHugePage(struct zspage *zspage) { zspage->huge = 1; } static bool ZsHugePage(struct zspage *zspage) { return zspage->huge; } static void migrate_lock_init(struct zspage *zspage); static void migrate_read_lock(struct zspage *zspage); static void migrate_read_unlock(struct zspage *zspage); #ifdef CONFIG_COMPACTION static void migrate_write_lock(struct zspage *zspage); static void migrate_write_lock_nested(struct zspage *zspage); static void migrate_write_unlock(struct zspage *zspage); static void kick_deferred_free(struct zs_pool *pool); static void init_deferred_free(struct zs_pool *pool); static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage); #else static void migrate_write_lock(struct zspage *zspage) {} static void migrate_write_lock_nested(struct zspage *zspage) {} static void migrate_write_unlock(struct zspage *zspage) {} static void kick_deferred_free(struct zs_pool *pool) {} static void init_deferred_free(struct zs_pool *pool) {} static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) {} #endif static int create_cache(struct zs_pool *pool) { pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE, 0, 0, NULL); if (!pool->handle_cachep) return 1; pool->zspage_cachep = kmem_cache_create("zspage", sizeof(struct zspage), 0, 0, NULL); if (!pool->zspage_cachep) { kmem_cache_destroy(pool->handle_cachep); pool->handle_cachep = NULL; return 1; } return 0; } static void destroy_cache(struct zs_pool *pool) { kmem_cache_destroy(pool->handle_cachep); kmem_cache_destroy(pool->zspage_cachep); } static unsigned long cache_alloc_handle(struct zs_pool *pool, gfp_t gfp) { return (unsigned long)kmem_cache_alloc(pool->handle_cachep, gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE)); } static void cache_free_handle(struct zs_pool *pool, unsigned long handle) { kmem_cache_free(pool->handle_cachep, (void *)handle); } static struct zspage *cache_alloc_zspage(struct zs_pool *pool, gfp_t flags) { return kmem_cache_zalloc(pool->zspage_cachep, flags & ~(__GFP_HIGHMEM|__GFP_MOVABLE)); } static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage) { kmem_cache_free(pool->zspage_cachep, zspage); } /* pool->lock(which owns the handle) synchronizes races */ static void record_obj(unsigned long handle, unsigned long obj) { *(unsigned long *)handle = obj; } /* zpool driver */ #ifdef CONFIG_ZPOOL static void *zs_zpool_create(const char *name, gfp_t gfp) { /* * Ignore global gfp flags: zs_malloc() may be invoked from * different contexts and its caller must provide a valid * gfp mask. */ return zs_create_pool(name); } static void zs_zpool_destroy(void *pool) { zs_destroy_pool(pool); } static int zs_zpool_malloc(void *pool, size_t size, gfp_t gfp, unsigned long *handle) { *handle = zs_malloc(pool, size, gfp); if (IS_ERR_VALUE(*handle)) return PTR_ERR((void *)*handle); return 0; } static void zs_zpool_free(void *pool, unsigned long handle) { zs_free(pool, handle); } static void *zs_zpool_map(void *pool, unsigned long handle, enum zpool_mapmode mm) { enum zs_mapmode zs_mm; switch (mm) { case ZPOOL_MM_RO: zs_mm = ZS_MM_RO; break; case ZPOOL_MM_WO: zs_mm = ZS_MM_WO; break; case ZPOOL_MM_RW: default: zs_mm = ZS_MM_RW; break; } return zs_map_object(pool, handle, zs_mm); } static void zs_zpool_unmap(void *pool, unsigned long handle) { zs_unmap_object(pool, handle); } static u64 zs_zpool_total_size(void *pool) { return zs_get_total_pages(pool) << PAGE_SHIFT; } static struct zpool_driver zs_zpool_driver = { .type = "zsmalloc", .owner = THIS_MODULE, .create = zs_zpool_create, .destroy = zs_zpool_destroy, .malloc_support_movable = true, .malloc = zs_zpool_malloc, .free = zs_zpool_free, .map = zs_zpool_map, .unmap = zs_zpool_unmap, .total_size = zs_zpool_total_size, }; MODULE_ALIAS("zpool-zsmalloc"); #endif /* CONFIG_ZPOOL */ /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ static DEFINE_PER_CPU(struct mapping_area, zs_map_area) = { .lock = INIT_LOCAL_LOCK(lock), }; static __maybe_unused int is_first_page(struct page *page) { return PagePrivate(page); } /* Protected by pool->lock */ static inline int get_zspage_inuse(struct zspage *zspage) { return zspage->inuse; } static inline void mod_zspage_inuse(struct zspage *zspage, int val) { zspage->inuse += val; } static inline struct page *get_first_page(struct zspage *zspage) { struct page *first_page = zspage->first_page; VM_BUG_ON_PAGE(!is_first_page(first_page), first_page); return first_page; } static inline unsigned int get_first_obj_offset(struct page *page) { return page->page_type; } static inline void set_first_obj_offset(struct page *page, unsigned int offset) { page->page_type = offset; } static inline unsigned int get_freeobj(struct zspage *zspage) { return zspage->freeobj; } static inline void set_freeobj(struct zspage *zspage, unsigned int obj) { zspage->freeobj = obj; } static void get_zspage_mapping(struct zspage *zspage, unsigned int *class_idx, int *fullness) { BUG_ON(zspage->magic != ZSPAGE_MAGIC); *fullness = zspage->fullness; *class_idx = zspage->class; } static struct size_class *zspage_class(struct zs_pool *pool, struct zspage *zspage) { return pool->size_class[zspage->class]; } static void set_zspage_mapping(struct zspage *zspage, unsigned int class_idx, int fullness) { zspage->class = class_idx; zspage->fullness = fullness; } /* * zsmalloc divides the pool into various size classes where each * class maintains a list of zspages where each zspage is divided * into equal sized chunks. Each allocation falls into one of these * classes depending on its size. This function returns index of the * size class which has chunk size big enough to hold the given size. */ static int get_size_class_index(int size) { int idx = 0; if (likely(size > ZS_MIN_ALLOC_SIZE)) idx = DIV_ROUND_UP(size - ZS_MIN_ALLOC_SIZE, ZS_SIZE_CLASS_DELTA); return min_t(int, ZS_SIZE_CLASSES - 1, idx); } static inline void class_stat_inc(struct size_class *class, int type, unsigned long cnt) { class->stats.objs[type] += cnt; } static inline void class_stat_dec(struct size_class *class, int type, unsigned long cnt) { class->stats.objs[type] -= cnt; } static inline unsigned long zs_stat_get(struct size_class *class, int type) { return class->stats.objs[type]; } #ifdef CONFIG_ZSMALLOC_STAT static void __init zs_stat_init(void) { if (!debugfs_initialized()) { pr_warn("debugfs not available, stat dir not created\n"); return; } zs_stat_root = debugfs_create_dir("zsmalloc", NULL); } static void __exit zs_stat_exit(void) { debugfs_remove_recursive(zs_stat_root); } static unsigned long zs_can_compact(struct size_class *class); static int zs_stats_size_show(struct seq_file *s, void *v) { int i, fg; struct zs_pool *pool = s->private; struct size_class *class; int objs_per_zspage; unsigned long obj_allocated, obj_used, pages_used, freeable; unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0; unsigned long total_freeable = 0; unsigned long inuse_totals[NR_FULLNESS_GROUPS] = {0, }; seq_printf(s, " %5s %5s %9s %9s %9s %9s %9s %9s %9s %9s %9s %9s %9s %13s %10s %10s %16s %8s\n", "class", "size", "10%", "20%", "30%", "40%", "50%", "60%", "70%", "80%", "90%", "99%", "100%", "obj_allocated", "obj_used", "pages_used", "pages_per_zspage", "freeable"); for (i = 0; i < ZS_SIZE_CLASSES; i++) { class = pool->size_class[i]; if (class->index != i) continue; spin_lock(&pool->lock); seq_printf(s, " %5u %5u ", i, class->size); for (fg = ZS_INUSE_RATIO_10; fg < NR_FULLNESS_GROUPS; fg++) { inuse_totals[fg] += zs_stat_get(class, fg); seq_printf(s, "%9lu ", zs_stat_get(class, fg)); } obj_allocated = zs_stat_get(class, ZS_OBJS_ALLOCATED); obj_used = zs_stat_get(class, ZS_OBJS_INUSE); freeable = zs_can_compact(class); spin_unlock(&pool->lock); objs_per_zspage = class->objs_per_zspage; pages_used = obj_allocated / objs_per_zspage * class->pages_per_zspage; seq_printf(s, "%13lu %10lu %10lu %16d %8lu\n", obj_allocated, obj_used, pages_used, class->pages_per_zspage, freeable); total_objs += obj_allocated; total_used_objs += obj_used; total_pages += pages_used; total_freeable += freeable; } seq_puts(s, "\n"); seq_printf(s, " %5s %5s ", "Total", ""); for (fg = ZS_INUSE_RATIO_10; fg < NR_FULLNESS_GROUPS; fg++) seq_printf(s, "%9lu ", inuse_totals[fg]); seq_printf(s, "%13lu %10lu %10lu %16s %8lu\n", total_objs, total_used_objs, total_pages, "", total_freeable); return 0; } DEFINE_SHOW_ATTRIBUTE(zs_stats_size); static void zs_pool_stat_create(struct zs_pool *pool, const char *name) { if (!zs_stat_root) { pr_warn("no root stat dir, not creating <%s> stat dir\n", name); return; } pool->stat_dentry = debugfs_create_dir(name, zs_stat_root); debugfs_create_file("classes", S_IFREG | 0444, pool->stat_dentry, pool, &zs_stats_size_fops); } static void zs_pool_stat_destroy(struct zs_pool *pool) { debugfs_remove_recursive(pool->stat_dentry); } #else /* CONFIG_ZSMALLOC_STAT */ static void __init zs_stat_init(void) { } static void __exit zs_stat_exit(void) { } static inline void zs_pool_stat_create(struct zs_pool *pool, const char *name) { } static inline void zs_pool_stat_destroy(struct zs_pool *pool) { } #endif /* * For each size class, zspages are divided into different groups * depending on their usage ratio. This function returns fullness * status of the given page. */ static int get_fullness_group(struct size_class *class, struct zspage *zspage) { int inuse, objs_per_zspage, ratio; inuse = get_zspage_inuse(zspage); objs_per_zspage = class->objs_per_zspage; if (inuse == 0) return ZS_INUSE_RATIO_0; if (inuse == objs_per_zspage) return ZS_INUSE_RATIO_100; ratio = 100 * inuse / objs_per_zspage; /* * Take integer division into consideration: a page with one inuse * object out of 127 possible, will end up having 0 usage ratio, * which is wrong as it belongs in ZS_INUSE_RATIO_10 fullness group. */ return ratio / 10 + 1; } /* * Each size class maintains various freelists and zspages are assigned * to one of these freelists based on the number of live objects they * have. This functions inserts the given zspage into the freelist * identified by <class, fullness_group>. */ static void insert_zspage(struct size_class *class, struct zspage *zspage, int fullness) { class_stat_inc(class, fullness, 1); list_add(&zspage->list, &class->fullness_list[fullness]); } /* * This function removes the given zspage from the freelist identified * by <class, fullness_group>. */ static void remove_zspage(struct size_class *class, struct zspage *zspage, int fullness) { VM_BUG_ON(list_empty(&class->fullness_list[fullness])); list_del_init(&zspage->list); class_stat_dec(class, fullness, 1); } /* * Each size class maintains zspages in different fullness groups depending * on the number of live objects they contain. When allocating or freeing * objects, the fullness status of the page can change, for instance, from * INUSE_RATIO_80 to INUSE_RATIO_70 when freeing an object. This function * checks if such a status change has occurred for the given page and * accordingly moves the page from the list of the old fullness group to that * of the new fullness group. */ static int fix_fullness_group(struct size_class *class, struct zspage *zspage) { int class_idx; int currfg, newfg; get_zspage_mapping(zspage, &class_idx, &currfg); newfg = get_fullness_group(class, zspage); if (newfg == currfg) goto out; remove_zspage(class, zspage, currfg); insert_zspage(class, zspage, newfg); set_zspage_mapping(zspage, class_idx, newfg); out: return newfg; } static struct zspage *get_zspage(struct page *page) { struct zspage *zspage = (struct zspage *)page_private(page); BUG_ON(zspage->magic != ZSPAGE_MAGIC); return zspage; } static struct page *get_next_page(struct page *page) { struct zspage *zspage = get_zspage(page); if (unlikely(ZsHugePage(zspage))) return NULL; return (struct page *)page->index; } /** * obj_to_location - get (<page>, <obj_idx>) from encoded object value * @obj: the encoded object value * @page: page object resides in zspage * @obj_idx: object index */ static void obj_to_location(unsigned long obj, struct page **page, unsigned int *obj_idx) { obj >>= OBJ_TAG_BITS; *page = pfn_to_page(obj >> OBJ_INDEX_BITS); *obj_idx = (obj & OBJ_INDEX_MASK); } static void obj_to_page(unsigned long obj, struct page **page) { obj >>= OBJ_TAG_BITS; *page = pfn_to_page(obj >> OBJ_INDEX_BITS); } /** * location_to_obj - get obj value encoded from (<page>, <obj_idx>) * @page: page object resides in zspage * @obj_idx: object index */ static unsigned long location_to_obj(struct page *page, unsigned int obj_idx) { unsigned long obj; obj = page_to_pfn(page) << OBJ_INDEX_BITS; obj |= obj_idx & OBJ_INDEX_MASK; obj <<= OBJ_TAG_BITS; return obj; } static unsigned long handle_to_obj(unsigned long handle) { return *(unsigned long *)handle; } static inline bool obj_allocated(struct page *page, void *obj, unsigned long *phandle) { unsigned long handle; struct zspage *zspage = get_zspage(page); if (unlikely(ZsHugePage(zspage))) { VM_BUG_ON_PAGE(!is_first_page(page), page); handle = page->index; } else handle = *(unsigned long *)obj; if (!(handle & OBJ_ALLOCATED_TAG)) return false; /* Clear all tags before returning the handle */ *phandle = handle & ~OBJ_TAG_MASK; return true; } static void reset_page(struct page *page) { __ClearPageMovable(page); ClearPagePrivate(page); set_page_private(page, 0); page_mapcount_reset(page); page->index = 0; } static int trylock_zspage(struct zspage *zspage) { struct page *cursor, *fail; for (cursor = get_first_page(zspage); cursor != NULL; cursor = get_next_page(cursor)) { if (!trylock_page(cursor)) { fail = cursor; goto unlock; } } return 1; unlock: for (cursor = get_first_page(zspage); cursor != fail; cursor = get_next_page(cursor)) unlock_page(cursor); return 0; } static void __free_zspage(struct zs_pool *pool, struct size_class *class, struct zspage *zspage) { struct page *page, *next; int fg; unsigned int class_idx; get_zspage_mapping(zspage, &class_idx, &fg); assert_spin_locked(&pool->lock); VM_BUG_ON(get_zspage_inuse(zspage)); VM_BUG_ON(fg != ZS_INUSE_RATIO_0); next = page = get_first_page(zspage); do { VM_BUG_ON_PAGE(!PageLocked(page), page); next = get_next_page(page); reset_page(page); unlock_page(page); dec_zone_page_state(page, NR_ZSPAGES); put_page(page); page = next; } while (page != NULL); cache_free_zspage(pool, zspage); class_stat_dec(class, ZS_OBJS_ALLOCATED, class->objs_per_zspage); atomic_long_sub(class->pages_per_zspage, &pool->pages_allocated); } static void free_zspage(struct zs_pool *pool, struct size_class *class, struct zspage *zspage) { VM_BUG_ON(get_zspage_inuse(zspage)); VM_BUG_ON(list_empty(&zspage->list)); /* * Since zs_free couldn't be sleepable, this function cannot call * lock_page. The page locks trylock_zspage got will be released * by __free_zspage. */ if (!trylock_zspage(zspage)) { kick_deferred_free(pool); return; } remove_zspage(class, zspage, ZS_INUSE_RATIO_0); __free_zspage(pool, class, zspage); } /* Initialize a newly allocated zspage */ static void init_zspage(struct size_class *class, struct zspage *zspage) { unsigned int freeobj = 1; unsigned long off = 0; struct page *page = get_first_page(zspage); while (page) { struct page *next_page; struct link_free *link; void *vaddr; set_first_obj_offset(page, off); vaddr = kmap_atomic(page); link = (struct link_free *)vaddr + off / sizeof(*link); while ((off += class->size) < PAGE_SIZE) { link->next = freeobj++ << OBJ_TAG_BITS; link += class->size / sizeof(*link); } /* * We now come to the last (full or partial) object on this * page, which must point to the first object on the next * page (if present) */ next_page = get_next_page(page); if (next_page) { link->next = freeobj++ << OBJ_TAG_BITS; } else { /* * Reset OBJ_TAG_BITS bit to last link to tell * whether it's allocated object or not. */ link->next = -1UL << OBJ_TAG_BITS; } kunmap_atomic(vaddr); page = next_page; off %= PAGE_SIZE; } set_freeobj(zspage, 0); } static void create_page_chain(struct size_class *class, struct zspage *zspage, struct page *pages[]) { int i; struct page *page; struct page *prev_page = NULL; int nr_pages = class->pages_per_zspage; /* * Allocate individual pages and link them together as: * 1. all pages are linked together using page->index * 2. each sub-page point to zspage using page->private * * we set PG_private to identify the first page (i.e. no other sub-page * has this flag set). */ for (i = 0; i < nr_pages; i++) { page = pages[i]; set_page_private(page, (unsigned long)zspage); page->index = 0; if (i == 0) { zspage->first_page = page; SetPagePrivate(page); if (unlikely(class->objs_per_zspage == 1 && class->pages_per_zspage == 1)) SetZsHugePage(zspage); } else { prev_page->index = (unsigned long)page; } prev_page = page; } } /* * Allocate a zspage for the given size class */ static struct zspage *alloc_zspage(struct zs_pool *pool, struct size_class *class, gfp_t gfp) { int i; struct page *pages[ZS_MAX_PAGES_PER_ZSPAGE]; struct zspage *zspage = cache_alloc_zspage(pool, gfp); if (!zspage) return NULL; zspage->magic = ZSPAGE_MAGIC; migrate_lock_init(zspage); for (i = 0; i < class->pages_per_zspage; i++) { struct page *page; page = alloc_page(gfp); if (!page) { while (--i >= 0) { dec_zone_page_state(pages[i], NR_ZSPAGES); __free_page(pages[i]); } cache_free_zspage(pool, zspage); return NULL; } inc_zone_page_state(page, NR_ZSPAGES); pages[i] = page; } create_page_chain(class, zspage, pages); init_zspage(class, zspage); zspage->pool = pool; return zspage; } static struct zspage *find_get_zspage(struct size_class *class) { int i; struct zspage *zspage; for (i = ZS_INUSE_RATIO_99; i >= ZS_INUSE_RATIO_0; i--) { zspage = list_first_entry_or_null(&class->fullness_list[i], struct zspage, list); if (zspage) break; } return zspage; } static inline int __zs_cpu_up(struct mapping_area *area) { /* * Make sure we don't leak memory if a cpu UP notification * and zs_init() race and both call zs_cpu_up() on the same cpu */ if (area->vm_buf) return 0; area->vm_buf = kmalloc(ZS_MAX_ALLOC_SIZE, GFP_KERNEL); if (!area->vm_buf) return -ENOMEM; return 0; } static inline void __zs_cpu_down(struct mapping_area *area) { kfree(area->vm_buf); area->vm_buf = NULL; } static void *__zs_map_object(struct mapping_area *area, struct page *pages[2], int off, int size) { int sizes[2]; void *addr; char *buf = area->vm_buf; /* disable page faults to match kmap_atomic() return conditions */ pagefault_disable(); /* no read fastpath */ if (area->vm_mm == ZS_MM_WO) goto out; sizes[0] = PAGE_SIZE - off; sizes[1] = size - sizes[0]; /* copy object to per-cpu buffer */ addr = kmap_atomic(pages[0]); memcpy(buf, addr + off, sizes[0]); kunmap_atomic(addr); addr = kmap_atomic(pages[1]); memcpy(buf + sizes[0], addr, sizes[1]); kunmap_atomic(addr); out: return area->vm_buf; } static void __zs_unmap_object(struct mapping_area *area, struct page *pages[2], int off, int size) { int sizes[2]; void *addr; char *buf; /* no write fastpath */ if (area->vm_mm == ZS_MM_RO) goto out; buf = area->vm_buf; buf = buf + ZS_HANDLE_SIZE; size -= ZS_HANDLE_SIZE; off += ZS_HANDLE_SIZE; sizes[0] = PAGE_SIZE - off; sizes[1] = size - sizes[0]; /* copy per-cpu buffer to object */ addr = kmap_atomic(pages[0]); memcpy(addr + off, buf, sizes[0]); kunmap_atomic(addr); addr = kmap_atomic(pages[1]); memcpy(addr, buf + sizes[0], sizes[1]); kunmap_atomic(addr); out: /* enable page faults to match kunmap_atomic() return conditions */ pagefault_enable(); } static int zs_cpu_prepare(unsigned int cpu) { struct mapping_area *area; area = &per_cpu(zs_map_area, cpu); return __zs_cpu_up(area); } static int zs_cpu_dead(unsigned int cpu) { struct mapping_area *area; area = &per_cpu(zs_map_area, cpu); __zs_cpu_down(area); return 0; } static bool can_merge(struct size_class *prev, int pages_per_zspage, int objs_per_zspage) { if (prev->pages_per_zspage == pages_per_zspage && prev->objs_per_zspage == objs_per_zspage) return true; return false; } static bool zspage_full(struct size_class *class, struct zspage *zspage) { return get_zspage_inuse(zspage) == class->objs_per_zspage; } static bool zspage_empty(struct zspage *zspage) { return get_zspage_inuse(zspage) == 0; } /** * zs_lookup_class_index() - Returns index of the zsmalloc &size_class * that hold objects of the provided size. * @pool: zsmalloc pool to use * @size: object size * * Context: Any context. * * Return: the index of the zsmalloc &size_class that hold objects of the * provided size. */ unsigned int zs_lookup_class_index(struct zs_pool *pool, unsigned int size) { struct size_class *class; class = pool->size_class[get_size_class_index(size)]; return class->index; } EXPORT_SYMBOL_GPL(zs_lookup_class_index); unsigned long zs_get_total_pages(struct zs_pool *pool) { return atomic_long_read(&pool->pages_allocated); } EXPORT_SYMBOL_GPL(zs_get_total_pages); /** * zs_map_object - get address of allocated object from handle. * @pool: pool from which the object was allocated * @handle: handle returned from zs_malloc * @mm: mapping mode to use * * Before using an object allocated from zs_malloc, it must be mapped using * this function. When done with the object, it must be unmapped using * zs_unmap_object. * * Only one object can be mapped per cpu at a time. There is no protection * against nested mappings. * * This function returns with preemption and page faults disabled. */ void *zs_map_object(struct zs_pool *pool, unsigned long handle, enum zs_mapmode mm) { struct zspage *zspage; struct page *page; unsigned long obj, off; unsigned int obj_idx; struct size_class *class; struct mapping_area *area; struct page *pages[2]; void *ret; /* * Because we use per-cpu mapping areas shared among the * pools/users, we can't allow mapping in interrupt context * because it can corrupt another users mappings. */ BUG_ON(in_interrupt()); /* It guarantees it can get zspage from handle safely */ spin_lock(&pool->lock); obj = handle_to_obj(handle); obj_to_location(obj, &page, &obj_idx); zspage = get_zspage(page); /* * migration cannot move any zpages in this zspage. Here, pool->lock * is too heavy since callers would take some time until they calls * zs_unmap_object API so delegate the locking from class to zspage * which is smaller granularity. */ migrate_read_lock(zspage); spin_unlock(&pool->lock); class = zspage_class(pool, zspage); off = offset_in_page(class->size * obj_idx); local_lock(&zs_map_area.lock); area = this_cpu_ptr(&zs_map_area); area->vm_mm = mm; if (off + class->size <= PAGE_SIZE) { /* this object is contained entirely within a page */ area->vm_addr = kmap_atomic(page); ret = area->vm_addr + off; goto out; } /* this object spans two pages */ pages[0] = page; pages[1] = get_next_page(page); BUG_ON(!pages[1]); ret = __zs_map_object(area, pages, off, class->size); out: if (likely(!ZsHugePage(zspage))) ret += ZS_HANDLE_SIZE; return ret; } EXPORT_SYMBOL_GPL(zs_map_object); void zs_unmap_object(struct zs_pool *pool, unsigned long handle) { struct zspage *zspage; struct page *page; unsigned long obj, off; unsigned int obj_idx; struct size_class *class; struct mapping_area *area; obj = handle_to_obj(handle); obj_to_location(obj, &page, &obj_idx); zspage = get_zspage(page); class = zspage_class(pool, zspage); off = offset_in_page(class->size * obj_idx); area = this_cpu_ptr(&zs_map_area); if (off + class->size <= PAGE_SIZE) kunmap_atomic(area->vm_addr); else { struct page *pages[2]; pages[0] = page; pages[1] = get_next_page(page); BUG_ON(!pages[1]); __zs_unmap_object(area, pages, off, class->size); } local_unlock(&zs_map_area.lock); migrate_read_unlock(zspage); } EXPORT_SYMBOL_GPL(zs_unmap_object); /** * zs_huge_class_size() - Returns the size (in bytes) of the first huge * zsmalloc &size_class. * @pool: zsmalloc pool to use * * The function returns the size of the first huge class - any object of equal * or bigger size will be stored in zspage consisting of a single physical * page. * * Context: Any context. * * Return: the size (in bytes) of the first huge zsmalloc &size_class. */ size_t zs_huge_class_size(struct zs_pool *pool) { return huge_class_size; } EXPORT_SYMBOL_GPL(zs_huge_class_size); static unsigned long obj_malloc(struct zs_pool *pool, struct zspage *zspage, unsigned long handle) { int i, nr_page, offset; unsigned long obj; struct link_free *link; struct size_class *class; struct page *m_page; unsigned long m_offset; void *vaddr; class = pool->size_class[zspage->class]; handle |= OBJ_ALLOCATED_TAG; obj = get_freeobj(zspage); offset = obj * class->size; nr_page = offset >> PAGE_SHIFT; m_offset = offset_in_page(offset); m_page = get_first_page(zspage); for (i = 0; i < nr_page; i++) m_page = get_next_page(m_page); vaddr = kmap_atomic(m_page); link = (struct link_free *)vaddr + m_offset / sizeof(*link); set_freeobj(zspage, link->next >> OBJ_TAG_BITS); if (likely(!ZsHugePage(zspage))) /* record handle in the header of allocated chunk */ link->handle = handle; else /* record handle to page->index */ zspage->first_page->index = handle; kunmap_atomic(vaddr); mod_zspage_inuse(zspage, 1); obj = location_to_obj(m_page, obj); return obj; } /** * zs_malloc - Allocate block of given size from pool. * @pool: pool to allocate from * @size: size of block to allocate * @gfp: gfp flags when allocating object * * On success, handle to the allocated object is returned, * otherwise an ERR_PTR(). * Allocation requests with size > ZS_MAX_ALLOC_SIZE will fail. */ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) { unsigned long handle, obj; struct size_class *class; int newfg; struct zspage *zspage; if (unlikely(!size || size > ZS_MAX_ALLOC_SIZE)) return (unsigned long)ERR_PTR(-EINVAL); handle = cache_alloc_handle(pool, gfp); if (!handle) return (unsigned long)ERR_PTR(-ENOMEM); /* extra space in chunk to keep the handle */ size += ZS_HANDLE_SIZE; class = pool->size_class[get_size_class_index(size)]; /* pool->lock effectively protects the zpage migration */ spin_lock(&pool->lock); zspage = find_get_zspage(class); if (likely(zspage)) { obj = obj_malloc(pool, zspage, handle); /* Now move the zspage to another fullness group, if required */ fix_fullness_group(class, zspage); record_obj(handle, obj); class_stat_inc(class, ZS_OBJS_INUSE, 1); goto out; } spin_unlock(&pool->lock); zspage = alloc_zspage(pool, class, gfp); if (!zspage) { cache_free_handle(pool, handle); return (unsigned long)ERR_PTR(-ENOMEM); } spin_lock(&pool->lock); obj = obj_malloc(pool, zspage, handle); newfg = get_fullness_group(class, zspage); insert_zspage(class, zspage, newfg); set_zspage_mapping(zspage, class->index, newfg); record_obj(handle, obj); atomic_long_add(class->pages_per_zspage, &pool->pages_allocated); class_stat_inc(class, ZS_OBJS_ALLOCATED, class->objs_per_zspage); class_stat_inc(class, ZS_OBJS_INUSE, 1); /* We completely set up zspage so mark them as movable */ SetZsPageMovable(pool, zspage); out: spin_unlock(&pool->lock); return handle; } EXPORT_SYMBOL_GPL(zs_malloc); static void obj_free(int class_size, unsigned long obj) { struct link_free *link; struct zspage *zspage; struct page *f_page; unsigned long f_offset; unsigned int f_objidx; void *vaddr; obj_to_location(obj, &f_page, &f_objidx); f_offset = offset_in_page(class_size * f_objidx); zspage = get_zspage(f_page); vaddr = kmap_atomic(f_page); link = (struct link_free *)(vaddr + f_offset); /* Insert this object in containing zspage's freelist */ if (likely(!ZsHugePage(zspage))) link->next = get_freeobj(zspage) << OBJ_TAG_BITS; else f_page->index = 0; set_freeobj(zspage, f_objidx); kunmap_atomic(vaddr); mod_zspage_inuse(zspage, -1); } void zs_free(struct zs_pool *pool, unsigned long handle) { struct zspage *zspage; struct page *f_page; unsigned long obj; struct size_class *class; int fullness; if (IS_ERR_OR_NULL((void *)handle)) return; /* * The pool->lock protects the race with zpage's migration * so it's safe to get the page from handle. */ spin_lock(&pool->lock); obj = handle_to_obj(handle); obj_to_page(obj, &f_page); zspage = get_zspage(f_page); class = zspage_class(pool, zspage); class_stat_dec(class, ZS_OBJS_INUSE, 1); obj_free(class->size, obj); fullness = fix_fullness_group(class, zspage); if (fullness == ZS_INUSE_RATIO_0) free_zspage(pool, class, zspage); spin_unlock(&pool->lock); cache_free_handle(pool, handle); } EXPORT_SYMBOL_GPL(zs_free); static void zs_object_copy(struct size_class *class, unsigned long dst, unsigned long src) { struct page *s_page, *d_page; unsigned int s_objidx, d_objidx; unsigned long s_off, d_off; void *s_addr, *d_addr; int s_size, d_size, size; int written = 0; s_size = d_size = class->size; obj_to_location(src, &s_page, &s_objidx); obj_to_location(dst, &d_page, &d_objidx); s_off = offset_in_page(class->size * s_objidx); d_off = offset_in_page(class->size * d_objidx); if (s_off + class->size > PAGE_SIZE) s_size = PAGE_SIZE - s_off; if (d_off + class->size > PAGE_SIZE) d_size = PAGE_SIZE - d_off; s_addr = kmap_atomic(s_page); d_addr = kmap_atomic(d_page); while (1) { size = min(s_size, d_size); memcpy(d_addr + d_off, s_addr + s_off, size); written += size; if (written == class->size) break; s_off += size; s_size -= size; d_off += size; d_size -= size; /* * Calling kunmap_atomic(d_addr) is necessary. kunmap_atomic() * calls must occurs in reverse order of calls to kmap_atomic(). * So, to call kunmap_atomic(s_addr) we should first call * kunmap_atomic(d_addr). For more details see * Documentation/mm/highmem.rst. */ if (s_off >= PAGE_SIZE) { kunmap_atomic(d_addr); kunmap_atomic(s_addr); s_page = get_next_page(s_page); s_addr = kmap_atomic(s_page); d_addr = kmap_atomic(d_page); s_size = class->size - written; s_off = 0; } if (d_off >= PAGE_SIZE) { kunmap_atomic(d_addr); d_page = get_next_page(d_page); d_addr = kmap_atomic(d_page); d_size = class->size - written; d_off = 0; } } kunmap_atomic(d_addr); kunmap_atomic(s_addr); } /* * Find alloced object in zspage from index object and * return handle. */ static unsigned long find_alloced_obj(struct size_class *class, struct page *page, int *obj_idx) { unsigned int offset; int index = *obj_idx; unsigned long handle = 0; void *addr = kmap_atomic(page); offset = get_first_obj_offset(page); offset += class->size * index; while (offset < PAGE_SIZE) { if (obj_allocated(page, addr + offset, &handle)) break; offset += class->size; index++; } kunmap_atomic(addr); *obj_idx = index; return handle; } static void migrate_zspage(struct zs_pool *pool, struct zspage *src_zspage, struct zspage *dst_zspage) { unsigned long used_obj, free_obj; unsigned long handle; int obj_idx = 0; struct page *s_page = get_first_page(src_zspage); struct size_class *class = pool->size_class[src_zspage->class]; while (1) { handle = find_alloced_obj(class, s_page, &obj_idx); if (!handle) { s_page = get_next_page(s_page); if (!s_page) break; obj_idx = 0; continue; } used_obj = handle_to_obj(handle); free_obj = obj_malloc(pool, dst_zspage, handle); zs_object_copy(class, free_obj, used_obj); obj_idx++; record_obj(handle, free_obj); obj_free(class->size, used_obj); /* Stop if there is no more space */ if (zspage_full(class, dst_zspage)) break; /* Stop if there are no more objects to migrate */ if (zspage_empty(src_zspage)) break; } } static struct zspage *isolate_src_zspage(struct size_class *class) { struct zspage *zspage; int fg; for (fg = ZS_INUSE_RATIO_10; fg <= ZS_INUSE_RATIO_99; fg++) { zspage = list_first_entry_or_null(&class->fullness_list[fg], struct zspage, list); if (zspage) { remove_zspage(class, zspage, fg); return zspage; } } return zspage; } static struct zspage *isolate_dst_zspage(struct size_class *class) { struct zspage *zspage; int fg; for (fg = ZS_INUSE_RATIO_99; fg >= ZS_INUSE_RATIO_10; fg--) { zspage = list_first_entry_or_null(&class->fullness_list[fg], struct zspage, list); if (zspage) { remove_zspage(class, zspage, fg); return zspage; } } return zspage; } /* * putback_zspage - add @zspage into right class's fullness list * @class: destination class * @zspage: target page * * Return @zspage's fullness status */ static int putback_zspage(struct size_class *class, struct zspage *zspage) { int fullness; fullness = get_fullness_group(class, zspage); insert_zspage(class, zspage, fullness); set_zspage_mapping(zspage, class->index, fullness); return fullness; } #ifdef CONFIG_COMPACTION /* * To prevent zspage destroy during migration, zspage freeing should * hold locks of all pages in the zspage. */ static void lock_zspage(struct zspage *zspage) { struct page *curr_page, *page; /* * Pages we haven't locked yet can be migrated off the list while we're * trying to lock them, so we need to be careful and only attempt to * lock each page under migrate_read_lock(). Otherwise, the page we lock * may no longer belong to the zspage. This means that we may wait for * the wrong page to unlock, so we must take a reference to the page * prior to waiting for it to unlock outside migrate_read_lock(). */ while (1) { migrate_read_lock(zspage); page = get_first_page(zspage); if (trylock_page(page)) break; get_page(page); migrate_read_unlock(zspage); wait_on_page_locked(page); put_page(page); } curr_page = page; while ((page = get_next_page(curr_page))) { if (trylock_page(page)) { curr_page = page; } else { get_page(page); migrate_read_unlock(zspage); wait_on_page_locked(page); put_page(page); migrate_read_lock(zspage); } } migrate_read_unlock(zspage); } #endif /* CONFIG_COMPACTION */ static void migrate_lock_init(struct zspage *zspage) { rwlock_init(&zspage->lock); } static void migrate_read_lock(struct zspage *zspage) __acquires(&zspage->lock) { read_lock(&zspage->lock); } static void migrate_read_unlock(struct zspage *zspage) __releases(&zspage->lock) { read_unlock(&zspage->lock); } #ifdef CONFIG_COMPACTION static void migrate_write_lock(struct zspage *zspage) { write_lock(&zspage->lock); } static void migrate_write_lock_nested(struct zspage *zspage) { write_lock_nested(&zspage->lock, SINGLE_DEPTH_NESTING); } static void migrate_write_unlock(struct zspage *zspage) { write_unlock(&zspage->lock); } /* Number of isolated subpage for *page migration* in this zspage */ static void inc_zspage_isolation(struct zspage *zspage) { zspage->isolated++; } static void dec_zspage_isolation(struct zspage *zspage) { VM_BUG_ON(zspage->isolated == 0); zspage->isolated--; } static const struct movable_operations zsmalloc_mops; static void replace_sub_page(struct size_class *class, struct zspage *zspage, struct page *newpage, struct page *oldpage) { struct page *page; struct page *pages[ZS_MAX_PAGES_PER_ZSPAGE] = {NULL, }; int idx = 0; page = get_first_page(zspage); do { if (page == oldpage) pages[idx] = newpage; else pages[idx] = page; idx++; } while ((page = get_next_page(page)) != NULL); create_page_chain(class, zspage, pages); set_first_obj_offset(newpage, get_first_obj_offset(oldpage)); if (unlikely(ZsHugePage(zspage))) newpage->index = oldpage->index; __SetPageMovable(newpage, &zsmalloc_mops); } static bool zs_page_isolate(struct page *page, isolate_mode_t mode) { struct zs_pool *pool; struct zspage *zspage; /* * Page is locked so zspage couldn't be destroyed. For detail, look at * lock_zspage in free_zspage. */ VM_BUG_ON_PAGE(PageIsolated(page), page); zspage = get_zspage(page); pool = zspage->pool; spin_lock(&pool->lock); inc_zspage_isolation(zspage); spin_unlock(&pool->lock); return true; } static int zs_page_migrate(struct page *newpage, struct page *page, enum migrate_mode mode) { struct zs_pool *pool; struct size_class *class; struct zspage *zspage; struct page *dummy; void *s_addr, *d_addr, *addr; unsigned int offset; unsigned long handle; unsigned long old_obj, new_obj; unsigned int obj_idx; /* * We cannot support the _NO_COPY case here, because copy needs to * happen under the zs lock, which does not work with * MIGRATE_SYNC_NO_COPY workflow. */ if (mode == MIGRATE_SYNC_NO_COPY) return -EINVAL; VM_BUG_ON_PAGE(!PageIsolated(page), page); /* The page is locked, so this pointer must remain valid */ zspage = get_zspage(page); pool = zspage->pool; /* * The pool's lock protects the race between zpage migration * and zs_free. */ spin_lock(&pool->lock); class = zspage_class(pool, zspage); /* the migrate_write_lock protects zpage access via zs_map_object */ migrate_write_lock(zspage); offset = get_first_obj_offset(page); s_addr = kmap_atomic(page); /* * Here, any user cannot access all objects in the zspage so let's move. */ d_addr = kmap_atomic(newpage); copy_page(d_addr, s_addr); kunmap_atomic(d_addr); for (addr = s_addr + offset; addr < s_addr + PAGE_SIZE; addr += class->size) { if (obj_allocated(page, addr, &handle)) { old_obj = handle_to_obj(handle); obj_to_location(old_obj, &dummy, &obj_idx); new_obj = (unsigned long)location_to_obj(newpage, obj_idx); record_obj(handle, new_obj); } } kunmap_atomic(s_addr); replace_sub_page(class, zspage, newpage, page); dec_zspage_isolation(zspage); /* * Since we complete the data copy and set up new zspage structure, * it's okay to release the pool's lock. */ spin_unlock(&pool->lock); migrate_write_unlock(zspage); get_page(newpage); if (page_zone(newpage) != page_zone(page)) { dec_zone_page_state(page, NR_ZSPAGES); inc_zone_page_state(newpage, NR_ZSPAGES); } reset_page(page); put_page(page); return MIGRATEPAGE_SUCCESS; } static void zs_page_putback(struct page *page) { struct zs_pool *pool; struct zspage *zspage; VM_BUG_ON_PAGE(!PageIsolated(page), page); zspage = get_zspage(page); pool = zspage->pool; spin_lock(&pool->lock); dec_zspage_isolation(zspage); spin_unlock(&pool->lock); } static const struct movable_operations zsmalloc_mops = { .isolate_page = zs_page_isolate, .migrate_page = zs_page_migrate, .putback_page = zs_page_putback, }; /* * Caller should hold page_lock of all pages in the zspage * In here, we cannot use zspage meta data. */ static void async_free_zspage(struct work_struct *work) { int i; struct size_class *class; unsigned int class_idx; int fullness; struct zspage *zspage, *tmp; LIST_HEAD(free_pages); struct zs_pool *pool = container_of(work, struct zs_pool, free_work); for (i = 0; i < ZS_SIZE_CLASSES; i++) { class = pool->size_class[i]; if (class->index != i) continue; spin_lock(&pool->lock); list_splice_init(&class->fullness_list[ZS_INUSE_RATIO_0], &free_pages); spin_unlock(&pool->lock); } list_for_each_entry_safe(zspage, tmp, &free_pages, list) { list_del(&zspage->list); lock_zspage(zspage); get_zspage_mapping(zspage, &class_idx, &fullness); VM_BUG_ON(fullness != ZS_INUSE_RATIO_0); class = pool->size_class[class_idx]; spin_lock(&pool->lock); __free_zspage(pool, class, zspage); spin_unlock(&pool->lock); } }; static void kick_deferred_free(struct zs_pool *pool) { schedule_work(&pool->free_work); } static void zs_flush_migration(struct zs_pool *pool) { flush_work(&pool->free_work); } static void init_deferred_free(struct zs_pool *pool) { INIT_WORK(&pool->free_work, async_free_zspage); } static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) { struct page *page = get_first_page(zspage); do { WARN_ON(!trylock_page(page)); __SetPageMovable(page, &zsmalloc_mops); unlock_page(page); } while ((page = get_next_page(page)) != NULL); } #else static inline void zs_flush_migration(struct zs_pool *pool) { } #endif /* * * Based on the number of unused allocated objects calculate * and return the number of pages that we can free. */ static unsigned long zs_can_compact(struct size_class *class) { unsigned long obj_wasted; unsigned long obj_allocated = zs_stat_get(class, ZS_OBJS_ALLOCATED); unsigned long obj_used = zs_stat_get(class, ZS_OBJS_INUSE); if (obj_allocated <= obj_used) return 0; obj_wasted = obj_allocated - obj_used; obj_wasted /= class->objs_per_zspage; return obj_wasted * class->pages_per_zspage; } static unsigned long __zs_compact(struct zs_pool *pool, struct size_class *class) { struct zspage *src_zspage = NULL; struct zspage *dst_zspage = NULL; unsigned long pages_freed = 0; /* * protect the race between zpage migration and zs_free * as well as zpage allocation/free */ spin_lock(&pool->lock); while (zs_can_compact(class)) { int fg; if (!dst_zspage) { dst_zspage = isolate_dst_zspage(class); if (!dst_zspage) break; migrate_write_lock(dst_zspage); } src_zspage = isolate_src_zspage(class); if (!src_zspage) break; migrate_write_lock_nested(src_zspage); migrate_zspage(pool, src_zspage, dst_zspage); fg = putback_zspage(class, src_zspage); migrate_write_unlock(src_zspage); if (fg == ZS_INUSE_RATIO_0) { free_zspage(pool, class, src_zspage); pages_freed += class->pages_per_zspage; } src_zspage = NULL; if (get_fullness_group(class, dst_zspage) == ZS_INUSE_RATIO_100 || spin_is_contended(&pool->lock)) { putback_zspage(class, dst_zspage); migrate_write_unlock(dst_zspage); dst_zspage = NULL; spin_unlock(&pool->lock); cond_resched(); spin_lock(&pool->lock); } } if (src_zspage) { putback_zspage(class, src_zspage); migrate_write_unlock(src_zspage); } if (dst_zspage) { putback_zspage(class, dst_zspage); migrate_write_unlock(dst_zspage); } spin_unlock(&pool->lock); return pages_freed; } unsigned long zs_compact(struct zs_pool *pool) { int i; struct size_class *class; unsigned long pages_freed = 0; /* * Pool compaction is performed under pool->lock so it is basically * single-threaded. Having more than one thread in __zs_compact() * will increase pool->lock contention, which will impact other * zsmalloc operations that need pool->lock. */ if (atomic_xchg(&pool->compaction_in_progress, 1)) return 0; for (i = ZS_SIZE_CLASSES - 1; i >= 0; i--) { class = pool->size_class[i]; if (class->index != i) continue; pages_freed += __zs_compact(pool, class); } atomic_long_add(pages_freed, &pool->stats.pages_compacted); atomic_set(&pool->compaction_in_progress, 0); return pages_freed; } EXPORT_SYMBOL_GPL(zs_compact); void zs_pool_stats(struct zs_pool *pool, struct zs_pool_stats *stats) { memcpy(stats, &pool->stats, sizeof(struct zs_pool_stats)); } EXPORT_SYMBOL_GPL(zs_pool_stats); static unsigned long zs_shrinker_scan(struct shrinker *shrinker, struct shrink_control *sc) { unsigned long pages_freed; struct zs_pool *pool = shrinker->private_data; /* * Compact classes and calculate compaction delta. * Can run concurrently with a manually triggered * (by user) compaction. */ pages_freed = zs_compact(pool); return pages_freed ? pages_freed : SHRINK_STOP; } static unsigned long zs_shrinker_count(struct shrinker *shrinker, struct shrink_control *sc) { int i; struct size_class *class; unsigned long pages_to_free = 0; struct zs_pool *pool = shrinker->private_data; for (i = ZS_SIZE_CLASSES - 1; i >= 0; i--) { class = pool->size_class[i]; if (class->index != i) continue; pages_to_free += zs_can_compact(class); } return pages_to_free; } static void zs_unregister_shrinker(struct zs_pool *pool) { shrinker_free(pool->shrinker); } static int zs_register_shrinker(struct zs_pool *pool) { pool->shrinker = shrinker_alloc(0, "mm-zspool:%s", pool->name); if (!pool->shrinker) return -ENOMEM; pool->shrinker->scan_objects = zs_shrinker_scan; pool->shrinker->count_objects = zs_shrinker_count; pool->shrinker->batch = 0; pool->shrinker->private_data = pool; shrinker_register(pool->shrinker); return 0; } static int calculate_zspage_chain_size(int class_size) { int i, min_waste = INT_MAX; int chain_size = 1; if (is_power_of_2(class_size)) return chain_size; for (i = 1; i <= ZS_MAX_PAGES_PER_ZSPAGE; i++) { int waste; waste = (i * PAGE_SIZE) % class_size; if (waste < min_waste) { min_waste = waste; chain_size = i; } } return chain_size; } /** * zs_create_pool - Creates an allocation pool to work from. * @name: pool name to be created * * This function must be called before anything when using * the zsmalloc allocator. * * On success, a pointer to the newly created pool is returned, * otherwise NULL. */ struct zs_pool *zs_create_pool(const char *name) { int i; struct zs_pool *pool; struct size_class *prev_class = NULL; pool = kzalloc(sizeof(*pool), GFP_KERNEL); if (!pool) return NULL; init_deferred_free(pool); spin_lock_init(&pool->lock); atomic_set(&pool->compaction_in_progress, 0); pool->name = kstrdup(name, GFP_KERNEL); if (!pool->name) goto err; if (create_cache(pool)) goto err; /* * Iterate reversely, because, size of size_class that we want to use * for merging should be larger or equal to current size. */ for (i = ZS_SIZE_CLASSES - 1; i >= 0; i--) { int size; int pages_per_zspage; int objs_per_zspage; struct size_class *class; int fullness; size = ZS_MIN_ALLOC_SIZE + i * ZS_SIZE_CLASS_DELTA; if (size > ZS_MAX_ALLOC_SIZE) size = ZS_MAX_ALLOC_SIZE; pages_per_zspage = calculate_zspage_chain_size(size); objs_per_zspage = pages_per_zspage * PAGE_SIZE / size; /* * We iterate from biggest down to smallest classes, * so huge_class_size holds the size of the first huge * class. Any object bigger than or equal to that will * endup in the huge class. */ if (pages_per_zspage != 1 && objs_per_zspage != 1 && !huge_class_size) { huge_class_size = size; /* * The object uses ZS_HANDLE_SIZE bytes to store the * handle. We need to subtract it, because zs_malloc() * unconditionally adds handle size before it performs * size class search - so object may be smaller than * huge class size, yet it still can end up in the huge * class because it grows by ZS_HANDLE_SIZE extra bytes * right before class lookup. */ huge_class_size -= (ZS_HANDLE_SIZE - 1); } /* * size_class is used for normal zsmalloc operation such * as alloc/free for that size. Although it is natural that we * have one size_class for each size, there is a chance that we * can get more memory utilization if we use one size_class for * many different sizes whose size_class have same * characteristics. So, we makes size_class point to * previous size_class if possible. */ if (prev_class) { if (can_merge(prev_class, pages_per_zspage, objs_per_zspage)) { pool->size_class[i] = prev_class; continue; } } class = kzalloc(sizeof(struct size_class), GFP_KERNEL); if (!class) goto err; class->size = size; class->index = i; class->pages_per_zspage = pages_per_zspage; class->objs_per_zspage = objs_per_zspage; pool->size_class[i] = class; fullness = ZS_INUSE_RATIO_0; while (fullness < NR_FULLNESS_GROUPS) { INIT_LIST_HEAD(&class->fullness_list[fullness]); fullness++; } prev_class = class; } /* debug only, don't abort if it fails */ zs_pool_stat_create(pool, name); /* * Not critical since shrinker is only used to trigger internal * defragmentation of the pool which is pretty optional thing. If * registration fails we still can use the pool normally and user can * trigger compaction manually. Thus, ignore return code. */ zs_register_shrinker(pool); return pool; err: zs_destroy_pool(pool); return NULL; } EXPORT_SYMBOL_GPL(zs_create_pool); void zs_destroy_pool(struct zs_pool *pool) { int i; zs_unregister_shrinker(pool); zs_flush_migration(pool); zs_pool_stat_destroy(pool); for (i = 0; i < ZS_SIZE_CLASSES; i++) { int fg; struct size_class *class = pool->size_class[i]; if (!class) continue; if (class->index != i) continue; for (fg = ZS_INUSE_RATIO_0; fg < NR_FULLNESS_GROUPS; fg++) { if (list_empty(&class->fullness_list[fg])) continue; pr_err("Class-%d fullness group %d is not empty\n", class->size, fg); } kfree(class); } destroy_cache(pool); kfree(pool->name); kfree(pool); } EXPORT_SYMBOL_GPL(zs_destroy_pool); static int __init zs_init(void) { int ret; ret = cpuhp_setup_state(CPUHP_MM_ZS_PREPARE, "mm/zsmalloc:prepare", zs_cpu_prepare, zs_cpu_dead); if (ret) goto out; #ifdef CONFIG_ZPOOL zpool_register_driver(&zs_zpool_driver); #endif zs_stat_init(); return 0; out: return ret; } static void __exit zs_exit(void) { #ifdef CONFIG_ZPOOL zpool_unregister_driver(&zs_zpool_driver); #endif cpuhp_remove_state(CPUHP_MM_ZS_PREPARE); zs_stat_exit(); } module_init(zs_init); module_exit(zs_exit); MODULE_LICENSE("Dual BSD/GPL"); MODULE_AUTHOR("Nitin Gupta <ngupta@vflare.org>"); |
3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __FIRMWARE_LOADER_H #define __FIRMWARE_LOADER_H #include <linux/bitops.h> #include <linux/firmware.h> #include <linux/types.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/completion.h> /** * enum fw_opt - options to control firmware loading behaviour * * @FW_OPT_UEVENT: Enables the fallback mechanism to send a kobject uevent * when the firmware is not found. Userspace is in charge to load the * firmware using the sysfs loading facility. * @FW_OPT_NOWAIT: Used to describe the firmware request is asynchronous. * @FW_OPT_USERHELPER: Enable the fallback mechanism, in case the direct * filesystem lookup fails at finding the firmware. For details refer to * firmware_fallback_sysfs(). * @FW_OPT_NO_WARN: Quiet, avoid printing warning messages. * @FW_OPT_NOCACHE: Disables firmware caching. Firmware caching is used to * cache the firmware upon suspend, so that upon resume races against the * firmware file lookup on storage is avoided. Used for calls where the * file may be too big, or where the driver takes charge of its own * firmware caching mechanism. * @FW_OPT_NOFALLBACK_SYSFS: Disable the sysfs fallback mechanism. Takes * precedence over &FW_OPT_UEVENT and &FW_OPT_USERHELPER. * @FW_OPT_FALLBACK_PLATFORM: Enable fallback to device fw copy embedded in * the platform's main firmware. If both this fallback and the sysfs * fallback are enabled, then this fallback will be tried first. * @FW_OPT_PARTIAL: Allow partial read of firmware instead of needing to read * entire file. */ enum fw_opt { FW_OPT_UEVENT = BIT(0), FW_OPT_NOWAIT = BIT(1), FW_OPT_USERHELPER = BIT(2), FW_OPT_NO_WARN = BIT(3), FW_OPT_NOCACHE = BIT(4), FW_OPT_NOFALLBACK_SYSFS = BIT(5), FW_OPT_FALLBACK_PLATFORM = BIT(6), FW_OPT_PARTIAL = BIT(7), }; enum fw_status { FW_STATUS_UNKNOWN, FW_STATUS_LOADING, FW_STATUS_DONE, FW_STATUS_ABORTED, }; /* * Concurrent request_firmware() for the same firmware need to be * serialized. struct fw_state is simple state machine which hold the * state of the firmware loading. */ struct fw_state { struct completion completion; enum fw_status status; }; struct fw_priv { struct kref ref; struct list_head list; struct firmware_cache *fwc; struct fw_state fw_st; void *data; size_t size; size_t allocated_size; size_t offset; u32 opt_flags; #ifdef CONFIG_FW_LOADER_PAGED_BUF bool is_paged_buf; struct page **pages; int nr_pages; int page_array_size; #endif #ifdef CONFIG_FW_LOADER_USER_HELPER bool need_uevent; struct list_head pending_list; #endif const char *fw_name; }; extern struct mutex fw_lock; extern struct firmware_cache fw_cache; extern bool fw_load_abort_all; static inline bool __fw_state_check(struct fw_priv *fw_priv, enum fw_status status) { struct fw_state *fw_st = &fw_priv->fw_st; return fw_st->status == status; } static inline int __fw_state_wait_common(struct fw_priv *fw_priv, long timeout) { struct fw_state *fw_st = &fw_priv->fw_st; long ret; ret = wait_for_completion_killable_timeout(&fw_st->completion, timeout); if (ret != 0 && fw_st->status == FW_STATUS_ABORTED) return -ENOENT; if (!ret) return -ETIMEDOUT; return ret < 0 ? ret : 0; } static inline void __fw_state_set(struct fw_priv *fw_priv, enum fw_status status) { struct fw_state *fw_st = &fw_priv->fw_st; WRITE_ONCE(fw_st->status, status); if (status == FW_STATUS_DONE || status == FW_STATUS_ABORTED) { #ifdef CONFIG_FW_LOADER_USER_HELPER /* * Doing this here ensures that the fw_priv is deleted from * the pending list in all abort/done paths. */ list_del_init(&fw_priv->pending_list); #endif complete_all(&fw_st->completion); } } static inline void fw_state_aborted(struct fw_priv *fw_priv) { __fw_state_set(fw_priv, FW_STATUS_ABORTED); } static inline bool fw_state_is_aborted(struct fw_priv *fw_priv) { return __fw_state_check(fw_priv, FW_STATUS_ABORTED); } static inline void fw_state_start(struct fw_priv *fw_priv) { __fw_state_set(fw_priv, FW_STATUS_LOADING); } static inline void fw_state_done(struct fw_priv *fw_priv) { __fw_state_set(fw_priv, FW_STATUS_DONE); } static inline bool fw_state_is_done(struct fw_priv *fw_priv) { return __fw_state_check(fw_priv, FW_STATUS_DONE); } static inline bool fw_state_is_loading(struct fw_priv *fw_priv) { return __fw_state_check(fw_priv, FW_STATUS_LOADING); } int alloc_lookup_fw_priv(const char *fw_name, struct firmware_cache *fwc, struct fw_priv **fw_priv, void *dbuf, size_t size, size_t offset, u32 opt_flags); int assign_fw(struct firmware *fw, struct device *device); void free_fw_priv(struct fw_priv *fw_priv); void fw_state_init(struct fw_priv *fw_priv); #ifdef CONFIG_FW_LOADER bool firmware_is_builtin(const struct firmware *fw); bool firmware_request_builtin_buf(struct firmware *fw, const char *name, void *buf, size_t size); #else /* module case */ static inline bool firmware_is_builtin(const struct firmware *fw) { return false; } static inline bool firmware_request_builtin_buf(struct firmware *fw, const char *name, void *buf, size_t size) { return false; } #endif #ifdef CONFIG_FW_LOADER_PAGED_BUF void fw_free_paged_buf(struct fw_priv *fw_priv); int fw_grow_paged_buf(struct fw_priv *fw_priv, int pages_needed); int fw_map_paged_buf(struct fw_priv *fw_priv); bool fw_is_paged_buf(struct fw_priv *fw_priv); #else static inline void fw_free_paged_buf(struct fw_priv *fw_priv) {} static inline int fw_grow_paged_buf(struct fw_priv *fw_priv, int pages_needed) { return -ENXIO; } static inline int fw_map_paged_buf(struct fw_priv *fw_priv) { return -ENXIO; } static inline bool fw_is_paged_buf(struct fw_priv *fw_priv) { return false; } #endif #endif /* __FIRMWARE_LOADER_H */ |
6 3 3 4 4 2 3 3 3 2 1 5 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C)2006 USAGI/WIDE Project * * Author: * Kazunori Miyazawa <miyazawa@linux-ipv6.org> */ #include <crypto/internal/cipher.h> #include <crypto/internal/hash.h> #include <linux/err.h> #include <linux/kernel.h> #include <linux/module.h> static u_int32_t ks[12] = {0x01010101, 0x01010101, 0x01010101, 0x01010101, 0x02020202, 0x02020202, 0x02020202, 0x02020202, 0x03030303, 0x03030303, 0x03030303, 0x03030303}; /* * +------------------------ * | <parent tfm> * +------------------------ * | xcbc_tfm_ctx * +------------------------ * | consts (block size * 2) * +------------------------ */ struct xcbc_tfm_ctx { struct crypto_cipher *child; u8 consts[]; }; /* * +------------------------ * | <shash desc> * +------------------------ * | xcbc_desc_ctx * +------------------------ * | odds (block size) * +------------------------ * | prev (block size) * +------------------------ */ struct xcbc_desc_ctx { unsigned int len; u8 odds[]; }; #define XCBC_BLOCKSIZE 16 static int crypto_xcbc_digest_setkey(struct crypto_shash *parent, const u8 *inkey, unsigned int keylen) { struct xcbc_tfm_ctx *ctx = crypto_shash_ctx(parent); u8 *consts = ctx->consts; int err = 0; u8 key1[XCBC_BLOCKSIZE]; int bs = sizeof(key1); if ((err = crypto_cipher_setkey(ctx->child, inkey, keylen))) return err; crypto_cipher_encrypt_one(ctx->child, consts, (u8 *)ks + bs); crypto_cipher_encrypt_one(ctx->child, consts + bs, (u8 *)ks + bs * 2); crypto_cipher_encrypt_one(ctx->child, key1, (u8 *)ks); return crypto_cipher_setkey(ctx->child, key1, bs); } static int crypto_xcbc_digest_init(struct shash_desc *pdesc) { struct xcbc_desc_ctx *ctx = shash_desc_ctx(pdesc); int bs = crypto_shash_blocksize(pdesc->tfm); u8 *prev = &ctx->odds[bs]; ctx->len = 0; memset(prev, 0, bs); return 0; } static int crypto_xcbc_digest_update(struct shash_desc *pdesc, const u8 *p, unsigned int len) { struct crypto_shash *parent = pdesc->tfm; struct xcbc_tfm_ctx *tctx = crypto_shash_ctx(parent); struct xcbc_desc_ctx *ctx = shash_desc_ctx(pdesc); struct crypto_cipher *tfm = tctx->child; int bs = crypto_shash_blocksize(parent); u8 *odds = ctx->odds; u8 *prev = odds + bs; /* checking the data can fill the block */ if ((ctx->len + len) <= bs) { memcpy(odds + ctx->len, p, len); ctx->len += len; return 0; } /* filling odds with new data and encrypting it */ memcpy(odds + ctx->len, p, bs - ctx->len); len -= bs - ctx->len; p += bs - ctx->len; crypto_xor(prev, odds, bs); crypto_cipher_encrypt_one(tfm, prev, prev); /* clearing the length */ ctx->len = 0; /* encrypting the rest of data */ while (len > bs) { crypto_xor(prev, p, bs); crypto_cipher_encrypt_one(tfm, prev, prev); p += bs; len -= bs; } /* keeping the surplus of blocksize */ if (len) { memcpy(odds, p, len); ctx->len = len; } return 0; } static int crypto_xcbc_digest_final(struct shash_desc *pdesc, u8 *out) { struct crypto_shash *parent = pdesc->tfm; struct xcbc_tfm_ctx *tctx = crypto_shash_ctx(parent); struct xcbc_desc_ctx *ctx = shash_desc_ctx(pdesc); struct crypto_cipher *tfm = tctx->child; int bs = crypto_shash_blocksize(parent); u8 *odds = ctx->odds; u8 *prev = odds + bs; unsigned int offset = 0; if (ctx->len != bs) { unsigned int rlen; u8 *p = odds + ctx->len; *p = 0x80; p++; rlen = bs - ctx->len -1; if (rlen) memset(p, 0, rlen); offset += bs; } crypto_xor(prev, odds, bs); crypto_xor(prev, &tctx->consts[offset], bs); crypto_cipher_encrypt_one(tfm, out, prev); return 0; } static int xcbc_init_tfm(struct crypto_tfm *tfm) { struct crypto_cipher *cipher; struct crypto_instance *inst = (void *)tfm->__crt_alg; struct crypto_cipher_spawn *spawn = crypto_instance_ctx(inst); struct xcbc_tfm_ctx *ctx = crypto_tfm_ctx(tfm); cipher = crypto_spawn_cipher(spawn); if (IS_ERR(cipher)) return PTR_ERR(cipher); ctx->child = cipher; return 0; }; static void xcbc_exit_tfm(struct crypto_tfm *tfm) { struct xcbc_tfm_ctx *ctx = crypto_tfm_ctx(tfm); crypto_free_cipher(ctx->child); } static int xcbc_create(struct crypto_template *tmpl, struct rtattr **tb) { struct shash_instance *inst; struct crypto_cipher_spawn *spawn; struct crypto_alg *alg; u32 mask; int err; err = crypto_check_attr_type(tb, CRYPTO_ALG_TYPE_SHASH, &mask); if (err) return err; inst = kzalloc(sizeof(*inst) + sizeof(*spawn), GFP_KERNEL); if (!inst) return -ENOMEM; spawn = shash_instance_ctx(inst); err = crypto_grab_cipher(spawn, shash_crypto_instance(inst), crypto_attr_alg_name(tb[1]), 0, mask); if (err) goto err_free_inst; alg = crypto_spawn_cipher_alg(spawn); err = -EINVAL; if (alg->cra_blocksize != XCBC_BLOCKSIZE) goto err_free_inst; err = crypto_inst_setname(shash_crypto_instance(inst), tmpl->name, alg); if (err) goto err_free_inst; inst->alg.base.cra_priority = alg->cra_priority; inst->alg.base.cra_blocksize = alg->cra_blocksize; inst->alg.base.cra_ctxsize = sizeof(struct xcbc_tfm_ctx) + alg->cra_blocksize * 2; inst->alg.digestsize = alg->cra_blocksize; inst->alg.descsize = sizeof(struct xcbc_desc_ctx) + alg->cra_blocksize * 2; inst->alg.base.cra_init = xcbc_init_tfm; inst->alg.base.cra_exit = xcbc_exit_tfm; inst->alg.init = crypto_xcbc_digest_init; inst->alg.update = crypto_xcbc_digest_update; inst->alg.final = crypto_xcbc_digest_final; inst->alg.setkey = crypto_xcbc_digest_setkey; inst->free = shash_free_singlespawn_instance; err = shash_register_instance(tmpl, inst); if (err) { err_free_inst: shash_free_singlespawn_instance(inst); } return err; } static struct crypto_template crypto_xcbc_tmpl = { .name = "xcbc", .create = xcbc_create, .module = THIS_MODULE, }; static int __init crypto_xcbc_module_init(void) { return crypto_register_template(&crypto_xcbc_tmpl); } static void __exit crypto_xcbc_module_exit(void) { crypto_unregister_template(&crypto_xcbc_tmpl); } subsys_initcall(crypto_xcbc_module_init); module_exit(crypto_xcbc_module_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("XCBC keyed hash algorithm"); MODULE_ALIAS_CRYPTO("xcbc"); MODULE_IMPORT_NS(CRYPTO_INTERNAL); |
42 22 22 22 22 22 22 22 22 22 42 39 43 43 43 43 42 42 3 41 42 38 41 42 26 42 22 5 43 42 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * Kernel-based Virtual Machine driver for Linux * * This module enables machines with Intel VT-x extensions to run virtual * machines without emulation or binary translation. * * MMU support * * Copyright (C) 2006 Qumranet, Inc. * Copyright 2010 Red Hat, Inc. and/or its affiliates. * * Authors: * Yaniv Kamay <yaniv@qumranet.com> * Avi Kivity <avi@qumranet.com> */ /* * The MMU needs to be able to access/walk 32-bit and 64-bit guest page tables, * as well as guest EPT tables, so the code in this file is compiled thrice, * once per guest PTE type. The per-type defines are #undef'd at the end. */ #if PTTYPE == 64 #define pt_element_t u64 #define guest_walker guest_walker64 #define FNAME(name) paging##64_##name #define PT_LEVEL_BITS 9 #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT #define PT_HAVE_ACCESSED_DIRTY(mmu) true #ifdef CONFIG_X86_64 #define PT_MAX_FULL_LEVELS PT64_ROOT_MAX_LEVEL #else #define PT_MAX_FULL_LEVELS 2 #endif #elif PTTYPE == 32 #define pt_element_t u32 #define guest_walker guest_walker32 #define FNAME(name) paging##32_##name #define PT_LEVEL_BITS 10 #define PT_MAX_FULL_LEVELS 2 #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT #define PT_HAVE_ACCESSED_DIRTY(mmu) true #define PT32_DIR_PSE36_SIZE 4 #define PT32_DIR_PSE36_SHIFT 13 #define PT32_DIR_PSE36_MASK \ (((1ULL << PT32_DIR_PSE36_SIZE) - 1) << PT32_DIR_PSE36_SHIFT) #elif PTTYPE == PTTYPE_EPT #define pt_element_t u64 #define guest_walker guest_walkerEPT #define FNAME(name) ept_##name #define PT_LEVEL_BITS 9 #define PT_GUEST_DIRTY_SHIFT 9 #define PT_GUEST_ACCESSED_SHIFT 8 #define PT_HAVE_ACCESSED_DIRTY(mmu) (!(mmu)->cpu_role.base.ad_disabled) #define PT_MAX_FULL_LEVELS PT64_ROOT_MAX_LEVEL #else #error Invalid PTTYPE value #endif /* Common logic, but per-type values. These also need to be undefined. */ #define PT_BASE_ADDR_MASK ((pt_element_t)(((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1))) #define PT_LVL_ADDR_MASK(lvl) __PT_LVL_ADDR_MASK(PT_BASE_ADDR_MASK, lvl, PT_LEVEL_BITS) #define PT_LVL_OFFSET_MASK(lvl) __PT_LVL_OFFSET_MASK(PT_BASE_ADDR_MASK, lvl, PT_LEVEL_BITS) #define PT_INDEX(addr, lvl) __PT_INDEX(addr, lvl, PT_LEVEL_BITS) #define PT_GUEST_DIRTY_MASK (1 << PT_GUEST_DIRTY_SHIFT) #define PT_GUEST_ACCESSED_MASK (1 << PT_GUEST_ACCESSED_SHIFT) #define gpte_to_gfn_lvl FNAME(gpte_to_gfn_lvl) #define gpte_to_gfn(pte) gpte_to_gfn_lvl((pte), PG_LEVEL_4K) /* * The guest_walker structure emulates the behavior of the hardware page * table walker. */ struct guest_walker { int level; unsigned max_level; gfn_t table_gfn[PT_MAX_FULL_LEVELS]; pt_element_t ptes[PT_MAX_FULL_LEVELS]; pt_element_t prefetch_ptes[PTE_PREFETCH_NUM]; gpa_t pte_gpa[PT_MAX_FULL_LEVELS]; pt_element_t __user *ptep_user[PT_MAX_FULL_LEVELS]; bool pte_writable[PT_MAX_FULL_LEVELS]; unsigned int pt_access[PT_MAX_FULL_LEVELS]; unsigned int pte_access; gfn_t gfn; struct x86_exception fault; }; #if PTTYPE == 32 static inline gfn_t pse36_gfn_delta(u32 gpte) { int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT; return (gpte & PT32_DIR_PSE36_MASK) << shift; } #endif static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl) { return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT; } static inline void FNAME(protect_clean_gpte)(struct kvm_mmu *mmu, unsigned *access, unsigned gpte) { unsigned mask; /* dirty bit is not supported, so no need to track it */ if (!PT_HAVE_ACCESSED_DIRTY(mmu)) return; BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK); mask = (unsigned)~ACC_WRITE_MASK; /* Allow write access to dirty gptes */ mask |= (gpte >> (PT_GUEST_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) & PT_WRITABLE_MASK; *access &= mask; } static inline int FNAME(is_present_gpte)(unsigned long pte) { #if PTTYPE != PTTYPE_EPT return pte & PT_PRESENT_MASK; #else return pte & 7; #endif } static bool FNAME(is_bad_mt_xwr)(struct rsvd_bits_validate *rsvd_check, u64 gpte) { #if PTTYPE != PTTYPE_EPT return false; #else return __is_bad_mt_xwr(rsvd_check, gpte); #endif } static bool FNAME(is_rsvd_bits_set)(struct kvm_mmu *mmu, u64 gpte, int level) { return __is_rsvd_bits_set(&mmu->guest_rsvd_check, gpte, level) || FNAME(is_bad_mt_xwr)(&mmu->guest_rsvd_check, gpte); } static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, u64 gpte) { if (!FNAME(is_present_gpte)(gpte)) goto no_present; /* Prefetch only accessed entries (unless A/D bits are disabled). */ if (PT_HAVE_ACCESSED_DIRTY(vcpu->arch.mmu) && !(gpte & PT_GUEST_ACCESSED_MASK)) goto no_present; if (FNAME(is_rsvd_bits_set)(vcpu->arch.mmu, gpte, PG_LEVEL_4K)) goto no_present; return false; no_present: drop_spte(vcpu->kvm, spte); return true; } /* * For PTTYPE_EPT, a page table can be executable but not readable * on supported processors. Therefore, set_spte does not automatically * set bit 0 if execute only is supported. Here, we repurpose ACC_USER_MASK * to signify readability since it isn't used in the EPT case */ static inline unsigned FNAME(gpte_access)(u64 gpte) { unsigned access; #if PTTYPE == PTTYPE_EPT access = ((gpte & VMX_EPT_WRITABLE_MASK) ? ACC_WRITE_MASK : 0) | ((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0) | ((gpte & VMX_EPT_READABLE_MASK) ? ACC_USER_MASK : 0); #else BUILD_BUG_ON(ACC_EXEC_MASK != PT_PRESENT_MASK); BUILD_BUG_ON(ACC_EXEC_MASK != 1); access = gpte & (PT_WRITABLE_MASK | PT_USER_MASK | PT_PRESENT_MASK); /* Combine NX with P (which is set here) to get ACC_EXEC_MASK. */ access ^= (gpte >> PT64_NX_SHIFT); #endif return access; } static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, struct guest_walker *walker, gpa_t addr, int write_fault) { unsigned level, index; pt_element_t pte, orig_pte; pt_element_t __user *ptep_user; gfn_t table_gfn; int ret; /* dirty/accessed bits are not supported, so no need to update them */ if (!PT_HAVE_ACCESSED_DIRTY(mmu)) return 0; for (level = walker->max_level; level >= walker->level; --level) { pte = orig_pte = walker->ptes[level - 1]; table_gfn = walker->table_gfn[level - 1]; ptep_user = walker->ptep_user[level - 1]; index = offset_in_page(ptep_user) / sizeof(pt_element_t); if (!(pte & PT_GUEST_ACCESSED_MASK)) { trace_kvm_mmu_set_accessed_bit(table_gfn, index, sizeof(pte)); pte |= PT_GUEST_ACCESSED_MASK; } if (level == walker->level && write_fault && !(pte & PT_GUEST_DIRTY_MASK)) { trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); #if PTTYPE == PTTYPE_EPT if (kvm_x86_ops.nested_ops->write_log_dirty(vcpu, addr)) return -EINVAL; #endif pte |= PT_GUEST_DIRTY_MASK; } if (pte == orig_pte) continue; /* * If the slot is read-only, simply do not process the accessed * and dirty bits. This is the correct thing to do if the slot * is ROM, and page tables in read-as-ROM/write-as-MMIO slots * are only supported if the accessed and dirty bits are already * set in the ROM (so that MMIO writes are never needed). * * Note that NPT does not allow this at all and faults, since * it always wants nested page table entries for the guest * page tables to be writable. And EPT works but will simply * overwrite the read-only memory to set the accessed and dirty * bits. */ if (unlikely(!walker->pte_writable[level - 1])) continue; ret = __try_cmpxchg_user(ptep_user, &orig_pte, pte, fault); if (ret) return ret; kvm_vcpu_mark_page_dirty(vcpu, table_gfn); walker->ptes[level - 1] = pte; } return 0; } static inline unsigned FNAME(gpte_pkeys)(struct kvm_vcpu *vcpu, u64 gpte) { unsigned pkeys = 0; #if PTTYPE == 64 pte_t pte = {.pte = gpte}; pkeys = pte_flags_pkey(pte_flags(pte)); #endif return pkeys; } static inline bool FNAME(is_last_gpte)(struct kvm_mmu *mmu, unsigned int level, unsigned int gpte) { /* * For EPT and PAE paging (both variants), bit 7 is either reserved at * all level or indicates a huge page (ignoring CR3/EPTP). In either * case, bit 7 being set terminates the walk. */ #if PTTYPE == 32 /* * 32-bit paging requires special handling because bit 7 is ignored if * CR4.PSE=0, not reserved. Clear bit 7 in the gpte if the level is * greater than the last level for which bit 7 is the PAGE_SIZE bit. * * The RHS has bit 7 set iff level < (2 + PSE). If it is clear, bit 7 * is not reserved and does not indicate a large page at this level, * so clear PT_PAGE_SIZE_MASK in gpte if that is the case. */ gpte &= level - (PT32_ROOT_LEVEL + mmu->cpu_role.ext.cr4_pse); #endif /* * PG_LEVEL_4K always terminates. The RHS has bit 7 set * iff level <= PG_LEVEL_4K, which for our purpose means * level == PG_LEVEL_4K; set PT_PAGE_SIZE_MASK in gpte then. */ gpte |= level - PG_LEVEL_4K - 1; return gpte & PT_PAGE_SIZE_MASK; } /* * Fetch a guest pte for a guest virtual address, or for an L2's GPA. */ static int FNAME(walk_addr_generic)(struct guest_walker *walker, struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, gpa_t addr, u64 access) { int ret; pt_element_t pte; pt_element_t __user *ptep_user; gfn_t table_gfn; u64 pt_access, pte_access; unsigned index, accessed_dirty, pte_pkey; u64 nested_access; gpa_t pte_gpa; bool have_ad; int offset; u64 walk_nx_mask = 0; const int write_fault = access & PFERR_WRITE_MASK; const int user_fault = access & PFERR_USER_MASK; const int fetch_fault = access & PFERR_FETCH_MASK; u16 errcode = 0; gpa_t real_gpa; gfn_t gfn; trace_kvm_mmu_pagetable_walk(addr, access); retry_walk: walker->level = mmu->cpu_role.base.level; pte = kvm_mmu_get_guest_pgd(vcpu, mmu); have_ad = PT_HAVE_ACCESSED_DIRTY(mmu); #if PTTYPE == 64 walk_nx_mask = 1ULL << PT64_NX_SHIFT; if (walker->level == PT32E_ROOT_LEVEL) { pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3); trace_kvm_mmu_paging_element(pte, walker->level); if (!FNAME(is_present_gpte)(pte)) goto error; --walker->level; } #endif walker->max_level = walker->level; /* * FIXME: on Intel processors, loads of the PDPTE registers for PAE paging * by the MOV to CR instruction are treated as reads and do not cause the * processor to set the dirty flag in any EPT paging-structure entry. */ nested_access = (have_ad ? PFERR_WRITE_MASK : 0) | PFERR_USER_MASK; pte_access = ~0; /* * Queue a page fault for injection if this assertion fails, as callers * assume that walker.fault contains sane info on a walk failure. I.e. * avoid making the situation worse by inducing even worse badness * between when the assertion fails and when KVM kicks the vCPU out to * userspace (because the VM is bugged). */ if (KVM_BUG_ON(is_long_mode(vcpu) && !is_pae(vcpu), vcpu->kvm)) goto error; ++walker->level; do { struct kvm_memory_slot *slot; unsigned long host_addr; pt_access = pte_access; --walker->level; index = PT_INDEX(addr, walker->level); table_gfn = gpte_to_gfn(pte); offset = index * sizeof(pt_element_t); pte_gpa = gfn_to_gpa(table_gfn) + offset; BUG_ON(walker->level < 1); walker->table_gfn[walker->level - 1] = table_gfn; walker->pte_gpa[walker->level - 1] = pte_gpa; real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(table_gfn), nested_access, &walker->fault); /* * FIXME: This can happen if emulation (for of an INS/OUTS * instruction) triggers a nested page fault. The exit * qualification / exit info field will incorrectly have * "guest page access" as the nested page fault's cause, * instead of "guest page structure access". To fix this, * the x86_exception struct should be augmented with enough * information to fix the exit_qualification or exit_info_1 * fields. */ if (unlikely(real_gpa == INVALID_GPA)) return 0; slot = kvm_vcpu_gfn_to_memslot(vcpu, gpa_to_gfn(real_gpa)); if (!kvm_is_visible_memslot(slot)) goto error; host_addr = gfn_to_hva_memslot_prot(slot, gpa_to_gfn(real_gpa), &walker->pte_writable[walker->level - 1]); if (unlikely(kvm_is_error_hva(host_addr))) goto error; ptep_user = (pt_element_t __user *)((void *)host_addr + offset); if (unlikely(__get_user(pte, ptep_user))) goto error; walker->ptep_user[walker->level - 1] = ptep_user; trace_kvm_mmu_paging_element(pte, walker->level); /* * Inverting the NX it lets us AND it like other * permission bits. */ pte_access = pt_access & (pte ^ walk_nx_mask); if (unlikely(!FNAME(is_present_gpte)(pte))) goto error; if (unlikely(FNAME(is_rsvd_bits_set)(mmu, pte, walker->level))) { errcode = PFERR_RSVD_MASK | PFERR_PRESENT_MASK; goto error; } walker->ptes[walker->level - 1] = pte; /* Convert to ACC_*_MASK flags for struct guest_walker. */ walker->pt_access[walker->level - 1] = FNAME(gpte_access)(pt_access ^ walk_nx_mask); } while (!FNAME(is_last_gpte)(mmu, walker->level, pte)); pte_pkey = FNAME(gpte_pkeys)(vcpu, pte); accessed_dirty = have_ad ? pte_access & PT_GUEST_ACCESSED_MASK : 0; /* Convert to ACC_*_MASK flags for struct guest_walker. */ walker->pte_access = FNAME(gpte_access)(pte_access ^ walk_nx_mask); errcode = permission_fault(vcpu, mmu, walker->pte_access, pte_pkey, access); if (unlikely(errcode)) goto error; gfn = gpte_to_gfn_lvl(pte, walker->level); gfn += (addr & PT_LVL_OFFSET_MASK(walker->level)) >> PAGE_SHIFT; #if PTTYPE == 32 if (walker->level > PG_LEVEL_4K && is_cpuid_PSE36()) gfn += pse36_gfn_delta(pte); #endif real_gpa = kvm_translate_gpa(vcpu, mmu, gfn_to_gpa(gfn), access, &walker->fault); if (real_gpa == INVALID_GPA) return 0; walker->gfn = real_gpa >> PAGE_SHIFT; if (!write_fault) FNAME(protect_clean_gpte)(mmu, &walker->pte_access, pte); else /* * On a write fault, fold the dirty bit into accessed_dirty. * For modes without A/D bits support accessed_dirty will be * always clear. */ accessed_dirty &= pte >> (PT_GUEST_DIRTY_SHIFT - PT_GUEST_ACCESSED_SHIFT); if (unlikely(!accessed_dirty)) { ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, addr, write_fault); if (unlikely(ret < 0)) goto error; else if (ret) goto retry_walk; } return 1; error: errcode |= write_fault | user_fault; if (fetch_fault && (is_efer_nx(mmu) || is_cr4_smep(mmu))) errcode |= PFERR_FETCH_MASK; walker->fault.vector = PF_VECTOR; walker->fault.error_code_valid = true; walker->fault.error_code = errcode; #if PTTYPE == PTTYPE_EPT /* * Use PFERR_RSVD_MASK in error_code to tell if EPT * misconfiguration requires to be injected. The detection is * done by is_rsvd_bits_set() above. * * We set up the value of exit_qualification to inject: * [2:0] - Derive from the access bits. The exit_qualification might be * out of date if it is serving an EPT misconfiguration. * [5:3] - Calculated by the page walk of the guest EPT page tables * [7:8] - Derived from [7:8] of real exit_qualification * * The other bits are set to 0. */ if (!(errcode & PFERR_RSVD_MASK)) { vcpu->arch.exit_qualification &= (EPT_VIOLATION_GVA_IS_VALID | EPT_VIOLATION_GVA_TRANSLATED); if (write_fault) vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_WRITE; if (user_fault) vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_READ; if (fetch_fault) vcpu->arch.exit_qualification |= EPT_VIOLATION_ACC_INSTR; /* * Note, pte_access holds the raw RWX bits from the EPTE, not * ACC_*_MASK flags! */ vcpu->arch.exit_qualification |= (pte_access & VMX_EPT_RWX_MASK) << EPT_VIOLATION_RWX_SHIFT; } #endif walker->fault.address = addr; walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu; walker->fault.async_page_fault = false; trace_kvm_mmu_walker_error(walker->fault.error_code); return 0; } static int FNAME(walk_addr)(struct guest_walker *walker, struct kvm_vcpu *vcpu, gpa_t addr, u64 access) { return FNAME(walk_addr_generic)(walker, vcpu, vcpu->arch.mmu, addr, access); } static bool FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, pt_element_t gpte) { struct kvm_memory_slot *slot; unsigned pte_access; gfn_t gfn; kvm_pfn_t pfn; if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte)) return false; gfn = gpte_to_gfn(gpte); pte_access = sp->role.access & FNAME(gpte_access)(gpte); FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte); slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, pte_access & ACC_WRITE_MASK); if (!slot) return false; pfn = gfn_to_pfn_memslot_atomic(slot, gfn); if (is_error_pfn(pfn)) return false; mmu_set_spte(vcpu, slot, spte, pte_access, gfn, pfn, NULL); kvm_release_pfn_clean(pfn); return true; } static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu, struct guest_walker *gw, int level) { pt_element_t curr_pte; gpa_t base_gpa, pte_gpa = gw->pte_gpa[level - 1]; u64 mask; int r, index; if (level == PG_LEVEL_4K) { mask = PTE_PREFETCH_NUM * sizeof(pt_element_t) - 1; base_gpa = pte_gpa & ~mask; index = (pte_gpa - base_gpa) / sizeof(pt_element_t); r = kvm_vcpu_read_guest_atomic(vcpu, base_gpa, gw->prefetch_ptes, sizeof(gw->prefetch_ptes)); curr_pte = gw->prefetch_ptes[index]; } else r = kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &curr_pte, sizeof(curr_pte)); return r || curr_pte != gw->ptes[level - 1]; } static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw, u64 *sptep) { struct kvm_mmu_page *sp; pt_element_t *gptep = gw->prefetch_ptes; u64 *spte; int i; sp = sptep_to_sp(sptep); if (sp->role.level > PG_LEVEL_4K) return; /* * If addresses are being invalidated, skip prefetching to avoid * accidentally prefetching those addresses. */ if (unlikely(vcpu->kvm->mmu_invalidate_in_progress)) return; if (sp->role.direct) return __direct_pte_prefetch(vcpu, sp, sptep); i = spte_index(sptep) & ~(PTE_PREFETCH_NUM - 1); spte = sp->spt + i; for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) { if (spte == sptep) continue; if (is_shadow_present_pte(*spte)) continue; if (!FNAME(prefetch_gpte)(vcpu, sp, spte, gptep[i])) break; } } /* * Fetch a shadow pte for a specific level in the paging hierarchy. * If the guest tries to write a write-protected page, we need to * emulate this operation, return 1 to indicate this case. */ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, struct guest_walker *gw) { struct kvm_mmu_page *sp = NULL; struct kvm_shadow_walk_iterator it; unsigned int direct_access, access; int top_level, ret; gfn_t base_gfn = fault->gfn; WARN_ON_ONCE(gw->gfn != base_gfn); direct_access = gw->pte_access; top_level = vcpu->arch.mmu->cpu_role.base.level; if (top_level == PT32E_ROOT_LEVEL) top_level = PT32_ROOT_LEVEL; /* * Verify that the top-level gpte is still there. Since the page * is a root page, it is either write protected (and cannot be * changed from now on) or it is invalid (in which case, we don't * really care if it changes underneath us after this point). */ if (FNAME(gpte_changed)(vcpu, gw, top_level)) goto out_gpte_changed; if (WARN_ON_ONCE(!VALID_PAGE(vcpu->arch.mmu->root.hpa))) goto out_gpte_changed; /* * Load a new root and retry the faulting instruction in the extremely * unlikely scenario that the guest root gfn became visible between * loading a dummy root and handling the resulting page fault, e.g. if * userspace create a memslot in the interim. */ if (unlikely(kvm_mmu_is_dummy_root(vcpu->arch.mmu->root.hpa))) { kvm_make_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu); goto out_gpte_changed; } for_each_shadow_entry(vcpu, fault->addr, it) { gfn_t table_gfn; clear_sp_write_flooding_count(it.sptep); if (it.level == gw->level) break; table_gfn = gw->table_gfn[it.level - 2]; access = gw->pt_access[it.level - 2]; sp = kvm_mmu_get_child_sp(vcpu, it.sptep, table_gfn, false, access); if (sp != ERR_PTR(-EEXIST)) { /* * We must synchronize the pagetable before linking it * because the guest doesn't need to flush tlb when * the gpte is changed from non-present to present. * Otherwise, the guest may use the wrong mapping. * * For PG_LEVEL_4K, kvm_mmu_get_page() has already * synchronized it transiently via kvm_sync_page(). * * For higher level pagetable, we synchronize it via * the slower mmu_sync_children(). If it needs to * break, some progress has been made; return * RET_PF_RETRY and retry on the next #PF. * KVM_REQ_MMU_SYNC is not necessary but it * expedites the process. */ if (sp->unsync_children && mmu_sync_children(vcpu, sp, false)) return RET_PF_RETRY; } /* * Verify that the gpte in the page we've just write * protected is still there. */ if (FNAME(gpte_changed)(vcpu, gw, it.level - 1)) goto out_gpte_changed; if (sp != ERR_PTR(-EEXIST)) link_shadow_page(vcpu, it.sptep, sp); if (fault->write && table_gfn == fault->gfn) fault->write_fault_to_shadow_pgtable = true; } /* * Adjust the hugepage size _after_ resolving indirect shadow pages. * KVM doesn't support mapping hugepages into the guest for gfns that * are being shadowed by KVM, i.e. allocating a new shadow page may * affect the allowed hugepage size. */ kvm_mmu_hugepage_adjust(vcpu, fault); trace_kvm_mmu_spte_requested(fault); for (; shadow_walk_okay(&it); shadow_walk_next(&it)) { /* * We cannot overwrite existing page tables with an NX * large page, as the leaf could be executable. */ if (fault->nx_huge_page_workaround_enabled) disallowed_hugepage_adjust(fault, *it.sptep, it.level); base_gfn = gfn_round_for_level(fault->gfn, it.level); if (it.level == fault->goal_level) break; validate_direct_spte(vcpu, it.sptep, direct_access); sp = kvm_mmu_get_child_sp(vcpu, it.sptep, base_gfn, true, direct_access); if (sp == ERR_PTR(-EEXIST)) continue; link_shadow_page(vcpu, it.sptep, sp); if (fault->huge_page_disallowed) account_nx_huge_page(vcpu->kvm, sp, fault->req_level >= it.level); } if (WARN_ON_ONCE(it.level != fault->goal_level)) return -EFAULT; ret = mmu_set_spte(vcpu, fault->slot, it.sptep, gw->pte_access, base_gfn, fault->pfn, fault); if (ret == RET_PF_SPURIOUS) return ret; FNAME(pte_prefetch)(vcpu, gw, it.sptep); return ret; out_gpte_changed: return RET_PF_RETRY; } /* * Page fault handler. There are several causes for a page fault: * - there is no shadow pte for the guest pte * - write access through a shadow pte marked read only so that we can set * the dirty bit * - write access to a shadow pte marked read only so we can update the page * dirty bitmap, when userspace requests it * - mmio access; in this case we will never install a present shadow pte * - normal guest page fault due to the guest pte marked not present, not * writable, or not executable * * Returns: 1 if we need to emulate the instruction, 0 otherwise, or * a negative value on error. */ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { struct guest_walker walker; int r; WARN_ON_ONCE(fault->is_tdp); /* * Look up the guest pte for the faulting address. * If PFEC.RSVD is set, this is a shadow page fault. * The bit needs to be cleared before walking guest page tables. */ r = FNAME(walk_addr)(&walker, vcpu, fault->addr, fault->error_code & ~PFERR_RSVD_MASK); /* * The page is not mapped by the guest. Let the guest handle it. */ if (!r) { if (!fault->prefetch) kvm_inject_emulated_page_fault(vcpu, &walker.fault); return RET_PF_RETRY; } fault->gfn = walker.gfn; fault->max_level = walker.level; fault->slot = kvm_vcpu_gfn_to_memslot(vcpu, fault->gfn); if (page_fault_handle_page_track(vcpu, fault)) { shadow_page_table_clear_flood(vcpu, fault->addr); return RET_PF_EMULATE; } r = mmu_topup_memory_caches(vcpu, true); if (r) return r; r = kvm_faultin_pfn(vcpu, fault, walker.pte_access); if (r != RET_PF_CONTINUE) return r; /* * Do not change pte_access if the pfn is a mmio page, otherwise * we will cache the incorrect access into mmio spte. */ if (fault->write && !(walker.pte_access & ACC_WRITE_MASK) && !is_cr0_wp(vcpu->arch.mmu) && !fault->user && fault->slot) { walker.pte_access |= ACC_WRITE_MASK; walker.pte_access &= ~ACC_USER_MASK; /* * If we converted a user page to a kernel page, * so that the kernel can write to it when cr0.wp=0, * then we should prevent the kernel from executing it * if SMEP is enabled. */ if (is_cr4_smep(vcpu->arch.mmu)) walker.pte_access &= ~ACC_EXEC_MASK; } r = RET_PF_RETRY; write_lock(&vcpu->kvm->mmu_lock); if (is_page_fault_stale(vcpu, fault)) goto out_unlock; r = make_mmu_pages_available(vcpu); if (r) goto out_unlock; r = FNAME(fetch)(vcpu, fault, &walker); out_unlock: write_unlock(&vcpu->kvm->mmu_lock); kvm_release_pfn_clean(fault->pfn); return r; } static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp) { int offset = 0; WARN_ON_ONCE(sp->role.level != PG_LEVEL_4K); if (PTTYPE == 32) offset = sp->role.quadrant << SPTE_LEVEL_BITS; return gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t); } /* Note, @addr is a GPA when gva_to_gpa() translates an L2 GPA to an L1 GPA. */ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, gpa_t addr, u64 access, struct x86_exception *exception) { struct guest_walker walker; gpa_t gpa = INVALID_GPA; int r; #ifndef CONFIG_X86_64 /* A 64-bit GVA should be impossible on 32-bit KVM. */ WARN_ON_ONCE((addr >> 32) && mmu == vcpu->arch.walk_mmu); #endif r = FNAME(walk_addr_generic)(&walker, vcpu, mmu, addr, access); if (r) { gpa = gfn_to_gpa(walker.gfn); gpa |= addr & ~PAGE_MASK; } else if (exception) *exception = walker.fault; return gpa; } /* * Using the information in sp->shadowed_translation (kvm_mmu_page_get_gfn()) is * safe because: * - The spte has a reference to the struct page, so the pfn for a given gfn * can't change unless all sptes pointing to it are nuked first. * * Returns * < 0: failed to sync spte * 0: the spte is synced and no tlb flushing is required * > 0: the spte is synced and tlb flushing is required */ static int FNAME(sync_spte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int i) { bool host_writable; gpa_t first_pte_gpa; u64 *sptep, spte; struct kvm_memory_slot *slot; unsigned pte_access; pt_element_t gpte; gpa_t pte_gpa; gfn_t gfn; if (WARN_ON_ONCE(!sp->spt[i])) return 0; first_pte_gpa = FNAME(get_level1_sp_gpa)(sp); pte_gpa = first_pte_gpa + i * sizeof(pt_element_t); if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte, sizeof(pt_element_t))) return -1; if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) return 1; gfn = gpte_to_gfn(gpte); pte_access = sp->role.access; pte_access &= FNAME(gpte_access)(gpte); FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte); if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access)) return 0; /* * Drop the SPTE if the new protections would result in a RWX=0 * SPTE or if the gfn is changing. The RWX=0 case only affects * EPT with execute-only support, i.e. EPT without an effective * "present" bit, as all other paging modes will create a * read-only SPTE if pte_access is zero. */ if ((!pte_access && !shadow_present_mask) || gfn != kvm_mmu_page_get_gfn(sp, i)) { drop_spte(vcpu->kvm, &sp->spt[i]); return 1; } /* * Do nothing if the permissions are unchanged. The existing SPTE is * still, and prefetch_invalid_gpte() has verified that the A/D bits * are set in the "new" gPTE, i.e. there is no danger of missing an A/D * update due to A/D bits being set in the SPTE but not the gPTE. */ if (kvm_mmu_page_get_access(sp, i) == pte_access) return 0; /* Update the shadowed access bits in case they changed. */ kvm_mmu_page_set_access(sp, i, pte_access); sptep = &sp->spt[i]; spte = *sptep; host_writable = spte & shadow_host_writable_mask; slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); make_spte(vcpu, sp, slot, pte_access, gfn, spte_to_pfn(spte), spte, true, false, host_writable, &spte); return mmu_spte_update(sptep, spte); } #undef pt_element_t #undef guest_walker #undef FNAME #undef PT_BASE_ADDR_MASK #undef PT_INDEX #undef PT_LVL_ADDR_MASK #undef PT_LVL_OFFSET_MASK #undef PT_LEVEL_BITS #undef PT_MAX_FULL_LEVELS #undef gpte_to_gfn #undef gpte_to_gfn_lvl #undef PT_GUEST_ACCESSED_MASK #undef PT_GUEST_DIRTY_MASK #undef PT_GUEST_DIRTY_SHIFT #undef PT_GUEST_ACCESSED_SHIFT #undef PT_HAVE_ACCESSED_DIRTY |
50 51 52 52 52 6 46 4 46 2 52 1 52 34 51 2 35 36 52 52 49 2 46 6 51 52 52 52 52 2 52 57 57 56 56 57 3 3 3 3 3 3 3 3 3 3 5 6 6 6 6 6 6 6 1 1 1 1 1 1 6 6 6 6 5 6 6 3 3 3 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 | // SPDX-License-Identifier: GPL-2.0-only /* * Scanning implementation * * Copyright 2003, Jouni Malinen <jkmaline@cc.hut.fi> * Copyright 2004, Instant802 Networks, Inc. * Copyright 2005, Devicescape Software, Inc. * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2007, Michael Wu <flamingice@sourmilk.net> * Copyright 2013-2015 Intel Mobile Communications GmbH * Copyright 2016-2017 Intel Deutschland GmbH * Copyright (C) 2018-2023 Intel Corporation */ #include <linux/if_arp.h> #include <linux/etherdevice.h> #include <linux/rtnetlink.h> #include <net/sch_generic.h> #include <linux/slab.h> #include <linux/export.h> #include <linux/random.h> #include <net/mac80211.h> #include "ieee80211_i.h" #include "driver-ops.h" #include "mesh.h" #define IEEE80211_PROBE_DELAY (HZ / 33) #define IEEE80211_CHANNEL_TIME (HZ / 33) #define IEEE80211_PASSIVE_CHANNEL_TIME (HZ / 9) void ieee80211_rx_bss_put(struct ieee80211_local *local, struct ieee80211_bss *bss) { if (!bss) return; cfg80211_put_bss(local->hw.wiphy, container_of((void *)bss, struct cfg80211_bss, priv)); } static bool is_uapsd_supported(struct ieee802_11_elems *elems) { u8 qos_info; if (elems->wmm_info && elems->wmm_info_len == 7 && elems->wmm_info[5] == 1) qos_info = elems->wmm_info[6]; else if (elems->wmm_param && elems->wmm_param_len == 24 && elems->wmm_param[5] == 1) qos_info = elems->wmm_param[6]; else /* no valid wmm information or parameter element found */ return false; return qos_info & IEEE80211_WMM_IE_AP_QOSINFO_UAPSD; } struct inform_bss_update_data { struct ieee80211_rx_status *rx_status; bool beacon; }; void ieee80211_inform_bss(struct wiphy *wiphy, struct cfg80211_bss *cbss, const struct cfg80211_bss_ies *ies, void *data) { struct ieee80211_local *local = wiphy_priv(wiphy); struct inform_bss_update_data *update_data = data; struct ieee80211_bss *bss = (void *)cbss->priv; struct ieee80211_rx_status *rx_status; struct ieee802_11_elems *elems; int clen, srlen; /* This happens while joining an IBSS */ if (!update_data) return; elems = ieee802_11_parse_elems(ies->data, ies->len, false, NULL); if (!elems) return; rx_status = update_data->rx_status; if (update_data->beacon) bss->device_ts_beacon = rx_status->device_timestamp; else bss->device_ts_presp = rx_status->device_timestamp; if (elems->parse_error) { if (update_data->beacon) bss->corrupt_data |= IEEE80211_BSS_CORRUPT_BEACON; else bss->corrupt_data |= IEEE80211_BSS_CORRUPT_PROBE_RESP; } else { if (update_data->beacon) bss->corrupt_data &= ~IEEE80211_BSS_CORRUPT_BEACON; else bss->corrupt_data &= ~IEEE80211_BSS_CORRUPT_PROBE_RESP; } /* save the ERP value so that it is available at association time */ if (elems->erp_info && (!elems->parse_error || !(bss->valid_data & IEEE80211_BSS_VALID_ERP))) { bss->erp_value = elems->erp_info[0]; bss->has_erp_value = true; if (!elems->parse_error) bss->valid_data |= IEEE80211_BSS_VALID_ERP; } /* replace old supported rates if we get new values */ if (!elems->parse_error || !(bss->valid_data & IEEE80211_BSS_VALID_RATES)) { srlen = 0; if (elems->supp_rates) { clen = IEEE80211_MAX_SUPP_RATES; if (clen > elems->supp_rates_len) clen = elems->supp_rates_len; memcpy(bss->supp_rates, elems->supp_rates, clen); srlen += clen; } if (elems->ext_supp_rates) { clen = IEEE80211_MAX_SUPP_RATES - srlen; if (clen > elems->ext_supp_rates_len) clen = elems->ext_supp_rates_len; memcpy(bss->supp_rates + srlen, elems->ext_supp_rates, clen); srlen += clen; } if (srlen) { bss->supp_rates_len = srlen; if (!elems->parse_error) bss->valid_data |= IEEE80211_BSS_VALID_RATES; } } if (!elems->parse_error || !(bss->valid_data & IEEE80211_BSS_VALID_WMM)) { bss->wmm_used = elems->wmm_param || elems->wmm_info; bss->uapsd_supported = is_uapsd_supported(elems); if (!elems->parse_error) bss->valid_data |= IEEE80211_BSS_VALID_WMM; } if (update_data->beacon) { struct ieee80211_supported_band *sband = local->hw.wiphy->bands[rx_status->band]; if (!(rx_status->encoding == RX_ENC_HT) && !(rx_status->encoding == RX_ENC_VHT)) bss->beacon_rate = &sband->bitrates[rx_status->rate_idx]; } if (elems->vht_cap_elem) bss->vht_cap_info = le32_to_cpu(elems->vht_cap_elem->vht_cap_info); else bss->vht_cap_info = 0; kfree(elems); } struct ieee80211_bss * ieee80211_bss_info_update(struct ieee80211_local *local, struct ieee80211_rx_status *rx_status, struct ieee80211_mgmt *mgmt, size_t len, struct ieee80211_channel *channel) { bool beacon = ieee80211_is_beacon(mgmt->frame_control) || ieee80211_is_s1g_beacon(mgmt->frame_control); struct cfg80211_bss *cbss; struct inform_bss_update_data update_data = { .rx_status = rx_status, .beacon = beacon, }; struct cfg80211_inform_bss bss_meta = { .boottime_ns = rx_status->boottime_ns, .drv_data = (void *)&update_data, }; bool signal_valid; struct ieee80211_sub_if_data *scan_sdata; if (rx_status->flag & RX_FLAG_NO_SIGNAL_VAL) bss_meta.signal = 0; /* invalid signal indication */ else if (ieee80211_hw_check(&local->hw, SIGNAL_DBM)) bss_meta.signal = rx_status->signal * 100; else if (ieee80211_hw_check(&local->hw, SIGNAL_UNSPEC)) bss_meta.signal = (rx_status->signal * 100) / local->hw.max_signal; bss_meta.chan = channel; rcu_read_lock(); scan_sdata = rcu_dereference(local->scan_sdata); if (scan_sdata && scan_sdata->vif.type == NL80211_IFTYPE_STATION && scan_sdata->vif.cfg.assoc && ieee80211_have_rx_timestamp(rx_status)) { bss_meta.parent_tsf = ieee80211_calculate_rx_timestamp(local, rx_status, len + FCS_LEN, 24); ether_addr_copy(bss_meta.parent_bssid, scan_sdata->vif.bss_conf.bssid); } rcu_read_unlock(); cbss = cfg80211_inform_bss_frame_data(local->hw.wiphy, &bss_meta, mgmt, len, GFP_ATOMIC); if (!cbss) return NULL; /* In case the signal is invalid update the status */ signal_valid = channel == cbss->channel; if (!signal_valid) rx_status->flag |= RX_FLAG_NO_SIGNAL_VAL; return (void *)cbss->priv; } static bool ieee80211_scan_accept_presp(struct ieee80211_sub_if_data *sdata, u32 scan_flags, const u8 *da) { if (!sdata) return false; /* accept broadcast for OCE */ if (scan_flags & NL80211_SCAN_FLAG_ACCEPT_BCAST_PROBE_RESP && is_broadcast_ether_addr(da)) return true; if (scan_flags & NL80211_SCAN_FLAG_RANDOM_ADDR) return true; return ether_addr_equal(da, sdata->vif.addr); } void ieee80211_scan_rx(struct ieee80211_local *local, struct sk_buff *skb) { struct ieee80211_rx_status *rx_status = IEEE80211_SKB_RXCB(skb); struct ieee80211_sub_if_data *sdata1, *sdata2; struct ieee80211_mgmt *mgmt = (void *)skb->data; struct ieee80211_bss *bss; struct ieee80211_channel *channel; size_t min_hdr_len = offsetof(struct ieee80211_mgmt, u.probe_resp.variable); if (!ieee80211_is_probe_resp(mgmt->frame_control) && !ieee80211_is_beacon(mgmt->frame_control) && !ieee80211_is_s1g_beacon(mgmt->frame_control)) return; if (ieee80211_is_s1g_beacon(mgmt->frame_control)) { if (ieee80211_is_s1g_short_beacon(mgmt->frame_control)) min_hdr_len = offsetof(struct ieee80211_ext, u.s1g_short_beacon.variable); else min_hdr_len = offsetof(struct ieee80211_ext, u.s1g_beacon); } if (skb->len < min_hdr_len) return; sdata1 = rcu_dereference(local->scan_sdata); sdata2 = rcu_dereference(local->sched_scan_sdata); if (likely(!sdata1 && !sdata2)) return; if (test_and_clear_bit(SCAN_BEACON_WAIT, &local->scanning)) { /* * we were passive scanning because of radar/no-IR, but * the beacon/proberesp rx gives us an opportunity to upgrade * to active scan */ set_bit(SCAN_BEACON_DONE, &local->scanning); wiphy_delayed_work_queue(local->hw.wiphy, &local->scan_work, 0); } if (ieee80211_is_probe_resp(mgmt->frame_control)) { struct cfg80211_scan_request *scan_req; struct cfg80211_sched_scan_request *sched_scan_req; u32 scan_req_flags = 0, sched_scan_req_flags = 0; scan_req = rcu_dereference(local->scan_req); sched_scan_req = rcu_dereference(local->sched_scan_req); if (scan_req) scan_req_flags = scan_req->flags; if (sched_scan_req) sched_scan_req_flags = sched_scan_req->flags; /* ignore ProbeResp to foreign address or non-bcast (OCE) * unless scanning with randomised address */ if (!ieee80211_scan_accept_presp(sdata1, scan_req_flags, mgmt->da) && !ieee80211_scan_accept_presp(sdata2, sched_scan_req_flags, mgmt->da)) return; } channel = ieee80211_get_channel_khz(local->hw.wiphy, ieee80211_rx_status_to_khz(rx_status)); if (!channel || channel->flags & IEEE80211_CHAN_DISABLED) return; bss = ieee80211_bss_info_update(local, rx_status, mgmt, skb->len, channel); if (bss) ieee80211_rx_bss_put(local, bss); } static void ieee80211_prepare_scan_chandef(struct cfg80211_chan_def *chandef) { memset(chandef, 0, sizeof(*chandef)); chandef->width = NL80211_CHAN_WIDTH_20_NOHT; } /* return false if no more work */ static bool ieee80211_prep_hw_scan(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; struct cfg80211_scan_request *req; struct cfg80211_chan_def chandef; u8 bands_used = 0; int i, ielen, n_chans; u32 flags = 0; req = rcu_dereference_protected(local->scan_req, lockdep_is_held(&local->hw.wiphy->mtx)); if (test_bit(SCAN_HW_CANCELLED, &local->scanning)) return false; if (ieee80211_hw_check(&local->hw, SINGLE_SCAN_ON_ALL_BANDS)) { for (i = 0; i < req->n_channels; i++) { local->hw_scan_req->req.channels[i] = req->channels[i]; bands_used |= BIT(req->channels[i]->band); } n_chans = req->n_channels; } else { do { if (local->hw_scan_band == NUM_NL80211_BANDS) return false; n_chans = 0; for (i = 0; i < req->n_channels; i++) { if (req->channels[i]->band != local->hw_scan_band) continue; local->hw_scan_req->req.channels[n_chans] = req->channels[i]; n_chans++; bands_used |= BIT(req->channels[i]->band); } local->hw_scan_band++; } while (!n_chans); } local->hw_scan_req->req.n_channels = n_chans; ieee80211_prepare_scan_chandef(&chandef); if (req->flags & NL80211_SCAN_FLAG_MIN_PREQ_CONTENT) flags |= IEEE80211_PROBE_FLAG_MIN_CONTENT; ielen = ieee80211_build_preq_ies(sdata, (u8 *)local->hw_scan_req->req.ie, local->hw_scan_ies_bufsize, &local->hw_scan_req->ies, req->ie, req->ie_len, bands_used, req->rates, &chandef, flags); local->hw_scan_req->req.ie_len = ielen; local->hw_scan_req->req.no_cck = req->no_cck; ether_addr_copy(local->hw_scan_req->req.mac_addr, req->mac_addr); ether_addr_copy(local->hw_scan_req->req.mac_addr_mask, req->mac_addr_mask); ether_addr_copy(local->hw_scan_req->req.bssid, req->bssid); return true; } static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted) { struct ieee80211_local *local = hw_to_local(hw); bool hw_scan = test_bit(SCAN_HW_SCANNING, &local->scanning); bool was_scanning = local->scanning; struct cfg80211_scan_request *scan_req; struct ieee80211_sub_if_data *scan_sdata; struct ieee80211_sub_if_data *sdata; lockdep_assert_wiphy(local->hw.wiphy); /* * It's ok to abort a not-yet-running scan (that * we have one at all will be verified by checking * local->scan_req next), but not to complete it * successfully. */ if (WARN_ON(!local->scanning && !aborted)) aborted = true; if (WARN_ON(!local->scan_req)) return; scan_sdata = rcu_dereference_protected(local->scan_sdata, lockdep_is_held(&local->hw.wiphy->mtx)); if (hw_scan && !aborted && !ieee80211_hw_check(&local->hw, SINGLE_SCAN_ON_ALL_BANDS) && ieee80211_prep_hw_scan(scan_sdata)) { int rc; rc = drv_hw_scan(local, rcu_dereference_protected(local->scan_sdata, lockdep_is_held(&local->hw.wiphy->mtx)), local->hw_scan_req); if (rc == 0) return; /* HW scan failed and is going to be reported as aborted, * so clear old scan info. */ memset(&local->scan_info, 0, sizeof(local->scan_info)); aborted = true; } kfree(local->hw_scan_req); local->hw_scan_req = NULL; scan_req = rcu_dereference_protected(local->scan_req, lockdep_is_held(&local->hw.wiphy->mtx)); RCU_INIT_POINTER(local->scan_req, NULL); RCU_INIT_POINTER(local->scan_sdata, NULL); local->scanning = 0; local->scan_chandef.chan = NULL; synchronize_rcu(); if (scan_req != local->int_scan_req) { local->scan_info.aborted = aborted; cfg80211_scan_done(scan_req, &local->scan_info); } /* Set power back to normal operating levels. */ ieee80211_hw_config(local, 0); if (!hw_scan && was_scanning) { ieee80211_configure_filter(local); drv_sw_scan_complete(local, scan_sdata); ieee80211_offchannel_return(local); } ieee80211_recalc_idle(local); ieee80211_mlme_notify_scan_completed(local); ieee80211_ibss_notify_scan_completed(local); /* Requeue all the work that might have been ignored while * the scan was in progress; if there was none this will * just be a no-op for the particular interface. */ list_for_each_entry_rcu(sdata, &local->interfaces, list) { if (ieee80211_sdata_running(sdata)) wiphy_work_queue(sdata->local->hw.wiphy, &sdata->work); } if (was_scanning) ieee80211_start_next_roc(local); } void ieee80211_scan_completed(struct ieee80211_hw *hw, struct cfg80211_scan_info *info) { struct ieee80211_local *local = hw_to_local(hw); trace_api_scan_completed(local, info->aborted); set_bit(SCAN_COMPLETED, &local->scanning); if (info->aborted) set_bit(SCAN_ABORTED, &local->scanning); memcpy(&local->scan_info, info, sizeof(*info)); wiphy_delayed_work_queue(local->hw.wiphy, &local->scan_work, 0); } EXPORT_SYMBOL(ieee80211_scan_completed); static int ieee80211_start_sw_scan(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { /* Software scan is not supported in multi-channel cases */ if (local->use_chanctx) return -EOPNOTSUPP; /* * Hardware/driver doesn't support hw_scan, so use software * scanning instead. First send a nullfunc frame with power save * bit on so that AP will buffer the frames for us while we are not * listening, then send probe requests to each channel and wait for * the responses. After all channels are scanned, tune back to the * original channel and send a nullfunc frame with power save bit * off to trigger the AP to send us all the buffered frames. * * Note that while local->sw_scanning is true everything else but * nullfunc frames and probe requests will be dropped in * ieee80211_tx_h_check_assoc(). */ drv_sw_scan_start(local, sdata, local->scan_addr); local->leave_oper_channel_time = jiffies; local->next_scan_state = SCAN_DECISION; local->scan_channel_idx = 0; ieee80211_offchannel_stop_vifs(local); /* ensure nullfunc is transmitted before leaving operating channel */ ieee80211_flush_queues(local, NULL, false); ieee80211_configure_filter(local); /* We need to set power level at maximum rate for scanning. */ ieee80211_hw_config(local, 0); wiphy_delayed_work_queue(local->hw.wiphy, &local->scan_work, 0); return 0; } static bool __ieee80211_can_leave_ch(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; struct ieee80211_sub_if_data *sdata_iter; lockdep_assert_wiphy(local->hw.wiphy); if (!ieee80211_is_radar_required(local)) return true; if (!regulatory_pre_cac_allowed(local->hw.wiphy)) return false; list_for_each_entry(sdata_iter, &local->interfaces, list) { if (sdata_iter->wdev.cac_started) return false; } return true; } static bool ieee80211_can_scan(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata) { if (!__ieee80211_can_leave_ch(sdata)) return false; if (!list_empty(&local->roc_list)) return false; if (sdata->vif.type == NL80211_IFTYPE_STATION && sdata->u.mgd.flags & IEEE80211_STA_CONNECTION_POLL) return false; return true; } void ieee80211_run_deferred_scan(struct ieee80211_local *local) { lockdep_assert_wiphy(local->hw.wiphy); if (!local->scan_req || local->scanning) return; if (!ieee80211_can_scan(local, rcu_dereference_protected( local->scan_sdata, lockdep_is_held(&local->hw.wiphy->mtx)))) return; wiphy_delayed_work_queue(local->hw.wiphy, &local->scan_work, round_jiffies_relative(0)); } static void ieee80211_send_scan_probe_req(struct ieee80211_sub_if_data *sdata, const u8 *src, const u8 *dst, const u8 *ssid, size_t ssid_len, const u8 *ie, size_t ie_len, u32 ratemask, u32 flags, u32 tx_flags, struct ieee80211_channel *channel) { struct sk_buff *skb; skb = ieee80211_build_probe_req(sdata, src, dst, ratemask, channel, ssid, ssid_len, ie, ie_len, flags); if (skb) { if (flags & IEEE80211_PROBE_FLAG_RANDOM_SN) { struct ieee80211_hdr *hdr = (void *)skb->data; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); u16 sn = get_random_u16(); info->control.flags |= IEEE80211_TX_CTRL_NO_SEQNO; hdr->seq_ctrl = cpu_to_le16(IEEE80211_SN_TO_SEQ(sn)); } IEEE80211_SKB_CB(skb)->flags |= tx_flags; ieee80211_tx_skb_tid_band(sdata, skb, 7, channel->band); } } static void ieee80211_scan_state_send_probe(struct ieee80211_local *local, unsigned long *next_delay) { int i; struct ieee80211_sub_if_data *sdata; struct cfg80211_scan_request *scan_req; enum nl80211_band band = local->hw.conf.chandef.chan->band; u32 flags = 0, tx_flags; scan_req = rcu_dereference_protected(local->scan_req, lockdep_is_held(&local->hw.wiphy->mtx)); tx_flags = IEEE80211_TX_INTFL_OFFCHAN_TX_OK; if (scan_req->no_cck) tx_flags |= IEEE80211_TX_CTL_NO_CCK_RATE; if (scan_req->flags & NL80211_SCAN_FLAG_MIN_PREQ_CONTENT) flags |= IEEE80211_PROBE_FLAG_MIN_CONTENT; if (scan_req->flags & NL80211_SCAN_FLAG_RANDOM_SN) flags |= IEEE80211_PROBE_FLAG_RANDOM_SN; sdata = rcu_dereference_protected(local->scan_sdata, lockdep_is_held(&local->hw.wiphy->mtx)); for (i = 0; i < scan_req->n_ssids; i++) ieee80211_send_scan_probe_req( sdata, local->scan_addr, scan_req->bssid, scan_req->ssids[i].ssid, scan_req->ssids[i].ssid_len, scan_req->ie, scan_req->ie_len, scan_req->rates[band], flags, tx_flags, local->hw.conf.chandef.chan); /* * After sending probe requests, wait for probe responses * on the channel. */ *next_delay = IEEE80211_CHANNEL_TIME; local->next_scan_state = SCAN_DECISION; } static int __ieee80211_start_scan(struct ieee80211_sub_if_data *sdata, struct cfg80211_scan_request *req) { struct ieee80211_local *local = sdata->local; bool hw_scan = local->ops->hw_scan; int rc; lockdep_assert_wiphy(local->hw.wiphy); if (local->scan_req) return -EBUSY; if (!__ieee80211_can_leave_ch(sdata)) return -EBUSY; if (!ieee80211_can_scan(local, sdata)) { /* wait for the work to finish/time out */ rcu_assign_pointer(local->scan_req, req); rcu_assign_pointer(local->scan_sdata, sdata); return 0; } again: if (hw_scan) { u8 *ies; local->hw_scan_ies_bufsize = local->scan_ies_len + req->ie_len; if (ieee80211_hw_check(&local->hw, SINGLE_SCAN_ON_ALL_BANDS)) { int i, n_bands = 0; u8 bands_counted = 0; for (i = 0; i < req->n_channels; i++) { if (bands_counted & BIT(req->channels[i]->band)) continue; bands_counted |= BIT(req->channels[i]->band); n_bands++; } local->hw_scan_ies_bufsize *= n_bands; } local->hw_scan_req = kmalloc( sizeof(*local->hw_scan_req) + req->n_channels * sizeof(req->channels[0]) + local->hw_scan_ies_bufsize, GFP_KERNEL); if (!local->hw_scan_req) return -ENOMEM; local->hw_scan_req->req.ssids = req->ssids; local->hw_scan_req->req.n_ssids = req->n_ssids; ies = (u8 *)local->hw_scan_req + sizeof(*local->hw_scan_req) + req->n_channels * sizeof(req->channels[0]); local->hw_scan_req->req.ie = ies; local->hw_scan_req->req.flags = req->flags; eth_broadcast_addr(local->hw_scan_req->req.bssid); local->hw_scan_req->req.duration = req->duration; local->hw_scan_req->req.duration_mandatory = req->duration_mandatory; local->hw_scan_band = 0; local->hw_scan_req->req.n_6ghz_params = req->n_6ghz_params; local->hw_scan_req->req.scan_6ghz_params = req->scan_6ghz_params; local->hw_scan_req->req.scan_6ghz = req->scan_6ghz; /* * After allocating local->hw_scan_req, we must * go through until ieee80211_prep_hw_scan(), so * anything that might be changed here and leave * this function early must not go after this * allocation. */ } rcu_assign_pointer(local->scan_req, req); rcu_assign_pointer(local->scan_sdata, sdata); if (req->flags & NL80211_SCAN_FLAG_RANDOM_ADDR) get_random_mask_addr(local->scan_addr, req->mac_addr, req->mac_addr_mask); else memcpy(local->scan_addr, sdata->vif.addr, ETH_ALEN); if (hw_scan) { __set_bit(SCAN_HW_SCANNING, &local->scanning); } else if ((req->n_channels == 1) && (req->channels[0] == local->_oper_chandef.chan)) { /* * If we are scanning only on the operating channel * then we do not need to stop normal activities */ unsigned long next_delay; __set_bit(SCAN_ONCHANNEL_SCANNING, &local->scanning); ieee80211_recalc_idle(local); /* Notify driver scan is starting, keep order of operations * same as normal software scan, in case that matters. */ drv_sw_scan_start(local, sdata, local->scan_addr); ieee80211_configure_filter(local); /* accept probe-responses */ /* We need to ensure power level is at max for scanning. */ ieee80211_hw_config(local, 0); if ((req->channels[0]->flags & (IEEE80211_CHAN_NO_IR | IEEE80211_CHAN_RADAR)) || !req->n_ssids) { next_delay = IEEE80211_PASSIVE_CHANNEL_TIME; if (req->n_ssids) set_bit(SCAN_BEACON_WAIT, &local->scanning); } else { ieee80211_scan_state_send_probe(local, &next_delay); next_delay = IEEE80211_CHANNEL_TIME; } /* Now, just wait a bit and we are all done! */ wiphy_delayed_work_queue(local->hw.wiphy, &local->scan_work, next_delay); return 0; } else { /* Do normal software scan */ __set_bit(SCAN_SW_SCANNING, &local->scanning); } ieee80211_recalc_idle(local); if (hw_scan) { WARN_ON(!ieee80211_prep_hw_scan(sdata)); rc = drv_hw_scan(local, sdata, local->hw_scan_req); } else { rc = ieee80211_start_sw_scan(local, sdata); } if (rc) { kfree(local->hw_scan_req); local->hw_scan_req = NULL; local->scanning = 0; ieee80211_recalc_idle(local); local->scan_req = NULL; RCU_INIT_POINTER(local->scan_sdata, NULL); } if (hw_scan && rc == 1) { /* * we can't fall back to software for P2P-GO * as it must update NoA etc. */ if (ieee80211_vif_type_p2p(&sdata->vif) == NL80211_IFTYPE_P2P_GO) return -EOPNOTSUPP; hw_scan = false; goto again; } return rc; } static unsigned long ieee80211_scan_get_channel_time(struct ieee80211_channel *chan) { /* * TODO: channel switching also consumes quite some time, * add that delay as well to get a better estimation */ if (chan->flags & (IEEE80211_CHAN_NO_IR | IEEE80211_CHAN_RADAR)) return IEEE80211_PASSIVE_CHANNEL_TIME; return IEEE80211_PROBE_DELAY + IEEE80211_CHANNEL_TIME; } static void ieee80211_scan_state_decision(struct ieee80211_local *local, unsigned long *next_delay) { bool associated = false; bool tx_empty = true; bool bad_latency; struct ieee80211_sub_if_data *sdata; struct ieee80211_channel *next_chan; enum mac80211_scan_state next_scan_state; struct cfg80211_scan_request *scan_req; lockdep_assert_wiphy(local->hw.wiphy); /* * check if at least one STA interface is associated, * check if at least one STA interface has pending tx frames * and grab the lowest used beacon interval */ list_for_each_entry(sdata, &local->interfaces, list) { if (!ieee80211_sdata_running(sdata)) continue; if (sdata->vif.type == NL80211_IFTYPE_STATION) { if (sdata->u.mgd.associated) { associated = true; if (!qdisc_all_tx_empty(sdata->dev)) { tx_empty = false; break; } } } } scan_req = rcu_dereference_protected(local->scan_req, lockdep_is_held(&local->hw.wiphy->mtx)); next_chan = scan_req->channels[local->scan_channel_idx]; /* * we're currently scanning a different channel, let's * see if we can scan another channel without interfering * with the current traffic situation. * * Keep good latency, do not stay off-channel more than 125 ms. */ bad_latency = time_after(jiffies + ieee80211_scan_get_channel_time(next_chan), local->leave_oper_channel_time + HZ / 8); if (associated && !tx_empty) { if (scan_req->flags & NL80211_SCAN_FLAG_LOW_PRIORITY) next_scan_state = SCAN_ABORT; else next_scan_state = SCAN_SUSPEND; } else if (associated && bad_latency) { next_scan_state = SCAN_SUSPEND; } else { next_scan_state = SCAN_SET_CHANNEL; } local->next_scan_state = next_scan_state; *next_delay = 0; } static void ieee80211_scan_state_set_channel(struct ieee80211_local *local, unsigned long *next_delay) { int skip; struct ieee80211_channel *chan; struct cfg80211_scan_request *scan_req; scan_req = rcu_dereference_protected(local->scan_req, lockdep_is_held(&local->hw.wiphy->mtx)); skip = 0; chan = scan_req->channels[local->scan_channel_idx]; local->scan_chandef.chan = chan; local->scan_chandef.center_freq1 = chan->center_freq; local->scan_chandef.freq1_offset = chan->freq_offset; local->scan_chandef.center_freq2 = 0; /* For scanning on the S1G band, detect the channel width according to * the channel being scanned. */ if (chan->band == NL80211_BAND_S1GHZ) { local->scan_chandef.width = ieee80211_s1g_channel_width(chan); goto set_channel; } /* If scanning on oper channel, use whatever channel-type * is currently in use. */ if (chan == local->_oper_chandef.chan) local->scan_chandef = local->_oper_chandef; else local->scan_chandef.width = NL80211_CHAN_WIDTH_20_NOHT; set_channel: if (ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_CHANNEL)) skip = 1; /* advance state machine to next channel/band */ local->scan_channel_idx++; if (skip) { /* if we skip this channel return to the decision state */ local->next_scan_state = SCAN_DECISION; return; } /* * Probe delay is used to update the NAV, cf. 11.1.3.2.2 * (which unfortunately doesn't say _why_ step a) is done, * but it waits for the probe delay or until a frame is * received - and the received frame would update the NAV). * For now, we do not support waiting until a frame is * received. * * In any case, it is not necessary for a passive scan. */ if ((chan->flags & (IEEE80211_CHAN_NO_IR | IEEE80211_CHAN_RADAR)) || !scan_req->n_ssids) { *next_delay = IEEE80211_PASSIVE_CHANNEL_TIME; local->next_scan_state = SCAN_DECISION; if (scan_req->n_ssids) set_bit(SCAN_BEACON_WAIT, &local->scanning); return; } /* active scan, send probes */ *next_delay = IEEE80211_PROBE_DELAY; local->next_scan_state = SCAN_SEND_PROBE; } static void ieee80211_scan_state_suspend(struct ieee80211_local *local, unsigned long *next_delay) { /* switch back to the operating channel */ local->scan_chandef.chan = NULL; ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_CHANNEL); /* disable PS */ ieee80211_offchannel_return(local); *next_delay = HZ / 5; /* afterwards, resume scan & go to next channel */ local->next_scan_state = SCAN_RESUME; } static void ieee80211_scan_state_resume(struct ieee80211_local *local, unsigned long *next_delay) { ieee80211_offchannel_stop_vifs(local); if (local->ops->flush) { ieee80211_flush_queues(local, NULL, false); *next_delay = 0; } else *next_delay = HZ / 10; /* remember when we left the operating channel */ local->leave_oper_channel_time = jiffies; /* advance to the next channel to be scanned */ local->next_scan_state = SCAN_SET_CHANNEL; } void ieee80211_scan_work(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_local *local = container_of(work, struct ieee80211_local, scan_work.work); struct ieee80211_sub_if_data *sdata; struct cfg80211_scan_request *scan_req; unsigned long next_delay = 0; bool aborted; lockdep_assert_wiphy(local->hw.wiphy); if (!ieee80211_can_run_worker(local)) { aborted = true; goto out_complete; } sdata = rcu_dereference_protected(local->scan_sdata, lockdep_is_held(&local->hw.wiphy->mtx)); scan_req = rcu_dereference_protected(local->scan_req, lockdep_is_held(&local->hw.wiphy->mtx)); /* When scanning on-channel, the first-callback means completed. */ if (test_bit(SCAN_ONCHANNEL_SCANNING, &local->scanning)) { aborted = test_and_clear_bit(SCAN_ABORTED, &local->scanning); goto out_complete; } if (test_and_clear_bit(SCAN_COMPLETED, &local->scanning)) { aborted = test_and_clear_bit(SCAN_ABORTED, &local->scanning); goto out_complete; } if (!sdata || !scan_req) return; if (!local->scanning) { int rc; RCU_INIT_POINTER(local->scan_req, NULL); RCU_INIT_POINTER(local->scan_sdata, NULL); rc = __ieee80211_start_scan(sdata, scan_req); if (!rc) return; /* need to complete scan in cfg80211 */ rcu_assign_pointer(local->scan_req, scan_req); aborted = true; goto out_complete; } clear_bit(SCAN_BEACON_WAIT, &local->scanning); /* * as long as no delay is required advance immediately * without scheduling a new work */ do { if (!ieee80211_sdata_running(sdata)) { aborted = true; goto out_complete; } if (test_and_clear_bit(SCAN_BEACON_DONE, &local->scanning) && local->next_scan_state == SCAN_DECISION) local->next_scan_state = SCAN_SEND_PROBE; switch (local->next_scan_state) { case SCAN_DECISION: /* if no more bands/channels left, complete scan */ if (local->scan_channel_idx >= scan_req->n_channels) { aborted = false; goto out_complete; } ieee80211_scan_state_decision(local, &next_delay); break; case SCAN_SET_CHANNEL: ieee80211_scan_state_set_channel(local, &next_delay); break; case SCAN_SEND_PROBE: ieee80211_scan_state_send_probe(local, &next_delay); break; case SCAN_SUSPEND: ieee80211_scan_state_suspend(local, &next_delay); break; case SCAN_RESUME: ieee80211_scan_state_resume(local, &next_delay); break; case SCAN_ABORT: aborted = true; goto out_complete; } } while (next_delay == 0); wiphy_delayed_work_queue(local->hw.wiphy, &local->scan_work, next_delay); return; out_complete: __ieee80211_scan_completed(&local->hw, aborted); } int ieee80211_request_scan(struct ieee80211_sub_if_data *sdata, struct cfg80211_scan_request *req) { lockdep_assert_wiphy(sdata->local->hw.wiphy); return __ieee80211_start_scan(sdata, req); } int ieee80211_request_ibss_scan(struct ieee80211_sub_if_data *sdata, const u8 *ssid, u8 ssid_len, struct ieee80211_channel **channels, unsigned int n_channels) { struct ieee80211_local *local = sdata->local; int ret = -EBUSY, i, n_ch = 0; enum nl80211_band band; lockdep_assert_wiphy(local->hw.wiphy); /* busy scanning */ if (local->scan_req) goto unlock; /* fill internal scan request */ if (!channels) { int max_n; for (band = 0; band < NUM_NL80211_BANDS; band++) { if (!local->hw.wiphy->bands[band] || band == NL80211_BAND_6GHZ) continue; max_n = local->hw.wiphy->bands[band]->n_channels; for (i = 0; i < max_n; i++) { struct ieee80211_channel *tmp_ch = &local->hw.wiphy->bands[band]->channels[i]; if (tmp_ch->flags & (IEEE80211_CHAN_NO_IR | IEEE80211_CHAN_DISABLED)) continue; local->int_scan_req->channels[n_ch] = tmp_ch; n_ch++; } } if (WARN_ON_ONCE(n_ch == 0)) goto unlock; local->int_scan_req->n_channels = n_ch; } else { for (i = 0; i < n_channels; i++) { if (channels[i]->flags & (IEEE80211_CHAN_NO_IR | IEEE80211_CHAN_DISABLED)) continue; local->int_scan_req->channels[n_ch] = channels[i]; n_ch++; } if (WARN_ON_ONCE(n_ch == 0)) goto unlock; local->int_scan_req->n_channels = n_ch; } local->int_scan_req->ssids = &local->scan_ssid; local->int_scan_req->n_ssids = 1; memcpy(local->int_scan_req->ssids[0].ssid, ssid, IEEE80211_MAX_SSID_LEN); local->int_scan_req->ssids[0].ssid_len = ssid_len; ret = __ieee80211_start_scan(sdata, sdata->local->int_scan_req); unlock: return ret; } void ieee80211_scan_cancel(struct ieee80211_local *local) { /* ensure a new scan cannot be queued */ lockdep_assert_wiphy(local->hw.wiphy); /* * We are canceling software scan, or deferred scan that was not * yet really started (see __ieee80211_start_scan ). * * Regarding hardware scan: * - we can not call __ieee80211_scan_completed() as when * SCAN_HW_SCANNING bit is set this function change * local->hw_scan_req to operate on 5G band, what race with * driver which can use local->hw_scan_req * * - we can not cancel scan_work since driver can schedule it * by ieee80211_scan_completed(..., true) to finish scan * * Hence we only call the cancel_hw_scan() callback, but the low-level * driver is still responsible for calling ieee80211_scan_completed() * after the scan was completed/aborted. */ if (!local->scan_req) return; /* * We have a scan running and the driver already reported completion, * but the worker hasn't run yet or is stuck on the mutex - mark it as * cancelled. */ if (test_bit(SCAN_HW_SCANNING, &local->scanning) && test_bit(SCAN_COMPLETED, &local->scanning)) { set_bit(SCAN_HW_CANCELLED, &local->scanning); return; } if (test_bit(SCAN_HW_SCANNING, &local->scanning)) { /* * Make sure that __ieee80211_scan_completed doesn't trigger a * scan on another band. */ set_bit(SCAN_HW_CANCELLED, &local->scanning); if (local->ops->cancel_hw_scan) drv_cancel_hw_scan(local, rcu_dereference_protected(local->scan_sdata, lockdep_is_held(&local->hw.wiphy->mtx))); return; } wiphy_delayed_work_cancel(local->hw.wiphy, &local->scan_work); /* and clean up */ memset(&local->scan_info, 0, sizeof(local->scan_info)); __ieee80211_scan_completed(&local->hw, true); } int __ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata, struct cfg80211_sched_scan_request *req) { struct ieee80211_local *local = sdata->local; struct ieee80211_scan_ies sched_scan_ies = {}; struct cfg80211_chan_def chandef; int ret, i, iebufsz, num_bands = 0; u32 rate_masks[NUM_NL80211_BANDS] = {}; u8 bands_used = 0; u8 *ie; u32 flags = 0; lockdep_assert_wiphy(local->hw.wiphy); iebufsz = local->scan_ies_len + req->ie_len; if (!local->ops->sched_scan_start) return -ENOTSUPP; for (i = 0; i < NUM_NL80211_BANDS; i++) { if (local->hw.wiphy->bands[i]) { bands_used |= BIT(i); rate_masks[i] = (u32) -1; num_bands++; } } if (req->flags & NL80211_SCAN_FLAG_MIN_PREQ_CONTENT) flags |= IEEE80211_PROBE_FLAG_MIN_CONTENT; ie = kcalloc(iebufsz, num_bands, GFP_KERNEL); if (!ie) { ret = -ENOMEM; goto out; } ieee80211_prepare_scan_chandef(&chandef); ieee80211_build_preq_ies(sdata, ie, num_bands * iebufsz, &sched_scan_ies, req->ie, req->ie_len, bands_used, rate_masks, &chandef, flags); ret = drv_sched_scan_start(local, sdata, req, &sched_scan_ies); if (ret == 0) { rcu_assign_pointer(local->sched_scan_sdata, sdata); rcu_assign_pointer(local->sched_scan_req, req); } kfree(ie); out: if (ret) { /* Clean in case of failure after HW restart or upon resume. */ RCU_INIT_POINTER(local->sched_scan_sdata, NULL); RCU_INIT_POINTER(local->sched_scan_req, NULL); } return ret; } int ieee80211_request_sched_scan_start(struct ieee80211_sub_if_data *sdata, struct cfg80211_sched_scan_request *req) { struct ieee80211_local *local = sdata->local; lockdep_assert_wiphy(local->hw.wiphy); if (rcu_access_pointer(local->sched_scan_sdata)) return -EBUSY; return __ieee80211_request_sched_scan_start(sdata, req); } int ieee80211_request_sched_scan_stop(struct ieee80211_local *local) { struct ieee80211_sub_if_data *sched_scan_sdata; int ret = -ENOENT; lockdep_assert_wiphy(local->hw.wiphy); if (!local->ops->sched_scan_stop) return -ENOTSUPP; /* We don't want to restart sched scan anymore. */ RCU_INIT_POINTER(local->sched_scan_req, NULL); sched_scan_sdata = rcu_dereference_protected(local->sched_scan_sdata, lockdep_is_held(&local->hw.wiphy->mtx)); if (sched_scan_sdata) { ret = drv_sched_scan_stop(local, sched_scan_sdata); if (!ret) RCU_INIT_POINTER(local->sched_scan_sdata, NULL); } return ret; } void ieee80211_sched_scan_results(struct ieee80211_hw *hw) { struct ieee80211_local *local = hw_to_local(hw); trace_api_sched_scan_results(local); cfg80211_sched_scan_results(hw->wiphy, 0); } EXPORT_SYMBOL(ieee80211_sched_scan_results); void ieee80211_sched_scan_end(struct ieee80211_local *local) { lockdep_assert_wiphy(local->hw.wiphy); if (!rcu_access_pointer(local->sched_scan_sdata)) return; RCU_INIT_POINTER(local->sched_scan_sdata, NULL); /* If sched scan was aborted by the driver. */ RCU_INIT_POINTER(local->sched_scan_req, NULL); cfg80211_sched_scan_stopped_locked(local->hw.wiphy, 0); } void ieee80211_sched_scan_stopped_work(struct wiphy *wiphy, struct wiphy_work *work) { struct ieee80211_local *local = container_of(work, struct ieee80211_local, sched_scan_stopped_work); ieee80211_sched_scan_end(local); } void ieee80211_sched_scan_stopped(struct ieee80211_hw *hw) { struct ieee80211_local *local = hw_to_local(hw); trace_api_sched_scan_stopped(local); /* * this shouldn't really happen, so for simplicity * simply ignore it, and let mac80211 reconfigure * the sched scan later on. */ if (local->in_reconfig) return; wiphy_work_queue(hw->wiphy, &local->sched_scan_stopped_work); } EXPORT_SYMBOL(ieee80211_sched_scan_stopped); |
70 70 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2014 Red Hat, Inc. * All Rights Reserved. */ #include "xfs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_sysfs.h" #include "xfs_log.h" #include "xfs_log_priv.h" #include "xfs_mount.h" struct xfs_sysfs_attr { struct attribute attr; ssize_t (*show)(struct kobject *kobject, char *buf); ssize_t (*store)(struct kobject *kobject, const char *buf, size_t count); }; static inline struct xfs_sysfs_attr * to_attr(struct attribute *attr) { return container_of(attr, struct xfs_sysfs_attr, attr); } #define XFS_SYSFS_ATTR_RW(name) \ static struct xfs_sysfs_attr xfs_sysfs_attr_##name = __ATTR_RW(name) #define XFS_SYSFS_ATTR_RO(name) \ static struct xfs_sysfs_attr xfs_sysfs_attr_##name = __ATTR_RO(name) #define XFS_SYSFS_ATTR_WO(name) \ static struct xfs_sysfs_attr xfs_sysfs_attr_##name = __ATTR_WO(name) #define ATTR_LIST(name) &xfs_sysfs_attr_##name.attr STATIC ssize_t xfs_sysfs_object_show( struct kobject *kobject, struct attribute *attr, char *buf) { struct xfs_sysfs_attr *xfs_attr = to_attr(attr); return xfs_attr->show ? xfs_attr->show(kobject, buf) : 0; } STATIC ssize_t xfs_sysfs_object_store( struct kobject *kobject, struct attribute *attr, const char *buf, size_t count) { struct xfs_sysfs_attr *xfs_attr = to_attr(attr); return xfs_attr->store ? xfs_attr->store(kobject, buf, count) : 0; } static const struct sysfs_ops xfs_sysfs_ops = { .show = xfs_sysfs_object_show, .store = xfs_sysfs_object_store, }; static struct attribute *xfs_mp_attrs[] = { NULL, }; ATTRIBUTE_GROUPS(xfs_mp); const struct kobj_type xfs_mp_ktype = { .release = xfs_sysfs_release, .sysfs_ops = &xfs_sysfs_ops, .default_groups = xfs_mp_groups, }; #ifdef DEBUG /* debug */ STATIC ssize_t bug_on_assert_store( struct kobject *kobject, const char *buf, size_t count) { int ret; int val; ret = kstrtoint(buf, 0, &val); if (ret) return ret; if (val == 1) xfs_globals.bug_on_assert = true; else if (val == 0) xfs_globals.bug_on_assert = false; else return -EINVAL; return count; } STATIC ssize_t bug_on_assert_show( struct kobject *kobject, char *buf) { return sysfs_emit(buf, "%d\n", xfs_globals.bug_on_assert); } XFS_SYSFS_ATTR_RW(bug_on_assert); STATIC ssize_t log_recovery_delay_store( struct kobject *kobject, const char *buf, size_t count) { int ret; int val; ret = kstrtoint(buf, 0, &val); if (ret) return ret; if (val < 0 || val > 60) return -EINVAL; xfs_globals.log_recovery_delay = val; return count; } STATIC ssize_t log_recovery_delay_show( struct kobject *kobject, char *buf) { return sysfs_emit(buf, "%d\n", xfs_globals.log_recovery_delay); } XFS_SYSFS_ATTR_RW(log_recovery_delay); STATIC ssize_t mount_delay_store( struct kobject *kobject, const char *buf, size_t count) { int ret; int val; ret = kstrtoint(buf, 0, &val); if (ret) return ret; if (val < 0 || val > 60) return -EINVAL; xfs_globals.mount_delay = val; return count; } STATIC ssize_t mount_delay_show( struct kobject *kobject, char *buf) { return sysfs_emit(buf, "%d\n", xfs_globals.mount_delay); } XFS_SYSFS_ATTR_RW(mount_delay); static ssize_t always_cow_store( struct kobject *kobject, const char *buf, size_t count) { ssize_t ret; ret = kstrtobool(buf, &xfs_globals.always_cow); if (ret < 0) return ret; return count; } static ssize_t always_cow_show( struct kobject *kobject, char *buf) { return sysfs_emit(buf, "%d\n", xfs_globals.always_cow); } XFS_SYSFS_ATTR_RW(always_cow); #ifdef DEBUG /* * Override how many threads the parallel work queue is allowed to create. * This has to be a debug-only global (instead of an errortag) because one of * the main users of parallel workqueues is mount time quotacheck. */ STATIC ssize_t pwork_threads_store( struct kobject *kobject, const char *buf, size_t count) { int ret; int val; ret = kstrtoint(buf, 0, &val); if (ret) return ret; if (val < -1 || val > num_possible_cpus()) return -EINVAL; xfs_globals.pwork_threads = val; return count; } STATIC ssize_t pwork_threads_show( struct kobject *kobject, char *buf) { return sysfs_emit(buf, "%d\n", xfs_globals.pwork_threads); } XFS_SYSFS_ATTR_RW(pwork_threads); static ssize_t larp_store( struct kobject *kobject, const char *buf, size_t count) { ssize_t ret; ret = kstrtobool(buf, &xfs_globals.larp); if (ret < 0) return ret; return count; } STATIC ssize_t larp_show( struct kobject *kobject, char *buf) { return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.larp); } XFS_SYSFS_ATTR_RW(larp); #endif /* DEBUG */ static struct attribute *xfs_dbg_attrs[] = { ATTR_LIST(bug_on_assert), ATTR_LIST(log_recovery_delay), ATTR_LIST(mount_delay), ATTR_LIST(always_cow), #ifdef DEBUG ATTR_LIST(pwork_threads), ATTR_LIST(larp), #endif NULL, }; ATTRIBUTE_GROUPS(xfs_dbg); const struct kobj_type xfs_dbg_ktype = { .release = xfs_sysfs_release, .sysfs_ops = &xfs_sysfs_ops, .default_groups = xfs_dbg_groups, }; #endif /* DEBUG */ /* stats */ static inline struct xstats * to_xstats(struct kobject *kobject) { struct xfs_kobj *kobj = to_kobj(kobject); return container_of(kobj, struct xstats, xs_kobj); } STATIC ssize_t stats_show( struct kobject *kobject, char *buf) { struct xstats *stats = to_xstats(kobject); return xfs_stats_format(stats->xs_stats, buf); } XFS_SYSFS_ATTR_RO(stats); STATIC ssize_t stats_clear_store( struct kobject *kobject, const char *buf, size_t count) { int ret; int val; struct xstats *stats = to_xstats(kobject); ret = kstrtoint(buf, 0, &val); if (ret) return ret; if (val != 1) return -EINVAL; xfs_stats_clearall(stats->xs_stats); return count; } XFS_SYSFS_ATTR_WO(stats_clear); static struct attribute *xfs_stats_attrs[] = { ATTR_LIST(stats), ATTR_LIST(stats_clear), NULL, }; ATTRIBUTE_GROUPS(xfs_stats); const struct kobj_type xfs_stats_ktype = { .release = xfs_sysfs_release, .sysfs_ops = &xfs_sysfs_ops, .default_groups = xfs_stats_groups, }; /* xlog */ static inline struct xlog * to_xlog(struct kobject *kobject) { struct xfs_kobj *kobj = to_kobj(kobject); return container_of(kobj, struct xlog, l_kobj); } STATIC ssize_t log_head_lsn_show( struct kobject *kobject, char *buf) { int cycle; int block; struct xlog *log = to_xlog(kobject); spin_lock(&log->l_icloglock); cycle = log->l_curr_cycle; block = log->l_curr_block; spin_unlock(&log->l_icloglock); return sysfs_emit(buf, "%d:%d\n", cycle, block); } XFS_SYSFS_ATTR_RO(log_head_lsn); STATIC ssize_t log_tail_lsn_show( struct kobject *kobject, char *buf) { int cycle; int block; struct xlog *log = to_xlog(kobject); xlog_crack_atomic_lsn(&log->l_tail_lsn, &cycle, &block); return sysfs_emit(buf, "%d:%d\n", cycle, block); } XFS_SYSFS_ATTR_RO(log_tail_lsn); STATIC ssize_t reserve_grant_head_show( struct kobject *kobject, char *buf) { int cycle; int bytes; struct xlog *log = to_xlog(kobject); xlog_crack_grant_head(&log->l_reserve_head.grant, &cycle, &bytes); return sysfs_emit(buf, "%d:%d\n", cycle, bytes); } XFS_SYSFS_ATTR_RO(reserve_grant_head); STATIC ssize_t write_grant_head_show( struct kobject *kobject, char *buf) { int cycle; int bytes; struct xlog *log = to_xlog(kobject); xlog_crack_grant_head(&log->l_write_head.grant, &cycle, &bytes); return sysfs_emit(buf, "%d:%d\n", cycle, bytes); } XFS_SYSFS_ATTR_RO(write_grant_head); static struct attribute *xfs_log_attrs[] = { ATTR_LIST(log_head_lsn), ATTR_LIST(log_tail_lsn), ATTR_LIST(reserve_grant_head), ATTR_LIST(write_grant_head), NULL, }; ATTRIBUTE_GROUPS(xfs_log); const struct kobj_type xfs_log_ktype = { .release = xfs_sysfs_release, .sysfs_ops = &xfs_sysfs_ops, .default_groups = xfs_log_groups, }; /* * Metadata IO error configuration * * The sysfs structure here is: * ...xfs/<dev>/error/<class>/<errno>/<error_attrs> * * where <class> allows us to discriminate between data IO and metadata IO, * and any other future type of IO (e.g. special inode or directory error * handling) we care to support. */ static inline struct xfs_error_cfg * to_error_cfg(struct kobject *kobject) { struct xfs_kobj *kobj = to_kobj(kobject); return container_of(kobj, struct xfs_error_cfg, kobj); } static inline struct xfs_mount * err_to_mp(struct kobject *kobject) { struct xfs_kobj *kobj = to_kobj(kobject); return container_of(kobj, struct xfs_mount, m_error_kobj); } static ssize_t max_retries_show( struct kobject *kobject, char *buf) { int retries; struct xfs_error_cfg *cfg = to_error_cfg(kobject); if (cfg->max_retries == XFS_ERR_RETRY_FOREVER) retries = -1; else retries = cfg->max_retries; return sysfs_emit(buf, "%d\n", retries); } static ssize_t max_retries_store( struct kobject *kobject, const char *buf, size_t count) { struct xfs_error_cfg *cfg = to_error_cfg(kobject); int ret; int val; ret = kstrtoint(buf, 0, &val); if (ret) return ret; if (val < -1) return -EINVAL; if (val == -1) cfg->max_retries = XFS_ERR_RETRY_FOREVER; else cfg->max_retries = val; return count; } XFS_SYSFS_ATTR_RW(max_retries); static ssize_t retry_timeout_seconds_show( struct kobject *kobject, char *buf) { int timeout; struct xfs_error_cfg *cfg = to_error_cfg(kobject); if (cfg->retry_timeout == XFS_ERR_RETRY_FOREVER) timeout = -1; else timeout = jiffies_to_msecs(cfg->retry_timeout) / MSEC_PER_SEC; return sysfs_emit(buf, "%d\n", timeout); } static ssize_t retry_timeout_seconds_store( struct kobject *kobject, const char *buf, size_t count) { struct xfs_error_cfg *cfg = to_error_cfg(kobject); int ret; int val; ret = kstrtoint(buf, 0, &val); if (ret) return ret; /* 1 day timeout maximum, -1 means infinite */ if (val < -1 || val > 86400) return -EINVAL; if (val == -1) cfg->retry_timeout = XFS_ERR_RETRY_FOREVER; else { cfg->retry_timeout = msecs_to_jiffies(val * MSEC_PER_SEC); ASSERT(msecs_to_jiffies(val * MSEC_PER_SEC) < LONG_MAX); } return count; } XFS_SYSFS_ATTR_RW(retry_timeout_seconds); static ssize_t fail_at_unmount_show( struct kobject *kobject, char *buf) { struct xfs_mount *mp = err_to_mp(kobject); return sysfs_emit(buf, "%d\n", mp->m_fail_unmount); } static ssize_t fail_at_unmount_store( struct kobject *kobject, const char *buf, size_t count) { struct xfs_mount *mp = err_to_mp(kobject); int ret; int val; ret = kstrtoint(buf, 0, &val); if (ret) return ret; if (val < 0 || val > 1) return -EINVAL; mp->m_fail_unmount = val; return count; } XFS_SYSFS_ATTR_RW(fail_at_unmount); static struct attribute *xfs_error_attrs[] = { ATTR_LIST(max_retries), ATTR_LIST(retry_timeout_seconds), NULL, }; ATTRIBUTE_GROUPS(xfs_error); static const struct kobj_type xfs_error_cfg_ktype = { .release = xfs_sysfs_release, .sysfs_ops = &xfs_sysfs_ops, .default_groups = xfs_error_groups, }; static const struct kobj_type xfs_error_ktype = { .release = xfs_sysfs_release, .sysfs_ops = &xfs_sysfs_ops, }; /* * Error initialization tables. These need to be ordered in the same * order as the enums used to index the array. All class init tables need to * define a "default" behaviour as the first entry, all other entries can be * empty. */ struct xfs_error_init { char *name; int max_retries; int retry_timeout; /* in seconds */ }; static const struct xfs_error_init xfs_error_meta_init[XFS_ERR_ERRNO_MAX] = { { .name = "default", .max_retries = XFS_ERR_RETRY_FOREVER, .retry_timeout = XFS_ERR_RETRY_FOREVER, }, { .name = "EIO", .max_retries = XFS_ERR_RETRY_FOREVER, .retry_timeout = XFS_ERR_RETRY_FOREVER, }, { .name = "ENOSPC", .max_retries = XFS_ERR_RETRY_FOREVER, .retry_timeout = XFS_ERR_RETRY_FOREVER, }, { .name = "ENODEV", .max_retries = 0, /* We can't recover from devices disappearing */ .retry_timeout = 0, }, }; static int xfs_error_sysfs_init_class( struct xfs_mount *mp, int class, const char *parent_name, struct xfs_kobj *parent_kobj, const struct xfs_error_init init[]) { struct xfs_error_cfg *cfg; int error; int i; ASSERT(class < XFS_ERR_CLASS_MAX); error = xfs_sysfs_init(parent_kobj, &xfs_error_ktype, &mp->m_error_kobj, parent_name); if (error) return error; for (i = 0; i < XFS_ERR_ERRNO_MAX; i++) { cfg = &mp->m_error_cfg[class][i]; error = xfs_sysfs_init(&cfg->kobj, &xfs_error_cfg_ktype, parent_kobj, init[i].name); if (error) goto out_error; cfg->max_retries = init[i].max_retries; if (init[i].retry_timeout == XFS_ERR_RETRY_FOREVER) cfg->retry_timeout = XFS_ERR_RETRY_FOREVER; else cfg->retry_timeout = msecs_to_jiffies( init[i].retry_timeout * MSEC_PER_SEC); } return 0; out_error: /* unwind the entries that succeeded */ for (i--; i >= 0; i--) { cfg = &mp->m_error_cfg[class][i]; xfs_sysfs_del(&cfg->kobj); } xfs_sysfs_del(parent_kobj); return error; } int xfs_error_sysfs_init( struct xfs_mount *mp) { int error; /* .../xfs/<dev>/error/ */ error = xfs_sysfs_init(&mp->m_error_kobj, &xfs_error_ktype, &mp->m_kobj, "error"); if (error) return error; error = sysfs_create_file(&mp->m_error_kobj.kobject, ATTR_LIST(fail_at_unmount)); if (error) goto out_error; /* .../xfs/<dev>/error/metadata/ */ error = xfs_error_sysfs_init_class(mp, XFS_ERR_METADATA, "metadata", &mp->m_error_meta_kobj, xfs_error_meta_init); if (error) goto out_error; return 0; out_error: xfs_sysfs_del(&mp->m_error_kobj); return error; } void xfs_error_sysfs_del( struct xfs_mount *mp) { struct xfs_error_cfg *cfg; int i, j; for (i = 0; i < XFS_ERR_CLASS_MAX; i++) { for (j = 0; j < XFS_ERR_ERRNO_MAX; j++) { cfg = &mp->m_error_cfg[i][j]; xfs_sysfs_del(&cfg->kobj); } } xfs_sysfs_del(&mp->m_error_meta_kobj); xfs_sysfs_del(&mp->m_error_kobj); } struct xfs_error_cfg * xfs_error_get_cfg( struct xfs_mount *mp, int error_class, int error) { struct xfs_error_cfg *cfg; if (error < 0) error = -error; switch (error) { case EIO: cfg = &mp->m_error_cfg[error_class][XFS_ERR_EIO]; break; case ENOSPC: cfg = &mp->m_error_cfg[error_class][XFS_ERR_ENOSPC]; break; case ENODEV: cfg = &mp->m_error_cfg[error_class][XFS_ERR_ENODEV]; break; default: cfg = &mp->m_error_cfg[error_class][XFS_ERR_DEFAULT]; break; } return cfg; } |
1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 | // SPDX-License-Identifier: GPL-2.0 #include <linux/cpumask.h> #include <linux/fs.h> #include <linux/init.h> #include <linux/interrupt.h> #include <linux/kernel_stat.h> #include <linux/proc_fs.h> #include <linux/sched.h> #include <linux/sched/stat.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/time.h> #include <linux/time_namespace.h> #include <linux/irqnr.h> #include <linux/sched/cputime.h> #include <linux/tick.h> #ifndef arch_irq_stat_cpu #define arch_irq_stat_cpu(cpu) 0 #endif #ifndef arch_irq_stat #define arch_irq_stat() 0 #endif u64 get_idle_time(struct kernel_cpustat *kcs, int cpu) { u64 idle, idle_usecs = -1ULL; if (cpu_online(cpu)) idle_usecs = get_cpu_idle_time_us(cpu, NULL); if (idle_usecs == -1ULL) /* !NO_HZ or cpu offline so we can rely on cpustat.idle */ idle = kcs->cpustat[CPUTIME_IDLE]; else idle = idle_usecs * NSEC_PER_USEC; return idle; } static u64 get_iowait_time(struct kernel_cpustat *kcs, int cpu) { u64 iowait, iowait_usecs = -1ULL; if (cpu_online(cpu)) iowait_usecs = get_cpu_iowait_time_us(cpu, NULL); if (iowait_usecs == -1ULL) /* !NO_HZ or cpu offline so we can rely on cpustat.iowait */ iowait = kcs->cpustat[CPUTIME_IOWAIT]; else iowait = iowait_usecs * NSEC_PER_USEC; return iowait; } static void show_irq_gap(struct seq_file *p, unsigned int gap) { static const char zeros[] = " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0"; while (gap > 0) { unsigned int inc; inc = min_t(unsigned int, gap, ARRAY_SIZE(zeros) / 2); seq_write(p, zeros, 2 * inc); gap -= inc; } } static void show_all_irqs(struct seq_file *p) { unsigned int i, next = 0; for_each_active_irq(i) { show_irq_gap(p, i - next); seq_put_decimal_ull(p, " ", kstat_irqs_usr(i)); next = i + 1; } show_irq_gap(p, nr_irqs - next); } static int show_stat(struct seq_file *p, void *v) { int i, j; u64 user, nice, system, idle, iowait, irq, softirq, steal; u64 guest, guest_nice; u64 sum = 0; u64 sum_softirq = 0; unsigned int per_softirq_sums[NR_SOFTIRQS] = {0}; struct timespec64 boottime; user = nice = system = idle = iowait = irq = softirq = steal = 0; guest = guest_nice = 0; getboottime64(&boottime); /* shift boot timestamp according to the timens offset */ timens_sub_boottime(&boottime); for_each_possible_cpu(i) { struct kernel_cpustat kcpustat; u64 *cpustat = kcpustat.cpustat; kcpustat_cpu_fetch(&kcpustat, i); user += cpustat[CPUTIME_USER]; nice += cpustat[CPUTIME_NICE]; system += cpustat[CPUTIME_SYSTEM]; idle += get_idle_time(&kcpustat, i); iowait += get_iowait_time(&kcpustat, i); irq += cpustat[CPUTIME_IRQ]; softirq += cpustat[CPUTIME_SOFTIRQ]; steal += cpustat[CPUTIME_STEAL]; guest += cpustat[CPUTIME_GUEST]; guest_nice += cpustat[CPUTIME_GUEST_NICE]; sum += kstat_cpu_irqs_sum(i); sum += arch_irq_stat_cpu(i); for (j = 0; j < NR_SOFTIRQS; j++) { unsigned int softirq_stat = kstat_softirqs_cpu(j, i); per_softirq_sums[j] += softirq_stat; sum_softirq += softirq_stat; } } sum += arch_irq_stat(); seq_put_decimal_ull(p, "cpu ", nsec_to_clock_t(user)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(nice)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(system)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(idle)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(iowait)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(irq)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(softirq)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(steal)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest_nice)); seq_putc(p, '\n'); for_each_online_cpu(i) { struct kernel_cpustat kcpustat; u64 *cpustat = kcpustat.cpustat; kcpustat_cpu_fetch(&kcpustat, i); /* Copy values here to work around gcc-2.95.3, gcc-2.96 */ user = cpustat[CPUTIME_USER]; nice = cpustat[CPUTIME_NICE]; system = cpustat[CPUTIME_SYSTEM]; idle = get_idle_time(&kcpustat, i); iowait = get_iowait_time(&kcpustat, i); irq = cpustat[CPUTIME_IRQ]; softirq = cpustat[CPUTIME_SOFTIRQ]; steal = cpustat[CPUTIME_STEAL]; guest = cpustat[CPUTIME_GUEST]; guest_nice = cpustat[CPUTIME_GUEST_NICE]; seq_printf(p, "cpu%d", i); seq_put_decimal_ull(p, " ", nsec_to_clock_t(user)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(nice)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(system)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(idle)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(iowait)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(irq)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(softirq)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(steal)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest)); seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest_nice)); seq_putc(p, '\n'); } seq_put_decimal_ull(p, "intr ", (unsigned long long)sum); show_all_irqs(p); seq_printf(p, "\nctxt %llu\n" "btime %llu\n" "processes %lu\n" "procs_running %u\n" "procs_blocked %u\n", nr_context_switches(), (unsigned long long)boottime.tv_sec, total_forks, nr_running(), nr_iowait()); seq_put_decimal_ull(p, "softirq ", (unsigned long long)sum_softirq); for (i = 0; i < NR_SOFTIRQS; i++) seq_put_decimal_ull(p, " ", per_softirq_sums[i]); seq_putc(p, '\n'); return 0; } static int stat_open(struct inode *inode, struct file *file) { unsigned int size = 1024 + 128 * num_online_cpus(); /* minimum size to display an interrupt count : 2 bytes */ size += 2 * nr_irqs; return single_open_size(file, show_stat, NULL, size); } static const struct proc_ops stat_proc_ops = { .proc_flags = PROC_ENTRY_PERMANENT, .proc_open = stat_open, .proc_read_iter = seq_read_iter, .proc_lseek = seq_lseek, .proc_release = single_release, }; static int __init proc_stat_init(void) { proc_create("stat", 0, NULL, &stat_proc_ops); return 0; } fs_initcall(proc_stat_init); |
159 159 159 155 139 154 3 157 3 21 16 176 69 159 4 2 1 1 68 169 69 3 129 151 5 3 137 137 10 10 141 141 7 5 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. * All Rights Reserved. */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_trans.h" #include "xfs_buf_item.h" #include "xfs_trans_priv.h" #include "xfs_trace.h" /* * Check to see if a buffer matching the given parameters is already * a part of the given transaction. */ STATIC struct xfs_buf * xfs_trans_buf_item_match( struct xfs_trans *tp, struct xfs_buftarg *target, struct xfs_buf_map *map, int nmaps) { struct xfs_log_item *lip; struct xfs_buf_log_item *blip; int len = 0; int i; for (i = 0; i < nmaps; i++) len += map[i].bm_len; list_for_each_entry(lip, &tp->t_items, li_trans) { blip = (struct xfs_buf_log_item *)lip; if (blip->bli_item.li_type == XFS_LI_BUF && blip->bli_buf->b_target == target && xfs_buf_daddr(blip->bli_buf) == map[0].bm_bn && blip->bli_buf->b_length == len) { ASSERT(blip->bli_buf->b_map_count == nmaps); return blip->bli_buf; } } return NULL; } /* * Add the locked buffer to the transaction. * * The buffer must be locked, and it cannot be associated with any * transaction. * * If the buffer does not yet have a buf log item associated with it, * then allocate one for it. Then add the buf item to the transaction. */ STATIC void _xfs_trans_bjoin( struct xfs_trans *tp, struct xfs_buf *bp, int reset_recur) { struct xfs_buf_log_item *bip; ASSERT(bp->b_transp == NULL); /* * The xfs_buf_log_item pointer is stored in b_log_item. If * it doesn't have one yet, then allocate one and initialize it. * The checks to see if one is there are in xfs_buf_item_init(). */ xfs_buf_item_init(bp, tp->t_mountp); bip = bp->b_log_item; ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_CANCEL)); ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED)); if (reset_recur) bip->bli_recur = 0; /* * Take a reference for this transaction on the buf item. */ atomic_inc(&bip->bli_refcount); /* * Attach the item to the transaction so we can find it in * xfs_trans_get_buf() and friends. */ xfs_trans_add_item(tp, &bip->bli_item); bp->b_transp = tp; } void xfs_trans_bjoin( struct xfs_trans *tp, struct xfs_buf *bp) { _xfs_trans_bjoin(tp, bp, 0); trace_xfs_trans_bjoin(bp->b_log_item); } /* * Get and lock the buffer for the caller if it is not already * locked within the given transaction. If it is already locked * within the transaction, just increment its lock recursion count * and return a pointer to it. * * If the transaction pointer is NULL, make this just a normal * get_buf() call. */ int xfs_trans_get_buf_map( struct xfs_trans *tp, struct xfs_buftarg *target, struct xfs_buf_map *map, int nmaps, xfs_buf_flags_t flags, struct xfs_buf **bpp) { struct xfs_buf *bp; struct xfs_buf_log_item *bip; int error; *bpp = NULL; if (!tp) return xfs_buf_get_map(target, map, nmaps, flags, bpp); /* * If we find the buffer in the cache with this transaction * pointer in its b_fsprivate2 field, then we know we already * have it locked. In this case we just increment the lock * recursion count and return the buffer to the caller. */ bp = xfs_trans_buf_item_match(tp, target, map, nmaps); if (bp != NULL) { ASSERT(xfs_buf_islocked(bp)); if (xfs_is_shutdown(tp->t_mountp)) { xfs_buf_stale(bp); bp->b_flags |= XBF_DONE; } ASSERT(bp->b_transp == tp); bip = bp->b_log_item; ASSERT(bip != NULL); ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_recur++; trace_xfs_trans_get_buf_recur(bip); *bpp = bp; return 0; } error = xfs_buf_get_map(target, map, nmaps, flags, &bp); if (error) return error; ASSERT(!bp->b_error); _xfs_trans_bjoin(tp, bp, 1); trace_xfs_trans_get_buf(bp->b_log_item); *bpp = bp; return 0; } /* * Get and lock the superblock buffer for the given transaction. */ struct xfs_buf * xfs_trans_getsb( struct xfs_trans *tp) { struct xfs_buf *bp = tp->t_mountp->m_sb_bp; /* * Just increment the lock recursion count if the buffer is already * attached to this transaction. */ if (bp->b_transp == tp) { struct xfs_buf_log_item *bip = bp->b_log_item; ASSERT(bip != NULL); ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_recur++; trace_xfs_trans_getsb_recur(bip); } else { xfs_buf_lock(bp); xfs_buf_hold(bp); _xfs_trans_bjoin(tp, bp, 1); trace_xfs_trans_getsb(bp->b_log_item); } return bp; } /* * Get and lock the buffer for the caller if it is not already * locked within the given transaction. If it has not yet been * read in, read it from disk. If it is already locked * within the transaction and already read in, just increment its * lock recursion count and return a pointer to it. * * If the transaction pointer is NULL, make this just a normal * read_buf() call. */ int xfs_trans_read_buf_map( struct xfs_mount *mp, struct xfs_trans *tp, struct xfs_buftarg *target, struct xfs_buf_map *map, int nmaps, xfs_buf_flags_t flags, struct xfs_buf **bpp, const struct xfs_buf_ops *ops) { struct xfs_buf *bp = NULL; struct xfs_buf_log_item *bip; int error; *bpp = NULL; /* * If we find the buffer in the cache with this transaction * pointer in its b_fsprivate2 field, then we know we already * have it locked. If it is already read in we just increment * the lock recursion count and return the buffer to the caller. * If the buffer is not yet read in, then we read it in, increment * the lock recursion count, and return it to the caller. */ if (tp) bp = xfs_trans_buf_item_match(tp, target, map, nmaps); if (bp) { ASSERT(xfs_buf_islocked(bp)); ASSERT(bp->b_transp == tp); ASSERT(bp->b_log_item != NULL); ASSERT(!bp->b_error); ASSERT(bp->b_flags & XBF_DONE); /* * We never locked this buf ourselves, so we shouldn't * brelse it either. Just get out. */ if (xfs_is_shutdown(mp)) { trace_xfs_trans_read_buf_shut(bp, _RET_IP_); return -EIO; } /* * Check if the caller is trying to read a buffer that is * already attached to the transaction yet has no buffer ops * assigned. Ops are usually attached when the buffer is * attached to the transaction, or by the read caller if * special circumstances. That didn't happen, which is not * how this is supposed to go. * * If the buffer passes verification we'll let this go, but if * not we have to shut down. Let the transaction cleanup code * release this buffer when it kills the tranaction. */ ASSERT(bp->b_ops != NULL); error = xfs_buf_reverify(bp, ops); if (error) { xfs_buf_ioerror_alert(bp, __return_address); if (tp->t_flags & XFS_TRANS_DIRTY) xfs_force_shutdown(tp->t_mountp, SHUTDOWN_META_IO_ERROR); /* bad CRC means corrupted metadata */ if (error == -EFSBADCRC) error = -EFSCORRUPTED; return error; } bip = bp->b_log_item; bip->bli_recur++; ASSERT(atomic_read(&bip->bli_refcount) > 0); trace_xfs_trans_read_buf_recur(bip); ASSERT(bp->b_ops != NULL || ops == NULL); *bpp = bp; return 0; } error = xfs_buf_read_map(target, map, nmaps, flags, &bp, ops, __return_address); switch (error) { case 0: break; default: if (tp && (tp->t_flags & XFS_TRANS_DIRTY)) xfs_force_shutdown(tp->t_mountp, SHUTDOWN_META_IO_ERROR); fallthrough; case -ENOMEM: case -EAGAIN: return error; } if (xfs_is_shutdown(mp)) { xfs_buf_relse(bp); trace_xfs_trans_read_buf_shut(bp, _RET_IP_); return -EIO; } if (tp) { _xfs_trans_bjoin(tp, bp, 1); trace_xfs_trans_read_buf(bp->b_log_item); } ASSERT(bp->b_ops != NULL || ops == NULL); *bpp = bp; return 0; } /* Has this buffer been dirtied by anyone? */ bool xfs_trans_buf_is_dirty( struct xfs_buf *bp) { struct xfs_buf_log_item *bip = bp->b_log_item; if (!bip) return false; ASSERT(bip->bli_item.li_type == XFS_LI_BUF); return test_bit(XFS_LI_DIRTY, &bip->bli_item.li_flags); } /* * Release a buffer previously joined to the transaction. If the buffer is * modified within this transaction, decrement the recursion count but do not * release the buffer even if the count goes to 0. If the buffer is not modified * within the transaction, decrement the recursion count and release the buffer * if the recursion count goes to 0. * * If the buffer is to be released and it was not already dirty before this * transaction began, then also free the buf_log_item associated with it. * * If the transaction pointer is NULL, this is a normal xfs_buf_relse() call. */ void xfs_trans_brelse( struct xfs_trans *tp, struct xfs_buf *bp) { struct xfs_buf_log_item *bip = bp->b_log_item; ASSERT(bp->b_transp == tp); if (!tp) { xfs_buf_relse(bp); return; } trace_xfs_trans_brelse(bip); ASSERT(bip->bli_item.li_type == XFS_LI_BUF); ASSERT(atomic_read(&bip->bli_refcount) > 0); /* * If the release is for a recursive lookup, then decrement the count * and return. */ if (bip->bli_recur > 0) { bip->bli_recur--; return; } /* * If the buffer is invalidated or dirty in this transaction, we can't * release it until we commit. */ if (test_bit(XFS_LI_DIRTY, &bip->bli_item.li_flags)) return; if (bip->bli_flags & XFS_BLI_STALE) return; /* * Unlink the log item from the transaction and clear the hold flag, if * set. We wouldn't want the next user of the buffer to get confused. */ ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED)); xfs_trans_del_item(&bip->bli_item); bip->bli_flags &= ~XFS_BLI_HOLD; /* drop the reference to the bli */ xfs_buf_item_put(bip); bp->b_transp = NULL; xfs_buf_relse(bp); } /* * Mark the buffer as not needing to be unlocked when the buf item's * iop_committing() routine is called. The buffer must already be locked * and associated with the given transaction. */ /* ARGSUSED */ void xfs_trans_bhold( xfs_trans_t *tp, struct xfs_buf *bp) { struct xfs_buf_log_item *bip = bp->b_log_item; ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_CANCEL)); ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_flags |= XFS_BLI_HOLD; trace_xfs_trans_bhold(bip); } /* * Cancel the previous buffer hold request made on this buffer * for this transaction. */ void xfs_trans_bhold_release( xfs_trans_t *tp, struct xfs_buf *bp) { struct xfs_buf_log_item *bip = bp->b_log_item; ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_CANCEL)); ASSERT(atomic_read(&bip->bli_refcount) > 0); ASSERT(bip->bli_flags & XFS_BLI_HOLD); bip->bli_flags &= ~XFS_BLI_HOLD; trace_xfs_trans_bhold_release(bip); } /* * Mark a buffer dirty in the transaction. */ void xfs_trans_dirty_buf( struct xfs_trans *tp, struct xfs_buf *bp) { struct xfs_buf_log_item *bip = bp->b_log_item; ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); /* * Mark the buffer as needing to be written out eventually, * and set its iodone function to remove the buffer's buf log * item from the AIL and free it when the buffer is flushed * to disk. */ bp->b_flags |= XBF_DONE; ASSERT(atomic_read(&bip->bli_refcount) > 0); /* * If we invalidated the buffer within this transaction, then * cancel the invalidation now that we're dirtying the buffer * again. There are no races with the code in xfs_buf_item_unpin(), * because we have a reference to the buffer this entire time. */ if (bip->bli_flags & XFS_BLI_STALE) { bip->bli_flags &= ~XFS_BLI_STALE; ASSERT(bp->b_flags & XBF_STALE); bp->b_flags &= ~XBF_STALE; bip->__bli_format.blf_flags &= ~XFS_BLF_CANCEL; } bip->bli_flags |= XFS_BLI_DIRTY | XFS_BLI_LOGGED; tp->t_flags |= XFS_TRANS_DIRTY; set_bit(XFS_LI_DIRTY, &bip->bli_item.li_flags); } /* * This is called to mark bytes first through last inclusive of the given * buffer as needing to be logged when the transaction is committed. * The buffer must already be associated with the given transaction. * * First and last are numbers relative to the beginning of this buffer, * so the first byte in the buffer is numbered 0 regardless of the * value of b_blkno. */ void xfs_trans_log_buf( struct xfs_trans *tp, struct xfs_buf *bp, uint first, uint last) { struct xfs_buf_log_item *bip = bp->b_log_item; ASSERT(first <= last && last < BBTOB(bp->b_length)); ASSERT(!(bip->bli_flags & XFS_BLI_ORDERED)); xfs_trans_dirty_buf(tp, bp); trace_xfs_trans_log_buf(bip); xfs_buf_item_log(bip, first, last); } /* * Invalidate a buffer that is being used within a transaction. * * Typically this is because the blocks in the buffer are being freed, so we * need to prevent it from being written out when we're done. Allowing it * to be written again might overwrite data in the free blocks if they are * reallocated to a file. * * We prevent the buffer from being written out by marking it stale. We can't * get rid of the buf log item at this point because the buffer may still be * pinned by another transaction. If that is the case, then we'll wait until * the buffer is committed to disk for the last time (we can tell by the ref * count) and free it in xfs_buf_item_unpin(). Until that happens we will * keep the buffer locked so that the buffer and buf log item are not reused. * * We also set the XFS_BLF_CANCEL flag in the buf log format structure and log * the buf item. This will be used at recovery time to determine that copies * of the buffer in the log before this should not be replayed. * * We mark the item descriptor and the transaction dirty so that we'll hold * the buffer until after the commit. * * Since we're invalidating the buffer, we also clear the state about which * parts of the buffer have been logged. We also clear the flag indicating * that this is an inode buffer since the data in the buffer will no longer * be valid. * * We set the stale bit in the buffer as well since we're getting rid of it. */ void xfs_trans_binval( xfs_trans_t *tp, struct xfs_buf *bp) { struct xfs_buf_log_item *bip = bp->b_log_item; int i; ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); ASSERT(atomic_read(&bip->bli_refcount) > 0); trace_xfs_trans_binval(bip); if (bip->bli_flags & XFS_BLI_STALE) { /* * If the buffer is already invalidated, then * just return. */ ASSERT(bp->b_flags & XBF_STALE); ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY))); ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_INODE_BUF)); ASSERT(!(bip->__bli_format.blf_flags & XFS_BLFT_MASK)); ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL); ASSERT(test_bit(XFS_LI_DIRTY, &bip->bli_item.li_flags)); ASSERT(tp->t_flags & XFS_TRANS_DIRTY); return; } xfs_buf_stale(bp); bip->bli_flags |= XFS_BLI_STALE; bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY); bip->__bli_format.blf_flags &= ~XFS_BLF_INODE_BUF; bip->__bli_format.blf_flags |= XFS_BLF_CANCEL; bip->__bli_format.blf_flags &= ~XFS_BLFT_MASK; for (i = 0; i < bip->bli_format_count; i++) { memset(bip->bli_formats[i].blf_data_map, 0, (bip->bli_formats[i].blf_map_size * sizeof(uint))); } set_bit(XFS_LI_DIRTY, &bip->bli_item.li_flags); tp->t_flags |= XFS_TRANS_DIRTY; } /* * This call is used to indicate that the buffer contains on-disk inodes which * must be handled specially during recovery. They require special handling * because only the di_next_unlinked from the inodes in the buffer should be * recovered. The rest of the data in the buffer is logged via the inodes * themselves. * * All we do is set the XFS_BLI_INODE_BUF flag in the items flags so it can be * transferred to the buffer's log format structure so that we'll know what to * do at recovery time. */ void xfs_trans_inode_buf( xfs_trans_t *tp, struct xfs_buf *bp) { struct xfs_buf_log_item *bip = bp->b_log_item; ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_flags |= XFS_BLI_INODE_BUF; bp->b_flags |= _XBF_INODES; xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF); } /* * This call is used to indicate that the buffer is going to * be staled and was an inode buffer. This means it gets * special processing during unpin - where any inodes * associated with the buffer should be removed from ail. * There is also special processing during recovery, * any replay of the inodes in the buffer needs to be * prevented as the buffer may have been reused. */ void xfs_trans_stale_inode_buf( xfs_trans_t *tp, struct xfs_buf *bp) { struct xfs_buf_log_item *bip = bp->b_log_item; ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_flags |= XFS_BLI_STALE_INODE; bp->b_flags |= _XBF_INODES; xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF); } /* * Mark the buffer as being one which contains newly allocated * inodes. We need to make sure that even if this buffer is * relogged as an 'inode buf' we still recover all of the inode * images in the face of a crash. This works in coordination with * xfs_buf_item_committed() to ensure that the buffer remains in the * AIL at its original location even after it has been relogged. */ /* ARGSUSED */ void xfs_trans_inode_alloc_buf( xfs_trans_t *tp, struct xfs_buf *bp) { struct xfs_buf_log_item *bip = bp->b_log_item; ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); ASSERT(atomic_read(&bip->bli_refcount) > 0); bip->bli_flags |= XFS_BLI_INODE_ALLOC_BUF; bp->b_flags |= _XBF_INODES; xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF); } /* * Mark the buffer as ordered for this transaction. This means that the contents * of the buffer are not recorded in the transaction but it is tracked in the * AIL as though it was. This allows us to record logical changes in * transactions rather than the physical changes we make to the buffer without * changing writeback ordering constraints of metadata buffers. */ bool xfs_trans_ordered_buf( struct xfs_trans *tp, struct xfs_buf *bp) { struct xfs_buf_log_item *bip = bp->b_log_item; ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); ASSERT(atomic_read(&bip->bli_refcount) > 0); if (xfs_buf_item_dirty_format(bip)) return false; bip->bli_flags |= XFS_BLI_ORDERED; trace_xfs_buf_item_ordered(bip); /* * We don't log a dirty range of an ordered buffer but it still needs * to be marked dirty and that it has been logged. */ xfs_trans_dirty_buf(tp, bp); return true; } /* * Set the type of the buffer for log recovery so that it can correctly identify * and hence attach the correct buffer ops to the buffer after replay. */ void xfs_trans_buf_set_type( struct xfs_trans *tp, struct xfs_buf *bp, enum xfs_blft type) { struct xfs_buf_log_item *bip = bp->b_log_item; if (!tp) return; ASSERT(bp->b_transp == tp); ASSERT(bip != NULL); ASSERT(atomic_read(&bip->bli_refcount) > 0); xfs_blft_to_flags(&bip->__bli_format, type); } void xfs_trans_buf_copy_type( struct xfs_buf *dst_bp, struct xfs_buf *src_bp) { struct xfs_buf_log_item *sbip = src_bp->b_log_item; struct xfs_buf_log_item *dbip = dst_bp->b_log_item; enum xfs_blft type; type = xfs_blft_from_flags(&sbip->__bli_format); xfs_blft_to_flags(&dbip->__bli_format, type); } /* * Similar to xfs_trans_inode_buf(), this marks the buffer as a cluster of * dquots. However, unlike in inode buffer recovery, dquot buffers get * recovered in their entirety. (Hence, no XFS_BLI_DQUOT_ALLOC_BUF flag). * The only thing that makes dquot buffers different from regular * buffers is that we must not replay dquot bufs when recovering * if a _corresponding_ quotaoff has happened. We also have to distinguish * between usr dquot bufs and grp dquot bufs, because usr and grp quotas * can be turned off independently. */ /* ARGSUSED */ void xfs_trans_dquot_buf( xfs_trans_t *tp, struct xfs_buf *bp, uint type) { struct xfs_buf_log_item *bip = bp->b_log_item; ASSERT(type == XFS_BLF_UDQUOT_BUF || type == XFS_BLF_PDQUOT_BUF || type == XFS_BLF_GDQUOT_BUF); bip->__bli_format.blf_flags |= type; switch (type) { case XFS_BLF_UDQUOT_BUF: type = XFS_BLFT_UDQUOT_BUF; break; case XFS_BLF_PDQUOT_BUF: type = XFS_BLFT_PDQUOT_BUF; break; case XFS_BLF_GDQUOT_BUF: type = XFS_BLFT_GDQUOT_BUF; break; default: type = XFS_BLFT_UNKNOWN_BUF; break; } bp->b_flags |= _XBF_DQUOTS; xfs_trans_buf_set_type(tp, bp, type); } |
2447 5111 500 4 2372 371 104 2042 5105 2448 2495 714 501 501 233 8 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 | // SPDX-License-Identifier: GPL-2.0 /* * fs/ext4/fast_commit.c * * Written by Harshad Shirwadkar <harshadshirwadkar@gmail.com> * * Ext4 fast commits routines. */ #include "ext4.h" #include "ext4_jbd2.h" #include "ext4_extents.h" #include "mballoc.h" /* * Ext4 Fast Commits * ----------------- * * Ext4 fast commits implement fine grained journalling for Ext4. * * Fast commits are organized as a log of tag-length-value (TLV) structs. (See * struct ext4_fc_tl). Each TLV contains some delta that is replayed TLV by * TLV during the recovery phase. For the scenarios for which we currently * don't have replay code, fast commit falls back to full commits. * Fast commits record delta in one of the following three categories. * * (A) Directory entry updates: * * - EXT4_FC_TAG_UNLINK - records directory entry unlink * - EXT4_FC_TAG_LINK - records directory entry link * - EXT4_FC_TAG_CREAT - records inode and directory entry creation * * (B) File specific data range updates: * * - EXT4_FC_TAG_ADD_RANGE - records addition of new blocks to an inode * - EXT4_FC_TAG_DEL_RANGE - records deletion of blocks from an inode * * (C) Inode metadata (mtime / ctime etc): * * - EXT4_FC_TAG_INODE - record the inode that should be replayed * during recovery. Note that iblocks field is * not replayed and instead derived during * replay. * Commit Operation * ---------------- * With fast commits, we maintain all the directory entry operations in the * order in which they are issued in an in-memory queue. This queue is flushed * to disk during the commit operation. We also maintain a list of inodes * that need to be committed during a fast commit in another in memory queue of * inodes. During the commit operation, we commit in the following order: * * [1] Lock inodes for any further data updates by setting COMMITTING state * [2] Submit data buffers of all the inodes * [3] Wait for [2] to complete * [4] Commit all the directory entry updates in the fast commit space * [5] Commit all the changed inode structures * [6] Write tail tag (this tag ensures the atomicity, please read the following * section for more details). * [7] Wait for [4], [5] and [6] to complete. * * All the inode updates must call ext4_fc_start_update() before starting an * update. If such an ongoing update is present, fast commit waits for it to * complete. The completion of such an update is marked by * ext4_fc_stop_update(). * * Fast Commit Ineligibility * ------------------------- * * Not all operations are supported by fast commits today (e.g extended * attributes). Fast commit ineligibility is marked by calling * ext4_fc_mark_ineligible(): This makes next fast commit operation to fall back * to full commit. * * Atomicity of commits * -------------------- * In order to guarantee atomicity during the commit operation, fast commit * uses "EXT4_FC_TAG_TAIL" tag that marks a fast commit as complete. Tail * tag contains CRC of the contents and TID of the transaction after which * this fast commit should be applied. Recovery code replays fast commit * logs only if there's at least 1 valid tail present. For every fast commit * operation, there is 1 tail. This means, we may end up with multiple tails * in the fast commit space. Here's an example: * * - Create a new file A and remove existing file B * - fsync() * - Append contents to file A * - Truncate file A * - fsync() * * The fast commit space at the end of above operations would look like this: * [HEAD] [CREAT A] [UNLINK B] [TAIL] [ADD_RANGE A] [DEL_RANGE A] [TAIL] * |<--- Fast Commit 1 --->|<--- Fast Commit 2 ---->| * * Replay code should thus check for all the valid tails in the FC area. * * Fast Commit Replay Idempotence * ------------------------------ * * Fast commits tags are idempotent in nature provided the recovery code follows * certain rules. The guiding principle that the commit path follows while * committing is that it stores the result of a particular operation instead of * storing the procedure. * * Let's consider this rename operation: 'mv /a /b'. Let's assume dirent '/a' * was associated with inode 10. During fast commit, instead of storing this * operation as a procedure "rename a to b", we store the resulting file system * state as a "series" of outcomes: * * - Link dirent b to inode 10 * - Unlink dirent a * - Inode <10> with valid refcount * * Now when recovery code runs, it needs "enforce" this state on the file * system. This is what guarantees idempotence of fast commit replay. * * Let's take an example of a procedure that is not idempotent and see how fast * commits make it idempotent. Consider following sequence of operations: * * rm A; mv B A; read A * (x) (y) (z) * * (x), (y) and (z) are the points at which we can crash. If we store this * sequence of operations as is then the replay is not idempotent. Let's say * while in replay, we crash at (z). During the second replay, file A (which was * actually created as a result of "mv B A" operation) would get deleted. Thus, * file named A would be absent when we try to read A. So, this sequence of * operations is not idempotent. However, as mentioned above, instead of storing * the procedure fast commits store the outcome of each procedure. Thus the fast * commit log for above procedure would be as follows: * * (Let's assume dirent A was linked to inode 10 and dirent B was linked to * inode 11 before the replay) * * [Unlink A] [Link A to inode 11] [Unlink B] [Inode 11] * (w) (x) (y) (z) * * If we crash at (z), we will have file A linked to inode 11. During the second * replay, we will remove file A (inode 11). But we will create it back and make * it point to inode 11. We won't find B, so we'll just skip that step. At this * point, the refcount for inode 11 is not reliable, but that gets fixed by the * replay of last inode 11 tag. Crashes at points (w), (x) and (y) get handled * similarly. Thus, by converting a non-idempotent procedure into a series of * idempotent outcomes, fast commits ensured idempotence during the replay. * * TODOs * ----- * * 0) Fast commit replay path hardening: Fast commit replay code should use * journal handles to make sure all the updates it does during the replay * path are atomic. With that if we crash during fast commit replay, after * trying to do recovery again, we will find a file system where fast commit * area is invalid (because new full commit would be found). In order to deal * with that, fast commit replay code should ensure that the "FC_REPLAY" * superblock state is persisted before starting the replay, so that after * the crash, fast commit recovery code can look at that flag and perform * fast commit recovery even if that area is invalidated by later full * commits. * * 1) Fast commit's commit path locks the entire file system during fast * commit. This has significant performance penalty. Instead of that, we * should use ext4_fc_start/stop_update functions to start inode level * updates from ext4_journal_start/stop. Once we do that we can drop file * system locking during commit path. * * 2) Handle more ineligible cases. */ #include <trace/events/ext4.h> static struct kmem_cache *ext4_fc_dentry_cachep; static void ext4_end_buffer_io_sync(struct buffer_head *bh, int uptodate) { BUFFER_TRACE(bh, ""); if (uptodate) { ext4_debug("%s: Block %lld up-to-date", __func__, bh->b_blocknr); set_buffer_uptodate(bh); } else { ext4_debug("%s: Block %lld not up-to-date", __func__, bh->b_blocknr); clear_buffer_uptodate(bh); } unlock_buffer(bh); } static inline void ext4_fc_reset_inode(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); ei->i_fc_lblk_start = 0; ei->i_fc_lblk_len = 0; } void ext4_fc_init_inode(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); ext4_fc_reset_inode(inode); ext4_clear_inode_state(inode, EXT4_STATE_FC_COMMITTING); INIT_LIST_HEAD(&ei->i_fc_list); INIT_LIST_HEAD(&ei->i_fc_dilist); init_waitqueue_head(&ei->i_fc_wait); atomic_set(&ei->i_fc_updates, 0); } /* This function must be called with sbi->s_fc_lock held. */ static void ext4_fc_wait_committing_inode(struct inode *inode) __releases(&EXT4_SB(inode->i_sb)->s_fc_lock) { wait_queue_head_t *wq; struct ext4_inode_info *ei = EXT4_I(inode); #if (BITS_PER_LONG < 64) DEFINE_WAIT_BIT(wait, &ei->i_state_flags, EXT4_STATE_FC_COMMITTING); wq = bit_waitqueue(&ei->i_state_flags, EXT4_STATE_FC_COMMITTING); #else DEFINE_WAIT_BIT(wait, &ei->i_flags, EXT4_STATE_FC_COMMITTING); wq = bit_waitqueue(&ei->i_flags, EXT4_STATE_FC_COMMITTING); #endif lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock); prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); schedule(); finish_wait(wq, &wait.wq_entry); } static bool ext4_fc_disabled(struct super_block *sb) { return (!test_opt2(sb, JOURNAL_FAST_COMMIT) || (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)); } /* * Inform Ext4's fast about start of an inode update * * This function is called by the high level call VFS callbacks before * performing any inode update. This function blocks if there's an ongoing * fast commit on the inode in question. */ void ext4_fc_start_update(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); if (ext4_fc_disabled(inode->i_sb)) return; restart: spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock); if (list_empty(&ei->i_fc_list)) goto out; if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { ext4_fc_wait_committing_inode(inode); goto restart; } out: atomic_inc(&ei->i_fc_updates); spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); } /* * Stop inode update and wake up waiting fast commits if any. */ void ext4_fc_stop_update(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); if (ext4_fc_disabled(inode->i_sb)) return; if (atomic_dec_and_test(&ei->i_fc_updates)) wake_up_all(&ei->i_fc_wait); } /* * Remove inode from fast commit list. If the inode is being committed * we wait until inode commit is done. */ void ext4_fc_del(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_fc_dentry_update *fc_dentry; if (ext4_fc_disabled(inode->i_sb)) return; restart: spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock); if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) { spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); return; } if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { ext4_fc_wait_committing_inode(inode); goto restart; } if (!list_empty(&ei->i_fc_list)) list_del_init(&ei->i_fc_list); /* * Since this inode is getting removed, let's also remove all FC * dentry create references, since it is not needed to log it anyways. */ if (list_empty(&ei->i_fc_dilist)) { spin_unlock(&sbi->s_fc_lock); return; } fc_dentry = list_first_entry(&ei->i_fc_dilist, struct ext4_fc_dentry_update, fcd_dilist); WARN_ON(fc_dentry->fcd_op != EXT4_FC_TAG_CREAT); list_del_init(&fc_dentry->fcd_list); list_del_init(&fc_dentry->fcd_dilist); WARN_ON(!list_empty(&ei->i_fc_dilist)); spin_unlock(&sbi->s_fc_lock); if (fc_dentry->fcd_name.name && fc_dentry->fcd_name.len > DNAME_INLINE_LEN) kfree(fc_dentry->fcd_name.name); kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry); return; } /* * Mark file system as fast commit ineligible, and record latest * ineligible transaction tid. This means until the recorded * transaction, commit operation would result in a full jbd2 commit. */ void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle) { struct ext4_sb_info *sbi = EXT4_SB(sb); tid_t tid; if (ext4_fc_disabled(sb)) return; ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); if (handle && !IS_ERR(handle)) tid = handle->h_transaction->t_tid; else { read_lock(&sbi->s_journal->j_state_lock); tid = sbi->s_journal->j_running_transaction ? sbi->s_journal->j_running_transaction->t_tid : 0; read_unlock(&sbi->s_journal->j_state_lock); } spin_lock(&sbi->s_fc_lock); if (sbi->s_fc_ineligible_tid < tid) sbi->s_fc_ineligible_tid = tid; spin_unlock(&sbi->s_fc_lock); WARN_ON(reason >= EXT4_FC_REASON_MAX); sbi->s_fc_stats.fc_ineligible_reason_count[reason]++; } /* * Generic fast commit tracking function. If this is the first time this we are * called after a full commit, we initialize fast commit fields and then call * __fc_track_fn() with update = 0. If we have already been called after a full * commit, we pass update = 1. Based on that, the track function can determine * if it needs to track a field for the first time or if it needs to just * update the previously tracked value. * * If enqueue is set, this function enqueues the inode in fast commit list. */ static int ext4_fc_track_template( handle_t *handle, struct inode *inode, int (*__fc_track_fn)(struct inode *, void *, bool), void *args, int enqueue) { bool update = false; struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); tid_t tid = 0; int ret; tid = handle->h_transaction->t_tid; mutex_lock(&ei->i_fc_lock); if (tid == ei->i_sync_tid) { update = true; } else { ext4_fc_reset_inode(inode); ei->i_sync_tid = tid; } ret = __fc_track_fn(inode, args, update); mutex_unlock(&ei->i_fc_lock); if (!enqueue) return ret; spin_lock(&sbi->s_fc_lock); if (list_empty(&EXT4_I(inode)->i_fc_list)) list_add_tail(&EXT4_I(inode)->i_fc_list, (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING || sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) ? &sbi->s_fc_q[FC_Q_STAGING] : &sbi->s_fc_q[FC_Q_MAIN]); spin_unlock(&sbi->s_fc_lock); return ret; } struct __track_dentry_update_args { struct dentry *dentry; int op; }; /* __track_fn for directory entry updates. Called with ei->i_fc_lock. */ static int __track_dentry_update(struct inode *inode, void *arg, bool update) { struct ext4_fc_dentry_update *node; struct ext4_inode_info *ei = EXT4_I(inode); struct __track_dentry_update_args *dentry_update = (struct __track_dentry_update_args *)arg; struct dentry *dentry = dentry_update->dentry; struct inode *dir = dentry->d_parent->d_inode; struct super_block *sb = inode->i_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); mutex_unlock(&ei->i_fc_lock); if (IS_ENCRYPTED(dir)) { ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_ENCRYPTED_FILENAME, NULL); mutex_lock(&ei->i_fc_lock); return -EOPNOTSUPP; } node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS); if (!node) { ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, NULL); mutex_lock(&ei->i_fc_lock); return -ENOMEM; } node->fcd_op = dentry_update->op; node->fcd_parent = dir->i_ino; node->fcd_ino = inode->i_ino; if (dentry->d_name.len > DNAME_INLINE_LEN) { node->fcd_name.name = kmalloc(dentry->d_name.len, GFP_NOFS); if (!node->fcd_name.name) { kmem_cache_free(ext4_fc_dentry_cachep, node); ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, NULL); mutex_lock(&ei->i_fc_lock); return -ENOMEM; } memcpy((u8 *)node->fcd_name.name, dentry->d_name.name, dentry->d_name.len); } else { memcpy(node->fcd_iname, dentry->d_name.name, dentry->d_name.len); node->fcd_name.name = node->fcd_iname; } node->fcd_name.len = dentry->d_name.len; INIT_LIST_HEAD(&node->fcd_dilist); spin_lock(&sbi->s_fc_lock); if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING || sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_STAGING]); else list_add_tail(&node->fcd_list, &sbi->s_fc_dentry_q[FC_Q_MAIN]); /* * This helps us keep a track of all fc_dentry updates which is part of * this ext4 inode. So in case the inode is getting unlinked, before * even we get a chance to fsync, we could remove all fc_dentry * references while evicting the inode in ext4_fc_del(). * Also with this, we don't need to loop over all the inodes in * sbi->s_fc_q to get the corresponding inode in * ext4_fc_commit_dentry_updates(). */ if (dentry_update->op == EXT4_FC_TAG_CREAT) { WARN_ON(!list_empty(&ei->i_fc_dilist)); list_add_tail(&node->fcd_dilist, &ei->i_fc_dilist); } spin_unlock(&sbi->s_fc_lock); mutex_lock(&ei->i_fc_lock); return 0; } void __ext4_fc_track_unlink(handle_t *handle, struct inode *inode, struct dentry *dentry) { struct __track_dentry_update_args args; int ret; args.dentry = dentry; args.op = EXT4_FC_TAG_UNLINK; ret = ext4_fc_track_template(handle, inode, __track_dentry_update, (void *)&args, 0); trace_ext4_fc_track_unlink(handle, inode, dentry, ret); } void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry) { struct inode *inode = d_inode(dentry); if (ext4_fc_disabled(inode->i_sb)) return; if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) return; __ext4_fc_track_unlink(handle, inode, dentry); } void __ext4_fc_track_link(handle_t *handle, struct inode *inode, struct dentry *dentry) { struct __track_dentry_update_args args; int ret; args.dentry = dentry; args.op = EXT4_FC_TAG_LINK; ret = ext4_fc_track_template(handle, inode, __track_dentry_update, (void *)&args, 0); trace_ext4_fc_track_link(handle, inode, dentry, ret); } void ext4_fc_track_link(handle_t *handle, struct dentry *dentry) { struct inode *inode = d_inode(dentry); if (ext4_fc_disabled(inode->i_sb)) return; if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) return; __ext4_fc_track_link(handle, inode, dentry); } void __ext4_fc_track_create(handle_t *handle, struct inode *inode, struct dentry *dentry) { struct __track_dentry_update_args args; int ret; args.dentry = dentry; args.op = EXT4_FC_TAG_CREAT; ret = ext4_fc_track_template(handle, inode, __track_dentry_update, (void *)&args, 0); trace_ext4_fc_track_create(handle, inode, dentry, ret); } void ext4_fc_track_create(handle_t *handle, struct dentry *dentry) { struct inode *inode = d_inode(dentry); if (ext4_fc_disabled(inode->i_sb)) return; if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) return; __ext4_fc_track_create(handle, inode, dentry); } /* __track_fn for inode tracking */ static int __track_inode(struct inode *inode, void *arg, bool update) { if (update) return -EEXIST; EXT4_I(inode)->i_fc_lblk_len = 0; return 0; } void ext4_fc_track_inode(handle_t *handle, struct inode *inode) { int ret; if (S_ISDIR(inode->i_mode)) return; if (ext4_fc_disabled(inode->i_sb)) return; if (ext4_should_journal_data(inode)) { ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_INODE_JOURNAL_DATA, handle); return; } if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) return; ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1); trace_ext4_fc_track_inode(handle, inode, ret); } struct __track_range_args { ext4_lblk_t start, end; }; /* __track_fn for tracking data updates */ static int __track_range(struct inode *inode, void *arg, bool update) { struct ext4_inode_info *ei = EXT4_I(inode); ext4_lblk_t oldstart; struct __track_range_args *__arg = (struct __track_range_args *)arg; if (inode->i_ino < EXT4_FIRST_INO(inode->i_sb)) { ext4_debug("Special inode %ld being modified\n", inode->i_ino); return -ECANCELED; } oldstart = ei->i_fc_lblk_start; if (update && ei->i_fc_lblk_len > 0) { ei->i_fc_lblk_start = min(ei->i_fc_lblk_start, __arg->start); ei->i_fc_lblk_len = max(oldstart + ei->i_fc_lblk_len - 1, __arg->end) - ei->i_fc_lblk_start + 1; } else { ei->i_fc_lblk_start = __arg->start; ei->i_fc_lblk_len = __arg->end - __arg->start + 1; } return 0; } void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start, ext4_lblk_t end) { struct __track_range_args args; int ret; if (S_ISDIR(inode->i_mode)) return; if (ext4_fc_disabled(inode->i_sb)) return; if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) return; args.start = start; args.end = end; ret = ext4_fc_track_template(handle, inode, __track_range, &args, 1); trace_ext4_fc_track_range(handle, inode, start, end, ret); } static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail) { blk_opf_t write_flags = REQ_SYNC; struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh; /* Add REQ_FUA | REQ_PREFLUSH only its tail */ if (test_opt(sb, BARRIER) && is_tail) write_flags |= REQ_FUA | REQ_PREFLUSH; lock_buffer(bh); set_buffer_dirty(bh); set_buffer_uptodate(bh); bh->b_end_io = ext4_end_buffer_io_sync; submit_bh(REQ_OP_WRITE | write_flags, bh); EXT4_SB(sb)->s_fc_bh = NULL; } /* Ext4 commit path routines */ /* * Allocate len bytes on a fast commit buffer. * * During the commit time this function is used to manage fast commit * block space. We don't split a fast commit log onto different * blocks. So this function makes sure that if there's not enough space * on the current block, the remaining space in the current block is * marked as unused by adding EXT4_FC_TAG_PAD tag. In that case, * new block is from jbd2 and CRC is updated to reflect the padding * we added. */ static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc) { struct ext4_fc_tl tl; struct ext4_sb_info *sbi = EXT4_SB(sb); struct buffer_head *bh; int bsize = sbi->s_journal->j_blocksize; int ret, off = sbi->s_fc_bytes % bsize; int remaining; u8 *dst; /* * If 'len' is too long to fit in any block alongside a PAD tlv, then we * cannot fulfill the request. */ if (len > bsize - EXT4_FC_TAG_BASE_LEN) return NULL; if (!sbi->s_fc_bh) { ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh); if (ret) return NULL; sbi->s_fc_bh = bh; } dst = sbi->s_fc_bh->b_data + off; /* * Allocate the bytes in the current block if we can do so while still * leaving enough space for a PAD tlv. */ remaining = bsize - EXT4_FC_TAG_BASE_LEN - off; if (len <= remaining) { sbi->s_fc_bytes += len; return dst; } /* * Else, terminate the current block with a PAD tlv, then allocate a new * block and allocate the bytes at the start of that new block. */ tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_PAD); tl.fc_len = cpu_to_le16(remaining); memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); memset(dst + EXT4_FC_TAG_BASE_LEN, 0, remaining); *crc = ext4_chksum(sbi, *crc, sbi->s_fc_bh->b_data, bsize); ext4_fc_submit_bh(sb, false); ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh); if (ret) return NULL; sbi->s_fc_bh = bh; sbi->s_fc_bytes += bsize - off + len; return sbi->s_fc_bh->b_data; } /* * Complete a fast commit by writing tail tag. * * Writing tail tag marks the end of a fast commit. In order to guarantee * atomicity, after writing tail tag, even if there's space remaining * in the block, next commit shouldn't use it. That's why tail tag * has the length as that of the remaining space on the block. */ static int ext4_fc_write_tail(struct super_block *sb, u32 crc) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_fc_tl tl; struct ext4_fc_tail tail; int off, bsize = sbi->s_journal->j_blocksize; u8 *dst; /* * ext4_fc_reserve_space takes care of allocating an extra block if * there's no enough space on this block for accommodating this tail. */ dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + sizeof(tail), &crc); if (!dst) return -ENOSPC; off = sbi->s_fc_bytes % bsize; tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_TAIL); tl.fc_len = cpu_to_le16(bsize - off + sizeof(struct ext4_fc_tail)); sbi->s_fc_bytes = round_up(sbi->s_fc_bytes, bsize); memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); dst += EXT4_FC_TAG_BASE_LEN; tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid); memcpy(dst, &tail.fc_tid, sizeof(tail.fc_tid)); dst += sizeof(tail.fc_tid); crc = ext4_chksum(sbi, crc, sbi->s_fc_bh->b_data, dst - (u8 *)sbi->s_fc_bh->b_data); tail.fc_crc = cpu_to_le32(crc); memcpy(dst, &tail.fc_crc, sizeof(tail.fc_crc)); dst += sizeof(tail.fc_crc); memset(dst, 0, bsize - off); /* Don't leak uninitialized memory. */ ext4_fc_submit_bh(sb, true); return 0; } /* * Adds tag, length, value and updates CRC. Returns true if tlv was added. * Returns false if there's not enough space. */ static bool ext4_fc_add_tlv(struct super_block *sb, u16 tag, u16 len, u8 *val, u32 *crc) { struct ext4_fc_tl tl; u8 *dst; dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + len, crc); if (!dst) return false; tl.fc_tag = cpu_to_le16(tag); tl.fc_len = cpu_to_le16(len); memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); memcpy(dst + EXT4_FC_TAG_BASE_LEN, val, len); return true; } /* Same as above, but adds dentry tlv. */ static bool ext4_fc_add_dentry_tlv(struct super_block *sb, u32 *crc, struct ext4_fc_dentry_update *fc_dentry) { struct ext4_fc_dentry_info fcd; struct ext4_fc_tl tl; int dlen = fc_dentry->fcd_name.len; u8 *dst = ext4_fc_reserve_space(sb, EXT4_FC_TAG_BASE_LEN + sizeof(fcd) + dlen, crc); if (!dst) return false; fcd.fc_parent_ino = cpu_to_le32(fc_dentry->fcd_parent); fcd.fc_ino = cpu_to_le32(fc_dentry->fcd_ino); tl.fc_tag = cpu_to_le16(fc_dentry->fcd_op); tl.fc_len = cpu_to_le16(sizeof(fcd) + dlen); memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); dst += EXT4_FC_TAG_BASE_LEN; memcpy(dst, &fcd, sizeof(fcd)); dst += sizeof(fcd); memcpy(dst, fc_dentry->fcd_name.name, dlen); return true; } /* * Writes inode in the fast commit space under TLV with tag @tag. * Returns 0 on success, error on failure. */ static int ext4_fc_write_inode(struct inode *inode, u32 *crc) { struct ext4_inode_info *ei = EXT4_I(inode); int inode_len = EXT4_GOOD_OLD_INODE_SIZE; int ret; struct ext4_iloc iloc; struct ext4_fc_inode fc_inode; struct ext4_fc_tl tl; u8 *dst; ret = ext4_get_inode_loc(inode, &iloc); if (ret) return ret; if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) inode_len = EXT4_INODE_SIZE(inode->i_sb); else if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) inode_len += ei->i_extra_isize; fc_inode.fc_ino = cpu_to_le32(inode->i_ino); tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE); tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino)); ret = -ECANCELED; dst = ext4_fc_reserve_space(inode->i_sb, EXT4_FC_TAG_BASE_LEN + inode_len + sizeof(fc_inode.fc_ino), crc); if (!dst) goto err; memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); dst += EXT4_FC_TAG_BASE_LEN; memcpy(dst, &fc_inode, sizeof(fc_inode)); dst += sizeof(fc_inode); memcpy(dst, (u8 *)ext4_raw_inode(&iloc), inode_len); ret = 0; err: brelse(iloc.bh); return ret; } /* * Writes updated data ranges for the inode in question. Updates CRC. * Returns 0 on success, error otherwise. */ static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc) { ext4_lblk_t old_blk_size, cur_lblk_off, new_blk_size; struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_map_blocks map; struct ext4_fc_add_range fc_ext; struct ext4_fc_del_range lrange; struct ext4_extent *ex; int ret; mutex_lock(&ei->i_fc_lock); if (ei->i_fc_lblk_len == 0) { mutex_unlock(&ei->i_fc_lock); return 0; } old_blk_size = ei->i_fc_lblk_start; new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1; ei->i_fc_lblk_len = 0; mutex_unlock(&ei->i_fc_lock); cur_lblk_off = old_blk_size; ext4_debug("will try writing %d to %d for inode %ld\n", cur_lblk_off, new_blk_size, inode->i_ino); while (cur_lblk_off <= new_blk_size) { map.m_lblk = cur_lblk_off; map.m_len = new_blk_size - cur_lblk_off + 1; ret = ext4_map_blocks(NULL, inode, &map, 0); if (ret < 0) return -ECANCELED; if (map.m_len == 0) { cur_lblk_off++; continue; } if (ret == 0) { lrange.fc_ino = cpu_to_le32(inode->i_ino); lrange.fc_lblk = cpu_to_le32(map.m_lblk); lrange.fc_len = cpu_to_le32(map.m_len); if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_DEL_RANGE, sizeof(lrange), (u8 *)&lrange, crc)) return -ENOSPC; } else { unsigned int max = (map.m_flags & EXT4_MAP_UNWRITTEN) ? EXT_UNWRITTEN_MAX_LEN : EXT_INIT_MAX_LEN; /* Limit the number of blocks in one extent */ map.m_len = min(max, map.m_len); fc_ext.fc_ino = cpu_to_le32(inode->i_ino); ex = (struct ext4_extent *)&fc_ext.fc_ex; ex->ee_block = cpu_to_le32(map.m_lblk); ex->ee_len = cpu_to_le16(map.m_len); ext4_ext_store_pblock(ex, map.m_pblk); if (map.m_flags & EXT4_MAP_UNWRITTEN) ext4_ext_mark_unwritten(ex); else ext4_ext_mark_initialized(ex); if (!ext4_fc_add_tlv(inode->i_sb, EXT4_FC_TAG_ADD_RANGE, sizeof(fc_ext), (u8 *)&fc_ext, crc)) return -ENOSPC; } cur_lblk_off += map.m_len; } return 0; } /* Submit data for all the fast commit inodes */ static int ext4_fc_submit_inode_data_all(journal_t *journal) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_inode_info *ei; int ret = 0; spin_lock(&sbi->s_fc_lock); list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING); while (atomic_read(&ei->i_fc_updates)) { DEFINE_WAIT(wait); prepare_to_wait(&ei->i_fc_wait, &wait, TASK_UNINTERRUPTIBLE); if (atomic_read(&ei->i_fc_updates)) { spin_unlock(&sbi->s_fc_lock); schedule(); spin_lock(&sbi->s_fc_lock); } finish_wait(&ei->i_fc_wait, &wait); } spin_unlock(&sbi->s_fc_lock); ret = jbd2_submit_inode_data(journal, ei->jinode); if (ret) return ret; spin_lock(&sbi->s_fc_lock); } spin_unlock(&sbi->s_fc_lock); return ret; } /* Wait for completion of data for all the fast commit inodes */ static int ext4_fc_wait_inode_data_all(journal_t *journal) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_inode_info *pos, *n; int ret = 0; spin_lock(&sbi->s_fc_lock); list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { if (!ext4_test_inode_state(&pos->vfs_inode, EXT4_STATE_FC_COMMITTING)) continue; spin_unlock(&sbi->s_fc_lock); ret = jbd2_wait_inode_data(journal, pos->jinode); if (ret) return ret; spin_lock(&sbi->s_fc_lock); } spin_unlock(&sbi->s_fc_lock); return 0; } /* Commit all the directory entry updates */ static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc) __acquires(&sbi->s_fc_lock) __releases(&sbi->s_fc_lock) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_fc_dentry_update *fc_dentry, *fc_dentry_n; struct inode *inode; struct ext4_inode_info *ei; int ret; if (list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) return 0; list_for_each_entry_safe(fc_dentry, fc_dentry_n, &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) { if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) { spin_unlock(&sbi->s_fc_lock); if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) { ret = -ENOSPC; goto lock_and_exit; } spin_lock(&sbi->s_fc_lock); continue; } /* * With fcd_dilist we need not loop in sbi->s_fc_q to get the * corresponding inode pointer */ WARN_ON(list_empty(&fc_dentry->fcd_dilist)); ei = list_first_entry(&fc_dentry->fcd_dilist, struct ext4_inode_info, i_fc_dilist); inode = &ei->vfs_inode; WARN_ON(inode->i_ino != fc_dentry->fcd_ino); spin_unlock(&sbi->s_fc_lock); /* * We first write the inode and then the create dirent. This * allows the recovery code to create an unnamed inode first * and then link it to a directory entry. This allows us * to use namei.c routines almost as is and simplifies * the recovery code. */ ret = ext4_fc_write_inode(inode, crc); if (ret) goto lock_and_exit; ret = ext4_fc_write_inode_data(inode, crc); if (ret) goto lock_and_exit; if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) { ret = -ENOSPC; goto lock_and_exit; } spin_lock(&sbi->s_fc_lock); } return 0; lock_and_exit: spin_lock(&sbi->s_fc_lock); return ret; } static int ext4_fc_perform_commit(journal_t *journal) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_inode_info *iter; struct ext4_fc_head head; struct inode *inode; struct blk_plug plug; int ret = 0; u32 crc = 0; ret = ext4_fc_submit_inode_data_all(journal); if (ret) return ret; ret = ext4_fc_wait_inode_data_all(journal); if (ret) return ret; /* * If file system device is different from journal device, issue a cache * flush before we start writing fast commit blocks. */ if (journal->j_fs_dev != journal->j_dev) blkdev_issue_flush(journal->j_fs_dev); blk_start_plug(&plug); if (sbi->s_fc_bytes == 0) { /* * Add a head tag only if this is the first fast commit * in this TID. */ head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES); head.fc_tid = cpu_to_le32( sbi->s_journal->j_running_transaction->t_tid); if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head), (u8 *)&head, &crc)) { ret = -ENOSPC; goto out; } } spin_lock(&sbi->s_fc_lock); ret = ext4_fc_commit_dentry_updates(journal, &crc); if (ret) { spin_unlock(&sbi->s_fc_lock); goto out; } list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { inode = &iter->vfs_inode; if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) continue; spin_unlock(&sbi->s_fc_lock); ret = ext4_fc_write_inode_data(inode, &crc); if (ret) goto out; ret = ext4_fc_write_inode(inode, &crc); if (ret) goto out; spin_lock(&sbi->s_fc_lock); } spin_unlock(&sbi->s_fc_lock); ret = ext4_fc_write_tail(sb, crc); out: blk_finish_plug(&plug); return ret; } static void ext4_fc_update_stats(struct super_block *sb, int status, u64 commit_time, int nblks, tid_t commit_tid) { struct ext4_fc_stats *stats = &EXT4_SB(sb)->s_fc_stats; ext4_debug("Fast commit ended with status = %d for tid %u", status, commit_tid); if (status == EXT4_FC_STATUS_OK) { stats->fc_num_commits++; stats->fc_numblks += nblks; if (likely(stats->s_fc_avg_commit_time)) stats->s_fc_avg_commit_time = (commit_time + stats->s_fc_avg_commit_time * 3) / 4; else stats->s_fc_avg_commit_time = commit_time; } else if (status == EXT4_FC_STATUS_FAILED || status == EXT4_FC_STATUS_INELIGIBLE) { if (status == EXT4_FC_STATUS_FAILED) stats->fc_failed_commits++; stats->fc_ineligible_commits++; } else { stats->fc_skipped_commits++; } trace_ext4_fc_commit_stop(sb, nblks, status, commit_tid); } /* * The main commit entry point. Performs a fast commit for transaction * commit_tid if needed. If it's not possible to perform a fast commit * due to various reasons, we fall back to full commit. Returns 0 * on success, error otherwise. */ int ext4_fc_commit(journal_t *journal, tid_t commit_tid) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); int nblks = 0, ret, bsize = journal->j_blocksize; int subtid = atomic_read(&sbi->s_fc_subtid); int status = EXT4_FC_STATUS_OK, fc_bufs_before = 0; ktime_t start_time, commit_time; if (!test_opt2(sb, JOURNAL_FAST_COMMIT)) return jbd2_complete_transaction(journal, commit_tid); trace_ext4_fc_commit_start(sb, commit_tid); start_time = ktime_get(); restart_fc: ret = jbd2_fc_begin_commit(journal, commit_tid); if (ret == -EALREADY) { /* There was an ongoing commit, check if we need to restart */ if (atomic_read(&sbi->s_fc_subtid) <= subtid && commit_tid > journal->j_commit_sequence) goto restart_fc; ext4_fc_update_stats(sb, EXT4_FC_STATUS_SKIPPED, 0, 0, commit_tid); return 0; } else if (ret) { /* * Commit couldn't start. Just update stats and perform a * full commit. */ ext4_fc_update_stats(sb, EXT4_FC_STATUS_FAILED, 0, 0, commit_tid); return jbd2_complete_transaction(journal, commit_tid); } /* * After establishing journal barrier via jbd2_fc_begin_commit(), check * if we are fast commit ineligible. */ if (ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE)) { status = EXT4_FC_STATUS_INELIGIBLE; goto fallback; } fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize; ret = ext4_fc_perform_commit(journal); if (ret < 0) { status = EXT4_FC_STATUS_FAILED; goto fallback; } nblks = (sbi->s_fc_bytes + bsize - 1) / bsize - fc_bufs_before; ret = jbd2_fc_wait_bufs(journal, nblks); if (ret < 0) { status = EXT4_FC_STATUS_FAILED; goto fallback; } atomic_inc(&sbi->s_fc_subtid); ret = jbd2_fc_end_commit(journal); /* * weight the commit time higher than the average time so we * don't react too strongly to vast changes in the commit time */ commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); ext4_fc_update_stats(sb, status, commit_time, nblks, commit_tid); return ret; fallback: ret = jbd2_fc_end_commit_fallback(journal); ext4_fc_update_stats(sb, status, 0, 0, commit_tid); return ret; } /* * Fast commit cleanup routine. This is called after every fast commit and * full commit. full is true if we are called after a full commit. */ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_inode_info *iter, *iter_n; struct ext4_fc_dentry_update *fc_dentry; if (full && sbi->s_fc_bh) sbi->s_fc_bh = NULL; trace_ext4_fc_cleanup(journal, full, tid); jbd2_fc_release_bufs(journal); spin_lock(&sbi->s_fc_lock); list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { list_del_init(&iter->i_fc_list); ext4_clear_inode_state(&iter->vfs_inode, EXT4_STATE_FC_COMMITTING); if (iter->i_sync_tid <= tid) ext4_fc_reset_inode(&iter->vfs_inode); /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */ smp_mb(); #if (BITS_PER_LONG < 64) wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING); #else wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING); #endif } while (!list_empty(&sbi->s_fc_dentry_q[FC_Q_MAIN])) { fc_dentry = list_first_entry(&sbi->s_fc_dentry_q[FC_Q_MAIN], struct ext4_fc_dentry_update, fcd_list); list_del_init(&fc_dentry->fcd_list); list_del_init(&fc_dentry->fcd_dilist); spin_unlock(&sbi->s_fc_lock); if (fc_dentry->fcd_name.name && fc_dentry->fcd_name.len > DNAME_INLINE_LEN) kfree(fc_dentry->fcd_name.name); kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry); spin_lock(&sbi->s_fc_lock); } list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING], &sbi->s_fc_dentry_q[FC_Q_MAIN]); list_splice_init(&sbi->s_fc_q[FC_Q_STAGING], &sbi->s_fc_q[FC_Q_MAIN]); if (tid >= sbi->s_fc_ineligible_tid) { sbi->s_fc_ineligible_tid = 0; ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); } if (full) sbi->s_fc_bytes = 0; spin_unlock(&sbi->s_fc_lock); trace_ext4_fc_stats(sb); } /* Ext4 Replay Path Routines */ /* Helper struct for dentry replay routines */ struct dentry_info_args { int parent_ino, dname_len, ino, inode_len; char *dname; }; /* Same as struct ext4_fc_tl, but uses native endianness fields */ struct ext4_fc_tl_mem { u16 fc_tag; u16 fc_len; }; static inline void tl_to_darg(struct dentry_info_args *darg, struct ext4_fc_tl_mem *tl, u8 *val) { struct ext4_fc_dentry_info fcd; memcpy(&fcd, val, sizeof(fcd)); darg->parent_ino = le32_to_cpu(fcd.fc_parent_ino); darg->ino = le32_to_cpu(fcd.fc_ino); darg->dname = val + offsetof(struct ext4_fc_dentry_info, fc_dname); darg->dname_len = tl->fc_len - sizeof(struct ext4_fc_dentry_info); } static inline void ext4_fc_get_tl(struct ext4_fc_tl_mem *tl, u8 *val) { struct ext4_fc_tl tl_disk; memcpy(&tl_disk, val, EXT4_FC_TAG_BASE_LEN); tl->fc_len = le16_to_cpu(tl_disk.fc_len); tl->fc_tag = le16_to_cpu(tl_disk.fc_tag); } /* Unlink replay function */ static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl_mem *tl, u8 *val) { struct inode *inode, *old_parent; struct qstr entry; struct dentry_info_args darg; int ret = 0; tl_to_darg(&darg, tl, val); trace_ext4_fc_replay(sb, EXT4_FC_TAG_UNLINK, darg.ino, darg.parent_ino, darg.dname_len); entry.name = darg.dname; entry.len = darg.dname_len; inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); if (IS_ERR(inode)) { ext4_debug("Inode %d not found", darg.ino); return 0; } old_parent = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL); if (IS_ERR(old_parent)) { ext4_debug("Dir with inode %d not found", darg.parent_ino); iput(inode); return 0; } ret = __ext4_unlink(old_parent, &entry, inode, NULL); /* -ENOENT ok coz it might not exist anymore. */ if (ret == -ENOENT) ret = 0; iput(old_parent); iput(inode); return ret; } static int ext4_fc_replay_link_internal(struct super_block *sb, struct dentry_info_args *darg, struct inode *inode) { struct inode *dir = NULL; struct dentry *dentry_dir = NULL, *dentry_inode = NULL; struct qstr qstr_dname = QSTR_INIT(darg->dname, darg->dname_len); int ret = 0; dir = ext4_iget(sb, darg->parent_ino, EXT4_IGET_NORMAL); if (IS_ERR(dir)) { ext4_debug("Dir with inode %d not found.", darg->parent_ino); dir = NULL; goto out; } dentry_dir = d_obtain_alias(dir); if (IS_ERR(dentry_dir)) { ext4_debug("Failed to obtain dentry"); dentry_dir = NULL; goto out; } dentry_inode = d_alloc(dentry_dir, &qstr_dname); if (!dentry_inode) { ext4_debug("Inode dentry not created."); ret = -ENOMEM; goto out; } ret = __ext4_link(dir, inode, dentry_inode); /* * It's possible that link already existed since data blocks * for the dir in question got persisted before we crashed OR * we replayed this tag and crashed before the entire replay * could complete. */ if (ret && ret != -EEXIST) { ext4_debug("Failed to link\n"); goto out; } ret = 0; out: if (dentry_dir) { d_drop(dentry_dir); dput(dentry_dir); } else if (dir) { iput(dir); } if (dentry_inode) { d_drop(dentry_inode); dput(dentry_inode); } return ret; } /* Link replay function */ static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl_mem *tl, u8 *val) { struct inode *inode; struct dentry_info_args darg; int ret = 0; tl_to_darg(&darg, tl, val); trace_ext4_fc_replay(sb, EXT4_FC_TAG_LINK, darg.ino, darg.parent_ino, darg.dname_len); inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); if (IS_ERR(inode)) { ext4_debug("Inode not found."); return 0; } ret = ext4_fc_replay_link_internal(sb, &darg, inode); iput(inode); return ret; } /* * Record all the modified inodes during replay. We use this later to setup * block bitmaps correctly. */ static int ext4_fc_record_modified_inode(struct super_block *sb, int ino) { struct ext4_fc_replay_state *state; int i; state = &EXT4_SB(sb)->s_fc_replay_state; for (i = 0; i < state->fc_modified_inodes_used; i++) if (state->fc_modified_inodes[i] == ino) return 0; if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) { int *fc_modified_inodes; fc_modified_inodes = krealloc(state->fc_modified_inodes, sizeof(int) * (state->fc_modified_inodes_size + EXT4_FC_REPLAY_REALLOC_INCREMENT), GFP_KERNEL); if (!fc_modified_inodes) return -ENOMEM; state->fc_modified_inodes = fc_modified_inodes; state->fc_modified_inodes_size += EXT4_FC_REPLAY_REALLOC_INCREMENT; } state->fc_modified_inodes[state->fc_modified_inodes_used++] = ino; return 0; } /* * Inode replay function */ static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl_mem *tl, u8 *val) { struct ext4_fc_inode fc_inode; struct ext4_inode *raw_inode; struct ext4_inode *raw_fc_inode; struct inode *inode = NULL; struct ext4_iloc iloc; int inode_len, ino, ret, tag = tl->fc_tag; struct ext4_extent_header *eh; size_t off_gen = offsetof(struct ext4_inode, i_generation); memcpy(&fc_inode, val, sizeof(fc_inode)); ino = le32_to_cpu(fc_inode.fc_ino); trace_ext4_fc_replay(sb, tag, ino, 0, 0); inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); if (!IS_ERR(inode)) { ext4_ext_clear_bb(inode); iput(inode); } inode = NULL; ret = ext4_fc_record_modified_inode(sb, ino); if (ret) goto out; raw_fc_inode = (struct ext4_inode *) (val + offsetof(struct ext4_fc_inode, fc_raw_inode)); ret = ext4_get_fc_inode_loc(sb, ino, &iloc); if (ret) goto out; inode_len = tl->fc_len - sizeof(struct ext4_fc_inode); raw_inode = ext4_raw_inode(&iloc); memcpy(raw_inode, raw_fc_inode, offsetof(struct ext4_inode, i_block)); memcpy((u8 *)raw_inode + off_gen, (u8 *)raw_fc_inode + off_gen, inode_len - off_gen); if (le32_to_cpu(raw_inode->i_flags) & EXT4_EXTENTS_FL) { eh = (struct ext4_extent_header *)(&raw_inode->i_block[0]); if (eh->eh_magic != EXT4_EXT_MAGIC) { memset(eh, 0, sizeof(*eh)); eh->eh_magic = EXT4_EXT_MAGIC; eh->eh_max = cpu_to_le16( (sizeof(raw_inode->i_block) - sizeof(struct ext4_extent_header)) / sizeof(struct ext4_extent)); } } else if (le32_to_cpu(raw_inode->i_flags) & EXT4_INLINE_DATA_FL) { memcpy(raw_inode->i_block, raw_fc_inode->i_block, sizeof(raw_inode->i_block)); } /* Immediately update the inode on disk. */ ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh); if (ret) goto out; ret = sync_dirty_buffer(iloc.bh); if (ret) goto out; ret = ext4_mark_inode_used(sb, ino); if (ret) goto out; /* Given that we just wrote the inode on disk, this SHOULD succeed. */ inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); if (IS_ERR(inode)) { ext4_debug("Inode not found."); return -EFSCORRUPTED; } /* * Our allocator could have made different decisions than before * crashing. This should be fixed but until then, we calculate * the number of blocks the inode. */ if (!ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) ext4_ext_replay_set_iblocks(inode); inode->i_generation = le32_to_cpu(ext4_raw_inode(&iloc)->i_generation); ext4_reset_inode_seed(inode); ext4_inode_csum_set(inode, ext4_raw_inode(&iloc), EXT4_I(inode)); ret = ext4_handle_dirty_metadata(NULL, NULL, iloc.bh); sync_dirty_buffer(iloc.bh); brelse(iloc.bh); out: iput(inode); if (!ret) blkdev_issue_flush(sb->s_bdev); return 0; } /* * Dentry create replay function. * * EXT4_FC_TAG_CREAT is preceded by EXT4_FC_TAG_INODE_FULL. Which means, the * inode for which we are trying to create a dentry here, should already have * been replayed before we start here. */ static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl_mem *tl, u8 *val) { int ret = 0; struct inode *inode = NULL; struct inode *dir = NULL; struct dentry_info_args darg; tl_to_darg(&darg, tl, val); trace_ext4_fc_replay(sb, EXT4_FC_TAG_CREAT, darg.ino, darg.parent_ino, darg.dname_len); /* This takes care of update group descriptor and other metadata */ ret = ext4_mark_inode_used(sb, darg.ino); if (ret) goto out; inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); if (IS_ERR(inode)) { ext4_debug("inode %d not found.", darg.ino); inode = NULL; ret = -EINVAL; goto out; } if (S_ISDIR(inode->i_mode)) { /* * If we are creating a directory, we need to make sure that the * dot and dot dot dirents are setup properly. */ dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL); if (IS_ERR(dir)) { ext4_debug("Dir %d not found.", darg.ino); goto out; } ret = ext4_init_new_dir(NULL, dir, inode); iput(dir); if (ret) { ret = 0; goto out; } } ret = ext4_fc_replay_link_internal(sb, &darg, inode); if (ret) goto out; set_nlink(inode, 1); ext4_mark_inode_dirty(NULL, inode); out: iput(inode); return ret; } /* * Record physical disk regions which are in use as per fast commit area, * and used by inodes during replay phase. Our simple replay phase * allocator excludes these regions from allocation. */ int ext4_fc_record_regions(struct super_block *sb, int ino, ext4_lblk_t lblk, ext4_fsblk_t pblk, int len, int replay) { struct ext4_fc_replay_state *state; struct ext4_fc_alloc_region *region; state = &EXT4_SB(sb)->s_fc_replay_state; /* * during replay phase, the fc_regions_valid may not same as * fc_regions_used, update it when do new additions. */ if (replay && state->fc_regions_used != state->fc_regions_valid) state->fc_regions_used = state->fc_regions_valid; if (state->fc_regions_used == state->fc_regions_size) { struct ext4_fc_alloc_region *fc_regions; fc_regions = krealloc(state->fc_regions, sizeof(struct ext4_fc_alloc_region) * (state->fc_regions_size + EXT4_FC_REPLAY_REALLOC_INCREMENT), GFP_KERNEL); if (!fc_regions) return -ENOMEM; state->fc_regions_size += EXT4_FC_REPLAY_REALLOC_INCREMENT; state->fc_regions = fc_regions; } region = &state->fc_regions[state->fc_regions_used++]; region->ino = ino; region->lblk = lblk; region->pblk = pblk; region->len = len; if (replay) state->fc_regions_valid++; return 0; } /* Replay add range tag */ static int ext4_fc_replay_add_range(struct super_block *sb, struct ext4_fc_tl_mem *tl, u8 *val) { struct ext4_fc_add_range fc_add_ex; struct ext4_extent newex, *ex; struct inode *inode; ext4_lblk_t start, cur; int remaining, len; ext4_fsblk_t start_pblk; struct ext4_map_blocks map; struct ext4_ext_path *path = NULL; int ret; memcpy(&fc_add_ex, val, sizeof(fc_add_ex)); ex = (struct ext4_extent *)&fc_add_ex.fc_ex; trace_ext4_fc_replay(sb, EXT4_FC_TAG_ADD_RANGE, le32_to_cpu(fc_add_ex.fc_ino), le32_to_cpu(ex->ee_block), ext4_ext_get_actual_len(ex)); inode = ext4_iget(sb, le32_to_cpu(fc_add_ex.fc_ino), EXT4_IGET_NORMAL); if (IS_ERR(inode)) { ext4_debug("Inode not found."); return 0; } ret = ext4_fc_record_modified_inode(sb, inode->i_ino); if (ret) goto out; start = le32_to_cpu(ex->ee_block); start_pblk = ext4_ext_pblock(ex); len = ext4_ext_get_actual_len(ex); cur = start; remaining = len; ext4_debug("ADD_RANGE, lblk %d, pblk %lld, len %d, unwritten %d, inode %ld\n", start, start_pblk, len, ext4_ext_is_unwritten(ex), inode->i_ino); while (remaining > 0) { map.m_lblk = cur; map.m_len = remaining; map.m_pblk = 0; ret = ext4_map_blocks(NULL, inode, &map, 0); if (ret < 0) goto out; if (ret == 0) { /* Range is not mapped */ path = ext4_find_extent(inode, cur, NULL, 0); if (IS_ERR(path)) goto out; memset(&newex, 0, sizeof(newex)); newex.ee_block = cpu_to_le32(cur); ext4_ext_store_pblock( &newex, start_pblk + cur - start); newex.ee_len = cpu_to_le16(map.m_len); if (ext4_ext_is_unwritten(ex)) ext4_ext_mark_unwritten(&newex); down_write(&EXT4_I(inode)->i_data_sem); ret = ext4_ext_insert_extent( NULL, inode, &path, &newex, 0); up_write((&EXT4_I(inode)->i_data_sem)); ext4_free_ext_path(path); if (ret) goto out; goto next; } if (start_pblk + cur - start != map.m_pblk) { /* * Logical to physical mapping changed. This can happen * if this range was removed and then reallocated to * map to new physical blocks during a fast commit. */ ret = ext4_ext_replay_update_ex(inode, cur, map.m_len, ext4_ext_is_unwritten(ex), start_pblk + cur - start); if (ret) goto out; /* * Mark the old blocks as free since they aren't used * anymore. We maintain an array of all the modified * inodes. In case these blocks are still used at either * a different logical range in the same inode or in * some different inode, we will mark them as allocated * at the end of the FC replay using our array of * modified inodes. */ ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false); goto next; } /* Range is mapped and needs a state change */ ext4_debug("Converting from %ld to %d %lld", map.m_flags & EXT4_MAP_UNWRITTEN, ext4_ext_is_unwritten(ex), map.m_pblk); ret = ext4_ext_replay_update_ex(inode, cur, map.m_len, ext4_ext_is_unwritten(ex), map.m_pblk); if (ret) goto out; /* * We may have split the extent tree while toggling the state. * Try to shrink the extent tree now. */ ext4_ext_replay_shrink_inode(inode, start + len); next: cur += map.m_len; remaining -= map.m_len; } ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >> sb->s_blocksize_bits); out: iput(inode); return 0; } /* Replay DEL_RANGE tag */ static int ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl_mem *tl, u8 *val) { struct inode *inode; struct ext4_fc_del_range lrange; struct ext4_map_blocks map; ext4_lblk_t cur, remaining; int ret; memcpy(&lrange, val, sizeof(lrange)); cur = le32_to_cpu(lrange.fc_lblk); remaining = le32_to_cpu(lrange.fc_len); trace_ext4_fc_replay(sb, EXT4_FC_TAG_DEL_RANGE, le32_to_cpu(lrange.fc_ino), cur, remaining); inode = ext4_iget(sb, le32_to_cpu(lrange.fc_ino), EXT4_IGET_NORMAL); if (IS_ERR(inode)) { ext4_debug("Inode %d not found", le32_to_cpu(lrange.fc_ino)); return 0; } ret = ext4_fc_record_modified_inode(sb, inode->i_ino); if (ret) goto out; ext4_debug("DEL_RANGE, inode %ld, lblk %d, len %d\n", inode->i_ino, le32_to_cpu(lrange.fc_lblk), le32_to_cpu(lrange.fc_len)); while (remaining > 0) { map.m_lblk = cur; map.m_len = remaining; ret = ext4_map_blocks(NULL, inode, &map, 0); if (ret < 0) goto out; if (ret > 0) { remaining -= ret; cur += ret; ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, false); } else { remaining -= map.m_len; cur += map.m_len; } } down_write(&EXT4_I(inode)->i_data_sem); ret = ext4_ext_remove_space(inode, le32_to_cpu(lrange.fc_lblk), le32_to_cpu(lrange.fc_lblk) + le32_to_cpu(lrange.fc_len) - 1); up_write(&EXT4_I(inode)->i_data_sem); if (ret) goto out; ext4_ext_replay_shrink_inode(inode, i_size_read(inode) >> sb->s_blocksize_bits); ext4_mark_inode_dirty(NULL, inode); out: iput(inode); return 0; } static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb) { struct ext4_fc_replay_state *state; struct inode *inode; struct ext4_ext_path *path = NULL; struct ext4_map_blocks map; int i, ret, j; ext4_lblk_t cur, end; state = &EXT4_SB(sb)->s_fc_replay_state; for (i = 0; i < state->fc_modified_inodes_used; i++) { inode = ext4_iget(sb, state->fc_modified_inodes[i], EXT4_IGET_NORMAL); if (IS_ERR(inode)) { ext4_debug("Inode %d not found.", state->fc_modified_inodes[i]); continue; } cur = 0; end = EXT_MAX_BLOCKS; if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA)) { iput(inode); continue; } while (cur < end) { map.m_lblk = cur; map.m_len = end - cur; ret = ext4_map_blocks(NULL, inode, &map, 0); if (ret < 0) break; if (ret > 0) { path = ext4_find_extent(inode, map.m_lblk, NULL, 0); if (!IS_ERR(path)) { for (j = 0; j < path->p_depth; j++) ext4_mb_mark_bb(inode->i_sb, path[j].p_block, 1, true); ext4_free_ext_path(path); } cur += ret; ext4_mb_mark_bb(inode->i_sb, map.m_pblk, map.m_len, true); } else { cur = cur + (map.m_len ? map.m_len : 1); } } iput(inode); } } /* * Check if block is in excluded regions for block allocation. The simple * allocator that runs during replay phase is calls this function to see * if it is okay to use a block. */ bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t blk) { int i; struct ext4_fc_replay_state *state; state = &EXT4_SB(sb)->s_fc_replay_state; for (i = 0; i < state->fc_regions_valid; i++) { if (state->fc_regions[i].ino == 0 || state->fc_regions[i].len == 0) continue; if (in_range(blk, state->fc_regions[i].pblk, state->fc_regions[i].len)) return true; } return false; } /* Cleanup function called after replay */ void ext4_fc_replay_cleanup(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); sbi->s_mount_state &= ~EXT4_FC_REPLAY; kfree(sbi->s_fc_replay_state.fc_regions); kfree(sbi->s_fc_replay_state.fc_modified_inodes); } static bool ext4_fc_value_len_isvalid(struct ext4_sb_info *sbi, int tag, int len) { switch (tag) { case EXT4_FC_TAG_ADD_RANGE: return len == sizeof(struct ext4_fc_add_range); case EXT4_FC_TAG_DEL_RANGE: return len == sizeof(struct ext4_fc_del_range); case EXT4_FC_TAG_CREAT: case EXT4_FC_TAG_LINK: case EXT4_FC_TAG_UNLINK: len -= sizeof(struct ext4_fc_dentry_info); return len >= 1 && len <= EXT4_NAME_LEN; case EXT4_FC_TAG_INODE: len -= sizeof(struct ext4_fc_inode); return len >= EXT4_GOOD_OLD_INODE_SIZE && len <= sbi->s_inode_size; case EXT4_FC_TAG_PAD: return true; /* padding can have any length */ case EXT4_FC_TAG_TAIL: return len >= sizeof(struct ext4_fc_tail); case EXT4_FC_TAG_HEAD: return len == sizeof(struct ext4_fc_head); } return false; } /* * Recovery Scan phase handler * * This function is called during the scan phase and is responsible * for doing following things: * - Make sure the fast commit area has valid tags for replay * - Count number of tags that need to be replayed by the replay handler * - Verify CRC * - Create a list of excluded blocks for allocation during replay phase * * This function returns JBD2_FC_REPLAY_CONTINUE to indicate that SCAN is * incomplete and JBD2 should send more blocks. It returns JBD2_FC_REPLAY_STOP * to indicate that scan has finished and JBD2 can now start replay phase. * It returns a negative error to indicate that there was an error. At the end * of a successful scan phase, sbi->s_fc_replay_state.fc_replay_num_tags is set * to indicate the number of tags that need to replayed during the replay phase. */ static int ext4_fc_replay_scan(journal_t *journal, struct buffer_head *bh, int off, tid_t expected_tid) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_fc_replay_state *state; int ret = JBD2_FC_REPLAY_CONTINUE; struct ext4_fc_add_range ext; struct ext4_fc_tl_mem tl; struct ext4_fc_tail tail; __u8 *start, *end, *cur, *val; struct ext4_fc_head head; struct ext4_extent *ex; state = &sbi->s_fc_replay_state; start = (u8 *)bh->b_data; end = start + journal->j_blocksize; if (state->fc_replay_expected_off == 0) { state->fc_cur_tag = 0; state->fc_replay_num_tags = 0; state->fc_crc = 0; state->fc_regions = NULL; state->fc_regions_valid = state->fc_regions_used = state->fc_regions_size = 0; /* Check if we can stop early */ if (le16_to_cpu(((struct ext4_fc_tl *)start)->fc_tag) != EXT4_FC_TAG_HEAD) return 0; } if (off != state->fc_replay_expected_off) { ret = -EFSCORRUPTED; goto out_err; } state->fc_replay_expected_off++; for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN; cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) { ext4_fc_get_tl(&tl, cur); val = cur + EXT4_FC_TAG_BASE_LEN; if (tl.fc_len > end - val || !ext4_fc_value_len_isvalid(sbi, tl.fc_tag, tl.fc_len)) { ret = state->fc_replay_num_tags ? JBD2_FC_REPLAY_STOP : -ECANCELED; goto out_err; } ext4_debug("Scan phase, tag:%s, blk %lld\n", tag2str(tl.fc_tag), bh->b_blocknr); switch (tl.fc_tag) { case EXT4_FC_TAG_ADD_RANGE: memcpy(&ext, val, sizeof(ext)); ex = (struct ext4_extent *)&ext.fc_ex; ret = ext4_fc_record_regions(sb, le32_to_cpu(ext.fc_ino), le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex), ext4_ext_get_actual_len(ex), 0); if (ret < 0) break; ret = JBD2_FC_REPLAY_CONTINUE; fallthrough; case EXT4_FC_TAG_DEL_RANGE: case EXT4_FC_TAG_LINK: case EXT4_FC_TAG_UNLINK: case EXT4_FC_TAG_CREAT: case EXT4_FC_TAG_INODE: case EXT4_FC_TAG_PAD: state->fc_cur_tag++; state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, EXT4_FC_TAG_BASE_LEN + tl.fc_len); break; case EXT4_FC_TAG_TAIL: state->fc_cur_tag++; memcpy(&tail, val, sizeof(tail)); state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, EXT4_FC_TAG_BASE_LEN + offsetof(struct ext4_fc_tail, fc_crc)); if (le32_to_cpu(tail.fc_tid) == expected_tid && le32_to_cpu(tail.fc_crc) == state->fc_crc) { state->fc_replay_num_tags = state->fc_cur_tag; state->fc_regions_valid = state->fc_regions_used; } else { ret = state->fc_replay_num_tags ? JBD2_FC_REPLAY_STOP : -EFSBADCRC; } state->fc_crc = 0; break; case EXT4_FC_TAG_HEAD: memcpy(&head, val, sizeof(head)); if (le32_to_cpu(head.fc_features) & ~EXT4_FC_SUPPORTED_FEATURES) { ret = -EOPNOTSUPP; break; } if (le32_to_cpu(head.fc_tid) != expected_tid) { ret = JBD2_FC_REPLAY_STOP; break; } state->fc_cur_tag++; state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, EXT4_FC_TAG_BASE_LEN + tl.fc_len); break; default: ret = state->fc_replay_num_tags ? JBD2_FC_REPLAY_STOP : -ECANCELED; } if (ret < 0 || ret == JBD2_FC_REPLAY_STOP) break; } out_err: trace_ext4_fc_replay_scan(sb, ret, off); return ret; } /* * Main recovery path entry point. * The meaning of return codes is similar as above. */ static int ext4_fc_replay(journal_t *journal, struct buffer_head *bh, enum passtype pass, int off, tid_t expected_tid) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_fc_tl_mem tl; __u8 *start, *end, *cur, *val; int ret = JBD2_FC_REPLAY_CONTINUE; struct ext4_fc_replay_state *state = &sbi->s_fc_replay_state; struct ext4_fc_tail tail; if (pass == PASS_SCAN) { state->fc_current_pass = PASS_SCAN; return ext4_fc_replay_scan(journal, bh, off, expected_tid); } if (state->fc_current_pass != pass) { state->fc_current_pass = pass; sbi->s_mount_state |= EXT4_FC_REPLAY; } if (!sbi->s_fc_replay_state.fc_replay_num_tags) { ext4_debug("Replay stops\n"); ext4_fc_set_bitmaps_and_counters(sb); return 0; } #ifdef CONFIG_EXT4_DEBUG if (sbi->s_fc_debug_max_replay && off >= sbi->s_fc_debug_max_replay) { pr_warn("Dropping fc block %d because max_replay set\n", off); return JBD2_FC_REPLAY_STOP; } #endif start = (u8 *)bh->b_data; end = start + journal->j_blocksize; for (cur = start; cur <= end - EXT4_FC_TAG_BASE_LEN; cur = cur + EXT4_FC_TAG_BASE_LEN + tl.fc_len) { ext4_fc_get_tl(&tl, cur); val = cur + EXT4_FC_TAG_BASE_LEN; if (state->fc_replay_num_tags == 0) { ret = JBD2_FC_REPLAY_STOP; ext4_fc_set_bitmaps_and_counters(sb); break; } ext4_debug("Replay phase, tag:%s\n", tag2str(tl.fc_tag)); state->fc_replay_num_tags--; switch (tl.fc_tag) { case EXT4_FC_TAG_LINK: ret = ext4_fc_replay_link(sb, &tl, val); break; case EXT4_FC_TAG_UNLINK: ret = ext4_fc_replay_unlink(sb, &tl, val); break; case EXT4_FC_TAG_ADD_RANGE: ret = ext4_fc_replay_add_range(sb, &tl, val); break; case EXT4_FC_TAG_CREAT: ret = ext4_fc_replay_create(sb, &tl, val); break; case EXT4_FC_TAG_DEL_RANGE: ret = ext4_fc_replay_del_range(sb, &tl, val); break; case EXT4_FC_TAG_INODE: ret = ext4_fc_replay_inode(sb, &tl, val); break; case EXT4_FC_TAG_PAD: trace_ext4_fc_replay(sb, EXT4_FC_TAG_PAD, 0, tl.fc_len, 0); break; case EXT4_FC_TAG_TAIL: trace_ext4_fc_replay(sb, EXT4_FC_TAG_TAIL, 0, tl.fc_len, 0); memcpy(&tail, val, sizeof(tail)); WARN_ON(le32_to_cpu(tail.fc_tid) != expected_tid); break; case EXT4_FC_TAG_HEAD: break; default: trace_ext4_fc_replay(sb, tl.fc_tag, 0, tl.fc_len, 0); ret = -ECANCELED; break; } if (ret < 0) break; ret = JBD2_FC_REPLAY_CONTINUE; } return ret; } void ext4_fc_init(struct super_block *sb, journal_t *journal) { /* * We set replay callback even if fast commit disabled because we may * could still have fast commit blocks that need to be replayed even if * fast commit has now been turned off. */ journal->j_fc_replay_callback = ext4_fc_replay; if (!test_opt2(sb, JOURNAL_FAST_COMMIT)) return; journal->j_fc_cleanup_callback = ext4_fc_cleanup; } static const char * const fc_ineligible_reasons[] = { [EXT4_FC_REASON_XATTR] = "Extended attributes changed", [EXT4_FC_REASON_CROSS_RENAME] = "Cross rename", [EXT4_FC_REASON_JOURNAL_FLAG_CHANGE] = "Journal flag changed", [EXT4_FC_REASON_NOMEM] = "Insufficient memory", [EXT4_FC_REASON_SWAP_BOOT] = "Swap boot", [EXT4_FC_REASON_RESIZE] = "Resize", [EXT4_FC_REASON_RENAME_DIR] = "Dir renamed", [EXT4_FC_REASON_FALLOC_RANGE] = "Falloc range op", [EXT4_FC_REASON_INODE_JOURNAL_DATA] = "Data journalling", [EXT4_FC_REASON_ENCRYPTED_FILENAME] = "Encrypted filename", }; int ext4_fc_info_show(struct seq_file *seq, void *v) { struct ext4_sb_info *sbi = EXT4_SB((struct super_block *)seq->private); struct ext4_fc_stats *stats = &sbi->s_fc_stats; int i; if (v != SEQ_START_TOKEN) return 0; seq_printf(seq, "fc stats:\n%ld commits\n%ld ineligible\n%ld numblks\n%lluus avg_commit_time\n", stats->fc_num_commits, stats->fc_ineligible_commits, stats->fc_numblks, div_u64(stats->s_fc_avg_commit_time, 1000)); seq_puts(seq, "Ineligible reasons:\n"); for (i = 0; i < EXT4_FC_REASON_MAX; i++) seq_printf(seq, "\"%s\":\t%d\n", fc_ineligible_reasons[i], stats->fc_ineligible_reason_count[i]); return 0; } int __init ext4_fc_init_dentry_cache(void) { ext4_fc_dentry_cachep = KMEM_CACHE(ext4_fc_dentry_update, SLAB_RECLAIM_ACCOUNT); if (ext4_fc_dentry_cachep == NULL) return -ENOMEM; return 0; } void ext4_fc_destroy_dentry_cache(void) { kmem_cache_destroy(ext4_fc_dentry_cachep); } |
3169 3214 2875 2874 2 1 3 3 4 1 10 1 2879 2879 2868 12 2876 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/filesystems.c * * Copyright (C) 1991, 1992 Linus Torvalds * * table of configured filesystems */ #include <linux/syscalls.h> #include <linux/fs.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/kmod.h> #include <linux/init.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/fs_parser.h> /* * Handling of filesystem drivers list. * Rules: * Inclusion to/removals from/scanning of list are protected by spinlock. * During the unload module must call unregister_filesystem(). * We can access the fields of list element if: * 1) spinlock is held or * 2) we hold the reference to the module. * The latter can be guaranteed by call of try_module_get(); if it * returned 0 we must skip the element, otherwise we got the reference. * Once the reference is obtained we can drop the spinlock. */ static struct file_system_type *file_systems; static DEFINE_RWLOCK(file_systems_lock); /* WARNING: This can be used only if we _already_ own a reference */ struct file_system_type *get_filesystem(struct file_system_type *fs) { __module_get(fs->owner); return fs; } void put_filesystem(struct file_system_type *fs) { module_put(fs->owner); } static struct file_system_type **find_filesystem(const char *name, unsigned len) { struct file_system_type **p; for (p = &file_systems; *p; p = &(*p)->next) if (strncmp((*p)->name, name, len) == 0 && !(*p)->name[len]) break; return p; } /** * register_filesystem - register a new filesystem * @fs: the file system structure * * Adds the file system passed to the list of file systems the kernel * is aware of for mount and other syscalls. Returns 0 on success, * or a negative errno code on an error. * * The &struct file_system_type that is passed is linked into the kernel * structures and must not be freed until the file system has been * unregistered. */ int register_filesystem(struct file_system_type * fs) { int res = 0; struct file_system_type ** p; if (fs->parameters && !fs_validate_description(fs->name, fs->parameters)) return -EINVAL; BUG_ON(strchr(fs->name, '.')); if (fs->next) return -EBUSY; write_lock(&file_systems_lock); p = find_filesystem(fs->name, strlen(fs->name)); if (*p) res = -EBUSY; else *p = fs; write_unlock(&file_systems_lock); return res; } EXPORT_SYMBOL(register_filesystem); /** * unregister_filesystem - unregister a file system * @fs: filesystem to unregister * * Remove a file system that was previously successfully registered * with the kernel. An error is returned if the file system is not found. * Zero is returned on a success. * * Once this function has returned the &struct file_system_type structure * may be freed or reused. */ int unregister_filesystem(struct file_system_type * fs) { struct file_system_type ** tmp; write_lock(&file_systems_lock); tmp = &file_systems; while (*tmp) { if (fs == *tmp) { *tmp = fs->next; fs->next = NULL; write_unlock(&file_systems_lock); synchronize_rcu(); return 0; } tmp = &(*tmp)->next; } write_unlock(&file_systems_lock); return -EINVAL; } EXPORT_SYMBOL(unregister_filesystem); #ifdef CONFIG_SYSFS_SYSCALL static int fs_index(const char __user * __name) { struct file_system_type * tmp; struct filename *name; int err, index; name = getname(__name); err = PTR_ERR(name); if (IS_ERR(name)) return err; err = -EINVAL; read_lock(&file_systems_lock); for (tmp=file_systems, index=0 ; tmp ; tmp=tmp->next, index++) { if (strcmp(tmp->name, name->name) == 0) { err = index; break; } } read_unlock(&file_systems_lock); putname(name); return err; } static int fs_name(unsigned int index, char __user * buf) { struct file_system_type * tmp; int len, res; read_lock(&file_systems_lock); for (tmp = file_systems; tmp; tmp = tmp->next, index--) if (index <= 0 && try_module_get(tmp->owner)) break; read_unlock(&file_systems_lock); if (!tmp) return -EINVAL; /* OK, we got the reference, so we can safely block */ len = strlen(tmp->name) + 1; res = copy_to_user(buf, tmp->name, len) ? -EFAULT : 0; put_filesystem(tmp); return res; } static int fs_maxindex(void) { struct file_system_type * tmp; int index; read_lock(&file_systems_lock); for (tmp = file_systems, index = 0 ; tmp ; tmp = tmp->next, index++) ; read_unlock(&file_systems_lock); return index; } /* * Whee.. Weird sysv syscall. */ SYSCALL_DEFINE3(sysfs, int, option, unsigned long, arg1, unsigned long, arg2) { int retval = -EINVAL; switch (option) { case 1: retval = fs_index((const char __user *) arg1); break; case 2: retval = fs_name(arg1, (char __user *) arg2); break; case 3: retval = fs_maxindex(); break; } return retval; } #endif int __init list_bdev_fs_names(char *buf, size_t size) { struct file_system_type *p; size_t len; int count = 0; read_lock(&file_systems_lock); for (p = file_systems; p; p = p->next) { if (!(p->fs_flags & FS_REQUIRES_DEV)) continue; len = strlen(p->name) + 1; if (len > size) { pr_warn("%s: truncating file system list\n", __func__); break; } memcpy(buf, p->name, len); buf += len; size -= len; count++; } read_unlock(&file_systems_lock); return count; } #ifdef CONFIG_PROC_FS static int filesystems_proc_show(struct seq_file *m, void *v) { struct file_system_type * tmp; read_lock(&file_systems_lock); tmp = file_systems; while (tmp) { seq_printf(m, "%s\t%s\n", (tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev", tmp->name); tmp = tmp->next; } read_unlock(&file_systems_lock); return 0; } static int __init proc_filesystems_init(void) { proc_create_single("filesystems", 0, NULL, filesystems_proc_show); return 0; } module_init(proc_filesystems_init); #endif static struct file_system_type *__get_fs_type(const char *name, int len) { struct file_system_type *fs; read_lock(&file_systems_lock); fs = *(find_filesystem(name, len)); if (fs && !try_module_get(fs->owner)) fs = NULL; read_unlock(&file_systems_lock); return fs; } struct file_system_type *get_fs_type(const char *name) { struct file_system_type *fs; const char *dot = strchr(name, '.'); int len = dot ? dot - name : strlen(name); fs = __get_fs_type(name, len); if (!fs && (request_module("fs-%.*s", len, name) == 0)) { fs = __get_fs_type(name, len); if (!fs) pr_warn_once("request_module fs-%.*s succeeded, but still no fs?\n", len, name); } if (dot && fs && !(fs->fs_flags & FS_HAS_SUBTYPE)) { put_filesystem(fs); fs = NULL; } return fs; } EXPORT_SYMBOL(get_fs_type); |
2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 2 2 1 1 2 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 | // SPDX-License-Identifier: GPL-2.0-or-later /* * uvc_driver.c -- USB Video Class driver * * Copyright (C) 2005-2010 * Laurent Pinchart (laurent.pinchart@ideasonboard.com) */ #include <linux/atomic.h> #include <linux/bits.h> #include <linux/gpio/consumer.h> #include <linux/kernel.h> #include <linux/list.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/usb.h> #include <linux/usb/uvc.h> #include <linux/videodev2.h> #include <linux/vmalloc.h> #include <linux/wait.h> #include <asm/unaligned.h> #include <media/v4l2-common.h> #include <media/v4l2-ioctl.h> #include "uvcvideo.h" #define DRIVER_AUTHOR "Laurent Pinchart " \ "<laurent.pinchart@ideasonboard.com>" #define DRIVER_DESC "USB Video Class driver" unsigned int uvc_clock_param = CLOCK_MONOTONIC; unsigned int uvc_hw_timestamps_param; unsigned int uvc_no_drop_param; static unsigned int uvc_quirks_param = -1; unsigned int uvc_dbg_param; unsigned int uvc_timeout_param = UVC_CTRL_STREAMING_TIMEOUT; /* ------------------------------------------------------------------------ * Utility functions */ struct usb_host_endpoint *uvc_find_endpoint(struct usb_host_interface *alts, u8 epaddr) { struct usb_host_endpoint *ep; unsigned int i; for (i = 0; i < alts->desc.bNumEndpoints; ++i) { ep = &alts->endpoint[i]; if (ep->desc.bEndpointAddress == epaddr) return ep; } return NULL; } static enum v4l2_colorspace uvc_colorspace(const u8 primaries) { static const enum v4l2_colorspace colorprimaries[] = { V4L2_COLORSPACE_SRGB, /* Unspecified */ V4L2_COLORSPACE_SRGB, V4L2_COLORSPACE_470_SYSTEM_M, V4L2_COLORSPACE_470_SYSTEM_BG, V4L2_COLORSPACE_SMPTE170M, V4L2_COLORSPACE_SMPTE240M, }; if (primaries < ARRAY_SIZE(colorprimaries)) return colorprimaries[primaries]; return V4L2_COLORSPACE_SRGB; /* Reserved */ } static enum v4l2_xfer_func uvc_xfer_func(const u8 transfer_characteristics) { /* * V4L2 does not currently have definitions for all possible values of * UVC transfer characteristics. If v4l2_xfer_func is extended with new * values, the mapping below should be updated. * * Substitutions are taken from the mapping given for * V4L2_XFER_FUNC_DEFAULT documented in videodev2.h. */ static const enum v4l2_xfer_func xfer_funcs[] = { V4L2_XFER_FUNC_DEFAULT, /* Unspecified */ V4L2_XFER_FUNC_709, V4L2_XFER_FUNC_709, /* Substitution for BT.470-2 M */ V4L2_XFER_FUNC_709, /* Substitution for BT.470-2 B, G */ V4L2_XFER_FUNC_709, /* Substitution for SMPTE 170M */ V4L2_XFER_FUNC_SMPTE240M, V4L2_XFER_FUNC_NONE, V4L2_XFER_FUNC_SRGB, }; if (transfer_characteristics < ARRAY_SIZE(xfer_funcs)) return xfer_funcs[transfer_characteristics]; return V4L2_XFER_FUNC_DEFAULT; /* Reserved */ } static enum v4l2_ycbcr_encoding uvc_ycbcr_enc(const u8 matrix_coefficients) { /* * V4L2 does not currently have definitions for all possible values of * UVC matrix coefficients. If v4l2_ycbcr_encoding is extended with new * values, the mapping below should be updated. * * Substitutions are taken from the mapping given for * V4L2_YCBCR_ENC_DEFAULT documented in videodev2.h. * * FCC is assumed to be close enough to 601. */ static const enum v4l2_ycbcr_encoding ycbcr_encs[] = { V4L2_YCBCR_ENC_DEFAULT, /* Unspecified */ V4L2_YCBCR_ENC_709, V4L2_YCBCR_ENC_601, /* Substitution for FCC */ V4L2_YCBCR_ENC_601, /* Substitution for BT.470-2 B, G */ V4L2_YCBCR_ENC_601, V4L2_YCBCR_ENC_SMPTE240M, }; if (matrix_coefficients < ARRAY_SIZE(ycbcr_encs)) return ycbcr_encs[matrix_coefficients]; return V4L2_YCBCR_ENC_DEFAULT; /* Reserved */ } /* ------------------------------------------------------------------------ * Terminal and unit management */ struct uvc_entity *uvc_entity_by_id(struct uvc_device *dev, int id) { struct uvc_entity *entity; list_for_each_entry(entity, &dev->entities, list) { if (entity->id == id) return entity; } return NULL; } static struct uvc_entity *uvc_entity_by_reference(struct uvc_device *dev, int id, struct uvc_entity *entity) { unsigned int i; if (entity == NULL) entity = list_entry(&dev->entities, struct uvc_entity, list); list_for_each_entry_continue(entity, &dev->entities, list) { for (i = 0; i < entity->bNrInPins; ++i) if (entity->baSourceID[i] == id) return entity; } return NULL; } static struct uvc_streaming *uvc_stream_by_id(struct uvc_device *dev, int id) { struct uvc_streaming *stream; list_for_each_entry(stream, &dev->streams, list) { if (stream->header.bTerminalLink == id) return stream; } return NULL; } /* ------------------------------------------------------------------------ * Streaming Object Management */ static void uvc_stream_delete(struct uvc_streaming *stream) { if (stream->async_wq) destroy_workqueue(stream->async_wq); mutex_destroy(&stream->mutex); usb_put_intf(stream->intf); kfree(stream->formats); kfree(stream->header.bmaControls); kfree(stream); } static struct uvc_streaming *uvc_stream_new(struct uvc_device *dev, struct usb_interface *intf) { struct uvc_streaming *stream; stream = kzalloc(sizeof(*stream), GFP_KERNEL); if (stream == NULL) return NULL; mutex_init(&stream->mutex); stream->dev = dev; stream->intf = usb_get_intf(intf); stream->intfnum = intf->cur_altsetting->desc.bInterfaceNumber; /* Allocate a stream specific work queue for asynchronous tasks. */ stream->async_wq = alloc_workqueue("uvcvideo", WQ_UNBOUND | WQ_HIGHPRI, 0); if (!stream->async_wq) { uvc_stream_delete(stream); return NULL; } return stream; } /* ------------------------------------------------------------------------ * Descriptors parsing */ static int uvc_parse_format(struct uvc_device *dev, struct uvc_streaming *streaming, struct uvc_format *format, struct uvc_frame *frames, u32 **intervals, const unsigned char *buffer, int buflen) { struct usb_interface *intf = streaming->intf; struct usb_host_interface *alts = intf->cur_altsetting; const struct uvc_format_desc *fmtdesc; struct uvc_frame *frame; const unsigned char *start = buffer; unsigned int width_multiplier = 1; unsigned int interval; unsigned int i, n; u8 ftype; format->type = buffer[2]; format->index = buffer[3]; format->frames = frames; switch (buffer[2]) { case UVC_VS_FORMAT_UNCOMPRESSED: case UVC_VS_FORMAT_FRAME_BASED: n = buffer[2] == UVC_VS_FORMAT_UNCOMPRESSED ? 27 : 28; if (buflen < n) { uvc_dbg(dev, DESCR, "device %d videostreaming interface %d FORMAT error\n", dev->udev->devnum, alts->desc.bInterfaceNumber); return -EINVAL; } /* Find the format descriptor from its GUID. */ fmtdesc = uvc_format_by_guid(&buffer[5]); if (!fmtdesc) { /* * Unknown video formats are not fatal errors, the * caller will skip this descriptor. */ dev_info(&streaming->intf->dev, "Unknown video format %pUl\n", &buffer[5]); return 0; } format->fcc = fmtdesc->fcc; format->bpp = buffer[21]; /* * Some devices report a format that doesn't match what they * really send. */ if (dev->quirks & UVC_QUIRK_FORCE_Y8) { if (format->fcc == V4L2_PIX_FMT_YUYV) { format->fcc = V4L2_PIX_FMT_GREY; format->bpp = 8; width_multiplier = 2; } } /* Some devices report bpp that doesn't match the format. */ if (dev->quirks & UVC_QUIRK_FORCE_BPP) { const struct v4l2_format_info *info = v4l2_format_info(format->fcc); if (info) { unsigned int div = info->hdiv * info->vdiv; n = info->bpp[0] * div; for (i = 1; i < info->comp_planes; i++) n += info->bpp[i]; format->bpp = DIV_ROUND_UP(8 * n, div); } } if (buffer[2] == UVC_VS_FORMAT_UNCOMPRESSED) { ftype = UVC_VS_FRAME_UNCOMPRESSED; } else { ftype = UVC_VS_FRAME_FRAME_BASED; if (buffer[27]) format->flags = UVC_FMT_FLAG_COMPRESSED; } break; case UVC_VS_FORMAT_MJPEG: if (buflen < 11) { uvc_dbg(dev, DESCR, "device %d videostreaming interface %d FORMAT error\n", dev->udev->devnum, alts->desc.bInterfaceNumber); return -EINVAL; } format->fcc = V4L2_PIX_FMT_MJPEG; format->flags = UVC_FMT_FLAG_COMPRESSED; format->bpp = 0; ftype = UVC_VS_FRAME_MJPEG; break; case UVC_VS_FORMAT_DV: if (buflen < 9) { uvc_dbg(dev, DESCR, "device %d videostreaming interface %d FORMAT error\n", dev->udev->devnum, alts->desc.bInterfaceNumber); return -EINVAL; } if ((buffer[8] & 0x7f) > 2) { uvc_dbg(dev, DESCR, "device %d videostreaming interface %d: unknown DV format %u\n", dev->udev->devnum, alts->desc.bInterfaceNumber, buffer[8]); return -EINVAL; } format->fcc = V4L2_PIX_FMT_DV; format->flags = UVC_FMT_FLAG_COMPRESSED | UVC_FMT_FLAG_STREAM; format->bpp = 0; ftype = 0; /* Create a dummy frame descriptor. */ frame = &frames[0]; memset(frame, 0, sizeof(*frame)); frame->bFrameIntervalType = 1; frame->dwDefaultFrameInterval = 1; frame->dwFrameInterval = *intervals; *(*intervals)++ = 1; format->nframes = 1; break; case UVC_VS_FORMAT_MPEG2TS: case UVC_VS_FORMAT_STREAM_BASED: /* Not supported yet. */ default: uvc_dbg(dev, DESCR, "device %d videostreaming interface %d unsupported format %u\n", dev->udev->devnum, alts->desc.bInterfaceNumber, buffer[2]); return -EINVAL; } uvc_dbg(dev, DESCR, "Found format %p4cc", &format->fcc); buflen -= buffer[0]; buffer += buffer[0]; /* * Parse the frame descriptors. Only uncompressed, MJPEG and frame * based formats have frame descriptors. */ while (buflen > 2 && buffer[1] == USB_DT_CS_INTERFACE && buffer[2] == ftype) { unsigned int maxIntervalIndex; frame = &frames[format->nframes]; if (ftype != UVC_VS_FRAME_FRAME_BASED) n = buflen > 25 ? buffer[25] : 0; else n = buflen > 21 ? buffer[21] : 0; n = n ? n : 3; if (buflen < 26 + 4*n) { uvc_dbg(dev, DESCR, "device %d videostreaming interface %d FRAME error\n", dev->udev->devnum, alts->desc.bInterfaceNumber); return -EINVAL; } frame->bFrameIndex = buffer[3]; frame->bmCapabilities = buffer[4]; frame->wWidth = get_unaligned_le16(&buffer[5]) * width_multiplier; frame->wHeight = get_unaligned_le16(&buffer[7]); frame->dwMinBitRate = get_unaligned_le32(&buffer[9]); frame->dwMaxBitRate = get_unaligned_le32(&buffer[13]); if (ftype != UVC_VS_FRAME_FRAME_BASED) { frame->dwMaxVideoFrameBufferSize = get_unaligned_le32(&buffer[17]); frame->dwDefaultFrameInterval = get_unaligned_le32(&buffer[21]); frame->bFrameIntervalType = buffer[25]; } else { frame->dwMaxVideoFrameBufferSize = 0; frame->dwDefaultFrameInterval = get_unaligned_le32(&buffer[17]); frame->bFrameIntervalType = buffer[21]; } /* * Copy the frame intervals. * * Some bogus devices report dwMinFrameInterval equal to * dwMaxFrameInterval and have dwFrameIntervalStep set to * zero. Setting all null intervals to 1 fixes the problem and * some other divisions by zero that could happen. */ frame->dwFrameInterval = *intervals; for (i = 0; i < n; ++i) { interval = get_unaligned_le32(&buffer[26+4*i]); (*intervals)[i] = interval ? interval : 1; } /* * Apply more fixes, quirks and workarounds to handle incorrect * or broken descriptors. */ /* * Several UVC chipsets screw up dwMaxVideoFrameBufferSize * completely. Observed behaviours range from setting the * value to 1.1x the actual frame size to hardwiring the * 16 low bits to 0. This results in a higher than necessary * memory usage as well as a wrong image size information. For * uncompressed formats this can be fixed by computing the * value from the frame size. */ if (!(format->flags & UVC_FMT_FLAG_COMPRESSED)) frame->dwMaxVideoFrameBufferSize = format->bpp * frame->wWidth * frame->wHeight / 8; /* * Clamp the default frame interval to the boundaries. A zero * bFrameIntervalType value indicates a continuous frame * interval range, with dwFrameInterval[0] storing the minimum * value and dwFrameInterval[1] storing the maximum value. */ maxIntervalIndex = frame->bFrameIntervalType ? n - 1 : 1; frame->dwDefaultFrameInterval = clamp(frame->dwDefaultFrameInterval, frame->dwFrameInterval[0], frame->dwFrameInterval[maxIntervalIndex]); /* * Some devices report frame intervals that are not functional. * If the corresponding quirk is set, restrict operation to the * first interval only. */ if (dev->quirks & UVC_QUIRK_RESTRICT_FRAME_RATE) { frame->bFrameIntervalType = 1; (*intervals)[0] = frame->dwDefaultFrameInterval; } uvc_dbg(dev, DESCR, "- %ux%u (%u.%u fps)\n", frame->wWidth, frame->wHeight, 10000000 / frame->dwDefaultFrameInterval, (100000000 / frame->dwDefaultFrameInterval) % 10); format->nframes++; *intervals += n; buflen -= buffer[0]; buffer += buffer[0]; } if (buflen > 2 && buffer[1] == USB_DT_CS_INTERFACE && buffer[2] == UVC_VS_STILL_IMAGE_FRAME) { buflen -= buffer[0]; buffer += buffer[0]; } if (buflen > 2 && buffer[1] == USB_DT_CS_INTERFACE && buffer[2] == UVC_VS_COLORFORMAT) { if (buflen < 6) { uvc_dbg(dev, DESCR, "device %d videostreaming interface %d COLORFORMAT error\n", dev->udev->devnum, alts->desc.bInterfaceNumber); return -EINVAL; } format->colorspace = uvc_colorspace(buffer[3]); format->xfer_func = uvc_xfer_func(buffer[4]); format->ycbcr_enc = uvc_ycbcr_enc(buffer[5]); buflen -= buffer[0]; buffer += buffer[0]; } else { format->colorspace = V4L2_COLORSPACE_SRGB; } return buffer - start; } static int uvc_parse_streaming(struct uvc_device *dev, struct usb_interface *intf) { struct uvc_streaming *streaming = NULL; struct uvc_format *format; struct uvc_frame *frame; struct usb_host_interface *alts = &intf->altsetting[0]; const unsigned char *_buffer, *buffer = alts->extra; int _buflen, buflen = alts->extralen; unsigned int nformats = 0, nframes = 0, nintervals = 0; unsigned int size, i, n, p; u32 *interval; u16 psize; int ret = -EINVAL; if (intf->cur_altsetting->desc.bInterfaceSubClass != UVC_SC_VIDEOSTREAMING) { uvc_dbg(dev, DESCR, "device %d interface %d isn't a video streaming interface\n", dev->udev->devnum, intf->altsetting[0].desc.bInterfaceNumber); return -EINVAL; } if (usb_driver_claim_interface(&uvc_driver.driver, intf, dev)) { uvc_dbg(dev, DESCR, "device %d interface %d is already claimed\n", dev->udev->devnum, intf->altsetting[0].desc.bInterfaceNumber); return -EINVAL; } streaming = uvc_stream_new(dev, intf); if (streaming == NULL) { usb_driver_release_interface(&uvc_driver.driver, intf); return -ENOMEM; } /* * The Pico iMage webcam has its class-specific interface descriptors * after the endpoint descriptors. */ if (buflen == 0) { for (i = 0; i < alts->desc.bNumEndpoints; ++i) { struct usb_host_endpoint *ep = &alts->endpoint[i]; if (ep->extralen == 0) continue; if (ep->extralen > 2 && ep->extra[1] == USB_DT_CS_INTERFACE) { uvc_dbg(dev, DESCR, "trying extra data from endpoint %u\n", i); buffer = alts->endpoint[i].extra; buflen = alts->endpoint[i].extralen; break; } } } /* Skip the standard interface descriptors. */ while (buflen > 2 && buffer[1] != USB_DT_CS_INTERFACE) { buflen -= buffer[0]; buffer += buffer[0]; } if (buflen <= 2) { uvc_dbg(dev, DESCR, "no class-specific streaming interface descriptors found\n"); goto error; } /* Parse the header descriptor. */ switch (buffer[2]) { case UVC_VS_OUTPUT_HEADER: streaming->type = V4L2_BUF_TYPE_VIDEO_OUTPUT; size = 9; break; case UVC_VS_INPUT_HEADER: streaming->type = V4L2_BUF_TYPE_VIDEO_CAPTURE; size = 13; break; default: uvc_dbg(dev, DESCR, "device %d videostreaming interface %d HEADER descriptor not found\n", dev->udev->devnum, alts->desc.bInterfaceNumber); goto error; } p = buflen >= 4 ? buffer[3] : 0; n = buflen >= size ? buffer[size-1] : 0; if (buflen < size + p*n) { uvc_dbg(dev, DESCR, "device %d videostreaming interface %d HEADER descriptor is invalid\n", dev->udev->devnum, alts->desc.bInterfaceNumber); goto error; } streaming->header.bNumFormats = p; streaming->header.bEndpointAddress = buffer[6]; if (buffer[2] == UVC_VS_INPUT_HEADER) { streaming->header.bmInfo = buffer[7]; streaming->header.bTerminalLink = buffer[8]; streaming->header.bStillCaptureMethod = buffer[9]; streaming->header.bTriggerSupport = buffer[10]; streaming->header.bTriggerUsage = buffer[11]; } else { streaming->header.bTerminalLink = buffer[7]; } streaming->header.bControlSize = n; streaming->header.bmaControls = kmemdup(&buffer[size], p * n, GFP_KERNEL); if (streaming->header.bmaControls == NULL) { ret = -ENOMEM; goto error; } buflen -= buffer[0]; buffer += buffer[0]; _buffer = buffer; _buflen = buflen; /* Count the format and frame descriptors. */ while (_buflen > 2 && _buffer[1] == USB_DT_CS_INTERFACE) { switch (_buffer[2]) { case UVC_VS_FORMAT_UNCOMPRESSED: case UVC_VS_FORMAT_MJPEG: case UVC_VS_FORMAT_FRAME_BASED: nformats++; break; case UVC_VS_FORMAT_DV: /* * DV format has no frame descriptor. We will create a * dummy frame descriptor with a dummy frame interval. */ nformats++; nframes++; nintervals++; break; case UVC_VS_FORMAT_MPEG2TS: case UVC_VS_FORMAT_STREAM_BASED: uvc_dbg(dev, DESCR, "device %d videostreaming interface %d FORMAT %u is not supported\n", dev->udev->devnum, alts->desc.bInterfaceNumber, _buffer[2]); break; case UVC_VS_FRAME_UNCOMPRESSED: case UVC_VS_FRAME_MJPEG: nframes++; if (_buflen > 25) nintervals += _buffer[25] ? _buffer[25] : 3; break; case UVC_VS_FRAME_FRAME_BASED: nframes++; if (_buflen > 21) nintervals += _buffer[21] ? _buffer[21] : 3; break; } _buflen -= _buffer[0]; _buffer += _buffer[0]; } if (nformats == 0) { uvc_dbg(dev, DESCR, "device %d videostreaming interface %d has no supported formats defined\n", dev->udev->devnum, alts->desc.bInterfaceNumber); goto error; } size = nformats * sizeof(*format) + nframes * sizeof(*frame) + nintervals * sizeof(*interval); format = kzalloc(size, GFP_KERNEL); if (format == NULL) { ret = -ENOMEM; goto error; } frame = (struct uvc_frame *)&format[nformats]; interval = (u32 *)&frame[nframes]; streaming->formats = format; streaming->nformats = 0; /* Parse the format descriptors. */ while (buflen > 2 && buffer[1] == USB_DT_CS_INTERFACE) { switch (buffer[2]) { case UVC_VS_FORMAT_UNCOMPRESSED: case UVC_VS_FORMAT_MJPEG: case UVC_VS_FORMAT_DV: case UVC_VS_FORMAT_FRAME_BASED: ret = uvc_parse_format(dev, streaming, format, frame, &interval, buffer, buflen); if (ret < 0) goto error; if (!ret) break; streaming->nformats++; frame += format->nframes; format++; buflen -= ret; buffer += ret; continue; default: break; } buflen -= buffer[0]; buffer += buffer[0]; } if (buflen) uvc_dbg(dev, DESCR, "device %d videostreaming interface %d has %u bytes of trailing descriptor garbage\n", dev->udev->devnum, alts->desc.bInterfaceNumber, buflen); /* Parse the alternate settings to find the maximum bandwidth. */ for (i = 0; i < intf->num_altsetting; ++i) { struct usb_host_endpoint *ep; alts = &intf->altsetting[i]; ep = uvc_find_endpoint(alts, streaming->header.bEndpointAddress); if (ep == NULL) continue; psize = uvc_endpoint_max_bpi(dev->udev, ep); if (psize > streaming->maxpsize) streaming->maxpsize = psize; } list_add_tail(&streaming->list, &dev->streams); return 0; error: usb_driver_release_interface(&uvc_driver.driver, intf); uvc_stream_delete(streaming); return ret; } static const u8 uvc_camera_guid[16] = UVC_GUID_UVC_CAMERA; static const u8 uvc_gpio_guid[16] = UVC_GUID_EXT_GPIO_CONTROLLER; static const u8 uvc_media_transport_input_guid[16] = UVC_GUID_UVC_MEDIA_TRANSPORT_INPUT; static const u8 uvc_processing_guid[16] = UVC_GUID_UVC_PROCESSING; static struct uvc_entity *uvc_alloc_entity(u16 type, u16 id, unsigned int num_pads, unsigned int extra_size) { struct uvc_entity *entity; unsigned int num_inputs; unsigned int size; unsigned int i; extra_size = roundup(extra_size, sizeof(*entity->pads)); if (num_pads) num_inputs = type & UVC_TERM_OUTPUT ? num_pads : num_pads - 1; else num_inputs = 0; size = sizeof(*entity) + extra_size + sizeof(*entity->pads) * num_pads + num_inputs; entity = kzalloc(size, GFP_KERNEL); if (entity == NULL) return NULL; entity->id = id; entity->type = type; /* * Set the GUID for standard entity types. For extension units, the GUID * is initialized by the caller. */ switch (type) { case UVC_EXT_GPIO_UNIT: memcpy(entity->guid, uvc_gpio_guid, 16); break; case UVC_ITT_CAMERA: memcpy(entity->guid, uvc_camera_guid, 16); break; case UVC_ITT_MEDIA_TRANSPORT_INPUT: memcpy(entity->guid, uvc_media_transport_input_guid, 16); break; case UVC_VC_PROCESSING_UNIT: memcpy(entity->guid, uvc_processing_guid, 16); break; } entity->num_links = 0; entity->num_pads = num_pads; entity->pads = ((void *)(entity + 1)) + extra_size; for (i = 0; i < num_inputs; ++i) entity->pads[i].flags = MEDIA_PAD_FL_SINK; if (!UVC_ENTITY_IS_OTERM(entity) && num_pads) entity->pads[num_pads-1].flags = MEDIA_PAD_FL_SOURCE; entity->bNrInPins = num_inputs; entity->baSourceID = (u8 *)(&entity->pads[num_pads]); return entity; } static void uvc_entity_set_name(struct uvc_device *dev, struct uvc_entity *entity, const char *type_name, u8 string_id) { int ret; /* * First attempt to read the entity name from the device. If the entity * has no associated string, or if reading the string fails (most * likely due to a buggy firmware), fall back to default names based on * the entity type. */ if (string_id) { ret = usb_string(dev->udev, string_id, entity->name, sizeof(entity->name)); if (!ret) return; } sprintf(entity->name, "%s %u", type_name, entity->id); } /* Parse vendor-specific extensions. */ static int uvc_parse_vendor_control(struct uvc_device *dev, const unsigned char *buffer, int buflen) { struct usb_device *udev = dev->udev; struct usb_host_interface *alts = dev->intf->cur_altsetting; struct uvc_entity *unit; unsigned int n, p; int handled = 0; switch (le16_to_cpu(dev->udev->descriptor.idVendor)) { case 0x046d: /* Logitech */ if (buffer[1] != 0x41 || buffer[2] != 0x01) break; /* * Logitech implements several vendor specific functions * through vendor specific extension units (LXU). * * The LXU descriptors are similar to XU descriptors * (see "USB Device Video Class for Video Devices", section * 3.7.2.6 "Extension Unit Descriptor") with the following * differences: * * ---------------------------------------------------------- * 0 bLength 1 Number * Size of this descriptor, in bytes: 24+p+n*2 * ---------------------------------------------------------- * 23+p+n bmControlsType N Bitmap * Individual bits in the set are defined: * 0: Absolute * 1: Relative * * This bitset is mapped exactly the same as bmControls. * ---------------------------------------------------------- * 23+p+n*2 bReserved 1 Boolean * ---------------------------------------------------------- * 24+p+n*2 iExtension 1 Index * Index of a string descriptor that describes this * extension unit. * ---------------------------------------------------------- */ p = buflen >= 22 ? buffer[21] : 0; n = buflen >= 25 + p ? buffer[22+p] : 0; if (buflen < 25 + p + 2*n) { uvc_dbg(dev, DESCR, "device %d videocontrol interface %d EXTENSION_UNIT error\n", udev->devnum, alts->desc.bInterfaceNumber); break; } unit = uvc_alloc_entity(UVC_VC_EXTENSION_UNIT, buffer[3], p + 1, 2*n); if (unit == NULL) return -ENOMEM; memcpy(unit->guid, &buffer[4], 16); unit->extension.bNumControls = buffer[20]; memcpy(unit->baSourceID, &buffer[22], p); unit->extension.bControlSize = buffer[22+p]; unit->extension.bmControls = (u8 *)unit + sizeof(*unit); unit->extension.bmControlsType = (u8 *)unit + sizeof(*unit) + n; memcpy(unit->extension.bmControls, &buffer[23+p], 2*n); uvc_entity_set_name(dev, unit, "Extension", buffer[24+p+2*n]); list_add_tail(&unit->list, &dev->entities); handled = 1; break; } return handled; } static int uvc_parse_standard_control(struct uvc_device *dev, const unsigned char *buffer, int buflen) { struct usb_device *udev = dev->udev; struct uvc_entity *unit, *term; struct usb_interface *intf; struct usb_host_interface *alts = dev->intf->cur_altsetting; unsigned int i, n, p, len; const char *type_name; u16 type; switch (buffer[2]) { case UVC_VC_HEADER: n = buflen >= 12 ? buffer[11] : 0; if (buflen < 12 + n) { uvc_dbg(dev, DESCR, "device %d videocontrol interface %d HEADER error\n", udev->devnum, alts->desc.bInterfaceNumber); return -EINVAL; } dev->uvc_version = get_unaligned_le16(&buffer[3]); dev->clock_frequency = get_unaligned_le32(&buffer[7]); /* Parse all USB Video Streaming interfaces. */ for (i = 0; i < n; ++i) { intf = usb_ifnum_to_if(udev, buffer[12+i]); if (intf == NULL) { uvc_dbg(dev, DESCR, "device %d interface %d doesn't exists\n", udev->devnum, i); continue; } uvc_parse_streaming(dev, intf); } break; case UVC_VC_INPUT_TERMINAL: if (buflen < 8) { uvc_dbg(dev, DESCR, "device %d videocontrol interface %d INPUT_TERMINAL error\n", udev->devnum, alts->desc.bInterfaceNumber); return -EINVAL; } /* * Reject invalid terminal types that would cause issues: * * - The high byte must be non-zero, otherwise it would be * confused with a unit. * * - Bit 15 must be 0, as we use it internally as a terminal * direction flag. * * Other unknown types are accepted. */ type = get_unaligned_le16(&buffer[4]); if ((type & 0x7f00) == 0 || (type & 0x8000) != 0) { uvc_dbg(dev, DESCR, "device %d videocontrol interface %d INPUT_TERMINAL %d has invalid type 0x%04x, skipping\n", udev->devnum, alts->desc.bInterfaceNumber, buffer[3], type); return 0; } n = 0; p = 0; len = 8; if (type == UVC_ITT_CAMERA) { n = buflen >= 15 ? buffer[14] : 0; len = 15; } else if (type == UVC_ITT_MEDIA_TRANSPORT_INPUT) { n = buflen >= 9 ? buffer[8] : 0; p = buflen >= 10 + n ? buffer[9+n] : 0; len = 10; } if (buflen < len + n + p) { uvc_dbg(dev, DESCR, "device %d videocontrol interface %d INPUT_TERMINAL error\n", udev->devnum, alts->desc.bInterfaceNumber); return -EINVAL; } term = uvc_alloc_entity(type | UVC_TERM_INPUT, buffer[3], 1, n + p); if (term == NULL) return -ENOMEM; if (UVC_ENTITY_TYPE(term) == UVC_ITT_CAMERA) { term->camera.bControlSize = n; term->camera.bmControls = (u8 *)term + sizeof(*term); term->camera.wObjectiveFocalLengthMin = get_unaligned_le16(&buffer[8]); term->camera.wObjectiveFocalLengthMax = get_unaligned_le16(&buffer[10]); term->camera.wOcularFocalLength = get_unaligned_le16(&buffer[12]); memcpy(term->camera.bmControls, &buffer[15], n); } else if (UVC_ENTITY_TYPE(term) == UVC_ITT_MEDIA_TRANSPORT_INPUT) { term->media.bControlSize = n; term->media.bmControls = (u8 *)term + sizeof(*term); term->media.bTransportModeSize = p; term->media.bmTransportModes = (u8 *)term + sizeof(*term) + n; memcpy(term->media.bmControls, &buffer[9], n); memcpy(term->media.bmTransportModes, &buffer[10+n], p); } if (UVC_ENTITY_TYPE(term) == UVC_ITT_CAMERA) type_name = "Camera"; else if (UVC_ENTITY_TYPE(term) == UVC_ITT_MEDIA_TRANSPORT_INPUT) type_name = "Media"; else type_name = "Input"; uvc_entity_set_name(dev, term, type_name, buffer[7]); list_add_tail(&term->list, &dev->entities); break; case UVC_VC_OUTPUT_TERMINAL: if (buflen < 9) { uvc_dbg(dev, DESCR, "device %d videocontrol interface %d OUTPUT_TERMINAL error\n", udev->devnum, alts->desc.bInterfaceNumber); return -EINVAL; } /* * Make sure the terminal type MSB is not null, otherwise it * could be confused with a unit. */ type = get_unaligned_le16(&buffer[4]); if ((type & 0xff00) == 0) { uvc_dbg(dev, DESCR, "device %d videocontrol interface %d OUTPUT_TERMINAL %d has invalid type 0x%04x, skipping\n", udev->devnum, alts->desc.bInterfaceNumber, buffer[3], type); return 0; } term = uvc_alloc_entity(type | UVC_TERM_OUTPUT, buffer[3], 1, 0); if (term == NULL) return -ENOMEM; memcpy(term->baSourceID, &buffer[7], 1); uvc_entity_set_name(dev, term, "Output", buffer[8]); list_add_tail(&term->list, &dev->entities); break; case UVC_VC_SELECTOR_UNIT: p = buflen >= 5 ? buffer[4] : 0; if (buflen < 5 || buflen < 6 + p) { uvc_dbg(dev, DESCR, "device %d videocontrol interface %d SELECTOR_UNIT error\n", udev->devnum, alts->desc.bInterfaceNumber); return -EINVAL; } unit = uvc_alloc_entity(buffer[2], buffer[3], p + 1, 0); if (unit == NULL) return -ENOMEM; memcpy(unit->baSourceID, &buffer[5], p); uvc_entity_set_name(dev, unit, "Selector", buffer[5+p]); list_add_tail(&unit->list, &dev->entities); break; case UVC_VC_PROCESSING_UNIT: n = buflen >= 8 ? buffer[7] : 0; p = dev->uvc_version >= 0x0110 ? 10 : 9; if (buflen < p + n) { uvc_dbg(dev, DESCR, "device %d videocontrol interface %d PROCESSING_UNIT error\n", udev->devnum, alts->desc.bInterfaceNumber); return -EINVAL; } unit = uvc_alloc_entity(buffer[2], buffer[3], 2, n); if (unit == NULL) return -ENOMEM; memcpy(unit->baSourceID, &buffer[4], 1); unit->processing.wMaxMultiplier = get_unaligned_le16(&buffer[5]); unit->processing.bControlSize = buffer[7]; unit->processing.bmControls = (u8 *)unit + sizeof(*unit); memcpy(unit->processing.bmControls, &buffer[8], n); if (dev->uvc_version >= 0x0110) unit->processing.bmVideoStandards = buffer[9+n]; uvc_entity_set_name(dev, unit, "Processing", buffer[8+n]); list_add_tail(&unit->list, &dev->entities); break; case UVC_VC_EXTENSION_UNIT: p = buflen >= 22 ? buffer[21] : 0; n = buflen >= 24 + p ? buffer[22+p] : 0; if (buflen < 24 + p + n) { uvc_dbg(dev, DESCR, "device %d videocontrol interface %d EXTENSION_UNIT error\n", udev->devnum, alts->desc.bInterfaceNumber); return -EINVAL; } unit = uvc_alloc_entity(buffer[2], buffer[3], p + 1, n); if (unit == NULL) return -ENOMEM; memcpy(unit->guid, &buffer[4], 16); unit->extension.bNumControls = buffer[20]; memcpy(unit->baSourceID, &buffer[22], p); unit->extension.bControlSize = buffer[22+p]; unit->extension.bmControls = (u8 *)unit + sizeof(*unit); memcpy(unit->extension.bmControls, &buffer[23+p], n); uvc_entity_set_name(dev, unit, "Extension", buffer[23+p+n]); list_add_tail(&unit->list, &dev->entities); break; default: uvc_dbg(dev, DESCR, "Found an unknown CS_INTERFACE descriptor (%u)\n", buffer[2]); break; } return 0; } static int uvc_parse_control(struct uvc_device *dev) { struct usb_host_interface *alts = dev->intf->cur_altsetting; const unsigned char *buffer = alts->extra; int buflen = alts->extralen; int ret; /* * Parse the default alternate setting only, as the UVC specification * defines a single alternate setting, the default alternate setting * zero. */ while (buflen > 2) { if (uvc_parse_vendor_control(dev, buffer, buflen) || buffer[1] != USB_DT_CS_INTERFACE) goto next_descriptor; ret = uvc_parse_standard_control(dev, buffer, buflen); if (ret < 0) return ret; next_descriptor: buflen -= buffer[0]; buffer += buffer[0]; } /* * Check if the optional status endpoint is present. Built-in iSight * webcams have an interrupt endpoint but spit proprietary data that * don't conform to the UVC status endpoint messages. Don't try to * handle the interrupt endpoint for those cameras. */ if (alts->desc.bNumEndpoints == 1 && !(dev->quirks & UVC_QUIRK_BUILTIN_ISIGHT)) { struct usb_host_endpoint *ep = &alts->endpoint[0]; struct usb_endpoint_descriptor *desc = &ep->desc; if (usb_endpoint_is_int_in(desc) && le16_to_cpu(desc->wMaxPacketSize) >= 8 && desc->bInterval != 0) { uvc_dbg(dev, DESCR, "Found a Status endpoint (addr %02x)\n", desc->bEndpointAddress); dev->int_ep = ep; } } return 0; } /* ----------------------------------------------------------------------------- * Privacy GPIO */ static void uvc_gpio_event(struct uvc_device *dev) { struct uvc_entity *unit = dev->gpio_unit; struct uvc_video_chain *chain; u8 new_val; if (!unit) return; new_val = gpiod_get_value_cansleep(unit->gpio.gpio_privacy); /* GPIO entities are always on the first chain. */ chain = list_first_entry(&dev->chains, struct uvc_video_chain, list); uvc_ctrl_status_event(chain, unit->controls, &new_val); } static int uvc_gpio_get_cur(struct uvc_device *dev, struct uvc_entity *entity, u8 cs, void *data, u16 size) { if (cs != UVC_CT_PRIVACY_CONTROL || size < 1) return -EINVAL; *(u8 *)data = gpiod_get_value_cansleep(entity->gpio.gpio_privacy); return 0; } static int uvc_gpio_get_info(struct uvc_device *dev, struct uvc_entity *entity, u8 cs, u8 *caps) { if (cs != UVC_CT_PRIVACY_CONTROL) return -EINVAL; *caps = UVC_CONTROL_CAP_GET | UVC_CONTROL_CAP_AUTOUPDATE; return 0; } static irqreturn_t uvc_gpio_irq(int irq, void *data) { struct uvc_device *dev = data; uvc_gpio_event(dev); return IRQ_HANDLED; } static int uvc_gpio_parse(struct uvc_device *dev) { struct uvc_entity *unit; struct gpio_desc *gpio_privacy; int irq; gpio_privacy = devm_gpiod_get_optional(&dev->udev->dev, "privacy", GPIOD_IN); if (IS_ERR_OR_NULL(gpio_privacy)) return PTR_ERR_OR_ZERO(gpio_privacy); irq = gpiod_to_irq(gpio_privacy); if (irq < 0) return dev_err_probe(&dev->udev->dev, irq, "No IRQ for privacy GPIO\n"); unit = uvc_alloc_entity(UVC_EXT_GPIO_UNIT, UVC_EXT_GPIO_UNIT_ID, 0, 1); if (!unit) return -ENOMEM; unit->gpio.gpio_privacy = gpio_privacy; unit->gpio.irq = irq; unit->gpio.bControlSize = 1; unit->gpio.bmControls = (u8 *)unit + sizeof(*unit); unit->gpio.bmControls[0] = 1; unit->get_cur = uvc_gpio_get_cur; unit->get_info = uvc_gpio_get_info; strscpy(unit->name, "GPIO", sizeof(unit->name)); list_add_tail(&unit->list, &dev->entities); dev->gpio_unit = unit; return 0; } static int uvc_gpio_init_irq(struct uvc_device *dev) { struct uvc_entity *unit = dev->gpio_unit; if (!unit || unit->gpio.irq < 0) return 0; return devm_request_threaded_irq(&dev->udev->dev, unit->gpio.irq, NULL, uvc_gpio_irq, IRQF_ONESHOT | IRQF_TRIGGER_FALLING | IRQF_TRIGGER_RISING, "uvc_privacy_gpio", dev); } /* ------------------------------------------------------------------------ * UVC device scan */ /* * Scan the UVC descriptors to locate a chain starting at an Output Terminal * and containing the following units: * * - one or more Output Terminals (USB Streaming or Display) * - zero or one Processing Unit * - zero, one or more single-input Selector Units * - zero or one multiple-input Selector Units, provided all inputs are * connected to input terminals * - zero, one or mode single-input Extension Units * - one or more Input Terminals (Camera, External or USB Streaming) * * The terminal and units must match on of the following structures: * * ITT_*(0) -> +---------+ +---------+ +---------+ -> TT_STREAMING(0) * ... | SU{0,1} | -> | PU{0,1} | -> | XU{0,n} | ... * ITT_*(n) -> +---------+ +---------+ +---------+ -> TT_STREAMING(n) * * +---------+ +---------+ -> OTT_*(0) * TT_STREAMING -> | PU{0,1} | -> | XU{0,n} | ... * +---------+ +---------+ -> OTT_*(n) * * The Processing Unit and Extension Units can be in any order. Additional * Extension Units connected to the main chain as single-unit branches are * also supported. Single-input Selector Units are ignored. */ static int uvc_scan_chain_entity(struct uvc_video_chain *chain, struct uvc_entity *entity) { switch (UVC_ENTITY_TYPE(entity)) { case UVC_VC_EXTENSION_UNIT: uvc_dbg_cont(PROBE, " <- XU %d", entity->id); if (entity->bNrInPins != 1) { uvc_dbg(chain->dev, DESCR, "Extension unit %d has more than 1 input pin\n", entity->id); return -1; } break; case UVC_VC_PROCESSING_UNIT: uvc_dbg_cont(PROBE, " <- PU %d", entity->id); if (chain->processing != NULL) { uvc_dbg(chain->dev, DESCR, "Found multiple Processing Units in chain\n"); return -1; } chain->processing = entity; break; case UVC_VC_SELECTOR_UNIT: uvc_dbg_cont(PROBE, " <- SU %d", entity->id); /* Single-input selector units are ignored. */ if (entity->bNrInPins == 1) break; if (chain->selector != NULL) { uvc_dbg(chain->dev, DESCR, "Found multiple Selector Units in chain\n"); return -1; } chain->selector = entity; break; case UVC_ITT_VENDOR_SPECIFIC: case UVC_ITT_CAMERA: case UVC_ITT_MEDIA_TRANSPORT_INPUT: uvc_dbg_cont(PROBE, " <- IT %d\n", entity->id); break; case UVC_OTT_VENDOR_SPECIFIC: case UVC_OTT_DISPLAY: case UVC_OTT_MEDIA_TRANSPORT_OUTPUT: uvc_dbg_cont(PROBE, " OT %d", entity->id); break; case UVC_TT_STREAMING: if (UVC_ENTITY_IS_ITERM(entity)) uvc_dbg_cont(PROBE, " <- IT %d\n", entity->id); else uvc_dbg_cont(PROBE, " OT %d", entity->id); break; default: uvc_dbg(chain->dev, DESCR, "Unsupported entity type 0x%04x found in chain\n", UVC_ENTITY_TYPE(entity)); return -1; } list_add_tail(&entity->chain, &chain->entities); return 0; } static int uvc_scan_chain_forward(struct uvc_video_chain *chain, struct uvc_entity *entity, struct uvc_entity *prev) { struct uvc_entity *forward; int found; /* Forward scan */ forward = NULL; found = 0; while (1) { forward = uvc_entity_by_reference(chain->dev, entity->id, forward); if (forward == NULL) break; if (forward == prev) continue; if (forward->chain.next || forward->chain.prev) { uvc_dbg(chain->dev, DESCR, "Found reference to entity %d already in chain\n", forward->id); return -EINVAL; } switch (UVC_ENTITY_TYPE(forward)) { case UVC_VC_EXTENSION_UNIT: if (forward->bNrInPins != 1) { uvc_dbg(chain->dev, DESCR, "Extension unit %d has more than 1 input pin\n", forward->id); return -EINVAL; } /* * Some devices reference an output terminal as the * source of extension units. This is incorrect, as * output terminals only have an input pin, and thus * can't be connected to any entity in the forward * direction. The resulting topology would cause issues * when registering the media controller graph. To * avoid this problem, connect the extension unit to * the source of the output terminal instead. */ if (UVC_ENTITY_IS_OTERM(entity)) { struct uvc_entity *source; source = uvc_entity_by_id(chain->dev, entity->baSourceID[0]); if (!source) { uvc_dbg(chain->dev, DESCR, "Can't connect extension unit %u in chain\n", forward->id); break; } forward->baSourceID[0] = source->id; } list_add_tail(&forward->chain, &chain->entities); if (!found) uvc_dbg_cont(PROBE, " (->"); uvc_dbg_cont(PROBE, " XU %d", forward->id); found = 1; break; case UVC_OTT_VENDOR_SPECIFIC: case UVC_OTT_DISPLAY: case UVC_OTT_MEDIA_TRANSPORT_OUTPUT: case UVC_TT_STREAMING: if (UVC_ENTITY_IS_ITERM(forward)) { uvc_dbg(chain->dev, DESCR, "Unsupported input terminal %u\n", forward->id); return -EINVAL; } if (UVC_ENTITY_IS_OTERM(entity)) { uvc_dbg(chain->dev, DESCR, "Unsupported connection between output terminals %u and %u\n", entity->id, forward->id); break; } list_add_tail(&forward->chain, &chain->entities); if (!found) uvc_dbg_cont(PROBE, " (->"); uvc_dbg_cont(PROBE, " OT %d", forward->id); found = 1; break; } } if (found) uvc_dbg_cont(PROBE, ")"); return 0; } static int uvc_scan_chain_backward(struct uvc_video_chain *chain, struct uvc_entity **_entity) { struct uvc_entity *entity = *_entity; struct uvc_entity *term; int id = -EINVAL, i; switch (UVC_ENTITY_TYPE(entity)) { case UVC_VC_EXTENSION_UNIT: case UVC_VC_PROCESSING_UNIT: id = entity->baSourceID[0]; break; case UVC_VC_SELECTOR_UNIT: /* Single-input selector units are ignored. */ if (entity->bNrInPins == 1) { id = entity->baSourceID[0]; break; } uvc_dbg_cont(PROBE, " <- IT"); chain->selector = entity; for (i = 0; i < entity->bNrInPins; ++i) { id = entity->baSourceID[i]; term = uvc_entity_by_id(chain->dev, id); if (term == NULL || !UVC_ENTITY_IS_ITERM(term)) { uvc_dbg(chain->dev, DESCR, "Selector unit %d input %d isn't connected to an input terminal\n", entity->id, i); return -1; } if (term->chain.next || term->chain.prev) { uvc_dbg(chain->dev, DESCR, "Found reference to entity %d already in chain\n", term->id); return -EINVAL; } uvc_dbg_cont(PROBE, " %d", term->id); list_add_tail(&term->chain, &chain->entities); uvc_scan_chain_forward(chain, term, entity); } uvc_dbg_cont(PROBE, "\n"); id = 0; break; case UVC_ITT_VENDOR_SPECIFIC: case UVC_ITT_CAMERA: case UVC_ITT_MEDIA_TRANSPORT_INPUT: case UVC_OTT_VENDOR_SPECIFIC: case UVC_OTT_DISPLAY: case UVC_OTT_MEDIA_TRANSPORT_OUTPUT: case UVC_TT_STREAMING: id = UVC_ENTITY_IS_OTERM(entity) ? entity->baSourceID[0] : 0; break; } if (id <= 0) { *_entity = NULL; return id; } entity = uvc_entity_by_id(chain->dev, id); if (entity == NULL) { uvc_dbg(chain->dev, DESCR, "Found reference to unknown entity %d\n", id); return -EINVAL; } *_entity = entity; return 0; } static int uvc_scan_chain(struct uvc_video_chain *chain, struct uvc_entity *term) { struct uvc_entity *entity, *prev; uvc_dbg(chain->dev, PROBE, "Scanning UVC chain:"); entity = term; prev = NULL; while (entity != NULL) { /* Entity must not be part of an existing chain */ if (entity->chain.next || entity->chain.prev) { uvc_dbg(chain->dev, DESCR, "Found reference to entity %d already in chain\n", entity->id); return -EINVAL; } /* Process entity */ if (uvc_scan_chain_entity(chain, entity) < 0) return -EINVAL; /* Forward scan */ if (uvc_scan_chain_forward(chain, entity, prev) < 0) return -EINVAL; /* Backward scan */ prev = entity; if (uvc_scan_chain_backward(chain, &entity) < 0) return -EINVAL; } return 0; } static unsigned int uvc_print_terms(struct list_head *terms, u16 dir, char *buffer) { struct uvc_entity *term; unsigned int nterms = 0; char *p = buffer; list_for_each_entry(term, terms, chain) { if (!UVC_ENTITY_IS_TERM(term) || UVC_TERM_DIRECTION(term) != dir) continue; if (nterms) p += sprintf(p, ","); if (++nterms >= 4) { p += sprintf(p, "..."); break; } p += sprintf(p, "%u", term->id); } return p - buffer; } static const char *uvc_print_chain(struct uvc_video_chain *chain) { static char buffer[43]; char *p = buffer; p += uvc_print_terms(&chain->entities, UVC_TERM_INPUT, p); p += sprintf(p, " -> "); uvc_print_terms(&chain->entities, UVC_TERM_OUTPUT, p); return buffer; } static struct uvc_video_chain *uvc_alloc_chain(struct uvc_device *dev) { struct uvc_video_chain *chain; chain = kzalloc(sizeof(*chain), GFP_KERNEL); if (chain == NULL) return NULL; INIT_LIST_HEAD(&chain->entities); mutex_init(&chain->ctrl_mutex); chain->dev = dev; v4l2_prio_init(&chain->prio); return chain; } /* * Fallback heuristic for devices that don't connect units and terminals in a * valid chain. * * Some devices have invalid baSourceID references, causing uvc_scan_chain() * to fail, but if we just take the entities we can find and put them together * in the most sensible chain we can think of, turns out they do work anyway. * Note: This heuristic assumes there is a single chain. * * At the time of writing, devices known to have such a broken chain are * - Acer Integrated Camera (5986:055a) * - Realtek rtl157a7 (0bda:57a7) */ static int uvc_scan_fallback(struct uvc_device *dev) { struct uvc_video_chain *chain; struct uvc_entity *iterm = NULL; struct uvc_entity *oterm = NULL; struct uvc_entity *entity; struct uvc_entity *prev; /* * Start by locating the input and output terminals. We only support * devices with exactly one of each for now. */ list_for_each_entry(entity, &dev->entities, list) { if (UVC_ENTITY_IS_ITERM(entity)) { if (iterm) return -EINVAL; iterm = entity; } if (UVC_ENTITY_IS_OTERM(entity)) { if (oterm) return -EINVAL; oterm = entity; } } if (iterm == NULL || oterm == NULL) return -EINVAL; /* Allocate the chain and fill it. */ chain = uvc_alloc_chain(dev); if (chain == NULL) return -ENOMEM; if (uvc_scan_chain_entity(chain, oterm) < 0) goto error; prev = oterm; /* * Add all Processing and Extension Units with two pads. The order * doesn't matter much, use reverse list traversal to connect units in * UVC descriptor order as we build the chain from output to input. This * leads to units appearing in the order meant by the manufacturer for * the cameras known to require this heuristic. */ list_for_each_entry_reverse(entity, &dev->entities, list) { if (entity->type != UVC_VC_PROCESSING_UNIT && entity->type != UVC_VC_EXTENSION_UNIT) continue; if (entity->num_pads != 2) continue; if (uvc_scan_chain_entity(chain, entity) < 0) goto error; prev->baSourceID[0] = entity->id; prev = entity; } if (uvc_scan_chain_entity(chain, iterm) < 0) goto error; prev->baSourceID[0] = iterm->id; list_add_tail(&chain->list, &dev->chains); uvc_dbg(dev, PROBE, "Found a video chain by fallback heuristic (%s)\n", uvc_print_chain(chain)); return 0; error: kfree(chain); return -EINVAL; } /* * Scan the device for video chains and register video devices. * * Chains are scanned starting at their output terminals and walked backwards. */ static int uvc_scan_device(struct uvc_device *dev) { struct uvc_video_chain *chain; struct uvc_entity *term; list_for_each_entry(term, &dev->entities, list) { if (!UVC_ENTITY_IS_OTERM(term)) continue; /* * If the terminal is already included in a chain, skip it. * This can happen for chains that have multiple output * terminals, where all output terminals beside the first one * will be inserted in the chain in forward scans. */ if (term->chain.next || term->chain.prev) continue; chain = uvc_alloc_chain(dev); if (chain == NULL) return -ENOMEM; term->flags |= UVC_ENTITY_FLAG_DEFAULT; if (uvc_scan_chain(chain, term) < 0) { kfree(chain); continue; } uvc_dbg(dev, PROBE, "Found a valid video chain (%s)\n", uvc_print_chain(chain)); list_add_tail(&chain->list, &dev->chains); } if (list_empty(&dev->chains)) uvc_scan_fallback(dev); if (list_empty(&dev->chains)) { dev_info(&dev->udev->dev, "No valid video chain found.\n"); return -1; } /* Add GPIO entity to the first chain. */ if (dev->gpio_unit) { chain = list_first_entry(&dev->chains, struct uvc_video_chain, list); list_add_tail(&dev->gpio_unit->chain, &chain->entities); } return 0; } /* ------------------------------------------------------------------------ * Video device registration and unregistration */ /* * Delete the UVC device. * * Called by the kernel when the last reference to the uvc_device structure * is released. * * As this function is called after or during disconnect(), all URBs have * already been cancelled by the USB core. There is no need to kill the * interrupt URB manually. */ static void uvc_delete(struct kref *kref) { struct uvc_device *dev = container_of(kref, struct uvc_device, ref); struct list_head *p, *n; uvc_status_cleanup(dev); uvc_ctrl_cleanup_device(dev); usb_put_intf(dev->intf); usb_put_dev(dev->udev); #ifdef CONFIG_MEDIA_CONTROLLER media_device_cleanup(&dev->mdev); #endif list_for_each_safe(p, n, &dev->chains) { struct uvc_video_chain *chain; chain = list_entry(p, struct uvc_video_chain, list); kfree(chain); } list_for_each_safe(p, n, &dev->entities) { struct uvc_entity *entity; entity = list_entry(p, struct uvc_entity, list); #ifdef CONFIG_MEDIA_CONTROLLER uvc_mc_cleanup_entity(entity); #endif kfree(entity); } list_for_each_safe(p, n, &dev->streams) { struct uvc_streaming *streaming; streaming = list_entry(p, struct uvc_streaming, list); usb_driver_release_interface(&uvc_driver.driver, streaming->intf); uvc_stream_delete(streaming); } kfree(dev); } static void uvc_release(struct video_device *vdev) { struct uvc_streaming *stream = video_get_drvdata(vdev); struct uvc_device *dev = stream->dev; kref_put(&dev->ref, uvc_delete); } /* * Unregister the video devices. */ static void uvc_unregister_video(struct uvc_device *dev) { struct uvc_streaming *stream; list_for_each_entry(stream, &dev->streams, list) { if (!video_is_registered(&stream->vdev)) continue; video_unregister_device(&stream->vdev); video_unregister_device(&stream->meta.vdev); uvc_debugfs_cleanup_stream(stream); } uvc_status_unregister(dev); if (dev->vdev.dev) v4l2_device_unregister(&dev->vdev); #ifdef CONFIG_MEDIA_CONTROLLER if (media_devnode_is_registered(dev->mdev.devnode)) media_device_unregister(&dev->mdev); #endif } int uvc_register_video_device(struct uvc_device *dev, struct uvc_streaming *stream, struct video_device *vdev, struct uvc_video_queue *queue, enum v4l2_buf_type type, const struct v4l2_file_operations *fops, const struct v4l2_ioctl_ops *ioctl_ops) { int ret; /* Initialize the video buffers queue. */ ret = uvc_queue_init(queue, type, !uvc_no_drop_param); if (ret) return ret; /* Register the device with V4L. */ /* * We already hold a reference to dev->udev. The video device will be * unregistered before the reference is released, so we don't need to * get another one. */ vdev->v4l2_dev = &dev->vdev; vdev->fops = fops; vdev->ioctl_ops = ioctl_ops; vdev->release = uvc_release; vdev->prio = &stream->chain->prio; if (type == V4L2_BUF_TYPE_VIDEO_OUTPUT) vdev->vfl_dir = VFL_DIR_TX; else vdev->vfl_dir = VFL_DIR_RX; switch (type) { case V4L2_BUF_TYPE_VIDEO_CAPTURE: default: vdev->device_caps = V4L2_CAP_VIDEO_CAPTURE | V4L2_CAP_STREAMING; break; case V4L2_BUF_TYPE_VIDEO_OUTPUT: vdev->device_caps = V4L2_CAP_VIDEO_OUTPUT | V4L2_CAP_STREAMING; break; case V4L2_BUF_TYPE_META_CAPTURE: vdev->device_caps = V4L2_CAP_META_CAPTURE | V4L2_CAP_STREAMING; break; } strscpy(vdev->name, dev->name, sizeof(vdev->name)); /* * Set the driver data before calling video_register_device, otherwise * the file open() handler might race us. */ video_set_drvdata(vdev, stream); ret = video_register_device(vdev, VFL_TYPE_VIDEO, -1); if (ret < 0) { dev_err(&stream->intf->dev, "Failed to register %s device (%d).\n", v4l2_type_names[type], ret); return ret; } kref_get(&dev->ref); return 0; } static int uvc_register_video(struct uvc_device *dev, struct uvc_streaming *stream) { int ret; /* Initialize the streaming interface with default parameters. */ ret = uvc_video_init(stream); if (ret < 0) { dev_err(&stream->intf->dev, "Failed to initialize the device (%d).\n", ret); return ret; } if (stream->type == V4L2_BUF_TYPE_VIDEO_CAPTURE) stream->chain->caps |= V4L2_CAP_VIDEO_CAPTURE | V4L2_CAP_META_CAPTURE; else stream->chain->caps |= V4L2_CAP_VIDEO_OUTPUT; uvc_debugfs_init_stream(stream); /* Register the device with V4L. */ return uvc_register_video_device(dev, stream, &stream->vdev, &stream->queue, stream->type, &uvc_fops, &uvc_ioctl_ops); } /* * Register all video devices in all chains. */ static int uvc_register_terms(struct uvc_device *dev, struct uvc_video_chain *chain) { struct uvc_streaming *stream; struct uvc_entity *term; int ret; list_for_each_entry(term, &chain->entities, chain) { if (UVC_ENTITY_TYPE(term) != UVC_TT_STREAMING) continue; stream = uvc_stream_by_id(dev, term->id); if (stream == NULL) { dev_info(&dev->udev->dev, "No streaming interface found for terminal %u.", term->id); continue; } stream->chain = chain; ret = uvc_register_video(dev, stream); if (ret < 0) return ret; /* * Register a metadata node, but ignore a possible failure, * complete registration of video nodes anyway. */ uvc_meta_register(stream); term->vdev = &stream->vdev; } return 0; } static int uvc_register_chains(struct uvc_device *dev) { struct uvc_video_chain *chain; int ret; list_for_each_entry(chain, &dev->chains, list) { ret = uvc_register_terms(dev, chain); if (ret < 0) return ret; #ifdef CONFIG_MEDIA_CONTROLLER ret = uvc_mc_register_entities(chain); if (ret < 0) dev_info(&dev->udev->dev, "Failed to register entities (%d).\n", ret); #endif } return 0; } /* ------------------------------------------------------------------------ * USB probe, disconnect, suspend and resume */ static const struct uvc_device_info uvc_quirk_none = { 0 }; static int uvc_probe(struct usb_interface *intf, const struct usb_device_id *id) { struct usb_device *udev = interface_to_usbdev(intf); struct uvc_device *dev; const struct uvc_device_info *info = (const struct uvc_device_info *)id->driver_info; int function; int ret; /* Allocate memory for the device and initialize it. */ dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (dev == NULL) return -ENOMEM; INIT_LIST_HEAD(&dev->entities); INIT_LIST_HEAD(&dev->chains); INIT_LIST_HEAD(&dev->streams); kref_init(&dev->ref); atomic_set(&dev->nmappings, 0); mutex_init(&dev->lock); dev->udev = usb_get_dev(udev); dev->intf = usb_get_intf(intf); dev->intfnum = intf->cur_altsetting->desc.bInterfaceNumber; dev->info = info ? info : &uvc_quirk_none; dev->quirks = uvc_quirks_param == -1 ? dev->info->quirks : uvc_quirks_param; if (id->idVendor && id->idProduct) uvc_dbg(dev, PROBE, "Probing known UVC device %s (%04x:%04x)\n", udev->devpath, id->idVendor, id->idProduct); else uvc_dbg(dev, PROBE, "Probing generic UVC device %s\n", udev->devpath); if (udev->product != NULL) strscpy(dev->name, udev->product, sizeof(dev->name)); else snprintf(dev->name, sizeof(dev->name), "UVC Camera (%04x:%04x)", le16_to_cpu(udev->descriptor.idVendor), le16_to_cpu(udev->descriptor.idProduct)); /* * Add iFunction or iInterface to names when available as additional * distinguishers between interfaces. iFunction is prioritized over * iInterface which matches Windows behavior at the point of writing. */ if (intf->intf_assoc && intf->intf_assoc->iFunction != 0) function = intf->intf_assoc->iFunction; else function = intf->cur_altsetting->desc.iInterface; if (function != 0) { size_t len; strlcat(dev->name, ": ", sizeof(dev->name)); len = strlen(dev->name); usb_string(udev, function, dev->name + len, sizeof(dev->name) - len); } /* Initialize the media device. */ #ifdef CONFIG_MEDIA_CONTROLLER dev->mdev.dev = &intf->dev; strscpy(dev->mdev.model, dev->name, sizeof(dev->mdev.model)); if (udev->serial) strscpy(dev->mdev.serial, udev->serial, sizeof(dev->mdev.serial)); usb_make_path(udev, dev->mdev.bus_info, sizeof(dev->mdev.bus_info)); dev->mdev.hw_revision = le16_to_cpu(udev->descriptor.bcdDevice); media_device_init(&dev->mdev); dev->vdev.mdev = &dev->mdev; #endif /* Parse the Video Class control descriptor. */ if (uvc_parse_control(dev) < 0) { uvc_dbg(dev, PROBE, "Unable to parse UVC descriptors\n"); goto error; } /* Parse the associated GPIOs. */ if (uvc_gpio_parse(dev) < 0) { uvc_dbg(dev, PROBE, "Unable to parse UVC GPIOs\n"); goto error; } dev_info(&dev->udev->dev, "Found UVC %u.%02x device %s (%04x:%04x)\n", dev->uvc_version >> 8, dev->uvc_version & 0xff, udev->product ? udev->product : "<unnamed>", le16_to_cpu(udev->descriptor.idVendor), le16_to_cpu(udev->descriptor.idProduct)); if (dev->quirks != dev->info->quirks) { dev_info(&dev->udev->dev, "Forcing device quirks to 0x%x by module parameter for testing purpose.\n", dev->quirks); dev_info(&dev->udev->dev, "Please report required quirks to the linux-media mailing list.\n"); } if (dev->info->uvc_version) { dev->uvc_version = dev->info->uvc_version; dev_info(&dev->udev->dev, "Forcing UVC version to %u.%02x\n", dev->uvc_version >> 8, dev->uvc_version & 0xff); } /* Register the V4L2 device. */ if (v4l2_device_register(&intf->dev, &dev->vdev) < 0) goto error; /* Scan the device for video chains. */ if (uvc_scan_device(dev) < 0) goto error; /* Initialize controls. */ if (uvc_ctrl_init_device(dev) < 0) goto error; /* Register video device nodes. */ if (uvc_register_chains(dev) < 0) goto error; #ifdef CONFIG_MEDIA_CONTROLLER /* Register the media device node */ if (media_device_register(&dev->mdev) < 0) goto error; #endif /* Save our data pointer in the interface data. */ usb_set_intfdata(intf, dev); /* Initialize the interrupt URB. */ ret = uvc_status_init(dev); if (ret < 0) { dev_info(&dev->udev->dev, "Unable to initialize the status endpoint (%d), status interrupt will not be supported.\n", ret); } ret = uvc_gpio_init_irq(dev); if (ret < 0) { dev_err(&dev->udev->dev, "Unable to request privacy GPIO IRQ (%d)\n", ret); goto error; } uvc_dbg(dev, PROBE, "UVC device initialized\n"); usb_enable_autosuspend(udev); return 0; error: uvc_unregister_video(dev); kref_put(&dev->ref, uvc_delete); return -ENODEV; } static void uvc_disconnect(struct usb_interface *intf) { struct uvc_device *dev = usb_get_intfdata(intf); /* * Set the USB interface data to NULL. This can be done outside the * lock, as there's no other reader. */ usb_set_intfdata(intf, NULL); if (intf->cur_altsetting->desc.bInterfaceSubClass == UVC_SC_VIDEOSTREAMING) return; uvc_unregister_video(dev); kref_put(&dev->ref, uvc_delete); } static int uvc_suspend(struct usb_interface *intf, pm_message_t message) { struct uvc_device *dev = usb_get_intfdata(intf); struct uvc_streaming *stream; uvc_dbg(dev, SUSPEND, "Suspending interface %u\n", intf->cur_altsetting->desc.bInterfaceNumber); /* Controls are cached on the fly so they don't need to be saved. */ if (intf->cur_altsetting->desc.bInterfaceSubClass == UVC_SC_VIDEOCONTROL) { mutex_lock(&dev->lock); if (dev->users) uvc_status_stop(dev); mutex_unlock(&dev->lock); return 0; } list_for_each_entry(stream, &dev->streams, list) { if (stream->intf == intf) return uvc_video_suspend(stream); } uvc_dbg(dev, SUSPEND, "Suspend: video streaming USB interface mismatch\n"); return -EINVAL; } static int __uvc_resume(struct usb_interface *intf, int reset) { struct uvc_device *dev = usb_get_intfdata(intf); struct uvc_streaming *stream; int ret = 0; uvc_dbg(dev, SUSPEND, "Resuming interface %u\n", intf->cur_altsetting->desc.bInterfaceNumber); if (intf->cur_altsetting->desc.bInterfaceSubClass == UVC_SC_VIDEOCONTROL) { if (reset) { ret = uvc_ctrl_restore_values(dev); if (ret < 0) return ret; } mutex_lock(&dev->lock); if (dev->users) ret = uvc_status_start(dev, GFP_NOIO); mutex_unlock(&dev->lock); return ret; } list_for_each_entry(stream, &dev->streams, list) { if (stream->intf == intf) { ret = uvc_video_resume(stream, reset); if (ret < 0) uvc_queue_streamoff(&stream->queue, stream->queue.queue.type); return ret; } } uvc_dbg(dev, SUSPEND, "Resume: video streaming USB interface mismatch\n"); return -EINVAL; } static int uvc_resume(struct usb_interface *intf) { return __uvc_resume(intf, 0); } static int uvc_reset_resume(struct usb_interface *intf) { return __uvc_resume(intf, 1); } /* ------------------------------------------------------------------------ * Module parameters */ static int uvc_clock_param_get(char *buffer, const struct kernel_param *kp) { if (uvc_clock_param == CLOCK_MONOTONIC) return sprintf(buffer, "CLOCK_MONOTONIC"); else return sprintf(buffer, "CLOCK_REALTIME"); } static int uvc_clock_param_set(const char *val, const struct kernel_param *kp) { if (strncasecmp(val, "clock_", strlen("clock_")) == 0) val += strlen("clock_"); if (strcasecmp(val, "monotonic") == 0) uvc_clock_param = CLOCK_MONOTONIC; else if (strcasecmp(val, "realtime") == 0) uvc_clock_param = CLOCK_REALTIME; else return -EINVAL; return 0; } module_param_call(clock, uvc_clock_param_set, uvc_clock_param_get, &uvc_clock_param, 0644); MODULE_PARM_DESC(clock, "Video buffers timestamp clock"); module_param_named(hwtimestamps, uvc_hw_timestamps_param, uint, 0644); MODULE_PARM_DESC(hwtimestamps, "Use hardware timestamps"); module_param_named(nodrop, uvc_no_drop_param, uint, 0644); MODULE_PARM_DESC(nodrop, "Don't drop incomplete frames"); module_param_named(quirks, uvc_quirks_param, uint, 0644); MODULE_PARM_DESC(quirks, "Forced device quirks"); module_param_named(trace, uvc_dbg_param, uint, 0644); MODULE_PARM_DESC(trace, "Trace level bitmask"); module_param_named(timeout, uvc_timeout_param, uint, 0644); MODULE_PARM_DESC(timeout, "Streaming control requests timeout"); /* ------------------------------------------------------------------------ * Driver initialization and cleanup */ static const struct uvc_device_info uvc_ctrl_power_line_limited = { .mappings = (const struct uvc_control_mapping *[]) { &uvc_ctrl_power_line_mapping_limited, NULL, /* Sentinel */ }, }; static const struct uvc_device_info uvc_ctrl_power_line_uvc11 = { .mappings = (const struct uvc_control_mapping *[]) { &uvc_ctrl_power_line_mapping_uvc11, NULL, /* Sentinel */ }, }; static const struct uvc_device_info uvc_quirk_probe_minmax = { .quirks = UVC_QUIRK_PROBE_MINMAX, }; static const struct uvc_device_info uvc_quirk_fix_bandwidth = { .quirks = UVC_QUIRK_FIX_BANDWIDTH, }; static const struct uvc_device_info uvc_quirk_probe_def = { .quirks = UVC_QUIRK_PROBE_DEF, }; static const struct uvc_device_info uvc_quirk_stream_no_fid = { .quirks = UVC_QUIRK_STREAM_NO_FID, }; static const struct uvc_device_info uvc_quirk_force_y8 = { .quirks = UVC_QUIRK_FORCE_Y8, }; #define UVC_INFO_QUIRK(q) (kernel_ulong_t)&(struct uvc_device_info){.quirks = q} #define UVC_INFO_META(m) (kernel_ulong_t)&(struct uvc_device_info) \ {.meta_format = m} /* * The Logitech cameras listed below have their interface class set to * VENDOR_SPEC because they don't announce themselves as UVC devices, even * though they are compliant. */ static const struct usb_device_id uvc_ids[] = { /* Quanta USB2.0 HD UVC Webcam */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x0408, .idProduct = 0x3090, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_ctrl_power_line_limited }, /* Quanta USB2.0 HD UVC Webcam */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x0408, .idProduct = 0x4030, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_ctrl_power_line_limited }, /* Quanta USB2.0 HD UVC Webcam */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x0408, .idProduct = 0x4034, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = UVC_PC_PROTOCOL_15, .driver_info = (kernel_ulong_t)&uvc_ctrl_power_line_limited }, /* LogiLink Wireless Webcam */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x0416, .idProduct = 0xa91a, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_quirk_probe_minmax }, /* Genius eFace 2025 */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x0458, .idProduct = 0x706e, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_quirk_probe_minmax }, /* Microsoft Lifecam NX-6000 */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x045e, .idProduct = 0x00f8, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_quirk_probe_minmax }, /* Microsoft Lifecam NX-3000 */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x045e, .idProduct = 0x0721, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_quirk_probe_def }, /* Microsoft Lifecam VX-7000 */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x045e, .idProduct = 0x0723, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_quirk_probe_minmax }, /* Logitech, Webcam C910 */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x046d, .idProduct = 0x0821, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = UVC_INFO_QUIRK(UVC_QUIRK_WAKE_AUTOSUSPEND)}, /* Logitech, Webcam B910 */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x046d, .idProduct = 0x0823, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = UVC_INFO_QUIRK(UVC_QUIRK_WAKE_AUTOSUSPEND)}, /* Logitech Quickcam Fusion */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x046d, .idProduct = 0x08c1, .bInterfaceClass = USB_CLASS_VENDOR_SPEC, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0 }, /* Logitech Quickcam Orbit MP */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x046d, .idProduct = 0x08c2, .bInterfaceClass = USB_CLASS_VENDOR_SPEC, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0 }, /* Logitech Quickcam Pro for Notebook */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x046d, .idProduct = 0x08c3, .bInterfaceClass = USB_CLASS_VENDOR_SPEC, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0 }, /* Logitech Quickcam Pro 5000 */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x046d, .idProduct = 0x08c5, .bInterfaceClass = USB_CLASS_VENDOR_SPEC, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0 }, /* Logitech Quickcam OEM Dell Notebook */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x046d, .idProduct = 0x08c6, .bInterfaceClass = USB_CLASS_VENDOR_SPEC, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0 }, /* Logitech Quickcam OEM Cisco VT Camera II */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x046d, .idProduct = 0x08c7, .bInterfaceClass = USB_CLASS_VENDOR_SPEC, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0 }, /* Logitech HD Pro Webcam C920 */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x046d, .idProduct = 0x082d, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = UVC_INFO_QUIRK(UVC_QUIRK_RESTORE_CTRLS_ON_INIT) }, /* Chicony CNF7129 (Asus EEE 100HE) */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x04f2, .idProduct = 0xb071, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = UVC_INFO_QUIRK(UVC_QUIRK_RESTRICT_FRAME_RATE) }, /* Chicony EasyCamera */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x04f2, .idProduct = 0xb5eb, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_ctrl_power_line_limited }, /* Chicony EasyCamera */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x04f2, .idProduct = 0xb6ba, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_ctrl_power_line_limited }, /* Chicony EasyCamera */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x04f2, .idProduct = 0xb746, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_ctrl_power_line_limited }, /* Alcor Micro AU3820 (Future Boy PC USB Webcam) */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x058f, .idProduct = 0x3820, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_quirk_probe_minmax }, /* Dell XPS m1530 */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x05a9, .idProduct = 0x2640, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_quirk_probe_def }, /* Dell SP2008WFP Monitor */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x05a9, .idProduct = 0x2641, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_quirk_probe_def }, /* Dell Alienware X51 */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x05a9, .idProduct = 0x2643, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_quirk_probe_def }, /* Dell Studio Hybrid 140g (OmniVision webcam) */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x05a9, .idProduct = 0x264a, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_quirk_probe_def }, /* Dell XPS M1330 (OmniVision OV7670 webcam) */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x05a9, .idProduct = 0x7670, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_quirk_probe_def }, /* Apple Built-In iSight */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x05ac, .idProduct = 0x8501, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = UVC_INFO_QUIRK(UVC_QUIRK_PROBE_MINMAX | UVC_QUIRK_BUILTIN_ISIGHT) }, /* Apple FaceTime HD Camera (Built-In) */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x05ac, .idProduct = 0x8514, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_quirk_probe_def }, /* Apple Built-In iSight via iBridge */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x05ac, .idProduct = 0x8600, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_quirk_probe_def }, /* Foxlink ("HP Webcam" on HP Mini 5103) */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x05c8, .idProduct = 0x0403, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_quirk_fix_bandwidth }, /* Genesys Logic USB 2.0 PC Camera */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x05e3, .idProduct = 0x0505, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_quirk_stream_no_fid }, /* Hercules Classic Silver */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x06f8, .idProduct = 0x300c, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_quirk_fix_bandwidth }, /* ViMicro Vega */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x0ac8, .idProduct = 0x332d, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_quirk_fix_bandwidth }, /* ViMicro - Minoru3D */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x0ac8, .idProduct = 0x3410, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_quirk_fix_bandwidth }, /* ViMicro Venus - Minoru3D */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x0ac8, .idProduct = 0x3420, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_quirk_fix_bandwidth }, /* Ophir Optronics - SPCAM 620U */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x0bd3, .idProduct = 0x0555, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_quirk_probe_minmax }, /* MT6227 */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x0e8d, .idProduct = 0x0004, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = UVC_INFO_QUIRK(UVC_QUIRK_PROBE_MINMAX | UVC_QUIRK_PROBE_DEF) }, /* IMC Networks (Medion Akoya) */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x13d3, .idProduct = 0x5103, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_quirk_stream_no_fid }, /* JMicron USB2.0 XGA WebCam */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x152d, .idProduct = 0x0310, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_quirk_probe_minmax }, /* Syntek (HP Spartan) */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x174f, .idProduct = 0x5212, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_quirk_stream_no_fid }, /* Syntek (Samsung Q310) */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x174f, .idProduct = 0x5931, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_quirk_stream_no_fid }, /* Syntek (Packard Bell EasyNote MX52 */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x174f, .idProduct = 0x8a12, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_quirk_stream_no_fid }, /* Syntek (Asus F9SG) */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x174f, .idProduct = 0x8a31, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_quirk_stream_no_fid }, /* Syntek (Asus U3S) */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x174f, .idProduct = 0x8a33, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_quirk_stream_no_fid }, /* Syntek (JAOtech Smart Terminal) */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x174f, .idProduct = 0x8a34, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_quirk_stream_no_fid }, /* Miricle 307K */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x17dc, .idProduct = 0x0202, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_quirk_stream_no_fid }, /* Lenovo Thinkpad SL400/SL500 */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x17ef, .idProduct = 0x480b, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_quirk_stream_no_fid }, /* Aveo Technology USB 2.0 Camera */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x1871, .idProduct = 0x0306, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = UVC_INFO_QUIRK(UVC_QUIRK_PROBE_MINMAX | UVC_QUIRK_PROBE_EXTRAFIELDS) }, /* Aveo Technology USB 2.0 Camera (Tasco USB Microscope) */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x1871, .idProduct = 0x0516, .bInterfaceClass = USB_CLASS_VENDOR_SPEC, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0 }, /* Ecamm Pico iMage */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x18cd, .idProduct = 0xcafe, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = UVC_INFO_QUIRK(UVC_QUIRK_PROBE_EXTRAFIELDS) }, /* Manta MM-353 Plako */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x18ec, .idProduct = 0x3188, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_quirk_probe_minmax }, /* FSC WebCam V30S */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x18ec, .idProduct = 0x3288, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_quirk_probe_minmax }, /* Arkmicro unbranded */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x18ec, .idProduct = 0x3290, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_quirk_probe_def }, /* The Imaging Source USB CCD cameras */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x199e, .idProduct = 0x8102, .bInterfaceClass = USB_CLASS_VENDOR_SPEC, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0 }, /* Bodelin ProScopeHR */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_DEV_HI | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x19ab, .idProduct = 0x1000, .bcdDevice_hi = 0x0126, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = UVC_INFO_QUIRK(UVC_QUIRK_STATUS_INTERVAL) }, /* MSI StarCam 370i */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x1b3b, .idProduct = 0x2951, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_quirk_probe_minmax }, /* Generalplus Technology Inc. 808 Camera */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x1b3f, .idProduct = 0x2002, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_quirk_probe_minmax }, /* Shenzhen Aoni Electronic Co.,Ltd 2K FHD camera */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x1bcf, .idProduct = 0x0b40, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&(const struct uvc_device_info){ .uvc_version = 0x010a, } }, /* SiGma Micro USB Web Camera */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x1c4f, .idProduct = 0x3000, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = UVC_INFO_QUIRK(UVC_QUIRK_PROBE_MINMAX | UVC_QUIRK_IGNORE_SELECTOR_UNIT) }, /* Oculus VR Positional Tracker DK2 */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x2833, .idProduct = 0x0201, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_quirk_force_y8 }, /* Oculus VR Rift Sensor */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x2833, .idProduct = 0x0211, .bInterfaceClass = USB_CLASS_VENDOR_SPEC, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_quirk_force_y8 }, /* GEO Semiconductor GC6500 */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x29fe, .idProduct = 0x4d53, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = UVC_INFO_QUIRK(UVC_QUIRK_FORCE_BPP) }, /* Lenovo Integrated Camera */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x30c9, .idProduct = 0x0093, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = UVC_PC_PROTOCOL_15, .driver_info = (kernel_ulong_t)&uvc_ctrl_power_line_uvc11 }, /* Sonix Technology USB 2.0 Camera */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x3277, .idProduct = 0x0072, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_ctrl_power_line_limited }, /* Acer EasyCamera */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x5986, .idProduct = 0x1172, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_ctrl_power_line_limited }, /* Acer EasyCamera */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x5986, .idProduct = 0x1180, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = (kernel_ulong_t)&uvc_ctrl_power_line_limited }, /* Intel D410/ASR depth camera */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x8086, .idProduct = 0x0ad2, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = UVC_INFO_META(V4L2_META_FMT_D4XX) }, /* Intel D415/ASRC depth camera */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x8086, .idProduct = 0x0ad3, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = UVC_INFO_META(V4L2_META_FMT_D4XX) }, /* Intel D430/AWG depth camera */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x8086, .idProduct = 0x0ad4, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = UVC_INFO_META(V4L2_META_FMT_D4XX) }, /* Intel RealSense D4M */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x8086, .idProduct = 0x0b03, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = UVC_INFO_META(V4L2_META_FMT_D4XX) }, /* Intel D435/AWGC depth camera */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x8086, .idProduct = 0x0b07, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = UVC_INFO_META(V4L2_META_FMT_D4XX) }, /* Intel D435i depth camera */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x8086, .idProduct = 0x0b3a, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = UVC_INFO_META(V4L2_META_FMT_D4XX) }, /* Intel D405 Depth Camera */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x8086, .idProduct = 0x0b5b, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = UVC_INFO_META(V4L2_META_FMT_D4XX) }, /* Intel D455 Depth Camera */ { .match_flags = USB_DEVICE_ID_MATCH_DEVICE | USB_DEVICE_ID_MATCH_INT_INFO, .idVendor = 0x8086, .idProduct = 0x0b5c, .bInterfaceClass = USB_CLASS_VIDEO, .bInterfaceSubClass = 1, .bInterfaceProtocol = 0, .driver_info = UVC_INFO_META(V4L2_META_FMT_D4XX) }, /* Generic USB Video Class */ { USB_INTERFACE_INFO(USB_CLASS_VIDEO, 1, UVC_PC_PROTOCOL_UNDEFINED) }, { USB_INTERFACE_INFO(USB_CLASS_VIDEO, 1, UVC_PC_PROTOCOL_15) }, {} }; MODULE_DEVICE_TABLE(usb, uvc_ids); struct uvc_driver uvc_driver = { .driver = { .name = "uvcvideo", .probe = uvc_probe, .disconnect = uvc_disconnect, .suspend = uvc_suspend, .resume = uvc_resume, .reset_resume = uvc_reset_resume, .id_table = uvc_ids, .supports_autosuspend = 1, }, }; static int __init uvc_init(void) { int ret; uvc_debugfs_init(); ret = usb_register(&uvc_driver.driver); if (ret < 0) { uvc_debugfs_cleanup(); return ret; } return 0; } static void __exit uvc_cleanup(void) { usb_deregister(&uvc_driver.driver); uvc_debugfs_cleanup(); } module_init(uvc_init); module_exit(uvc_cleanup); MODULE_AUTHOR(DRIVER_AUTHOR); MODULE_DESCRIPTION(DRIVER_DESC); MODULE_LICENSE("GPL"); MODULE_VERSION(DRIVER_VERSION); |
140 139 8 5 311 4 2 297 2 2 175 5 128 4 243 6 2 53 59 296 1 15 5 3 1 16 277 75 59 173 214 5 15 3 202 3 216 2 2 42 42 42 42 1 42 42 38 57 16 29 1 28 3 9 19 21 3 3 5 2 3 6 4 6 26 18 44 44 44 23 7 2 23 23 22 16 7 16 6 19 4 22 23 7 2 5 28 3 1 5 17 5 16 1 6 15 5 5 16 2 2 45 8 25 4 20 1 282 268 14 129 73 53 6 212 6 2 2 6 11 7 1 5 3 7 1 8 1 7 80 80 94 2 1 1 2 78 67 60 8 9 9 2 5 55 55 1 23 45 5 99 3 1 2 82 15 285 149 4 4 5 5 1 4 197 177 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 | // SPDX-License-Identifier: GPL-2.0-or-later /* * TCP over IPv6 * Linux INET6 implementation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> * * Based on: * linux/net/ipv4/tcp.c * linux/net/ipv4/tcp_input.c * linux/net/ipv4/tcp_output.c * * Fixes: * Hideaki YOSHIFUJI : sin6_scope_id support * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind * a single port at the same time. * YOSHIFUJI Hideaki @USAGI: convert /proc/net/tcp6 to seq_file. */ #include <linux/bottom_half.h> #include <linux/module.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/jiffies.h> #include <linux/in.h> #include <linux/in6.h> #include <linux/netdevice.h> #include <linux/init.h> #include <linux/jhash.h> #include <linux/ipsec.h> #include <linux/times.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/ipv6.h> #include <linux/icmpv6.h> #include <linux/random.h> #include <linux/indirect_call_wrapper.h> #include <net/tcp.h> #include <net/ndisc.h> #include <net/inet6_hashtables.h> #include <net/inet6_connection_sock.h> #include <net/ipv6.h> #include <net/transp_v6.h> #include <net/addrconf.h> #include <net/ip6_route.h> #include <net/ip6_checksum.h> #include <net/inet_ecn.h> #include <net/protocol.h> #include <net/xfrm.h> #include <net/snmp.h> #include <net/dsfield.h> #include <net/timewait_sock.h> #include <net/inet_common.h> #include <net/secure_seq.h> #include <net/busy_poll.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <crypto/hash.h> #include <linux/scatterlist.h> #include <trace/events/tcp.h> static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb); static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, struct request_sock *req); INDIRECT_CALLABLE_SCOPE int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb); static const struct inet_connection_sock_af_ops ipv6_mapped; const struct inet_connection_sock_af_ops ipv6_specific; #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) static const struct tcp_sock_af_ops tcp_sock_ipv6_specific; static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific; #endif /* Helper returning the inet6 address from a given tcp socket. * It can be used in TCP stack instead of inet6_sk(sk). * This avoids a dereference and allow compiler optimizations. * It is a specialized version of inet6_sk_generic(). */ #define tcp_inet6_sk(sk) (&container_of_const(tcp_sk(sk), \ struct tcp6_sock, tcp)->inet6) static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); if (dst && dst_hold_safe(dst)) { const struct rt6_info *rt = (const struct rt6_info *)dst; rcu_assign_pointer(sk->sk_rx_dst, dst); sk->sk_rx_dst_ifindex = skb->skb_iif; sk->sk_rx_dst_cookie = rt6_get_cookie(rt); } } static u32 tcp_v6_init_seq(const struct sk_buff *skb) { return secure_tcpv6_seq(ipv6_hdr(skb)->daddr.s6_addr32, ipv6_hdr(skb)->saddr.s6_addr32, tcp_hdr(skb)->dest, tcp_hdr(skb)->source); } static u32 tcp_v6_init_ts_off(const struct net *net, const struct sk_buff *skb) { return secure_tcpv6_ts_off(net, ipv6_hdr(skb)->daddr.s6_addr32, ipv6_hdr(skb)->saddr.s6_addr32); } static int tcp_v6_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { /* This check is replicated from tcp_v6_connect() and intended to * prevent BPF program called below from accessing bytes that are out * of the bound specified by user in addr_len. */ if (addr_len < SIN6_LEN_RFC2133) return -EINVAL; sock_owned_by_me(sk); return BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr, &addr_len); } static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr; struct inet_connection_sock *icsk = inet_csk(sk); struct in6_addr *saddr = NULL, *final_p, final; struct inet_timewait_death_row *tcp_death_row; struct ipv6_pinfo *np = tcp_inet6_sk(sk); struct inet_sock *inet = inet_sk(sk); struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); struct ipv6_txoptions *opt; struct dst_entry *dst; struct flowi6 fl6; int addr_type; int err; if (addr_len < SIN6_LEN_RFC2133) return -EINVAL; if (usin->sin6_family != AF_INET6) return -EAFNOSUPPORT; memset(&fl6, 0, sizeof(fl6)); if (inet6_test_bit(SNDFLOW, sk)) { fl6.flowlabel = usin->sin6_flowinfo&IPV6_FLOWINFO_MASK; IP6_ECN_flow_init(fl6.flowlabel); if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) { struct ip6_flowlabel *flowlabel; flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); if (IS_ERR(flowlabel)) return -EINVAL; fl6_sock_release(flowlabel); } } /* * connect() to INADDR_ANY means loopback (BSD'ism). */ if (ipv6_addr_any(&usin->sin6_addr)) { if (ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) ipv6_addr_set_v4mapped(htonl(INADDR_LOOPBACK), &usin->sin6_addr); else usin->sin6_addr = in6addr_loopback; } addr_type = ipv6_addr_type(&usin->sin6_addr); if (addr_type & IPV6_ADDR_MULTICAST) return -ENETUNREACH; if (addr_type&IPV6_ADDR_LINKLOCAL) { if (addr_len >= sizeof(struct sockaddr_in6) && usin->sin6_scope_id) { /* If interface is set while binding, indices * must coincide. */ if (!sk_dev_equal_l3scope(sk, usin->sin6_scope_id)) return -EINVAL; sk->sk_bound_dev_if = usin->sin6_scope_id; } /* Connect to link-local address requires an interface */ if (!sk->sk_bound_dev_if) return -EINVAL; } if (tp->rx_opt.ts_recent_stamp && !ipv6_addr_equal(&sk->sk_v6_daddr, &usin->sin6_addr)) { tp->rx_opt.ts_recent = 0; tp->rx_opt.ts_recent_stamp = 0; WRITE_ONCE(tp->write_seq, 0); } sk->sk_v6_daddr = usin->sin6_addr; np->flow_label = fl6.flowlabel; /* * TCP over IPv4 */ if (addr_type & IPV6_ADDR_MAPPED) { u32 exthdrlen = icsk->icsk_ext_hdr_len; struct sockaddr_in sin; if (ipv6_only_sock(sk)) return -ENETUNREACH; sin.sin_family = AF_INET; sin.sin_port = usin->sin6_port; sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3]; /* Paired with READ_ONCE() in tcp_(get|set)sockopt() */ WRITE_ONCE(icsk->icsk_af_ops, &ipv6_mapped); if (sk_is_mptcp(sk)) mptcpv6_handle_mapped(sk, true); sk->sk_backlog_rcv = tcp_v4_do_rcv; #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) tp->af_specific = &tcp_sock_ipv6_mapped_specific; #endif err = tcp_v4_connect(sk, (struct sockaddr *)&sin, sizeof(sin)); if (err) { icsk->icsk_ext_hdr_len = exthdrlen; /* Paired with READ_ONCE() in tcp_(get|set)sockopt() */ WRITE_ONCE(icsk->icsk_af_ops, &ipv6_specific); if (sk_is_mptcp(sk)) mptcpv6_handle_mapped(sk, false); sk->sk_backlog_rcv = tcp_v6_do_rcv; #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) tp->af_specific = &tcp_sock_ipv6_specific; #endif goto failure; } np->saddr = sk->sk_v6_rcv_saddr; return err; } if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr)) saddr = &sk->sk_v6_rcv_saddr; fl6.flowi6_proto = IPPROTO_TCP; fl6.daddr = sk->sk_v6_daddr; fl6.saddr = saddr ? *saddr : np->saddr; fl6.flowlabel = ip6_make_flowinfo(np->tclass, np->flow_label); fl6.flowi6_oif = sk->sk_bound_dev_if; fl6.flowi6_mark = sk->sk_mark; fl6.fl6_dport = usin->sin6_port; fl6.fl6_sport = inet->inet_sport; fl6.flowi6_uid = sk->sk_uid; opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk)); final_p = fl6_update_dst(&fl6, opt, &final); security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6)); dst = ip6_dst_lookup_flow(net, sk, &fl6, final_p); if (IS_ERR(dst)) { err = PTR_ERR(dst); goto failure; } tp->tcp_usec_ts = dst_tcp_usec_ts(dst); tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; if (!saddr) { saddr = &fl6.saddr; err = inet_bhash2_update_saddr(sk, saddr, AF_INET6); if (err) goto failure; } /* set the source address */ np->saddr = *saddr; inet->inet_rcv_saddr = LOOPBACK4_IPV6; sk->sk_gso_type = SKB_GSO_TCPV6; ip6_dst_store(sk, dst, NULL, NULL); icsk->icsk_ext_hdr_len = 0; if (opt) icsk->icsk_ext_hdr_len = opt->opt_flen + opt->opt_nflen; tp->rx_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr); inet->inet_dport = usin->sin6_port; tcp_set_state(sk, TCP_SYN_SENT); err = inet6_hash_connect(tcp_death_row, sk); if (err) goto late_failure; sk_set_txhash(sk); if (likely(!tp->repair)) { if (!tp->write_seq) WRITE_ONCE(tp->write_seq, secure_tcpv6_seq(np->saddr.s6_addr32, sk->sk_v6_daddr.s6_addr32, inet->inet_sport, inet->inet_dport)); tp->tsoffset = secure_tcpv6_ts_off(net, np->saddr.s6_addr32, sk->sk_v6_daddr.s6_addr32); } if (tcp_fastopen_defer_connect(sk, &err)) return err; if (err) goto late_failure; err = tcp_connect(sk); if (err) goto late_failure; return 0; late_failure: tcp_set_state(sk, TCP_CLOSE); inet_bhash2_reset_saddr(sk); failure: inet->inet_dport = 0; sk->sk_route_caps = 0; return err; } static void tcp_v6_mtu_reduced(struct sock *sk) { struct dst_entry *dst; u32 mtu; if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) return; mtu = READ_ONCE(tcp_sk(sk)->mtu_info); /* Drop requests trying to increase our current mss. * Check done in __ip6_rt_update_pmtu() is too late. */ if (tcp_mtu_to_mss(sk, mtu) >= tcp_sk(sk)->mss_cache) return; dst = inet6_csk_update_pmtu(sk, mtu); if (!dst) return; if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) { tcp_sync_mss(sk, dst_mtu(dst)); tcp_simple_retransmit(sk); } } static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data; const struct tcphdr *th = (struct tcphdr *)(skb->data+offset); struct net *net = dev_net(skb->dev); struct request_sock *fastopen; struct ipv6_pinfo *np; struct tcp_sock *tp; __u32 seq, snd_una; struct sock *sk; bool fatal; int err; sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, &hdr->daddr, th->dest, &hdr->saddr, ntohs(th->source), skb->dev->ifindex, inet6_sdif(skb)); if (!sk) { __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev), ICMP6_MIB_INERRORS); return -ENOENT; } if (sk->sk_state == TCP_TIME_WAIT) { /* To increase the counter of ignored icmps for TCP-AO */ tcp_ao_ignore_icmp(sk, AF_INET6, type, code); inet_twsk_put(inet_twsk(sk)); return 0; } seq = ntohl(th->seq); fatal = icmpv6_err_convert(type, code, &err); if (sk->sk_state == TCP_NEW_SYN_RECV) { tcp_req_err(sk, seq, fatal); return 0; } if (tcp_ao_ignore_icmp(sk, AF_INET6, type, code)) { sock_put(sk); return 0; } bh_lock_sock(sk); if (sock_owned_by_user(sk) && type != ICMPV6_PKT_TOOBIG) __NET_INC_STATS(net, LINUX_MIB_LOCKDROPPEDICMPS); if (sk->sk_state == TCP_CLOSE) goto out; if (static_branch_unlikely(&ip6_min_hopcount)) { /* min_hopcount can be changed concurrently from do_ipv6_setsockopt() */ if (ipv6_hdr(skb)->hop_limit < READ_ONCE(tcp_inet6_sk(sk)->min_hopcount)) { __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); goto out; } } tp = tcp_sk(sk); /* XXX (TFO) - tp->snd_una should be ISN (tcp_create_openreq_child() */ fastopen = rcu_dereference(tp->fastopen_rsk); snd_una = fastopen ? tcp_rsk(fastopen)->snt_isn : tp->snd_una; if (sk->sk_state != TCP_LISTEN && !between(seq, snd_una, tp->snd_nxt)) { __NET_INC_STATS(net, LINUX_MIB_OUTOFWINDOWICMPS); goto out; } np = tcp_inet6_sk(sk); if (type == NDISC_REDIRECT) { if (!sock_owned_by_user(sk)) { struct dst_entry *dst = __sk_dst_check(sk, np->dst_cookie); if (dst) dst->ops->redirect(dst, sk, skb); } goto out; } if (type == ICMPV6_PKT_TOOBIG) { u32 mtu = ntohl(info); /* We are not interested in TCP_LISTEN and open_requests * (SYN-ACKs send out by Linux are always <576bytes so * they should go through unfragmented). */ if (sk->sk_state == TCP_LISTEN) goto out; if (!ip6_sk_accept_pmtu(sk)) goto out; if (mtu < IPV6_MIN_MTU) goto out; WRITE_ONCE(tp->mtu_info, mtu); if (!sock_owned_by_user(sk)) tcp_v6_mtu_reduced(sk); else if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &sk->sk_tsq_flags)) sock_hold(sk); goto out; } /* Might be for an request_sock */ switch (sk->sk_state) { case TCP_SYN_SENT: case TCP_SYN_RECV: /* Only in fast or simultaneous open. If a fast open socket is * already accepted it is treated as a connected one below. */ if (fastopen && !fastopen->sk) break; ipv6_icmp_error(sk, skb, err, th->dest, ntohl(info), (u8 *)th); if (!sock_owned_by_user(sk)) { WRITE_ONCE(sk->sk_err, err); sk_error_report(sk); /* Wake people up to see the error (see connect in sock.c) */ tcp_done(sk); } else { WRITE_ONCE(sk->sk_err_soft, err); } goto out; case TCP_LISTEN: break; default: /* check if this ICMP message allows revert of backoff. * (see RFC 6069) */ if (!fastopen && type == ICMPV6_DEST_UNREACH && code == ICMPV6_NOROUTE) tcp_ld_RTO_revert(sk, seq); } if (!sock_owned_by_user(sk) && inet6_test_bit(RECVERR6, sk)) { WRITE_ONCE(sk->sk_err, err); sk_error_report(sk); } else { WRITE_ONCE(sk->sk_err_soft, err); } out: bh_unlock_sock(sk); sock_put(sk); return 0; } static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, struct tcp_fastopen_cookie *foc, enum tcp_synack_type synack_type, struct sk_buff *syn_skb) { struct inet_request_sock *ireq = inet_rsk(req); const struct ipv6_pinfo *np = tcp_inet6_sk(sk); struct ipv6_txoptions *opt; struct flowi6 *fl6 = &fl->u.ip6; struct sk_buff *skb; int err = -ENOMEM; u8 tclass; /* First, grab a route. */ if (!dst && (dst = inet6_csk_route_req(sk, fl6, req, IPPROTO_TCP)) == NULL) goto done; skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb); if (skb) { __tcp_v6_send_check(skb, &ireq->ir_v6_loc_addr, &ireq->ir_v6_rmt_addr); fl6->daddr = ireq->ir_v6_rmt_addr; if (inet6_test_bit(REPFLOW, sk) && ireq->pktopts) fl6->flowlabel = ip6_flowlabel(ipv6_hdr(ireq->pktopts)); tclass = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos) ? (tcp_rsk(req)->syn_tos & ~INET_ECN_MASK) | (np->tclass & INET_ECN_MASK) : np->tclass; if (!INET_ECN_is_capable(tclass) && tcp_bpf_ca_needs_ecn((struct sock *)req)) tclass |= INET_ECN_ECT_0; rcu_read_lock(); opt = ireq->ipv6_opt; if (!opt) opt = rcu_dereference(np->opt); err = ip6_xmit(sk, skb, fl6, skb->mark ? : READ_ONCE(sk->sk_mark), opt, tclass, READ_ONCE(sk->sk_priority)); rcu_read_unlock(); err = net_xmit_eval(err); } done: return err; } static void tcp_v6_reqsk_destructor(struct request_sock *req) { kfree(inet_rsk(req)->ipv6_opt); consume_skb(inet_rsk(req)->pktopts); } #ifdef CONFIG_TCP_MD5SIG static struct tcp_md5sig_key *tcp_v6_md5_do_lookup(const struct sock *sk, const struct in6_addr *addr, int l3index) { return tcp_md5_do_lookup(sk, l3index, (union tcp_md5_addr *)addr, AF_INET6); } static struct tcp_md5sig_key *tcp_v6_md5_lookup(const struct sock *sk, const struct sock *addr_sk) { int l3index; l3index = l3mdev_master_ifindex_by_index(sock_net(sk), addr_sk->sk_bound_dev_if); return tcp_v6_md5_do_lookup(sk, &addr_sk->sk_v6_daddr, l3index); } static int tcp_v6_parse_md5_keys(struct sock *sk, int optname, sockptr_t optval, int optlen) { struct tcp_md5sig cmd; struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&cmd.tcpm_addr; union tcp_ao_addr *addr; int l3index = 0; u8 prefixlen; bool l3flag; u8 flags; if (optlen < sizeof(cmd)) return -EINVAL; if (copy_from_sockptr(&cmd, optval, sizeof(cmd))) return -EFAULT; if (sin6->sin6_family != AF_INET6) return -EINVAL; flags = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; l3flag = cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX; if (optname == TCP_MD5SIG_EXT && cmd.tcpm_flags & TCP_MD5SIG_FLAG_PREFIX) { prefixlen = cmd.tcpm_prefixlen; if (prefixlen > 128 || (ipv6_addr_v4mapped(&sin6->sin6_addr) && prefixlen > 32)) return -EINVAL; } else { prefixlen = ipv6_addr_v4mapped(&sin6->sin6_addr) ? 32 : 128; } if (optname == TCP_MD5SIG_EXT && cmd.tcpm_ifindex && cmd.tcpm_flags & TCP_MD5SIG_FLAG_IFINDEX) { struct net_device *dev; rcu_read_lock(); dev = dev_get_by_index_rcu(sock_net(sk), cmd.tcpm_ifindex); if (dev && netif_is_l3_master(dev)) l3index = dev->ifindex; rcu_read_unlock(); /* ok to reference set/not set outside of rcu; * right now device MUST be an L3 master */ if (!dev || !l3index) return -EINVAL; } if (!cmd.tcpm_keylen) { if (ipv6_addr_v4mapped(&sin6->sin6_addr)) return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin6->sin6_addr.s6_addr32[3], AF_INET, prefixlen, l3index, flags); return tcp_md5_do_del(sk, (union tcp_md5_addr *)&sin6->sin6_addr, AF_INET6, prefixlen, l3index, flags); } if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN) return -EINVAL; if (ipv6_addr_v4mapped(&sin6->sin6_addr)) { addr = (union tcp_md5_addr *)&sin6->sin6_addr.s6_addr32[3]; /* Don't allow keys for peers that have a matching TCP-AO key. * See the comment in tcp_ao_add_cmd() */ if (tcp_ao_required(sk, addr, AF_INET, l3flag ? l3index : -1, false)) return -EKEYREJECTED; return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags, cmd.tcpm_key, cmd.tcpm_keylen); } addr = (union tcp_md5_addr *)&sin6->sin6_addr; /* Don't allow keys for peers that have a matching TCP-AO key. * See the comment in tcp_ao_add_cmd() */ if (tcp_ao_required(sk, addr, AF_INET6, l3flag ? l3index : -1, false)) return -EKEYREJECTED; return tcp_md5_do_add(sk, addr, AF_INET6, prefixlen, l3index, flags, cmd.tcpm_key, cmd.tcpm_keylen); } static int tcp_v6_md5_hash_headers(struct tcp_sigpool *hp, const struct in6_addr *daddr, const struct in6_addr *saddr, const struct tcphdr *th, int nbytes) { struct tcp6_pseudohdr *bp; struct scatterlist sg; struct tcphdr *_th; bp = hp->scratch; /* 1. TCP pseudo-header (RFC2460) */ bp->saddr = *saddr; bp->daddr = *daddr; bp->protocol = cpu_to_be32(IPPROTO_TCP); bp->len = cpu_to_be32(nbytes); _th = (struct tcphdr *)(bp + 1); memcpy(_th, th, sizeof(*th)); _th->check = 0; sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th)); ahash_request_set_crypt(hp->req, &sg, NULL, sizeof(*bp) + sizeof(*th)); return crypto_ahash_update(hp->req); } static int tcp_v6_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, const struct in6_addr *daddr, struct in6_addr *saddr, const struct tcphdr *th) { struct tcp_sigpool hp; if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp)) goto clear_hash_nostart; if (crypto_ahash_init(hp.req)) goto clear_hash; if (tcp_v6_md5_hash_headers(&hp, daddr, saddr, th, th->doff << 2)) goto clear_hash; if (tcp_md5_hash_key(&hp, key)) goto clear_hash; ahash_request_set_crypt(hp.req, NULL, md5_hash, 0); if (crypto_ahash_final(hp.req)) goto clear_hash; tcp_sigpool_end(&hp); return 0; clear_hash: tcp_sigpool_end(&hp); clear_hash_nostart: memset(md5_hash, 0, 16); return 1; } static int tcp_v6_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, const struct sock *sk, const struct sk_buff *skb) { const struct tcphdr *th = tcp_hdr(skb); const struct in6_addr *saddr, *daddr; struct tcp_sigpool hp; if (sk) { /* valid for establish/request sockets */ saddr = &sk->sk_v6_rcv_saddr; daddr = &sk->sk_v6_daddr; } else { const struct ipv6hdr *ip6h = ipv6_hdr(skb); saddr = &ip6h->saddr; daddr = &ip6h->daddr; } if (tcp_sigpool_start(tcp_md5_sigpool_id, &hp)) goto clear_hash_nostart; if (crypto_ahash_init(hp.req)) goto clear_hash; if (tcp_v6_md5_hash_headers(&hp, daddr, saddr, th, skb->len)) goto clear_hash; if (tcp_sigpool_hash_skb_data(&hp, skb, th->doff << 2)) goto clear_hash; if (tcp_md5_hash_key(&hp, key)) goto clear_hash; ahash_request_set_crypt(hp.req, NULL, md5_hash, 0); if (crypto_ahash_final(hp.req)) goto clear_hash; tcp_sigpool_end(&hp); return 0; clear_hash: tcp_sigpool_end(&hp); clear_hash_nostart: memset(md5_hash, 0, 16); return 1; } #endif static void tcp_v6_init_req(struct request_sock *req, const struct sock *sk_listener, struct sk_buff *skb) { bool l3_slave = ipv6_l3mdev_skb(TCP_SKB_CB(skb)->header.h6.flags); struct inet_request_sock *ireq = inet_rsk(req); const struct ipv6_pinfo *np = tcp_inet6_sk(sk_listener); ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr; ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr; /* So that link locals have meaning */ if ((!sk_listener->sk_bound_dev_if || l3_slave) && ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL) ireq->ir_iif = tcp_v6_iif(skb); if (!TCP_SKB_CB(skb)->tcp_tw_isn && (ipv6_opt_accepted(sk_listener, skb, &TCP_SKB_CB(skb)->header.h6) || np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim || inet6_test_bit(REPFLOW, sk_listener))) { refcount_inc(&skb->users); ireq->pktopts = skb; } } static struct dst_entry *tcp_v6_route_req(const struct sock *sk, struct sk_buff *skb, struct flowi *fl, struct request_sock *req) { tcp_v6_init_req(req, sk, skb); if (security_inet_conn_request(sk, skb, req)) return NULL; return inet6_csk_route_req(sk, &fl->u.ip6, req, IPPROTO_TCP); } struct request_sock_ops tcp6_request_sock_ops __read_mostly = { .family = AF_INET6, .obj_size = sizeof(struct tcp6_request_sock), .rtx_syn_ack = tcp_rtx_synack, .send_ack = tcp_v6_reqsk_send_ack, .destructor = tcp_v6_reqsk_destructor, .send_reset = tcp_v6_send_reset, .syn_ack_timeout = tcp_syn_ack_timeout, }; const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = { .mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr), #ifdef CONFIG_TCP_MD5SIG .req_md5_lookup = tcp_v6_md5_lookup, .calc_md5_hash = tcp_v6_md5_hash_skb, #endif #ifdef CONFIG_TCP_AO .ao_lookup = tcp_v6_ao_lookup_rsk, .ao_calc_key = tcp_v6_ao_calc_key_rsk, .ao_synack_hash = tcp_v6_ao_synack_hash, #endif #ifdef CONFIG_SYN_COOKIES .cookie_init_seq = cookie_v6_init_sequence, #endif .route_req = tcp_v6_route_req, .init_seq = tcp_v6_init_seq, .init_ts_off = tcp_v6_init_ts_off, .send_synack = tcp_v6_send_synack, }; static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 tsval, u32 tsecr, int oif, int rst, u8 tclass, __be32 label, u32 priority, u32 txhash, struct tcp_key *key) { const struct tcphdr *th = tcp_hdr(skb); struct tcphdr *t1; struct sk_buff *buff; struct flowi6 fl6; struct net *net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); struct sock *ctl_sk = net->ipv6.tcp_sk; unsigned int tot_len = sizeof(struct tcphdr); __be32 mrst = 0, *topt; struct dst_entry *dst; __u32 mark = 0; if (tsecr) tot_len += TCPOLEN_TSTAMP_ALIGNED; if (tcp_key_is_md5(key)) tot_len += TCPOLEN_MD5SIG_ALIGNED; if (tcp_key_is_ao(key)) tot_len += tcp_ao_len_aligned(key->ao_key); #ifdef CONFIG_MPTCP if (rst && !tcp_key_is_md5(key)) { mrst = mptcp_reset_option(skb); if (mrst) tot_len += sizeof(__be32); } #endif buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); if (!buff) return; skb_reserve(buff, MAX_TCP_HEADER); t1 = skb_push(buff, tot_len); skb_reset_transport_header(buff); /* Swap the send and the receive. */ memset(t1, 0, sizeof(*t1)); t1->dest = th->source; t1->source = th->dest; t1->doff = tot_len / 4; t1->seq = htonl(seq); t1->ack_seq = htonl(ack); t1->ack = !rst || !th->ack; t1->rst = rst; t1->window = htons(win); topt = (__be32 *)(t1 + 1); if (tsecr) { *topt++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); *topt++ = htonl(tsval); *topt++ = htonl(tsecr); } if (mrst) *topt++ = mrst; #ifdef CONFIG_TCP_MD5SIG if (tcp_key_is_md5(key)) { *topt++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG); tcp_v6_md5_hash_hdr((__u8 *)topt, key->md5_key, &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, t1); } #endif #ifdef CONFIG_TCP_AO if (tcp_key_is_ao(key)) { *topt++ = htonl((TCPOPT_AO << 24) | (tcp_ao_len(key->ao_key) << 16) | (key->ao_key->sndid << 8) | (key->rcv_next)); tcp_ao_hash_hdr(AF_INET6, (char *)topt, key->ao_key, key->traffic_key, (union tcp_ao_addr *)&ipv6_hdr(skb)->saddr, (union tcp_ao_addr *)&ipv6_hdr(skb)->daddr, t1, key->sne); } #endif memset(&fl6, 0, sizeof(fl6)); fl6.daddr = ipv6_hdr(skb)->saddr; fl6.saddr = ipv6_hdr(skb)->daddr; fl6.flowlabel = label; buff->ip_summed = CHECKSUM_PARTIAL; __tcp_v6_send_check(buff, &fl6.saddr, &fl6.daddr); fl6.flowi6_proto = IPPROTO_TCP; if (rt6_need_strict(&fl6.daddr) && !oif) fl6.flowi6_oif = tcp_v6_iif(skb); else { if (!oif && netif_index_is_l3_master(net, skb->skb_iif)) oif = skb->skb_iif; fl6.flowi6_oif = oif; } if (sk) { if (sk->sk_state == TCP_TIME_WAIT) mark = inet_twsk(sk)->tw_mark; else mark = READ_ONCE(sk->sk_mark); skb_set_delivery_time(buff, tcp_transmit_time(sk), true); } if (txhash) { /* autoflowlabel/skb_get_hash_flowi6 rely on buff->hash */ skb_set_hash(buff, txhash, PKT_HASH_TYPE_L4); } fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark) ?: mark; fl6.fl6_dport = t1->dest; fl6.fl6_sport = t1->source; fl6.flowi6_uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); security_skb_classify_flow(skb, flowi6_to_flowi_common(&fl6)); /* Pass a socket to ip6_dst_lookup either it is for RST * Underlying function will use this to retrieve the network * namespace */ if (sk && sk->sk_state != TCP_TIME_WAIT) dst = ip6_dst_lookup_flow(net, sk, &fl6, NULL); /*sk's xfrm_policy can be referred*/ else dst = ip6_dst_lookup_flow(net, ctl_sk, &fl6, NULL); if (!IS_ERR(dst)) { skb_dst_set(buff, dst); ip6_xmit(ctl_sk, buff, &fl6, fl6.flowi6_mark, NULL, tclass & ~INET_ECN_MASK, priority); TCP_INC_STATS(net, TCP_MIB_OUTSEGS); if (rst) TCP_INC_STATS(net, TCP_MIB_OUTRSTS); return; } kfree_skb(buff); } static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) { const struct tcphdr *th = tcp_hdr(skb); struct ipv6hdr *ipv6h = ipv6_hdr(skb); const __u8 *md5_hash_location = NULL; #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) bool allocated_traffic_key = false; #endif const struct tcp_ao_hdr *aoh; struct tcp_key key = {}; u32 seq = 0, ack_seq = 0; __be32 label = 0; u32 priority = 0; struct net *net; u32 txhash = 0; int oif = 0; #ifdef CONFIG_TCP_MD5SIG unsigned char newhash[16]; int genhash; struct sock *sk1 = NULL; #endif if (th->rst) return; /* If sk not NULL, it means we did a successful lookup and incoming * route had to be correct. prequeue might have dropped our dst. */ if (!sk && !ipv6_unicast_destination(skb)) return; net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev); /* Invalid TCP option size or twice included auth */ if (tcp_parse_auth_options(th, &md5_hash_location, &aoh)) return; #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) rcu_read_lock(); #endif #ifdef CONFIG_TCP_MD5SIG if (sk && sk_fullsock(sk)) { int l3index; /* sdif set, means packet ingressed via a device * in an L3 domain and inet_iif is set to it. */ l3index = tcp_v6_sdif(skb) ? tcp_v6_iif_l3_slave(skb) : 0; key.md5_key = tcp_v6_md5_do_lookup(sk, &ipv6h->saddr, l3index); if (key.md5_key) key.type = TCP_KEY_MD5; } else if (md5_hash_location) { int dif = tcp_v6_iif_l3_slave(skb); int sdif = tcp_v6_sdif(skb); int l3index; /* * active side is lost. Try to find listening socket through * source port, and then find md5 key through listening socket. * we are not loose security here: * Incoming packet is checked with md5 hash with finding key, * no RST generated if md5 hash doesn't match. */ sk1 = inet6_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo, NULL, 0, &ipv6h->saddr, th->source, &ipv6h->daddr, ntohs(th->source), dif, sdif); if (!sk1) goto out; /* sdif set, means packet ingressed via a device * in an L3 domain and dif is set to it. */ l3index = tcp_v6_sdif(skb) ? dif : 0; key.md5_key = tcp_v6_md5_do_lookup(sk1, &ipv6h->saddr, l3index); if (!key.md5_key) goto out; key.type = TCP_KEY_MD5; genhash = tcp_v6_md5_hash_skb(newhash, key.md5_key, NULL, skb); if (genhash || memcmp(md5_hash_location, newhash, 16) != 0) goto out; } #endif if (th->ack) seq = ntohl(th->ack_seq); else ack_seq = ntohl(th->seq) + th->syn + th->fin + skb->len - (th->doff << 2); #ifdef CONFIG_TCP_AO if (aoh) { int l3index; l3index = tcp_v6_sdif(skb) ? tcp_v6_iif_l3_slave(skb) : 0; if (tcp_ao_prepare_reset(sk, skb, aoh, l3index, seq, &key.ao_key, &key.traffic_key, &allocated_traffic_key, &key.rcv_next, &key.sne)) goto out; key.type = TCP_KEY_AO; } #endif if (sk) { oif = sk->sk_bound_dev_if; if (sk_fullsock(sk)) { trace_tcp_send_reset(sk, skb); if (inet6_test_bit(REPFLOW, sk)) label = ip6_flowlabel(ipv6h); priority = READ_ONCE(sk->sk_priority); txhash = sk->sk_txhash; } if (sk->sk_state == TCP_TIME_WAIT) { label = cpu_to_be32(inet_twsk(sk)->tw_flowlabel); priority = inet_twsk(sk)->tw_priority; txhash = inet_twsk(sk)->tw_txhash; } } else { if (net->ipv6.sysctl.flowlabel_reflect & FLOWLABEL_REFLECT_TCP_RESET) label = ip6_flowlabel(ipv6h); } tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, 1, ipv6_get_dsfield(ipv6h), label, priority, txhash, &key); #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) out: if (allocated_traffic_key) kfree(key.traffic_key); rcu_read_unlock(); #endif } static void tcp_v6_send_ack(const struct sock *sk, struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 tsval, u32 tsecr, int oif, struct tcp_key *key, u8 tclass, __be32 label, u32 priority, u32 txhash) { tcp_v6_send_response(sk, skb, seq, ack, win, tsval, tsecr, oif, 0, tclass, label, priority, txhash, key); } static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb) { struct inet_timewait_sock *tw = inet_twsk(sk); struct tcp_timewait_sock *tcptw = tcp_twsk(sk); struct tcp_key key = {}; #ifdef CONFIG_TCP_AO struct tcp_ao_info *ao_info; if (static_branch_unlikely(&tcp_ao_needed.key)) { /* FIXME: the segment to-be-acked is not verified yet */ ao_info = rcu_dereference(tcptw->ao_info); if (ao_info) { const struct tcp_ao_hdr *aoh; /* Invalid TCP option size or twice included auth */ if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) goto out; if (aoh) key.ao_key = tcp_ao_established_key(ao_info, aoh->rnext_keyid, -1); } } if (key.ao_key) { struct tcp_ao_key *rnext_key; key.traffic_key = snd_other_key(key.ao_key); /* rcv_next switches to our rcv_next */ rnext_key = READ_ONCE(ao_info->rnext_key); key.rcv_next = rnext_key->rcvid; key.sne = READ_ONCE(ao_info->snd_sne); key.type = TCP_KEY_AO; #else if (0) { #endif #ifdef CONFIG_TCP_MD5SIG } else if (static_branch_unlikely(&tcp_md5_needed.key)) { key.md5_key = tcp_twsk_md5_key(tcptw); if (key.md5_key) key.type = TCP_KEY_MD5; #endif } tcp_v6_send_ack(sk, skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt, tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale, tcp_tw_tsval(tcptw), tcptw->tw_ts_recent, tw->tw_bound_dev_if, &key, tw->tw_tclass, cpu_to_be32(tw->tw_flowlabel), tw->tw_priority, tw->tw_txhash); #ifdef CONFIG_TCP_AO out: #endif inet_twsk_put(tw); } static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, struct request_sock *req) { struct tcp_key key = {}; #ifdef CONFIG_TCP_AO if (static_branch_unlikely(&tcp_ao_needed.key) && tcp_rsk_used_ao(req)) { const struct in6_addr *addr = &ipv6_hdr(skb)->saddr; const struct tcp_ao_hdr *aoh; int l3index; l3index = tcp_v6_sdif(skb) ? tcp_v6_iif_l3_slave(skb) : 0; /* Invalid TCP option size or twice included auth */ if (tcp_parse_auth_options(tcp_hdr(skb), NULL, &aoh)) return; if (!aoh) return; key.ao_key = tcp_ao_do_lookup(sk, l3index, (union tcp_ao_addr *)addr, AF_INET6, aoh->rnext_keyid, -1); if (unlikely(!key.ao_key)) { /* Send ACK with any matching MKT for the peer */ key.ao_key = tcp_ao_do_lookup(sk, l3index, (union tcp_ao_addr *)addr, AF_INET6, -1, -1); /* Matching key disappeared (user removed the key?) * let the handshake timeout. */ if (!key.ao_key) { net_info_ratelimited("TCP-AO key for (%pI6, %d)->(%pI6, %d) suddenly disappeared, won't ACK new connection\n", addr, ntohs(tcp_hdr(skb)->source), &ipv6_hdr(skb)->daddr, ntohs(tcp_hdr(skb)->dest)); return; } } key.traffic_key = kmalloc(tcp_ao_digest_size(key.ao_key), GFP_ATOMIC); if (!key.traffic_key) return; key.type = TCP_KEY_AO; key.rcv_next = aoh->keyid; tcp_v6_ao_calc_key_rsk(key.ao_key, key.traffic_key, req); #else if (0) { #endif #ifdef CONFIG_TCP_MD5SIG } else if (static_branch_unlikely(&tcp_md5_needed.key)) { int l3index = tcp_v6_sdif(skb) ? tcp_v6_iif_l3_slave(skb) : 0; key.md5_key = tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->saddr, l3index); if (key.md5_key) key.type = TCP_KEY_MD5; #endif } /* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV * sk->sk_state == TCP_SYN_RECV -> for Fast Open. */ /* RFC 7323 2.3 * The window field (SEG.WND) of every outgoing segment, with the * exception of <SYN> segments, MUST be right-shifted by * Rcv.Wind.Shift bits: */ tcp_v6_send_ack(sk, skb, (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt, tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, tcp_rsk_tsval(tcp_rsk(req)), READ_ONCE(req->ts_recent), sk->sk_bound_dev_if, &key, ipv6_get_dsfield(ipv6_hdr(skb)), 0, READ_ONCE(sk->sk_priority), READ_ONCE(tcp_rsk(req)->txhash)); if (tcp_key_is_ao(&key)) kfree(key.traffic_key); } static struct sock *tcp_v6_cookie_check(struct sock *sk, struct sk_buff *skb) { #ifdef CONFIG_SYN_COOKIES const struct tcphdr *th = tcp_hdr(skb); if (!th->syn) sk = cookie_v6_check(sk, skb); #endif return sk; } u16 tcp_v6_get_syncookie(struct sock *sk, struct ipv6hdr *iph, struct tcphdr *th, u32 *cookie) { u16 mss = 0; #ifdef CONFIG_SYN_COOKIES mss = tcp_get_syncookie_mss(&tcp6_request_sock_ops, &tcp_request_sock_ipv6_ops, sk, th); if (mss) { *cookie = __cookie_v6_init_sequence(iph, th, &mss); tcp_synq_overflow(sk); } #endif return mss; } static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) { if (skb->protocol == htons(ETH_P_IP)) return tcp_v4_conn_request(sk, skb); if (!ipv6_unicast_destination(skb)) goto drop; if (ipv6_addr_v4mapped(&ipv6_hdr(skb)->saddr)) { __IP6_INC_STATS(sock_net(sk), NULL, IPSTATS_MIB_INHDRERRORS); return 0; } return tcp_conn_request(&tcp6_request_sock_ops, &tcp_request_sock_ipv6_ops, sk, skb); drop: tcp_listendrop(sk); return 0; /* don't send reset */ } static void tcp_v6_restore_cb(struct sk_buff *skb) { /* We need to move header back to the beginning if xfrm6_policy_check() * and tcp_v6_fill_cb() are going to be called again. * ip6_datagram_recv_specific_ctl() also expects IP6CB to be there. */ memmove(IP6CB(skb), &TCP_SKB_CB(skb)->header.h6, sizeof(struct inet6_skb_parm)); } static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst, struct request_sock *req_unhash, bool *own_req) { struct inet_request_sock *ireq; struct ipv6_pinfo *newnp; const struct ipv6_pinfo *np = tcp_inet6_sk(sk); struct ipv6_txoptions *opt; struct inet_sock *newinet; bool found_dup_sk = false; struct tcp_sock *newtp; struct sock *newsk; #ifdef CONFIG_TCP_MD5SIG struct tcp_md5sig_key *key; int l3index; #endif struct flowi6 fl6; if (skb->protocol == htons(ETH_P_IP)) { /* * v6 mapped */ newsk = tcp_v4_syn_recv_sock(sk, skb, req, dst, req_unhash, own_req); if (!newsk) return NULL; inet_sk(newsk)->pinet6 = tcp_inet6_sk(newsk); newnp = tcp_inet6_sk(newsk); newtp = tcp_sk(newsk); memcpy(newnp, np, sizeof(struct ipv6_pinfo)); newnp->saddr = newsk->sk_v6_rcv_saddr; inet_csk(newsk)->icsk_af_ops = &ipv6_mapped; if (sk_is_mptcp(newsk)) mptcpv6_handle_mapped(newsk, true); newsk->sk_backlog_rcv = tcp_v4_do_rcv; #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) newtp->af_specific = &tcp_sock_ipv6_mapped_specific; #endif newnp->ipv6_mc_list = NULL; newnp->ipv6_ac_list = NULL; newnp->ipv6_fl_list = NULL; newnp->pktoptions = NULL; newnp->opt = NULL; newnp->mcast_oif = inet_iif(skb); newnp->mcast_hops = ip_hdr(skb)->ttl; newnp->rcv_flowinfo = 0; if (inet6_test_bit(REPFLOW, sk)) newnp->flow_label = 0; /* * No need to charge this sock to the relevant IPv6 refcnt debug socks count * here, tcp_create_openreq_child now does this for us, see the comment in * that function for the gory details. -acme */ /* It is tricky place. Until this moment IPv4 tcp worked with IPv6 icsk.icsk_af_ops. Sync it now. */ tcp_sync_mss(newsk, inet_csk(newsk)->icsk_pmtu_cookie); return newsk; } ireq = inet_rsk(req); if (sk_acceptq_is_full(sk)) goto out_overflow; if (!dst) { dst = inet6_csk_route_req(sk, &fl6, req, IPPROTO_TCP); if (!dst) goto out; } newsk = tcp_create_openreq_child(sk, req, skb); if (!newsk) goto out_nonewsk; /* * No need to charge this sock to the relevant IPv6 refcnt debug socks * count here, tcp_create_openreq_child now does this for us, see the * comment in that function for the gory details. -acme */ newsk->sk_gso_type = SKB_GSO_TCPV6; ip6_dst_store(newsk, dst, NULL, NULL); inet6_sk_rx_dst_set(newsk, skb); inet_sk(newsk)->pinet6 = tcp_inet6_sk(newsk); newtp = tcp_sk(newsk); newinet = inet_sk(newsk); newnp = tcp_inet6_sk(newsk); memcpy(newnp, np, sizeof(struct ipv6_pinfo)); newsk->sk_v6_daddr = ireq->ir_v6_rmt_addr; newnp->saddr = ireq->ir_v6_loc_addr; newsk->sk_v6_rcv_saddr = ireq->ir_v6_loc_addr; newsk->sk_bound_dev_if = ireq->ir_iif; /* Now IPv6 options... First: no IPv4 options. */ newinet->inet_opt = NULL; newnp->ipv6_mc_list = NULL; newnp->ipv6_ac_list = NULL; newnp->ipv6_fl_list = NULL; /* Clone RX bits */ newnp->rxopt.all = np->rxopt.all; newnp->pktoptions = NULL; newnp->opt = NULL; newnp->mcast_oif = tcp_v6_iif(skb); newnp->mcast_hops = ipv6_hdr(skb)->hop_limit; newnp->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(skb)); if (inet6_test_bit(REPFLOW, sk)) newnp->flow_label = ip6_flowlabel(ipv6_hdr(skb)); /* Set ToS of the new socket based upon the value of incoming SYN. * ECT bits are set later in tcp_init_transfer(). */ if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_reflect_tos)) newnp->tclass = tcp_rsk(req)->syn_tos & ~INET_ECN_MASK; /* Clone native IPv6 options from listening socket (if any) Yes, keeping reference count would be much more clever, but we make one more one thing there: reattach optmem to newsk. */ opt = ireq->ipv6_opt; if (!opt) opt = rcu_dereference(np->opt); if (opt) { opt = ipv6_dup_options(newsk, opt); RCU_INIT_POINTER(newnp->opt, opt); } inet_csk(newsk)->icsk_ext_hdr_len = 0; if (opt) inet_csk(newsk)->icsk_ext_hdr_len = opt->opt_nflen + opt->opt_flen; tcp_ca_openreq_child(newsk, dst); tcp_sync_mss(newsk, dst_mtu(dst)); newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst)); tcp_initialize_rcv_mss(newsk); newinet->inet_daddr = newinet->inet_saddr = LOOPBACK4_IPV6; newinet->inet_rcv_saddr = LOOPBACK4_IPV6; #ifdef CONFIG_TCP_MD5SIG l3index = l3mdev_master_ifindex_by_index(sock_net(sk), ireq->ir_iif); if (!tcp_rsk_used_ao(req)) { /* Copy over the MD5 key from the original socket */ key = tcp_v6_md5_do_lookup(sk, &newsk->sk_v6_daddr, l3index); if (key) { const union tcp_md5_addr *addr; addr = (union tcp_md5_addr *)&newsk->sk_v6_daddr; if (tcp_md5_key_copy(newsk, addr, AF_INET6, 128, l3index, key)) { inet_csk_prepare_forced_close(newsk); tcp_done(newsk); goto out; } } } #endif #ifdef CONFIG_TCP_AO /* Copy over tcp_ao_info if any */ if (tcp_ao_copy_all_matching(sk, newsk, req, skb, AF_INET6)) goto out; /* OOM */ #endif if (__inet_inherit_port(sk, newsk) < 0) { inet_csk_prepare_forced_close(newsk); tcp_done(newsk); goto out; } *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash), &found_dup_sk); if (*own_req) { tcp_move_syn(newtp, req); /* Clone pktoptions received with SYN, if we own the req */ if (ireq->pktopts) { newnp->pktoptions = skb_clone_and_charge_r(ireq->pktopts, newsk); consume_skb(ireq->pktopts); ireq->pktopts = NULL; if (newnp->pktoptions) tcp_v6_restore_cb(newnp->pktoptions); } } else { if (!req_unhash && found_dup_sk) { /* This code path should only be executed in the * syncookie case only */ bh_unlock_sock(newsk); sock_put(newsk); newsk = NULL; } } return newsk; out_overflow: __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); out_nonewsk: dst_release(dst); out: tcp_listendrop(sk); return NULL; } INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, u32)); /* The socket must have it's spinlock held when we get * here, unless it is a TCP_LISTEN socket. * * We have a potential double-lock case here, so even when * doing backlog processing we use the BH locking scheme. * This is because we cannot sleep with the original spinlock * held. */ INDIRECT_CALLABLE_SCOPE int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) { struct ipv6_pinfo *np = tcp_inet6_sk(sk); struct sk_buff *opt_skb = NULL; enum skb_drop_reason reason; struct tcp_sock *tp; /* Imagine: socket is IPv6. IPv4 packet arrives, goes to IPv4 receive handler and backlogged. From backlog it always goes here. Kerboom... Fortunately, tcp_rcv_established and rcv_established handle them correctly, but it is not case with tcp_v6_hnd_req and tcp_v6_send_reset(). --ANK */ if (skb->protocol == htons(ETH_P_IP)) return tcp_v4_do_rcv(sk, skb); /* * socket locking is here for SMP purposes as backlog rcv * is currently called with bh processing disabled. */ /* Do Stevens' IPV6_PKTOPTIONS. Yes, guys, it is the only place in our code, where we may make it not affecting IPv4. The rest of code is protocol independent, and I do not like idea to uglify IPv4. Actually, all the idea behind IPV6_PKTOPTIONS looks not very well thought. For now we latch options, received in the last packet, enqueued by tcp. Feel free to propose better solution. --ANK (980728) */ if (np->rxopt.all) opt_skb = skb_clone_and_charge_r(skb, sk); reason = SKB_DROP_REASON_NOT_SPECIFIED; if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ struct dst_entry *dst; dst = rcu_dereference_protected(sk->sk_rx_dst, lockdep_sock_is_held(sk)); sock_rps_save_rxhash(sk, skb); sk_mark_napi_id(sk, skb); if (dst) { if (sk->sk_rx_dst_ifindex != skb->skb_iif || INDIRECT_CALL_1(dst->ops->check, ip6_dst_check, dst, sk->sk_rx_dst_cookie) == NULL) { RCU_INIT_POINTER(sk->sk_rx_dst, NULL); dst_release(dst); } } tcp_rcv_established(sk, skb); if (opt_skb) goto ipv6_pktoptions; return 0; } if (tcp_checksum_complete(skb)) goto csum_err; if (sk->sk_state == TCP_LISTEN) { struct sock *nsk = tcp_v6_cookie_check(sk, skb); if (!nsk) goto discard; if (nsk != sk) { if (tcp_child_process(sk, nsk, skb)) goto reset; if (opt_skb) __kfree_skb(opt_skb); return 0; } } else sock_rps_save_rxhash(sk, skb); if (tcp_rcv_state_process(sk, skb)) goto reset; if (opt_skb) goto ipv6_pktoptions; return 0; reset: tcp_v6_send_reset(sk, skb); discard: if (opt_skb) __kfree_skb(opt_skb); kfree_skb_reason(skb, reason); return 0; csum_err: reason = SKB_DROP_REASON_TCP_CSUM; trace_tcp_bad_csum(skb); TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); goto discard; ipv6_pktoptions: /* Do you ask, what is it? 1. skb was enqueued by tcp. 2. skb is added to tail of read queue, rather than out of order. 3. socket is not in passive state. 4. Finally, it really contains options, which user wants to receive. */ tp = tcp_sk(sk); if (TCP_SKB_CB(opt_skb)->end_seq == tp->rcv_nxt && !((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) { if (np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo) np->mcast_oif = tcp_v6_iif(opt_skb); if (np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) WRITE_ONCE(np->mcast_hops, ipv6_hdr(opt_skb)->hop_limit); if (np->rxopt.bits.rxflow || np->rxopt.bits.rxtclass) np->rcv_flowinfo = ip6_flowinfo(ipv6_hdr(opt_skb)); if (inet6_test_bit(REPFLOW, sk)) np->flow_label = ip6_flowlabel(ipv6_hdr(opt_skb)); if (ipv6_opt_accepted(sk, opt_skb, &TCP_SKB_CB(opt_skb)->header.h6)) { tcp_v6_restore_cb(opt_skb); opt_skb = xchg(&np->pktoptions, opt_skb); } else { __kfree_skb(opt_skb); opt_skb = xchg(&np->pktoptions, NULL); } } consume_skb(opt_skb); return 0; } static void tcp_v6_fill_cb(struct sk_buff *skb, const struct ipv6hdr *hdr, const struct tcphdr *th) { /* This is tricky: we move IP6CB at its correct location into * TCP_SKB_CB(). It must be done after xfrm6_policy_check(), because * _decode_session6() uses IP6CB(). * barrier() makes sure compiler won't play aliasing games. */ memmove(&TCP_SKB_CB(skb)->header.h6, IP6CB(skb), sizeof(struct inet6_skb_parm)); barrier(); TCP_SKB_CB(skb)->seq = ntohl(th->seq); TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + skb->len - th->doff*4); TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); TCP_SKB_CB(skb)->tcp_flags = tcp_flag_byte(th); TCP_SKB_CB(skb)->tcp_tw_isn = 0; TCP_SKB_CB(skb)->ip_dsfield = ipv6_get_dsfield(hdr); TCP_SKB_CB(skb)->sacked = 0; TCP_SKB_CB(skb)->has_rxtstamp = skb->tstamp || skb_hwtstamps(skb)->hwtstamp; } INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb) { enum skb_drop_reason drop_reason; int sdif = inet6_sdif(skb); int dif = inet6_iif(skb); const struct tcphdr *th; const struct ipv6hdr *hdr; bool refcounted; struct sock *sk; int ret; struct net *net = dev_net(skb->dev); drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; if (skb->pkt_type != PACKET_HOST) goto discard_it; /* * Count it even if it's bad. */ __TCP_INC_STATS(net, TCP_MIB_INSEGS); if (!pskb_may_pull(skb, sizeof(struct tcphdr))) goto discard_it; th = (const struct tcphdr *)skb->data; if (unlikely(th->doff < sizeof(struct tcphdr) / 4)) { drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL; goto bad_packet; } if (!pskb_may_pull(skb, th->doff*4)) goto discard_it; if (skb_checksum_init(skb, IPPROTO_TCP, ip6_compute_pseudo)) goto csum_error; th = (const struct tcphdr *)skb->data; hdr = ipv6_hdr(skb); lookup: sk = __inet6_lookup_skb(net->ipv4.tcp_death_row.hashinfo, skb, __tcp_hdrlen(th), th->source, th->dest, inet6_iif(skb), sdif, &refcounted); if (!sk) goto no_tcp_socket; process: if (sk->sk_state == TCP_TIME_WAIT) goto do_time_wait; if (sk->sk_state == TCP_NEW_SYN_RECV) { struct request_sock *req = inet_reqsk(sk); bool req_stolen = false; struct sock *nsk; sk = req->rsk_listener; if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) drop_reason = SKB_DROP_REASON_XFRM_POLICY; else drop_reason = tcp_inbound_hash(sk, req, skb, &hdr->saddr, &hdr->daddr, AF_INET6, dif, sdif); if (drop_reason) { sk_drops_add(sk, skb); reqsk_put(req); goto discard_it; } if (tcp_checksum_complete(skb)) { reqsk_put(req); goto csum_error; } if (unlikely(sk->sk_state != TCP_LISTEN)) { nsk = reuseport_migrate_sock(sk, req_to_sk(req), skb); if (!nsk) { inet_csk_reqsk_queue_drop_and_put(sk, req); goto lookup; } sk = nsk; /* reuseport_migrate_sock() has already held one sk_refcnt * before returning. */ } else { sock_hold(sk); } refcounted = true; nsk = NULL; if (!tcp_filter(sk, skb)) { th = (const struct tcphdr *)skb->data; hdr = ipv6_hdr(skb); tcp_v6_fill_cb(skb, hdr, th); nsk = tcp_check_req(sk, skb, req, false, &req_stolen); } else { drop_reason = SKB_DROP_REASON_SOCKET_FILTER; } if (!nsk) { reqsk_put(req); if (req_stolen) { /* Another cpu got exclusive access to req * and created a full blown socket. * Try to feed this packet to this socket * instead of discarding it. */ tcp_v6_restore_cb(skb); sock_put(sk); goto lookup; } goto discard_and_relse; } nf_reset_ct(skb); if (nsk == sk) { reqsk_put(req); tcp_v6_restore_cb(skb); } else if (tcp_child_process(sk, nsk, skb)) { tcp_v6_send_reset(nsk, skb); goto discard_and_relse; } else { sock_put(sk); return 0; } } if (static_branch_unlikely(&ip6_min_hopcount)) { /* min_hopcount can be changed concurrently from do_ipv6_setsockopt() */ if (unlikely(hdr->hop_limit < READ_ONCE(tcp_inet6_sk(sk)->min_hopcount))) { __NET_INC_STATS(net, LINUX_MIB_TCPMINTTLDROP); drop_reason = SKB_DROP_REASON_TCP_MINTTL; goto discard_and_relse; } } if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) { drop_reason = SKB_DROP_REASON_XFRM_POLICY; goto discard_and_relse; } drop_reason = tcp_inbound_hash(sk, NULL, skb, &hdr->saddr, &hdr->daddr, AF_INET6, dif, sdif); if (drop_reason) goto discard_and_relse; nf_reset_ct(skb); if (tcp_filter(sk, skb)) { drop_reason = SKB_DROP_REASON_SOCKET_FILTER; goto discard_and_relse; } th = (const struct tcphdr *)skb->data; hdr = ipv6_hdr(skb); tcp_v6_fill_cb(skb, hdr, th); skb->dev = NULL; if (sk->sk_state == TCP_LISTEN) { ret = tcp_v6_do_rcv(sk, skb); goto put_and_return; } sk_incoming_cpu_update(sk); bh_lock_sock_nested(sk); tcp_segs_in(tcp_sk(sk), skb); ret = 0; if (!sock_owned_by_user(sk)) { ret = tcp_v6_do_rcv(sk, skb); } else { if (tcp_add_backlog(sk, skb, &drop_reason)) goto discard_and_relse; } bh_unlock_sock(sk); put_and_return: if (refcounted) sock_put(sk); return ret ? -1 : 0; no_tcp_socket: drop_reason = SKB_DROP_REASON_NO_SOCKET; if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) goto discard_it; tcp_v6_fill_cb(skb, hdr, th); if (tcp_checksum_complete(skb)) { csum_error: drop_reason = SKB_DROP_REASON_TCP_CSUM; trace_tcp_bad_csum(skb); __TCP_INC_STATS(net, TCP_MIB_CSUMERRORS); bad_packet: __TCP_INC_STATS(net, TCP_MIB_INERRS); } else { tcp_v6_send_reset(NULL, skb); } discard_it: SKB_DR_OR(drop_reason, NOT_SPECIFIED); kfree_skb_reason(skb, drop_reason); return 0; discard_and_relse: sk_drops_add(sk, skb); if (refcounted) sock_put(sk); goto discard_it; do_time_wait: if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) { drop_reason = SKB_DROP_REASON_XFRM_POLICY; inet_twsk_put(inet_twsk(sk)); goto discard_it; } tcp_v6_fill_cb(skb, hdr, th); if (tcp_checksum_complete(skb)) { inet_twsk_put(inet_twsk(sk)); goto csum_error; } switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { case TCP_TW_SYN: { struct sock *sk2; sk2 = inet6_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo, skb, __tcp_hdrlen(th), &ipv6_hdr(skb)->saddr, th->source, &ipv6_hdr(skb)->daddr, ntohs(th->dest), tcp_v6_iif_l3_slave(skb), sdif); if (sk2) { struct inet_timewait_sock *tw = inet_twsk(sk); inet_twsk_deschedule_put(tw); sk = sk2; tcp_v6_restore_cb(skb); refcounted = false; goto process; } } /* to ACK */ fallthrough; case TCP_TW_ACK: tcp_v6_timewait_ack(sk, skb); break; case TCP_TW_RST: tcp_v6_send_reset(sk, skb); inet_twsk_deschedule_put(inet_twsk(sk)); goto discard_it; case TCP_TW_SUCCESS: ; } goto discard_it; } void tcp_v6_early_demux(struct sk_buff *skb) { struct net *net = dev_net(skb->dev); const struct ipv6hdr *hdr; const struct tcphdr *th; struct sock *sk; if (skb->pkt_type != PACKET_HOST) return; if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct tcphdr))) return; hdr = ipv6_hdr(skb); th = tcp_hdr(skb); if (th->doff < sizeof(struct tcphdr) / 4) return; /* Note : We use inet6_iif() here, not tcp_v6_iif() */ sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, &hdr->saddr, th->source, &hdr->daddr, ntohs(th->dest), inet6_iif(skb), inet6_sdif(skb)); if (sk) { skb->sk = sk; skb->destructor = sock_edemux; if (sk_fullsock(sk)) { struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst); if (dst) dst = dst_check(dst, sk->sk_rx_dst_cookie); if (dst && sk->sk_rx_dst_ifindex == skb->skb_iif) skb_dst_set_noref(skb, dst); } } } static struct timewait_sock_ops tcp6_timewait_sock_ops = { .twsk_obj_size = sizeof(struct tcp6_timewait_sock), .twsk_unique = tcp_twsk_unique, .twsk_destructor = tcp_twsk_destructor, }; INDIRECT_CALLABLE_SCOPE void tcp_v6_send_check(struct sock *sk, struct sk_buff *skb) { __tcp_v6_send_check(skb, &sk->sk_v6_rcv_saddr, &sk->sk_v6_daddr); } const struct inet_connection_sock_af_ops ipv6_specific = { .queue_xmit = inet6_csk_xmit, .send_check = tcp_v6_send_check, .rebuild_header = inet6_sk_rebuild_header, .sk_rx_dst_set = inet6_sk_rx_dst_set, .conn_request = tcp_v6_conn_request, .syn_recv_sock = tcp_v6_syn_recv_sock, .net_header_len = sizeof(struct ipv6hdr), .setsockopt = ipv6_setsockopt, .getsockopt = ipv6_getsockopt, .addr2sockaddr = inet6_csk_addr2sockaddr, .sockaddr_len = sizeof(struct sockaddr_in6), .mtu_reduced = tcp_v6_mtu_reduced, }; #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) static const struct tcp_sock_af_ops tcp_sock_ipv6_specific = { #ifdef CONFIG_TCP_MD5SIG .md5_lookup = tcp_v6_md5_lookup, .calc_md5_hash = tcp_v6_md5_hash_skb, .md5_parse = tcp_v6_parse_md5_keys, #endif #ifdef CONFIG_TCP_AO .ao_lookup = tcp_v6_ao_lookup, .calc_ao_hash = tcp_v6_ao_hash_skb, .ao_parse = tcp_v6_parse_ao, .ao_calc_key_sk = tcp_v6_ao_calc_key_sk, #endif }; #endif /* * TCP over IPv4 via INET6 API */ static const struct inet_connection_sock_af_ops ipv6_mapped = { .queue_xmit = ip_queue_xmit, .send_check = tcp_v4_send_check, .rebuild_header = inet_sk_rebuild_header, .sk_rx_dst_set = inet_sk_rx_dst_set, .conn_request = tcp_v6_conn_request, .syn_recv_sock = tcp_v6_syn_recv_sock, .net_header_len = sizeof(struct iphdr), .setsockopt = ipv6_setsockopt, .getsockopt = ipv6_getsockopt, .addr2sockaddr = inet6_csk_addr2sockaddr, .sockaddr_len = sizeof(struct sockaddr_in6), .mtu_reduced = tcp_v4_mtu_reduced, }; #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific = { #ifdef CONFIG_TCP_MD5SIG .md5_lookup = tcp_v4_md5_lookup, .calc_md5_hash = tcp_v4_md5_hash_skb, .md5_parse = tcp_v6_parse_md5_keys, #endif #ifdef CONFIG_TCP_AO .ao_lookup = tcp_v6_ao_lookup, .calc_ao_hash = tcp_v4_ao_hash_skb, .ao_parse = tcp_v6_parse_ao, .ao_calc_key_sk = tcp_v4_ao_calc_key_sk, #endif }; #endif /* NOTE: A lot of things set to zero explicitly by call to * sk_alloc() so need not be done here. */ static int tcp_v6_init_sock(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); tcp_init_sock(sk); icsk->icsk_af_ops = &ipv6_specific; #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) tcp_sk(sk)->af_specific = &tcp_sock_ipv6_specific; #endif return 0; } #ifdef CONFIG_PROC_FS /* Proc filesystem TCPv6 sock list dumping. */ static void get_openreq6(struct seq_file *seq, const struct request_sock *req, int i) { long ttd = req->rsk_timer.expires - jiffies; const struct in6_addr *src = &inet_rsk(req)->ir_v6_loc_addr; const struct in6_addr *dest = &inet_rsk(req)->ir_v6_rmt_addr; if (ttd < 0) ttd = 0; seq_printf(seq, "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " "%02X %08X:%08X %02X:%08lX %08X %5u %8d %d %d %pK\n", i, src->s6_addr32[0], src->s6_addr32[1], src->s6_addr32[2], src->s6_addr32[3], inet_rsk(req)->ir_num, dest->s6_addr32[0], dest->s6_addr32[1], dest->s6_addr32[2], dest->s6_addr32[3], ntohs(inet_rsk(req)->ir_rmt_port), TCP_SYN_RECV, 0, 0, /* could print option size, but that is af dependent. */ 1, /* timers active (only the expire timer) */ jiffies_to_clock_t(ttd), req->num_timeout, from_kuid_munged(seq_user_ns(seq), sock_i_uid(req->rsk_listener)), 0, /* non standard timer */ 0, /* open_requests have no inode */ 0, req); } static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) { const struct in6_addr *dest, *src; __u16 destp, srcp; int timer_active; unsigned long timer_expires; const struct inet_sock *inet = inet_sk(sp); const struct tcp_sock *tp = tcp_sk(sp); const struct inet_connection_sock *icsk = inet_csk(sp); const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq; int rx_queue; int state; dest = &sp->sk_v6_daddr; src = &sp->sk_v6_rcv_saddr; destp = ntohs(inet->inet_dport); srcp = ntohs(inet->inet_sport); if (icsk->icsk_pending == ICSK_TIME_RETRANS || icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { timer_active = 1; timer_expires = icsk->icsk_timeout; } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) { timer_active = 4; timer_expires = icsk->icsk_timeout; } else if (timer_pending(&sp->sk_timer)) { timer_active = 2; timer_expires = sp->sk_timer.expires; } else { timer_active = 0; timer_expires = jiffies; } state = inet_sk_state_load(sp); if (state == TCP_LISTEN) rx_queue = READ_ONCE(sp->sk_ack_backlog); else /* Because we don't lock the socket, * we might find a transient negative value. */ rx_queue = max_t(int, READ_ONCE(tp->rcv_nxt) - READ_ONCE(tp->copied_seq), 0); seq_printf(seq, "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " "%02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %lu %lu %u %u %d\n", i, src->s6_addr32[0], src->s6_addr32[1], src->s6_addr32[2], src->s6_addr32[3], srcp, dest->s6_addr32[0], dest->s6_addr32[1], dest->s6_addr32[2], dest->s6_addr32[3], destp, state, READ_ONCE(tp->write_seq) - tp->snd_una, rx_queue, timer_active, jiffies_delta_to_clock_t(timer_expires - jiffies), icsk->icsk_retransmits, from_kuid_munged(seq_user_ns(seq), sock_i_uid(sp)), icsk->icsk_probes_out, sock_i_ino(sp), refcount_read(&sp->sk_refcnt), sp, jiffies_to_clock_t(icsk->icsk_rto), jiffies_to_clock_t(icsk->icsk_ack.ato), (icsk->icsk_ack.quick << 1) | inet_csk_in_pingpong_mode(sp), tcp_snd_cwnd(tp), state == TCP_LISTEN ? fastopenq->max_qlen : (tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh) ); } static void get_timewait6_sock(struct seq_file *seq, struct inet_timewait_sock *tw, int i) { long delta = tw->tw_timer.expires - jiffies; const struct in6_addr *dest, *src; __u16 destp, srcp; dest = &tw->tw_v6_daddr; src = &tw->tw_v6_rcv_saddr; destp = ntohs(tw->tw_dport); srcp = ntohs(tw->tw_sport); seq_printf(seq, "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " "%02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %pK\n", i, src->s6_addr32[0], src->s6_addr32[1], src->s6_addr32[2], src->s6_addr32[3], srcp, dest->s6_addr32[0], dest->s6_addr32[1], dest->s6_addr32[2], dest->s6_addr32[3], destp, tw->tw_substate, 0, 0, 3, jiffies_delta_to_clock_t(delta), 0, 0, 0, 0, refcount_read(&tw->tw_refcnt), tw); } static int tcp6_seq_show(struct seq_file *seq, void *v) { struct tcp_iter_state *st; struct sock *sk = v; if (v == SEQ_START_TOKEN) { seq_puts(seq, " sl " "local_address " "remote_address " "st tx_queue rx_queue tr tm->when retrnsmt" " uid timeout inode\n"); goto out; } st = seq->private; if (sk->sk_state == TCP_TIME_WAIT) get_timewait6_sock(seq, v, st->num); else if (sk->sk_state == TCP_NEW_SYN_RECV) get_openreq6(seq, v, st->num); else get_tcp6_sock(seq, v, st->num); out: return 0; } static const struct seq_operations tcp6_seq_ops = { .show = tcp6_seq_show, .start = tcp_seq_start, .next = tcp_seq_next, .stop = tcp_seq_stop, }; static struct tcp_seq_afinfo tcp6_seq_afinfo = { .family = AF_INET6, }; int __net_init tcp6_proc_init(struct net *net) { if (!proc_create_net_data("tcp6", 0444, net->proc_net, &tcp6_seq_ops, sizeof(struct tcp_iter_state), &tcp6_seq_afinfo)) return -ENOMEM; return 0; } void tcp6_proc_exit(struct net *net) { remove_proc_entry("tcp6", net->proc_net); } #endif struct proto tcpv6_prot = { .name = "TCPv6", .owner = THIS_MODULE, .close = tcp_close, .pre_connect = tcp_v6_pre_connect, .connect = tcp_v6_connect, .disconnect = tcp_disconnect, .accept = inet_csk_accept, .ioctl = tcp_ioctl, .init = tcp_v6_init_sock, .destroy = tcp_v4_destroy_sock, .shutdown = tcp_shutdown, .setsockopt = tcp_setsockopt, .getsockopt = tcp_getsockopt, .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt, .keepalive = tcp_set_keepalive, .recvmsg = tcp_recvmsg, .sendmsg = tcp_sendmsg, .splice_eof = tcp_splice_eof, .backlog_rcv = tcp_v6_do_rcv, .release_cb = tcp_release_cb, .hash = inet6_hash, .unhash = inet_unhash, .get_port = inet_csk_get_port, .put_port = inet_put_port, #ifdef CONFIG_BPF_SYSCALL .psock_update_sk_prot = tcp_bpf_update_proto, #endif .enter_memory_pressure = tcp_enter_memory_pressure, .leave_memory_pressure = tcp_leave_memory_pressure, .stream_memory_free = tcp_stream_memory_free, .sockets_allocated = &tcp_sockets_allocated, .memory_allocated = &tcp_memory_allocated, .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc, .memory_pressure = &tcp_memory_pressure, .orphan_count = &tcp_orphan_count, .sysctl_mem = sysctl_tcp_mem, .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), .max_header = MAX_TCP_HEADER, .obj_size = sizeof(struct tcp6_sock), .ipv6_pinfo_offset = offsetof(struct tcp6_sock, inet6), .slab_flags = SLAB_TYPESAFE_BY_RCU, .twsk_prot = &tcp6_timewait_sock_ops, .rsk_prot = &tcp6_request_sock_ops, .h.hashinfo = NULL, .no_autobind = true, .diag_destroy = tcp_abort, }; EXPORT_SYMBOL_GPL(tcpv6_prot); static const struct inet6_protocol tcpv6_protocol = { .handler = tcp_v6_rcv, .err_handler = tcp_v6_err, .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, }; static struct inet_protosw tcpv6_protosw = { .type = SOCK_STREAM, .protocol = IPPROTO_TCP, .prot = &tcpv6_prot, .ops = &inet6_stream_ops, .flags = INET_PROTOSW_PERMANENT | INET_PROTOSW_ICSK, }; static int __net_init tcpv6_net_init(struct net *net) { return inet_ctl_sock_create(&net->ipv6.tcp_sk, PF_INET6, SOCK_RAW, IPPROTO_TCP, net); } static void __net_exit tcpv6_net_exit(struct net *net) { inet_ctl_sock_destroy(net->ipv6.tcp_sk); } static void __net_exit tcpv6_net_exit_batch(struct list_head *net_exit_list) { tcp_twsk_purge(net_exit_list, AF_INET6); } static struct pernet_operations tcpv6_net_ops = { .init = tcpv6_net_init, .exit = tcpv6_net_exit, .exit_batch = tcpv6_net_exit_batch, }; int __init tcpv6_init(void) { int ret; ret = inet6_add_protocol(&tcpv6_protocol, IPPROTO_TCP); if (ret) goto out; /* register inet6 protocol */ ret = inet6_register_protosw(&tcpv6_protosw); if (ret) goto out_tcpv6_protocol; ret = register_pernet_subsys(&tcpv6_net_ops); if (ret) goto out_tcpv6_protosw; ret = mptcpv6_init(); if (ret) goto out_tcpv6_pernet_subsys; out: return ret; out_tcpv6_pernet_subsys: unregister_pernet_subsys(&tcpv6_net_ops); out_tcpv6_protosw: inet6_unregister_protosw(&tcpv6_protosw); out_tcpv6_protocol: inet6_del_protocol(&tcpv6_protocol, IPPROTO_TCP); goto out; } void tcpv6_exit(void) { unregister_pernet_subsys(&tcpv6_net_ops); inet6_unregister_protosw(&tcpv6_protosw); inet6_del_protocol(&tcpv6_protocol, IPPROTO_TCP); } |
17 17 17 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 | /* -*- linux-c -*- */ /* fs/reiserfs/procfs.c */ /* * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README */ /* proc info support a la one created by Sizif@Botik.RU for PGC */ #include <linux/module.h> #include <linux/time.h> #include <linux/seq_file.h> #include <linux/uaccess.h> #include "reiserfs.h" #include <linux/init.h> #include <linux/proc_fs.h> #include <linux/blkdev.h> /* * LOCKING: * * These guys are evicted from procfs as the very first step in ->kill_sb(). * */ static int show_version(struct seq_file *m, void *unused) { struct super_block *sb = m->private; char *format; if (REISERFS_SB(sb)->s_properties & (1 << REISERFS_3_6)) { format = "3.6"; } else if (REISERFS_SB(sb)->s_properties & (1 << REISERFS_3_5)) { format = "3.5"; } else { format = "unknown"; } seq_printf(m, "%s format\twith checks %s\n", format, #if defined( CONFIG_REISERFS_CHECK ) "on" #else "off" #endif ); return 0; } #define SF( x ) ( r -> x ) #define SFP( x ) SF( s_proc_info_data.x ) #define SFPL( x ) SFP( x[ level ] ) #define SFPF( x ) SFP( scan_bitmap.x ) #define SFPJ( x ) SFP( journal.x ) #define D2C( x ) le16_to_cpu( x ) #define D4C( x ) le32_to_cpu( x ) #define DF( x ) D2C( rs -> s_v1.x ) #define DFL( x ) D4C( rs -> s_v1.x ) #define objectid_map( s, rs ) (old_format_only (s) ? \ (__le32 *)((struct reiserfs_super_block_v1 *)rs + 1) : \ (__le32 *)(rs + 1)) #define MAP( i ) D4C( objectid_map( sb, rs )[ i ] ) #define DJF( x ) le32_to_cpu( rs -> x ) #define DJP( x ) le32_to_cpu( jp -> x ) #define JF( x ) ( r -> s_journal -> x ) static int show_super(struct seq_file *m, void *unused) { struct super_block *sb = m->private; struct reiserfs_sb_info *r = REISERFS_SB(sb); seq_printf(m, "state: \t%s\n" "mount options: \t%s%s%s%s%s%s%s%s%s%s%s\n" "gen. counter: \t%i\n" "s_disk_reads: \t%i\n" "s_disk_writes: \t%i\n" "s_fix_nodes: \t%i\n" "s_do_balance: \t%i\n" "s_unneeded_left_neighbor: \t%i\n" "s_good_search_by_key_reada: \t%i\n" "s_bmaps: \t%i\n" "s_bmaps_without_search: \t%i\n" "s_direct2indirect: \t%i\n" "s_indirect2direct: \t%i\n" "\n" "max_hash_collisions: \t%i\n" "breads: \t%lu\n" "bread_misses: \t%lu\n" "search_by_key: \t%lu\n" "search_by_key_fs_changed: \t%lu\n" "search_by_key_restarted: \t%lu\n" "insert_item_restarted: \t%lu\n" "paste_into_item_restarted: \t%lu\n" "cut_from_item_restarted: \t%lu\n" "delete_solid_item_restarted: \t%lu\n" "delete_item_restarted: \t%lu\n" "leaked_oid: \t%lu\n" "leaves_removable: \t%lu\n", SF(s_mount_state) == REISERFS_VALID_FS ? "REISERFS_VALID_FS" : "REISERFS_ERROR_FS", reiserfs_r5_hash(sb) ? "FORCE_R5 " : "", reiserfs_rupasov_hash(sb) ? "FORCE_RUPASOV " : "", reiserfs_tea_hash(sb) ? "FORCE_TEA " : "", reiserfs_hash_detect(sb) ? "DETECT_HASH " : "", reiserfs_no_border(sb) ? "NO_BORDER " : "BORDER ", reiserfs_no_unhashed_relocation(sb) ? "NO_UNHASHED_RELOCATION " : "", reiserfs_hashed_relocation(sb) ? "UNHASHED_RELOCATION " : "", reiserfs_test4(sb) ? "TEST4 " : "", have_large_tails(sb) ? "TAILS " : have_small_tails(sb) ? "SMALL_TAILS " : "NO_TAILS ", replay_only(sb) ? "REPLAY_ONLY " : "", convert_reiserfs(sb) ? "CONV " : "", atomic_read(&r->s_generation_counter), SF(s_disk_reads), SF(s_disk_writes), SF(s_fix_nodes), SF(s_do_balance), SF(s_unneeded_left_neighbor), SF(s_good_search_by_key_reada), SF(s_bmaps), SF(s_bmaps_without_search), SF(s_direct2indirect), SF(s_indirect2direct), SFP(max_hash_collisions), SFP(breads), SFP(bread_miss), SFP(search_by_key), SFP(search_by_key_fs_changed), SFP(search_by_key_restarted), SFP(insert_item_restarted), SFP(paste_into_item_restarted), SFP(cut_from_item_restarted), SFP(delete_solid_item_restarted), SFP(delete_item_restarted), SFP(leaked_oid), SFP(leaves_removable)); return 0; } static int show_per_level(struct seq_file *m, void *unused) { struct super_block *sb = m->private; struct reiserfs_sb_info *r = REISERFS_SB(sb); int level; seq_printf(m, "level\t" " balances" " [sbk: reads" " fs_changed" " restarted]" " free space" " items" " can_remove" " lnum" " rnum" " lbytes" " rbytes" " get_neig" " get_neig_res" " need_l_neig" " need_r_neig" "\n"); for (level = 0; level < MAX_HEIGHT; ++level) { seq_printf(m, "%i\t" " %12lu" " %12lu" " %12lu" " %12lu" " %12lu" " %12lu" " %12lu" " %12li" " %12li" " %12li" " %12li" " %12lu" " %12lu" " %12lu" " %12lu" "\n", level, SFPL(balance_at), SFPL(sbk_read_at), SFPL(sbk_fs_changed), SFPL(sbk_restarted), SFPL(free_at), SFPL(items_at), SFPL(can_node_be_removed), SFPL(lnum), SFPL(rnum), SFPL(lbytes), SFPL(rbytes), SFPL(get_neighbors), SFPL(get_neighbors_restart), SFPL(need_l_neighbor), SFPL(need_r_neighbor) ); } return 0; } static int show_bitmap(struct seq_file *m, void *unused) { struct super_block *sb = m->private; struct reiserfs_sb_info *r = REISERFS_SB(sb); seq_printf(m, "free_block: %lu\n" " scan_bitmap:" " wait" " bmap" " retry" " stolen" " journal_hint" "journal_nohint" "\n" " %14lu" " %14lu" " %14lu" " %14lu" " %14lu" " %14lu" " %14lu" "\n", SFP(free_block), SFPF(call), SFPF(wait), SFPF(bmap), SFPF(retry), SFPF(stolen), SFPF(in_journal_hint), SFPF(in_journal_nohint)); return 0; } static int show_on_disk_super(struct seq_file *m, void *unused) { struct super_block *sb = m->private; struct reiserfs_sb_info *sb_info = REISERFS_SB(sb); struct reiserfs_super_block *rs = sb_info->s_rs; int hash_code = DFL(s_hash_function_code); __u32 flags = DJF(s_flags); seq_printf(m, "block_count: \t%i\n" "free_blocks: \t%i\n" "root_block: \t%i\n" "blocksize: \t%i\n" "oid_maxsize: \t%i\n" "oid_cursize: \t%i\n" "umount_state: \t%i\n" "magic: \t%10.10s\n" "fs_state: \t%i\n" "hash: \t%s\n" "tree_height: \t%i\n" "bmap_nr: \t%i\n" "version: \t%i\n" "flags: \t%x[%s]\n" "reserved_for_journal: \t%i\n", DFL(s_block_count), DFL(s_free_blocks), DFL(s_root_block), DF(s_blocksize), DF(s_oid_maxsize), DF(s_oid_cursize), DF(s_umount_state), rs->s_v1.s_magic, DF(s_fs_state), hash_code == TEA_HASH ? "tea" : (hash_code == YURA_HASH) ? "rupasov" : (hash_code == R5_HASH) ? "r5" : (hash_code == UNSET_HASH) ? "unset" : "unknown", DF(s_tree_height), DF(s_bmap_nr), DF(s_version), flags, (flags & reiserfs_attrs_cleared) ? "attrs_cleared" : "", DF(s_reserved_for_journal)); return 0; } static int show_oidmap(struct seq_file *m, void *unused) { struct super_block *sb = m->private; struct reiserfs_sb_info *sb_info = REISERFS_SB(sb); struct reiserfs_super_block *rs = sb_info->s_rs; unsigned int mapsize = le16_to_cpu(rs->s_v1.s_oid_cursize); unsigned long total_used = 0; int i; for (i = 0; i < mapsize; ++i) { __u32 right; right = (i == mapsize - 1) ? MAX_KEY_OBJECTID : MAP(i + 1); seq_printf(m, "%s: [ %x .. %x )\n", (i & 1) ? "free" : "used", MAP(i), right); if (!(i & 1)) { total_used += right - MAP(i); } } #if defined( REISERFS_USE_OIDMAPF ) if (sb_info->oidmap.use_file && (sb_info->oidmap.mapf != NULL)) { loff_t size = file_inode(sb_info->oidmap.mapf)->i_size; total_used += size / sizeof(reiserfs_oidinterval_d_t); } #endif seq_printf(m, "total: \t%i [%i/%i] used: %lu [exact]\n", mapsize, mapsize, le16_to_cpu(rs->s_v1.s_oid_maxsize), total_used); return 0; } static time64_t ktime_mono_to_real_seconds(time64_t mono) { ktime_t kt = ktime_set(mono, NSEC_PER_SEC/2); return ktime_divns(ktime_mono_to_real(kt), NSEC_PER_SEC); } static int show_journal(struct seq_file *m, void *unused) { struct super_block *sb = m->private; struct reiserfs_sb_info *r = REISERFS_SB(sb); struct reiserfs_super_block *rs = r->s_rs; struct journal_params *jp = &rs->s_v1.s_journal; seq_printf(m, /* on-disk fields */ "jp_journal_1st_block: \t%i\n" "jp_journal_dev: \t%pg[%x]\n" "jp_journal_size: \t%i\n" "jp_journal_trans_max: \t%i\n" "jp_journal_magic: \t%i\n" "jp_journal_max_batch: \t%i\n" "jp_journal_max_commit_age: \t%i\n" "jp_journal_max_trans_age: \t%i\n" /* incore fields */ "j_1st_reserved_block: \t%i\n" "j_state: \t%li\n" "j_trans_id: \t%u\n" "j_mount_id: \t%lu\n" "j_start: \t%lu\n" "j_len: \t%lu\n" "j_len_alloc: \t%lu\n" "j_wcount: \t%i\n" "j_bcount: \t%lu\n" "j_first_unflushed_offset: \t%lu\n" "j_last_flush_trans_id: \t%u\n" "j_trans_start_time: \t%lli\n" "j_list_bitmap_index: \t%i\n" "j_must_wait: \t%i\n" "j_next_full_flush: \t%i\n" "j_next_async_flush: \t%i\n" "j_cnode_used: \t%i\n" "j_cnode_free: \t%i\n" "\n" /* reiserfs_proc_info_data_t.journal fields */ "in_journal: \t%12lu\n" "in_journal_bitmap: \t%12lu\n" "in_journal_reusable: \t%12lu\n" "lock_journal: \t%12lu\n" "lock_journal_wait: \t%12lu\n" "journal_begin: \t%12lu\n" "journal_relock_writers: \t%12lu\n" "journal_relock_wcount: \t%12lu\n" "mark_dirty: \t%12lu\n" "mark_dirty_already: \t%12lu\n" "mark_dirty_notjournal: \t%12lu\n" "restore_prepared: \t%12lu\n" "prepare: \t%12lu\n" "prepare_retry: \t%12lu\n", DJP(jp_journal_1st_block), SB_JOURNAL(sb)->j_bdev_handle->bdev, DJP(jp_journal_dev), DJP(jp_journal_size), DJP(jp_journal_trans_max), DJP(jp_journal_magic), DJP(jp_journal_max_batch), SB_JOURNAL(sb)->j_max_commit_age, DJP(jp_journal_max_trans_age), JF(j_1st_reserved_block), JF(j_state), JF(j_trans_id), JF(j_mount_id), JF(j_start), JF(j_len), JF(j_len_alloc), atomic_read(&r->s_journal->j_wcount), JF(j_bcount), JF(j_first_unflushed_offset), JF(j_last_flush_trans_id), ktime_mono_to_real_seconds(JF(j_trans_start_time)), JF(j_list_bitmap_index), JF(j_must_wait), JF(j_next_full_flush), JF(j_next_async_flush), JF(j_cnode_used), JF(j_cnode_free), SFPJ(in_journal), SFPJ(in_journal_bitmap), SFPJ(in_journal_reusable), SFPJ(lock_journal), SFPJ(lock_journal_wait), SFPJ(journal_being), SFPJ(journal_relock_writers), SFPJ(journal_relock_wcount), SFPJ(mark_dirty), SFPJ(mark_dirty_already), SFPJ(mark_dirty_notjournal), SFPJ(restore_prepared), SFPJ(prepare), SFPJ(prepare_retry) ); return 0; } static struct proc_dir_entry *proc_info_root = NULL; static const char proc_info_root_name[] = "fs/reiserfs"; static void add_file(struct super_block *sb, char *name, int (*func) (struct seq_file *, void *)) { proc_create_single_data(name, 0, REISERFS_SB(sb)->procdir, func, sb); } int reiserfs_proc_info_init(struct super_block *sb) { char b[BDEVNAME_SIZE]; char *s; /* Some block devices use /'s */ strscpy(b, sb->s_id, BDEVNAME_SIZE); s = strchr(b, '/'); if (s) *s = '!'; spin_lock_init(&__PINFO(sb).lock); REISERFS_SB(sb)->procdir = proc_mkdir_data(b, 0, proc_info_root, sb); if (REISERFS_SB(sb)->procdir) { add_file(sb, "version", show_version); add_file(sb, "super", show_super); add_file(sb, "per-level", show_per_level); add_file(sb, "bitmap", show_bitmap); add_file(sb, "on-disk-super", show_on_disk_super); add_file(sb, "oidmap", show_oidmap); add_file(sb, "journal", show_journal); return 0; } reiserfs_warning(sb, "cannot create /proc/%s/%s", proc_info_root_name, b); return 1; } int reiserfs_proc_info_done(struct super_block *sb) { struct proc_dir_entry *de = REISERFS_SB(sb)->procdir; if (de) { char b[BDEVNAME_SIZE]; char *s; /* Some block devices use /'s */ strscpy(b, sb->s_id, BDEVNAME_SIZE); s = strchr(b, '/'); if (s) *s = '!'; remove_proc_subtree(b, proc_info_root); REISERFS_SB(sb)->procdir = NULL; } return 0; } int reiserfs_proc_info_global_init(void) { if (proc_info_root == NULL) { proc_info_root = proc_mkdir(proc_info_root_name, NULL); if (!proc_info_root) { reiserfs_warning(NULL, "cannot create /proc/%s", proc_info_root_name); return 1; } } return 0; } int reiserfs_proc_info_global_done(void) { if (proc_info_root != NULL) { proc_info_root = NULL; remove_proc_entry(proc_info_root_name, NULL); } return 0; } /* * Revision 1.1.8.2 2001/07/15 17:08:42 god * . use get_super() in procfs.c * . remove remove_save_link() from reiserfs_do_truncate() * * I accept terms and conditions stated in the Legal Agreement * (available at http://www.namesys.com/legalese.html) * * Revision 1.1.8.1 2001/07/11 16:48:50 god * proc info support * * I accept terms and conditions stated in the Legal Agreement * (available at http://www.namesys.com/legalese.html) * */ |
20 18 2 2 2 48 48 48 3 3 17 17 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 | // SPDX-License-Identifier: GPL-2.0+ /* * NILFS inode file * * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. * * Written by Amagai Yoshiji. * Revised by Ryusuke Konishi. * */ #include <linux/types.h> #include <linux/buffer_head.h> #include "nilfs.h" #include "mdt.h" #include "alloc.h" #include "ifile.h" /** * struct nilfs_ifile_info - on-memory private data of ifile * @mi: on-memory private data of metadata file * @palloc_cache: persistent object allocator cache of ifile */ struct nilfs_ifile_info { struct nilfs_mdt_info mi; struct nilfs_palloc_cache palloc_cache; }; static inline struct nilfs_ifile_info *NILFS_IFILE_I(struct inode *ifile) { return (struct nilfs_ifile_info *)NILFS_MDT(ifile); } /** * nilfs_ifile_create_inode - create a new disk inode * @ifile: ifile inode * @out_ino: pointer to a variable to store inode number * @out_bh: buffer_head contains newly allocated disk inode * * Return Value: On success, 0 is returned and the newly allocated inode * number is stored in the place pointed by @ino, and buffer_head pointer * that contains newly allocated disk inode structure is stored in the * place pointed by @out_bh * On error, one of the following negative error codes is returned. * * %-EIO - I/O error. * * %-ENOMEM - Insufficient amount of memory available. * * %-ENOSPC - No inode left. */ int nilfs_ifile_create_inode(struct inode *ifile, ino_t *out_ino, struct buffer_head **out_bh) { struct nilfs_palloc_req req; int ret; req.pr_entry_nr = 0; /* * 0 says find free inode from beginning * of a group. dull code!! */ req.pr_entry_bh = NULL; ret = nilfs_palloc_prepare_alloc_entry(ifile, &req); if (!ret) { ret = nilfs_palloc_get_entry_block(ifile, req.pr_entry_nr, 1, &req.pr_entry_bh); if (ret < 0) nilfs_palloc_abort_alloc_entry(ifile, &req); } if (ret < 0) { brelse(req.pr_entry_bh); return ret; } nilfs_palloc_commit_alloc_entry(ifile, &req); mark_buffer_dirty(req.pr_entry_bh); nilfs_mdt_mark_dirty(ifile); *out_ino = (ino_t)req.pr_entry_nr; *out_bh = req.pr_entry_bh; return 0; } /** * nilfs_ifile_delete_inode - delete a disk inode * @ifile: ifile inode * @ino: inode number * * Return Value: On success, 0 is returned. On error, one of the following * negative error codes is returned. * * %-EIO - I/O error. * * %-ENOMEM - Insufficient amount of memory available. * * %-ENOENT - The inode number @ino have not been allocated. */ int nilfs_ifile_delete_inode(struct inode *ifile, ino_t ino) { struct nilfs_palloc_req req = { .pr_entry_nr = ino, .pr_entry_bh = NULL }; struct nilfs_inode *raw_inode; void *kaddr; int ret; ret = nilfs_palloc_prepare_free_entry(ifile, &req); if (!ret) { ret = nilfs_palloc_get_entry_block(ifile, req.pr_entry_nr, 0, &req.pr_entry_bh); if (ret < 0) nilfs_palloc_abort_free_entry(ifile, &req); } if (ret < 0) { brelse(req.pr_entry_bh); return ret; } kaddr = kmap_atomic(req.pr_entry_bh->b_page); raw_inode = nilfs_palloc_block_get_entry(ifile, req.pr_entry_nr, req.pr_entry_bh, kaddr); raw_inode->i_flags = 0; kunmap_atomic(kaddr); mark_buffer_dirty(req.pr_entry_bh); brelse(req.pr_entry_bh); nilfs_palloc_commit_free_entry(ifile, &req); return 0; } int nilfs_ifile_get_inode_block(struct inode *ifile, ino_t ino, struct buffer_head **out_bh) { struct super_block *sb = ifile->i_sb; int err; if (unlikely(!NILFS_VALID_INODE(sb, ino))) { nilfs_error(sb, "bad inode number: %lu", (unsigned long)ino); return -EINVAL; } err = nilfs_palloc_get_entry_block(ifile, ino, 0, out_bh); if (unlikely(err)) nilfs_warn(sb, "error %d reading inode: ino=%lu", err, (unsigned long)ino); return err; } /** * nilfs_ifile_count_free_inodes - calculate free inodes count * @ifile: ifile inode * @nmaxinodes: current maximum of available inodes count [out] * @nfreeinodes: free inodes count [out] */ int nilfs_ifile_count_free_inodes(struct inode *ifile, u64 *nmaxinodes, u64 *nfreeinodes) { u64 nused; int err; *nmaxinodes = 0; *nfreeinodes = 0; nused = atomic64_read(&NILFS_I(ifile)->i_root->inodes_count); err = nilfs_palloc_count_max_entries(ifile, nused, nmaxinodes); if (likely(!err)) *nfreeinodes = *nmaxinodes - nused; return err; } /** * nilfs_ifile_read - read or get ifile inode * @sb: super block instance * @root: root object * @inode_size: size of an inode * @raw_inode: on-disk ifile inode * @inodep: buffer to store the inode */ int nilfs_ifile_read(struct super_block *sb, struct nilfs_root *root, size_t inode_size, struct nilfs_inode *raw_inode, struct inode **inodep) { struct inode *ifile; int err; ifile = nilfs_iget_locked(sb, root, NILFS_IFILE_INO); if (unlikely(!ifile)) return -ENOMEM; if (!(ifile->i_state & I_NEW)) goto out; err = nilfs_mdt_init(ifile, NILFS_MDT_GFP, sizeof(struct nilfs_ifile_info)); if (err) goto failed; err = nilfs_palloc_init_blockgroup(ifile, inode_size); if (err) goto failed; nilfs_palloc_setup_cache(ifile, &NILFS_IFILE_I(ifile)->palloc_cache); err = nilfs_read_inode_common(ifile, raw_inode); if (err) goto failed; unlock_new_inode(ifile); out: *inodep = ifile; return 0; failed: iget_failed(ifile); return err; } |
44 44 78 78 78 78 3 3 3 3 3 3 1 75 75 75 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Advanced Linux Sound Architecture * Copyright (c) by Jaroslav Kysela <perex@perex.cz> */ #include <linux/init.h> #include <linux/slab.h> #include <linux/time.h> #include <linux/device.h> #include <linux/module.h> #include <linux/debugfs.h> #include <sound/core.h> #include <sound/minors.h> #include <sound/info.h> #include <sound/control.h> #include <sound/initval.h> #include <linux/kmod.h> #include <linux/mutex.h> static int major = CONFIG_SND_MAJOR; int snd_major; EXPORT_SYMBOL(snd_major); static int cards_limit = 1; MODULE_AUTHOR("Jaroslav Kysela <perex@perex.cz>"); MODULE_DESCRIPTION("Advanced Linux Sound Architecture driver for soundcards."); MODULE_LICENSE("GPL"); module_param(major, int, 0444); MODULE_PARM_DESC(major, "Major # for sound driver."); module_param(cards_limit, int, 0444); MODULE_PARM_DESC(cards_limit, "Count of auto-loadable soundcards."); MODULE_ALIAS_CHARDEV_MAJOR(CONFIG_SND_MAJOR); /* this one holds the actual max. card number currently available. * as default, it's identical with cards_limit option. when more * modules are loaded manually, this limit number increases, too. */ int snd_ecards_limit; EXPORT_SYMBOL(snd_ecards_limit); #ifdef CONFIG_SND_DEBUG struct dentry *sound_debugfs_root; EXPORT_SYMBOL_GPL(sound_debugfs_root); #endif static struct snd_minor *snd_minors[SNDRV_OS_MINORS]; static DEFINE_MUTEX(sound_mutex); #ifdef CONFIG_MODULES /** * snd_request_card - try to load the card module * @card: the card number * * Tries to load the module "snd-card-X" for the given card number * via request_module. Returns immediately if already loaded. */ void snd_request_card(int card) { if (snd_card_locked(card)) return; if (card < 0 || card >= cards_limit) return; request_module("snd-card-%i", card); } EXPORT_SYMBOL(snd_request_card); static void snd_request_other(int minor) { char *str; switch (minor) { case SNDRV_MINOR_SEQUENCER: str = "snd-seq"; break; case SNDRV_MINOR_TIMER: str = "snd-timer"; break; default: return; } request_module(str); } #endif /* modular kernel */ /** * snd_lookup_minor_data - get user data of a registered device * @minor: the minor number * @type: device type (SNDRV_DEVICE_TYPE_XXX) * * Checks that a minor device with the specified type is registered, and returns * its user data pointer. * * This function increments the reference counter of the card instance * if an associated instance with the given minor number and type is found. * The caller must call snd_card_unref() appropriately later. * * Return: The user data pointer if the specified device is found. %NULL * otherwise. */ void *snd_lookup_minor_data(unsigned int minor, int type) { struct snd_minor *mreg; void *private_data; if (minor >= ARRAY_SIZE(snd_minors)) return NULL; mutex_lock(&sound_mutex); mreg = snd_minors[minor]; if (mreg && mreg->type == type) { private_data = mreg->private_data; if (private_data && mreg->card_ptr) get_device(&mreg->card_ptr->card_dev); } else private_data = NULL; mutex_unlock(&sound_mutex); return private_data; } EXPORT_SYMBOL(snd_lookup_minor_data); #ifdef CONFIG_MODULES static struct snd_minor *autoload_device(unsigned int minor) { int dev; mutex_unlock(&sound_mutex); /* release lock temporarily */ dev = SNDRV_MINOR_DEVICE(minor); if (dev == SNDRV_MINOR_CONTROL) { /* /dev/aloadC? */ int card = SNDRV_MINOR_CARD(minor); struct snd_card *ref = snd_card_ref(card); if (!ref) snd_request_card(card); else snd_card_unref(ref); } else if (dev == SNDRV_MINOR_GLOBAL) { /* /dev/aloadSEQ */ snd_request_other(minor); } mutex_lock(&sound_mutex); /* reacuire lock */ return snd_minors[minor]; } #else /* !CONFIG_MODULES */ #define autoload_device(minor) NULL #endif /* CONFIG_MODULES */ static int snd_open(struct inode *inode, struct file *file) { unsigned int minor = iminor(inode); struct snd_minor *mptr = NULL; const struct file_operations *new_fops; int err = 0; if (minor >= ARRAY_SIZE(snd_minors)) return -ENODEV; mutex_lock(&sound_mutex); mptr = snd_minors[minor]; if (mptr == NULL) { mptr = autoload_device(minor); if (!mptr) { mutex_unlock(&sound_mutex); return -ENODEV; } } new_fops = fops_get(mptr->f_ops); mutex_unlock(&sound_mutex); if (!new_fops) return -ENODEV; replace_fops(file, new_fops); if (file->f_op->open) err = file->f_op->open(inode, file); return err; } static const struct file_operations snd_fops = { .owner = THIS_MODULE, .open = snd_open, .llseek = noop_llseek, }; #ifdef CONFIG_SND_DYNAMIC_MINORS static int snd_find_free_minor(int type, struct snd_card *card, int dev) { int minor; /* static minors for module auto loading */ if (type == SNDRV_DEVICE_TYPE_SEQUENCER) return SNDRV_MINOR_SEQUENCER; if (type == SNDRV_DEVICE_TYPE_TIMER) return SNDRV_MINOR_TIMER; for (minor = 0; minor < ARRAY_SIZE(snd_minors); ++minor) { /* skip static minors still used for module auto loading */ if (SNDRV_MINOR_DEVICE(minor) == SNDRV_MINOR_CONTROL) continue; if (minor == SNDRV_MINOR_SEQUENCER || minor == SNDRV_MINOR_TIMER) continue; if (!snd_minors[minor]) return minor; } return -EBUSY; } #else static int snd_find_free_minor(int type, struct snd_card *card, int dev) { int minor; switch (type) { case SNDRV_DEVICE_TYPE_SEQUENCER: case SNDRV_DEVICE_TYPE_TIMER: minor = type; break; case SNDRV_DEVICE_TYPE_CONTROL: if (snd_BUG_ON(!card)) return -EINVAL; minor = SNDRV_MINOR(card->number, type); break; case SNDRV_DEVICE_TYPE_HWDEP: case SNDRV_DEVICE_TYPE_RAWMIDI: case SNDRV_DEVICE_TYPE_PCM_PLAYBACK: case SNDRV_DEVICE_TYPE_PCM_CAPTURE: case SNDRV_DEVICE_TYPE_COMPRESS: if (snd_BUG_ON(!card)) return -EINVAL; minor = SNDRV_MINOR(card->number, type + dev); break; default: return -EINVAL; } if (snd_BUG_ON(minor < 0 || minor >= SNDRV_OS_MINORS)) return -EINVAL; if (snd_minors[minor]) return -EBUSY; return minor; } #endif /** * snd_register_device - Register the ALSA device file for the card * @type: the device type, SNDRV_DEVICE_TYPE_XXX * @card: the card instance * @dev: the device index * @f_ops: the file operations * @private_data: user pointer for f_ops->open() * @device: the device to register * * Registers an ALSA device file for the given card. * The operators have to be set in reg parameter. * * Return: Zero if successful, or a negative error code on failure. */ int snd_register_device(int type, struct snd_card *card, int dev, const struct file_operations *f_ops, void *private_data, struct device *device) { int minor; int err = 0; struct snd_minor *preg; if (snd_BUG_ON(!device)) return -EINVAL; preg = kmalloc(sizeof *preg, GFP_KERNEL); if (preg == NULL) return -ENOMEM; preg->type = type; preg->card = card ? card->number : -1; preg->device = dev; preg->f_ops = f_ops; preg->private_data = private_data; preg->card_ptr = card; mutex_lock(&sound_mutex); minor = snd_find_free_minor(type, card, dev); if (minor < 0) { err = minor; goto error; } preg->dev = device; device->devt = MKDEV(major, minor); err = device_add(device); if (err < 0) goto error; snd_minors[minor] = preg; error: mutex_unlock(&sound_mutex); if (err < 0) kfree(preg); return err; } EXPORT_SYMBOL(snd_register_device); /** * snd_unregister_device - unregister the device on the given card * @dev: the device instance * * Unregisters the device file already registered via * snd_register_device(). * * Return: Zero if successful, or a negative error code on failure. */ int snd_unregister_device(struct device *dev) { int minor; struct snd_minor *preg; mutex_lock(&sound_mutex); for (minor = 0; minor < ARRAY_SIZE(snd_minors); ++minor) { preg = snd_minors[minor]; if (preg && preg->dev == dev) { snd_minors[minor] = NULL; device_del(dev); kfree(preg); break; } } mutex_unlock(&sound_mutex); if (minor >= ARRAY_SIZE(snd_minors)) return -ENOENT; return 0; } EXPORT_SYMBOL(snd_unregister_device); #ifdef CONFIG_SND_PROC_FS /* * INFO PART */ static const char *snd_device_type_name(int type) { switch (type) { case SNDRV_DEVICE_TYPE_CONTROL: return "control"; case SNDRV_DEVICE_TYPE_HWDEP: return "hardware dependent"; case SNDRV_DEVICE_TYPE_RAWMIDI: return "raw midi"; case SNDRV_DEVICE_TYPE_PCM_PLAYBACK: return "digital audio playback"; case SNDRV_DEVICE_TYPE_PCM_CAPTURE: return "digital audio capture"; case SNDRV_DEVICE_TYPE_SEQUENCER: return "sequencer"; case SNDRV_DEVICE_TYPE_TIMER: return "timer"; case SNDRV_DEVICE_TYPE_COMPRESS: return "compress"; default: return "?"; } } static void snd_minor_info_read(struct snd_info_entry *entry, struct snd_info_buffer *buffer) { int minor; struct snd_minor *mptr; mutex_lock(&sound_mutex); for (minor = 0; minor < SNDRV_OS_MINORS; ++minor) { mptr = snd_minors[minor]; if (!mptr) continue; if (mptr->card >= 0) { if (mptr->device >= 0) snd_iprintf(buffer, "%3i: [%2i-%2i]: %s\n", minor, mptr->card, mptr->device, snd_device_type_name(mptr->type)); else snd_iprintf(buffer, "%3i: [%2i] : %s\n", minor, mptr->card, snd_device_type_name(mptr->type)); } else snd_iprintf(buffer, "%3i: : %s\n", minor, snd_device_type_name(mptr->type)); } mutex_unlock(&sound_mutex); } int __init snd_minor_info_init(void) { struct snd_info_entry *entry; entry = snd_info_create_module_entry(THIS_MODULE, "devices", NULL); if (!entry) return -ENOMEM; entry->c.text.read = snd_minor_info_read; return snd_info_register(entry); /* freed in error path */ } #endif /* CONFIG_SND_PROC_FS */ /* * INIT PART */ static int __init alsa_sound_init(void) { snd_major = major; snd_ecards_limit = cards_limit; if (register_chrdev(major, "alsa", &snd_fops)) { pr_err("ALSA core: unable to register native major device number %d\n", major); return -EIO; } if (snd_info_init() < 0) { unregister_chrdev(major, "alsa"); return -ENOMEM; } #ifdef CONFIG_SND_DEBUG sound_debugfs_root = debugfs_create_dir("sound", NULL); #endif #ifndef MODULE pr_info("Advanced Linux Sound Architecture Driver Initialized.\n"); #endif return 0; } static void __exit alsa_sound_exit(void) { #ifdef CONFIG_SND_DEBUG debugfs_remove(sound_debugfs_root); #endif snd_info_done(); unregister_chrdev(major, "alsa"); } subsys_initcall(alsa_sound_init); module_exit(alsa_sound_exit); |
19 208 13 141 140 142 141 2 6 197 8 1 6 201 5 5 4 4 4 4 3 3 3 3 1 1 10 6 5 1 10 1 9 9 3 2 7 3 6 3 2 7 2 7 9 2 2 1 1 1 11 11 1 1 6 1 3 1 7 3 6 8 3 1 2000 2000 495 496 45 197 197 197 11 1 10 9 3 4 8 2 5 7 7 7 15 4 11 8 2 7 2 13 3 2 5 10 6 19 11 4 1 5 141 63 1 60 19 1 6 4 2 1 1 11 1 1 3 4 1 9 5 3 11 2 1 10 2 1 1 2 1 2 2 1 1 1 5 11 2 2 3 1 1 1 5 32 30 1 1 1 1 3 3 12 12 1 11 11 14 14 5 3 11 1 2 1 20 5 2 1 1 2 2 38 16 23 4 1 2 25 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Linux IPv6 multicast routing support for BSD pim6sd * Based on net/ipv4/ipmr.c. * * (c) 2004 Mickael Hoerdt, <hoerdt@clarinet.u-strasbg.fr> * LSIIT Laboratory, Strasbourg, France * (c) 2004 Jean-Philippe Andriot, <jean-philippe.andriot@6WIND.com> * 6WIND, Paris, France * Copyright (C)2007,2008 USAGI/WIDE Project * YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org> */ #include <linux/uaccess.h> #include <linux/types.h> #include <linux/sched.h> #include <linux/errno.h> #include <linux/mm.h> #include <linux/kernel.h> #include <linux/fcntl.h> #include <linux/stat.h> #include <linux/socket.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/inetdevice.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/init.h> #include <linux/compat.h> #include <linux/rhashtable.h> #include <net/protocol.h> #include <linux/skbuff.h> #include <net/raw.h> #include <linux/notifier.h> #include <linux/if_arp.h> #include <net/checksum.h> #include <net/netlink.h> #include <net/fib_rules.h> #include <net/ipv6.h> #include <net/ip6_route.h> #include <linux/mroute6.h> #include <linux/pim.h> #include <net/addrconf.h> #include <linux/netfilter_ipv6.h> #include <linux/export.h> #include <net/ip6_checksum.h> #include <linux/netconf.h> #include <net/ip_tunnels.h> #include <linux/nospec.h> struct ip6mr_rule { struct fib_rule common; }; struct ip6mr_result { struct mr_table *mrt; }; /* Big lock, protecting vif table, mrt cache and mroute socket state. Note that the changes are semaphored via rtnl_lock. */ static DEFINE_SPINLOCK(mrt_lock); static struct net_device *vif_dev_read(const struct vif_device *vif) { return rcu_dereference(vif->dev); } /* Multicast router control variables */ /* Special spinlock for queue of unresolved entries */ static DEFINE_SPINLOCK(mfc_unres_lock); /* We return to original Alan's scheme. Hash table of resolved entries is changed only in process context and protected with weak lock mrt_lock. Queue of unresolved entries is protected with strong spinlock mfc_unres_lock. In this case data path is free of exclusive locks at all. */ static struct kmem_cache *mrt_cachep __read_mostly; static struct mr_table *ip6mr_new_table(struct net *net, u32 id); static void ip6mr_free_table(struct mr_table *mrt); static void ip6_mr_forward(struct net *net, struct mr_table *mrt, struct net_device *dev, struct sk_buff *skb, struct mfc6_cache *cache); static int ip6mr_cache_report(const struct mr_table *mrt, struct sk_buff *pkt, mifi_t mifi, int assert); static void mr6_netlink_event(struct mr_table *mrt, struct mfc6_cache *mfc, int cmd); static void mrt6msg_netlink_event(const struct mr_table *mrt, struct sk_buff *pkt); static int ip6mr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack); static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb); static void mroute_clean_tables(struct mr_table *mrt, int flags); static void ipmr_expire_process(struct timer_list *t); #ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES #define ip6mr_for_each_table(mrt, net) \ list_for_each_entry_rcu(mrt, &net->ipv6.mr6_tables, list, \ lockdep_rtnl_is_held() || \ list_empty(&net->ipv6.mr6_tables)) static struct mr_table *ip6mr_mr_table_iter(struct net *net, struct mr_table *mrt) { struct mr_table *ret; if (!mrt) ret = list_entry_rcu(net->ipv6.mr6_tables.next, struct mr_table, list); else ret = list_entry_rcu(mrt->list.next, struct mr_table, list); if (&ret->list == &net->ipv6.mr6_tables) return NULL; return ret; } static struct mr_table *ip6mr_get_table(struct net *net, u32 id) { struct mr_table *mrt; ip6mr_for_each_table(mrt, net) { if (mrt->id == id) return mrt; } return NULL; } static int ip6mr_fib_lookup(struct net *net, struct flowi6 *flp6, struct mr_table **mrt) { int err; struct ip6mr_result res; struct fib_lookup_arg arg = { .result = &res, .flags = FIB_LOOKUP_NOREF, }; /* update flow if oif or iif point to device enslaved to l3mdev */ l3mdev_update_flow(net, flowi6_to_flowi(flp6)); err = fib_rules_lookup(net->ipv6.mr6_rules_ops, flowi6_to_flowi(flp6), 0, &arg); if (err < 0) return err; *mrt = res.mrt; return 0; } static int ip6mr_rule_action(struct fib_rule *rule, struct flowi *flp, int flags, struct fib_lookup_arg *arg) { struct ip6mr_result *res = arg->result; struct mr_table *mrt; switch (rule->action) { case FR_ACT_TO_TBL: break; case FR_ACT_UNREACHABLE: return -ENETUNREACH; case FR_ACT_PROHIBIT: return -EACCES; case FR_ACT_BLACKHOLE: default: return -EINVAL; } arg->table = fib_rule_get_table(rule, arg); mrt = ip6mr_get_table(rule->fr_net, arg->table); if (!mrt) return -EAGAIN; res->mrt = mrt; return 0; } static int ip6mr_rule_match(struct fib_rule *rule, struct flowi *flp, int flags) { return 1; } static int ip6mr_rule_configure(struct fib_rule *rule, struct sk_buff *skb, struct fib_rule_hdr *frh, struct nlattr **tb, struct netlink_ext_ack *extack) { return 0; } static int ip6mr_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, struct nlattr **tb) { return 1; } static int ip6mr_rule_fill(struct fib_rule *rule, struct sk_buff *skb, struct fib_rule_hdr *frh) { frh->dst_len = 0; frh->src_len = 0; frh->tos = 0; return 0; } static const struct fib_rules_ops __net_initconst ip6mr_rules_ops_template = { .family = RTNL_FAMILY_IP6MR, .rule_size = sizeof(struct ip6mr_rule), .addr_size = sizeof(struct in6_addr), .action = ip6mr_rule_action, .match = ip6mr_rule_match, .configure = ip6mr_rule_configure, .compare = ip6mr_rule_compare, .fill = ip6mr_rule_fill, .nlgroup = RTNLGRP_IPV6_RULE, .owner = THIS_MODULE, }; static int __net_init ip6mr_rules_init(struct net *net) { struct fib_rules_ops *ops; struct mr_table *mrt; int err; ops = fib_rules_register(&ip6mr_rules_ops_template, net); if (IS_ERR(ops)) return PTR_ERR(ops); INIT_LIST_HEAD(&net->ipv6.mr6_tables); mrt = ip6mr_new_table(net, RT6_TABLE_DFLT); if (IS_ERR(mrt)) { err = PTR_ERR(mrt); goto err1; } err = fib_default_rule_add(ops, 0x7fff, RT6_TABLE_DFLT, 0); if (err < 0) goto err2; net->ipv6.mr6_rules_ops = ops; return 0; err2: rtnl_lock(); ip6mr_free_table(mrt); rtnl_unlock(); err1: fib_rules_unregister(ops); return err; } static void __net_exit ip6mr_rules_exit(struct net *net) { struct mr_table *mrt, *next; ASSERT_RTNL(); list_for_each_entry_safe(mrt, next, &net->ipv6.mr6_tables, list) { list_del(&mrt->list); ip6mr_free_table(mrt); } fib_rules_unregister(net->ipv6.mr6_rules_ops); } static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { return fib_rules_dump(net, nb, RTNL_FAMILY_IP6MR, extack); } static unsigned int ip6mr_rules_seq_read(struct net *net) { return fib_rules_seq_read(net, RTNL_FAMILY_IP6MR); } bool ip6mr_rule_default(const struct fib_rule *rule) { return fib_rule_matchall(rule) && rule->action == FR_ACT_TO_TBL && rule->table == RT6_TABLE_DFLT && !rule->l3mdev; } EXPORT_SYMBOL(ip6mr_rule_default); #else #define ip6mr_for_each_table(mrt, net) \ for (mrt = net->ipv6.mrt6; mrt; mrt = NULL) static struct mr_table *ip6mr_mr_table_iter(struct net *net, struct mr_table *mrt) { if (!mrt) return net->ipv6.mrt6; return NULL; } static struct mr_table *ip6mr_get_table(struct net *net, u32 id) { return net->ipv6.mrt6; } static int ip6mr_fib_lookup(struct net *net, struct flowi6 *flp6, struct mr_table **mrt) { *mrt = net->ipv6.mrt6; return 0; } static int __net_init ip6mr_rules_init(struct net *net) { struct mr_table *mrt; mrt = ip6mr_new_table(net, RT6_TABLE_DFLT); if (IS_ERR(mrt)) return PTR_ERR(mrt); net->ipv6.mrt6 = mrt; return 0; } static void __net_exit ip6mr_rules_exit(struct net *net) { ASSERT_RTNL(); ip6mr_free_table(net->ipv6.mrt6); net->ipv6.mrt6 = NULL; } static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { return 0; } static unsigned int ip6mr_rules_seq_read(struct net *net) { return 0; } #endif static int ip6mr_hash_cmp(struct rhashtable_compare_arg *arg, const void *ptr) { const struct mfc6_cache_cmp_arg *cmparg = arg->key; struct mfc6_cache *c = (struct mfc6_cache *)ptr; return !ipv6_addr_equal(&c->mf6c_mcastgrp, &cmparg->mf6c_mcastgrp) || !ipv6_addr_equal(&c->mf6c_origin, &cmparg->mf6c_origin); } static const struct rhashtable_params ip6mr_rht_params = { .head_offset = offsetof(struct mr_mfc, mnode), .key_offset = offsetof(struct mfc6_cache, cmparg), .key_len = sizeof(struct mfc6_cache_cmp_arg), .nelem_hint = 3, .obj_cmpfn = ip6mr_hash_cmp, .automatic_shrinking = true, }; static void ip6mr_new_table_set(struct mr_table *mrt, struct net *net) { #ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES list_add_tail_rcu(&mrt->list, &net->ipv6.mr6_tables); #endif } static struct mfc6_cache_cmp_arg ip6mr_mr_table_ops_cmparg_any = { .mf6c_origin = IN6ADDR_ANY_INIT, .mf6c_mcastgrp = IN6ADDR_ANY_INIT, }; static struct mr_table_ops ip6mr_mr_table_ops = { .rht_params = &ip6mr_rht_params, .cmparg_any = &ip6mr_mr_table_ops_cmparg_any, }; static struct mr_table *ip6mr_new_table(struct net *net, u32 id) { struct mr_table *mrt; mrt = ip6mr_get_table(net, id); if (mrt) return mrt; return mr_table_alloc(net, id, &ip6mr_mr_table_ops, ipmr_expire_process, ip6mr_new_table_set); } static void ip6mr_free_table(struct mr_table *mrt) { timer_shutdown_sync(&mrt->ipmr_expire_timer); mroute_clean_tables(mrt, MRT6_FLUSH_MIFS | MRT6_FLUSH_MIFS_STATIC | MRT6_FLUSH_MFC | MRT6_FLUSH_MFC_STATIC); rhltable_destroy(&mrt->mfc_hash); kfree(mrt); } #ifdef CONFIG_PROC_FS /* The /proc interfaces to multicast routing * /proc/ip6_mr_cache /proc/ip6_mr_vif */ static void *ip6mr_vif_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { struct mr_vif_iter *iter = seq->private; struct net *net = seq_file_net(seq); struct mr_table *mrt; mrt = ip6mr_get_table(net, RT6_TABLE_DFLT); if (!mrt) return ERR_PTR(-ENOENT); iter->mrt = mrt; rcu_read_lock(); return mr_vif_seq_start(seq, pos); } static void ip6mr_vif_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { rcu_read_unlock(); } static int ip6mr_vif_seq_show(struct seq_file *seq, void *v) { struct mr_vif_iter *iter = seq->private; struct mr_table *mrt = iter->mrt; if (v == SEQ_START_TOKEN) { seq_puts(seq, "Interface BytesIn PktsIn BytesOut PktsOut Flags\n"); } else { const struct vif_device *vif = v; const struct net_device *vif_dev; const char *name; vif_dev = vif_dev_read(vif); name = vif_dev ? vif_dev->name : "none"; seq_printf(seq, "%2td %-10s %8ld %7ld %8ld %7ld %05X\n", vif - mrt->vif_table, name, vif->bytes_in, vif->pkt_in, vif->bytes_out, vif->pkt_out, vif->flags); } return 0; } static const struct seq_operations ip6mr_vif_seq_ops = { .start = ip6mr_vif_seq_start, .next = mr_vif_seq_next, .stop = ip6mr_vif_seq_stop, .show = ip6mr_vif_seq_show, }; static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos) { struct net *net = seq_file_net(seq); struct mr_table *mrt; mrt = ip6mr_get_table(net, RT6_TABLE_DFLT); if (!mrt) return ERR_PTR(-ENOENT); return mr_mfc_seq_start(seq, pos, mrt, &mfc_unres_lock); } static int ipmr_mfc_seq_show(struct seq_file *seq, void *v) { int n; if (v == SEQ_START_TOKEN) { seq_puts(seq, "Group " "Origin " "Iif Pkts Bytes Wrong Oifs\n"); } else { const struct mfc6_cache *mfc = v; const struct mr_mfc_iter *it = seq->private; struct mr_table *mrt = it->mrt; seq_printf(seq, "%pI6 %pI6 %-3hd", &mfc->mf6c_mcastgrp, &mfc->mf6c_origin, mfc->_c.mfc_parent); if (it->cache != &mrt->mfc_unres_queue) { seq_printf(seq, " %8lu %8lu %8lu", mfc->_c.mfc_un.res.pkt, mfc->_c.mfc_un.res.bytes, mfc->_c.mfc_un.res.wrong_if); for (n = mfc->_c.mfc_un.res.minvif; n < mfc->_c.mfc_un.res.maxvif; n++) { if (VIF_EXISTS(mrt, n) && mfc->_c.mfc_un.res.ttls[n] < 255) seq_printf(seq, " %2d:%-3d", n, mfc->_c.mfc_un.res.ttls[n]); } } else { /* unresolved mfc_caches don't contain * pkt, bytes and wrong_if values */ seq_printf(seq, " %8lu %8lu %8lu", 0ul, 0ul, 0ul); } seq_putc(seq, '\n'); } return 0; } static const struct seq_operations ipmr_mfc_seq_ops = { .start = ipmr_mfc_seq_start, .next = mr_mfc_seq_next, .stop = mr_mfc_seq_stop, .show = ipmr_mfc_seq_show, }; #endif #ifdef CONFIG_IPV6_PIMSM_V2 static int pim6_rcv(struct sk_buff *skb) { struct pimreghdr *pim; struct ipv6hdr *encap; struct net_device *reg_dev = NULL; struct net *net = dev_net(skb->dev); struct mr_table *mrt; struct flowi6 fl6 = { .flowi6_iif = skb->dev->ifindex, .flowi6_mark = skb->mark, }; int reg_vif_num; if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap))) goto drop; pim = (struct pimreghdr *)skb_transport_header(skb); if (pim->type != ((PIM_VERSION << 4) | PIM_TYPE_REGISTER) || (pim->flags & PIM_NULL_REGISTER) || (csum_ipv6_magic(&ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, sizeof(*pim), IPPROTO_PIM, csum_partial((void *)pim, sizeof(*pim), 0)) && csum_fold(skb_checksum(skb, 0, skb->len, 0)))) goto drop; /* check if the inner packet is destined to mcast group */ encap = (struct ipv6hdr *)(skb_transport_header(skb) + sizeof(*pim)); if (!ipv6_addr_is_multicast(&encap->daddr) || encap->payload_len == 0 || ntohs(encap->payload_len) + sizeof(*pim) > skb->len) goto drop; if (ip6mr_fib_lookup(net, &fl6, &mrt) < 0) goto drop; /* Pairs with WRITE_ONCE() in mif6_add()/mif6_delete() */ reg_vif_num = READ_ONCE(mrt->mroute_reg_vif_num); if (reg_vif_num >= 0) reg_dev = vif_dev_read(&mrt->vif_table[reg_vif_num]); if (!reg_dev) goto drop; skb->mac_header = skb->network_header; skb_pull(skb, (u8 *)encap - skb->data); skb_reset_network_header(skb); skb->protocol = htons(ETH_P_IPV6); skb->ip_summed = CHECKSUM_NONE; skb_tunnel_rx(skb, reg_dev, dev_net(reg_dev)); netif_rx(skb); return 0; drop: kfree_skb(skb); return 0; } static const struct inet6_protocol pim6_protocol = { .handler = pim6_rcv, }; /* Service routines creating virtual interfaces: PIMREG */ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev) { struct net *net = dev_net(dev); struct mr_table *mrt; struct flowi6 fl6 = { .flowi6_oif = dev->ifindex, .flowi6_iif = skb->skb_iif ? : LOOPBACK_IFINDEX, .flowi6_mark = skb->mark, }; if (!pskb_inet_may_pull(skb)) goto tx_err; if (ip6mr_fib_lookup(net, &fl6, &mrt) < 0) goto tx_err; DEV_STATS_ADD(dev, tx_bytes, skb->len); DEV_STATS_INC(dev, tx_packets); rcu_read_lock(); ip6mr_cache_report(mrt, skb, READ_ONCE(mrt->mroute_reg_vif_num), MRT6MSG_WHOLEPKT); rcu_read_unlock(); kfree_skb(skb); return NETDEV_TX_OK; tx_err: DEV_STATS_INC(dev, tx_errors); kfree_skb(skb); return NETDEV_TX_OK; } static int reg_vif_get_iflink(const struct net_device *dev) { return 0; } static const struct net_device_ops reg_vif_netdev_ops = { .ndo_start_xmit = reg_vif_xmit, .ndo_get_iflink = reg_vif_get_iflink, }; static void reg_vif_setup(struct net_device *dev) { dev->type = ARPHRD_PIMREG; dev->mtu = 1500 - sizeof(struct ipv6hdr) - 8; dev->flags = IFF_NOARP; dev->netdev_ops = ®_vif_netdev_ops; dev->needs_free_netdev = true; dev->features |= NETIF_F_NETNS_LOCAL; } static struct net_device *ip6mr_reg_vif(struct net *net, struct mr_table *mrt) { struct net_device *dev; char name[IFNAMSIZ]; if (mrt->id == RT6_TABLE_DFLT) sprintf(name, "pim6reg"); else sprintf(name, "pim6reg%u", mrt->id); dev = alloc_netdev(0, name, NET_NAME_UNKNOWN, reg_vif_setup); if (!dev) return NULL; dev_net_set(dev, net); if (register_netdevice(dev)) { free_netdev(dev); return NULL; } if (dev_open(dev, NULL)) goto failure; dev_hold(dev); return dev; failure: unregister_netdevice(dev); return NULL; } #endif static int call_ip6mr_vif_entry_notifiers(struct net *net, enum fib_event_type event_type, struct vif_device *vif, struct net_device *vif_dev, mifi_t vif_index, u32 tb_id) { return mr_call_vif_notifiers(net, RTNL_FAMILY_IP6MR, event_type, vif, vif_dev, vif_index, tb_id, &net->ipv6.ipmr_seq); } static int call_ip6mr_mfc_entry_notifiers(struct net *net, enum fib_event_type event_type, struct mfc6_cache *mfc, u32 tb_id) { return mr_call_mfc_notifiers(net, RTNL_FAMILY_IP6MR, event_type, &mfc->_c, tb_id, &net->ipv6.ipmr_seq); } /* Delete a VIF entry */ static int mif6_delete(struct mr_table *mrt, int vifi, int notify, struct list_head *head) { struct vif_device *v; struct net_device *dev; struct inet6_dev *in6_dev; if (vifi < 0 || vifi >= mrt->maxvif) return -EADDRNOTAVAIL; v = &mrt->vif_table[vifi]; dev = rtnl_dereference(v->dev); if (!dev) return -EADDRNOTAVAIL; call_ip6mr_vif_entry_notifiers(read_pnet(&mrt->net), FIB_EVENT_VIF_DEL, v, dev, vifi, mrt->id); spin_lock(&mrt_lock); RCU_INIT_POINTER(v->dev, NULL); #ifdef CONFIG_IPV6_PIMSM_V2 if (vifi == mrt->mroute_reg_vif_num) { /* Pairs with READ_ONCE() in ip6mr_cache_report() and reg_vif_xmit() */ WRITE_ONCE(mrt->mroute_reg_vif_num, -1); } #endif if (vifi + 1 == mrt->maxvif) { int tmp; for (tmp = vifi - 1; tmp >= 0; tmp--) { if (VIF_EXISTS(mrt, tmp)) break; } WRITE_ONCE(mrt->maxvif, tmp + 1); } spin_unlock(&mrt_lock); dev_set_allmulti(dev, -1); in6_dev = __in6_dev_get(dev); if (in6_dev) { atomic_dec(&in6_dev->cnf.mc_forwarding); inet6_netconf_notify_devconf(dev_net(dev), RTM_NEWNETCONF, NETCONFA_MC_FORWARDING, dev->ifindex, &in6_dev->cnf); } if ((v->flags & MIFF_REGISTER) && !notify) unregister_netdevice_queue(dev, head); netdev_put(dev, &v->dev_tracker); return 0; } static inline void ip6mr_cache_free_rcu(struct rcu_head *head) { struct mr_mfc *c = container_of(head, struct mr_mfc, rcu); kmem_cache_free(mrt_cachep, (struct mfc6_cache *)c); } static inline void ip6mr_cache_free(struct mfc6_cache *c) { call_rcu(&c->_c.rcu, ip6mr_cache_free_rcu); } /* Destroy an unresolved cache entry, killing queued skbs and reporting error to netlink readers. */ static void ip6mr_destroy_unres(struct mr_table *mrt, struct mfc6_cache *c) { struct net *net = read_pnet(&mrt->net); struct sk_buff *skb; atomic_dec(&mrt->cache_resolve_queue_len); while ((skb = skb_dequeue(&c->_c.mfc_un.unres.unresolved)) != NULL) { if (ipv6_hdr(skb)->version == 0) { struct nlmsghdr *nlh = skb_pull(skb, sizeof(struct ipv6hdr)); nlh->nlmsg_type = NLMSG_ERROR; nlh->nlmsg_len = nlmsg_msg_size(sizeof(struct nlmsgerr)); skb_trim(skb, nlh->nlmsg_len); ((struct nlmsgerr *)nlmsg_data(nlh))->error = -ETIMEDOUT; rtnl_unicast(skb, net, NETLINK_CB(skb).portid); } else kfree_skb(skb); } ip6mr_cache_free(c); } /* Timer process for all the unresolved queue. */ static void ipmr_do_expire_process(struct mr_table *mrt) { unsigned long now = jiffies; unsigned long expires = 10 * HZ; struct mr_mfc *c, *next; list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) { if (time_after(c->mfc_un.unres.expires, now)) { /* not yet... */ unsigned long interval = c->mfc_un.unres.expires - now; if (interval < expires) expires = interval; continue; } list_del(&c->list); mr6_netlink_event(mrt, (struct mfc6_cache *)c, RTM_DELROUTE); ip6mr_destroy_unres(mrt, (struct mfc6_cache *)c); } if (!list_empty(&mrt->mfc_unres_queue)) mod_timer(&mrt->ipmr_expire_timer, jiffies + expires); } static void ipmr_expire_process(struct timer_list *t) { struct mr_table *mrt = from_timer(mrt, t, ipmr_expire_timer); if (!spin_trylock(&mfc_unres_lock)) { mod_timer(&mrt->ipmr_expire_timer, jiffies + 1); return; } if (!list_empty(&mrt->mfc_unres_queue)) ipmr_do_expire_process(mrt); spin_unlock(&mfc_unres_lock); } /* Fill oifs list. It is called under locked mrt_lock. */ static void ip6mr_update_thresholds(struct mr_table *mrt, struct mr_mfc *cache, unsigned char *ttls) { int vifi; cache->mfc_un.res.minvif = MAXMIFS; cache->mfc_un.res.maxvif = 0; memset(cache->mfc_un.res.ttls, 255, MAXMIFS); for (vifi = 0; vifi < mrt->maxvif; vifi++) { if (VIF_EXISTS(mrt, vifi) && ttls[vifi] && ttls[vifi] < 255) { cache->mfc_un.res.ttls[vifi] = ttls[vifi]; if (cache->mfc_un.res.minvif > vifi) cache->mfc_un.res.minvif = vifi; if (cache->mfc_un.res.maxvif <= vifi) cache->mfc_un.res.maxvif = vifi + 1; } } cache->mfc_un.res.lastuse = jiffies; } static int mif6_add(struct net *net, struct mr_table *mrt, struct mif6ctl *vifc, int mrtsock) { int vifi = vifc->mif6c_mifi; struct vif_device *v = &mrt->vif_table[vifi]; struct net_device *dev; struct inet6_dev *in6_dev; int err; /* Is vif busy ? */ if (VIF_EXISTS(mrt, vifi)) return -EADDRINUSE; switch (vifc->mif6c_flags) { #ifdef CONFIG_IPV6_PIMSM_V2 case MIFF_REGISTER: /* * Special Purpose VIF in PIM * All the packets will be sent to the daemon */ if (mrt->mroute_reg_vif_num >= 0) return -EADDRINUSE; dev = ip6mr_reg_vif(net, mrt); if (!dev) return -ENOBUFS; err = dev_set_allmulti(dev, 1); if (err) { unregister_netdevice(dev); dev_put(dev); return err; } break; #endif case 0: dev = dev_get_by_index(net, vifc->mif6c_pifi); if (!dev) return -EADDRNOTAVAIL; err = dev_set_allmulti(dev, 1); if (err) { dev_put(dev); return err; } break; default: return -EINVAL; } in6_dev = __in6_dev_get(dev); if (in6_dev) { atomic_inc(&in6_dev->cnf.mc_forwarding); inet6_netconf_notify_devconf(dev_net(dev), RTM_NEWNETCONF, NETCONFA_MC_FORWARDING, dev->ifindex, &in6_dev->cnf); } /* Fill in the VIF structures */ vif_device_init(v, dev, vifc->vifc_rate_limit, vifc->vifc_threshold, vifc->mif6c_flags | (!mrtsock ? VIFF_STATIC : 0), MIFF_REGISTER); /* And finish update writing critical data */ spin_lock(&mrt_lock); rcu_assign_pointer(v->dev, dev); netdev_tracker_alloc(dev, &v->dev_tracker, GFP_ATOMIC); #ifdef CONFIG_IPV6_PIMSM_V2 if (v->flags & MIFF_REGISTER) WRITE_ONCE(mrt->mroute_reg_vif_num, vifi); #endif if (vifi + 1 > mrt->maxvif) WRITE_ONCE(mrt->maxvif, vifi + 1); spin_unlock(&mrt_lock); call_ip6mr_vif_entry_notifiers(net, FIB_EVENT_VIF_ADD, v, dev, vifi, mrt->id); return 0; } static struct mfc6_cache *ip6mr_cache_find(struct mr_table *mrt, const struct in6_addr *origin, const struct in6_addr *mcastgrp) { struct mfc6_cache_cmp_arg arg = { .mf6c_origin = *origin, .mf6c_mcastgrp = *mcastgrp, }; return mr_mfc_find(mrt, &arg); } /* Look for a (*,G) entry */ static struct mfc6_cache *ip6mr_cache_find_any(struct mr_table *mrt, struct in6_addr *mcastgrp, mifi_t mifi) { struct mfc6_cache_cmp_arg arg = { .mf6c_origin = in6addr_any, .mf6c_mcastgrp = *mcastgrp, }; if (ipv6_addr_any(mcastgrp)) return mr_mfc_find_any_parent(mrt, mifi); return mr_mfc_find_any(mrt, mifi, &arg); } /* Look for a (S,G,iif) entry if parent != -1 */ static struct mfc6_cache * ip6mr_cache_find_parent(struct mr_table *mrt, const struct in6_addr *origin, const struct in6_addr *mcastgrp, int parent) { struct mfc6_cache_cmp_arg arg = { .mf6c_origin = *origin, .mf6c_mcastgrp = *mcastgrp, }; return mr_mfc_find_parent(mrt, &arg, parent); } /* Allocate a multicast cache entry */ static struct mfc6_cache *ip6mr_cache_alloc(void) { struct mfc6_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL); if (!c) return NULL; c->_c.mfc_un.res.last_assert = jiffies - MFC_ASSERT_THRESH - 1; c->_c.mfc_un.res.minvif = MAXMIFS; c->_c.free = ip6mr_cache_free_rcu; refcount_set(&c->_c.mfc_un.res.refcount, 1); return c; } static struct mfc6_cache *ip6mr_cache_alloc_unres(void) { struct mfc6_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC); if (!c) return NULL; skb_queue_head_init(&c->_c.mfc_un.unres.unresolved); c->_c.mfc_un.unres.expires = jiffies + 10 * HZ; return c; } /* * A cache entry has gone into a resolved state from queued */ static void ip6mr_cache_resolve(struct net *net, struct mr_table *mrt, struct mfc6_cache *uc, struct mfc6_cache *c) { struct sk_buff *skb; /* * Play the pending entries through our router */ while ((skb = __skb_dequeue(&uc->_c.mfc_un.unres.unresolved))) { if (ipv6_hdr(skb)->version == 0) { struct nlmsghdr *nlh = skb_pull(skb, sizeof(struct ipv6hdr)); if (mr_fill_mroute(mrt, skb, &c->_c, nlmsg_data(nlh)) > 0) { nlh->nlmsg_len = skb_tail_pointer(skb) - (u8 *)nlh; } else { nlh->nlmsg_type = NLMSG_ERROR; nlh->nlmsg_len = nlmsg_msg_size(sizeof(struct nlmsgerr)); skb_trim(skb, nlh->nlmsg_len); ((struct nlmsgerr *)nlmsg_data(nlh))->error = -EMSGSIZE; } rtnl_unicast(skb, net, NETLINK_CB(skb).portid); } else { rcu_read_lock(); ip6_mr_forward(net, mrt, skb->dev, skb, c); rcu_read_unlock(); } } } /* * Bounce a cache query up to pim6sd and netlink. * * Called under rcu_read_lock() */ static int ip6mr_cache_report(const struct mr_table *mrt, struct sk_buff *pkt, mifi_t mifi, int assert) { struct sock *mroute6_sk; struct sk_buff *skb; struct mrt6msg *msg; int ret; #ifdef CONFIG_IPV6_PIMSM_V2 if (assert == MRT6MSG_WHOLEPKT || assert == MRT6MSG_WRMIFWHOLE) skb = skb_realloc_headroom(pkt, -skb_network_offset(pkt) +sizeof(*msg)); else #endif skb = alloc_skb(sizeof(struct ipv6hdr) + sizeof(*msg), GFP_ATOMIC); if (!skb) return -ENOBUFS; /* I suppose that internal messages * do not require checksums */ skb->ip_summed = CHECKSUM_UNNECESSARY; #ifdef CONFIG_IPV6_PIMSM_V2 if (assert == MRT6MSG_WHOLEPKT || assert == MRT6MSG_WRMIFWHOLE) { /* Ugly, but we have no choice with this interface. Duplicate old header, fix length etc. And all this only to mangle msg->im6_msgtype and to set msg->im6_mbz to "mbz" :-) */ __skb_pull(skb, skb_network_offset(pkt)); skb_push(skb, sizeof(*msg)); skb_reset_transport_header(skb); msg = (struct mrt6msg *)skb_transport_header(skb); msg->im6_mbz = 0; msg->im6_msgtype = assert; if (assert == MRT6MSG_WRMIFWHOLE) msg->im6_mif = mifi; else msg->im6_mif = READ_ONCE(mrt->mroute_reg_vif_num); msg->im6_pad = 0; msg->im6_src = ipv6_hdr(pkt)->saddr; msg->im6_dst = ipv6_hdr(pkt)->daddr; skb->ip_summed = CHECKSUM_UNNECESSARY; } else #endif { /* * Copy the IP header */ skb_put(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); skb_copy_to_linear_data(skb, ipv6_hdr(pkt), sizeof(struct ipv6hdr)); /* * Add our header */ skb_put(skb, sizeof(*msg)); skb_reset_transport_header(skb); msg = (struct mrt6msg *)skb_transport_header(skb); msg->im6_mbz = 0; msg->im6_msgtype = assert; msg->im6_mif = mifi; msg->im6_pad = 0; msg->im6_src = ipv6_hdr(pkt)->saddr; msg->im6_dst = ipv6_hdr(pkt)->daddr; skb_dst_set(skb, dst_clone(skb_dst(pkt))); skb->ip_summed = CHECKSUM_UNNECESSARY; } mroute6_sk = rcu_dereference(mrt->mroute_sk); if (!mroute6_sk) { kfree_skb(skb); return -EINVAL; } mrt6msg_netlink_event(mrt, skb); /* Deliver to user space multicast routing algorithms */ ret = sock_queue_rcv_skb(mroute6_sk, skb); if (ret < 0) { net_warn_ratelimited("mroute6: pending queue full, dropping entries\n"); kfree_skb(skb); } return ret; } /* Queue a packet for resolution. It gets locked cache entry! */ static int ip6mr_cache_unresolved(struct mr_table *mrt, mifi_t mifi, struct sk_buff *skb, struct net_device *dev) { struct mfc6_cache *c; bool found = false; int err; spin_lock_bh(&mfc_unres_lock); list_for_each_entry(c, &mrt->mfc_unres_queue, _c.list) { if (ipv6_addr_equal(&c->mf6c_mcastgrp, &ipv6_hdr(skb)->daddr) && ipv6_addr_equal(&c->mf6c_origin, &ipv6_hdr(skb)->saddr)) { found = true; break; } } if (!found) { /* * Create a new entry if allowable */ c = ip6mr_cache_alloc_unres(); if (!c) { spin_unlock_bh(&mfc_unres_lock); kfree_skb(skb); return -ENOBUFS; } /* Fill in the new cache entry */ c->_c.mfc_parent = -1; c->mf6c_origin = ipv6_hdr(skb)->saddr; c->mf6c_mcastgrp = ipv6_hdr(skb)->daddr; /* * Reflect first query at pim6sd */ err = ip6mr_cache_report(mrt, skb, mifi, MRT6MSG_NOCACHE); if (err < 0) { /* If the report failed throw the cache entry out - Brad Parker */ spin_unlock_bh(&mfc_unres_lock); ip6mr_cache_free(c); kfree_skb(skb); return err; } atomic_inc(&mrt->cache_resolve_queue_len); list_add(&c->_c.list, &mrt->mfc_unres_queue); mr6_netlink_event(mrt, c, RTM_NEWROUTE); ipmr_do_expire_process(mrt); } /* See if we can append the packet */ if (c->_c.mfc_un.unres.unresolved.qlen > 3) { kfree_skb(skb); err = -ENOBUFS; } else { if (dev) { skb->dev = dev; skb->skb_iif = dev->ifindex; } skb_queue_tail(&c->_c.mfc_un.unres.unresolved, skb); err = 0; } spin_unlock_bh(&mfc_unres_lock); return err; } /* * MFC6 cache manipulation by user space */ static int ip6mr_mfc_delete(struct mr_table *mrt, struct mf6cctl *mfc, int parent) { struct mfc6_cache *c; /* The entries are added/deleted only under RTNL */ rcu_read_lock(); c = ip6mr_cache_find_parent(mrt, &mfc->mf6cc_origin.sin6_addr, &mfc->mf6cc_mcastgrp.sin6_addr, parent); rcu_read_unlock(); if (!c) return -ENOENT; rhltable_remove(&mrt->mfc_hash, &c->_c.mnode, ip6mr_rht_params); list_del_rcu(&c->_c.list); call_ip6mr_mfc_entry_notifiers(read_pnet(&mrt->net), FIB_EVENT_ENTRY_DEL, c, mrt->id); mr6_netlink_event(mrt, c, RTM_DELROUTE); mr_cache_put(&c->_c); return 0; } static int ip6mr_device_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); struct mr_table *mrt; struct vif_device *v; int ct; if (event != NETDEV_UNREGISTER) return NOTIFY_DONE; ip6mr_for_each_table(mrt, net) { v = &mrt->vif_table[0]; for (ct = 0; ct < mrt->maxvif; ct++, v++) { if (rcu_access_pointer(v->dev) == dev) mif6_delete(mrt, ct, 1, NULL); } } return NOTIFY_DONE; } static unsigned int ip6mr_seq_read(struct net *net) { ASSERT_RTNL(); return net->ipv6.ipmr_seq + ip6mr_rules_seq_read(net); } static int ip6mr_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { return mr_dump(net, nb, RTNL_FAMILY_IP6MR, ip6mr_rules_dump, ip6mr_mr_table_iter, extack); } static struct notifier_block ip6_mr_notifier = { .notifier_call = ip6mr_device_event }; static const struct fib_notifier_ops ip6mr_notifier_ops_template = { .family = RTNL_FAMILY_IP6MR, .fib_seq_read = ip6mr_seq_read, .fib_dump = ip6mr_dump, .owner = THIS_MODULE, }; static int __net_init ip6mr_notifier_init(struct net *net) { struct fib_notifier_ops *ops; net->ipv6.ipmr_seq = 0; ops = fib_notifier_ops_register(&ip6mr_notifier_ops_template, net); if (IS_ERR(ops)) return PTR_ERR(ops); net->ipv6.ip6mr_notifier_ops = ops; return 0; } static void __net_exit ip6mr_notifier_exit(struct net *net) { fib_notifier_ops_unregister(net->ipv6.ip6mr_notifier_ops); net->ipv6.ip6mr_notifier_ops = NULL; } /* Setup for IP multicast routing */ static int __net_init ip6mr_net_init(struct net *net) { int err; err = ip6mr_notifier_init(net); if (err) return err; err = ip6mr_rules_init(net); if (err < 0) goto ip6mr_rules_fail; #ifdef CONFIG_PROC_FS err = -ENOMEM; if (!proc_create_net("ip6_mr_vif", 0, net->proc_net, &ip6mr_vif_seq_ops, sizeof(struct mr_vif_iter))) goto proc_vif_fail; if (!proc_create_net("ip6_mr_cache", 0, net->proc_net, &ipmr_mfc_seq_ops, sizeof(struct mr_mfc_iter))) goto proc_cache_fail; #endif return 0; #ifdef CONFIG_PROC_FS proc_cache_fail: remove_proc_entry("ip6_mr_vif", net->proc_net); proc_vif_fail: rtnl_lock(); ip6mr_rules_exit(net); rtnl_unlock(); #endif ip6mr_rules_fail: ip6mr_notifier_exit(net); return err; } static void __net_exit ip6mr_net_exit(struct net *net) { #ifdef CONFIG_PROC_FS remove_proc_entry("ip6_mr_cache", net->proc_net); remove_proc_entry("ip6_mr_vif", net->proc_net); #endif ip6mr_notifier_exit(net); } static void __net_exit ip6mr_net_exit_batch(struct list_head *net_list) { struct net *net; rtnl_lock(); list_for_each_entry(net, net_list, exit_list) ip6mr_rules_exit(net); rtnl_unlock(); } static struct pernet_operations ip6mr_net_ops = { .init = ip6mr_net_init, .exit = ip6mr_net_exit, .exit_batch = ip6mr_net_exit_batch, }; int __init ip6_mr_init(void) { int err; mrt_cachep = kmem_cache_create("ip6_mrt_cache", sizeof(struct mfc6_cache), 0, SLAB_HWCACHE_ALIGN, NULL); if (!mrt_cachep) return -ENOMEM; err = register_pernet_subsys(&ip6mr_net_ops); if (err) goto reg_pernet_fail; err = register_netdevice_notifier(&ip6_mr_notifier); if (err) goto reg_notif_fail; #ifdef CONFIG_IPV6_PIMSM_V2 if (inet6_add_protocol(&pim6_protocol, IPPROTO_PIM) < 0) { pr_err("%s: can't add PIM protocol\n", __func__); err = -EAGAIN; goto add_proto_fail; } #endif err = rtnl_register_module(THIS_MODULE, RTNL_FAMILY_IP6MR, RTM_GETROUTE, ip6mr_rtm_getroute, ip6mr_rtm_dumproute, 0); if (err == 0) return 0; #ifdef CONFIG_IPV6_PIMSM_V2 inet6_del_protocol(&pim6_protocol, IPPROTO_PIM); add_proto_fail: unregister_netdevice_notifier(&ip6_mr_notifier); #endif reg_notif_fail: unregister_pernet_subsys(&ip6mr_net_ops); reg_pernet_fail: kmem_cache_destroy(mrt_cachep); return err; } void ip6_mr_cleanup(void) { rtnl_unregister(RTNL_FAMILY_IP6MR, RTM_GETROUTE); #ifdef CONFIG_IPV6_PIMSM_V2 inet6_del_protocol(&pim6_protocol, IPPROTO_PIM); #endif unregister_netdevice_notifier(&ip6_mr_notifier); unregister_pernet_subsys(&ip6mr_net_ops); kmem_cache_destroy(mrt_cachep); } static int ip6mr_mfc_add(struct net *net, struct mr_table *mrt, struct mf6cctl *mfc, int mrtsock, int parent) { unsigned char ttls[MAXMIFS]; struct mfc6_cache *uc, *c; struct mr_mfc *_uc; bool found; int i, err; if (mfc->mf6cc_parent >= MAXMIFS) return -ENFILE; memset(ttls, 255, MAXMIFS); for (i = 0; i < MAXMIFS; i++) { if (IF_ISSET(i, &mfc->mf6cc_ifset)) ttls[i] = 1; } /* The entries are added/deleted only under RTNL */ rcu_read_lock(); c = ip6mr_cache_find_parent(mrt, &mfc->mf6cc_origin.sin6_addr, &mfc->mf6cc_mcastgrp.sin6_addr, parent); rcu_read_unlock(); if (c) { spin_lock(&mrt_lock); c->_c.mfc_parent = mfc->mf6cc_parent; ip6mr_update_thresholds(mrt, &c->_c, ttls); if (!mrtsock) c->_c.mfc_flags |= MFC_STATIC; spin_unlock(&mrt_lock); call_ip6mr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, c, mrt->id); mr6_netlink_event(mrt, c, RTM_NEWROUTE); return 0; } if (!ipv6_addr_any(&mfc->mf6cc_mcastgrp.sin6_addr) && !ipv6_addr_is_multicast(&mfc->mf6cc_mcastgrp.sin6_addr)) return -EINVAL; c = ip6mr_cache_alloc(); if (!c) return -ENOMEM; c->mf6c_origin = mfc->mf6cc_origin.sin6_addr; c->mf6c_mcastgrp = mfc->mf6cc_mcastgrp.sin6_addr; c->_c.mfc_parent = mfc->mf6cc_parent; ip6mr_update_thresholds(mrt, &c->_c, ttls); if (!mrtsock) c->_c.mfc_flags |= MFC_STATIC; err = rhltable_insert_key(&mrt->mfc_hash, &c->cmparg, &c->_c.mnode, ip6mr_rht_params); if (err) { pr_err("ip6mr: rhtable insert error %d\n", err); ip6mr_cache_free(c); return err; } list_add_tail_rcu(&c->_c.list, &mrt->mfc_cache_list); /* Check to see if we resolved a queued list. If so we * need to send on the frames and tidy up. */ found = false; spin_lock_bh(&mfc_unres_lock); list_for_each_entry(_uc, &mrt->mfc_unres_queue, list) { uc = (struct mfc6_cache *)_uc; if (ipv6_addr_equal(&uc->mf6c_origin, &c->mf6c_origin) && ipv6_addr_equal(&uc->mf6c_mcastgrp, &c->mf6c_mcastgrp)) { list_del(&_uc->list); atomic_dec(&mrt->cache_resolve_queue_len); found = true; break; } } if (list_empty(&mrt->mfc_unres_queue)) del_timer(&mrt->ipmr_expire_timer); spin_unlock_bh(&mfc_unres_lock); if (found) { ip6mr_cache_resolve(net, mrt, uc, c); ip6mr_cache_free(uc); } call_ip6mr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_ADD, c, mrt->id); mr6_netlink_event(mrt, c, RTM_NEWROUTE); return 0; } /* * Close the multicast socket, and clear the vif tables etc */ static void mroute_clean_tables(struct mr_table *mrt, int flags) { struct mr_mfc *c, *tmp; LIST_HEAD(list); int i; /* Shut down all active vif entries */ if (flags & (MRT6_FLUSH_MIFS | MRT6_FLUSH_MIFS_STATIC)) { for (i = 0; i < mrt->maxvif; i++) { if (((mrt->vif_table[i].flags & VIFF_STATIC) && !(flags & MRT6_FLUSH_MIFS_STATIC)) || (!(mrt->vif_table[i].flags & VIFF_STATIC) && !(flags & MRT6_FLUSH_MIFS))) continue; mif6_delete(mrt, i, 0, &list); } unregister_netdevice_many(&list); } /* Wipe the cache */ if (flags & (MRT6_FLUSH_MFC | MRT6_FLUSH_MFC_STATIC)) { list_for_each_entry_safe(c, tmp, &mrt->mfc_cache_list, list) { if (((c->mfc_flags & MFC_STATIC) && !(flags & MRT6_FLUSH_MFC_STATIC)) || (!(c->mfc_flags & MFC_STATIC) && !(flags & MRT6_FLUSH_MFC))) continue; rhltable_remove(&mrt->mfc_hash, &c->mnode, ip6mr_rht_params); list_del_rcu(&c->list); call_ip6mr_mfc_entry_notifiers(read_pnet(&mrt->net), FIB_EVENT_ENTRY_DEL, (struct mfc6_cache *)c, mrt->id); mr6_netlink_event(mrt, (struct mfc6_cache *)c, RTM_DELROUTE); mr_cache_put(c); } } if (flags & MRT6_FLUSH_MFC) { if (atomic_read(&mrt->cache_resolve_queue_len) != 0) { spin_lock_bh(&mfc_unres_lock); list_for_each_entry_safe(c, tmp, &mrt->mfc_unres_queue, list) { list_del(&c->list); mr6_netlink_event(mrt, (struct mfc6_cache *)c, RTM_DELROUTE); ip6mr_destroy_unres(mrt, (struct mfc6_cache *)c); } spin_unlock_bh(&mfc_unres_lock); } } } static int ip6mr_sk_init(struct mr_table *mrt, struct sock *sk) { int err = 0; struct net *net = sock_net(sk); rtnl_lock(); spin_lock(&mrt_lock); if (rtnl_dereference(mrt->mroute_sk)) { err = -EADDRINUSE; } else { rcu_assign_pointer(mrt->mroute_sk, sk); sock_set_flag(sk, SOCK_RCU_FREE); atomic_inc(&net->ipv6.devconf_all->mc_forwarding); } spin_unlock(&mrt_lock); if (!err) inet6_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_MC_FORWARDING, NETCONFA_IFINDEX_ALL, net->ipv6.devconf_all); rtnl_unlock(); return err; } int ip6mr_sk_done(struct sock *sk) { struct net *net = sock_net(sk); struct ipv6_devconf *devconf; struct mr_table *mrt; int err = -EACCES; if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num != IPPROTO_ICMPV6) return err; devconf = net->ipv6.devconf_all; if (!devconf || !atomic_read(&devconf->mc_forwarding)) return err; rtnl_lock(); ip6mr_for_each_table(mrt, net) { if (sk == rtnl_dereference(mrt->mroute_sk)) { spin_lock(&mrt_lock); RCU_INIT_POINTER(mrt->mroute_sk, NULL); /* Note that mroute_sk had SOCK_RCU_FREE set, * so the RCU grace period before sk freeing * is guaranteed by sk_destruct() */ atomic_dec(&devconf->mc_forwarding); spin_unlock(&mrt_lock); inet6_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_MC_FORWARDING, NETCONFA_IFINDEX_ALL, net->ipv6.devconf_all); mroute_clean_tables(mrt, MRT6_FLUSH_MIFS | MRT6_FLUSH_MFC); err = 0; break; } } rtnl_unlock(); return err; } bool mroute6_is_socket(struct net *net, struct sk_buff *skb) { struct mr_table *mrt; struct flowi6 fl6 = { .flowi6_iif = skb->skb_iif ? : LOOPBACK_IFINDEX, .flowi6_oif = skb->dev->ifindex, .flowi6_mark = skb->mark, }; if (ip6mr_fib_lookup(net, &fl6, &mrt) < 0) return NULL; return rcu_access_pointer(mrt->mroute_sk); } EXPORT_SYMBOL(mroute6_is_socket); /* * Socket options and virtual interface manipulation. The whole * virtual interface system is a complete heap, but unfortunately * that's how BSD mrouted happens to think. Maybe one day with a proper * MOSPF/PIM router set up we can clean this up. */ int ip6_mroute_setsockopt(struct sock *sk, int optname, sockptr_t optval, unsigned int optlen) { int ret, parent = 0; struct mif6ctl vif; struct mf6cctl mfc; mifi_t mifi; struct net *net = sock_net(sk); struct mr_table *mrt; if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num != IPPROTO_ICMPV6) return -EOPNOTSUPP; mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT); if (!mrt) return -ENOENT; if (optname != MRT6_INIT) { if (sk != rcu_access_pointer(mrt->mroute_sk) && !ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EACCES; } switch (optname) { case MRT6_INIT: if (optlen < sizeof(int)) return -EINVAL; return ip6mr_sk_init(mrt, sk); case MRT6_DONE: return ip6mr_sk_done(sk); case MRT6_ADD_MIF: if (optlen < sizeof(vif)) return -EINVAL; if (copy_from_sockptr(&vif, optval, sizeof(vif))) return -EFAULT; if (vif.mif6c_mifi >= MAXMIFS) return -ENFILE; rtnl_lock(); ret = mif6_add(net, mrt, &vif, sk == rtnl_dereference(mrt->mroute_sk)); rtnl_unlock(); return ret; case MRT6_DEL_MIF: if (optlen < sizeof(mifi_t)) return -EINVAL; if (copy_from_sockptr(&mifi, optval, sizeof(mifi_t))) return -EFAULT; rtnl_lock(); ret = mif6_delete(mrt, mifi, 0, NULL); rtnl_unlock(); return ret; /* * Manipulate the forwarding caches. These live * in a sort of kernel/user symbiosis. */ case MRT6_ADD_MFC: case MRT6_DEL_MFC: parent = -1; fallthrough; case MRT6_ADD_MFC_PROXY: case MRT6_DEL_MFC_PROXY: if (optlen < sizeof(mfc)) return -EINVAL; if (copy_from_sockptr(&mfc, optval, sizeof(mfc))) return -EFAULT; if (parent == 0) parent = mfc.mf6cc_parent; rtnl_lock(); if (optname == MRT6_DEL_MFC || optname == MRT6_DEL_MFC_PROXY) ret = ip6mr_mfc_delete(mrt, &mfc, parent); else ret = ip6mr_mfc_add(net, mrt, &mfc, sk == rtnl_dereference(mrt->mroute_sk), parent); rtnl_unlock(); return ret; case MRT6_FLUSH: { int flags; if (optlen != sizeof(flags)) return -EINVAL; if (copy_from_sockptr(&flags, optval, sizeof(flags))) return -EFAULT; rtnl_lock(); mroute_clean_tables(mrt, flags); rtnl_unlock(); return 0; } /* * Control PIM assert (to activate pim will activate assert) */ case MRT6_ASSERT: { int v; if (optlen != sizeof(v)) return -EINVAL; if (copy_from_sockptr(&v, optval, sizeof(v))) return -EFAULT; mrt->mroute_do_assert = v; return 0; } #ifdef CONFIG_IPV6_PIMSM_V2 case MRT6_PIM: { bool do_wrmifwhole; int v; if (optlen != sizeof(v)) return -EINVAL; if (copy_from_sockptr(&v, optval, sizeof(v))) return -EFAULT; do_wrmifwhole = (v == MRT6MSG_WRMIFWHOLE); v = !!v; rtnl_lock(); ret = 0; if (v != mrt->mroute_do_pim) { mrt->mroute_do_pim = v; mrt->mroute_do_assert = v; mrt->mroute_do_wrvifwhole = do_wrmifwhole; } rtnl_unlock(); return ret; } #endif #ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES case MRT6_TABLE: { u32 v; if (optlen != sizeof(u32)) return -EINVAL; if (copy_from_sockptr(&v, optval, sizeof(v))) return -EFAULT; /* "pim6reg%u" should not exceed 16 bytes (IFNAMSIZ) */ if (v != RT_TABLE_DEFAULT && v >= 100000000) return -EINVAL; if (sk == rcu_access_pointer(mrt->mroute_sk)) return -EBUSY; rtnl_lock(); ret = 0; mrt = ip6mr_new_table(net, v); if (IS_ERR(mrt)) ret = PTR_ERR(mrt); else raw6_sk(sk)->ip6mr_table = v; rtnl_unlock(); return ret; } #endif /* * Spurious command, or MRT6_VERSION which you cannot * set. */ default: return -ENOPROTOOPT; } } /* * Getsock opt support for the multicast routing system. */ int ip6_mroute_getsockopt(struct sock *sk, int optname, sockptr_t optval, sockptr_t optlen) { int olr; int val; struct net *net = sock_net(sk); struct mr_table *mrt; if (sk->sk_type != SOCK_RAW || inet_sk(sk)->inet_num != IPPROTO_ICMPV6) return -EOPNOTSUPP; mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT); if (!mrt) return -ENOENT; switch (optname) { case MRT6_VERSION: val = 0x0305; break; #ifdef CONFIG_IPV6_PIMSM_V2 case MRT6_PIM: val = mrt->mroute_do_pim; break; #endif case MRT6_ASSERT: val = mrt->mroute_do_assert; break; default: return -ENOPROTOOPT; } if (copy_from_sockptr(&olr, optlen, sizeof(int))) return -EFAULT; olr = min_t(int, olr, sizeof(int)); if (olr < 0) return -EINVAL; if (copy_to_sockptr(optlen, &olr, sizeof(int))) return -EFAULT; if (copy_to_sockptr(optval, &val, olr)) return -EFAULT; return 0; } /* * The IP multicast ioctl support routines. */ int ip6mr_ioctl(struct sock *sk, int cmd, void *arg) { struct sioc_sg_req6 *sr; struct sioc_mif_req6 *vr; struct vif_device *vif; struct mfc6_cache *c; struct net *net = sock_net(sk); struct mr_table *mrt; mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT); if (!mrt) return -ENOENT; switch (cmd) { case SIOCGETMIFCNT_IN6: vr = (struct sioc_mif_req6 *)arg; if (vr->mifi >= mrt->maxvif) return -EINVAL; vr->mifi = array_index_nospec(vr->mifi, mrt->maxvif); rcu_read_lock(); vif = &mrt->vif_table[vr->mifi]; if (VIF_EXISTS(mrt, vr->mifi)) { vr->icount = READ_ONCE(vif->pkt_in); vr->ocount = READ_ONCE(vif->pkt_out); vr->ibytes = READ_ONCE(vif->bytes_in); vr->obytes = READ_ONCE(vif->bytes_out); rcu_read_unlock(); return 0; } rcu_read_unlock(); return -EADDRNOTAVAIL; case SIOCGETSGCNT_IN6: sr = (struct sioc_sg_req6 *)arg; rcu_read_lock(); c = ip6mr_cache_find(mrt, &sr->src.sin6_addr, &sr->grp.sin6_addr); if (c) { sr->pktcnt = c->_c.mfc_un.res.pkt; sr->bytecnt = c->_c.mfc_un.res.bytes; sr->wrong_if = c->_c.mfc_un.res.wrong_if; rcu_read_unlock(); return 0; } rcu_read_unlock(); return -EADDRNOTAVAIL; default: return -ENOIOCTLCMD; } } #ifdef CONFIG_COMPAT struct compat_sioc_sg_req6 { struct sockaddr_in6 src; struct sockaddr_in6 grp; compat_ulong_t pktcnt; compat_ulong_t bytecnt; compat_ulong_t wrong_if; }; struct compat_sioc_mif_req6 { mifi_t mifi; compat_ulong_t icount; compat_ulong_t ocount; compat_ulong_t ibytes; compat_ulong_t obytes; }; int ip6mr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) { struct compat_sioc_sg_req6 sr; struct compat_sioc_mif_req6 vr; struct vif_device *vif; struct mfc6_cache *c; struct net *net = sock_net(sk); struct mr_table *mrt; mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT); if (!mrt) return -ENOENT; switch (cmd) { case SIOCGETMIFCNT_IN6: if (copy_from_user(&vr, arg, sizeof(vr))) return -EFAULT; if (vr.mifi >= mrt->maxvif) return -EINVAL; vr.mifi = array_index_nospec(vr.mifi, mrt->maxvif); rcu_read_lock(); vif = &mrt->vif_table[vr.mifi]; if (VIF_EXISTS(mrt, vr.mifi)) { vr.icount = READ_ONCE(vif->pkt_in); vr.ocount = READ_ONCE(vif->pkt_out); vr.ibytes = READ_ONCE(vif->bytes_in); vr.obytes = READ_ONCE(vif->bytes_out); rcu_read_unlock(); if (copy_to_user(arg, &vr, sizeof(vr))) return -EFAULT; return 0; } rcu_read_unlock(); return -EADDRNOTAVAIL; case SIOCGETSGCNT_IN6: if (copy_from_user(&sr, arg, sizeof(sr))) return -EFAULT; rcu_read_lock(); c = ip6mr_cache_find(mrt, &sr.src.sin6_addr, &sr.grp.sin6_addr); if (c) { sr.pktcnt = c->_c.mfc_un.res.pkt; sr.bytecnt = c->_c.mfc_un.res.bytes; sr.wrong_if = c->_c.mfc_un.res.wrong_if; rcu_read_unlock(); if (copy_to_user(arg, &sr, sizeof(sr))) return -EFAULT; return 0; } rcu_read_unlock(); return -EADDRNOTAVAIL; default: return -ENOIOCTLCMD; } } #endif static inline int ip6mr_forward2_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_OUTFORWDATAGRAMS); return dst_output(net, sk, skb); } /* * Processing handlers for ip6mr_forward */ static int ip6mr_forward2(struct net *net, struct mr_table *mrt, struct sk_buff *skb, int vifi) { struct vif_device *vif = &mrt->vif_table[vifi]; struct net_device *vif_dev; struct ipv6hdr *ipv6h; struct dst_entry *dst; struct flowi6 fl6; vif_dev = vif_dev_read(vif); if (!vif_dev) goto out_free; #ifdef CONFIG_IPV6_PIMSM_V2 if (vif->flags & MIFF_REGISTER) { WRITE_ONCE(vif->pkt_out, vif->pkt_out + 1); WRITE_ONCE(vif->bytes_out, vif->bytes_out + skb->len); DEV_STATS_ADD(vif_dev, tx_bytes, skb->len); DEV_STATS_INC(vif_dev, tx_packets); ip6mr_cache_report(mrt, skb, vifi, MRT6MSG_WHOLEPKT); goto out_free; } #endif ipv6h = ipv6_hdr(skb); fl6 = (struct flowi6) { .flowi6_oif = vif->link, .daddr = ipv6h->daddr, }; dst = ip6_route_output(net, NULL, &fl6); if (dst->error) { dst_release(dst); goto out_free; } skb_dst_drop(skb); skb_dst_set(skb, dst); /* * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally * not only before forwarding, but after forwarding on all output * interfaces. It is clear, if mrouter runs a multicasting * program, it should receive packets not depending to what interface * program is joined. * If we will not make it, the program will have to join on all * interfaces. On the other hand, multihoming host (or router, but * not mrouter) cannot join to more than one interface - it will * result in receiving multiple packets. */ skb->dev = vif_dev; WRITE_ONCE(vif->pkt_out, vif->pkt_out + 1); WRITE_ONCE(vif->bytes_out, vif->bytes_out + skb->len); /* We are about to write */ /* XXX: extension headers? */ if (skb_cow(skb, sizeof(*ipv6h) + LL_RESERVED_SPACE(vif_dev))) goto out_free; ipv6h = ipv6_hdr(skb); ipv6h->hop_limit--; IP6CB(skb)->flags |= IP6SKB_FORWARDED; return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, net, NULL, skb, skb->dev, vif_dev, ip6mr_forward2_finish); out_free: kfree_skb(skb); return 0; } /* Called with rcu_read_lock() */ static int ip6mr_find_vif(struct mr_table *mrt, struct net_device *dev) { int ct; /* Pairs with WRITE_ONCE() in mif6_delete()/mif6_add() */ for (ct = READ_ONCE(mrt->maxvif) - 1; ct >= 0; ct--) { if (rcu_access_pointer(mrt->vif_table[ct].dev) == dev) break; } return ct; } /* Called under rcu_read_lock() */ static void ip6_mr_forward(struct net *net, struct mr_table *mrt, struct net_device *dev, struct sk_buff *skb, struct mfc6_cache *c) { int psend = -1; int vif, ct; int true_vifi = ip6mr_find_vif(mrt, dev); vif = c->_c.mfc_parent; c->_c.mfc_un.res.pkt++; c->_c.mfc_un.res.bytes += skb->len; c->_c.mfc_un.res.lastuse = jiffies; if (ipv6_addr_any(&c->mf6c_origin) && true_vifi >= 0) { struct mfc6_cache *cache_proxy; /* For an (*,G) entry, we only check that the incoming * interface is part of the static tree. */ cache_proxy = mr_mfc_find_any_parent(mrt, vif); if (cache_proxy && cache_proxy->_c.mfc_un.res.ttls[true_vifi] < 255) goto forward; } /* * Wrong interface: drop packet and (maybe) send PIM assert. */ if (rcu_access_pointer(mrt->vif_table[vif].dev) != dev) { c->_c.mfc_un.res.wrong_if++; if (true_vifi >= 0 && mrt->mroute_do_assert && /* pimsm uses asserts, when switching from RPT to SPT, so that we cannot check that packet arrived on an oif. It is bad, but otherwise we would need to move pretty large chunk of pimd to kernel. Ough... --ANK */ (mrt->mroute_do_pim || c->_c.mfc_un.res.ttls[true_vifi] < 255) && time_after(jiffies, c->_c.mfc_un.res.last_assert + MFC_ASSERT_THRESH)) { c->_c.mfc_un.res.last_assert = jiffies; ip6mr_cache_report(mrt, skb, true_vifi, MRT6MSG_WRONGMIF); if (mrt->mroute_do_wrvifwhole) ip6mr_cache_report(mrt, skb, true_vifi, MRT6MSG_WRMIFWHOLE); } goto dont_forward; } forward: WRITE_ONCE(mrt->vif_table[vif].pkt_in, mrt->vif_table[vif].pkt_in + 1); WRITE_ONCE(mrt->vif_table[vif].bytes_in, mrt->vif_table[vif].bytes_in + skb->len); /* * Forward the frame */ if (ipv6_addr_any(&c->mf6c_origin) && ipv6_addr_any(&c->mf6c_mcastgrp)) { if (true_vifi >= 0 && true_vifi != c->_c.mfc_parent && ipv6_hdr(skb)->hop_limit > c->_c.mfc_un.res.ttls[c->_c.mfc_parent]) { /* It's an (*,*) entry and the packet is not coming from * the upstream: forward the packet to the upstream * only. */ psend = c->_c.mfc_parent; goto last_forward; } goto dont_forward; } for (ct = c->_c.mfc_un.res.maxvif - 1; ct >= c->_c.mfc_un.res.minvif; ct--) { /* For (*,G) entry, don't forward to the incoming interface */ if ((!ipv6_addr_any(&c->mf6c_origin) || ct != true_vifi) && ipv6_hdr(skb)->hop_limit > c->_c.mfc_un.res.ttls[ct]) { if (psend != -1) { struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2) ip6mr_forward2(net, mrt, skb2, psend); } psend = ct; } } last_forward: if (psend != -1) { ip6mr_forward2(net, mrt, skb, psend); return; } dont_forward: kfree_skb(skb); } /* * Multicast packets for forwarding arrive here */ int ip6_mr_input(struct sk_buff *skb) { struct mfc6_cache *cache; struct net *net = dev_net(skb->dev); struct mr_table *mrt; struct flowi6 fl6 = { .flowi6_iif = skb->dev->ifindex, .flowi6_mark = skb->mark, }; int err; struct net_device *dev; /* skb->dev passed in is the master dev for vrfs. * Get the proper interface that does have a vif associated with it. */ dev = skb->dev; if (netif_is_l3_master(skb->dev)) { dev = dev_get_by_index_rcu(net, IPCB(skb)->iif); if (!dev) { kfree_skb(skb); return -ENODEV; } } err = ip6mr_fib_lookup(net, &fl6, &mrt); if (err < 0) { kfree_skb(skb); return err; } cache = ip6mr_cache_find(mrt, &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr); if (!cache) { int vif = ip6mr_find_vif(mrt, dev); if (vif >= 0) cache = ip6mr_cache_find_any(mrt, &ipv6_hdr(skb)->daddr, vif); } /* * No usable cache entry */ if (!cache) { int vif; vif = ip6mr_find_vif(mrt, dev); if (vif >= 0) { int err = ip6mr_cache_unresolved(mrt, vif, skb, dev); return err; } kfree_skb(skb); return -ENODEV; } ip6_mr_forward(net, mrt, dev, skb, cache); return 0; } int ip6mr_get_route(struct net *net, struct sk_buff *skb, struct rtmsg *rtm, u32 portid) { int err; struct mr_table *mrt; struct mfc6_cache *cache; struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); mrt = ip6mr_get_table(net, RT6_TABLE_DFLT); if (!mrt) return -ENOENT; rcu_read_lock(); cache = ip6mr_cache_find(mrt, &rt->rt6i_src.addr, &rt->rt6i_dst.addr); if (!cache && skb->dev) { int vif = ip6mr_find_vif(mrt, skb->dev); if (vif >= 0) cache = ip6mr_cache_find_any(mrt, &rt->rt6i_dst.addr, vif); } if (!cache) { struct sk_buff *skb2; struct ipv6hdr *iph; struct net_device *dev; int vif; dev = skb->dev; if (!dev || (vif = ip6mr_find_vif(mrt, dev)) < 0) { rcu_read_unlock(); return -ENODEV; } /* really correct? */ skb2 = alloc_skb(sizeof(struct ipv6hdr), GFP_ATOMIC); if (!skb2) { rcu_read_unlock(); return -ENOMEM; } NETLINK_CB(skb2).portid = portid; skb_reset_transport_header(skb2); skb_put(skb2, sizeof(struct ipv6hdr)); skb_reset_network_header(skb2); iph = ipv6_hdr(skb2); iph->version = 0; iph->priority = 0; iph->flow_lbl[0] = 0; iph->flow_lbl[1] = 0; iph->flow_lbl[2] = 0; iph->payload_len = 0; iph->nexthdr = IPPROTO_NONE; iph->hop_limit = 0; iph->saddr = rt->rt6i_src.addr; iph->daddr = rt->rt6i_dst.addr; err = ip6mr_cache_unresolved(mrt, vif, skb2, dev); rcu_read_unlock(); return err; } err = mr_fill_mroute(mrt, skb, &cache->_c, rtm); rcu_read_unlock(); return err; } static int ip6mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, u32 portid, u32 seq, struct mfc6_cache *c, int cmd, int flags) { struct nlmsghdr *nlh; struct rtmsg *rtm; int err; nlh = nlmsg_put(skb, portid, seq, cmd, sizeof(*rtm), flags); if (!nlh) return -EMSGSIZE; rtm = nlmsg_data(nlh); rtm->rtm_family = RTNL_FAMILY_IP6MR; rtm->rtm_dst_len = 128; rtm->rtm_src_len = 128; rtm->rtm_tos = 0; rtm->rtm_table = mrt->id; if (nla_put_u32(skb, RTA_TABLE, mrt->id)) goto nla_put_failure; rtm->rtm_type = RTN_MULTICAST; rtm->rtm_scope = RT_SCOPE_UNIVERSE; if (c->_c.mfc_flags & MFC_STATIC) rtm->rtm_protocol = RTPROT_STATIC; else rtm->rtm_protocol = RTPROT_MROUTED; rtm->rtm_flags = 0; if (nla_put_in6_addr(skb, RTA_SRC, &c->mf6c_origin) || nla_put_in6_addr(skb, RTA_DST, &c->mf6c_mcastgrp)) goto nla_put_failure; err = mr_fill_mroute(mrt, skb, &c->_c, rtm); /* do not break the dump if cache is unresolved */ if (err < 0 && err != -ENOENT) goto nla_put_failure; nlmsg_end(skb, nlh); return 0; nla_put_failure: nlmsg_cancel(skb, nlh); return -EMSGSIZE; } static int _ip6mr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb, u32 portid, u32 seq, struct mr_mfc *c, int cmd, int flags) { return ip6mr_fill_mroute(mrt, skb, portid, seq, (struct mfc6_cache *)c, cmd, flags); } static int mr6_msgsize(bool unresolved, int maxvif) { size_t len = NLMSG_ALIGN(sizeof(struct rtmsg)) + nla_total_size(4) /* RTA_TABLE */ + nla_total_size(sizeof(struct in6_addr)) /* RTA_SRC */ + nla_total_size(sizeof(struct in6_addr)) /* RTA_DST */ ; if (!unresolved) len = len + nla_total_size(4) /* RTA_IIF */ + nla_total_size(0) /* RTA_MULTIPATH */ + maxvif * NLA_ALIGN(sizeof(struct rtnexthop)) /* RTA_MFC_STATS */ + nla_total_size_64bit(sizeof(struct rta_mfc_stats)) ; return len; } static void mr6_netlink_event(struct mr_table *mrt, struct mfc6_cache *mfc, int cmd) { struct net *net = read_pnet(&mrt->net); struct sk_buff *skb; int err = -ENOBUFS; skb = nlmsg_new(mr6_msgsize(mfc->_c.mfc_parent >= MAXMIFS, mrt->maxvif), GFP_ATOMIC); if (!skb) goto errout; err = ip6mr_fill_mroute(mrt, skb, 0, 0, mfc, cmd, 0); if (err < 0) goto errout; rtnl_notify(skb, net, 0, RTNLGRP_IPV6_MROUTE, NULL, GFP_ATOMIC); return; errout: kfree_skb(skb); if (err < 0) rtnl_set_sk_err(net, RTNLGRP_IPV6_MROUTE, err); } static size_t mrt6msg_netlink_msgsize(size_t payloadlen) { size_t len = NLMSG_ALIGN(sizeof(struct rtgenmsg)) + nla_total_size(1) /* IP6MRA_CREPORT_MSGTYPE */ + nla_total_size(4) /* IP6MRA_CREPORT_MIF_ID */ /* IP6MRA_CREPORT_SRC_ADDR */ + nla_total_size(sizeof(struct in6_addr)) /* IP6MRA_CREPORT_DST_ADDR */ + nla_total_size(sizeof(struct in6_addr)) /* IP6MRA_CREPORT_PKT */ + nla_total_size(payloadlen) ; return len; } static void mrt6msg_netlink_event(const struct mr_table *mrt, struct sk_buff *pkt) { struct net *net = read_pnet(&mrt->net); struct nlmsghdr *nlh; struct rtgenmsg *rtgenm; struct mrt6msg *msg; struct sk_buff *skb; struct nlattr *nla; int payloadlen; payloadlen = pkt->len - sizeof(struct mrt6msg); msg = (struct mrt6msg *)skb_transport_header(pkt); skb = nlmsg_new(mrt6msg_netlink_msgsize(payloadlen), GFP_ATOMIC); if (!skb) goto errout; nlh = nlmsg_put(skb, 0, 0, RTM_NEWCACHEREPORT, sizeof(struct rtgenmsg), 0); if (!nlh) goto errout; rtgenm = nlmsg_data(nlh); rtgenm->rtgen_family = RTNL_FAMILY_IP6MR; if (nla_put_u8(skb, IP6MRA_CREPORT_MSGTYPE, msg->im6_msgtype) || nla_put_u32(skb, IP6MRA_CREPORT_MIF_ID, msg->im6_mif) || nla_put_in6_addr(skb, IP6MRA_CREPORT_SRC_ADDR, &msg->im6_src) || nla_put_in6_addr(skb, IP6MRA_CREPORT_DST_ADDR, &msg->im6_dst)) goto nla_put_failure; nla = nla_reserve(skb, IP6MRA_CREPORT_PKT, payloadlen); if (!nla || skb_copy_bits(pkt, sizeof(struct mrt6msg), nla_data(nla), payloadlen)) goto nla_put_failure; nlmsg_end(skb, nlh); rtnl_notify(skb, net, 0, RTNLGRP_IPV6_MROUTE_R, NULL, GFP_ATOMIC); return; nla_put_failure: nlmsg_cancel(skb, nlh); errout: kfree_skb(skb); rtnl_set_sk_err(net, RTNLGRP_IPV6_MROUTE_R, -ENOBUFS); } static const struct nla_policy ip6mr_getroute_policy[RTA_MAX + 1] = { [RTA_SRC] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)), [RTA_DST] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)), [RTA_TABLE] = { .type = NLA_U32 }, }; static int ip6mr_rtm_valid_getroute_req(struct sk_buff *skb, const struct nlmsghdr *nlh, struct nlattr **tb, struct netlink_ext_ack *extack) { struct rtmsg *rtm; int err; err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, ip6mr_getroute_policy, extack); if (err) return err; rtm = nlmsg_data(nlh); if ((rtm->rtm_src_len && rtm->rtm_src_len != 128) || (rtm->rtm_dst_len && rtm->rtm_dst_len != 128) || rtm->rtm_tos || rtm->rtm_table || rtm->rtm_protocol || rtm->rtm_scope || rtm->rtm_type || rtm->rtm_flags) { NL_SET_ERR_MSG_MOD(extack, "Invalid values in header for multicast route get request"); return -EINVAL; } if ((tb[RTA_SRC] && !rtm->rtm_src_len) || (tb[RTA_DST] && !rtm->rtm_dst_len)) { NL_SET_ERR_MSG_MOD(extack, "rtm_src_len and rtm_dst_len must be 128 for IPv6"); return -EINVAL; } return 0; } static int ip6mr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(in_skb->sk); struct in6_addr src = {}, grp = {}; struct nlattr *tb[RTA_MAX + 1]; struct mfc6_cache *cache; struct mr_table *mrt; struct sk_buff *skb; u32 tableid; int err; err = ip6mr_rtm_valid_getroute_req(in_skb, nlh, tb, extack); if (err < 0) return err; if (tb[RTA_SRC]) src = nla_get_in6_addr(tb[RTA_SRC]); if (tb[RTA_DST]) grp = nla_get_in6_addr(tb[RTA_DST]); tableid = tb[RTA_TABLE] ? nla_get_u32(tb[RTA_TABLE]) : 0; mrt = ip6mr_get_table(net, tableid ?: RT_TABLE_DEFAULT); if (!mrt) { NL_SET_ERR_MSG_MOD(extack, "MR table does not exist"); return -ENOENT; } /* entries are added/deleted only under RTNL */ rcu_read_lock(); cache = ip6mr_cache_find(mrt, &src, &grp); rcu_read_unlock(); if (!cache) { NL_SET_ERR_MSG_MOD(extack, "MR cache entry not found"); return -ENOENT; } skb = nlmsg_new(mr6_msgsize(false, mrt->maxvif), GFP_KERNEL); if (!skb) return -ENOBUFS; err = ip6mr_fill_mroute(mrt, skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, cache, RTM_NEWROUTE, 0); if (err < 0) { kfree_skb(skb); return err; } return rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); } static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb) { const struct nlmsghdr *nlh = cb->nlh; struct fib_dump_filter filter = {}; int err; if (cb->strict_check) { err = ip_valid_fib_dump_req(sock_net(skb->sk), nlh, &filter, cb); if (err < 0) return err; } if (filter.table_id) { struct mr_table *mrt; mrt = ip6mr_get_table(sock_net(skb->sk), filter.table_id); if (!mrt) { if (rtnl_msg_family(cb->nlh) != RTNL_FAMILY_IP6MR) return skb->len; NL_SET_ERR_MSG_MOD(cb->extack, "MR table does not exist"); return -ENOENT; } err = mr_table_dump(mrt, skb, cb, _ip6mr_fill_mroute, &mfc_unres_lock, &filter); return skb->len ? : err; } return mr_rtm_dumproute(skb, cb, ip6mr_mr_table_iter, _ip6mr_fill_mroute, &mfc_unres_lock, &filter); } |
373 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright 2019 Google LLC */ #ifndef __LINUX_BLK_CRYPTO_H #define __LINUX_BLK_CRYPTO_H #include <linux/types.h> enum blk_crypto_mode_num { BLK_ENCRYPTION_MODE_INVALID, BLK_ENCRYPTION_MODE_AES_256_XTS, BLK_ENCRYPTION_MODE_AES_128_CBC_ESSIV, BLK_ENCRYPTION_MODE_ADIANTUM, BLK_ENCRYPTION_MODE_SM4_XTS, BLK_ENCRYPTION_MODE_MAX, }; #define BLK_CRYPTO_MAX_KEY_SIZE 64 /** * struct blk_crypto_config - an inline encryption key's crypto configuration * @crypto_mode: encryption algorithm this key is for * @data_unit_size: the data unit size for all encryption/decryptions with this * key. This is the size in bytes of each individual plaintext and * ciphertext. This is always a power of 2. It might be e.g. the * filesystem block size or the disk sector size. * @dun_bytes: the maximum number of bytes of DUN used when using this key */ struct blk_crypto_config { enum blk_crypto_mode_num crypto_mode; unsigned int data_unit_size; unsigned int dun_bytes; }; /** * struct blk_crypto_key - an inline encryption key * @crypto_cfg: the crypto configuration (like crypto_mode, key size) for this * key * @data_unit_size_bits: log2 of data_unit_size * @size: size of this key in bytes (determined by @crypto_cfg.crypto_mode) * @raw: the raw bytes of this key. Only the first @size bytes are used. * * A blk_crypto_key is immutable once created, and many bios can reference it at * the same time. It must not be freed until all bios using it have completed * and it has been evicted from all devices on which it may have been used. */ struct blk_crypto_key { struct blk_crypto_config crypto_cfg; unsigned int data_unit_size_bits; unsigned int size; u8 raw[BLK_CRYPTO_MAX_KEY_SIZE]; }; #define BLK_CRYPTO_MAX_IV_SIZE 32 #define BLK_CRYPTO_DUN_ARRAY_SIZE (BLK_CRYPTO_MAX_IV_SIZE / sizeof(u64)) /** * struct bio_crypt_ctx - an inline encryption context * @bc_key: the key, algorithm, and data unit size to use * @bc_dun: the data unit number (starting IV) to use * * A bio_crypt_ctx specifies that the contents of the bio will be encrypted (for * write requests) or decrypted (for read requests) inline by the storage device * or controller, or by the crypto API fallback. */ struct bio_crypt_ctx { const struct blk_crypto_key *bc_key; u64 bc_dun[BLK_CRYPTO_DUN_ARRAY_SIZE]; }; #include <linux/blk_types.h> #include <linux/blkdev.h> #ifdef CONFIG_BLK_INLINE_ENCRYPTION static inline bool bio_has_crypt_ctx(struct bio *bio) { return bio->bi_crypt_context; } void bio_crypt_set_ctx(struct bio *bio, const struct blk_crypto_key *key, const u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE], gfp_t gfp_mask); bool bio_crypt_dun_is_contiguous(const struct bio_crypt_ctx *bc, unsigned int bytes, const u64 next_dun[BLK_CRYPTO_DUN_ARRAY_SIZE]); int blk_crypto_init_key(struct blk_crypto_key *blk_key, const u8 *raw_key, enum blk_crypto_mode_num crypto_mode, unsigned int dun_bytes, unsigned int data_unit_size); int blk_crypto_start_using_key(struct block_device *bdev, const struct blk_crypto_key *key); void blk_crypto_evict_key(struct block_device *bdev, const struct blk_crypto_key *key); bool blk_crypto_config_supported_natively(struct block_device *bdev, const struct blk_crypto_config *cfg); bool blk_crypto_config_supported(struct block_device *bdev, const struct blk_crypto_config *cfg); #else /* CONFIG_BLK_INLINE_ENCRYPTION */ static inline bool bio_has_crypt_ctx(struct bio *bio) { return false; } #endif /* CONFIG_BLK_INLINE_ENCRYPTION */ int __bio_crypt_clone(struct bio *dst, struct bio *src, gfp_t gfp_mask); /** * bio_crypt_clone - clone bio encryption context * @dst: destination bio * @src: source bio * @gfp_mask: memory allocation flags * * If @src has an encryption context, clone it to @dst. * * Return: 0 on success, -ENOMEM if out of memory. -ENOMEM is only possible if * @gfp_mask doesn't include %__GFP_DIRECT_RECLAIM. */ static inline int bio_crypt_clone(struct bio *dst, struct bio *src, gfp_t gfp_mask) { if (bio_has_crypt_ctx(src)) return __bio_crypt_clone(dst, src, gfp_mask); return 0; } #endif /* __LINUX_BLK_CRYPTO_H */ |
5421 1046 2246 2248 168 222 370 405 1597 1598 18 347 329 328 3095 1797 1197 118 243 77 1014 1090 2 327 327 326 5305 3096 646 1286 325 241 2012 8 3 5 19 10 9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 | // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com * Copyright (c) 2016 Facebook * Copyright (c) 2018 Covalent IO, Inc. http://covalent.io */ #include <uapi/linux/btf.h> #include <linux/kernel.h> #include <linux/types.h> #include <linux/bpf.h> #include <linux/bpf_verifier.h> #include <linux/math64.h> static bool bpf_verifier_log_attr_valid(const struct bpf_verifier_log *log) { /* ubuf and len_total should both be specified (or not) together */ if (!!log->ubuf != !!log->len_total) return false; /* log buf without log_level is meaningless */ if (log->ubuf && log->level == 0) return false; if (log->level & ~BPF_LOG_MASK) return false; if (log->len_total > UINT_MAX >> 2) return false; return true; } int bpf_vlog_init(struct bpf_verifier_log *log, u32 log_level, char __user *log_buf, u32 log_size) { log->level = log_level; log->ubuf = log_buf; log->len_total = log_size; /* log attributes have to be sane */ if (!bpf_verifier_log_attr_valid(log)) return -EINVAL; return 0; } static void bpf_vlog_update_len_max(struct bpf_verifier_log *log, u32 add_len) { /* add_len includes terminal \0, so no need for +1. */ u64 len = log->end_pos + add_len; /* log->len_max could be larger than our current len due to * bpf_vlog_reset() calls, so we maintain the max of any length at any * previous point */ if (len > UINT_MAX) log->len_max = UINT_MAX; else if (len > log->len_max) log->len_max = len; } void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt, va_list args) { u64 cur_pos; u32 new_n, n; n = vscnprintf(log->kbuf, BPF_VERIFIER_TMP_LOG_SIZE, fmt, args); if (log->level == BPF_LOG_KERNEL) { bool newline = n > 0 && log->kbuf[n - 1] == '\n'; pr_err("BPF: %s%s", log->kbuf, newline ? "" : "\n"); return; } n += 1; /* include terminating zero */ bpf_vlog_update_len_max(log, n); if (log->level & BPF_LOG_FIXED) { /* check if we have at least something to put into user buf */ new_n = 0; if (log->end_pos < log->len_total) { new_n = min_t(u32, log->len_total - log->end_pos, n); log->kbuf[new_n - 1] = '\0'; } cur_pos = log->end_pos; log->end_pos += n - 1; /* don't count terminating '\0' */ if (log->ubuf && new_n && copy_to_user(log->ubuf + cur_pos, log->kbuf, new_n)) goto fail; } else { u64 new_end, new_start; u32 buf_start, buf_end, new_n; new_end = log->end_pos + n; if (new_end - log->start_pos >= log->len_total) new_start = new_end - log->len_total; else new_start = log->start_pos; log->start_pos = new_start; log->end_pos = new_end - 1; /* don't count terminating '\0' */ if (!log->ubuf) return; new_n = min(n, log->len_total); cur_pos = new_end - new_n; div_u64_rem(cur_pos, log->len_total, &buf_start); div_u64_rem(new_end, log->len_total, &buf_end); /* new_end and buf_end are exclusive indices, so if buf_end is * exactly zero, then it actually points right to the end of * ubuf and there is no wrap around */ if (buf_end == 0) buf_end = log->len_total; /* if buf_start > buf_end, we wrapped around; * if buf_start == buf_end, then we fill ubuf completely; we * can't have buf_start == buf_end to mean that there is * nothing to write, because we always write at least * something, even if terminal '\0' */ if (buf_start < buf_end) { /* message fits within contiguous chunk of ubuf */ if (copy_to_user(log->ubuf + buf_start, log->kbuf + n - new_n, buf_end - buf_start)) goto fail; } else { /* message wraps around the end of ubuf, copy in two chunks */ if (copy_to_user(log->ubuf + buf_start, log->kbuf + n - new_n, log->len_total - buf_start)) goto fail; if (copy_to_user(log->ubuf, log->kbuf + n - buf_end, buf_end)) goto fail; } } return; fail: log->ubuf = NULL; } void bpf_vlog_reset(struct bpf_verifier_log *log, u64 new_pos) { char zero = 0; u32 pos; if (WARN_ON_ONCE(new_pos > log->end_pos)) return; if (!bpf_verifier_log_needed(log) || log->level == BPF_LOG_KERNEL) return; /* if position to which we reset is beyond current log window, * then we didn't preserve any useful content and should adjust * start_pos to end up with an empty log (start_pos == end_pos) */ log->end_pos = new_pos; if (log->end_pos < log->start_pos) log->start_pos = log->end_pos; if (!log->ubuf) return; if (log->level & BPF_LOG_FIXED) pos = log->end_pos + 1; else div_u64_rem(new_pos, log->len_total, &pos); if (pos < log->len_total && put_user(zero, log->ubuf + pos)) log->ubuf = NULL; } static void bpf_vlog_reverse_kbuf(char *buf, int len) { int i, j; for (i = 0, j = len - 1; i < j; i++, j--) swap(buf[i], buf[j]); } static int bpf_vlog_reverse_ubuf(struct bpf_verifier_log *log, int start, int end) { /* we split log->kbuf into two equal parts for both ends of array */ int n = sizeof(log->kbuf) / 2, nn; char *lbuf = log->kbuf, *rbuf = log->kbuf + n; /* Read ubuf's section [start, end) two chunks at a time, from left * and right side; within each chunk, swap all the bytes; after that * reverse the order of lbuf and rbuf and write result back to ubuf. * This way we'll end up with swapped contents of specified * [start, end) ubuf segment. */ while (end - start > 1) { nn = min(n, (end - start ) / 2); if (copy_from_user(lbuf, log->ubuf + start, nn)) return -EFAULT; if (copy_from_user(rbuf, log->ubuf + end - nn, nn)) return -EFAULT; bpf_vlog_reverse_kbuf(lbuf, nn); bpf_vlog_reverse_kbuf(rbuf, nn); /* we write lbuf to the right end of ubuf, while rbuf to the * left one to end up with properly reversed overall ubuf */ if (copy_to_user(log->ubuf + start, rbuf, nn)) return -EFAULT; if (copy_to_user(log->ubuf + end - nn, lbuf, nn)) return -EFAULT; start += nn; end -= nn; } return 0; } int bpf_vlog_finalize(struct bpf_verifier_log *log, u32 *log_size_actual) { u32 sublen; int err; *log_size_actual = 0; if (!log || log->level == 0 || log->level == BPF_LOG_KERNEL) return 0; if (!log->ubuf) goto skip_log_rotate; /* If we never truncated log, there is nothing to move around. */ if (log->start_pos == 0) goto skip_log_rotate; /* Otherwise we need to rotate log contents to make it start from the * buffer beginning and be a continuous zero-terminated string. Note * that if log->start_pos != 0 then we definitely filled up entire log * buffer with no gaps, and we just need to shift buffer contents to * the left by (log->start_pos % log->len_total) bytes. * * Unfortunately, user buffer could be huge and we don't want to * allocate temporary kernel memory of the same size just to shift * contents in a straightforward fashion. Instead, we'll be clever and * do in-place array rotation. This is a leetcode-style problem, which * could be solved by three rotations. * * Let's say we have log buffer that has to be shifted left by 7 bytes * (spaces and vertical bar is just for demonstrative purposes): * E F G H I J K | A B C D * * First, we reverse entire array: * D C B A | K J I H G F E * * Then we rotate first 4 bytes (DCBA) and separately last 7 bytes * (KJIHGFE), resulting in a properly rotated array: * A B C D | E F G H I J K * * We'll utilize log->kbuf to read user memory chunk by chunk, swap * bytes, and write them back. Doing it byte-by-byte would be * unnecessarily inefficient. Altogether we are going to read and * write each byte twice, for total 4 memory copies between kernel and * user space. */ /* length of the chopped off part that will be the beginning; * len(ABCD) in the example above */ div_u64_rem(log->start_pos, log->len_total, &sublen); sublen = log->len_total - sublen; err = bpf_vlog_reverse_ubuf(log, 0, log->len_total); err = err ?: bpf_vlog_reverse_ubuf(log, 0, sublen); err = err ?: bpf_vlog_reverse_ubuf(log, sublen, log->len_total); if (err) log->ubuf = NULL; skip_log_rotate: *log_size_actual = log->len_max; /* properly initialized log has either both ubuf!=NULL and len_total>0 * or ubuf==NULL and len_total==0, so if this condition doesn't hold, * we got a fault somewhere along the way, so report it back */ if (!!log->ubuf != !!log->l |