Total coverage: 450249 (23%)of 2026024
89 44 50 50 88 89 81 137 137 138 2 139 89 137 138 138 2 137 137 88 139 139 138 139 130 128 130 96 47 138 128 139 139 138 138 139 99 93 129 138 139 24 136 139 139 139 117 137 93 93 138 24 139 93 110 139 138 139 7 139 138 26 139 138 137 139 130 137 138 139 13 139 139 92 92 92 91 85 90 83 7 44 44 44 44 43 44 44 137 139 137 44 139 57 57 134 138 139 139 139 139 138 139 139 139 92 92 92 85 85 85 85 2 2 84 92 90 82 92 92 88 89 88 89 89 89 88 7 7 2 7 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2010 Red Hat, Inc. All Rights Reserved. */ #include "xfs.h" #include "xfs_fs.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_shared.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_extent_busy.h" #include "xfs_trans.h" #include "xfs_trans_priv.h" #include "xfs_log.h" #include "xfs_log_priv.h" #include "xfs_trace.h" #include "xfs_discard.h" /* * Allocate a new ticket. Failing to get a new ticket makes it really hard to * recover, so we don't allow failure here. Also, we allocate in a context that * we don't want to be issuing transactions from, so we need to tell the * allocation code this as well. * * We don't reserve any space for the ticket - we are going to steal whatever * space we require from transactions as they commit. To ensure we reserve all * the space required, we need to set the current reservation of the ticket to * zero so that we know to steal the initial transaction overhead from the * first transaction commit. */ static struct xlog_ticket * xlog_cil_ticket_alloc( struct xlog *log) { struct xlog_ticket *tic; tic = xlog_ticket_alloc(log, 0, 1, 0); /* * set the current reservation to zero so we know to steal the basic * transaction overhead reservation from the first transaction commit. */ tic->t_curr_res = 0; tic->t_iclog_hdrs = 0; return tic; } static inline void xlog_cil_set_iclog_hdr_count(struct xfs_cil *cil) { struct xlog *log = cil->xc_log; atomic_set(&cil->xc_iclog_hdrs, (XLOG_CIL_BLOCKING_SPACE_LIMIT(log) / (log->l_iclog_size - log->l_iclog_hsize))); } /* * Check if the current log item was first committed in this sequence. * We can't rely on just the log item being in the CIL, we have to check * the recorded commit sequence number. * * Note: for this to be used in a non-racy manner, it has to be called with * CIL flushing locked out. As a result, it should only be used during the * transaction commit process when deciding what to format into the item. */ static bool xlog_item_in_current_chkpt( struct xfs_cil *cil, struct xfs_log_item *lip) { if (test_bit(XLOG_CIL_EMPTY, &cil->xc_flags)) return false; /* * li_seq is written on the first commit of a log item to record the * first checkpoint it is written to. Hence if it is different to the * current sequence, we're in a new checkpoint. */ return lip->li_seq == READ_ONCE(cil->xc_current_sequence); } bool xfs_log_item_in_current_chkpt( struct xfs_log_item *lip) { return xlog_item_in_current_chkpt(lip->li_log->l_cilp, lip); } /* * Unavoidable forward declaration - xlog_cil_push_work() calls * xlog_cil_ctx_alloc() itself. */ static void xlog_cil_push_work(struct work_struct *work); static struct xfs_cil_ctx * xlog_cil_ctx_alloc(void) { struct xfs_cil_ctx *ctx; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL | __GFP_NOFAIL); INIT_LIST_HEAD(&ctx->committing); INIT_LIST_HEAD(&ctx->busy_extents.extent_list); INIT_LIST_HEAD(&ctx->log_items); INIT_LIST_HEAD(&ctx->lv_chain); INIT_WORK(&ctx->push_work, xlog_cil_push_work); return ctx; } /* * Aggregate the CIL per cpu structures into global counts, lists, etc and * clear the percpu state ready for the next context to use. This is called * from the push code with the context lock held exclusively, hence nothing else * will be accessing or modifying the per-cpu counters. */ static void xlog_cil_push_pcp_aggregate( struct xfs_cil *cil, struct xfs_cil_ctx *ctx) { struct xlog_cil_pcp *cilpcp; int cpu; for_each_cpu(cpu, &ctx->cil_pcpmask) { cilpcp = per_cpu_ptr(cil->xc_pcp, cpu); ctx->ticket->t_curr_res += cilpcp->space_reserved; cilpcp->space_reserved = 0; if (!list_empty(&cilpcp->busy_extents)) { list_splice_init(&cilpcp->busy_extents, &ctx->busy_extents.extent_list); } if (!list_empty(&cilpcp->log_items)) list_splice_init(&cilpcp->log_items, &ctx->log_items); /* * We're in the middle of switching cil contexts. Reset the * counter we use to detect when the current context is nearing * full. */ cilpcp->space_used = 0; } } /* * Aggregate the CIL per-cpu space used counters into the global atomic value. * This is called when the per-cpu counter aggregation will first pass the soft * limit threshold so we can switch to atomic counter aggregation for accurate * detection of hard limit traversal. */ static void xlog_cil_insert_pcp_aggregate( struct xfs_cil *cil, struct xfs_cil_ctx *ctx) { int cpu; int count = 0; /* Trigger atomic updates then aggregate only for the first caller */ if (!test_and_clear_bit(XLOG_CIL_PCP_SPACE, &cil->xc_flags)) return; /* * We can race with other cpus setting cil_pcpmask. However, we've * atomically cleared PCP_SPACE which forces other threads to add to * the global space used count. cil_pcpmask is a superset of cilpcp * structures that could have a nonzero space_used. */ for_each_cpu(cpu, &ctx->cil_pcpmask) { struct xlog_cil_pcp *cilpcp = per_cpu_ptr(cil->xc_pcp, cpu); count += xchg(&cilpcp->space_used, 0); } atomic_add(count, &ctx->space_used); } static void xlog_cil_ctx_switch( struct xfs_cil *cil, struct xfs_cil_ctx *ctx) { xlog_cil_set_iclog_hdr_count(cil); set_bit(XLOG_CIL_EMPTY, &cil->xc_flags); set_bit(XLOG_CIL_PCP_SPACE, &cil->xc_flags); ctx->sequence = ++cil->xc_current_sequence; ctx->cil = cil; cil->xc_ctx = ctx; } /* * After the first stage of log recovery is done, we know where the head and * tail of the log are. We need this log initialisation done before we can * initialise the first CIL checkpoint context. * * Here we allocate a log ticket to track space usage during a CIL push. This * ticket is passed to xlog_write() directly so that we don't slowly leak log * space by failing to account for space used by log headers and additional * region headers for split regions. */ void xlog_cil_init_post_recovery( struct xlog *log) { log->l_cilp->xc_ctx->ticket = xlog_cil_ticket_alloc(log); log->l_cilp->xc_ctx->sequence = 1; xlog_cil_set_iclog_hdr_count(log->l_cilp); } static inline int xlog_cil_iovec_space( uint niovecs) { return round_up((sizeof(struct xfs_log_vec) + niovecs * sizeof(struct xfs_log_iovec)), sizeof(uint64_t)); } /* * Allocate or pin log vector buffers for CIL insertion. * * The CIL currently uses disposable buffers for copying a snapshot of the * modified items into the log during a push. The biggest problem with this is * the requirement to allocate the disposable buffer during the commit if: * a) does not exist; or * b) it is too small * * If we do this allocation within xlog_cil_insert_format_items(), it is done * under the xc_ctx_lock, which means that a CIL push cannot occur during * the memory allocation. This means that we have a potential deadlock situation * under low memory conditions when we have lots of dirty metadata pinned in * the CIL and we need a CIL commit to occur to free memory. * * To avoid this, we need to move the memory allocation outside the * xc_ctx_lock, but because the log vector buffers are disposable, that opens * up a TOCTOU race condition w.r.t. the CIL committing and removing the log * vector buffers between the check and the formatting of the item into the * log vector buffer within the xc_ctx_lock. * * Because the log vector buffer needs to be unchanged during the CIL push * process, we cannot share the buffer between the transaction commit (which * modifies the buffer) and the CIL push context that is writing the changes * into the log. This means skipping preallocation of buffer space is * unreliable, but we most definitely do not want to be allocating and freeing * buffers unnecessarily during commits when overwrites can be done safely. * * The simplest solution to this problem is to allocate a shadow buffer when a * log item is committed for the second time, and then to only use this buffer * if necessary. The buffer can remain attached to the log item until such time * it is needed, and this is the buffer that is reallocated to match the size of * the incoming modification. Then during the formatting of the item we can swap * the active buffer with the new one if we can't reuse the existing buffer. We * don't free the old buffer as it may be reused on the next modification if * it's size is right, otherwise we'll free and reallocate it at that point. * * This function builds a vector for the changes in each log item in the * transaction. It then works out the length of the buffer needed for each log * item, allocates them and attaches the vector to the log item in preparation * for the formatting step which occurs under the xc_ctx_lock. * * While this means the memory footprint goes up, it avoids the repeated * alloc/free pattern that repeated modifications of an item would otherwise * cause, and hence minimises the CPU overhead of such behaviour. */ static void xlog_cil_alloc_shadow_bufs( struct xlog *log, struct xfs_trans *tp) { struct xfs_log_item *lip; list_for_each_entry(lip, &tp->t_items, li_trans) { struct xfs_log_vec *lv; int niovecs = 0; int nbytes = 0; int buf_size; bool ordered = false; /* Skip items which aren't dirty in this transaction. */ if (!test_bit(XFS_LI_DIRTY, &lip->li_flags)) continue; /* get number of vecs and size of data to be stored */ lip->li_ops->iop_size(lip, &niovecs, &nbytes); /* * Ordered items need to be tracked but we do not wish to write * them. We need a logvec to track the object, but we do not * need an iovec or buffer to be allocated for copying data. */ if (niovecs == XFS_LOG_VEC_ORDERED) { ordered = true; niovecs = 0; nbytes = 0; } /* * We 64-bit align the length of each iovec so that the start of * the next one is naturally aligned. We'll need to account for * that slack space here. * * We also add the xlog_op_header to each region when * formatting, but that's not accounted to the size of the item * at this point. Hence we'll need an addition number of bytes * for each vector to hold an opheader. * * Then round nbytes up to 64-bit alignment so that the initial * buffer alignment is easy to calculate and verify. */ nbytes = xlog_item_space(niovecs, nbytes); /* * The data buffer needs to start 64-bit aligned, so round up * that space to ensure we can align it appropriately and not * overrun the buffer. */ buf_size = nbytes + xlog_cil_iovec_space(niovecs); /* * if we have no shadow buffer, or it is too small, we need to * reallocate it. */ if (!lip->li_lv_shadow || buf_size > lip->li_lv_shadow->lv_size) { /* * We free and allocate here as a realloc would copy * unnecessary data. We don't use kvzalloc() for the * same reason - we don't need to zero the data area in * the buffer, only the log vector header and the iovec * storage. */ kvfree(lip->li_lv_shadow); lv = xlog_kvmalloc(buf_size); memset(lv, 0, xlog_cil_iovec_space(niovecs)); INIT_LIST_HEAD(&lv->lv_list); lv->lv_item = lip; lv->lv_size = buf_size; if (ordered) lv->lv_buf_len = XFS_LOG_VEC_ORDERED; else lv->lv_iovecp = (struct xfs_log_iovec *)&lv[1]; lip->li_lv_shadow = lv; } else { /* same or smaller, optimise common overwrite case */ lv = lip->li_lv_shadow; if (ordered) lv->lv_buf_len = XFS_LOG_VEC_ORDERED; else lv->lv_buf_len = 0; lv->lv_bytes = 0; } /* Ensure the lv is set up according to ->iop_size */ lv->lv_niovecs = niovecs; /* The allocated data region lies beyond the iovec region */ lv->lv_buf = (char *)lv + xlog_cil_iovec_space(niovecs); } } /* * Prepare the log item for insertion into the CIL. Calculate the difference in * log space it will consume, and if it is a new item pin it as well. */ STATIC void xfs_cil_prepare_item( struct xlog *log, struct xfs_log_vec *lv, struct xfs_log_vec *old_lv, int *diff_len) { /* Account for the new LV being passed in */ if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED) *diff_len += lv->lv_bytes; /* * If there is no old LV, this is the first time we've seen the item in * this CIL context and so we need to pin it. If we are replacing the * old_lv, then remove the space it accounts for and make it the shadow * buffer for later freeing. In both cases we are now switching to the * shadow buffer, so update the pointer to it appropriately. */ if (!old_lv) { if (lv->lv_item->li_ops->iop_pin) lv->lv_item->li_ops->iop_pin(lv->lv_item); lv->lv_item->li_lv_shadow = NULL; } else if (old_lv != lv) { ASSERT(lv->lv_buf_len != XFS_LOG_VEC_ORDERED); *diff_len -= old_lv->lv_bytes; lv->lv_item->li_lv_shadow = old_lv; } /* attach new log vector to log item */ lv->lv_item->li_lv = lv; /* * If this is the first time the item is being committed to the * CIL, store the sequence number on the log item so we can * tell in future commits whether this is the first checkpoint * the item is being committed into. */ if (!lv->lv_item->li_seq) lv->lv_item->li_seq = log->l_cilp->xc_ctx->sequence; } /* * Format log item into a flat buffers * * For delayed logging, we need to hold a formatted buffer containing all the * changes on the log item. This enables us to relog the item in memory and * write it out asynchronously without needing to relock the object that was * modified at the time it gets written into the iclog. * * This function takes the prepared log vectors attached to each log item, and * formats the changes into the log vector buffer. The buffer it uses is * dependent on the current state of the vector in the CIL - the shadow lv is * guaranteed to be large enough for the current modification, but we will only * use that if we can't reuse the existing lv. If we can't reuse the existing * lv, then simple swap it out for the shadow lv. We don't free it - that is * done lazily either by th enext modification or the freeing of the log item. * * We don't set up region headers during this process; we simply copy the * regions into the flat buffer. We can do this because we still have to do a * formatting step to write the regions into the iclog buffer. Writing the * ophdrs during the iclog write means that we can support splitting large * regions across iclog boundares without needing a change in the format of the * item/region encapsulation. * * Hence what we need to do now is change the rewrite the vector array to point * to the copied region inside the buffer we just allocated. This allows us to * format the regions into the iclog as though they are being formatted * directly out of the objects themselves. */ static void xlog_cil_insert_format_items( struct xlog *log, struct xfs_trans *tp, int *diff_len) { struct xfs_log_item *lip; /* Bail out if we didn't find a log item. */ if (list_empty(&tp->t_items)) { ASSERT(0); return; } list_for_each_entry(lip, &tp->t_items, li_trans) { struct xfs_log_vec *lv; struct xfs_log_vec *old_lv = NULL; struct xfs_log_vec *shadow; bool ordered = false; /* Skip items which aren't dirty in this transaction. */ if (!test_bit(XFS_LI_DIRTY, &lip->li_flags)) continue; /* * The formatting size information is already attached to * the shadow lv on the log item. */ shadow = lip->li_lv_shadow; if (shadow->lv_buf_len == XFS_LOG_VEC_ORDERED) ordered = true; /* Skip items that do not have any vectors for writing */ if (!shadow->lv_niovecs && !ordered) continue; /* compare to existing item size */ old_lv = lip->li_lv; if (lip->li_lv && shadow->lv_size <= lip->li_lv->lv_size) { /* same or smaller, optimise common overwrite case */ lv = lip->li_lv; if (ordered) goto insert; /* * set the item up as though it is a new insertion so * that the space reservation accounting is correct. */ *diff_len -= lv->lv_bytes; /* Ensure the lv is set up according to ->iop_size */ lv->lv_niovecs = shadow->lv_niovecs; /* reset the lv buffer information for new formatting */ lv->lv_buf_len = 0; lv->lv_bytes = 0; lv->lv_buf = (char *)lv + xlog_cil_iovec_space(lv->lv_niovecs); } else { /* switch to shadow buffer! */ lv = shadow; lv->lv_item = lip; if (ordered) { /* track as an ordered logvec */ ASSERT(lip->li_lv == NULL); goto insert; } } ASSERT(IS_ALIGNED((unsigned long)lv->lv_buf, sizeof(uint64_t))); lip->li_ops->iop_format(lip, lv); insert: xfs_cil_prepare_item(log, lv, old_lv, diff_len); } } /* * The use of lockless waitqueue_active() requires that the caller has * serialised itself against the wakeup call in xlog_cil_push_work(). That * can be done by either holding the push lock or the context lock. */ static inline bool xlog_cil_over_hard_limit( struct xlog *log, int32_t space_used) { if (waitqueue_active(&log->l_cilp->xc_push_wait)) return true; if (space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log)) return true; return false; } /* * Insert the log items into the CIL and calculate the difference in space * consumed by the item. Add the space to the checkpoint ticket and calculate * if the change requires additional log metadata. If it does, take that space * as well. Remove the amount of space we added to the checkpoint ticket from * the current transaction ticket so that the accounting works out correctly. */ static void xlog_cil_insert_items( struct xlog *log, struct xfs_trans *tp, uint32_t released_space) { struct xfs_cil *cil = log->l_cilp; struct xfs_cil_ctx *ctx = cil->xc_ctx; struct xfs_log_item *lip; int len = 0; int iovhdr_res = 0, split_res = 0, ctx_res = 0; int space_used; int order; unsigned int cpu_nr; struct xlog_cil_pcp *cilpcp; ASSERT(tp); /* * We can do this safely because the context can't checkpoint until we * are done so it doesn't matter exactly how we update the CIL. */ xlog_cil_insert_format_items(log, tp, &len); /* * Subtract the space released by intent cancelation from the space we * consumed so that we remove it from the CIL space and add it back to * the current transaction reservation context. */ len -= released_space; /* * Grab the per-cpu pointer for the CIL before we start any accounting. * That ensures that we are running with pre-emption disabled and so we * can't be scheduled away between split sample/update operations that * are done without outside locking to serialise them. */ cpu_nr = get_cpu(); cilpcp = this_cpu_ptr(cil->xc_pcp); /* Tell the future push that there was work added by this CPU. */ if (!cpumask_test_cpu(cpu_nr, &ctx->cil_pcpmask)) cpumask_test_and_set_cpu(cpu_nr, &ctx->cil_pcpmask); /* * We need to take the CIL checkpoint unit reservation on the first * commit into the CIL. Test the XLOG_CIL_EMPTY bit first so we don't * unnecessarily do an atomic op in the fast path here. We can clear the * XLOG_CIL_EMPTY bit as we are under the xc_ctx_lock here and that * needs to be held exclusively to reset the XLOG_CIL_EMPTY bit. */ if (test_bit(XLOG_CIL_EMPTY, &cil->xc_flags) && test_and_clear_bit(XLOG_CIL_EMPTY, &cil->xc_flags)) ctx_res = ctx->ticket->t_unit_res; /* * Check if we need to steal iclog headers. atomic_read() is not a * locked atomic operation, so we can check the value before we do any * real atomic ops in the fast path. If we've already taken the CIL unit * reservation from this commit, we've already got one iclog header * space reserved so we have to account for that otherwise we risk * overrunning the reservation on this ticket. * * If the CIL is already at the hard limit, we might need more header * space that originally reserved. So steal more header space from every * commit that occurs once we are over the hard limit to ensure the CIL * push won't run out of reservation space. * * This can steal more than we need, but that's OK. * * The cil->xc_ctx_lock provides the serialisation necessary for safely * calling xlog_cil_over_hard_limit() in this context. */ space_used = atomic_read(&ctx->space_used) + cilpcp->space_used + len; if (atomic_read(&cil->xc_iclog_hdrs) > 0 || xlog_cil_over_hard_limit(log, space_used)) { split_res = log->l_iclog_hsize + sizeof(struct xlog_op_header); if (ctx_res) ctx_res += split_res * (tp->t_ticket->t_iclog_hdrs - 1); else ctx_res = split_res * tp->t_ticket->t_iclog_hdrs; atomic_sub(tp->t_ticket->t_iclog_hdrs, &cil->xc_iclog_hdrs); } cilpcp->space_reserved += ctx_res; /* * Accurately account when over the soft limit, otherwise fold the * percpu count into the global count if over the per-cpu threshold. */ if (!test_bit(XLOG_CIL_PCP_SPACE, &cil->xc_flags)) { atomic_add(len, &ctx->space_used); } else if (cilpcp->space_used + len > (XLOG_CIL_SPACE_LIMIT(log) / num_online_cpus())) { space_used = atomic_add_return(cilpcp->space_used + len, &ctx->space_used); cilpcp->space_used = 0; /* * If we just transitioned over the soft limit, we need to * transition to the global atomic counter. */ if (space_used >= XLOG_CIL_SPACE_LIMIT(log)) xlog_cil_insert_pcp_aggregate(cil, ctx); } else { cilpcp->space_used += len; } /* attach the transaction to the CIL if it has any busy extents */ if (!list_empty(&tp->t_busy)) list_splice_init(&tp->t_busy, &cilpcp->busy_extents); /* * Now update the order of everything modified in the transaction * and insert items into the CIL if they aren't already there. * We do this here so we only need to take the CIL lock once during * the transaction commit. */ order = atomic_inc_return(&ctx->order_id); list_for_each_entry(lip, &tp->t_items, li_trans) { /* Skip items which aren't dirty in this transaction. */ if (!test_bit(XFS_LI_DIRTY, &lip->li_flags)) continue; lip->li_order_id = order; if (!list_empty(&lip->li_cil)) continue; list_add_tail(&lip->li_cil, &cilpcp->log_items); } put_cpu(); /* * If we've overrun the reservation, dump the tx details before we move * the log items. Shutdown is imminent... */ tp->t_ticket->t_curr_res -= ctx_res + len; if (WARN_ON(tp->t_ticket->t_curr_res < 0)) { xfs_warn(log->l_mp, "Transaction log reservation overrun:"); xfs_warn(log->l_mp, " log items: %d bytes (iov hdrs: %d bytes)", len, iovhdr_res); xfs_warn(log->l_mp, " split region headers: %d bytes", split_res); xfs_warn(log->l_mp, " ctx ticket: %d bytes", ctx_res); xlog_print_trans(tp); xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); } } static inline void xlog_cil_ail_insert_batch( struct xfs_ail *ailp, struct xfs_ail_cursor *cur, struct xfs_log_item **log_items, int nr_items, xfs_lsn_t commit_lsn) { int i; spin_lock(&ailp->ail_lock); /* xfs_trans_ail_update_bulk drops ailp->ail_lock */ xfs_trans_ail_update_bulk(ailp, cur, log_items, nr_items, commit_lsn); for (i = 0; i < nr_items; i++) { struct xfs_log_item *lip = log_items[i]; if (lip->li_ops->iop_unpin) lip->li_ops->iop_unpin(lip, 0); } } /* * Take the checkpoint's log vector chain of items and insert the attached log * items into the AIL. This uses bulk insertion techniques to minimise AIL lock * traffic. * * The AIL tracks log items via the start record LSN of the checkpoint, * not the commit record LSN. This is because we can pipeline multiple * checkpoints, and so the start record of checkpoint N+1 can be * written before the commit record of checkpoint N. i.e: * * start N commit N * +-------------+------------+----------------+ * start N+1 commit N+1 * * The tail of the log cannot be moved to the LSN of commit N when all * the items of that checkpoint are written back, because then the * start record for N+1 is no longer in the active portion of the log * and recovery will fail/corrupt the filesystem. * * Hence when all the log items in checkpoint N are written back, the * tail of the log most now only move as far forwards as the start LSN * of checkpoint N+1. * * If we are called with the aborted flag set, it is because a log write during * a CIL checkpoint commit has failed. In this case, all the items in the * checkpoint have already gone through iop_committed and iop_committing, which * means that checkpoint commit abort handling is treated exactly the same as an * iclog write error even though we haven't started any IO yet. Hence in this * case all we need to do is iop_committed processing, followed by an * iop_unpin(aborted) call. * * The AIL cursor is used to optimise the insert process. If commit_lsn is not * at the end of the AIL, the insert cursor avoids the need to walk the AIL to * find the insertion point on every xfs_log_item_batch_insert() call. This * saves a lot of needless list walking and is a net win, even though it * slightly increases that amount of AIL lock traffic to set it up and tear it * down. */ static void xlog_cil_ail_insert( struct xfs_cil_ctx *ctx, bool aborted) { #define LOG_ITEM_BATCH_SIZE 32 struct xfs_ail *ailp = ctx->cil->xc_log->l_ailp; struct xfs_log_item *log_items[LOG_ITEM_BATCH_SIZE]; struct xfs_log_vec *lv; struct xfs_ail_cursor cur; xfs_lsn_t old_head; int i = 0; /* * Update the AIL head LSN with the commit record LSN of this * checkpoint. As iclogs are always completed in order, this should * always be the same (as iclogs can contain multiple commit records) or * higher LSN than the current head. We do this before insertion of the * items so that log space checks during insertion will reflect the * space that this checkpoint has already consumed. We call * xfs_ail_update_finish() so that tail space and space-based wakeups * will be recalculated appropriately. */ ASSERT(XFS_LSN_CMP(ctx->commit_lsn, ailp->ail_head_lsn) >= 0 || aborted); spin_lock(&ailp->ail_lock); xfs_trans_ail_cursor_last(ailp, &cur, ctx->start_lsn); old_head = ailp->ail_head_lsn; ailp->ail_head_lsn = ctx->commit_lsn; /* xfs_ail_update_finish() drops the ail_lock */ xfs_ail_update_finish(ailp, NULLCOMMITLSN); /* * We move the AIL head forwards to account for the space used in the * log before we remove that space from the grant heads. This prevents a * transient condition where reservation space appears to become * available on return, only for it to disappear again immediately as * the AIL head update accounts in the log tail space. */ smp_wmb(); /* paired with smp_rmb in xlog_grant_space_left */ xlog_grant_return_space(ailp->ail_log, old_head, ailp->ail_head_lsn); /* unpin all the log items */ list_for_each_entry(lv, &ctx->lv_chain, lv_list) { struct xfs_log_item *lip = lv->lv_item; xfs_lsn_t item_lsn; if (aborted) set_bit(XFS_LI_ABORTED, &lip->li_flags); if (lip->li_ops->flags & XFS_ITEM_RELEASE_WHEN_COMMITTED) { lip->li_ops->iop_release(lip); continue; } if (lip->li_ops->iop_committed) item_lsn = lip->li_ops->iop_committed(lip, ctx->start_lsn); else item_lsn = ctx->start_lsn; /* item_lsn of -1 means the item needs no further processing */ if (XFS_LSN_CMP(item_lsn, (xfs_lsn_t)-1) == 0) continue; /* * if we are aborting the operation, no point in inserting the * object into the AIL as we are in a shutdown situation. */ if (aborted) { ASSERT(xlog_is_shutdown(ailp->ail_log)); if (lip->li_ops->iop_unpin) lip->li_ops->iop_unpin(lip, 1); continue; } if (item_lsn != ctx->start_lsn) { /* * Not a bulk update option due to unusual item_lsn. * Push into AIL immediately, rechecking the lsn once * we have the ail lock. Then unpin the item. This does * not affect the AIL cursor the bulk insert path is * using. */ spin_lock(&ailp->ail_lock); if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0) xfs_trans_ail_update(ailp, lip, item_lsn); else spin_unlock(&ailp->ail_lock); if (lip->li_ops->iop_unpin) lip->li_ops->iop_unpin(lip, 0); continue; } /* Item is a candidate for bulk AIL insert. */ log_items[i++] = lv->lv_item; if (i >= LOG_ITEM_BATCH_SIZE) { xlog_cil_ail_insert_batch(ailp, &cur, log_items, LOG_ITEM_BATCH_SIZE, ctx->start_lsn); i = 0; } } /* make sure we insert the remainder! */ if (i) xlog_cil_ail_insert_batch(ailp, &cur, log_items, i, ctx->start_lsn); spin_lock(&ailp->ail_lock); xfs_trans_ail_cursor_done(&cur); spin_unlock(&ailp->ail_lock); } static void xlog_cil_free_logvec( struct list_head *lv_chain) { struct xfs_log_vec *lv; while (!list_empty(lv_chain)) { lv = list_first_entry(lv_chain, struct xfs_log_vec, lv_list); list_del_init(&lv->lv_list); kvfree(lv); } } /* * Mark all items committed and clear busy extents. We free the log vector * chains in a separate pass so that we unpin the log items as quickly as * possible. */ static void xlog_cil_committed( struct xfs_cil_ctx *ctx) { struct xfs_mount *mp = ctx->cil->xc_log->l_mp; bool abort = xlog_is_shutdown(ctx->cil->xc_log); /* * If the I/O failed, we're aborting the commit and already shutdown. * Wake any commit waiters before aborting the log items so we don't * block async log pushers on callbacks. Async log pushers explicitly do * not wait on log force completion because they may be holding locks * required to unpin items. */ if (abort) { spin_lock(&ctx->cil->xc_push_lock); wake_up_all(&ctx->cil->xc_start_wait); wake_up_all(&ctx->cil->xc_commit_wait); spin_unlock(&ctx->cil->xc_push_lock); } xlog_cil_ail_insert(ctx, abort); xfs_extent_busy_sort(&ctx->busy_extents.extent_list); xfs_extent_busy_clear(&ctx->busy_extents.extent_list, xfs_has_discard(mp) && !abort); spin_lock(&ctx->cil->xc_push_lock); list_del(&ctx->committing); spin_unlock(&ctx->cil->xc_push_lock); xlog_cil_free_logvec(&ctx->lv_chain); if (!list_empty(&ctx->busy_extents.extent_list)) { ctx->busy_extents.owner = ctx; xfs_discard_extents(mp, &ctx->busy_extents); return; } kfree(ctx); } void xlog_cil_process_committed( struct list_head *list) { struct xfs_cil_ctx *ctx; while ((ctx = list_first_entry_or_null(list, struct xfs_cil_ctx, iclog_entry))) { list_del(&ctx->iclog_entry); xlog_cil_committed(ctx); } } /* * Record the LSN of the iclog we were just granted space to start writing into. * If the context doesn't have a start_lsn recorded, then this iclog will * contain the start record for the checkpoint. Otherwise this write contains * the commit record for the checkpoint. */ void xlog_cil_set_ctx_write_state( struct xfs_cil_ctx *ctx, struct xlog_in_core *iclog) { struct xfs_cil *cil = ctx->cil; xfs_lsn_t lsn = be64_to_cpu(iclog->ic_header.h_lsn); ASSERT(!ctx->commit_lsn); if (!ctx->start_lsn) { spin_lock(&cil->xc_push_lock); /* * The LSN we need to pass to the log items on transaction * commit is the LSN reported by the first log vector write, not * the commit lsn. If we use the commit record lsn then we can * move the grant write head beyond the tail LSN and overwrite * it. */ ctx->start_lsn = lsn; wake_up_all(&cil->xc_start_wait); spin_unlock(&cil->xc_push_lock); /* * Make sure the metadata we are about to overwrite in the log * has been flushed to stable storage before this iclog is * issued. */ spin_lock(&cil->xc_log->l_icloglock); iclog->ic_flags |= XLOG_ICL_NEED_FLUSH; spin_unlock(&cil->xc_log->l_icloglock); return; } /* * Take a reference to the iclog for the context so that we still hold * it when xlog_write is done and has released it. This means the * context controls when the iclog is released for IO. */ atomic_inc(&iclog->ic_refcnt); /* * xlog_state_get_iclog_space() guarantees there is enough space in the * iclog for an entire commit record, so we can attach the context * callbacks now. This needs to be done before we make the commit_lsn * visible to waiters so that checkpoints with commit records in the * same iclog order their IO completion callbacks in the same order that * the commit records appear in the iclog. */ spin_lock(&cil->xc_log->l_icloglock); list_add_tail(&ctx->iclog_entry, &iclog->ic_callbacks); spin_unlock(&cil->xc_log->l_icloglock); /* * Now we can record the commit LSN and wake anyone waiting for this * sequence to have the ordered commit record assigned to a physical * location in the log. */ spin_lock(&cil->xc_push_lock); ctx->commit_iclog = iclog; ctx->commit_lsn = lsn; wake_up_all(&cil->xc_commit_wait); spin_unlock(&cil->xc_push_lock); } /* * Ensure that the order of log writes follows checkpoint sequence order. This * relies on the context LSN being zero until the log write has guaranteed the * LSN that the log write will start at via xlog_state_get_iclog_space(). */ enum _record_type { _START_RECORD, _COMMIT_RECORD, }; static int xlog_cil_order_write( struct xfs_cil *cil, xfs_csn_t sequence, enum _record_type record) { struct xfs_cil_ctx *ctx; restart: spin_lock(&cil->xc_push_lock); list_for_each_entry(ctx, &cil->xc_committing, committing) { /* * Avoid getting stuck in this loop because we were woken by the * shutdown, but then went back to sleep once already in the * shutdown state. */ if (xlog_is_shutdown(cil->xc_log)) { spin_unlock(&cil->xc_push_lock); return -EIO; } /* * Higher sequences will wait for this one so skip them. * Don't wait for our own sequence, either. */ if (ctx->sequence >= sequence) continue; /* Wait until the LSN for the record has been recorded. */ switch (record) { case _START_RECORD: if (!ctx->start_lsn) { xlog_wait(&cil->xc_start_wait, &cil->xc_push_lock); goto restart; } break; case _COMMIT_RECORD: if (!ctx->commit_lsn) { xlog_wait(&cil->xc_commit_wait, &cil->xc_push_lock); goto restart; } break; } } spin_unlock(&cil->xc_push_lock); return 0; } /* * Write out the log vector change now attached to the CIL context. This will * write a start record that needs to be strictly ordered in ascending CIL * sequence order so that log recovery will always use in-order start LSNs when * replaying checkpoints. */ static int xlog_cil_write_chain( struct xfs_cil_ctx *ctx, uint32_t chain_len) { struct xlog *log = ctx->cil->xc_log; int error; error = xlog_cil_order_write(ctx->cil, ctx->sequence, _START_RECORD); if (error) return error; return xlog_write(log, ctx, &ctx->lv_chain, ctx->ticket, chain_len); } /* * Write out the commit record of a checkpoint transaction to close off a * running log write. These commit records are strictly ordered in ascending CIL * sequence order so that log recovery will always replay the checkpoints in the * correct order. */ static int xlog_cil_write_commit_record( struct xfs_cil_ctx *ctx) { struct xlog *log = ctx->cil->xc_log; struct xlog_op_header ophdr = { .oh_clientid = XFS_TRANSACTION, .oh_tid = cpu_to_be32(ctx->ticket->t_tid), .oh_flags = XLOG_COMMIT_TRANS, }; struct xfs_log_iovec reg = { .i_addr = &ophdr, .i_len = sizeof(struct xlog_op_header), .i_type = XLOG_REG_TYPE_COMMIT, }; struct xfs_log_vec vec = { .lv_niovecs = 1, .lv_iovecp = &reg, }; int error; LIST_HEAD(lv_chain); list_add(&vec.lv_list, &lv_chain); if (xlog_is_shutdown(log)) return -EIO; error = xlog_cil_order_write(ctx->cil, ctx->sequence, _COMMIT_RECORD); if (error) return error; /* account for space used by record data */ ctx->ticket->t_curr_res -= reg.i_len; error = xlog_write(log, ctx, &lv_chain, ctx->ticket, reg.i_len); if (error) xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); return error; } struct xlog_cil_trans_hdr { struct xlog_op_header oph[2]; struct xfs_trans_header thdr; struct xfs_log_iovec lhdr[2]; }; /* * Build a checkpoint transaction header to begin the journal transaction. We * need to account for the space used by the transaction header here as it is * not accounted for in xlog_write(). * * This is the only place we write a transaction header, so we also build the * log opheaders that indicate the start of a log transaction and wrap the * transaction header. We keep the start record in it's own log vector rather * than compacting them into a single region as this ends up making the logic * in xlog_write() for handling empty opheaders for start, commit and unmount * records much simpler. */ static void xlog_cil_build_trans_hdr( struct xfs_cil_ctx *ctx, struct xlog_cil_trans_hdr *hdr, struct xfs_log_vec *lvhdr, int num_iovecs) { struct xlog_ticket *tic = ctx->ticket; __be32 tid = cpu_to_be32(tic->t_tid); memset(hdr, 0, sizeof(*hdr)); /* Log start record */ hdr->oph[0].oh_tid = tid; hdr->oph[0].oh_clientid = XFS_TRANSACTION; hdr->oph[0].oh_flags = XLOG_START_TRANS; /* log iovec region pointer */ hdr->lhdr[0].i_addr = &hdr->oph[0]; hdr->lhdr[0].i_len = sizeof(struct xlog_op_header); hdr->lhdr[0].i_type = XLOG_REG_TYPE_LRHEADER; /* log opheader */ hdr->oph[1].oh_tid = tid; hdr->oph[1].oh_clientid = XFS_TRANSACTION; hdr->oph[1].oh_len = cpu_to_be32(sizeof(struct xfs_trans_header)); /* transaction header in host byte order format */ hdr->thdr.th_magic = XFS_TRANS_HEADER_MAGIC; hdr->thdr.th_type = XFS_TRANS_CHECKPOINT; hdr->thdr.th_tid = tic->t_tid; hdr->thdr.th_num_items = num_iovecs; /* log iovec region pointer */ hdr->lhdr[1].i_addr = &hdr->oph[1]; hdr->lhdr[1].i_len = sizeof(struct xlog_op_header) + sizeof(struct xfs_trans_header); hdr->lhdr[1].i_type = XLOG_REG_TYPE_TRANSHDR; lvhdr->lv_niovecs = 2; lvhdr->lv_iovecp = &hdr->lhdr[0]; lvhdr->lv_bytes = hdr->lhdr[0].i_len + hdr->lhdr[1].i_len; tic->t_curr_res -= lvhdr->lv_bytes; } /* * CIL item reordering compare function. We want to order in ascending ID order, * but we want to leave items with the same ID in the order they were added to * the list. This is important for operations like reflink where we log 4 order * dependent intents in a single transaction when we overwrite an existing * shared extent with a new shared extent. i.e. BUI(unmap), CUI(drop), * CUI (inc), BUI(remap)... */ static int xlog_cil_order_cmp( void *priv, const struct list_head *a, const struct list_head *b) { struct xfs_log_vec *l1 = container_of(a, struct xfs_log_vec, lv_list); struct xfs_log_vec *l2 = container_of(b, struct xfs_log_vec, lv_list); return l1->lv_order_id > l2->lv_order_id; } /* * Pull all the log vectors off the items in the CIL, and remove the items from * the CIL. We don't need the CIL lock here because it's only needed on the * transaction commit side which is currently locked out by the flush lock. * * If a log item is marked with a whiteout, we do not need to write it to the * journal and so we just move them to the whiteout list for the caller to * dispose of appropriately. */ static void xlog_cil_build_lv_chain( struct xfs_cil_ctx *ctx, struct list_head *whiteouts, uint32_t *num_iovecs, uint32_t *num_bytes) { while (!list_empty(&ctx->log_items)) { struct xfs_log_item *item; struct xfs_log_vec *lv; item = list_first_entry(&ctx->log_items, struct xfs_log_item, li_cil); if (test_bit(XFS_LI_WHITEOUT, &item->li_flags)) { list_move(&item->li_cil, whiteouts); trace_xfs_cil_whiteout_skip(item); continue; } lv = item->li_lv; lv->lv_order_id = item->li_order_id; /* we don't write ordered log vectors */ if (lv->lv_buf_len != XFS_LOG_VEC_ORDERED) *num_bytes += lv->lv_bytes; *num_iovecs += lv->lv_niovecs; list_add_tail(&lv->lv_list, &ctx->lv_chain); list_del_init(&item->li_cil); item->li_order_id = 0; item->li_lv = NULL; } } static void xlog_cil_cleanup_whiteouts( struct list_head *whiteouts) { while (!list_empty(whiteouts)) { struct xfs_log_item *item = list_first_entry(whiteouts, struct xfs_log_item, li_cil); list_del_init(&item->li_cil); trace_xfs_cil_whiteout_unpin(item); item->li_ops->iop_unpin(item, 1); } } /* * Push the Committed Item List to the log. * * If the current sequence is the same as xc_push_seq we need to do a flush. If * xc_push_seq is less than the current sequence, then it has already been * flushed and we don't need to do anything - the caller will wait for it to * complete if necessary. * * xc_push_seq is checked unlocked against the sequence number for a match. * Hence we can allow log forces to run racily and not issue pushes for the * same sequence twice. If we get a race between multiple pushes for the same * sequence they will block on the first one and then abort, hence avoiding * needless pushes. * * This runs from a workqueue so it does not inherent any specific memory * allocation context. However, we do not want to block on memory reclaim * recursing back into the filesystem because this push may have been triggered * by memory reclaim itself. Hence we really need to run under full GFP_NOFS * contraints here. */ static void xlog_cil_push_work( struct work_struct *work) { unsigned int nofs_flags = memalloc_nofs_save(); struct xfs_cil_ctx *ctx = container_of(work, struct xfs_cil_ctx, push_work); struct xfs_cil *cil = ctx->cil; struct xlog *log = cil->xc_log; struct xfs_cil_ctx *new_ctx; int num_iovecs = 0; int num_bytes = 0; int error = 0; struct xlog_cil_trans_hdr thdr; struct xfs_log_vec lvhdr = {}; xfs_csn_t push_seq; bool push_commit_stable; LIST_HEAD (whiteouts); struct xlog_ticket *ticket; new_ctx = xlog_cil_ctx_alloc(); new_ctx->ticket = xlog_cil_ticket_alloc(log); down_write(&cil->xc_ctx_lock); spin_lock(&cil->xc_push_lock); push_seq = cil->xc_push_seq; ASSERT(push_seq <= ctx->sequence); push_commit_stable = cil->xc_push_commit_stable; cil->xc_push_commit_stable = false; /* * As we are about to switch to a new, empty CIL context, we no longer * need to throttle tasks on CIL space overruns. Wake any waiters that * the hard push throttle may have caught so they can start committing * to the new context. The ctx->xc_push_lock provides the serialisation * necessary for safely using the lockless waitqueue_active() check in * this context. */ if (waitqueue_active(&cil->xc_push_wait)) wake_up_all(&cil->xc_push_wait); xlog_cil_push_pcp_aggregate(cil, ctx); /* * Check if we've anything to push. If there is nothing, then we don't * move on to a new sequence number and so we have to be able to push * this sequence again later. */ if (test_bit(XLOG_CIL_EMPTY, &cil->xc_flags)) { cil->xc_push_seq = 0; spin_unlock(&cil->xc_push_lock); goto out_skip; } /* check for a previously pushed sequence */ if (push_seq < ctx->sequence) { spin_unlock(&cil->xc_push_lock); goto out_skip; } /* * We are now going to push this context, so add it to the committing * list before we do anything else. This ensures that anyone waiting on * this push can easily detect the difference between a "push in * progress" and "CIL is empty, nothing to do". * * IOWs, a wait loop can now check for: * the current sequence not being found on the committing list; * an empty CIL; and * an unchanged sequence number * to detect a push that had nothing to do and therefore does not need * waiting on. If the CIL is not empty, we get put on the committing * list before emptying the CIL and bumping the sequence number. Hence * an empty CIL and an unchanged sequence number means we jumped out * above after doing nothing. * * Hence the waiter will either find the commit sequence on the * committing list or the sequence number will be unchanged and the CIL * still dirty. In that latter case, the push has not yet started, and * so the waiter will have to continue trying to check the CIL * committing list until it is found. In extreme cases of delay, the * sequence may fully commit between the attempts the wait makes to wait * on the commit sequence. */ list_add(&ctx->committing, &cil->xc_committing); spin_unlock(&cil->xc_push_lock); xlog_cil_build_lv_chain(ctx, &whiteouts, &num_iovecs, &num_bytes); /* * Switch the contexts so we can drop the context lock and move out * of a shared context. We can't just go straight to the commit record, * though - we need to synchronise with previous and future commits so * that the commit records are correctly ordered in the log to ensure * that we process items during log IO completion in the correct order. * * For example, if we get an EFI in one checkpoint and the EFD in the * next (e.g. due to log forces), we do not want the checkpoint with * the EFD to be committed before the checkpoint with the EFI. Hence * we must strictly order the commit records of the checkpoints so * that: a) the checkpoint callbacks are attached to the iclogs in the * correct order; and b) the checkpoints are replayed in correct order * in log recovery. * * Hence we need to add this context to the committing context list so * that higher sequences will wait for us to write out a commit record * before they do. * * xfs_log_force_seq requires us to mirror the new sequence into the cil * structure atomically with the addition of this sequence to the * committing list. This also ensures that we can do unlocked checks * against the current sequence in log forces without risking * deferencing a freed context pointer. */ spin_lock(&cil->xc_push_lock); xlog_cil_ctx_switch(cil, new_ctx); spin_unlock(&cil->xc_push_lock); up_write(&cil->xc_ctx_lock); /* * Sort the log vector chain before we add the transaction headers. * This ensures we always have the transaction headers at the start * of the chain. */ list_sort(NULL, &ctx->lv_chain, xlog_cil_order_cmp); /* * Build a checkpoint transaction header and write it to the log to * begin the transaction. We need to account for the space used by the * transaction header here as it is not accounted for in xlog_write(). * Add the lvhdr to the head of the lv chain we pass to xlog_write() so * it gets written into the iclog first. */ xlog_cil_build_trans_hdr(ctx, &thdr, &lvhdr, num_iovecs); num_bytes += lvhdr.lv_bytes; list_add(&lvhdr.lv_list, &ctx->lv_chain); /* * Take the lvhdr back off the lv_chain immediately after calling * xlog_cil_write_chain() as it should not be passed to log IO * completion. */ error = xlog_cil_write_chain(ctx, num_bytes); list_del(&lvhdr.lv_list); if (error) goto out_abort_free_ticket; error = xlog_cil_write_commit_record(ctx); if (error) goto out_abort_free_ticket; /* * Grab the ticket from the ctx so we can ungrant it after releasing the * commit_iclog. The ctx may be freed by the time we return from * releasing the commit_iclog (i.e. checkpoint has been completed and * callback run) so we can't reference the ctx after the call to * xlog_state_release_iclog(). */ ticket = ctx->ticket; /* * If the checkpoint spans multiple iclogs, wait for all previous iclogs * to complete before we submit the commit_iclog. We can't use state * checks for this - ACTIVE can be either a past completed iclog or a * future iclog being filled, while WANT_SYNC through SYNC_DONE can be a * past or future iclog awaiting IO or ordered IO completion to be run. * In the latter case, if it's a future iclog and we wait on it, the we * will hang because it won't get processed through to ic_force_wait * wakeup until this commit_iclog is written to disk. Hence we use the * iclog header lsn and compare it to the commit lsn to determine if we * need to wait on iclogs or not. */ spin_lock(&log->l_icloglock); if (ctx->start_lsn != ctx->commit_lsn) { xfs_lsn_t plsn; plsn = be64_to_cpu(ctx->commit_iclog->ic_prev->ic_header.h_lsn); if (plsn && XFS_LSN_CMP(plsn, ctx->commit_lsn) < 0) { /* * Waiting on ic_force_wait orders the completion of * iclogs older than ic_prev. Hence we only need to wait * on the most recent older iclog here. */ xlog_wait_on_iclog(ctx->commit_iclog->ic_prev); spin_lock(&log->l_icloglock); } /* * We need to issue a pre-flush so that the ordering for this * checkpoint is correctly preserved down to stable storage. */ ctx->commit_iclog->ic_flags |= XLOG_ICL_NEED_FLUSH; } /* * The commit iclog must be written to stable storage to guarantee * journal IO vs metadata writeback IO is correctly ordered on stable * storage. * * If the push caller needs the commit to be immediately stable and the * commit_iclog is not yet marked as XLOG_STATE_WANT_SYNC to indicate it * will be written when released, switch it's state to WANT_SYNC right * now. */ ctx->commit_iclog->ic_flags |= XLOG_ICL_NEED_FUA; if (push_commit_stable && ctx->commit_iclog->ic_state == XLOG_STATE_ACTIVE) xlog_state_switch_iclogs(log, ctx->commit_iclog, 0); ticket = ctx->ticket; xlog_state_release_iclog(log, ctx->commit_iclog, ticket); /* Not safe to reference ctx now! */ spin_unlock(&log->l_icloglock); xlog_cil_cleanup_whiteouts(&whiteouts); xfs_log_ticket_ungrant(log, ticket); memalloc_nofs_restore(nofs_flags); return; out_skip: up_write(&cil->xc_ctx_lock); xfs_log_ticket_put(new_ctx->ticket); kfree(new_ctx); memalloc_nofs_restore(nofs_flags); return; out_abort_free_ticket: ASSERT(xlog_is_shutdown(log)); xlog_cil_cleanup_whiteouts(&whiteouts); if (!ctx->commit_iclog) { xfs_log_ticket_ungrant(log, ctx->ticket); xlog_cil_committed(ctx); memalloc_nofs_restore(nofs_flags); return; } spin_lock(&log->l_icloglock); ticket = ctx->ticket; xlog_state_release_iclog(log, ctx->commit_iclog, ticket); /* Not safe to reference ctx now! */ spin_unlock(&log->l_icloglock); xfs_log_ticket_ungrant(log, ticket); memalloc_nofs_restore(nofs_flags); } /* * We need to push CIL every so often so we don't cache more than we can fit in * the log. The limit really is that a checkpoint can't be more than half the * log (the current checkpoint is not allowed to overwrite the previous * checkpoint), but commit latency and memory usage limit this to a smaller * size. */ static void xlog_cil_push_background( struct xlog *log) { struct xfs_cil *cil = log->l_cilp; int space_used = atomic_read(&cil->xc_ctx->space_used); /* * The cil won't be empty because we are called while holding the * context lock so whatever we added to the CIL will still be there. */ ASSERT(!test_bit(XLOG_CIL_EMPTY, &cil->xc_flags)); /* * We are done if: * - we haven't used up all the space available yet; or * - we've already queued up a push; and * - we're not over the hard limit; and * - nothing has been over the hard limit. * * If so, we don't need to take the push lock as there's nothing to do. */ if (space_used < XLOG_CIL_SPACE_LIMIT(log) || (cil->xc_push_seq == cil->xc_current_sequence && space_used < XLOG_CIL_BLOCKING_SPACE_LIMIT(log) && !waitqueue_active(&cil->xc_push_wait))) { up_read(&cil->xc_ctx_lock); return; } spin_lock(&cil->xc_push_lock); if (cil->xc_push_seq < cil->xc_current_sequence) { cil->xc_push_seq = cil->xc_current_sequence; queue_work(cil->xc_push_wq, &cil->xc_ctx->push_work); } /* * Drop the context lock now, we can't hold that if we need to sleep * because we are over the blocking threshold. The push_lock is still * held, so blocking threshold sleep/wakeup is still correctly * serialised here. */ up_read(&cil->xc_ctx_lock); /* * If we are well over the space limit, throttle the work that is being * done until the push work on this context has begun. Enforce the hard * throttle on all transaction commits once it has been activated, even * if the committing transactions have resulted in the space usage * dipping back down under the hard limit. * * The ctx->xc_push_lock provides the serialisation necessary for safely * calling xlog_cil_over_hard_limit() in this context. */ if (xlog_cil_over_hard_limit(log, space_used)) { trace_xfs_log_cil_wait(log, cil->xc_ctx->ticket); ASSERT(space_used < log->l_logsize); xlog_wait(&cil->xc_push_wait, &cil->xc_push_lock); return; } spin_unlock(&cil->xc_push_lock); } /* * xlog_cil_push_now() is used to trigger an immediate CIL push to the sequence * number that is passed. When it returns, the work will be queued for * @push_seq, but it won't be completed. * * If the caller is performing a synchronous force, we will flush the workqueue * to get previously queued work moving to minimise the wait time they will * undergo waiting for all outstanding pushes to complete. The caller is * expected to do the required waiting for push_seq to complete. * * If the caller is performing an async push, we need to ensure that the * checkpoint is fully flushed out of the iclogs when we finish the push. If we * don't do this, then the commit record may remain sitting in memory in an * ACTIVE iclog. This then requires another full log force to push to disk, * which defeats the purpose of having an async, non-blocking CIL force * mechanism. Hence in this case we need to pass a flag to the push work to * indicate it needs to flush the commit record itself. */ static void xlog_cil_push_now( struct xlog *log, xfs_lsn_t push_seq, bool async) { struct xfs_cil *cil = log->l_cilp; if (!cil) return; ASSERT(push_seq && push_seq <= cil->xc_current_sequence); /* start on any pending background push to minimise wait time on it */ if (!async) flush_workqueue(cil->xc_push_wq); spin_lock(&cil->xc_push_lock); /* * If this is an async flush request, we always need to set the * xc_push_commit_stable flag even if something else has already queued * a push. The flush caller is asking for the CIL to be on stable * storage when the next push completes, so regardless of who has queued * the push, the flush requires stable semantics from it. */ cil->xc_push_commit_stable = async; /* * If the CIL is empty or we've already pushed the sequence then * there's no more work that we need to do. */ if (test_bit(XLOG_CIL_EMPTY, &cil->xc_flags) || push_seq <= cil->xc_push_seq) { spin_unlock(&cil->xc_push_lock); return; } cil->xc_push_seq = push_seq; queue_work(cil->xc_push_wq, &cil->xc_ctx->push_work); spin_unlock(&cil->xc_push_lock); } bool xlog_cil_empty( struct xlog *log) { struct xfs_cil *cil = log->l_cilp; bool empty = false; spin_lock(&cil->xc_push_lock); if (test_bit(XLOG_CIL_EMPTY, &cil->xc_flags)) empty = true; spin_unlock(&cil->xc_push_lock); return empty; } /* * If there are intent done items in this transaction and the related intent was * committed in the current (same) CIL checkpoint, we don't need to write either * the intent or intent done item to the journal as the change will be * journalled atomically within this checkpoint. As we cannot remove items from * the CIL here, mark the related intent with a whiteout so that the CIL push * can remove it rather than writing it to the journal. Then remove the intent * done item from the current transaction and release it so it doesn't get put * into the CIL at all. */ static uint32_t xlog_cil_process_intents( struct xfs_cil *cil, struct xfs_trans *tp) { struct xfs_log_item *lip, *ilip, *next; uint32_t len = 0; list_for_each_entry_safe(lip, next, &tp->t_items, li_trans) { if (!(lip->li_ops->flags & XFS_ITEM_INTENT_DONE)) continue; ilip = lip->li_ops->iop_intent(lip); if (!ilip || !xlog_item_in_current_chkpt(cil, ilip)) continue; set_bit(XFS_LI_WHITEOUT, &ilip->li_flags); trace_xfs_cil_whiteout_mark(ilip); len += ilip->li_lv->lv_bytes; kvfree(ilip->li_lv); ilip->li_lv = NULL; xfs_trans_del_item(lip); lip->li_ops->iop_release(lip); } return len; } /* * Commit a transaction with the given vector to the Committed Item List. * * To do this, we need to format the item, pin it in memory if required and * account for the space used by the transaction. Once we have done that we * need to release the unused reservation for the transaction, attach the * transaction to the checkpoint context so we carry the busy extents through * to checkpoint completion, and then unlock all the items in the transaction. * * Called with the context lock already held in read mode to lock out * background commit, returns without it held once background commits are * allowed again. */ void xlog_cil_commit( struct xlog *log, struct xfs_trans *tp, xfs_csn_t *commit_seq, bool regrant) { struct xfs_cil *cil = log->l_cilp; struct xfs_log_item *lip, *next; uint32_t released_space = 0; /* * Do all necessary memory allocation before we lock the CIL. * This ensures the allocation does not deadlock with a CIL * push in memory reclaim (e.g. from kswapd). */ xlog_cil_alloc_shadow_bufs(log, tp); /* lock out background commit */ down_read(&cil->xc_ctx_lock); if (tp->t_flags & XFS_TRANS_HAS_INTENT_DONE) released_space = xlog_cil_process_intents(cil, tp); xlog_cil_insert_items(log, tp, released_space); if (regrant && !xlog_is_shutdown(log)) xfs_log_ticket_regrant(log, tp->t_ticket); else xfs_log_ticket_ungrant(log, tp->t_ticket); tp->t_ticket = NULL; xfs_trans_unreserve_and_mod_sb(tp); /* * Once all the items of the transaction have been copied to the CIL, * the items can be unlocked and possibly freed. * * This needs to be done before we drop the CIL context lock because we * have to update state in the log items and unlock them before they go * to disk. If we don't, then the CIL checkpoint can race with us and * we can run checkpoint completion before we've updated and unlocked * the log items. This affects (at least) processing of stale buffers, * inodes and EFIs. */ trace_xfs_trans_commit_items(tp, _RET_IP_); list_for_each_entry_safe(lip, next, &tp->t_items, li_trans) { xfs_trans_del_item(lip); if (lip->li_ops->iop_committing) lip->li_ops->iop_committing(lip, cil->xc_ctx->sequence); } if (commit_seq) *commit_seq = cil->xc_ctx->sequence; /* xlog_cil_push_background() releases cil->xc_ctx_lock */ xlog_cil_push_background(log); } /* * Flush the CIL to stable storage but don't wait for it to complete. This * requires the CIL push to ensure the commit record for the push hits the disk, * but otherwise is no different to a push done from a log force. */ void xlog_cil_flush( struct xlog *log) { xfs_csn_t seq = log->l_cilp->xc_current_sequence; trace_xfs_log_force(log->l_mp, seq, _RET_IP_); xlog_cil_push_now(log, seq, true); /* * If the CIL is empty, make sure that any previous checkpoint that may * still be in an active iclog is pushed to stable storage. */ if (test_bit(XLOG_CIL_EMPTY, &log->l_cilp->xc_flags)) xfs_log_force(log->l_mp, 0); } /* * Conditionally push the CIL based on the sequence passed in. * * We only need to push if we haven't already pushed the sequence number given. * Hence the only time we will trigger a push here is if the push sequence is * the same as the current context. * * We return the current commit lsn to allow the callers to determine if a * iclog flush is necessary following this call. */ xfs_lsn_t xlog_cil_force_seq( struct xlog *log, xfs_csn_t sequence) { struct xfs_cil *cil = log->l_cilp; struct xfs_cil_ctx *ctx; xfs_lsn_t commit_lsn = NULLCOMMITLSN; ASSERT(sequence <= cil->xc_current_sequence); if (!sequence) sequence = cil->xc_current_sequence; trace_xfs_log_force(log->l_mp, sequence, _RET_IP_); /* * check to see if we need to force out the current context. * xlog_cil_push() handles racing pushes for the same sequence, * so no need to deal with it here. */ restart: xlog_cil_push_now(log, sequence, false); /* * See if we can find a previous sequence still committing. * We need to wait for all previous sequence commits to complete * before allowing the force of push_seq to go ahead. Hence block * on commits for those as well. */ spin_lock(&cil->xc_push_lock); list_for_each_entry(ctx, &cil->xc_committing, committing) { /* * Avoid getting stuck in this loop because we were woken by the * shutdown, but then went back to sleep once already in the * shutdown state. */ if (xlog_is_shutdown(log)) goto out_shutdown; if (ctx->sequence > sequence) continue; if (!ctx->commit_lsn) { /* * It is still being pushed! Wait for the push to * complete, then start again from the beginning. */ XFS_STATS_INC(log->l_mp, xs_log_force_sleep); xlog_wait(&cil->xc_commit_wait, &cil->xc_push_lock); goto restart; } if (ctx->sequence != sequence) continue; /* found it! */ commit_lsn = ctx->commit_lsn; } /* * The call to xlog_cil_push_now() executes the push in the background. * Hence by the time we have got here it our sequence may not have been * pushed yet. This is true if the current sequence still matches the * push sequence after the above wait loop and the CIL still contains * dirty objects. This is guaranteed by the push code first adding the * context to the committing list before emptying the CIL. * * Hence if we don't find the context in the committing list and the * current sequence number is unchanged then the CIL contents are * significant. If the CIL is empty, if means there was nothing to push * and that means there is nothing to wait for. If the CIL is not empty, * it means we haven't yet started the push, because if it had started * we would have found the context on the committing list. */ if (sequence == cil->xc_current_sequence && !test_bit(XLOG_CIL_EMPTY, &cil->xc_flags)) { spin_unlock(&cil->xc_push_lock); goto restart; } spin_unlock(&cil->xc_push_lock); return commit_lsn; /* * We detected a shutdown in progress. We need to trigger the log force * to pass through it's iclog state machine error handling, even though * we are already in a shutdown state. Hence we can't return * NULLCOMMITLSN here as that has special meaning to log forces (i.e. * LSN is already stable), so we return a zero LSN instead. */ out_shutdown: spin_unlock(&cil->xc_push_lock); return 0; } /* * Perform initial CIL structure initialisation. */ int xlog_cil_init( struct xlog *log) { struct xfs_cil *cil; struct xfs_cil_ctx *ctx; struct xlog_cil_pcp *cilpcp; int cpu; cil = kzalloc(sizeof(*cil), GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!cil) return -ENOMEM; /* * Limit the CIL pipeline depth to 4 concurrent works to bound the * concurrency the log spinlocks will be exposed to. */ cil->xc_push_wq = alloc_workqueue("xfs-cil/%s", XFS_WQFLAGS(WQ_FREEZABLE | WQ_MEM_RECLAIM | WQ_UNBOUND), 4, log->l_mp->m_super->s_id); if (!cil->xc_push_wq) goto out_destroy_cil; cil->xc_log = log; cil->xc_pcp = alloc_percpu(struct xlog_cil_pcp); if (!cil->xc_pcp) goto out_destroy_wq; for_each_possible_cpu(cpu) { cilpcp = per_cpu_ptr(cil->xc_pcp, cpu); INIT_LIST_HEAD(&cilpcp->busy_extents); INIT_LIST_HEAD(&cilpcp->log_items); } INIT_LIST_HEAD(&cil->xc_committing); spin_lock_init(&cil->xc_push_lock); init_waitqueue_head(&cil->xc_push_wait); init_rwsem(&cil->xc_ctx_lock); init_waitqueue_head(&cil->xc_start_wait); init_waitqueue_head(&cil->xc_commit_wait); log->l_cilp = cil; ctx = xlog_cil_ctx_alloc(); xlog_cil_ctx_switch(cil, ctx); return 0; out_destroy_wq: destroy_workqueue(cil->xc_push_wq); out_destroy_cil: kfree(cil); return -ENOMEM; } void xlog_cil_destroy( struct xlog *log) { struct xfs_cil *cil = log->l_cilp; if (cil->xc_ctx) { if (cil->xc_ctx->ticket) xfs_log_ticket_put(cil->xc_ctx->ticket); kfree(cil->xc_ctx); } ASSERT(test_bit(XLOG_CIL_EMPTY, &cil->xc_flags)); free_percpu(cil->xc_pcp); destroy_workqueue(cil->xc_push_wq); kfree(cil); }
365 366 105 139 348 345 5 19 19 110 250 151 188 32 32 153 127 111 112 111 125 106 153 247 153 45 45 49 100 100 100 100 10 138 290 52 237 74 225 9 1 73 73 65 65 65 65 10 10 8 8 8 6 2 28 52 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Definitions for the 'struct ptr_ring' datastructure. * * Author: * Michael S. Tsirkin <mst@redhat.com> * * Copyright (C) 2016 Red Hat, Inc. * * This is a limited-size FIFO maintaining pointers in FIFO order, with * one CPU producing entries and another consuming entries from a FIFO. * * This implementation tries to minimize cache-contention when there is a * single producer and a single consumer CPU. */ #ifndef _LINUX_PTR_RING_H #define _LINUX_PTR_RING_H 1 #ifdef __KERNEL__ #include <linux/spinlock.h> #include <linux/cache.h> #include <linux/types.h> #include <linux/compiler.h> #include <linux/slab.h> #include <linux/mm.h> #include <asm/errno.h> #endif struct ptr_ring { int producer ____cacheline_aligned_in_smp; spinlock_t producer_lock; int consumer_head ____cacheline_aligned_in_smp; /* next valid entry */ int consumer_tail; /* next entry to invalidate */ spinlock_t consumer_lock; /* Shared consumer/producer data */ /* Read-only by both the producer and the consumer */ int size ____cacheline_aligned_in_smp; /* max entries in queue */ int batch; /* number of entries to consume in a batch */ void **queue; }; /* Note: callers invoking this in a loop must use a compiler barrier, * for example cpu_relax(). * * NB: this is unlike __ptr_ring_empty in that callers must hold producer_lock: * see e.g. ptr_ring_full. */ static inline bool __ptr_ring_full(struct ptr_ring *r) { return r->queue[r->producer]; } static inline bool ptr_ring_full(struct ptr_ring *r) { bool ret; spin_lock(&r->producer_lock); ret = __ptr_ring_full(r); spin_unlock(&r->producer_lock); return ret; } static inline bool ptr_ring_full_irq(struct ptr_ring *r) { bool ret; spin_lock_irq(&r->producer_lock); ret = __ptr_ring_full(r); spin_unlock_irq(&r->producer_lock); return ret; } static inline bool ptr_ring_full_any(struct ptr_ring *r) { unsigned long flags; bool ret; spin_lock_irqsave(&r->producer_lock, flags); ret = __ptr_ring_full(r); spin_unlock_irqrestore(&r->producer_lock, flags); return ret; } static inline bool ptr_ring_full_bh(struct ptr_ring *r) { bool ret; spin_lock_bh(&r->producer_lock); ret = __ptr_ring_full(r); spin_unlock_bh(&r->producer_lock); return ret; } /* Note: callers invoking this in a loop must use a compiler barrier, * for example cpu_relax(). Callers must hold producer_lock. * Callers are responsible for making sure pointer that is being queued * points to a valid data. */ static inline int __ptr_ring_produce(struct ptr_ring *r, void *ptr) { if (unlikely(!r->size) || r->queue[r->producer]) return -ENOSPC; /* Make sure the pointer we are storing points to a valid data. */ /* Pairs with the dependency ordering in __ptr_ring_consume. */ smp_wmb(); WRITE_ONCE(r->queue[r->producer++], ptr); if (unlikely(r->producer >= r->size)) r->producer = 0; return 0; } /* * Note: resize (below) nests producer lock within consumer lock, so if you * consume in interrupt or BH context, you must disable interrupts/BH when * calling this. */ static inline int ptr_ring_produce(struct ptr_ring *r, void *ptr) { int ret; spin_lock(&r->producer_lock); ret = __ptr_ring_produce(r, ptr); spin_unlock(&r->producer_lock); return ret; } static inline int ptr_ring_produce_irq(struct ptr_ring *r, void *ptr) { int ret; spin_lock_irq(&r->producer_lock); ret = __ptr_ring_produce(r, ptr); spin_unlock_irq(&r->producer_lock); return ret; } static inline int ptr_ring_produce_any(struct ptr_ring *r, void *ptr) { unsigned long flags; int ret; spin_lock_irqsave(&r->producer_lock, flags); ret = __ptr_ring_produce(r, ptr); spin_unlock_irqrestore(&r->producer_lock, flags); return ret; } static inline int ptr_ring_produce_bh(struct ptr_ring *r, void *ptr) { int ret; spin_lock_bh(&r->producer_lock); ret = __ptr_ring_produce(r, ptr); spin_unlock_bh(&r->producer_lock); return ret; } static inline void *__ptr_ring_peek(struct ptr_ring *r) { if (likely(r->size)) return READ_ONCE(r->queue[r->consumer_head]); return NULL; } /* * Test ring empty status without taking any locks. * * NB: This is only safe to call if ring is never resized. * * However, if some other CPU consumes ring entries at the same time, the value * returned is not guaranteed to be correct. * * In this case - to avoid incorrectly detecting the ring * as empty - the CPU consuming the ring entries is responsible * for either consuming all ring entries until the ring is empty, * or synchronizing with some other CPU and causing it to * re-test __ptr_ring_empty and/or consume the ring enteries * after the synchronization point. * * Note: callers invoking this in a loop must use a compiler barrier, * for example cpu_relax(). */ static inline bool __ptr_ring_empty(struct ptr_ring *r) { if (likely(r->size)) return !r->queue[READ_ONCE(r->consumer_head)]; return true; } static inline bool ptr_ring_empty(struct ptr_ring *r) { bool ret; spin_lock(&r->consumer_lock); ret = __ptr_ring_empty(r); spin_unlock(&r->consumer_lock); return ret; } static inline bool ptr_ring_empty_irq(struct ptr_ring *r) { bool ret; spin_lock_irq(&r->consumer_lock); ret = __ptr_ring_empty(r); spin_unlock_irq(&r->consumer_lock); return ret; } static inline bool ptr_ring_empty_any(struct ptr_ring *r) { unsigned long flags; bool ret; spin_lock_irqsave(&r->consumer_lock, flags); ret = __ptr_ring_empty(r); spin_unlock_irqrestore(&r->consumer_lock, flags); return ret; } static inline bool ptr_ring_empty_bh(struct ptr_ring *r) { bool ret; spin_lock_bh(&r->consumer_lock); ret = __ptr_ring_empty(r); spin_unlock_bh(&r->consumer_lock); return ret; } /* Must only be called after __ptr_ring_peek returned !NULL */ static inline void __ptr_ring_discard_one(struct ptr_ring *r) { /* Fundamentally, what we want to do is update consumer * index and zero out the entry so producer can reuse it. * Doing it naively at each consume would be as simple as: * consumer = r->consumer; * r->queue[consumer++] = NULL; * if (unlikely(consumer >= r->size)) * consumer = 0; * r->consumer = consumer; * but that is suboptimal when the ring is full as producer is writing * out new entries in the same cache line. Defer these updates until a * batch of entries has been consumed. */ /* Note: we must keep consumer_head valid at all times for __ptr_ring_empty * to work correctly. */ int consumer_head = r->consumer_head; int head = consumer_head++; /* Once we have processed enough entries invalidate them in * the ring all at once so producer can reuse their space in the ring. * We also do this when we reach end of the ring - not mandatory * but helps keep the implementation simple. */ if (unlikely(consumer_head - r->consumer_tail >= r->batch || consumer_head >= r->size)) { /* Zero out entries in the reverse order: this way we touch the * cache line that producer might currently be reading the last; * producer won't make progress and touch other cache lines * besides the first one until we write out all entries. */ while (likely(head >= r->consumer_tail)) r->queue[head--] = NULL; r->consumer_tail = consumer_head; } if (unlikely(consumer_head >= r->size)) { consumer_head = 0; r->consumer_tail = 0; } /* matching READ_ONCE in __ptr_ring_empty for lockless tests */ WRITE_ONCE(r->consumer_head, consumer_head); } static inline void *__ptr_ring_consume(struct ptr_ring *r) { void *ptr; /* The READ_ONCE in __ptr_ring_peek guarantees that anyone * accessing data through the pointer is up to date. Pairs * with smp_wmb in __ptr_ring_produce. */ ptr = __ptr_ring_peek(r); if (ptr) __ptr_ring_discard_one(r); return ptr; } static inline int __ptr_ring_consume_batched(struct ptr_ring *r, void **array, int n) { void *ptr; int i; for (i = 0; i < n; i++) { ptr = __ptr_ring_consume(r); if (!ptr) break; array[i] = ptr; } return i; } /* * Note: resize (below) nests producer lock within consumer lock, so if you * call this in interrupt or BH context, you must disable interrupts/BH when * producing. */ static inline void *ptr_ring_consume(struct ptr_ring *r) { void *ptr; spin_lock(&r->consumer_lock); ptr = __ptr_ring_consume(r); spin_unlock(&r->consumer_lock); return ptr; } static inline void *ptr_ring_consume_irq(struct ptr_ring *r) { void *ptr; spin_lock_irq(&r->consumer_lock); ptr = __ptr_ring_consume(r); spin_unlock_irq(&r->consumer_lock); return ptr; } static inline void *ptr_ring_consume_any(struct ptr_ring *r) { unsigned long flags; void *ptr; spin_lock_irqsave(&r->consumer_lock, flags); ptr = __ptr_ring_consume(r); spin_unlock_irqrestore(&r->consumer_lock, flags); return ptr; } static inline void *ptr_ring_consume_bh(struct ptr_ring *r) { void *ptr; spin_lock_bh(&r->consumer_lock); ptr = __ptr_ring_consume(r); spin_unlock_bh(&r->consumer_lock); return ptr; } static inline int ptr_ring_consume_batched(struct ptr_ring *r, void **array, int n) { int ret; spin_lock(&r->consumer_lock); ret = __ptr_ring_consume_batched(r, array, n); spin_unlock(&r->consumer_lock); return ret; } static inline int ptr_ring_consume_batched_irq(struct ptr_ring *r, void **array, int n) { int ret; spin_lock_irq(&r->consumer_lock); ret = __ptr_ring_consume_batched(r, array, n); spin_unlock_irq(&r->consumer_lock); return ret; } static inline int ptr_ring_consume_batched_any(struct ptr_ring *r, void **array, int n) { unsigned long flags; int ret; spin_lock_irqsave(&r->consumer_lock, flags); ret = __ptr_ring_consume_batched(r, array, n); spin_unlock_irqrestore(&r->consumer_lock, flags); return ret; } static inline int ptr_ring_consume_batched_bh(struct ptr_ring *r, void **array, int n) { int ret; spin_lock_bh(&r->consumer_lock); ret = __ptr_ring_consume_batched(r, array, n); spin_unlock_bh(&r->consumer_lock); return ret; } /* Cast to structure type and call a function without discarding from FIFO. * Function must return a value. * Callers must take consumer_lock. */ #define __PTR_RING_PEEK_CALL(r, f) ((f)(__ptr_ring_peek(r))) #define PTR_RING_PEEK_CALL(r, f) ({ \ typeof((f)(NULL)) __PTR_RING_PEEK_CALL_v; \ \ spin_lock(&(r)->consumer_lock); \ __PTR_RING_PEEK_CALL_v = __PTR_RING_PEEK_CALL(r, f); \ spin_unlock(&(r)->consumer_lock); \ __PTR_RING_PEEK_CALL_v; \ }) #define PTR_RING_PEEK_CALL_IRQ(r, f) ({ \ typeof((f)(NULL)) __PTR_RING_PEEK_CALL_v; \ \ spin_lock_irq(&(r)->consumer_lock); \ __PTR_RING_PEEK_CALL_v = __PTR_RING_PEEK_CALL(r, f); \ spin_unlock_irq(&(r)->consumer_lock); \ __PTR_RING_PEEK_CALL_v; \ }) #define PTR_RING_PEEK_CALL_BH(r, f) ({ \ typeof((f)(NULL)) __PTR_RING_PEEK_CALL_v; \ \ spin_lock_bh(&(r)->consumer_lock); \ __PTR_RING_PEEK_CALL_v = __PTR_RING_PEEK_CALL(r, f); \ spin_unlock_bh(&(r)->consumer_lock); \ __PTR_RING_PEEK_CALL_v; \ }) #define PTR_RING_PEEK_CALL_ANY(r, f) ({ \ typeof((f)(NULL)) __PTR_RING_PEEK_CALL_v; \ unsigned long __PTR_RING_PEEK_CALL_f;\ \ spin_lock_irqsave(&(r)->consumer_lock, __PTR_RING_PEEK_CALL_f); \ __PTR_RING_PEEK_CALL_v = __PTR_RING_PEEK_CALL(r, f); \ spin_unlock_irqrestore(&(r)->consumer_lock, __PTR_RING_PEEK_CALL_f); \ __PTR_RING_PEEK_CALL_v; \ }) /* Not all gfp_t flags (besides GFP_KERNEL) are allowed. See * documentation for vmalloc for which of them are legal. */ static inline void **__ptr_ring_init_queue_alloc_noprof(unsigned int size, gfp_t gfp) { if (size > KMALLOC_MAX_SIZE / sizeof(void *)) return NULL; return kvmalloc_array_noprof(size, sizeof(void *), gfp | __GFP_ZERO); } static inline void __ptr_ring_set_size(struct ptr_ring *r, int size) { r->size = size; r->batch = SMP_CACHE_BYTES * 2 / sizeof(*(r->queue)); /* We need to set batch at least to 1 to make logic * in __ptr_ring_discard_one work correctly. * Batching too much (because ring is small) would cause a lot of * burstiness. Needs tuning, for now disable batching. */ if (r->batch > r->size / 2 || !r->batch) r->batch = 1; } static inline int ptr_ring_init_noprof(struct ptr_ring *r, int size, gfp_t gfp) { r->queue = __ptr_ring_init_queue_alloc_noprof(size, gfp); if (!r->queue) return -ENOMEM; __ptr_ring_set_size(r, size); r->producer = r->consumer_head = r->consumer_tail = 0; spin_lock_init(&r->producer_lock); spin_lock_init(&r->consumer_lock); return 0; } #define ptr_ring_init(...) alloc_hooks(ptr_ring_init_noprof(__VA_ARGS__)) /* * Return entries into ring. Destroy entries that don't fit. * * Note: this is expected to be a rare slow path operation. * * Note: producer lock is nested within consumer lock, so if you * resize you must make sure all uses nest correctly. * In particular if you consume ring in interrupt or BH context, you must * disable interrupts/BH when doing so. */ static inline void ptr_ring_unconsume(struct ptr_ring *r, void **batch, int n, void (*destroy)(void *)) { unsigned long flags; int head; spin_lock_irqsave(&r->consumer_lock, flags); spin_lock(&r->producer_lock); if (!r->size) goto done; /* * Clean out buffered entries (for simplicity). This way following code * can test entries for NULL and if not assume they are valid. */ head = r->consumer_head - 1; while (likely(head >= r->consumer_tail)) r->queue[head--] = NULL; r->consumer_tail = r->consumer_head; /* * Go over entries in batch, start moving head back and copy entries. * Stop when we run into previously unconsumed entries. */ while (n) { head = r->consumer_head - 1; if (head < 0) head = r->size - 1; if (r->queue[head]) { /* This batch entry will have to be destroyed. */ goto done; } r->queue[head] = batch[--n]; r->consumer_tail = head; /* matching READ_ONCE in __ptr_ring_empty for lockless tests */ WRITE_ONCE(r->consumer_head, head); } done: /* Destroy all entries left in the batch. */ while (n) destroy(batch[--n]); spin_unlock(&r->producer_lock); spin_unlock_irqrestore(&r->consumer_lock, flags); } static inline void **__ptr_ring_swap_queue(struct ptr_ring *r, void **queue, int size, gfp_t gfp, void (*destroy)(void *)) { int producer = 0; void **old; void *ptr; while ((ptr = __ptr_ring_consume(r))) if (producer < size) queue[producer++] = ptr; else if (destroy) destroy(ptr); if (producer >= size) producer = 0; __ptr_ring_set_size(r, size); r->producer = producer; r->consumer_head = 0; r->consumer_tail = 0; old = r->queue; r->queue = queue; return old; } /* * Note: producer lock is nested within consumer lock, so if you * resize you must make sure all uses nest correctly. * In particular if you consume ring in interrupt or BH context, you must * disable interrupts/BH when doing so. */ static inline int ptr_ring_resize_noprof(struct ptr_ring *r, int size, gfp_t gfp, void (*destroy)(void *)) { unsigned long flags; void **queue = __ptr_ring_init_queue_alloc_noprof(size, gfp); void **old; if (!queue) return -ENOMEM; spin_lock_irqsave(&(r)->consumer_lock, flags); spin_lock(&(r)->producer_lock); old = __ptr_ring_swap_queue(r, queue, size, gfp, destroy); spin_unlock(&(r)->producer_lock); spin_unlock_irqrestore(&(r)->consumer_lock, flags); kvfree(old); return 0; } #define ptr_ring_resize(...) alloc_hooks(ptr_ring_resize_noprof(__VA_ARGS__)) /* * Note: producer lock is nested within consumer lock, so if you * resize you must make sure all uses nest correctly. * In particular if you consume ring in BH context, you must * disable BH when doing so. */ static inline int ptr_ring_resize_multiple_bh_noprof(struct ptr_ring **rings, unsigned int nrings, int size, gfp_t gfp, void (*destroy)(void *)) { void ***queues; int i; queues = kmalloc_array_noprof(nrings, sizeof(*queues), gfp); if (!queues) goto noqueues; for (i = 0; i < nrings; ++i) { queues[i] = __ptr_ring_init_queue_alloc_noprof(size, gfp); if (!queues[i]) goto nomem; } for (i = 0; i < nrings; ++i) { spin_lock_bh(&(rings[i])->consumer_lock); spin_lock(&(rings[i])->producer_lock); queues[i] = __ptr_ring_swap_queue(rings[i], queues[i], size, gfp, destroy); spin_unlock(&(rings[i])->producer_lock); spin_unlock_bh(&(rings[i])->consumer_lock); } for (i = 0; i < nrings; ++i) kvfree(queues[i]); kfree(queues); return 0; nomem: while (--i >= 0) kvfree(queues[i]); kfree(queues); noqueues: return -ENOMEM; } #define ptr_ring_resize_multiple_bh(...) \ alloc_hooks(ptr_ring_resize_multiple_bh_noprof(__VA_ARGS__)) static inline void ptr_ring_cleanup(struct ptr_ring *r, void (*destroy)(void *)) { void *ptr; if (destroy) while ((ptr = ptr_ring_consume(r))) destroy(ptr); kvfree(r->queue); } #endif /* _LINUX_PTR_RING_H */
18 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 /* * llc_pdu.c - access to PDU internals * * Copyright (c) 1997 by Procom Technology, Inc. * 2001-2003 by Arnaldo Carvalho de Melo <acme@conectiva.com.br> * * This program can be redistributed or modified under the terms of the * GNU General Public License as published by the Free Software Foundation. * This program is distributed without any warranty or implied warranty * of merchantability or fitness for a particular purpose. * * See the GNU General Public License for more details. */ #include <linux/netdevice.h> #include <net/llc_pdu.h> static void llc_pdu_decode_pdu_type(struct sk_buff *skb, u8 *type); static u8 llc_pdu_get_pf_bit(struct llc_pdu_sn *pdu); void llc_pdu_set_cmd_rsp(struct sk_buff *skb, u8 pdu_type) { llc_pdu_un_hdr(skb)->ssap |= pdu_type; } /** * llc_pdu_set_pf_bit - sets poll/final bit in LLC header * @skb: Frame to set bit in * @bit_value: poll/final bit (0 or 1). * * This function sets poll/final bit in LLC header (based on type of PDU). * in I or S pdus, p/f bit is right bit of fourth byte in header. in U * pdus p/f bit is fifth bit of third byte. */ void llc_pdu_set_pf_bit(struct sk_buff *skb, u8 bit_value) { u8 pdu_type; struct llc_pdu_sn *pdu; llc_pdu_decode_pdu_type(skb, &pdu_type); pdu = llc_pdu_sn_hdr(skb); switch (pdu_type) { case LLC_PDU_TYPE_I: case LLC_PDU_TYPE_S: pdu->ctrl_2 = (pdu->ctrl_2 & 0xFE) | bit_value; break; case LLC_PDU_TYPE_U: pdu->ctrl_1 |= (pdu->ctrl_1 & 0xEF) | (bit_value << 4); break; } } /** * llc_pdu_decode_pf_bit - extracs poll/final bit from LLC header * @skb: input skb that p/f bit must be extracted from it * @pf_bit: poll/final bit (0 or 1) * * This function extracts poll/final bit from LLC header (based on type of * PDU). In I or S pdus, p/f bit is right bit of fourth byte in header. In * U pdus p/f bit is fifth bit of third byte. */ void llc_pdu_decode_pf_bit(struct sk_buff *skb, u8 *pf_bit) { u8 pdu_type; struct llc_pdu_sn *pdu; llc_pdu_decode_pdu_type(skb, &pdu_type); pdu = llc_pdu_sn_hdr(skb); switch (pdu_type) { case LLC_PDU_TYPE_I: case LLC_PDU_TYPE_S: *pf_bit = pdu->ctrl_2 & LLC_S_PF_BIT_MASK; break; case LLC_PDU_TYPE_U: *pf_bit = (pdu->ctrl_1 & LLC_U_PF_BIT_MASK) >> 4; break; } } /** * llc_pdu_init_as_disc_cmd - Builds DISC PDU * @skb: Address of the skb to build * @p_bit: The P bit to set in the PDU * * Builds a pdu frame as a DISC command. */ void llc_pdu_init_as_disc_cmd(struct sk_buff *skb, u8 p_bit) { struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); pdu->ctrl_1 = LLC_PDU_TYPE_U; pdu->ctrl_1 |= LLC_2_PDU_CMD_DISC; pdu->ctrl_1 |= ((p_bit & 1) << 4) & LLC_U_PF_BIT_MASK; } /** * llc_pdu_init_as_i_cmd - builds I pdu * @skb: Address of the skb to build * @p_bit: The P bit to set in the PDU * @ns: The sequence number of the data PDU * @nr: The seq. number of the expected I PDU from the remote * * Builds a pdu frame as an I command. */ void llc_pdu_init_as_i_cmd(struct sk_buff *skb, u8 p_bit, u8 ns, u8 nr) { struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); pdu->ctrl_1 = LLC_PDU_TYPE_I; pdu->ctrl_2 = 0; pdu->ctrl_2 |= (p_bit & LLC_I_PF_BIT_MASK); /* p/f bit */ pdu->ctrl_1 |= (ns << 1) & 0xFE; /* set N(S) in bits 2..8 */ pdu->ctrl_2 |= (nr << 1) & 0xFE; /* set N(R) in bits 10..16 */ } /** * llc_pdu_init_as_rej_cmd - builds REJ PDU * @skb: Address of the skb to build * @p_bit: The P bit to set in the PDU * @nr: The seq. number of the expected I PDU from the remote * * Builds a pdu frame as a REJ command. */ void llc_pdu_init_as_rej_cmd(struct sk_buff *skb, u8 p_bit, u8 nr) { struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); pdu->ctrl_1 = LLC_PDU_TYPE_S; pdu->ctrl_1 |= LLC_2_PDU_CMD_REJ; pdu->ctrl_2 = 0; pdu->ctrl_2 |= p_bit & LLC_S_PF_BIT_MASK; pdu->ctrl_1 &= 0x0F; /* setting bits 5..8 to zero(reserved) */ pdu->ctrl_2 |= (nr << 1) & 0xFE; /* set N(R) in bits 10..16 */ } /** * llc_pdu_init_as_rnr_cmd - builds RNR pdu * @skb: Address of the skb to build * @p_bit: The P bit to set in the PDU * @nr: The seq. number of the expected I PDU from the remote * * Builds a pdu frame as an RNR command. */ void llc_pdu_init_as_rnr_cmd(struct sk_buff *skb, u8 p_bit, u8 nr) { struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); pdu->ctrl_1 = LLC_PDU_TYPE_S; pdu->ctrl_1 |= LLC_2_PDU_CMD_RNR; pdu->ctrl_2 = 0; pdu->ctrl_2 |= p_bit & LLC_S_PF_BIT_MASK; pdu->ctrl_1 &= 0x0F; /* setting bits 5..8 to zero(reserved) */ pdu->ctrl_2 |= (nr << 1) & 0xFE; /* set N(R) in bits 10..16 */ } /** * llc_pdu_init_as_rr_cmd - Builds RR pdu * @skb: Address of the skb to build * @p_bit: The P bit to set in the PDU * @nr: The seq. number of the expected I PDU from the remote * * Builds a pdu frame as an RR command. */ void llc_pdu_init_as_rr_cmd(struct sk_buff *skb, u8 p_bit, u8 nr) { struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); pdu->ctrl_1 = LLC_PDU_TYPE_S; pdu->ctrl_1 |= LLC_2_PDU_CMD_RR; pdu->ctrl_2 = p_bit & LLC_S_PF_BIT_MASK; pdu->ctrl_1 &= 0x0F; /* setting bits 5..8 to zero(reserved) */ pdu->ctrl_2 |= (nr << 1) & 0xFE; /* set N(R) in bits 10..16 */ } /** * llc_pdu_init_as_sabme_cmd - builds SABME pdu * @skb: Address of the skb to build * @p_bit: The P bit to set in the PDU * * Builds a pdu frame as an SABME command. */ void llc_pdu_init_as_sabme_cmd(struct sk_buff *skb, u8 p_bit) { struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); pdu->ctrl_1 = LLC_PDU_TYPE_U; pdu->ctrl_1 |= LLC_2_PDU_CMD_SABME; pdu->ctrl_1 |= ((p_bit & 1) << 4) & LLC_U_PF_BIT_MASK; } /** * llc_pdu_init_as_dm_rsp - builds DM response pdu * @skb: Address of the skb to build * @f_bit: The F bit to set in the PDU * * Builds a pdu frame as a DM response. */ void llc_pdu_init_as_dm_rsp(struct sk_buff *skb, u8 f_bit) { struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); pdu->ctrl_1 = LLC_PDU_TYPE_U; pdu->ctrl_1 |= LLC_2_PDU_RSP_DM; pdu->ctrl_1 |= ((f_bit & 1) << 4) & LLC_U_PF_BIT_MASK; } /** * llc_pdu_init_as_frmr_rsp - builds FRMR response PDU * @skb: Address of the frame to build * @prev_pdu: The rejected PDU frame * @f_bit: The F bit to set in the PDU * @vs: tx state vari value for the data link conn at the rejecting LLC * @vr: rx state var value for the data link conn at the rejecting LLC * @vzyxw: completely described in the IEEE Std 802.2 document (Pg 55) * * Builds a pdu frame as a FRMR response. */ void llc_pdu_init_as_frmr_rsp(struct sk_buff *skb, struct llc_pdu_sn *prev_pdu, u8 f_bit, u8 vs, u8 vr, u8 vzyxw) { struct llc_frmr_info *frmr_info; u8 prev_pf = 0; u8 *ctrl; struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); pdu->ctrl_1 = LLC_PDU_TYPE_U; pdu->ctrl_1 |= LLC_2_PDU_RSP_FRMR; pdu->ctrl_1 |= ((f_bit & 1) << 4) & LLC_U_PF_BIT_MASK; frmr_info = (struct llc_frmr_info *)&pdu->ctrl_2; ctrl = (u8 *)&prev_pdu->ctrl_1; FRMR_INFO_SET_REJ_CNTRL(frmr_info,ctrl); FRMR_INFO_SET_Vs(frmr_info, vs); FRMR_INFO_SET_Vr(frmr_info, vr); prev_pf = llc_pdu_get_pf_bit(prev_pdu); FRMR_INFO_SET_C_R_BIT(frmr_info, prev_pf); FRMR_INFO_SET_INVALID_PDU_CTRL_IND(frmr_info, vzyxw); FRMR_INFO_SET_INVALID_PDU_INFO_IND(frmr_info, vzyxw); FRMR_INFO_SET_PDU_INFO_2LONG_IND(frmr_info, vzyxw); FRMR_INFO_SET_PDU_INVALID_Nr_IND(frmr_info, vzyxw); FRMR_INFO_SET_PDU_INVALID_Ns_IND(frmr_info, vzyxw); skb_put(skb, sizeof(struct llc_frmr_info)); } /** * llc_pdu_init_as_rr_rsp - builds RR response pdu * @skb: Address of the skb to build * @f_bit: The F bit to set in the PDU * @nr: The seq. number of the expected data PDU from the remote * * Builds a pdu frame as an RR response. */ void llc_pdu_init_as_rr_rsp(struct sk_buff *skb, u8 f_bit, u8 nr) { struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); pdu->ctrl_1 = LLC_PDU_TYPE_S; pdu->ctrl_1 |= LLC_2_PDU_RSP_RR; pdu->ctrl_2 = 0; pdu->ctrl_2 |= f_bit & LLC_S_PF_BIT_MASK; pdu->ctrl_1 &= 0x0F; /* setting bits 5..8 to zero(reserved) */ pdu->ctrl_2 |= (nr << 1) & 0xFE; /* set N(R) in bits 10..16 */ } /** * llc_pdu_init_as_rej_rsp - builds REJ response pdu * @skb: Address of the skb to build * @f_bit: The F bit to set in the PDU * @nr: The seq. number of the expected data PDU from the remote * * Builds a pdu frame as a REJ response. */ void llc_pdu_init_as_rej_rsp(struct sk_buff *skb, u8 f_bit, u8 nr) { struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); pdu->ctrl_1 = LLC_PDU_TYPE_S; pdu->ctrl_1 |= LLC_2_PDU_RSP_REJ; pdu->ctrl_2 = 0; pdu->ctrl_2 |= f_bit & LLC_S_PF_BIT_MASK; pdu->ctrl_1 &= 0x0F; /* setting bits 5..8 to zero(reserved) */ pdu->ctrl_2 |= (nr << 1) & 0xFE; /* set N(R) in bits 10..16 */ } /** * llc_pdu_init_as_rnr_rsp - builds RNR response pdu * @skb: Address of the frame to build * @f_bit: The F bit to set in the PDU * @nr: The seq. number of the expected data PDU from the remote * * Builds a pdu frame as an RNR response. */ void llc_pdu_init_as_rnr_rsp(struct sk_buff *skb, u8 f_bit, u8 nr) { struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); pdu->ctrl_1 = LLC_PDU_TYPE_S; pdu->ctrl_1 |= LLC_2_PDU_RSP_RNR; pdu->ctrl_2 = 0; pdu->ctrl_2 |= f_bit & LLC_S_PF_BIT_MASK; pdu->ctrl_1 &= 0x0F; /* setting bits 5..8 to zero(reserved) */ pdu->ctrl_2 |= (nr << 1) & 0xFE; /* set N(R) in bits 10..16 */ } /** * llc_pdu_init_as_ua_rsp - builds UA response pdu * @skb: Address of the frame to build * @f_bit: The F bit to set in the PDU * * Builds a pdu frame as a UA response. */ void llc_pdu_init_as_ua_rsp(struct sk_buff *skb, u8 f_bit) { struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); pdu->ctrl_1 = LLC_PDU_TYPE_U; pdu->ctrl_1 |= LLC_2_PDU_RSP_UA; pdu->ctrl_1 |= ((f_bit & 1) << 4) & LLC_U_PF_BIT_MASK; } /** * llc_pdu_decode_pdu_type - designates PDU type * @skb: input skb that type of it must be designated. * @type: type of PDU (output argument). * * This function designates type of PDU (I, S or U). */ static void llc_pdu_decode_pdu_type(struct sk_buff *skb, u8 *type) { struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); if (pdu->ctrl_1 & 1) { if ((pdu->ctrl_1 & LLC_PDU_TYPE_U) == LLC_PDU_TYPE_U) *type = LLC_PDU_TYPE_U; else *type = LLC_PDU_TYPE_S; } else *type = LLC_PDU_TYPE_I; } /** * llc_pdu_get_pf_bit - extracts p/f bit of input PDU * @pdu: pointer to LLC header. * * This function extracts p/f bit of input PDU. at first examines type of * PDU and then extracts p/f bit. Returns the p/f bit. */ static u8 llc_pdu_get_pf_bit(struct llc_pdu_sn *pdu) { u8 pdu_type; u8 pf_bit = 0; if (pdu->ctrl_1 & 1) { if ((pdu->ctrl_1 & LLC_PDU_TYPE_U) == LLC_PDU_TYPE_U) pdu_type = LLC_PDU_TYPE_U; else pdu_type = LLC_PDU_TYPE_S; } else pdu_type = LLC_PDU_TYPE_I; switch (pdu_type) { case LLC_PDU_TYPE_I: case LLC_PDU_TYPE_S: pf_bit = pdu->ctrl_2 & LLC_S_PF_BIT_MASK; break; case LLC_PDU_TYPE_U: pf_bit = (pdu->ctrl_1 & LLC_U_PF_BIT_MASK) >> 4; break; } return pf_bit; }
856 859 862 861 860 859 860 115 116 116 76 1 18 18 18 18 18 18 18 18 18 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Pseudo-driver for the loopback interface. * * Version: @(#)loopback.c 1.0.4b 08/16/93 * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Donald Becker, <becker@scyld.com> * * Alan Cox : Fixed oddments for NET3.014 * Alan Cox : Rejig for NET3.029 snap #3 * Alan Cox : Fixed NET3.029 bugs and sped up * Larry McVoy : Tiny tweak to double performance * Alan Cox : Backed out LMV's tweak - the linux mm * can't take it... * Michael Griffith: Don't bother computing the checksums * on packets received on the loopback * interface. * Alexey Kuznetsov: Potential hang under some extreme * cases removed. */ #include <linux/kernel.h> #include <linux/jiffies.h> #include <linux/module.h> #include <linux/interrupt.h> #include <linux/fs.h> #include <linux/types.h> #include <linux/string.h> #include <linux/socket.h> #include <linux/errno.h> #include <linux/fcntl.h> #include <linux/in.h> #include <linux/uaccess.h> #include <linux/io.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/skbuff.h> #include <linux/ethtool.h> #include <net/sch_generic.h> #include <net/sock.h> #include <net/checksum.h> #include <linux/if_ether.h> /* For the statistics structure. */ #include <linux/if_arp.h> /* For ARPHRD_ETHER */ #include <linux/ip.h> #include <linux/tcp.h> #include <linux/percpu.h> #include <linux/net_tstamp.h> #include <net/net_namespace.h> #include <net/netdev_lock.h> #include <linux/u64_stats_sync.h> /* blackhole_netdev - a device used for dsts that are marked expired! * This is global device (instead of per-net-ns) since it's not needed * to be per-ns and gets initialized at boot time. */ struct net_device *blackhole_netdev; EXPORT_SYMBOL(blackhole_netdev); /* The higher levels take care of making this non-reentrant (it's * called with bh's disabled). */ static netdev_tx_t loopback_xmit(struct sk_buff *skb, struct net_device *dev) { int len; skb_tx_timestamp(skb); /* do not fool net_timestamp_check() with various clock bases */ skb_clear_tstamp(skb); skb_orphan(skb); /* Before queueing this packet to __netif_rx(), * make sure dst is refcounted. */ skb_dst_force(skb); skb->protocol = eth_type_trans(skb, dev); len = skb->len; if (likely(__netif_rx(skb) == NET_RX_SUCCESS)) dev_lstats_add(dev, len); return NETDEV_TX_OK; } void dev_lstats_read(struct net_device *dev, u64 *packets, u64 *bytes) { int i; *packets = 0; *bytes = 0; for_each_possible_cpu(i) { const struct pcpu_lstats *lb_stats; u64 tbytes, tpackets; unsigned int start; lb_stats = per_cpu_ptr(dev->lstats, i); do { start = u64_stats_fetch_begin(&lb_stats->syncp); tpackets = u64_stats_read(&lb_stats->packets); tbytes = u64_stats_read(&lb_stats->bytes); } while (u64_stats_fetch_retry(&lb_stats->syncp, start)); *bytes += tbytes; *packets += tpackets; } } EXPORT_SYMBOL(dev_lstats_read); static void loopback_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) { u64 packets, bytes; dev_lstats_read(dev, &packets, &bytes); stats->rx_packets = packets; stats->tx_packets = packets; stats->rx_bytes = bytes; stats->tx_bytes = bytes; } static u32 always_on(struct net_device *dev) { return 1; } static const struct ethtool_ops loopback_ethtool_ops = { .get_link = always_on, .get_ts_info = ethtool_op_get_ts_info, }; static int loopback_dev_init(struct net_device *dev) { netdev_lockdep_set_classes(dev); return 0; } static void loopback_dev_free(struct net_device *dev) { dev_net(dev)->loopback_dev = NULL; } static const struct net_device_ops loopback_ops = { .ndo_init = loopback_dev_init, .ndo_start_xmit = loopback_xmit, .ndo_get_stats64 = loopback_get_stats64, .ndo_set_mac_address = eth_mac_addr, }; static void gen_lo_setup(struct net_device *dev, unsigned int mtu, const struct ethtool_ops *eth_ops, const struct header_ops *hdr_ops, const struct net_device_ops *dev_ops, void (*dev_destructor)(struct net_device *dev)) { dev->mtu = mtu; dev->hard_header_len = ETH_HLEN; /* 14 */ dev->min_header_len = ETH_HLEN; /* 14 */ dev->addr_len = ETH_ALEN; /* 6 */ dev->type = ARPHRD_LOOPBACK; /* 0x0001*/ dev->flags = IFF_LOOPBACK; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_NO_QUEUE; dev->lltx = true; dev->netns_immutable = true; netif_keep_dst(dev); dev->hw_features = NETIF_F_GSO_SOFTWARE; dev->features = NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_GSO_SOFTWARE | NETIF_F_HW_CSUM | NETIF_F_RXCSUM | NETIF_F_SCTP_CRC | NETIF_F_HIGHDMA | NETIF_F_VLAN_CHALLENGED | NETIF_F_LOOPBACK; dev->ethtool_ops = eth_ops; dev->header_ops = hdr_ops; dev->netdev_ops = dev_ops; dev->needs_free_netdev = true; dev->pcpu_stat_type = NETDEV_PCPU_STAT_LSTATS; dev->priv_destructor = dev_destructor; netif_set_tso_max_size(dev, GSO_MAX_SIZE); } /* The loopback device is special. There is only one instance * per network namespace. */ static void loopback_setup(struct net_device *dev) { gen_lo_setup(dev, (64 * 1024), &loopback_ethtool_ops, &eth_header_ops, &loopback_ops, loopback_dev_free); } /* Setup and register the loopback device. */ static __net_init int loopback_net_init(struct net *net) { struct net_device *dev; int err; err = -ENOMEM; dev = alloc_netdev(0, "lo", NET_NAME_PREDICTABLE, loopback_setup); if (!dev) goto out; dev_net_set(dev, net); err = register_netdev(dev); if (err) goto out_free_netdev; BUG_ON(dev->ifindex != LOOPBACK_IFINDEX); net->loopback_dev = dev; return 0; out_free_netdev: free_netdev(dev); out: if (net_eq(net, &init_net)) panic("loopback: Failed to register netdevice: %d\n", err); return err; } /* Registered in net/core/dev.c */ struct pernet_operations __net_initdata loopback_net_ops = { .init = loopback_net_init, }; /* blackhole netdevice */ static netdev_tx_t blackhole_netdev_xmit(struct sk_buff *skb, struct net_device *dev) { kfree_skb(skb); net_warn_ratelimited("%s(): Dropping skb.\n", __func__); return NETDEV_TX_OK; } static int blackhole_neigh_output(struct neighbour *n, struct sk_buff *skb) { kfree_skb(skb); return 0; } static int blackhole_neigh_construct(struct net_device *dev, struct neighbour *n) { n->output = blackhole_neigh_output; return 0; } static const struct net_device_ops blackhole_netdev_ops = { .ndo_start_xmit = blackhole_netdev_xmit, .ndo_neigh_construct = blackhole_neigh_construct, }; /* This is a dst-dummy device used specifically for invalidated * DSTs and unlike loopback, this is not per-ns. */ static void blackhole_netdev_setup(struct net_device *dev) { gen_lo_setup(dev, ETH_MIN_MTU, NULL, NULL, &blackhole_netdev_ops, NULL); } /* Setup and register the blackhole_netdev. */ static int __init blackhole_netdev_init(void) { blackhole_netdev = alloc_netdev(0, "blackhole_dev", NET_NAME_UNKNOWN, blackhole_netdev_setup); if (!blackhole_netdev) return -ENOMEM; rtnl_net_lock(&init_net); dev_init_scheduler(blackhole_netdev); dev_activate(blackhole_netdev); rtnl_net_unlock(&init_net); blackhole_netdev->flags |= IFF_UP | IFF_RUNNING; return 0; } device_initcall(blackhole_netdev_init);
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 // SPDX-License-Identifier: GPL-2.0-or-later // SPI init/core code // // Copyright (C) 2005 David Brownell // Copyright (C) 2008 Secret Lab Technologies Ltd. #include <linux/acpi.h> #include <linux/cache.h> #include <linux/clk/clk-conf.h> #include <linux/delay.h> #include <linux/device.h> #include <linux/dmaengine.h> #include <linux/dma-mapping.h> #include <linux/export.h> #include <linux/gpio/consumer.h> #include <linux/highmem.h> #include <linux/idr.h> #include <linux/init.h> #include <linux/ioport.h> #include <linux/kernel.h> #include <linux/kthread.h> #include <linux/mod_devicetable.h> #include <linux/mutex.h> #include <linux/of_device.h> #include <linux/of_irq.h> #include <linux/percpu.h> #include <linux/platform_data/x86/apple.h> #include <linux/pm_domain.h> #include <linux/pm_runtime.h> #include <linux/property.h> #include <linux/ptp_clock_kernel.h> #include <linux/sched/rt.h> #include <linux/slab.h> #include <linux/spi/offload/types.h> #include <linux/spi/spi.h> #include <linux/spi/spi-mem.h> #include <uapi/linux/sched/types.h> #define CREATE_TRACE_POINTS #include <trace/events/spi.h> EXPORT_TRACEPOINT_SYMBOL(spi_transfer_start); EXPORT_TRACEPOINT_SYMBOL(spi_transfer_stop); #include "internals.h" static DEFINE_IDR(spi_controller_idr); static void spidev_release(struct device *dev) { struct spi_device *spi = to_spi_device(dev); spi_controller_put(spi->controller); kfree(spi->driver_override); free_percpu(spi->pcpu_statistics); kfree(spi); } static ssize_t modalias_show(struct device *dev, struct device_attribute *a, char *buf) { const struct spi_device *spi = to_spi_device(dev); int len; len = acpi_device_modalias(dev, buf, PAGE_SIZE - 1); if (len != -ENODEV) return len; return sysfs_emit(buf, "%s%s\n", SPI_MODULE_PREFIX, spi->modalias); } static DEVICE_ATTR_RO(modalias); static ssize_t driver_override_store(struct device *dev, struct device_attribute *a, const char *buf, size_t count) { struct spi_device *spi = to_spi_device(dev); int ret; ret = driver_set_override(dev, &spi->driver_override, buf, count); if (ret) return ret; return count; } static ssize_t driver_override_show(struct device *dev, struct device_attribute *a, char *buf) { const struct spi_device *spi = to_spi_device(dev); ssize_t len; device_lock(dev); len = sysfs_emit(buf, "%s\n", spi->driver_override ? : ""); device_unlock(dev); return len; } static DEVICE_ATTR_RW(driver_override); static struct spi_statistics __percpu *spi_alloc_pcpu_stats(struct device *dev) { struct spi_statistics __percpu *pcpu_stats; if (dev) pcpu_stats = devm_alloc_percpu(dev, struct spi_statistics); else pcpu_stats = alloc_percpu_gfp(struct spi_statistics, GFP_KERNEL); if (pcpu_stats) { int cpu; for_each_possible_cpu(cpu) { struct spi_statistics *stat; stat = per_cpu_ptr(pcpu_stats, cpu); u64_stats_init(&stat->syncp); } } return pcpu_stats; } static ssize_t spi_emit_pcpu_stats(struct spi_statistics __percpu *stat, char *buf, size_t offset) { u64 val = 0; int i; for_each_possible_cpu(i) { const struct spi_statistics *pcpu_stats; u64_stats_t *field; unsigned int start; u64 inc; pcpu_stats = per_cpu_ptr(stat, i); field = (void *)pcpu_stats + offset; do { start = u64_stats_fetch_begin(&pcpu_stats->syncp); inc = u64_stats_read(field); } while (u64_stats_fetch_retry(&pcpu_stats->syncp, start)); val += inc; } return sysfs_emit(buf, "%llu\n", val); } #define SPI_STATISTICS_ATTRS(field, file) \ static ssize_t spi_controller_##field##_show(struct device *dev, \ struct device_attribute *attr, \ char *buf) \ { \ struct spi_controller *ctlr = container_of(dev, \ struct spi_controller, dev); \ return spi_statistics_##field##_show(ctlr->pcpu_statistics, buf); \ } \ static struct device_attribute dev_attr_spi_controller_##field = { \ .attr = { .name = file, .mode = 0444 }, \ .show = spi_controller_##field##_show, \ }; \ static ssize_t spi_device_##field##_show(struct device *dev, \ struct device_attribute *attr, \ char *buf) \ { \ struct spi_device *spi = to_spi_device(dev); \ return spi_statistics_##field##_show(spi->pcpu_statistics, buf); \ } \ static struct device_attribute dev_attr_spi_device_##field = { \ .attr = { .name = file, .mode = 0444 }, \ .show = spi_device_##field##_show, \ } #define SPI_STATISTICS_SHOW_NAME(name, file, field) \ static ssize_t spi_statistics_##name##_show(struct spi_statistics __percpu *stat, \ char *buf) \ { \ return spi_emit_pcpu_stats(stat, buf, \ offsetof(struct spi_statistics, field)); \ } \ SPI_STATISTICS_ATTRS(name, file) #define SPI_STATISTICS_SHOW(field) \ SPI_STATISTICS_SHOW_NAME(field, __stringify(field), \ field) SPI_STATISTICS_SHOW(messages); SPI_STATISTICS_SHOW(transfers); SPI_STATISTICS_SHOW(errors); SPI_STATISTICS_SHOW(timedout); SPI_STATISTICS_SHOW(spi_sync); SPI_STATISTICS_SHOW(spi_sync_immediate); SPI_STATISTICS_SHOW(spi_async); SPI_STATISTICS_SHOW(bytes); SPI_STATISTICS_SHOW(bytes_rx); SPI_STATISTICS_SHOW(bytes_tx); #define SPI_STATISTICS_TRANSFER_BYTES_HISTO(index, number) \ SPI_STATISTICS_SHOW_NAME(transfer_bytes_histo##index, \ "transfer_bytes_histo_" number, \ transfer_bytes_histo[index]) SPI_STATISTICS_TRANSFER_BYTES_HISTO(0, "0-1"); SPI_STATISTICS_TRANSFER_BYTES_HISTO(1, "2-3"); SPI_STATISTICS_TRANSFER_BYTES_HISTO(2, "4-7"); SPI_STATISTICS_TRANSFER_BYTES_HISTO(3, "8-15"); SPI_STATISTICS_TRANSFER_BYTES_HISTO(4, "16-31"); SPI_STATISTICS_TRANSFER_BYTES_HISTO(5, "32-63"); SPI_STATISTICS_TRANSFER_BYTES_HISTO(6, "64-127"); SPI_STATISTICS_TRANSFER_BYTES_HISTO(7, "128-255"); SPI_STATISTICS_TRANSFER_BYTES_HISTO(8, "256-511"); SPI_STATISTICS_TRANSFER_BYTES_HISTO(9, "512-1023"); SPI_STATISTICS_TRANSFER_BYTES_HISTO(10, "1024-2047"); SPI_STATISTICS_TRANSFER_BYTES_HISTO(11, "2048-4095"); SPI_STATISTICS_TRANSFER_BYTES_HISTO(12, "4096-8191"); SPI_STATISTICS_TRANSFER_BYTES_HISTO(13, "8192-16383"); SPI_STATISTICS_TRANSFER_BYTES_HISTO(14, "16384-32767"); SPI_STATISTICS_TRANSFER_BYTES_HISTO(15, "32768-65535"); SPI_STATISTICS_TRANSFER_BYTES_HISTO(16, "65536+"); SPI_STATISTICS_SHOW(transfers_split_maxsize); static struct attribute *spi_dev_attrs[] = { &dev_attr_modalias.attr, &dev_attr_driver_override.attr, NULL, }; static const struct attribute_group spi_dev_group = { .attrs = spi_dev_attrs, }; static struct attribute *spi_device_statistics_attrs[] = { &dev_attr_spi_device_messages.attr, &dev_attr_spi_device_transfers.attr, &dev_attr_spi_device_errors.attr, &dev_attr_spi_device_timedout.attr, &dev_attr_spi_device_spi_sync.attr, &dev_attr_spi_device_spi_sync_immediate.attr, &dev_attr_spi_device_spi_async.attr, &dev_attr_spi_device_bytes.attr, &dev_attr_spi_device_bytes_rx.attr, &dev_attr_spi_device_bytes_tx.attr, &dev_attr_spi_device_transfer_bytes_histo0.attr, &dev_attr_spi_device_transfer_bytes_histo1.attr, &dev_attr_spi_device_transfer_bytes_histo2.attr, &dev_attr_spi_device_transfer_bytes_histo3.attr, &dev_attr_spi_device_transfer_bytes_histo4.attr, &dev_attr_spi_device_transfer_bytes_histo5.attr, &dev_attr_spi_device_transfer_bytes_histo6.attr, &dev_attr_spi_device_transfer_bytes_histo7.attr, &dev_attr_spi_device_transfer_bytes_histo8.attr, &dev_attr_spi_device_transfer_bytes_histo9.attr, &dev_attr_spi_device_transfer_bytes_histo10.attr, &dev_attr_spi_device_transfer_bytes_histo11.attr, &dev_attr_spi_device_transfer_bytes_histo12.attr, &dev_attr_spi_device_transfer_bytes_histo13.attr, &dev_attr_spi_device_transfer_bytes_histo14.attr, &dev_attr_spi_device_transfer_bytes_histo15.attr, &dev_attr_spi_device_transfer_bytes_histo16.attr, &dev_attr_spi_device_transfers_split_maxsize.attr, NULL, }; static const struct attribute_group spi_device_statistics_group = { .name = "statistics", .attrs = spi_device_statistics_attrs, }; static const struct attribute_group *spi_dev_groups[] = { &spi_dev_group, &spi_device_statistics_group, NULL, }; static struct attribute *spi_controller_statistics_attrs[] = { &dev_attr_spi_controller_messages.attr, &dev_attr_spi_controller_transfers.attr, &dev_attr_spi_controller_errors.attr, &dev_attr_spi_controller_timedout.attr, &dev_attr_spi_controller_spi_sync.attr, &dev_attr_spi_controller_spi_sync_immediate.attr, &dev_attr_spi_controller_spi_async.attr, &dev_attr_spi_controller_bytes.attr, &dev_attr_spi_controller_bytes_rx.attr, &dev_attr_spi_controller_bytes_tx.attr, &dev_attr_spi_controller_transfer_bytes_histo0.attr, &dev_attr_spi_controller_transfer_bytes_histo1.attr, &dev_attr_spi_controller_transfer_bytes_histo2.attr, &dev_attr_spi_controller_transfer_bytes_histo3.attr, &dev_attr_spi_controller_transfer_bytes_histo4.attr, &dev_attr_spi_controller_transfer_bytes_histo5.attr, &dev_attr_spi_controller_transfer_bytes_histo6.attr, &dev_attr_spi_controller_transfer_bytes_histo7.attr, &dev_attr_spi_controller_transfer_bytes_histo8.attr, &dev_attr_spi_controller_transfer_bytes_histo9.attr, &dev_attr_spi_controller_transfer_bytes_histo10.attr, &dev_attr_spi_controller_transfer_bytes_histo11.attr, &dev_attr_spi_controller_transfer_bytes_histo12.attr, &dev_attr_spi_controller_transfer_bytes_histo13.attr, &dev_attr_spi_controller_transfer_bytes_histo14.attr, &dev_attr_spi_controller_transfer_bytes_histo15.attr, &dev_attr_spi_controller_transfer_bytes_histo16.attr, &dev_attr_spi_controller_transfers_split_maxsize.attr, NULL, }; static const struct attribute_group spi_controller_statistics_group = { .name = "statistics", .attrs = spi_controller_statistics_attrs, }; static const struct attribute_group *spi_controller_groups[] = { &spi_controller_statistics_group, NULL, }; static void spi_statistics_add_transfer_stats(struct spi_statistics __percpu *pcpu_stats, struct spi_transfer *xfer, struct spi_message *msg) { int l2len = min(fls(xfer->len), SPI_STATISTICS_HISTO_SIZE) - 1; struct spi_statistics *stats; if (l2len < 0) l2len = 0; get_cpu(); stats = this_cpu_ptr(pcpu_stats); u64_stats_update_begin(&stats->syncp); u64_stats_inc(&stats->transfers); u64_stats_inc(&stats->transfer_bytes_histo[l2len]); u64_stats_add(&stats->bytes, xfer->len); if (spi_valid_txbuf(msg, xfer)) u64_stats_add(&stats->bytes_tx, xfer->len); if (spi_valid_rxbuf(msg, xfer)) u64_stats_add(&stats->bytes_rx, xfer->len); u64_stats_update_end(&stats->syncp); put_cpu(); } /* * modalias support makes "modprobe $MODALIAS" new-style hotplug work, * and the sysfs version makes coldplug work too. */ static const struct spi_device_id *spi_match_id(const struct spi_device_id *id, const char *name) { while (id->name[0]) { if (!strcmp(name, id->name)) return id; id++; } return NULL; } const struct spi_device_id *spi_get_device_id(const struct spi_device *sdev) { const struct spi_driver *sdrv = to_spi_driver(sdev->dev.driver); return spi_match_id(sdrv->id_table, sdev->modalias); } EXPORT_SYMBOL_GPL(spi_get_device_id); const void *spi_get_device_match_data(const struct spi_device *sdev) { const void *match; match = device_get_match_data(&sdev->dev); if (match) return match; return (const void *)spi_get_device_id(sdev)->driver_data; } EXPORT_SYMBOL_GPL(spi_get_device_match_data); static int spi_match_device(struct device *dev, const struct device_driver *drv) { const struct spi_device *spi = to_spi_device(dev); const struct spi_driver *sdrv = to_spi_driver(drv); /* Check override first, and if set, only use the named driver */ if (spi->driver_override) return strcmp(spi->driver_override, drv->name) == 0; /* Attempt an OF style match */ if (of_driver_match_device(dev, drv)) return 1; /* Then try ACPI */ if (acpi_driver_match_device(dev, drv)) return 1; if (sdrv->id_table) return !!spi_match_id(sdrv->id_table, spi->modalias); return strcmp(spi->modalias, drv->name) == 0; } static int spi_uevent(const struct device *dev, struct kobj_uevent_env *env) { const struct spi_device *spi = to_spi_device(dev); int rc; rc = acpi_device_uevent_modalias(dev, env); if (rc != -ENODEV) return rc; return add_uevent_var(env, "MODALIAS=%s%s", SPI_MODULE_PREFIX, spi->modalias); } static int spi_probe(struct device *dev) { const struct spi_driver *sdrv = to_spi_driver(dev->driver); struct spi_device *spi = to_spi_device(dev); struct fwnode_handle *fwnode = dev_fwnode(dev); int ret; ret = of_clk_set_defaults(dev->of_node, false); if (ret) return ret; if (is_of_node(fwnode)) spi->irq = of_irq_get(dev->of_node, 0); else if (is_acpi_device_node(fwnode) && spi->irq < 0) spi->irq = acpi_dev_gpio_irq_get(to_acpi_device_node(fwnode), 0); if (spi->irq == -EPROBE_DEFER) return dev_err_probe(dev, spi->irq, "Failed to get irq\n"); if (spi->irq < 0) spi->irq = 0; ret = dev_pm_domain_attach(dev, true); if (ret) return ret; if (sdrv->probe) { ret = sdrv->probe(spi); if (ret) dev_pm_domain_detach(dev, true); } return ret; } static void spi_remove(struct device *dev) { const struct spi_driver *sdrv = to_spi_driver(dev->driver); if (sdrv->remove) sdrv->remove(to_spi_device(dev)); dev_pm_domain_detach(dev, true); } static void spi_shutdown(struct device *dev) { if (dev->driver) { const struct spi_driver *sdrv = to_spi_driver(dev->driver); if (sdrv->shutdown) sdrv->shutdown(to_spi_device(dev)); } } const struct bus_type spi_bus_type = { .name = "spi", .dev_groups = spi_dev_groups, .match = spi_match_device, .uevent = spi_uevent, .probe = spi_probe, .remove = spi_remove, .shutdown = spi_shutdown, }; EXPORT_SYMBOL_GPL(spi_bus_type); /** * __spi_register_driver - register a SPI driver * @owner: owner module of the driver to register * @sdrv: the driver to register * Context: can sleep * * Return: zero on success, else a negative error code. */ int __spi_register_driver(struct module *owner, struct spi_driver *sdrv) { sdrv->driver.owner = owner; sdrv->driver.bus = &spi_bus_type; /* * For Really Good Reasons we use spi: modaliases not of: * modaliases for DT so module autoloading won't work if we * don't have a spi_device_id as well as a compatible string. */ if (sdrv->driver.of_match_table) { const struct of_device_id *of_id; for (of_id = sdrv->driver.of_match_table; of_id->compatible[0]; of_id++) { const char *of_name; /* Strip off any vendor prefix */ of_name = strnchr(of_id->compatible, sizeof(of_id->compatible), ','); if (of_name) of_name++; else of_name = of_id->compatible; if (sdrv->id_table) { const struct spi_device_id *spi_id; spi_id = spi_match_id(sdrv->id_table, of_name); if (spi_id) continue; } else { if (strcmp(sdrv->driver.name, of_name) == 0) continue; } pr_warn("SPI driver %s has no spi_device_id for %s\n", sdrv->driver.name, of_id->compatible); } } return driver_register(&sdrv->driver); } EXPORT_SYMBOL_GPL(__spi_register_driver); /*-------------------------------------------------------------------------*/ /* * SPI devices should normally not be created by SPI device drivers; that * would make them board-specific. Similarly with SPI controller drivers. * Device registration normally goes into like arch/.../mach.../board-YYY.c * with other readonly (flashable) information about mainboard devices. */ struct boardinfo { struct list_head list; struct spi_board_info board_info; }; static LIST_HEAD(board_list); static LIST_HEAD(spi_controller_list); /* * Used to protect add/del operation for board_info list and * spi_controller list, and their matching process also used * to protect object of type struct idr. */ static DEFINE_MUTEX(board_lock); /** * spi_alloc_device - Allocate a new SPI device * @ctlr: Controller to which device is connected * Context: can sleep * * Allows a driver to allocate and initialize a spi_device without * registering it immediately. This allows a driver to directly * fill the spi_device with device parameters before calling * spi_add_device() on it. * * Caller is responsible to call spi_add_device() on the returned * spi_device structure to add it to the SPI controller. If the caller * needs to discard the spi_device without adding it, then it should * call spi_dev_put() on it. * * Return: a pointer to the new device, or NULL. */ struct spi_device *spi_alloc_device(struct spi_controller *ctlr) { struct spi_device *spi; if (!spi_controller_get(ctlr)) return NULL; spi = kzalloc(sizeof(*spi), GFP_KERNEL); if (!spi) { spi_controller_put(ctlr); return NULL; } spi->pcpu_statistics = spi_alloc_pcpu_stats(NULL); if (!spi->pcpu_statistics) { kfree(spi); spi_controller_put(ctlr); return NULL; } spi->controller = ctlr; spi->dev.parent = &ctlr->dev; spi->dev.bus = &spi_bus_type; spi->dev.release = spidev_release; spi->mode = ctlr->buswidth_override_bits; device_initialize(&spi->dev); return spi; } EXPORT_SYMBOL_GPL(spi_alloc_device); static void spi_dev_set_name(struct spi_device *spi) { struct device *dev = &spi->dev; struct fwnode_handle *fwnode = dev_fwnode(dev); if (is_acpi_device_node(fwnode)) { dev_set_name(dev, "spi-%s", acpi_dev_name(to_acpi_device_node(fwnode))); return; } if (is_software_node(fwnode)) { dev_set_name(dev, "spi-%pfwP", fwnode); return; } dev_set_name(&spi->dev, "%s.%u", dev_name(&spi->controller->dev), spi_get_chipselect(spi, 0)); } /* * Zero(0) is a valid physical CS value and can be located at any * logical CS in the spi->chip_select[]. If all the physical CS * are initialized to 0 then It would be difficult to differentiate * between a valid physical CS 0 & an unused logical CS whose physical * CS can be 0. As a solution to this issue initialize all the CS to -1. * Now all the unused logical CS will have -1 physical CS value & can be * ignored while performing physical CS validity checks. */ #define SPI_INVALID_CS ((s8)-1) static inline bool is_valid_cs(s8 chip_select) { return chip_select != SPI_INVALID_CS; } static inline int spi_dev_check_cs(struct device *dev, struct spi_device *spi, u8 idx, struct spi_device *new_spi, u8 new_idx) { u8 cs, cs_new; u8 idx_new; cs = spi_get_chipselect(spi, idx); for (idx_new = new_idx; idx_new < SPI_CS_CNT_MAX; idx_new++) { cs_new = spi_get_chipselect(new_spi, idx_new); if (is_valid_cs(cs) && is_valid_cs(cs_new) && cs == cs_new) { dev_err(dev, "chipselect %u already in use\n", cs_new); return -EBUSY; } } return 0; } static int spi_dev_check(struct device *dev, void *data) { struct spi_device *spi = to_spi_device(dev); struct spi_device *new_spi = data; int status, idx; if (spi->controller == new_spi->controller) { for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) { status = spi_dev_check_cs(dev, spi, idx, new_spi, 0); if (status) return status; } } return 0; } static void spi_cleanup(struct spi_device *spi) { if (spi->controller->cleanup) spi->controller->cleanup(spi); } static int __spi_add_device(struct spi_device *spi) { struct spi_controller *ctlr = spi->controller; struct device *dev = ctlr->dev.parent; int status, idx; u8 cs; for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) { /* Chipselects are numbered 0..max; validate. */ cs = spi_get_chipselect(spi, idx); if (is_valid_cs(cs) && cs >= ctlr->num_chipselect) { dev_err(dev, "cs%d >= max %d\n", spi_get_chipselect(spi, idx), ctlr->num_chipselect); return -EINVAL; } } /* * Make sure that multiple logical CS doesn't map to the same physical CS. * For example, spi->chip_select[0] != spi->chip_select[1] and so on. */ if (!spi_controller_is_target(ctlr)) { for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) { status = spi_dev_check_cs(dev, spi, idx, spi, idx + 1); if (status) return status; } } /* Set the bus ID string */ spi_dev_set_name(spi); /* * We need to make sure there's no other device with this * chipselect **BEFORE** we call setup(), else we'll trash * its configuration. */ status = bus_for_each_dev(&spi_bus_type, NULL, spi, spi_dev_check); if (status) return status; /* Controller may unregister concurrently */ if (IS_ENABLED(CONFIG_SPI_DYNAMIC) && !device_is_registered(&ctlr->dev)) { return -ENODEV; } if (ctlr->cs_gpiods) { u8 cs; for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) { cs = spi_get_chipselect(spi, idx); if (is_valid_cs(cs)) spi_set_csgpiod(spi, idx, ctlr->cs_gpiods[cs]); } } /* * Drivers may modify this initial i/o setup, but will * normally rely on the device being setup. Devices * using SPI_CS_HIGH can't coexist well otherwise... */ status = spi_setup(spi); if (status < 0) { dev_err(dev, "can't setup %s, status %d\n", dev_name(&spi->dev), status); return status; } /* Device may be bound to an active driver when this returns */ status = device_add(&spi->dev); if (status < 0) { dev_err(dev, "can't add %s, status %d\n", dev_name(&spi->dev), status); spi_cleanup(spi); } else { dev_dbg(dev, "registered child %s\n", dev_name(&spi->dev)); } return status; } /** * spi_add_device - Add spi_device allocated with spi_alloc_device * @spi: spi_device to register * * Companion function to spi_alloc_device. Devices allocated with * spi_alloc_device can be added onto the SPI bus with this function. * * Return: 0 on success; negative errno on failure */ int spi_add_device(struct spi_device *spi) { struct spi_controller *ctlr = spi->controller; int status; /* Set the bus ID string */ spi_dev_set_name(spi); mutex_lock(&ctlr->add_lock); status = __spi_add_device(spi); mutex_unlock(&ctlr->add_lock); return status; } EXPORT_SYMBOL_GPL(spi_add_device); static void spi_set_all_cs_unused(struct spi_device *spi) { u8 idx; for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) spi_set_chipselect(spi, idx, SPI_INVALID_CS); } /** * spi_new_device - instantiate one new SPI device * @ctlr: Controller to which device is connected * @chip: Describes the SPI device * Context: can sleep * * On typical mainboards, this is purely internal; and it's not needed * after board init creates the hard-wired devices. Some development * platforms may not be able to use spi_register_board_info though, and * this is exported so that for example a USB or parport based adapter * driver could add devices (which it would learn about out-of-band). * * Return: the new device, or NULL. */ struct spi_device *spi_new_device(struct spi_controller *ctlr, struct spi_board_info *chip) { struct spi_device *proxy; int status; /* * NOTE: caller did any chip->bus_num checks necessary. * * Also, unless we change the return value convention to use * error-or-pointer (not NULL-or-pointer), troubleshootability * suggests syslogged diagnostics are best here (ugh). */ proxy = spi_alloc_device(ctlr); if (!proxy) return NULL; WARN_ON(strlen(chip->modalias) >= sizeof(proxy->modalias)); /* Use provided chip-select for proxy device */ spi_set_all_cs_unused(proxy); spi_set_chipselect(proxy, 0, chip->chip_select); proxy->max_speed_hz = chip->max_speed_hz; proxy->mode = chip->mode; proxy->irq = chip->irq; strscpy(proxy->modalias, chip->modalias, sizeof(proxy->modalias)); proxy->dev.platform_data = (void *) chip->platform_data; proxy->controller_data = chip->controller_data; proxy->controller_state = NULL; /* * By default spi->chip_select[0] will hold the physical CS number, * so set bit 0 in spi->cs_index_mask. */ proxy->cs_index_mask = BIT(0); if (chip->swnode) { status = device_add_software_node(&proxy->dev, chip->swnode); if (status) { dev_err(&ctlr->dev, "failed to add software node to '%s': %d\n", chip->modalias, status); goto err_dev_put; } } status = spi_add_device(proxy); if (status < 0) goto err_dev_put; return proxy; err_dev_put: device_remove_software_node(&proxy->dev); spi_dev_put(proxy); return NULL; } EXPORT_SYMBOL_GPL(spi_new_device); /** * spi_unregister_device - unregister a single SPI device * @spi: spi_device to unregister * * Start making the passed SPI device vanish. Normally this would be handled * by spi_unregister_controller(). */ void spi_unregister_device(struct spi_device *spi) { struct fwnode_handle *fwnode; if (!spi) return; fwnode = dev_fwnode(&spi->dev); if (is_of_node(fwnode)) { of_node_clear_flag(to_of_node(fwnode), OF_POPULATED); of_node_put(to_of_node(fwnode)); } else if (is_acpi_device_node(fwnode)) { acpi_device_clear_enumerated(to_acpi_device_node(fwnode)); } device_remove_software_node(&spi->dev); device_del(&spi->dev); spi_cleanup(spi); put_device(&spi->dev); } EXPORT_SYMBOL_GPL(spi_unregister_device); static void spi_match_controller_to_boardinfo(struct spi_controller *ctlr, struct spi_board_info *bi) { struct spi_device *dev; if (ctlr->bus_num != bi->bus_num) return; dev = spi_new_device(ctlr, bi); if (!dev) dev_err(ctlr->dev.parent, "can't create new device for %s\n", bi->modalias); } /** * spi_register_board_info - register SPI devices for a given board * @info: array of chip descriptors * @n: how many descriptors are provided * Context: can sleep * * Board-specific early init code calls this (probably during arch_initcall) * with segments of the SPI device table. Any device nodes are created later, * after the relevant parent SPI controller (bus_num) is defined. We keep * this table of devices forever, so that reloading a controller driver will * not make Linux forget about these hard-wired devices. * * Other code can also call this, e.g. a particular add-on board might provide * SPI devices through its expansion connector, so code initializing that board * would naturally declare its SPI devices. * * The board info passed can safely be __initdata ... but be careful of * any embedded pointers (platform_data, etc), they're copied as-is. * * Return: zero on success, else a negative error code. */ int spi_register_board_info(struct spi_board_info const *info, unsigned n) { struct boardinfo *bi; int i; if (!n) return 0; bi = kcalloc(n, sizeof(*bi), GFP_KERNEL); if (!bi) return -ENOMEM; for (i = 0; i < n; i++, bi++, info++) { struct spi_controller *ctlr; memcpy(&bi->board_info, info, sizeof(*info)); mutex_lock(&board_lock); list_add_tail(&bi->list, &board_list); list_for_each_entry(ctlr, &spi_controller_list, list) spi_match_controller_to_boardinfo(ctlr, &bi->board_info); mutex_unlock(&board_lock); } return 0; } /*-------------------------------------------------------------------------*/ /* Core methods for SPI resource management */ /** * spi_res_alloc - allocate a spi resource that is life-cycle managed * during the processing of a spi_message while using * spi_transfer_one * @spi: the SPI device for which we allocate memory * @release: the release code to execute for this resource * @size: size to alloc and return * @gfp: GFP allocation flags * * Return: the pointer to the allocated data * * This may get enhanced in the future to allocate from a memory pool * of the @spi_device or @spi_controller to avoid repeated allocations. */ static void *spi_res_alloc(struct spi_device *spi, spi_res_release_t release, size_t size, gfp_t gfp) { struct spi_res *sres; sres = kzalloc(sizeof(*sres) + size, gfp); if (!sres) return NULL; INIT_LIST_HEAD(&sres->entry); sres->release = release; return sres->data; } /** * spi_res_free - free an SPI resource * @res: pointer to the custom data of a resource */ static void spi_res_free(void *res) { struct spi_res *sres = container_of(res, struct spi_res, data); WARN_ON(!list_empty(&sres->entry)); kfree(sres); } /** * spi_res_add - add a spi_res to the spi_message * @message: the SPI message * @res: the spi_resource */ static void spi_res_add(struct spi_message *message, void *res) { struct spi_res *sres = container_of(res, struct spi_res, data); WARN_ON(!list_empty(&sres->entry)); list_add_tail(&sres->entry, &message->resources); } /** * spi_res_release - release all SPI resources for this message * @ctlr: the @spi_controller * @message: the @spi_message */ static void spi_res_release(struct spi_controller *ctlr, struct spi_message *message) { struct spi_res *res, *tmp; list_for_each_entry_safe_reverse(res, tmp, &message->resources, entry) { if (res->release) res->release(ctlr, message, res->data); list_del(&res->entry); kfree(res); } } /*-------------------------------------------------------------------------*/ #define spi_for_each_valid_cs(spi, idx) \ for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) \ if (!(spi->cs_index_mask & BIT(idx))) {} else static inline bool spi_is_last_cs(struct spi_device *spi) { u8 idx; bool last = false; spi_for_each_valid_cs(spi, idx) { if (spi->controller->last_cs[idx] == spi_get_chipselect(spi, idx)) last = true; } return last; } static void spi_toggle_csgpiod(struct spi_device *spi, u8 idx, bool enable, bool activate) { /* * Historically ACPI has no means of the GPIO polarity and * thus the SPISerialBus() resource defines it on the per-chip * basis. In order to avoid a chain of negations, the GPIO * polarity is considered being Active High. Even for the cases * when _DSD() is involved (in the updated versions of ACPI) * the GPIO CS polarity must be defined Active High to avoid * ambiguity. That's why we use enable, that takes SPI_CS_HIGH * into account. */ if (is_acpi_device_node(dev_fwnode(&spi->dev))) gpiod_set_value_cansleep(spi_get_csgpiod(spi, idx), !enable); else /* Polarity handled by GPIO library */ gpiod_set_value_cansleep(spi_get_csgpiod(spi, idx), activate); if (activate) spi_delay_exec(&spi->cs_setup, NULL); else spi_delay_exec(&spi->cs_inactive, NULL); } static void spi_set_cs(struct spi_device *spi, bool enable, bool force) { bool activate = enable; u8 idx; /* * Avoid calling into the driver (or doing delays) if the chip select * isn't actually changing from the last time this was called. */ if (!force && (enable == spi_is_last_cs(spi)) && (spi->controller->last_cs_index_mask == spi->cs_index_mask) && (spi->controller->last_cs_mode_high == (spi->mode & SPI_CS_HIGH))) return; trace_spi_set_cs(spi, activate); spi->controller->last_cs_index_mask = spi->cs_index_mask; for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) spi->controller->last_cs[idx] = enable ? spi_get_chipselect(spi, 0) : SPI_INVALID_CS; spi->controller->last_cs_mode_high = spi->mode & SPI_CS_HIGH; if (spi->controller->last_cs_mode_high) enable = !enable; /* * Handle chip select delays for GPIO based CS or controllers without * programmable chip select timing. */ if ((spi_is_csgpiod(spi) || !spi->controller->set_cs_timing) && !activate) spi_delay_exec(&spi->cs_hold, NULL); if (spi_is_csgpiod(spi)) { if (!(spi->mode & SPI_NO_CS)) { spi_for_each_valid_cs(spi, idx) { if (spi_get_csgpiod(spi, idx)) spi_toggle_csgpiod(spi, idx, enable, activate); } } /* Some SPI controllers need both GPIO CS & ->set_cs() */ if ((spi->controller->flags & SPI_CONTROLLER_GPIO_SS) && spi->controller->set_cs) spi->controller->set_cs(spi, !enable); } else if (spi->controller->set_cs) { spi->controller->set_cs(spi, !enable); } if (spi_is_csgpiod(spi) || !spi->controller->set_cs_timing) { if (activate) spi_delay_exec(&spi->cs_setup, NULL); else spi_delay_exec(&spi->cs_inactive, NULL); } } #ifdef CONFIG_HAS_DMA static int spi_map_buf_attrs(struct spi_controller *ctlr, struct device *dev, struct sg_table *sgt, void *buf, size_t len, enum dma_data_direction dir, unsigned long attrs) { const bool vmalloced_buf = is_vmalloc_addr(buf); unsigned int max_seg_size = dma_get_max_seg_size(dev); #ifdef CONFIG_HIGHMEM const bool kmap_buf = ((unsigned long)buf >= PKMAP_BASE && (unsigned long)buf < (PKMAP_BASE + (LAST_PKMAP * PAGE_SIZE))); #else const bool kmap_buf = false; #endif int desc_len; int sgs; struct page *vm_page; struct scatterlist *sg; void *sg_buf; size_t min; int i, ret; if (vmalloced_buf || kmap_buf) { desc_len = min_t(unsigned long, max_seg_size, PAGE_SIZE); sgs = DIV_ROUND_UP(len + offset_in_page(buf), desc_len); } else if (virt_addr_valid(buf)) { desc_len = min_t(size_t, max_seg_size, ctlr->max_dma_len); sgs = DIV_ROUND_UP(len, desc_len); } else { return -EINVAL; } ret = sg_alloc_table(sgt, sgs, GFP_KERNEL); if (ret != 0) return ret; sg = &sgt->sgl[0]; for (i = 0; i < sgs; i++) { if (vmalloced_buf || kmap_buf) { /* * Next scatterlist entry size is the minimum between * the desc_len and the remaining buffer length that * fits in a page. */ min = min_t(size_t, desc_len, min_t(size_t, len, PAGE_SIZE - offset_in_page(buf))); if (vmalloced_buf) vm_page = vmalloc_to_page(buf); else vm_page = kmap_to_page(buf); if (!vm_page) { sg_free_table(sgt); return -ENOMEM; } sg_set_page(sg, vm_page, min, offset_in_page(buf)); } else { min = min_t(size_t, len, desc_len); sg_buf = buf; sg_set_buf(sg, sg_buf, min); } buf += min; len -= min; sg = sg_next(sg); } ret = dma_map_sgtable(dev, sgt, dir, attrs); if (ret < 0) { sg_free_table(sgt); return ret; } return 0; } int spi_map_buf(struct spi_controller *ctlr, struct device *dev, struct sg_table *sgt, void *buf, size_t len, enum dma_data_direction dir) { return spi_map_buf_attrs(ctlr, dev, sgt, buf, len, dir, 0); } static void spi_unmap_buf_attrs(struct spi_controller *ctlr, struct device *dev, struct sg_table *sgt, enum dma_data_direction dir, unsigned long attrs) { dma_unmap_sgtable(dev, sgt, dir, attrs); sg_free_table(sgt); sgt->orig_nents = 0; sgt->nents = 0; } void spi_unmap_buf(struct spi_controller *ctlr, struct device *dev, struct sg_table *sgt, enum dma_data_direction dir) { spi_unmap_buf_attrs(ctlr, dev, sgt, dir, 0); } static int __spi_map_msg(struct spi_controller *ctlr, struct spi_message *msg) { struct device *tx_dev, *rx_dev; struct spi_transfer *xfer; int ret; if (!ctlr->can_dma) return 0; if (ctlr->dma_tx) tx_dev = ctlr->dma_tx->device->dev; else if (ctlr->dma_map_dev) tx_dev = ctlr->dma_map_dev; else tx_dev = ctlr->dev.parent; if (ctlr->dma_rx) rx_dev = ctlr->dma_rx->device->dev; else if (ctlr->dma_map_dev) rx_dev = ctlr->dma_map_dev; else rx_dev = ctlr->dev.parent; ret = -ENOMSG; list_for_each_entry(xfer, &msg->transfers, transfer_list) { /* The sync is done before each transfer. */ unsigned long attrs = DMA_ATTR_SKIP_CPU_SYNC; if (!ctlr->can_dma(ctlr, msg->spi, xfer)) continue; if (xfer->tx_buf != NULL) { ret = spi_map_buf_attrs(ctlr, tx_dev, &xfer->tx_sg, (void *)xfer->tx_buf, xfer->len, DMA_TO_DEVICE, attrs); if (ret != 0) return ret; xfer->tx_sg_mapped = true; } if (xfer->rx_buf != NULL) { ret = spi_map_buf_attrs(ctlr, rx_dev, &xfer->rx_sg, xfer->rx_buf, xfer->len, DMA_FROM_DEVICE, attrs); if (ret != 0) { spi_unmap_buf_attrs(ctlr, tx_dev, &xfer->tx_sg, DMA_TO_DEVICE, attrs); return ret; } xfer->rx_sg_mapped = true; } } /* No transfer has been mapped, bail out with success */ if (ret) return 0; ctlr->cur_rx_dma_dev = rx_dev; ctlr->cur_tx_dma_dev = tx_dev; return 0; } static int __spi_unmap_msg(struct spi_controller *ctlr, struct spi_message *msg) { struct device *rx_dev = ctlr->cur_rx_dma_dev; struct device *tx_dev = ctlr->cur_tx_dma_dev; struct spi_transfer *xfer; list_for_each_entry(xfer, &msg->transfers, transfer_list) { /* The sync has already been done after each transfer. */ unsigned long attrs = DMA_ATTR_SKIP_CPU_SYNC; if (xfer->rx_sg_mapped) spi_unmap_buf_attrs(ctlr, rx_dev, &xfer->rx_sg, DMA_FROM_DEVICE, attrs); xfer->rx_sg_mapped = false; if (xfer->tx_sg_mapped) spi_unmap_buf_attrs(ctlr, tx_dev, &xfer->tx_sg, DMA_TO_DEVICE, attrs); xfer->tx_sg_mapped = false; } return 0; } static void spi_dma_sync_for_device(struct spi_controller *ctlr, struct spi_transfer *xfer) { struct device *rx_dev = ctlr->cur_rx_dma_dev; struct device *tx_dev = ctlr->cur_tx_dma_dev; if (xfer->tx_sg_mapped) dma_sync_sgtable_for_device(tx_dev, &xfer->tx_sg, DMA_TO_DEVICE); if (xfer->rx_sg_mapped) dma_sync_sgtable_for_device(rx_dev, &xfer->rx_sg, DMA_FROM_DEVICE); } static void spi_dma_sync_for_cpu(struct spi_controller *ctlr, struct spi_transfer *xfer) { struct device *rx_dev = ctlr->cur_rx_dma_dev; struct device *tx_dev = ctlr->cur_tx_dma_dev; if (xfer->rx_sg_mapped) dma_sync_sgtable_for_cpu(rx_dev, &xfer->rx_sg, DMA_FROM_DEVICE); if (xfer->tx_sg_mapped) dma_sync_sgtable_for_cpu(tx_dev, &xfer->tx_sg, DMA_TO_DEVICE); } #else /* !CONFIG_HAS_DMA */ static inline int __spi_map_msg(struct spi_controller *ctlr, struct spi_message *msg) { return 0; } static inline int __spi_unmap_msg(struct spi_controller *ctlr, struct spi_message *msg) { return 0; } static void spi_dma_sync_for_device(struct spi_controller *ctrl, struct spi_transfer *xfer) { } static void spi_dma_sync_for_cpu(struct spi_controller *ctrl, struct spi_transfer *xfer) { } #endif /* !CONFIG_HAS_DMA */ static inline int spi_unmap_msg(struct spi_controller *ctlr, struct spi_message *msg) { struct spi_transfer *xfer; list_for_each_entry(xfer, &msg->transfers, transfer_list) { /* * Restore the original value of tx_buf or rx_buf if they are * NULL. */ if (xfer->tx_buf == ctlr->dummy_tx) xfer->tx_buf = NULL; if (xfer->rx_buf == ctlr->dummy_rx) xfer->rx_buf = NULL; } return __spi_unmap_msg(ctlr, msg); } static int spi_map_msg(struct spi_controller *ctlr, struct spi_message *msg) { struct spi_transfer *xfer; void *tmp; unsigned int max_tx, max_rx; if ((ctlr->flags & (SPI_CONTROLLER_MUST_RX | SPI_CONTROLLER_MUST_TX)) && !(msg->spi->mode & SPI_3WIRE)) { max_tx = 0; max_rx = 0; list_for_each_entry(xfer, &msg->transfers, transfer_list) { if ((ctlr->flags & SPI_CONTROLLER_MUST_TX) && !xfer->tx_buf) max_tx = max(xfer->len, max_tx); if ((ctlr->flags & SPI_CONTROLLER_MUST_RX) && !xfer->rx_buf) max_rx = max(xfer->len, max_rx); } if (max_tx) { tmp = krealloc(ctlr->dummy_tx, max_tx, GFP_KERNEL | GFP_DMA | __GFP_ZERO); if (!tmp) return -ENOMEM; ctlr->dummy_tx = tmp; } if (max_rx) { tmp = krealloc(ctlr->dummy_rx, max_rx, GFP_KERNEL | GFP_DMA); if (!tmp) return -ENOMEM; ctlr->dummy_rx = tmp; } if (max_tx || max_rx) { list_for_each_entry(xfer, &msg->transfers, transfer_list) { if (!xfer->len) continue; if (!xfer->tx_buf) xfer->tx_buf = ctlr->dummy_tx; if (!xfer->rx_buf) xfer->rx_buf = ctlr->dummy_rx; } } } return __spi_map_msg(ctlr, msg); } static int spi_transfer_wait(struct spi_controller *ctlr, struct spi_message *msg, struct spi_transfer *xfer) { struct spi_statistics __percpu *statm = ctlr->pcpu_statistics; struct spi_statistics __percpu *stats = msg->spi->pcpu_statistics; u32 speed_hz = xfer->speed_hz; unsigned long long ms; if (spi_controller_is_target(ctlr)) { if (wait_for_completion_interruptible(&ctlr->xfer_completion)) { dev_dbg(&msg->spi->dev, "SPI transfer interrupted\n"); return -EINTR; } } else { if (!speed_hz) speed_hz = 100000; /* * For each byte we wait for 8 cycles of the SPI clock. * Since speed is defined in Hz and we want milliseconds, * use respective multiplier, but before the division, * otherwise we may get 0 for short transfers. */ ms = 8LL * MSEC_PER_SEC * xfer->len; do_div(ms, speed_hz); /* * Increase it twice and add 200 ms tolerance, use * predefined maximum in case of overflow. */ ms += ms + 200; if (ms > UINT_MAX) ms = UINT_MAX; ms = wait_for_completion_timeout(&ctlr->xfer_completion, msecs_to_jiffies(ms)); if (ms == 0) { SPI_STATISTICS_INCREMENT_FIELD(statm, timedout); SPI_STATISTICS_INCREMENT_FIELD(stats, timedout); dev_err(&msg->spi->dev, "SPI transfer timed out\n"); return -ETIMEDOUT; } if (xfer->error & SPI_TRANS_FAIL_IO) return -EIO; } return 0; } static void _spi_transfer_delay_ns(u32 ns) { if (!ns) return; if (ns <= NSEC_PER_USEC) { ndelay(ns); } else { u32 us = DIV_ROUND_UP(ns, NSEC_PER_USEC); fsleep(us); } } int spi_delay_to_ns(struct spi_delay *_delay, struct spi_transfer *xfer) { u32 delay = _delay->value; u32 unit = _delay->unit; u32 hz; if (!delay) return 0; switch (unit) { case SPI_DELAY_UNIT_USECS: delay *= NSEC_PER_USEC; break; case SPI_DELAY_UNIT_NSECS: /* Nothing to do here */ break; case SPI_DELAY_UNIT_SCK: /* Clock cycles need to be obtained from spi_transfer */ if (!xfer) return -EINVAL; /* * If there is unknown effective speed, approximate it * by underestimating with half of the requested Hz. */ hz = xfer->effective_speed_hz ?: xfer->speed_hz / 2; if (!hz) return -EINVAL; /* Convert delay to nanoseconds */ delay *= DIV_ROUND_UP(NSEC_PER_SEC, hz); break; default: return -EINVAL; } return delay; } EXPORT_SYMBOL_GPL(spi_delay_to_ns); int spi_delay_exec(struct spi_delay *_delay, struct spi_transfer *xfer) { int delay; might_sleep(); if (!_delay) return -EINVAL; delay = spi_delay_to_ns(_delay, xfer); if (delay < 0) return delay; _spi_transfer_delay_ns(delay); return 0; } EXPORT_SYMBOL_GPL(spi_delay_exec); static void _spi_transfer_cs_change_delay(struct spi_message *msg, struct spi_transfer *xfer) { u32 default_delay_ns = 10 * NSEC_PER_USEC; u32 delay = xfer->cs_change_delay.value; u32 unit = xfer->cs_change_delay.unit; int ret; /* Return early on "fast" mode - for everything but USECS */ if (!delay) { if (unit == SPI_DELAY_UNIT_USECS) _spi_transfer_delay_ns(default_delay_ns); return; } ret = spi_delay_exec(&xfer->cs_change_delay, xfer); if (ret) { dev_err_once(&msg->spi->dev, "Use of unsupported delay unit %i, using default of %luus\n", unit, default_delay_ns / NSEC_PER_USEC); _spi_transfer_delay_ns(default_delay_ns); } } void spi_transfer_cs_change_delay_exec(struct spi_message *msg, struct spi_transfer *xfer) { _spi_transfer_cs_change_delay(msg, xfer); } EXPORT_SYMBOL_GPL(spi_transfer_cs_change_delay_exec); /* * spi_transfer_one_message - Default implementation of transfer_one_message() * * This is a standard implementation of transfer_one_message() for * drivers which implement a transfer_one() operation. It provides * standard handling of delays and chip select management. */ static int spi_transfer_one_message(struct spi_controller *ctlr, struct spi_message *msg) { struct spi_transfer *xfer; bool keep_cs = false; int ret = 0; struct spi_statistics __percpu *statm = ctlr->pcpu_statistics; struct spi_statistics __percpu *stats = msg->spi->pcpu_statistics; xfer = list_first_entry(&msg->transfers, struct spi_transfer, transfer_list); spi_set_cs(msg->spi, !xfer->cs_off, false); SPI_STATISTICS_INCREMENT_FIELD(statm, messages); SPI_STATISTICS_INCREMENT_FIELD(stats, messages); list_for_each_entry(xfer, &msg->transfers, transfer_list) { trace_spi_transfer_start(msg, xfer); spi_statistics_add_transfer_stats(statm, xfer, msg); spi_statistics_add_transfer_stats(stats, xfer, msg); if (!ctlr->ptp_sts_supported) { xfer->ptp_sts_word_pre = 0; ptp_read_system_prets(xfer->ptp_sts); } if ((xfer->tx_buf || xfer->rx_buf) && xfer->len) { reinit_completion(&ctlr->xfer_completion); fallback_pio: spi_dma_sync_for_device(ctlr, xfer); ret = ctlr->transfer_one(ctlr, msg->spi, xfer); if (ret < 0) { spi_dma_sync_for_cpu(ctlr, xfer); if ((xfer->tx_sg_mapped || xfer->rx_sg_mapped) && (xfer->error & SPI_TRANS_FAIL_NO_START)) { __spi_unmap_msg(ctlr, msg); ctlr->fallback = true; xfer->error &= ~SPI_TRANS_FAIL_NO_START; goto fallback_pio; } SPI_STATISTICS_INCREMENT_FIELD(statm, errors); SPI_STATISTICS_INCREMENT_FIELD(stats, errors); dev_err(&msg->spi->dev, "SPI transfer failed: %d\n", ret); goto out; } if (ret > 0) { ret = spi_transfer_wait(ctlr, msg, xfer); if (ret < 0) msg->status = ret; } spi_dma_sync_for_cpu(ctlr, xfer); } else { if (xfer->len) dev_err(&msg->spi->dev, "Bufferless transfer has length %u\n", xfer->len); } if (!ctlr->ptp_sts_supported) { ptp_read_system_postts(xfer->ptp_sts); xfer->ptp_sts_word_post = xfer->len; } trace_spi_transfer_stop(msg, xfer); if (msg->status != -EINPROGRESS) goto out; spi_transfer_delay_exec(xfer); if (xfer->cs_change) { if (list_is_last(&xfer->transfer_list, &msg->transfers)) { keep_cs = true; } else { if (!xfer->cs_off) spi_set_cs(msg->spi, false, false); _spi_transfer_cs_change_delay(msg, xfer); if (!list_next_entry(xfer, transfer_list)->cs_off) spi_set_cs(msg->spi, true, false); } } else if (!list_is_last(&xfer->transfer_list, &msg->transfers) && xfer->cs_off != list_next_entry(xfer, transfer_list)->cs_off) { spi_set_cs(msg->spi, xfer->cs_off, false); } msg->actual_length += xfer->len; } out: if (ret != 0 || !keep_cs) spi_set_cs(msg->spi, false, false); if (msg->status == -EINPROGRESS) msg->status = ret; if (msg->status && ctlr->handle_err) ctlr->handle_err(ctlr, msg); spi_finalize_current_message(ctlr); return ret; } /** * spi_finalize_current_transfer - report completion of a transfer * @ctlr: the controller reporting completion * * Called by SPI drivers using the core transfer_one_message() * implementation to notify it that the current interrupt driven * transfer has finished and the next one may be scheduled. */ void spi_finalize_current_transfer(struct spi_controller *ctlr) { complete(&ctlr->xfer_completion); } EXPORT_SYMBOL_GPL(spi_finalize_current_transfer); static void spi_idle_runtime_pm(struct spi_controller *ctlr) { if (ctlr->auto_runtime_pm) { pm_runtime_mark_last_busy(ctlr->dev.parent); pm_runtime_put_autosuspend(ctlr->dev.parent); } } static int __spi_pump_transfer_message(struct spi_controller *ctlr, struct spi_message *msg, bool was_busy) { struct spi_transfer *xfer; int ret; if (!was_busy && ctlr->auto_runtime_pm) { ret = pm_runtime_get_sync(ctlr->dev.parent); if (ret < 0) { pm_runtime_put_noidle(ctlr->dev.parent); dev_err(&ctlr->dev, "Failed to power device: %d\n", ret); msg->status = ret; spi_finalize_current_message(ctlr); return ret; } } if (!was_busy) trace_spi_controller_busy(ctlr); if (!was_busy && ctlr->prepare_transfer_hardware) { ret = ctlr->prepare_transfer_hardware(ctlr); if (ret) { dev_err(&ctlr->dev, "failed to prepare transfer hardware: %d\n", ret); if (ctlr->auto_runtime_pm) pm_runtime_put(ctlr->dev.parent); msg->status = ret; spi_finalize_current_message(ctlr); return ret; } } trace_spi_message_start(msg); if (ctlr->prepare_message) { ret = ctlr->prepare_message(ctlr, msg); if (ret) { dev_err(&ctlr->dev, "failed to prepare message: %d\n", ret); msg->status = ret; spi_finalize_current_message(ctlr); return ret; } msg->prepared = true; } ret = spi_map_msg(ctlr, msg); if (ret) { msg->status = ret; spi_finalize_current_message(ctlr); return ret; } if (!ctlr->ptp_sts_supported && !ctlr->transfer_one) { list_for_each_entry(xfer, &msg->transfers, transfer_list) { xfer->ptp_sts_word_pre = 0; ptp_read_system_prets(xfer->ptp_sts); } } /* * Drivers implementation of transfer_one_message() must arrange for * spi_finalize_current_message() to get called. Most drivers will do * this in the calling context, but some don't. For those cases, a * completion is used to guarantee that this function does not return * until spi_finalize_current_message() is done accessing * ctlr->cur_msg. * Use of the following two flags enable to opportunistically skip the * use of the completion since its use involves expensive spin locks. * In case of a race with the context that calls * spi_finalize_current_message() the completion will always be used, * due to strict ordering of these flags using barriers. */ WRITE_ONCE(ctlr->cur_msg_incomplete, true); WRITE_ONCE(ctlr->cur_msg_need_completion, false); reinit_completion(&ctlr->cur_msg_completion); smp_wmb(); /* Make these available to spi_finalize_current_message() */ ret = ctlr->transfer_one_message(ctlr, msg); if (ret) { dev_err(&ctlr->dev, "failed to transfer one message from queue\n"); return ret; } WRITE_ONCE(ctlr->cur_msg_need_completion, true); smp_mb(); /* See spi_finalize_current_message()... */ if (READ_ONCE(ctlr->cur_msg_incomplete)) wait_for_completion(&ctlr->cur_msg_completion); return 0; } /** * __spi_pump_messages - function which processes SPI message queue * @ctlr: controller to process queue for * @in_kthread: true if we are in the context of the message pump thread * * This function checks if there is any SPI message in the queue that * needs processing and if so call out to the driver to initialize hardware * and transfer each message. * * Note that it is called both from the kthread itself and also from * inside spi_sync(); the queue extraction handling at the top of the * function should deal with this safely. */ static void __spi_pump_messages(struct spi_controller *ctlr, bool in_kthread) { struct spi_message *msg; bool was_busy = false; unsigned long flags; int ret; /* Take the I/O mutex */ mutex_lock(&ctlr->io_mutex); /* Lock queue */ spin_lock_irqsave(&ctlr->queue_lock, flags); /* Make sure we are not already running a message */ if (ctlr->cur_msg) goto out_unlock; /* Check if the queue is idle */ if (list_empty(&ctlr->queue) || !ctlr->running) { if (!ctlr->busy) goto out_unlock; /* Defer any non-atomic teardown to the thread */ if (!in_kthread) { if (!ctlr->dummy_rx && !ctlr->dummy_tx && !ctlr->unprepare_transfer_hardware) { spi_idle_runtime_pm(ctlr); ctlr->busy = false; ctlr->queue_empty = true; trace_spi_controller_idle(ctlr); } else { kthread_queue_work(ctlr->kworker, &ctlr->pump_messages); } goto out_unlock; } ctlr->busy = false; spin_unlock_irqrestore(&ctlr->queue_lock, flags); kfree(ctlr->dummy_rx); ctlr->dummy_rx = NULL; kfree(ctlr->dummy_tx); ctlr->dummy_tx = NULL; if (ctlr->unprepare_transfer_hardware && ctlr->unprepare_transfer_hardware(ctlr)) dev_err(&ctlr->dev, "failed to unprepare transfer hardware\n"); spi_idle_runtime_pm(ctlr); trace_spi_controller_idle(ctlr); spin_lock_irqsave(&ctlr->queue_lock, flags); ctlr->queue_empty = true; goto out_unlock; } /* Extract head of queue */ msg = list_first_entry(&ctlr->queue, struct spi_message, queue); ctlr->cur_msg = msg; list_del_init(&msg->queue); if (ctlr->busy) was_busy = true; else ctlr->busy = true; spin_unlock_irqrestore(&ctlr->queue_lock, flags); ret = __spi_pump_transfer_message(ctlr, msg, was_busy); kthread_queue_work(ctlr->kworker, &ctlr->pump_messages); ctlr->cur_msg = NULL; ctlr->fallback = false; mutex_unlock(&ctlr->io_mutex); /* Prod the scheduler in case transfer_one() was busy waiting */ if (!ret) cond_resched(); return; out_unlock: spin_unlock_irqrestore(&ctlr->queue_lock, flags); mutex_unlock(&ctlr->io_mutex); } /** * spi_pump_messages - kthread work function which processes spi message queue * @work: pointer to kthread work struct contained in the controller struct */ static void spi_pump_messages(struct kthread_work *work) { struct spi_controller *ctlr = container_of(work, struct spi_controller, pump_messages); __spi_pump_messages(ctlr, true); } /** * spi_take_timestamp_pre - helper to collect the beginning of the TX timestamp * @ctlr: Pointer to the spi_controller structure of the driver * @xfer: Pointer to the transfer being timestamped * @progress: How many words (not bytes) have been transferred so far * @irqs_off: If true, will disable IRQs and preemption for the duration of the * transfer, for less jitter in time measurement. Only compatible * with PIO drivers. If true, must follow up with * spi_take_timestamp_post or otherwise system will crash. * WARNING: for fully predictable results, the CPU frequency must * also be under control (governor). * * This is a helper for drivers to collect the beginning of the TX timestamp * for the requested byte from the SPI transfer. The frequency with which this * function must be called (once per word, once for the whole transfer, once * per batch of words etc) is arbitrary as long as the @tx buffer offset is * greater than or equal to the requested byte at the time of the call. The * timestamp is only taken once, at the first such call. It is assumed that * the driver advances its @tx buffer pointer monotonically. */ void spi_take_timestamp_pre(struct spi_controller *ctlr, struct spi_transfer *xfer, size_t progress, bool irqs_off) { if (!xfer->ptp_sts) return; if (xfer->timestamped) return; if (progress > xfer->ptp_sts_word_pre) return; /* Capture the resolution of the timestamp */ xfer->ptp_sts_word_pre = progress; if (irqs_off) { local_irq_save(ctlr->irq_flags); preempt_disable(); } ptp_read_system_prets(xfer->ptp_sts); } EXPORT_SYMBOL_GPL(spi_take_timestamp_pre); /** * spi_take_timestamp_post - helper to collect the end of the TX timestamp * @ctlr: Pointer to the spi_controller structure of the driver * @xfer: Pointer to the transfer being timestamped * @progress: How many words (not bytes) have been transferred so far * @irqs_off: If true, will re-enable IRQs and preemption for the local CPU. * * This is a helper for drivers to collect the end of the TX timestamp for * the requested byte from the SPI transfer. Can be called with an arbitrary * frequency: only the first call where @tx exceeds or is equal to the * requested word will be timestamped. */ void spi_take_timestamp_post(struct spi_controller *ctlr, struct spi_transfer *xfer, size_t progress, bool irqs_off) { if (!xfer->ptp_sts) return; if (xfer->timestamped) return; if (progress < xfer->ptp_sts_word_post) return; ptp_read_system_postts(xfer->ptp_sts); if (irqs_off) { local_irq_restore(ctlr->irq_flags); preempt_enable(); } /* Capture the resolution of the timestamp */ xfer->ptp_sts_word_post = progress; xfer->timestamped = 1; } EXPORT_SYMBOL_GPL(spi_take_timestamp_post); /** * spi_set_thread_rt - set the controller to pump at realtime priority * @ctlr: controller to boost priority of * * This can be called because the controller requested realtime priority * (by setting the ->rt value before calling spi_register_controller()) or * because a device on the bus said that its transfers needed realtime * priority. * * NOTE: at the moment if any device on a bus says it needs realtime then * the thread will be at realtime priority for all transfers on that * controller. If this eventually becomes a problem we may see if we can * find a way to boost the priority only temporarily during relevant * transfers. */ static void spi_set_thread_rt(struct spi_controller *ctlr) { dev_info(&ctlr->dev, "will run message pump with realtime priority\n"); sched_set_fifo(ctlr->kworker->task); } static int spi_init_queue(struct spi_controller *ctlr) { ctlr->running = false; ctlr->busy = false; ctlr->queue_empty = true; ctlr->kworker = kthread_run_worker(0, dev_name(&ctlr->dev)); if (IS_ERR(ctlr->kworker)) { dev_err(&ctlr->dev, "failed to create message pump kworker\n"); return PTR_ERR(ctlr->kworker); } kthread_init_work(&ctlr->pump_messages, spi_pump_messages); /* * Controller config will indicate if this controller should run the * message pump with high (realtime) priority to reduce the transfer * latency on the bus by minimising the delay between a transfer * request and the scheduling of the message pump thread. Without this * setting the message pump thread will remain at default priority. */ if (ctlr->rt) spi_set_thread_rt(ctlr); return 0; } /** * spi_get_next_queued_message() - called by driver to check for queued * messages * @ctlr: the controller to check for queued messages * * If there are more messages in the queue, the next message is returned from * this call. * * Return: the next message in the queue, else NULL if the queue is empty. */ struct spi_message *spi_get_next_queued_message(struct spi_controller *ctlr) { struct spi_message *next; unsigned long flags; /* Get a pointer to the next message, if any */ spin_lock_irqsave(&ctlr->queue_lock, flags); next = list_first_entry_or_null(&ctlr->queue, struct spi_message, queue); spin_unlock_irqrestore(&ctlr->queue_lock, flags); return next; } EXPORT_SYMBOL_GPL(spi_get_next_queued_message); /* * __spi_unoptimize_message - shared implementation of spi_unoptimize_message() * and spi_maybe_unoptimize_message() * @msg: the message to unoptimize * * Peripheral drivers should use spi_unoptimize_message() and callers inside * core should use spi_maybe_unoptimize_message() rather than calling this * function directly. * * It is not valid to call this on a message that is not currently optimized. */ static void __spi_unoptimize_message(struct spi_message *msg) { struct spi_controller *ctlr = msg->spi->controller; if (ctlr->unoptimize_message) ctlr->unoptimize_message(msg); spi_res_release(ctlr, msg); msg->optimized = false; msg->opt_state = NULL; } /* * spi_maybe_unoptimize_message - unoptimize msg not managed by a peripheral * @msg: the message to unoptimize * * This function is used to unoptimize a message if and only if it was * optimized by the core (via spi_maybe_optimize_message()). */ static void spi_maybe_unoptimize_message(struct spi_message *msg) { if (!msg->pre_optimized && msg->optimized && !msg->spi->controller->defer_optimize_message) __spi_unoptimize_message(msg); } /** * spi_finalize_current_message() - the current message is complete * @ctlr: the controller to return the message to * * Called by the driver to notify the core that the message in the front of the * queue is complete and can be removed from the queue. */ void spi_finalize_current_message(struct spi_controller *ctlr) { struct spi_transfer *xfer; struct spi_message *mesg; int ret; mesg = ctlr->cur_msg; if (!ctlr->ptp_sts_supported && !ctlr->transfer_one) { list_for_each_entry(xfer, &mesg->transfers, transfer_list) { ptp_read_system_postts(xfer->ptp_sts); xfer->ptp_sts_word_post = xfer->len; } } if (unlikely(ctlr->ptp_sts_supported)) list_for_each_entry(xfer, &mesg->transfers, transfer_list) WARN_ON_ONCE(xfer->ptp_sts && !xfer->timestamped); spi_unmap_msg(ctlr, mesg); if (mesg->prepared && ctlr->unprepare_message) { ret = ctlr->unprepare_message(ctlr, mesg); if (ret) { dev_err(&ctlr->dev, "failed to unprepare message: %d\n", ret); } } mesg->prepared = false; spi_maybe_unoptimize_message(mesg); WRITE_ONCE(ctlr->cur_msg_incomplete, false); smp_mb(); /* See __spi_pump_transfer_message()... */ if (READ_ONCE(ctlr->cur_msg_need_completion)) complete(&ctlr->cur_msg_completion); trace_spi_message_done(mesg); mesg->state = NULL; if (mesg->complete) mesg->complete(mesg->context); } EXPORT_SYMBOL_GPL(spi_finalize_current_message); static int spi_start_queue(struct spi_controller *ctlr) { unsigned long flags; spin_lock_irqsave(&ctlr->queue_lock, flags); if (ctlr->running || ctlr->busy) { spin_unlock_irqrestore(&ctlr->queue_lock, flags); return -EBUSY; } ctlr->running = true; ctlr->cur_msg = NULL; spin_unlock_irqrestore(&ctlr->queue_lock, flags); kthread_queue_work(ctlr->kworker, &ctlr->pump_messages); return 0; } static int spi_stop_queue(struct spi_controller *ctlr) { unsigned int limit = 500; unsigned long flags; /* * This is a bit lame, but is optimized for the common execution path. * A wait_queue on the ctlr->busy could be used, but then the common * execution path (pump_messages) would be required to call wake_up or * friends on every SPI message. Do this instead. */ do { spin_lock_irqsave(&ctlr->queue_lock, flags); if (list_empty(&ctlr->queue) && !ctlr->busy) { ctlr->running = false; spin_unlock_irqrestore(&ctlr->queue_lock, flags); return 0; } spin_unlock_irqrestore(&ctlr->queue_lock, flags); usleep_range(10000, 11000); } while (--limit); return -EBUSY; } static int spi_destroy_queue(struct spi_controller *ctlr) { int ret; ret = spi_stop_queue(ctlr); /* * kthread_flush_worker will block until all work is done. * If the reason that stop_queue timed out is that the work will never * finish, then it does no good to call flush/stop thread, so * return anyway. */ if (ret) { dev_err(&ctlr->dev, "problem destroying queue\n"); return ret; } kthread_destroy_worker(ctlr->kworker); return 0; } static int __spi_queued_transfer(struct spi_device *spi, struct spi_message *msg, bool need_pump) { struct spi_controller *ctlr = spi->controller; unsigned long flags; spin_lock_irqsave(&ctlr->queue_lock, flags); if (!ctlr->running) { spin_unlock_irqrestore(&ctlr->queue_lock, flags); return -ESHUTDOWN; } msg->actual_length = 0; msg->status = -EINPROGRESS; list_add_tail(&msg->queue, &ctlr->queue); ctlr->queue_empty = false; if (!ctlr->busy && need_pump) kthread_queue_work(ctlr->kworker, &ctlr->pump_messages); spin_unlock_irqrestore(&ctlr->queue_lock, flags); return 0; } /** * spi_queued_transfer - transfer function for queued transfers * @spi: SPI device which is requesting transfer * @msg: SPI message which is to handled is queued to driver queue * * Return: zero on success, else a negative error code. */ static int spi_queued_transfer(struct spi_device *spi, struct spi_message *msg) { return __spi_queued_transfer(spi, msg, true); } static int spi_controller_initialize_queue(struct spi_controller *ctlr) { int ret; ctlr->transfer = spi_queued_transfer; if (!ctlr->transfer_one_message) ctlr->transfer_one_message = spi_transfer_one_message; /* Initialize and start queue */ ret = spi_init_queue(ctlr); if (ret) { dev_err(&ctlr->dev, "problem initializing queue\n"); goto err_init_queue; } ctlr->queued = true; ret = spi_start_queue(ctlr); if (ret) { dev_err(&ctlr->dev, "problem starting queue\n"); goto err_start_queue; } return 0; err_start_queue: spi_destroy_queue(ctlr); err_init_queue: return ret; } /** * spi_flush_queue - Send all pending messages in the queue from the callers' * context * @ctlr: controller to process queue for * * This should be used when one wants to ensure all pending messages have been * sent before doing something. Is used by the spi-mem code to make sure SPI * memory operations do not preempt regular SPI transfers that have been queued * before the spi-mem operation. */ void spi_flush_queue(struct spi_controller *ctlr) { if (ctlr->transfer == spi_queued_transfer) __spi_pump_messages(ctlr, false); } /*-------------------------------------------------------------------------*/ #if defined(CONFIG_OF) static void of_spi_parse_dt_cs_delay(struct device_node *nc, struct spi_delay *delay, const char *prop) { u32 value; if (!of_property_read_u32(nc, prop, &value)) { if (value > U16_MAX) { delay->value = DIV_ROUND_UP(value, 1000); delay->unit = SPI_DELAY_UNIT_USECS; } else { delay->value = value; delay->unit = SPI_DELAY_UNIT_NSECS; } } } static int of_spi_parse_dt(struct spi_controller *ctlr, struct spi_device *spi, struct device_node *nc) { u32 value, cs[SPI_CS_CNT_MAX]; int rc, idx; /* Mode (clock phase/polarity/etc.) */ if (of_property_read_bool(nc, "spi-cpha")) spi->mode |= SPI_CPHA; if (of_property_read_bool(nc, "spi-cpol")) spi->mode |= SPI_CPOL; if (of_property_read_bool(nc, "spi-3wire")) spi->mode |= SPI_3WIRE; if (of_property_read_bool(nc, "spi-lsb-first")) spi->mode |= SPI_LSB_FIRST; if (of_property_read_bool(nc, "spi-cs-high")) spi->mode |= SPI_CS_HIGH; /* Device DUAL/QUAD mode */ if (!of_property_read_u32(nc, "spi-tx-bus-width", &value)) { switch (value) { case 0: spi->mode |= SPI_NO_TX; break; case 1: break; case 2: spi->mode |= SPI_TX_DUAL; break; case 4: spi->mode |= SPI_TX_QUAD; break; case 8: spi->mode |= SPI_TX_OCTAL; break; default: dev_warn(&ctlr->dev, "spi-tx-bus-width %d not supported\n", value); break; } } if (!of_property_read_u32(nc, "spi-rx-bus-width", &value)) { switch (value) { case 0: spi->mode |= SPI_NO_RX; break; case 1: break; case 2: spi->mode |= SPI_RX_DUAL; break; case 4: spi->mode |= SPI_RX_QUAD; break; case 8: spi->mode |= SPI_RX_OCTAL; break; default: dev_warn(&ctlr->dev, "spi-rx-bus-width %d not supported\n", value); break; } } if (spi_controller_is_target(ctlr)) { if (!of_node_name_eq(nc, "slave")) { dev_err(&ctlr->dev, "%pOF is not called 'slave'\n", nc); return -EINVAL; } return 0; } if (ctlr->num_chipselect > SPI_CS_CNT_MAX) { dev_err(&ctlr->dev, "No. of CS is more than max. no. of supported CS\n"); return -EINVAL; } spi_set_all_cs_unused(spi); /* Device address */ rc = of_property_read_variable_u32_array(nc, "reg", &cs[0], 1, SPI_CS_CNT_MAX); if (rc < 0) { dev_err(&ctlr->dev, "%pOF has no valid 'reg' property (%d)\n", nc, rc); return rc; } if (rc > ctlr->num_chipselect) { dev_err(&ctlr->dev, "%pOF has number of CS > ctlr->num_chipselect (%d)\n", nc, rc); return rc; } if ((of_property_present(nc, "parallel-memories")) && (!(ctlr->flags & SPI_CONTROLLER_MULTI_CS))) { dev_err(&ctlr->dev, "SPI controller doesn't support multi CS\n"); return -EINVAL; } for (idx = 0; idx < rc; idx++) spi_set_chipselect(spi, idx, cs[idx]); /* * By default spi->chip_select[0] will hold the physical CS number, * so set bit 0 in spi->cs_index_mask. */ spi->cs_index_mask = BIT(0); /* Device speed */ if (!of_property_read_u32(nc, "spi-max-frequency", &value)) spi->max_speed_hz = value; /* Device CS delays */ of_spi_parse_dt_cs_delay(nc, &spi->cs_setup, "spi-cs-setup-delay-ns"); of_spi_parse_dt_cs_delay(nc, &spi->cs_hold, "spi-cs-hold-delay-ns"); of_spi_parse_dt_cs_delay(nc, &spi->cs_inactive, "spi-cs-inactive-delay-ns"); return 0; } static struct spi_device * of_register_spi_device(struct spi_controller *ctlr, struct device_node *nc) { struct spi_device *spi; int rc; /* Alloc an spi_device */ spi = spi_alloc_device(ctlr); if (!spi) { dev_err(&ctlr->dev, "spi_device alloc error for %pOF\n", nc); rc = -ENOMEM; goto err_out; } /* Select device driver */ rc = of_alias_from_compatible(nc, spi->modalias, sizeof(spi->modalias)); if (rc < 0) { dev_err(&ctlr->dev, "cannot find modalias for %pOF\n", nc); goto err_out; } rc = of_spi_parse_dt(ctlr, spi, nc); if (rc) goto err_out; /* Store a pointer to the node in the device structure */ of_node_get(nc); device_set_node(&spi->dev, of_fwnode_handle(nc)); /* Register the new device */ rc = spi_add_device(spi); if (rc) { dev_err(&ctlr->dev, "spi_device register error %pOF\n", nc); goto err_of_node_put; } return spi; err_of_node_put: of_node_put(nc); err_out: spi_dev_put(spi); return ERR_PTR(rc); } /** * of_register_spi_devices() - Register child devices onto the SPI bus * @ctlr: Pointer to spi_controller device * * Registers an spi_device for each child node of controller node which * represents a valid SPI target device. */ static void of_register_spi_devices(struct spi_controller *ctlr) { struct spi_device *spi; struct device_node *nc; for_each_available_child_of_node(ctlr->dev.of_node, nc) { if (of_node_test_and_set_flag(nc, OF_POPULATED)) continue; spi = of_register_spi_device(ctlr, nc); if (IS_ERR(spi)) { dev_warn(&ctlr->dev, "Failed to create SPI device for %pOF\n", nc); of_node_clear_flag(nc, OF_POPULATED); } } } #else static void of_register_spi_devices(struct spi_controller *ctlr) { } #endif /** * spi_new_ancillary_device() - Register ancillary SPI device * @spi: Pointer to the main SPI device registering the ancillary device * @chip_select: Chip Select of the ancillary device * * Register an ancillary SPI device; for example some chips have a chip-select * for normal device usage and another one for setup/firmware upload. * * This may only be called from main SPI device's probe routine. * * Return: 0 on success; negative errno on failure */ struct spi_device *spi_new_ancillary_device(struct spi_device *spi, u8 chip_select) { struct spi_controller *ctlr = spi->controller; struct spi_device *ancillary; int rc; /* Alloc an spi_device */ ancillary = spi_alloc_device(ctlr); if (!ancillary) { rc = -ENOMEM; goto err_out; } strscpy(ancillary->modalias, "dummy", sizeof(ancillary->modalias)); /* Use provided chip-select for ancillary device */ spi_set_all_cs_unused(ancillary); spi_set_chipselect(ancillary, 0, chip_select); /* Take over SPI mode/speed from SPI main device */ ancillary->max_speed_hz = spi->max_speed_hz; ancillary->mode = spi->mode; /* * By default spi->chip_select[0] will hold the physical CS number, * so set bit 0 in spi->cs_index_mask. */ ancillary->cs_index_mask = BIT(0); WARN_ON(!mutex_is_locked(&ctlr->add_lock)); /* Register the new device */ rc = __spi_add_device(ancillary); if (rc) { dev_err(&spi->dev, "failed to register ancillary device\n"); goto err_out; } return ancillary; err_out: spi_dev_put(ancillary); return ERR_PTR(rc); } EXPORT_SYMBOL_GPL(spi_new_ancillary_device); #ifdef CONFIG_ACPI struct acpi_spi_lookup { struct spi_controller *ctlr; u32 max_speed_hz; u32 mode; int irq; u8 bits_per_word; u8 chip_select; int n; int index; }; static int acpi_spi_count(struct acpi_resource *ares, void *data) { struct acpi_resource_spi_serialbus *sb; int *count = data; if (ares->type != ACPI_RESOURCE_TYPE_SERIAL_BUS) return 1; sb = &ares->data.spi_serial_bus; if (sb->type != ACPI_RESOURCE_SERIAL_TYPE_SPI) return 1; *count = *count + 1; return 1; } /** * acpi_spi_count_resources - Count the number of SpiSerialBus resources * @adev: ACPI device * * Return: the number of SpiSerialBus resources in the ACPI-device's * resource-list; or a negative error code. */ int acpi_spi_count_resources(struct acpi_device *adev) { LIST_HEAD(r); int count = 0; int ret; ret = acpi_dev_get_resources(adev, &r, acpi_spi_count, &count); if (ret < 0) return ret; acpi_dev_free_resource_list(&r); return count; } EXPORT_SYMBOL_GPL(acpi_spi_count_resources); static void acpi_spi_parse_apple_properties(struct acpi_device *dev, struct acpi_spi_lookup *lookup) { const union acpi_object *obj; if (!x86_apple_machine) return; if (!acpi_dev_get_property(dev, "spiSclkPeriod", ACPI_TYPE_BUFFER, &obj) && obj->buffer.length >= 4) lookup->max_speed_hz = NSEC_PER_SEC / *(u32 *)obj->buffer.pointer; if (!acpi_dev_get_property(dev, "spiWordSize", ACPI_TYPE_BUFFER, &obj) && obj->buffer.length == 8) lookup->bits_per_word = *(u64 *)obj->buffer.pointer; if (!acpi_dev_get_property(dev, "spiBitOrder", ACPI_TYPE_BUFFER, &obj) && obj->buffer.length == 8 && !*(u64 *)obj->buffer.pointer) lookup->mode |= SPI_LSB_FIRST; if (!acpi_dev_get_property(dev, "spiSPO", ACPI_TYPE_BUFFER, &obj) && obj->buffer.length == 8 && *(u64 *)obj->buffer.pointer) lookup->mode |= SPI_CPOL; if (!acpi_dev_get_property(dev, "spiSPH", ACPI_TYPE_BUFFER, &obj) && obj->buffer.length == 8 && *(u64 *)obj->buffer.pointer) lookup->mode |= SPI_CPHA; } static int acpi_spi_add_resource(struct acpi_resource *ares, void *data) { struct acpi_spi_lookup *lookup = data; struct spi_controller *ctlr = lookup->ctlr; if (ares->type == ACPI_RESOURCE_TYPE_SERIAL_BUS) { struct acpi_resource_spi_serialbus *sb; acpi_handle parent_handle; acpi_status status; sb = &ares->data.spi_serial_bus; if (sb->type == ACPI_RESOURCE_SERIAL_TYPE_SPI) { if (lookup->index != -1 && lookup->n++ != lookup->index) return 1; status = acpi_get_handle(NULL, sb->resource_source.string_ptr, &parent_handle); if (ACPI_FAILURE(status)) return -ENODEV; if (ctlr) { if (!device_match_acpi_handle(ctlr->dev.parent, parent_handle)) return -ENODEV; } else { struct acpi_device *adev; adev = acpi_fetch_acpi_dev(parent_handle); if (!adev) return -ENODEV; ctlr = acpi_spi_find_controller_by_adev(adev); if (!ctlr) return -EPROBE_DEFER; lookup->ctlr = ctlr; } /* * ACPI DeviceSelection numbering is handled by the * host controller driver in Windows and can vary * from driver to driver. In Linux we always expect * 0 .. max - 1 so we need to ask the driver to * translate between the two schemes. */ if (ctlr->fw_translate_cs) { int cs = ctlr->fw_translate_cs(ctlr, sb->device_selection); if (cs < 0) return cs; lookup->chip_select = cs; } else { lookup->chip_select = sb->device_selection; } lookup->max_speed_hz = sb->connection_speed; lookup->bits_per_word = sb->data_bit_length; if (sb->clock_phase == ACPI_SPI_SECOND_PHASE) lookup->mode |= SPI_CPHA; if (sb->clock_polarity == ACPI_SPI_START_HIGH) lookup->mode |= SPI_CPOL; if (sb->device_polarity == ACPI_SPI_ACTIVE_HIGH) lookup->mode |= SPI_CS_HIGH; } } else if (lookup->irq < 0) { struct resource r; if (acpi_dev_resource_interrupt(ares, 0, &r)) lookup->irq = r.start; } /* Always tell the ACPI core to skip this resource */ return 1; } /** * acpi_spi_device_alloc - Allocate a spi device, and fill it in with ACPI information * @ctlr: controller to which the spi device belongs * @adev: ACPI Device for the spi device * @index: Index of the spi resource inside the ACPI Node * * This should be used to allocate a new SPI device from and ACPI Device node. * The caller is responsible for calling spi_add_device to register the SPI device. * * If ctlr is set to NULL, the Controller for the SPI device will be looked up * using the resource. * If index is set to -1, index is not used. * Note: If index is -1, ctlr must be set. * * Return: a pointer to the new device, or ERR_PTR on error. */ struct spi_device *acpi_spi_device_alloc(struct spi_controller *ctlr, struct acpi_device *adev, int index) { acpi_handle parent_handle = NULL; struct list_head resource_list; struct acpi_spi_lookup lookup = {}; struct spi_device *spi; int ret; if (!ctlr && index == -1) return ERR_PTR(-EINVAL); lookup.ctlr = ctlr; lookup.irq = -1; lookup.index = index; lookup.n = 0; INIT_LIST_HEAD(&resource_list); ret = acpi_dev_get_resources(adev, &resource_list, acpi_spi_add_resource, &lookup); acpi_dev_free_resource_list(&resource_list); if (ret < 0) /* Found SPI in _CRS but it points to another controller */ return ERR_PTR(ret); if (!lookup.max_speed_hz && ACPI_SUCCESS(acpi_get_parent(adev->handle, &parent_handle)) && device_match_acpi_handle(lookup.ctlr->dev.parent, parent_handle)) { /* Apple does not use _CRS but nested devices for SPI target devices */ acpi_spi_parse_apple_properties(adev, &lookup); } if (!lookup.max_speed_hz) return ERR_PTR(-ENODEV); spi = spi_alloc_device(lookup.ctlr); if (!spi) { dev_err(&lookup.ctlr->dev, "failed to allocate SPI device for %s\n", dev_name(&adev->dev)); return ERR_PTR(-ENOMEM); } spi_set_all_cs_unused(spi); spi_set_chipselect(spi, 0, lookup.chip_select); ACPI_COMPANION_SET(&spi->dev, adev); spi->max_speed_hz = lookup.max_speed_hz; spi->mode |= lookup.mode; spi->irq = lookup.irq; spi->bits_per_word = lookup.bits_per_word; /* * By default spi->chip_select[0] will hold the physical CS number, * so set bit 0 in spi->cs_index_mask. */ spi->cs_index_mask = BIT(0); return spi; } EXPORT_SYMBOL_GPL(acpi_spi_device_alloc); static acpi_status acpi_register_spi_device(struct spi_controller *ctlr, struct acpi_device *adev) { struct spi_device *spi; if (acpi_bus_get_status(adev) || !adev->status.present || acpi_device_enumerated(adev)) return AE_OK; spi = acpi_spi_device_alloc(ctlr, adev, -1); if (IS_ERR(spi)) { if (PTR_ERR(spi) == -ENOMEM) return AE_NO_MEMORY; else return AE_OK; } acpi_set_modalias(adev, acpi_device_hid(adev), spi->modalias, sizeof(spi->modalias)); acpi_device_set_enumerated(adev); adev->power.flags.ignore_parent = true; if (spi_add_device(spi)) { adev->power.flags.ignore_parent = false; dev_err(&ctlr->dev, "failed to add SPI device %s from ACPI\n", dev_name(&adev->dev)); spi_dev_put(spi); } return AE_OK; } static acpi_status acpi_spi_add_device(acpi_handle handle, u32 level, void *data, void **return_value) { struct acpi_device *adev = acpi_fetch_acpi_dev(handle); struct spi_controller *ctlr = data; if (!adev) return AE_OK; return acpi_register_spi_device(ctlr, adev); } #define SPI_ACPI_ENUMERATE_MAX_DEPTH 32 static void acpi_register_spi_devices(struct spi_controller *ctlr) { acpi_status status; acpi_handle handle; handle = ACPI_HANDLE(ctlr->dev.parent); if (!handle) return; status = acpi_walk_namespace(ACPI_TYPE_DEVICE, ACPI_ROOT_OBJECT, SPI_ACPI_ENUMERATE_MAX_DEPTH, acpi_spi_add_device, NULL, ctlr, NULL); if (ACPI_FAILURE(status)) dev_warn(&ctlr->dev, "failed to enumerate SPI target devices\n"); } #else static inline void acpi_register_spi_devices(struct spi_controller *ctlr) {} #endif /* CONFIG_ACPI */ static void spi_controller_release(struct device *dev) { struct spi_controller *ctlr; ctlr = container_of(dev, struct spi_controller, dev); kfree(ctlr); } static const struct class spi_controller_class = { .name = "spi_master", .dev_release = spi_controller_release, .dev_groups = spi_controller_groups, }; #ifdef CONFIG_SPI_SLAVE /** * spi_target_abort - abort the ongoing transfer request on an SPI target controller * @spi: device used for the current transfer */ int spi_target_abort(struct spi_device *spi) { struct spi_controller *ctlr = spi->controller; if (spi_controller_is_target(ctlr) && ctlr->target_abort) return ctlr->target_abort(ctlr); return -ENOTSUPP; } EXPORT_SYMBOL_GPL(spi_target_abort); static ssize_t slave_show(struct device *dev, struct device_attribute *attr, char *buf) { struct spi_controller *ctlr = container_of(dev, struct spi_controller, dev); struct device *child; int ret; child = device_find_any_child(&ctlr->dev); ret = sysfs_emit(buf, "%s\n", child ? to_spi_device(child)->modalias : NULL); put_device(child); return ret; } static ssize_t slave_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct spi_controller *ctlr = container_of(dev, struct spi_controller, dev); struct spi_device *spi; struct device *child; char name[32]; int rc; rc = sscanf(buf, "%31s", name); if (rc != 1 || !name[0]) return -EINVAL; child = device_find_any_child(&ctlr->dev); if (child) { /* Remove registered target device */ device_unregister(child); put_device(child); } if (strcmp(name, "(null)")) { /* Register new target device */ spi = spi_alloc_device(ctlr); if (!spi) return -ENOMEM; strscpy(spi->modalias, name, sizeof(spi->modalias)); rc = spi_add_device(spi); if (rc) { spi_dev_put(spi); return rc; } } return count; } static DEVICE_ATTR_RW(slave); static struct attribute *spi_target_attrs[] = { &dev_attr_slave.attr, NULL, }; static const struct attribute_group spi_target_group = { .attrs = spi_target_attrs, }; static const struct attribute_group *spi_target_groups[] = { &spi_controller_statistics_group, &spi_target_group, NULL, }; static const struct class spi_target_class = { .name = "spi_slave", .dev_release = spi_controller_release, .dev_groups = spi_target_groups, }; #else extern struct class spi_target_class; /* dummy */ #endif /** * __spi_alloc_controller - allocate an SPI host or target controller * @dev: the controller, possibly using the platform_bus * @size: how much zeroed driver-private data to allocate; the pointer to this * memory is in the driver_data field of the returned device, accessible * with spi_controller_get_devdata(); the memory is cacheline aligned; * drivers granting DMA access to portions of their private data need to * round up @size using ALIGN(size, dma_get_cache_alignment()). * @target: flag indicating whether to allocate an SPI host (false) or SPI target (true) * controller * Context: can sleep * * This call is used only by SPI controller drivers, which are the * only ones directly touching chip registers. It's how they allocate * an spi_controller structure, prior to calling spi_register_controller(). * * This must be called from context that can sleep. * * The caller is responsible for assigning the bus number and initializing the * controller's methods before calling spi_register_controller(); and (after * errors adding the device) calling spi_controller_put() to prevent a memory * leak. * * Return: the SPI controller structure on success, else NULL. */ struct spi_controller *__spi_alloc_controller(struct device *dev, unsigned int size, bool target) { struct spi_controller *ctlr; size_t ctlr_size = ALIGN(sizeof(*ctlr), dma_get_cache_alignment()); if (!dev) return NULL; ctlr = kzalloc(size + ctlr_size, GFP_KERNEL); if (!ctlr) return NULL; device_initialize(&ctlr->dev); INIT_LIST_HEAD(&ctlr->queue); spin_lock_init(&ctlr->queue_lock); spin_lock_init(&ctlr->bus_lock_spinlock); mutex_init(&ctlr->bus_lock_mutex); mutex_init(&ctlr->io_mutex); mutex_init(&ctlr->add_lock); ctlr->bus_num = -1; ctlr->num_chipselect = 1; ctlr->target = target; if (IS_ENABLED(CONFIG_SPI_SLAVE) && target) ctlr->dev.class = &spi_target_class; else ctlr->dev.class = &spi_controller_class; ctlr->dev.parent = dev; pm_suspend_ignore_children(&ctlr->dev, true); spi_controller_set_devdata(ctlr, (void *)ctlr + ctlr_size); return ctlr; } EXPORT_SYMBOL_GPL(__spi_alloc_controller); static void devm_spi_release_controller(struct device *dev, void *ctlr) { spi_controller_put(*(struct spi_controller **)ctlr); } /** * __devm_spi_alloc_controller - resource-managed __spi_alloc_controller() * @dev: physical device of SPI controller * @size: how much zeroed driver-private data to allocate * @target: whether to allocate an SPI host (false) or SPI target (true) controller * Context: can sleep * * Allocate an SPI controller and automatically release a reference on it * when @dev is unbound from its driver. Drivers are thus relieved from * having to call spi_controller_put(). * * The arguments to this function are identical to __spi_alloc_controller(). * * Return: the SPI controller structure on success, else NULL. */ struct spi_controller *__devm_spi_alloc_controller(struct device *dev, unsigned int size, bool target) { struct spi_controller **ptr, *ctlr; ptr = devres_alloc(devm_spi_release_controller, sizeof(*ptr), GFP_KERNEL); if (!ptr) return NULL; ctlr = __spi_alloc_controller(dev, size, target); if (ctlr) { ctlr->devm_allocated = true; *ptr = ctlr; devres_add(dev, ptr); } else { devres_free(ptr); } return ctlr; } EXPORT_SYMBOL_GPL(__devm_spi_alloc_controller); /** * spi_get_gpio_descs() - grab chip select GPIOs for the controller * @ctlr: The SPI controller to grab GPIO descriptors for */ static int spi_get_gpio_descs(struct spi_controller *ctlr) { int nb, i; struct gpio_desc **cs; struct device *dev = &ctlr->dev; unsigned long native_cs_mask = 0; unsigned int num_cs_gpios = 0; nb = gpiod_count(dev, "cs"); if (nb < 0) { /* No GPIOs at all is fine, else return the error */ if (nb == -ENOENT) return 0; return nb; } ctlr->num_chipselect = max_t(int, nb, ctlr->num_chipselect); cs = devm_kcalloc(dev, ctlr->num_chipselect, sizeof(*cs), GFP_KERNEL); if (!cs) return -ENOMEM; ctlr->cs_gpiods = cs; for (i = 0; i < nb; i++) { /* * Most chipselects are active low, the inverted * semantics are handled by special quirks in gpiolib, * so initializing them GPIOD_OUT_LOW here means * "unasserted", in most cases this will drive the physical * line high. */ cs[i] = devm_gpiod_get_index_optional(dev, "cs", i, GPIOD_OUT_LOW); if (IS_ERR(cs[i])) return PTR_ERR(cs[i]); if (cs[i]) { /* * If we find a CS GPIO, name it after the device and * chip select line. */ char *gpioname; gpioname = devm_kasprintf(dev, GFP_KERNEL, "%s CS%d", dev_name(dev), i); if (!gpioname) return -ENOMEM; gpiod_set_consumer_name(cs[i], gpioname); num_cs_gpios++; continue; } if (ctlr->max_native_cs && i >= ctlr->max_native_cs) { dev_err(dev, "Invalid native chip select %d\n", i); return -EINVAL; } native_cs_mask |= BIT(i); } ctlr->unused_native_cs = ffs(~native_cs_mask) - 1; if ((ctlr->flags & SPI_CONTROLLER_GPIO_SS) && num_cs_gpios && ctlr->max_native_cs && ctlr->unused_native_cs >= ctlr->max_native_cs) { dev_err(dev, "No unused native chip select available\n"); return -EINVAL; } return 0; } static int spi_controller_check_ops(struct spi_controller *ctlr) { /* * The controller may implement only the high-level SPI-memory like * operations if it does not support regular SPI transfers, and this is * valid use case. * If ->mem_ops or ->mem_ops->exec_op is NULL, we request that at least * one of the ->transfer_xxx() method be implemented. */ if (!ctlr->mem_ops || !ctlr->mem_ops->exec_op) { if (!ctlr->transfer && !ctlr->transfer_one && !ctlr->transfer_one_message) { return -EINVAL; } } return 0; } /* Allocate dynamic bus number using Linux idr */ static int spi_controller_id_alloc(struct spi_controller *ctlr, int start, int end) { int id; mutex_lock(&board_lock); id = idr_alloc(&spi_controller_idr, ctlr, start, end, GFP_KERNEL); mutex_unlock(&board_lock); if (WARN(id < 0, "couldn't get idr")) return id == -ENOSPC ? -EBUSY : id; ctlr->bus_num = id; return 0; } /** * spi_register_controller - register SPI host or target controller * @ctlr: initialized controller, originally from spi_alloc_host() or * spi_alloc_target() * Context: can sleep * * SPI controllers connect to their drivers using some non-SPI bus, * such as the platform bus. The final stage of probe() in that code * includes calling spi_register_controller() to hook up to this SPI bus glue. * * SPI controllers use board specific (often SOC specific) bus numbers, * and board-specific addressing for SPI devices combines those numbers * with chip select numbers. Since SPI does not directly support dynamic * device identification, boards need configuration tables telling which * chip is at which address. * * This must be called from context that can sleep. It returns zero on * success, else a negative error code (dropping the controller's refcount). * After a successful return, the caller is responsible for calling * spi_unregister_controller(). * * Return: zero on success, else a negative error code. */ int spi_register_controller(struct spi_controller *ctlr) { struct device *dev = ctlr->dev.parent; struct boardinfo *bi; int first_dynamic; int status; int idx; if (!dev) return -ENODEV; /* * Make sure all necessary hooks are implemented before registering * the SPI controller. */ status = spi_controller_check_ops(ctlr); if (status) return status; if (ctlr->bus_num < 0) ctlr->bus_num = of_alias_get_id(ctlr->dev.of_node, "spi"); if (ctlr->bus_num >= 0) { /* Devices with a fixed bus num must check-in with the num */ status = spi_controller_id_alloc(ctlr, ctlr->bus_num, ctlr->bus_num + 1); if (status) return status; } if (ctlr->bus_num < 0) { first_dynamic = of_alias_get_highest_id("spi"); if (first_dynamic < 0) first_dynamic = 0; else first_dynamic++; status = spi_controller_id_alloc(ctlr, first_dynamic, 0); if (status) return status; } ctlr->bus_lock_flag = 0; init_completion(&ctlr->xfer_completion); init_completion(&ctlr->cur_msg_completion); if (!ctlr->max_dma_len) ctlr->max_dma_len = INT_MAX; /* * Register the device, then userspace will see it. * Registration fails if the bus ID is in use. */ dev_set_name(&ctlr->dev, "spi%u", ctlr->bus_num); if (!spi_controller_is_target(ctlr) && ctlr->use_gpio_descriptors) { status = spi_get_gpio_descs(ctlr); if (status) goto free_bus_id; /* * A controller using GPIO descriptors always * supports SPI_CS_HIGH if need be. */ ctlr->mode_bits |= SPI_CS_HIGH; } /* * Even if it's just one always-selected device, there must * be at least one chipselect. */ if (!ctlr->num_chipselect) { status = -EINVAL; goto free_bus_id; } /* Setting last_cs to SPI_INVALID_CS means no chip selected */ for (idx = 0; idx < SPI_CS_CNT_MAX; idx++) ctlr->last_cs[idx] = SPI_INVALID_CS; status = device_add(&ctlr->dev); if (status < 0) goto free_bus_id; dev_dbg(dev, "registered %s %s\n", spi_controller_is_target(ctlr) ? "target" : "host", dev_name(&ctlr->dev)); /* * If we're using a queued driver, start the queue. Note that we don't * need the queueing logic if the driver is only supporting high-level * memory operations. */ if (ctlr->transfer) { dev_info(dev, "controller is unqueued, this is deprecated\n"); } else if (ctlr->transfer_one || ctlr->transfer_one_message) { status = spi_controller_initialize_queue(ctlr); if (status) { device_del(&ctlr->dev); goto free_bus_id; } } /* Add statistics */ ctlr->pcpu_statistics = spi_alloc_pcpu_stats(dev); if (!ctlr->pcpu_statistics) { dev_err(dev, "Error allocating per-cpu statistics\n"); status = -ENOMEM; goto destroy_queue; } mutex_lock(&board_lock); list_add_tail(&ctlr->list, &spi_controller_list); list_for_each_entry(bi, &board_list, list) spi_match_controller_to_boardinfo(ctlr, &bi->board_info); mutex_unlock(&board_lock); /* Register devices from the device tree and ACPI */ of_register_spi_devices(ctlr); acpi_register_spi_devices(ctlr); return status; destroy_queue: spi_destroy_queue(ctlr); free_bus_id: mutex_lock(&board_lock); idr_remove(&spi_controller_idr, ctlr->bus_num); mutex_unlock(&board_lock); return status; } EXPORT_SYMBOL_GPL(spi_register_controller); static void devm_spi_unregister(struct device *dev, void *res) { spi_unregister_controller(*(struct spi_controller **)res); } /** * devm_spi_register_controller - register managed SPI host or target controller * @dev: device managing SPI controller * @ctlr: initialized controller, originally from spi_alloc_host() or * spi_alloc_target() * Context: can sleep * * Register a SPI device as with spi_register_controller() which will * automatically be unregistered and freed. * * Return: zero on success, else a negative error code. */ int devm_spi_register_controller(struct device *dev, struct spi_controller *ctlr) { struct spi_controller **ptr; int ret; ptr = devres_alloc(devm_spi_unregister, sizeof(*ptr), GFP_KERNEL); if (!ptr) return -ENOMEM; ret = spi_register_controller(ctlr); if (!ret) { *ptr = ctlr; devres_add(dev, ptr); } else { devres_free(ptr); } return ret; } EXPORT_SYMBOL_GPL(devm_spi_register_controller); static int __unregister(struct device *dev, void *null) { spi_unregister_device(to_spi_device(dev)); return 0; } /** * spi_unregister_controller - unregister SPI host or target controller * @ctlr: the controller being unregistered * Context: can sleep * * This call is used only by SPI controller drivers, which are the * only ones directly touching chip registers. * * This must be called from context that can sleep. * * Note that this function also drops a reference to the controller. */ void spi_unregister_controller(struct spi_controller *ctlr) { struct spi_controller *found; int id = ctlr->bus_num; /* Prevent addition of new devices, unregister existing ones */ if (IS_ENABLED(CONFIG_SPI_DYNAMIC)) mutex_lock(&ctlr->add_lock); device_for_each_child(&ctlr->dev, NULL, __unregister); /* First make sure that this controller was ever added */ mutex_lock(&board_lock); found = idr_find(&spi_controller_idr, id); mutex_unlock(&board_lock); if (ctlr->queued) { if (spi_destroy_queue(ctlr)) dev_err(&ctlr->dev, "queue remove failed\n"); } mutex_lock(&board_lock); list_del(&ctlr->list); mutex_unlock(&board_lock); device_del(&ctlr->dev); /* Free bus id */ mutex_lock(&board_lock); if (found == ctlr) idr_remove(&spi_controller_idr, id); mutex_unlock(&board_lock); if (IS_ENABLED(CONFIG_SPI_DYNAMIC)) mutex_unlock(&ctlr->add_lock); /* * Release the last reference on the controller if its driver * has not yet been converted to devm_spi_alloc_host/target(). */ if (!ctlr->devm_allocated) put_device(&ctlr->dev); } EXPORT_SYMBOL_GPL(spi_unregister_controller); static inline int __spi_check_suspended(const struct spi_controller *ctlr) { return ctlr->flags & SPI_CONTROLLER_SUSPENDED ? -ESHUTDOWN : 0; } static inline void __spi_mark_suspended(struct spi_controller *ctlr) { mutex_lock(&ctlr->bus_lock_mutex); ctlr->flags |= SPI_CONTROLLER_SUSPENDED; mutex_unlock(&ctlr->bus_lock_mutex); } static inline void __spi_mark_resumed(struct spi_controller *ctlr) { mutex_lock(&ctlr->bus_lock_mutex); ctlr->flags &= ~SPI_CONTROLLER_SUSPENDED; mutex_unlock(&ctlr->bus_lock_mutex); } int spi_controller_suspend(struct spi_controller *ctlr) { int ret = 0; /* Basically no-ops for non-queued controllers */ if (ctlr->queued) { ret = spi_stop_queue(ctlr); if (ret) dev_err(&ctlr->dev, "queue stop failed\n"); } __spi_mark_suspended(ctlr); return ret; } EXPORT_SYMBOL_GPL(spi_controller_suspend); int spi_controller_resume(struct spi_controller *ctlr) { int ret = 0; __spi_mark_resumed(ctlr); if (ctlr->queued) { ret = spi_start_queue(ctlr); if (ret) dev_err(&ctlr->dev, "queue restart failed\n"); } return ret; } EXPORT_SYMBOL_GPL(spi_controller_resume); /*-------------------------------------------------------------------------*/ /* Core methods for spi_message alterations */ static void __spi_replace_transfers_release(struct spi_controller *ctlr, struct spi_message *msg, void *res) { struct spi_replaced_transfers *rxfer = res; size_t i; /* Call extra callback if requested */ if (rxfer->release) rxfer->release(ctlr, msg, res); /* Insert replaced transfers back into the message */ list_splice(&rxfer->replaced_transfers, rxfer->replaced_after); /* Remove the formerly inserted entries */ for (i = 0; i < rxfer->inserted; i++) list_del(&rxfer->inserted_transfers[i].transfer_list); } /** * spi_replace_transfers - replace transfers with several transfers * and register change with spi_message.resources * @msg: the spi_message we work upon * @xfer_first: the first spi_transfer we want to replace * @remove: number of transfers to remove * @insert: the number of transfers we want to insert instead * @release: extra release code necessary in some circumstances * @extradatasize: extra data to allocate (with alignment guarantees * of struct @spi_transfer) * @gfp: gfp flags * * Returns: pointer to @spi_replaced_transfers, * PTR_ERR(...) in case of errors. */ static struct spi_replaced_transfers *spi_replace_transfers( struct spi_message *msg, struct spi_transfer *xfer_first, size_t remove, size_t insert, spi_replaced_release_t release, size_t extradatasize, gfp_t gfp) { struct spi_replaced_transfers *rxfer; struct spi_transfer *xfer; size_t i; /* Allocate the structure using spi_res */ rxfer = spi_res_alloc(msg->spi, __spi_replace_transfers_release, struct_size(rxfer, inserted_transfers, insert) + extradatasize, gfp); if (!rxfer) return ERR_PTR(-ENOMEM); /* The release code to invoke before running the generic release */ rxfer->release = release; /* Assign extradata */ if (extradatasize) rxfer->extradata = &rxfer->inserted_transfers[insert]; /* Init the replaced_transfers list */ INIT_LIST_HEAD(&rxfer->replaced_transfers); /* * Assign the list_entry after which we should reinsert * the @replaced_transfers - it may be spi_message.messages! */ rxfer->replaced_after = xfer_first->transfer_list.prev; /* Remove the requested number of transfers */ for (i = 0; i < remove; i++) { /* * If the entry after replaced_after it is msg->transfers * then we have been requested to remove more transfers * than are in the list. */ if (rxfer->replaced_after->next == &msg->transfers) { dev_err(&msg->spi->dev, "requested to remove more spi_transfers than are available\n"); /* Insert replaced transfers back into the message */ list_splice(&rxfer->replaced_transfers, rxfer->replaced_after); /* Free the spi_replace_transfer structure... */ spi_res_free(rxfer); /* ...and return with an error */ return ERR_PTR(-EINVAL); } /* * Remove the entry after replaced_after from list of * transfers and add it to list of replaced_transfers. */ list_move_tail(rxfer->replaced_after->next, &rxfer->replaced_transfers); } /* * Create copy of the given xfer with identical settings * based on the first transfer to get removed. */ for (i = 0; i < insert; i++) { /* We need to run in reverse order */ xfer = &rxfer->inserted_transfers[insert - 1 - i]; /* Copy all spi_transfer data */ memcpy(xfer, xfer_first, sizeof(*xfer)); /* Add to list */ list_add(&xfer->transfer_list, rxfer->replaced_after); /* Clear cs_change and delay for all but the last */ if (i) { xfer->cs_change = false; xfer->delay.value = 0; } } /* Set up inserted... */ rxfer->inserted = insert; /* ...and register it with spi_res/spi_message */ spi_res_add(msg, rxfer); return rxfer; } static int __spi_split_transfer_maxsize(struct spi_controller *ctlr, struct spi_message *msg, struct spi_transfer **xferp, size_t maxsize) { struct spi_transfer *xfer = *xferp, *xfers; struct spi_replaced_transfers *srt; size_t offset; size_t count, i; /* Calculate how many we have to replace */ count = DIV_ROUND_UP(xfer->len, maxsize); /* Create replacement */ srt = spi_replace_transfers(msg, xfer, 1, count, NULL, 0, GFP_KERNEL); if (IS_ERR(srt)) return PTR_ERR(srt); xfers = srt->inserted_transfers; /* * Now handle each of those newly inserted spi_transfers. * Note that the replacements spi_transfers all are preset * to the same values as *xferp, so tx_buf, rx_buf and len * are all identical (as well as most others) * so we just have to fix up len and the pointers. */ /* * The first transfer just needs the length modified, so we * run it outside the loop. */ xfers[0].len = min_t(size_t, maxsize, xfer[0].len); /* All the others need rx_buf/tx_buf also set */ for (i = 1, offset = maxsize; i < count; offset += maxsize, i++) { /* Update rx_buf, tx_buf and DMA */ if (xfers[i].rx_buf) xfers[i].rx_buf += offset; if (xfers[i].tx_buf) xfers[i].tx_buf += offset; /* Update length */ xfers[i].len = min(maxsize, xfers[i].len - offset); } /* * We set up xferp to the last entry we have inserted, * so that we skip those already split transfers. */ *xferp = &xfers[count - 1]; /* Increment statistics counters */ SPI_STATISTICS_INCREMENT_FIELD(ctlr->pcpu_statistics, transfers_split_maxsize); SPI_STATISTICS_INCREMENT_FIELD(msg->spi->pcpu_statistics, transfers_split_maxsize); return 0; } /** * spi_split_transfers_maxsize - split spi transfers into multiple transfers * when an individual transfer exceeds a * certain size * @ctlr: the @spi_controller for this transfer * @msg: the @spi_message to transform * @maxsize: the maximum when to apply this * * This function allocates resources that are automatically freed during the * spi message unoptimize phase so this function should only be called from * optimize_message callbacks. * * Return: status of transformation */ int spi_split_transfers_maxsize(struct spi_controller *ctlr, struct spi_message *msg, size_t maxsize) { struct spi_transfer *xfer; int ret; /* * Iterate over the transfer_list, * but note that xfer is advanced to the last transfer inserted * to avoid checking sizes again unnecessarily (also xfer does * potentially belong to a different list by the time the * replacement has happened). */ list_for_each_entry(xfer, &msg->transfers, transfer_list) { if (xfer->len > maxsize) { ret = __spi_split_transfer_maxsize(ctlr, msg, &xfer, maxsize); if (ret) return ret; } } return 0; } EXPORT_SYMBOL_GPL(spi_split_transfers_maxsize); /** * spi_split_transfers_maxwords - split SPI transfers into multiple transfers * when an individual transfer exceeds a * certain number of SPI words * @ctlr: the @spi_controller for this transfer * @msg: the @spi_message to transform * @maxwords: the number of words to limit each transfer to * * This function allocates resources that are automatically freed during the * spi message unoptimize phase so this function should only be called from * optimize_message callbacks. * * Return: status of transformation */ int spi_split_transfers_maxwords(struct spi_controller *ctlr, struct spi_message *msg, size_t maxwords) { struct spi_transfer *xfer; /* * Iterate over the transfer_list, * but note that xfer is advanced to the last transfer inserted * to avoid checking sizes again unnecessarily (also xfer does * potentially belong to a different list by the time the * replacement has happened). */ list_for_each_entry(xfer, &msg->transfers, transfer_list) { size_t maxsize; int ret; maxsize = maxwords * spi_bpw_to_bytes(xfer->bits_per_word); if (xfer->len > maxsize) { ret = __spi_split_transfer_maxsize(ctlr, msg, &xfer, maxsize); if (ret) return ret; } } return 0; } EXPORT_SYMBOL_GPL(spi_split_transfers_maxwords); /*-------------------------------------------------------------------------*/ /* * Core methods for SPI controller protocol drivers. Some of the * other core methods are currently defined as inline functions. */ static int __spi_validate_bits_per_word(struct spi_controller *ctlr, u8 bits_per_word) { if (ctlr->bits_per_word_mask) { /* Only 32 bits fit in the mask */ if (bits_per_word > 32) return -EINVAL; if (!(ctlr->bits_per_word_mask & SPI_BPW_MASK(bits_per_word))) return -EINVAL; } return 0; } /** * spi_set_cs_timing - configure CS setup, hold, and inactive delays * @spi: the device that requires specific CS timing configuration * * Return: zero on success, else a negative error code. */ static int spi_set_cs_timing(struct spi_device *spi) { struct device *parent = spi->controller->dev.parent; int status = 0; if (spi->controller->set_cs_timing && !spi_get_csgpiod(spi, 0)) { if (spi->controller->auto_runtime_pm) { status = pm_runtime_get_sync(parent); if (status < 0) { pm_runtime_put_noidle(parent); dev_err(&spi->controller->dev, "Failed to power device: %d\n", status); return status; } status = spi->controller->set_cs_timing(spi); pm_runtime_mark_last_busy(parent); pm_runtime_put_autosuspend(parent); } else { status = spi->controller->set_cs_timing(spi); } } return status; } /** * spi_setup - setup SPI mode and clock rate * @spi: the device whose settings are being modified * Context: can sleep, and no requests are queued to the device * * SPI protocol drivers may need to update the transfer mode if the * device doesn't work with its default. They may likewise need * to update clock rates or word sizes from initial values. This function * changes those settings, and must be called from a context that can sleep. * Except for SPI_CS_HIGH, which takes effect immediately, the changes take * effect the next time the device is selected and data is transferred to * or from it. When this function returns, the SPI device is deselected. * * Note that this call will fail if the protocol driver specifies an option * that the underlying controller or its driver does not support. For * example, not all hardware supports wire transfers using nine bit words, * LSB-first wire encoding, or active-high chipselects. * * Return: zero on success, else a negative error code. */ int spi_setup(struct spi_device *spi) { unsigned bad_bits, ugly_bits; int status; /* * Check mode to prevent that any two of DUAL, QUAD and NO_MOSI/MISO * are set at the same time. */ if ((hweight_long(spi->mode & (SPI_TX_DUAL | SPI_TX_QUAD | SPI_NO_TX)) > 1) || (hweight_long(spi->mode & (SPI_RX_DUAL | SPI_RX_QUAD | SPI_NO_RX)) > 1)) { dev_err(&spi->dev, "setup: can not select any two of dual, quad and no-rx/tx at the same time\n"); return -EINVAL; } /* If it is SPI_3WIRE mode, DUAL and QUAD should be forbidden */ if ((spi->mode & SPI_3WIRE) && (spi->mode & (SPI_TX_DUAL | SPI_TX_QUAD | SPI_TX_OCTAL | SPI_RX_DUAL | SPI_RX_QUAD | SPI_RX_OCTAL))) return -EINVAL; /* Check against conflicting MOSI idle configuration */ if ((spi->mode & SPI_MOSI_IDLE_LOW) && (spi->mode & SPI_MOSI_IDLE_HIGH)) { dev_err(&spi->dev, "setup: MOSI configured to idle low and high at the same time.\n"); return -EINVAL; } /* * Help drivers fail *cleanly* when they need options * that aren't supported with their current controller. * SPI_CS_WORD has a fallback software implementation, * so it is ignored here. */ bad_bits = spi->mode & ~(spi->controller->mode_bits | SPI_CS_WORD | SPI_NO_TX | SPI_NO_RX); ugly_bits = bad_bits & (SPI_TX_DUAL | SPI_TX_QUAD | SPI_TX_OCTAL | SPI_RX_DUAL | SPI_RX_QUAD | SPI_RX_OCTAL); if (ugly_bits) { dev_warn(&spi->dev, "setup: ignoring unsupported mode bits %x\n", ugly_bits); spi->mode &= ~ugly_bits; bad_bits &= ~ugly_bits; } if (bad_bits) { dev_err(&spi->dev, "setup: unsupported mode bits %x\n", bad_bits); return -EINVAL; } if (!spi->bits_per_word) { spi->bits_per_word = 8; } else { /* * Some controllers may not support the default 8 bits-per-word * so only perform the check when this is explicitly provided. */ status = __spi_validate_bits_per_word(spi->controller, spi->bits_per_word); if (status) return status; } if (spi->controller->max_speed_hz && (!spi->max_speed_hz || spi->max_speed_hz > spi->controller->max_speed_hz)) spi->max_speed_hz = spi->controller->max_speed_hz; mutex_lock(&spi->controller->io_mutex); if (spi->controller->setup) { status = spi->controller->setup(spi); if (status) { mutex_unlock(&spi->controller->io_mutex); dev_err(&spi->controller->dev, "Failed to setup device: %d\n", status); return status; } } status = spi_set_cs_timing(spi); if (status) { mutex_unlock(&spi->controller->io_mutex); return status; } if (spi->controller->auto_runtime_pm && spi->controller->set_cs) { status = pm_runtime_resume_and_get(spi->controller->dev.parent); if (status < 0) { mutex_unlock(&spi->controller->io_mutex); dev_err(&spi->controller->dev, "Failed to power device: %d\n", status); return status; } /* * We do not want to return positive value from pm_runtime_get, * there are many instances of devices calling spi_setup() and * checking for a non-zero return value instead of a negative * return value. */ status = 0; spi_set_cs(spi, false, true); pm_runtime_mark_last_busy(spi->controller->dev.parent); pm_runtime_put_autosuspend(spi->controller->dev.parent); } else { spi_set_cs(spi, false, true); } mutex_unlock(&spi->controller->io_mutex); if (spi->rt && !spi->controller->rt) { spi->controller->rt = true; spi_set_thread_rt(spi->controller); } trace_spi_setup(spi, status); dev_dbg(&spi->dev, "setup mode %lu, %s%s%s%s%u bits/w, %u Hz max --> %d\n", spi->mode & SPI_MODE_X_MASK, (spi->mode & SPI_CS_HIGH) ? "cs_high, " : "", (spi->mode & SPI_LSB_FIRST) ? "lsb, " : "", (spi->mode & SPI_3WIRE) ? "3wire, " : "", (spi->mode & SPI_LOOP) ? "loopback, " : "", spi->bits_per_word, spi->max_speed_hz, status); return status; } EXPORT_SYMBOL_GPL(spi_setup); static int _spi_xfer_word_delay_update(struct spi_transfer *xfer, struct spi_device *spi) { int delay1, delay2; delay1 = spi_delay_to_ns(&xfer->word_delay, xfer); if (delay1 < 0) return delay1; delay2 = spi_delay_to_ns(&spi->word_delay, xfer); if (delay2 < 0) return delay2; if (delay1 < delay2) memcpy(&xfer->word_delay, &spi->word_delay, sizeof(xfer->word_delay)); return 0; } static int __spi_validate(struct spi_device *spi, struct spi_message *message) { struct spi_controller *ctlr = spi->controller; struct spi_transfer *xfer; int w_size; if (list_empty(&message->transfers)) return -EINVAL; message->spi = spi; /* * Half-duplex links include original MicroWire, and ones with * only one data pin like SPI_3WIRE (switches direction) or where * either MOSI or MISO is missing. They can also be caused by * software limitations. */ if ((ctlr->flags & SPI_CONTROLLER_HALF_DUPLEX) || (spi->mode & SPI_3WIRE)) { unsigned flags = ctlr->flags; list_for_each_entry(xfer, &message->transfers, transfer_list) { if (xfer->rx_buf && xfer->tx_buf) return -EINVAL; if ((flags & SPI_CONTROLLER_NO_TX) && xfer->tx_buf) return -EINVAL; if ((flags & SPI_CONTROLLER_NO_RX) && xfer->rx_buf) return -EINVAL; } } /* * Set transfer bits_per_word and max speed as spi device default if * it is not set for this transfer. * Set transfer tx_nbits and rx_nbits as single transfer default * (SPI_NBITS_SINGLE) if it is not set for this transfer. * Ensure transfer word_delay is at least as long as that required by * device itself. */ message->frame_length = 0; list_for_each_entry(xfer, &message->transfers, transfer_list) { xfer->effective_speed_hz = 0; message->frame_length += xfer->len; if (!xfer->bits_per_word) xfer->bits_per_word = spi->bits_per_word; if (!xfer->speed_hz) xfer->speed_hz = spi->max_speed_hz; if (ctlr->max_speed_hz && xfer->speed_hz > ctlr->max_speed_hz) xfer->speed_hz = ctlr->max_speed_hz; if (__spi_validate_bits_per_word(ctlr, xfer->bits_per_word)) return -EINVAL; /* DDR mode is supported only if controller has dtr_caps=true. * default considered as SDR mode for SPI and QSPI controller. * Note: This is applicable only to QSPI controller. */ if (xfer->dtr_mode && !ctlr->dtr_caps) return -EINVAL; /* * SPI transfer length should be multiple of SPI word size * where SPI word size should be power-of-two multiple. */ if (xfer->bits_per_word <= 8) w_size = 1; else if (xfer->bits_per_word <= 16) w_size = 2; else w_size = 4; /* No partial transfers accepted */ if (xfer->len % w_size) return -EINVAL; if (xfer->speed_hz && ctlr->min_speed_hz && xfer->speed_hz < ctlr->min_speed_hz) return -EINVAL; if (xfer->tx_buf && !xfer->tx_nbits) xfer->tx_nbits = SPI_NBITS_SINGLE; if (xfer->rx_buf && !xfer->rx_nbits) xfer->rx_nbits = SPI_NBITS_SINGLE; /* * Check transfer tx/rx_nbits: * 1. check the value matches one of single, dual and quad * 2. check tx/rx_nbits match the mode in spi_device */ if (xfer->tx_buf) { if (spi->mode & SPI_NO_TX) return -EINVAL; if (xfer->tx_nbits != SPI_NBITS_SINGLE && xfer->tx_nbits != SPI_NBITS_DUAL && xfer->tx_nbits != SPI_NBITS_QUAD && xfer->tx_nbits != SPI_NBITS_OCTAL) return -EINVAL; if ((xfer->tx_nbits == SPI_NBITS_DUAL) && !(spi->mode & (SPI_TX_DUAL | SPI_TX_QUAD))) return -EINVAL; if ((xfer->tx_nbits == SPI_NBITS_QUAD) && !(spi->mode & SPI_TX_QUAD)) return -EINVAL; } /* Check transfer rx_nbits */ if (xfer->rx_buf) { if (spi->mode & SPI_NO_RX) return -EINVAL; if (xfer->rx_nbits != SPI_NBITS_SINGLE && xfer->rx_nbits != SPI_NBITS_DUAL && xfer->rx_nbits != SPI_NBITS_QUAD && xfer->rx_nbits != SPI_NBITS_OCTAL) return -EINVAL; if ((xfer->rx_nbits == SPI_NBITS_DUAL) && !(spi->mode & (SPI_RX_DUAL | SPI_RX_QUAD))) return -EINVAL; if ((xfer->rx_nbits == SPI_NBITS_QUAD) && !(spi->mode & SPI_RX_QUAD)) return -EINVAL; } if (_spi_xfer_word_delay_update(xfer, spi)) return -EINVAL; /* Make sure controller supports required offload features. */ if (xfer->offload_flags) { if (!message->offload) return -EINVAL; if (xfer->offload_flags & ~message->offload->xfer_flags) return -EINVAL; } } message->status = -EINPROGRESS; return 0; } /* * spi_split_transfers - generic handling of transfer splitting * @msg: the message to split * * Under certain conditions, a SPI controller may not support arbitrary * transfer sizes or other features required by a peripheral. This function * will split the transfers in the message into smaller transfers that are * supported by the controller. * * Controllers with special requirements not covered here can also split * transfers in the optimize_message() callback. * * Context: can sleep * Return: zero on success, else a negative error code */ static int spi_split_transfers(struct spi_message *msg) { struct spi_controller *ctlr = msg->spi->controller; struct spi_transfer *xfer; int ret; /* * If an SPI controller does not support toggling the CS line on each * transfer (indicated by the SPI_CS_WORD flag) or we are using a GPIO * for the CS line, we can emulate the CS-per-word hardware function by * splitting transfers into one-word transfers and ensuring that * cs_change is set for each transfer. */ if ((msg->spi->mode & SPI_CS_WORD) && (!(ctlr->mode_bits & SPI_CS_WORD) || spi_is_csgpiod(msg->spi))) { ret = spi_split_transfers_maxwords(ctlr, msg, 1); if (ret) return ret; list_for_each_entry(xfer, &msg->transfers, transfer_list) { /* Don't change cs_change on the last entry in the list */ if (list_is_last(&xfer->transfer_list, &msg->transfers)) break; xfer->cs_change = 1; } } else { ret = spi_split_transfers_maxsize(ctlr, msg, spi_max_transfer_size(msg->spi)); if (ret) return ret; } return 0; } /* * __spi_optimize_message - shared implementation for spi_optimize_message() * and spi_maybe_optimize_message() * @spi: the device that will be used for the message * @msg: the message to optimize * * Peripheral drivers will call spi_optimize_message() and the spi core will * call spi_maybe_optimize_message() instead of calling this directly. * * It is not valid to call this on a message that has already been optimized. * * Return: zero on success, else a negative error code */ static int __spi_optimize_message(struct spi_device *spi, struct spi_message *msg) { struct spi_controller *ctlr = spi->controller; int ret; ret = __spi_validate(spi, msg); if (ret) return ret; ret = spi_split_transfers(msg); if (ret) return ret; if (ctlr->optimize_message) { ret = ctlr->optimize_message(msg); if (ret) { spi_res_release(ctlr, msg); return ret; } } msg->optimized = true; return 0; } /* * spi_maybe_optimize_message - optimize message if it isn't already pre-optimized * @spi: the device that will be used for the message * @msg: the message to optimize * Return: zero on success, else a negative error code */ static int spi_maybe_optimize_message(struct spi_device *spi, struct spi_message *msg) { if (spi->controller->defer_optimize_message) { msg->spi = spi; return 0; } if (msg->pre_optimized) return 0; return __spi_optimize_message(spi, msg); } /** * spi_optimize_message - do any one-time validation and setup for a SPI message * @spi: the device that will be used for the message * @msg: the message to optimize * * Peripheral drivers that reuse the same message repeatedly may call this to * perform as much message prep as possible once, rather than repeating it each * time a message transfer is performed to improve throughput and reduce CPU * usage. * * Once a message has been optimized, it cannot be modified with the exception * of updating the contents of any xfer->tx_buf (the pointer can't be changed, * only the data in the memory it points to). * * Calls to this function must be balanced with calls to spi_unoptimize_message() * to avoid leaking resources. * * Context: can sleep * Return: zero on success, else a negative error code */ int spi_optimize_message(struct spi_device *spi, struct spi_message *msg) { int ret; /* * Pre-optimization is not supported and optimization is deferred e.g. * when using spi-mux. */ if (spi->controller->defer_optimize_message) return 0; ret = __spi_optimize_message(spi, msg); if (ret) return ret; /* * This flag indicates that the peripheral driver called spi_optimize_message() * and therefore we shouldn't unoptimize message automatically when finalizing * the message but rather wait until spi_unoptimize_message() is called * by the peripheral driver. */ msg->pre_optimized = true; return 0; } EXPORT_SYMBOL_GPL(spi_optimize_message); /** * spi_unoptimize_message - releases any resources allocated by spi_optimize_message() * @msg: the message to unoptimize * * Calls to this function must be balanced with calls to spi_optimize_message(). * * Context: can sleep */ void spi_unoptimize_message(struct spi_message *msg) { if (msg->spi->controller->defer_optimize_message) return; __spi_unoptimize_message(msg); msg->pre_optimized = false; } EXPORT_SYMBOL_GPL(spi_unoptimize_message); static int __spi_async(struct spi_device *spi, struct spi_message *message) { struct spi_controller *ctlr = spi->controller; struct spi_transfer *xfer; /* * Some controllers do not support doing regular SPI transfers. Return * ENOTSUPP when this is the case. */ if (!ctlr->transfer) return -ENOTSUPP; SPI_STATISTICS_INCREMENT_FIELD(ctlr->pcpu_statistics, spi_async); SPI_STATISTICS_INCREMENT_FIELD(spi->pcpu_statistics, spi_async); trace_spi_message_submit(message); if (!ctlr->ptp_sts_supported) { list_for_each_entry(xfer, &message->transfers, transfer_list) { xfer->ptp_sts_word_pre = 0; ptp_read_system_prets(xfer->ptp_sts); } } return ctlr->transfer(spi, message); } static void devm_spi_unoptimize_message(void *msg) { spi_unoptimize_message(msg); } /** * devm_spi_optimize_message - managed version of spi_optimize_message() * @dev: the device that manages @msg (usually @spi->dev) * @spi: the device that will be used for the message * @msg: the message to optimize * Return: zero on success, else a negative error code * * spi_unoptimize_message() will automatically be called when the device is * removed. */ int devm_spi_optimize_message(struct device *dev, struct spi_device *spi, struct spi_message *msg) { int ret; ret = spi_optimize_message(spi, msg); if (ret) return ret; return devm_add_action_or_reset(dev, devm_spi_unoptimize_message, msg); } EXPORT_SYMBOL_GPL(devm_spi_optimize_message); /** * spi_async - asynchronous SPI transfer * @spi: device with which data will be exchanged * @message: describes the data transfers, including completion callback * Context: any (IRQs may be blocked, etc) * * This call may be used in_irq and other contexts which can't sleep, * as well as from task contexts which can sleep. * * The completion callback is invoked in a context which can't sleep. * Before that invocation, the value of message->status is undefined. * When the callback is issued, message->status holds either zero (to * indicate complete success) or a negative error code. After that * callback returns, the driver which issued the transfer request may * deallocate the associated memory; it's no longer in use by any SPI * core or controller driver code. * * Note that although all messages to a spi_device are handled in * FIFO order, messages may go to different devices in other orders. * Some device might be higher priority, or have various "hard" access * time requirements, for example. * * On detection of any fault during the transfer, processing of * the entire message is aborted, and the device is deselected. * Until returning from the associated message completion callback, * no other spi_message queued to that device will be processed. * (This rule applies equally to all the synchronous transfer calls, * which are wrappers around this core asynchronous primitive.) * * Return: zero on success, else a negative error code. */ int spi_async(struct spi_device *spi, struct spi_message *message) { struct spi_controller *ctlr = spi->controller; int ret; unsigned long flags; ret = spi_maybe_optimize_message(spi, message); if (ret) return ret; spin_lock_irqsave(&ctlr->bus_lock_spinlock, flags); if (ctlr->bus_lock_flag) ret = -EBUSY; else ret = __spi_async(spi, message); spin_unlock_irqrestore(&ctlr->bus_lock_spinlock, flags); return ret; } EXPORT_SYMBOL_GPL(spi_async); static void __spi_transfer_message_noqueue(struct spi_controller *ctlr, struct spi_message *msg) { bool was_busy; int ret; mutex_lock(&ctlr->io_mutex); was_busy = ctlr->busy; ctlr->cur_msg = msg; ret = __spi_pump_transfer_message(ctlr, msg, was_busy); if (ret) dev_err(&ctlr->dev, "noqueue transfer failed\n"); ctlr->cur_msg = NULL; ctlr->fallback = false; if (!was_busy) { kfree(ctlr->dummy_rx); ctlr->dummy_rx = NULL; kfree(ctlr->dummy_tx); ctlr->dummy_tx = NULL; if (ctlr->unprepare_transfer_hardware && ctlr->unprepare_transfer_hardware(ctlr)) dev_err(&ctlr->dev, "failed to unprepare transfer hardware\n"); spi_idle_runtime_pm(ctlr); } mutex_unlock(&ctlr->io_mutex); } /*-------------------------------------------------------------------------*/ /* * Utility methods for SPI protocol drivers, layered on * top of the core. Some other utility methods are defined as * inline functions. */ static void spi_complete(void *arg) { complete(arg); } static int __spi_sync(struct spi_device *spi, struct spi_message *message) { DECLARE_COMPLETION_ONSTACK(done); unsigned long flags; int status; struct spi_controller *ctlr = spi->controller; if (__spi_check_suspended(ctlr)) { dev_warn_once(&spi->dev, "Attempted to sync while suspend\n"); return -ESHUTDOWN; } status = spi_maybe_optimize_message(spi, message); if (status) return status; SPI_STATISTICS_INCREMENT_FIELD(ctlr->pcpu_statistics, spi_sync); SPI_STATISTICS_INCREMENT_FIELD(spi->pcpu_statistics, spi_sync); /* * Checking queue_empty here only guarantees async/sync message * ordering when coming from the same context. It does not need to * guard against reentrancy from a different context. The io_mutex * will catch those cases. */ if (READ_ONCE(ctlr->queue_empty) && !ctlr->must_async) { message->actual_length = 0; message->status = -EINPROGRESS; trace_spi_message_submit(message); SPI_STATISTICS_INCREMENT_FIELD(ctlr->pcpu_statistics, spi_sync_immediate); SPI_STATISTICS_INCREMENT_FIELD(spi->pcpu_statistics, spi_sync_immediate); __spi_transfer_message_noqueue(ctlr, message); return message->status; } /* * There are messages in the async queue that could have originated * from the same context, so we need to preserve ordering. * Therefor we send the message to the async queue and wait until they * are completed. */ message->complete = spi_complete; message->context = &done; spin_lock_irqsave(&ctlr->bus_lock_spinlock, flags); status = __spi_async(spi, message); spin_unlock_irqrestore(&ctlr->bus_lock_spinlock, flags); if (status == 0) { wait_for_completion(&done); status = message->status; } message->complete = NULL; message->context = NULL; return status; } /** * spi_sync - blocking/synchronous SPI data transfers * @spi: device with which data will be exchanged * @message: describes the data transfers * Context: can sleep * * This call may only be used from a context that may sleep. The sleep * is non-interruptible, and has no timeout. Low-overhead controller * drivers may DMA directly into and out of the message buffers. * * Note that the SPI device's chip select is active during the message, * and then is normally disabled between messages. Drivers for some * frequently-used devices may want to minimize costs of selecting a chip, * by leaving it selected in anticipation that the next message will go * to the same chip. (That may increase power usage.) * * Also, the caller is guaranteeing that the memory associated with the * message will not be freed before this call returns. * * Return: zero on success, else a negative error code. */ int spi_sync(struct spi_device *spi, struct spi_message *message) { int ret; mutex_lock(&spi->controller->bus_lock_mutex); ret = __spi_sync(spi, message); mutex_unlock(&spi->controller->bus_lock_mutex); return ret; } EXPORT_SYMBOL_GPL(spi_sync); /** * spi_sync_locked - version of spi_sync with exclusive bus usage * @spi: device with which data will be exchanged * @message: describes the data transfers * Context: can sleep * * This call may only be used from a context that may sleep. The sleep * is non-interruptible, and has no timeout. Low-overhead controller * drivers may DMA directly into and out of the message buffers. * * This call should be used by drivers that require exclusive access to the * SPI bus. It has to be preceded by a spi_bus_lock call. The SPI bus must * be released by a spi_bus_unlock call when the exclusive access is over. * * Return: zero on success, else a negative error code. */ int spi_sync_locked(struct spi_device *spi, struct spi_message *message) { return __spi_sync(spi, message); } EXPORT_SYMBOL_GPL(spi_sync_locked); /** * spi_bus_lock - obtain a lock for exclusive SPI bus usage * @ctlr: SPI bus controller that should be locked for exclusive bus access * Context: can sleep * * This call may only be used from a context that may sleep. The sleep * is non-interruptible, and has no timeout. * * This call should be used by drivers that require exclusive access to the * SPI bus. The SPI bus must be released by a spi_bus_unlock call when the * exclusive access is over. Data transfer must be done by spi_sync_locked * and spi_async_locked calls when the SPI bus lock is held. * * Return: always zero. */ int spi_bus_lock(struct spi_controller *ctlr) { unsigned long flags; mutex_lock(&ctlr->bus_lock_mutex); spin_lock_irqsave(&ctlr->bus_lock_spinlock, flags); ctlr->bus_lock_flag = 1; spin_unlock_irqrestore(&ctlr->bus_lock_spinlock, flags); /* Mutex remains locked until spi_bus_unlock() is called */ return 0; } EXPORT_SYMBOL_GPL(spi_bus_lock); /** * spi_bus_unlock - release the lock for exclusive SPI bus usage * @ctlr: SPI bus controller that was locked for exclusive bus access * Context: can sleep * * This call may only be used from a context that may sleep. The sleep * is non-interruptible, and has no timeout. * * This call releases an SPI bus lock previously obtained by an spi_bus_lock * call. * * Return: always zero. */ int spi_bus_unlock(struct spi_controller *ctlr) { ctlr->bus_lock_flag = 0; mutex_unlock(&ctlr->bus_lock_mutex); return 0; } EXPORT_SYMBOL_GPL(spi_bus_unlock); /* Portable code must never pass more than 32 bytes */ #define SPI_BUFSIZ max(32, SMP_CACHE_BYTES) static u8 *buf; /** * spi_write_then_read - SPI synchronous write followed by read * @spi: device with which data will be exchanged * @txbuf: data to be written (need not be DMA-safe) * @n_tx: size of txbuf, in bytes * @rxbuf: buffer into which data will be read (need not be DMA-safe) * @n_rx: size of rxbuf, in bytes * Context: can sleep * * This performs a half duplex MicroWire style transaction with the * device, sending txbuf and then reading rxbuf. The return value * is zero for success, else a negative errno status code. * This call may only be used from a context that may sleep. * * Parameters to this routine are always copied using a small buffer. * Performance-sensitive or bulk transfer code should instead use * spi_{async,sync}() calls with DMA-safe buffers. * * Return: zero on success, else a negative error code. */ int spi_write_then_read(struct spi_device *spi, const void *txbuf, unsigned n_tx, void *rxbuf, unsigned n_rx) { static DEFINE_MUTEX(lock); int status; struct spi_message message; struct spi_transfer x[2]; u8 *local_buf; /* * Use preallocated DMA-safe buffer if we can. We can't avoid * copying here, (as a pure convenience thing), but we can * keep heap costs out of the hot path unless someone else is * using the pre-allocated buffer or the transfer is too large. */ if ((n_tx + n_rx) > SPI_BUFSIZ || !mutex_trylock(&lock)) { local_buf = kmalloc(max((unsigned)SPI_BUFSIZ, n_tx + n_rx), GFP_KERNEL | GFP_DMA); if (!local_buf) return -ENOMEM; } else { local_buf = buf; } spi_message_init(&message); memset(x, 0, sizeof(x)); if (n_tx) { x[0].len = n_tx; spi_message_add_tail(&x[0], &message); } if (n_rx) { x[1].len = n_rx; spi_message_add_tail(&x[1], &message); } memcpy(local_buf, txbuf, n_tx); x[0].tx_buf = local_buf; x[1].rx_buf = local_buf + n_tx; /* Do the I/O */ status = spi_sync(spi, &message); if (status == 0) memcpy(rxbuf, x[1].rx_buf, n_rx); if (x[0].tx_buf == buf) mutex_unlock(&lock); else kfree(local_buf); return status; } EXPORT_SYMBOL_GPL(spi_write_then_read); /*-------------------------------------------------------------------------*/ #if IS_ENABLED(CONFIG_OF_DYNAMIC) /* Must call put_device() when done with returned spi_device device */ static struct spi_device *of_find_spi_device_by_node(struct device_node *node) { struct device *dev = bus_find_device_by_of_node(&spi_bus_type, node); return dev ? to_spi_device(dev) : NULL; } /* The spi controllers are not using spi_bus, so we find it with another way */ static struct spi_controller *of_find_spi_controller_by_node(struct device_node *node) { struct device *dev; dev = class_find_device_by_of_node(&spi_controller_class, node); if (!dev && IS_ENABLED(CONFIG_SPI_SLAVE)) dev = class_find_device_by_of_node(&spi_target_class, node); if (!dev) return NULL; /* Reference got in class_find_device */ return container_of(dev, struct spi_controller, dev); } static int of_spi_notify(struct notifier_block *nb, unsigned long action, void *arg) { struct of_reconfig_data *rd = arg; struct spi_controller *ctlr; struct spi_device *spi; switch (of_reconfig_get_state_change(action, arg)) { case OF_RECONFIG_CHANGE_ADD: ctlr = of_find_spi_controller_by_node(rd->dn->parent); if (ctlr == NULL) return NOTIFY_OK; /* Not for us */ if (of_node_test_and_set_flag(rd->dn, OF_POPULATED)) { put_device(&ctlr->dev); return NOTIFY_OK; } /* * Clear the flag before adding the device so that fw_devlink * doesn't skip adding consumers to this device. */ rd->dn->fwnode.flags &= ~FWNODE_FLAG_NOT_DEVICE; spi = of_register_spi_device(ctlr, rd->dn); put_device(&ctlr->dev); if (IS_ERR(spi)) { pr_err("%s: failed to create for '%pOF'\n", __func__, rd->dn); of_node_clear_flag(rd->dn, OF_POPULATED); return notifier_from_errno(PTR_ERR(spi)); } break; case OF_RECONFIG_CHANGE_REMOVE: /* Already depopulated? */ if (!of_node_check_flag(rd->dn, OF_POPULATED)) return NOTIFY_OK; /* Find our device by node */ spi = of_find_spi_device_by_node(rd->dn); if (spi == NULL) return NOTIFY_OK; /* No? not meant for us */ /* Unregister takes one ref away */ spi_unregister_device(spi); /* And put the reference of the find */ put_device(&spi->dev); break; } return NOTIFY_OK; } static struct notifier_block spi_of_notifier = { .notifier_call = of_spi_notify, }; #else /* IS_ENABLED(CONFIG_OF_DYNAMIC) */ extern struct notifier_block spi_of_notifier; #endif /* IS_ENABLED(CONFIG_OF_DYNAMIC) */ #if IS_ENABLED(CONFIG_ACPI) static int spi_acpi_controller_match(struct device *dev, const void *data) { return device_match_acpi_dev(dev->parent, data); } struct spi_controller *acpi_spi_find_controller_by_adev(struct acpi_device *adev) { struct device *dev; dev = class_find_device(&spi_controller_class, NULL, adev, spi_acpi_controller_match); if (!dev && IS_ENABLED(CONFIG_SPI_SLAVE)) dev = class_find_device(&spi_target_class, NULL, adev, spi_acpi_controller_match); if (!dev) return NULL; return container_of(dev, struct spi_controller, dev); } EXPORT_SYMBOL_GPL(acpi_spi_find_controller_by_adev); static struct spi_device *acpi_spi_find_device_by_adev(struct acpi_device *adev) { struct device *dev; dev = bus_find_device_by_acpi_dev(&spi_bus_type, adev); return to_spi_device(dev); } static int acpi_spi_notify(struct notifier_block *nb, unsigned long value, void *arg) { struct acpi_device *adev = arg; struct spi_controller *ctlr; struct spi_device *spi; switch (value) { case ACPI_RECONFIG_DEVICE_ADD: ctlr = acpi_spi_find_controller_by_adev(acpi_dev_parent(adev)); if (!ctlr) break; acpi_register_spi_device(ctlr, adev); put_device(&ctlr->dev); break; case ACPI_RECONFIG_DEVICE_REMOVE: if (!acpi_device_enumerated(adev)) break; spi = acpi_spi_find_device_by_adev(adev); if (!spi) break; spi_unregister_device(spi); put_device(&spi->dev); break; } return NOTIFY_OK; } static struct notifier_block spi_acpi_notifier = { .notifier_call = acpi_spi_notify, }; #else extern struct notifier_block spi_acpi_notifier; #endif static int __init spi_init(void) { int status; buf = kmalloc(SPI_BUFSIZ, GFP_KERNEL); if (!buf) { status = -ENOMEM; goto err0; } status = bus_register(&spi_bus_type); if (status < 0) goto err1; status = class_register(&spi_controller_class); if (status < 0) goto err2; if (IS_ENABLED(CONFIG_SPI_SLAVE)) { status = class_register(&spi_target_class); if (status < 0) goto err3; } if (IS_ENABLED(CONFIG_OF_DYNAMIC)) WARN_ON(of_reconfig_notifier_register(&spi_of_notifier)); if (IS_ENABLED(CONFIG_ACPI)) WARN_ON(acpi_reconfig_notifier_register(&spi_acpi_notifier)); return 0; err3: class_unregister(&spi_controller_class); err2: bus_unregister(&spi_bus_type); err1: kfree(buf); buf = NULL; err0: return status; } /* * A board_info is normally registered in arch_initcall(), * but even essential drivers wait till later. * * REVISIT only boardinfo really needs static linking. The rest (device and * driver registration) _could_ be dynamically linked (modular) ... Costs * include needing to have boardinfo data structures be much more public. */ postcore_initcall(spi_init);
38 40 46 46 43 41 42 42 42 42 42 43 43 43 40 39 43 3 2 2 2 2 8 8 8 8 8 3 8 8 43 43 5 5 5 5 5 5 43 43 43 8 42 42 3 3 41 19 18 16 15 14 13 8 8 8 3 3 2 3 3 3 8 8 8 8 8 8 8 6 3 3 3 3 3 3 3 3 3 3 3 3 3 7 7 7 7 7 6 7 7 7 5 5 5 5 5 43 43 42 42 42 15 42 43 43 43 43 43 1 1 1 2 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 4 3 1 3 3 2 2 3 1 3 4 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 // SPDX-License-Identifier: GPL-2.0 /* Copyright (C) B.A.T.M.A.N. contributors: * * Simon Wunderlich */ #include "bridge_loop_avoidance.h" #include "main.h" #include <linux/atomic.h> #include <linux/byteorder/generic.h> #include <linux/compiler.h> #include <linux/container_of.h> #include <linux/crc16.h> #include <linux/err.h> #include <linux/errno.h> #include <linux/etherdevice.h> #include <linux/gfp.h> #include <linux/if_arp.h> #include <linux/if_ether.h> #include <linux/if_vlan.h> #include <linux/jhash.h> #include <linux/jiffies.h> #include <linux/kref.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/netdevice.h> #include <linux/netlink.h> #include <linux/rculist.h> #include <linux/rcupdate.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/sprintf.h> #include <linux/stddef.h> #include <linux/string.h> #include <linux/string_choices.h> #include <linux/workqueue.h> #include <net/arp.h> #include <net/genetlink.h> #include <net/netlink.h> #include <uapi/linux/batadv_packet.h> #include <uapi/linux/batman_adv.h> #include "hard-interface.h" #include "hash.h" #include "log.h" #include "netlink.h" #include "originator.h" #include "translation-table.h" static const u8 batadv_announce_mac[4] = {0x43, 0x05, 0x43, 0x05}; static void batadv_bla_periodic_work(struct work_struct *work); static void batadv_bla_send_announce(struct batadv_priv *bat_priv, struct batadv_bla_backbone_gw *backbone_gw); /** * batadv_choose_claim() - choose the right bucket for a claim. * @data: data to hash * @size: size of the hash table * * Return: the hash index of the claim */ static inline u32 batadv_choose_claim(const void *data, u32 size) { const struct batadv_bla_claim *claim = data; u32 hash = 0; hash = jhash(&claim->addr, sizeof(claim->addr), hash); hash = jhash(&claim->vid, sizeof(claim->vid), hash); return hash % size; } /** * batadv_choose_backbone_gw() - choose the right bucket for a backbone gateway. * @data: data to hash * @size: size of the hash table * * Return: the hash index of the backbone gateway */ static inline u32 batadv_choose_backbone_gw(const void *data, u32 size) { const struct batadv_bla_backbone_gw *gw; u32 hash = 0; gw = data; hash = jhash(&gw->orig, sizeof(gw->orig), hash); hash = jhash(&gw->vid, sizeof(gw->vid), hash); return hash % size; } /** * batadv_compare_backbone_gw() - compare address and vid of two backbone gws * @node: list node of the first entry to compare * @data2: pointer to the second backbone gateway * * Return: true if the backbones have the same data, false otherwise */ static bool batadv_compare_backbone_gw(const struct hlist_node *node, const void *data2) { const void *data1 = container_of(node, struct batadv_bla_backbone_gw, hash_entry); const struct batadv_bla_backbone_gw *gw1 = data1; const struct batadv_bla_backbone_gw *gw2 = data2; if (!batadv_compare_eth(gw1->orig, gw2->orig)) return false; if (gw1->vid != gw2->vid) return false; return true; } /** * batadv_compare_claim() - compare address and vid of two claims * @node: list node of the first entry to compare * @data2: pointer to the second claims * * Return: true if the claim have the same data, 0 otherwise */ static bool batadv_compare_claim(const struct hlist_node *node, const void *data2) { const void *data1 = container_of(node, struct batadv_bla_claim, hash_entry); const struct batadv_bla_claim *cl1 = data1; const struct batadv_bla_claim *cl2 = data2; if (!batadv_compare_eth(cl1->addr, cl2->addr)) return false; if (cl1->vid != cl2->vid) return false; return true; } /** * batadv_backbone_gw_release() - release backbone gw from lists and queue for * free after rcu grace period * @ref: kref pointer of the backbone gw */ static void batadv_backbone_gw_release(struct kref *ref) { struct batadv_bla_backbone_gw *backbone_gw; backbone_gw = container_of(ref, struct batadv_bla_backbone_gw, refcount); kfree_rcu(backbone_gw, rcu); } /** * batadv_backbone_gw_put() - decrement the backbone gw refcounter and possibly * release it * @backbone_gw: backbone gateway to be free'd */ static void batadv_backbone_gw_put(struct batadv_bla_backbone_gw *backbone_gw) { if (!backbone_gw) return; kref_put(&backbone_gw->refcount, batadv_backbone_gw_release); } /** * batadv_claim_release() - release claim from lists and queue for free after * rcu grace period * @ref: kref pointer of the claim */ static void batadv_claim_release(struct kref *ref) { struct batadv_bla_claim *claim; struct batadv_bla_backbone_gw *old_backbone_gw; claim = container_of(ref, struct batadv_bla_claim, refcount); spin_lock_bh(&claim->backbone_lock); old_backbone_gw = claim->backbone_gw; claim->backbone_gw = NULL; spin_unlock_bh(&claim->backbone_lock); spin_lock_bh(&old_backbone_gw->crc_lock); old_backbone_gw->crc ^= crc16(0, claim->addr, ETH_ALEN); spin_unlock_bh(&old_backbone_gw->crc_lock); batadv_backbone_gw_put(old_backbone_gw); kfree_rcu(claim, rcu); } /** * batadv_claim_put() - decrement the claim refcounter and possibly release it * @claim: claim to be free'd */ static void batadv_claim_put(struct batadv_bla_claim *claim) { if (!claim) return; kref_put(&claim->refcount, batadv_claim_release); } /** * batadv_claim_hash_find() - looks for a claim in the claim hash * @bat_priv: the bat priv with all the mesh interface information * @data: search data (may be local/static data) * * Return: claim if found or NULL otherwise. */ static struct batadv_bla_claim * batadv_claim_hash_find(struct batadv_priv *bat_priv, struct batadv_bla_claim *data) { struct batadv_hashtable *hash = bat_priv->bla.claim_hash; struct hlist_head *head; struct batadv_bla_claim *claim; struct batadv_bla_claim *claim_tmp = NULL; int index; if (!hash) return NULL; index = batadv_choose_claim(data, hash->size); head = &hash->table[index]; rcu_read_lock(); hlist_for_each_entry_rcu(claim, head, hash_entry) { if (!batadv_compare_claim(&claim->hash_entry, data)) continue; if (!kref_get_unless_zero(&claim->refcount)) continue; claim_tmp = claim; break; } rcu_read_unlock(); return claim_tmp; } /** * batadv_backbone_hash_find() - looks for a backbone gateway in the hash * @bat_priv: the bat priv with all the mesh interface information * @addr: the address of the originator * @vid: the VLAN ID * * Return: backbone gateway if found or NULL otherwise */ static struct batadv_bla_backbone_gw * batadv_backbone_hash_find(struct batadv_priv *bat_priv, const u8 *addr, unsigned short vid) { struct batadv_hashtable *hash = bat_priv->bla.backbone_hash; struct hlist_head *head; struct batadv_bla_backbone_gw search_entry, *backbone_gw; struct batadv_bla_backbone_gw *backbone_gw_tmp = NULL; int index; if (!hash) return NULL; ether_addr_copy(search_entry.orig, addr); search_entry.vid = vid; index = batadv_choose_backbone_gw(&search_entry, hash->size); head = &hash->table[index]; rcu_read_lock(); hlist_for_each_entry_rcu(backbone_gw, head, hash_entry) { if (!batadv_compare_backbone_gw(&backbone_gw->hash_entry, &search_entry)) continue; if (!kref_get_unless_zero(&backbone_gw->refcount)) continue; backbone_gw_tmp = backbone_gw; break; } rcu_read_unlock(); return backbone_gw_tmp; } /** * batadv_bla_del_backbone_claims() - delete all claims for a backbone * @backbone_gw: backbone gateway where the claims should be removed */ static void batadv_bla_del_backbone_claims(struct batadv_bla_backbone_gw *backbone_gw) { struct batadv_hashtable *hash; struct hlist_node *node_tmp; struct hlist_head *head; struct batadv_bla_claim *claim; int i; spinlock_t *list_lock; /* protects write access to the hash lists */ hash = backbone_gw->bat_priv->bla.claim_hash; if (!hash) return; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; list_lock = &hash->list_locks[i]; spin_lock_bh(list_lock); hlist_for_each_entry_safe(claim, node_tmp, head, hash_entry) { if (claim->backbone_gw != backbone_gw) continue; batadv_claim_put(claim); hlist_del_rcu(&claim->hash_entry); } spin_unlock_bh(list_lock); } /* all claims gone, initialize CRC */ spin_lock_bh(&backbone_gw->crc_lock); backbone_gw->crc = BATADV_BLA_CRC_INIT; spin_unlock_bh(&backbone_gw->crc_lock); } /** * batadv_bla_send_claim() - sends a claim frame according to the provided info * @bat_priv: the bat priv with all the mesh interface information * @mac: the mac address to be announced within the claim * @vid: the VLAN ID * @claimtype: the type of the claim (CLAIM, UNCLAIM, ANNOUNCE, ...) */ static void batadv_bla_send_claim(struct batadv_priv *bat_priv, const u8 *mac, unsigned short vid, int claimtype) { struct sk_buff *skb; struct ethhdr *ethhdr; struct batadv_hard_iface *primary_if; struct net_device *mesh_iface; u8 *hw_src; struct batadv_bla_claim_dst local_claim_dest; __be32 zeroip = 0; primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if) return; memcpy(&local_claim_dest, &bat_priv->bla.claim_dest, sizeof(local_claim_dest)); local_claim_dest.type = claimtype; mesh_iface = primary_if->mesh_iface; skb = arp_create(ARPOP_REPLY, ETH_P_ARP, /* IP DST: 0.0.0.0 */ zeroip, primary_if->mesh_iface, /* IP SRC: 0.0.0.0 */ zeroip, /* Ethernet DST: Broadcast */ NULL, /* Ethernet SRC/HW SRC: originator mac */ primary_if->net_dev->dev_addr, /* HW DST: FF:43:05:XX:YY:YY * with XX = claim type * and YY:YY = group id */ (u8 *)&local_claim_dest); if (!skb) goto out; ethhdr = (struct ethhdr *)skb->data; hw_src = (u8 *)ethhdr + ETH_HLEN + sizeof(struct arphdr); /* now we pretend that the client would have sent this ... */ switch (claimtype) { case BATADV_CLAIM_TYPE_CLAIM: /* normal claim frame * set Ethernet SRC to the clients mac */ ether_addr_copy(ethhdr->h_source, mac); batadv_dbg(BATADV_DBG_BLA, bat_priv, "%s(): CLAIM %pM on vid %d\n", __func__, mac, batadv_print_vid(vid)); break; case BATADV_CLAIM_TYPE_UNCLAIM: /* unclaim frame * set HW SRC to the clients mac */ ether_addr_copy(hw_src, mac); batadv_dbg(BATADV_DBG_BLA, bat_priv, "%s(): UNCLAIM %pM on vid %d\n", __func__, mac, batadv_print_vid(vid)); break; case BATADV_CLAIM_TYPE_ANNOUNCE: /* announcement frame * set HW SRC to the special mac containing the crc */ ether_addr_copy(hw_src, mac); batadv_dbg(BATADV_DBG_BLA, bat_priv, "%s(): ANNOUNCE of %pM on vid %d\n", __func__, ethhdr->h_source, batadv_print_vid(vid)); break; case BATADV_CLAIM_TYPE_REQUEST: /* request frame * set HW SRC and header destination to the receiving backbone * gws mac */ ether_addr_copy(hw_src, mac); ether_addr_copy(ethhdr->h_dest, mac); batadv_dbg(BATADV_DBG_BLA, bat_priv, "%s(): REQUEST of %pM to %pM on vid %d\n", __func__, ethhdr->h_source, ethhdr->h_dest, batadv_print_vid(vid)); break; case BATADV_CLAIM_TYPE_LOOPDETECT: ether_addr_copy(ethhdr->h_source, mac); batadv_dbg(BATADV_DBG_BLA, bat_priv, "%s(): LOOPDETECT of %pM to %pM on vid %d\n", __func__, ethhdr->h_source, ethhdr->h_dest, batadv_print_vid(vid)); break; } if (vid & BATADV_VLAN_HAS_TAG) { skb = vlan_insert_tag(skb, htons(ETH_P_8021Q), vid & VLAN_VID_MASK); if (!skb) goto out; } skb_reset_mac_header(skb); skb->protocol = eth_type_trans(skb, mesh_iface); batadv_inc_counter(bat_priv, BATADV_CNT_RX); batadv_add_counter(bat_priv, BATADV_CNT_RX_BYTES, skb->len + ETH_HLEN); netif_rx(skb); out: batadv_hardif_put(primary_if); } /** * batadv_bla_loopdetect_report() - worker for reporting the loop * @work: work queue item * * Throws an uevent, as the loopdetect check function can't do that itself * since the kernel may sleep while throwing uevents. */ static void batadv_bla_loopdetect_report(struct work_struct *work) { struct batadv_bla_backbone_gw *backbone_gw; struct batadv_priv *bat_priv; char vid_str[6] = { '\0' }; backbone_gw = container_of(work, struct batadv_bla_backbone_gw, report_work); bat_priv = backbone_gw->bat_priv; batadv_info(bat_priv->mesh_iface, "Possible loop on VLAN %d detected which can't be handled by BLA - please check your network setup!\n", batadv_print_vid(backbone_gw->vid)); snprintf(vid_str, sizeof(vid_str), "%d", batadv_print_vid(backbone_gw->vid)); vid_str[sizeof(vid_str) - 1] = 0; batadv_throw_uevent(bat_priv, BATADV_UEV_BLA, BATADV_UEV_LOOPDETECT, vid_str); batadv_backbone_gw_put(backbone_gw); } /** * batadv_bla_get_backbone_gw() - finds or creates a backbone gateway * @bat_priv: the bat priv with all the mesh interface information * @orig: the mac address of the originator * @vid: the VLAN ID * @own_backbone: set if the requested backbone is local * * Return: the (possibly created) backbone gateway or NULL on error */ static struct batadv_bla_backbone_gw * batadv_bla_get_backbone_gw(struct batadv_priv *bat_priv, const u8 *orig, unsigned short vid, bool own_backbone) { struct batadv_bla_backbone_gw *entry; struct batadv_orig_node *orig_node; int hash_added; entry = batadv_backbone_hash_find(bat_priv, orig, vid); if (entry) return entry; batadv_dbg(BATADV_DBG_BLA, bat_priv, "%s(): not found (%pM, %d), creating new entry\n", __func__, orig, batadv_print_vid(vid)); entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (!entry) return NULL; entry->vid = vid; entry->lasttime = jiffies; entry->crc = BATADV_BLA_CRC_INIT; entry->bat_priv = bat_priv; spin_lock_init(&entry->crc_lock); atomic_set(&entry->request_sent, 0); atomic_set(&entry->wait_periods, 0); ether_addr_copy(entry->orig, orig); INIT_WORK(&entry->report_work, batadv_bla_loopdetect_report); kref_init(&entry->refcount); kref_get(&entry->refcount); hash_added = batadv_hash_add(bat_priv->bla.backbone_hash, batadv_compare_backbone_gw, batadv_choose_backbone_gw, entry, &entry->hash_entry); if (unlikely(hash_added != 0)) { /* hash failed, free the structure */ kfree(entry); return NULL; } /* this is a gateway now, remove any TT entry on this VLAN */ orig_node = batadv_orig_hash_find(bat_priv, orig); if (orig_node) { batadv_tt_global_del_orig(bat_priv, orig_node, vid, "became a backbone gateway"); batadv_orig_node_put(orig_node); } if (own_backbone) { batadv_bla_send_announce(bat_priv, entry); /* this will be decreased in the worker thread */ atomic_inc(&entry->request_sent); atomic_set(&entry->wait_periods, BATADV_BLA_WAIT_PERIODS); atomic_inc(&bat_priv->bla.num_requests); } return entry; } /** * batadv_bla_update_own_backbone_gw() - updates the own backbone gw for a VLAN * @bat_priv: the bat priv with all the mesh interface information * @primary_if: the selected primary interface * @vid: VLAN identifier * * update or add the own backbone gw to make sure we announce * where we receive other backbone gws */ static void batadv_bla_update_own_backbone_gw(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if, unsigned short vid) { struct batadv_bla_backbone_gw *backbone_gw; backbone_gw = batadv_bla_get_backbone_gw(bat_priv, primary_if->net_dev->dev_addr, vid, true); if (unlikely(!backbone_gw)) return; backbone_gw->lasttime = jiffies; batadv_backbone_gw_put(backbone_gw); } /** * batadv_bla_answer_request() - answer a bla request by sending own claims * @bat_priv: the bat priv with all the mesh interface information * @primary_if: interface where the request came on * @vid: the vid where the request came on * * Repeat all of our own claims, and finally send an ANNOUNCE frame * to allow the requester another check if the CRC is correct now. */ static void batadv_bla_answer_request(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if, unsigned short vid) { struct hlist_head *head; struct batadv_hashtable *hash; struct batadv_bla_claim *claim; struct batadv_bla_backbone_gw *backbone_gw; int i; batadv_dbg(BATADV_DBG_BLA, bat_priv, "%s(): received a claim request, send all of our own claims again\n", __func__); backbone_gw = batadv_backbone_hash_find(bat_priv, primary_if->net_dev->dev_addr, vid); if (!backbone_gw) return; hash = bat_priv->bla.claim_hash; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; rcu_read_lock(); hlist_for_each_entry_rcu(claim, head, hash_entry) { /* only own claims are interesting */ if (claim->backbone_gw != backbone_gw) continue; batadv_bla_send_claim(bat_priv, claim->addr, claim->vid, BATADV_CLAIM_TYPE_CLAIM); } rcu_read_unlock(); } /* finally, send an announcement frame */ batadv_bla_send_announce(bat_priv, backbone_gw); batadv_backbone_gw_put(backbone_gw); } /** * batadv_bla_send_request() - send a request to repeat claims * @backbone_gw: the backbone gateway from whom we are out of sync * * When the crc is wrong, ask the backbone gateway for a full table update. * After the request, it will repeat all of his own claims and finally * send an announcement claim with which we can check again. */ static void batadv_bla_send_request(struct batadv_bla_backbone_gw *backbone_gw) { /* first, remove all old entries */ batadv_bla_del_backbone_claims(backbone_gw); batadv_dbg(BATADV_DBG_BLA, backbone_gw->bat_priv, "Sending REQUEST to %pM\n", backbone_gw->orig); /* send request */ batadv_bla_send_claim(backbone_gw->bat_priv, backbone_gw->orig, backbone_gw->vid, BATADV_CLAIM_TYPE_REQUEST); /* no local broadcasts should be sent or received, for now. */ if (!atomic_read(&backbone_gw->request_sent)) { atomic_inc(&backbone_gw->bat_priv->bla.num_requests); atomic_set(&backbone_gw->request_sent, 1); } } /** * batadv_bla_send_announce() - Send an announcement frame * @bat_priv: the bat priv with all the mesh interface information * @backbone_gw: our backbone gateway which should be announced */ static void batadv_bla_send_announce(struct batadv_priv *bat_priv, struct batadv_bla_backbone_gw *backbone_gw) { u8 mac[ETH_ALEN]; __be16 crc; memcpy(mac, batadv_announce_mac, 4); spin_lock_bh(&backbone_gw->crc_lock); crc = htons(backbone_gw->crc); spin_unlock_bh(&backbone_gw->crc_lock); memcpy(&mac[4], &crc, 2); batadv_bla_send_claim(bat_priv, mac, backbone_gw->vid, BATADV_CLAIM_TYPE_ANNOUNCE); } /** * batadv_bla_add_claim() - Adds a claim in the claim hash * @bat_priv: the bat priv with all the mesh interface information * @mac: the mac address of the claim * @vid: the VLAN ID of the frame * @backbone_gw: the backbone gateway which claims it */ static void batadv_bla_add_claim(struct batadv_priv *bat_priv, const u8 *mac, const unsigned short vid, struct batadv_bla_backbone_gw *backbone_gw) { struct batadv_bla_backbone_gw *old_backbone_gw; struct batadv_bla_claim *claim; struct batadv_bla_claim search_claim; bool remove_crc = false; int hash_added; ether_addr_copy(search_claim.addr, mac); search_claim.vid = vid; claim = batadv_claim_hash_find(bat_priv, &search_claim); /* create a new claim entry if it does not exist yet. */ if (!claim) { claim = kzalloc(sizeof(*claim), GFP_ATOMIC); if (!claim) return; ether_addr_copy(claim->addr, mac); spin_lock_init(&claim->backbone_lock); claim->vid = vid; claim->lasttime = jiffies; kref_get(&backbone_gw->refcount); claim->backbone_gw = backbone_gw; kref_init(&claim->refcount); batadv_dbg(BATADV_DBG_BLA, bat_priv, "%s(): adding new entry %pM, vid %d to hash ...\n", __func__, mac, batadv_print_vid(vid)); kref_get(&claim->refcount); hash_added = batadv_hash_add(bat_priv->bla.claim_hash, batadv_compare_claim, batadv_choose_claim, claim, &claim->hash_entry); if (unlikely(hash_added != 0)) { /* only local changes happened. */ kfree(claim); return; } } else { claim->lasttime = jiffies; if (claim->backbone_gw == backbone_gw) /* no need to register a new backbone */ goto claim_free_ref; batadv_dbg(BATADV_DBG_BLA, bat_priv, "%s(): changing ownership for %pM, vid %d to gw %pM\n", __func__, mac, batadv_print_vid(vid), backbone_gw->orig); remove_crc = true; } /* replace backbone_gw atomically and adjust reference counters */ spin_lock_bh(&claim->backbone_lock); old_backbone_gw = claim->backbone_gw; kref_get(&backbone_gw->refcount); claim->backbone_gw = backbone_gw; spin_unlock_bh(&claim->backbone_lock); if (remove_crc) { /* remove claim address from old backbone_gw */ spin_lock_bh(&old_backbone_gw->crc_lock); old_backbone_gw->crc ^= crc16(0, claim->addr, ETH_ALEN); spin_unlock_bh(&old_backbone_gw->crc_lock); } batadv_backbone_gw_put(old_backbone_gw); /* add claim address to new backbone_gw */ spin_lock_bh(&backbone_gw->crc_lock); backbone_gw->crc ^= crc16(0, claim->addr, ETH_ALEN); spin_unlock_bh(&backbone_gw->crc_lock); backbone_gw->lasttime = jiffies; claim_free_ref: batadv_claim_put(claim); } /** * batadv_bla_claim_get_backbone_gw() - Get valid reference for backbone_gw of * claim * @claim: claim whose backbone_gw should be returned * * Return: valid reference to claim::backbone_gw */ static struct batadv_bla_backbone_gw * batadv_bla_claim_get_backbone_gw(struct batadv_bla_claim *claim) { struct batadv_bla_backbone_gw *backbone_gw; spin_lock_bh(&claim->backbone_lock); backbone_gw = claim->backbone_gw; kref_get(&backbone_gw->refcount); spin_unlock_bh(&claim->backbone_lock); return backbone_gw; } /** * batadv_bla_del_claim() - delete a claim from the claim hash * @bat_priv: the bat priv with all the mesh interface information * @mac: mac address of the claim to be removed * @vid: VLAN id for the claim to be removed */ static void batadv_bla_del_claim(struct batadv_priv *bat_priv, const u8 *mac, const unsigned short vid) { struct batadv_bla_claim search_claim, *claim; struct batadv_bla_claim *claim_removed_entry; struct hlist_node *claim_removed_node; ether_addr_copy(search_claim.addr, mac); search_claim.vid = vid; claim = batadv_claim_hash_find(bat_priv, &search_claim); if (!claim) return; batadv_dbg(BATADV_DBG_BLA, bat_priv, "%s(): %pM, vid %d\n", __func__, mac, batadv_print_vid(vid)); claim_removed_node = batadv_hash_remove(bat_priv->bla.claim_hash, batadv_compare_claim, batadv_choose_claim, claim); if (!claim_removed_node) goto free_claim; /* reference from the hash is gone */ claim_removed_entry = hlist_entry(claim_removed_node, struct batadv_bla_claim, hash_entry); batadv_claim_put(claim_removed_entry); free_claim: /* don't need the reference from hash_find() anymore */ batadv_claim_put(claim); } /** * batadv_handle_announce() - check for ANNOUNCE frame * @bat_priv: the bat priv with all the mesh interface information * @an_addr: announcement mac address (ARP Sender HW address) * @backbone_addr: originator address of the sender (Ethernet source MAC) * @vid: the VLAN ID of the frame * * Return: true if handled */ static bool batadv_handle_announce(struct batadv_priv *bat_priv, u8 *an_addr, u8 *backbone_addr, unsigned short vid) { struct batadv_bla_backbone_gw *backbone_gw; u16 backbone_crc, crc; if (memcmp(an_addr, batadv_announce_mac, 4) != 0) return false; backbone_gw = batadv_bla_get_backbone_gw(bat_priv, backbone_addr, vid, false); if (unlikely(!backbone_gw)) return true; /* handle as ANNOUNCE frame */ backbone_gw->lasttime = jiffies; crc = ntohs(*((__force __be16 *)(&an_addr[4]))); batadv_dbg(BATADV_DBG_BLA, bat_priv, "%s(): ANNOUNCE vid %d (sent by %pM)... CRC = %#.4x\n", __func__, batadv_print_vid(vid), backbone_gw->orig, crc); spin_lock_bh(&backbone_gw->crc_lock); backbone_crc = backbone_gw->crc; spin_unlock_bh(&backbone_gw->crc_lock); if (backbone_crc != crc) { batadv_dbg(BATADV_DBG_BLA, backbone_gw->bat_priv, "%s(): CRC FAILED for %pM/%d (my = %#.4x, sent = %#.4x)\n", __func__, backbone_gw->orig, batadv_print_vid(backbone_gw->vid), backbone_crc, crc); batadv_bla_send_request(backbone_gw); } else { /* if we have sent a request and the crc was OK, * we can allow traffic again. */ if (atomic_read(&backbone_gw->request_sent)) { atomic_dec(&backbone_gw->bat_priv->bla.num_requests); atomic_set(&backbone_gw->request_sent, 0); } } batadv_backbone_gw_put(backbone_gw); return true; } /** * batadv_handle_request() - check for REQUEST frame * @bat_priv: the bat priv with all the mesh interface information * @primary_if: the primary hard interface of this batman mesh interface * @backbone_addr: backbone address to be requested (ARP sender HW MAC) * @ethhdr: ethernet header of a packet * @vid: the VLAN ID of the frame * * Return: true if handled */ static bool batadv_handle_request(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if, u8 *backbone_addr, struct ethhdr *ethhdr, unsigned short vid) { /* check for REQUEST frame */ if (!batadv_compare_eth(backbone_addr, ethhdr->h_dest)) return false; /* sanity check, this should not happen on a normal switch, * we ignore it in this case. */ if (!batadv_compare_eth(ethhdr->h_dest, primary_if->net_dev->dev_addr)) return true; batadv_dbg(BATADV_DBG_BLA, bat_priv, "%s(): REQUEST vid %d (sent by %pM)...\n", __func__, batadv_print_vid(vid), ethhdr->h_source); batadv_bla_answer_request(bat_priv, primary_if, vid); return true; } /** * batadv_handle_unclaim() - check for UNCLAIM frame * @bat_priv: the bat priv with all the mesh interface information * @primary_if: the primary hard interface of this batman mesh interface * @backbone_addr: originator address of the backbone (Ethernet source) * @claim_addr: Client to be unclaimed (ARP sender HW MAC) * @vid: the VLAN ID of the frame * * Return: true if handled */ static bool batadv_handle_unclaim(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if, const u8 *backbone_addr, const u8 *claim_addr, unsigned short vid) { struct batadv_bla_backbone_gw *backbone_gw; /* unclaim in any case if it is our own */ if (primary_if && batadv_compare_eth(backbone_addr, primary_if->net_dev->dev_addr)) batadv_bla_send_claim(bat_priv, claim_addr, vid, BATADV_CLAIM_TYPE_UNCLAIM); backbone_gw = batadv_backbone_hash_find(bat_priv, backbone_addr, vid); if (!backbone_gw) return true; /* this must be an UNCLAIM frame */ batadv_dbg(BATADV_DBG_BLA, bat_priv, "%s(): UNCLAIM %pM on vid %d (sent by %pM)...\n", __func__, claim_addr, batadv_print_vid(vid), backbone_gw->orig); batadv_bla_del_claim(bat_priv, claim_addr, vid); batadv_backbone_gw_put(backbone_gw); return true; } /** * batadv_handle_claim() - check for CLAIM frame * @bat_priv: the bat priv with all the mesh interface information * @primary_if: the primary hard interface of this batman mesh interface * @backbone_addr: originator address of the backbone (Ethernet Source) * @claim_addr: client mac address to be claimed (ARP sender HW MAC) * @vid: the VLAN ID of the frame * * Return: true if handled */ static bool batadv_handle_claim(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if, const u8 *backbone_addr, const u8 *claim_addr, unsigned short vid) { struct batadv_bla_backbone_gw *backbone_gw; /* register the gateway if not yet available, and add the claim. */ backbone_gw = batadv_bla_get_backbone_gw(bat_priv, backbone_addr, vid, false); if (unlikely(!backbone_gw)) return true; /* this must be a CLAIM frame */ batadv_bla_add_claim(bat_priv, claim_addr, vid, backbone_gw); if (batadv_compare_eth(backbone_addr, primary_if->net_dev->dev_addr)) batadv_bla_send_claim(bat_priv, claim_addr, vid, BATADV_CLAIM_TYPE_CLAIM); /* TODO: we could call something like tt_local_del() here. */ batadv_backbone_gw_put(backbone_gw); return true; } /** * batadv_check_claim_group() - check for claim group membership * @bat_priv: the bat priv with all the mesh interface information * @primary_if: the primary interface of this batman interface * @hw_src: the Hardware source in the ARP Header * @hw_dst: the Hardware destination in the ARP Header * @ethhdr: pointer to the Ethernet header of the claim frame * * checks if it is a claim packet and if it's on the same group. * This function also applies the group ID of the sender * if it is in the same mesh. * * Return: * 2 - if it is a claim packet and on the same group * 1 - if is a claim packet from another group * 0 - if it is not a claim packet */ static int batadv_check_claim_group(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if, u8 *hw_src, u8 *hw_dst, struct ethhdr *ethhdr) { u8 *backbone_addr; struct batadv_orig_node *orig_node; struct batadv_bla_claim_dst *bla_dst, *bla_dst_own; bla_dst = (struct batadv_bla_claim_dst *)hw_dst; bla_dst_own = &bat_priv->bla.claim_dest; /* if announcement packet, use the source, * otherwise assume it is in the hw_src */ switch (bla_dst->type) { case BATADV_CLAIM_TYPE_CLAIM: backbone_addr = hw_src; break; case BATADV_CLAIM_TYPE_REQUEST: case BATADV_CLAIM_TYPE_ANNOUNCE: case BATADV_CLAIM_TYPE_UNCLAIM: backbone_addr = ethhdr->h_source; break; default: return 0; } /* don't accept claim frames from ourselves */ if (batadv_compare_eth(backbone_addr, primary_if->net_dev->dev_addr)) return 0; /* if its already the same group, it is fine. */ if (bla_dst->group == bla_dst_own->group) return 2; /* lets see if this originator is in our mesh */ orig_node = batadv_orig_hash_find(bat_priv, backbone_addr); /* don't accept claims from gateways which are not in * the same mesh or group. */ if (!orig_node) return 1; /* if our mesh friends mac is bigger, use it for ourselves. */ if (ntohs(bla_dst->group) > ntohs(bla_dst_own->group)) { batadv_dbg(BATADV_DBG_BLA, bat_priv, "taking other backbones claim group: %#.4x\n", ntohs(bla_dst->group)); bla_dst_own->group = bla_dst->group; } batadv_orig_node_put(orig_node); return 2; } /** * batadv_bla_process_claim() - Check if this is a claim frame, and process it * @bat_priv: the bat priv with all the mesh interface information * @primary_if: the primary hard interface of this batman mesh interface * @skb: the frame to be checked * * Return: true if it was a claim frame, otherwise return false to * tell the callee that it can use the frame on its own. */ static bool batadv_bla_process_claim(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if, struct sk_buff *skb) { struct batadv_bla_claim_dst *bla_dst, *bla_dst_own; u8 *hw_src, *hw_dst; struct vlan_hdr *vhdr, vhdr_buf; struct ethhdr *ethhdr; struct arphdr *arphdr; unsigned short vid; int vlan_depth = 0; __be16 proto; int headlen; int ret; vid = batadv_get_vid(skb, 0); ethhdr = eth_hdr(skb); proto = ethhdr->h_proto; headlen = ETH_HLEN; if (vid & BATADV_VLAN_HAS_TAG) { /* Traverse the VLAN/Ethertypes. * * At this point it is known that the first protocol is a VLAN * header, so start checking at the encapsulated protocol. * * The depth of the VLAN headers is recorded to drop BLA claim * frames encapsulated into multiple VLAN headers (QinQ). */ do { vhdr = skb_header_pointer(skb, headlen, VLAN_HLEN, &vhdr_buf); if (!vhdr) return false; proto = vhdr->h_vlan_encapsulated_proto; headlen += VLAN_HLEN; vlan_depth++; } while (proto == htons(ETH_P_8021Q)); } if (proto != htons(ETH_P_ARP)) return false; /* not a claim frame */ /* this must be a ARP frame. check if it is a claim. */ if (unlikely(!pskb_may_pull(skb, headlen + arp_hdr_len(skb->dev)))) return false; /* pskb_may_pull() may have modified the pointers, get ethhdr again */ ethhdr = eth_hdr(skb); arphdr = (struct arphdr *)((u8 *)ethhdr + headlen); /* Check whether the ARP frame carries a valid * IP information */ if (arphdr->ar_hrd != htons(ARPHRD_ETHER)) return false; if (arphdr->ar_pro != htons(ETH_P_IP)) return false; if (arphdr->ar_hln != ETH_ALEN) return false; if (arphdr->ar_pln != 4) return false; hw_src = (u8 *)arphdr + sizeof(struct arphdr); hw_dst = hw_src + ETH_ALEN + 4; bla_dst = (struct batadv_bla_claim_dst *)hw_dst; bla_dst_own = &bat_priv->bla.claim_dest; /* check if it is a claim frame in general */ if (memcmp(bla_dst->magic, bla_dst_own->magic, sizeof(bla_dst->magic)) != 0) return false; /* check if there is a claim frame encapsulated deeper in (QinQ) and * drop that, as this is not supported by BLA but should also not be * sent via the mesh. */ if (vlan_depth > 1) return true; /* Let the loopdetect frames on the mesh in any case. */ if (bla_dst->type == BATADV_CLAIM_TYPE_LOOPDETECT) return false; /* check if it is a claim frame. */ ret = batadv_check_claim_group(bat_priv, primary_if, hw_src, hw_dst, ethhdr); if (ret == 1) batadv_dbg(BATADV_DBG_BLA, bat_priv, "%s(): received a claim frame from another group. From: %pM on vid %d ...(hw_src %pM, hw_dst %pM)\n", __func__, ethhdr->h_source, batadv_print_vid(vid), hw_src, hw_dst); if (ret < 2) return !!ret; /* become a backbone gw ourselves on this vlan if not happened yet */ batadv_bla_update_own_backbone_gw(bat_priv, primary_if, vid); /* check for the different types of claim frames ... */ switch (bla_dst->type) { case BATADV_CLAIM_TYPE_CLAIM: if (batadv_handle_claim(bat_priv, primary_if, hw_src, ethhdr->h_source, vid)) return true; break; case BATADV_CLAIM_TYPE_UNCLAIM: if (batadv_handle_unclaim(bat_priv, primary_if, ethhdr->h_source, hw_src, vid)) return true; break; case BATADV_CLAIM_TYPE_ANNOUNCE: if (batadv_handle_announce(bat_priv, hw_src, ethhdr->h_source, vid)) return true; break; case BATADV_CLAIM_TYPE_REQUEST: if (batadv_handle_request(bat_priv, primary_if, hw_src, ethhdr, vid)) return true; break; } batadv_dbg(BATADV_DBG_BLA, bat_priv, "%s(): ERROR - this looks like a claim frame, but is useless. eth src %pM on vid %d ...(hw_src %pM, hw_dst %pM)\n", __func__, ethhdr->h_source, batadv_print_vid(vid), hw_src, hw_dst); return true; } /** * batadv_bla_purge_backbone_gw() - Remove backbone gateways after a timeout or * immediately * @bat_priv: the bat priv with all the mesh interface information * @now: whether the whole hash shall be wiped now * * Check when we last heard from other nodes, and remove them in case of * a time out, or clean all backbone gws if now is set. */ static void batadv_bla_purge_backbone_gw(struct batadv_priv *bat_priv, int now) { struct batadv_bla_backbone_gw *backbone_gw; struct hlist_node *node_tmp; struct hlist_head *head; struct batadv_hashtable *hash; spinlock_t *list_lock; /* protects write access to the hash lists */ int i; hash = bat_priv->bla.backbone_hash; if (!hash) return; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; list_lock = &hash->list_locks[i]; spin_lock_bh(list_lock); hlist_for_each_entry_safe(backbone_gw, node_tmp, head, hash_entry) { if (now) goto purge_now; if (!batadv_has_timed_out(backbone_gw->lasttime, BATADV_BLA_BACKBONE_TIMEOUT)) continue; batadv_dbg(BATADV_DBG_BLA, backbone_gw->bat_priv, "%s(): backbone gw %pM timed out\n", __func__, backbone_gw->orig); purge_now: /* don't wait for the pending request anymore */ if (atomic_read(&backbone_gw->request_sent)) atomic_dec(&bat_priv->bla.num_requests); batadv_bla_del_backbone_claims(backbone_gw); hlist_del_rcu(&backbone_gw->hash_entry); batadv_backbone_gw_put(backbone_gw); } spin_unlock_bh(list_lock); } } /** * batadv_bla_purge_claims() - Remove claims after a timeout or immediately * @bat_priv: the bat priv with all the mesh interface information * @primary_if: the selected primary interface, may be NULL if now is set * @now: whether the whole hash shall be wiped now * * Check when we heard last time from our own claims, and remove them in case of * a time out, or clean all claims if now is set */ static void batadv_bla_purge_claims(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if, int now) { struct batadv_bla_backbone_gw *backbone_gw; struct batadv_bla_claim *claim; struct hlist_head *head; struct batadv_hashtable *hash; int i; hash = bat_priv->bla.claim_hash; if (!hash) return; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; rcu_read_lock(); hlist_for_each_entry_rcu(claim, head, hash_entry) { backbone_gw = batadv_bla_claim_get_backbone_gw(claim); if (now) goto purge_now; if (!batadv_compare_eth(backbone_gw->orig, primary_if->net_dev->dev_addr)) goto skip; if (!batadv_has_timed_out(claim->lasttime, BATADV_BLA_CLAIM_TIMEOUT)) goto skip; batadv_dbg(BATADV_DBG_BLA, bat_priv, "%s(): timed out.\n", __func__); purge_now: batadv_dbg(BATADV_DBG_BLA, bat_priv, "%s(): %pM, vid %d\n", __func__, claim->addr, claim->vid); batadv_handle_unclaim(bat_priv, primary_if, backbone_gw->orig, claim->addr, claim->vid); skip: batadv_backbone_gw_put(backbone_gw); } rcu_read_unlock(); } } /** * batadv_bla_update_orig_address() - Update the backbone gateways when the own * originator address changes * @bat_priv: the bat priv with all the mesh interface information * @primary_if: the new selected primary_if * @oldif: the old primary interface, may be NULL */ void batadv_bla_update_orig_address(struct batadv_priv *bat_priv, struct batadv_hard_iface *primary_if, struct batadv_hard_iface *oldif) { struct batadv_bla_backbone_gw *backbone_gw; struct hlist_head *head; struct batadv_hashtable *hash; __be16 group; int i; /* reset bridge loop avoidance group id */ group = htons(crc16(0, primary_if->net_dev->dev_addr, ETH_ALEN)); bat_priv->bla.claim_dest.group = group; /* purge everything when bridge loop avoidance is turned off */ if (!atomic_read(&bat_priv->bridge_loop_avoidance)) oldif = NULL; if (!oldif) { batadv_bla_purge_claims(bat_priv, NULL, 1); batadv_bla_purge_backbone_gw(bat_priv, 1); return; } hash = bat_priv->bla.backbone_hash; if (!hash) return; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; rcu_read_lock(); hlist_for_each_entry_rcu(backbone_gw, head, hash_entry) { /* own orig still holds the old value. */ if (!batadv_compare_eth(backbone_gw->orig, oldif->net_dev->dev_addr)) continue; ether_addr_copy(backbone_gw->orig, primary_if->net_dev->dev_addr); /* send an announce frame so others will ask for our * claims and update their tables. */ batadv_bla_send_announce(bat_priv, backbone_gw); } rcu_read_unlock(); } } /** * batadv_bla_send_loopdetect() - send a loopdetect frame * @bat_priv: the bat priv with all the mesh interface information * @backbone_gw: the backbone gateway for which a loop should be detected * * To detect loops that the bridge loop avoidance can't handle, send a loop * detection packet on the backbone. Unlike other BLA frames, this frame will * be allowed on the mesh by other nodes. If it is received on the mesh, this * indicates that there is a loop. */ static void batadv_bla_send_loopdetect(struct batadv_priv *bat_priv, struct batadv_bla_backbone_gw *backbone_gw) { batadv_dbg(BATADV_DBG_BLA, bat_priv, "Send loopdetect frame for vid %d\n", backbone_gw->vid); batadv_bla_send_claim(bat_priv, bat_priv->bla.loopdetect_addr, backbone_gw->vid, BATADV_CLAIM_TYPE_LOOPDETECT); } /** * batadv_bla_status_update() - purge bla interfaces if necessary * @net_dev: the mesh interface net device */ void batadv_bla_status_update(struct net_device *net_dev) { struct batadv_priv *bat_priv = netdev_priv(net_dev); struct batadv_hard_iface *primary_if; primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if) return; /* this function already purges everything when bla is disabled, * so just call that one. */ batadv_bla_update_orig_address(bat_priv, primary_if, primary_if); batadv_hardif_put(primary_if); } /** * batadv_bla_periodic_work() - performs periodic bla work * @work: kernel work struct * * periodic work to do: * * purge structures when they are too old * * send announcements */ static void batadv_bla_periodic_work(struct work_struct *work) { struct delayed_work *delayed_work; struct batadv_priv *bat_priv; struct batadv_priv_bla *priv_bla; struct hlist_head *head; struct batadv_bla_backbone_gw *backbone_gw; struct batadv_hashtable *hash; struct batadv_hard_iface *primary_if; bool send_loopdetect = false; int i; delayed_work = to_delayed_work(work); priv_bla = container_of(delayed_work, struct batadv_priv_bla, work); bat_priv = container_of(priv_bla, struct batadv_priv, bla); primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if) goto out; batadv_bla_purge_claims(bat_priv, primary_if, 0); batadv_bla_purge_backbone_gw(bat_priv, 0); if (!atomic_read(&bat_priv->bridge_loop_avoidance)) goto out; if (atomic_dec_and_test(&bat_priv->bla.loopdetect_next)) { /* set a new random mac address for the next bridge loop * detection frames. Set the locally administered bit to avoid * collisions with users mac addresses. */ eth_random_addr(bat_priv->bla.loopdetect_addr); bat_priv->bla.loopdetect_addr[0] = 0xba; bat_priv->bla.loopdetect_addr[1] = 0xbe; bat_priv->bla.loopdetect_lasttime = jiffies; atomic_set(&bat_priv->bla.loopdetect_next, BATADV_BLA_LOOPDETECT_PERIODS); /* mark for sending loop detect on all VLANs */ send_loopdetect = true; } hash = bat_priv->bla.backbone_hash; if (!hash) goto out; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; rcu_read_lock(); hlist_for_each_entry_rcu(backbone_gw, head, hash_entry) { if (!batadv_compare_eth(backbone_gw->orig, primary_if->net_dev->dev_addr)) continue; backbone_gw->lasttime = jiffies; batadv_bla_send_announce(bat_priv, backbone_gw); if (send_loopdetect) batadv_bla_send_loopdetect(bat_priv, backbone_gw); /* request_sent is only set after creation to avoid * problems when we are not yet known as backbone gw * in the backbone. * * We can reset this now after we waited some periods * to give bridge forward delays and bla group forming * some grace time. */ if (atomic_read(&backbone_gw->request_sent) == 0) continue; if (!atomic_dec_and_test(&backbone_gw->wait_periods)) continue; atomic_dec(&backbone_gw->bat_priv->bla.num_requests); atomic_set(&backbone_gw->request_sent, 0); } rcu_read_unlock(); } out: batadv_hardif_put(primary_if); queue_delayed_work(batadv_event_workqueue, &bat_priv->bla.work, msecs_to_jiffies(BATADV_BLA_PERIOD_LENGTH)); } /* The hash for claim and backbone hash receive the same key because they * are getting initialized by hash_new with the same key. Reinitializing * them with to different keys to allow nested locking without generating * lockdep warnings */ static struct lock_class_key batadv_claim_hash_lock_class_key; static struct lock_class_key batadv_backbone_hash_lock_class_key; /** * batadv_bla_init() - initialize all bla structures * @bat_priv: the bat priv with all the mesh interface information * * Return: 0 on success, < 0 on error. */ int batadv_bla_init(struct batadv_priv *bat_priv) { int i; u8 claim_dest[ETH_ALEN] = {0xff, 0x43, 0x05, 0x00, 0x00, 0x00}; struct batadv_hard_iface *primary_if; u16 crc; unsigned long entrytime; spin_lock_init(&bat_priv->bla.bcast_duplist_lock); batadv_dbg(BATADV_DBG_BLA, bat_priv, "bla hash registering\n"); /* setting claim destination address */ memcpy(&bat_priv->bla.claim_dest.magic, claim_dest, 3); bat_priv->bla.claim_dest.type = 0; primary_if = batadv_primary_if_get_selected(bat_priv); if (primary_if) { crc = crc16(0, primary_if->net_dev->dev_addr, ETH_ALEN); bat_priv->bla.claim_dest.group = htons(crc); batadv_hardif_put(primary_if); } else { bat_priv->bla.claim_dest.group = 0; /* will be set later */ } /* initialize the duplicate list */ entrytime = jiffies - msecs_to_jiffies(BATADV_DUPLIST_TIMEOUT); for (i = 0; i < BATADV_DUPLIST_SIZE; i++) bat_priv->bla.bcast_duplist[i].entrytime = entrytime; bat_priv->bla.bcast_duplist_curr = 0; atomic_set(&bat_priv->bla.loopdetect_next, BATADV_BLA_LOOPDETECT_PERIODS); if (bat_priv->bla.claim_hash) return 0; bat_priv->bla.claim_hash = batadv_hash_new(128); if (!bat_priv->bla.claim_hash) return -ENOMEM; bat_priv->bla.backbone_hash = batadv_hash_new(32); if (!bat_priv->bla.backbone_hash) { batadv_hash_destroy(bat_priv->bla.claim_hash); return -ENOMEM; } batadv_hash_set_lock_class(bat_priv->bla.claim_hash, &batadv_claim_hash_lock_class_key); batadv_hash_set_lock_class(bat_priv->bla.backbone_hash, &batadv_backbone_hash_lock_class_key); batadv_dbg(BATADV_DBG_BLA, bat_priv, "bla hashes initialized\n"); INIT_DELAYED_WORK(&bat_priv->bla.work, batadv_bla_periodic_work); queue_delayed_work(batadv_event_workqueue, &bat_priv->bla.work, msecs_to_jiffies(BATADV_BLA_PERIOD_LENGTH)); return 0; } /** * batadv_bla_check_duplist() - Check if a frame is in the broadcast dup. * @bat_priv: the bat priv with all the mesh interface information * @skb: contains the multicast packet to be checked * @payload_ptr: pointer to position inside the head buffer of the skb * marking the start of the data to be CRC'ed * @orig: originator mac address, NULL if unknown * * Check if it is on our broadcast list. Another gateway might have sent the * same packet because it is connected to the same backbone, so we have to * remove this duplicate. * * This is performed by checking the CRC, which will tell us * with a good chance that it is the same packet. If it is furthermore * sent by another host, drop it. We allow equal packets from * the same host however as this might be intended. * * Return: true if a packet is in the duplicate list, false otherwise. */ static bool batadv_bla_check_duplist(struct batadv_priv *bat_priv, struct sk_buff *skb, u8 *payload_ptr, const u8 *orig) { struct batadv_bcast_duplist_entry *entry; bool ret = false; int i, curr; __be32 crc; /* calculate the crc ... */ crc = batadv_skb_crc32(skb, payload_ptr); spin_lock_bh(&bat_priv->bla.bcast_duplist_lock); for (i = 0; i < BATADV_DUPLIST_SIZE; i++) { curr = (bat_priv->bla.bcast_duplist_curr + i); curr %= BATADV_DUPLIST_SIZE; entry = &bat_priv->bla.bcast_duplist[curr]; /* we can stop searching if the entry is too old ; * later entries will be even older */ if (batadv_has_timed_out(entry->entrytime, BATADV_DUPLIST_TIMEOUT)) break; if (entry->crc != crc) continue; /* are the originators both known and not anonymous? */ if (orig && !is_zero_ether_addr(orig) && !is_zero_ether_addr(entry->orig)) { /* If known, check if the new frame came from * the same originator: * We are safe to take identical frames from the * same orig, if known, as multiplications in * the mesh are detected via the (orig, seqno) pair. * So we can be a bit more liberal here and allow * identical frames from the same orig which the source * host might have sent multiple times on purpose. */ if (batadv_compare_eth(entry->orig, orig)) continue; } /* this entry seems to match: same crc, not too old, * and from another gw. therefore return true to forbid it. */ ret = true; goto out; } /* not found, add a new entry (overwrite the oldest entry) * and allow it, its the first occurrence. */ curr = (bat_priv->bla.bcast_duplist_curr + BATADV_DUPLIST_SIZE - 1); curr %= BATADV_DUPLIST_SIZE; entry = &bat_priv->bla.bcast_duplist[curr]; entry->crc = crc; entry->entrytime = jiffies; /* known originator */ if (orig) ether_addr_copy(entry->orig, orig); /* anonymous originator */ else eth_zero_addr(entry->orig); bat_priv->bla.bcast_duplist_curr = curr; out: spin_unlock_bh(&bat_priv->bla.bcast_duplist_lock); return ret; } /** * batadv_bla_check_ucast_duplist() - Check if a frame is in the broadcast dup. * @bat_priv: the bat priv with all the mesh interface information * @skb: contains the multicast packet to be checked, decapsulated from a * unicast_packet * * Check if it is on our broadcast list. Another gateway might have sent the * same packet because it is connected to the same backbone, so we have to * remove this duplicate. * * Return: true if a packet is in the duplicate list, false otherwise. */ static bool batadv_bla_check_ucast_duplist(struct batadv_priv *bat_priv, struct sk_buff *skb) { return batadv_bla_check_duplist(bat_priv, skb, (u8 *)skb->data, NULL); } /** * batadv_bla_check_bcast_duplist() - Check if a frame is in the broadcast dup. * @bat_priv: the bat priv with all the mesh interface information * @skb: contains the bcast_packet to be checked * * Check if it is on our broadcast list. Another gateway might have sent the * same packet because it is connected to the same backbone, so we have to * remove this duplicate. * * Return: true if a packet is in the duplicate list, false otherwise. */ bool batadv_bla_check_bcast_duplist(struct batadv_priv *bat_priv, struct sk_buff *skb) { struct batadv_bcast_packet *bcast_packet; u8 *payload_ptr; bcast_packet = (struct batadv_bcast_packet *)skb->data; payload_ptr = (u8 *)(bcast_packet + 1); return batadv_bla_check_duplist(bat_priv, skb, payload_ptr, bcast_packet->orig); } /** * batadv_bla_is_backbone_gw_orig() - Check if the originator is a gateway for * the VLAN identified by vid. * @bat_priv: the bat priv with all the mesh interface information * @orig: originator mac address * @vid: VLAN identifier * * Return: true if orig is a backbone for this vid, false otherwise. */ bool batadv_bla_is_backbone_gw_orig(struct batadv_priv *bat_priv, u8 *orig, unsigned short vid) { struct batadv_hashtable *hash = bat_priv->bla.backbone_hash; struct hlist_head *head; struct batadv_bla_backbone_gw *backbone_gw; int i; if (!atomic_read(&bat_priv->bridge_loop_avoidance)) return false; if (!hash) return false; for (i = 0; i < hash->size; i++) { head = &hash->table[i]; rcu_read_lock(); hlist_for_each_entry_rcu(backbone_gw, head, hash_entry) { if (batadv_compare_eth(backbone_gw->orig, orig) && backbone_gw->vid == vid) { rcu_read_unlock(); return true; } } rcu_read_unlock(); } return false; } /** * batadv_bla_is_backbone_gw() - check if originator is a backbone gw for a VLAN * @skb: the frame to be checked * @orig_node: the orig_node of the frame * @hdr_size: maximum length of the frame * * Return: true if the orig_node is also a gateway on the mesh interface, * otherwise it returns false. */ bool batadv_bla_is_backbone_gw(struct sk_buff *skb, struct batadv_orig_node *orig_node, int hdr_size) { struct batadv_bla_backbone_gw *backbone_gw; unsigned short vid; if (!atomic_read(&orig_node->bat_priv->bridge_loop_avoidance)) return false; /* first, find out the vid. */ if (!pskb_may_pull(skb, hdr_size + ETH_HLEN)) return false; vid = batadv_get_vid(skb, hdr_size); /* see if this originator is a backbone gw for this VLAN */ backbone_gw = batadv_backbone_hash_find(orig_node->bat_priv, orig_node->orig, vid); if (!backbone_gw) return false; batadv_backbone_gw_put(backbone_gw); return true; } /** * batadv_bla_free() - free all bla structures * @bat_priv: the bat priv with all the mesh interface information * * for meshinterface free or module unload */ void batadv_bla_free(struct batadv_priv *bat_priv) { struct batadv_hard_iface *primary_if; cancel_delayed_work_sync(&bat_priv->bla.work); primary_if = batadv_primary_if_get_selected(bat_priv); if (bat_priv->bla.claim_hash) { batadv_bla_purge_claims(bat_priv, primary_if, 1); batadv_hash_destroy(bat_priv->bla.claim_hash); bat_priv->bla.claim_hash = NULL; } if (bat_priv->bla.backbone_hash) { batadv_bla_purge_backbone_gw(bat_priv, 1); batadv_hash_destroy(bat_priv->bla.backbone_hash); bat_priv->bla.backbone_hash = NULL; } batadv_hardif_put(primary_if); } /** * batadv_bla_loopdetect_check() - check and handle a detected loop * @bat_priv: the bat priv with all the mesh interface information * @skb: the packet to check * @primary_if: interface where the request came on * @vid: the VLAN ID of the frame * * Checks if this packet is a loop detect frame which has been sent by us, * throws an uevent and logs the event if that is the case. * * Return: true if it is a loop detect frame which is to be dropped, false * otherwise. */ static bool batadv_bla_loopdetect_check(struct batadv_priv *bat_priv, struct sk_buff *skb, struct batadv_hard_iface *primary_if, unsigned short vid) { struct batadv_bla_backbone_gw *backbone_gw; struct ethhdr *ethhdr; bool ret; ethhdr = eth_hdr(skb); /* Only check for the MAC address and skip more checks here for * performance reasons - this function is on the hotpath, after all. */ if (!batadv_compare_eth(ethhdr->h_source, bat_priv->bla.loopdetect_addr)) return false; /* If the packet came too late, don't forward it on the mesh * but don't consider that as loop. It might be a coincidence. */ if (batadv_has_timed_out(bat_priv->bla.loopdetect_lasttime, BATADV_BLA_LOOPDETECT_TIMEOUT)) return true; backbone_gw = batadv_bla_get_backbone_gw(bat_priv, primary_if->net_dev->dev_addr, vid, true); if (unlikely(!backbone_gw)) return true; ret = queue_work(batadv_event_workqueue, &backbone_gw->report_work); /* backbone_gw is unreferenced in the report work function * if queue_work() call was successful */ if (!ret) batadv_backbone_gw_put(backbone_gw); return true; } /** * batadv_bla_rx() - check packets coming from the mesh. * @bat_priv: the bat priv with all the mesh interface information * @skb: the frame to be checked * @vid: the VLAN ID of the frame * @packet_type: the batman packet type this frame came in * * batadv_bla_rx avoidance checks if: * * we have to race for a claim * * if the frame is allowed on the LAN * * In these cases, the skb is further handled by this function * * Return: true if handled, otherwise it returns false and the caller shall * further process the skb. */ bool batadv_bla_rx(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid, int packet_type) { struct batadv_bla_backbone_gw *backbone_gw; struct ethhdr *ethhdr; struct batadv_bla_claim search_claim, *claim = NULL; struct batadv_hard_iface *primary_if; bool own_claim; bool ret; ethhdr = eth_hdr(skb); primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if) goto handled; if (!atomic_read(&bat_priv->bridge_loop_avoidance)) goto allow; if (batadv_bla_loopdetect_check(bat_priv, skb, primary_if, vid)) goto handled; if (unlikely(atomic_read(&bat_priv->bla.num_requests))) /* don't allow multicast packets while requests are in flight */ if (is_multicast_ether_addr(ethhdr->h_dest)) /* Both broadcast flooding or multicast-via-unicasts * delivery might send to multiple backbone gateways * sharing the same LAN and therefore need to coordinate * which backbone gateway forwards into the LAN, * by claiming the payload source address. * * Broadcast flooding and multicast-via-unicasts * delivery use the following two batman packet types. * Note: explicitly exclude BATADV_UNICAST_4ADDR, * as the DHCP gateway feature will send explicitly * to only one BLA gateway, so the claiming process * should be avoided there. */ if (packet_type == BATADV_BCAST || packet_type == BATADV_UNICAST) goto handled; /* potential duplicates from foreign BLA backbone gateways via * multicast-in-unicast packets */ if (is_multicast_ether_addr(ethhdr->h_dest) && packet_type == BATADV_UNICAST && batadv_bla_check_ucast_duplist(bat_priv, skb)) goto handled; ether_addr_copy(search_claim.addr, ethhdr->h_source); search_claim.vid = vid; claim = batadv_claim_hash_find(bat_priv, &search_claim); if (!claim) { bool local = batadv_is_my_client(bat_priv, ethhdr->h_source, vid); /* possible optimization: race for a claim */ /* No claim exists yet, claim it for us! */ batadv_dbg(BATADV_DBG_BLA, bat_priv, "%s(): Unclaimed MAC %pM found. Claim it. Local: %s\n", __func__, ethhdr->h_source, str_yes_no(local)); batadv_handle_claim(bat_priv, primary_if, primary_if->net_dev->dev_addr, ethhdr->h_source, vid); goto allow; } /* if it is our own claim ... */ backbone_gw = batadv_bla_claim_get_backbone_gw(claim); own_claim = batadv_compare_eth(backbone_gw->orig, primary_if->net_dev->dev_addr); batadv_backbone_gw_put(backbone_gw); if (own_claim) { /* ... allow it in any case */ claim->lasttime = jiffies; goto allow; } /* if it is a multicast ... */ if (is_multicast_ether_addr(ethhdr->h_dest) && (packet_type == BATADV_BCAST || packet_type == BATADV_UNICAST)) { /* ... drop it. the responsible gateway is in charge. * * We need to check packet type because with the gateway * feature, broadcasts (like DHCP requests) may be sent * using a unicast 4 address packet type. See comment above. */ goto handled; } else { /* seems the client considers us as its best gateway. * send a claim and update the claim table * immediately. */ batadv_handle_claim(bat_priv, primary_if, primary_if->net_dev->dev_addr, ethhdr->h_source, vid); goto allow; } allow: batadv_bla_update_own_backbone_gw(bat_priv, primary_if, vid); ret = false; goto out; handled: kfree_skb(skb); ret = true; out: batadv_hardif_put(primary_if); batadv_claim_put(claim); return ret; } /** * batadv_bla_tx() - check packets going into the mesh * @bat_priv: the bat priv with all the mesh interface information * @skb: the frame to be checked * @vid: the VLAN ID of the frame * * batadv_bla_tx checks if: * * a claim was received which has to be processed * * the frame is allowed on the mesh * * in these cases, the skb is further handled by this function. * * This call might reallocate skb data. * * Return: true if handled, otherwise it returns false and the caller shall * further process the skb. */ bool batadv_bla_tx(struct batadv_priv *bat_priv, struct sk_buff *skb, unsigned short vid) { struct ethhdr *ethhdr; struct batadv_bla_claim search_claim, *claim = NULL; struct batadv_bla_backbone_gw *backbone_gw; struct batadv_hard_iface *primary_if; bool client_roamed; bool ret = false; primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if) goto out; if (!atomic_read(&bat_priv->bridge_loop_avoidance)) goto allow; if (batadv_bla_process_claim(bat_priv, primary_if, skb)) goto handled; ethhdr = eth_hdr(skb); if (unlikely(atomic_read(&bat_priv->bla.num_requests))) /* don't allow broadcasts while requests are in flight */ if (is_multicast_ether_addr(ethhdr->h_dest)) goto handled; ether_addr_copy(search_claim.addr, ethhdr->h_source); search_claim.vid = vid; claim = batadv_claim_hash_find(bat_priv, &search_claim); /* if no claim exists, allow it. */ if (!claim) goto allow; /* check if we are responsible. */ backbone_gw = batadv_bla_claim_get_backbone_gw(claim); client_roamed = batadv_compare_eth(backbone_gw->orig, primary_if->net_dev->dev_addr); batadv_backbone_gw_put(backbone_gw); if (client_roamed) { /* if yes, the client has roamed and we have * to unclaim it. */ if (batadv_has_timed_out(claim->lasttime, 100)) { /* only unclaim if the last claim entry is * older than 100 ms to make sure we really * have a roaming client here. */ batadv_dbg(BATADV_DBG_BLA, bat_priv, "%s(): Roaming client %pM detected. Unclaim it.\n", __func__, ethhdr->h_source); batadv_handle_unclaim(bat_priv, primary_if, primary_if->net_dev->dev_addr, ethhdr->h_source, vid); goto allow; } else { batadv_dbg(BATADV_DBG_BLA, bat_priv, "%s(): Race for claim %pM detected. Drop packet.\n", __func__, ethhdr->h_source); goto handled; } } /* check if it is a multicast/broadcast frame */ if (is_multicast_ether_addr(ethhdr->h_dest)) { /* drop it. the responsible gateway has forwarded it into * the backbone network. */ goto handled; } else { /* we must allow it. at least if we are * responsible for the DESTINATION. */ goto allow; } allow: batadv_bla_update_own_backbone_gw(bat_priv, primary_if, vid); ret = false; goto out; handled: ret = true; out: batadv_hardif_put(primary_if); batadv_claim_put(claim); return ret; } /** * batadv_bla_claim_dump_entry() - dump one entry of the claim table * to a netlink socket * @msg: buffer for the message * @portid: netlink port * @cb: Control block containing additional options * @primary_if: primary interface * @claim: entry to dump * * Return: 0 or error code. */ static int batadv_bla_claim_dump_entry(struct sk_buff *msg, u32 portid, struct netlink_callback *cb, struct batadv_hard_iface *primary_if, struct batadv_bla_claim *claim) { const u8 *primary_addr = primary_if->net_dev->dev_addr; u16 backbone_crc; bool is_own; void *hdr; int ret = -EINVAL; hdr = genlmsg_put(msg, portid, cb->nlh->nlmsg_seq, &batadv_netlink_family, NLM_F_MULTI, BATADV_CMD_GET_BLA_CLAIM); if (!hdr) { ret = -ENOBUFS; goto out; } genl_dump_check_consistent(cb, hdr); is_own = batadv_compare_eth(claim->backbone_gw->orig, primary_addr); spin_lock_bh(&claim->backbone_gw->crc_lock); backbone_crc = claim->backbone_gw->crc; spin_unlock_bh(&claim->backbone_gw->crc_lock); if (is_own) if (nla_put_flag(msg, BATADV_ATTR_BLA_OWN)) { genlmsg_cancel(msg, hdr); goto out; } if (nla_put(msg, BATADV_ATTR_BLA_ADDRESS, ETH_ALEN, claim->addr) || nla_put_u16(msg, BATADV_ATTR_BLA_VID, claim->vid) || nla_put(msg, BATADV_ATTR_BLA_BACKBONE, ETH_ALEN, claim->backbone_gw->orig) || nla_put_u16(msg, BATADV_ATTR_BLA_CRC, backbone_crc)) { genlmsg_cancel(msg, hdr); goto out; } genlmsg_end(msg, hdr); ret = 0; out: return ret; } /** * batadv_bla_claim_dump_bucket() - dump one bucket of the claim table * to a netlink socket * @msg: buffer for the message * @portid: netlink port * @cb: Control block containing additional options * @primary_if: primary interface * @hash: hash to dump * @bucket: bucket index to dump * @idx_skip: How many entries to skip * * Return: always 0. */ static int batadv_bla_claim_dump_bucket(struct sk_buff *msg, u32 portid, struct netlink_callback *cb, struct batadv_hard_iface *primary_if, struct batadv_hashtable *hash, unsigned int bucket, int *idx_skip) { struct batadv_bla_claim *claim; int idx = 0; int ret = 0; spin_lock_bh(&hash->list_locks[bucket]); cb->seq = atomic_read(&hash->generation) << 1 | 1; hlist_for_each_entry(claim, &hash->table[bucket], hash_entry) { if (idx++ < *idx_skip) continue; ret = batadv_bla_claim_dump_entry(msg, portid, cb, primary_if, claim); if (ret) { *idx_skip = idx - 1; goto unlock; } } *idx_skip = 0; unlock: spin_unlock_bh(&hash->list_locks[bucket]); return ret; } /** * batadv_bla_claim_dump() - dump claim table to a netlink socket * @msg: buffer for the message * @cb: callback structure containing arguments * * Return: message length. */ int batadv_bla_claim_dump(struct sk_buff *msg, struct netlink_callback *cb) { struct batadv_hard_iface *primary_if = NULL; int portid = NETLINK_CB(cb->skb).portid; struct net_device *mesh_iface; struct batadv_hashtable *hash; struct batadv_priv *bat_priv; int bucket = cb->args[0]; int idx = cb->args[1]; int ret = 0; mesh_iface = batadv_netlink_get_meshif(cb); if (IS_ERR(mesh_iface)) return PTR_ERR(mesh_iface); bat_priv = netdev_priv(mesh_iface); hash = bat_priv->bla.claim_hash; primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) { ret = -ENOENT; goto out; } while (bucket < hash->size) { if (batadv_bla_claim_dump_bucket(msg, portid, cb, primary_if, hash, bucket, &idx)) break; bucket++; } cb->args[0] = bucket; cb->args[1] = idx; ret = msg->len; out: batadv_hardif_put(primary_if); dev_put(mesh_iface); return ret; } /** * batadv_bla_backbone_dump_entry() - dump one entry of the backbone table to a * netlink socket * @msg: buffer for the message * @portid: netlink port * @cb: Control block containing additional options * @primary_if: primary interface * @backbone_gw: entry to dump * * Return: 0 or error code. */ static int batadv_bla_backbone_dump_entry(struct sk_buff *msg, u32 portid, struct netlink_callback *cb, struct batadv_hard_iface *primary_if, struct batadv_bla_backbone_gw *backbone_gw) { const u8 *primary_addr = primary_if->net_dev->dev_addr; u16 backbone_crc; bool is_own; int msecs; void *hdr; int ret = -EINVAL; hdr = genlmsg_put(msg, portid, cb->nlh->nlmsg_seq, &batadv_netlink_family, NLM_F_MULTI, BATADV_CMD_GET_BLA_BACKBONE); if (!hdr) { ret = -ENOBUFS; goto out; } genl_dump_check_consistent(cb, hdr); is_own = batadv_compare_eth(backbone_gw->orig, primary_addr); spin_lock_bh(&backbone_gw->crc_lock); backbone_crc = backbone_gw->crc; spin_unlock_bh(&backbone_gw->crc_lock); msecs = jiffies_to_msecs(jiffies - backbone_gw->lasttime); if (is_own) if (nla_put_flag(msg, BATADV_ATTR_BLA_OWN)) { genlmsg_cancel(msg, hdr); goto out; } if (nla_put(msg, BATADV_ATTR_BLA_BACKBONE, ETH_ALEN, backbone_gw->orig) || nla_put_u16(msg, BATADV_ATTR_BLA_VID, backbone_gw->vid) || nla_put_u16(msg, BATADV_ATTR_BLA_CRC, backbone_crc) || nla_put_u32(msg, BATADV_ATTR_LAST_SEEN_MSECS, msecs)) { genlmsg_cancel(msg, hdr); goto out; } genlmsg_end(msg, hdr); ret = 0; out: return ret; } /** * batadv_bla_backbone_dump_bucket() - dump one bucket of the backbone table to * a netlink socket * @msg: buffer for the message * @portid: netlink port * @cb: Control block containing additional options * @primary_if: primary interface * @hash: hash to dump * @bucket: bucket index to dump * @idx_skip: How many entries to skip * * Return: always 0. */ static int batadv_bla_backbone_dump_bucket(struct sk_buff *msg, u32 portid, struct netlink_callback *cb, struct batadv_hard_iface *primary_if, struct batadv_hashtable *hash, unsigned int bucket, int *idx_skip) { struct batadv_bla_backbone_gw *backbone_gw; int idx = 0; int ret = 0; spin_lock_bh(&hash->list_locks[bucket]); cb->seq = atomic_read(&hash->generation) << 1 | 1; hlist_for_each_entry(backbone_gw, &hash->table[bucket], hash_entry) { if (idx++ < *idx_skip) continue; ret = batadv_bla_backbone_dump_entry(msg, portid, cb, primary_if, backbone_gw); if (ret) { *idx_skip = idx - 1; goto unlock; } } *idx_skip = 0; unlock: spin_unlock_bh(&hash->list_locks[bucket]); return ret; } /** * batadv_bla_backbone_dump() - dump backbone table to a netlink socket * @msg: buffer for the message * @cb: callback structure containing arguments * * Return: message length. */ int batadv_bla_backbone_dump(struct sk_buff *msg, struct netlink_callback *cb) { struct batadv_hard_iface *primary_if = NULL; int portid = NETLINK_CB(cb->skb).portid; struct net_device *mesh_iface; struct batadv_hashtable *hash; struct batadv_priv *bat_priv; int bucket = cb->args[0]; int idx = cb->args[1]; int ret = 0; mesh_iface = batadv_netlink_get_meshif(cb); if (IS_ERR(mesh_iface)) return PTR_ERR(mesh_iface); bat_priv = netdev_priv(mesh_iface); hash = bat_priv->bla.backbone_hash; primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if || primary_if->if_status != BATADV_IF_ACTIVE) { ret = -ENOENT; goto out; } while (bucket < hash->size) { if (batadv_bla_backbone_dump_bucket(msg, portid, cb, primary_if, hash, bucket, &idx)) break; bucket++; } cb->args[0] = bucket; cb->args[1] = idx; ret = msg->len; out: batadv_hardif_put(primary_if); dev_put(mesh_iface); return ret; } #ifdef CONFIG_BATMAN_ADV_DAT /** * batadv_bla_check_claim() - check if address is claimed * * @bat_priv: the bat priv with all the mesh interface information * @addr: mac address of which the claim status is checked * @vid: the VLAN ID * * addr is checked if this address is claimed by the local device itself. * * Return: true if bla is disabled or the mac is claimed by the device, * false if the device addr is already claimed by another gateway */ bool batadv_bla_check_claim(struct batadv_priv *bat_priv, u8 *addr, unsigned short vid) { struct batadv_bla_claim search_claim; struct batadv_bla_claim *claim = NULL; struct batadv_hard_iface *primary_if = NULL; bool ret = true; if (!atomic_read(&bat_priv->bridge_loop_avoidance)) return ret; primary_if = batadv_primary_if_get_selected(bat_priv); if (!primary_if) return ret; /* First look if the mac address is claimed */ ether_addr_copy(search_claim.addr, addr); search_claim.vid = vid; claim = batadv_claim_hash_find(bat_priv, &search_claim); /* If there is a claim and we are not owner of the claim, * return false. */ if (claim) { if (!batadv_compare_eth(claim->backbone_gw->orig, primary_if->net_dev->dev_addr)) ret = false; batadv_claim_put(claim); } batadv_hardif_put(primary_if); return ret; } #endif
22 42 42 42 42 155 55 55 55 55 55 22 22 22 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 // SPDX-License-Identifier: GPL-2.0-or-later /* * A generic kernel FIFO implementation * * Copyright (C) 2009/2010 Stefani Seibold <stefani@seibold.net> */ #include <linux/dma-mapping.h> #include <linux/err.h> #include <linux/export.h> #include <linux/kfifo.h> #include <linux/log2.h> #include <linux/scatterlist.h> #include <linux/slab.h> #include <linux/uaccess.h> /* * internal helper to calculate the unused elements in a fifo */ static inline unsigned int kfifo_unused(struct __kfifo *fifo) { return (fifo->mask + 1) - (fifo->in - fifo->out); } int __kfifo_alloc(struct __kfifo *fifo, unsigned int size, size_t esize, gfp_t gfp_mask) { /* * round up to the next power of 2, since our 'let the indices * wrap' technique works only in this case. */ size = roundup_pow_of_two(size); fifo->in = 0; fifo->out = 0; fifo->esize = esize; if (size < 2) { fifo->data = NULL; fifo->mask = 0; return -EINVAL; } fifo->data = kmalloc_array(esize, size, gfp_mask); if (!fifo->data) { fifo->mask = 0; return -ENOMEM; } fifo->mask = size - 1; return 0; } EXPORT_SYMBOL(__kfifo_alloc); void __kfifo_free(struct __kfifo *fifo) { kfree(fifo->data); fifo->in = 0; fifo->out = 0; fifo->esize = 0; fifo->data = NULL; fifo->mask = 0; } EXPORT_SYMBOL(__kfifo_free); int __kfifo_init(struct __kfifo *fifo, void *buffer, unsigned int size, size_t esize) { size /= esize; if (!is_power_of_2(size)) size = rounddown_pow_of_two(size); fifo->in = 0; fifo->out = 0; fifo->esize = esize; fifo->data = buffer; if (size < 2) { fifo->mask = 0; return -EINVAL; } fifo->mask = size - 1; return 0; } EXPORT_SYMBOL(__kfifo_init); static void kfifo_copy_in(struct __kfifo *fifo, const void *src, unsigned int len, unsigned int off) { unsigned int size = fifo->mask + 1; unsigned int esize = fifo->esize; unsigned int l; off &= fifo->mask; if (esize != 1) { off *= esize; size *= esize; len *= esize; } l = min(len, size - off); memcpy(fifo->data + off, src, l); memcpy(fifo->data, src + l, len - l); /* * make sure that the data in the fifo is up to date before * incrementing the fifo->in index counter */ smp_wmb(); } unsigned int __kfifo_in(struct __kfifo *fifo, const void *buf, unsigned int len) { unsigned int l; l = kfifo_unused(fifo); if (len > l) len = l; kfifo_copy_in(fifo, buf, len, fifo->in); fifo->in += len; return len; } EXPORT_SYMBOL(__kfifo_in); static void kfifo_copy_out(struct __kfifo *fifo, void *dst, unsigned int len, unsigned int off) { unsigned int size = fifo->mask + 1; unsigned int esize = fifo->esize; unsigned int l; off &= fifo->mask; if (esize != 1) { off *= esize; size *= esize; len *= esize; } l = min(len, size - off); memcpy(dst, fifo->data + off, l); memcpy(dst + l, fifo->data, len - l); /* * make sure that the data is copied before * incrementing the fifo->out index counter */ smp_wmb(); } unsigned int __kfifo_out_peek(struct __kfifo *fifo, void *buf, unsigned int len) { unsigned int l; l = fifo->in - fifo->out; if (len > l) len = l; kfifo_copy_out(fifo, buf, len, fifo->out); return len; } EXPORT_SYMBOL(__kfifo_out_peek); unsigned int __kfifo_out_linear(struct __kfifo *fifo, unsigned int *tail, unsigned int n) { unsigned int size = fifo->mask + 1; unsigned int off = fifo->out & fifo->mask; if (tail) *tail = off; return min3(n, fifo->in - fifo->out, size - off); } EXPORT_SYMBOL(__kfifo_out_linear); unsigned int __kfifo_out(struct __kfifo *fifo, void *buf, unsigned int len) { len = __kfifo_out_peek(fifo, buf, len); fifo->out += len; return len; } EXPORT_SYMBOL(__kfifo_out); static unsigned long kfifo_copy_from_user(struct __kfifo *fifo, const void __user *from, unsigned int len, unsigned int off, unsigned int *copied) { unsigned int size = fifo->mask + 1; unsigned int esize = fifo->esize; unsigned int l; unsigned long ret; off &= fifo->mask; if (esize != 1) { off *= esize; size *= esize; len *= esize; } l = min(len, size - off); ret = copy_from_user(fifo->data + off, from, l); if (unlikely(ret)) ret = DIV_ROUND_UP(ret + len - l, esize); else { ret = copy_from_user(fifo->data, from + l, len - l); if (unlikely(ret)) ret = DIV_ROUND_UP(ret, esize); } /* * make sure that the data in the fifo is up to date before * incrementing the fifo->in index counter */ smp_wmb(); *copied = len - ret * esize; /* return the number of elements which are not copied */ return ret; } int __kfifo_from_user(struct __kfifo *fifo, const void __user *from, unsigned long len, unsigned int *copied) { unsigned int l; unsigned long ret; unsigned int esize = fifo->esize; int err; if (esize != 1) len /= esize; l = kfifo_unused(fifo); if (len > l) len = l; ret = kfifo_copy_from_user(fifo, from, len, fifo->in, copied); if (unlikely(ret)) { len -= ret; err = -EFAULT; } else err = 0; fifo->in += len; return err; } EXPORT_SYMBOL(__kfifo_from_user); static unsigned long kfifo_copy_to_user(struct __kfifo *fifo, void __user *to, unsigned int len, unsigned int off, unsigned int *copied) { unsigned int l; unsigned long ret; unsigned int size = fifo->mask + 1; unsigned int esize = fifo->esize; off &= fifo->mask; if (esize != 1) { off *= esize; size *= esize; len *= esize; } l = min(len, size - off); ret = copy_to_user(to, fifo->data + off, l); if (unlikely(ret)) ret = DIV_ROUND_UP(ret + len - l, esize); else { ret = copy_to_user(to + l, fifo->data, len - l); if (unlikely(ret)) ret = DIV_ROUND_UP(ret, esize); } /* * make sure that the data is copied before * incrementing the fifo->out index counter */ smp_wmb(); *copied = len - ret * esize; /* return the number of elements which are not copied */ return ret; } int __kfifo_to_user(struct __kfifo *fifo, void __user *to, unsigned long len, unsigned int *copied) { unsigned int l; unsigned long ret; unsigned int esize = fifo->esize; int err; if (esize != 1) len /= esize; l = fifo->in - fifo->out; if (len > l) len = l; ret = kfifo_copy_to_user(fifo, to, len, fifo->out, copied); if (unlikely(ret)) { len -= ret; err = -EFAULT; } else err = 0; fifo->out += len; return err; } EXPORT_SYMBOL(__kfifo_to_user); static unsigned int setup_sgl_buf(struct __kfifo *fifo, struct scatterlist *sgl, unsigned int data_offset, int nents, unsigned int len, dma_addr_t dma) { const void *buf = fifo->data + data_offset; if (!nents || !len) return 0; sg_set_buf(sgl, buf, len); if (dma != DMA_MAPPING_ERROR) { sg_dma_address(sgl) = dma + data_offset; sg_dma_len(sgl) = len; } return 1; } static unsigned int setup_sgl(struct __kfifo *fifo, struct scatterlist *sgl, int nents, unsigned int len, unsigned int off, dma_addr_t dma) { unsigned int size = fifo->mask + 1; unsigned int esize = fifo->esize; unsigned int len_to_end; unsigned int n; off &= fifo->mask; if (esize != 1) { off *= esize; size *= esize; len *= esize; } len_to_end = min(len, size - off); n = setup_sgl_buf(fifo, sgl, off, nents, len_to_end, dma); n += setup_sgl_buf(fifo, sgl + n, 0, nents - n, len - len_to_end, dma); return n; } unsigned int __kfifo_dma_in_prepare(struct __kfifo *fifo, struct scatterlist *sgl, int nents, unsigned int len, dma_addr_t dma) { unsigned int l; l = kfifo_unused(fifo); if (len > l) len = l; return setup_sgl(fifo, sgl, nents, len, fifo->in, dma); } EXPORT_SYMBOL(__kfifo_dma_in_prepare); unsigned int __kfifo_dma_out_prepare(struct __kfifo *fifo, struct scatterlist *sgl, int nents, unsigned int len, dma_addr_t dma) { unsigned int l; l = fifo->in - fifo->out; if (len > l) len = l; return setup_sgl(fifo, sgl, nents, len, fifo->out, dma); } EXPORT_SYMBOL(__kfifo_dma_out_prepare); unsigned int __kfifo_max_r(unsigned int len, size_t recsize) { unsigned int max = (1 << (recsize << 3)) - 1; if (len > max) return max; return len; } EXPORT_SYMBOL(__kfifo_max_r); #define __KFIFO_PEEK(data, out, mask) \ ((data)[(out) & (mask)]) /* * __kfifo_peek_n internal helper function for determinate the length of * the next record in the fifo */ static unsigned int __kfifo_peek_n(struct __kfifo *fifo, size_t recsize) { unsigned int l; unsigned int mask = fifo->mask; unsigned char *data = fifo->data; l = __KFIFO_PEEK(data, fifo->out, mask); if (--recsize) l |= __KFIFO_PEEK(data, fifo->out + 1, mask) << 8; return l; } #define __KFIFO_POKE(data, in, mask, val) \ ( \ (data)[(in) & (mask)] = (unsigned char)(val) \ ) /* * __kfifo_poke_n internal helper function for storing the length of * the record into the fifo */ static void __kfifo_poke_n(struct __kfifo *fifo, unsigned int n, size_t recsize) { unsigned int mask = fifo->mask; unsigned char *data = fifo->data; __KFIFO_POKE(data, fifo->in, mask, n); if (recsize > 1) __KFIFO_POKE(data, fifo->in + 1, mask, n >> 8); } unsigned int __kfifo_len_r(struct __kfifo *fifo, size_t recsize) { return __kfifo_peek_n(fifo, recsize); } EXPORT_SYMBOL(__kfifo_len_r); unsigned int __kfifo_in_r(struct __kfifo *fifo, const void *buf, unsigned int len, size_t recsize) { if (len + recsize > kfifo_unused(fifo)) return 0; __kfifo_poke_n(fifo, len, recsize); kfifo_copy_in(fifo, buf, len, fifo->in + recsize); fifo->in += len + recsize; return len; } EXPORT_SYMBOL(__kfifo_in_r); static unsigned int kfifo_out_copy_r(struct __kfifo *fifo, void *buf, unsigned int len, size_t recsize, unsigned int *n) { *n = __kfifo_peek_n(fifo, recsize); if (len > *n) len = *n; kfifo_copy_out(fifo, buf, len, fifo->out + recsize); return len; } unsigned int __kfifo_out_peek_r(struct __kfifo *fifo, void *buf, unsigned int len, size_t recsize) { unsigned int n; if (fifo->in == fifo->out) return 0; return kfifo_out_copy_r(fifo, buf, len, recsize, &n); } EXPORT_SYMBOL(__kfifo_out_peek_r); unsigned int __kfifo_out_linear_r(struct __kfifo *fifo, unsigned int *tail, unsigned int n, size_t recsize) { if (fifo->in == fifo->out) return 0; if (tail) *tail = fifo->out + recsize; return min(n, __kfifo_peek_n(fifo, recsize)); } EXPORT_SYMBOL(__kfifo_out_linear_r); unsigned int __kfifo_out_r(struct __kfifo *fifo, void *buf, unsigned int len, size_t recsize) { unsigned int n; if (fifo->in == fifo->out) return 0; len = kfifo_out_copy_r(fifo, buf, len, recsize, &n); fifo->out += n + recsize; return len; } EXPORT_SYMBOL(__kfifo_out_r); void __kfifo_skip_r(struct __kfifo *fifo, size_t recsize) { unsigned int n; n = __kfifo_peek_n(fifo, recsize); fifo->out += n + recsize; } EXPORT_SYMBOL(__kfifo_skip_r); int __kfifo_from_user_r(struct __kfifo *fifo, const void __user *from, unsigned long len, unsigned int *copied, size_t recsize) { unsigned long ret; len = __kfifo_max_r(len, recsize); if (len + recsize > kfifo_unused(fifo)) { *copied = 0; return 0; } __kfifo_poke_n(fifo, len, recsize); ret = kfifo_copy_from_user(fifo, from, len, fifo->in + recsize, copied); if (unlikely(ret)) { *copied = 0; return -EFAULT; } fifo->in += len + recsize; return 0; } EXPORT_SYMBOL(__kfifo_from_user_r); int __kfifo_to_user_r(struct __kfifo *fifo, void __user *to, unsigned long len, unsigned int *copied, size_t recsize) { unsigned long ret; unsigned int n; if (fifo->in == fifo->out) { *copied = 0; return 0; } n = __kfifo_peek_n(fifo, recsize); if (len > n) len = n; ret = kfifo_copy_to_user(fifo, to, len, fifo->out + recsize, copied); if (unlikely(ret)) { *copied = 0; return -EFAULT; } fifo->out += n + recsize; return 0; } EXPORT_SYMBOL(__kfifo_to_user_r); unsigned int __kfifo_dma_in_prepare_r(struct __kfifo *fifo, struct scatterlist *sgl, int nents, unsigned int len, size_t recsize, dma_addr_t dma) { BUG_ON(!nents); len = __kfifo_max_r(len, recsize); if (len + recsize > kfifo_unused(fifo)) return 0; return setup_sgl(fifo, sgl, nents, len, fifo->in + recsize, dma); } EXPORT_SYMBOL(__kfifo_dma_in_prepare_r); void __kfifo_dma_in_finish_r(struct __kfifo *fifo, unsigned int len, size_t recsize) { len = __kfifo_max_r(len, recsize); __kfifo_poke_n(fifo, len, recsize); fifo->in += len + recsize; } EXPORT_SYMBOL(__kfifo_dma_in_finish_r); unsigned int __kfifo_dma_out_prepare_r(struct __kfifo *fifo, struct scatterlist *sgl, int nents, unsigned int len, size_t recsize, dma_addr_t dma) { BUG_ON(!nents); len = __kfifo_max_r(len, recsize); if (len + recsize > fifo->in - fifo->out) return 0; return setup_sgl(fifo, sgl, nents, len, fifo->out + recsize, dma); } EXPORT_SYMBOL(__kfifo_dma_out_prepare_r);
156 155 59 157 155 154 155 155 1 154 153 58 230 27 232 1 205 204 11 204 19 3 16 2 1 1 183 173 9 13 1 198 1 197 25 25 18 25 178 178 178 180 178 180 179 5 20 20 20 18 20 10 130 130 26 2 65 62 23 63 183 35 35 34 34 34 28 25 28 25 28 16 16 16 15 15 3 15 14 15 1 1 1 1 20 20 35 26 26 26 26 26 26 22 23 5 9 26 24 26 26 26 26 26 26 26 26 26 14 13 26 27 28 27 27 2 2 27 23 27 5 5 3 3 3 3 1 1 3 2 3 1 1 2 2 1 1 1 1 1 1 1 1 2 1 1 1 1 2 16 113 114 112 110 109 87 87 87 87 86 53 34 87 87 87 87 87 111 130 127 128 128 129 130 129 129 129 130 128 128 130 130 128 127 129 130 129 81 1104 81 244 1109 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 // SPDX-License-Identifier: GPL-2.0-only #include <net/netdev_lock.h> #include <net/netdev_queues.h> #include <net/sock.h> #include <linux/ethtool_netlink.h> #include <linux/phy_link_topology.h> #include <linux/pm_runtime.h> #include "netlink.h" #include "module_fw.h" static struct genl_family ethtool_genl_family; static bool ethnl_ok __read_mostly; static u32 ethnl_bcast_seq; #define ETHTOOL_FLAGS_BASIC (ETHTOOL_FLAG_COMPACT_BITSETS | \ ETHTOOL_FLAG_OMIT_REPLY) #define ETHTOOL_FLAGS_STATS (ETHTOOL_FLAGS_BASIC | ETHTOOL_FLAG_STATS) const struct nla_policy ethnl_header_policy[] = { [ETHTOOL_A_HEADER_DEV_INDEX] = { .type = NLA_U32 }, [ETHTOOL_A_HEADER_DEV_NAME] = { .type = NLA_NUL_STRING, .len = ALTIFNAMSIZ - 1 }, [ETHTOOL_A_HEADER_FLAGS] = NLA_POLICY_MASK(NLA_U32, ETHTOOL_FLAGS_BASIC), }; const struct nla_policy ethnl_header_policy_stats[] = { [ETHTOOL_A_HEADER_DEV_INDEX] = { .type = NLA_U32 }, [ETHTOOL_A_HEADER_DEV_NAME] = { .type = NLA_NUL_STRING, .len = ALTIFNAMSIZ - 1 }, [ETHTOOL_A_HEADER_FLAGS] = NLA_POLICY_MASK(NLA_U32, ETHTOOL_FLAGS_STATS), }; const struct nla_policy ethnl_header_policy_phy[] = { [ETHTOOL_A_HEADER_DEV_INDEX] = { .type = NLA_U32 }, [ETHTOOL_A_HEADER_DEV_NAME] = { .type = NLA_NUL_STRING, .len = ALTIFNAMSIZ - 1 }, [ETHTOOL_A_HEADER_FLAGS] = NLA_POLICY_MASK(NLA_U32, ETHTOOL_FLAGS_BASIC), [ETHTOOL_A_HEADER_PHY_INDEX] = NLA_POLICY_MIN(NLA_U32, 1), }; const struct nla_policy ethnl_header_policy_phy_stats[] = { [ETHTOOL_A_HEADER_DEV_INDEX] = { .type = NLA_U32 }, [ETHTOOL_A_HEADER_DEV_NAME] = { .type = NLA_NUL_STRING, .len = ALTIFNAMSIZ - 1 }, [ETHTOOL_A_HEADER_FLAGS] = NLA_POLICY_MASK(NLA_U32, ETHTOOL_FLAGS_STATS), [ETHTOOL_A_HEADER_PHY_INDEX] = NLA_POLICY_MIN(NLA_U32, 1), }; int ethnl_sock_priv_set(struct sk_buff *skb, struct net_device *dev, u32 portid, enum ethnl_sock_type type) { struct ethnl_sock_priv *sk_priv; sk_priv = genl_sk_priv_get(&ethtool_genl_family, NETLINK_CB(skb).sk); if (IS_ERR(sk_priv)) return PTR_ERR(sk_priv); sk_priv->dev = dev; sk_priv->portid = portid; sk_priv->type = type; return 0; } static void ethnl_sock_priv_destroy(void *priv) { struct ethnl_sock_priv *sk_priv = priv; switch (sk_priv->type) { case ETHTOOL_SOCK_TYPE_MODULE_FW_FLASH: ethnl_module_fw_flash_sock_destroy(sk_priv); break; default: break; } } int ethnl_ops_begin(struct net_device *dev) { int ret; if (!dev) return -ENODEV; if (dev->dev.parent) pm_runtime_get_sync(dev->dev.parent); netdev_ops_assert_locked(dev); if (!netif_device_present(dev) || dev->reg_state >= NETREG_UNREGISTERING) { ret = -ENODEV; goto err; } if (dev->ethtool_ops->begin) { ret = dev->ethtool_ops->begin(dev); if (ret) goto err; } return 0; err: if (dev->dev.parent) pm_runtime_put(dev->dev.parent); return ret; } void ethnl_ops_complete(struct net_device *dev) { if (dev->ethtool_ops->complete) dev->ethtool_ops->complete(dev); if (dev->dev.parent) pm_runtime_put(dev->dev.parent); } /** * ethnl_parse_header_dev_get() - parse request header * @req_info: structure to put results into * @header: nest attribute with request header * @net: request netns * @extack: netlink extack for error reporting * @require_dev: fail if no device identified in header * * Parse request header in nested attribute @nest and puts results into * the structure pointed to by @req_info. Extack from @info is used for error * reporting. If req_info->dev is not null on return, reference to it has * been taken. If error is returned, *req_info is null initialized and no * reference is held. * * Return: 0 on success or negative error code */ int ethnl_parse_header_dev_get(struct ethnl_req_info *req_info, const struct nlattr *header, struct net *net, struct netlink_ext_ack *extack, bool require_dev) { struct nlattr *tb[ARRAY_SIZE(ethnl_header_policy_phy)]; const struct nlattr *devname_attr; struct net_device *dev = NULL; u32 flags = 0; int ret; if (!header) { if (!require_dev) return 0; NL_SET_ERR_MSG(extack, "request header missing"); return -EINVAL; } /* No validation here, command policy should have a nested policy set * for the header, therefore validation should have already been done. */ ret = nla_parse_nested(tb, ARRAY_SIZE(ethnl_header_policy_phy) - 1, header, NULL, extack); if (ret < 0) return ret; if (tb[ETHTOOL_A_HEADER_FLAGS]) flags = nla_get_u32(tb[ETHTOOL_A_HEADER_FLAGS]); devname_attr = tb[ETHTOOL_A_HEADER_DEV_NAME]; if (tb[ETHTOOL_A_HEADER_DEV_INDEX]) { u32 ifindex = nla_get_u32(tb[ETHTOOL_A_HEADER_DEV_INDEX]); dev = netdev_get_by_index(net, ifindex, &req_info->dev_tracker, GFP_KERNEL); if (!dev) { NL_SET_ERR_MSG_ATTR(extack, tb[ETHTOOL_A_HEADER_DEV_INDEX], "no device matches ifindex"); return -ENODEV; } /* if both ifindex and ifname are passed, they must match */ if (devname_attr && strncmp(dev->name, nla_data(devname_attr), IFNAMSIZ)) { netdev_put(dev, &req_info->dev_tracker); NL_SET_ERR_MSG_ATTR(extack, header, "ifindex and name do not match"); return -ENODEV; } } else if (devname_attr) { dev = netdev_get_by_name(net, nla_data(devname_attr), &req_info->dev_tracker, GFP_KERNEL); if (!dev) { NL_SET_ERR_MSG_ATTR(extack, devname_attr, "no device matches name"); return -ENODEV; } } else if (require_dev) { NL_SET_ERR_MSG_ATTR(extack, header, "neither ifindex nor name specified"); return -EINVAL; } if (tb[ETHTOOL_A_HEADER_PHY_INDEX]) { if (dev) { req_info->phy_index = nla_get_u32(tb[ETHTOOL_A_HEADER_PHY_INDEX]); } else { NL_SET_ERR_MSG_ATTR(extack, header, "phy_index set without a netdev"); return -EINVAL; } } req_info->dev = dev; req_info->flags = flags; return 0; } struct phy_device *ethnl_req_get_phydev(const struct ethnl_req_info *req_info, struct nlattr **tb, unsigned int header, struct netlink_ext_ack *extack) { struct phy_device *phydev; ASSERT_RTNL(); if (!req_info->dev) return NULL; if (!req_info->phy_index) return req_info->dev->phydev; phydev = phy_link_topo_get_phy(req_info->dev, req_info->phy_index); if (!phydev && tb) { NL_SET_ERR_MSG_ATTR(extack, tb[header], "no phy matching phyindex"); return ERR_PTR(-ENODEV); } return phydev; } /** * ethnl_fill_reply_header() - Put common header into a reply message * @skb: skb with the message * @dev: network device to describe in header * @attrtype: attribute type to use for the nest * * Create a nested attribute with attributes describing given network device. * * Return: 0 on success, error value (-EMSGSIZE only) on error */ int ethnl_fill_reply_header(struct sk_buff *skb, struct net_device *dev, u16 attrtype) { struct nlattr *nest; if (!dev) return 0; nest = nla_nest_start(skb, attrtype); if (!nest) return -EMSGSIZE; if (nla_put_u32(skb, ETHTOOL_A_HEADER_DEV_INDEX, (u32)dev->ifindex) || nla_put_string(skb, ETHTOOL_A_HEADER_DEV_NAME, dev->name)) goto nla_put_failure; /* If more attributes are put into reply header, ethnl_header_size() * must be updated to account for them. */ nla_nest_end(skb, nest); return 0; nla_put_failure: nla_nest_cancel(skb, nest); return -EMSGSIZE; } /** * ethnl_reply_init() - Create skb for a reply and fill device identification * @payload: payload length (without netlink and genetlink header) * @dev: device the reply is about (may be null) * @cmd: ETHTOOL_MSG_* message type for reply * @hdr_attrtype: attribute type for common header * @info: genetlink info of the received packet we respond to * @ehdrp: place to store payload pointer returned by genlmsg_new() * * Return: pointer to allocated skb on success, NULL on error */ struct sk_buff *ethnl_reply_init(size_t payload, struct net_device *dev, u8 cmd, u16 hdr_attrtype, struct genl_info *info, void **ehdrp) { struct sk_buff *skb; skb = genlmsg_new(payload, GFP_KERNEL); if (!skb) goto err; *ehdrp = genlmsg_put_reply(skb, info, &ethtool_genl_family, 0, cmd); if (!*ehdrp) goto err_free; if (dev) { int ret; ret = ethnl_fill_reply_header(skb, dev, hdr_attrtype); if (ret < 0) goto err_free; } return skb; err_free: nlmsg_free(skb); err: if (info) GENL_SET_ERR_MSG(info, "failed to setup reply message"); return NULL; } void *ethnl_dump_put(struct sk_buff *skb, struct netlink_callback *cb, u8 cmd) { return genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &ethtool_genl_family, 0, cmd); } void *ethnl_bcastmsg_put(struct sk_buff *skb, u8 cmd) { return genlmsg_put(skb, 0, ++ethnl_bcast_seq, &ethtool_genl_family, 0, cmd); } void *ethnl_unicast_put(struct sk_buff *skb, u32 portid, u32 seq, u8 cmd) { return genlmsg_put(skb, portid, seq, &ethtool_genl_family, 0, cmd); } int ethnl_multicast(struct sk_buff *skb, struct net_device *dev) { return genlmsg_multicast_netns(&ethtool_genl_family, dev_net(dev), skb, 0, ETHNL_MCGRP_MONITOR, GFP_KERNEL); } /* GET request helpers */ /** * struct ethnl_dump_ctx - context structure for generic dumpit() callback * @ops: request ops of currently processed message type * @req_info: parsed request header of processed request * @reply_data: data needed to compose the reply * @pos_ifindex: saved iteration position - ifindex * * These parameters are kept in struct netlink_callback as context preserved * between iterations. They are initialized by ethnl_default_start() and used * in ethnl_default_dumpit() and ethnl_default_done(). */ struct ethnl_dump_ctx { const struct ethnl_request_ops *ops; struct ethnl_req_info *req_info; struct ethnl_reply_data *reply_data; unsigned long pos_ifindex; }; /** * struct ethnl_perphy_dump_ctx - context for dumpit() PHY-aware callbacks * @ethnl_ctx: generic ethnl context * @ifindex: For Filtered DUMP requests, the ifindex of the targeted netdev * @pos_phyindex: iterator position for multi-msg DUMP */ struct ethnl_perphy_dump_ctx { struct ethnl_dump_ctx ethnl_ctx; unsigned int ifindex; unsigned long pos_phyindex; }; static const struct ethnl_request_ops * ethnl_default_requests[__ETHTOOL_MSG_USER_CNT] = { [ETHTOOL_MSG_STRSET_GET] = &ethnl_strset_request_ops, [ETHTOOL_MSG_LINKINFO_GET] = &ethnl_linkinfo_request_ops, [ETHTOOL_MSG_LINKINFO_SET] = &ethnl_linkinfo_request_ops, [ETHTOOL_MSG_LINKMODES_GET] = &ethnl_linkmodes_request_ops, [ETHTOOL_MSG_LINKMODES_SET] = &ethnl_linkmodes_request_ops, [ETHTOOL_MSG_LINKSTATE_GET] = &ethnl_linkstate_request_ops, [ETHTOOL_MSG_DEBUG_GET] = &ethnl_debug_request_ops, [ETHTOOL_MSG_DEBUG_SET] = &ethnl_debug_request_ops, [ETHTOOL_MSG_WOL_GET] = &ethnl_wol_request_ops, [ETHTOOL_MSG_WOL_SET] = &ethnl_wol_request_ops, [ETHTOOL_MSG_FEATURES_GET] = &ethnl_features_request_ops, [ETHTOOL_MSG_PRIVFLAGS_GET] = &ethnl_privflags_request_ops, [ETHTOOL_MSG_PRIVFLAGS_SET] = &ethnl_privflags_request_ops, [ETHTOOL_MSG_RINGS_GET] = &ethnl_rings_request_ops, [ETHTOOL_MSG_RINGS_SET] = &ethnl_rings_request_ops, [ETHTOOL_MSG_CHANNELS_GET] = &ethnl_channels_request_ops, [ETHTOOL_MSG_CHANNELS_SET] = &ethnl_channels_request_ops, [ETHTOOL_MSG_COALESCE_GET] = &ethnl_coalesce_request_ops, [ETHTOOL_MSG_COALESCE_SET] = &ethnl_coalesce_request_ops, [ETHTOOL_MSG_PAUSE_GET] = &ethnl_pause_request_ops, [ETHTOOL_MSG_PAUSE_SET] = &ethnl_pause_request_ops, [ETHTOOL_MSG_EEE_GET] = &ethnl_eee_request_ops, [ETHTOOL_MSG_EEE_SET] = &ethnl_eee_request_ops, [ETHTOOL_MSG_FEC_GET] = &ethnl_fec_request_ops, [ETHTOOL_MSG_FEC_SET] = &ethnl_fec_request_ops, [ETHTOOL_MSG_TSINFO_GET] = &ethnl_tsinfo_request_ops, [ETHTOOL_MSG_MODULE_EEPROM_GET] = &ethnl_module_eeprom_request_ops, [ETHTOOL_MSG_STATS_GET] = &ethnl_stats_request_ops, [ETHTOOL_MSG_PHC_VCLOCKS_GET] = &ethnl_phc_vclocks_request_ops, [ETHTOOL_MSG_MODULE_GET] = &ethnl_module_request_ops, [ETHTOOL_MSG_MODULE_SET] = &ethnl_module_request_ops, [ETHTOOL_MSG_PSE_GET] = &ethnl_pse_request_ops, [ETHTOOL_MSG_PSE_SET] = &ethnl_pse_request_ops, [ETHTOOL_MSG_RSS_GET] = &ethnl_rss_request_ops, [ETHTOOL_MSG_PLCA_GET_CFG] = &ethnl_plca_cfg_request_ops, [ETHTOOL_MSG_PLCA_SET_CFG] = &ethnl_plca_cfg_request_ops, [ETHTOOL_MSG_PLCA_GET_STATUS] = &ethnl_plca_status_request_ops, [ETHTOOL_MSG_MM_GET] = &ethnl_mm_request_ops, [ETHTOOL_MSG_MM_SET] = &ethnl_mm_request_ops, [ETHTOOL_MSG_TSCONFIG_GET] = &ethnl_tsconfig_request_ops, [ETHTOOL_MSG_TSCONFIG_SET] = &ethnl_tsconfig_request_ops, [ETHTOOL_MSG_PHY_GET] = &ethnl_phy_request_ops, }; static struct ethnl_dump_ctx *ethnl_dump_context(struct netlink_callback *cb) { return (struct ethnl_dump_ctx *)cb->ctx; } static struct ethnl_perphy_dump_ctx * ethnl_perphy_dump_context(struct netlink_callback *cb) { return (struct ethnl_perphy_dump_ctx *)cb->ctx; } /** * ethnl_default_parse() - Parse request message * @req_info: pointer to structure to put data into * @info: genl_info from the request * @request_ops: struct request_ops for request type * @require_dev: fail if no device identified in header * * Parse universal request header and call request specific ->parse_request() * callback (if defined) to parse the rest of the message. * * Return: 0 on success or negative error code */ static int ethnl_default_parse(struct ethnl_req_info *req_info, const struct genl_info *info, const struct ethnl_request_ops *request_ops, bool require_dev) { struct nlattr **tb = info->attrs; int ret; ret = ethnl_parse_header_dev_get(req_info, tb[request_ops->hdr_attr], genl_info_net(info), info->extack, require_dev); if (ret < 0) return ret; if (request_ops->parse_request) { ret = request_ops->parse_request(req_info, tb, info->extack); if (ret < 0) return ret; } return 0; } /** * ethnl_init_reply_data() - Initialize reply data for GET request * @reply_data: pointer to embedded struct ethnl_reply_data * @ops: instance of struct ethnl_request_ops describing the layout * @dev: network device to initialize the reply for * * Fills the reply data part with zeros and sets the dev member. Must be called * before calling the ->fill_reply() callback (for each iteration when handling * dump requests). */ static void ethnl_init_reply_data(struct ethnl_reply_data *reply_data, const struct ethnl_request_ops *ops, struct net_device *dev) { memset(reply_data, 0, ops->reply_data_size); reply_data->dev = dev; } /* default ->doit() handler for GET type requests */ static int ethnl_default_doit(struct sk_buff *skb, struct genl_info *info) { struct ethnl_reply_data *reply_data = NULL; struct ethnl_req_info *req_info = NULL; const u8 cmd = info->genlhdr->cmd; const struct ethnl_request_ops *ops; int hdr_len, reply_len; struct sk_buff *rskb; void *reply_payload; int ret; ops = ethnl_default_requests[cmd]; if (WARN_ONCE(!ops, "cmd %u has no ethnl_request_ops\n", cmd)) return -EOPNOTSUPP; if (GENL_REQ_ATTR_CHECK(info, ops->hdr_attr)) return -EINVAL; req_info = kzalloc(ops->req_info_size, GFP_KERNEL); if (!req_info) return -ENOMEM; reply_data = kmalloc(ops->reply_data_size, GFP_KERNEL); if (!reply_data) { kfree(req_info); return -ENOMEM; } ret = ethnl_default_parse(req_info, info, ops, !ops->allow_nodev_do); if (ret < 0) goto err_dev; ethnl_init_reply_data(reply_data, ops, req_info->dev); rtnl_lock(); if (req_info->dev) netdev_lock_ops(req_info->dev); ret = ops->prepare_data(req_info, reply_data, info); if (req_info->dev) netdev_unlock_ops(req_info->dev); rtnl_unlock(); if (ret < 0) goto err_dev; ret = ops->reply_size(req_info, reply_data); if (ret < 0) goto err_cleanup; reply_len = ret; ret = -ENOMEM; rskb = ethnl_reply_init(reply_len + ethnl_reply_header_size(), req_info->dev, ops->reply_cmd, ops->hdr_attr, info, &reply_payload); if (!rskb) goto err_cleanup; hdr_len = rskb->len; ret = ops->fill_reply(rskb, req_info, reply_data); if (ret < 0) goto err_msg; WARN_ONCE(rskb->len - hdr_len > reply_len, "ethnl cmd %d: calculated reply length %d, but consumed %d\n", cmd, reply_len, rskb->len - hdr_len); if (ops->cleanup_data) ops->cleanup_data(reply_data); genlmsg_end(rskb, reply_payload); netdev_put(req_info->dev, &req_info->dev_tracker); kfree(reply_data); kfree(req_info); return genlmsg_reply(rskb, info); err_msg: WARN_ONCE(ret == -EMSGSIZE, "calculated message payload length (%d) not sufficient\n", reply_len); nlmsg_free(rskb); err_cleanup: if (ops->cleanup_data) ops->cleanup_data(reply_data); err_dev: netdev_put(req_info->dev, &req_info->dev_tracker); kfree(reply_data); kfree(req_info); return ret; } static int ethnl_default_dump_one(struct sk_buff *skb, struct net_device *dev, const struct ethnl_dump_ctx *ctx, const struct genl_info *info) { void *ehdr; int ret; ehdr = genlmsg_put(skb, info->snd_portid, info->snd_seq, &ethtool_genl_family, NLM_F_MULTI, ctx->ops->reply_cmd); if (!ehdr) return -EMSGSIZE; ethnl_init_reply_data(ctx->reply_data, ctx->ops, dev); rtnl_lock(); netdev_lock_ops(dev); ret = ctx->ops->prepare_data(ctx->req_info, ctx->reply_data, info); netdev_unlock_ops(dev); rtnl_unlock(); if (ret < 0) goto out_cancel; ret = ethnl_fill_reply_header(skb, dev, ctx->ops->hdr_attr); if (ret < 0) goto out; ret = ctx->ops->fill_reply(skb, ctx->req_info, ctx->reply_data); out: if (ctx->ops->cleanup_data) ctx->ops->cleanup_data(ctx->reply_data); out_cancel: ctx->reply_data->dev = NULL; if (ret < 0) genlmsg_cancel(skb, ehdr); else genlmsg_end(skb, ehdr); return ret; } /* Default ->dumpit() handler for GET requests. */ static int ethnl_default_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct ethnl_dump_ctx *ctx = ethnl_dump_context(cb); struct net *net = sock_net(skb->sk); netdevice_tracker dev_tracker; struct net_device *dev; int ret = 0; rcu_read_lock(); for_each_netdev_dump(net, dev, ctx->pos_ifindex) { netdev_hold(dev, &dev_tracker, GFP_ATOMIC); rcu_read_unlock(); ret = ethnl_default_dump_one(skb, dev, ctx, genl_info_dump(cb)); rcu_read_lock(); netdev_put(dev, &dev_tracker); if (ret < 0 && ret != -EOPNOTSUPP) { if (likely(skb->len)) ret = skb->len; break; } ret = 0; } rcu_read_unlock(); return ret; } /* generic ->start() handler for GET requests */ static int ethnl_default_start(struct netlink_callback *cb) { const struct genl_dumpit_info *info = genl_dumpit_info(cb); struct ethnl_dump_ctx *ctx = ethnl_dump_context(cb); struct ethnl_reply_data *reply_data; const struct ethnl_request_ops *ops; struct ethnl_req_info *req_info; struct genlmsghdr *ghdr; int ret; BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx)); ghdr = nlmsg_data(cb->nlh); ops = ethnl_default_requests[ghdr->cmd]; if (WARN_ONCE(!ops, "cmd %u has no ethnl_request_ops\n", ghdr->cmd)) return -EOPNOTSUPP; req_info = kzalloc(ops->req_info_size, GFP_KERNEL); if (!req_info) return -ENOMEM; reply_data = kmalloc(ops->reply_data_size, GFP_KERNEL); if (!reply_data) { ret = -ENOMEM; goto free_req_info; } ret = ethnl_default_parse(req_info, &info->info, ops, false); if (req_info->dev) { /* We ignore device specification in dump requests but as the * same parser as for non-dump (doit) requests is used, it * would take reference to the device if it finds one */ netdev_put(req_info->dev, &req_info->dev_tracker); req_info->dev = NULL; } if (ret < 0) goto free_reply_data; ctx->ops = ops; ctx->req_info = req_info; ctx->reply_data = reply_data; ctx->pos_ifindex = 0; return 0; free_reply_data: kfree(reply_data); free_req_info: kfree(req_info); return ret; } /* per-PHY ->start() handler for GET requests */ static int ethnl_perphy_start(struct netlink_callback *cb) { struct ethnl_perphy_dump_ctx *phy_ctx = ethnl_perphy_dump_context(cb); const struct genl_dumpit_info *info = genl_dumpit_info(cb); struct ethnl_dump_ctx *ctx = &phy_ctx->ethnl_ctx; struct ethnl_reply_data *reply_data; const struct ethnl_request_ops *ops; struct ethnl_req_info *req_info; struct genlmsghdr *ghdr; int ret; BUILD_BUG_ON(sizeof(*ctx) > sizeof(cb->ctx)); ghdr = nlmsg_data(cb->nlh); ops = ethnl_default_requests[ghdr->cmd]; if (WARN_ONCE(!ops, "cmd %u has no ethnl_request_ops\n", ghdr->cmd)) return -EOPNOTSUPP; req_info = kzalloc(ops->req_info_size, GFP_KERNEL); if (!req_info) return -ENOMEM; reply_data = kmalloc(ops->reply_data_size, GFP_KERNEL); if (!reply_data) { ret = -ENOMEM; goto free_req_info; } /* Unlike per-dev dump, don't ignore dev. The dump handler * will notice it and dump PHYs from given dev. We only keep track of * the dev's ifindex, .dumpit() will grab and release the netdev itself. */ ret = ethnl_default_parse(req_info, &info->info, ops, false); if (req_info->dev) { phy_ctx->ifindex = req_info->dev->ifindex; netdev_put(req_info->dev, &req_info->dev_tracker); req_info->dev = NULL; } if (ret < 0) goto free_reply_data; ctx->ops = ops; ctx->req_info = req_info; ctx->reply_data = reply_data; ctx->pos_ifindex = 0; return 0; free_reply_data: kfree(reply_data); free_req_info: kfree(req_info); return ret; } static int ethnl_perphy_dump_one_dev(struct sk_buff *skb, struct ethnl_perphy_dump_ctx *ctx, const struct genl_info *info) { struct ethnl_dump_ctx *ethnl_ctx = &ctx->ethnl_ctx; struct net_device *dev = ethnl_ctx->req_info->dev; struct phy_device_node *pdn; int ret; if (!dev->link_topo) return 0; xa_for_each_start(&dev->link_topo->phys, ctx->pos_phyindex, pdn, ctx->pos_phyindex) { ethnl_ctx->req_info->phy_index = ctx->pos_phyindex; /* We can re-use the original dump_one as ->prepare_data in * commands use ethnl_req_get_phydev(), which gets the PHY from * the req_info->phy_index */ ret = ethnl_default_dump_one(skb, dev, ethnl_ctx, info); if (ret) return ret; } ctx->pos_phyindex = 0; return 0; } static int ethnl_perphy_dump_all_dev(struct sk_buff *skb, struct ethnl_perphy_dump_ctx *ctx, const struct genl_info *info) { struct ethnl_dump_ctx *ethnl_ctx = &ctx->ethnl_ctx; struct net *net = sock_net(skb->sk); netdevice_tracker dev_tracker; struct net_device *dev; int ret = 0; rcu_read_lock(); for_each_netdev_dump(net, dev, ethnl_ctx->pos_ifindex) { netdev_hold(dev, &dev_tracker, GFP_ATOMIC); rcu_read_unlock(); /* per-PHY commands use ethnl_req_get_phydev(), which needs the * net_device in the req_info */ ethnl_ctx->req_info->dev = dev; ret = ethnl_perphy_dump_one_dev(skb, ctx, info); rcu_read_lock(); netdev_put(dev, &dev_tracker); ethnl_ctx->req_info->dev = NULL; if (ret < 0 && ret != -EOPNOTSUPP) { if (likely(skb->len)) ret = skb->len; break; } ret = 0; } rcu_read_unlock(); return ret; } /* per-PHY ->dumpit() handler for GET requests. */ static int ethnl_perphy_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { struct ethnl_perphy_dump_ctx *ctx = ethnl_perphy_dump_context(cb); const struct genl_dumpit_info *info = genl_dumpit_info(cb); struct ethnl_dump_ctx *ethnl_ctx = &ctx->ethnl_ctx; int ret = 0; if (ctx->ifindex) { netdevice_tracker dev_tracker; struct net_device *dev; dev = netdev_get_by_index(genl_info_net(&info->info), ctx->ifindex, &dev_tracker, GFP_KERNEL); if (!dev) return -ENODEV; ethnl_ctx->req_info->dev = dev; ret = ethnl_perphy_dump_one_dev(skb, ctx, genl_info_dump(cb)); if (ret < 0 && ret != -EOPNOTSUPP && likely(skb->len)) ret = skb->len; netdev_put(dev, &dev_tracker); } else { ret = ethnl_perphy_dump_all_dev(skb, ctx, genl_info_dump(cb)); } return ret; } /* per-PHY ->done() handler for GET requests */ static int ethnl_perphy_done(struct netlink_callback *cb) { struct ethnl_perphy_dump_ctx *ctx = ethnl_perphy_dump_context(cb); struct ethnl_dump_ctx *ethnl_ctx = &ctx->ethnl_ctx; kfree(ethnl_ctx->reply_data); kfree(ethnl_ctx->req_info); return 0; } /* default ->done() handler for GET requests */ static int ethnl_default_done(struct netlink_callback *cb) { struct ethnl_dump_ctx *ctx = ethnl_dump_context(cb); kfree(ctx->reply_data); kfree(ctx->req_info); return 0; } static int ethnl_default_set_doit(struct sk_buff *skb, struct genl_info *info) { const struct ethnl_request_ops *ops; struct ethnl_req_info req_info = {}; const u8 cmd = info->genlhdr->cmd; struct net_device *dev; int ret; ops = ethnl_default_requests[cmd]; if (WARN_ONCE(!ops, "cmd %u has no ethnl_request_ops\n", cmd)) return -EOPNOTSUPP; if (GENL_REQ_ATTR_CHECK(info, ops->hdr_attr)) return -EINVAL; ret = ethnl_parse_header_dev_get(&req_info, info->attrs[ops->hdr_attr], genl_info_net(info), info->extack, true); if (ret < 0) return ret; if (ops->set_validate) { ret = ops->set_validate(&req_info, info); /* 0 means nothing to do */ if (ret <= 0) goto out_dev; } dev = req_info.dev; rtnl_lock(); netdev_lock_ops(dev); dev->cfg_pending = kmemdup(dev->cfg, sizeof(*dev->cfg), GFP_KERNEL_ACCOUNT); if (!dev->cfg_pending) { ret = -ENOMEM; goto out_tie_cfg; } ret = ethnl_ops_begin(dev); if (ret < 0) goto out_free_cfg; ret = ops->set(&req_info, info); if (ret < 0) goto out_ops; swap(dev->cfg, dev->cfg_pending); if (!ret) goto out_ops; ethtool_notify(dev, ops->set_ntf_cmd, NULL); ret = 0; out_ops: ethnl_ops_complete(dev); out_free_cfg: kfree(dev->cfg_pending); out_tie_cfg: dev->cfg_pending = dev->cfg; netdev_unlock_ops(dev); rtnl_unlock(); out_dev: ethnl_parse_header_dev_put(&req_info); return ret; } static const struct ethnl_request_ops * ethnl_default_notify_ops[ETHTOOL_MSG_KERNEL_MAX + 1] = { [ETHTOOL_MSG_LINKINFO_NTF] = &ethnl_linkinfo_request_ops, [ETHTOOL_MSG_LINKMODES_NTF] = &ethnl_linkmodes_request_ops, [ETHTOOL_MSG_DEBUG_NTF] = &ethnl_debug_request_ops, [ETHTOOL_MSG_WOL_NTF] = &ethnl_wol_request_ops, [ETHTOOL_MSG_FEATURES_NTF] = &ethnl_features_request_ops, [ETHTOOL_MSG_PRIVFLAGS_NTF] = &ethnl_privflags_request_ops, [ETHTOOL_MSG_RINGS_NTF] = &ethnl_rings_request_ops, [ETHTOOL_MSG_CHANNELS_NTF] = &ethnl_channels_request_ops, [ETHTOOL_MSG_COALESCE_NTF] = &ethnl_coalesce_request_ops, [ETHTOOL_MSG_PAUSE_NTF] = &ethnl_pause_request_ops, [ETHTOOL_MSG_EEE_NTF] = &ethnl_eee_request_ops, [ETHTOOL_MSG_FEC_NTF] = &ethnl_fec_request_ops, [ETHTOOL_MSG_MODULE_NTF] = &ethnl_module_request_ops, [ETHTOOL_MSG_PLCA_NTF] = &ethnl_plca_cfg_request_ops, [ETHTOOL_MSG_MM_NTF] = &ethnl_mm_request_ops, }; /* default notification handler */ static void ethnl_default_notify(struct net_device *dev, unsigned int cmd, const void *data) { struct ethnl_reply_data *reply_data; const struct ethnl_request_ops *ops; struct ethnl_req_info *req_info; struct genl_info info; struct sk_buff *skb; void *reply_payload; int reply_len; int ret; genl_info_init_ntf(&info, &ethtool_genl_family, cmd); if (WARN_ONCE(cmd > ETHTOOL_MSG_KERNEL_MAX || !ethnl_default_notify_ops[cmd], "unexpected notification type %u\n", cmd)) return; ops = ethnl_default_notify_ops[cmd]; req_info = kzalloc(ops->req_info_size, GFP_KERNEL); if (!req_info) return; reply_data = kmalloc(ops->reply_data_size, GFP_KERNEL); if (!reply_data) { kfree(req_info); return; } req_info->dev = dev; req_info->flags |= ETHTOOL_FLAG_COMPACT_BITSETS; netdev_ops_assert_locked(dev); ethnl_init_reply_data(reply_data, ops, dev); ret = ops->prepare_data(req_info, reply_data, &info); if (ret < 0) goto err_rep; ret = ops->reply_size(req_info, reply_data); if (ret < 0) goto err_cleanup; reply_len = ret + ethnl_reply_header_size(); skb = genlmsg_new(reply_len, GFP_KERNEL); if (!skb) goto err_cleanup; reply_payload = ethnl_bcastmsg_put(skb, cmd); if (!reply_payload) goto err_skb; ret = ethnl_fill_reply_header(skb, dev, ops->hdr_attr); if (ret < 0) goto err_msg; ret = ops->fill_reply(skb, req_info, reply_data); if (ret < 0) goto err_msg; if (ops->cleanup_data) ops->cleanup_data(reply_data); genlmsg_end(skb, reply_payload); kfree(reply_data); kfree(req_info); ethnl_multicast(skb, dev); return; err_msg: WARN_ONCE(ret == -EMSGSIZE, "calculated message payload length (%d) not sufficient\n", reply_len); err_skb: nlmsg_free(skb); err_cleanup: if (ops->cleanup_data) ops->cleanup_data(reply_data); err_rep: kfree(reply_data); kfree(req_info); return; } /* notifications */ typedef void (*ethnl_notify_handler_t)(struct net_device *dev, unsigned int cmd, const void *data); static const ethnl_notify_handler_t ethnl_notify_handlers[] = { [ETHTOOL_MSG_LINKINFO_NTF] = ethnl_default_notify, [ETHTOOL_MSG_LINKMODES_NTF] = ethnl_default_notify, [ETHTOOL_MSG_DEBUG_NTF] = ethnl_default_notify, [ETHTOOL_MSG_WOL_NTF] = ethnl_default_notify, [ETHTOOL_MSG_FEATURES_NTF] = ethnl_default_notify, [ETHTOOL_MSG_PRIVFLAGS_NTF] = ethnl_default_notify, [ETHTOOL_MSG_RINGS_NTF] = ethnl_default_notify, [ETHTOOL_MSG_CHANNELS_NTF] = ethnl_default_notify, [ETHTOOL_MSG_COALESCE_NTF] = ethnl_default_notify, [ETHTOOL_MSG_PAUSE_NTF] = ethnl_default_notify, [ETHTOOL_MSG_EEE_NTF] = ethnl_default_notify, [ETHTOOL_MSG_FEC_NTF] = ethnl_default_notify, [ETHTOOL_MSG_MODULE_NTF] = ethnl_default_notify, [ETHTOOL_MSG_PLCA_NTF] = ethnl_default_notify, [ETHTOOL_MSG_MM_NTF] = ethnl_default_notify, }; void ethtool_notify(struct net_device *dev, unsigned int cmd, const void *data) { if (unlikely(!ethnl_ok)) return; ASSERT_RTNL(); if (likely(cmd < ARRAY_SIZE(ethnl_notify_handlers) && ethnl_notify_handlers[cmd])) ethnl_notify_handlers[cmd](dev, cmd, data); else WARN_ONCE(1, "notification %u not implemented (dev=%s)\n", cmd, netdev_name(dev)); } EXPORT_SYMBOL(ethtool_notify); static void ethnl_notify_features(struct netdev_notifier_info *info) { struct net_device *dev = netdev_notifier_info_to_dev(info); ethtool_notify(dev, ETHTOOL_MSG_FEATURES_NTF, NULL); } static int ethnl_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct netdev_notifier_info *info = ptr; struct netlink_ext_ack *extack; struct net_device *dev; dev = netdev_notifier_info_to_dev(info); extack = netdev_notifier_info_to_extack(info); switch (event) { case NETDEV_FEAT_CHANGE: ethnl_notify_features(ptr); break; case NETDEV_PRE_UP: if (dev->ethtool->module_fw_flash_in_progress) { NL_SET_ERR_MSG(extack, "Can't set port up while flashing module firmware"); return NOTIFY_BAD; } } return NOTIFY_DONE; } static struct notifier_block ethnl_netdev_notifier = { .notifier_call = ethnl_netdev_event, }; /* genetlink setup */ static const struct genl_ops ethtool_genl_ops[] = { { .cmd = ETHTOOL_MSG_STRSET_GET, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_strset_get_policy, .maxattr = ARRAY_SIZE(ethnl_strset_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_LINKINFO_GET, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_linkinfo_get_policy, .maxattr = ARRAY_SIZE(ethnl_linkinfo_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_LINKINFO_SET, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_default_set_doit, .policy = ethnl_linkinfo_set_policy, .maxattr = ARRAY_SIZE(ethnl_linkinfo_set_policy) - 1, }, { .cmd = ETHTOOL_MSG_LINKMODES_GET, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_linkmodes_get_policy, .maxattr = ARRAY_SIZE(ethnl_linkmodes_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_LINKMODES_SET, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_default_set_doit, .policy = ethnl_linkmodes_set_policy, .maxattr = ARRAY_SIZE(ethnl_linkmodes_set_policy) - 1, }, { .cmd = ETHTOOL_MSG_LINKSTATE_GET, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_linkstate_get_policy, .maxattr = ARRAY_SIZE(ethnl_linkstate_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_DEBUG_GET, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_debug_get_policy, .maxattr = ARRAY_SIZE(ethnl_debug_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_DEBUG_SET, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_default_set_doit, .policy = ethnl_debug_set_policy, .maxattr = ARRAY_SIZE(ethnl_debug_set_policy) - 1, }, { .cmd = ETHTOOL_MSG_WOL_GET, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_wol_get_policy, .maxattr = ARRAY_SIZE(ethnl_wol_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_WOL_SET, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_default_set_doit, .policy = ethnl_wol_set_policy, .maxattr = ARRAY_SIZE(ethnl_wol_set_policy) - 1, }, { .cmd = ETHTOOL_MSG_FEATURES_GET, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_features_get_policy, .maxattr = ARRAY_SIZE(ethnl_features_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_FEATURES_SET, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_set_features, .policy = ethnl_features_set_policy, .maxattr = ARRAY_SIZE(ethnl_features_set_policy) - 1, }, { .cmd = ETHTOOL_MSG_PRIVFLAGS_GET, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_privflags_get_policy, .maxattr = ARRAY_SIZE(ethnl_privflags_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_PRIVFLAGS_SET, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_default_set_doit, .policy = ethnl_privflags_set_policy, .maxattr = ARRAY_SIZE(ethnl_privflags_set_policy) - 1, }, { .cmd = ETHTOOL_MSG_RINGS_GET, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_rings_get_policy, .maxattr = ARRAY_SIZE(ethnl_rings_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_RINGS_SET, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_default_set_doit, .policy = ethnl_rings_set_policy, .maxattr = ARRAY_SIZE(ethnl_rings_set_policy) - 1, }, { .cmd = ETHTOOL_MSG_CHANNELS_GET, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_channels_get_policy, .maxattr = ARRAY_SIZE(ethnl_channels_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_CHANNELS_SET, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_default_set_doit, .policy = ethnl_channels_set_policy, .maxattr = ARRAY_SIZE(ethnl_channels_set_policy) - 1, }, { .cmd = ETHTOOL_MSG_COALESCE_GET, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_coalesce_get_policy, .maxattr = ARRAY_SIZE(ethnl_coalesce_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_COALESCE_SET, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_default_set_doit, .policy = ethnl_coalesce_set_policy, .maxattr = ARRAY_SIZE(ethnl_coalesce_set_policy) - 1, }, { .cmd = ETHTOOL_MSG_PAUSE_GET, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_pause_get_policy, .maxattr = ARRAY_SIZE(ethnl_pause_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_PAUSE_SET, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_default_set_doit, .policy = ethnl_pause_set_policy, .maxattr = ARRAY_SIZE(ethnl_pause_set_policy) - 1, }, { .cmd = ETHTOOL_MSG_EEE_GET, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_eee_get_policy, .maxattr = ARRAY_SIZE(ethnl_eee_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_EEE_SET, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_default_set_doit, .policy = ethnl_eee_set_policy, .maxattr = ARRAY_SIZE(ethnl_eee_set_policy) - 1, }, { .cmd = ETHTOOL_MSG_TSINFO_GET, .doit = ethnl_default_doit, .start = ethnl_tsinfo_start, .dumpit = ethnl_tsinfo_dumpit, .done = ethnl_tsinfo_done, .policy = ethnl_tsinfo_get_policy, .maxattr = ARRAY_SIZE(ethnl_tsinfo_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_CABLE_TEST_ACT, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_act_cable_test, .policy = ethnl_cable_test_act_policy, .maxattr = ARRAY_SIZE(ethnl_cable_test_act_policy) - 1, }, { .cmd = ETHTOOL_MSG_CABLE_TEST_TDR_ACT, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_act_cable_test_tdr, .policy = ethnl_cable_test_tdr_act_policy, .maxattr = ARRAY_SIZE(ethnl_cable_test_tdr_act_policy) - 1, }, { .cmd = ETHTOOL_MSG_TUNNEL_INFO_GET, .doit = ethnl_tunnel_info_doit, .start = ethnl_tunnel_info_start, .dumpit = ethnl_tunnel_info_dumpit, .policy = ethnl_tunnel_info_get_policy, .maxattr = ARRAY_SIZE(ethnl_tunnel_info_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_FEC_GET, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_fec_get_policy, .maxattr = ARRAY_SIZE(ethnl_fec_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_FEC_SET, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_default_set_doit, .policy = ethnl_fec_set_policy, .maxattr = ARRAY_SIZE(ethnl_fec_set_policy) - 1, }, { .cmd = ETHTOOL_MSG_MODULE_EEPROM_GET, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_module_eeprom_get_policy, .maxattr = ARRAY_SIZE(ethnl_module_eeprom_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_STATS_GET, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_stats_get_policy, .maxattr = ARRAY_SIZE(ethnl_stats_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_PHC_VCLOCKS_GET, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_phc_vclocks_get_policy, .maxattr = ARRAY_SIZE(ethnl_phc_vclocks_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_MODULE_GET, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_module_get_policy, .maxattr = ARRAY_SIZE(ethnl_module_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_MODULE_SET, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_default_set_doit, .policy = ethnl_module_set_policy, .maxattr = ARRAY_SIZE(ethnl_module_set_policy) - 1, }, { .cmd = ETHTOOL_MSG_PSE_GET, .doit = ethnl_default_doit, .start = ethnl_perphy_start, .dumpit = ethnl_perphy_dumpit, .done = ethnl_perphy_done, .policy = ethnl_pse_get_policy, .maxattr = ARRAY_SIZE(ethnl_pse_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_PSE_SET, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_default_set_doit, .policy = ethnl_pse_set_policy, .maxattr = ARRAY_SIZE(ethnl_pse_set_policy) - 1, }, { .cmd = ETHTOOL_MSG_RSS_GET, .doit = ethnl_default_doit, .start = ethnl_rss_dump_start, .dumpit = ethnl_rss_dumpit, .policy = ethnl_rss_get_policy, .maxattr = ARRAY_SIZE(ethnl_rss_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_PLCA_GET_CFG, .doit = ethnl_default_doit, .start = ethnl_perphy_start, .dumpit = ethnl_perphy_dumpit, .done = ethnl_perphy_done, .policy = ethnl_plca_get_cfg_policy, .maxattr = ARRAY_SIZE(ethnl_plca_get_cfg_policy) - 1, }, { .cmd = ETHTOOL_MSG_PLCA_SET_CFG, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_default_set_doit, .policy = ethnl_plca_set_cfg_policy, .maxattr = ARRAY_SIZE(ethnl_plca_set_cfg_policy) - 1, }, { .cmd = ETHTOOL_MSG_PLCA_GET_STATUS, .doit = ethnl_default_doit, .start = ethnl_perphy_start, .dumpit = ethnl_perphy_dumpit, .done = ethnl_perphy_done, .policy = ethnl_plca_get_status_policy, .maxattr = ARRAY_SIZE(ethnl_plca_get_status_policy) - 1, }, { .cmd = ETHTOOL_MSG_MM_GET, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_mm_get_policy, .maxattr = ARRAY_SIZE(ethnl_mm_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_MM_SET, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_default_set_doit, .policy = ethnl_mm_set_policy, .maxattr = ARRAY_SIZE(ethnl_mm_set_policy) - 1, }, { .cmd = ETHTOOL_MSG_MODULE_FW_FLASH_ACT, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_act_module_fw_flash, .policy = ethnl_module_fw_flash_act_policy, .maxattr = ARRAY_SIZE(ethnl_module_fw_flash_act_policy) - 1, }, { .cmd = ETHTOOL_MSG_PHY_GET, .doit = ethnl_default_doit, .start = ethnl_perphy_start, .dumpit = ethnl_perphy_dumpit, .done = ethnl_perphy_done, .policy = ethnl_phy_get_policy, .maxattr = ARRAY_SIZE(ethnl_phy_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_TSCONFIG_GET, .doit = ethnl_default_doit, .start = ethnl_default_start, .dumpit = ethnl_default_dumpit, .done = ethnl_default_done, .policy = ethnl_tsconfig_get_policy, .maxattr = ARRAY_SIZE(ethnl_tsconfig_get_policy) - 1, }, { .cmd = ETHTOOL_MSG_TSCONFIG_SET, .flags = GENL_UNS_ADMIN_PERM, .doit = ethnl_default_set_doit, .policy = ethnl_tsconfig_set_policy, .maxattr = ARRAY_SIZE(ethnl_tsconfig_set_policy) - 1, }, }; static const struct genl_multicast_group ethtool_nl_mcgrps[] = { [ETHNL_MCGRP_MONITOR] = { .name = ETHTOOL_MCGRP_MONITOR_NAME }, }; static struct genl_family ethtool_genl_family __ro_after_init = { .name = ETHTOOL_GENL_NAME, .version = ETHTOOL_GENL_VERSION, .netnsok = true, .parallel_ops = true, .ops = ethtool_genl_ops, .n_ops = ARRAY_SIZE(ethtool_genl_ops), .resv_start_op = ETHTOOL_MSG_MODULE_GET + 1, .mcgrps = ethtool_nl_mcgrps, .n_mcgrps = ARRAY_SIZE(ethtool_nl_mcgrps), .sock_priv_size = sizeof(struct ethnl_sock_priv), .sock_priv_destroy = ethnl_sock_priv_destroy, }; /* module setup */ static int __init ethnl_init(void) { int ret; ret = genl_register_family(&ethtool_genl_family); if (WARN(ret < 0, "ethtool: genetlink family registration failed")) return ret; ethnl_ok = true; ret = register_netdevice_notifier(&ethnl_netdev_notifier); WARN(ret < 0, "ethtool: net device notifier registration failed"); return ret; } subsys_initcall(ethnl_init);
240 7 2078 2483 1584 2055 2055 1155 1155 9954 658 1148 2160 154 342 343 343 63 63 62 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 /* SPDX-License-Identifier: GPL-2.0+ */ #ifndef _LINUX_MAPLE_TREE_H #define _LINUX_MAPLE_TREE_H /* * Maple Tree - An RCU-safe adaptive tree for storing ranges * Copyright (c) 2018-2022 Oracle * Authors: Liam R. Howlett <Liam.Howlett@Oracle.com> * Matthew Wilcox <willy@infradead.org> */ #include <linux/kernel.h> #include <linux/rcupdate.h> #include <linux/spinlock.h> /* #define CONFIG_MAPLE_RCU_DISABLED */ /* * Allocated nodes are mutable until they have been inserted into the tree, * at which time they cannot change their type until they have been removed * from the tree and an RCU grace period has passed. * * Removed nodes have their ->parent set to point to themselves. RCU readers * check ->parent before relying on the value that they loaded from the * slots array. This lets us reuse the slots array for the RCU head. * * Nodes in the tree point to their parent unless bit 0 is set. */ #if defined(CONFIG_64BIT) || defined(BUILD_VDSO32_64) /* 64bit sizes */ #define MAPLE_NODE_SLOTS 31 /* 256 bytes including ->parent */ #define MAPLE_RANGE64_SLOTS 16 /* 256 bytes */ #define MAPLE_ARANGE64_SLOTS 10 /* 240 bytes */ #define MAPLE_ALLOC_SLOTS (MAPLE_NODE_SLOTS - 1) #else /* 32bit sizes */ #define MAPLE_NODE_SLOTS 63 /* 256 bytes including ->parent */ #define MAPLE_RANGE64_SLOTS 32 /* 256 bytes */ #define MAPLE_ARANGE64_SLOTS 21 /* 240 bytes */ #define MAPLE_ALLOC_SLOTS (MAPLE_NODE_SLOTS - 2) #endif /* defined(CONFIG_64BIT) || defined(BUILD_VDSO32_64) */ #define MAPLE_NODE_MASK 255UL /* * The node->parent of the root node has bit 0 set and the rest of the pointer * is a pointer to the tree itself. No more bits are available in this pointer * (on m68k, the data structure may only be 2-byte aligned). * * Internal non-root nodes can only have maple_range_* nodes as parents. The * parent pointer is 256B aligned like all other tree nodes. When storing a 32 * or 64 bit values, the offset can fit into 4 bits. The 16 bit values need an * extra bit to store the offset. This extra bit comes from a reuse of the last * bit in the node type. This is possible by using bit 1 to indicate if bit 2 * is part of the type or the slot. * * Once the type is decided, the decision of an allocation range type or a * range type is done by examining the immutable tree flag for the * MT_FLAGS_ALLOC_RANGE flag. * * Node types: * 0x??1 = Root * 0x?00 = 16 bit nodes * 0x010 = 32 bit nodes * 0x110 = 64 bit nodes * * Slot size and location in the parent pointer: * type : slot location * 0x??1 : Root * 0x?00 : 16 bit values, type in 0-1, slot in 2-6 * 0x010 : 32 bit values, type in 0-2, slot in 3-6 * 0x110 : 64 bit values, type in 0-2, slot in 3-6 */ /* * This metadata is used to optimize the gap updating code and in reverse * searching for gaps or any other code that needs to find the end of the data. */ struct maple_metadata { unsigned char end; unsigned char gap; }; /* * Leaf nodes do not store pointers to nodes, they store user data. Users may * store almost any bit pattern. As noted above, the optimisation of storing an * entry at 0 in the root pointer cannot be done for data which have the bottom * two bits set to '10'. We also reserve values with the bottom two bits set to * '10' which are below 4096 (ie 2, 6, 10 .. 4094) for internal use. Some APIs * return errnos as a negative errno shifted right by two bits and the bottom * two bits set to '10', and while choosing to store these values in the array * is not an error, it may lead to confusion if you're testing for an error with * mas_is_err(). * * Non-leaf nodes store the type of the node pointed to (enum maple_type in bits * 3-6), bit 2 is reserved. That leaves bits 0-1 unused for now. * * In regular B-Tree terms, pivots are called keys. The term pivot is used to * indicate that the tree is specifying ranges, Pivots may appear in the * subtree with an entry attached to the value whereas keys are unique to a * specific position of a B-tree. Pivot values are inclusive of the slot with * the same index. */ struct maple_range_64 { struct maple_pnode *parent; unsigned long pivot[MAPLE_RANGE64_SLOTS - 1]; union { void __rcu *slot[MAPLE_RANGE64_SLOTS]; struct { void __rcu *pad[MAPLE_RANGE64_SLOTS - 1]; struct maple_metadata meta; }; }; }; /* * At tree creation time, the user can specify that they're willing to trade off * storing fewer entries in a tree in return for storing more information in * each node. * * The maple tree supports recording the largest range of NULL entries available * in this node, also called gaps. This optimises the tree for allocating a * range. */ struct maple_arange_64 { struct maple_pnode *parent; unsigned long pivot[MAPLE_ARANGE64_SLOTS - 1]; void __rcu *slot[MAPLE_ARANGE64_SLOTS]; unsigned long gap[MAPLE_ARANGE64_SLOTS]; struct maple_metadata meta; }; struct maple_alloc { unsigned long total; unsigned char node_count; unsigned int request_count; struct maple_alloc *slot[MAPLE_ALLOC_SLOTS]; }; struct maple_topiary { struct maple_pnode *parent; struct maple_enode *next; /* Overlaps the pivot */ }; enum maple_type { maple_dense, maple_leaf_64, maple_range_64, maple_arange_64, }; enum store_type { wr_invalid, wr_new_root, wr_store_root, wr_exact_fit, wr_spanning_store, wr_split_store, wr_rebalance, wr_append, wr_node_store, wr_slot_store, }; /** * DOC: Maple tree flags * * * MT_FLAGS_ALLOC_RANGE - Track gaps in this tree * * MT_FLAGS_USE_RCU - Operate in RCU mode * * MT_FLAGS_HEIGHT_OFFSET - The position of the tree height in the flags * * MT_FLAGS_HEIGHT_MASK - The mask for the maple tree height value * * MT_FLAGS_LOCK_MASK - How the mt_lock is used * * MT_FLAGS_LOCK_IRQ - Acquired irq-safe * * MT_FLAGS_LOCK_BH - Acquired bh-safe * * MT_FLAGS_LOCK_EXTERN - mt_lock is not used * * MAPLE_HEIGHT_MAX The largest height that can be stored */ #define MT_FLAGS_ALLOC_RANGE 0x01 #define MT_FLAGS_USE_RCU 0x02 #define MT_FLAGS_HEIGHT_OFFSET 0x02 #define MT_FLAGS_HEIGHT_MASK 0x7C #define MT_FLAGS_LOCK_MASK 0x300 #define MT_FLAGS_LOCK_IRQ 0x100 #define MT_FLAGS_LOCK_BH 0x200 #define MT_FLAGS_LOCK_EXTERN 0x300 #define MT_FLAGS_ALLOC_WRAPPED 0x0800 #define MAPLE_HEIGHT_MAX 31 #define MAPLE_NODE_TYPE_MASK 0x0F #define MAPLE_NODE_TYPE_SHIFT 0x03 #define MAPLE_RESERVED_RANGE 4096 #ifdef CONFIG_LOCKDEP typedef struct lockdep_map *lockdep_map_p; #define mt_lock_is_held(mt) \ (!(mt)->ma_external_lock || lock_is_held((mt)->ma_external_lock)) #define mt_write_lock_is_held(mt) \ (!(mt)->ma_external_lock || \ lock_is_held_type((mt)->ma_external_lock, 0)) #define mt_set_external_lock(mt, lock) \ (mt)->ma_external_lock = &(lock)->dep_map #define mt_on_stack(mt) (mt).ma_external_lock = NULL #else typedef struct { /* nothing */ } lockdep_map_p; #define mt_lock_is_held(mt) 1 #define mt_write_lock_is_held(mt) 1 #define mt_set_external_lock(mt, lock) do { } while (0) #define mt_on_stack(mt) do { } while (0) #endif /* * If the tree contains a single entry at index 0, it is usually stored in * tree->ma_root. To optimise for the page cache, an entry which ends in '00', * '01' or '11' is stored in the root, but an entry which ends in '10' will be * stored in a node. Bits 3-6 are used to store enum maple_type. * * The flags are used both to store some immutable information about this tree * (set at tree creation time) and dynamic information set under the spinlock. * * Another use of flags are to indicate global states of the tree. This is the * case with the MT_FLAGS_USE_RCU flag, which indicates the tree is currently in * RCU mode. This mode was added to allow the tree to reuse nodes instead of * re-allocating and RCU freeing nodes when there is a single user. */ struct maple_tree { union { spinlock_t ma_lock; lockdep_map_p ma_external_lock; }; unsigned int ma_flags; void __rcu *ma_root; }; /** * MTREE_INIT() - Initialize a maple tree * @name: The maple tree name * @__flags: The maple tree flags * */ #define MTREE_INIT(name, __flags) { \ .ma_lock = __SPIN_LOCK_UNLOCKED((name).ma_lock), \ .ma_flags = __flags, \ .ma_root = NULL, \ } /** * MTREE_INIT_EXT() - Initialize a maple tree with an external lock. * @name: The tree name * @__flags: The maple tree flags * @__lock: The external lock */ #ifdef CONFIG_LOCKDEP #define MTREE_INIT_EXT(name, __flags, __lock) { \ .ma_external_lock = &(__lock).dep_map, \ .ma_flags = (__flags), \ .ma_root = NULL, \ } #else #define MTREE_INIT_EXT(name, __flags, __lock) MTREE_INIT(name, __flags) #endif #define DEFINE_MTREE(name) \ struct maple_tree name = MTREE_INIT(name, 0) #define mtree_lock(mt) spin_lock((&(mt)->ma_lock)) #define mtree_lock_nested(mas, subclass) \ spin_lock_nested((&(mt)->ma_lock), subclass) #define mtree_unlock(mt) spin_unlock((&(mt)->ma_lock)) /* * The Maple Tree squeezes various bits in at various points which aren't * necessarily obvious. Usually, this is done by observing that pointers are * N-byte aligned and thus the bottom log_2(N) bits are available for use. We * don't use the high bits of pointers to store additional information because * we don't know what bits are unused on any given architecture. * * Nodes are 256 bytes in size and are also aligned to 256 bytes, giving us 8 * low bits for our own purposes. Nodes are currently of 4 types: * 1. Single pointer (Range is 0-0) * 2. Non-leaf Allocation Range nodes * 3. Non-leaf Range nodes * 4. Leaf Range nodes All nodes consist of a number of node slots, * pivots, and a parent pointer. */ struct maple_node { union { struct { struct maple_pnode *parent; void __rcu *slot[MAPLE_NODE_SLOTS]; }; struct { void *pad; struct rcu_head rcu; struct maple_enode *piv_parent; unsigned char parent_slot; enum maple_type type; unsigned char slot_len; unsigned int ma_flags; }; struct maple_range_64 mr64; struct maple_arange_64 ma64; struct maple_alloc alloc; }; }; /* * More complicated stores can cause two nodes to become one or three and * potentially alter the height of the tree. Either half of the tree may need * to be rebalanced against the other. The ma_topiary struct is used to track * which nodes have been 'cut' from the tree so that the change can be done * safely at a later date. This is done to support RCU. */ struct ma_topiary { struct maple_enode *head; struct maple_enode *tail; struct maple_tree *mtree; }; void *mtree_load(struct maple_tree *mt, unsigned long index); int mtree_insert(struct maple_tree *mt, unsigned long index, void *entry, gfp_t gfp); int mtree_insert_range(struct maple_tree *mt, unsigned long first, unsigned long last, void *entry, gfp_t gfp); int mtree_alloc_range(struct maple_tree *mt, unsigned long *startp, void *entry, unsigned long size, unsigned long min, unsigned long max, gfp_t gfp); int mtree_alloc_cyclic(struct maple_tree *mt, unsigned long *startp, void *entry, unsigned long range_lo, unsigned long range_hi, unsigned long *next, gfp_t gfp); int mtree_alloc_rrange(struct maple_tree *mt, unsigned long *startp, void *entry, unsigned long size, unsigned long min, unsigned long max, gfp_t gfp); int mtree_store_range(struct maple_tree *mt, unsigned long first, unsigned long last, void *entry, gfp_t gfp); int mtree_store(struct maple_tree *mt, unsigned long index, void *entry, gfp_t gfp); void *mtree_erase(struct maple_tree *mt, unsigned long index); int mtree_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp); int __mt_dup(struct maple_tree *mt, struct maple_tree *new, gfp_t gfp); void mtree_destroy(struct maple_tree *mt); void __mt_destroy(struct maple_tree *mt); /** * mtree_empty() - Determine if a tree has any present entries. * @mt: Maple Tree. * * Context: Any context. * Return: %true if the tree contains only NULL pointers. */ static inline bool mtree_empty(const struct maple_tree *mt) { return mt->ma_root == NULL; } /* Advanced API */ /* * Maple State Status * ma_active means the maple state is pointing to a node and offset and can * continue operating on the tree. * ma_start means we have not searched the tree. * ma_root means we have searched the tree and the entry we found lives in * the root of the tree (ie it has index 0, length 1 and is the only entry in * the tree). * ma_none means we have searched the tree and there is no node in the * tree for this entry. For example, we searched for index 1 in an empty * tree. Or we have a tree which points to a full leaf node and we * searched for an entry which is larger than can be contained in that * leaf node. * ma_pause means the data within the maple state may be stale, restart the * operation * ma_overflow means the search has reached the upper limit of the search * ma_underflow means the search has reached the lower limit of the search * ma_error means there was an error, check the node for the error number. */ enum maple_status { ma_active, ma_start, ma_root, ma_none, ma_pause, ma_overflow, ma_underflow, ma_error, }; /* * The maple state is defined in the struct ma_state and is used to keep track * of information during operations, and even between operations when using the * advanced API. * * If state->node has bit 0 set then it references a tree location which is not * a node (eg the root). If bit 1 is set, the rest of the bits are a negative * errno. Bit 2 (the 'unallocated slots' bit) is clear. Bits 3-6 indicate the * node type. * * state->alloc either has a request number of nodes or an allocated node. If * stat->alloc has a requested number of nodes, the first bit will be set (0x1) * and the remaining bits are the value. If state->alloc is a node, then the * node will be of type maple_alloc. maple_alloc has MAPLE_NODE_SLOTS - 1 for * storing more allocated nodes, a total number of nodes allocated, and the * node_count in this node. node_count is the number of allocated nodes in this * node. The scaling beyond MAPLE_NODE_SLOTS - 1 is handled by storing further * nodes into state->alloc->slot[0]'s node. Nodes are taken from state->alloc * by removing a node from the state->alloc node until state->alloc->node_count * is 1, when state->alloc is returned and the state->alloc->slot[0] is promoted * to state->alloc. Nodes are pushed onto state->alloc by putting the current * state->alloc into the pushed node's slot[0]. * * The state also contains the implied min/max of the state->node, the depth of * this search, and the offset. The implied min/max are either from the parent * node or are 0-oo for the root node. The depth is incremented or decremented * every time a node is walked down or up. The offset is the slot/pivot of * interest in the node - either for reading or writing. * * When returning a value the maple state index and last respectively contain * the start and end of the range for the entry. Ranges are inclusive in the * Maple Tree. * * The status of the state is used to determine how the next action should treat * the state. For instance, if the status is ma_start then the next action * should start at the root of the tree and walk down. If the status is * ma_pause then the node may be stale data and should be discarded. If the * status is ma_overflow, then the last action hit the upper limit. * */ struct ma_state { struct maple_tree *tree; /* The tree we're operating in */ unsigned long index; /* The index we're operating on - range start */ unsigned long last; /* The last index we're operating on - range end */ struct maple_enode *node; /* The node containing this entry */ unsigned long min; /* The minimum index of this node - implied pivot min */ unsigned long max; /* The maximum index of this node - implied pivot max */ struct maple_alloc *alloc; /* Allocated nodes for this operation */ enum maple_status status; /* The status of the state (active, start, none, etc) */ unsigned char depth; /* depth of tree descent during write */ unsigned char offset; unsigned char mas_flags; unsigned char end; /* The end of the node */ enum store_type store_type; /* The type of store needed for this operation */ }; struct ma_wr_state { struct ma_state *mas; struct maple_node *node; /* Decoded mas->node */ unsigned long r_min; /* range min */ unsigned long r_max; /* range max */ enum maple_type type; /* mas->node type */ unsigned char offset_end; /* The offset where the write ends */ unsigned long *pivots; /* mas->node->pivots pointer */ unsigned long end_piv; /* The pivot at the offset end */ void __rcu **slots; /* mas->node->slots pointer */ void *entry; /* The entry to write */ void *content; /* The existing entry that is being overwritten */ }; #define mas_lock(mas) spin_lock(&((mas)->tree->ma_lock)) #define mas_lock_nested(mas, subclass) \ spin_lock_nested(&((mas)->tree->ma_lock), subclass) #define mas_unlock(mas) spin_unlock(&((mas)->tree->ma_lock)) /* * Special values for ma_state.node. * MA_ERROR represents an errno. After dropping the lock and attempting * to resolve the error, the walk would have to be restarted from the * top of the tree as the tree may have been modified. */ #define MA_ERROR(err) \ ((struct maple_enode *)(((unsigned long)err << 2) | 2UL)) #define MA_STATE(name, mt, first, end) \ struct ma_state name = { \ .tree = mt, \ .index = first, \ .last = end, \ .node = NULL, \ .status = ma_start, \ .min = 0, \ .max = ULONG_MAX, \ .alloc = NULL, \ .mas_flags = 0, \ .store_type = wr_invalid, \ } #define MA_WR_STATE(name, ma_state, wr_entry) \ struct ma_wr_state name = { \ .mas = ma_state, \ .content = NULL, \ .entry = wr_entry, \ } #define MA_TOPIARY(name, tree) \ struct ma_topiary name = { \ .head = NULL, \ .tail = NULL, \ .mtree = tree, \ } void *mas_walk(struct ma_state *mas); void *mas_store(struct ma_state *mas, void *entry); void *mas_erase(struct ma_state *mas); int mas_store_gfp(struct ma_state *mas, void *entry, gfp_t gfp); void mas_store_prealloc(struct ma_state *mas, void *entry); void *mas_find(struct ma_state *mas, unsigned long max); void *mas_find_range(struct ma_state *mas, unsigned long max); void *mas_find_rev(struct ma_state *mas, unsigned long min); void *mas_find_range_rev(struct ma_state *mas, unsigned long max); int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp); int mas_alloc_cyclic(struct ma_state *mas, unsigned long *startp, void *entry, unsigned long range_lo, unsigned long range_hi, unsigned long *next, gfp_t gfp); bool mas_nomem(struct ma_state *mas, gfp_t gfp); void mas_pause(struct ma_state *mas); void maple_tree_init(void); void mas_destroy(struct ma_state *mas); int mas_expected_entries(struct ma_state *mas, unsigned long nr_entries); void *mas_prev(struct ma_state *mas, unsigned long min); void *mas_prev_range(struct ma_state *mas, unsigned long max); void *mas_next(struct ma_state *mas, unsigned long max); void *mas_next_range(struct ma_state *mas, unsigned long max); int mas_empty_area(struct ma_state *mas, unsigned long min, unsigned long max, unsigned long size); /* * This finds an empty area from the highest address to the lowest. * AKA "Topdown" version, */ int mas_empty_area_rev(struct ma_state *mas, unsigned long min, unsigned long max, unsigned long size); static inline void mas_init(struct ma_state *mas, struct maple_tree *tree, unsigned long addr) { memset(mas, 0, sizeof(struct ma_state)); mas->tree = tree; mas->index = mas->last = addr; mas->max = ULONG_MAX; mas->status = ma_start; mas->node = NULL; } static inline bool mas_is_active(struct ma_state *mas) { return mas->status == ma_active; } static inline bool mas_is_err(struct ma_state *mas) { return mas->status == ma_error; } /** * mas_reset() - Reset a Maple Tree operation state. * @mas: Maple Tree operation state. * * Resets the error or walk state of the @mas so future walks of the * array will start from the root. Use this if you have dropped the * lock and want to reuse the ma_state. * * Context: Any context. */ static __always_inline void mas_reset(struct ma_state *mas) { mas->status = ma_start; mas->node = NULL; } /** * mas_for_each() - Iterate over a range of the maple tree. * @__mas: Maple Tree operation state (maple_state) * @__entry: Entry retrieved from the tree * @__max: maximum index to retrieve from the tree * * When returned, mas->index and mas->last will hold the entire range for the * entry. * * Note: may return the zero entry. */ #define mas_for_each(__mas, __entry, __max) \ while (((__entry) = mas_find((__mas), (__max))) != NULL) /** * mas_for_each_rev() - Iterate over a range of the maple tree in reverse order. * @__mas: Maple Tree operation state (maple_state) * @__entry: Entry retrieved from the tree * @__min: minimum index to retrieve from the tree * * When returned, mas->index and mas->last will hold the entire range for the * entry. * * Note: may return the zero entry. */ #define mas_for_each_rev(__mas, __entry, __min) \ while (((__entry) = mas_find_rev((__mas), (__min))) != NULL) #ifdef CONFIG_DEBUG_MAPLE_TREE enum mt_dump_format { mt_dump_dec, mt_dump_hex, }; extern atomic_t maple_tree_tests_run; extern atomic_t maple_tree_tests_passed; void mt_dump(const struct maple_tree *mt, enum mt_dump_format format); void mas_dump(const struct ma_state *mas); void mas_wr_dump(const struct ma_wr_state *wr_mas); void mt_validate(struct maple_tree *mt); void mt_cache_shrink(void); #define MT_BUG_ON(__tree, __x) do { \ atomic_inc(&maple_tree_tests_run); \ if (__x) { \ pr_info("BUG at %s:%d (%u)\n", \ __func__, __LINE__, __x); \ mt_dump(__tree, mt_dump_hex); \ pr_info("Pass: %u Run:%u\n", \ atomic_read(&maple_tree_tests_passed), \ atomic_read(&maple_tree_tests_run)); \ dump_stack(); \ } else { \ atomic_inc(&maple_tree_tests_passed); \ } \ } while (0) #define MAS_BUG_ON(__mas, __x) do { \ atomic_inc(&maple_tree_tests_run); \ if (__x) { \ pr_info("BUG at %s:%d (%u)\n", \ __func__, __LINE__, __x); \ mas_dump(__mas); \ mt_dump((__mas)->tree, mt_dump_hex); \ pr_info("Pass: %u Run:%u\n", \ atomic_read(&maple_tree_tests_passed), \ atomic_read(&maple_tree_tests_run)); \ dump_stack(); \ } else { \ atomic_inc(&maple_tree_tests_passed); \ } \ } while (0) #define MAS_WR_BUG_ON(__wrmas, __x) do { \ atomic_inc(&maple_tree_tests_run); \ if (__x) { \ pr_info("BUG at %s:%d (%u)\n", \ __func__, __LINE__, __x); \ mas_wr_dump(__wrmas); \ mas_dump((__wrmas)->mas); \ mt_dump((__wrmas)->mas->tree, mt_dump_hex); \ pr_info("Pass: %u Run:%u\n", \ atomic_read(&maple_tree_tests_passed), \ atomic_read(&maple_tree_tests_run)); \ dump_stack(); \ } else { \ atomic_inc(&maple_tree_tests_passed); \ } \ } while (0) #define MT_WARN_ON(__tree, __x) ({ \ int ret = !!(__x); \ atomic_inc(&maple_tree_tests_run); \ if (ret) { \ pr_info("WARN at %s:%d (%u)\n", \ __func__, __LINE__, __x); \ mt_dump(__tree, mt_dump_hex); \ pr_info("Pass: %u Run:%u\n", \ atomic_read(&maple_tree_tests_passed), \ atomic_read(&maple_tree_tests_run)); \ dump_stack(); \ } else { \ atomic_inc(&maple_tree_tests_passed); \ } \ unlikely(ret); \ }) #define MAS_WARN_ON(__mas, __x) ({ \ int ret = !!(__x); \ atomic_inc(&maple_tree_tests_run); \ if (ret) { \ pr_info("WARN at %s:%d (%u)\n", \ __func__, __LINE__, __x); \ mas_dump(__mas); \ mt_dump((__mas)->tree, mt_dump_hex); \ pr_info("Pass: %u Run:%u\n", \ atomic_read(&maple_tree_tests_passed), \ atomic_read(&maple_tree_tests_run)); \ dump_stack(); \ } else { \ atomic_inc(&maple_tree_tests_passed); \ } \ unlikely(ret); \ }) #define MAS_WR_WARN_ON(__wrmas, __x) ({ \ int ret = !!(__x); \ atomic_inc(&maple_tree_tests_run); \ if (ret) { \ pr_info("WARN at %s:%d (%u)\n", \ __func__, __LINE__, __x); \ mas_wr_dump(__wrmas); \ mas_dump((__wrmas)->mas); \ mt_dump((__wrmas)->mas->tree, mt_dump_hex); \ pr_info("Pass: %u Run:%u\n", \ atomic_read(&maple_tree_tests_passed), \ atomic_read(&maple_tree_tests_run)); \ dump_stack(); \ } else { \ atomic_inc(&maple_tree_tests_passed); \ } \ unlikely(ret); \ }) #else #define MT_BUG_ON(__tree, __x) BUG_ON(__x) #define MAS_BUG_ON(__mas, __x) BUG_ON(__x) #define MAS_WR_BUG_ON(__mas, __x) BUG_ON(__x) #define MT_WARN_ON(__tree, __x) WARN_ON(__x) #define MAS_WARN_ON(__mas, __x) WARN_ON(__x) #define MAS_WR_WARN_ON(__mas, __x) WARN_ON(__x) #endif /* CONFIG_DEBUG_MAPLE_TREE */ /** * __mas_set_range() - Set up Maple Tree operation state to a sub-range of the * current location. * @mas: Maple Tree operation state. * @start: New start of range in the Maple Tree. * @last: New end of range in the Maple Tree. * * set the internal maple state values to a sub-range. * Please use mas_set_range() if you do not know where you are in the tree. */ static inline void __mas_set_range(struct ma_state *mas, unsigned long start, unsigned long last) { /* Ensure the range starts within the current slot */ MAS_WARN_ON(mas, mas_is_active(mas) && (mas->index > start || mas->last < start)); mas->index = start; mas->last = last; } /** * mas_set_range() - Set up Maple Tree operation state for a different index. * @mas: Maple Tree operation state. * @start: New start of range in the Maple Tree. * @last: New end of range in the Maple Tree. * * Move the operation state to refer to a different range. This will * have the effect of starting a walk from the top; see mas_next() * to move to an adjacent index. */ static inline void mas_set_range(struct ma_state *mas, unsigned long start, unsigned long last) { mas_reset(mas); __mas_set_range(mas, start, last); } /** * mas_set() - Set up Maple Tree operation state for a different index. * @mas: Maple Tree operation state. * @index: New index into the Maple Tree. * * Move the operation state to refer to a different index. This will * have the effect of starting a walk from the top; see mas_next() * to move to an adjacent index. */ static inline void mas_set(struct ma_state *mas, unsigned long index) { mas_set_range(mas, index, index); } static inline bool mt_external_lock(const struct maple_tree *mt) { return (mt->ma_flags & MT_FLAGS_LOCK_MASK) == MT_FLAGS_LOCK_EXTERN; } /** * mt_init_flags() - Initialise an empty maple tree with flags. * @mt: Maple Tree * @flags: maple tree flags. * * If you need to initialise a Maple Tree with special flags (eg, an * allocation tree), use this function. * * Context: Any context. */ static inline void mt_init_flags(struct maple_tree *mt, unsigned int flags) { mt->ma_flags = flags; if (!mt_external_lock(mt)) spin_lock_init(&mt->ma_lock); rcu_assign_pointer(mt->ma_root, NULL); } /** * mt_init() - Initialise an empty maple tree. * @mt: Maple Tree * * An empty Maple Tree. * * Context: Any context. */ static inline void mt_init(struct maple_tree *mt) { mt_init_flags(mt, 0); } static inline bool mt_in_rcu(struct maple_tree *mt) { #ifdef CONFIG_MAPLE_RCU_DISABLED return false; #endif return mt->ma_flags & MT_FLAGS_USE_RCU; } /** * mt_clear_in_rcu() - Switch the tree to non-RCU mode. * @mt: The Maple Tree */ static inline void mt_clear_in_rcu(struct maple_tree *mt) { if (!mt_in_rcu(mt)) return; if (mt_external_lock(mt)) { WARN_ON(!mt_lock_is_held(mt)); mt->ma_flags &= ~MT_FLAGS_USE_RCU; } else { mtree_lock(mt); mt->ma_flags &= ~MT_FLAGS_USE_RCU; mtree_unlock(mt); } } /** * mt_set_in_rcu() - Switch the tree to RCU safe mode. * @mt: The Maple Tree */ static inline void mt_set_in_rcu(struct maple_tree *mt) { if (mt_in_rcu(mt)) return; if (mt_external_lock(mt)) { WARN_ON(!mt_lock_is_held(mt)); mt->ma_flags |= MT_FLAGS_USE_RCU; } else { mtree_lock(mt); mt->ma_flags |= MT_FLAGS_USE_RCU; mtree_unlock(mt); } } static inline unsigned int mt_height(const struct maple_tree *mt) { return (mt->ma_flags & MT_FLAGS_HEIGHT_MASK) >> MT_FLAGS_HEIGHT_OFFSET; } void *mt_find(struct maple_tree *mt, unsigned long *index, unsigned long max); void *mt_find_after(struct maple_tree *mt, unsigned long *index, unsigned long max); void *mt_prev(struct maple_tree *mt, unsigned long index, unsigned long min); void *mt_next(struct maple_tree *mt, unsigned long index, unsigned long max); /** * mt_for_each - Iterate over each entry starting at index until max. * @__tree: The Maple Tree * @__entry: The current entry * @__index: The index to start the search from. Subsequently used as iterator. * @__max: The maximum limit for @index * * This iterator skips all entries, which resolve to a NULL pointer, * e.g. entries which has been reserved with XA_ZERO_ENTRY. */ #define mt_for_each(__tree, __entry, __index, __max) \ for (__entry = mt_find(__tree, &(__index), __max); \ __entry; __entry = mt_find_after(__tree, &(__index), __max)) #endif /*_LINUX_MAPLE_TREE_H */
14 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM mctp #if !defined(_TRACE_MCTP_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_MCTP_H #include <linux/tracepoint.h> #ifndef __TRACE_MCTP_ENUMS #define __TRACE_MCTP_ENUMS enum { MCTP_TRACE_KEY_TIMEOUT, MCTP_TRACE_KEY_REPLIED, MCTP_TRACE_KEY_INVALIDATED, MCTP_TRACE_KEY_CLOSED, MCTP_TRACE_KEY_DROPPED, }; #endif /* __TRACE_MCTP_ENUMS */ TRACE_DEFINE_ENUM(MCTP_TRACE_KEY_TIMEOUT); TRACE_DEFINE_ENUM(MCTP_TRACE_KEY_REPLIED); TRACE_DEFINE_ENUM(MCTP_TRACE_KEY_INVALIDATED); TRACE_DEFINE_ENUM(MCTP_TRACE_KEY_CLOSED); TRACE_DEFINE_ENUM(MCTP_TRACE_KEY_DROPPED); TRACE_EVENT(mctp_key_acquire, TP_PROTO(const struct mctp_sk_key *key), TP_ARGS(key), TP_STRUCT__entry( __field(__u8, paddr) __field(__u8, laddr) __field(__u8, tag) ), TP_fast_assign( __entry->paddr = key->peer_addr; __entry->laddr = key->local_addr; __entry->tag = key->tag; ), TP_printk("local %d, peer %d, tag %1x", __entry->laddr, __entry->paddr, __entry->tag ) ); TRACE_EVENT(mctp_key_release, TP_PROTO(const struct mctp_sk_key *key, int reason), TP_ARGS(key, reason), TP_STRUCT__entry( __field(__u8, paddr) __field(__u8, laddr) __field(__u8, tag) __field(int, reason) ), TP_fast_assign( __entry->paddr = key->peer_addr; __entry->laddr = key->local_addr; __entry->tag = key->tag; __entry->reason = reason; ), TP_printk("local %d, peer %d, tag %1x %s", __entry->laddr, __entry->paddr, __entry->tag, __print_symbolic(__entry->reason, { MCTP_TRACE_KEY_TIMEOUT, "timeout" }, { MCTP_TRACE_KEY_REPLIED, "replied" }, { MCTP_TRACE_KEY_INVALIDATED, "invalidated" }, { MCTP_TRACE_KEY_CLOSED, "closed" }, { MCTP_TRACE_KEY_DROPPED, "dropped" }) ) ); #endif #include <trace/define_trace.h>
70 1930 729 1931 3 2 2 2 3 357 90 1094 1092 1090 1090 1088 1094 358 357 1107 49 1 1107 36 1102 1094 1094 4 1094 2 1093 1092 565 563 1 756 618 527 1091 7 4 1091 570 567 547 1084 882 609 1082 201 1082 265 177 1093 4 3 390 1092 398 1036 52 62 1863 1861 1927 1928 97 1922 1922 1928 59 59 2 1925 1922 1281 696 690 1920 31 1899 1129 1158 1925 1146 1109 428 1107 428 1188 352 1924 8 1952 1957 24 1952 1927 1935 13 1936 365 1929 1928 4 1922 2 2 1920 569 1924 111 112 59 59 95 1927 1268 8 1 8 8 1264 31 1568 1949 182 1752 24 1751 1924 1922 1920 1925 597 1925 1923 1927 1922 1951 1932 27 5 1924 1931 1926 1924 1957 1923 1925 1925 1925 1923 368 1910 1890 1897 1890 1898 1898 1889 1867 1867 1869 1896 1959 10 1947 4 1946 1956 1956 1955 1957 4 1954 1953 1955 1954 1 1951 55 1954 1957 1957 1953 1960 1914 12 111 111 13 99 99 111 12 12 12 12 1 11 10 2 10 8 3 10 1 9 3 3 2 1 1 1 10 11 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 // SPDX-License-Identifier: GPL-2.0 /* * Released under the GPLv2 only. */ #include <linux/usb.h> #include <linux/usb/ch9.h> #include <linux/usb/hcd.h> #include <linux/usb/quirks.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/string_choices.h> #include <linux/device.h> #include <asm/byteorder.h> #include "usb.h" #define USB_MAXALTSETTING 128 /* Hard limit */ #define USB_MAXCONFIG 8 /* Arbitrary limit */ static int find_next_descriptor(unsigned char *buffer, int size, int dt1, int dt2, int *num_skipped) { struct usb_descriptor_header *h; int n = 0; unsigned char *buffer0 = buffer; /* Find the next descriptor of type dt1 or dt2 */ while (size > 0) { h = (struct usb_descriptor_header *) buffer; if (h->bDescriptorType == dt1 || h->bDescriptorType == dt2) break; buffer += h->bLength; size -= h->bLength; ++n; } /* Store the number of descriptors skipped and return the * number of bytes skipped */ if (num_skipped) *num_skipped = n; return buffer - buffer0; } static void usb_parse_ssp_isoc_endpoint_companion(struct device *ddev, int cfgno, int inum, int asnum, struct usb_host_endpoint *ep, unsigned char *buffer, int size) { struct usb_ssp_isoc_ep_comp_descriptor *desc; /* * The SuperSpeedPlus Isoc endpoint companion descriptor immediately * follows the SuperSpeed Endpoint Companion descriptor */ desc = (struct usb_ssp_isoc_ep_comp_descriptor *) buffer; if (desc->bDescriptorType != USB_DT_SSP_ISOC_ENDPOINT_COMP || size < USB_DT_SSP_ISOC_EP_COMP_SIZE) { dev_notice(ddev, "Invalid SuperSpeedPlus isoc endpoint companion" "for config %d interface %d altsetting %d ep %d.\n", cfgno, inum, asnum, ep->desc.bEndpointAddress); return; } memcpy(&ep->ssp_isoc_ep_comp, desc, USB_DT_SSP_ISOC_EP_COMP_SIZE); } static void usb_parse_eusb2_isoc_endpoint_companion(struct device *ddev, int cfgno, int inum, int asnum, struct usb_host_endpoint *ep, unsigned char *buffer, int size) { struct usb_eusb2_isoc_ep_comp_descriptor *desc; struct usb_descriptor_header *h; /* * eUSB2 isochronous endpoint companion descriptor for this endpoint * shall be declared before the next endpoint or interface descriptor */ while (size >= USB_DT_EUSB2_ISOC_EP_COMP_SIZE) { h = (struct usb_descriptor_header *)buffer; if (h->bDescriptorType == USB_DT_EUSB2_ISOC_ENDPOINT_COMP) { desc = (struct usb_eusb2_isoc_ep_comp_descriptor *)buffer; ep->eusb2_isoc_ep_comp = *desc; return; } if (h->bDescriptorType == USB_DT_ENDPOINT || h->bDescriptorType == USB_DT_INTERFACE) break; buffer += h->bLength; size -= h->bLength; } dev_notice(ddev, "No eUSB2 isoc ep %d companion for config %d interface %d altsetting %d\n", ep->desc.bEndpointAddress, cfgno, inum, asnum); } static void usb_parse_ss_endpoint_companion(struct device *ddev, int cfgno, int inum, int asnum, struct usb_host_endpoint *ep, unsigned char *buffer, int size) { struct usb_ss_ep_comp_descriptor *desc; int max_tx; /* The SuperSpeed endpoint companion descriptor is supposed to * be the first thing immediately following the endpoint descriptor. */ desc = (struct usb_ss_ep_comp_descriptor *) buffer; if (desc->bDescriptorType != USB_DT_SS_ENDPOINT_COMP || size < USB_DT_SS_EP_COMP_SIZE) { dev_notice(ddev, "No SuperSpeed endpoint companion for config %d " " interface %d altsetting %d ep %d: " "using minimum values\n", cfgno, inum, asnum, ep->desc.bEndpointAddress); /* Fill in some default values. * Leave bmAttributes as zero, which will mean no streams for * bulk, and isoc won't support multiple bursts of packets. * With bursts of only one packet, and a Mult of 1, the max * amount of data moved per endpoint service interval is one * packet. */ ep->ss_ep_comp.bLength = USB_DT_SS_EP_COMP_SIZE; ep->ss_ep_comp.bDescriptorType = USB_DT_SS_ENDPOINT_COMP; if (usb_endpoint_xfer_isoc(&ep->desc) || usb_endpoint_xfer_int(&ep->desc)) ep->ss_ep_comp.wBytesPerInterval = ep->desc.wMaxPacketSize; return; } buffer += desc->bLength; size -= desc->bLength; memcpy(&ep->ss_ep_comp, desc, USB_DT_SS_EP_COMP_SIZE); /* Check the various values */ if (usb_endpoint_xfer_control(&ep->desc) && desc->bMaxBurst != 0) { dev_notice(ddev, "Control endpoint with bMaxBurst = %d in " "config %d interface %d altsetting %d ep %d: " "setting to zero\n", desc->bMaxBurst, cfgno, inum, asnum, ep->desc.bEndpointAddress); ep->ss_ep_comp.bMaxBurst = 0; } else if (desc->bMaxBurst > 15) { dev_notice(ddev, "Endpoint with bMaxBurst = %d in " "config %d interface %d altsetting %d ep %d: " "setting to 15\n", desc->bMaxBurst, cfgno, inum, asnum, ep->desc.bEndpointAddress); ep->ss_ep_comp.bMaxBurst = 15; } if ((usb_endpoint_xfer_control(&ep->desc) || usb_endpoint_xfer_int(&ep->desc)) && desc->bmAttributes != 0) { dev_notice(ddev, "%s endpoint with bmAttributes = %d in " "config %d interface %d altsetting %d ep %d: " "setting to zero\n", usb_endpoint_xfer_control(&ep->desc) ? "Control" : "Bulk", desc->bmAttributes, cfgno, inum, asnum, ep->desc.bEndpointAddress); ep->ss_ep_comp.bmAttributes = 0; } else if (usb_endpoint_xfer_bulk(&ep->desc) && desc->bmAttributes > 16) { dev_notice(ddev, "Bulk endpoint with more than 65536 streams in " "config %d interface %d altsetting %d ep %d: " "setting to max\n", cfgno, inum, asnum, ep->desc.bEndpointAddress); ep->ss_ep_comp.bmAttributes = 16; } else if (usb_endpoint_xfer_isoc(&ep->desc) && !USB_SS_SSP_ISOC_COMP(desc->bmAttributes) && USB_SS_MULT(desc->bmAttributes) > 3) { dev_notice(ddev, "Isoc endpoint has Mult of %d in " "config %d interface %d altsetting %d ep %d: " "setting to 3\n", USB_SS_MULT(desc->bmAttributes), cfgno, inum, asnum, ep->desc.bEndpointAddress); ep->ss_ep_comp.bmAttributes = 2; } if (usb_endpoint_xfer_isoc(&ep->desc)) max_tx = (desc->bMaxBurst + 1) * (USB_SS_MULT(desc->bmAttributes)) * usb_endpoint_maxp(&ep->desc); else if (usb_endpoint_xfer_int(&ep->desc)) max_tx = usb_endpoint_maxp(&ep->desc) * (desc->bMaxBurst + 1); else max_tx = 999999; if (le16_to_cpu(desc->wBytesPerInterval) > max_tx) { dev_notice(ddev, "%s endpoint with wBytesPerInterval of %d in " "config %d interface %d altsetting %d ep %d: " "setting to %d\n", usb_endpoint_xfer_isoc(&ep->desc) ? "Isoc" : "Int", le16_to_cpu(desc->wBytesPerInterval), cfgno, inum, asnum, ep->desc.bEndpointAddress, max_tx); ep->ss_ep_comp.wBytesPerInterval = cpu_to_le16(max_tx); } /* Parse a possible SuperSpeedPlus isoc ep companion descriptor */ if (usb_endpoint_xfer_isoc(&ep->desc) && USB_SS_SSP_ISOC_COMP(desc->bmAttributes)) usb_parse_ssp_isoc_endpoint_companion(ddev, cfgno, inum, asnum, ep, buffer, size); } static const unsigned short low_speed_maxpacket_maxes[4] = { [USB_ENDPOINT_XFER_CONTROL] = 8, [USB_ENDPOINT_XFER_ISOC] = 0, [USB_ENDPOINT_XFER_BULK] = 0, [USB_ENDPOINT_XFER_INT] = 8, }; static const unsigned short full_speed_maxpacket_maxes[4] = { [USB_ENDPOINT_XFER_CONTROL] = 64, [USB_ENDPOINT_XFER_ISOC] = 1023, [USB_ENDPOINT_XFER_BULK] = 64, [USB_ENDPOINT_XFER_INT] = 64, }; static const unsigned short high_speed_maxpacket_maxes[4] = { [USB_ENDPOINT_XFER_CONTROL] = 64, [USB_ENDPOINT_XFER_ISOC] = 1024, /* Bulk should be 512, but some devices use 1024: we will warn below */ [USB_ENDPOINT_XFER_BULK] = 1024, [USB_ENDPOINT_XFER_INT] = 1024, }; static const unsigned short super_speed_maxpacket_maxes[4] = { [USB_ENDPOINT_XFER_CONTROL] = 512, [USB_ENDPOINT_XFER_ISOC] = 1024, [USB_ENDPOINT_XFER_BULK] = 1024, [USB_ENDPOINT_XFER_INT] = 1024, }; static bool endpoint_is_duplicate(struct usb_endpoint_descriptor *e1, struct usb_endpoint_descriptor *e2) { if (e1->bEndpointAddress == e2->bEndpointAddress) return true; if (usb_endpoint_xfer_control(e1) || usb_endpoint_xfer_control(e2)) { if (usb_endpoint_num(e1) == usb_endpoint_num(e2)) return true; } return false; } /* * Check for duplicate endpoint addresses in other interfaces and in the * altsetting currently being parsed. */ static bool config_endpoint_is_duplicate(struct usb_host_config *config, int inum, int asnum, struct usb_endpoint_descriptor *d) { struct usb_endpoint_descriptor *epd; struct usb_interface_cache *intfc; struct usb_host_interface *alt; int i, j, k; for (i = 0; i < config->desc.bNumInterfaces; ++i) { intfc = config->intf_cache[i]; for (j = 0; j < intfc->num_altsetting; ++j) { alt = &intfc->altsetting[j]; if (alt->desc.bInterfaceNumber == inum && alt->desc.bAlternateSetting != asnum) continue; for (k = 0; k < alt->desc.bNumEndpoints; ++k) { epd = &alt->endpoint[k].desc; if (endpoint_is_duplicate(epd, d)) return true; } } } return false; } static int usb_parse_endpoint(struct device *ddev, int cfgno, struct usb_host_config *config, int inum, int asnum, struct usb_host_interface *ifp, int num_ep, unsigned char *buffer, int size) { struct usb_device *udev = to_usb_device(ddev); unsigned char *buffer0 = buffer; struct usb_endpoint_descriptor *d; struct usb_host_endpoint *endpoint; int n, i, j, retval; unsigned int maxp; const unsigned short *maxpacket_maxes; u16 bcdUSB; d = (struct usb_endpoint_descriptor *) buffer; bcdUSB = le16_to_cpu(udev->descriptor.bcdUSB); buffer += d->bLength; size -= d->bLength; if (d->bLength >= USB_DT_ENDPOINT_AUDIO_SIZE) n = USB_DT_ENDPOINT_AUDIO_SIZE; else if (d->bLength >= USB_DT_ENDPOINT_SIZE) n = USB_DT_ENDPOINT_SIZE; else { dev_notice(ddev, "config %d interface %d altsetting %d has an " "invalid endpoint descriptor of length %d, skipping\n", cfgno, inum, asnum, d->bLength); goto skip_to_next_endpoint_or_interface_descriptor; } i = d->bEndpointAddress & USB_ENDPOINT_NUMBER_MASK; if (i == 0) { dev_notice(ddev, "config %d interface %d altsetting %d has an " "invalid descriptor for endpoint zero, skipping\n", cfgno, inum, asnum); goto skip_to_next_endpoint_or_interface_descriptor; } /* Only store as many endpoints as we have room for */ if (ifp->desc.bNumEndpoints >= num_ep) goto skip_to_next_endpoint_or_interface_descriptor; /* Save a copy of the descriptor and use it instead of the original */ endpoint = &ifp->endpoint[ifp->desc.bNumEndpoints]; memcpy(&endpoint->desc, d, n); d = &endpoint->desc; /* Clear the reserved bits in bEndpointAddress */ i = d->bEndpointAddress & (USB_ENDPOINT_DIR_MASK | USB_ENDPOINT_NUMBER_MASK); if (i != d->bEndpointAddress) { dev_notice(ddev, "config %d interface %d altsetting %d has an endpoint descriptor with address 0x%X, changing to 0x%X\n", cfgno, inum, asnum, d->bEndpointAddress, i); endpoint->desc.bEndpointAddress = i; } /* Check for duplicate endpoint addresses */ if (config_endpoint_is_duplicate(config, inum, asnum, d)) { dev_notice(ddev, "config %d interface %d altsetting %d has a duplicate endpoint with address 0x%X, skipping\n", cfgno, inum, asnum, d->bEndpointAddress); goto skip_to_next_endpoint_or_interface_descriptor; } /* Ignore some endpoints */ if (udev->quirks & USB_QUIRK_ENDPOINT_IGNORE) { if (usb_endpoint_is_ignored(udev, ifp, d)) { dev_notice(ddev, "config %d interface %d altsetting %d has an ignored endpoint with address 0x%X, skipping\n", cfgno, inum, asnum, d->bEndpointAddress); goto skip_to_next_endpoint_or_interface_descriptor; } } /* Accept this endpoint */ ++ifp->desc.bNumEndpoints; INIT_LIST_HEAD(&endpoint->urb_list); /* * Fix up bInterval values outside the legal range. * Use 10 or 8 ms if no proper value can be guessed. */ i = 0; /* i = min, j = max, n = default */ j = 255; if (usb_endpoint_xfer_int(d)) { i = 1; switch (udev->speed) { case USB_SPEED_SUPER_PLUS: case USB_SPEED_SUPER: case USB_SPEED_HIGH: /* * Many device manufacturers are using full-speed * bInterval values in high-speed interrupt endpoint * descriptors. Try to fix those and fall back to an * 8-ms default value otherwise. */ n = fls(d->bInterval*8); if (n == 0) n = 7; /* 8 ms = 2^(7-1) uframes */ j = 16; /* * Adjust bInterval for quirked devices. */ /* * This quirk fixes bIntervals reported in ms. */ if (udev->quirks & USB_QUIRK_LINEAR_FRAME_INTR_BINTERVAL) { n = clamp(fls(d->bInterval) + 3, i, j); i = j = n; } /* * This quirk fixes bIntervals reported in * linear microframes. */ if (udev->quirks & USB_QUIRK_LINEAR_UFRAME_INTR_BINTERVAL) { n = clamp(fls(d->bInterval), i, j); i = j = n; } break; default: /* USB_SPEED_FULL or _LOW */ /* * For low-speed, 10 ms is the official minimum. * But some "overclocked" devices might want faster * polling so we'll allow it. */ n = 10; break; } } else if (usb_endpoint_xfer_isoc(d)) { i = 1; j = 16; switch (udev->speed) { case USB_SPEED_HIGH: n = 7; /* 8 ms = 2^(7-1) uframes */ break; default: /* USB_SPEED_FULL */ n = 4; /* 8 ms = 2^(4-1) frames */ break; } } if (d->bInterval < i || d->bInterval > j) { dev_notice(ddev, "config %d interface %d altsetting %d " "endpoint 0x%X has an invalid bInterval %d, " "changing to %d\n", cfgno, inum, asnum, d->bEndpointAddress, d->bInterval, n); endpoint->desc.bInterval = n; } /* Some buggy low-speed devices have Bulk endpoints, which is * explicitly forbidden by the USB spec. In an attempt to make * them usable, we will try treating them as Interrupt endpoints. */ if (udev->speed == USB_SPEED_LOW && usb_endpoint_xfer_bulk(d)) { dev_notice(ddev, "config %d interface %d altsetting %d " "endpoint 0x%X is Bulk; changing to Interrupt\n", cfgno, inum, asnum, d->bEndpointAddress); endpoint->desc.bmAttributes = USB_ENDPOINT_XFER_INT; endpoint->desc.bInterval = 1; if (usb_endpoint_maxp(&endpoint->desc) > 8) endpoint->desc.wMaxPacketSize = cpu_to_le16(8); } /* * Validate the wMaxPacketSize field. * eUSB2 devices (see USB 2.0 Double Isochronous IN ECN 9.6.6 Endpoint) * and devices with isochronous endpoints in altsetting 0 (see USB 2.0 * end of section 5.6.3) have wMaxPacketSize = 0. * So don't warn about those. */ maxp = le16_to_cpu(endpoint->desc.wMaxPacketSize); if (maxp == 0 && bcdUSB != 0x0220 && !(usb_endpoint_xfer_isoc(d) && asnum == 0)) dev_notice(ddev, "config %d interface %d altsetting %d endpoint 0x%X has invalid wMaxPacketSize 0\n", cfgno, inum, asnum, d->bEndpointAddress); /* Find the highest legal maxpacket size for this endpoint */ i = 0; /* additional transactions per microframe */ switch (udev->speed) { case USB_SPEED_LOW: maxpacket_maxes = low_speed_maxpacket_maxes; break; case USB_SPEED_FULL: maxpacket_maxes = full_speed_maxpacket_maxes; break; case USB_SPEED_HIGH: /* Multiple-transactions bits are allowed only for HS periodic endpoints */ if (usb_endpoint_xfer_int(d) || usb_endpoint_xfer_isoc(d)) { i = maxp & USB_EP_MAXP_MULT_MASK; maxp &= ~i; } fallthrough; default: maxpacket_maxes = high_speed_maxpacket_maxes; break; case USB_SPEED_SUPER: case USB_SPEED_SUPER_PLUS: maxpacket_maxes = super_speed_maxpacket_maxes; break; } j = maxpacket_maxes[usb_endpoint_type(&endpoint->desc)]; if (maxp > j) { dev_notice(ddev, "config %d interface %d altsetting %d endpoint 0x%X has invalid maxpacket %d, setting to %d\n", cfgno, inum, asnum, d->bEndpointAddress, maxp, j); maxp = j; endpoint->desc.wMaxPacketSize = cpu_to_le16(i | maxp); } /* * Some buggy high speed devices have bulk endpoints using * maxpacket sizes other than 512. High speed HCDs may not * be able to handle that particular bug, so let's warn... */ if (udev->speed == USB_SPEED_HIGH && usb_endpoint_xfer_bulk(d)) { if (maxp != 512) dev_notice(ddev, "config %d interface %d altsetting %d " "bulk endpoint 0x%X has invalid maxpacket %d\n", cfgno, inum, asnum, d->bEndpointAddress, maxp); } /* Parse a possible eUSB2 periodic endpoint companion descriptor */ if (bcdUSB == 0x0220 && d->wMaxPacketSize == 0 && (usb_endpoint_xfer_isoc(d) || usb_endpoint_xfer_int(d))) usb_parse_eusb2_isoc_endpoint_companion(ddev, cfgno, inum, asnum, endpoint, buffer, size); /* Parse a possible SuperSpeed endpoint companion descriptor */ if (udev->speed >= USB_SPEED_SUPER) usb_parse_ss_endpoint_companion(ddev, cfgno, inum, asnum, endpoint, buffer, size); /* Skip over any Class Specific or Vendor Specific descriptors; * find the next endpoint or interface descriptor */ endpoint->extra = buffer; i = find_next_descriptor(buffer, size, USB_DT_ENDPOINT, USB_DT_INTERFACE, &n); endpoint->extralen = i; retval = buffer - buffer0 + i; if (n > 0) dev_dbg(ddev, "skipped %d descriptor%s after %s\n", n, str_plural(n), "endpoint"); return retval; skip_to_next_endpoint_or_interface_descriptor: i = find_next_descriptor(buffer, size, USB_DT_ENDPOINT, USB_DT_INTERFACE, NULL); return buffer - buffer0 + i; } void usb_release_interface_cache(struct kref *ref) { struct usb_interface_cache *intfc = ref_to_usb_interface_cache(ref); int j; for (j = 0; j < intfc->num_altsetting; j++) { struct usb_host_interface *alt = &intfc->altsetting[j]; kfree(alt->endpoint); kfree(alt->string); } kfree(intfc); } static int usb_parse_interface(struct device *ddev, int cfgno, struct usb_host_config *config, unsigned char *buffer, int size, u8 inums[], u8 nalts[]) { unsigned char *buffer0 = buffer; struct usb_interface_descriptor *d; int inum, asnum; struct usb_interface_cache *intfc; struct usb_host_interface *alt; int i, n; int len, retval; int num_ep, num_ep_orig; d = (struct usb_interface_descriptor *) buffer; buffer += d->bLength; size -= d->bLength; if (d->bLength < USB_DT_INTERFACE_SIZE) goto skip_to_next_interface_descriptor; /* Which interface entry is this? */ intfc = NULL; inum = d->bInterfaceNumber; for (i = 0; i < config->desc.bNumInterfaces; ++i) { if (inums[i] == inum) { intfc = config->intf_cache[i]; break; } } if (!intfc || intfc->num_altsetting >= nalts[i]) goto skip_to_next_interface_descriptor; /* Check for duplicate altsetting entries */ asnum = d->bAlternateSetting; for ((i = 0, alt = &intfc->altsetting[0]); i < intfc->num_altsetting; (++i, ++alt)) { if (alt->desc.bAlternateSetting == asnum) { dev_notice(ddev, "Duplicate descriptor for config %d " "interface %d altsetting %d, skipping\n", cfgno, inum, asnum); goto skip_to_next_interface_descriptor; } } ++intfc->num_altsetting; memcpy(&alt->desc, d, USB_DT_INTERFACE_SIZE); /* Skip over any Class Specific or Vendor Specific descriptors; * find the first endpoint or interface descriptor */ alt->extra = buffer; i = find_next_descriptor(buffer, size, USB_DT_ENDPOINT, USB_DT_INTERFACE, &n); alt->extralen = i; if (n > 0) dev_dbg(ddev, "skipped %d descriptor%s after %s\n", n, str_plural(n), "interface"); buffer += i; size -= i; /* Allocate space for the right(?) number of endpoints */ num_ep = num_ep_orig = alt->desc.bNumEndpoints; alt->desc.bNumEndpoints = 0; /* Use as a counter */ if (num_ep > USB_MAXENDPOINTS) { dev_notice(ddev, "too many endpoints for config %d interface %d " "altsetting %d: %d, using maximum allowed: %d\n", cfgno, inum, asnum, num_ep, USB_MAXENDPOINTS); num_ep = USB_MAXENDPOINTS; } if (num_ep > 0) { /* Can't allocate 0 bytes */ len = sizeof(struct usb_host_endpoint) * num_ep; alt->endpoint = kzalloc(len, GFP_KERNEL); if (!alt->endpoint) return -ENOMEM; } /* Parse all the endpoint descriptors */ n = 0; while (size > 0) { if (((struct usb_descriptor_header *) buffer)->bDescriptorType == USB_DT_INTERFACE) break; retval = usb_parse_endpoint(ddev, cfgno, config, inum, asnum, alt, num_ep, buffer, size); if (retval < 0) return retval; ++n; buffer += retval; size -= retval; } if (n != num_ep_orig) dev_notice(ddev, "config %d interface %d altsetting %d has %d " "endpoint descriptor%s, different from the interface " "descriptor's value: %d\n", cfgno, inum, asnum, n, str_plural(n), num_ep_orig); return buffer - buffer0; skip_to_next_interface_descriptor: i = find_next_descriptor(buffer, size, USB_DT_INTERFACE, USB_DT_INTERFACE, NULL); return buffer - buffer0 + i; } static int usb_parse_configuration(struct usb_device *dev, int cfgidx, struct usb_host_config *config, unsigned char *buffer, int size) { struct device *ddev = &dev->dev; unsigned char *buffer0 = buffer; int cfgno; int nintf, nintf_orig; int i, j, n; struct usb_interface_cache *intfc; unsigned char *buffer2; int size2; struct usb_descriptor_header *header; int retval; u8 inums[USB_MAXINTERFACES], nalts[USB_MAXINTERFACES]; unsigned iad_num = 0; memcpy(&config->desc, buffer, USB_DT_CONFIG_SIZE); nintf = nintf_orig = config->desc.bNumInterfaces; config->desc.bNumInterfaces = 0; // Adjusted later if (config->desc.bDescriptorType != USB_DT_CONFIG || config->desc.bLength < USB_DT_CONFIG_SIZE || config->desc.bLength > size) { dev_notice(ddev, "invalid descriptor for config index %d: " "type = 0x%X, length = %d\n", cfgidx, config->desc.bDescriptorType, config->desc.bLength); return -EINVAL; } cfgno = config->desc.bConfigurationValue; buffer += config->desc.bLength; size -= config->desc.bLength; if (nintf > USB_MAXINTERFACES) { dev_notice(ddev, "config %d has too many interfaces: %d, " "using maximum allowed: %d\n", cfgno, nintf, USB_MAXINTERFACES); nintf = USB_MAXINTERFACES; } /* Go through the descriptors, checking their length and counting the * number of altsettings for each interface */ n = 0; for ((buffer2 = buffer, size2 = size); size2 > 0; (buffer2 += header->bLength, size2 -= header->bLength)) { if (size2 < sizeof(struct usb_descriptor_header)) { dev_notice(ddev, "config %d descriptor has %d excess " "byte%s, ignoring\n", cfgno, size2, str_plural(size2)); break; } header = (struct usb_descriptor_header *) buffer2; if ((header->bLength > size2) || (header->bLength < 2)) { dev_notice(ddev, "config %d has an invalid descriptor " "of length %d, skipping remainder of the config\n", cfgno, header->bLength); break; } if (header->bDescriptorType == USB_DT_INTERFACE) { struct usb_interface_descriptor *d; int inum; d = (struct usb_interface_descriptor *) header; if (d->bLength < USB_DT_INTERFACE_SIZE) { dev_notice(ddev, "config %d has an invalid " "interface descriptor of length %d, " "skipping\n", cfgno, d->bLength); continue; } inum = d->bInterfaceNumber; if ((dev->quirks & USB_QUIRK_HONOR_BNUMINTERFACES) && n >= nintf_orig) { dev_notice(ddev, "config %d has more interface " "descriptors, than it declares in " "bNumInterfaces, ignoring interface " "number: %d\n", cfgno, inum); continue; } if (inum >= nintf_orig) dev_notice(ddev, "config %d has an invalid " "interface number: %d but max is %d\n", cfgno, inum, nintf_orig - 1); /* Have we already encountered this interface? * Count its altsettings */ for (i = 0; i < n; ++i) { if (inums[i] == inum) break; } if (i < n) { if (nalts[i] < 255) ++nalts[i]; } else if (n < USB_MAXINTERFACES) { inums[n] = inum; nalts[n] = 1; ++n; } } else if (header->bDescriptorType == USB_DT_INTERFACE_ASSOCIATION) { struct usb_interface_assoc_descriptor *d; d = (struct usb_interface_assoc_descriptor *)header; if (d->bLength < USB_DT_INTERFACE_ASSOCIATION_SIZE) { dev_notice(ddev, "config %d has an invalid interface association descriptor of length %d, skipping\n", cfgno, d->bLength); continue; } if (iad_num == USB_MAXIADS) { dev_notice(ddev, "found more Interface " "Association Descriptors " "than allocated for in " "configuration %d\n", cfgno); } else { config->intf_assoc[iad_num] = d; iad_num++; } } else if (header->bDescriptorType == USB_DT_DEVICE || header->bDescriptorType == USB_DT_CONFIG) dev_notice(ddev, "config %d contains an unexpected " "descriptor of type 0x%X, skipping\n", cfgno, header->bDescriptorType); } /* for ((buffer2 = buffer, size2 = size); ...) */ size = buffer2 - buffer; config->desc.wTotalLength = cpu_to_le16(buffer2 - buffer0); if (n != nintf) dev_notice(ddev, "config %d has %d interface%s, different from " "the descriptor's value: %d\n", cfgno, n, str_plural(n), nintf_orig); else if (n == 0) dev_notice(ddev, "config %d has no interfaces?\n", cfgno); config->desc.bNumInterfaces = nintf = n; /* Check for missing interface numbers */ for (i = 0; i < nintf; ++i) { for (j = 0; j < nintf; ++j) { if (inums[j] == i) break; } if (j >= nintf) dev_notice(ddev, "config %d has no interface number " "%d\n", cfgno, i); } /* Allocate the usb_interface_caches and altsetting arrays */ for (i = 0; i < nintf; ++i) { j = nalts[i]; if (j > USB_MAXALTSETTING) { dev_notice(ddev, "too many alternate settings for " "config %d interface %d: %d, " "using maximum allowed: %d\n", cfgno, inums[i], j, USB_MAXALTSETTING); nalts[i] = j = USB_MAXALTSETTING; } intfc = kzalloc(struct_size(intfc, altsetting, j), GFP_KERNEL); config->intf_cache[i] = intfc; if (!intfc) return -ENOMEM; kref_init(&intfc->ref); } /* FIXME: parse the BOS descriptor */ /* Skip over any Class Specific or Vendor Specific descriptors; * find the first interface descriptor */ config->extra = buffer; i = find_next_descriptor(buffer, size, USB_DT_INTERFACE, USB_DT_INTERFACE, &n); config->extralen = i; if (n > 0) dev_dbg(ddev, "skipped %d descriptor%s after %s\n", n, str_plural(n), "configuration"); buffer += i; size -= i; /* Parse all the interface/altsetting descriptors */ while (size > 0) { retval = usb_parse_interface(ddev, cfgno, config, buffer, size, inums, nalts); if (retval < 0) return retval; buffer += retval; size -= retval; } /* Check for missing altsettings */ for (i = 0; i < nintf; ++i) { intfc = config->intf_cache[i]; for (j = 0; j < intfc->num_altsetting; ++j) { for (n = 0; n < intfc->num_altsetting; ++n) { if (intfc->altsetting[n].desc. bAlternateSetting == j) break; } if (n >= intfc->num_altsetting) dev_notice(ddev, "config %d interface %d has no " "altsetting %d\n", cfgno, inums[i], j); } } return 0; } /* hub-only!! ... and only exported for reset/reinit path. * otherwise used internally on disconnect/destroy path */ void usb_destroy_configuration(struct usb_device *dev) { int c, i; if (!dev->config) return; if (dev->rawdescriptors) { for (i = 0; i < dev->descriptor.bNumConfigurations; i++) kfree(dev->rawdescriptors[i]); kfree(dev->rawdescriptors); dev->rawdescriptors = NULL; } for (c = 0; c < dev->descriptor.bNumConfigurations; c++) { struct usb_host_config *cf = &dev->config[c]; kfree(cf->string); for (i = 0; i < cf->desc.bNumInterfaces; i++) { if (cf->intf_cache[i]) kref_put(&cf->intf_cache[i]->ref, usb_release_interface_cache); } } kfree(dev->config); dev->config = NULL; } /* * Get the USB config descriptors, cache and parse'em * * hub-only!! ... and only in reset path, or usb_new_device() * (used by real hubs and virtual root hubs) */ int usb_get_configuration(struct usb_device *dev) { struct device *ddev = &dev->dev; int ncfg = dev->descriptor.bNumConfigurations; unsigned int cfgno, length; unsigned char *bigbuffer; struct usb_config_descriptor *desc; int result; if (ncfg > USB_MAXCONFIG) { dev_notice(ddev, "too many configurations: %d, " "using maximum allowed: %d\n", ncfg, USB_MAXCONFIG); dev->descriptor.bNumConfigurations = ncfg = USB_MAXCONFIG; } if (ncfg < 1) { dev_err(ddev, "no configurations\n"); return -EINVAL; } length = ncfg * sizeof(struct usb_host_config); dev->config = kzalloc(length, GFP_KERNEL); if (!dev->config) return -ENOMEM; length = ncfg * sizeof(char *); dev->rawdescriptors = kzalloc(length, GFP_KERNEL); if (!dev->rawdescriptors) return -ENOMEM; desc = kmalloc(USB_DT_CONFIG_SIZE, GFP_KERNEL); if (!desc) return -ENOMEM; for (cfgno = 0; cfgno < ncfg; cfgno++) { /* We grab just the first descriptor so we know how long * the whole configuration is */ result = usb_get_descriptor(dev, USB_DT_CONFIG, cfgno, desc, USB_DT_CONFIG_SIZE); if (result < 0) { dev_err(ddev, "unable to read config index %d " "descriptor/%s: %d\n", cfgno, "start", result); if (result != -EPIPE) goto err; dev_notice(ddev, "chopping to %d config(s)\n", cfgno); dev->descriptor.bNumConfigurations = cfgno; break; } else if (result < 4) { dev_err(ddev, "config index %d descriptor too short " "(expected %i, got %i)\n", cfgno, USB_DT_CONFIG_SIZE, result); result = -EINVAL; goto err; } length = max_t(int, le16_to_cpu(desc->wTotalLength), USB_DT_CONFIG_SIZE); /* Now that we know the length, get the whole thing */ bigbuffer = kmalloc(length, GFP_KERNEL); if (!bigbuffer) { result = -ENOMEM; goto err; } if (dev->quirks & USB_QUIRK_DELAY_INIT) msleep(200); result = usb_get_descriptor(dev, USB_DT_CONFIG, cfgno, bigbuffer, length); if (result < 0) { dev_err(ddev, "unable to read config index %d " "descriptor/%s\n", cfgno, "all"); kfree(bigbuffer); goto err; } if (result < length) { dev_notice(ddev, "config index %d descriptor too short " "(expected %i, got %i)\n", cfgno, length, result); length = result; } dev->rawdescriptors[cfgno] = bigbuffer; result = usb_parse_configuration(dev, cfgno, &dev->config[cfgno], bigbuffer, length); if (result < 0) { ++cfgno; goto err; } } err: kfree(desc); dev->descriptor.bNumConfigurations = cfgno; return result; } void usb_release_bos_descriptor(struct usb_device *dev) { if (dev->bos) { kfree(dev->bos->desc); kfree(dev->bos); dev->bos = NULL; } } static const __u8 bos_desc_len[256] = { [USB_CAP_TYPE_WIRELESS_USB] = USB_DT_USB_WIRELESS_CAP_SIZE, [USB_CAP_TYPE_EXT] = USB_DT_USB_EXT_CAP_SIZE, [USB_SS_CAP_TYPE] = USB_DT_USB_SS_CAP_SIZE, [USB_SSP_CAP_TYPE] = USB_DT_USB_SSP_CAP_SIZE(1), [CONTAINER_ID_TYPE] = USB_DT_USB_SS_CONTN_ID_SIZE, [USB_PTM_CAP_TYPE] = USB_DT_USB_PTM_ID_SIZE, }; /* Get BOS descriptor set */ int usb_get_bos_descriptor(struct usb_device *dev) { struct device *ddev = &dev->dev; struct usb_bos_descriptor *bos; struct usb_dev_cap_header *cap; struct usb_ssp_cap_descriptor *ssp_cap; unsigned char *buffer, *buffer0; int length, total_len, num, i, ssac; __u8 cap_type; int ret; bos = kzalloc(sizeof(*bos), GFP_KERNEL); if (!bos) return -ENOMEM; /* Get BOS descriptor */ ret = usb_get_descriptor(dev, USB_DT_BOS, 0, bos, USB_DT_BOS_SIZE); if (ret < USB_DT_BOS_SIZE || bos->bLength < USB_DT_BOS_SIZE) { dev_notice(ddev, "unable to get BOS descriptor or descriptor too short\n"); if (ret >= 0) ret = -ENOMSG; kfree(bos); return ret; } length = bos->bLength; total_len = le16_to_cpu(bos->wTotalLength); num = bos->bNumDeviceCaps; kfree(bos); if (total_len < length) return -EINVAL; dev->bos = kzalloc(sizeof(*dev->bos), GFP_KERNEL); if (!dev->bos) return -ENOMEM; /* Now let's get the whole BOS descriptor set */ buffer = kzalloc(total_len, GFP_KERNEL); if (!buffer) { ret = -ENOMEM; goto err; } dev->bos->desc = (struct usb_bos_descriptor *)buffer; ret = usb_get_descriptor(dev, USB_DT_BOS, 0, buffer, total_len); if (ret < total_len) { dev_notice(ddev, "unable to get BOS descriptor set\n"); if (ret >= 0) ret = -ENOMSG; goto err; } buffer0 = buffer; total_len -= length; buffer += length; for (i = 0; i < num; i++) { cap = (struct usb_dev_cap_header *)buffer; if (total_len < sizeof(*cap) || total_len < cap->bLength) { dev->bos->desc->bNumDeviceCaps = i; break; } cap_type = cap->bDevCapabilityType; length = cap->bLength; if (bos_desc_len[cap_type] && length < bos_desc_len[cap_type]) { dev->bos->desc->bNumDeviceCaps = i; break; } if (cap->bDescriptorType != USB_DT_DEVICE_CAPABILITY) { dev_notice(ddev, "descriptor type invalid, skip\n"); goto skip_to_next_descriptor; } switch (cap_type) { case USB_CAP_TYPE_EXT: dev->bos->ext_cap = (struct usb_ext_cap_descriptor *)buffer; break; case USB_SS_CAP_TYPE: dev->bos->ss_cap = (struct usb_ss_cap_descriptor *)buffer; break; case USB_SSP_CAP_TYPE: ssp_cap = (struct usb_ssp_cap_descriptor *)buffer; ssac = (le32_to_cpu(ssp_cap->bmAttributes) & USB_SSP_SUBLINK_SPEED_ATTRIBS); if (length >= USB_DT_USB_SSP_CAP_SIZE(ssac)) dev->bos->ssp_cap = ssp_cap; break; case CONTAINER_ID_TYPE: dev->bos->ss_id = (struct usb_ss_container_id_descriptor *)buffer; break; case USB_PTM_CAP_TYPE: dev->bos->ptm_cap = (struct usb_ptm_cap_descriptor *)buffer; break; default: break; } skip_to_next_descriptor: total_len -= length; buffer += length; } dev->bos->desc->wTotalLength = cpu_to_le16(buffer - buffer0); return 0; err: usb_release_bos_descriptor(dev); return ret; }
14 14 14 2 14 14 2 14 14 5 14 14 14 14 14 14 14 14 14 14 14 14 1 14 14 14 14 14 14 14 14 14 14 14 14 14 17 91 89 1 89 89 70 1 1 70 2 2 2 2 3 2 2 2 3 88 2 88 86 85 79 7 7 4 1 6 19 19 80 80 7 7 7 7 7 7 7 6 2 6 6 2 6 2 2 46 46 1 3 2 8 4 4 8 11 11 11 11 11 10 11 11 11 11 11 12 12 12 12 12 5 5 5 5 13 13 7 7 7 7 7 7 7 7 7 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 12 6 6 6 6 12 12 2 13 13 10 13 7 7 7 7 7 7 2 2 13 12 12 13 13 13 13 13 13 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 4 4 4 4 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 43 5 5 43 5 11 8 8 8 1 1 13 13 13 13 13 13 13 13 13 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 13 20 20 19 18 19 19 19 19 13 13 13 13 13 13 13 19 19 86 86 86 60 86 86 86 8 86 86 86 85 86 6 6 45 33 44 43 8 46 46 45 46 46 45 27 31 25 25 25 25 25 25 1 25 24 25 25 1 25 4 43 42 43 40 2 42 35 35 2 42 8 43 2 2 46 46 46 46 46 43 25 46 4 18 11 9 9 19 19 9 9 9 9 9 1 5 1 1 5 20 20 94 94 86 20 94 94 25 25 14 14 8 4 8 1 8 8 8 8 8 8 8 8 8 89 89 89 8 8 8 8 4 4 8 8 8 8 4 8 44 44 43 19 19 20 20 20 19 19 19 8 8 8 8 8 8 8 1 1 1 1 1 1 1 1 1 1 7 13 6 1 37 21 23 17 2 1 3 3 2 26 15 15 1 14 14 14 14 6 6 6 15 15 32 2 30 1 3 26 30 6 6 43 41 43 43 43 43 43 6 42 43 8 8 8 43 43 33 43 30 17 42 30 30 30 42 42 4 85 30 30 30 30 30 16 30 30 20 20 20 20 20 17 19 19 19 19 3 3 3 3 3 3 3 3 3 3 3 7 90 7 6 6 1 25 25 7 7 86 86 86 86 86 86 86 84 86 86 86 86 86 86 86 86 86 86 85 86 86 85 86 85 86 85 86 86 2 20 20 20 20 20 20 20 20 20 14 20 20 14 118 118 118 116 13 13 13 14 84 1 1 1 1 1 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 20 14 14 14 14 14 14 13 1 14 14 14 14 14 13 13 1 14 14 14 13 1 14 14 14 14 20 20 20 86 86 85 86 86 86 85 86 86 86 78 86 86 1 85 86 86 85 86 86 86 86 86 86 86 86 86 86 86 86 85 86 86 86 86 86 84 84 84 84 84 84 84 84 84 76 61 15 84 1 84 83 84 83 1 84 83 1 83 75 60 15 83 1 82 3 1 1 1 82 81 1 80 80 80 80 80 80 80 80 78 80 80 80 80 80 80 1 1 1 1 1 1 80 80 80 79 80 80 80 80 80 1 1 80 80 80 80 80 79 80 80 80 80 44 44 109 109 109 80 80 80 80 80 79 1 1 1 1 80 80 80 80 80 80 86 86 86 86 77 86 86 85 86 86 86 86 84 80 80 80 80 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 // SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/segment.c * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ */ #include <linux/fs.h> #include <linux/f2fs_fs.h> #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/sched/mm.h> #include <linux/prefetch.h> #include <linux/kthread.h> #include <linux/swap.h> #include <linux/timer.h> #include <linux/freezer.h> #include <linux/sched/signal.h> #include <linux/random.h> #include "f2fs.h" #include "segment.h" #include "node.h" #include "gc.h" #include "iostat.h" #include <trace/events/f2fs.h> #define __reverse_ffz(x) __reverse_ffs(~(x)) static struct kmem_cache *discard_entry_slab; static struct kmem_cache *discard_cmd_slab; static struct kmem_cache *sit_entry_set_slab; static struct kmem_cache *revoke_entry_slab; static unsigned long __reverse_ulong(unsigned char *str) { unsigned long tmp = 0; int shift = 24, idx = 0; #if BITS_PER_LONG == 64 shift = 56; #endif while (shift >= 0) { tmp |= (unsigned long)str[idx++] << shift; shift -= BITS_PER_BYTE; } return tmp; } /* * __reverse_ffs is copied from include/asm-generic/bitops/__ffs.h since * MSB and LSB are reversed in a byte by f2fs_set_bit. */ static inline unsigned long __reverse_ffs(unsigned long word) { int num = 0; #if BITS_PER_LONG == 64 if ((word & 0xffffffff00000000UL) == 0) num += 32; else word >>= 32; #endif if ((word & 0xffff0000) == 0) num += 16; else word >>= 16; if ((word & 0xff00) == 0) num += 8; else word >>= 8; if ((word & 0xf0) == 0) num += 4; else word >>= 4; if ((word & 0xc) == 0) num += 2; else word >>= 2; if ((word & 0x2) == 0) num += 1; return num; } /* * __find_rev_next(_zero)_bit is copied from lib/find_next_bit.c because * f2fs_set_bit makes MSB and LSB reversed in a byte. * @size must be integral times of unsigned long. * Example: * MSB <--> LSB * f2fs_set_bit(0, bitmap) => 1000 0000 * f2fs_set_bit(7, bitmap) => 0000 0001 */ static unsigned long __find_rev_next_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { const unsigned long *p = addr + BIT_WORD(offset); unsigned long result = size; unsigned long tmp; if (offset >= size) return size; size -= (offset & ~(BITS_PER_LONG - 1)); offset %= BITS_PER_LONG; while (1) { if (*p == 0) goto pass; tmp = __reverse_ulong((unsigned char *)p); tmp &= ~0UL >> offset; if (size < BITS_PER_LONG) tmp &= (~0UL << (BITS_PER_LONG - size)); if (tmp) goto found; pass: if (size <= BITS_PER_LONG) break; size -= BITS_PER_LONG; offset = 0; p++; } return result; found: return result - size + __reverse_ffs(tmp); } static unsigned long __find_rev_next_zero_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { const unsigned long *p = addr + BIT_WORD(offset); unsigned long result = size; unsigned long tmp; if (offset >= size) return size; size -= (offset & ~(BITS_PER_LONG - 1)); offset %= BITS_PER_LONG; while (1) { if (*p == ~0UL) goto pass; tmp = __reverse_ulong((unsigned char *)p); if (offset) tmp |= ~0UL << (BITS_PER_LONG - offset); if (size < BITS_PER_LONG) tmp |= ~0UL >> size; if (tmp != ~0UL) goto found; pass: if (size <= BITS_PER_LONG) break; size -= BITS_PER_LONG; offset = 0; p++; } return result; found: return result - size + __reverse_ffz(tmp); } bool f2fs_need_SSR(struct f2fs_sb_info *sbi) { int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); int imeta_secs = get_blocktype_secs(sbi, F2FS_DIRTY_IMETA); if (f2fs_lfs_mode(sbi)) return false; if (sbi->gc_mode == GC_URGENT_HIGH) return true; if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) return true; return free_sections(sbi) <= (node_secs + 2 * dent_secs + imeta_secs + SM_I(sbi)->min_ssr_sections + reserved_sections(sbi)); } void f2fs_abort_atomic_write(struct inode *inode, bool clean) { struct f2fs_inode_info *fi = F2FS_I(inode); if (!f2fs_is_atomic_file(inode)) return; if (clean) truncate_inode_pages_final(inode->i_mapping); release_atomic_write_cnt(inode); clear_inode_flag(inode, FI_ATOMIC_COMMITTED); clear_inode_flag(inode, FI_ATOMIC_REPLACE); clear_inode_flag(inode, FI_ATOMIC_FILE); if (is_inode_flag_set(inode, FI_ATOMIC_DIRTIED)) { clear_inode_flag(inode, FI_ATOMIC_DIRTIED); /* * The vfs inode keeps clean during commit, but the f2fs inode * doesn't. So clear the dirty state after commit and let * f2fs_mark_inode_dirty_sync ensure a consistent dirty state. */ f2fs_inode_synced(inode); f2fs_mark_inode_dirty_sync(inode, true); } stat_dec_atomic_inode(inode); F2FS_I(inode)->atomic_write_task = NULL; if (clean) { f2fs_i_size_write(inode, fi->original_i_size); fi->original_i_size = 0; } /* avoid stale dirty inode during eviction */ sync_inode_metadata(inode, 0); } static int __replace_atomic_write_block(struct inode *inode, pgoff_t index, block_t new_addr, block_t *old_addr, bool recover) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct dnode_of_data dn; struct node_info ni; int err; retry: set_new_dnode(&dn, inode, NULL, NULL, 0); err = f2fs_get_dnode_of_data(&dn, index, ALLOC_NODE); if (err) { if (err == -ENOMEM) { f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); goto retry; } return err; } err = f2fs_get_node_info(sbi, dn.nid, &ni, false); if (err) { f2fs_put_dnode(&dn); return err; } if (recover) { /* dn.data_blkaddr is always valid */ if (!__is_valid_data_blkaddr(new_addr)) { if (new_addr == NULL_ADDR) dec_valid_block_count(sbi, inode, 1); f2fs_invalidate_blocks(sbi, dn.data_blkaddr, 1); f2fs_update_data_blkaddr(&dn, new_addr); } else { f2fs_replace_block(sbi, &dn, dn.data_blkaddr, new_addr, ni.version, true, true); } } else { blkcnt_t count = 1; err = inc_valid_block_count(sbi, inode, &count, true); if (err) { f2fs_put_dnode(&dn); return err; } *old_addr = dn.data_blkaddr; f2fs_truncate_data_blocks_range(&dn, 1); dec_valid_block_count(sbi, F2FS_I(inode)->cow_inode, count); f2fs_replace_block(sbi, &dn, dn.data_blkaddr, new_addr, ni.version, true, false); } f2fs_put_dnode(&dn); trace_f2fs_replace_atomic_write_block(inode, F2FS_I(inode)->cow_inode, index, old_addr ? *old_addr : 0, new_addr, recover); return 0; } static void __complete_revoke_list(struct inode *inode, struct list_head *head, bool revoke) { struct revoke_entry *cur, *tmp; pgoff_t start_index = 0; bool truncate = is_inode_flag_set(inode, FI_ATOMIC_REPLACE); list_for_each_entry_safe(cur, tmp, head, list) { if (revoke) { __replace_atomic_write_block(inode, cur->index, cur->old_addr, NULL, true); } else if (truncate) { f2fs_truncate_hole(inode, start_index, cur->index); start_index = cur->index + 1; } list_del(&cur->list); kmem_cache_free(revoke_entry_slab, cur); } if (!revoke && truncate) f2fs_do_truncate_blocks(inode, start_index * PAGE_SIZE, false); } static int __f2fs_commit_atomic_write(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); struct inode *cow_inode = fi->cow_inode; struct revoke_entry *new; struct list_head revoke_list; block_t blkaddr; struct dnode_of_data dn; pgoff_t len = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); pgoff_t off = 0, blen, index; int ret = 0, i; INIT_LIST_HEAD(&revoke_list); while (len) { blen = min_t(pgoff_t, ADDRS_PER_BLOCK(cow_inode), len); set_new_dnode(&dn, cow_inode, NULL, NULL, 0); ret = f2fs_get_dnode_of_data(&dn, off, LOOKUP_NODE_RA); if (ret && ret != -ENOENT) { goto out; } else if (ret == -ENOENT) { ret = 0; if (dn.max_level == 0) goto out; goto next; } blen = min((pgoff_t)ADDRS_PER_PAGE(&dn.node_folio->page, cow_inode), len); index = off; for (i = 0; i < blen; i++, dn.ofs_in_node++, index++) { blkaddr = f2fs_data_blkaddr(&dn); if (!__is_valid_data_blkaddr(blkaddr)) { continue; } else if (!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE)) { f2fs_put_dnode(&dn); ret = -EFSCORRUPTED; goto out; } new = f2fs_kmem_cache_alloc(revoke_entry_slab, GFP_NOFS, true, NULL); ret = __replace_atomic_write_block(inode, index, blkaddr, &new->old_addr, false); if (ret) { f2fs_put_dnode(&dn); kmem_cache_free(revoke_entry_slab, new); goto out; } f2fs_update_data_blkaddr(&dn, NULL_ADDR); new->index = index; list_add_tail(&new->list, &revoke_list); } f2fs_put_dnode(&dn); next: off += blen; len -= blen; } out: if (time_to_inject(sbi, FAULT_TIMEOUT)) f2fs_io_schedule_timeout_killable(DEFAULT_FAULT_TIMEOUT); if (ret) { sbi->revoked_atomic_block += fi->atomic_write_cnt; } else { sbi->committed_atomic_block += fi->atomic_write_cnt; set_inode_flag(inode, FI_ATOMIC_COMMITTED); /* * inode may has no FI_ATOMIC_DIRTIED flag due to no write * before commit. */ if (is_inode_flag_set(inode, FI_ATOMIC_DIRTIED)) { /* clear atomic dirty status and set vfs dirty status */ clear_inode_flag(inode, FI_ATOMIC_DIRTIED); f2fs_mark_inode_dirty_sync(inode, true); } } __complete_revoke_list(inode, &revoke_list, ret ? true : false); return ret; } int f2fs_commit_atomic_write(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); int err; err = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); if (err) return err; f2fs_down_write(&fi->i_gc_rwsem[WRITE]); f2fs_lock_op(sbi); err = __f2fs_commit_atomic_write(inode); f2fs_unlock_op(sbi); f2fs_up_write(&fi->i_gc_rwsem[WRITE]); return err; } /* * This function balances dirty node and dentry pages. * In addition, it controls garbage collection. */ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) { if (f2fs_cp_error(sbi)) return; if (time_to_inject(sbi, FAULT_CHECKPOINT)) f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_FAULT_INJECT); /* balance_fs_bg is able to be pending */ if (need && excess_cached_nats(sbi)) f2fs_balance_fs_bg(sbi, false); if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) return; /* * We should do GC or end up with checkpoint, if there are so many dirty * dir/node pages without enough free segments. */ if (has_enough_free_secs(sbi, 0, 0)) return; if (test_opt(sbi, GC_MERGE) && sbi->gc_thread && sbi->gc_thread->f2fs_gc_task) { DEFINE_WAIT(wait); prepare_to_wait(&sbi->gc_thread->fggc_wq, &wait, TASK_UNINTERRUPTIBLE); wake_up(&sbi->gc_thread->gc_wait_queue_head); io_schedule(); finish_wait(&sbi->gc_thread->fggc_wq, &wait); } else { struct f2fs_gc_control gc_control = { .victim_segno = NULL_SEGNO, .init_gc_type = BG_GC, .no_bg_gc = true, .should_migrate_blocks = false, .err_gc_skipped = false, .nr_free_secs = 1 }; f2fs_down_write(&sbi->gc_lock); stat_inc_gc_call_count(sbi, FOREGROUND); f2fs_gc(sbi, &gc_control); } } static inline bool excess_dirty_threshold(struct f2fs_sb_info *sbi) { int factor = f2fs_rwsem_is_locked(&sbi->cp_rwsem) ? 3 : 2; unsigned int dents = get_pages(sbi, F2FS_DIRTY_DENTS); unsigned int qdata = get_pages(sbi, F2FS_DIRTY_QDATA); unsigned int nodes = get_pages(sbi, F2FS_DIRTY_NODES); unsigned int meta = get_pages(sbi, F2FS_DIRTY_META); unsigned int imeta = get_pages(sbi, F2FS_DIRTY_IMETA); unsigned int threshold = SEGS_TO_BLKS(sbi, (factor * DEFAULT_DIRTY_THRESHOLD)); unsigned int global_threshold = threshold * 3 / 2; if (dents >= threshold || qdata >= threshold || nodes >= threshold || meta >= threshold || imeta >= threshold) return true; return dents + qdata + nodes + meta + imeta > global_threshold; } void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg) { if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) return; /* try to shrink extent cache when there is no enough memory */ if (!f2fs_available_free_memory(sbi, READ_EXTENT_CACHE)) f2fs_shrink_read_extent_tree(sbi, READ_EXTENT_CACHE_SHRINK_NUMBER); /* try to shrink age extent cache when there is no enough memory */ if (!f2fs_available_free_memory(sbi, AGE_EXTENT_CACHE)) f2fs_shrink_age_extent_tree(sbi, AGE_EXTENT_CACHE_SHRINK_NUMBER); /* check the # of cached NAT entries */ if (!f2fs_available_free_memory(sbi, NAT_ENTRIES)) f2fs_try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK); if (!f2fs_available_free_memory(sbi, FREE_NIDS)) f2fs_try_to_free_nids(sbi, MAX_FREE_NIDS); else f2fs_build_free_nids(sbi, false, false); if (excess_dirty_nats(sbi) || excess_dirty_threshold(sbi) || excess_prefree_segs(sbi) || !f2fs_space_for_roll_forward(sbi)) goto do_sync; /* there is background inflight IO or foreground operation recently */ if (is_inflight_io(sbi, REQ_TIME) || (!f2fs_time_over(sbi, REQ_TIME) && f2fs_rwsem_is_locked(&sbi->cp_rwsem))) return; /* exceed periodical checkpoint timeout threshold */ if (f2fs_time_over(sbi, CP_TIME)) goto do_sync; /* checkpoint is the only way to shrink partial cached entries */ if (f2fs_available_free_memory(sbi, NAT_ENTRIES) && f2fs_available_free_memory(sbi, INO_ENTRIES)) return; do_sync: if (test_opt(sbi, DATA_FLUSH) && from_bg) { struct blk_plug plug; mutex_lock(&sbi->flush_lock); blk_start_plug(&plug); f2fs_sync_dirty_inodes(sbi, FILE_INODE, false); blk_finish_plug(&plug); mutex_unlock(&sbi->flush_lock); } stat_inc_cp_call_count(sbi, BACKGROUND); f2fs_sync_fs(sbi->sb, 1); } static int __submit_flush_wait(struct f2fs_sb_info *sbi, struct block_device *bdev) { int ret = blkdev_issue_flush(bdev); trace_f2fs_issue_flush(bdev, test_opt(sbi, NOBARRIER), test_opt(sbi, FLUSH_MERGE), ret); if (!ret) f2fs_update_iostat(sbi, NULL, FS_FLUSH_IO, 0); return ret; } static int submit_flush_wait(struct f2fs_sb_info *sbi, nid_t ino) { int ret = 0; int i; if (!f2fs_is_multi_device(sbi)) return __submit_flush_wait(sbi, sbi->sb->s_bdev); for (i = 0; i < sbi->s_ndevs; i++) { if (!f2fs_is_dirty_device(sbi, ino, i, FLUSH_INO)) continue; ret = __submit_flush_wait(sbi, FDEV(i).bdev); if (ret) break; } return ret; } static int issue_flush_thread(void *data) { struct f2fs_sb_info *sbi = data; struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info; wait_queue_head_t *q = &fcc->flush_wait_queue; repeat: if (kthread_should_stop()) return 0; if (!llist_empty(&fcc->issue_list)) { struct flush_cmd *cmd, *next; int ret; fcc->dispatch_list = llist_del_all(&fcc->issue_list); fcc->dispatch_list = llist_reverse_order(fcc->dispatch_list); cmd = llist_entry(fcc->dispatch_list, struct flush_cmd, llnode); ret = submit_flush_wait(sbi, cmd->ino); atomic_inc(&fcc->issued_flush); llist_for_each_entry_safe(cmd, next, fcc->dispatch_list, llnode) { cmd->ret = ret; complete(&cmd->wait); } fcc->dispatch_list = NULL; } wait_event_interruptible(*q, kthread_should_stop() || !llist_empty(&fcc->issue_list)); goto repeat; } int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino) { struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info; struct flush_cmd cmd; int ret; if (test_opt(sbi, NOBARRIER)) return 0; if (!test_opt(sbi, FLUSH_MERGE)) { atomic_inc(&fcc->queued_flush); ret = submit_flush_wait(sbi, ino); atomic_dec(&fcc->queued_flush); atomic_inc(&fcc->issued_flush); return ret; } if (atomic_inc_return(&fcc->queued_flush) == 1 || f2fs_is_multi_device(sbi)) { ret = submit_flush_wait(sbi, ino); atomic_dec(&fcc->queued_flush); atomic_inc(&fcc->issued_flush); return ret; } cmd.ino = ino; init_completion(&cmd.wait); llist_add(&cmd.llnode, &fcc->issue_list); /* * update issue_list before we wake up issue_flush thread, this * smp_mb() pairs with another barrier in ___wait_event(), see * more details in comments of waitqueue_active(). */ smp_mb(); if (waitqueue_active(&fcc->flush_wait_queue)) wake_up(&fcc->flush_wait_queue); if (fcc->f2fs_issue_flush) { wait_for_completion(&cmd.wait); atomic_dec(&fcc->queued_flush); } else { struct llist_node *list; list = llist_del_all(&fcc->issue_list); if (!list) { wait_for_completion(&cmd.wait); atomic_dec(&fcc->queued_flush); } else { struct flush_cmd *tmp, *next; ret = submit_flush_wait(sbi, ino); llist_for_each_entry_safe(tmp, next, list, llnode) { if (tmp == &cmd) { cmd.ret = ret; atomic_dec(&fcc->queued_flush); continue; } tmp->ret = ret; complete(&tmp->wait); } } } return cmd.ret; } int f2fs_create_flush_cmd_control(struct f2fs_sb_info *sbi) { dev_t dev = sbi->sb->s_bdev->bd_dev; struct flush_cmd_control *fcc; if (SM_I(sbi)->fcc_info) { fcc = SM_I(sbi)->fcc_info; if (fcc->f2fs_issue_flush) return 0; goto init_thread; } fcc = f2fs_kzalloc(sbi, sizeof(struct flush_cmd_control), GFP_KERNEL); if (!fcc) return -ENOMEM; atomic_set(&fcc->issued_flush, 0); atomic_set(&fcc->queued_flush, 0); init_waitqueue_head(&fcc->flush_wait_queue); init_llist_head(&fcc->issue_list); SM_I(sbi)->fcc_info = fcc; if (!test_opt(sbi, FLUSH_MERGE)) return 0; init_thread: fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi, "f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(fcc->f2fs_issue_flush)) { int err = PTR_ERR(fcc->f2fs_issue_flush); fcc->f2fs_issue_flush = NULL; return err; } return 0; } void f2fs_destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free) { struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info; if (fcc && fcc->f2fs_issue_flush) { struct task_struct *flush_thread = fcc->f2fs_issue_flush; fcc->f2fs_issue_flush = NULL; kthread_stop(flush_thread); } if (free) { kfree(fcc); SM_I(sbi)->fcc_info = NULL; } } int f2fs_flush_device_cache(struct f2fs_sb_info *sbi) { int ret = 0, i; if (!f2fs_is_multi_device(sbi)) return 0; if (test_opt(sbi, NOBARRIER)) return 0; for (i = 1; i < sbi->s_ndevs; i++) { int count = DEFAULT_RETRY_IO_COUNT; if (!f2fs_test_bit(i, (char *)&sbi->dirty_device)) continue; do { ret = __submit_flush_wait(sbi, FDEV(i).bdev); if (ret) f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); } while (ret && --count); if (ret) { f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_FLUSH_FAIL); break; } spin_lock(&sbi->dev_lock); f2fs_clear_bit(i, (char *)&sbi->dirty_device); spin_unlock(&sbi->dev_lock); } return ret; } static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, enum dirty_type dirty_type) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); /* need not be added */ if (IS_CURSEG(sbi, segno)) return; if (!test_and_set_bit(segno, dirty_i->dirty_segmap[dirty_type])) dirty_i->nr_dirty[dirty_type]++; if (dirty_type == DIRTY) { struct seg_entry *sentry = get_seg_entry(sbi, segno); enum dirty_type t = sentry->type; if (unlikely(t >= DIRTY)) { f2fs_bug_on(sbi, 1); return; } if (!test_and_set_bit(segno, dirty_i->dirty_segmap[t])) dirty_i->nr_dirty[t]++; if (__is_large_section(sbi)) { unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); block_t valid_blocks = get_valid_blocks(sbi, segno, true); f2fs_bug_on(sbi, (!is_sbi_flag_set(sbi, SBI_CP_DISABLED) && !valid_blocks) || valid_blocks == CAP_BLKS_PER_SEC(sbi)); if (!IS_CURSEC(sbi, secno)) set_bit(secno, dirty_i->dirty_secmap); } } } static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, enum dirty_type dirty_type) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); block_t valid_blocks; if (test_and_clear_bit(segno, dirty_i->dirty_segmap[dirty_type])) dirty_i->nr_dirty[dirty_type]--; if (dirty_type == DIRTY) { struct seg_entry *sentry = get_seg_entry(sbi, segno); enum dirty_type t = sentry->type; if (test_and_clear_bit(segno, dirty_i->dirty_segmap[t])) dirty_i->nr_dirty[t]--; valid_blocks = get_valid_blocks(sbi, segno, true); if (valid_blocks == 0) { clear_bit(GET_SEC_FROM_SEG(sbi, segno), dirty_i->victim_secmap); #ifdef CONFIG_F2FS_CHECK_FS clear_bit(segno, SIT_I(sbi)->invalid_segmap); #endif } if (__is_large_section(sbi)) { unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); if (!valid_blocks || valid_blocks == CAP_BLKS_PER_SEC(sbi)) { clear_bit(secno, dirty_i->dirty_secmap); return; } if (!IS_CURSEC(sbi, secno)) set_bit(secno, dirty_i->dirty_secmap); } } } /* * Should not occur error such as -ENOMEM. * Adding dirty entry into seglist is not critical operation. * If a given segment is one of current working segments, it won't be added. */ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); unsigned short valid_blocks, ckpt_valid_blocks; unsigned int usable_blocks; if (segno == NULL_SEGNO || IS_CURSEG(sbi, segno)) return; usable_blocks = f2fs_usable_blks_in_seg(sbi, segno); mutex_lock(&dirty_i->seglist_lock); valid_blocks = get_valid_blocks(sbi, segno, false); ckpt_valid_blocks = get_ckpt_valid_blocks(sbi, segno, false); if (valid_blocks == 0 && (!is_sbi_flag_set(sbi, SBI_CP_DISABLED) || ckpt_valid_blocks == usable_blocks)) { __locate_dirty_segment(sbi, segno, PRE); __remove_dirty_segment(sbi, segno, DIRTY); } else if (valid_blocks < usable_blocks) { __locate_dirty_segment(sbi, segno, DIRTY); } else { /* Recovery routine with SSR needs this */ __remove_dirty_segment(sbi, segno, DIRTY); } mutex_unlock(&dirty_i->seglist_lock); } /* This moves currently empty dirty blocks to prefree. Must hold seglist_lock */ void f2fs_dirty_to_prefree(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); unsigned int segno; mutex_lock(&dirty_i->seglist_lock); for_each_set_bit(segno, dirty_i->dirty_segmap[DIRTY], MAIN_SEGS(sbi)) { if (get_valid_blocks(sbi, segno, false)) continue; if (IS_CURSEG(sbi, segno)) continue; __locate_dirty_segment(sbi, segno, PRE); __remove_dirty_segment(sbi, segno, DIRTY); } mutex_unlock(&dirty_i->seglist_lock); } block_t f2fs_get_unusable_blocks(struct f2fs_sb_info *sbi) { int ovp_hole_segs = (overprovision_segments(sbi) - reserved_segments(sbi)); block_t ovp_holes = SEGS_TO_BLKS(sbi, ovp_hole_segs); struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); block_t holes[2] = {0, 0}; /* DATA and NODE */ block_t unusable; struct seg_entry *se; unsigned int segno; mutex_lock(&dirty_i->seglist_lock); for_each_set_bit(segno, dirty_i->dirty_segmap[DIRTY], MAIN_SEGS(sbi)) { se = get_seg_entry(sbi, segno); if (IS_NODESEG(se->type)) holes[NODE] += f2fs_usable_blks_in_seg(sbi, segno) - se->valid_blocks; else holes[DATA] += f2fs_usable_blks_in_seg(sbi, segno) - se->valid_blocks; } mutex_unlock(&dirty_i->seglist_lock); unusable = max(holes[DATA], holes[NODE]); if (unusable > ovp_holes) return unusable - ovp_holes; return 0; } int f2fs_disable_cp_again(struct f2fs_sb_info *sbi, block_t unusable) { int ovp_hole_segs = (overprovision_segments(sbi) - reserved_segments(sbi)); if (F2FS_OPTION(sbi).unusable_cap_perc == 100) return 0; if (unusable > F2FS_OPTION(sbi).unusable_cap) return -EAGAIN; if (is_sbi_flag_set(sbi, SBI_CP_DISABLED_QUICK) && dirty_segments(sbi) > ovp_hole_segs) return -EAGAIN; if (has_not_enough_free_secs(sbi, 0, 0)) return -EAGAIN; return 0; } /* This is only used by SBI_CP_DISABLED */ static unsigned int get_free_segment(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); unsigned int segno = 0; mutex_lock(&dirty_i->seglist_lock); for_each_set_bit(segno, dirty_i->dirty_segmap[DIRTY], MAIN_SEGS(sbi)) { if (get_valid_blocks(sbi, segno, false)) continue; if (get_ckpt_valid_blocks(sbi, segno, false)) continue; mutex_unlock(&dirty_i->seglist_lock); return segno; } mutex_unlock(&dirty_i->seglist_lock); return NULL_SEGNO; } static struct discard_cmd *__create_discard_cmd(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t lstart, block_t start, block_t len) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct list_head *pend_list; struct discard_cmd *dc; f2fs_bug_on(sbi, !len); pend_list = &dcc->pend_list[plist_idx(len)]; dc = f2fs_kmem_cache_alloc(discard_cmd_slab, GFP_NOFS, true, NULL); INIT_LIST_HEAD(&dc->list); dc->bdev = bdev; dc->di.lstart = lstart; dc->di.start = start; dc->di.len = len; dc->ref = 0; dc->state = D_PREP; dc->queued = 0; dc->error = 0; init_completion(&dc->wait); list_add_tail(&dc->list, pend_list); spin_lock_init(&dc->lock); dc->bio_ref = 0; atomic_inc(&dcc->discard_cmd_cnt); dcc->undiscard_blks += len; return dc; } static bool f2fs_check_discard_tree(struct f2fs_sb_info *sbi) { #ifdef CONFIG_F2FS_CHECK_FS struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct rb_node *cur = rb_first_cached(&dcc->root), *next; struct discard_cmd *cur_dc, *next_dc; while (cur) { next = rb_next(cur); if (!next) return true; cur_dc = rb_entry(cur, struct discard_cmd, rb_node); next_dc = rb_entry(next, struct discard_cmd, rb_node); if (cur_dc->di.lstart + cur_dc->di.len > next_dc->di.lstart) { f2fs_info(sbi, "broken discard_rbtree, " "cur(%u, %u) next(%u, %u)", cur_dc->di.lstart, cur_dc->di.len, next_dc->di.lstart, next_dc->di.len); return false; } cur = next; } #endif return true; } static struct discard_cmd *__lookup_discard_cmd(struct f2fs_sb_info *sbi, block_t blkaddr) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct rb_node *node = dcc->root.rb_root.rb_node; struct discard_cmd *dc; while (node) { dc = rb_entry(node, struct discard_cmd, rb_node); if (blkaddr < dc->di.lstart) node = node->rb_left; else if (blkaddr >= dc->di.lstart + dc->di.len) node = node->rb_right; else return dc; } return NULL; } static struct discard_cmd *__lookup_discard_cmd_ret(struct rb_root_cached *root, block_t blkaddr, struct discard_cmd **prev_entry, struct discard_cmd **next_entry, struct rb_node ***insert_p, struct rb_node **insert_parent) { struct rb_node **pnode = &root->rb_root.rb_node; struct rb_node *parent = NULL, *tmp_node; struct discard_cmd *dc; *insert_p = NULL; *insert_parent = NULL; *prev_entry = NULL; *next_entry = NULL; if (RB_EMPTY_ROOT(&root->rb_root)) return NULL; while (*pnode) { parent = *pnode; dc = rb_entry(*pnode, struct discard_cmd, rb_node); if (blkaddr < dc->di.lstart) pnode = &(*pnode)->rb_left; else if (blkaddr >= dc->di.lstart + dc->di.len) pnode = &(*pnode)->rb_right; else goto lookup_neighbors; } *insert_p = pnode; *insert_parent = parent; dc = rb_entry(parent, struct discard_cmd, rb_node); tmp_node = parent; if (parent && blkaddr > dc->di.lstart) tmp_node = rb_next(parent); *next_entry = rb_entry_safe(tmp_node, struct discard_cmd, rb_node); tmp_node = parent; if (parent && blkaddr < dc->di.lstart) tmp_node = rb_prev(parent); *prev_entry = rb_entry_safe(tmp_node, struct discard_cmd, rb_node); return NULL; lookup_neighbors: /* lookup prev node for merging backward later */ tmp_node = rb_prev(&dc->rb_node); *prev_entry = rb_entry_safe(tmp_node, struct discard_cmd, rb_node); /* lookup next node for merging frontward later */ tmp_node = rb_next(&dc->rb_node); *next_entry = rb_entry_safe(tmp_node, struct discard_cmd, rb_node); return dc; } static void __detach_discard_cmd(struct discard_cmd_control *dcc, struct discard_cmd *dc) { if (dc->state == D_DONE) atomic_sub(dc->queued, &dcc->queued_discard); list_del(&dc->list); rb_erase_cached(&dc->rb_node, &dcc->root); dcc->undiscard_blks -= dc->di.len; kmem_cache_free(discard_cmd_slab, dc); atomic_dec(&dcc->discard_cmd_cnt); } static void __remove_discard_cmd(struct f2fs_sb_info *sbi, struct discard_cmd *dc) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; unsigned long flags; trace_f2fs_remove_discard(dc->bdev, dc->di.start, dc->di.len); spin_lock_irqsave(&dc->lock, flags); if (dc->bio_ref) { spin_unlock_irqrestore(&dc->lock, flags); return; } spin_unlock_irqrestore(&dc->lock, flags); f2fs_bug_on(sbi, dc->ref); if (dc->error == -EOPNOTSUPP) dc->error = 0; if (dc->error) f2fs_info_ratelimited(sbi, "Issue discard(%u, %u, %u) failed, ret: %d", dc->di.lstart, dc->di.start, dc->di.len, dc->error); __detach_discard_cmd(dcc, dc); } static void f2fs_submit_discard_endio(struct bio *bio) { struct discard_cmd *dc = (struct discard_cmd *)bio->bi_private; unsigned long flags; spin_lock_irqsave(&dc->lock, flags); if (!dc->error) dc->error = blk_status_to_errno(bio->bi_status); dc->bio_ref--; if (!dc->bio_ref && dc->state == D_SUBMIT) { dc->state = D_DONE; complete_all(&dc->wait); } spin_unlock_irqrestore(&dc->lock, flags); bio_put(bio); } static void __check_sit_bitmap(struct f2fs_sb_info *sbi, block_t start, block_t end) { #ifdef CONFIG_F2FS_CHECK_FS struct seg_entry *sentry; unsigned int segno; block_t blk = start; unsigned long offset, size, *map; while (blk < end) { segno = GET_SEGNO(sbi, blk); sentry = get_seg_entry(sbi, segno); offset = GET_BLKOFF_FROM_SEG0(sbi, blk); if (end < START_BLOCK(sbi, segno + 1)) size = GET_BLKOFF_FROM_SEG0(sbi, end); else size = BLKS_PER_SEG(sbi); map = (unsigned long *)(sentry->cur_valid_map); offset = __find_rev_next_bit(map, size, offset); f2fs_bug_on(sbi, offset != size); blk = START_BLOCK(sbi, segno + 1); } #endif } static void __init_discard_policy(struct f2fs_sb_info *sbi, struct discard_policy *dpolicy, int discard_type, unsigned int granularity) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; /* common policy */ dpolicy->type = discard_type; dpolicy->sync = true; dpolicy->ordered = false; dpolicy->granularity = granularity; dpolicy->max_requests = dcc->max_discard_request; dpolicy->io_aware_gran = dcc->discard_io_aware_gran; dpolicy->timeout = false; if (discard_type == DPOLICY_BG) { dpolicy->min_interval = dcc->min_discard_issue_time; dpolicy->mid_interval = dcc->mid_discard_issue_time; dpolicy->max_interval = dcc->max_discard_issue_time; if (dcc->discard_io_aware == DPOLICY_IO_AWARE_ENABLE) dpolicy->io_aware = true; else if (dcc->discard_io_aware == DPOLICY_IO_AWARE_DISABLE) dpolicy->io_aware = false; dpolicy->sync = false; dpolicy->ordered = true; if (utilization(sbi) > dcc->discard_urgent_util) { dpolicy->granularity = MIN_DISCARD_GRANULARITY; if (atomic_read(&dcc->discard_cmd_cnt)) dpolicy->max_interval = dcc->min_discard_issue_time; } } else if (discard_type == DPOLICY_FORCE) { dpolicy->min_interval = dcc->min_discard_issue_time; dpolicy->mid_interval = dcc->mid_discard_issue_time; dpolicy->max_interval = dcc->max_discard_issue_time; dpolicy->io_aware = false; } else if (discard_type == DPOLICY_FSTRIM) { dpolicy->io_aware = false; } else if (discard_type == DPOLICY_UMOUNT) { dpolicy->io_aware = false; /* we need to issue all to keep CP_TRIMMED_FLAG */ dpolicy->granularity = MIN_DISCARD_GRANULARITY; dpolicy->timeout = true; } } static void __update_discard_tree_range(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t lstart, block_t start, block_t len); #ifdef CONFIG_BLK_DEV_ZONED static void __submit_zone_reset_cmd(struct f2fs_sb_info *sbi, struct discard_cmd *dc, blk_opf_t flag, struct list_head *wait_list, unsigned int *issued) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct block_device *bdev = dc->bdev; struct bio *bio = bio_alloc(bdev, 0, REQ_OP_ZONE_RESET | flag, GFP_NOFS); unsigned long flags; trace_f2fs_issue_reset_zone(bdev, dc->di.start); spin_lock_irqsave(&dc->lock, flags); dc->state = D_SUBMIT; dc->bio_ref++; spin_unlock_irqrestore(&dc->lock, flags); if (issued) (*issued)++; atomic_inc(&dcc->queued_discard); dc->queued++; list_move_tail(&dc->list, wait_list); /* sanity check on discard range */ __check_sit_bitmap(sbi, dc->di.lstart, dc->di.lstart + dc->di.len); bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(dc->di.start); bio->bi_private = dc; bio->bi_end_io = f2fs_submit_discard_endio; submit_bio(bio); atomic_inc(&dcc->issued_discard); f2fs_update_iostat(sbi, NULL, FS_ZONE_RESET_IO, dc->di.len * F2FS_BLKSIZE); } #endif /* this function is copied from blkdev_issue_discard from block/blk-lib.c */ static int __submit_discard_cmd(struct f2fs_sb_info *sbi, struct discard_policy *dpolicy, struct discard_cmd *dc, int *issued) { struct block_device *bdev = dc->bdev; unsigned int max_discard_blocks = SECTOR_TO_BLOCK(bdev_max_discard_sectors(bdev)); struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct list_head *wait_list = (dpolicy->type == DPOLICY_FSTRIM) ? &(dcc->fstrim_list) : &(dcc->wait_list); blk_opf_t flag = dpolicy->sync ? REQ_SYNC : 0; block_t lstart, start, len, total_len; int err = 0; if (dc->state != D_PREP) return 0; if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) return 0; #ifdef CONFIG_BLK_DEV_ZONED if (f2fs_sb_has_blkzoned(sbi) && bdev_is_zoned(bdev)) { int devi = f2fs_bdev_index(sbi, bdev); if (devi < 0) return -EINVAL; if (f2fs_blkz_is_seq(sbi, devi, dc->di.start)) { __submit_zone_reset_cmd(sbi, dc, flag, wait_list, issued); return 0; } } #endif /* * stop issuing discard for any of below cases: * 1. device is conventional zone, but it doesn't support discard. * 2. device is regulare device, after snapshot it doesn't support * discard. */ if (!bdev_max_discard_sectors(bdev)) return -EOPNOTSUPP; trace_f2fs_issue_discard(bdev, dc->di.start, dc->di.len); lstart = dc->di.lstart; start = dc->di.start; len = dc->di.len; total_len = len; dc->di.len = 0; while (total_len && *issued < dpolicy->max_requests && !err) { struct bio *bio = NULL; unsigned long flags; bool last = true; if (len > max_discard_blocks) { len = max_discard_blocks; last = false; } (*issued)++; if (*issued == dpolicy->max_requests) last = true; dc->di.len += len; if (time_to_inject(sbi, FAULT_DISCARD)) { err = -EIO; } else { err = __blkdev_issue_discard(bdev, SECTOR_FROM_BLOCK(start), SECTOR_FROM_BLOCK(len), GFP_NOFS, &bio); } if (err) { spin_lock_irqsave(&dc->lock, flags); if (dc->state == D_PARTIAL) dc->state = D_SUBMIT; spin_unlock_irqrestore(&dc->lock, flags); break; } f2fs_bug_on(sbi, !bio); /* * should keep before submission to avoid D_DONE * right away */ spin_lock_irqsave(&dc->lock, flags); if (last) dc->state = D_SUBMIT; else dc->state = D_PARTIAL; dc->bio_ref++; spin_unlock_irqrestore(&dc->lock, flags); atomic_inc(&dcc->queued_discard); dc->queued++; list_move_tail(&dc->list, wait_list); /* sanity check on discard range */ __check_sit_bitmap(sbi, lstart, lstart + len); bio->bi_private = dc; bio->bi_end_io = f2fs_submit_discard_endio; bio->bi_opf |= flag; submit_bio(bio); atomic_inc(&dcc->issued_discard); f2fs_update_iostat(sbi, NULL, FS_DISCARD_IO, len * F2FS_BLKSIZE); lstart += len; start += len; total_len -= len; len = total_len; } if (!err && len) { dcc->undiscard_blks -= len; __update_discard_tree_range(sbi, bdev, lstart, start, len); } return err; } static void __insert_discard_cmd(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t lstart, block_t start, block_t len) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct rb_node **p = &dcc->root.rb_root.rb_node; struct rb_node *parent = NULL; struct discard_cmd *dc; bool leftmost = true; /* look up rb tree to find parent node */ while (*p) { parent = *p; dc = rb_entry(parent, struct discard_cmd, rb_node); if (lstart < dc->di.lstart) { p = &(*p)->rb_left; } else if (lstart >= dc->di.lstart + dc->di.len) { p = &(*p)->rb_right; leftmost = false; } else { /* Let's skip to add, if exists */ return; } } dc = __create_discard_cmd(sbi, bdev, lstart, start, len); rb_link_node(&dc->rb_node, parent, p); rb_insert_color_cached(&dc->rb_node, &dcc->root, leftmost); } static void __relocate_discard_cmd(struct discard_cmd_control *dcc, struct discard_cmd *dc) { list_move_tail(&dc->list, &dcc->pend_list[plist_idx(dc->di.len)]); } static void __punch_discard_cmd(struct f2fs_sb_info *sbi, struct discard_cmd *dc, block_t blkaddr) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct discard_info di = dc->di; bool modified = false; if (dc->state == D_DONE || dc->di.len == 1) { __remove_discard_cmd(sbi, dc); return; } dcc->undiscard_blks -= di.len; if (blkaddr > di.lstart) { dc->di.len = blkaddr - dc->di.lstart; dcc->undiscard_blks += dc->di.len; __relocate_discard_cmd(dcc, dc); modified = true; } if (blkaddr < di.lstart + di.len - 1) { if (modified) { __insert_discard_cmd(sbi, dc->bdev, blkaddr + 1, di.start + blkaddr + 1 - di.lstart, di.lstart + di.len - 1 - blkaddr); } else { dc->di.lstart++; dc->di.len--; dc->di.start++; dcc->undiscard_blks += dc->di.len; __relocate_discard_cmd(dcc, dc); } } } static void __update_discard_tree_range(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t lstart, block_t start, block_t len) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct discard_cmd *prev_dc = NULL, *next_dc = NULL; struct discard_cmd *dc; struct discard_info di = {0}; struct rb_node **insert_p = NULL, *insert_parent = NULL; unsigned int max_discard_blocks = SECTOR_TO_BLOCK(bdev_max_discard_sectors(bdev)); block_t end = lstart + len; dc = __lookup_discard_cmd_ret(&dcc->root, lstart, &prev_dc, &next_dc, &insert_p, &insert_parent); if (dc) prev_dc = dc; if (!prev_dc) { di.lstart = lstart; di.len = next_dc ? next_dc->di.lstart - lstart : len; di.len = min(di.len, len); di.start = start; } while (1) { struct rb_node *node; bool merged = false; struct discard_cmd *tdc = NULL; if (prev_dc) { di.lstart = prev_dc->di.lstart + prev_dc->di.len; if (di.lstart < lstart) di.lstart = lstart; if (di.lstart >= end) break; if (!next_dc || next_dc->di.lstart > end) di.len = end - di.lstart; else di.len = next_dc->di.lstart - di.lstart; di.start = start + di.lstart - lstart; } if (!di.len) goto next; if (prev_dc && prev_dc->state == D_PREP && prev_dc->bdev == bdev && __is_discard_back_mergeable(&di, &prev_dc->di, max_discard_blocks)) { prev_dc->di.len += di.len; dcc->undiscard_blks += di.len; __relocate_discard_cmd(dcc, prev_dc); di = prev_dc->di; tdc = prev_dc; merged = true; } if (next_dc && next_dc->state == D_PREP && next_dc->bdev == bdev && __is_discard_front_mergeable(&di, &next_dc->di, max_discard_blocks)) { next_dc->di.lstart = di.lstart; next_dc->di.len += di.len; next_dc->di.start = di.start; dcc->undiscard_blks += di.len; __relocate_discard_cmd(dcc, next_dc); if (tdc) __remove_discard_cmd(sbi, tdc); merged = true; } if (!merged) __insert_discard_cmd(sbi, bdev, di.lstart, di.start, di.len); next: prev_dc = next_dc; if (!prev_dc) break; node = rb_next(&prev_dc->rb_node); next_dc = rb_entry_safe(node, struct discard_cmd, rb_node); } } #ifdef CONFIG_BLK_DEV_ZONED static void __queue_zone_reset_cmd(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkstart, block_t lblkstart, block_t blklen) { trace_f2fs_queue_reset_zone(bdev, blkstart); mutex_lock(&SM_I(sbi)->dcc_info->cmd_lock); __insert_discard_cmd(sbi, bdev, lblkstart, blkstart, blklen); mutex_unlock(&SM_I(sbi)->dcc_info->cmd_lock); } #endif static void __queue_discard_cmd(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkstart, block_t blklen) { block_t lblkstart = blkstart; if (!f2fs_bdev_support_discard(bdev)) return; trace_f2fs_queue_discard(bdev, blkstart, blklen); if (f2fs_is_multi_device(sbi)) { int devi = f2fs_target_device_index(sbi, blkstart); blkstart -= FDEV(devi).start_blk; } mutex_lock(&SM_I(sbi)->dcc_info->cmd_lock); __update_discard_tree_range(sbi, bdev, lblkstart, blkstart, blklen); mutex_unlock(&SM_I(sbi)->dcc_info->cmd_lock); } static void __issue_discard_cmd_orderly(struct f2fs_sb_info *sbi, struct discard_policy *dpolicy, int *issued) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct discard_cmd *prev_dc = NULL, *next_dc = NULL; struct rb_node **insert_p = NULL, *insert_parent = NULL; struct discard_cmd *dc; struct blk_plug plug; bool io_interrupted = false; mutex_lock(&dcc->cmd_lock); dc = __lookup_discard_cmd_ret(&dcc->root, dcc->next_pos, &prev_dc, &next_dc, &insert_p, &insert_parent); if (!dc) dc = next_dc; blk_start_plug(&plug); while (dc) { struct rb_node *node; int err = 0; if (dc->state != D_PREP) goto next; if (dpolicy->io_aware && !is_idle(sbi, DISCARD_TIME)) { io_interrupted = true; break; } dcc->next_pos = dc->di.lstart + dc->di.len; err = __submit_discard_cmd(sbi, dpolicy, dc, issued); if (*issued >= dpolicy->max_requests) break; next: node = rb_next(&dc->rb_node); if (err) __remove_discard_cmd(sbi, dc); dc = rb_entry_safe(node, struct discard_cmd, rb_node); } blk_finish_plug(&plug); if (!dc) dcc->next_pos = 0; mutex_unlock(&dcc->cmd_lock); if (!(*issued) && io_interrupted) *issued = -1; } static unsigned int __wait_all_discard_cmd(struct f2fs_sb_info *sbi, struct discard_policy *dpolicy); static int __issue_discard_cmd(struct f2fs_sb_info *sbi, struct discard_policy *dpolicy) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct list_head *pend_list; struct discard_cmd *dc, *tmp; struct blk_plug plug; int i, issued; bool io_interrupted = false; if (dpolicy->timeout) f2fs_update_time(sbi, UMOUNT_DISCARD_TIMEOUT); retry: issued = 0; for (i = MAX_PLIST_NUM - 1; i >= 0; i--) { if (dpolicy->timeout && f2fs_time_over(sbi, UMOUNT_DISCARD_TIMEOUT)) break; if (i + 1 < dpolicy->granularity) break; if (i + 1 < dcc->max_ordered_discard && dpolicy->ordered) { __issue_discard_cmd_orderly(sbi, dpolicy, &issued); return issued; } pend_list = &dcc->pend_list[i]; mutex_lock(&dcc->cmd_lock); if (list_empty(pend_list)) goto next; if (unlikely(dcc->rbtree_check)) f2fs_bug_on(sbi, !f2fs_check_discard_tree(sbi)); blk_start_plug(&plug); list_for_each_entry_safe(dc, tmp, pend_list, list) { f2fs_bug_on(sbi, dc->state != D_PREP); if (dpolicy->timeout && f2fs_time_over(sbi, UMOUNT_DISCARD_TIMEOUT)) break; if (dpolicy->io_aware && i < dpolicy->io_aware_gran && !is_idle(sbi, DISCARD_TIME)) { io_interrupted = true; break; } __submit_discard_cmd(sbi, dpolicy, dc, &issued); if (issued >= dpolicy->max_requests) break; } blk_finish_plug(&plug); next: mutex_unlock(&dcc->cmd_lock); if (issued >= dpolicy->max_requests || io_interrupted) break; } if (dpolicy->type == DPOLICY_UMOUNT && issued) { __wait_all_discard_cmd(sbi, dpolicy); goto retry; } if (!issued && io_interrupted) issued = -1; return issued; } static bool __drop_discard_cmd(struct f2fs_sb_info *sbi) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct list_head *pend_list; struct discard_cmd *dc, *tmp; int i; bool dropped = false; mutex_lock(&dcc->cmd_lock); for (i = MAX_PLIST_NUM - 1; i >= 0; i--) { pend_list = &dcc->pend_list[i]; list_for_each_entry_safe(dc, tmp, pend_list, list) { f2fs_bug_on(sbi, dc->state != D_PREP); __remove_discard_cmd(sbi, dc); dropped = true; } } mutex_unlock(&dcc->cmd_lock); return dropped; } void f2fs_drop_discard_cmd(struct f2fs_sb_info *sbi) { __drop_discard_cmd(sbi); } static unsigned int __wait_one_discard_bio(struct f2fs_sb_info *sbi, struct discard_cmd *dc) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; unsigned int len = 0; wait_for_completion_io(&dc->wait); mutex_lock(&dcc->cmd_lock); f2fs_bug_on(sbi, dc->state != D_DONE); dc->ref--; if (!dc->ref) { if (!dc->error) len = dc->di.len; __remove_discard_cmd(sbi, dc); } mutex_unlock(&dcc->cmd_lock); return len; } static unsigned int __wait_discard_cmd_range(struct f2fs_sb_info *sbi, struct discard_policy *dpolicy, block_t start, block_t end) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct list_head *wait_list = (dpolicy->type == DPOLICY_FSTRIM) ? &(dcc->fstrim_list) : &(dcc->wait_list); struct discard_cmd *dc = NULL, *iter, *tmp; unsigned int trimmed = 0; next: dc = NULL; mutex_lock(&dcc->cmd_lock); list_for_each_entry_safe(iter, tmp, wait_list, list) { if (iter->di.lstart + iter->di.len <= start || end <= iter->di.lstart) continue; if (iter->di.len < dpolicy->granularity) continue; if (iter->state == D_DONE && !iter->ref) { wait_for_completion_io(&iter->wait); if (!iter->error) trimmed += iter->di.len; __remove_discard_cmd(sbi, iter); } else { iter->ref++; dc = iter; break; } } mutex_unlock(&dcc->cmd_lock); if (dc) { trimmed += __wait_one_discard_bio(sbi, dc); goto next; } return trimmed; } static unsigned int __wait_all_discard_cmd(struct f2fs_sb_info *sbi, struct discard_policy *dpolicy) { struct discard_policy dp; unsigned int discard_blks; if (dpolicy) return __wait_discard_cmd_range(sbi, dpolicy, 0, UINT_MAX); /* wait all */ __init_discard_policy(sbi, &dp, DPOLICY_FSTRIM, MIN_DISCARD_GRANULARITY); discard_blks = __wait_discard_cmd_range(sbi, &dp, 0, UINT_MAX); __init_discard_policy(sbi, &dp, DPOLICY_UMOUNT, MIN_DISCARD_GRANULARITY); discard_blks += __wait_discard_cmd_range(sbi, &dp, 0, UINT_MAX); return discard_blks; } /* This should be covered by global mutex, &sit_i->sentry_lock */ static void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct discard_cmd *dc; bool need_wait = false; mutex_lock(&dcc->cmd_lock); dc = __lookup_discard_cmd(sbi, blkaddr); #ifdef CONFIG_BLK_DEV_ZONED if (dc && f2fs_sb_has_blkzoned(sbi) && bdev_is_zoned(dc->bdev)) { int devi = f2fs_bdev_index(sbi, dc->bdev); if (devi < 0) { mutex_unlock(&dcc->cmd_lock); return; } if (f2fs_blkz_is_seq(sbi, devi, dc->di.start)) { /* force submit zone reset */ if (dc->state == D_PREP) __submit_zone_reset_cmd(sbi, dc, REQ_SYNC, &dcc->wait_list, NULL); dc->ref++; mutex_unlock(&dcc->cmd_lock); /* wait zone reset */ __wait_one_discard_bio(sbi, dc); return; } } #endif if (dc) { if (dc->state == D_PREP) { __punch_discard_cmd(sbi, dc, blkaddr); } else { dc->ref++; need_wait = true; } } mutex_unlock(&dcc->cmd_lock); if (need_wait) __wait_one_discard_bio(sbi, dc); } void f2fs_stop_discard_thread(struct f2fs_sb_info *sbi) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; if (dcc && dcc->f2fs_issue_discard) { struct task_struct *discard_thread = dcc->f2fs_issue_discard; dcc->f2fs_issue_discard = NULL; kthread_stop(discard_thread); } } /** * f2fs_issue_discard_timeout() - Issue all discard cmd within UMOUNT_DISCARD_TIMEOUT * @sbi: the f2fs_sb_info data for discard cmd to issue * * When UMOUNT_DISCARD_TIMEOUT is exceeded, all remaining discard commands will be dropped * * Return true if issued all discard cmd or no discard cmd need issue, otherwise return false. */ bool f2fs_issue_discard_timeout(struct f2fs_sb_info *sbi) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct discard_policy dpolicy; bool dropped; if (!atomic_read(&dcc->discard_cmd_cnt)) return true; __init_discard_policy(sbi, &dpolicy, DPOLICY_UMOUNT, dcc->discard_granularity); __issue_discard_cmd(sbi, &dpolicy); dropped = __drop_discard_cmd(sbi); /* just to make sure there is no pending discard commands */ __wait_all_discard_cmd(sbi, NULL); f2fs_bug_on(sbi, atomic_read(&dcc->discard_cmd_cnt)); return !dropped; } static int issue_discard_thread(void *data) { struct f2fs_sb_info *sbi = data; struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; wait_queue_head_t *q = &dcc->discard_wait_queue; struct discard_policy dpolicy; unsigned int wait_ms = dcc->min_discard_issue_time; int issued; set_freezable(); do { wait_event_freezable_timeout(*q, kthread_should_stop() || dcc->discard_wake, msecs_to_jiffies(wait_ms)); if (sbi->gc_mode == GC_URGENT_HIGH || !f2fs_available_free_memory(sbi, DISCARD_CACHE)) __init_discard_policy(sbi, &dpolicy, DPOLICY_FORCE, MIN_DISCARD_GRANULARITY); else __init_discard_policy(sbi, &dpolicy, DPOLICY_BG, dcc->discard_granularity); if (dcc->discard_wake) dcc->discard_wake = false; /* clean up pending candidates before going to sleep */ if (atomic_read(&dcc->queued_discard)) __wait_all_discard_cmd(sbi, NULL); if (f2fs_readonly(sbi->sb)) continue; if (kthread_should_stop()) return 0; if (is_sbi_flag_set(sbi, SBI_NEED_FSCK) || !atomic_read(&dcc->discard_cmd_cnt)) { wait_ms = dpolicy.max_interval; continue; } sb_start_intwrite(sbi->sb); issued = __issue_discard_cmd(sbi, &dpolicy); if (issued > 0) { __wait_all_discard_cmd(sbi, &dpolicy); wait_ms = dpolicy.min_interval; } else if (issued == -1) { wait_ms = f2fs_time_to_wait(sbi, DISCARD_TIME); if (!wait_ms) wait_ms = dpolicy.mid_interval; } else { wait_ms = dpolicy.max_interval; } if (!atomic_read(&dcc->discard_cmd_cnt)) wait_ms = dpolicy.max_interval; sb_end_intwrite(sbi->sb); } while (!kthread_should_stop()); return 0; } #ifdef CONFIG_BLK_DEV_ZONED static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkstart, block_t blklen) { sector_t sector, nr_sects; block_t lblkstart = blkstart; int devi = 0; u64 remainder = 0; if (f2fs_is_multi_device(sbi)) { devi = f2fs_target_device_index(sbi, blkstart); if (blkstart < FDEV(devi).start_blk || blkstart > FDEV(devi).end_blk) { f2fs_err(sbi, "Invalid block %x", blkstart); return -EIO; } blkstart -= FDEV(devi).start_blk; } /* For sequential zones, reset the zone write pointer */ if (f2fs_blkz_is_seq(sbi, devi, blkstart)) { sector = SECTOR_FROM_BLOCK(blkstart); nr_sects = SECTOR_FROM_BLOCK(blklen); div64_u64_rem(sector, bdev_zone_sectors(bdev), &remainder); if (remainder || nr_sects != bdev_zone_sectors(bdev)) { f2fs_err(sbi, "(%d) %s: Unaligned zone reset attempted (block %x + %x)", devi, sbi->s_ndevs ? FDEV(devi).path : "", blkstart, blklen); return -EIO; } if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) { unsigned int nofs_flags; int ret; trace_f2fs_issue_reset_zone(bdev, blkstart); nofs_flags = memalloc_nofs_save(); ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, sector, nr_sects); memalloc_nofs_restore(nofs_flags); return ret; } __queue_zone_reset_cmd(sbi, bdev, blkstart, lblkstart, blklen); return 0; } /* For conventional zones, use regular discard if supported */ __queue_discard_cmd(sbi, bdev, lblkstart, blklen); return 0; } #endif static int __issue_discard_async(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkstart, block_t blklen) { #ifdef CONFIG_BLK_DEV_ZONED if (f2fs_sb_has_blkzoned(sbi) && bdev_is_zoned(bdev)) return __f2fs_issue_discard_zone(sbi, bdev, blkstart, blklen); #endif __queue_discard_cmd(sbi, bdev, blkstart, blklen); return 0; } static int f2fs_issue_discard(struct f2fs_sb_info *sbi, block_t blkstart, block_t blklen) { sector_t start = blkstart, len = 0; struct block_device *bdev; struct seg_entry *se; unsigned int offset; block_t i; int err = 0; bdev = f2fs_target_device(sbi, blkstart, NULL); for (i = blkstart; i < blkstart + blklen; i++, len++) { if (i != start) { struct block_device *bdev2 = f2fs_target_device(sbi, i, NULL); if (bdev2 != bdev) { err = __issue_discard_async(sbi, bdev, start, len); if (err) return err; bdev = bdev2; start = i; len = 0; } } se = get_seg_entry(sbi, GET_SEGNO(sbi, i)); offset = GET_BLKOFF_FROM_SEG0(sbi, i); if (f2fs_block_unit_discard(sbi) && !f2fs_test_and_set_bit(offset, se->discard_map)) sbi->discard_blks--; } if (len) err = __issue_discard_async(sbi, bdev, start, len); return err; } static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, bool check_only) { int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long); struct seg_entry *se = get_seg_entry(sbi, cpc->trim_start); unsigned long *cur_map = (unsigned long *)se->cur_valid_map; unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map; unsigned long *discard_map = (unsigned long *)se->discard_map; unsigned long *dmap = SIT_I(sbi)->tmp_map; unsigned int start = 0, end = -1; bool force = (cpc->reason & CP_DISCARD); struct discard_entry *de = NULL; struct list_head *head = &SM_I(sbi)->dcc_info->entry_list; int i; if (se->valid_blocks == BLKS_PER_SEG(sbi) || !f2fs_hw_support_discard(sbi) || !f2fs_block_unit_discard(sbi)) return false; if (!force) { if (!f2fs_realtime_discard_enable(sbi) || (!se->valid_blocks && !IS_CURSEG(sbi, cpc->trim_start)) || SM_I(sbi)->dcc_info->nr_discards >= SM_I(sbi)->dcc_info->max_discards) return false; } /* SIT_VBLOCK_MAP_SIZE should be multiple of sizeof(unsigned long) */ for (i = 0; i < entries; i++) dmap[i] = force ? ~ckpt_map[i] & ~discard_map[i] : (cur_map[i] ^ ckpt_map[i]) & ckpt_map[i]; while (force || SM_I(sbi)->dcc_info->nr_discards <= SM_I(sbi)->dcc_info->max_discards) { start = __find_rev_next_bit(dmap, BLKS_PER_SEG(sbi), end + 1); if (start >= BLKS_PER_SEG(sbi)) break; end = __find_rev_next_zero_bit(dmap, BLKS_PER_SEG(sbi), start + 1); if (force && start && end != BLKS_PER_SEG(sbi) && (end - start) < cpc->trim_minlen) continue; if (check_only) return true; if (!de) { de = f2fs_kmem_cache_alloc(discard_entry_slab, GFP_F2FS_ZERO, true, NULL); de->start_blkaddr = START_BLOCK(sbi, cpc->trim_start); list_add_tail(&de->list, head); } for (i = start; i < end; i++) __set_bit_le(i, (void *)de->discard_map); SM_I(sbi)->dcc_info->nr_discards += end - start; } return false; } static void release_discard_addr(struct discard_entry *entry) { list_del(&entry->list); kmem_cache_free(discard_entry_slab, entry); } void f2fs_release_discard_addrs(struct f2fs_sb_info *sbi) { struct list_head *head = &(SM_I(sbi)->dcc_info->entry_list); struct discard_entry *entry, *this; /* drop caches */ list_for_each_entry_safe(entry, this, head, list) release_discard_addr(entry); } /* * Should call f2fs_clear_prefree_segments after checkpoint is done. */ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); unsigned int segno; mutex_lock(&dirty_i->seglist_lock); for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], MAIN_SEGS(sbi)) __set_test_and_free(sbi, segno, false); mutex_unlock(&dirty_i->seglist_lock); } void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct list_head *head = &dcc->entry_list; struct discard_entry *entry, *this; struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); unsigned long *prefree_map = dirty_i->dirty_segmap[PRE]; unsigned int start = 0, end = -1; unsigned int secno, start_segno; bool force = (cpc->reason & CP_DISCARD); bool section_alignment = F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SECTION; if (f2fs_lfs_mode(sbi) && __is_large_section(sbi)) section_alignment = true; mutex_lock(&dirty_i->seglist_lock); while (1) { int i; if (section_alignment && end != -1) end--; start = find_next_bit(prefree_map, MAIN_SEGS(sbi), end + 1); if (start >= MAIN_SEGS(sbi)) break; end = find_next_zero_bit(prefree_map, MAIN_SEGS(sbi), start + 1); if (section_alignment) { start = rounddown(start, SEGS_PER_SEC(sbi)); end = roundup(end, SEGS_PER_SEC(sbi)); } for (i = start; i < end; i++) { if (test_and_clear_bit(i, prefree_map)) dirty_i->nr_dirty[PRE]--; } if (!f2fs_realtime_discard_enable(sbi)) continue; if (force && start >= cpc->trim_start && (end - 1) <= cpc->trim_end) continue; /* Should cover 2MB zoned device for zone-based reset */ if (!f2fs_sb_has_blkzoned(sbi) && (!f2fs_lfs_mode(sbi) || !__is_large_section(sbi))) { f2fs_issue_discard(sbi, START_BLOCK(sbi, start), SEGS_TO_BLKS(sbi, end - start)); continue; } next: secno = GET_SEC_FROM_SEG(sbi, start); start_segno = GET_SEG_FROM_SEC(sbi, secno); if (!IS_CURSEC(sbi, secno) && !get_valid_blocks(sbi, start, true)) f2fs_issue_discard(sbi, START_BLOCK(sbi, start_segno), BLKS_PER_SEC(sbi)); start = start_segno + SEGS_PER_SEC(sbi); if (start < end) goto next; else end = start - 1; } mutex_unlock(&dirty_i->seglist_lock); if (!f2fs_block_unit_discard(sbi)) goto wakeup; /* send small discards */ list_for_each_entry_safe(entry, this, head, list) { unsigned int cur_pos = 0, next_pos, len, total_len = 0; bool is_valid = test_bit_le(0, entry->discard_map); find_next: if (is_valid) { next_pos = find_next_zero_bit_le(entry->discard_map, BLKS_PER_SEG(sbi), cur_pos); len = next_pos - cur_pos; if (f2fs_sb_has_blkzoned(sbi) || (force && len < cpc->trim_minlen)) goto skip; f2fs_issue_discard(sbi, entry->start_blkaddr + cur_pos, len); total_len += len; } else { next_pos = find_next_bit_le(entry->discard_map, BLKS_PER_SEG(sbi), cur_pos); } skip: cur_pos = next_pos; is_valid = !is_valid; if (cur_pos < BLKS_PER_SEG(sbi)) goto find_next; release_discard_addr(entry); dcc->nr_discards -= total_len; } wakeup: wake_up_discard_thread(sbi, false); } int f2fs_start_discard_thread(struct f2fs_sb_info *sbi) { dev_t dev = sbi->sb->s_bdev->bd_dev; struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; int err = 0; if (f2fs_sb_has_readonly(sbi)) { f2fs_info(sbi, "Skip to start discard thread for readonly image"); return 0; } if (!f2fs_realtime_discard_enable(sbi)) return 0; dcc->f2fs_issue_discard = kthread_run(issue_discard_thread, sbi, "f2fs_discard-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(dcc->f2fs_issue_discard)) { err = PTR_ERR(dcc->f2fs_issue_discard); dcc->f2fs_issue_discard = NULL; } return err; } static int create_discard_cmd_control(struct f2fs_sb_info *sbi) { struct discard_cmd_control *dcc; int err = 0, i; if (SM_I(sbi)->dcc_info) { dcc = SM_I(sbi)->dcc_info; goto init_thread; } dcc = f2fs_kzalloc(sbi, sizeof(struct discard_cmd_control), GFP_KERNEL); if (!dcc) return -ENOMEM; dcc->discard_io_aware_gran = MAX_PLIST_NUM; dcc->discard_granularity = DEFAULT_DISCARD_GRANULARITY; dcc->max_ordered_discard = DEFAULT_MAX_ORDERED_DISCARD_GRANULARITY; dcc->discard_io_aware = DPOLICY_IO_AWARE_ENABLE; if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SEGMENT || F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SECTION) dcc->discard_granularity = BLKS_PER_SEG(sbi); INIT_LIST_HEAD(&dcc->entry_list); for (i = 0; i < MAX_PLIST_NUM; i++) INIT_LIST_HEAD(&dcc->pend_list[i]); INIT_LIST_HEAD(&dcc->wait_list); INIT_LIST_HEAD(&dcc->fstrim_list); mutex_init(&dcc->cmd_lock); atomic_set(&dcc->issued_discard, 0); atomic_set(&dcc->queued_discard, 0); atomic_set(&dcc->discard_cmd_cnt, 0); dcc->nr_discards = 0; dcc->max_discards = SEGS_TO_BLKS(sbi, MAIN_SEGS(sbi)); dcc->max_discard_request = DEF_MAX_DISCARD_REQUEST; dcc->min_discard_issue_time = DEF_MIN_DISCARD_ISSUE_TIME; dcc->mid_discard_issue_time = DEF_MID_DISCARD_ISSUE_TIME; dcc->max_discard_issue_time = DEF_MAX_DISCARD_ISSUE_TIME; dcc->discard_urgent_util = DEF_DISCARD_URGENT_UTIL; dcc->undiscard_blks = 0; dcc->next_pos = 0; dcc->root = RB_ROOT_CACHED; dcc->rbtree_check = false; init_waitqueue_head(&dcc->discard_wait_queue); SM_I(sbi)->dcc_info = dcc; init_thread: err = f2fs_start_discard_thread(sbi); if (err) { kfree(dcc); SM_I(sbi)->dcc_info = NULL; } return err; } static void destroy_discard_cmd_control(struct f2fs_sb_info *sbi) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; if (!dcc) return; f2fs_stop_discard_thread(sbi); /* * Recovery can cache discard commands, so in error path of * fill_super(), it needs to give a chance to handle them. */ f2fs_issue_discard_timeout(sbi); kfree(dcc); SM_I(sbi)->dcc_info = NULL; } static bool __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno) { struct sit_info *sit_i = SIT_I(sbi); if (!__test_and_set_bit(segno, sit_i->dirty_sentries_bitmap)) { sit_i->dirty_sentries++; return false; } return true; } static void __set_sit_entry_type(struct f2fs_sb_info *sbi, int type, unsigned int segno, int modified) { struct seg_entry *se = get_seg_entry(sbi, segno); se->type = type; if (modified) __mark_sit_entry_dirty(sbi, segno); } static inline unsigned long long get_segment_mtime(struct f2fs_sb_info *sbi, block_t blkaddr) { unsigned int segno = GET_SEGNO(sbi, blkaddr); if (segno == NULL_SEGNO) return 0; return get_seg_entry(sbi, segno)->mtime; } static void update_segment_mtime(struct f2fs_sb_info *sbi, block_t blkaddr, unsigned long long old_mtime) { struct seg_entry *se; unsigned int segno = GET_SEGNO(sbi, blkaddr); unsigned long long ctime = get_mtime(sbi, false); unsigned long long mtime = old_mtime ? old_mtime : ctime; if (segno == NULL_SEGNO) return; se = get_seg_entry(sbi, segno); if (!se->mtime) se->mtime = mtime; else se->mtime = div_u64(se->mtime * se->valid_blocks + mtime, se->valid_blocks + 1); if (ctime > SIT_I(sbi)->max_mtime) SIT_I(sbi)->max_mtime = ctime; } /* * NOTE: when updating multiple blocks at the same time, please ensure * that the consecutive input blocks belong to the same segment. */ static int update_sit_entry_for_release(struct f2fs_sb_info *sbi, struct seg_entry *se, unsigned int segno, block_t blkaddr, unsigned int offset, int del) { bool exist; #ifdef CONFIG_F2FS_CHECK_FS bool mir_exist; #endif int i; int del_count = -del; f2fs_bug_on(sbi, GET_SEGNO(sbi, blkaddr) != GET_SEGNO(sbi, blkaddr + del_count - 1)); for (i = 0; i < del_count; i++) { exist = f2fs_test_and_clear_bit(offset + i, se->cur_valid_map); #ifdef CONFIG_F2FS_CHECK_FS mir_exist = f2fs_test_and_clear_bit(offset + i, se->cur_valid_map_mir); if (unlikely(exist != mir_exist)) { f2fs_err(sbi, "Inconsistent error when clearing bitmap, blk:%u, old bit:%d", blkaddr + i, exist); f2fs_bug_on(sbi, 1); } #endif if (unlikely(!exist)) { f2fs_err(sbi, "Bitmap was wrongly cleared, blk:%u", blkaddr + i); f2fs_bug_on(sbi, 1); se->valid_blocks++; del += 1; } else if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { /* * If checkpoints are off, we must not reuse data that * was used in the previous checkpoint. If it was used * before, we must track that to know how much space we * really have. */ if (f2fs_test_bit(offset + i, se->ckpt_valid_map)) { spin_lock(&sbi->stat_lock); sbi->unusable_block_count++; spin_unlock(&sbi->stat_lock); } } if (f2fs_block_unit_discard(sbi) && f2fs_test_and_clear_bit(offset + i, se->discard_map)) sbi->discard_blks++; if (!f2fs_test_bit(offset + i, se->ckpt_valid_map)) { se->ckpt_valid_blocks -= 1; if (__is_large_section(sbi)) get_sec_entry(sbi, segno)->ckpt_valid_blocks -= 1; } } if (__is_large_section(sbi)) sanity_check_valid_blocks(sbi, segno); return del; } static int update_sit_entry_for_alloc(struct f2fs_sb_info *sbi, struct seg_entry *se, unsigned int segno, block_t blkaddr, unsigned int offset, int del) { bool exist; #ifdef CONFIG_F2FS_CHECK_FS bool mir_exist; #endif exist = f2fs_test_and_set_bit(offset, se->cur_valid_map); #ifdef CONFIG_F2FS_CHECK_FS mir_exist = f2fs_test_and_set_bit(offset, se->cur_valid_map_mir); if (unlikely(exist != mir_exist)) { f2fs_err(sbi, "Inconsistent error when setting bitmap, blk:%u, old bit:%d", blkaddr, exist); f2fs_bug_on(sbi, 1); } #endif if (unlikely(exist)) { f2fs_err(sbi, "Bitmap was wrongly set, blk:%u", blkaddr); f2fs_bug_on(sbi, 1); se->valid_blocks--; del = 0; } if (f2fs_block_unit_discard(sbi) && !f2fs_test_and_set_bit(offset, se->discard_map)) sbi->discard_blks--; /* * SSR should never reuse block which is checkpointed * or newly invalidated. */ if (!is_sbi_flag_set(sbi, SBI_CP_DISABLED)) { if (!f2fs_test_and_set_bit(offset, se->ckpt_valid_map)) { se->ckpt_valid_blocks++; if (__is_large_section(sbi)) get_sec_entry(sbi, segno)->ckpt_valid_blocks++; } } if (!f2fs_test_bit(offset, se->ckpt_valid_map)) { se->ckpt_valid_blocks += del; if (__is_large_section(sbi)) get_sec_entry(sbi, segno)->ckpt_valid_blocks += del; } if (__is_large_section(sbi)) sanity_check_valid_blocks(sbi, segno); return del; } /* * If releasing blocks, this function supports updating multiple consecutive blocks * at one time, but please note that these consecutive blocks need to belong to the * same segment. */ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) { struct seg_entry *se; unsigned int segno, offset; long int new_vblocks; segno = GET_SEGNO(sbi, blkaddr); if (segno == NULL_SEGNO) return; se = get_seg_entry(sbi, segno); new_vblocks = se->valid_blocks + del; offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr); f2fs_bug_on(sbi, (new_vblocks < 0 || (new_vblocks > f2fs_usable_blks_in_seg(sbi, segno)))); se->valid_blocks = new_vblocks; /* Update valid block bitmap */ if (del > 0) { del = update_sit_entry_for_alloc(sbi, se, segno, blkaddr, offset, del); } else { del = update_sit_entry_for_release(sbi, se, segno, blkaddr, offset, del); } __mark_sit_entry_dirty(sbi, segno); /* update total number of valid blocks to be written in ckpt area */ SIT_I(sbi)->written_valid_blocks += del; if (__is_large_section(sbi)) get_sec_entry(sbi, segno)->valid_blocks += del; } void f2fs_invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr, unsigned int len) { unsigned int segno = GET_SEGNO(sbi, addr); struct sit_info *sit_i = SIT_I(sbi); block_t addr_start = addr, addr_end = addr + len - 1; unsigned int seg_num = GET_SEGNO(sbi, addr_end) - segno + 1; unsigned int i = 1, max_blocks = sbi->blocks_per_seg, cnt; f2fs_bug_on(sbi, addr == NULL_ADDR); if (addr == NEW_ADDR || addr == COMPRESS_ADDR) return; f2fs_invalidate_internal_cache(sbi, addr, len); /* add it into sit main buffer */ down_write(&sit_i->sentry_lock); if (seg_num == 1) cnt = len; else cnt = max_blocks - GET_BLKOFF_FROM_SEG0(sbi, addr); do { update_segment_mtime(sbi, addr_start, 0); update_sit_entry(sbi, addr_start, -cnt); /* add it into dirty seglist */ locate_dirty_segment(sbi, segno); /* update @addr_start and @cnt and @segno */ addr_start = START_BLOCK(sbi, ++segno); if (++i == seg_num) cnt = GET_BLKOFF_FROM_SEG0(sbi, addr_end) + 1; else cnt = max_blocks; } while (i <= seg_num); up_write(&sit_i->sentry_lock); } bool f2fs_is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr) { struct sit_info *sit_i = SIT_I(sbi); unsigned int segno, offset; struct seg_entry *se; bool is_cp = false; if (!__is_valid_data_blkaddr(blkaddr)) return true; down_read(&sit_i->sentry_lock); segno = GET_SEGNO(sbi, blkaddr); se = get_seg_entry(sbi, segno); offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr); if (f2fs_test_bit(offset, se->ckpt_valid_map)) is_cp = true; up_read(&sit_i->sentry_lock); return is_cp; } static unsigned short f2fs_curseg_valid_blocks(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg = CURSEG_I(sbi, type); if (sbi->ckpt->alloc_type[type] == SSR) return BLKS_PER_SEG(sbi); return curseg->next_blkoff; } /* * Calculate the number of current summary pages for writing */ int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra) { int valid_sum_count = 0; int i, sum_in_page; for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { if (sbi->ckpt->alloc_type[i] != SSR && for_ra) valid_sum_count += le16_to_cpu(F2FS_CKPT(sbi)->cur_data_blkoff[i]); else valid_sum_count += f2fs_curseg_valid_blocks(sbi, i); } sum_in_page = (PAGE_SIZE - 2 * SUM_JOURNAL_SIZE - SUM_FOOTER_SIZE) / SUMMARY_SIZE; if (valid_sum_count <= sum_in_page) return 1; else if ((valid_sum_count - sum_in_page) <= (PAGE_SIZE - SUM_FOOTER_SIZE) / SUMMARY_SIZE) return 2; return 3; } /* * Caller should put this summary folio */ struct folio *f2fs_get_sum_folio(struct f2fs_sb_info *sbi, unsigned int segno) { if (unlikely(f2fs_cp_error(sbi))) return ERR_PTR(-EIO); return f2fs_get_meta_folio_retry(sbi, GET_SUM_BLOCK(sbi, segno)); } void f2fs_update_meta_page(struct f2fs_sb_info *sbi, void *src, block_t blk_addr) { struct folio *folio = f2fs_grab_meta_folio(sbi, blk_addr); memcpy(folio_address(folio), src, PAGE_SIZE); folio_mark_dirty(folio); f2fs_folio_put(folio, true); } static void write_sum_page(struct f2fs_sb_info *sbi, struct f2fs_summary_block *sum_blk, block_t blk_addr) { f2fs_update_meta_page(sbi, (void *)sum_blk, blk_addr); } static void write_current_sum_page(struct f2fs_sb_info *sbi, int type, block_t blk_addr) { struct curseg_info *curseg = CURSEG_I(sbi, type); struct folio *folio = f2fs_grab_meta_folio(sbi, blk_addr); struct f2fs_summary_block *src = curseg->sum_blk; struct f2fs_summary_block *dst; dst = folio_address(folio); memset(dst, 0, PAGE_SIZE); mutex_lock(&curseg->curseg_mutex); down_read(&curseg->journal_rwsem); memcpy(&dst->journal, curseg->journal, SUM_JOURNAL_SIZE); up_read(&curseg->journal_rwsem); memcpy(dst->entries, src->entries, SUM_ENTRY_SIZE); memcpy(&dst->footer, &src->footer, SUM_FOOTER_SIZE); mutex_unlock(&curseg->curseg_mutex); folio_mark_dirty(folio); f2fs_folio_put(folio, true); } static int is_next_segment_free(struct f2fs_sb_info *sbi, struct curseg_info *curseg) { unsigned int segno = curseg->segno + 1; struct free_segmap_info *free_i = FREE_I(sbi); if (segno < MAIN_SEGS(sbi) && segno % SEGS_PER_SEC(sbi)) return !test_bit(segno, free_i->free_segmap); return 0; } /* * Find a new segment from the free segments bitmap to right order * This function should be returned with success, otherwise BUG */ static int get_new_segment(struct f2fs_sb_info *sbi, unsigned int *newseg, bool new_sec, bool pinning) { struct free_segmap_info *free_i = FREE_I(sbi); unsigned int segno, secno, zoneno; unsigned int total_zones = MAIN_SECS(sbi) / sbi->secs_per_zone; unsigned int hint = GET_SEC_FROM_SEG(sbi, *newseg); unsigned int old_zoneno = GET_ZONE_FROM_SEG(sbi, *newseg); bool init = true; int i; int ret = 0; spin_lock(&free_i->segmap_lock); if (time_to_inject(sbi, FAULT_NO_SEGMENT)) { ret = -ENOSPC; goto out_unlock; } if (!new_sec && ((*newseg + 1) % SEGS_PER_SEC(sbi))) { segno = find_next_zero_bit(free_i->free_segmap, GET_SEG_FROM_SEC(sbi, hint + 1), *newseg + 1); if (segno < GET_SEG_FROM_SEC(sbi, hint + 1)) goto got_it; } #ifdef CONFIG_BLK_DEV_ZONED /* * If we format f2fs on zoned storage, let's try to get pinned sections * from beginning of the storage, which should be a conventional one. */ if (f2fs_sb_has_blkzoned(sbi)) { /* Prioritize writing to conventional zones */ if (sbi->blkzone_alloc_policy == BLKZONE_ALLOC_PRIOR_CONV || pinning) segno = 0; else segno = max(sbi->first_seq_zone_segno, *newseg); hint = GET_SEC_FROM_SEG(sbi, segno); } #endif find_other_zone: secno = find_next_zero_bit(free_i->free_secmap, MAIN_SECS(sbi), hint); #ifdef CONFIG_BLK_DEV_ZONED if (secno >= MAIN_SECS(sbi) && f2fs_sb_has_blkzoned(sbi)) { /* Write only to sequential zones */ if (sbi->blkzone_alloc_policy == BLKZONE_ALLOC_ONLY_SEQ) { hint = GET_SEC_FROM_SEG(sbi, sbi->first_seq_zone_segno); secno = find_next_zero_bit(free_i->free_secmap, MAIN_SECS(sbi), hint); } else secno = find_first_zero_bit(free_i->free_secmap, MAIN_SECS(sbi)); if (secno >= MAIN_SECS(sbi)) { ret = -ENOSPC; f2fs_bug_on(sbi, 1); goto out_unlock; } } #endif if (secno >= MAIN_SECS(sbi)) { secno = find_first_zero_bit(free_i->free_secmap, MAIN_SECS(sbi)); if (secno >= MAIN_SECS(sbi)) { ret = -ENOSPC; f2fs_bug_on(sbi, !pinning); goto out_unlock; } } segno = GET_SEG_FROM_SEC(sbi, secno); zoneno = GET_ZONE_FROM_SEC(sbi, secno); /* give up on finding another zone */ if (!init) goto got_it; if (sbi->secs_per_zone == 1) goto got_it; if (zoneno == old_zoneno) goto got_it; for (i = 0; i < NR_CURSEG_TYPE; i++) if (CURSEG_I(sbi, i)->zone == zoneno) break; if (i < NR_CURSEG_TYPE) { /* zone is in user, try another */ if (zoneno + 1 >= total_zones) hint = 0; else hint = (zoneno + 1) * sbi->secs_per_zone; init = false; goto find_other_zone; } got_it: /* set it as dirty segment in free segmap */ if (test_bit(segno, free_i->free_segmap)) { ret = -EFSCORRUPTED; f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_CORRUPTED_FREE_BITMAP); goto out_unlock; } /* no free section in conventional device or conventional zone */ if (new_sec && pinning && f2fs_is_sequential_zone_area(sbi, START_BLOCK(sbi, segno))) { ret = -EAGAIN; goto out_unlock; } __set_inuse(sbi, segno); *newseg = segno; out_unlock: spin_unlock(&free_i->segmap_lock); if (ret == -ENOSPC && !pinning) f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_NO_SEGMENT); return ret; } static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified) { struct curseg_info *curseg = CURSEG_I(sbi, type); struct summary_footer *sum_footer; unsigned short seg_type = curseg->seg_type; /* only happen when get_new_segment() fails */ if (curseg->next_segno == NULL_SEGNO) return; curseg->inited = true; curseg->segno = curseg->next_segno; curseg->zone = GET_ZONE_FROM_SEG(sbi, curseg->segno); curseg->next_blkoff = 0; curseg->next_segno = NULL_SEGNO; sum_footer = &(curseg->sum_blk->footer); memset(sum_footer, 0, sizeof(struct summary_footer)); sanity_check_seg_type(sbi, seg_type); if (IS_DATASEG(seg_type)) SET_SUM_TYPE(sum_footer, SUM_TYPE_DATA); if (IS_NODESEG(seg_type)) SET_SUM_TYPE(sum_footer, SUM_TYPE_NODE); __set_sit_entry_type(sbi, seg_type, curseg->segno, modified); } static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg = CURSEG_I(sbi, type); unsigned short seg_type = curseg->seg_type; sanity_check_seg_type(sbi, seg_type); if (__is_large_section(sbi)) { if (f2fs_need_rand_seg(sbi)) { unsigned int hint = GET_SEC_FROM_SEG(sbi, curseg->segno); if (GET_SEC_FROM_SEG(sbi, curseg->segno + 1) != hint) return curseg->segno; return get_random_u32_inclusive(curseg->segno + 1, GET_SEG_FROM_SEC(sbi, hint + 1) - 1); } return curseg->segno; } else if (f2fs_need_rand_seg(sbi)) { return get_random_u32_below(MAIN_SECS(sbi) * SEGS_PER_SEC(sbi)); } /* inmem log may not locate on any segment after mount */ if (!curseg->inited) return 0; if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) return 0; if (seg_type == CURSEG_HOT_DATA || IS_NODESEG(seg_type)) return 0; if (SIT_I(sbi)->last_victim[ALLOC_NEXT]) return SIT_I(sbi)->last_victim[ALLOC_NEXT]; /* find segments from 0 to reuse freed segments */ if (F2FS_OPTION(sbi).alloc_mode == ALLOC_MODE_REUSE) return 0; return curseg->segno; } static void reset_curseg_fields(struct curseg_info *curseg) { curseg->inited = false; curseg->segno = NULL_SEGNO; curseg->next_segno = 0; } /* * Allocate a current working segment. * This function always allocates a free segment in LFS manner. */ static int new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec) { struct curseg_info *curseg = CURSEG_I(sbi, type); unsigned int segno = curseg->segno; bool pinning = type == CURSEG_COLD_DATA_PINNED; int ret; if (curseg->inited) write_sum_page(sbi, curseg->sum_blk, GET_SUM_BLOCK(sbi, segno)); segno = __get_next_segno(sbi, type); ret = get_new_segment(sbi, &segno, new_sec, pinning); if (ret) { if (ret == -ENOSPC) reset_curseg_fields(curseg); return ret; } curseg->next_segno = segno; reset_curseg(sbi, type, 1); curseg->alloc_type = LFS; if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK) curseg->fragment_remained_chunk = get_random_u32_inclusive(1, sbi->max_fragment_chunk); return 0; } static int __next_free_blkoff(struct f2fs_sb_info *sbi, int segno, block_t start) { struct seg_entry *se = get_seg_entry(sbi, segno); int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long); unsigned long *target_map = SIT_I(sbi)->tmp_map; unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map; unsigned long *cur_map = (unsigned long *)se->cur_valid_map; int i; for (i = 0; i < entries; i++) target_map[i] = ckpt_map[i] | cur_map[i]; return __find_rev_next_zero_bit(target_map, BLKS_PER_SEG(sbi), start); } static int f2fs_find_next_ssr_block(struct f2fs_sb_info *sbi, struct curseg_info *seg) { return __next_free_blkoff(sbi, seg->segno, seg->next_blkoff + 1); } bool f2fs_segment_has_free_slot(struct f2fs_sb_info *sbi, int segno) { return __next_free_blkoff(sbi, segno, 0) < BLKS_PER_SEG(sbi); } /* * This function always allocates a used segment(from dirty seglist) by SSR * manner, so it should recover the existing segment information of valid blocks */ static int change_curseg(struct f2fs_sb_info *sbi, int type) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, type); unsigned int new_segno = curseg->next_segno; struct f2fs_summary_block *sum_node; struct folio *sum_folio; if (curseg->inited) write_sum_page(sbi, curseg->sum_blk, GET_SUM_BLOCK(sbi, curseg->segno)); __set_test_and_inuse(sbi, new_segno); mutex_lock(&dirty_i->seglist_lock); __remove_dirty_segment(sbi, new_segno, PRE); __remove_dirty_segment(sbi, new_segno, DIRTY); mutex_unlock(&dirty_i->seglist_lock); reset_curseg(sbi, type, 1); curseg->alloc_type = SSR; curseg->next_blkoff = __next_free_blkoff(sbi, curseg->segno, 0); sum_folio = f2fs_get_sum_folio(sbi, new_segno); if (IS_ERR(sum_folio)) { /* GC won't be able to use stale summary pages by cp_error */ memset(curseg->sum_blk, 0, SUM_ENTRY_SIZE); return PTR_ERR(sum_folio); } sum_node = folio_address(sum_folio); memcpy(curseg->sum_blk, sum_node, SUM_ENTRY_SIZE); f2fs_folio_put(sum_folio, true); return 0; } static int get_ssr_segment(struct f2fs_sb_info *sbi, int type, int alloc_mode, unsigned long long age); static int get_atssr_segment(struct f2fs_sb_info *sbi, int type, int target_type, int alloc_mode, unsigned long long age) { struct curseg_info *curseg = CURSEG_I(sbi, type); int ret = 0; curseg->seg_type = target_type; if (get_ssr_segment(sbi, type, alloc_mode, age)) { struct seg_entry *se = get_seg_entry(sbi, curseg->next_segno); curseg->seg_type = se->type; ret = change_curseg(sbi, type); } else { /* allocate cold segment by default */ curseg->seg_type = CURSEG_COLD_DATA; ret = new_curseg(sbi, type, true); } stat_inc_seg_type(sbi, curseg); return ret; } static int __f2fs_init_atgc_curseg(struct f2fs_sb_info *sbi, bool force) { struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_ALL_DATA_ATGC); int ret = 0; if (!sbi->am.atgc_enabled && !force) return 0; f2fs_down_read(&SM_I(sbi)->curseg_lock); mutex_lock(&curseg->curseg_mutex); down_write(&SIT_I(sbi)->sentry_lock); ret = get_atssr_segment(sbi, CURSEG_ALL_DATA_ATGC, CURSEG_COLD_DATA, SSR, 0); up_write(&SIT_I(sbi)->sentry_lock); mutex_unlock(&curseg->curseg_mutex); f2fs_up_read(&SM_I(sbi)->curseg_lock); return ret; } int f2fs_init_inmem_curseg(struct f2fs_sb_info *sbi) { return __f2fs_init_atgc_curseg(sbi, false); } int f2fs_reinit_atgc_curseg(struct f2fs_sb_info *sbi) { int ret; if (!test_opt(sbi, ATGC)) return 0; if (sbi->am.atgc_enabled) return 0; if (le64_to_cpu(F2FS_CKPT(sbi)->elapsed_time) < sbi->am.age_threshold) return 0; ret = __f2fs_init_atgc_curseg(sbi, true); if (!ret) { sbi->am.atgc_enabled = true; f2fs_info(sbi, "reenabled age threshold GC"); } return ret; } static void __f2fs_save_inmem_curseg(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg = CURSEG_I(sbi, type); mutex_lock(&curseg->curseg_mutex); if (!curseg->inited) goto out; if (get_valid_blocks(sbi, curseg->segno, false)) { write_sum_page(sbi, curseg->sum_blk, GET_SUM_BLOCK(sbi, curseg->segno)); } else { mutex_lock(&DIRTY_I(sbi)->seglist_lock); __set_test_and_free(sbi, curseg->segno, true); mutex_unlock(&DIRTY_I(sbi)->seglist_lock); } out: mutex_unlock(&curseg->curseg_mutex); } void f2fs_save_inmem_curseg(struct f2fs_sb_info *sbi) { __f2fs_save_inmem_curseg(sbi, CURSEG_COLD_DATA_PINNED); if (sbi->am.atgc_enabled) __f2fs_save_inmem_curseg(sbi, CURSEG_ALL_DATA_ATGC); } static void __f2fs_restore_inmem_curseg(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg = CURSEG_I(sbi, type); mutex_lock(&curseg->curseg_mutex); if (!curseg->inited) goto out; if (get_valid_blocks(sbi, curseg->segno, false)) goto out; mutex_lock(&DIRTY_I(sbi)->seglist_lock); __set_test_and_inuse(sbi, curseg->segno); mutex_unlock(&DIRTY_I(sbi)->seglist_lock); out: mutex_unlock(&curseg->curseg_mutex); } void f2fs_restore_inmem_curseg(struct f2fs_sb_info *sbi) { __f2fs_restore_inmem_curseg(sbi, CURSEG_COLD_DATA_PINNED); if (sbi->am.atgc_enabled) __f2fs_restore_inmem_curseg(sbi, CURSEG_ALL_DATA_ATGC); } static int get_ssr_segment(struct f2fs_sb_info *sbi, int type, int alloc_mode, unsigned long long age) { struct curseg_info *curseg = CURSEG_I(sbi, type); unsigned segno = NULL_SEGNO; unsigned short seg_type = curseg->seg_type; int i, cnt; bool reversed = false; sanity_check_seg_type(sbi, seg_type); /* f2fs_need_SSR() already forces to do this */ if (!f2fs_get_victim(sbi, &segno, BG_GC, seg_type, alloc_mode, age, false)) { curseg->next_segno = segno; return 1; } /* For node segments, let's do SSR more intensively */ if (IS_NODESEG(seg_type)) { if (seg_type >= CURSEG_WARM_NODE) { reversed = true; i = CURSEG_COLD_NODE; } else { i = CURSEG_HOT_NODE; } cnt = NR_CURSEG_NODE_TYPE; } else { if (seg_type >= CURSEG_WARM_DATA) { reversed = true; i = CURSEG_COLD_DATA; } else { i = CURSEG_HOT_DATA; } cnt = NR_CURSEG_DATA_TYPE; } for (; cnt-- > 0; reversed ? i-- : i++) { if (i == seg_type) continue; if (!f2fs_get_victim(sbi, &segno, BG_GC, i, alloc_mode, age, false)) { curseg->next_segno = segno; return 1; } } /* find valid_blocks=0 in dirty list */ if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { segno = get_free_segment(sbi); if (segno != NULL_SEGNO) { curseg->next_segno = segno; return 1; } } return 0; } static bool need_new_seg(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg = CURSEG_I(sbi, type); if (!is_set_ckpt_flags(sbi, CP_CRC_RECOVERY_FLAG) && curseg->seg_type == CURSEG_WARM_NODE) return true; if (curseg->alloc_type == LFS && is_next_segment_free(sbi, curseg) && likely(!is_sbi_flag_set(sbi, SBI_CP_DISABLED))) return true; if (!f2fs_need_SSR(sbi) || !get_ssr_segment(sbi, type, SSR, 0)) return true; return false; } int f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type, unsigned int start, unsigned int end) { struct curseg_info *curseg = CURSEG_I(sbi, type); unsigned int segno; int ret = 0; f2fs_down_read(&SM_I(sbi)->curseg_lock); mutex_lock(&curseg->curseg_mutex); down_write(&SIT_I(sbi)->sentry_lock); segno = CURSEG_I(sbi, type)->segno; if (segno < start || segno > end) goto unlock; if (f2fs_need_SSR(sbi) && get_ssr_segment(sbi, type, SSR, 0)) ret = change_curseg(sbi, type); else ret = new_curseg(sbi, type, true); stat_inc_seg_type(sbi, curseg); locate_dirty_segment(sbi, segno); unlock: up_write(&SIT_I(sbi)->sentry_lock); if (segno != curseg->segno) f2fs_notice(sbi, "For resize: curseg of type %d: %u ==> %u", type, segno, curseg->segno); mutex_unlock(&curseg->curseg_mutex); f2fs_up_read(&SM_I(sbi)->curseg_lock); return ret; } static int __allocate_new_segment(struct f2fs_sb_info *sbi, int type, bool new_sec, bool force) { struct curseg_info *curseg = CURSEG_I(sbi, type); unsigned int old_segno; int err = 0; if (type == CURSEG_COLD_DATA_PINNED && !curseg->inited) goto allocate; if (!force && curseg->inited && !curseg->next_blkoff && !get_valid_blocks(sbi, curseg->segno, new_sec) && !get_ckpt_valid_blocks(sbi, curseg->segno, new_sec)) return 0; allocate: old_segno = curseg->segno; err = new_curseg(sbi, type, true); if (err) return err; stat_inc_seg_type(sbi, curseg); locate_dirty_segment(sbi, old_segno); return 0; } int f2fs_allocate_new_section(struct f2fs_sb_info *sbi, int type, bool force) { int ret; f2fs_down_read(&SM_I(sbi)->curseg_lock); down_write(&SIT_I(sbi)->sentry_lock); ret = __allocate_new_segment(sbi, type, true, force); up_write(&SIT_I(sbi)->sentry_lock); f2fs_up_read(&SM_I(sbi)->curseg_lock); return ret; } int f2fs_allocate_pinning_section(struct f2fs_sb_info *sbi) { int err; bool gc_required = true; retry: f2fs_lock_op(sbi); err = f2fs_allocate_new_section(sbi, CURSEG_COLD_DATA_PINNED, false); f2fs_unlock_op(sbi); if (f2fs_sb_has_blkzoned(sbi) && err == -EAGAIN && gc_required) { f2fs_down_write(&sbi->gc_lock); err = f2fs_gc_range(sbi, 0, sbi->first_seq_zone_segno - 1, true, ZONED_PIN_SEC_REQUIRED_COUNT); f2fs_up_write(&sbi->gc_lock); gc_required = false; if (!err) goto retry; } return err; } int f2fs_allocate_new_segments(struct f2fs_sb_info *sbi) { int i; int err = 0; f2fs_down_read(&SM_I(sbi)->curseg_lock); down_write(&SIT_I(sbi)->sentry_lock); for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) err += __allocate_new_segment(sbi, i, false, false); up_write(&SIT_I(sbi)->sentry_lock); f2fs_up_read(&SM_I(sbi)->curseg_lock); return err; } bool f2fs_exist_trim_candidates(struct f2fs_sb_info *sbi, struct cp_control *cpc) { __u64 trim_start = cpc->trim_start; bool has_candidate = false; down_write(&SIT_I(sbi)->sentry_lock); for (; cpc->trim_start <= cpc->trim_end; cpc->trim_start++) { if (add_discard_addrs(sbi, cpc, true)) { has_candidate = true; break; } } up_write(&SIT_I(sbi)->sentry_lock); cpc->trim_start = trim_start; return has_candidate; } static unsigned int __issue_discard_cmd_range(struct f2fs_sb_info *sbi, struct discard_policy *dpolicy, unsigned int start, unsigned int end) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct discard_cmd *prev_dc = NULL, *next_dc = NULL; struct rb_node **insert_p = NULL, *insert_parent = NULL; struct discard_cmd *dc; struct blk_plug plug; int issued; unsigned int trimmed = 0; next: issued = 0; mutex_lock(&dcc->cmd_lock); if (unlikely(dcc->rbtree_check)) f2fs_bug_on(sbi, !f2fs_check_discard_tree(sbi)); dc = __lookup_discard_cmd_ret(&dcc->root, start, &prev_dc, &next_dc, &insert_p, &insert_parent); if (!dc) dc = next_dc; blk_start_plug(&plug); while (dc && dc->di.lstart <= end) { struct rb_node *node; int err = 0; if (dc->di.len < dpolicy->granularity) goto skip; if (dc->state != D_PREP) { list_move_tail(&dc->list, &dcc->fstrim_list); goto skip; } err = __submit_discard_cmd(sbi, dpolicy, dc, &issued); if (issued >= dpolicy->max_requests) { start = dc->di.lstart + dc->di.len; if (err) __remove_discard_cmd(sbi, dc); blk_finish_plug(&plug); mutex_unlock(&dcc->cmd_lock); trimmed += __wait_all_discard_cmd(sbi, NULL); f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); goto next; } skip: node = rb_next(&dc->rb_node); if (err) __remove_discard_cmd(sbi, dc); dc = rb_entry_safe(node, struct discard_cmd, rb_node); if (fatal_signal_pending(current)) break; } blk_finish_plug(&plug); mutex_unlock(&dcc->cmd_lock); return trimmed; } int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) { __u64 start = F2FS_BYTES_TO_BLK(range->start); __u64 end = start + F2FS_BYTES_TO_BLK(range->len) - 1; unsigned int start_segno, end_segno; block_t start_block, end_block; struct cp_control cpc; struct discard_policy dpolicy; unsigned long long trimmed = 0; int err = 0; bool need_align = f2fs_lfs_mode(sbi) && __is_large_section(sbi); if (start >= MAX_BLKADDR(sbi) || range->len < sbi->blocksize) return -EINVAL; if (end < MAIN_BLKADDR(sbi)) goto out; if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) { f2fs_warn(sbi, "Found FS corruption, run fsck to fix."); return -EFSCORRUPTED; } /* start/end segment number in main_area */ start_segno = (start <= MAIN_BLKADDR(sbi)) ? 0 : GET_SEGNO(sbi, start); end_segno = (end >= MAX_BLKADDR(sbi)) ? MAIN_SEGS(sbi) - 1 : GET_SEGNO(sbi, end); if (need_align) { start_segno = rounddown(start_segno, SEGS_PER_SEC(sbi)); end_segno = roundup(end_segno + 1, SEGS_PER_SEC(sbi)) - 1; } cpc.reason = CP_DISCARD; cpc.trim_minlen = max_t(__u64, 1, F2FS_BYTES_TO_BLK(range->minlen)); cpc.trim_start = start_segno; cpc.trim_end = end_segno; if (sbi->discard_blks == 0) goto out; f2fs_down_write(&sbi->gc_lock); stat_inc_cp_call_count(sbi, TOTAL_CALL); err = f2fs_write_checkpoint(sbi, &cpc); f2fs_up_write(&sbi->gc_lock); if (err) goto out; /* * We filed discard candidates, but actually we don't need to wait for * all of them, since they'll be issued in idle time along with runtime * discard option. User configuration looks like using runtime discard * or periodic fstrim instead of it. */ if (f2fs_realtime_discard_enable(sbi)) goto out; start_block = START_BLOCK(sbi, start_segno); end_block = START_BLOCK(sbi, end_segno + 1); __init_discard_policy(sbi, &dpolicy, DPOLICY_FSTRIM, cpc.trim_minlen); trimmed = __issue_discard_cmd_range(sbi, &dpolicy, start_block, end_block); trimmed += __wait_discard_cmd_range(sbi, &dpolicy, start_block, end_block); out: if (!err) range->len = F2FS_BLK_TO_BYTES(trimmed); return err; } int f2fs_rw_hint_to_seg_type(struct f2fs_sb_info *sbi, enum rw_hint hint) { if (F2FS_OPTION(sbi).active_logs == 2) return CURSEG_HOT_DATA; else if (F2FS_OPTION(sbi).active_logs == 4) return CURSEG_COLD_DATA; /* active_log == 6 */ switch (hint) { case WRITE_LIFE_SHORT: return CURSEG_HOT_DATA; case WRITE_LIFE_EXTREME: return CURSEG_COLD_DATA; default: return CURSEG_WARM_DATA; } } /* * This returns write hints for each segment type. This hints will be * passed down to block layer as below by default. * * User F2FS Block * ---- ---- ----- * META WRITE_LIFE_NONE|REQ_META * HOT_NODE WRITE_LIFE_NONE * WARM_NODE WRITE_LIFE_MEDIUM * COLD_NODE WRITE_LIFE_LONG * ioctl(COLD) COLD_DATA WRITE_LIFE_EXTREME * extension list " " * * -- buffered io * COLD_DATA WRITE_LIFE_EXTREME * HOT_DATA WRITE_LIFE_SHORT * WARM_DATA WRITE_LIFE_NOT_SET * * -- direct io * WRITE_LIFE_EXTREME COLD_DATA WRITE_LIFE_EXTREME * WRITE_LIFE_SHORT HOT_DATA WRITE_LIFE_SHORT * WRITE_LIFE_NOT_SET WARM_DATA WRITE_LIFE_NOT_SET * WRITE_LIFE_NONE " WRITE_LIFE_NONE * WRITE_LIFE_MEDIUM " WRITE_LIFE_MEDIUM * WRITE_LIFE_LONG " WRITE_LIFE_LONG */ enum rw_hint f2fs_io_type_to_rw_hint(struct f2fs_sb_info *sbi, enum page_type type, enum temp_type temp) { switch (type) { case DATA: switch (temp) { case WARM: return WRITE_LIFE_NOT_SET; case HOT: return WRITE_LIFE_SHORT; case COLD: return WRITE_LIFE_EXTREME; default: return WRITE_LIFE_NONE; } case NODE: switch (temp) { case WARM: return WRITE_LIFE_MEDIUM; case HOT: return WRITE_LIFE_NONE; case COLD: return WRITE_LIFE_LONG; default: return WRITE_LIFE_NONE; } case META: return WRITE_LIFE_NONE; default: return WRITE_LIFE_NONE; } } static int __get_segment_type_2(struct f2fs_io_info *fio) { if (fio->type == DATA) return CURSEG_HOT_DATA; else return CURSEG_HOT_NODE; } static int __get_segment_type_4(struct f2fs_io_info *fio) { if (fio->type == DATA) { struct inode *inode = fio_inode(fio); if (S_ISDIR(inode->i_mode)) return CURSEG_HOT_DATA; else return CURSEG_COLD_DATA; } else { if (IS_DNODE(fio->page) && is_cold_node(fio->page)) return CURSEG_WARM_NODE; else return CURSEG_COLD_NODE; } } static int __get_age_segment_type(struct inode *inode, pgoff_t pgofs) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct extent_info ei = {}; if (f2fs_lookup_age_extent_cache(inode, pgofs, &ei)) { if (!ei.age) return NO_CHECK_TYPE; if (ei.age <= sbi->hot_data_age_threshold) return CURSEG_HOT_DATA; if (ei.age <= sbi->warm_data_age_threshold) return CURSEG_WARM_DATA; return CURSEG_COLD_DATA; } return NO_CHECK_TYPE; } static int __get_segment_type_6(struct f2fs_io_info *fio) { if (fio->type == DATA) { struct inode *inode = fio_inode(fio); int type; if (is_inode_flag_set(inode, FI_ALIGNED_WRITE)) return CURSEG_COLD_DATA_PINNED; if (page_private_gcing(fio->page)) { if (fio->sbi->am.atgc_enabled && (fio->io_type == FS_DATA_IO) && (fio->sbi->gc_mode != GC_URGENT_HIGH) && __is_valid_data_blkaddr(fio->old_blkaddr) && !is_inode_flag_set(inode, FI_OPU_WRITE)) return CURSEG_ALL_DATA_ATGC; else return CURSEG_COLD_DATA; } if (file_is_cold(inode) || f2fs_need_compress_data(inode)) return CURSEG_COLD_DATA; type = __get_age_segment_type(inode, page_folio(fio->page)->index); if (type != NO_CHECK_TYPE) return type; if (file_is_hot(inode) || is_inode_flag_set(inode, FI_HOT_DATA) || f2fs_is_cow_file(inode)) return CURSEG_HOT_DATA; return f2fs_rw_hint_to_seg_type(F2FS_I_SB(inode), inode->i_write_hint); } else { if (IS_DNODE(fio->page)) return is_cold_node(fio->page) ? CURSEG_WARM_NODE : CURSEG_HOT_NODE; return CURSEG_COLD_NODE; } } enum temp_type f2fs_get_segment_temp(struct f2fs_sb_info *sbi, enum log_type type) { struct curseg_info *curseg = CURSEG_I(sbi, type); enum temp_type temp = COLD; switch (curseg->seg_type) { case CURSEG_HOT_NODE: case CURSEG_HOT_DATA: temp = HOT; break; case CURSEG_WARM_NODE: case CURSEG_WARM_DATA: temp = WARM; break; case CURSEG_COLD_NODE: case CURSEG_COLD_DATA: temp = COLD; break; default: f2fs_bug_on(sbi, 1); } return temp; } static int __get_segment_type(struct f2fs_io_info *fio) { enum log_type type = CURSEG_HOT_DATA; switch (F2FS_OPTION(fio->sbi).active_logs) { case 2: type = __get_segment_type_2(fio); break; case 4: type = __get_segment_type_4(fio); break; case 6: type = __get_segment_type_6(fio); break; default: f2fs_bug_on(fio->sbi, true); } fio->temp = f2fs_get_segment_temp(fio->sbi, type); return type; } static void f2fs_randomize_chunk(struct f2fs_sb_info *sbi, struct curseg_info *seg) { /* To allocate block chunks in different sizes, use random number */ if (--seg->fragment_remained_chunk > 0) return; seg->fragment_remained_chunk = get_random_u32_inclusive(1, sbi->max_fragment_chunk); seg->next_blkoff += get_random_u32_inclusive(1, sbi->max_fragment_hole); } int f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, block_t old_blkaddr, block_t *new_blkaddr, struct f2fs_summary *sum, int type, struct f2fs_io_info *fio) { struct sit_info *sit_i = SIT_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, type); unsigned long long old_mtime; bool from_gc = (type == CURSEG_ALL_DATA_ATGC); struct seg_entry *se = NULL; bool segment_full = false; int ret = 0; f2fs_down_read(&SM_I(sbi)->curseg_lock); mutex_lock(&curseg->curseg_mutex); down_write(&sit_i->sentry_lock); if (curseg->segno == NULL_SEGNO) { ret = -ENOSPC; goto out_err; } if (from_gc) { f2fs_bug_on(sbi, GET_SEGNO(sbi, old_blkaddr) == NULL_SEGNO); se = get_seg_entry(sbi, GET_SEGNO(sbi, old_blkaddr)); sanity_check_seg_type(sbi, se->type); f2fs_bug_on(sbi, IS_NODESEG(se->type)); } *new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); f2fs_bug_on(sbi, curseg->next_blkoff >= BLKS_PER_SEG(sbi)); f2fs_wait_discard_bio(sbi, *new_blkaddr); curseg->sum_blk->entries[curseg->next_blkoff] = *sum; if (curseg->alloc_type == SSR) { curseg->next_blkoff = f2fs_find_next_ssr_block(sbi, curseg); } else { curseg->next_blkoff++; if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK) f2fs_randomize_chunk(sbi, curseg); } if (curseg->next_blkoff >= f2fs_usable_blks_in_seg(sbi, curseg->segno)) segment_full = true; stat_inc_block_count(sbi, curseg); if (from_gc) { old_mtime = get_segment_mtime(sbi, old_blkaddr); } else { update_segment_mtime(sbi, old_blkaddr, 0); old_mtime = 0; } update_segment_mtime(sbi, *new_blkaddr, old_mtime); /* * SIT information should be updated before segment allocation, * since SSR needs latest valid block information. */ update_sit_entry(sbi, *new_blkaddr, 1); update_sit_entry(sbi, old_blkaddr, -1); /* * If the current segment is full, flush it out and replace it with a * new segment. */ if (segment_full) { if (type == CURSEG_COLD_DATA_PINNED && !((curseg->segno + 1) % sbi->segs_per_sec)) { write_sum_page(sbi, curseg->sum_blk, GET_SUM_BLOCK(sbi, curseg->segno)); reset_curseg_fields(curseg); goto skip_new_segment; } if (from_gc) { ret = get_atssr_segment(sbi, type, se->type, AT_SSR, se->mtime); } else { if (need_new_seg(sbi, type)) ret = new_curseg(sbi, type, false); else ret = change_curseg(sbi, type); stat_inc_seg_type(sbi, curseg); } if (ret) goto out_err; } skip_new_segment: /* * segment dirty status should be updated after segment allocation, * so we just need to update status only one time after previous * segment being closed. */ locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr)); locate_dirty_segment(sbi, GET_SEGNO(sbi, *new_blkaddr)); if (IS_DATASEG(curseg->seg_type)) atomic64_inc(&sbi->allocated_data_blocks); up_write(&sit_i->sentry_lock); if (page && IS_NODESEG(curseg->seg_type)) { fill_node_footer_blkaddr(page, NEXT_FREE_BLKADDR(sbi, curseg)); f2fs_inode_chksum_set(sbi, page); } if (fio) { struct f2fs_bio_info *io; INIT_LIST_HEAD(&fio->list); fio->in_list = 1; io = sbi->write_io[fio->type] + fio->temp; spin_lock(&io->io_lock); list_add_tail(&fio->list, &io->io_list); spin_unlock(&io->io_lock); } mutex_unlock(&curseg->curseg_mutex); f2fs_up_read(&SM_I(sbi)->curseg_lock); return 0; out_err: *new_blkaddr = NULL_ADDR; up_write(&sit_i->sentry_lock); mutex_unlock(&curseg->curseg_mutex); f2fs_up_read(&SM_I(sbi)->curseg_lock); return ret; } void f2fs_update_device_state(struct f2fs_sb_info *sbi, nid_t ino, block_t blkaddr, unsigned int blkcnt) { if (!f2fs_is_multi_device(sbi)) return; while (1) { unsigned int devidx = f2fs_target_device_index(sbi, blkaddr); unsigned int blks = FDEV(devidx).end_blk - blkaddr + 1; /* update device state for fsync */ f2fs_set_dirty_device(sbi, ino, devidx, FLUSH_INO); /* update device state for checkpoint */ if (!f2fs_test_bit(devidx, (char *)&sbi->dirty_device)) { spin_lock(&sbi->dev_lock); f2fs_set_bit(devidx, (char *)&sbi->dirty_device); spin_unlock(&sbi->dev_lock); } if (blkcnt <= blks) break; blkcnt -= blks; blkaddr += blks; } } static int log_type_to_seg_type(enum log_type type) { int seg_type = CURSEG_COLD_DATA; switch (type) { case CURSEG_HOT_DATA: case CURSEG_WARM_DATA: case CURSEG_COLD_DATA: case CURSEG_HOT_NODE: case CURSEG_WARM_NODE: case CURSEG_COLD_NODE: seg_type = (int)type; break; case CURSEG_COLD_DATA_PINNED: case CURSEG_ALL_DATA_ATGC: seg_type = CURSEG_COLD_DATA; break; default: break; } return seg_type; } static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) { struct folio *folio = page_folio(fio->page); enum log_type type = __get_segment_type(fio); int seg_type = log_type_to_seg_type(type); bool keep_order = (f2fs_lfs_mode(fio->sbi) && seg_type == CURSEG_COLD_DATA); if (keep_order) f2fs_down_read(&fio->sbi->io_order_lock); if (f2fs_allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr, &fio->new_blkaddr, sum, type, fio)) { if (fscrypt_inode_uses_fs_layer_crypto(folio->mapping->host)) fscrypt_finalize_bounce_page(&fio->encrypted_page); folio_end_writeback(folio); if (f2fs_in_warm_node_list(fio->sbi, folio)) f2fs_del_fsync_node_entry(fio->sbi, folio); goto out; } if (GET_SEGNO(fio->sbi, fio->old_blkaddr) != NULL_SEGNO) f2fs_invalidate_internal_cache(fio->sbi, fio->old_blkaddr, 1); /* writeout dirty page into bdev */ f2fs_submit_page_write(fio); f2fs_update_device_state(fio->sbi, fio->ino, fio->new_blkaddr, 1); out: if (keep_order) f2fs_up_read(&fio->sbi->io_order_lock); } void f2fs_do_write_meta_page(struct f2fs_sb_info *sbi, struct folio *folio, enum iostat_type io_type) { struct f2fs_io_info fio = { .sbi = sbi, .type = META, .temp = HOT, .op = REQ_OP_WRITE, .op_flags = REQ_SYNC | REQ_META | REQ_PRIO, .old_blkaddr = folio->index, .new_blkaddr = folio->index, .page = folio_page(folio, 0), .encrypted_page = NULL, .in_list = 0, }; if (unlikely(folio->index >= MAIN_BLKADDR(sbi))) fio.op_flags &= ~REQ_META; folio_start_writeback(folio); f2fs_submit_page_write(&fio); stat_inc_meta_count(sbi, folio->index); f2fs_update_iostat(sbi, NULL, io_type, F2FS_BLKSIZE); } void f2fs_do_write_node_page(unsigned int nid, struct f2fs_io_info *fio) { struct f2fs_summary sum; set_summary(&sum, nid, 0, 0); do_write_page(&sum, fio); f2fs_update_iostat(fio->sbi, NULL, fio->io_type, F2FS_BLKSIZE); } void f2fs_outplace_write_data(struct dnode_of_data *dn, struct f2fs_io_info *fio) { struct f2fs_sb_info *sbi = fio->sbi; struct f2fs_summary sum; f2fs_bug_on(sbi, dn->data_blkaddr == NULL_ADDR); if (fio->io_type == FS_DATA_IO || fio->io_type == FS_CP_DATA_IO) f2fs_update_age_extent_cache(dn); set_summary(&sum, dn->nid, dn->ofs_in_node, fio->version); do_write_page(&sum, fio); f2fs_update_data_blkaddr(dn, fio->new_blkaddr); f2fs_update_iostat(sbi, dn->inode, fio->io_type, F2FS_BLKSIZE); } int f2fs_inplace_write_data(struct f2fs_io_info *fio) { int err; struct f2fs_sb_info *sbi = fio->sbi; unsigned int segno; fio->new_blkaddr = fio->old_blkaddr; /* i/o temperature is needed for passing down write hints */ __get_segment_type(fio); segno = GET_SEGNO(sbi, fio->new_blkaddr); if (!IS_DATASEG(get_seg_entry(sbi, segno)->type)) { set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: incorrect segment(%u) type, run fsck to fix.", __func__, segno); err = -EFSCORRUPTED; f2fs_handle_error(sbi, ERROR_INCONSISTENT_SUM_TYPE); goto drop_bio; } if (f2fs_cp_error(sbi)) { err = -EIO; goto drop_bio; } if (fio->meta_gc) f2fs_truncate_meta_inode_pages(sbi, fio->new_blkaddr, 1); stat_inc_inplace_blocks(fio->sbi); if (fio->bio && !IS_F2FS_IPU_NOCACHE(sbi)) err = f2fs_merge_page_bio(fio); else err = f2fs_submit_page_bio(fio); if (!err) { f2fs_update_device_state(fio->sbi, fio->ino, fio->new_blkaddr, 1); f2fs_update_iostat(fio->sbi, fio_inode(fio), fio->io_type, F2FS_BLKSIZE); } return err; drop_bio: if (fio->bio && *(fio->bio)) { struct bio *bio = *(fio->bio); bio->bi_status = BLK_STS_IOERR; bio_endio(bio); *(fio->bio) = NULL; } return err; } static inline int __f2fs_get_curseg(struct f2fs_sb_info *sbi, unsigned int segno) { int i; for (i = CURSEG_HOT_DATA; i < NO_CHECK_TYPE; i++) { if (CURSEG_I(sbi, i)->segno == segno) break; } return i; } void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, block_t old_blkaddr, block_t new_blkaddr, bool recover_curseg, bool recover_newaddr, bool from_gc) { struct sit_info *sit_i = SIT_I(sbi); struct curseg_info *curseg; unsigned int segno, old_cursegno; struct seg_entry *se; int type; unsigned short old_blkoff; unsigned char old_alloc_type; segno = GET_SEGNO(sbi, new_blkaddr); se = get_seg_entry(sbi, segno); type = se->type; f2fs_down_write(&SM_I(sbi)->curseg_lock); if (!recover_curseg) { /* for recovery flow */ if (se->valid_blocks == 0 && !IS_CURSEG(sbi, segno)) { if (old_blkaddr == NULL_ADDR) type = CURSEG_COLD_DATA; else type = CURSEG_WARM_DATA; } } else { if (IS_CURSEG(sbi, segno)) { /* se->type is volatile as SSR allocation */ type = __f2fs_get_curseg(sbi, segno); f2fs_bug_on(sbi, type == NO_CHECK_TYPE); } else { type = CURSEG_WARM_DATA; } } curseg = CURSEG_I(sbi, type); f2fs_bug_on(sbi, !IS_DATASEG(curseg->seg_type)); mutex_lock(&curseg->curseg_mutex); down_write(&sit_i->sentry_lock); old_cursegno = curseg->segno; old_blkoff = curseg->next_blkoff; old_alloc_type = curseg->alloc_type; /* change the current segment */ if (segno != curseg->segno) { curseg->next_segno = segno; if (change_curseg(sbi, type)) goto out_unlock; } curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr); curseg->sum_blk->entries[curseg->next_blkoff] = *sum; if (!recover_curseg || recover_newaddr) { if (!from_gc) update_segment_mtime(sbi, new_blkaddr, 0); update_sit_entry(sbi, new_blkaddr, 1); } if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) { f2fs_invalidate_internal_cache(sbi, old_blkaddr, 1); if (!from_gc) update_segment_mtime(sbi, old_blkaddr, 0); update_sit_entry(sbi, old_blkaddr, -1); } locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr)); locate_dirty_segment(sbi, GET_SEGNO(sbi, new_blkaddr)); locate_dirty_segment(sbi, old_cursegno); if (recover_curseg) { if (old_cursegno != curseg->segno) { curseg->next_segno = old_cursegno; if (change_curseg(sbi, type)) goto out_unlock; } curseg->next_blkoff = old_blkoff; curseg->alloc_type = old_alloc_type; } out_unlock: up_write(&sit_i->sentry_lock); mutex_unlock(&curseg->curseg_mutex); f2fs_up_write(&SM_I(sbi)->curseg_lock); } void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn, block_t old_addr, block_t new_addr, unsigned char version, bool recover_curseg, bool recover_newaddr) { struct f2fs_summary sum; set_summary(&sum, dn->nid, dn->ofs_in_node, version); f2fs_do_replace_block(sbi, &sum, old_addr, new_addr, recover_curseg, recover_newaddr, false); f2fs_update_data_blkaddr(dn, new_addr); } void f2fs_folio_wait_writeback(struct folio *folio, enum page_type type, bool ordered, bool locked) { if (folio_test_writeback(folio)) { struct f2fs_sb_info *sbi = F2FS_F_SB(folio); /* submit cached LFS IO */ f2fs_submit_merged_write_cond(sbi, NULL, &folio->page, 0, type); /* submit cached IPU IO */ f2fs_submit_merged_ipu_write(sbi, NULL, folio); if (ordered) { folio_wait_writeback(folio); f2fs_bug_on(sbi, locked && folio_test_writeback(folio)); } else { folio_wait_stable(folio); } } } void f2fs_wait_on_block_writeback(struct inode *inode, block_t blkaddr) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct folio *cfolio; if (!f2fs_meta_inode_gc_required(inode)) return; if (!__is_valid_data_blkaddr(blkaddr)) return; cfolio = filemap_lock_folio(META_MAPPING(sbi), blkaddr); if (!IS_ERR(cfolio)) { f2fs_folio_wait_writeback(cfolio, DATA, true, true); f2fs_folio_put(cfolio, true); } } void f2fs_wait_on_block_writeback_range(struct inode *inode, block_t blkaddr, block_t len) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); block_t i; if (!f2fs_meta_inode_gc_required(inode)) return; for (i = 0; i < len; i++) f2fs_wait_on_block_writeback(inode, blkaddr + i); f2fs_truncate_meta_inode_pages(sbi, blkaddr, len); } static int read_compacted_summaries(struct f2fs_sb_info *sbi) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); struct curseg_info *seg_i; unsigned char *kaddr; struct folio *folio; block_t start; int i, j, offset; start = start_sum_block(sbi); folio = f2fs_get_meta_folio(sbi, start++); if (IS_ERR(folio)) return PTR_ERR(folio); kaddr = folio_address(folio); /* Step 1: restore nat cache */ seg_i = CURSEG_I(sbi, CURSEG_HOT_DATA); memcpy(seg_i->journal, kaddr, SUM_JOURNAL_SIZE); /* Step 2: restore sit cache */ seg_i = CURSEG_I(sbi, CURSEG_COLD_DATA); memcpy(seg_i->journal, kaddr + SUM_JOURNAL_SIZE, SUM_JOURNAL_SIZE); offset = 2 * SUM_JOURNAL_SIZE; /* Step 3: restore summary entries */ for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { unsigned short blk_off; unsigned int segno; seg_i = CURSEG_I(sbi, i); segno = le32_to_cpu(ckpt->cur_data_segno[i]); blk_off = le16_to_cpu(ckpt->cur_data_blkoff[i]); seg_i->next_segno = segno; reset_curseg(sbi, i, 0); seg_i->alloc_type = ckpt->alloc_type[i]; seg_i->next_blkoff = blk_off; if (seg_i->alloc_type == SSR) blk_off = BLKS_PER_SEG(sbi); for (j = 0; j < blk_off; j++) { struct f2fs_summary *s; s = (struct f2fs_summary *)(kaddr + offset); seg_i->sum_blk->entries[j] = *s; offset += SUMMARY_SIZE; if (offset + SUMMARY_SIZE <= PAGE_SIZE - SUM_FOOTER_SIZE) continue; f2fs_folio_put(folio, true); folio = f2fs_get_meta_folio(sbi, start++); if (IS_ERR(folio)) return PTR_ERR(folio); kaddr = folio_address(folio); offset = 0; } } f2fs_folio_put(folio, true); return 0; } static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); struct f2fs_summary_block *sum; struct curseg_info *curseg; struct folio *new; unsigned short blk_off; unsigned int segno = 0; block_t blk_addr = 0; int err = 0; /* get segment number and block addr */ if (IS_DATASEG(type)) { segno = le32_to_cpu(ckpt->cur_data_segno[type]); blk_off = le16_to_cpu(ckpt->cur_data_blkoff[type - CURSEG_HOT_DATA]); if (__exist_node_summaries(sbi)) blk_addr = sum_blk_addr(sbi, NR_CURSEG_PERSIST_TYPE, type); else blk_addr = sum_blk_addr(sbi, NR_CURSEG_DATA_TYPE, type); } else { segno = le32_to_cpu(ckpt->cur_node_segno[type - CURSEG_HOT_NODE]); blk_off = le16_to_cpu(ckpt->cur_node_blkoff[type - CURSEG_HOT_NODE]); if (__exist_node_summaries(sbi)) blk_addr = sum_blk_addr(sbi, NR_CURSEG_NODE_TYPE, type - CURSEG_HOT_NODE); else blk_addr = GET_SUM_BLOCK(sbi, segno); } new = f2fs_get_meta_folio(sbi, blk_addr); if (IS_ERR(new)) return PTR_ERR(new); sum = folio_address(new); if (IS_NODESEG(type)) { if (__exist_node_summaries(sbi)) { struct f2fs_summary *ns = &sum->entries[0]; int i; for (i = 0; i < BLKS_PER_SEG(sbi); i++, ns++) { ns->version = 0; ns->ofs_in_node = 0; } } else { err = f2fs_restore_node_summary(sbi, segno, sum); if (err) goto out; } } /* set uncompleted segment to curseg */ curseg = CURSEG_I(sbi, type); mutex_lock(&curseg->curseg_mutex); /* update journal info */ down_write(&curseg->journal_rwsem); memcpy(curseg->journal, &sum->journal, SUM_JOURNAL_SIZE); up_write(&curseg->journal_rwsem); memcpy(curseg->sum_blk->entries, sum->entries, SUM_ENTRY_SIZE); memcpy(&curseg->sum_blk->footer, &sum->footer, SUM_FOOTER_SIZE); curseg->next_segno = segno; reset_curseg(sbi, type, 0); curseg->alloc_type = ckpt->alloc_type[type]; curseg->next_blkoff = blk_off; mutex_unlock(&curseg->curseg_mutex); out: f2fs_folio_put(new, true); return err; } static int restore_curseg_summaries(struct f2fs_sb_info *sbi) { struct f2fs_journal *sit_j = CURSEG_I(sbi, CURSEG_COLD_DATA)->journal; struct f2fs_journal *nat_j = CURSEG_I(sbi, CURSEG_HOT_DATA)->journal; int type = CURSEG_HOT_DATA; int err; if (is_set_ckpt_flags(sbi, CP_COMPACT_SUM_FLAG)) { int npages = f2fs_npages_for_summary_flush(sbi, true); if (npages >= 2) f2fs_ra_meta_pages(sbi, start_sum_block(sbi), npages, META_CP, true); /* restore for compacted data summary */ err = read_compacted_summaries(sbi); if (err) return err; type = CURSEG_HOT_NODE; } if (__exist_node_summaries(sbi)) f2fs_ra_meta_pages(sbi, sum_blk_addr(sbi, NR_CURSEG_PERSIST_TYPE, type), NR_CURSEG_PERSIST_TYPE - type, META_CP, true); for (; type <= CURSEG_COLD_NODE; type++) { err = read_normal_summaries(sbi, type); if (err) return err; } /* sanity check for summary blocks */ if (nats_in_cursum(nat_j) > NAT_JOURNAL_ENTRIES || sits_in_cursum(sit_j) > SIT_JOURNAL_ENTRIES) { f2fs_err(sbi, "invalid journal entries nats %u sits %u", nats_in_cursum(nat_j), sits_in_cursum(sit_j)); return -EINVAL; } return 0; } static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr) { struct folio *folio; unsigned char *kaddr; struct f2fs_summary *summary; struct curseg_info *seg_i; int written_size = 0; int i, j; folio = f2fs_grab_meta_folio(sbi, blkaddr++); kaddr = folio_address(folio); memset(kaddr, 0, PAGE_SIZE); /* Step 1: write nat cache */ seg_i = CURSEG_I(sbi, CURSEG_HOT_DATA); memcpy(kaddr, seg_i->journal, SUM_JOURNAL_SIZE); written_size += SUM_JOURNAL_SIZE; /* Step 2: write sit cache */ seg_i = CURSEG_I(sbi, CURSEG_COLD_DATA); memcpy(kaddr + written_size, seg_i->journal, SUM_JOURNAL_SIZE); written_size += SUM_JOURNAL_SIZE; /* Step 3: write summary entries */ for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { seg_i = CURSEG_I(sbi, i); for (j = 0; j < f2fs_curseg_valid_blocks(sbi, i); j++) { if (!folio) { folio = f2fs_grab_meta_folio(sbi, blkaddr++); kaddr = folio_address(folio); memset(kaddr, 0, PAGE_SIZE); written_size = 0; } summary = (struct f2fs_summary *)(kaddr + written_size); *summary = seg_i->sum_blk->entries[j]; written_size += SUMMARY_SIZE; if (written_size + SUMMARY_SIZE <= PAGE_SIZE - SUM_FOOTER_SIZE) continue; folio_mark_dirty(folio); f2fs_folio_put(folio, true); folio = NULL; } } if (folio) { folio_mark_dirty(folio); f2fs_folio_put(folio, true); } } static void write_normal_summaries(struct f2fs_sb_info *sbi, block_t blkaddr, int type) { int i, end; if (IS_DATASEG(type)) end = type + NR_CURSEG_DATA_TYPE; else end = type + NR_CURSEG_NODE_TYPE; for (i = type; i < end; i++) write_current_sum_page(sbi, i, blkaddr + (i - type)); } void f2fs_write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk) { if (is_set_ckpt_flags(sbi, CP_COMPACT_SUM_FLAG)) write_compacted_summaries(sbi, start_blk); else write_normal_summaries(sbi, start_blk, CURSEG_HOT_DATA); } void f2fs_write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk) { write_normal_summaries(sbi, start_blk, CURSEG_HOT_NODE); } int f2fs_lookup_journal_in_cursum(struct f2fs_journal *journal, int type, unsigned int val, int alloc) { int i; if (type == NAT_JOURNAL) { for (i = 0; i < nats_in_cursum(journal); i++) { if (le32_to_cpu(nid_in_journal(journal, i)) == val) return i; } if (alloc && __has_cursum_space(journal, 1, NAT_JOURNAL)) return update_nats_in_cursum(journal, 1); } else if (type == SIT_JOURNAL) { for (i = 0; i < sits_in_cursum(journal); i++) if (le32_to_cpu(segno_in_journal(journal, i)) == val) return i; if (alloc && __has_cursum_space(journal, 1, SIT_JOURNAL)) return update_sits_in_cursum(journal, 1); } return -1; } static struct folio *get_current_sit_folio(struct f2fs_sb_info *sbi, unsigned int segno) { return f2fs_get_meta_folio(sbi, current_sit_addr(sbi, segno)); } static struct folio *get_next_sit_folio(struct f2fs_sb_info *sbi, unsigned int start) { struct sit_info *sit_i = SIT_I(sbi); struct folio *folio; pgoff_t src_off, dst_off; src_off = current_sit_addr(sbi, start); dst_off = next_sit_addr(sbi, src_off); folio = f2fs_grab_meta_folio(sbi, dst_off); seg_info_to_sit_folio(sbi, folio, start); folio_mark_dirty(folio); set_to_next_sit(sit_i, start); return folio; } static struct sit_entry_set *grab_sit_entry_set(void) { struct sit_entry_set *ses = f2fs_kmem_cache_alloc(sit_entry_set_slab, GFP_NOFS, true, NULL); ses->entry_cnt = 0; INIT_LIST_HEAD(&ses->set_list); return ses; } static void release_sit_entry_set(struct sit_entry_set *ses) { list_del(&ses->set_list); kmem_cache_free(sit_entry_set_slab, ses); } static void adjust_sit_entry_set(struct sit_entry_set *ses, struct list_head *head) { struct sit_entry_set *next = ses; if (list_is_last(&ses->set_list, head)) return; list_for_each_entry_continue(next, head, set_list) if (ses->entry_cnt <= next->entry_cnt) { list_move_tail(&ses->set_list, &next->set_list); return; } list_move_tail(&ses->set_list, head); } static void add_sit_entry(unsigned int segno, struct list_head *head) { struct sit_entry_set *ses; unsigned int start_segno = START_SEGNO(segno); list_for_each_entry(ses, head, set_list) { if (ses->start_segno == start_segno) { ses->entry_cnt++; adjust_sit_entry_set(ses, head); return; } } ses = grab_sit_entry_set(); ses->start_segno = start_segno; ses->entry_cnt++; list_add(&ses->set_list, head); } static void add_sits_in_set(struct f2fs_sb_info *sbi) { struct f2fs_sm_info *sm_info = SM_I(sbi); struct list_head *set_list = &sm_info->sit_entry_set; unsigned long *bitmap = SIT_I(sbi)->dirty_sentries_bitmap; unsigned int segno; for_each_set_bit(segno, bitmap, MAIN_SEGS(sbi)) add_sit_entry(segno, set_list); } static void remove_sits_in_journal(struct f2fs_sb_info *sbi) { struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA); struct f2fs_journal *journal = curseg->journal; int i; down_write(&curseg->journal_rwsem); for (i = 0; i < sits_in_cursum(journal); i++) { unsigned int segno; bool dirtied; segno = le32_to_cpu(segno_in_journal(journal, i)); dirtied = __mark_sit_entry_dirty(sbi, segno); if (!dirtied) add_sit_entry(segno, &SM_I(sbi)->sit_entry_set); } update_sits_in_cursum(journal, -i); up_write(&curseg->journal_rwsem); } /* * CP calls this function, which flushes SIT entries including sit_journal, * and moves prefree segs to free segs. */ void f2fs_flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct sit_info *sit_i = SIT_I(sbi); unsigned long *bitmap = sit_i->dirty_sentries_bitmap; struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA); struct f2fs_journal *journal = curseg->journal; struct sit_entry_set *ses, *tmp; struct list_head *head = &SM_I(sbi)->sit_entry_set; bool to_journal = !is_sbi_flag_set(sbi, SBI_IS_RESIZEFS); struct seg_entry *se; down_write(&sit_i->sentry_lock); if (!sit_i->dirty_sentries) goto out; /* * add and account sit entries of dirty bitmap in sit entry * set temporarily */ add_sits_in_set(sbi); /* * if there are no enough space in journal to store dirty sit * entries, remove all entries from journal and add and account * them in sit entry set. */ if (!__has_cursum_space(journal, sit_i->dirty_sentries, SIT_JOURNAL) || !to_journal) remove_sits_in_journal(sbi); /* * there are two steps to flush sit entries: * #1, flush sit entries to journal in current cold data summary block. * #2, flush sit entries to sit page. */ list_for_each_entry_safe(ses, tmp, head, set_list) { struct folio *folio = NULL; struct f2fs_sit_block *raw_sit = NULL; unsigned int start_segno = ses->start_segno; unsigned int end = min(start_segno + SIT_ENTRY_PER_BLOCK, (unsigned long)MAIN_SEGS(sbi)); unsigned int segno = start_segno; if (to_journal && !__has_cursum_space(journal, ses->entry_cnt, SIT_JOURNAL)) to_journal = false; if (to_journal) { down_write(&curseg->journal_rwsem); } else { folio = get_next_sit_folio(sbi, start_segno); raw_sit = folio_address(folio); } /* flush dirty sit entries in region of current sit set */ for_each_set_bit_from(segno, bitmap, end) { int offset, sit_offset; se = get_seg_entry(sbi, segno); #ifdef CONFIG_F2FS_CHECK_FS if (memcmp(se->cur_valid_map, se->cur_valid_map_mir, SIT_VBLOCK_MAP_SIZE)) f2fs_bug_on(sbi, 1); #endif /* add discard candidates */ if (!(cpc->reason & CP_DISCARD)) { cpc->trim_start = segno; add_discard_addrs(sbi, cpc, false); } if (to_journal) { offset = f2fs_lookup_journal_in_cursum(journal, SIT_JOURNAL, segno, 1); f2fs_bug_on(sbi, offset < 0); segno_in_journal(journal, offset) = cpu_to_le32(segno); seg_info_to_raw_sit(se, &sit_in_journal(journal, offset)); check_block_count(sbi, segno, &sit_in_journal(journal, offset)); } else { sit_offset = SIT_ENTRY_OFFSET(sit_i, segno); seg_info_to_raw_sit(se, &raw_sit->entries[sit_offset]); check_block_count(sbi, segno, &raw_sit->entries[sit_offset]); } /* update ckpt_valid_block */ if (__is_large_section(sbi)) { set_ckpt_valid_blocks(sbi, segno); sanity_check_valid_blocks(sbi, segno); } __clear_bit(segno, bitmap); sit_i->dirty_sentries--; ses->entry_cnt--; } if (to_journal) up_write(&curseg->journal_rwsem); else f2fs_folio_put(folio, true); f2fs_bug_on(sbi, ses->entry_cnt); release_sit_entry_set(ses); } f2fs_bug_on(sbi, !list_empty(head)); f2fs_bug_on(sbi, sit_i->dirty_sentries); out: if (cpc->reason & CP_DISCARD) { __u64 trim_start = cpc->trim_start; for (; cpc->trim_start <= cpc->trim_end; cpc->trim_start++) add_discard_addrs(sbi, cpc, false); cpc->trim_start = trim_start; } up_write(&sit_i->sentry_lock); set_prefree_as_free_segments(sbi); } static int build_sit_info(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); struct sit_info *sit_i; unsigned int sit_segs, start; char *src_bitmap, *bitmap; unsigned int bitmap_size, main_bitmap_size, sit_bitmap_size; unsigned int discard_map = f2fs_block_unit_discard(sbi) ? 1 : 0; /* allocate memory for SIT information */ sit_i = f2fs_kzalloc(sbi, sizeof(struct sit_info), GFP_KERNEL); if (!sit_i) return -ENOMEM; SM_I(sbi)->sit_info = sit_i; sit_i->sentries = f2fs_kvzalloc(sbi, array_size(sizeof(struct seg_entry), MAIN_SEGS(sbi)), GFP_KERNEL); if (!sit_i->sentries) return -ENOMEM; main_bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); sit_i->dirty_sentries_bitmap = f2fs_kvzalloc(sbi, main_bitmap_size, GFP_KERNEL); if (!sit_i->dirty_sentries_bitmap) return -ENOMEM; #ifdef CONFIG_F2FS_CHECK_FS bitmap_size = MAIN_SEGS(sbi) * SIT_VBLOCK_MAP_SIZE * (3 + discard_map); #else bitmap_size = MAIN_SEGS(sbi) * SIT_VBLOCK_MAP_SIZE * (2 + discard_map); #endif sit_i->bitmap = f2fs_kvzalloc(sbi, bitmap_size, GFP_KERNEL); if (!sit_i->bitmap) return -ENOMEM; bitmap = sit_i->bitmap; for (start = 0; start < MAIN_SEGS(sbi); start++) { sit_i->sentries[start].cur_valid_map = bitmap; bitmap += SIT_VBLOCK_MAP_SIZE; sit_i->sentries[start].ckpt_valid_map = bitmap; bitmap += SIT_VBLOCK_MAP_SIZE; #ifdef CONFIG_F2FS_CHECK_FS sit_i->sentries[start].cur_valid_map_mir = bitmap; bitmap += SIT_VBLOCK_MAP_SIZE; #endif if (discard_map) { sit_i->sentries[start].discard_map = bitmap; bitmap += SIT_VBLOCK_MAP_SIZE; } } sit_i->tmp_map = f2fs_kzalloc(sbi, SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); if (!sit_i->tmp_map) return -ENOMEM; if (__is_large_section(sbi)) { sit_i->sec_entries = f2fs_kvzalloc(sbi, array_size(sizeof(struct sec_entry), MAIN_SECS(sbi)), GFP_KERNEL); if (!sit_i->sec_entries) return -ENOMEM; } /* get information related with SIT */ sit_segs = le32_to_cpu(raw_super->segment_count_sit) >> 1; /* setup SIT bitmap from ckeckpoint pack */ sit_bitmap_size = __bitmap_size(sbi, SIT_BITMAP); src_bitmap = __bitmap_ptr(sbi, SIT_BITMAP); sit_i->sit_bitmap = kmemdup(src_bitmap, sit_bitmap_size, GFP_KERNEL); if (!sit_i->sit_bitmap) return -ENOMEM; #ifdef CONFIG_F2FS_CHECK_FS sit_i->sit_bitmap_mir = kmemdup(src_bitmap, sit_bitmap_size, GFP_KERNEL); if (!sit_i->sit_bitmap_mir) return -ENOMEM; sit_i->invalid_segmap = f2fs_kvzalloc(sbi, main_bitmap_size, GFP_KERNEL); if (!sit_i->invalid_segmap) return -ENOMEM; #endif sit_i->sit_base_addr = le32_to_cpu(raw_super->sit_blkaddr); sit_i->sit_blocks = SEGS_TO_BLKS(sbi, sit_segs); sit_i->written_valid_blocks = 0; sit_i->bitmap_size = sit_bitmap_size; sit_i->dirty_sentries = 0; sit_i->sents_per_block = SIT_ENTRY_PER_BLOCK; sit_i->elapsed_time = le64_to_cpu(sbi->ckpt->elapsed_time); sit_i->mounted_time = ktime_get_boottime_seconds(); init_rwsem(&sit_i->sentry_lock); return 0; } static int build_free_segmap(struct f2fs_sb_info *sbi) { struct free_segmap_info *free_i; unsigned int bitmap_size, sec_bitmap_size; /* allocate memory for free segmap information */ free_i = f2fs_kzalloc(sbi, sizeof(struct free_segmap_info), GFP_KERNEL); if (!free_i) return -ENOMEM; SM_I(sbi)->free_info = free_i; bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); free_i->free_segmap = f2fs_kvmalloc(sbi, bitmap_size, GFP_KERNEL); if (!free_i->free_segmap) return -ENOMEM; sec_bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi)); free_i->free_secmap = f2fs_kvmalloc(sbi, sec_bitmap_size, GFP_KERNEL); if (!free_i->free_secmap) return -ENOMEM; /* set all segments as dirty temporarily */ memset(free_i->free_segmap, 0xff, bitmap_size); memset(free_i->free_secmap, 0xff, sec_bitmap_size); /* init free segmap information */ free_i->start_segno = GET_SEGNO_FROM_SEG0(sbi, MAIN_BLKADDR(sbi)); free_i->free_segments = 0; free_i->free_sections = 0; spin_lock_init(&free_i->segmap_lock); return 0; } static int build_curseg(struct f2fs_sb_info *sbi) { struct curseg_info *array; int i; array = f2fs_kzalloc(sbi, array_size(NR_CURSEG_TYPE, sizeof(*array)), GFP_KERNEL); if (!array) return -ENOMEM; SM_I(sbi)->curseg_array = array; for (i = 0; i < NO_CHECK_TYPE; i++) { mutex_init(&array[i].curseg_mutex); array[i].sum_blk = f2fs_kzalloc(sbi, PAGE_SIZE, GFP_KERNEL); if (!array[i].sum_blk) return -ENOMEM; init_rwsem(&array[i].journal_rwsem); array[i].journal = f2fs_kzalloc(sbi, sizeof(struct f2fs_journal), GFP_KERNEL); if (!array[i].journal) return -ENOMEM; array[i].seg_type = log_type_to_seg_type(i); reset_curseg_fields(&array[i]); } return restore_curseg_summaries(sbi); } static int build_sit_entries(struct f2fs_sb_info *sbi) { struct sit_info *sit_i = SIT_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA); struct f2fs_journal *journal = curseg->journal; struct seg_entry *se; struct f2fs_sit_entry sit; int sit_blk_cnt = SIT_BLK_CNT(sbi); unsigned int i, start, end; unsigned int readed, start_blk = 0; int err = 0; block_t sit_valid_blocks[2] = {0, 0}; do { readed = f2fs_ra_meta_pages(sbi, start_blk, BIO_MAX_VECS, META_SIT, true); start = start_blk * sit_i->sents_per_block; end = (start_blk + readed) * sit_i->sents_per_block; for (; start < end && start < MAIN_SEGS(sbi); start++) { struct f2fs_sit_block *sit_blk; struct folio *folio; se = &sit_i->sentries[start]; folio = get_current_sit_folio(sbi, start); if (IS_ERR(folio)) return PTR_ERR(folio); sit_blk = folio_address(folio); sit = sit_blk->entries[SIT_ENTRY_OFFSET(sit_i, start)]; f2fs_folio_put(folio, true); err = check_block_count(sbi, start, &sit); if (err) return err; seg_info_from_raw_sit(se, &sit); if (se->type >= NR_PERSISTENT_LOG) { f2fs_err(sbi, "Invalid segment type: %u, segno: %u", se->type, start); f2fs_handle_error(sbi, ERROR_INCONSISTENT_SUM_TYPE); return -EFSCORRUPTED; } sit_valid_blocks[SE_PAGETYPE(se)] += se->valid_blocks; if (!f2fs_block_unit_discard(sbi)) goto init_discard_map_done; /* build discard map only one time */ if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) { memset(se->discard_map, 0xff, SIT_VBLOCK_MAP_SIZE); goto init_discard_map_done; } memcpy(se->discard_map, se->cur_valid_map, SIT_VBLOCK_MAP_SIZE); sbi->discard_blks += BLKS_PER_SEG(sbi) - se->valid_blocks; init_discard_map_done: if (__is_large_section(sbi)) get_sec_entry(sbi, start)->valid_blocks += se->valid_blocks; } start_blk += readed; } while (start_blk < sit_blk_cnt); down_read(&curseg->journal_rwsem); for (i = 0; i < sits_in_cursum(journal); i++) { unsigned int old_valid_blocks; start = le32_to_cpu(segno_in_journal(journal, i)); if (start >= MAIN_SEGS(sbi)) { f2fs_err(sbi, "Wrong journal entry on segno %u", start); err = -EFSCORRUPTED; f2fs_handle_error(sbi, ERROR_CORRUPTED_JOURNAL); break; } se = &sit_i->sentries[start]; sit = sit_in_journal(journal, i); old_valid_blocks = se->valid_blocks; sit_valid_blocks[SE_PAGETYPE(se)] -= old_valid_blocks; err = check_block_count(sbi, start, &sit); if (err) break; seg_info_from_raw_sit(se, &sit); if (se->type >= NR_PERSISTENT_LOG) { f2fs_err(sbi, "Invalid segment type: %u, segno: %u", se->type, start); err = -EFSCORRUPTED; f2fs_handle_error(sbi, ERROR_INCONSISTENT_SUM_TYPE); break; } sit_valid_blocks[SE_PAGETYPE(se)] += se->valid_blocks; if (f2fs_block_unit_discard(sbi)) { if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) { memset(se->discard_map, 0xff, SIT_VBLOCK_MAP_SIZE); } else { memcpy(se->discard_map, se->cur_valid_map, SIT_VBLOCK_MAP_SIZE); sbi->discard_blks += old_valid_blocks; sbi->discard_blks -= se->valid_blocks; } } if (__is_large_section(sbi)) { get_sec_entry(sbi, start)->valid_blocks += se->valid_blocks; get_sec_entry(sbi, start)->valid_blocks -= old_valid_blocks; } } up_read(&curseg->journal_rwsem); /* update ckpt_valid_block */ if (__is_large_section(sbi)) { unsigned int segno; for (segno = 0; segno < MAIN_SEGS(sbi); segno += SEGS_PER_SEC(sbi)) { set_ckpt_valid_blocks(sbi, segno); sanity_check_valid_blocks(sbi, segno); } } if (err) return err; if (sit_valid_blocks[NODE] != valid_node_count(sbi)) { f2fs_err(sbi, "SIT is corrupted node# %u vs %u", sit_valid_blocks[NODE], valid_node_count(sbi)); f2fs_handle_error(sbi, ERROR_INCONSISTENT_NODE_COUNT); return -EFSCORRUPTED; } if (sit_valid_blocks[DATA] + sit_valid_blocks[NODE] > valid_user_blocks(sbi)) { f2fs_err(sbi, "SIT is corrupted data# %u %u vs %u", sit_valid_blocks[DATA], sit_valid_blocks[NODE], valid_user_blocks(sbi)); f2fs_handle_error(sbi, ERROR_INCONSISTENT_BLOCK_COUNT); return -EFSCORRUPTED; } return 0; } static void init_free_segmap(struct f2fs_sb_info *sbi) { unsigned int start; int type; struct seg_entry *sentry; for (start = 0; start < MAIN_SEGS(sbi); start++) { if (f2fs_usable_blks_in_seg(sbi, start) == 0) continue; sentry = get_seg_entry(sbi, start); if (!sentry->valid_blocks) __set_free(sbi, start); else SIT_I(sbi)->written_valid_blocks += sentry->valid_blocks; } /* set use the current segments */ for (type = CURSEG_HOT_DATA; type <= CURSEG_COLD_NODE; type++) { struct curseg_info *curseg_t = CURSEG_I(sbi, type); __set_test_and_inuse(sbi, curseg_t->segno); } } static void init_dirty_segmap(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); struct free_segmap_info *free_i = FREE_I(sbi); unsigned int segno = 0, offset = 0, secno; block_t valid_blocks, usable_blks_in_seg; while (1) { /* find dirty segment based on free segmap */ segno = find_next_inuse(free_i, MAIN_SEGS(sbi), offset); if (segno >= MAIN_SEGS(sbi)) break; offset = segno + 1; valid_blocks = get_valid_blocks(sbi, segno, false); usable_blks_in_seg = f2fs_usable_blks_in_seg(sbi, segno); if (valid_blocks == usable_blks_in_seg || !valid_blocks) continue; if (valid_blocks > usable_blks_in_seg) { f2fs_bug_on(sbi, 1); continue; } mutex_lock(&dirty_i->seglist_lock); __locate_dirty_segment(sbi, segno, DIRTY); mutex_unlock(&dirty_i->seglist_lock); } if (!__is_large_section(sbi)) return; mutex_lock(&dirty_i->seglist_lock); for (segno = 0; segno < MAIN_SEGS(sbi); segno += SEGS_PER_SEC(sbi)) { valid_blocks = get_valid_blocks(sbi, segno, true); secno = GET_SEC_FROM_SEG(sbi, segno); if (!valid_blocks || valid_blocks == CAP_BLKS_PER_SEC(sbi)) continue; if (IS_CURSEC(sbi, secno)) continue; set_bit(secno, dirty_i->dirty_secmap); } mutex_unlock(&dirty_i->seglist_lock); } static int init_victim_secmap(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); unsigned int bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi)); dirty_i->victim_secmap = f2fs_kvzalloc(sbi, bitmap_size, GFP_KERNEL); if (!dirty_i->victim_secmap) return -ENOMEM; dirty_i->pinned_secmap = f2fs_kvzalloc(sbi, bitmap_size, GFP_KERNEL); if (!dirty_i->pinned_secmap) return -ENOMEM; dirty_i->pinned_secmap_cnt = 0; dirty_i->enable_pin_section = true; return 0; } static int build_dirty_segmap(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i; unsigned int bitmap_size, i; /* allocate memory for dirty segments list information */ dirty_i = f2fs_kzalloc(sbi, sizeof(struct dirty_seglist_info), GFP_KERNEL); if (!dirty_i) return -ENOMEM; SM_I(sbi)->dirty_info = dirty_i; mutex_init(&dirty_i->seglist_lock); bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); for (i = 0; i < NR_DIRTY_TYPE; i++) { dirty_i->dirty_segmap[i] = f2fs_kvzalloc(sbi, bitmap_size, GFP_KERNEL); if (!dirty_i->dirty_segmap[i]) return -ENOMEM; } if (__is_large_section(sbi)) { bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi)); dirty_i->dirty_secmap = f2fs_kvzalloc(sbi, bitmap_size, GFP_KERNEL); if (!dirty_i->dirty_secmap) return -ENOMEM; } init_dirty_segmap(sbi); return init_victim_secmap(sbi); } static int sanity_check_curseg(struct f2fs_sb_info *sbi) { int i; /* * In LFS/SSR curseg, .next_blkoff should point to an unused blkaddr; * In LFS curseg, all blkaddr after .next_blkoff should be unused. */ for (i = 0; i < NR_PERSISTENT_LOG; i++) { struct curseg_info *curseg = CURSEG_I(sbi, i); struct seg_entry *se = get_seg_entry(sbi, curseg->segno); unsigned int blkofs = curseg->next_blkoff; if (f2fs_sb_has_readonly(sbi) && i != CURSEG_HOT_DATA && i != CURSEG_HOT_NODE) continue; sanity_check_seg_type(sbi, curseg->seg_type); if (curseg->alloc_type != LFS && curseg->alloc_type != SSR) { f2fs_err(sbi, "Current segment has invalid alloc_type:%d", curseg->alloc_type); f2fs_handle_error(sbi, ERROR_INVALID_CURSEG); return -EFSCORRUPTED; } if (f2fs_test_bit(blkofs, se->cur_valid_map)) goto out; if (curseg->alloc_type == SSR) continue; for (blkofs += 1; blkofs < BLKS_PER_SEG(sbi); blkofs++) { if (!f2fs_test_bit(blkofs, se->cur_valid_map)) continue; out: f2fs_err(sbi, "Current segment's next free block offset is inconsistent with bitmap, logtype:%u, segno:%u, type:%u, next_blkoff:%u, blkofs:%u", i, curseg->segno, curseg->alloc_type, curseg->next_blkoff, blkofs); f2fs_handle_error(sbi, ERROR_INVALID_CURSEG); return -EFSCORRUPTED; } } return 0; } #ifdef CONFIG_BLK_DEV_ZONED static int check_zone_write_pointer(struct f2fs_sb_info *sbi, struct f2fs_dev_info *fdev, struct blk_zone *zone) { unsigned int zone_segno; block_t zone_block, valid_block_cnt; unsigned int log_sectors_per_block = sbi->log_blocksize - SECTOR_SHIFT; int ret; unsigned int nofs_flags; if (zone->type != BLK_ZONE_TYPE_SEQWRITE_REQ) return 0; zone_block = fdev->start_blk + (zone->start >> log_sectors_per_block); zone_segno = GET_SEGNO(sbi, zone_block); /* * Skip check of zones cursegs point to, since * fix_curseg_write_pointer() checks them. */ if (zone_segno >= MAIN_SEGS(sbi)) return 0; /* * Get # of valid block of the zone. */ valid_block_cnt = get_valid_blocks(sbi, zone_segno, true); if (IS_CURSEC(sbi, GET_SEC_FROM_SEG(sbi, zone_segno))) { f2fs_notice(sbi, "Open zones: valid block[0x%x,0x%x] cond[%s]", zone_segno, valid_block_cnt, blk_zone_cond_str(zone->cond)); return 0; } if ((!valid_block_cnt && zone->cond == BLK_ZONE_COND_EMPTY) || (valid_block_cnt && zone->cond == BLK_ZONE_COND_FULL)) return 0; if (!valid_block_cnt) { f2fs_notice(sbi, "Zone without valid block has non-zero write " "pointer. Reset the write pointer: cond[%s]", blk_zone_cond_str(zone->cond)); ret = __f2fs_issue_discard_zone(sbi, fdev->bdev, zone_block, zone->len >> log_sectors_per_block); if (ret) f2fs_err(sbi, "Discard zone failed: %s (errno=%d)", fdev->path, ret); return ret; } /* * If there are valid blocks and the write pointer doesn't match * with them, we need to report the inconsistency and fill * the zone till the end to close the zone. This inconsistency * does not cause write error because the zone will not be * selected for write operation until it get discarded. */ f2fs_notice(sbi, "Valid blocks are not aligned with write " "pointer: valid block[0x%x,0x%x] cond[%s]", zone_segno, valid_block_cnt, blk_zone_cond_str(zone->cond)); nofs_flags = memalloc_nofs_save(); ret = blkdev_zone_mgmt(fdev->bdev, REQ_OP_ZONE_FINISH, zone->start, zone->len); memalloc_nofs_restore(nofs_flags); if (ret == -EOPNOTSUPP) { ret = blkdev_issue_zeroout(fdev->bdev, zone->wp, zone->len - (zone->wp - zone->start), GFP_NOFS, 0); if (ret) f2fs_err(sbi, "Fill up zone failed: %s (errno=%d)", fdev->path, ret); } else if (ret) { f2fs_err(sbi, "Finishing zone failed: %s (errno=%d)", fdev->path, ret); } return ret; } static struct f2fs_dev_info *get_target_zoned_dev(struct f2fs_sb_info *sbi, block_t zone_blkaddr) { int i; for (i = 0; i < sbi->s_ndevs; i++) { if (!bdev_is_zoned(FDEV(i).bdev)) continue; if (sbi->s_ndevs == 1 || (FDEV(i).start_blk <= zone_blkaddr && zone_blkaddr <= FDEV(i).end_blk)) return &FDEV(i); } return NULL; } static int report_one_zone_cb(struct blk_zone *zone, unsigned int idx, void *data) { memcpy(data, zone, sizeof(struct blk_zone)); return 0; } static int do_fix_curseg_write_pointer(struct f2fs_sb_info *sbi, int type) { struct curseg_info *cs = CURSEG_I(sbi, type); struct f2fs_dev_info *zbd; struct blk_zone zone; unsigned int cs_section, wp_segno, wp_blkoff, wp_sector_off; block_t cs_zone_block, wp_block; unsigned int log_sectors_per_block = sbi->log_blocksize - SECTOR_SHIFT; sector_t zone_sector; int err; cs_section = GET_SEC_FROM_SEG(sbi, cs->segno); cs_zone_block = START_BLOCK(sbi, GET_SEG_FROM_SEC(sbi, cs_section)); zbd = get_target_zoned_dev(sbi, cs_zone_block); if (!zbd) return 0; /* report zone for the sector the curseg points to */ zone_sector = (sector_t)(cs_zone_block - zbd->start_blk) << log_sectors_per_block; err = blkdev_report_zones(zbd->bdev, zone_sector, 1, report_one_zone_cb, &zone); if (err != 1) { f2fs_err(sbi, "Report zone failed: %s errno=(%d)", zbd->path, err); return err; } if (zone.type != BLK_ZONE_TYPE_SEQWRITE_REQ) return 0; /* * When safely unmounted in the previous mount, we could use current * segments. Otherwise, allocate new sections. */ if (is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) { wp_block = zbd->start_blk + (zone.wp >> log_sectors_per_block); wp_segno = GET_SEGNO(sbi, wp_block); wp_blkoff = wp_block - START_BLOCK(sbi, wp_segno); wp_sector_off = zone.wp & GENMASK(log_sectors_per_block - 1, 0); if (cs->segno == wp_segno && cs->next_blkoff == wp_blkoff && wp_sector_off == 0) return 0; f2fs_notice(sbi, "Unaligned curseg[%d] with write pointer: " "curseg[0x%x,0x%x] wp[0x%x,0x%x]", type, cs->segno, cs->next_blkoff, wp_segno, wp_blkoff); } /* Allocate a new section if it's not new. */ if (cs->next_blkoff || cs->segno != GET_SEG_FROM_SEC(sbi, GET_ZONE_FROM_SEC(sbi, cs_section))) { unsigned int old_segno = cs->segno, old_blkoff = cs->next_blkoff; f2fs_allocate_new_section(sbi, type, true); f2fs_notice(sbi, "Assign new section to curseg[%d]: " "[0x%x,0x%x] -> [0x%x,0x%x]", type, old_segno, old_blkoff, cs->segno, cs->next_blkoff); } /* check consistency of the zone curseg pointed to */ if (check_zone_write_pointer(sbi, zbd, &zone)) return -EIO; /* check newly assigned zone */ cs_section = GET_SEC_FROM_SEG(sbi, cs->segno); cs_zone_block = START_BLOCK(sbi, GET_SEG_FROM_SEC(sbi, cs_section)); zbd = get_target_zoned_dev(sbi, cs_zone_block); if (!zbd) return 0; zone_sector = (sector_t)(cs_zone_block - zbd->start_blk) << log_sectors_per_block; err = blkdev_report_zones(zbd->bdev, zone_sector, 1, report_one_zone_cb, &zone); if (err != 1) { f2fs_err(sbi, "Report zone failed: %s errno=(%d)", zbd->path, err); return err; } if (zone.type != BLK_ZONE_TYPE_SEQWRITE_REQ) return 0; if (zone.wp != zone.start) { f2fs_notice(sbi, "New zone for curseg[%d] is not yet discarded. " "Reset the zone: curseg[0x%x,0x%x]", type, cs->segno, cs->next_blkoff); err = __f2fs_issue_discard_zone(sbi, zbd->bdev, cs_zone_block, zone.len >> log_sectors_per_block); if (err) { f2fs_err(sbi, "Discard zone failed: %s (errno=%d)", zbd->path, err); return err; } } return 0; } static int fix_curseg_write_pointer(struct f2fs_sb_info *sbi) { int i, ret; for (i = 0; i < NR_PERSISTENT_LOG; i++) { ret = do_fix_curseg_write_pointer(sbi, i); if (ret) return ret; } return 0; } struct check_zone_write_pointer_args { struct f2fs_sb_info *sbi; struct f2fs_dev_info *fdev; }; static int check_zone_write_pointer_cb(struct blk_zone *zone, unsigned int idx, void *data) { struct check_zone_write_pointer_args *args; args = (struct check_zone_write_pointer_args *)data; return check_zone_write_pointer(args->sbi, args->fdev, zone); } static int check_write_pointer(struct f2fs_sb_info *sbi) { int i, ret; struct check_zone_write_pointer_args args; for (i = 0; i < sbi->s_ndevs; i++) { if (!bdev_is_zoned(FDEV(i).bdev)) continue; args.sbi = sbi; args.fdev = &FDEV(i); ret = blkdev_report_zones(FDEV(i).bdev, 0, BLK_ALL_ZONES, check_zone_write_pointer_cb, &args); if (ret < 0) return ret; } return 0; } int f2fs_check_and_fix_write_pointer(struct f2fs_sb_info *sbi) { int ret; if (!f2fs_sb_has_blkzoned(sbi) || f2fs_readonly(sbi->sb) || f2fs_hw_is_readonly(sbi)) return 0; f2fs_notice(sbi, "Checking entire write pointers"); ret = fix_curseg_write_pointer(sbi); if (!ret) ret = check_write_pointer(sbi); return ret; } /* * Return the number of usable blocks in a segment. The number of blocks * returned is always equal to the number of blocks in a segment for * segments fully contained within a sequential zone capacity or a * conventional zone. For segments partially contained in a sequential * zone capacity, the number of usable blocks up to the zone capacity * is returned. 0 is returned in all other cases. */ static inline unsigned int f2fs_usable_zone_blks_in_seg( struct f2fs_sb_info *sbi, unsigned int segno) { block_t seg_start, sec_start_blkaddr, sec_cap_blkaddr; unsigned int secno; if (!sbi->unusable_blocks_per_sec) return BLKS_PER_SEG(sbi); secno = GET_SEC_FROM_SEG(sbi, segno); seg_start = START_BLOCK(sbi, segno); sec_start_blkaddr = START_BLOCK(sbi, GET_SEG_FROM_SEC(sbi, secno)); sec_cap_blkaddr = sec_start_blkaddr + CAP_BLKS_PER_SEC(sbi); /* * If segment starts before zone capacity and spans beyond * zone capacity, then usable blocks are from seg start to * zone capacity. If the segment starts after the zone capacity, * then there are no usable blocks. */ if (seg_start >= sec_cap_blkaddr) return 0; if (seg_start + BLKS_PER_SEG(sbi) > sec_cap_blkaddr) return sec_cap_blkaddr - seg_start; return BLKS_PER_SEG(sbi); } #else int f2fs_check_and_fix_write_pointer(struct f2fs_sb_info *sbi) { return 0; } static inline unsigned int f2fs_usable_zone_blks_in_seg(struct f2fs_sb_info *sbi, unsigned int segno) { return 0; } #endif unsigned int f2fs_usable_blks_in_seg(struct f2fs_sb_info *sbi, unsigned int segno) { if (f2fs_sb_has_blkzoned(sbi)) return f2fs_usable_zone_blks_in_seg(sbi, segno); return BLKS_PER_SEG(sbi); } unsigned int f2fs_usable_segs_in_sec(struct f2fs_sb_info *sbi) { if (f2fs_sb_has_blkzoned(sbi)) return CAP_SEGS_PER_SEC(sbi); return SEGS_PER_SEC(sbi); } unsigned long long f2fs_get_section_mtime(struct f2fs_sb_info *sbi, unsigned int segno) { unsigned int usable_segs_per_sec = f2fs_usable_segs_in_sec(sbi); unsigned int secno = 0, start = 0; unsigned int total_valid_blocks = 0; unsigned long long mtime = 0; unsigned int i = 0; secno = GET_SEC_FROM_SEG(sbi, segno); start = GET_SEG_FROM_SEC(sbi, secno); if (!__is_large_section(sbi)) { mtime = get_seg_entry(sbi, start + i)->mtime; goto out; } for (i = 0; i < usable_segs_per_sec; i++) { /* for large section, only check the mtime of valid segments */ struct seg_entry *se = get_seg_entry(sbi, start+i); mtime += se->mtime * se->valid_blocks; total_valid_blocks += se->valid_blocks; } if (total_valid_blocks == 0) return INVALID_MTIME; mtime = div_u64(mtime, total_valid_blocks); out: if (unlikely(mtime == INVALID_MTIME)) mtime -= 1; return mtime; } /* * Update min, max modified time for cost-benefit GC algorithm */ static void init_min_max_mtime(struct f2fs_sb_info *sbi) { struct sit_info *sit_i = SIT_I(sbi); unsigned int segno; down_write(&sit_i->sentry_lock); sit_i->min_mtime = ULLONG_MAX; for (segno = 0; segno < MAIN_SEGS(sbi); segno += SEGS_PER_SEC(sbi)) { unsigned long long mtime = 0; mtime = f2fs_get_section_mtime(sbi, segno); if (sit_i->min_mtime > mtime) sit_i->min_mtime = mtime; } sit_i->max_mtime = get_mtime(sbi, false); sit_i->dirty_max_mtime = 0; up_write(&sit_i->sentry_lock); } int f2fs_build_segment_manager(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); struct f2fs_sm_info *sm_info; int err; sm_info = f2fs_kzalloc(sbi, sizeof(struct f2fs_sm_info), GFP_KERNEL); if (!sm_info) return -ENOMEM; /* init sm info */ sbi->sm_info = sm_info; sm_info->seg0_blkaddr = le32_to_cpu(raw_super->segment0_blkaddr); sm_info->main_blkaddr = le32_to_cpu(raw_super->main_blkaddr); sm_info->segment_count = le32_to_cpu(raw_super->segment_count); sm_info->reserved_segments = le32_to_cpu(ckpt->rsvd_segment_count); sm_info->ovp_segments = le32_to_cpu(ckpt->overprov_segment_count); sm_info->main_segments = le32_to_cpu(raw_super->segment_count_main); sm_info->ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr); sm_info->rec_prefree_segments = sm_info->main_segments * DEF_RECLAIM_PREFREE_SEGMENTS / 100; if (sm_info->rec_prefree_segments > DEF_MAX_RECLAIM_PREFREE_SEGMENTS) sm_info->rec_prefree_segments = DEF_MAX_RECLAIM_PREFREE_SEGMENTS; if (!f2fs_lfs_mode(sbi)) sm_info->ipu_policy = BIT(F2FS_IPU_FSYNC); sm_info->min_ipu_util = DEF_MIN_IPU_UTIL; sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS; sm_info->min_seq_blocks = BLKS_PER_SEG(sbi); sm_info->min_hot_blocks = DEF_MIN_HOT_BLOCKS; sm_info->min_ssr_sections = reserved_sections(sbi); INIT_LIST_HEAD(&sm_info->sit_entry_set); init_f2fs_rwsem(&sm_info->curseg_lock); err = f2fs_create_flush_cmd_control(sbi); if (err) return err; err = create_discard_cmd_control(sbi); if (err) return err; err = build_sit_info(sbi); if (err) return err; err = build_free_segmap(sbi); if (err) return err; err = build_curseg(sbi); if (err) return err; /* reinit free segmap based on SIT */ err = build_sit_entries(sbi); if (err) return err; init_free_segmap(sbi); err = build_dirty_segmap(sbi); if (err) return err; err = sanity_check_curseg(sbi); if (err) return err; init_min_max_mtime(sbi); return 0; } static void discard_dirty_segmap(struct f2fs_sb_info *sbi, enum dirty_type dirty_type) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); mutex_lock(&dirty_i->seglist_lock); kvfree(dirty_i->dirty_segmap[dirty_type]); dirty_i->nr_dirty[dirty_type] = 0; mutex_unlock(&dirty_i->seglist_lock); } static void destroy_victim_secmap(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); kvfree(dirty_i->pinned_secmap); kvfree(dirty_i->victim_secmap); } static void destroy_dirty_segmap(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); int i; if (!dirty_i) return; /* discard pre-free/dirty segments list */ for (i = 0; i < NR_DIRTY_TYPE; i++) discard_dirty_segmap(sbi, i); if (__is_large_section(sbi)) { mutex_lock(&dirty_i->seglist_lock); kvfree(dirty_i->dirty_secmap); mutex_unlock(&dirty_i->seglist_lock); } destroy_victim_secmap(sbi); SM_I(sbi)->dirty_info = NULL; kfree(dirty_i); } static void destroy_curseg(struct f2fs_sb_info *sbi) { struct curseg_info *array = SM_I(sbi)->curseg_array; int i; if (!array) return; SM_I(sbi)->curseg_array = NULL; for (i = 0; i < NR_CURSEG_TYPE; i++) { kfree(array[i].sum_blk); kfree(array[i].journal); } kfree(array); } static void destroy_free_segmap(struct f2fs_sb_info *sbi) { struct free_segmap_info *free_i = SM_I(sbi)->free_info; if (!free_i) return; SM_I(sbi)->free_info = NULL; kvfree(free_i->free_segmap); kvfree(free_i->free_secmap); kfree(free_i); } static void destroy_sit_info(struct f2fs_sb_info *sbi) { struct sit_info *sit_i = SIT_I(sbi); if (!sit_i) return; if (sit_i->sentries) kvfree(sit_i->bitmap); kfree(sit_i->tmp_map); kvfree(sit_i->sentries); kvfree(sit_i->sec_entries); kvfree(sit_i->dirty_sentries_bitmap); SM_I(sbi)->sit_info = NULL; kvfree(sit_i->sit_bitmap); #ifdef CONFIG_F2FS_CHECK_FS kvfree(sit_i->sit_bitmap_mir); kvfree(sit_i->invalid_segmap); #endif kfree(sit_i); } void f2fs_destroy_segment_manager(struct f2fs_sb_info *sbi) { struct f2fs_sm_info *sm_info = SM_I(sbi); if (!sm_info) return; f2fs_destroy_flush_cmd_control(sbi, true); destroy_discard_cmd_control(sbi); destroy_dirty_segmap(sbi); destroy_curseg(sbi); destroy_free_segmap(sbi); destroy_sit_info(sbi); sbi->sm_info = NULL; kfree(sm_info); } int __init f2fs_create_segment_manager_caches(void) { discard_entry_slab = f2fs_kmem_cache_create("f2fs_discard_entry", sizeof(struct discard_entry)); if (!discard_entry_slab) goto fail; discard_cmd_slab = f2fs_kmem_cache_create("f2fs_discard_cmd", sizeof(struct discard_cmd)); if (!discard_cmd_slab) goto destroy_discard_entry; sit_entry_set_slab = f2fs_kmem_cache_create("f2fs_sit_entry_set", sizeof(struct sit_entry_set)); if (!sit_entry_set_slab) goto destroy_discard_cmd; revoke_entry_slab = f2fs_kmem_cache_create("f2fs_revoke_entry", sizeof(struct revoke_entry)); if (!revoke_entry_slab) goto destroy_sit_entry_set; return 0; destroy_sit_entry_set: kmem_cache_destroy(sit_entry_set_slab); destroy_discard_cmd: kmem_cache_destroy(discard_cmd_slab); destroy_discard_entry: kmem_cache_destroy(discard_entry_slab); fail: return -ENOMEM; } void f2fs_destroy_segment_manager_caches(void) { kmem_cache_destroy(sit_entry_set_slab); kmem_cache_destroy(discard_cmd_slab); kmem_cache_destroy(discard_entry_slab); kmem_cache_destroy(revoke_entry_slab); }
10 10 6 10 8 10 6 10 9 4 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) International Business Machines Corp., 2000-2004 */ /* * jfs_umount.c * * note: file system in transition to aggregate/fileset: * (ref. jfs_mount.c) * * file system unmount is interpreted as mount of the single/only * fileset in the aggregate and, if unmount of the last fileset, * as unmount of the aggerate; */ #include <linux/fs.h> #include "jfs_incore.h" #include "jfs_filsys.h" #include "jfs_superblock.h" #include "jfs_dmap.h" #include "jfs_imap.h" #include "jfs_metapage.h" #include "jfs_debug.h" /* * NAME: jfs_umount(vfsp, flags, crp) * * FUNCTION: vfs_umount() * * PARAMETERS: vfsp - virtual file system pointer * flags - unmount for shutdown * crp - credential * * RETURN : EBUSY - device has open files */ int jfs_umount(struct super_block *sb) { struct jfs_sb_info *sbi = JFS_SBI(sb); struct inode *ipbmap = sbi->ipbmap; struct inode *ipimap = sbi->ipimap; struct inode *ipaimap = sbi->ipaimap; struct inode *ipaimap2 = sbi->ipaimap2; struct jfs_log *log; int rc = 0; jfs_info("UnMount JFS: sb:0x%p", sb); /* * update superblock and close log * * if mounted read-write and log based recovery was enabled */ if ((log = sbi->log)) /* * Wait for outstanding transactions to be written to log: */ jfs_flush_journal(log, 2); /* * close fileset inode allocation map (aka fileset inode) */ diUnmount(ipimap, 0); diFreeSpecial(ipimap); sbi->ipimap = NULL; /* * close secondary aggregate inode allocation map */ if (ipaimap2) { diUnmount(ipaimap2, 0); diFreeSpecial(ipaimap2); sbi->ipaimap2 = NULL; } /* * close aggregate inode allocation map */ diUnmount(ipaimap, 0); diFreeSpecial(ipaimap); sbi->ipaimap = NULL; /* * close aggregate block allocation map */ dbUnmount(ipbmap, 0); diFreeSpecial(ipbmap); sbi->ipbmap = NULL; /* * Make sure all metadata makes it to disk before we mark * the superblock as clean */ filemap_write_and_wait(sbi->direct_inode->i_mapping); /* * ensure all file system file pages are propagated to their * home blocks on disk (and their in-memory buffer pages are * invalidated) BEFORE updating file system superblock state * (to signify file system is unmounted cleanly, and thus in * consistent state) and log superblock active file system * list (to signify skip logredo()). */ if (log) { /* log = NULL if read-only mount */ updateSuper(sb, FM_CLEAN); /* * close log: * * remove file system from log active file system list. */ rc = lmLogClose(sb); } jfs_info("UnMount JFS Complete: rc = %d", rc); return rc; } int jfs_umount_rw(struct super_block *sb) { struct jfs_sb_info *sbi = JFS_SBI(sb); struct jfs_log *log = sbi->log; if (!log) return 0; /* * close log: * * remove file system from log active file system list. */ jfs_flush_journal(log, 2); /* * Make sure all metadata makes it to disk */ dbSync(sbi->ipbmap); diSync(sbi->ipimap); /* * Note that we have to do this even if sync_blockdev() will * do exactly the same a few instructions later: We can't * mark the superblock clean before everything is flushed to * disk. */ filemap_write_and_wait(sbi->direct_inode->i_mapping); updateSuper(sb, FM_CLEAN); return lmLogClose(sb); }
5 4 3 1 2 5 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/errno.h> #include <linux/file.h> #include <linux/io_uring.h> #include <uapi/linux/io_uring.h> #include "../fs/internal.h" #include "io_uring.h" #include "statx.h" struct io_statx { struct file *file; int dfd; unsigned int mask; unsigned int flags; struct filename *filename; struct statx __user *buffer; }; int io_statx_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_statx *sx = io_kiocb_to_cmd(req, struct io_statx); const char __user *path; if (sqe->buf_index || sqe->splice_fd_in) return -EINVAL; if (req->flags & REQ_F_FIXED_FILE) return -EBADF; sx->dfd = READ_ONCE(sqe->fd); sx->mask = READ_ONCE(sqe->len); path = u64_to_user_ptr(READ_ONCE(sqe->addr)); sx->buffer = u64_to_user_ptr(READ_ONCE(sqe->addr2)); sx->flags = READ_ONCE(sqe->statx_flags); sx->filename = getname_uflags(path, sx->flags); if (IS_ERR(sx->filename)) { int ret = PTR_ERR(sx->filename); sx->filename = NULL; return ret; } req->flags |= REQ_F_NEED_CLEANUP; req->flags |= REQ_F_FORCE_ASYNC; return 0; } int io_statx(struct io_kiocb *req, unsigned int issue_flags) { struct io_statx *sx = io_kiocb_to_cmd(req, struct io_statx); int ret; WARN_ON_ONCE(issue_flags & IO_URING_F_NONBLOCK); ret = do_statx(sx->dfd, sx->filename, sx->flags, sx->mask, sx->buffer); io_req_set_res(req, ret, 0); return IOU_COMPLETE; } void io_statx_cleanup(struct io_kiocb *req) { struct io_statx *sx = io_kiocb_to_cmd(req, struct io_statx); if (sx->filename) putname(sx->filename); }
33 33 33 33 33 33 33 33 33 27 31 12 21 31 29 31 31 31 31 24 23 14 30 31 11 11 11 25 21 21 22 22 13 11 11 22 22 11 12 12 12 3 3 3 3 3 2 2 3 12 4 4 3 3 8 8 7 8 31 8 30 31 12 10 8 3 3 3 8 25 25 25 31 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 33 33 11 23 23 23 23 33 33 33 33 33 33 33 20 20 20 1 1 1 1 20 33 1 36 1 35 35 35 35 35 11 1 1 1 1 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 1 33 1 1 1 1 1 33 20 21 33 33 33 34 34 34 33 33 33 33 32 32 32 1 33 36 36 35 16 21 35 35 33 36 32 32 32 11 9 32 32 2 2 32 32 32 2 32 32 31 22 19 22 22 2 19 32 32 8 30 31 26 30 31 1 31 29 29 29 26 6 2 2 2 6 32 32 32 32 8 2 8 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 3 3 1 1 3 3 3 3 6 6 6 6 2 2 2 2 5 5 4 3 5 5 8 8 8 8 8 8 19 19 19 19 19 19 15 19 19 19 14 15 5 19 20 20 20 20 20 20 20 20 3 19 20 20 5 20 1 3 2 1 1 19 20 20 20 20 4 5 5 4 4 2 2 1 3 2 3 1 1 1 3 3 3 2 2 3 3 3 3 3 1 1 1 1 5 10 10 10 16 11 16 4 4 3 4 4 6 6 6 6 2 2 2 1 1 1 4 4 4 1 3 3 4 3 3 2 1 1 5 5 5 5 5 5 20 16 20 20 5 17 12 10 17 4 12 11 16 6 10 10 10 10 2 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 19 9 7 3 6 2 2 9 1 1 9 9 9 6 6 5 9 9 3 2 1 1 1 1 1 3 2 4 5 5 3 3 4 3 4 4 4 4 4 4 4 4 4 2 4 4 1 1 3 3 3 1 2 1 1 1 2 2 1 2 1 1 1 4 3 1 4 4 4 5 5 5 5 3 52 52 25 9 25 6 21 61 60 11 61 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Oracle. All rights reserved. */ #include <linux/fs.h> #include <linux/pagemap.h> #include <linux/time.h> #include <linux/init.h> #include <linux/string.h> #include <linux/backing-dev.h> #include <linux/falloc.h> #include <linux/writeback.h> #include <linux/compat.h> #include <linux/slab.h> #include <linux/btrfs.h> #include <linux/uio.h> #include <linux/iversion.h> #include <linux/fsverity.h> #include "ctree.h" #include "direct-io.h" #include "disk-io.h" #include "transaction.h" #include "btrfs_inode.h" #include "tree-log.h" #include "locking.h" #include "qgroup.h" #include "compression.h" #include "delalloc-space.h" #include "reflink.h" #include "subpage.h" #include "fs.h" #include "accessors.h" #include "extent-tree.h" #include "file-item.h" #include "ioctl.h" #include "file.h" #include "super.h" #include "print-tree.h" /* * Unlock folio after btrfs_file_write() is done with it. */ static void btrfs_drop_folio(struct btrfs_fs_info *fs_info, struct folio *folio, u64 pos, u64 copied) { u64 block_start = round_down(pos, fs_info->sectorsize); u64 block_len = round_up(pos + copied, fs_info->sectorsize) - block_start; ASSERT(block_len <= U32_MAX); /* * Folio checked is some magic around finding folios that have been * modified without going through btrfs_dirty_folio(). Clear it here. * There should be no need to mark the pages accessed as * prepare_one_folio() should have marked them accessed in * prepare_one_folio() via find_or_create_page() */ btrfs_folio_clamp_clear_checked(fs_info, folio, block_start, block_len); folio_unlock(folio); folio_put(folio); } /* * After copy_folio_from_iter_atomic(), update the following things for delalloc: * - Mark newly dirtied folio as DELALLOC in the io tree. * Used to advise which range is to be written back. * - Mark modified folio as Uptodate/Dirty and not needing COW fixup * - Update inode size for past EOF write */ int btrfs_dirty_folio(struct btrfs_inode *inode, struct folio *folio, loff_t pos, size_t write_bytes, struct extent_state **cached, bool noreserve) { struct btrfs_fs_info *fs_info = inode->root->fs_info; int ret = 0; u64 num_bytes; u64 start_pos; u64 end_of_last_block; u64 end_pos = pos + write_bytes; loff_t isize = i_size_read(&inode->vfs_inode); unsigned int extra_bits = 0; if (write_bytes == 0) return 0; if (noreserve) extra_bits |= EXTENT_NORESERVE; start_pos = round_down(pos, fs_info->sectorsize); num_bytes = round_up(write_bytes + pos - start_pos, fs_info->sectorsize); ASSERT(num_bytes <= U32_MAX); ASSERT(folio_pos(folio) <= pos && folio_pos(folio) + folio_size(folio) >= pos + write_bytes); end_of_last_block = start_pos + num_bytes - 1; /* * The pages may have already been dirty, clear out old accounting so * we can set things up properly */ btrfs_clear_extent_bit(&inode->io_tree, start_pos, end_of_last_block, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, cached); ret = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, extra_bits, cached); if (ret) return ret; btrfs_folio_clamp_set_uptodate(fs_info, folio, start_pos, num_bytes); btrfs_folio_clamp_clear_checked(fs_info, folio, start_pos, num_bytes); btrfs_folio_clamp_set_dirty(fs_info, folio, start_pos, num_bytes); /* * we've only changed i_size in ram, and we haven't updated * the disk i_size. There is no need to log the inode * at this time. */ if (end_pos > isize) i_size_write(&inode->vfs_inode, end_pos); return 0; } /* * this is very complex, but the basic idea is to drop all extents * in the range start - end. hint_block is filled in with a block number * that would be a good hint to the block allocator for this file. * * If an extent intersects the range but is not entirely inside the range * it is either truncated or split. Anything entirely inside the range * is deleted from the tree. * * Note: the VFS' inode number of bytes is not updated, it's up to the caller * to deal with that. We set the field 'bytes_found' of the arguments structure * with the number of allocated bytes found in the target range, so that the * caller can update the inode's number of bytes in an atomic way when * replacing extents in a range to avoid races with stat(2). */ int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_inode *inode, struct btrfs_drop_extents_args *args) { struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *leaf; struct btrfs_file_extent_item *fi; struct btrfs_key key; struct btrfs_key new_key; u64 ino = btrfs_ino(inode); u64 search_start = args->start; u64 disk_bytenr = 0; u64 num_bytes = 0; u64 extent_offset = 0; u64 extent_end = 0; u64 last_end = args->start; int del_nr = 0; int del_slot = 0; int extent_type; int recow; int ret; int modify_tree = -1; int update_refs; int found = 0; struct btrfs_path *path = args->path; args->bytes_found = 0; args->extent_inserted = false; /* Must always have a path if ->replace_extent is true */ ASSERT(!(args->replace_extent && !args->path)); if (!path) { path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; goto out; } } if (args->drop_cache) btrfs_drop_extent_map_range(inode, args->start, args->end - 1, false); if (data_race(args->start >= inode->disk_i_size) && !args->replace_extent) modify_tree = 0; update_refs = (btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID); while (1) { recow = 0; ret = btrfs_lookup_file_extent(trans, root, path, ino, search_start, modify_tree); if (ret < 0) break; if (ret > 0 && path->slots[0] > 0 && search_start == args->start) { leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1); if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY) path->slots[0]--; } ret = 0; next_slot: leaf = path->nodes[0]; if (path->slots[0] >= btrfs_header_nritems(leaf)) { if (WARN_ON(del_nr > 0)) { btrfs_print_leaf(leaf); ret = -EINVAL; break; } ret = btrfs_next_leaf(root, path); if (ret < 0) break; if (ret > 0) { ret = 0; break; } leaf = path->nodes[0]; recow = 1; } btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); if (key.objectid > ino) break; if (WARN_ON_ONCE(key.objectid < ino) || key.type < BTRFS_EXTENT_DATA_KEY) { ASSERT(del_nr == 0); path->slots[0]++; goto next_slot; } if (key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= args->end) break; fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); extent_type = btrfs_file_extent_type(leaf, fi); if (extent_type == BTRFS_FILE_EXTENT_REG || extent_type == BTRFS_FILE_EXTENT_PREALLOC) { disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); extent_offset = btrfs_file_extent_offset(leaf, fi); extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { extent_end = key.offset + btrfs_file_extent_ram_bytes(leaf, fi); } else { /* can't happen */ BUG(); } /* * Don't skip extent items representing 0 byte lengths. They * used to be created (bug) if while punching holes we hit * -ENOSPC condition. So if we find one here, just ensure we * delete it, otherwise we would insert a new file extent item * with the same key (offset) as that 0 bytes length file * extent item in the call to setup_items_for_insert() later * in this function. */ if (extent_end == key.offset && extent_end >= search_start) { last_end = extent_end; goto delete_extent_item; } if (extent_end <= search_start) { path->slots[0]++; goto next_slot; } found = 1; search_start = max(key.offset, args->start); if (recow || !modify_tree) { modify_tree = -1; btrfs_release_path(path); continue; } /* * | - range to drop - | * | -------- extent -------- | */ if (args->start > key.offset && args->end < extent_end) { if (WARN_ON(del_nr > 0)) { btrfs_print_leaf(leaf); ret = -EINVAL; break; } if (extent_type == BTRFS_FILE_EXTENT_INLINE) { ret = -EOPNOTSUPP; break; } memcpy(&new_key, &key, sizeof(new_key)); new_key.offset = args->start; ret = btrfs_duplicate_item(trans, root, path, &new_key); if (ret == -EAGAIN) { btrfs_release_path(path); continue; } if (ret < 0) break; leaf = path->nodes[0]; fi = btrfs_item_ptr(leaf, path->slots[0] - 1, struct btrfs_file_extent_item); btrfs_set_file_extent_num_bytes(leaf, fi, args->start - key.offset); fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); extent_offset += args->start - key.offset; btrfs_set_file_extent_offset(leaf, fi, extent_offset); btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - args->start); if (update_refs && disk_bytenr > 0) { struct btrfs_ref ref = { .action = BTRFS_ADD_DELAYED_REF, .bytenr = disk_bytenr, .num_bytes = num_bytes, .parent = 0, .owning_root = btrfs_root_id(root), .ref_root = btrfs_root_id(root), }; btrfs_init_data_ref(&ref, new_key.objectid, args->start - extent_offset, 0, false); ret = btrfs_inc_extent_ref(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); break; } } key.offset = args->start; } /* * From here on out we will have actually dropped something, so * last_end can be updated. */ last_end = extent_end; /* * | ---- range to drop ----- | * | -------- extent -------- | */ if (args->start <= key.offset && args->end < extent_end) { if (extent_type == BTRFS_FILE_EXTENT_INLINE) { ret = -EOPNOTSUPP; break; } memcpy(&new_key, &key, sizeof(new_key)); new_key.offset = args->end; btrfs_set_item_key_safe(trans, path, &new_key); extent_offset += args->end - key.offset; btrfs_set_file_extent_offset(leaf, fi, extent_offset); btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - args->end); if (update_refs && disk_bytenr > 0) args->bytes_found += args->end - key.offset; break; } search_start = extent_end; /* * | ---- range to drop ----- | * | -------- extent -------- | */ if (args->start > key.offset && args->end >= extent_end) { if (WARN_ON(del_nr > 0)) { btrfs_print_leaf(leaf); ret = -EINVAL; break; } if (extent_type == BTRFS_FILE_EXTENT_INLINE) { ret = -EOPNOTSUPP; break; } btrfs_set_file_extent_num_bytes(leaf, fi, args->start - key.offset); if (update_refs && disk_bytenr > 0) args->bytes_found += extent_end - args->start; if (args->end == extent_end) break; path->slots[0]++; goto next_slot; } /* * | ---- range to drop ----- | * | ------ extent ------ | */ if (args->start <= key.offset && args->end >= extent_end) { delete_extent_item: if (del_nr == 0) { del_slot = path->slots[0]; del_nr = 1; } else { if (WARN_ON(del_slot + del_nr != path->slots[0])) { btrfs_print_leaf(leaf); ret = -EINVAL; break; } del_nr++; } if (update_refs && extent_type == BTRFS_FILE_EXTENT_INLINE) { args->bytes_found += extent_end - key.offset; extent_end = ALIGN(extent_end, fs_info->sectorsize); } else if (update_refs && disk_bytenr > 0) { struct btrfs_ref ref = { .action = BTRFS_DROP_DELAYED_REF, .bytenr = disk_bytenr, .num_bytes = num_bytes, .parent = 0, .owning_root = btrfs_root_id(root), .ref_root = btrfs_root_id(root), }; btrfs_init_data_ref(&ref, key.objectid, key.offset - extent_offset, 0, false); ret = btrfs_free_extent(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); break; } args->bytes_found += extent_end - key.offset; } if (args->end == extent_end) break; if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) { path->slots[0]++; goto next_slot; } ret = btrfs_del_items(trans, root, path, del_slot, del_nr); if (ret) { btrfs_abort_transaction(trans, ret); break; } del_nr = 0; del_slot = 0; btrfs_release_path(path); continue; } BUG(); } if (!ret && del_nr > 0) { /* * Set path->slots[0] to first slot, so that after the delete * if items are move off from our leaf to its immediate left or * right neighbor leafs, we end up with a correct and adjusted * path->slots[0] for our insertion (if args->replace_extent). */ path->slots[0] = del_slot; ret = btrfs_del_items(trans, root, path, del_slot, del_nr); if (ret) btrfs_abort_transaction(trans, ret); } leaf = path->nodes[0]; /* * If btrfs_del_items() was called, it might have deleted a leaf, in * which case it unlocked our path, so check path->locks[0] matches a * write lock. */ if (!ret && args->replace_extent && path->locks[0] == BTRFS_WRITE_LOCK && btrfs_leaf_free_space(leaf) >= sizeof(struct btrfs_item) + args->extent_item_size) { key.objectid = ino; key.type = BTRFS_EXTENT_DATA_KEY; key.offset = args->start; if (!del_nr && path->slots[0] < btrfs_header_nritems(leaf)) { struct btrfs_key slot_key; btrfs_item_key_to_cpu(leaf, &slot_key, path->slots[0]); if (btrfs_comp_cpu_keys(&key, &slot_key) > 0) path->slots[0]++; } btrfs_setup_item_for_insert(trans, root, path, &key, args->extent_item_size); args->extent_inserted = true; } if (!args->path) btrfs_free_path(path); else if (!args->extent_inserted) btrfs_release_path(path); out: args->drop_end = found ? min(args->end, last_end) : args->end; return ret; } static bool extent_mergeable(struct extent_buffer *leaf, int slot, u64 objectid, u64 bytenr, u64 orig_offset, u64 *start, u64 *end) { struct btrfs_file_extent_item *fi; struct btrfs_key key; u64 extent_end; if (slot < 0 || slot >= btrfs_header_nritems(leaf)) return false; btrfs_item_key_to_cpu(leaf, &key, slot); if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY) return false; fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG || btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr || btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset || btrfs_file_extent_compression(leaf, fi) || btrfs_file_extent_encryption(leaf, fi) || btrfs_file_extent_other_encoding(leaf, fi)) return false; extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); if ((*start && *start != key.offset) || (*end && *end != extent_end)) return false; *start = key.offset; *end = extent_end; return true; } /* * Mark extent in the range start - end as written. * * This changes extent type from 'pre-allocated' to 'regular'. If only * part of extent is marked as written, the extent will be split into * two or three. */ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, u64 start, u64 end) { struct btrfs_root *root = inode->root; struct extent_buffer *leaf; BTRFS_PATH_AUTO_FREE(path); struct btrfs_file_extent_item *fi; struct btrfs_ref ref = { 0 }; struct btrfs_key key; struct btrfs_key new_key; u64 bytenr; u64 num_bytes; u64 extent_end; u64 orig_offset; u64 other_start; u64 other_end; u64 split; int del_nr = 0; int del_slot = 0; int recow; int ret = 0; u64 ino = btrfs_ino(inode); path = btrfs_alloc_path(); if (!path) return -ENOMEM; again: recow = 0; split = start; key.objectid = ino; key.type = BTRFS_EXTENT_DATA_KEY; key.offset = split; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) goto out; if (ret > 0 && path->slots[0] > 0) path->slots[0]--; leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) { ret = -EINVAL; btrfs_abort_transaction(trans, ret); goto out; } fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_PREALLOC) { ret = -EINVAL; btrfs_abort_transaction(trans, ret); goto out; } extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); if (key.offset > start || extent_end < end) { ret = -EINVAL; btrfs_abort_transaction(trans, ret); goto out; } bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi); memcpy(&new_key, &key, sizeof(new_key)); if (start == key.offset && end < extent_end) { other_start = 0; other_end = start; if (extent_mergeable(leaf, path->slots[0] - 1, ino, bytenr, orig_offset, &other_start, &other_end)) { new_key.offset = end; btrfs_set_item_key_safe(trans, path, &new_key); fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); btrfs_set_file_extent_generation(leaf, fi, trans->transid); btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - end); btrfs_set_file_extent_offset(leaf, fi, end - orig_offset); fi = btrfs_item_ptr(leaf, path->slots[0] - 1, struct btrfs_file_extent_item); btrfs_set_file_extent_generation(leaf, fi, trans->transid); btrfs_set_file_extent_num_bytes(leaf, fi, end - other_start); goto out; } } if (start > key.offset && end == extent_end) { other_start = end; other_end = 0; if (extent_mergeable(leaf, path->slots[0] + 1, ino, bytenr, orig_offset, &other_start, &other_end)) { fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); btrfs_set_file_extent_num_bytes(leaf, fi, start - key.offset); btrfs_set_file_extent_generation(leaf, fi, trans->transid); path->slots[0]++; new_key.offset = start; btrfs_set_item_key_safe(trans, path, &new_key); fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); btrfs_set_file_extent_generation(leaf, fi, trans->transid); btrfs_set_file_extent_num_bytes(leaf, fi, other_end - start); btrfs_set_file_extent_offset(leaf, fi, start - orig_offset); goto out; } } while (start > key.offset || end < extent_end) { if (key.offset == start) split = end; new_key.offset = split; ret = btrfs_duplicate_item(trans, root, path, &new_key); if (ret == -EAGAIN) { btrfs_release_path(path); goto again; } if (ret < 0) { btrfs_abort_transaction(trans, ret); goto out; } leaf = path->nodes[0]; fi = btrfs_item_ptr(leaf, path->slots[0] - 1, struct btrfs_file_extent_item); btrfs_set_file_extent_generation(leaf, fi, trans->transid); btrfs_set_file_extent_num_bytes(leaf, fi, split - key.offset); fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); btrfs_set_file_extent_generation(leaf, fi, trans->transid); btrfs_set_file_extent_offset(leaf, fi, split - orig_offset); btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - split); ref.action = BTRFS_ADD_DELAYED_REF; ref.bytenr = bytenr; ref.num_bytes = num_bytes; ref.parent = 0; ref.owning_root = btrfs_root_id(root); ref.ref_root = btrfs_root_id(root); btrfs_init_data_ref(&ref, ino, orig_offset, 0, false); ret = btrfs_inc_extent_ref(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); goto out; } if (split == start) { key.offset = start; } else { if (start != key.offset) { ret = -EINVAL; btrfs_abort_transaction(trans, ret); goto out; } path->slots[0]--; extent_end = end; } recow = 1; } other_start = end; other_end = 0; ref.action = BTRFS_DROP_DELAYED_REF; ref.bytenr = bytenr; ref.num_bytes = num_bytes; ref.parent = 0; ref.owning_root = btrfs_root_id(root); ref.ref_root = btrfs_root_id(root); btrfs_init_data_ref(&ref, ino, orig_offset, 0, false); if (extent_mergeable(leaf, path->slots[0] + 1, ino, bytenr, orig_offset, &other_start, &other_end)) { if (recow) { btrfs_release_path(path); goto again; } extent_end = other_end; del_slot = path->slots[0] + 1; del_nr++; ret = btrfs_free_extent(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); goto out; } } other_start = 0; other_end = start; if (extent_mergeable(leaf, path->slots[0] - 1, ino, bytenr, orig_offset, &other_start, &other_end)) { if (recow) { btrfs_release_path(path); goto again; } key.offset = other_start; del_slot = path->slots[0]; del_nr++; ret = btrfs_free_extent(trans, &ref); if (ret) { btrfs_abort_transaction(trans, ret); goto out; } } if (del_nr == 0) { fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); btrfs_set_file_extent_type(leaf, fi, BTRFS_FILE_EXTENT_REG); btrfs_set_file_extent_generation(leaf, fi, trans->transid); } else { fi = btrfs_item_ptr(leaf, del_slot - 1, struct btrfs_file_extent_item); btrfs_set_file_extent_type(leaf, fi, BTRFS_FILE_EXTENT_REG); btrfs_set_file_extent_generation(leaf, fi, trans->transid); btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - key.offset); ret = btrfs_del_items(trans, root, path, del_slot, del_nr); if (ret < 0) { btrfs_abort_transaction(trans, ret); goto out; } } out: return ret; } /* * On error return an unlocked folio and the error value * On success return a locked folio and 0 */ static int prepare_uptodate_folio(struct inode *inode, struct folio *folio, u64 pos, u64 len) { u64 clamp_start = max_t(u64, pos, folio_pos(folio)); u64 clamp_end = min_t(u64, pos + len, folio_pos(folio) + folio_size(folio)); const u32 blocksize = inode_to_fs_info(inode)->sectorsize; int ret = 0; if (folio_test_uptodate(folio)) return 0; if (IS_ALIGNED(clamp_start, blocksize) && IS_ALIGNED(clamp_end, blocksize)) return 0; ret = btrfs_read_folio(NULL, folio); if (ret) return ret; folio_lock(folio); if (!folio_test_uptodate(folio)) { folio_unlock(folio); return -EIO; } /* * Since btrfs_read_folio() will unlock the folio before it returns, * there is a window where btrfs_release_folio() can be called to * release the page. Here we check both inode mapping and page * private to make sure the page was not released. * * The private flag check is essential for subpage as we need to store * extra bitmap using folio private. */ if (folio->mapping != inode->i_mapping || !folio_test_private(folio)) { folio_unlock(folio); return -EAGAIN; } return 0; } static gfp_t get_prepare_gfp_flags(struct inode *inode, bool nowait) { gfp_t gfp; gfp = btrfs_alloc_write_mask(inode->i_mapping); if (nowait) { gfp &= ~__GFP_DIRECT_RECLAIM; gfp |= GFP_NOWAIT; } return gfp; } /* * Get folio into the page cache and lock it. */ static noinline int prepare_one_folio(struct inode *inode, struct folio **folio_ret, loff_t pos, size_t write_bytes, bool nowait) { unsigned long index = pos >> PAGE_SHIFT; gfp_t mask = get_prepare_gfp_flags(inode, nowait); fgf_t fgp_flags = (nowait ? FGP_WRITEBEGIN | FGP_NOWAIT : FGP_WRITEBEGIN) | fgf_set_order(write_bytes); struct folio *folio; int ret = 0; again: folio = __filemap_get_folio(inode->i_mapping, index, fgp_flags, mask); if (IS_ERR(folio)) return PTR_ERR(folio); ret = set_folio_extent_mapped(folio); if (ret < 0) { folio_unlock(folio); folio_put(folio); return ret; } ret = prepare_uptodate_folio(inode, folio, pos, write_bytes); if (ret) { /* The folio is already unlocked. */ folio_put(folio); if (!nowait && ret == -EAGAIN) { ret = 0; goto again; } return ret; } *folio_ret = folio; return 0; } /* * Locks the extent and properly waits for data=ordered extents to finish * before allowing the folios to be modified if need. * * Return: * 1 - the extent is locked * 0 - the extent is not locked, and everything is OK * -EAGAIN - need to prepare the folios again */ static noinline int lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct folio *folio, loff_t pos, size_t write_bytes, u64 *lockstart, u64 *lockend, bool nowait, struct extent_state **cached_state) { struct btrfs_fs_info *fs_info = inode->root->fs_info; u64 start_pos; u64 last_pos; int ret = 0; start_pos = round_down(pos, fs_info->sectorsize); last_pos = round_up(pos + write_bytes, fs_info->sectorsize) - 1; if (start_pos < inode->vfs_inode.i_size) { struct btrfs_ordered_extent *ordered; if (nowait) { if (!btrfs_try_lock_extent(&inode->io_tree, start_pos, last_pos, cached_state)) { folio_unlock(folio); folio_put(folio); return -EAGAIN; } } else { btrfs_lock_extent(&inode->io_tree, start_pos, last_pos, cached_state); } ordered = btrfs_lookup_ordered_range(inode, start_pos, last_pos - start_pos + 1); if (ordered && ordered->file_offset + ordered->num_bytes > start_pos && ordered->file_offset <= last_pos) { btrfs_unlock_extent(&inode->io_tree, start_pos, last_pos, cached_state); folio_unlock(folio); folio_put(folio); btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); return -EAGAIN; } if (ordered) btrfs_put_ordered_extent(ordered); *lockstart = start_pos; *lockend = last_pos; ret = 1; } /* * We should be called after prepare_one_folio() which should have locked * all pages in the range. */ WARN_ON(!folio_test_locked(folio)); return ret; } /* * Check if we can do nocow write into the range [@pos, @pos + @write_bytes) * * @pos: File offset. * @write_bytes: The length to write, will be updated to the nocow writeable * range. * * This function will flush ordered extents in the range to ensure proper * nocow checks. * * Return: * > 0 If we can nocow, and updates @write_bytes. * 0 If we can't do a nocow write. * -EAGAIN If we can't do a nocow write because snapshoting of the inode's * root is in progress. * < 0 If an error happened. * * NOTE: Callers need to call btrfs_check_nocow_unlock() if we return > 0. */ int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, size_t *write_bytes, bool nowait) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_root *root = inode->root; struct extent_state *cached_state = NULL; u64 lockstart, lockend; u64 num_bytes; int ret; if (!(inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC))) return 0; if (!btrfs_drew_try_write_lock(&root->snapshot_lock)) return -EAGAIN; lockstart = round_down(pos, fs_info->sectorsize); lockend = round_up(pos + *write_bytes, fs_info->sectorsize) - 1; num_bytes = lockend - lockstart + 1; if (nowait) { if (!btrfs_try_lock_ordered_range(inode, lockstart, lockend, &cached_state)) { btrfs_drew_write_unlock(&root->snapshot_lock); return -EAGAIN; } } else { btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend, &cached_state); } ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, nowait); if (ret <= 0) btrfs_drew_write_unlock(&root->snapshot_lock); else *write_bytes = min_t(size_t, *write_bytes , num_bytes - pos + lockstart); btrfs_unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); return ret; } void btrfs_check_nocow_unlock(struct btrfs_inode *inode) { btrfs_drew_write_unlock(&inode->root->snapshot_lock); } int btrfs_write_check(struct kiocb *iocb, size_t count) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); loff_t pos = iocb->ki_pos; int ret; loff_t oldsize; /* * Quickly bail out on NOWAIT writes if we don't have the nodatacow or * prealloc flags, as without those flags we always have to COW. We will * later check if we can really COW into the target range (using * can_nocow_extent() at btrfs_get_blocks_direct_write()). */ if ((iocb->ki_flags & IOCB_NOWAIT) && !(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC))) return -EAGAIN; ret = file_remove_privs(file); if (ret) return ret; /* * We reserve space for updating the inode when we reserve space for the * extent we are going to write, so we will enospc out there. We don't * need to start yet another transaction to update the inode as we will * update the inode when we finish writing whatever data we write. */ if (!IS_NOCMTIME(inode)) { inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); inode_inc_iversion(inode); } oldsize = i_size_read(inode); if (pos > oldsize) { /* Expand hole size to cover write data, preventing empty gap */ loff_t end_pos = round_up(pos + count, fs_info->sectorsize); ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, end_pos); if (ret) return ret; } return 0; } static void release_space(struct btrfs_inode *inode, struct extent_changeset *data_reserved, u64 start, u64 len, bool only_release_metadata) { if (len == 0) return; if (only_release_metadata) { btrfs_check_nocow_unlock(inode); btrfs_delalloc_release_metadata(inode, len, true); } else { const struct btrfs_fs_info *fs_info = inode->root->fs_info; btrfs_delalloc_release_space(inode, data_reserved, round_down(start, fs_info->sectorsize), len, true); } } /* * Reserve data and metadata space for this buffered write range. * * Return >0 for the number of bytes reserved, which is always block aligned. * Return <0 for error. */ static ssize_t reserve_space(struct btrfs_inode *inode, struct extent_changeset **data_reserved, u64 start, size_t *len, bool nowait, bool *only_release_metadata) { const struct btrfs_fs_info *fs_info = inode->root->fs_info; const unsigned int block_offset = (start & (fs_info->sectorsize - 1)); size_t reserve_bytes; int ret; ret = btrfs_check_data_free_space(inode, data_reserved, start, *len, nowait); if (ret < 0) { int can_nocow; if (nowait && (ret == -ENOSPC || ret == -EAGAIN)) return -EAGAIN; /* * If we don't have to COW at the offset, reserve metadata only. * write_bytes may get smaller than requested here. */ can_nocow = btrfs_check_nocow_lock(inode, start, len, nowait); if (can_nocow < 0) ret = can_nocow; if (can_nocow > 0) ret = 0; if (ret) return ret; *only_release_metadata = true; } reserve_bytes = round_up(*len + block_offset, fs_info->sectorsize); WARN_ON(reserve_bytes == 0); ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes, reserve_bytes, nowait); if (ret) { if (!*only_release_metadata) btrfs_free_reserved_data_space(inode, *data_reserved, start, *len); else btrfs_check_nocow_unlock(inode); if (nowait && ret == -ENOSPC) ret = -EAGAIN; return ret; } return reserve_bytes; } /* Shrink the reserved data and metadata space from @reserved_len to @new_len. */ static void shrink_reserved_space(struct btrfs_inode *inode, struct extent_changeset *data_reserved, u64 reserved_start, u64 reserved_len, u64 new_len, bool only_release_metadata) { const u64 diff = reserved_len - new_len; ASSERT(new_len <= reserved_len); btrfs_delalloc_shrink_extents(inode, reserved_len, new_len); if (only_release_metadata) btrfs_delalloc_release_metadata(inode, diff, true); else btrfs_delalloc_release_space(inode, data_reserved, reserved_start + new_len, diff, true); } /* Calculate the maximum amount of bytes we can write into one folio. */ static size_t calc_write_bytes(const struct btrfs_inode *inode, const struct iov_iter *iter, u64 start) { const size_t max_folio_size = mapping_max_folio_size(inode->vfs_inode.i_mapping); return min(max_folio_size - (start & (max_folio_size - 1)), iov_iter_count(iter)); } /* * Do the heavy-lifting work to copy one range into one folio of the page cache. * * Return > 0 in case we copied all bytes or just some of them. * Return 0 if no bytes were copied, in which case the caller should retry. * Return <0 on error. */ static int copy_one_range(struct btrfs_inode *inode, struct iov_iter *iter, struct extent_changeset **data_reserved, u64 start, bool nowait) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_state *cached_state = NULL; size_t write_bytes = calc_write_bytes(inode, iter, start); size_t copied; const u64 reserved_start = round_down(start, fs_info->sectorsize); u64 reserved_len; struct folio *folio = NULL; int extents_locked; u64 lockstart; u64 lockend; bool only_release_metadata = false; const unsigned int bdp_flags = (nowait ? BDP_ASYNC : 0); int ret; /* * Fault all pages before locking them in prepare_one_folio() to avoid * recursive lock. */ if (unlikely(fault_in_iov_iter_readable(iter, write_bytes))) return -EFAULT; extent_changeset_release(*data_reserved); ret = reserve_space(inode, data_reserved, start, &write_bytes, nowait, &only_release_metadata); if (ret < 0) return ret; reserved_len = ret; /* Write range must be inside the reserved range. */ ASSERT(reserved_start <= start); ASSERT(start + write_bytes <= reserved_start + reserved_len); again: ret = balance_dirty_pages_ratelimited_flags(inode->vfs_inode.i_mapping, bdp_flags); if (ret) { btrfs_delalloc_release_extents(inode, reserved_len); release_space(inode, *data_reserved, reserved_start, reserved_len, only_release_metadata); return ret; } ret = prepare_one_folio(&inode->vfs_inode, &folio, start, write_bytes, false); if (ret) { btrfs_delalloc_release_extents(inode, reserved_len); release_space(inode, *data_reserved, reserved_start, reserved_len, only_release_metadata); return ret; } /* * The reserved range goes beyond the current folio, shrink the reserved * space to the folio boundary. */ if (reserved_start + reserved_len > folio_pos(folio) + folio_size(folio)) { const u64 last_block = folio_pos(folio) + folio_size(folio); shrink_reserved_space(inode, *data_reserved, reserved_start, reserved_len, last_block - reserved_start, only_release_metadata); write_bytes = last_block - start; reserved_len = last_block - reserved_start; } extents_locked = lock_and_cleanup_extent_if_need(inode, folio, start, write_bytes, &lockstart, &lockend, nowait, &cached_state); if (extents_locked < 0) { if (!nowait && extents_locked == -EAGAIN) goto again; btrfs_delalloc_release_extents(inode, reserved_len); release_space(inode, *data_reserved, reserved_start, reserved_len, only_release_metadata); ret = extents_locked; return ret; } copied = copy_folio_from_iter_atomic(folio, offset_in_folio(folio, start), write_bytes, iter); flush_dcache_folio(folio); if (unlikely(copied < write_bytes)) { u64 last_block; /* * The original write range doesn't need an uptodate folio as * the range is block aligned. But now a short copy happened. * We cannot handle it without an uptodate folio. * * So just revert the range and we will retry. */ if (!folio_test_uptodate(folio)) { iov_iter_revert(iter, copied); copied = 0; } /* No copied bytes, unlock, release reserved space and exit. */ if (copied == 0) { if (extents_locked) btrfs_unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); else btrfs_free_extent_state(cached_state); btrfs_delalloc_release_extents(inode, reserved_len); release_space(inode, *data_reserved, reserved_start, reserved_len, only_release_metadata); btrfs_drop_folio(fs_info, folio, start, copied); return 0; } /* Release the reserved space beyond the last block. */ last_block = round_up(start + copied, fs_info->sectorsize); shrink_reserved_space(inode, *data_reserved, reserved_start, reserved_len, last_block - reserved_start, only_release_metadata); reserved_len = last_block - reserved_start; } ret = btrfs_dirty_folio(inode, folio, start, copied, &cached_state, only_release_metadata); /* * If we have not locked the extent range, because the range's start * offset is >= i_size, we might still have a non-NULL cached extent * state, acquired while marking the extent range as delalloc through * btrfs_dirty_page(). Therefore free any possible cached extent state * to avoid a memory leak. */ if (extents_locked) btrfs_unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); else btrfs_free_extent_state(cached_state); btrfs_delalloc_release_extents(inode, reserved_len); if (ret) { btrfs_drop_folio(fs_info, folio, start, copied); release_space(inode, *data_reserved, reserved_start, reserved_len, only_release_metadata); return ret; } if (only_release_metadata) btrfs_check_nocow_unlock(inode); btrfs_drop_folio(fs_info, folio, start, copied); return copied; } ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; loff_t pos; struct inode *inode = file_inode(file); struct extent_changeset *data_reserved = NULL; size_t num_written = 0; ssize_t ret; loff_t old_isize; unsigned int ilock_flags = 0; const bool nowait = (iocb->ki_flags & IOCB_NOWAIT); if (nowait) ilock_flags |= BTRFS_ILOCK_TRY; ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags); if (ret < 0) return ret; /* * We can only trust the isize with inode lock held, or it can race with * other buffered writes and cause incorrect call of * pagecache_isize_extended() to overwrite existing data. */ old_isize = i_size_read(inode); ret = generic_write_checks(iocb, iter); if (ret <= 0) goto out; ret = btrfs_write_check(iocb, ret); if (ret < 0) goto out; pos = iocb->ki_pos; while (iov_iter_count(iter) > 0) { ret = copy_one_range(BTRFS_I(inode), iter, &data_reserved, pos, nowait); if (ret < 0) break; pos += ret; num_written += ret; cond_resched(); } extent_changeset_free(data_reserved); if (num_written > 0) { pagecache_isize_extended(inode, old_isize, iocb->ki_pos); iocb->ki_pos += num_written; } out: btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); return num_written ? num_written : ret; } static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from, const struct btrfs_ioctl_encoded_io_args *encoded) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); loff_t count; ssize_t ret; btrfs_inode_lock(BTRFS_I(inode), 0); count = encoded->len; ret = generic_write_checks_count(iocb, &count); if (ret == 0 && count != encoded->len) { /* * The write got truncated by generic_write_checks_count(). We * can't do a partial encoded write. */ ret = -EFBIG; } if (ret || encoded->len == 0) goto out; ret = btrfs_write_check(iocb, encoded->len); if (ret < 0) goto out; ret = btrfs_do_encoded_write(iocb, from, encoded); out: btrfs_inode_unlock(BTRFS_I(inode), 0); return ret; } ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from, const struct btrfs_ioctl_encoded_io_args *encoded) { struct file *file = iocb->ki_filp; struct btrfs_inode *inode = BTRFS_I(file_inode(file)); ssize_t num_written, num_sync; /* * If the fs flips readonly due to some impossible error, although we * have opened a file as writable, we have to stop this write operation * to ensure consistency. */ if (BTRFS_FS_ERROR(inode->root->fs_info)) return -EROFS; if (encoded && (iocb->ki_flags & IOCB_NOWAIT)) return -EOPNOTSUPP; if (encoded) { num_written = btrfs_encoded_write(iocb, from, encoded); num_sync = encoded->len; } else if (iocb->ki_flags & IOCB_DIRECT) { num_written = btrfs_direct_write(iocb, from); num_sync = num_written; } else { num_written = btrfs_buffered_write(iocb, from); num_sync = num_written; } btrfs_set_inode_last_sub_trans(inode); if (num_sync > 0) { num_sync = generic_write_sync(iocb, num_sync); if (num_sync < 0) num_written = num_sync; } return num_written; } static ssize_t btrfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { return btrfs_do_write_iter(iocb, from, NULL); } int btrfs_release_file(struct inode *inode, struct file *filp) { struct btrfs_file_private *private = filp->private_data; if (private) { kfree(private->filldir_buf); btrfs_free_extent_state(private->llseek_cached_state); kfree(private); filp->private_data = NULL; } /* * Set by setattr when we are about to truncate a file from a non-zero * size to a zero size. This tries to flush down new bytes that may * have been written if the application were using truncate to replace * a file in place. */ if (test_and_clear_bit(BTRFS_INODE_FLUSH_ON_CLOSE, &BTRFS_I(inode)->runtime_flags)) filemap_flush(inode->i_mapping); return 0; } static int start_ordered_ops(struct btrfs_inode *inode, loff_t start, loff_t end) { int ret; struct blk_plug plug; /* * This is only called in fsync, which would do synchronous writes, so * a plug can merge adjacent IOs as much as possible. Esp. in case of * multiple disks using raid profile, a large IO can be split to * several segments of stripe length (currently 64K). */ blk_start_plug(&plug); ret = btrfs_fdatawrite_range(inode, start, end); blk_finish_plug(&plug); return ret; } static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx) { struct btrfs_inode *inode = ctx->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; if (btrfs_inode_in_log(inode, btrfs_get_fs_generation(fs_info)) && list_empty(&ctx->ordered_extents)) return true; /* * If we are doing a fast fsync we can not bail out if the inode's * last_trans is <= then the last committed transaction, because we only * update the last_trans of the inode during ordered extent completion, * and for a fast fsync we don't wait for that, we only wait for the * writeback to complete. */ if (inode->last_trans <= btrfs_get_last_trans_committed(fs_info) && (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) || list_empty(&ctx->ordered_extents))) return true; return false; } /* * fsync call for both files and directories. This logs the inode into * the tree log instead of forcing full commits whenever possible. * * It needs to call filemap_fdatawait so that all ordered extent updates are * in the metadata btree are up to date for copying to the log. * * It drops the inode mutex before doing the tree log commit. This is an * important optimization for directories because holding the mutex prevents * new operations on the dir while we write to disk. */ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) { struct dentry *dentry = file_dentry(file); struct btrfs_inode *inode = BTRFS_I(d_inode(dentry)); struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans; struct btrfs_log_ctx ctx; int ret = 0, err; u64 len; bool full_sync; bool skip_ilock = false; if (current->journal_info == BTRFS_TRANS_DIO_WRITE_STUB) { skip_ilock = true; current->journal_info = NULL; btrfs_assert_inode_locked(inode); } trace_btrfs_sync_file(file, datasync); btrfs_init_log_ctx(&ctx, inode); /* * Always set the range to a full range, otherwise we can get into * several problems, from missing file extent items to represent holes * when not using the NO_HOLES feature, to log tree corruption due to * races between hole detection during logging and completion of ordered * extents outside the range, to missing checksums due to ordered extents * for which we flushed only a subset of their pages. */ start = 0; end = LLONG_MAX; len = (u64)LLONG_MAX + 1; /* * We write the dirty pages in the range and wait until they complete * out of the ->i_mutex. If so, we can flush the dirty pages by * multi-task, and make the performance up. See * btrfs_wait_ordered_range for an explanation of the ASYNC check. */ ret = start_ordered_ops(inode, start, end); if (ret) goto out; if (skip_ilock) down_write(&inode->i_mmap_lock); else btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP); atomic_inc(&root->log_batch); /* * Before we acquired the inode's lock and the mmap lock, someone may * have dirtied more pages in the target range. We need to make sure * that writeback for any such pages does not start while we are logging * the inode, because if it does, any of the following might happen when * we are not doing a full inode sync: * * 1) We log an extent after its writeback finishes but before its * checksums are added to the csum tree, leading to -EIO errors * when attempting to read the extent after a log replay. * * 2) We can end up logging an extent before its writeback finishes. * Therefore after the log replay we will have a file extent item * pointing to an unwritten extent (and no data checksums as well). * * So trigger writeback for any eventual new dirty pages and then we * wait for all ordered extents to complete below. */ ret = start_ordered_ops(inode, start, end); if (ret) { if (skip_ilock) up_write(&inode->i_mmap_lock); else btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); goto out; } /* * Always check for the full sync flag while holding the inode's lock, * to avoid races with other tasks. The flag must be either set all the * time during logging or always off all the time while logging. * We check the flag here after starting delalloc above, because when * running delalloc the full sync flag may be set if we need to drop * extra extent map ranges due to temporary memory allocation failures. */ full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); /* * We have to do this here to avoid the priority inversion of waiting on * IO of a lower priority task while holding a transaction open. * * For a full fsync we wait for the ordered extents to complete while * for a fast fsync we wait just for writeback to complete, and then * attach the ordered extents to the transaction so that a transaction * commit waits for their completion, to avoid data loss if we fsync, * the current transaction commits before the ordered extents complete * and a power failure happens right after that. * * For zoned filesystem, if a write IO uses a ZONE_APPEND command, the * logical address recorded in the ordered extent may change. We need * to wait for the IO to stabilize the logical address. */ if (full_sync || btrfs_is_zoned(fs_info)) { ret = btrfs_wait_ordered_range(inode, start, len); clear_bit(BTRFS_INODE_COW_WRITE_ERROR, &inode->runtime_flags); } else { /* * Get our ordered extents as soon as possible to avoid doing * checksum lookups in the csum tree, and use instead the * checksums attached to the ordered extents. */ btrfs_get_ordered_extents_for_logging(inode, &ctx.ordered_extents); ret = filemap_fdatawait_range(inode->vfs_inode.i_mapping, start, end); if (ret) goto out_release_extents; /* * Check and clear the BTRFS_INODE_COW_WRITE_ERROR now after * starting and waiting for writeback, because for buffered IO * it may have been set during the end IO callback * (end_bbio_data_write() -> btrfs_finish_ordered_extent()) in * case an error happened and we need to wait for ordered * extents to complete so that any extent maps that point to * unwritten locations are dropped and we don't log them. */ if (test_and_clear_bit(BTRFS_INODE_COW_WRITE_ERROR, &inode->runtime_flags)) ret = btrfs_wait_ordered_range(inode, start, len); } if (ret) goto out_release_extents; atomic_inc(&root->log_batch); if (skip_inode_logging(&ctx)) { /* * We've had everything committed since the last time we were * modified so clear this flag in case it was set for whatever * reason, it's no longer relevant. */ clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); /* * An ordered extent might have started before and completed * already with io errors, in which case the inode was not * updated and we end up here. So check the inode's mapping * for any errors that might have happened since we last * checked called fsync. */ ret = filemap_check_wb_err(inode->vfs_inode.i_mapping, file->f_wb_err); goto out_release_extents; } btrfs_init_log_ctx_scratch_eb(&ctx); /* * We use start here because we will need to wait on the IO to complete * in btrfs_sync_log, which could require joining a transaction (for * example checking cross references in the nocow path). If we use join * here we could get into a situation where we're waiting on IO to * happen that is blocked on a transaction trying to commit. With start * we inc the extwriter counter, so we wait for all extwriters to exit * before we start blocking joiners. This comment is to keep somebody * from thinking they are super smart and changing this to * btrfs_join_transaction *cough*Josef*cough*. */ trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out_release_extents; } trans->in_fsync = true; ret = btrfs_log_dentry_safe(trans, dentry, &ctx); /* * Scratch eb no longer needed, release before syncing log or commit * transaction, to avoid holding unnecessary memory during such long * operations. */ if (ctx.scratch_eb) { free_extent_buffer(ctx.scratch_eb); ctx.scratch_eb = NULL; } btrfs_release_log_ctx_extents(&ctx); if (ret < 0) { /* Fallthrough and commit/free transaction. */ ret = BTRFS_LOG_FORCE_COMMIT; } /* we've logged all the items and now have a consistent * version of the file in the log. It is possible that * someone will come in and modify the file, but that's * fine because the log is consistent on disk, and we * have references to all of the file's extents * * It is possible that someone will come in and log the * file again, but that will end up using the synchronization * inside btrfs_sync_log to keep things safe. */ if (skip_ilock) up_write(&inode->i_mmap_lock); else btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); if (ret == BTRFS_NO_LOG_SYNC) { ret = btrfs_end_transaction(trans); goto out; } /* We successfully logged the inode, attempt to sync the log. */ if (!ret) { ret = btrfs_sync_log(trans, root, &ctx); if (!ret) { ret = btrfs_end_transaction(trans); goto out; } } /* * At this point we need to commit the transaction because we had * btrfs_need_log_full_commit() or some other error. * * If we didn't do a full sync we have to stop the trans handle, wait on * the ordered extents, start it again and commit the transaction. If * we attempt to wait on the ordered extents here we could deadlock with * something like fallocate() that is holding the extent lock trying to * start a transaction while some other thread is trying to commit the * transaction while we (fsync) are currently holding the transaction * open. */ if (!full_sync) { ret = btrfs_end_transaction(trans); if (ret) goto out; ret = btrfs_wait_ordered_range(inode, start, len); if (ret) goto out; /* * This is safe to use here because we're only interested in * making sure the transaction that had the ordered extents is * committed. We aren't waiting on anything past this point, * we're purely getting the transaction and committing it. */ trans = btrfs_attach_transaction_barrier(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); /* * We committed the transaction and there's no currently * running transaction, this means everything we care * about made it to disk and we are done. */ if (ret == -ENOENT) ret = 0; goto out; } } ret = btrfs_commit_transaction(trans); out: free_extent_buffer(ctx.scratch_eb); ASSERT(list_empty(&ctx.list)); ASSERT(list_empty(&ctx.conflict_inodes)); err = file_check_and_advance_wb_err(file); if (!ret) ret = err; return ret > 0 ? -EIO : ret; out_release_extents: btrfs_release_log_ctx_extents(&ctx); if (skip_ilock) up_write(&inode->i_mmap_lock); else btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); goto out; } /* * btrfs_page_mkwrite() is not allowed to change the file size as it gets * called from a page fault handler when a page is first dirtied. Hence we must * be careful to check for EOF conditions here. We set the page up correctly * for a written page which means we get ENOSPC checking when writing into * holes and correct delalloc and unwritten extent mapping on filesystems that * support these features. * * We are not allowed to take the i_mutex here so we have to play games to * protect against truncate races as the page could now be beyond EOF. Because * truncate_setsize() writes the inode size before removing pages, once we have * the page lock we can determine safely if the page is beyond EOF. If it is not * beyond EOF, then the page is guaranteed safe against truncation until we * unlock the page. */ static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) { struct page *page = vmf->page; struct folio *folio = page_folio(page); struct inode *inode = file_inode(vmf->vma->vm_file); struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; struct extent_changeset *data_reserved = NULL; unsigned long zero_start; loff_t size; size_t fsize = folio_size(folio); int ret; u64 reserved_space; u64 page_start; u64 page_end; u64 end; reserved_space = fsize; sb_start_pagefault(inode->i_sb); page_start = folio_pos(folio); page_end = page_start + folio_size(folio) - 1; end = page_end; /* * Reserving delalloc space after obtaining the page lock can lead to * deadlock. For example, if a dirty page is locked by this function * and the call to btrfs_delalloc_reserve_space() ends up triggering * dirty page write out, then the btrfs_writepages() function could * end up waiting indefinitely to get a lock on the page currently * being processed by btrfs_page_mkwrite() function. */ ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved, page_start, reserved_space); if (ret < 0) goto out_noreserve; ret = file_update_time(vmf->vma->vm_file); if (ret < 0) goto out; again: down_read(&BTRFS_I(inode)->i_mmap_lock); folio_lock(folio); size = i_size_read(inode); if ((folio->mapping != inode->i_mapping) || (page_start >= size)) { /* Page got truncated out from underneath us. */ goto out_unlock; } folio_wait_writeback(folio); btrfs_lock_extent(io_tree, page_start, page_end, &cached_state); ret = set_folio_extent_mapped(folio); if (ret < 0) { btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state); goto out_unlock; } /* * We can't set the delalloc bits if there are pending ordered * extents. Drop our locks and wait for them to finish. */ ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, fsize); if (ordered) { btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state); folio_unlock(folio); up_read(&BTRFS_I(inode)->i_mmap_lock); btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); goto again; } if (folio_contains(folio, (size - 1) >> PAGE_SHIFT)) { reserved_space = round_up(size - page_start, fs_info->sectorsize); if (reserved_space < fsize) { end = page_start + reserved_space - 1; btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, end + 1, fsize - reserved_space, true); } } /* * page_mkwrite gets called when the page is firstly dirtied after it's * faulted in, but write(2) could also dirty a page and set delalloc * bits, thus in this case for space account reason, we still need to * clear any delalloc bits within this page range since we have to * reserve data&meta space before lock_page() (see above comments). */ btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, &cached_state); ret = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, end, 0, &cached_state); if (ret < 0) { btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state); goto out_unlock; } /* Page is wholly or partially inside EOF. */ if (page_start + folio_size(folio) > size) zero_start = offset_in_folio(folio, size); else zero_start = fsize; if (zero_start != fsize) folio_zero_range(folio, zero_start, folio_size(folio) - zero_start); btrfs_folio_clear_checked(fs_info, folio, page_start, fsize); btrfs_folio_set_dirty(fs_info, folio, page_start, end + 1 - page_start); btrfs_folio_set_uptodate(fs_info, folio, page_start, end + 1 - page_start); btrfs_set_inode_last_sub_trans(BTRFS_I(inode)); btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state); up_read(&BTRFS_I(inode)->i_mmap_lock); btrfs_delalloc_release_extents(BTRFS_I(inode), fsize); sb_end_pagefault(inode->i_sb); extent_changeset_free(data_reserved); return VM_FAULT_LOCKED; out_unlock: folio_unlock(folio); up_read(&BTRFS_I(inode)->i_mmap_lock); out: btrfs_delalloc_release_extents(BTRFS_I(inode), fsize); btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start, reserved_space, true); extent_changeset_free(data_reserved); out_noreserve: sb_end_pagefault(inode->i_sb); if (ret < 0) return vmf_error(ret); /* Make the VM retry the fault. */ return VM_FAULT_NOPAGE; } static const struct vm_operations_struct btrfs_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = btrfs_page_mkwrite, }; static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma) { struct address_space *mapping = filp->f_mapping; if (!mapping->a_ops->read_folio) return -ENOEXEC; file_accessed(filp); vma->vm_ops = &btrfs_file_vm_ops; return 0; } static bool hole_mergeable(struct btrfs_inode *inode, struct extent_buffer *leaf, int slot, u64 start, u64 end) { struct btrfs_file_extent_item *fi; struct btrfs_key key; if (slot < 0 || slot >= btrfs_header_nritems(leaf)) return false; btrfs_item_key_to_cpu(leaf, &key, slot); if (key.objectid != btrfs_ino(inode) || key.type != BTRFS_EXTENT_DATA_KEY) return false; fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG) return false; if (btrfs_file_extent_disk_bytenr(leaf, fi)) return false; if (key.offset == end) return true; if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start) return true; return false; } static int fill_holes(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_path *path, u64 offset, u64 end) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *root = inode->root; struct extent_buffer *leaf; struct btrfs_file_extent_item *fi; struct extent_map *hole_em; struct btrfs_key key; int ret; if (btrfs_fs_incompat(fs_info, NO_HOLES)) goto out; key.objectid = btrfs_ino(inode); key.type = BTRFS_EXTENT_DATA_KEY; key.offset = offset; ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret <= 0) { /* * We should have dropped this offset, so if we find it then * something has gone horribly wrong. */ if (ret == 0) ret = -EINVAL; return ret; } leaf = path->nodes[0]; if (hole_mergeable(inode, leaf, path->slots[0] - 1, offset, end)) { u64 num_bytes; path->slots[0]--; fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end - offset; btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes); btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes); btrfs_set_file_extent_offset(leaf, fi, 0); btrfs_set_file_extent_generation(leaf, fi, trans->transid); goto out; } if (hole_mergeable(inode, leaf, path->slots[0], offset, end)) { u64 num_bytes; key.offset = offset; btrfs_set_item_key_safe(trans, path, &key); fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end - offset; btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes); btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes); btrfs_set_file_extent_offset(leaf, fi, 0); btrfs_set_file_extent_generation(leaf, fi, trans->transid); goto out; } btrfs_release_path(path); ret = btrfs_insert_hole_extent(trans, root, btrfs_ino(inode), offset, end - offset); if (ret) return ret; out: btrfs_release_path(path); hole_em = btrfs_alloc_extent_map(); if (!hole_em) { btrfs_drop_extent_map_range(inode, offset, end - 1, false); btrfs_set_inode_full_sync(inode); } else { hole_em->start = offset; hole_em->len = end - offset; hole_em->ram_bytes = hole_em->len; hole_em->disk_bytenr = EXTENT_MAP_HOLE; hole_em->disk_num_bytes = 0; hole_em->generation = trans->transid; ret = btrfs_replace_extent_map_range(inode, hole_em, true); btrfs_free_extent_map(hole_em); if (ret) btrfs_set_inode_full_sync(inode); } return 0; } /* * Find a hole extent on given inode and change start/len to the end of hole * extent.(hole/vacuum extent whose em->start <= start && * em->start + em->len > start) * When a hole extent is found, return 1 and modify start/len. */ static int find_first_non_hole(struct btrfs_inode *inode, u64 *start, u64 *len) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_map *em; int ret = 0; em = btrfs_get_extent(inode, NULL, round_down(*start, fs_info->sectorsize), round_up(*len, fs_info->sectorsize)); if (IS_ERR(em)) return PTR_ERR(em); /* Hole or vacuum extent(only exists in no-hole mode) */ if (em->disk_bytenr == EXTENT_MAP_HOLE) { ret = 1; *len = em->start + em->len > *start + *len ? 0 : *start + *len - em->start - em->len; *start = em->start + em->len; } btrfs_free_extent_map(em); return ret; } /* * Check if there is no folio in the range. * * We cannot utilize filemap_range_has_page() in a filemap with large folios * as we can hit the following false positive: * * start end * | | * |//|//|//|//| | | | | | | | |//|//| * \ / \ / * Folio A Folio B * * That large folio A and B cover the start and end indexes. * In that case filemap_range_has_page() will always return true, but the above * case is fine for btrfs_punch_hole_lock_range() usage. * * So here we only ensure that no other folios is in the range, excluding the * head/tail large folio. */ static bool check_range_has_page(struct inode *inode, u64 start, u64 end) { struct folio_batch fbatch; bool ret = false; /* * For subpage case, if the range is not at page boundary, we could * have pages at the leading/tailing part of the range. * This could lead to dead loop since filemap_range_has_page() * will always return true. * So here we need to do extra page alignment for * filemap_range_has_page(). * * And do not decrease page_lockend right now, as it can be 0. */ const u64 page_lockstart = round_up(start, PAGE_SIZE); const u64 page_lockend = round_down(end + 1, PAGE_SIZE); const pgoff_t start_index = page_lockstart >> PAGE_SHIFT; const pgoff_t end_index = (page_lockend - 1) >> PAGE_SHIFT; pgoff_t tmp = start_index; int found_folios; /* The same page or adjacent pages. */ if (page_lockend <= page_lockstart) return false; folio_batch_init(&fbatch); found_folios = filemap_get_folios(inode->i_mapping, &tmp, end_index, &fbatch); for (int i = 0; i < found_folios; i++) { struct folio *folio = fbatch.folios[i]; /* A large folio begins before the start. Not a target. */ if (folio->index < start_index) continue; /* A large folio extends beyond the end. Not a target. */ if (folio->index + folio_nr_pages(folio) > end_index) continue; /* A folio doesn't cover the head/tail index. Found a target. */ ret = true; break; } folio_batch_release(&fbatch); return ret; } static void btrfs_punch_hole_lock_range(struct inode *inode, const u64 lockstart, const u64 lockend, struct extent_state **cached_state) { while (1) { truncate_pagecache_range(inode, lockstart, lockend); btrfs_lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, cached_state); /* * We can't have ordered extents in the range, nor dirty/writeback * pages, because we have locked the inode's VFS lock in exclusive * mode, we have locked the inode's i_mmap_lock in exclusive mode, * we have flushed all delalloc in the range and we have waited * for any ordered extents in the range to complete. * We can race with anyone reading pages from this range, so after * locking the range check if we have pages in the range, and if * we do, unlock the range and retry. */ if (!check_range_has_page(inode, lockstart, lockend)) break; btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, cached_state); } btrfs_assert_inode_range_clean(BTRFS_I(inode), lockstart, lockend); } static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_path *path, struct btrfs_replace_extent_info *extent_info, const u64 replace_len, const u64 bytes_to_drop) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *root = inode->root; struct btrfs_file_extent_item *extent; struct extent_buffer *leaf; struct btrfs_key key; int slot; int ret; if (replace_len == 0) return 0; if (extent_info->disk_offset == 0 && btrfs_fs_incompat(fs_info, NO_HOLES)) { btrfs_update_inode_bytes(inode, 0, bytes_to_drop); return 0; } key.objectid = btrfs_ino(inode); key.type = BTRFS_EXTENT_DATA_KEY; key.offset = extent_info->file_offset; ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(struct btrfs_file_extent_item)); if (ret) return ret; leaf = path->nodes[0]; slot = path->slots[0]; write_extent_buffer(leaf, extent_info->extent_buf, btrfs_item_ptr_offset(leaf, slot), sizeof(struct btrfs_file_extent_item)); extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); ASSERT(btrfs_file_extent_type(leaf, extent) != BTRFS_FILE_EXTENT_INLINE); btrfs_set_file_extent_offset(leaf, extent, extent_info->data_offset); btrfs_set_file_extent_num_bytes(leaf, extent, replace_len); if (extent_info->is_new_extent) btrfs_set_file_extent_generation(leaf, extent, trans->transid); btrfs_release_path(path); ret = btrfs_inode_set_file_extent_range(inode, extent_info->file_offset, replace_len); if (ret) return ret; /* If it's a hole, nothing more needs to be done. */ if (extent_info->disk_offset == 0) { btrfs_update_inode_bytes(inode, 0, bytes_to_drop); return 0; } btrfs_update_inode_bytes(inode, replace_len, bytes_to_drop); if (extent_info->is_new_extent && extent_info->insertions == 0) { key.objectid = extent_info->disk_offset; key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = extent_info->disk_len; ret = btrfs_alloc_reserved_file_extent(trans, root, btrfs_ino(inode), extent_info->file_offset, extent_info->qgroup_reserved, &key); } else { struct btrfs_ref ref = { .action = BTRFS_ADD_DELAYED_REF, .bytenr = extent_info->disk_offset, .num_bytes = extent_info->disk_len, .owning_root = btrfs_root_id(root), .ref_root = btrfs_root_id(root), }; u64 ref_offset; ref_offset = extent_info->file_offset - extent_info->data_offset; btrfs_init_data_ref(&ref, btrfs_ino(inode), ref_offset, 0, false); ret = btrfs_inc_extent_ref(trans, &ref); } extent_info->insertions++; return ret; } /* * The respective range must have been previously locked, as well as the inode. * The end offset is inclusive (last byte of the range). * @extent_info is NULL for fallocate's hole punching and non-NULL when replacing * the file range with an extent. * When not punching a hole, we don't want to end up in a state where we dropped * extents without inserting a new one, so we must abort the transaction to avoid * a corruption. */ int btrfs_replace_file_extents(struct btrfs_inode *inode, struct btrfs_path *path, const u64 start, const u64 end, struct btrfs_replace_extent_info *extent_info, struct btrfs_trans_handle **trans_out) { struct btrfs_drop_extents_args drop_args = { 0 }; struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; u64 min_size = btrfs_calc_insert_metadata_size(fs_info, 1); u64 ino_size = round_up(inode->vfs_inode.i_size, fs_info->sectorsize); struct btrfs_trans_handle *trans = NULL; struct btrfs_block_rsv *rsv; unsigned int rsv_count; u64 cur_offset; u64 len = end - start; int ret = 0; if (end <= start) return -EINVAL; rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP); if (!rsv) { ret = -ENOMEM; goto out; } rsv->size = btrfs_calc_insert_metadata_size(fs_info, 1); rsv->failfast = true; /* * 1 - update the inode * 1 - removing the extents in the range * 1 - adding the hole extent if no_holes isn't set or if we are * replacing the range with a new extent */ if (!btrfs_fs_incompat(fs_info, NO_HOLES) || extent_info) rsv_count = 3; else rsv_count = 2; trans = btrfs_start_transaction(root, rsv_count); if (IS_ERR(trans)) { ret = PTR_ERR(trans); trans = NULL; goto out_free; } ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv, min_size, false); if (WARN_ON(ret)) goto out_trans; trans->block_rsv = rsv; cur_offset = start; drop_args.path = path; drop_args.end = end + 1; drop_args.drop_cache = true; while (cur_offset < end) { drop_args.start = cur_offset; ret = btrfs_drop_extents(trans, root, inode, &drop_args); /* If we are punching a hole decrement the inode's byte count */ if (!extent_info) btrfs_update_inode_bytes(inode, 0, drop_args.bytes_found); if (ret != -ENOSPC) { /* * The only time we don't want to abort is if we are * attempting to clone a partial inline extent, in which * case we'll get EOPNOTSUPP. However if we aren't * clone we need to abort no matter what, because if we * got EOPNOTSUPP via prealloc then we messed up and * need to abort. */ if (ret && (ret != -EOPNOTSUPP || (extent_info && extent_info->is_new_extent))) btrfs_abort_transaction(trans, ret); break; } trans->block_rsv = &fs_info->trans_block_rsv; if (!extent_info && cur_offset < drop_args.drop_end && cur_offset < ino_size) { ret = fill_holes(trans, inode, path, cur_offset, drop_args.drop_end); if (ret) { /* * If we failed then we didn't insert our hole * entries for the area we dropped, so now the * fs is corrupted, so we must abort the * transaction. */ btrfs_abort_transaction(trans, ret); break; } } else if (!extent_info && cur_offset < drop_args.drop_end) { /* * We are past the i_size here, but since we didn't * insert holes we need to clear the mapped area so we * know to not set disk_i_size in this area until a new * file extent is inserted here. */ ret = btrfs_inode_clear_file_extent_range(inode, cur_offset, drop_args.drop_end - cur_offset); if (ret) { /* * We couldn't clear our area, so we could * presumably adjust up and corrupt the fs, so * we need to abort. */ btrfs_abort_transaction(trans, ret); break; } } if (extent_info && drop_args.drop_end > extent_info->file_offset) { u64 replace_len = drop_args.drop_end - extent_info->file_offset; ret = btrfs_insert_replace_extent(trans, inode, path, extent_info, replace_len, drop_args.bytes_found); if (ret) { btrfs_abort_transaction(trans, ret); break; } extent_info->data_len -= replace_len; extent_info->data_offset += replace_len; extent_info->file_offset += replace_len; } /* * We are releasing our handle on the transaction, balance the * dirty pages of the btree inode and flush delayed items, and * then get a new transaction handle, which may now point to a * new transaction in case someone else may have committed the * transaction we used to replace/drop file extent items. So * bump the inode's iversion and update mtime and ctime except * if we are called from a dedupe context. This is because a * power failure/crash may happen after the transaction is * committed and before we finish replacing/dropping all the * file extent items we need. */ inode_inc_iversion(&inode->vfs_inode); if (!extent_info || extent_info->update_times) inode_set_mtime_to_ts(&inode->vfs_inode, inode_set_ctime_current(&inode->vfs_inode)); ret = btrfs_update_inode(trans, inode); if (ret) break; btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); trans = btrfs_start_transaction(root, rsv_count); if (IS_ERR(trans)) { ret = PTR_ERR(trans); trans = NULL; break; } ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv, min_size, false); if (WARN_ON(ret)) break; trans->block_rsv = rsv; cur_offset = drop_args.drop_end; len = end - cur_offset; if (!extent_info && len) { ret = find_first_non_hole(inode, &cur_offset, &len); if (unlikely(ret < 0)) break; if (ret && !len) { ret = 0; break; } } } /* * If we were cloning, force the next fsync to be a full one since we * we replaced (or just dropped in the case of cloning holes when * NO_HOLES is enabled) file extent items and did not setup new extent * maps for the replacement extents (or holes). */ if (extent_info && !extent_info->is_new_extent) btrfs_set_inode_full_sync(inode); if (ret) goto out_trans; trans->block_rsv = &fs_info->trans_block_rsv; /* * If we are using the NO_HOLES feature we might have had already an * hole that overlaps a part of the region [lockstart, lockend] and * ends at (or beyond) lockend. Since we have no file extent items to * represent holes, drop_end can be less than lockend and so we must * make sure we have an extent map representing the existing hole (the * call to __btrfs_drop_extents() might have dropped the existing extent * map representing the existing hole), otherwise the fast fsync path * will not record the existence of the hole region * [existing_hole_start, lockend]. */ if (drop_args.drop_end <= end) drop_args.drop_end = end + 1; /* * Don't insert file hole extent item if it's for a range beyond eof * (because it's useless) or if it represents a 0 bytes range (when * cur_offset == drop_end). */ if (!extent_info && cur_offset < ino_size && cur_offset < drop_args.drop_end) { ret = fill_holes(trans, inode, path, cur_offset, drop_args.drop_end); if (ret) { /* Same comment as above. */ btrfs_abort_transaction(trans, ret); goto out_trans; } } else if (!extent_info && cur_offset < drop_args.drop_end) { /* See the comment in the loop above for the reasoning here. */ ret = btrfs_inode_clear_file_extent_range(inode, cur_offset, drop_args.drop_end - cur_offset); if (ret) { btrfs_abort_transaction(trans, ret); goto out_trans; } } if (extent_info) { ret = btrfs_insert_replace_extent(trans, inode, path, extent_info, extent_info->data_len, drop_args.bytes_found); if (ret) { btrfs_abort_transaction(trans, ret); goto out_trans; } } out_trans: if (!trans) goto out_free; trans->block_rsv = &fs_info->trans_block_rsv; if (ret) btrfs_end_transaction(trans); else *trans_out = trans; out_free: btrfs_free_block_rsv(fs_info, rsv); out: return ret; } static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_state *cached_state = NULL; struct btrfs_path *path; struct btrfs_trans_handle *trans = NULL; u64 lockstart; u64 lockend; u64 tail_start; u64 tail_len; const u64 orig_start = offset; const u64 orig_end = offset + len - 1; int ret = 0; bool same_block; u64 ino_size; bool truncated_block = false; bool updated_inode = false; btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); ret = btrfs_wait_ordered_range(BTRFS_I(inode), offset, len); if (ret) goto out_only_mutex; ino_size = round_up(inode->i_size, fs_info->sectorsize); ret = find_first_non_hole(BTRFS_I(inode), &offset, &len); if (ret < 0) goto out_only_mutex; if (ret && !len) { /* Already in a large hole */ ret = 0; goto out_only_mutex; } ret = file_modified(file); if (ret) goto out_only_mutex; lockstart = round_up(offset, fs_info->sectorsize); lockend = round_down(offset + len, fs_info->sectorsize) - 1; same_block = (BTRFS_BYTES_TO_BLKS(fs_info, offset)) == (BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)); /* * Only do this if we are in the same block and we aren't doing the * entire block. */ if (same_block && len < fs_info->sectorsize) { if (offset < ino_size) { truncated_block = true; ret = btrfs_truncate_block(BTRFS_I(inode), offset + len - 1, orig_start, orig_end); } else { ret = 0; } goto out_only_mutex; } /* zero back part of the first block */ if (offset < ino_size) { truncated_block = true; ret = btrfs_truncate_block(BTRFS_I(inode), offset, orig_start, orig_end); if (ret) { btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); return ret; } } /* Check the aligned pages after the first unaligned page, * if offset != orig_start, which means the first unaligned page * including several following pages are already in holes, * the extra check can be skipped */ if (offset == orig_start) { /* after truncate page, check hole again */ len = offset + len - lockstart; offset = lockstart; ret = find_first_non_hole(BTRFS_I(inode), &offset, &len); if (ret < 0) goto out_only_mutex; if (ret && !len) { ret = 0; goto out_only_mutex; } lockstart = offset; } /* Check the tail unaligned part is in a hole */ tail_start = lockend + 1; tail_len = offset + len - tail_start; if (tail_len) { ret = find_first_non_hole(BTRFS_I(inode), &tail_start, &tail_len); if (unlikely(ret < 0)) goto out_only_mutex; if (!ret) { /* zero the front end of the last page */ if (tail_start + tail_len < ino_size) { truncated_block = true; ret = btrfs_truncate_block(BTRFS_I(inode), tail_start + tail_len - 1, orig_start, orig_end); if (ret) goto out_only_mutex; } } } if (lockend < lockstart) { ret = 0; goto out_only_mutex; } btrfs_punch_hole_lock_range(inode, lockstart, lockend, &cached_state); path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; goto out; } ret = btrfs_replace_file_extents(BTRFS_I(inode), path, lockstart, lockend, NULL, &trans); btrfs_free_path(path); if (ret) goto out; ASSERT(trans != NULL); inode_inc_iversion(inode); inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); ret = btrfs_update_inode(trans, BTRFS_I(inode)); updated_inode = true; btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); out: btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state); out_only_mutex: if (!updated_inode && truncated_block && !ret) { /* * If we only end up zeroing part of a page, we still need to * update the inode item, so that all the time fields are * updated as well as the necessary btrfs inode in memory fields * for detecting, at fsync time, if the inode isn't yet in the * log tree or it's there but not up to date. */ struct timespec64 now = inode_set_ctime_current(inode); inode_inc_iversion(inode); inode_set_mtime_to_ts(inode, now); trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { ret = PTR_ERR(trans); } else { int ret2; ret = btrfs_update_inode(trans, BTRFS_I(inode)); ret2 = btrfs_end_transaction(trans); if (!ret) ret = ret2; } } btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); return ret; } /* Helper structure to record which range is already reserved */ struct falloc_range { struct list_head list; u64 start; u64 len; }; /* * Helper function to add falloc range * * Caller should have locked the larger range of extent containing * [start, len) */ static int add_falloc_range(struct list_head *head, u64 start, u64 len) { struct falloc_range *range = NULL; if (!list_empty(head)) { /* * As fallocate iterates by bytenr order, we only need to check * the last range. */ range = list_last_entry(head, struct falloc_range, list); if (range->start + range->len == start) { range->len += len; return 0; } } range = kmalloc(sizeof(*range), GFP_KERNEL); if (!range) return -ENOMEM; range->start = start; range->len = len; list_add_tail(&range->list, head); return 0; } static int btrfs_fallocate_update_isize(struct inode *inode, const u64 end, const int mode) { struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(inode)->root; int ret; int ret2; if (mode & FALLOC_FL_KEEP_SIZE || end <= i_size_read(inode)) return 0; trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) return PTR_ERR(trans); inode_set_ctime_current(inode); i_size_write(inode, end); btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); ret = btrfs_update_inode(trans, BTRFS_I(inode)); ret2 = btrfs_end_transaction(trans); return ret ? ret : ret2; } enum { RANGE_BOUNDARY_WRITTEN_EXTENT, RANGE_BOUNDARY_PREALLOC_EXTENT, RANGE_BOUNDARY_HOLE, }; static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode, u64 offset) { const u64 sectorsize = inode->root->fs_info->sectorsize; struct extent_map *em; int ret; offset = round_down(offset, sectorsize); em = btrfs_get_extent(inode, NULL, offset, sectorsize); if (IS_ERR(em)) return PTR_ERR(em); if (em->disk_bytenr == EXTENT_MAP_HOLE) ret = RANGE_BOUNDARY_HOLE; else if (em->flags & EXTENT_FLAG_PREALLOC) ret = RANGE_BOUNDARY_PREALLOC_EXTENT; else ret = RANGE_BOUNDARY_WRITTEN_EXTENT; btrfs_free_extent_map(em); return ret; } static int btrfs_zero_range(struct inode *inode, loff_t offset, loff_t len, const int mode) { struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; struct extent_map *em; struct extent_changeset *data_reserved = NULL; int ret; u64 alloc_hint = 0; const u64 sectorsize = fs_info->sectorsize; const u64 orig_start = offset; const u64 orig_end = offset + len - 1; u64 alloc_start = round_down(offset, sectorsize); u64 alloc_end = round_up(offset + len, sectorsize); u64 bytes_to_reserve = 0; bool space_reserved = false; em = btrfs_get_extent(BTRFS_I(inode), NULL, alloc_start, alloc_end - alloc_start); if (IS_ERR(em)) { ret = PTR_ERR(em); goto out; } /* * Avoid hole punching and extent allocation for some cases. More cases * could be considered, but these are unlikely common and we keep things * as simple as possible for now. Also, intentionally, if the target * range contains one or more prealloc extents together with regular * extents and holes, we drop all the existing extents and allocate a * new prealloc extent, so that we get a larger contiguous disk extent. */ if (em->start <= alloc_start && (em->flags & EXTENT_FLAG_PREALLOC)) { const u64 em_end = em->start + em->len; if (em_end >= offset + len) { /* * The whole range is already a prealloc extent, * do nothing except updating the inode's i_size if * needed. */ btrfs_free_extent_map(em); ret = btrfs_fallocate_update_isize(inode, offset + len, mode); goto out; } /* * Part of the range is already a prealloc extent, so operate * only on the remaining part of the range. */ alloc_start = em_end; ASSERT(IS_ALIGNED(alloc_start, sectorsize)); len = offset + len - alloc_start; offset = alloc_start; alloc_hint = btrfs_extent_map_block_start(em) + em->len; } btrfs_free_extent_map(em); if (BTRFS_BYTES_TO_BLKS(fs_info, offset) == BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) { em = btrfs_get_extent(BTRFS_I(inode), NULL, alloc_start, sectorsize); if (IS_ERR(em)) { ret = PTR_ERR(em); goto out; } if (em->flags & EXTENT_FLAG_PREALLOC) { btrfs_free_extent_map(em); ret = btrfs_fallocate_update_isize(inode, offset + len, mode); goto out; } if (len < sectorsize && em->disk_bytenr != EXTENT_MAP_HOLE) { btrfs_free_extent_map(em); ret = btrfs_truncate_block(BTRFS_I(inode), offset + len - 1, orig_start, orig_end); if (!ret) ret = btrfs_fallocate_update_isize(inode, offset + len, mode); return ret; } btrfs_free_extent_map(em); alloc_start = round_down(offset, sectorsize); alloc_end = alloc_start + sectorsize; goto reserve_space; } alloc_start = round_up(offset, sectorsize); alloc_end = round_down(offset + len, sectorsize); /* * For unaligned ranges, check the pages at the boundaries, they might * map to an extent, in which case we need to partially zero them, or * they might map to a hole, in which case we need our allocation range * to cover them. */ if (!IS_ALIGNED(offset, sectorsize)) { ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode), offset); if (ret < 0) goto out; if (ret == RANGE_BOUNDARY_HOLE) { alloc_start = round_down(offset, sectorsize); ret = 0; } else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) { ret = btrfs_truncate_block(BTRFS_I(inode), offset, orig_start, orig_end); if (ret) goto out; } else { ret = 0; } } if (!IS_ALIGNED(offset + len, sectorsize)) { ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode), offset + len); if (ret < 0) goto out; if (ret == RANGE_BOUNDARY_HOLE) { alloc_end = round_up(offset + len, sectorsize); ret = 0; } else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) { ret = btrfs_truncate_block(BTRFS_I(inode), offset + len - 1, orig_start, orig_end); if (ret) goto out; } else { ret = 0; } } reserve_space: if (alloc_start < alloc_end) { struct extent_state *cached_state = NULL; const u64 lockstart = alloc_start; const u64 lockend = alloc_end - 1; bytes_to_reserve = alloc_end - alloc_start; ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), bytes_to_reserve); if (ret < 0) goto out; space_reserved = true; btrfs_punch_hole_lock_range(inode, lockstart, lockend, &cached_state); ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved, alloc_start, bytes_to_reserve); if (ret) { btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state); goto out; } ret = btrfs_prealloc_file_range(inode, mode, alloc_start, alloc_end - alloc_start, fs_info->sectorsize, offset + len, &alloc_hint); btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state); /* btrfs_prealloc_file_range releases reserved space on error */ if (ret) { space_reserved = false; goto out; } } ret = btrfs_fallocate_update_isize(inode, offset + len, mode); out: if (ret && space_reserved) btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved, alloc_start, bytes_to_reserve); extent_changeset_free(data_reserved); return ret; } static long btrfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); struct extent_state *cached_state = NULL; struct extent_changeset *data_reserved = NULL; struct falloc_range *range; struct falloc_range *tmp; LIST_HEAD(reserve_list); u64 cur_offset; u64 last_byte; u64 alloc_start; u64 alloc_end; u64 alloc_hint = 0; u64 locked_end; u64 actual_end = 0; u64 data_space_needed = 0; u64 data_space_reserved = 0; u64 qgroup_reserved = 0; struct extent_map *em; int blocksize = BTRFS_I(inode)->root->fs_info->sectorsize; int ret; /* Do not allow fallocate in ZONED mode */ if (btrfs_is_zoned(inode_to_fs_info(inode))) return -EOPNOTSUPP; alloc_start = round_down(offset, blocksize); alloc_end = round_up(offset + len, blocksize); cur_offset = alloc_start; /* Make sure we aren't being give some crap mode */ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)) return -EOPNOTSUPP; if (mode & FALLOC_FL_PUNCH_HOLE) return btrfs_punch_hole(file, offset, len); btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) { ret = inode_newsize_ok(inode, offset + len); if (ret) goto out; } ret = file_modified(file); if (ret) goto out; /* * TODO: Move these two operations after we have checked * accurate reserved space, or fallocate can still fail but * with page truncated or size expanded. * * But that's a minor problem and won't do much harm BTW. */ if (alloc_start > inode->i_size) { ret = btrfs_cont_expand(BTRFS_I(inode), i_size_read(inode), alloc_start); if (ret) goto out; } else if (offset + len > inode->i_size) { /* * If we are fallocating from the end of the file onward we * need to zero out the end of the block if i_size lands in the * middle of a block. */ ret = btrfs_truncate_block(BTRFS_I(inode), inode->i_size, inode->i_size, (u64)-1); if (ret) goto out; } /* * We have locked the inode at the VFS level (in exclusive mode) and we * have locked the i_mmap_lock lock (in exclusive mode). Now before * locking the file range, flush all dealloc in the range and wait for * all ordered extents in the range to complete. After this we can lock * the file range and, due to the previous locking we did, we know there * can't be more delalloc or ordered extents in the range. */ ret = btrfs_wait_ordered_range(BTRFS_I(inode), alloc_start, alloc_end - alloc_start); if (ret) goto out; if (mode & FALLOC_FL_ZERO_RANGE) { ret = btrfs_zero_range(inode, offset, len, mode); btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); return ret; } locked_end = alloc_end - 1; btrfs_lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, &cached_state); btrfs_assert_inode_range_clean(BTRFS_I(inode), alloc_start, locked_end); /* First, check if we exceed the qgroup limit */ while (cur_offset < alloc_end) { em = btrfs_get_extent(BTRFS_I(inode), NULL, cur_offset, alloc_end - cur_offset); if (IS_ERR(em)) { ret = PTR_ERR(em); break; } last_byte = min(btrfs_extent_map_end(em), alloc_end); actual_end = min_t(u64, btrfs_extent_map_end(em), offset + len); last_byte = ALIGN(last_byte, blocksize); if (em->disk_bytenr == EXTENT_MAP_HOLE || (cur_offset >= inode->i_size && !(em->flags & EXTENT_FLAG_PREALLOC))) { const u64 range_len = last_byte - cur_offset; ret = add_falloc_range(&reserve_list, cur_offset, range_len); if (ret < 0) { btrfs_free_extent_map(em); break; } ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved, cur_offset, range_len); if (ret < 0) { btrfs_free_extent_map(em); break; } qgroup_reserved += range_len; data_space_needed += range_len; } btrfs_free_extent_map(em); cur_offset = last_byte; } if (!ret && data_space_needed > 0) { /* * We are safe to reserve space here as we can't have delalloc * in the range, see above. */ ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), data_space_needed); if (!ret) data_space_reserved = data_space_needed; } /* * If ret is still 0, means we're OK to fallocate. * Or just cleanup the list and exit. */ list_for_each_entry_safe(range, tmp, &reserve_list, list) { if (!ret) { ret = btrfs_prealloc_file_range(inode, mode, range->start, range->len, blocksize, offset + len, &alloc_hint); /* * btrfs_prealloc_file_range() releases space even * if it returns an error. */ data_space_reserved -= range->len; qgroup_reserved -= range->len; } else if (data_space_reserved > 0) { btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved, range->start, range->len); data_space_reserved -= range->len; qgroup_reserved -= range->len; } else if (qgroup_reserved > 0) { btrfs_qgroup_free_data(BTRFS_I(inode), data_reserved, range->start, range->len, NULL); qgroup_reserved -= range->len; } list_del(&range->list); kfree(range); } if (ret < 0) goto out_unlock; /* * We didn't need to allocate any more space, but we still extended the * size of the file so we need to update i_size and the inode item. */ ret = btrfs_fallocate_update_isize(inode, actual_end, mode); out_unlock: btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, &cached_state); out: btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); extent_changeset_free(data_reserved); return ret; } /* * Helper for btrfs_find_delalloc_in_range(). Find a subrange in a given range * that has unflushed and/or flushing delalloc. There might be other adjacent * subranges after the one it found, so btrfs_find_delalloc_in_range() keeps * looping while it gets adjacent subranges, and merging them together. */ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end, struct extent_state **cached_state, bool *search_io_tree, u64 *delalloc_start_ret, u64 *delalloc_end_ret) { u64 len = end + 1 - start; u64 delalloc_len = 0; struct btrfs_ordered_extent *oe; u64 oe_start; u64 oe_end; /* * Search the io tree first for EXTENT_DELALLOC. If we find any, it * means we have delalloc (dirty pages) for which writeback has not * started yet. */ if (*search_io_tree) { spin_lock(&inode->lock); if (inode->delalloc_bytes > 0) { spin_unlock(&inode->lock); *delalloc_start_ret = start; delalloc_len = btrfs_count_range_bits(&inode->io_tree, delalloc_start_ret, end, len, EXTENT_DELALLOC, 1, cached_state); } else { spin_unlock(&inode->lock); } } if (delalloc_len > 0) { /* * If delalloc was found then *delalloc_start_ret has a sector size * aligned value (rounded down). */ *delalloc_end_ret = *delalloc_start_ret + delalloc_len - 1; if (*delalloc_start_ret == start) { /* Delalloc for the whole range, nothing more to do. */ if (*delalloc_end_ret == end) return true; /* Else trim our search range for ordered extents. */ start = *delalloc_end_ret + 1; len = end + 1 - start; } } else { /* No delalloc, future calls don't need to search again. */ *search_io_tree = false; } /* * Now also check if there's any ordered extent in the range. * We do this because: * * 1) When delalloc is flushed, the file range is locked, we clear the * EXTENT_DELALLOC bit from the io tree and create an extent map and * an ordered extent for the write. So we might just have been called * after delalloc is flushed and before the ordered extent completes * and inserts the new file extent item in the subvolume's btree; * * 2) We may have an ordered extent created by flushing delalloc for a * subrange that starts before the subrange we found marked with * EXTENT_DELALLOC in the io tree. * * We could also use the extent map tree to find such delalloc that is * being flushed, but using the ordered extents tree is more efficient * because it's usually much smaller as ordered extents are removed from * the tree once they complete. With the extent maps, we mau have them * in the extent map tree for a very long time, and they were either * created by previous writes or loaded by read operations. */ oe = btrfs_lookup_first_ordered_range(inode, start, len); if (!oe) return (delalloc_len > 0); /* The ordered extent may span beyond our search range. */ oe_start = max(oe->file_offset, start); oe_end = min(oe->file_offset + oe->num_bytes - 1, end); btrfs_put_ordered_extent(oe); /* Don't have unflushed delalloc, return the ordered extent range. */ if (delalloc_len == 0) { *delalloc_start_ret = oe_start; *delalloc_end_ret = oe_end; return true; } /* * We have both unflushed delalloc (io_tree) and an ordered extent. * If the ranges are adjacent returned a combined range, otherwise * return the leftmost range. */ if (oe_start < *delalloc_start_ret) { if (oe_end < *delalloc_start_ret) *delalloc_end_ret = oe_end; *delalloc_start_ret = oe_start; } else if (*delalloc_end_ret + 1 == oe_start) { *delalloc_end_ret = oe_end; } return true; } /* * Check if there's delalloc in a given range. * * @inode: The inode. * @start: The start offset of the range. It does not need to be * sector size aligned. * @end: The end offset (inclusive value) of the search range. * It does not need to be sector size aligned. * @cached_state: Extent state record used for speeding up delalloc * searches in the inode's io_tree. Can be NULL. * @delalloc_start_ret: Output argument, set to the start offset of the * subrange found with delalloc (may not be sector size * aligned). * @delalloc_end_ret: Output argument, set to he end offset (inclusive value) * of the subrange found with delalloc. * * Returns true if a subrange with delalloc is found within the given range, and * if so it sets @delalloc_start_ret and @delalloc_end_ret with the start and * end offsets of the subrange. */ bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end, struct extent_state **cached_state, u64 *delalloc_start_ret, u64 *delalloc_end_ret) { u64 cur_offset = round_down(start, inode->root->fs_info->sectorsize); u64 prev_delalloc_end = 0; bool search_io_tree = true; bool ret = false; while (cur_offset <= end) { u64 delalloc_start; u64 delalloc_end; bool delalloc; delalloc = find_delalloc_subrange(inode, cur_offset, end, cached_state, &search_io_tree, &delalloc_start, &delalloc_end); if (!delalloc) break; if (prev_delalloc_end == 0) { /* First subrange found. */ *delalloc_start_ret = max(delalloc_start, start); *delalloc_end_ret = delalloc_end; ret = true; } else if (delalloc_start == prev_delalloc_end + 1) { /* Subrange adjacent to the previous one, merge them. */ *delalloc_end_ret = delalloc_end; } else { /* Subrange not adjacent to the previous one, exit. */ break; } prev_delalloc_end = delalloc_end; cur_offset = delalloc_end + 1; cond_resched(); } return ret; } /* * Check if there's a hole or delalloc range in a range representing a hole (or * prealloc extent) found in the inode's subvolume btree. * * @inode: The inode. * @whence: Seek mode (SEEK_DATA or SEEK_HOLE). * @start: Start offset of the hole region. It does not need to be sector * size aligned. * @end: End offset (inclusive value) of the hole region. It does not * need to be sector size aligned. * @start_ret: Return parameter, used to set the start of the subrange in the * hole that matches the search criteria (seek mode), if such * subrange is found (return value of the function is true). * The value returned here may not be sector size aligned. * * Returns true if a subrange matching the given seek mode is found, and if one * is found, it updates @start_ret with the start of the subrange. */ static bool find_desired_extent_in_hole(struct btrfs_inode *inode, int whence, struct extent_state **cached_state, u64 start, u64 end, u64 *start_ret) { u64 delalloc_start; u64 delalloc_end; bool delalloc; delalloc = btrfs_find_delalloc_in_range(inode, start, end, cached_state, &delalloc_start, &delalloc_end); if (delalloc && whence == SEEK_DATA) { *start_ret = delalloc_start; return true; } if (delalloc && whence == SEEK_HOLE) { /* * We found delalloc but it starts after out start offset. So we * have a hole between our start offset and the delalloc start. */ if (start < delalloc_start) { *start_ret = start; return true; } /* * Delalloc range starts at our start offset. * If the delalloc range's length is smaller than our range, * then it means we have a hole that starts where the delalloc * subrange ends. */ if (delalloc_end < end) { *start_ret = delalloc_end + 1; return true; } /* There's delalloc for the whole range. */ return false; } if (!delalloc && whence == SEEK_HOLE) { *start_ret = start; return true; } /* * No delalloc in the range and we are seeking for data. The caller has * to iterate to the next extent item in the subvolume btree. */ return false; } static loff_t find_desired_extent(struct file *file, loff_t offset, int whence) { struct btrfs_inode *inode = BTRFS_I(file->f_mapping->host); struct btrfs_file_private *private; struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_state *cached_state = NULL; struct extent_state **delalloc_cached_state; const loff_t i_size = i_size_read(&inode->vfs_inode); const u64 ino = btrfs_ino(inode); struct btrfs_root *root = inode->root; struct btrfs_path *path; struct btrfs_key key; u64 last_extent_end; u64 lockstart; u64 lockend; u64 start; int ret; bool found = false; if (i_size == 0 || offset >= i_size) return -ENXIO; /* * Quick path. If the inode has no prealloc extents and its number of * bytes used matches its i_size, then it can not have holes. */ if (whence == SEEK_HOLE && !(inode->flags & BTRFS_INODE_PREALLOC) && inode_get_bytes(&inode->vfs_inode) == i_size) return i_size; spin_lock(&inode->lock); private = file->private_data; spin_unlock(&inode->lock); if (private && private->owner_task != current) { /* * Not allocated by us, don't use it as its cached state is used * by the task that allocated it and we don't want neither to * mess with it nor get incorrect results because it reflects an * invalid state for the current task. */ private = NULL; } else if (!private) { private = kzalloc(sizeof(*private), GFP_KERNEL); /* * No worries if memory allocation failed. * The private structure is used only for speeding up multiple * lseek SEEK_HOLE/DATA calls to a file when there's delalloc, * so everything will still be correct. */ if (private) { bool free = false; private->owner_task = current; spin_lock(&inode->lock); if (file->private_data) free = true; else file->private_data = private; spin_unlock(&inode->lock); if (free) { kfree(private); private = NULL; } } } if (private) delalloc_cached_state = &private->llseek_cached_state; else delalloc_cached_state = NULL; /* * offset can be negative, in this case we start finding DATA/HOLE from * the very start of the file. */ start = max_t(loff_t, 0, offset); lockstart = round_down(start, fs_info->sectorsize); lockend = round_up(i_size, fs_info->sectorsize); if (lockend <= lockstart) lockend = lockstart + fs_info->sectorsize; lockend--; path = btrfs_alloc_path(); if (!path) return -ENOMEM; path->reada = READA_FORWARD; key.objectid = ino; key.type = BTRFS_EXTENT_DATA_KEY; key.offset = start; last_extent_end = lockstart; btrfs_lock_extent(&inode->io_tree, lockstart, lockend, &cached_state); ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) { goto out; } else if (ret > 0 && path->slots[0] > 0) { btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1); if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY) path->slots[0]--; } while (start < i_size) { struct extent_buffer *leaf = path->nodes[0]; struct btrfs_file_extent_item *extent; u64 extent_end; u8 type; if (path->slots[0] >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(root, path); if (ret < 0) goto out; else if (ret > 0) break; leaf = path->nodes[0]; } btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) break; extent_end = btrfs_file_extent_end(path); /* * In the first iteration we may have a slot that points to an * extent that ends before our start offset, so skip it. */ if (extent_end <= start) { path->slots[0]++; continue; } /* We have an implicit hole, NO_HOLES feature is likely set. */ if (last_extent_end < key.offset) { u64 search_start = last_extent_end; u64 found_start; /* * First iteration, @start matches @offset and it's * within the hole. */ if (start == offset) search_start = offset; found = find_desired_extent_in_hole(inode, whence, delalloc_cached_state, search_start, key.offset - 1, &found_start); if (found) { start = found_start; break; } /* * Didn't find data or a hole (due to delalloc) in the * implicit hole range, so need to analyze the extent. */ } extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); type = btrfs_file_extent_type(leaf, extent); /* * Can't access the extent's disk_bytenr field if this is an * inline extent, since at that offset, it's where the extent * data starts. */ if (type == BTRFS_FILE_EXTENT_PREALLOC || (type == BTRFS_FILE_EXTENT_REG && btrfs_file_extent_disk_bytenr(leaf, extent) == 0)) { /* * Explicit hole or prealloc extent, search for delalloc. * A prealloc extent is treated like a hole. */ u64 search_start = key.offset; u64 found_start; /* * First iteration, @start matches @offset and it's * within the hole. */ if (start == offset) search_start = offset; found = find_desired_extent_in_hole(inode, whence, delalloc_cached_state, search_start, extent_end - 1, &found_start); if (found) { start = found_start; break; } /* * Didn't find data or a hole (due to delalloc) in the * implicit hole range, so need to analyze the next * extent item. */ } else { /* * Found a regular or inline extent. * If we are seeking for data, adjust the start offset * and stop, we're done. */ if (whence == SEEK_DATA) { start = max_t(u64, key.offset, offset); found = true; break; } /* * Else, we are seeking for a hole, check the next file * extent item. */ } start = extent_end; last_extent_end = extent_end; path->slots[0]++; if (fatal_signal_pending(current)) { ret = -EINTR; goto out; } cond_resched(); } /* We have an implicit hole from the last extent found up to i_size. */ if (!found && start < i_size) { found = find_desired_extent_in_hole(inode, whence, delalloc_cached_state, start, i_size - 1, &start); if (!found) start = i_size; } out: btrfs_unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); btrfs_free_path(path); if (ret < 0) return ret; if (whence == SEEK_DATA && start >= i_size) return -ENXIO; return min_t(loff_t, start, i_size); } static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; switch (whence) { default: return generic_file_llseek(file, offset, whence); case SEEK_DATA: case SEEK_HOLE: btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED); offset = find_desired_extent(file, offset, whence); btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED); break; } if (offset < 0) return offset; return vfs_setpos(file, offset, inode->i_sb->s_maxbytes); } static int btrfs_file_open(struct inode *inode, struct file *filp) { int ret; filp->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT; ret = fsverity_file_open(inode, filp); if (ret) return ret; return generic_file_open(inode, filp); } static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { ssize_t ret = 0; if (iocb->ki_flags & IOCB_DIRECT) { ret = btrfs_direct_read(iocb, to); if (ret < 0 || !iov_iter_count(to) || iocb->ki_pos >= i_size_read(file_inode(iocb->ki_filp))) return ret; } return filemap_read(iocb, to, ret); } const struct file_operations btrfs_file_operations = { .llseek = btrfs_file_llseek, .read_iter = btrfs_file_read_iter, .splice_read = filemap_splice_read, .write_iter = btrfs_file_write_iter, .splice_write = iter_file_splice_write, .mmap = btrfs_file_mmap, .open = btrfs_file_open, .release = btrfs_release_file, .get_unmapped_area = thp_get_unmapped_area, .fsync = btrfs_sync_file, .fallocate = btrfs_fallocate, .unlocked_ioctl = btrfs_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = btrfs_compat_ioctl, #endif .remap_file_range = btrfs_remap_file_range, .uring_cmd = btrfs_uring_cmd, .fop_flags = FOP_BUFFER_RASYNC | FOP_BUFFER_WASYNC, }; int btrfs_fdatawrite_range(struct btrfs_inode *inode, loff_t start, loff_t end) { struct address_space *mapping = inode->vfs_inode.i_mapping; int ret; /* * So with compression we will find and lock a dirty page and clear the * first one as dirty, setup an async extent, and immediately return * with the entire range locked but with nobody actually marked with * writeback. So we can't just filemap_write_and_wait_range() and * expect it to work since it will just kick off a thread to do the * actual work. So we need to call filemap_fdatawrite_range _again_ * since it will wait on the page lock, which won't be unlocked until * after the pages have been marked as writeback and so we're good to go * from there. We have to do this otherwise we'll miss the ordered * extents and that results in badness. Please Josef, do not think you * know better and pull this out at some point in the future, it is * right and you are wrong. */ ret = filemap_fdatawrite_range(mapping, start, end); if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags)) ret = filemap_fdatawrite_range(mapping, start, end); return ret; }
10 725 399 248 1 36 1222 36 455 455 452 359 362 362 362 351 362 108 53 21 406 718 281 452 281 74 74 74 73 74 26 29 29 26 20 51 1236 10 482 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Linux INET6 implementation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> */ #ifndef _IP6_FIB_H #define _IP6_FIB_H #include <linux/ipv6_route.h> #include <linux/rtnetlink.h> #include <linux/spinlock.h> #include <linux/notifier.h> #include <net/dst.h> #include <net/flow.h> #include <net/ip_fib.h> #include <net/netlink.h> #include <net/inetpeer.h> #include <net/fib_notifier.h> #include <linux/indirect_call_wrapper.h> #include <uapi/linux/bpf.h> #ifdef CONFIG_IPV6_MULTIPLE_TABLES #define FIB6_TABLE_HASHSZ 256 #else #define FIB6_TABLE_HASHSZ 1 #endif #define RT6_DEBUG 2 struct rt6_info; struct fib6_info; struct fib6_config { u32 fc_table; u32 fc_metric; int fc_dst_len; int fc_src_len; int fc_ifindex; u32 fc_flags; u32 fc_protocol; u16 fc_type; /* only 8 bits are used */ u16 fc_delete_all_nh : 1, fc_ignore_dev_down:1, __unused : 14; u32 fc_nh_id; struct in6_addr fc_dst; struct in6_addr fc_src; struct in6_addr fc_prefsrc; struct in6_addr fc_gateway; unsigned long fc_expires; struct nlattr *fc_mx; int fc_mx_len; int fc_mp_len; struct nlattr *fc_mp; struct nl_info fc_nlinfo; struct nlattr *fc_encap; u16 fc_encap_type; bool fc_is_fdb; }; struct fib6_node { struct fib6_node __rcu *parent; struct fib6_node __rcu *left; struct fib6_node __rcu *right; #ifdef CONFIG_IPV6_SUBTREES struct fib6_node __rcu *subtree; #endif struct fib6_info __rcu *leaf; __u16 fn_bit; /* bit key */ __u16 fn_flags; int fn_sernum; struct fib6_info __rcu *rr_ptr; struct rcu_head rcu; }; struct fib6_gc_args { int timeout; int more; }; #ifndef CONFIG_IPV6_SUBTREES #define FIB6_SUBTREE(fn) NULL static inline bool fib6_routes_require_src(const struct net *net) { return false; } static inline void fib6_routes_require_src_inc(struct net *net) {} static inline void fib6_routes_require_src_dec(struct net *net) {} #else static inline bool fib6_routes_require_src(const struct net *net) { return net->ipv6.fib6_routes_require_src > 0; } static inline void fib6_routes_require_src_inc(struct net *net) { net->ipv6.fib6_routes_require_src++; } static inline void fib6_routes_require_src_dec(struct net *net) { net->ipv6.fib6_routes_require_src--; } #define FIB6_SUBTREE(fn) (rcu_dereference_protected((fn)->subtree, 1)) #endif /* * routing information * */ struct rt6key { struct in6_addr addr; int plen; }; struct fib6_table; struct rt6_exception_bucket { struct hlist_head chain; int depth; }; struct rt6_exception { struct hlist_node hlist; struct rt6_info *rt6i; unsigned long stamp; struct rcu_head rcu; }; #define FIB6_EXCEPTION_BUCKET_SIZE_SHIFT 10 #define FIB6_EXCEPTION_BUCKET_SIZE (1 << FIB6_EXCEPTION_BUCKET_SIZE_SHIFT) #define FIB6_MAX_DEPTH 5 struct fib6_nh { struct fib_nh_common nh_common; #ifdef CONFIG_IPV6_ROUTER_PREF unsigned long last_probe; #endif struct rt6_info * __percpu *rt6i_pcpu; struct rt6_exception_bucket __rcu *rt6i_exception_bucket; }; struct fib6_info { struct fib6_table *fib6_table; struct fib6_info __rcu *fib6_next; struct fib6_node __rcu *fib6_node; /* Multipath routes: * siblings is a list of fib6_info that have the same metric/weight, * destination, but not the same gateway. nsiblings is just a cache * to speed up lookup. */ union { struct list_head fib6_siblings; struct list_head nh_list; }; unsigned int fib6_nsiblings; refcount_t fib6_ref; unsigned long expires; struct hlist_node gc_link; struct dst_metrics *fib6_metrics; #define fib6_pmtu fib6_metrics->metrics[RTAX_MTU-1] struct rt6key fib6_dst; u32 fib6_flags; struct rt6key fib6_src; struct rt6key fib6_prefsrc; u32 fib6_metric; u8 fib6_protocol; u8 fib6_type; u8 offload; u8 trap; u8 offload_failed; u8 should_flush:1, dst_nocount:1, dst_nopolicy:1, fib6_destroying:1, unused:4; struct list_head purge_link; struct rcu_head rcu; struct nexthop *nh; struct fib6_nh fib6_nh[]; }; struct rt6_info { struct dst_entry dst; struct fib6_info __rcu *from; int sernum; struct rt6key rt6i_dst; struct rt6key rt6i_src; struct in6_addr rt6i_gateway; struct inet6_dev *rt6i_idev; u32 rt6i_flags; /* more non-fragment space at head required */ unsigned short rt6i_nfheader_len; }; struct fib6_result { struct fib6_nh *nh; struct fib6_info *f6i; u32 fib6_flags; u8 fib6_type; struct rt6_info *rt6; }; #define for_each_fib6_node_rt_rcu(fn) \ for (rt = rcu_dereference((fn)->leaf); rt; \ rt = rcu_dereference(rt->fib6_next)) #define for_each_fib6_walker_rt(w) \ for (rt = (w)->leaf; rt; \ rt = rcu_dereference_protected(rt->fib6_next, 1)) #define dst_rt6_info(_ptr) container_of_const(_ptr, struct rt6_info, dst) static inline struct inet6_dev *ip6_dst_idev(const struct dst_entry *dst) { return dst_rt6_info(dst)->rt6i_idev; } static inline bool fib6_requires_src(const struct fib6_info *rt) { return rt->fib6_src.plen > 0; } /* The callers should hold f6i->fib6_table->tb6_lock if a route has ever * been added to a table before. */ static inline void fib6_clean_expires(struct fib6_info *f6i) { f6i->fib6_flags &= ~RTF_EXPIRES; f6i->expires = 0; } /* The callers should hold f6i->fib6_table->tb6_lock if a route has ever * been added to a table before. */ static inline void fib6_set_expires(struct fib6_info *f6i, unsigned long expires) { f6i->expires = expires; f6i->fib6_flags |= RTF_EXPIRES; } static inline bool fib6_check_expired(const struct fib6_info *f6i) { if (f6i->fib6_flags & RTF_EXPIRES) return time_after(jiffies, f6i->expires); return false; } /* Function to safely get fn->fn_sernum for passed in rt * and store result in passed in cookie. * Return true if we can get cookie safely * Return false if not */ static inline bool fib6_get_cookie_safe(const struct fib6_info *f6i, u32 *cookie) { struct fib6_node *fn; bool status = false; fn = rcu_dereference(f6i->fib6_node); if (fn) { *cookie = READ_ONCE(fn->fn_sernum); /* pairs with smp_wmb() in __fib6_update_sernum_upto_root() */ smp_rmb(); status = true; } return status; } static inline u32 rt6_get_cookie(const struct rt6_info *rt) { struct fib6_info *from; u32 cookie = 0; if (rt->sernum) return rt->sernum; rcu_read_lock(); from = rcu_dereference(rt->from); if (from) fib6_get_cookie_safe(from, &cookie); rcu_read_unlock(); return cookie; } static inline void ip6_rt_put(struct rt6_info *rt) { /* dst_release() accepts a NULL parameter. * We rely on dst being first structure in struct rt6_info */ BUILD_BUG_ON(offsetof(struct rt6_info, dst) != 0); dst_release(&rt->dst); } struct fib6_info *fib6_info_alloc(gfp_t gfp_flags, bool with_fib6_nh); void fib6_info_destroy_rcu(struct rcu_head *head); static inline void fib6_info_hold(struct fib6_info *f6i) { refcount_inc(&f6i->fib6_ref); } static inline bool fib6_info_hold_safe(struct fib6_info *f6i) { return refcount_inc_not_zero(&f6i->fib6_ref); } static inline void fib6_info_release(struct fib6_info *f6i) { if (f6i && refcount_dec_and_test(&f6i->fib6_ref)) { DEBUG_NET_WARN_ON_ONCE(!hlist_unhashed(&f6i->gc_link)); call_rcu_hurry(&f6i->rcu, fib6_info_destroy_rcu); } } enum fib6_walk_state { #ifdef CONFIG_IPV6_SUBTREES FWS_S, #endif FWS_L, FWS_R, FWS_C, FWS_U }; struct fib6_walker { struct list_head lh; struct fib6_node *root, *node; struct fib6_info *leaf; enum fib6_walk_state state; unsigned int skip; unsigned int count; unsigned int skip_in_node; int (*func)(struct fib6_walker *); void *args; }; struct rt6_statistics { __u32 fib_nodes; /* all fib6 nodes */ __u32 fib_route_nodes; /* intermediate nodes */ __u32 fib_rt_entries; /* rt entries in fib table */ __u32 fib_rt_cache; /* cached rt entries in exception table */ __u32 fib_discarded_routes; /* total number of routes delete */ /* The following stat is not protected by any lock */ atomic_t fib_rt_alloc; /* total number of routes alloced */ }; #define RTN_TL_ROOT 0x0001 #define RTN_ROOT 0x0002 /* tree root node */ #define RTN_RTINFO 0x0004 /* node with valid routing info */ /* * priority levels (or metrics) * */ struct fib6_table { struct hlist_node tb6_hlist; u32 tb6_id; spinlock_t tb6_lock; struct fib6_node tb6_root; struct inet_peer_base tb6_peers; unsigned int flags; unsigned int fib_seq; /* writes protected by rtnl_mutex */ struct hlist_head tb6_gc_hlist; /* GC candidates */ #define RT6_TABLE_HAS_DFLT_ROUTER BIT(0) }; #define RT6_TABLE_UNSPEC RT_TABLE_UNSPEC #define RT6_TABLE_MAIN RT_TABLE_MAIN #define RT6_TABLE_DFLT RT6_TABLE_MAIN #define RT6_TABLE_INFO RT6_TABLE_MAIN #define RT6_TABLE_PREFIX RT6_TABLE_MAIN #ifdef CONFIG_IPV6_MULTIPLE_TABLES #define FIB6_TABLE_MIN 1 #define FIB6_TABLE_MAX RT_TABLE_MAX #define RT6_TABLE_LOCAL RT_TABLE_LOCAL #else #define FIB6_TABLE_MIN RT_TABLE_MAIN #define FIB6_TABLE_MAX FIB6_TABLE_MIN #define RT6_TABLE_LOCAL RT6_TABLE_MAIN #endif typedef struct rt6_info *(*pol_lookup_t)(struct net *, struct fib6_table *, struct flowi6 *, const struct sk_buff *, int); struct fib6_entry_notifier_info { struct fib_notifier_info info; /* must be first */ struct fib6_info *rt; unsigned int nsiblings; }; /* * exported functions */ struct fib6_table *fib6_get_table(struct net *net, u32 id); struct fib6_table *fib6_new_table(struct net *net, u32 id); struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, const struct sk_buff *skb, int flags, pol_lookup_t lookup); /* called with rcu lock held; can return error pointer * caller needs to select path */ int fib6_lookup(struct net *net, int oif, struct flowi6 *fl6, struct fib6_result *res, int flags); /* called with rcu lock held; caller needs to select path */ int fib6_table_lookup(struct net *net, struct fib6_table *table, int oif, struct flowi6 *fl6, struct fib6_result *res, int strict); void fib6_select_path(const struct net *net, struct fib6_result *res, struct flowi6 *fl6, int oif, bool have_oif_match, const struct sk_buff *skb, int strict); struct fib6_node *fib6_node_lookup(struct fib6_node *root, const struct in6_addr *daddr, const struct in6_addr *saddr); struct fib6_node *fib6_locate(struct fib6_node *root, const struct in6_addr *daddr, int dst_len, const struct in6_addr *saddr, int src_len, bool exact_match); void fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *arg), void *arg); void fib6_clean_all_skip_notify(struct net *net, int (*func)(struct fib6_info *, void *arg), void *arg); int fib6_add(struct fib6_node *root, struct fib6_info *rt, struct nl_info *info, struct netlink_ext_ack *extack); int fib6_del(struct fib6_info *rt, struct nl_info *info); static inline void rt6_get_prefsrc(const struct rt6_info *rt, struct in6_addr *addr) { const struct fib6_info *from; rcu_read_lock(); from = rcu_dereference(rt->from); if (from) *addr = from->fib6_prefsrc.addr; else *addr = in6addr_any; rcu_read_unlock(); } int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, struct fib6_config *cfg, gfp_t gfp_flags, struct netlink_ext_ack *extack); void fib6_nh_release(struct fib6_nh *fib6_nh); void fib6_nh_release_dsts(struct fib6_nh *fib6_nh); int call_fib6_entry_notifiers(struct net *net, enum fib_event_type event_type, struct fib6_info *rt, struct netlink_ext_ack *extack); int call_fib6_multipath_entry_notifiers(struct net *net, enum fib_event_type event_type, struct fib6_info *rt, unsigned int nsiblings, struct netlink_ext_ack *extack); int call_fib6_entry_notifiers_replace(struct net *net, struct fib6_info *rt); void fib6_rt_update(struct net *net, struct fib6_info *rt, struct nl_info *info); void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, unsigned int flags); void fib6_run_gc(unsigned long expires, struct net *net, bool force); void fib6_gc_cleanup(void); int fib6_init(void); /* Add the route to the gc list if it is not already there * * The callers should hold f6i->fib6_table->tb6_lock. */ static inline void fib6_add_gc_list(struct fib6_info *f6i) { /* If fib6_node is null, the f6i is not in (or removed from) the * table. * * There is a gap between finding the f6i from the table and * calling this function without the protection of the tb6_lock. * This check makes sure the f6i is not added to the gc list when * it is not on the table. */ if (!rcu_dereference_protected(f6i->fib6_node, lockdep_is_held(&f6i->fib6_table->tb6_lock))) return; if (hlist_unhashed(&f6i->gc_link)) hlist_add_head(&f6i->gc_link, &f6i->fib6_table->tb6_gc_hlist); } /* Remove the route from the gc list if it is on the list. * * The callers should hold f6i->fib6_table->tb6_lock. */ static inline void fib6_remove_gc_list(struct fib6_info *f6i) { if (!hlist_unhashed(&f6i->gc_link)) hlist_del_init(&f6i->gc_link); } struct ipv6_route_iter { struct seq_net_private p; struct fib6_walker w; loff_t skip; struct fib6_table *tbl; int sernum; }; extern const struct seq_operations ipv6_route_seq_ops; int call_fib6_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib_notifier_info *info); int call_fib6_notifiers(struct net *net, enum fib_event_type event_type, struct fib_notifier_info *info); int __net_init fib6_notifier_init(struct net *net); void __net_exit fib6_notifier_exit(struct net *net); unsigned int fib6_tables_seq_read(const struct net *net); int fib6_tables_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack); void fib6_update_sernum(struct net *net, struct fib6_info *rt); void fib6_update_sernum_upto_root(struct net *net, struct fib6_info *rt); void fib6_update_sernum_stub(struct net *net, struct fib6_info *f6i); void fib6_metric_set(struct fib6_info *f6i, int metric, u32 val); static inline bool fib6_metric_locked(struct fib6_info *f6i, int metric) { return !!(f6i->fib6_metrics->metrics[RTAX_LOCK - 1] & (1 << metric)); } void fib6_info_hw_flags_set(struct net *net, struct fib6_info *f6i, bool offload, bool trap, bool offload_failed); #if IS_BUILTIN(CONFIG_IPV6) && defined(CONFIG_BPF_SYSCALL) struct bpf_iter__ipv6_route { __bpf_md_ptr(struct bpf_iter_meta *, meta); __bpf_md_ptr(struct fib6_info *, rt); }; #endif INDIRECT_CALLABLE_DECLARE(struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table, struct flowi6 *fl6, const struct sk_buff *skb, int flags)); INDIRECT_CALLABLE_DECLARE(struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table, struct flowi6 *fl6, const struct sk_buff *skb, int flags)); INDIRECT_CALLABLE_DECLARE(struct rt6_info *__ip6_route_redirect(struct net *net, struct fib6_table *table, struct flowi6 *fl6, const struct sk_buff *skb, int flags)); INDIRECT_CALLABLE_DECLARE(struct rt6_info *ip6_pol_route_lookup(struct net *net, struct fib6_table *table, struct flowi6 *fl6, const struct sk_buff *skb, int flags)); static inline struct rt6_info *pol_lookup_func(pol_lookup_t lookup, struct net *net, struct fib6_table *table, struct flowi6 *fl6, const struct sk_buff *skb, int flags) { return INDIRECT_CALL_4(lookup, ip6_pol_route_output, ip6_pol_route_input, ip6_pol_route_lookup, __ip6_route_redirect, net, table, fl6, skb, flags); } #ifdef CONFIG_IPV6_MULTIPLE_TABLES static inline bool fib6_has_custom_rules(const struct net *net) { return net->ipv6.fib6_has_custom_rules; } int fib6_rules_init(void); void fib6_rules_cleanup(void); bool fib6_rule_default(const struct fib_rule *rule); int fib6_rules_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack); unsigned int fib6_rules_seq_read(const struct net *net); static inline bool fib6_rules_early_flow_dissect(struct net *net, struct sk_buff *skb, struct flowi6 *fl6, struct flow_keys *flkeys) { unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP; if (!net->ipv6.fib6_rules_require_fldissect) return false; memset(flkeys, 0, sizeof(*flkeys)); __skb_flow_dissect(net, skb, &flow_keys_dissector, flkeys, NULL, 0, 0, 0, flag); fl6->fl6_sport = flkeys->ports.src; fl6->fl6_dport = flkeys->ports.dst; fl6->flowi6_proto = flkeys->basic.ip_proto; return true; } #else static inline bool fib6_has_custom_rules(const struct net *net) { return false; } static inline int fib6_rules_init(void) { return 0; } static inline void fib6_rules_cleanup(void) { return ; } static inline bool fib6_rule_default(const struct fib_rule *rule) { return true; } static inline int fib6_rules_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { return 0; } static inline unsigned int fib6_rules_seq_read(const struct net *net) { return 0; } static inline bool fib6_rules_early_flow_dissect(struct net *net, struct sk_buff *skb, struct flowi6 *fl6, struct flow_keys *flkeys) { return false; } #endif #endif
450 348 361 378 119 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 /* * Copyright 1999 Precision Insight, Inc., Cedar Park, Texas. * Copyright 2000 VA Linux Systems, Inc., Sunnyvale, California. * Copyright (c) 2009-2010, Code Aurora Forum. * Copyright 2016 Intel Corp. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * VA LINUX SYSTEMS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. */ #ifndef _DRM_DRV_H_ #define _DRM_DRV_H_ #include <linux/list.h> #include <linux/irqreturn.h> #include <video/nomodeset.h> #include <drm/drm_device.h> struct dmem_cgroup_region; struct drm_fb_helper; struct drm_fb_helper_surface_size; struct drm_file; struct drm_gem_object; struct drm_master; struct drm_minor; struct dma_buf; struct dma_buf_attachment; struct drm_display_mode; struct drm_mode_create_dumb; struct drm_printer; struct sg_table; /** * enum drm_driver_feature - feature flags * * See &drm_driver.driver_features, drm_device.driver_features and * drm_core_check_feature(). */ enum drm_driver_feature { /** * @DRIVER_GEM: * * Driver use the GEM memory manager. This should be set for all modern * drivers. */ DRIVER_GEM = BIT(0), /** * @DRIVER_MODESET: * * Driver supports mode setting interfaces (KMS). */ DRIVER_MODESET = BIT(1), /** * @DRIVER_RENDER: * * Driver supports dedicated render nodes. See also the :ref:`section on * render nodes <drm_render_node>` for details. */ DRIVER_RENDER = BIT(3), /** * @DRIVER_ATOMIC: * * Driver supports the full atomic modesetting userspace API. Drivers * which only use atomic internally, but do not support the full * userspace API (e.g. not all properties converted to atomic, or * multi-plane updates are not guaranteed to be tear-free) should not * set this flag. */ DRIVER_ATOMIC = BIT(4), /** * @DRIVER_SYNCOBJ: * * Driver supports &drm_syncobj for explicit synchronization of command * submission. */ DRIVER_SYNCOBJ = BIT(5), /** * @DRIVER_SYNCOBJ_TIMELINE: * * Driver supports the timeline flavor of &drm_syncobj for explicit * synchronization of command submission. */ DRIVER_SYNCOBJ_TIMELINE = BIT(6), /** * @DRIVER_COMPUTE_ACCEL: * * Driver supports compute acceleration devices. This flag is mutually exclusive with * @DRIVER_RENDER and @DRIVER_MODESET. Devices that support both graphics and compute * acceleration should be handled by two drivers that are connected using auxiliary bus. */ DRIVER_COMPUTE_ACCEL = BIT(7), /** * @DRIVER_GEM_GPUVA: * * Driver supports user defined GPU VA bindings for GEM objects. */ DRIVER_GEM_GPUVA = BIT(8), /** * @DRIVER_CURSOR_HOTSPOT: * * Driver supports and requires cursor hotspot information in the * cursor plane (e.g. cursor plane has to actually track the mouse * cursor and the clients are required to set hotspot in order for * the cursor planes to work correctly). */ DRIVER_CURSOR_HOTSPOT = BIT(9), /* IMPORTANT: Below are all the legacy flags, add new ones above. */ /** * @DRIVER_USE_AGP: * * Set up DRM AGP support, see drm_agp_init(), the DRM core will manage * AGP resources. New drivers don't need this. */ DRIVER_USE_AGP = BIT(25), /** * @DRIVER_LEGACY: * * Denote a legacy driver using shadow attach. Do not use. */ DRIVER_LEGACY = BIT(26), /** * @DRIVER_PCI_DMA: * * Driver is capable of PCI DMA, mapping of PCI DMA buffers to userspace * will be enabled. Only for legacy drivers. Do not use. */ DRIVER_PCI_DMA = BIT(27), /** * @DRIVER_SG: * * Driver can perform scatter/gather DMA, allocation and mapping of * scatter/gather buffers will be enabled. Only for legacy drivers. Do * not use. */ DRIVER_SG = BIT(28), /** * @DRIVER_HAVE_DMA: * * Driver supports DMA, the userspace DMA API will be supported. Only * for legacy drivers. Do not use. */ DRIVER_HAVE_DMA = BIT(29), /** * @DRIVER_HAVE_IRQ: * * Legacy irq support. Only for legacy drivers. Do not use. */ DRIVER_HAVE_IRQ = BIT(30), }; /** * struct drm_driver - DRM driver structure * * This structure represent the common code for a family of cards. There will be * one &struct drm_device for each card present in this family. It contains lots * of vfunc entries, and a pile of those probably should be moved to more * appropriate places like &drm_mode_config_funcs or into a new operations * structure for GEM drivers. */ struct drm_driver { /** * @load: * * Backward-compatible driver callback to complete initialization steps * after the driver is registered. For this reason, may suffer from * race conditions and its use is deprecated for new drivers. It is * therefore only supported for existing drivers not yet converted to * the new scheme. See devm_drm_dev_alloc() and drm_dev_register() for * proper and race-free way to set up a &struct drm_device. * * This is deprecated, do not use! * * Returns: * * Zero on success, non-zero value on failure. */ int (*load) (struct drm_device *, unsigned long flags); /** * @open: * * Driver callback when a new &struct drm_file is opened. Useful for * setting up driver-private data structures like buffer allocators, * execution contexts or similar things. Such driver-private resources * must be released again in @postclose. * * Since the display/modeset side of DRM can only be owned by exactly * one &struct drm_file (see &drm_file.is_master and &drm_device.master) * there should never be a need to set up any modeset related resources * in this callback. Doing so would be a driver design bug. * * Returns: * * 0 on success, a negative error code on failure, which will be * promoted to userspace as the result of the open() system call. */ int (*open) (struct drm_device *, struct drm_file *); /** * @postclose: * * One of the driver callbacks when a new &struct drm_file is closed. * Useful for tearing down driver-private data structures allocated in * @open like buffer allocators, execution contexts or similar things. * * Since the display/modeset side of DRM can only be owned by exactly * one &struct drm_file (see &drm_file.is_master and &drm_device.master) * there should never be a need to tear down any modeset related * resources in this callback. Doing so would be a driver design bug. */ void (*postclose) (struct drm_device *, struct drm_file *); /** * @unload: * * Reverse the effects of the driver load callback. Ideally, * the clean up performed by the driver should happen in the * reverse order of the initialization. Similarly to the load * hook, this handler is deprecated and its usage should be * dropped in favor of an open-coded teardown function at the * driver layer. See drm_dev_unregister() and drm_dev_put() * for the proper way to remove a &struct drm_device. * * The unload() hook is called right after unregistering * the device. * */ void (*unload) (struct drm_device *); /** * @release: * * Optional callback for destroying device data after the final * reference is released, i.e. the device is being destroyed. * * This is deprecated, clean up all memory allocations associated with a * &drm_device using drmm_add_action(), drmm_kmalloc() and related * managed resources functions. */ void (*release) (struct drm_device *); /** * @master_set: * * Called whenever the minor master is set. Only used by vmwgfx. */ void (*master_set)(struct drm_device *dev, struct drm_file *file_priv, bool from_open); /** * @master_drop: * * Called whenever the minor master is dropped. Only used by vmwgfx. */ void (*master_drop)(struct drm_device *dev, struct drm_file *file_priv); /** * @debugfs_init: * * Allows drivers to create driver-specific debugfs files. */ void (*debugfs_init)(struct drm_minor *minor); /** * @gem_create_object: constructor for gem objects * * Hook for allocating the GEM object struct, for use by the CMA * and SHMEM GEM helpers. Returns a GEM object on success, or an * ERR_PTR()-encoded error code otherwise. */ struct drm_gem_object *(*gem_create_object)(struct drm_device *dev, size_t size); /** * @prime_handle_to_fd: * * PRIME export function. Only used by vmwgfx. */ int (*prime_handle_to_fd)(struct drm_device *dev, struct drm_file *file_priv, uint32_t handle, uint32_t flags, int *prime_fd); /** * @prime_fd_to_handle: * * PRIME import function. Only used by vmwgfx. */ int (*prime_fd_to_handle)(struct drm_device *dev, struct drm_file *file_priv, int prime_fd, uint32_t *handle); /** * @gem_prime_import: * * Import hook for GEM drivers. * * This defaults to drm_gem_prime_import() if not set. */ struct drm_gem_object * (*gem_prime_import)(struct drm_device *dev, struct dma_buf *dma_buf); /** * @gem_prime_import_sg_table: * * Optional hook used by the PRIME helper functions * drm_gem_prime_import() respectively drm_gem_prime_import_dev(). */ struct drm_gem_object *(*gem_prime_import_sg_table)( struct drm_device *dev, struct dma_buf_attachment *attach, struct sg_table *sgt); /** * @dumb_create: * * This creates a new dumb buffer in the driver's backing storage manager (GEM, * TTM or something else entirely) and returns the resulting buffer handle. This * handle can then be wrapped up into a framebuffer modeset object. * * Note that userspace is not allowed to use such objects for render * acceleration - drivers must create their own private ioctls for such a use * case. * * Width, height and depth are specified in the &drm_mode_create_dumb * argument. The callback needs to fill the handle, pitch and size for * the created buffer. * * Called by the user via ioctl. * * Returns: * * Zero on success, negative errno on failure. */ int (*dumb_create)(struct drm_file *file_priv, struct drm_device *dev, struct drm_mode_create_dumb *args); /** * @dumb_map_offset: * * Allocate an offset in the drm device node's address space to be able to * memory map a dumb buffer. * * The default implementation is drm_gem_create_mmap_offset(). GEM based * drivers must not overwrite this. * * Called by the user via ioctl. * * Returns: * * Zero on success, negative errno on failure. */ int (*dumb_map_offset)(struct drm_file *file_priv, struct drm_device *dev, uint32_t handle, uint64_t *offset); /** * @fbdev_probe: * * Allocates and initialize the fb_info structure for fbdev emulation. * Furthermore it also needs to allocate the DRM framebuffer used to * back the fbdev. * * This callback is mandatory for fbdev support. * * Returns: * * 0 on success ot a negative error code otherwise. */ int (*fbdev_probe)(struct drm_fb_helper *fbdev_helper, struct drm_fb_helper_surface_size *sizes); /** * @show_fdinfo: * * Print device specific fdinfo. See Documentation/gpu/drm-usage-stats.rst. */ void (*show_fdinfo)(struct drm_printer *p, struct drm_file *f); /** @major: driver major number */ int major; /** @minor: driver minor number */ int minor; /** @patchlevel: driver patch level */ int patchlevel; /** @name: driver name */ char *name; /** @desc: driver description */ char *desc; /** * @driver_features: * Driver features, see &enum drm_driver_feature. Drivers can disable * some features on a per-instance basis using * &drm_device.driver_features. */ u32 driver_features; /** * @ioctls: * * Array of driver-private IOCTL description entries. See the chapter on * :ref:`IOCTL support in the userland interfaces * chapter<drm_driver_ioctl>` for the full details. */ const struct drm_ioctl_desc *ioctls; /** @num_ioctls: Number of entries in @ioctls. */ int num_ioctls; /** * @fops: * * File operations for the DRM device node. See the discussion in * :ref:`file operations<drm_driver_fops>` for in-depth coverage and * some examples. */ const struct file_operations *fops; }; void *__devm_drm_dev_alloc(struct device *parent, const struct drm_driver *driver, size_t size, size_t offset); struct dmem_cgroup_region * drmm_cgroup_register_region(struct drm_device *dev, const char *region_name, u64 size); /** * devm_drm_dev_alloc - Resource managed allocation of a &drm_device instance * @parent: Parent device object * @driver: DRM driver * @type: the type of the struct which contains struct &drm_device * @member: the name of the &drm_device within @type. * * This allocates and initialize a new DRM device. No device registration is done. * Call drm_dev_register() to advertice the device to user space and register it * with other core subsystems. This should be done last in the device * initialization sequence to make sure userspace can't access an inconsistent * state. * * The initial ref-count of the object is 1. Use drm_dev_get() and * drm_dev_put() to take and drop further ref-counts. * * It is recommended that drivers embed &struct drm_device into their own device * structure. * * Note that this manages the lifetime of the resulting &drm_device * automatically using devres. The DRM device initialized with this function is * automatically put on driver detach using drm_dev_put(). * * RETURNS: * Pointer to new DRM device, or ERR_PTR on failure. */ #define devm_drm_dev_alloc(parent, driver, type, member) \ ((type *) __devm_drm_dev_alloc(parent, driver, sizeof(type), \ offsetof(type, member))) struct drm_device *drm_dev_alloc(const struct drm_driver *driver, struct device *parent); void *__drm_dev_alloc(struct device *parent, const struct drm_driver *driver, size_t size, size_t offset); int drm_dev_register(struct drm_device *dev, unsigned long flags); void drm_dev_unregister(struct drm_device *dev); void drm_dev_get(struct drm_device *dev); void drm_dev_put(struct drm_device *dev); void drm_put_dev(struct drm_device *dev); bool drm_dev_enter(struct drm_device *dev, int *idx); void drm_dev_exit(int idx); void drm_dev_unplug(struct drm_device *dev); int drm_dev_wedged_event(struct drm_device *dev, unsigned long method); /** * drm_dev_is_unplugged - is a DRM device unplugged * @dev: DRM device * * This function can be called to check whether a hotpluggable is unplugged. * Unplugging itself is singalled through drm_dev_unplug(). If a device is * unplugged, these two functions guarantee that any store before calling * drm_dev_unplug() is visible to callers of this function after it completes * * WARNING: This function fundamentally races against drm_dev_unplug(). It is * recommended that drivers instead use the underlying drm_dev_enter() and * drm_dev_exit() function pairs. */ static inline bool drm_dev_is_unplugged(struct drm_device *dev) { int idx; if (drm_dev_enter(dev, &idx)) { drm_dev_exit(idx); return false; } return true; } /** * drm_core_check_all_features - check driver feature flags mask * @dev: DRM device to check * @features: feature flag(s) mask * * This checks @dev for driver features, see &drm_driver.driver_features, * &drm_device.driver_features, and the various &enum drm_driver_feature flags. * * Returns true if all features in the @features mask are supported, false * otherwise. */ static inline bool drm_core_check_all_features(const struct drm_device *dev, u32 features) { u32 supported = dev->driver->driver_features & dev->driver_features; return features && (supported & features) == features; } /** * drm_core_check_feature - check driver feature flags * @dev: DRM device to check * @feature: feature flag * * This checks @dev for driver features, see &drm_driver.driver_features, * &drm_device.driver_features, and the various &enum drm_driver_feature flags. * * Returns true if the @feature is supported, false otherwise. */ static inline bool drm_core_check_feature(const struct drm_device *dev, enum drm_driver_feature feature) { return drm_core_check_all_features(dev, feature); } /** * drm_drv_uses_atomic_modeset - check if the driver implements * atomic_commit() * @dev: DRM device * * This check is useful if drivers do not have DRIVER_ATOMIC set but * have atomic modesetting internally implemented. */ static inline bool drm_drv_uses_atomic_modeset(struct drm_device *dev) { return drm_core_check_feature(dev, DRIVER_ATOMIC) || (dev->mode_config.funcs && dev->mode_config.funcs->atomic_commit != NULL); } /* TODO: Inline drm_firmware_drivers_only() in all its callers. */ static inline bool drm_firmware_drivers_only(void) { return video_firmware_drivers_only(); } #if defined(CONFIG_DEBUG_FS) void drm_debugfs_dev_init(struct drm_device *dev, struct dentry *root); #else static inline void drm_debugfs_dev_init(struct drm_device *dev, struct dentry *root) { } #endif #endif
11 15 15 60 60 1 15 11 11 11 11 11 11 11 11 11 11 2 2 4 4 4 4 11 2 11 11 11 11 1 2 1 1 2 2 2 1 1 2 1 1 3 3 3 1 3 61 15 60 60 60 34 61 24 26 5 45 15 21 6 15 15 15 11 11 11 11 11 15 15 15 1 15 1 15 15 15 15 15 8 15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 // SPDX-License-Identifier: GPL-2.0 /* * fs/partitions/msdos.c * * Code extracted from drivers/block/genhd.c * Copyright (C) 1991-1998 Linus Torvalds * * Thanks to Branko Lankester, lankeste@fwi.uva.nl, who found a bug * in the early extended-partition checks and added DM partitions * * Support for DiskManager v6.0x added by Mark Lord, * with information provided by OnTrack. This now works for linux fdisk * and LILO, as well as loadlin and bootln. Note that disks other than * /dev/hda *must* have a "DOS" type 0x51 partition in the first slot (hda1). * * More flexible handling of extended partitions - aeb, 950831 * * Check partition table on IDE disks for common CHS translations * * Re-organised Feb 1998 Russell King * * BSD disklabel support by Yossi Gottlieb <yogo@math.tau.ac.il> * updated by Marc Espie <Marc.Espie@openbsd.org> * * Unixware slices support by Andrzej Krzysztofowicz <ankry@mif.pg.gda.pl> * and Krzysztof G. Baranowski <kgb@knm.org.pl> */ #include <linux/msdos_fs.h> #include <linux/msdos_partition.h> #include "check.h" #include "efi.h" /* * Many architectures don't like unaligned accesses, while * the nr_sects and start_sect partition table entries are * at a 2 (mod 4) address. */ #include <linux/unaligned.h> static inline sector_t nr_sects(struct msdos_partition *p) { return (sector_t)get_unaligned_le32(&p->nr_sects); } static inline sector_t start_sect(struct msdos_partition *p) { return (sector_t)get_unaligned_le32(&p->start_sect); } static inline int is_extended_partition(struct msdos_partition *p) { return (p->sys_ind == DOS_EXTENDED_PARTITION || p->sys_ind == WIN98_EXTENDED_PARTITION || p->sys_ind == LINUX_EXTENDED_PARTITION); } #define MSDOS_LABEL_MAGIC1 0x55 #define MSDOS_LABEL_MAGIC2 0xAA static inline int msdos_magic_present(unsigned char *p) { return (p[0] == MSDOS_LABEL_MAGIC1 && p[1] == MSDOS_LABEL_MAGIC2); } /* Value is EBCDIC 'IBMA' */ #define AIX_LABEL_MAGIC1 0xC9 #define AIX_LABEL_MAGIC2 0xC2 #define AIX_LABEL_MAGIC3 0xD4 #define AIX_LABEL_MAGIC4 0xC1 static int aix_magic_present(struct parsed_partitions *state, unsigned char *p) { struct msdos_partition *pt = (struct msdos_partition *) (p + 0x1be); Sector sect; unsigned char *d; int slot, ret = 0; if (!(p[0] == AIX_LABEL_MAGIC1 && p[1] == AIX_LABEL_MAGIC2 && p[2] == AIX_LABEL_MAGIC3 && p[3] == AIX_LABEL_MAGIC4)) return 0; /* * Assume the partition table is valid if Linux partitions exists. * Note that old Solaris/x86 partitions use the same indicator as * Linux swap partitions, so we consider that a Linux partition as * well. */ for (slot = 1; slot <= 4; slot++, pt++) { if (pt->sys_ind == SOLARIS_X86_PARTITION || pt->sys_ind == LINUX_RAID_PARTITION || pt->sys_ind == LINUX_DATA_PARTITION || pt->sys_ind == LINUX_LVM_PARTITION || is_extended_partition(pt)) return 0; } d = read_part_sector(state, 7, &sect); if (d) { if (d[0] == '_' && d[1] == 'L' && d[2] == 'V' && d[3] == 'M') ret = 1; put_dev_sector(sect); } return ret; } static void set_info(struct parsed_partitions *state, int slot, u32 disksig) { struct partition_meta_info *info = &state->parts[slot].info; snprintf(info->uuid, sizeof(info->uuid), "%08x-%02x", disksig, slot); info->volname[0] = 0; state->parts[slot].has_info = true; } /* * Create devices for each logical partition in an extended partition. * The logical partitions form a linked list, with each entry being * a partition table with two entries. The first entry * is the real data partition (with a start relative to the partition * table start). The second is a pointer to the next logical partition * (with a start relative to the entire extended partition). * We do not create a Linux partition for the partition tables, but * only for the actual data partitions. */ static void parse_extended(struct parsed_partitions *state, sector_t first_sector, sector_t first_size, u32 disksig) { struct msdos_partition *p; Sector sect; unsigned char *data; sector_t this_sector, this_size; sector_t sector_size; int loopct = 0; /* number of links followed without finding a data partition */ int i; sector_size = queue_logical_block_size(state->disk->queue) / 512; this_sector = first_sector; this_size = first_size; while (1) { if (++loopct > 100) return; if (state->next == state->limit) return; data = read_part_sector(state, this_sector, &sect); if (!data) return; if (!msdos_magic_present(data + 510)) goto done; p = (struct msdos_partition *) (data + 0x1be); /* * Usually, the first entry is the real data partition, * the 2nd entry is the next extended partition, or empty, * and the 3rd and 4th entries are unused. * However, DRDOS sometimes has the extended partition as * the first entry (when the data partition is empty), * and OS/2 seems to use all four entries. */ /* * First process the data partition(s) */ for (i = 0; i < 4; i++, p++) { sector_t offs, size, next; if (!nr_sects(p) || is_extended_partition(p)) continue; /* Check the 3rd and 4th entries - these sometimes contain random garbage */ offs = start_sect(p)*sector_size; size = nr_sects(p)*sector_size; next = this_sector + offs; if (i >= 2) { if (offs + size > this_size) continue; if (next < first_sector) continue; if (next + size > first_sector + first_size) continue; } put_partition(state, state->next, next, size); set_info(state, state->next, disksig); if (p->sys_ind == LINUX_RAID_PARTITION) state->parts[state->next].flags = ADDPART_FLAG_RAID; loopct = 0; if (++state->next == state->limit) goto done; } /* * Next, process the (first) extended partition, if present. * (So far, there seems to be no reason to make * parse_extended() recursive and allow a tree * of extended partitions.) * It should be a link to the next logical partition. */ p -= 4; for (i = 0; i < 4; i++, p++) if (nr_sects(p) && is_extended_partition(p)) break; if (i == 4) goto done; /* nothing left to do */ this_sector = first_sector + start_sect(p) * sector_size; this_size = nr_sects(p) * sector_size; put_dev_sector(sect); } done: put_dev_sector(sect); } #define SOLARIS_X86_NUMSLICE 16 #define SOLARIS_X86_VTOC_SANE (0x600DDEEEUL) struct solaris_x86_slice { __le16 s_tag; /* ID tag of partition */ __le16 s_flag; /* permission flags */ __le32 s_start; /* start sector no of partition */ __le32 s_size; /* # of blocks in partition */ }; struct solaris_x86_vtoc { unsigned int v_bootinfo[3]; /* info needed by mboot */ __le32 v_sanity; /* to verify vtoc sanity */ __le32 v_version; /* layout version */ char v_volume[8]; /* volume name */ __le16 v_sectorsz; /* sector size in bytes */ __le16 v_nparts; /* number of partitions */ unsigned int v_reserved[10]; /* free space */ struct solaris_x86_slice v_slice[SOLARIS_X86_NUMSLICE]; /* slice headers */ unsigned int timestamp[SOLARIS_X86_NUMSLICE]; /* timestamp */ char v_asciilabel[128]; /* for compatibility */ }; /* james@bpgc.com: Solaris has a nasty indicator: 0x82 which also indicates linux swap. Be careful before believing this is Solaris. */ static void parse_solaris_x86(struct parsed_partitions *state, sector_t offset, sector_t size, int origin) { #ifdef CONFIG_SOLARIS_X86_PARTITION Sector sect; struct solaris_x86_vtoc *v; int i; short max_nparts; v = read_part_sector(state, offset + 1, &sect); if (!v) return; if (le32_to_cpu(v->v_sanity) != SOLARIS_X86_VTOC_SANE) { put_dev_sector(sect); return; } { char tmp[1 + BDEVNAME_SIZE + 10 + 11 + 1]; snprintf(tmp, sizeof(tmp), " %s%d: <solaris:", state->name, origin); strlcat(state->pp_buf, tmp, PAGE_SIZE); } if (le32_to_cpu(v->v_version) != 1) { char tmp[64]; snprintf(tmp, sizeof(tmp), " cannot handle version %d vtoc>\n", le32_to_cpu(v->v_version)); strlcat(state->pp_buf, tmp, PAGE_SIZE); put_dev_sector(sect); return; } /* Ensure we can handle previous case of VTOC with 8 entries gracefully */ max_nparts = le16_to_cpu(v->v_nparts) > 8 ? SOLARIS_X86_NUMSLICE : 8; for (i = 0; i < max_nparts && state->next < state->limit; i++) { struct solaris_x86_slice *s = &v->v_slice[i]; char tmp[3 + 10 + 1 + 1]; if (s->s_size == 0) continue; snprintf(tmp, sizeof(tmp), " [s%d]", i); strlcat(state->pp_buf, tmp, PAGE_SIZE); /* solaris partitions are relative to current MS-DOS * one; must add the offset of the current partition */ put_partition(state, state->next++, le32_to_cpu(s->s_start)+offset, le32_to_cpu(s->s_size)); } put_dev_sector(sect); strlcat(state->pp_buf, " >\n", PAGE_SIZE); #endif } /* check against BSD src/sys/sys/disklabel.h for consistency */ #define BSD_DISKMAGIC (0x82564557UL) /* The disk magic number */ #define BSD_MAXPARTITIONS 16 #define OPENBSD_MAXPARTITIONS 16 #define BSD_FS_UNUSED 0 /* disklabel unused partition entry ID */ struct bsd_disklabel { __le32 d_magic; /* the magic number */ __s16 d_type; /* drive type */ __s16 d_subtype; /* controller/d_type specific */ char d_typename[16]; /* type name, e.g. "eagle" */ char d_packname[16]; /* pack identifier */ __u32 d_secsize; /* # of bytes per sector */ __u32 d_nsectors; /* # of data sectors per track */ __u32 d_ntracks; /* # of tracks per cylinder */ __u32 d_ncylinders; /* # of data cylinders per unit */ __u32 d_secpercyl; /* # of data sectors per cylinder */ __u32 d_secperunit; /* # of data sectors per unit */ __u16 d_sparespertrack; /* # of spare sectors per track */ __u16 d_sparespercyl; /* # of spare sectors per cylinder */ __u32 d_acylinders; /* # of alt. cylinders per unit */ __u16 d_rpm; /* rotational speed */ __u16 d_interleave; /* hardware sector interleave */ __u16 d_trackskew; /* sector 0 skew, per track */ __u16 d_cylskew; /* sector 0 skew, per cylinder */ __u32 d_headswitch; /* head switch time, usec */ __u32 d_trkseek; /* track-to-track seek, usec */ __u32 d_flags; /* generic flags */ #define NDDATA 5 __u32 d_drivedata[NDDATA]; /* drive-type specific information */ #define NSPARE 5 __u32 d_spare[NSPARE]; /* reserved for future use */ __le32 d_magic2; /* the magic number (again) */ __le16 d_checksum; /* xor of data incl. partitions */ /* filesystem and partition information: */ __le16 d_npartitions; /* number of partitions in following */ __le32 d_bbsize; /* size of boot area at sn0, bytes */ __le32 d_sbsize; /* max size of fs superblock, bytes */ struct bsd_partition { /* the partition table */ __le32 p_size; /* number of sectors in partition */ __le32 p_offset; /* starting sector */ __le32 p_fsize; /* filesystem basic fragment size */ __u8 p_fstype; /* filesystem type, see below */ __u8 p_frag; /* filesystem fragments per block */ __le16 p_cpg; /* filesystem cylinders per group */ } d_partitions[BSD_MAXPARTITIONS]; /* actually may be more */ }; #if defined(CONFIG_BSD_DISKLABEL) /* * Create devices for BSD partitions listed in a disklabel, under a * dos-like partition. See parse_extended() for more information. */ static void parse_bsd(struct parsed_partitions *state, sector_t offset, sector_t size, int origin, char *flavour, int max_partitions) { Sector sect; struct bsd_disklabel *l; struct bsd_partition *p; char tmp[64]; l = read_part_sector(state, offset + 1, &sect); if (!l) return; if (le32_to_cpu(l->d_magic) != BSD_DISKMAGIC) { put_dev_sector(sect); return; } snprintf(tmp, sizeof(tmp), " %s%d: <%s:", state->name, origin, flavour); strlcat(state->pp_buf, tmp, PAGE_SIZE); if (le16_to_cpu(l->d_npartitions) < max_partitions) max_partitions = le16_to_cpu(l->d_npartitions); for (p = l->d_partitions; p - l->d_partitions < max_partitions; p++) { sector_t bsd_start, bsd_size; if (state->next == state->limit) break; if (p->p_fstype == BSD_FS_UNUSED) continue; bsd_start = le32_to_cpu(p->p_offset); bsd_size = le32_to_cpu(p->p_size); /* FreeBSD has relative offset if C partition offset is zero */ if (memcmp(flavour, "bsd\0", 4) == 0 && le32_to_cpu(l->d_partitions[2].p_offset) == 0) bsd_start += offset; if (offset == bsd_start && size == bsd_size) /* full parent partition, we have it already */ continue; if (offset > bsd_start || offset+size < bsd_start+bsd_size) { strlcat(state->pp_buf, "bad subpartition - ignored\n", PAGE_SIZE); continue; } put_partition(state, state->next++, bsd_start, bsd_size); } put_dev_sector(sect); if (le16_to_cpu(l->d_npartitions) > max_partitions) { snprintf(tmp, sizeof(tmp), " (ignored %d more)", le16_to_cpu(l->d_npartitions) - max_partitions); strlcat(state->pp_buf, tmp, PAGE_SIZE); } strlcat(state->pp_buf, " >\n", PAGE_SIZE); } #endif static void parse_freebsd(struct parsed_partitions *state, sector_t offset, sector_t size, int origin) { #ifdef CONFIG_BSD_DISKLABEL parse_bsd(state, offset, size, origin, "bsd", BSD_MAXPARTITIONS); #endif } static void parse_netbsd(struct parsed_partitions *state, sector_t offset, sector_t size, int origin) { #ifdef CONFIG_BSD_DISKLABEL parse_bsd(state, offset, size, origin, "netbsd", BSD_MAXPARTITIONS); #endif } static void parse_openbsd(struct parsed_partitions *state, sector_t offset, sector_t size, int origin) { #ifdef CONFIG_BSD_DISKLABEL parse_bsd(state, offset, size, origin, "openbsd", OPENBSD_MAXPARTITIONS); #endif } #define UNIXWARE_DISKMAGIC (0xCA5E600DUL) /* The disk magic number */ #define UNIXWARE_DISKMAGIC2 (0x600DDEEEUL) /* The slice table magic nr */ #define UNIXWARE_NUMSLICE 16 #define UNIXWARE_FS_UNUSED 0 /* Unused slice entry ID */ struct unixware_slice { __le16 s_label; /* label */ __le16 s_flags; /* permission flags */ __le32 start_sect; /* starting sector */ __le32 nr_sects; /* number of sectors in slice */ }; struct unixware_disklabel { __le32 d_type; /* drive type */ __le32 d_magic; /* the magic number */ __le32 d_version; /* version number */ char d_serial[12]; /* serial number of the device */ __le32 d_ncylinders; /* # of data cylinders per device */ __le32 d_ntracks; /* # of tracks per cylinder */ __le32 d_nsectors; /* # of data sectors per track */ __le32 d_secsize; /* # of bytes per sector */ __le32 d_part_start; /* # of first sector of this partition*/ __le32 d_unknown1[12]; /* ? */ __le32 d_alt_tbl; /* byte offset of alternate table */ __le32 d_alt_len; /* byte length of alternate table */ __le32 d_phys_cyl; /* # of physical cylinders per device */ __le32 d_phys_trk; /* # of physical tracks per cylinder */ __le32 d_phys_sec; /* # of physical sectors per track */ __le32 d_phys_bytes; /* # of physical bytes per sector */ __le32 d_unknown2; /* ? */ __le32 d_unknown3; /* ? */ __le32 d_pad[8]; /* pad */ struct unixware_vtoc { __le32 v_magic; /* the magic number */ __le32 v_version; /* version number */ char v_name[8]; /* volume name */ __le16 v_nslices; /* # of slices */ __le16 v_unknown1; /* ? */ __le32 v_reserved[10]; /* reserved */ struct unixware_slice v_slice[UNIXWARE_NUMSLICE]; /* slice headers */ } vtoc; }; /* 408 */ /* * Create devices for Unixware partitions listed in a disklabel, under a * dos-like partition. See parse_extended() for more information. */ static void parse_unixware(struct parsed_partitions *state, sector_t offset, sector_t size, int origin) { #ifdef CONFIG_UNIXWARE_DISKLABEL Sector sect; struct unixware_disklabel *l; struct unixware_slice *p; l = read_part_sector(state, offset + 29, &sect); if (!l) return; if (le32_to_cpu(l->d_magic) != UNIXWARE_DISKMAGIC || le32_to_cpu(l->vtoc.v_magic) != UNIXWARE_DISKMAGIC2) { put_dev_sector(sect); return; } { char tmp[1 + BDEVNAME_SIZE + 10 + 12 + 1]; snprintf(tmp, sizeof(tmp), " %s%d: <unixware:", state->name, origin); strlcat(state->pp_buf, tmp, PAGE_SIZE); } p = &l->vtoc.v_slice[1]; /* I omit the 0th slice as it is the same as whole disk. */ while (p - &l->vtoc.v_slice[0] < UNIXWARE_NUMSLICE) { if (state->next == state->limit) break; if (p->s_label != UNIXWARE_FS_UNUSED) put_partition(state, state->next++, le32_to_cpu(p->start_sect), le32_to_cpu(p->nr_sects)); p++; } put_dev_sector(sect); strlcat(state->pp_buf, " >\n", PAGE_SIZE); #endif } #define MINIX_NR_SUBPARTITIONS 4 /* * Minix 2.0.0/2.0.2 subpartition support. * Anand Krishnamurthy <anandk@wiproge.med.ge.com> * Rajeev V. Pillai <rajeevvp@yahoo.com> */ static void parse_minix(struct parsed_partitions *state, sector_t offset, sector_t size, int origin) { #ifdef CONFIG_MINIX_SUBPARTITION Sector sect; unsigned char *data; struct msdos_partition *p; int i; data = read_part_sector(state, offset, &sect); if (!data) return; p = (struct msdos_partition *)(data + 0x1be); /* The first sector of a Minix partition can have either * a secondary MBR describing its subpartitions, or * the normal boot sector. */ if (msdos_magic_present(data + 510) && p->sys_ind == MINIX_PARTITION) { /* subpartition table present */ char tmp[1 + BDEVNAME_SIZE + 10 + 9 + 1]; snprintf(tmp, sizeof(tmp), " %s%d: <minix:", state->name, origin); strlcat(state->pp_buf, tmp, PAGE_SIZE); for (i = 0; i < MINIX_NR_SUBPARTITIONS; i++, p++) { if (state->next == state->limit) break; /* add each partition in use */ if (p->sys_ind == MINIX_PARTITION) put_partition(state, state->next++, start_sect(p), nr_sects(p)); } strlcat(state->pp_buf, " >\n", PAGE_SIZE); } put_dev_sector(sect); #endif /* CONFIG_MINIX_SUBPARTITION */ } static struct { unsigned char id; void (*parse)(struct parsed_partitions *, sector_t, sector_t, int); } subtypes[] = { {FREEBSD_PARTITION, parse_freebsd}, {NETBSD_PARTITION, parse_netbsd}, {OPENBSD_PARTITION, parse_openbsd}, {MINIX_PARTITION, parse_minix}, {UNIXWARE_PARTITION, parse_unixware}, {SOLARIS_X86_PARTITION, parse_solaris_x86}, {NEW_SOLARIS_X86_PARTITION, parse_solaris_x86}, {0, NULL}, }; int msdos_partition(struct parsed_partitions *state) { sector_t sector_size; Sector sect; unsigned char *data; struct msdos_partition *p; struct fat_boot_sector *fb; int slot; u32 disksig; sector_size = queue_logical_block_size(state->disk->queue) / 512; data = read_part_sector(state, 0, &sect); if (!data) return -1; /* * Note order! (some AIX disks, e.g. unbootable kind, * have no MSDOS 55aa) */ if (aix_magic_present(state, data)) { put_dev_sector(sect); #ifdef CONFIG_AIX_PARTITION return aix_partition(state); #else strlcat(state->pp_buf, " [AIX]", PAGE_SIZE); return 0; #endif } if (!msdos_magic_present(data + 510)) { put_dev_sector(sect); return 0; } /* * Now that the 55aa signature is present, this is probably * either the boot sector of a FAT filesystem or a DOS-type * partition table. Reject this in case the boot indicator * is not 0 or 0x80. */ p = (struct msdos_partition *) (data + 0x1be); for (slot = 1; slot <= 4; slot++, p++) { if (p->boot_ind != 0 && p->boot_ind != 0x80) { /* * Even without a valid boot indicator value * its still possible this is valid FAT filesystem * without a partition table. */ fb = (struct fat_boot_sector *) data; if (slot == 1 && fb->reserved && fb->fats && fat_valid_media(fb->media)) { strlcat(state->pp_buf, "\n", PAGE_SIZE); put_dev_sector(sect); return 1; } else { put_dev_sector(sect); return 0; } } } #ifdef CONFIG_EFI_PARTITION p = (struct msdos_partition *) (data + 0x1be); for (slot = 1 ; slot <= 4 ; slot++, p++) { /* If this is an EFI GPT disk, msdos should ignore it. */ if (p->sys_ind == EFI_PMBR_OSTYPE_EFI_GPT) { put_dev_sector(sect); return 0; } } #endif p = (struct msdos_partition *) (data + 0x1be); disksig = le32_to_cpup((__le32 *)(data + 0x1b8)); /* * Look for partitions in two passes: * First find the primary and DOS-type extended partitions. * On the second pass look inside *BSD, Unixware and Solaris partitions. */ state->next = 5; for (slot = 1 ; slot <= 4 ; slot++, p++) { sector_t start = start_sect(p)*sector_size; sector_t size = nr_sects(p)*sector_size; if (!size) continue; if (is_extended_partition(p)) { /* * prevent someone doing mkfs or mkswap on an * extended partition, but leave room for LILO * FIXME: this uses one logical sector for > 512b * sector, although it may not be enough/proper. */ sector_t n = 2; n = min(size, max(sector_size, n)); put_partition(state, slot, start, n); strlcat(state->pp_buf, " <", PAGE_SIZE); parse_extended(state, start, size, disksig); strlcat(state->pp_buf, " >", PAGE_SIZE); continue; } put_partition(state, slot, start, size); set_info(state, slot, disksig); if (p->sys_ind == LINUX_RAID_PARTITION) state->parts[slot].flags = ADDPART_FLAG_RAID; if (p->sys_ind == DM6_PARTITION) strlcat(state->pp_buf, "[DM]", PAGE_SIZE); if (p->sys_ind == EZD_PARTITION) strlcat(state->pp_buf, "[EZD]", PAGE_SIZE); } strlcat(state->pp_buf, "\n", PAGE_SIZE); /* second pass - output for each on a separate line */ p = (struct msdos_partition *) (0x1be + data); for (slot = 1 ; slot <= 4 ; slot++, p++) { unsigned char id = p->sys_ind; int n; if (!nr_sects(p)) continue; for (n = 0; subtypes[n].parse && id != subtypes[n].id; n++) ; if (!subtypes[n].parse) continue; subtypes[n].parse(state, start_sect(p) * sector_size, nr_sects(p) * sector_size, slot); } put_dev_sector(sect); return 1; }
1462 1 1 1 486 2207 1659 487 485 487 485 486 487 487 422 428 427 487 3578 1659 3573 1584 1584 1238 1235 1238 1235 1314 1311 3 1313 486 487 56 55 56 3585 3597 3585 6 6 6 4 6 235 235 235 234 235 235 234 235 235 235 234 64 64 235 235 234 235 235 6 234 235 235 1 234 235 21 235 235 235 33 235 235 235 234 235 235 235 2 63 63 63 63 2 2 65 65 63 2 377 346 346 374 375 374 373 375 377 377 377 377 376 374 375 375 375 49 49 6 4 6 6 6 6 6 6 6 6 487 486 487 487 486 485 46 43 345 346 346 346 346 346 346 346 113 113 112 157 156 2 155 157 1 1 155 43 155 152 595 157 43 155 155 45 45 155 3 3 3 3 3 3 3 3 3 462 462 462 2 2 2 2 2 234 235 230 160 157 5 3 78 78 78 5 1 78 3 2 235 235 235 63 2 2 2 2 478 478 244 235 305 477 260 260 219 479 42 436 216 222 221 479 244 478 235 235 7 65 479 479 235 234 265 265 1 265 8 4 265 265 22 47 22 22 47 43 43 264 265 501 330 500 499 498 496 10 497 253 254 290 490 486 243 488 487 485 244 484 203 486 42 487 486 487 486 485 480 3 2 477 479 478 477 476 478 479 477 479 479 479 479 477 479 307 302 301 300 301 11 23 22 270 242 270 270 243 270 26 264 265 242 264 244 264 19 265 265 264 265 265 264 265 265 22 5 22 22 21 22 22 243 242 264 265 22 265 265 265 3 265 264 8 19 27 28 39 40 2 40 46 46 46 48 45 48 48 48 48 48 48 47 48 48 48 49 55 62 53 61 7 211 248 3 247 233 224 232 244 57 22 22 4 22 3 3 22 22 22 3 2 21 79 203 204 60 61 61 59 59 16 58 56 11 55 15 54 61 48 45 4 3 1 53 52 52 50 5 49 58 61 44 54 46 1 1 1 1 1 1 1 1 69 68 2 65 65 66 38 38 36 65 41 41 37 69 1 69 2 69 3 69 32 69 44 66 65 66 66 44 20 34 23 43 21 42 39 43 26 26 43 31 43 44 49 49 36 49 3 52 68 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 // SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/fork.c * * Copyright (C) 1991, 1992 Linus Torvalds */ /* * 'fork.c' contains the help-routines for the 'fork' system call * (see also entry.S and others). * Fork is rather simple, once you get the hang of it, but the memory * management can be a bitch. See 'mm/memory.c': 'copy_page_range()' */ #include <linux/anon_inodes.h> #include <linux/slab.h> #include <linux/sched/autogroup.h> #include <linux/sched/mm.h> #include <linux/sched/user.h> #include <linux/sched/numa_balancing.h> #include <linux/sched/stat.h> #include <linux/sched/task.h> #include <linux/sched/task_stack.h> #include <linux/sched/cputime.h> #include <linux/sched/ext.h> #include <linux/seq_file.h> #include <linux/rtmutex.h> #include <linux/init.h> #include <linux/unistd.h> #include <linux/module.h> #include <linux/vmalloc.h> #include <linux/completion.h> #include <linux/personality.h> #include <linux/mempolicy.h> #include <linux/sem.h> #include <linux/file.h> #include <linux/fdtable.h> #include <linux/iocontext.h> #include <linux/key.h> #include <linux/kmsan.h> #include <linux/binfmts.h> #include <linux/mman.h> #include <linux/mmu_notifier.h> #include <linux/fs.h> #include <linux/mm.h> #include <linux/mm_inline.h> #include <linux/memblock.h> #include <linux/nsproxy.h> #include <linux/capability.h> #include <linux/cpu.h> #include <linux/cgroup.h> #include <linux/security.h> #include <linux/hugetlb.h> #include <linux/seccomp.h> #include <linux/swap.h> #include <linux/syscalls.h> #include <linux/syscall_user_dispatch.h> #include <linux/jiffies.h> #include <linux/futex.h> #include <linux/compat.h> #include <linux/kthread.h> #include <linux/task_io_accounting_ops.h> #include <linux/rcupdate.h> #include <linux/ptrace.h> #include <linux/mount.h> #include <linux/audit.h> #include <linux/memcontrol.h> #include <linux/ftrace.h> #include <linux/proc_fs.h> #include <linux/profile.h> #include <linux/rmap.h> #include <linux/ksm.h> #include <linux/acct.h> #include <linux/userfaultfd_k.h> #include <linux/tsacct_kern.h> #include <linux/cn_proc.h> #include <linux/freezer.h> #include <linux/delayacct.h> #include <linux/taskstats_kern.h> #include <linux/tty.h> #include <linux/fs_struct.h> #include <linux/magic.h> #include <linux/perf_event.h> #include <linux/posix-timers.h> #include <linux/user-return-notifier.h> #include <linux/oom.h> #include <linux/khugepaged.h> #include <linux/signalfd.h> #include <linux/uprobes.h> #include <linux/aio.h> #include <linux/compiler.h> #include <linux/sysctl.h> #include <linux/kcov.h> #include <linux/livepatch.h> #include <linux/thread_info.h> #include <linux/stackleak.h> #include <linux/kasan.h> #include <linux/scs.h> #include <linux/io_uring.h> #include <linux/bpf.h> #include <linux/stackprotector.h> #include <linux/user_events.h> #include <linux/iommu.h> #include <linux/rseq.h> #include <uapi/linux/pidfd.h> #include <linux/pidfs.h> #include <linux/tick.h> #include <asm/pgalloc.h> #include <linux/uaccess.h> #include <asm/mmu_context.h> #include <asm/cacheflush.h> #include <asm/tlbflush.h> #include <trace/events/sched.h> #define CREATE_TRACE_POINTS #include <trace/events/task.h> #include <kunit/visibility.h> /* * Minimum number of threads to boot the kernel */ #define MIN_THREADS 20 /* * Maximum number of threads */ #define MAX_THREADS FUTEX_TID_MASK /* * Protected counters by write_lock_irq(&tasklist_lock) */ unsigned long total_forks; /* Handle normal Linux uptimes. */ int nr_threads; /* The idle threads do not count.. */ static int max_threads; /* tunable limit on nr_threads */ #define NAMED_ARRAY_INDEX(x) [x] = __stringify(x) static const char * const resident_page_types[] = { NAMED_ARRAY_INDEX(MM_FILEPAGES), NAMED_ARRAY_INDEX(MM_ANONPAGES), NAMED_ARRAY_INDEX(MM_SWAPENTS), NAMED_ARRAY_INDEX(MM_SHMEMPAGES), }; DEFINE_PER_CPU(unsigned long, process_counts) = 0; __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ #ifdef CONFIG_PROVE_RCU int lockdep_tasklist_lock_is_held(void) { return lockdep_is_held(&tasklist_lock); } EXPORT_SYMBOL_GPL(lockdep_tasklist_lock_is_held); #endif /* #ifdef CONFIG_PROVE_RCU */ int nr_processes(void) { int cpu; int total = 0; for_each_possible_cpu(cpu) total += per_cpu(process_counts, cpu); return total; } void __weak arch_release_task_struct(struct task_struct *tsk) { } static struct kmem_cache *task_struct_cachep; static inline struct task_struct *alloc_task_struct_node(int node) { return kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node); } static inline void free_task_struct(struct task_struct *tsk) { kmem_cache_free(task_struct_cachep, tsk); } /* * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a * kmemcache based allocator. */ # if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) # ifdef CONFIG_VMAP_STACK /* * vmalloc() is a bit slow, and calling vfree() enough times will force a TLB * flush. Try to minimize the number of calls by caching stacks. */ #define NR_CACHED_STACKS 2 static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]); struct vm_stack { struct rcu_head rcu; struct vm_struct *stack_vm_area; }; static bool try_release_thread_stack_to_cache(struct vm_struct *vm) { unsigned int i; for (i = 0; i < NR_CACHED_STACKS; i++) { struct vm_struct *tmp = NULL; if (this_cpu_try_cmpxchg(cached_stacks[i], &tmp, vm)) return true; } return false; } static void thread_stack_free_rcu(struct rcu_head *rh) { struct vm_stack *vm_stack = container_of(rh, struct vm_stack, rcu); if (try_release_thread_stack_to_cache(vm_stack->stack_vm_area)) return; vfree(vm_stack); } static void thread_stack_delayed_free(struct task_struct *tsk) { struct vm_stack *vm_stack = tsk->stack; vm_stack->stack_vm_area = tsk->stack_vm_area; call_rcu(&vm_stack->rcu, thread_stack_free_rcu); } static int free_vm_stack_cache(unsigned int cpu) { struct vm_struct **cached_vm_stacks = per_cpu_ptr(cached_stacks, cpu); int i; for (i = 0; i < NR_CACHED_STACKS; i++) { struct vm_struct *vm_stack = cached_vm_stacks[i]; if (!vm_stack) continue; vfree(vm_stack->addr); cached_vm_stacks[i] = NULL; } return 0; } static int memcg_charge_kernel_stack(struct vm_struct *vm) { int i; int ret; int nr_charged = 0; BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE); for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) { ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL, 0); if (ret) goto err; nr_charged++; } return 0; err: for (i = 0; i < nr_charged; i++) memcg_kmem_uncharge_page(vm->pages[i], 0); return ret; } static int alloc_thread_stack_node(struct task_struct *tsk, int node) { struct vm_struct *vm; void *stack; int i; for (i = 0; i < NR_CACHED_STACKS; i++) { struct vm_struct *s; s = this_cpu_xchg(cached_stacks[i], NULL); if (!s) continue; /* Reset stack metadata. */ kasan_unpoison_range(s->addr, THREAD_SIZE); stack = kasan_reset_tag(s->addr); /* Clear stale pointers from reused stack. */ memset(stack, 0, THREAD_SIZE); if (memcg_charge_kernel_stack(s)) { vfree(s->addr); return -ENOMEM; } tsk->stack_vm_area = s; tsk->stack = stack; return 0; } /* * Allocated stacks are cached and later reused by new threads, * so memcg accounting is performed manually on assigning/releasing * stacks to tasks. Drop __GFP_ACCOUNT. */ stack = __vmalloc_node(THREAD_SIZE, THREAD_ALIGN, THREADINFO_GFP & ~__GFP_ACCOUNT, node, __builtin_return_address(0)); if (!stack) return -ENOMEM; vm = find_vm_area(stack); if (memcg_charge_kernel_stack(vm)) { vfree(stack); return -ENOMEM; } /* * We can't call find_vm_area() in interrupt context, and * free_thread_stack() can be called in interrupt context, * so cache the vm_struct. */ tsk->stack_vm_area = vm; stack = kasan_reset_tag(stack); tsk->stack = stack; return 0; } static void free_thread_stack(struct task_struct *tsk) { if (!try_release_thread_stack_to_cache(tsk->stack_vm_area)) thread_stack_delayed_free(tsk); tsk->stack = NULL; tsk->stack_vm_area = NULL; } # else /* !CONFIG_VMAP_STACK */ static void thread_stack_free_rcu(struct rcu_head *rh) { __free_pages(virt_to_page(rh), THREAD_SIZE_ORDER); } static void thread_stack_delayed_free(struct task_struct *tsk) { struct rcu_head *rh = tsk->stack; call_rcu(rh, thread_stack_free_rcu); } static int alloc_thread_stack_node(struct task_struct *tsk, int node) { struct page *page = alloc_pages_node(node, THREADINFO_GFP, THREAD_SIZE_ORDER); if (likely(page)) { tsk->stack = kasan_reset_tag(page_address(page)); return 0; } return -ENOMEM; } static void free_thread_stack(struct task_struct *tsk) { thread_stack_delayed_free(tsk); tsk->stack = NULL; } # endif /* CONFIG_VMAP_STACK */ # else /* !(THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)) */ static struct kmem_cache *thread_stack_cache; static void thread_stack_free_rcu(struct rcu_head *rh) { kmem_cache_free(thread_stack_cache, rh); } static void thread_stack_delayed_free(struct task_struct *tsk) { struct rcu_head *rh = tsk->stack; call_rcu(rh, thread_stack_free_rcu); } static int alloc_thread_stack_node(struct task_struct *tsk, int node) { unsigned long *stack; stack = kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node); stack = kasan_reset_tag(stack); tsk->stack = stack; return stack ? 0 : -ENOMEM; } static void free_thread_stack(struct task_struct *tsk) { thread_stack_delayed_free(tsk); tsk->stack = NULL; } void thread_stack_cache_init(void) { thread_stack_cache = kmem_cache_create_usercopy("thread_stack", THREAD_SIZE, THREAD_SIZE, 0, 0, THREAD_SIZE, NULL); BUG_ON(thread_stack_cache == NULL); } # endif /* THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) */ /* SLAB cache for signal_struct structures (tsk->signal) */ static struct kmem_cache *signal_cachep; /* SLAB cache for sighand_struct structures (tsk->sighand) */ struct kmem_cache *sighand_cachep; /* SLAB cache for files_struct structures (tsk->files) */ struct kmem_cache *files_cachep; /* SLAB cache for fs_struct structures (tsk->fs) */ struct kmem_cache *fs_cachep; /* SLAB cache for vm_area_struct structures */ static struct kmem_cache *vm_area_cachep; /* SLAB cache for mm_struct structures (tsk->mm) */ static struct kmem_cache *mm_cachep; struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) { struct vm_area_struct *vma; vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (!vma) return NULL; vma_init(vma, mm); return vma; } static void vm_area_init_from(const struct vm_area_struct *src, struct vm_area_struct *dest) { dest->vm_mm = src->vm_mm; dest->vm_ops = src->vm_ops; dest->vm_start = src->vm_start; dest->vm_end = src->vm_end; dest->anon_vma = src->anon_vma; dest->vm_pgoff = src->vm_pgoff; dest->vm_file = src->vm_file; dest->vm_private_data = src->vm_private_data; vm_flags_init(dest, src->vm_flags); memcpy(&dest->vm_page_prot, &src->vm_page_prot, sizeof(dest->vm_page_prot)); /* * src->shared.rb may be modified concurrently when called from * dup_mmap(), but the clone will reinitialize it. */ data_race(memcpy(&dest->shared, &src->shared, sizeof(dest->shared))); memcpy(&dest->vm_userfaultfd_ctx, &src->vm_userfaultfd_ctx, sizeof(dest->vm_userfaultfd_ctx)); #ifdef CONFIG_ANON_VMA_NAME dest->anon_name = src->anon_name; #endif #ifdef CONFIG_SWAP memcpy(&dest->swap_readahead_info, &src->swap_readahead_info, sizeof(dest->swap_readahead_info)); #endif #ifndef CONFIG_MMU dest->vm_region = src->vm_region; #endif #ifdef CONFIG_NUMA dest->vm_policy = src->vm_policy; #endif } struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) { struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (!new) return NULL; ASSERT_EXCLUSIVE_WRITER(orig->vm_flags); ASSERT_EXCLUSIVE_WRITER(orig->vm_file); vm_area_init_from(orig, new); vma_lock_init(new, true); INIT_LIST_HEAD(&new->anon_vma_chain); vma_numab_state_init(new); dup_anon_vma_name(orig, new); return new; } void vm_area_free(struct vm_area_struct *vma) { /* The vma should be detached while being destroyed. */ vma_assert_detached(vma); vma_numab_state_free(vma); free_anon_vma_name(vma); kmem_cache_free(vm_area_cachep, vma); } static void account_kernel_stack(struct task_struct *tsk, int account) { if (IS_ENABLED(CONFIG_VMAP_STACK)) { struct vm_struct *vm = task_stack_vm_area(tsk); int i; for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) mod_lruvec_page_state(vm->pages[i], NR_KERNEL_STACK_KB, account * (PAGE_SIZE / 1024)); } else { void *stack = task_stack_page(tsk); /* All stack pages are in the same node. */ mod_lruvec_kmem_state(stack, NR_KERNEL_STACK_KB, account * (THREAD_SIZE / 1024)); } } void exit_task_stack_account(struct task_struct *tsk) { account_kernel_stack(tsk, -1); if (IS_ENABLED(CONFIG_VMAP_STACK)) { struct vm_struct *vm; int i; vm = task_stack_vm_area(tsk); for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) memcg_kmem_uncharge_page(vm->pages[i], 0); } } static void release_task_stack(struct task_struct *tsk) { if (WARN_ON(READ_ONCE(tsk->__state) != TASK_DEAD)) return; /* Better to leak the stack than to free prematurely */ free_thread_stack(tsk); } #ifdef CONFIG_THREAD_INFO_IN_TASK void put_task_stack(struct task_struct *tsk) { if (refcount_dec_and_test(&tsk->stack_refcount)) release_task_stack(tsk); } #endif void free_task(struct task_struct *tsk) { #ifdef CONFIG_SECCOMP WARN_ON_ONCE(tsk->seccomp.filter); #endif release_user_cpus_ptr(tsk); scs_release(tsk); #ifndef CONFIG_THREAD_INFO_IN_TASK /* * The task is finally done with both the stack and thread_info, * so free both. */ release_task_stack(tsk); #else /* * If the task had a separate stack allocation, it should be gone * by now. */ WARN_ON_ONCE(refcount_read(&tsk->stack_refcount) != 0); #endif rt_mutex_debug_task_free(tsk); ftrace_graph_exit_task(tsk); arch_release_task_struct(tsk); if (tsk->flags & PF_KTHREAD) free_kthread_struct(tsk); bpf_task_storage_free(tsk); free_task_struct(tsk); } EXPORT_SYMBOL(free_task); static void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm) { struct file *exe_file; exe_file = get_mm_exe_file(oldmm); RCU_INIT_POINTER(mm->exe_file, exe_file); /* * We depend on the oldmm having properly denied write access to the * exe_file already. */ if (exe_file && exe_file_deny_write_access(exe_file)) pr_warn_once("exe_file_deny_write_access() failed in %s\n", __func__); } #ifdef CONFIG_MMU static __latent_entropy int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) { struct vm_area_struct *mpnt, *tmp; int retval; unsigned long charge = 0; LIST_HEAD(uf); VMA_ITERATOR(vmi, mm, 0); if (mmap_write_lock_killable(oldmm)) return -EINTR; flush_cache_dup_mm(oldmm); uprobe_dup_mmap(oldmm, mm); /* * Not linked in yet - no deadlock potential: */ mmap_write_lock_nested(mm, SINGLE_DEPTH_NESTING); /* No ordering required: file already has been exposed. */ dup_mm_exe_file(mm, oldmm); mm->total_vm = oldmm->total_vm; mm->data_vm = oldmm->data_vm; mm->exec_vm = oldmm->exec_vm; mm->stack_vm = oldmm->stack_vm; /* Use __mt_dup() to efficiently build an identical maple tree. */ retval = __mt_dup(&oldmm->mm_mt, &mm->mm_mt, GFP_KERNEL); if (unlikely(retval)) goto out; mt_clear_in_rcu(vmi.mas.tree); for_each_vma(vmi, mpnt) { struct file *file; vma_start_write(mpnt); if (mpnt->vm_flags & VM_DONTCOPY) { retval = vma_iter_clear_gfp(&vmi, mpnt->vm_start, mpnt->vm_end, GFP_KERNEL); if (retval) goto loop_out; vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt)); continue; } charge = 0; /* * Don't duplicate many vmas if we've been oom-killed (for * example) */ if (fatal_signal_pending(current)) { retval = -EINTR; goto loop_out; } if (mpnt->vm_flags & VM_ACCOUNT) { unsigned long len = vma_pages(mpnt); if (security_vm_enough_memory_mm(oldmm, len)) /* sic */ goto fail_nomem; charge = len; } tmp = vm_area_dup(mpnt); if (!tmp) goto fail_nomem; /* track_pfn_copy() will later take care of copying internal state. */ if (unlikely(tmp->vm_flags & VM_PFNMAP)) untrack_pfn_clear(tmp); retval = vma_dup_policy(mpnt, tmp); if (retval) goto fail_nomem_policy; tmp->vm_mm = mm; retval = dup_userfaultfd(tmp, &uf); if (retval) goto fail_nomem_anon_vma_fork; if (tmp->vm_flags & VM_WIPEONFORK) { /* * VM_WIPEONFORK gets a clean slate in the child. * Don't prepare anon_vma until fault since we don't * copy page for current vma. */ tmp->anon_vma = NULL; } else if (anon_vma_fork(tmp, mpnt)) goto fail_nomem_anon_vma_fork; vm_flags_clear(tmp, VM_LOCKED_MASK); /* * Copy/update hugetlb private vma information. */ if (is_vm_hugetlb_page(tmp)) hugetlb_dup_vma_private(tmp); /* * Link the vma into the MT. After using __mt_dup(), memory * allocation is not necessary here, so it cannot fail. */ vma_iter_bulk_store(&vmi, tmp); mm->map_count++; if (tmp->vm_ops && tmp->vm_ops->open) tmp->vm_ops->open(tmp); file = tmp->vm_file; if (file) { struct address_space *mapping = file->f_mapping; get_file(file); i_mmap_lock_write(mapping); if (vma_is_shared_maywrite(tmp)) mapping_allow_writable(mapping); flush_dcache_mmap_lock(mapping); /* insert tmp into the share list, just after mpnt */ vma_interval_tree_insert_after(tmp, mpnt, &mapping->i_mmap); flush_dcache_mmap_unlock(mapping); i_mmap_unlock_write(mapping); } if (!(tmp->vm_flags & VM_WIPEONFORK)) retval = copy_page_range(tmp, mpnt); if (retval) { mpnt = vma_next(&vmi); goto loop_out; } } /* a new mm has just been created */ retval = arch_dup_mmap(oldmm, mm); loop_out: vma_iter_free(&vmi); if (!retval) { mt_set_in_rcu(vmi.mas.tree); ksm_fork(mm, oldmm); khugepaged_fork(mm, oldmm); } else { /* * The entire maple tree has already been duplicated. If the * mmap duplication fails, mark the failure point with * XA_ZERO_ENTRY. In exit_mmap(), if this marker is encountered, * stop releasing VMAs that have not been duplicated after this * point. */ if (mpnt) { mas_set_range(&vmi.mas, mpnt->vm_start, mpnt->vm_end - 1); mas_store(&vmi.mas, XA_ZERO_ENTRY); /* Avoid OOM iterating a broken tree */ set_bit(MMF_OOM_SKIP, &mm->flags); } /* * The mm_struct is going to exit, but the locks will be dropped * first. Set the mm_struct as unstable is advisable as it is * not fully initialised. */ set_bit(MMF_UNSTABLE, &mm->flags); } out: mmap_write_unlock(mm); flush_tlb_mm(oldmm); mmap_write_unlock(oldmm); if (!retval) dup_userfaultfd_complete(&uf); else dup_userfaultfd_fail(&uf); return retval; fail_nomem_anon_vma_fork: mpol_put(vma_policy(tmp)); fail_nomem_policy: vm_area_free(tmp); fail_nomem: retval = -ENOMEM; vm_unacct_memory(charge); goto loop_out; } static inline int mm_alloc_pgd(struct mm_struct *mm) { mm->pgd = pgd_alloc(mm); if (unlikely(!mm->pgd)) return -ENOMEM; return 0; } static inline void mm_free_pgd(struct mm_struct *mm) { pgd_free(mm, mm->pgd); } #else static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) { mmap_write_lock(oldmm); dup_mm_exe_file(mm, oldmm); mmap_write_unlock(oldmm); return 0; } #define mm_alloc_pgd(mm) (0) #define mm_free_pgd(mm) #endif /* CONFIG_MMU */ #ifdef CONFIG_MM_ID static DEFINE_IDA(mm_ida); static inline int mm_alloc_id(struct mm_struct *mm) { int ret; ret = ida_alloc_range(&mm_ida, MM_ID_MIN, MM_ID_MAX, GFP_KERNEL); if (ret < 0) return ret; mm->mm_id = ret; return 0; } static inline void mm_free_id(struct mm_struct *mm) { const mm_id_t id = mm->mm_id; mm->mm_id = MM_ID_DUMMY; if (id == MM_ID_DUMMY) return; if (WARN_ON_ONCE(id < MM_ID_MIN || id > MM_ID_MAX)) return; ida_free(&mm_ida, id); } #else /* !CONFIG_MM_ID */ static inline int mm_alloc_id(struct mm_struct *mm) { return 0; } static inline void mm_free_id(struct mm_struct *mm) {} #endif /* CONFIG_MM_ID */ static void check_mm(struct mm_struct *mm) { int i; BUILD_BUG_ON_MSG(ARRAY_SIZE(resident_page_types) != NR_MM_COUNTERS, "Please make sure 'struct resident_page_types[]' is updated as well"); for (i = 0; i < NR_MM_COUNTERS; i++) { long x = percpu_counter_sum(&mm->rss_stat[i]); if (unlikely(x)) pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n", mm, resident_page_types[i], x); } if (mm_pgtables_bytes(mm)) pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n", mm_pgtables_bytes(mm)); #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !defined(CONFIG_SPLIT_PMD_PTLOCKS) VM_BUG_ON_MM(mm->pmd_huge_pte, mm); #endif } #define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL)) #define free_mm(mm) (kmem_cache_free(mm_cachep, (mm))) static void do_check_lazy_tlb(void *arg) { struct mm_struct *mm = arg; WARN_ON_ONCE(current->active_mm == mm); } static void do_shoot_lazy_tlb(void *arg) { struct mm_struct *mm = arg; if (current->active_mm == mm) { WARN_ON_ONCE(current->mm); current->active_mm = &init_mm; switch_mm(mm, &init_mm, current); } } static void cleanup_lazy_tlbs(struct mm_struct *mm) { if (!IS_ENABLED(CONFIG_MMU_LAZY_TLB_SHOOTDOWN)) { /* * In this case, lazy tlb mms are refounted and would not reach * __mmdrop until all CPUs have switched away and mmdrop()ed. */ return; } /* * Lazy mm shootdown does not refcount "lazy tlb mm" usage, rather it * requires lazy mm users to switch to another mm when the refcount * drops to zero, before the mm is freed. This requires IPIs here to * switch kernel threads to init_mm. * * archs that use IPIs to flush TLBs can piggy-back that lazy tlb mm * switch with the final userspace teardown TLB flush which leaves the * mm lazy on this CPU but no others, reducing the need for additional * IPIs here. There are cases where a final IPI is still required here, * such as the final mmdrop being performed on a different CPU than the * one exiting, or kernel threads using the mm when userspace exits. * * IPI overheads have not found to be expensive, but they could be * reduced in a number of possible ways, for example (roughly * increasing order of complexity): * - The last lazy reference created by exit_mm() could instead switch * to init_mm, however it's probable this will run on the same CPU * immediately afterwards, so this may not reduce IPIs much. * - A batch of mms requiring IPIs could be gathered and freed at once. * - CPUs store active_mm where it can be remotely checked without a * lock, to filter out false-positives in the cpumask. * - After mm_users or mm_count reaches zero, switching away from the * mm could clear mm_cpumask to reduce some IPIs, perhaps together * with some batching or delaying of the final IPIs. * - A delayed freeing and RCU-like quiescing sequence based on mm * switching to avoid IPIs completely. */ on_each_cpu_mask(mm_cpumask(mm), do_shoot_lazy_tlb, (void *)mm, 1); if (IS_ENABLED(CONFIG_DEBUG_VM_SHOOT_LAZIES)) on_each_cpu(do_check_lazy_tlb, (void *)mm, 1); } /* * Called when the last reference to the mm * is dropped: either by a lazy thread or by * mmput. Free the page directory and the mm. */ void __mmdrop(struct mm_struct *mm) { BUG_ON(mm == &init_mm); WARN_ON_ONCE(mm == current->mm); /* Ensure no CPUs are using this as their lazy tlb mm */ cleanup_lazy_tlbs(mm); WARN_ON_ONCE(mm == current->active_mm); mm_free_pgd(mm); mm_free_id(mm); destroy_context(mm); mmu_notifier_subscriptions_destroy(mm); check_mm(mm); put_user_ns(mm->user_ns); mm_pasid_drop(mm); mm_destroy_cid(mm); percpu_counter_destroy_many(mm->rss_stat, NR_MM_COUNTERS); free_mm(mm); } EXPORT_SYMBOL_GPL(__mmdrop); static void mmdrop_async_fn(struct work_struct *work) { struct mm_struct *mm; mm = container_of(work, struct mm_struct, async_put_work); __mmdrop(mm); } static void mmdrop_async(struct mm_struct *mm) { if (unlikely(atomic_dec_and_test(&mm->mm_count))) { INIT_WORK(&mm->async_put_work, mmdrop_async_fn); schedule_work(&mm->async_put_work); } } static inline void free_signal_struct(struct signal_struct *sig) { taskstats_tgid_free(sig); sched_autogroup_exit(sig); /* * __mmdrop is not safe to call from softirq context on x86 due to * pgd_dtor so postpone it to the async context */ if (sig->oom_mm) mmdrop_async(sig->oom_mm); kmem_cache_free(signal_cachep, sig); } static inline void put_signal_struct(struct signal_struct *sig) { if (refcount_dec_and_test(&sig->sigcnt)) free_signal_struct(sig); } void __put_task_struct(struct task_struct *tsk) { WARN_ON(!tsk->exit_state); WARN_ON(refcount_read(&tsk->usage)); WARN_ON(tsk == current); sched_ext_free(tsk); io_uring_free(tsk); cgroup_free(tsk); task_numa_free(tsk, true); security_task_free(tsk); exit_creds(tsk); delayacct_tsk_free(tsk); put_signal_struct(tsk->signal); sched_core_free(tsk); free_task(tsk); } EXPORT_SYMBOL_GPL(__put_task_struct); void __put_task_struct_rcu_cb(struct rcu_head *rhp) { struct task_struct *task = container_of(rhp, struct task_struct, rcu); __put_task_struct(task); } EXPORT_SYMBOL_GPL(__put_task_struct_rcu_cb); void __init __weak arch_task_cache_init(void) { } /* * set_max_threads */ static void __init set_max_threads(unsigned int max_threads_suggested) { u64 threads; unsigned long nr_pages = memblock_estimated_nr_free_pages(); /* * The number of threads shall be limited such that the thread * structures may only consume a small part of the available memory. */ if (fls64(nr_pages) + fls64(PAGE_SIZE) > 64) threads = MAX_THREADS; else threads = div64_u64((u64) nr_pages * (u64) PAGE_SIZE, (u64) THREAD_SIZE * 8UL); if (threads > max_threads_suggested) threads = max_threads_suggested; max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS); } #ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT /* Initialized by the architecture: */ int arch_task_struct_size __read_mostly; #endif static void __init task_struct_whitelist(unsigned long *offset, unsigned long *size) { /* Fetch thread_struct whitelist for the architecture. */ arch_thread_struct_whitelist(offset, size); /* * Handle zero-sized whitelist or empty thread_struct, otherwise * adjust offset to position of thread_struct in task_struct. */ if (unlikely(*size == 0)) *offset = 0; else *offset += offsetof(struct task_struct, thread); } void __init fork_init(void) { int i; #ifndef ARCH_MIN_TASKALIGN #define ARCH_MIN_TASKALIGN 0 #endif int align = max_t(int, L1_CACHE_BYTES, ARCH_MIN_TASKALIGN); unsigned long useroffset, usersize; /* create a slab on which task_structs can be allocated */ task_struct_whitelist(&useroffset, &usersize); task_struct_cachep = kmem_cache_create_usercopy("task_struct", arch_task_struct_size, align, SLAB_PANIC|SLAB_ACCOUNT, useroffset, usersize, NULL); /* do the arch specific task caches init */ arch_task_cache_init(); set_max_threads(MAX_THREADS); init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2; init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2; init_task.signal->rlim[RLIMIT_SIGPENDING] = init_task.signal->rlim[RLIMIT_NPROC]; for (i = 0; i < UCOUNT_COUNTS; i++) init_user_ns.ucount_max[i] = max_threads/2; set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_NPROC, RLIM_INFINITY); set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_MSGQUEUE, RLIM_INFINITY); set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_SIGPENDING, RLIM_INFINITY); set_userns_rlimit_max(&init_user_ns, UCOUNT_RLIMIT_MEMLOCK, RLIM_INFINITY); #ifdef CONFIG_VMAP_STACK cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache", NULL, free_vm_stack_cache); #endif scs_init(); lockdep_init_task(&init_task); uprobes_init(); } int __weak arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { *dst = *src; return 0; } void set_task_stack_end_magic(struct task_struct *tsk) { unsigned long *stackend; stackend = end_of_stack(tsk); *stackend = STACK_END_MAGIC; /* for overflow detection */ } static struct task_struct *dup_task_struct(struct task_struct *orig, int node) { struct task_struct *tsk; int err; if (node == NUMA_NO_NODE) node = tsk_fork_get_node(orig); tsk = alloc_task_struct_node(node); if (!tsk) return NULL; err = arch_dup_task_struct(tsk, orig); if (err) goto free_tsk; err = alloc_thread_stack_node(tsk, node); if (err) goto free_tsk; #ifdef CONFIG_THREAD_INFO_IN_TASK refcount_set(&tsk->stack_refcount, 1); #endif account_kernel_stack(tsk, 1); err = scs_prepare(tsk, node); if (err) goto free_stack; #ifdef CONFIG_SECCOMP /* * We must handle setting up seccomp filters once we're under * the sighand lock in case orig has changed between now and * then. Until then, filter must be NULL to avoid messing up * the usage counts on the error path calling free_task. */ tsk->seccomp.filter = NULL; #endif setup_thread_stack(tsk, orig); clear_user_return_notifier(tsk); clear_tsk_need_resched(tsk); set_task_stack_end_magic(tsk); clear_syscall_work_syscall_user_dispatch(tsk); #ifdef CONFIG_STACKPROTECTOR tsk->stack_canary = get_random_canary(); #endif if (orig->cpus_ptr == &orig->cpus_mask) tsk->cpus_ptr = &tsk->cpus_mask; dup_user_cpus_ptr(tsk, orig, node); /* * One for the user space visible state that goes away when reaped. * One for the scheduler. */ refcount_set(&tsk->rcu_users, 2); /* One for the rcu users */ refcount_set(&tsk->usage, 1); #ifdef CONFIG_BLK_DEV_IO_TRACE tsk->btrace_seq = 0; #endif tsk->splice_pipe = NULL; tsk->task_frag.page = NULL; tsk->wake_q.next = NULL; tsk->worker_private = NULL; kcov_task_init(tsk); kmsan_task_create(tsk); kmap_local_fork(tsk); #ifdef CONFIG_FAULT_INJECTION tsk->fail_nth = 0; #endif #ifdef CONFIG_BLK_CGROUP tsk->throttle_disk = NULL; tsk->use_memdelay = 0; #endif #ifdef CONFIG_ARCH_HAS_CPU_PASID tsk->pasid_activated = 0; #endif #ifdef CONFIG_MEMCG tsk->active_memcg = NULL; #endif #ifdef CONFIG_X86_BUS_LOCK_DETECT tsk->reported_split_lock = 0; #endif #ifdef CONFIG_SCHED_MM_CID tsk->mm_cid = -1; tsk->last_mm_cid = -1; tsk->mm_cid_active = 0; tsk->migrate_from_cpu = -1; #endif return tsk; free_stack: exit_task_stack_account(tsk); free_thread_stack(tsk); free_tsk: free_task_struct(tsk); return NULL; } __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock); static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT; static int __init coredump_filter_setup(char *s) { default_dump_filter = (simple_strtoul(s, NULL, 0) << MMF_DUMP_FILTER_SHIFT) & MMF_DUMP_FILTER_MASK; return 1; } __setup("coredump_filter=", coredump_filter_setup); #include <linux/init_task.h> static void mm_init_aio(struct mm_struct *mm) { #ifdef CONFIG_AIO spin_lock_init(&mm->ioctx_lock); mm->ioctx_table = NULL; #endif } static __always_inline void mm_clear_owner(struct mm_struct *mm, struct task_struct *p) { #ifdef CONFIG_MEMCG if (mm->owner == p) WRITE_ONCE(mm->owner, NULL); #endif } static void mm_init_owner(struct mm_struct *mm, struct task_struct *p) { #ifdef CONFIG_MEMCG mm->owner = p; #endif } static void mm_init_uprobes_state(struct mm_struct *mm) { #ifdef CONFIG_UPROBES mm->uprobes_state.xol_area = NULL; #endif } static void mmap_init_lock(struct mm_struct *mm) { init_rwsem(&mm->mmap_lock); mm_lock_seqcount_init(mm); #ifdef CONFIG_PER_VMA_LOCK rcuwait_init(&mm->vma_writer_wait); #endif } static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, struct user_namespace *user_ns) { mt_init_flags(&mm->mm_mt, MM_MT_FLAGS); mt_set_external_lock(&mm->mm_mt, &mm->mmap_lock); atomic_set(&mm->mm_users, 1); atomic_set(&mm->mm_count, 1); seqcount_init(&mm->write_protect_seq); mmap_init_lock(mm); INIT_LIST_HEAD(&mm->mmlist); mm_pgtables_bytes_init(mm); mm->map_count = 0; mm->locked_vm = 0; atomic64_set(&mm->pinned_vm, 0); memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); spin_lock_init(&mm->page_table_lock); spin_lock_init(&mm->arg_lock); mm_init_cpumask(mm); mm_init_aio(mm); mm_init_owner(mm, p); mm_pasid_init(mm); RCU_INIT_POINTER(mm->exe_file, NULL); mmu_notifier_subscriptions_init(mm); init_tlb_flush_pending(mm); futex_mm_init(mm); #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !defined(CONFIG_SPLIT_PMD_PTLOCKS) mm->pmd_huge_pte = NULL; #endif mm_init_uprobes_state(mm); hugetlb_count_init(mm); if (current->mm) { mm->flags = mmf_init_flags(current->mm->flags); mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK; } else { mm->flags = default_dump_filter; mm->def_flags = 0; } if (mm_alloc_pgd(mm)) goto fail_nopgd; if (mm_alloc_id(mm)) goto fail_noid; if (init_new_context(p, mm)) goto fail_nocontext; if (mm_alloc_cid(mm, p)) goto fail_cid; if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL_ACCOUNT, NR_MM_COUNTERS)) goto fail_pcpu; mm->user_ns = get_user_ns(user_ns); lru_gen_init_mm(mm); return mm; fail_pcpu: mm_destroy_cid(mm); fail_cid: destroy_context(mm); fail_nocontext: mm_free_id(mm); fail_noid: mm_free_pgd(mm); fail_nopgd: free_mm(mm); return NULL; } /* * Allocate and initialize an mm_struct. */ struct mm_struct *mm_alloc(void) { struct mm_struct *mm; mm = allocate_mm(); if (!mm) return NULL; memset(mm, 0, sizeof(*mm)); return mm_init(mm, current, current_user_ns()); } EXPORT_SYMBOL_IF_KUNIT(mm_alloc); static inline void __mmput(struct mm_struct *mm) { VM_BUG_ON(atomic_read(&mm->mm_users)); uprobe_clear_state(mm); exit_aio(mm); ksm_exit(mm); khugepaged_exit(mm); /* must run before exit_mmap */ exit_mmap(mm); mm_put_huge_zero_folio(mm); set_mm_exe_file(mm, NULL); if (!list_empty(&mm->mmlist)) { spin_lock(&mmlist_lock); list_del(&mm->mmlist); spin_unlock(&mmlist_lock); } if (mm->binfmt) module_put(mm->binfmt->module); lru_gen_del_mm(mm); futex_hash_free(mm); mmdrop(mm); } /* * Decrement the use count and release all resources for an mm. */ void mmput(struct mm_struct *mm) { might_sleep(); if (atomic_dec_and_test(&mm->mm_users)) __mmput(mm); } EXPORT_SYMBOL_GPL(mmput); #ifdef CONFIG_MMU static void mmput_async_fn(struct work_struct *work) { struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work); __mmput(mm); } void mmput_async(struct mm_struct *mm) { if (atomic_dec_and_test(&mm->mm_users)) { INIT_WORK(&mm->async_put_work, mmput_async_fn); schedule_work(&mm->async_put_work); } } EXPORT_SYMBOL_GPL(mmput_async); #endif /** * set_mm_exe_file - change a reference to the mm's executable file * @mm: The mm to change. * @new_exe_file: The new file to use. * * This changes mm's executable file (shown as symlink /proc/[pid]/exe). * * Main users are mmput() and sys_execve(). Callers prevent concurrent * invocations: in mmput() nobody alive left, in execve it happens before * the new mm is made visible to anyone. * * Can only fail if new_exe_file != NULL. */ int set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) { struct file *old_exe_file; /* * It is safe to dereference the exe_file without RCU as * this function is only called if nobody else can access * this mm -- see comment above for justification. */ old_exe_file = rcu_dereference_raw(mm->exe_file); if (new_exe_file) { /* * We expect the caller (i.e., sys_execve) to already denied * write access, so this is unlikely to fail. */ if (unlikely(exe_file_deny_write_access(new_exe_file))) return -EACCES; get_file(new_exe_file); } rcu_assign_pointer(mm->exe_file, new_exe_file); if (old_exe_file) { exe_file_allow_write_access(old_exe_file); fput(old_exe_file); } return 0; } /** * replace_mm_exe_file - replace a reference to the mm's executable file * @mm: The mm to change. * @new_exe_file: The new file to use. * * This changes mm's executable file (shown as symlink /proc/[pid]/exe). * * Main user is sys_prctl(PR_SET_MM_MAP/EXE_FILE). */ int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) { struct vm_area_struct *vma; struct file *old_exe_file; int ret = 0; /* Forbid mm->exe_file change if old file still mapped. */ old_exe_file = get_mm_exe_file(mm); if (old_exe_file) { VMA_ITERATOR(vmi, mm, 0); mmap_read_lock(mm); for_each_vma(vmi, vma) { if (!vma->vm_file) continue; if (path_equal(&vma->vm_file->f_path, &old_exe_file->f_path)) { ret = -EBUSY; break; } } mmap_read_unlock(mm); fput(old_exe_file); if (ret) return ret; } ret = exe_file_deny_write_access(new_exe_file); if (ret) return -EACCES; get_file(new_exe_file); /* set the new file */ mmap_write_lock(mm); old_exe_file = rcu_dereference_raw(mm->exe_file); rcu_assign_pointer(mm->exe_file, new_exe_file); mmap_write_unlock(mm); if (old_exe_file) { exe_file_allow_write_access(old_exe_file); fput(old_exe_file); } return 0; } /** * get_mm_exe_file - acquire a reference to the mm's executable file * @mm: The mm of interest. * * Returns %NULL if mm has no associated executable file. * User must release file via fput(). */ struct file *get_mm_exe_file(struct mm_struct *mm) { struct file *exe_file; rcu_read_lock(); exe_file = get_file_rcu(&mm->exe_file); rcu_read_unlock(); return exe_file; } /** * get_task_exe_file - acquire a reference to the task's executable file * @task: The task. * * Returns %NULL if task's mm (if any) has no associated executable file or * this is a kernel thread with borrowed mm (see the comment above get_task_mm). * User must release file via fput(). */ struct file *get_task_exe_file(struct task_struct *task) { struct file *exe_file = NULL; struct mm_struct *mm; if (task->flags & PF_KTHREAD) return NULL; task_lock(task); mm = task->mm; if (mm) exe_file = get_mm_exe_file(mm); task_unlock(task); return exe_file; } /** * get_task_mm - acquire a reference to the task's mm * @task: The task. * * Returns %NULL if the task has no mm. Checks PF_KTHREAD (meaning * this kernel workthread has transiently adopted a user mm with use_mm, * to do its AIO) is not set and if so returns a reference to it, after * bumping up the use count. User must release the mm via mmput() * after use. Typically used by /proc and ptrace. */ struct mm_struct *get_task_mm(struct task_struct *task) { struct mm_struct *mm; if (task->flags & PF_KTHREAD) return NULL; task_lock(task); mm = task->mm; if (mm) mmget(mm); task_unlock(task); return mm; } EXPORT_SYMBOL_GPL(get_task_mm); static bool may_access_mm(struct mm_struct *mm, struct task_struct *task, unsigned int mode) { if (mm == current->mm) return true; if (ptrace_may_access(task, mode)) return true; if ((mode & PTRACE_MODE_READ) && perfmon_capable()) return true; return false; } struct mm_struct *mm_access(struct task_struct *task, unsigned int mode) { struct mm_struct *mm; int err; err = down_read_killable(&task->signal->exec_update_lock); if (err) return ERR_PTR(err); mm = get_task_mm(task); if (!mm) { mm = ERR_PTR(-ESRCH); } else if (!may_access_mm(mm, task, mode)) { mmput(mm); mm = ERR_PTR(-EACCES); } up_read(&task->signal->exec_update_lock); return mm; } static void complete_vfork_done(struct task_struct *tsk) { struct completion *vfork; task_lock(tsk); vfork = tsk->vfork_done; if (likely(vfork)) { tsk->vfork_done = NULL; complete(vfork); } task_unlock(tsk); } static int wait_for_vfork_done(struct task_struct *child, struct completion *vfork) { unsigned int state = TASK_KILLABLE|TASK_FREEZABLE; int killed; cgroup_enter_frozen(); killed = wait_for_completion_state(vfork, state); cgroup_leave_frozen(false); if (killed) { task_lock(child); child->vfork_done = NULL; task_unlock(child); } put_task_struct(child); return killed; } /* Please note the differences between mmput and mm_release. * mmput is called whenever we stop holding onto a mm_struct, * error success whatever. * * mm_release is called after a mm_struct has been removed * from the current process. * * This difference is important for error handling, when we * only half set up a mm_struct for a new process and need to restore * the old one. Because we mmput the new mm_struct before * restoring the old one. . . * Eric Biederman 10 January 1998 */ static void mm_release(struct task_struct *tsk, struct mm_struct *mm) { uprobe_free_utask(tsk); /* Get rid of any cached register state */ deactivate_mm(tsk, mm); /* * Signal userspace if we're not exiting with a core dump * because we want to leave the value intact for debugging * purposes. */ if (tsk->clear_child_tid) { if (atomic_read(&mm->mm_users) > 1) { /* * We don't check the error code - if userspace has * not set up a proper pointer then tough luck. */ put_user(0, tsk->clear_child_tid); do_futex(tsk->clear_child_tid, FUTEX_WAKE, 1, NULL, NULL, 0, 0); } tsk->clear_child_tid = NULL; } /* * All done, finally we can wake up parent and return this mm to him. * Also kthread_stop() uses this completion for synchronization. */ if (tsk->vfork_done) complete_vfork_done(tsk); } void exit_mm_release(struct task_struct *tsk, struct mm_struct *mm) { futex_exit_release(tsk); mm_release(tsk, mm); } void exec_mm_release(struct task_struct *tsk, struct mm_struct *mm) { futex_exec_release(tsk); mm_release(tsk, mm); } /** * dup_mm() - duplicates an existing mm structure * @tsk: the task_struct with which the new mm will be associated. * @oldmm: the mm to duplicate. * * Allocates a new mm structure and duplicates the provided @oldmm structure * content into it. * * Return: the duplicated mm or NULL on failure. */ static struct mm_struct *dup_mm(struct task_struct *tsk, struct mm_struct *oldmm) { struct mm_struct *mm; int err; mm = allocate_mm(); if (!mm) goto fail_nomem; memcpy(mm, oldmm, sizeof(*mm)); if (!mm_init(mm, tsk, mm->user_ns)) goto fail_nomem; uprobe_start_dup_mmap(); err = dup_mmap(mm, oldmm); if (err) goto free_pt; uprobe_end_dup_mmap(); mm->hiwater_rss = get_mm_rss(mm); mm->hiwater_vm = mm->total_vm; if (mm->binfmt && !try_module_get(mm->binfmt->module)) goto free_pt; return mm; free_pt: /* don't put binfmt in mmput, we haven't got module yet */ mm->binfmt = NULL; mm_init_owner(mm, NULL); mmput(mm); if (err) uprobe_end_dup_mmap(); fail_nomem: return NULL; } static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) { struct mm_struct *mm, *oldmm; tsk->min_flt = tsk->maj_flt = 0; tsk->nvcsw = tsk->nivcsw = 0; #ifdef CONFIG_DETECT_HUNG_TASK tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw; tsk->last_switch_time = 0; #endif tsk->mm = NULL; tsk->active_mm = NULL; /* * Are we cloning a kernel thread? * * We need to steal a active VM for that.. */ oldmm = current->mm; if (!oldmm) return 0; if (clone_flags & CLONE_VM) { mmget(oldmm); mm = oldmm; } else { mm = dup_mm(tsk, current->mm); if (!mm) return -ENOMEM; } tsk->mm = mm; tsk->active_mm = mm; sched_mm_cid_fork(tsk); return 0; } static int copy_fs(unsigned long clone_flags, struct task_struct *tsk) { struct fs_struct *fs = current->fs; if (clone_flags & CLONE_FS) { /* tsk->fs is already what we want */ spin_lock(&fs->lock); /* "users" and "in_exec" locked for check_unsafe_exec() */ if (fs->in_exec) { spin_unlock(&fs->lock); return -EAGAIN; } fs->users++; spin_unlock(&fs->lock); return 0; } tsk->fs = copy_fs_struct(fs); if (!tsk->fs) return -ENOMEM; return 0; } static int copy_files(unsigned long clone_flags, struct task_struct *tsk, int no_files) { struct files_struct *oldf, *newf; /* * A background process may not have any files ... */ oldf = current->files; if (!oldf) return 0; if (no_files) { tsk->files = NULL; return 0; } if (clone_flags & CLONE_FILES) { atomic_inc(&oldf->count); return 0; } newf = dup_fd(oldf, NULL); if (IS_ERR(newf)) return PTR_ERR(newf); tsk->files = newf; return 0; } static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) { struct sighand_struct *sig; if (clone_flags & CLONE_SIGHAND) { refcount_inc(&current->sighand->count); return 0; } sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL); RCU_INIT_POINTER(tsk->sighand, sig); if (!sig) return -ENOMEM; refcount_set(&sig->count, 1); spin_lock_irq(&current->sighand->siglock); memcpy(sig->action, current->sighand->action, sizeof(sig->action)); spin_unlock_irq(&current->sighand->siglock); /* Reset all signal handler not set to SIG_IGN to SIG_DFL. */ if (clone_flags & CLONE_CLEAR_SIGHAND) flush_signal_handlers(tsk, 0); return 0; } void __cleanup_sighand(struct sighand_struct *sighand) { if (refcount_dec_and_test(&sighand->count)) { signalfd_cleanup(sighand); /* * sighand_cachep is SLAB_TYPESAFE_BY_RCU so we can free it * without an RCU grace period, see __lock_task_sighand(). */ kmem_cache_free(sighand_cachep, sighand); } } /* * Initialize POSIX timer handling for a thread group. */ static void posix_cpu_timers_init_group(struct signal_struct *sig) { struct posix_cputimers *pct = &sig->posix_cputimers; unsigned long cpu_limit; cpu_limit = READ_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); posix_cputimers_group_init(pct, cpu_limit); } static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) { struct signal_struct *sig; if (clone_flags & CLONE_THREAD) return 0; sig = kmem_cache_zalloc(signal_cachep, GFP_KERNEL); tsk->signal = sig; if (!sig) return -ENOMEM; sig->nr_threads = 1; sig->quick_threads = 1; atomic_set(&sig->live, 1); refcount_set(&sig->sigcnt, 1); /* list_add(thread_node, thread_head) without INIT_LIST_HEAD() */ sig->thread_head = (struct list_head)LIST_HEAD_INIT(tsk->thread_node); tsk->thread_node = (struct list_head)LIST_HEAD_INIT(sig->thread_head); init_waitqueue_head(&sig->wait_chldexit); sig->curr_target = tsk; init_sigpending(&sig->shared_pending); INIT_HLIST_HEAD(&sig->multiprocess); seqlock_init(&sig->stats_lock); prev_cputime_init(&sig->prev_cputime); #ifdef CONFIG_POSIX_TIMERS INIT_HLIST_HEAD(&sig->posix_timers); INIT_HLIST_HEAD(&sig->ignored_posix_timers); hrtimer_setup(&sig->real_timer, it_real_fn, CLOCK_MONOTONIC, HRTIMER_MODE_REL); #endif task_lock(current->group_leader); memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim); task_unlock(current->group_leader); posix_cpu_timers_init_group(sig); tty_audit_fork(sig); sched_autogroup_fork(sig); sig->oom_score_adj = current->signal->oom_score_adj; sig->oom_score_adj_min = current->signal->oom_score_adj_min; mutex_init(&sig->cred_guard_mutex); init_rwsem(&sig->exec_update_lock); return 0; } static void copy_seccomp(struct task_struct *p) { #ifdef CONFIG_SECCOMP /* * Must be called with sighand->lock held, which is common to * all threads in the group. Holding cred_guard_mutex is not * needed because this new task is not yet running and cannot * be racing exec. */ assert_spin_locked(&current->sighand->siglock); /* Ref-count the new filter user, and assign it. */ get_seccomp_filter(current); p->seccomp = current->seccomp; /* * Explicitly enable no_new_privs here in case it got set * between the task_struct being duplicated and holding the * sighand lock. The seccomp state and nnp must be in sync. */ if (task_no_new_privs(current)) task_set_no_new_privs(p); /* * If the parent gained a seccomp mode after copying thread * flags and between before we held the sighand lock, we have * to manually enable the seccomp thread flag here. */ if (p->seccomp.mode != SECCOMP_MODE_DISABLED) set_task_syscall_work(p, SECCOMP); #endif } SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr) { current->clear_child_tid = tidptr; return task_pid_vnr(current); } static void rt_mutex_init_task(struct task_struct *p) { raw_spin_lock_init(&p->pi_lock); #ifdef CONFIG_RT_MUTEXES p->pi_waiters = RB_ROOT_CACHED; p->pi_top_task = NULL; p->pi_blocked_on = NULL; #endif } static inline void init_task_pid_links(struct task_struct *task) { enum pid_type type; for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type) INIT_HLIST_NODE(&task->pid_links[type]); } static inline void init_task_pid(struct task_struct *task, enum pid_type type, struct pid *pid) { if (type == PIDTYPE_PID) task->thread_pid = pid; else task->signal->pids[type] = pid; } static inline void rcu_copy_process(struct task_struct *p) { #ifdef CONFIG_PREEMPT_RCU p->rcu_read_lock_nesting = 0; p->rcu_read_unlock_special.s = 0; p->rcu_blocked_node = NULL; INIT_LIST_HEAD(&p->rcu_node_entry); #endif /* #ifdef CONFIG_PREEMPT_RCU */ #ifdef CONFIG_TASKS_RCU p->rcu_tasks_holdout = false; INIT_LIST_HEAD(&p->rcu_tasks_holdout_list); p->rcu_tasks_idle_cpu = -1; INIT_LIST_HEAD(&p->rcu_tasks_exit_list); #endif /* #ifdef CONFIG_TASKS_RCU */ #ifdef CONFIG_TASKS_TRACE_RCU p->trc_reader_nesting = 0; p->trc_reader_special.s = 0; INIT_LIST_HEAD(&p->trc_holdout_list); INIT_LIST_HEAD(&p->trc_blkd_node); #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ } /** * pidfd_prepare - allocate a new pidfd_file and reserve a pidfd * @pid: the struct pid for which to create a pidfd * @flags: flags of the new @pidfd * @ret_file: return the new pidfs file * * Allocate a new file that stashes @pid and reserve a new pidfd number in the * caller's file descriptor table. The pidfd is reserved but not installed yet. * * The helper verifies that @pid is still in use, without PIDFD_THREAD the * task identified by @pid must be a thread-group leader. * * If this function returns successfully the caller is responsible to either * call fd_install() passing the returned pidfd and pidfd file as arguments in * order to install the pidfd into its file descriptor table or they must use * put_unused_fd() and fput() on the returned pidfd and pidfd file * respectively. * * This function is useful when a pidfd must already be reserved but there * might still be points of failure afterwards and the caller wants to ensure * that no pidfd is leaked into its file descriptor table. * * Return: On success, a reserved pidfd is returned from the function and a new * pidfd file is returned in the last argument to the function. On * error, a negative error code is returned from the function and the * last argument remains unchanged. */ int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret_file) { struct file *pidfs_file; /* * PIDFD_STALE is only allowed to be passed if the caller knows * that @pid is already registered in pidfs and thus * PIDFD_INFO_EXIT information is guaranteed to be available. */ if (!(flags & PIDFD_STALE)) { /* * While holding the pidfd waitqueue lock removing the * task linkage for the thread-group leader pid * (PIDTYPE_TGID) isn't possible. Thus, if there's still * task linkage for PIDTYPE_PID not having thread-group * leader linkage for the pid means it wasn't a * thread-group leader in the first place. */ guard(spinlock_irq)(&pid->wait_pidfd.lock); /* Task has already been reaped. */ if (!pid_has_task(pid, PIDTYPE_PID)) return -ESRCH; /* * If this struct pid isn't used as a thread-group * leader but the caller requested to create a * thread-group leader pidfd then report ENOENT. */ if (!(flags & PIDFD_THREAD) && !pid_has_task(pid, PIDTYPE_TGID)) return -ENOENT; } CLASS(get_unused_fd, pidfd)(O_CLOEXEC); if (pidfd < 0) return pidfd; pidfs_file = pidfs_alloc_file(pid, flags | O_RDWR); if (IS_ERR(pidfs_file)) return PTR_ERR(pidfs_file); *ret_file = pidfs_file; return take_fd(pidfd); } static void __delayed_free_task(struct rcu_head *rhp) { struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); free_task(tsk); } static __always_inline void delayed_free_task(struct task_struct *tsk) { if (IS_ENABLED(CONFIG_MEMCG)) call_rcu(&tsk->rcu, __delayed_free_task); else free_task(tsk); } static void copy_oom_score_adj(u64 clone_flags, struct task_struct *tsk) { /* Skip if kernel thread */ if (!tsk->mm) return; /* Skip if spawning a thread or using vfork */ if ((clone_flags & (CLONE_VM | CLONE_THREAD | CLONE_VFORK)) != CLONE_VM) return; /* We need to synchronize with __set_oom_adj */ mutex_lock(&oom_adj_mutex); set_bit(MMF_MULTIPROCESS, &tsk->mm->flags); /* Update the values in case they were changed after copy_signal */ tsk->signal->oom_score_adj = current->signal->oom_score_adj; tsk->signal->oom_score_adj_min = current->signal->oom_score_adj_min; mutex_unlock(&oom_adj_mutex); } #ifdef CONFIG_RV static void rv_task_fork(struct task_struct *p) { int i; for (i = 0; i < RV_PER_TASK_MONITORS; i++) p->rv[i].da_mon.monitoring = false; } #else #define rv_task_fork(p) do {} while (0) #endif static bool need_futex_hash_allocate_default(u64 clone_flags) { if ((clone_flags & (CLONE_THREAD | CLONE_VM)) != (CLONE_THREAD | CLONE_VM)) return false; return true; } /* * This creates a new process as a copy of the old one, * but does not actually start it yet. * * It copies the registers, and all the appropriate * parts of the process environment (as per the clone * flags). The actual kick-off is left to the caller. */ __latent_entropy struct task_struct *copy_process( struct pid *pid, int trace, int node, struct kernel_clone_args *args) { int pidfd = -1, retval; struct task_struct *p; struct multiprocess_signals delayed; struct file *pidfile = NULL; const u64 clone_flags = args->flags; struct nsproxy *nsp = current->nsproxy; /* * Don't allow sharing the root directory with processes in a different * namespace */ if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) return ERR_PTR(-EINVAL); if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS)) return ERR_PTR(-EINVAL); /* * Thread groups must share signals as well, and detached threads * can only be started up within the thread group. */ if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND)) return ERR_PTR(-EINVAL); /* * Shared signal handlers imply shared VM. By way of the above, * thread groups also imply shared VM. Blocking this case allows * for various simplifications in other code. */ if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM)) return ERR_PTR(-EINVAL); /* * Siblings of global init remain as zombies on exit since they are * not reaped by their parent (swapper). To solve this and to avoid * multi-rooted process trees, prevent global and container-inits * from creating siblings. */ if ((clone_flags & CLONE_PARENT) && current->signal->flags & SIGNAL_UNKILLABLE) return ERR_PTR(-EINVAL); /* * If the new process will be in a different pid or user namespace * do not allow it to share a thread group with the forking task. */ if (clone_flags & CLONE_THREAD) { if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) || (task_active_pid_ns(current) != nsp->pid_ns_for_children)) return ERR_PTR(-EINVAL); } if (clone_flags & CLONE_PIDFD) { /* * - CLONE_DETACHED is blocked so that we can potentially * reuse it later for CLONE_PIDFD. */ if (clone_flags & CLONE_DETACHED) return ERR_PTR(-EINVAL); } /* * Force any signals received before this point to be delivered * before the fork happens. Collect up signals sent to multiple * processes that happen during the fork and delay them so that * they appear to happen after the fork. */ sigemptyset(&delayed.signal); INIT_HLIST_NODE(&delayed.node); spin_lock_irq(&current->sighand->siglock); if (!(clone_flags & CLONE_THREAD)) hlist_add_head(&delayed.node, &current->signal->multiprocess); recalc_sigpending(); spin_unlock_irq(&current->sighand->siglock); retval = -ERESTARTNOINTR; if (task_sigpending(current)) goto fork_out; retval = -ENOMEM; p = dup_task_struct(current, node); if (!p) goto fork_out; p->flags &= ~PF_KTHREAD; if (args->kthread) p->flags |= PF_KTHREAD; if (args->user_worker) { /* * Mark us a user worker, and block any signal that isn't * fatal or STOP */ p->flags |= PF_USER_WORKER; siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP)); } if (args->io_thread) p->flags |= PF_IO_WORKER; if (args->name) strscpy_pad(p->comm, args->name, sizeof(p->comm)); p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL; /* * Clear TID on mm_release()? */ p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? args->child_tid : NULL; ftrace_graph_init_task(p); rt_mutex_init_task(p); lockdep_assert_irqs_enabled(); #ifdef CONFIG_PROVE_LOCKING DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); #endif retval = copy_creds(p, clone_flags); if (retval < 0) goto bad_fork_free; retval = -EAGAIN; if (is_rlimit_overlimit(task_ucounts(p), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) { if (p->real_cred->user != INIT_USER && !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) goto bad_fork_cleanup_count; } current->flags &= ~PF_NPROC_EXCEEDED; /* * If multiple threads are within copy_process(), then this check * triggers too late. This doesn't hurt, the check is only there * to stop root fork bombs. */ retval = -EAGAIN; if (data_race(nr_threads >= max_threads)) goto bad_fork_cleanup_count; delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER | PF_IDLE | PF_NO_SETAFFINITY); p->flags |= PF_FORKNOEXEC; INIT_LIST_HEAD(&p->children); INIT_LIST_HEAD(&p->sibling); rcu_copy_process(p); p->vfork_done = NULL; spin_lock_init(&p->alloc_lock); init_sigpending(&p->pending); p->utime = p->stime = p->gtime = 0; #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME p->utimescaled = p->stimescaled = 0; #endif prev_cputime_init(&p->prev_cputime); #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN seqcount_init(&p->vtime.seqcount); p->vtime.starttime = 0; p->vtime.state = VTIME_INACTIVE; #endif #ifdef CONFIG_IO_URING p->io_uring = NULL; #endif p->default_timer_slack_ns = current->timer_slack_ns; #ifdef CONFIG_PSI p->psi_flags = 0; #endif task_io_accounting_init(&p->ioac); acct_clear_integrals(p); posix_cputimers_init(&p->posix_cputimers); tick_dep_init_task(p); p->io_context = NULL; audit_set_context(p, NULL); cgroup_fork(p); if (args->kthread) { if (!set_kthread_struct(p)) goto bad_fork_cleanup_delayacct; } #ifdef CONFIG_NUMA p->mempolicy = mpol_dup(p->mempolicy); if (IS_ERR(p->mempolicy)) { retval = PTR_ERR(p->mempolicy); p->mempolicy = NULL; goto bad_fork_cleanup_delayacct; } #endif #ifdef CONFIG_CPUSETS p->cpuset_mem_spread_rotor = NUMA_NO_NODE; seqcount_spinlock_init(&p->mems_allowed_seq, &p->alloc_lock); #endif #ifdef CONFIG_TRACE_IRQFLAGS memset(&p->irqtrace, 0, sizeof(p->irqtrace)); p->irqtrace.hardirq_disable_ip = _THIS_IP_; p->irqtrace.softirq_enable_ip = _THIS_IP_; p->softirqs_enabled = 1; p->softirq_context = 0; #endif p->pagefault_disabled = 0; #ifdef CONFIG_LOCKDEP lockdep_init_task(p); #endif #ifdef CONFIG_DEBUG_MUTEXES p->blocked_on = NULL; /* not blocked yet */ #endif #ifdef CONFIG_BCACHE p->sequential_io = 0; p->sequential_io_avg = 0; #endif #ifdef CONFIG_BPF_SYSCALL RCU_INIT_POINTER(p->bpf_storage, NULL); p->bpf_ctx = NULL; #endif /* Perform scheduler related setup. Assign this task to a CPU. */ retval = sched_fork(clone_flags, p); if (retval) goto bad_fork_cleanup_policy; retval = perf_event_init_task(p, clone_flags); if (retval) goto bad_fork_sched_cancel_fork; retval = audit_alloc(p); if (retval) goto bad_fork_cleanup_perf; /* copy all the process information */ shm_init_task(p); retval = security_task_alloc(p, clone_flags); if (retval) goto bad_fork_cleanup_audit; retval = copy_semundo(clone_flags, p); if (retval) goto bad_fork_cleanup_security; retval = copy_files(clone_flags, p, args->no_files); if (retval) goto bad_fork_cleanup_semundo; retval = copy_fs(clone_flags, p); if (retval) goto bad_fork_cleanup_files; retval = copy_sighand(clone_flags, p); if (retval) goto bad_fork_cleanup_fs; retval = copy_signal(clone_flags, p); if (retval) goto bad_fork_cleanup_sighand; retval = copy_mm(clone_flags, p); if (retval) goto bad_fork_cleanup_signal; retval = copy_namespaces(clone_flags, p); if (retval) goto bad_fork_cleanup_mm; retval = copy_io(clone_flags, p); if (retval) goto bad_fork_cleanup_namespaces; retval = copy_thread(p, args); if (retval) goto bad_fork_cleanup_io; stackleak_task_init(p); if (pid != &init_struct_pid) { pid = alloc_pid(p->nsproxy->pid_ns_for_children, args->set_tid, args->set_tid_size); if (IS_ERR(pid)) { retval = PTR_ERR(pid); goto bad_fork_cleanup_thread; } } /* * This has to happen after we've potentially unshared the file * descriptor table (so that the pidfd doesn't leak into the child * if the fd table isn't shared). */ if (clone_flags & CLONE_PIDFD) { int flags = (clone_flags & CLONE_THREAD) ? PIDFD_THREAD : 0; /* * Note that no task has been attached to @pid yet indicate * that via CLONE_PIDFD. */ retval = pidfd_prepare(pid, flags | PIDFD_STALE, &pidfile); if (retval < 0) goto bad_fork_free_pid; pidfd = retval; retval = put_user(pidfd, args->pidfd); if (retval) goto bad_fork_put_pidfd; } #ifdef CONFIG_BLOCK p->plug = NULL; #endif futex_init_task(p); /* * sigaltstack should be cleared when sharing the same VM */ if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM) sas_ss_reset(p); /* * Syscall tracing and stepping should be turned off in the * child regardless of CLONE_PTRACE. */ user_disable_single_step(p); clear_task_syscall_work(p, SYSCALL_TRACE); #if defined(CONFIG_GENERIC_ENTRY) || defined(TIF_SYSCALL_EMU) clear_task_syscall_work(p, SYSCALL_EMU); #endif clear_tsk_latency_tracing(p); /* ok, now we should be set up.. */ p->pid = pid_nr(pid); if (clone_flags & CLONE_THREAD) { p->group_leader = current->group_leader; p->tgid = current->tgid; } else { p->group_leader = p; p->tgid = p->pid; } p->nr_dirtied = 0; p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10); p->dirty_paused_when = 0; p->pdeath_signal = 0; p->task_works = NULL; clear_posix_cputimers_work(p); #ifdef CONFIG_KRETPROBES p->kretprobe_instances.first = NULL; #endif #ifdef CONFIG_RETHOOK p->rethooks.first = NULL; #endif /* * Ensure that the cgroup subsystem policies allow the new process to be * forked. It should be noted that the new process's css_set can be changed * between here and cgroup_post_fork() if an organisation operation is in * progress. */ retval = cgroup_can_fork(p, args); if (retval) goto bad_fork_put_pidfd; /* * Now that the cgroups are pinned, re-clone the parent cgroup and put * the new task on the correct runqueue. All this *before* the task * becomes visible. * * This isn't part of ->can_fork() because while the re-cloning is * cgroup specific, it unconditionally needs to place the task on a * runqueue. */ retval = sched_cgroup_fork(p, args); if (retval) goto bad_fork_cancel_cgroup; /* * Allocate a default futex hash for the user process once the first * thread spawns. */ if (need_futex_hash_allocate_default(clone_flags)) { retval = futex_hash_allocate_default(); if (retval) goto bad_fork_core_free; /* * If we fail beyond this point we don't free the allocated * futex hash map. We assume that another thread will be created * and makes use of it. The hash map will be freed once the main * thread terminates. */ } /* * From this point on we must avoid any synchronous user-space * communication until we take the tasklist-lock. In particular, we do * not want user-space to be able to predict the process start-time by * stalling fork(2) after we recorded the start_time but before it is * visible to the system. */ p->start_time = ktime_get_ns(); p->start_boottime = ktime_get_boottime_ns(); /* * Make it visible to the rest of the system, but dont wake it up yet. * Need tasklist lock for parent etc handling! */ write_lock_irq(&tasklist_lock); /* CLONE_PARENT re-uses the old parent */ if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) { p->real_parent = current->real_parent; p->parent_exec_id = current->parent_exec_id; if (clone_flags & CLONE_THREAD) p->exit_signal = -1; else p->exit_signal = current->group_leader->exit_signal; } else { p->real_parent = current; p->parent_exec_id = current->self_exec_id; p->exit_signal = args->exit_signal; } klp_copy_process(p); sched_core_fork(p); spin_lock(&current->sighand->siglock); rv_task_fork(p); rseq_fork(p, clone_flags); /* Don't start children in a dying pid namespace */ if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) { retval = -ENOMEM; goto bad_fork_core_free; } /* Let kill terminate clone/fork in the middle */ if (fatal_signal_pending(current)) { retval = -EINTR; goto bad_fork_core_free; } /* No more failure paths after this point. */ /* * Copy seccomp details explicitly here, in case they were changed * before holding sighand lock. */ copy_seccomp(p); init_task_pid_links(p); if (likely(p->pid)) { ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace); init_task_pid(p, PIDTYPE_PID, pid); if (thread_group_leader(p)) { init_task_pid(p, PIDTYPE_TGID, pid); init_task_pid(p, PIDTYPE_PGID, task_pgrp(current)); init_task_pid(p, PIDTYPE_SID, task_session(current)); if (is_child_reaper(pid)) { ns_of_pid(pid)->child_reaper = p; p->signal->flags |= SIGNAL_UNKILLABLE; } p->signal->shared_pending.signal = delayed.signal; p->signal->tty = tty_kref_get(current->signal->tty); /* * Inherit has_child_subreaper flag under the same * tasklist_lock with adding child to the process tree * for propagate_has_child_subreaper optimization. */ p->signal->has_child_subreaper = p->real_parent->signal->has_child_subreaper || p->real_parent->signal->is_child_subreaper; list_add_tail(&p->sibling, &p->real_parent->children); list_add_tail_rcu(&p->tasks, &init_task.tasks); attach_pid(p, PIDTYPE_TGID); attach_pid(p, PIDTYPE_PGID); attach_pid(p, PIDTYPE_SID); __this_cpu_inc(process_counts); } else { current->signal->nr_threads++; current->signal->quick_threads++; atomic_inc(&current->signal->live); refcount_inc(&current->signal->sigcnt); task_join_group_stop(p); list_add_tail_rcu(&p->thread_node, &p->signal->thread_head); } attach_pid(p, PIDTYPE_PID); nr_threads++; } total_forks++; hlist_del_init(&delayed.node); spin_unlock(&current->sighand->siglock); syscall_tracepoint_update(p); write_unlock_irq(&tasklist_lock); if (pidfile) fd_install(pidfd, pidfile); proc_fork_connector(p); sched_post_fork(p); cgroup_post_fork(p, args); perf_event_fork(p); trace_task_newtask(p, clone_flags); uprobe_copy_process(p, clone_flags); user_events_fork(p, clone_flags); copy_oom_score_adj(clone_flags, p); return p; bad_fork_core_free: sched_core_free(p); spin_unlock(&current->sighand->siglock); write_unlock_irq(&tasklist_lock); bad_fork_cancel_cgroup: cgroup_cancel_fork(p, args); bad_fork_put_pidfd: if (clone_flags & CLONE_PIDFD) { fput(pidfile); put_unused_fd(pidfd); } bad_fork_free_pid: if (pid != &init_struct_pid) free_pid(pid); bad_fork_cleanup_thread: exit_thread(p); bad_fork_cleanup_io: if (p->io_context) exit_io_context(p); bad_fork_cleanup_namespaces: exit_task_namespaces(p); bad_fork_cleanup_mm: if (p->mm) { mm_clear_owner(p->mm, p); mmput(p->mm); } bad_fork_cleanup_signal: if (!(clone_flags & CLONE_THREAD)) free_signal_struct(p->signal); bad_fork_cleanup_sighand: __cleanup_sighand(p->sighand); bad_fork_cleanup_fs: exit_fs(p); /* blocking */ bad_fork_cleanup_files: exit_files(p); /* blocking */ bad_fork_cleanup_semundo: exit_sem(p); bad_fork_cleanup_security: security_task_free(p); bad_fork_cleanup_audit: audit_free(p); bad_fork_cleanup_perf: perf_event_free_task(p); bad_fork_sched_cancel_fork: sched_cancel_fork(p); bad_fork_cleanup_policy: lockdep_free_task(p); #ifdef CONFIG_NUMA mpol_put(p->mempolicy); #endif bad_fork_cleanup_delayacct: delayacct_tsk_free(p); bad_fork_cleanup_count: dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1); exit_creds(p); bad_fork_free: WRITE_ONCE(p->__state, TASK_DEAD); exit_task_stack_account(p); put_task_stack(p); delayed_free_task(p); fork_out: spin_lock_irq(&current->sighand->siglock); hlist_del_init(&delayed.node); spin_unlock_irq(&current->sighand->siglock); return ERR_PTR(retval); } static inline void init_idle_pids(struct task_struct *idle) { enum pid_type type; for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type) { INIT_HLIST_NODE(&idle->pid_links[type]); /* not really needed */ init_task_pid(idle, type, &init_struct_pid); } } static int idle_dummy(void *dummy) { /* This function is never called */ return 0; } struct task_struct * __init fork_idle(int cpu) { struct task_struct *task; struct kernel_clone_args args = { .flags = CLONE_VM, .fn = &idle_dummy, .fn_arg = NULL, .kthread = 1, .idle = 1, }; task = copy_process(&init_struct_pid, 0, cpu_to_node(cpu), &args); if (!IS_ERR(task)) { init_idle_pids(task); init_idle(task, cpu); } return task; } /* * This is like kernel_clone(), but shaved down and tailored to just * creating io_uring workers. It returns a created task, or an error pointer. * The returned task is inactive, and the caller must fire it up through * wake_up_new_task(p). All signals are blocked in the created task. */ struct task_struct *create_io_thread(int (*fn)(void *), void *arg, int node) { unsigned long flags = CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD| CLONE_IO; struct kernel_clone_args args = { .flags = ((lower_32_bits(flags) | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL), .exit_signal = (lower_32_bits(flags) & CSIGNAL), .fn = fn, .fn_arg = arg, .io_thread = 1, .user_worker = 1, }; return copy_process(NULL, 0, node, &args); } /* * Ok, this is the main fork-routine. * * It copies the process, and if successful kick-starts * it and waits for it to finish using the VM if required. * * args->exit_signal is expected to be checked for sanity by the caller. */ pid_t kernel_clone(struct kernel_clone_args *args) { u64 clone_flags = args->flags; struct completion vfork; struct pid *pid; struct task_struct *p; int trace = 0; pid_t nr; /* * For legacy clone() calls, CLONE_PIDFD uses the parent_tid argument * to return the pidfd. Hence, CLONE_PIDFD and CLONE_PARENT_SETTID are * mutually exclusive. With clone3() CLONE_PIDFD has grown a separate * field in struct clone_args and it still doesn't make sense to have * them both point at the same memory location. Performing this check * here has the advantage that we don't need to have a separate helper * to check for legacy clone(). */ if ((clone_flags & CLONE_PIDFD) && (clone_flags & CLONE_PARENT_SETTID) && (args->pidfd == args->parent_tid)) return -EINVAL; /* * Determine whether and which event to report to ptracer. When * called from kernel_thread or CLONE_UNTRACED is explicitly * requested, no event is reported; otherwise, report if the event * for the type of forking is enabled. */ if (!(clone_flags & CLONE_UNTRACED)) { if (clone_flags & CLONE_VFORK) trace = PTRACE_EVENT_VFORK; else if (args->exit_signal != SIGCHLD) trace = PTRACE_EVENT_CLONE; else trace = PTRACE_EVENT_FORK; if (likely(!ptrace_event_enabled(current, trace))) trace = 0; } p = copy_process(NULL, trace, NUMA_NO_NODE, args); add_latent_entropy(); if (IS_ERR(p)) return PTR_ERR(p); /* * Do this prior waking up the new thread - the thread pointer * might get invalid after that point, if the thread exits quickly. */ trace_sched_process_fork(current, p); pid = get_task_pid(p, PIDTYPE_PID); nr = pid_vnr(pid); if (clone_flags & CLONE_PARENT_SETTID) put_user(nr, args->parent_tid); if (clone_flags & CLONE_VFORK) { p->vfork_done = &vfork; init_completion(&vfork); get_task_struct(p); } if (IS_ENABLED(CONFIG_LRU_GEN_WALKS_MMU) && !(clone_flags & CLONE_VM)) { /* lock the task to synchronize with memcg migration */ task_lock(p); lru_gen_add_mm(p->mm); task_unlock(p); } wake_up_new_task(p); /* forking complete and child started to run, tell ptracer */ if (unlikely(trace)) ptrace_event_pid(trace, pid); if (clone_flags & CLONE_VFORK) { if (!wait_for_vfork_done(p, &vfork)) ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid); } put_pid(pid); return nr; } /* * Create a kernel thread. */ pid_t kernel_thread(int (*fn)(void *), void *arg, const char *name, unsigned long flags) { struct kernel_clone_args args = { .flags = ((lower_32_bits(flags) | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL), .exit_signal = (lower_32_bits(flags) & CSIGNAL), .fn = fn, .fn_arg = arg, .name = name, .kthread = 1, }; return kernel_clone(&args); } /* * Create a user mode thread. */ pid_t user_mode_thread(int (*fn)(void *), void *arg, unsigned long flags) { struct kernel_clone_args args = { .flags = ((lower_32_bits(flags) | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL), .exit_signal = (lower_32_bits(flags) & CSIGNAL), .fn = fn, .fn_arg = arg, }; return kernel_clone(&args); } #ifdef __ARCH_WANT_SYS_FORK SYSCALL_DEFINE0(fork) { #ifdef CONFIG_MMU struct kernel_clone_args args = { .exit_signal = SIGCHLD, }; return kernel_clone(&args); #else /* can not support in nommu mode */ return -EINVAL; #endif } #endif #ifdef __ARCH_WANT_SYS_VFORK SYSCALL_DEFINE0(vfork) { struct kernel_clone_args args = { .flags = CLONE_VFORK | CLONE_VM, .exit_signal = SIGCHLD, }; return kernel_clone(&args); } #endif #ifdef __ARCH_WANT_SYS_CLONE #ifdef CONFIG_CLONE_BACKWARDS SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, int __user *, parent_tidptr, unsigned long, tls, int __user *, child_tidptr) #elif defined(CONFIG_CLONE_BACKWARDS2) SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags, int __user *, parent_tidptr, int __user *, child_tidptr, unsigned long, tls) #elif defined(CONFIG_CLONE_BACKWARDS3) SYSCALL_DEFINE6(clone, unsigned long, clone_flags, unsigned long, newsp, int, stack_size, int __user *, parent_tidptr, int __user *, child_tidptr, unsigned long, tls) #else SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, int __user *, parent_tidptr, int __user *, child_tidptr, unsigned long, tls) #endif { struct kernel_clone_args args = { .flags = (lower_32_bits(clone_flags) & ~CSIGNAL), .pidfd = parent_tidptr, .child_tid = child_tidptr, .parent_tid = parent_tidptr, .exit_signal = (lower_32_bits(clone_flags) & CSIGNAL), .stack = newsp, .tls = tls, }; return kernel_clone(&args); } #endif noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, struct clone_args __user *uargs, size_t usize) { int err; struct clone_args args; pid_t *kset_tid = kargs->set_tid; BUILD_BUG_ON(offsetofend(struct clone_args, tls) != CLONE_ARGS_SIZE_VER0); BUILD_BUG_ON(offsetofend(struct clone_args, set_tid_size) != CLONE_ARGS_SIZE_VER1); BUILD_BUG_ON(offsetofend(struct clone_args, cgroup) != CLONE_ARGS_SIZE_VER2); BUILD_BUG_ON(sizeof(struct clone_args) != CLONE_ARGS_SIZE_VER2); if (unlikely(usize > PAGE_SIZE)) return -E2BIG; if (unlikely(usize < CLONE_ARGS_SIZE_VER0)) return -EINVAL; err = copy_struct_from_user(&args, sizeof(args), uargs, usize); if (err) return err; if (unlikely(args.set_tid_size > MAX_PID_NS_LEVEL)) return -EINVAL; if (unlikely(!args.set_tid && args.set_tid_size > 0)) return -EINVAL; if (unlikely(args.set_tid && args.set_tid_size == 0)) return -EINVAL; /* * Verify that higher 32bits of exit_signal are unset and that * it is a valid signal */ if (unlikely((args.exit_signal & ~((u64)CSIGNAL)) || !valid_signal(args.exit_signal))) return -EINVAL; if ((args.flags & CLONE_INTO_CGROUP) && (args.cgroup > INT_MAX || usize < CLONE_ARGS_SIZE_VER2)) return -EINVAL; *kargs = (struct kernel_clone_args){ .flags = args.flags, .pidfd = u64_to_user_ptr(args.pidfd), .child_tid = u64_to_user_ptr(args.child_tid), .parent_tid = u64_to_user_ptr(args.parent_tid), .exit_signal = args.exit_signal, .stack = args.stack, .stack_size = args.stack_size, .tls = args.tls, .set_tid_size = args.set_tid_size, .cgroup = args.cgroup, }; if (args.set_tid && copy_from_user(kset_tid, u64_to_user_ptr(args.set_tid), (kargs->set_tid_size * sizeof(pid_t)))) return -EFAULT; kargs->set_tid = kset_tid; return 0; } /** * clone3_stack_valid - check and prepare stack * @kargs: kernel clone args * * Verify that the stack arguments userspace gave us are sane. * In addition, set the stack direction for userspace since it's easy for us to * determine. */ static inline bool clone3_stack_valid(struct kernel_clone_args *kargs) { if (kargs->stack == 0) { if (kargs->stack_size > 0) return false; } else { if (kargs->stack_size == 0) return false; if (!access_ok((void __user *)kargs->stack, kargs->stack_size)) return false; #if !defined(CONFIG_STACK_GROWSUP) kargs->stack += kargs->stack_size; #endif } return true; } static bool clone3_args_valid(struct kernel_clone_args *kargs) { /* Verify that no unknown flags are passed along. */ if (kargs->flags & ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP)) return false; /* * - make the CLONE_DETACHED bit reusable for clone3 * - make the CSIGNAL bits reusable for clone3 */ if (kargs->flags & (CLONE_DETACHED | (CSIGNAL & (~CLONE_NEWTIME)))) return false; if ((kargs->flags & (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND)) == (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND)) return false; if ((kargs->flags & (CLONE_THREAD | CLONE_PARENT)) && kargs->exit_signal) return false; if (!clone3_stack_valid(kargs)) return false; return true; } /** * sys_clone3 - create a new process with specific properties * @uargs: argument structure * @size: size of @uargs * * clone3() is the extensible successor to clone()/clone2(). * It takes a struct as argument that is versioned by its size. * * Return: On success, a positive PID for the child process. * On error, a negative errno number. */ SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size) { int err; struct kernel_clone_args kargs; pid_t set_tid[MAX_PID_NS_LEVEL]; #ifdef __ARCH_BROKEN_SYS_CLONE3 #warning clone3() entry point is missing, please fix return -ENOSYS; #endif kargs.set_tid = set_tid; err = copy_clone_args_from_user(&kargs, uargs, size); if (err) return err; if (!clone3_args_valid(&kargs)) return -EINVAL; return kernel_clone(&kargs); } void walk_process_tree(struct task_struct *top, proc_visitor visitor, void *data) { struct task_struct *leader, *parent, *child; int res; read_lock(&tasklist_lock); leader = top = top->group_leader; down: for_each_thread(leader, parent) { list_for_each_entry(child, &parent->children, sibling) { res = visitor(child, data); if (res) { if (res < 0) goto out; leader = child; goto down; } up: ; } } if (leader != top) { child = leader; parent = child->real_parent; leader = parent->group_leader; goto up; } out: read_unlock(&tasklist_lock); } #ifndef ARCH_MIN_MMSTRUCT_ALIGN #define ARCH_MIN_MMSTRUCT_ALIGN 0 #endif static void sighand_ctor(void *data) { struct sighand_struct *sighand = data; spin_lock_init(&sighand->siglock); init_waitqueue_head(&sighand->signalfd_wqh); } void __init mm_cache_init(void) { unsigned int mm_size; /* * The mm_cpumask is located at the end of mm_struct, and is * dynamically sized based on the maximum CPU number this system * can have, taking hotplug into account (nr_cpu_ids). */ mm_size = sizeof(struct mm_struct) + cpumask_size() + mm_cid_size(); mm_cachep = kmem_cache_create_usercopy("mm_struct", mm_size, ARCH_MIN_MMSTRUCT_ALIGN, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, offsetof(struct mm_struct, saved_auxv), sizeof_field(struct mm_struct, saved_auxv), NULL); } void __init proc_caches_init(void) { struct kmem_cache_args args = { .use_freeptr_offset = true, .freeptr_offset = offsetof(struct vm_area_struct, vm_freeptr), }; sighand_cachep = kmem_cache_create("sighand_cache", sizeof(struct sighand_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU| SLAB_ACCOUNT, sighand_ctor); signal_cachep = kmem_cache_create("signal_cache", sizeof(struct signal_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); files_cachep = kmem_cache_create("files_cache", sizeof(struct files_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); fs_cachep = kmem_cache_create("fs_cache", sizeof(struct fs_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); vm_area_cachep = kmem_cache_create("vm_area_struct", sizeof(struct vm_area_struct), &args, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU| SLAB_ACCOUNT); mmap_init(); nsproxy_cache_init(); } /* * Check constraints on flags passed to the unshare system call. */ static int check_unshare_flags(unsigned long unshare_flags) { if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET| CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP| CLONE_NEWTIME)) return -EINVAL; /* * Not implemented, but pretend it works if there is nothing * to unshare. Note that unsharing the address space or the * signal handlers also need to unshare the signal queues (aka * CLONE_THREAD). */ if (unshare_flags & (CLONE_THREAD | CLONE_SIGHAND | CLONE_VM)) { if (!thread_group_empty(current)) return -EINVAL; } if (unshare_flags & (CLONE_SIGHAND | CLONE_VM)) { if (refcount_read(&current->sighand->count) > 1) return -EINVAL; } if (unshare_flags & CLONE_VM) { if (!current_is_single_threaded()) return -EINVAL; } return 0; } /* * Unshare the filesystem structure if it is being shared */ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp) { struct fs_struct *fs = current->fs; if (!(unshare_flags & CLONE_FS) || !fs) return 0; /* don't need lock here; in the worst case we'll do useless copy */ if (fs->users == 1) return 0; *new_fsp = copy_fs_struct(fs); if (!*new_fsp) return -ENOMEM; return 0; } /* * Unshare file descriptor table if it is being shared */ static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp) { struct files_struct *fd = current->files; if ((unshare_flags & CLONE_FILES) && (fd && atomic_read(&fd->count) > 1)) { fd = dup_fd(fd, NULL); if (IS_ERR(fd)) return PTR_ERR(fd); *new_fdp = fd; } return 0; } /* * unshare allows a process to 'unshare' part of the process * context which was originally shared using clone. copy_* * functions used by kernel_clone() cannot be used here directly * because they modify an inactive task_struct that is being * constructed. Here we are modifying the current, active, * task_struct. */ int ksys_unshare(unsigned long unshare_flags) { struct fs_struct *fs, *new_fs = NULL; struct files_struct *new_fd = NULL; struct cred *new_cred = NULL; struct nsproxy *new_nsproxy = NULL; int do_sysvsem = 0; int err; /* * If unsharing a user namespace must also unshare the thread group * and unshare the filesystem root and working directories. */ if (unshare_flags & CLONE_NEWUSER) unshare_flags |= CLONE_THREAD | CLONE_FS; /* * If unsharing vm, must also unshare signal handlers. */ if (unshare_flags & CLONE_VM) unshare_flags |= CLONE_SIGHAND; /* * If unsharing a signal handlers, must also unshare the signal queues. */ if (unshare_flags & CLONE_SIGHAND) unshare_flags |= CLONE_THREAD; /* * If unsharing namespace, must also unshare filesystem information. */ if (unshare_flags & CLONE_NEWNS) unshare_flags |= CLONE_FS; err = check_unshare_flags(unshare_flags); if (err) goto bad_unshare_out; /* * CLONE_NEWIPC must also detach from the undolist: after switching * to a new ipc namespace, the semaphore arrays from the old * namespace are unreachable. */ if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM)) do_sysvsem = 1; err = unshare_fs(unshare_flags, &new_fs); if (err) goto bad_unshare_out; err = unshare_fd(unshare_flags, &new_fd); if (err) goto bad_unshare_cleanup_fs; err = unshare_userns(unshare_flags, &new_cred); if (err) goto bad_unshare_cleanup_fd; err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, new_cred, new_fs); if (err) goto bad_unshare_cleanup_cred; if (new_cred) { err = set_cred_ucounts(new_cred); if (err) goto bad_unshare_cleanup_cred; } if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) { if (do_sysvsem) { /* * CLONE_SYSVSEM is equivalent to sys_exit(). */ exit_sem(current); } if (unshare_flags & CLONE_NEWIPC) { /* Orphan segments in old ns (see sem above). */ exit_shm(current); shm_init_task(current); } if (new_nsproxy) switch_task_namespaces(current, new_nsproxy); task_lock(current); if (new_fs) { fs = current->fs; spin_lock(&fs->lock); current->fs = new_fs; if (--fs->users) new_fs = NULL; else new_fs = fs; spin_unlock(&fs->lock); } if (new_fd) swap(current->files, new_fd); task_unlock(current); if (new_cred) { /* Install the new user namespace */ commit_creds(new_cred); new_cred = NULL; } } perf_event_namespaces(current); bad_unshare_cleanup_cred: if (new_cred) put_cred(new_cred); bad_unshare_cleanup_fd: if (new_fd) put_files_struct(new_fd); bad_unshare_cleanup_fs: if (new_fs) free_fs_struct(new_fs); bad_unshare_out: return err; } SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) { return ksys_unshare(unshare_flags); } /* * Helper to unshare the files of the current task. * We don't want to expose copy_files internals to * the exec layer of the kernel. */ int unshare_files(void) { struct task_struct *task = current; struct files_struct *old, *copy = NULL; int error; error = unshare_fd(CLONE_FILES, &copy); if (error || !copy) return error; old = task->files; task_lock(task); task->files = copy; task_unlock(task); put_files_struct(old); return 0; } int sysctl_max_threads(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; int ret; int threads = max_threads; int min = 1; int max = MAX_THREADS; t = *table; t.data = &threads; t.extra1 = &min; t.extra2 = &max; ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); if (ret || !write) return ret; max_threads = threads; return 0; }
21 21 97 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Asynchronous Compression operations * * Copyright (c) 2016, Intel Corporation * Authors: Weigang Li <weigang.li@intel.com> * Giovanni Cabiddu <giovanni.cabiddu@intel.com> */ #ifndef _CRYPTO_ACOMP_H #define _CRYPTO_ACOMP_H #include <linux/atomic.h> #include <linux/args.h> #include <linux/compiler_types.h> #include <linux/container_of.h> #include <linux/crypto.h> #include <linux/err.h> #include <linux/scatterlist.h> #include <linux/slab.h> #include <linux/spinlock_types.h> #include <linux/types.h> /* Set this bit if source is virtual address instead of SG list. */ #define CRYPTO_ACOMP_REQ_SRC_VIRT 0x00000002 /* Set this bit for if virtual address source cannot be used for DMA. */ #define CRYPTO_ACOMP_REQ_SRC_NONDMA 0x00000004 /* Set this bit if destination is virtual address instead of SG list. */ #define CRYPTO_ACOMP_REQ_DST_VIRT 0x00000008 /* Set this bit for if virtual address destination cannot be used for DMA. */ #define CRYPTO_ACOMP_REQ_DST_NONDMA 0x00000010 /* Private flags that should not be touched by the user. */ #define CRYPTO_ACOMP_REQ_PRIVATE \ (CRYPTO_ACOMP_REQ_SRC_VIRT | CRYPTO_ACOMP_REQ_SRC_NONDMA | \ CRYPTO_ACOMP_REQ_DST_VIRT | CRYPTO_ACOMP_REQ_DST_NONDMA) #define CRYPTO_ACOMP_DST_MAX 131072 #define MAX_SYNC_COMP_REQSIZE 0 #define ACOMP_REQUEST_ON_STACK(name, tfm) \ char __##name##_req[sizeof(struct acomp_req) + \ MAX_SYNC_COMP_REQSIZE] CRYPTO_MINALIGN_ATTR; \ struct acomp_req *name = acomp_request_on_stack_init( \ __##name##_req, (tfm)) #define ACOMP_REQUEST_CLONE(name, gfp) \ acomp_request_clone(name, sizeof(__##name##_req), gfp) struct acomp_req; struct folio; struct acomp_req_chain { crypto_completion_t compl; void *data; struct scatterlist ssg; struct scatterlist dsg; union { const u8 *src; struct folio *sfolio; }; union { u8 *dst; struct folio *dfolio; }; u32 flags; }; /** * struct acomp_req - asynchronous (de)compression request * * @base: Common attributes for asynchronous crypto requests * @src: Source scatterlist * @dst: Destination scatterlist * @svirt: Source virtual address * @dvirt: Destination virtual address * @slen: Size of the input buffer * @dlen: Size of the output buffer and number of bytes produced * @chain: Private API code data, do not use * @__ctx: Start of private context data */ struct acomp_req { struct crypto_async_request base; union { struct scatterlist *src; const u8 *svirt; }; union { struct scatterlist *dst; u8 *dvirt; }; unsigned int slen; unsigned int dlen; struct acomp_req_chain chain; void *__ctx[] CRYPTO_MINALIGN_ATTR; }; /** * struct crypto_acomp - user-instantiated objects which encapsulate * algorithms and core processing logic * * @compress: Function performs a compress operation * @decompress: Function performs a de-compress operation * @reqsize: Context size for (de)compression requests * @fb: Synchronous fallback tfm * @base: Common crypto API algorithm data structure */ struct crypto_acomp { int (*compress)(struct acomp_req *req); int (*decompress)(struct acomp_req *req); unsigned int reqsize; struct crypto_tfm base; }; #define COMP_ALG_COMMON { \ struct crypto_alg base; \ } struct comp_alg_common COMP_ALG_COMMON; /** * DOC: Asynchronous Compression API * * The Asynchronous Compression API is used with the algorithms of type * CRYPTO_ALG_TYPE_ACOMPRESS (listed as type "acomp" in /proc/crypto) */ /** * crypto_alloc_acomp() -- allocate ACOMPRESS tfm handle * @alg_name: is the cra_name / name or cra_driver_name / driver name of the * compression algorithm e.g. "deflate" * @type: specifies the type of the algorithm * @mask: specifies the mask for the algorithm * * Allocate a handle for a compression algorithm. The returned struct * crypto_acomp is the handle that is required for any subsequent * API invocation for the compression operations. * * Return: allocated handle in case of success; IS_ERR() is true in case * of an error, PTR_ERR() returns the error code. */ struct crypto_acomp *crypto_alloc_acomp(const char *alg_name, u32 type, u32 mask); /** * crypto_alloc_acomp_node() -- allocate ACOMPRESS tfm handle with desired NUMA node * @alg_name: is the cra_name / name or cra_driver_name / driver name of the * compression algorithm e.g. "deflate" * @type: specifies the type of the algorithm * @mask: specifies the mask for the algorithm * @node: specifies the NUMA node the ZIP hardware belongs to * * Allocate a handle for a compression algorithm. Drivers should try to use * (de)compressors on the specified NUMA node. * The returned struct crypto_acomp is the handle that is required for any * subsequent API invocation for the compression operations. * * Return: allocated handle in case of success; IS_ERR() is true in case * of an error, PTR_ERR() returns the error code. */ struct crypto_acomp *crypto_alloc_acomp_node(const char *alg_name, u32 type, u32 mask, int node); static inline struct crypto_tfm *crypto_acomp_tfm(struct crypto_acomp *tfm) { return &tfm->base; } static inline struct comp_alg_common *__crypto_comp_alg_common( struct crypto_alg *alg) { return container_of(alg, struct comp_alg_common, base); } static inline struct crypto_acomp *__crypto_acomp_tfm(struct crypto_tfm *tfm) { return container_of(tfm, struct crypto_acomp, base); } static inline struct comp_alg_common *crypto_comp_alg_common( struct crypto_acomp *tfm) { return __crypto_comp_alg_common(crypto_acomp_tfm(tfm)->__crt_alg); } static inline unsigned int crypto_acomp_reqsize(struct crypto_acomp *tfm) { return tfm->reqsize; } static inline void acomp_request_set_tfm(struct acomp_req *req, struct crypto_acomp *tfm) { crypto_request_set_tfm(&req->base, crypto_acomp_tfm(tfm)); } static inline bool acomp_is_async(struct crypto_acomp *tfm) { return crypto_comp_alg_common(tfm)->base.cra_flags & CRYPTO_ALG_ASYNC; } static inline struct crypto_acomp *crypto_acomp_reqtfm(struct acomp_req *req) { return __crypto_acomp_tfm(req->base.tfm); } /** * crypto_free_acomp() -- free ACOMPRESS tfm handle * * @tfm: ACOMPRESS tfm handle allocated with crypto_alloc_acomp() * * If @tfm is a NULL or error pointer, this function does nothing. */ static inline void crypto_free_acomp(struct crypto_acomp *tfm) { crypto_destroy_tfm(tfm, crypto_acomp_tfm(tfm)); } static inline int crypto_has_acomp(const char *alg_name, u32 type, u32 mask) { type &= ~CRYPTO_ALG_TYPE_MASK; type |= CRYPTO_ALG_TYPE_ACOMPRESS; mask |= CRYPTO_ALG_TYPE_ACOMPRESS_MASK; return crypto_has_alg(alg_name, type, mask); } static inline const char *crypto_acomp_alg_name(struct crypto_acomp *tfm) { return crypto_tfm_alg_name(crypto_acomp_tfm(tfm)); } static inline const char *crypto_acomp_driver_name(struct crypto_acomp *tfm) { return crypto_tfm_alg_driver_name(crypto_acomp_tfm(tfm)); } /** * acomp_request_alloc() -- allocates asynchronous (de)compression request * * @tfm: ACOMPRESS tfm handle allocated with crypto_alloc_acomp() * @gfp: gfp to pass to kzalloc (defaults to GFP_KERNEL) * * Return: allocated handle in case of success or NULL in case of an error */ static inline struct acomp_req *acomp_request_alloc_extra_noprof( struct crypto_acomp *tfm, size_t extra, gfp_t gfp) { struct acomp_req *req; size_t len; len = ALIGN(sizeof(*req) + crypto_acomp_reqsize(tfm), CRYPTO_MINALIGN); if (check_add_overflow(len, extra, &len)) return NULL; req = kzalloc_noprof(len, gfp); if (likely(req)) acomp_request_set_tfm(req, tfm); return req; } #define acomp_request_alloc_noprof(tfm, ...) \ CONCATENATE(acomp_request_alloc_noprof_, COUNT_ARGS(__VA_ARGS__))( \ tfm, ##__VA_ARGS__) #define acomp_request_alloc_noprof_0(tfm) \ acomp_request_alloc_noprof_1(tfm, GFP_KERNEL) #define acomp_request_alloc_noprof_1(tfm, gfp) \ acomp_request_alloc_extra_noprof(tfm, 0, gfp) #define acomp_request_alloc(...) alloc_hooks(acomp_request_alloc_noprof(__VA_ARGS__)) /** * acomp_request_alloc_extra() -- allocate acomp request with extra memory * * @tfm: ACOMPRESS tfm handle allocated with crypto_alloc_acomp() * @extra: amount of extra memory * @gfp: gfp to pass to kzalloc * * Return: allocated handle in case of success or NULL in case of an error */ #define acomp_request_alloc_extra(...) alloc_hooks(acomp_request_alloc_extra_noprof(__VA_ARGS__)) static inline void *acomp_request_extra(struct acomp_req *req) { struct crypto_acomp *tfm = crypto_acomp_reqtfm(req); size_t len; len = ALIGN(sizeof(*req) + crypto_acomp_reqsize(tfm), CRYPTO_MINALIGN); return (void *)((char *)req + len); } static inline bool acomp_req_on_stack(struct acomp_req *req) { return crypto_req_on_stack(&req->base); } /** * acomp_request_free() -- zeroize and free asynchronous (de)compression * request as well as the output buffer if allocated * inside the algorithm * * @req: request to free */ static inline void acomp_request_free(struct acomp_req *req) { if (!req || acomp_req_on_stack(req)) return; kfree_sensitive(req); } /** * acomp_request_set_callback() -- Sets an asynchronous callback * * Callback will be called when an asynchronous operation on a given * request is finished. * * @req: request that the callback will be set for * @flgs: specify for instance if the operation may backlog * @cmlp: callback which will be called * @data: private data used by the caller */ static inline void acomp_request_set_callback(struct acomp_req *req, u32 flgs, crypto_completion_t cmpl, void *data) { flgs &= ~CRYPTO_ACOMP_REQ_PRIVATE; flgs |= req->base.flags & CRYPTO_ACOMP_REQ_PRIVATE; crypto_request_set_callback(&req->base, flgs, cmpl, data); } /** * acomp_request_set_params() -- Sets request parameters * * Sets parameters required by an acomp operation * * @req: asynchronous compress request * @src: pointer to input buffer scatterlist * @dst: pointer to output buffer scatterlist. If this is NULL, the * acomp layer will allocate the output memory * @slen: size of the input buffer * @dlen: size of the output buffer. If dst is NULL, this can be used by * the user to specify the maximum amount of memory to allocate */ static inline void acomp_request_set_params(struct acomp_req *req, struct scatterlist *src, struct scatterlist *dst, unsigned int slen, unsigned int dlen) { req->src = src; req->dst = dst; req->slen = slen; req->dlen = dlen; req->base.flags &= ~(CRYPTO_ACOMP_REQ_SRC_VIRT | CRYPTO_ACOMP_REQ_SRC_NONDMA | CRYPTO_ACOMP_REQ_DST_VIRT | CRYPTO_ACOMP_REQ_DST_NONDMA); } /** * acomp_request_set_src_sg() -- Sets source scatterlist * * Sets source scatterlist required by an acomp operation. * * @req: asynchronous compress request * @src: pointer to input buffer scatterlist * @slen: size of the input buffer */ static inline void acomp_request_set_src_sg(struct acomp_req *req, struct scatterlist *src, unsigned int slen) { req->src = src; req->slen = slen; req->base.flags &= ~CRYPTO_ACOMP_REQ_SRC_NONDMA; req->base.flags &= ~CRYPTO_ACOMP_REQ_SRC_VIRT; } /** * acomp_request_set_src_dma() -- Sets DMA source virtual address * * Sets source virtual address required by an acomp operation. * The address must be usable for DMA. * * @req: asynchronous compress request * @src: virtual address pointer to input buffer * @slen: size of the input buffer */ static inline void acomp_request_set_src_dma(struct acomp_req *req, const u8 *src, unsigned int slen) { req->svirt = src; req->slen = slen; req->base.flags &= ~CRYPTO_ACOMP_REQ_SRC_NONDMA; req->base.flags |= CRYPTO_ACOMP_REQ_SRC_VIRT; } /** * acomp_request_set_src_nondma() -- Sets non-DMA source virtual address * * Sets source virtual address required by an acomp operation. * The address can not be used for DMA. * * @req: asynchronous compress request * @src: virtual address pointer to input buffer * @slen: size of the input buffer */ static inline void acomp_request_set_src_nondma(struct acomp_req *req, const u8 *src, unsigned int slen) { req->svirt = src; req->slen = slen; req->base.flags |= CRYPTO_ACOMP_REQ_SRC_NONDMA; req->base.flags |= CRYPTO_ACOMP_REQ_SRC_VIRT; } /** * acomp_request_set_src_folio() -- Sets source folio * * Sets source folio required by an acomp operation. * * @req: asynchronous compress request * @folio: pointer to input folio * @off: input folio offset * @len: size of the input buffer */ static inline void acomp_request_set_src_folio(struct acomp_req *req, struct folio *folio, size_t off, unsigned int len) { sg_init_table(&req->chain.ssg, 1); sg_set_folio(&req->chain.ssg, folio, len, off); acomp_request_set_src_sg(req, &req->chain.ssg, len); } /** * acomp_request_set_dst_sg() -- Sets destination scatterlist * * Sets destination scatterlist required by an acomp operation. * * @req: asynchronous compress request * @dst: pointer to output buffer scatterlist * @dlen: size of the output buffer */ static inline void acomp_request_set_dst_sg(struct acomp_req *req, struct scatterlist *dst, unsigned int dlen) { req->dst = dst; req->dlen = dlen; req->base.flags &= ~CRYPTO_ACOMP_REQ_DST_NONDMA; req->base.flags &= ~CRYPTO_ACOMP_REQ_DST_VIRT; } /** * acomp_request_set_dst_dma() -- Sets DMA destination virtual address * * Sets destination virtual address required by an acomp operation. * The address must be usable for DMA. * * @req: asynchronous compress request * @dst: virtual address pointer to output buffer * @dlen: size of the output buffer */ static inline void acomp_request_set_dst_dma(struct acomp_req *req, u8 *dst, unsigned int dlen) { req->dvirt = dst; req->dlen = dlen; req->base.flags &= ~CRYPTO_ACOMP_REQ_DST_NONDMA; req->base.flags |= CRYPTO_ACOMP_REQ_DST_VIRT; } /** * acomp_request_set_dst_nondma() -- Sets non-DMA destination virtual address * * Sets destination virtual address required by an acomp operation. * The address can not be used for DMA. * * @req: asynchronous compress request * @dst: virtual address pointer to output buffer * @dlen: size of the output buffer */ static inline void acomp_request_set_dst_nondma(struct acomp_req *req, u8 *dst, unsigned int dlen) { req->dvirt = dst; req->dlen = dlen; req->base.flags |= CRYPTO_ACOMP_REQ_DST_NONDMA; req->base.flags |= CRYPTO_ACOMP_REQ_DST_VIRT; } /** * acomp_request_set_dst_folio() -- Sets destination folio * * Sets destination folio required by an acomp operation. * * @req: asynchronous compress request * @folio: pointer to input folio * @off: input folio offset * @len: size of the input buffer */ static inline void acomp_request_set_dst_folio(struct acomp_req *req, struct folio *folio, size_t off, unsigned int len) { sg_init_table(&req->chain.dsg, 1); sg_set_folio(&req->chain.dsg, folio, len, off); acomp_request_set_dst_sg(req, &req->chain.dsg, len); } /** * crypto_acomp_compress() -- Invoke asynchronous compress operation * * Function invokes the asynchronous compress operation * * @req: asynchronous compress request * * Return: zero on success; error code in case of error */ int crypto_acomp_compress(struct acomp_req *req); /** * crypto_acomp_decompress() -- Invoke asynchronous decompress operation * * Function invokes the asynchronous decompress operation * * @req: asynchronous compress request * * Return: zero on success; error code in case of error */ int crypto_acomp_decompress(struct acomp_req *req); static inline struct acomp_req *acomp_request_on_stack_init( char *buf, struct crypto_acomp *tfm) { struct acomp_req *req = (void *)buf; crypto_stack_request_init(&req->base, crypto_acomp_tfm(tfm)); return req; } struct acomp_req *acomp_request_clone(struct acomp_req *req, size_t total, gfp_t gfp); #endif
491 493 493 493 103 102 5 4 102 1 1 491 493 493 493 381 381 106 8 103 87 97 2 2 11 11 11 11 11 11 11 3 2 3 98 3 9 19 9 9 9 9 9 9 9 9 9 1 9 8 116 110 110 110 110 11 98 121 121 5 119 116 116 97 97 96 96 3 1 1 7 2 1 1 7 1 97 2 95 97 97 96 97 74 2 97 97 99 99 3 99 99 11 2 96 10 96 1 97 97 9 33 273 17 17 1 2 4 186 1 7 6 6 6 6 6 6 11 11 8 2 6 8 5 1 4 8 17 13 11 11 11 11 11 15 15 3 9 8 9 5 7 7 7 9 9 9 490 493 379 2 2 2 484 2 2 13 186 1 1 3 3 13 499 497 6 492 486 1 7 7 6 6 7 10 10 10 4 4 4 10 10 9 10 18 18 15 7 7 10 10 9 2 2 2 2 2 1 12 11 10 9 8 8 8 1 2 2 2 2 2 2 8 8 7 2 6 6 5 4 4 4 6 3 3 3 2 3 33 1 1 1 14 13 1 11 11 11 10 9 9 8 7 7 4 4 7 7 7 2 9 14 34 12 8 33 1 15 14 13 13 13 13 7 7 7 13 13 18 18 18 99 99 18 2 123 123 122 136 11 100 19 1 18 99 11 99 99 99 99 97 97 11 99 1 18 1 17 100 2 1 1 3 134 8 7 136 3 2 2 3 97 40 40 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 // SPDX-License-Identifier: GPL-2.0 /* * linux/kernel/seccomp.c * * Copyright 2004-2005 Andrea Arcangeli <andrea@cpushare.com> * * Copyright (C) 2012 Google, Inc. * Will Drewry <wad@chromium.org> * * This defines a simple but solid secure-computing facility. * * Mode 1 uses a fixed list of allowed system calls. * Mode 2 allows user-defined system call filters in the form * of Berkeley Packet Filters/Linux Socket Filters. */ #define pr_fmt(fmt) "seccomp: " fmt #include <linux/refcount.h> #include <linux/audit.h> #include <linux/compat.h> #include <linux/coredump.h> #include <linux/kmemleak.h> #include <linux/nospec.h> #include <linux/prctl.h> #include <linux/sched.h> #include <linux/sched/task_stack.h> #include <linux/seccomp.h> #include <linux/slab.h> #include <linux/syscalls.h> #include <linux/sysctl.h> #include <asm/syscall.h> /* Not exposed in headers: strictly internal use only. */ #define SECCOMP_MODE_DEAD (SECCOMP_MODE_FILTER + 1) #ifdef CONFIG_SECCOMP_FILTER #include <linux/file.h> #include <linux/filter.h> #include <linux/pid.h> #include <linux/ptrace.h> #include <linux/capability.h> #include <linux/uaccess.h> #include <linux/anon_inodes.h> #include <linux/lockdep.h> /* * When SECCOMP_IOCTL_NOTIF_ID_VALID was first introduced, it had the * wrong direction flag in the ioctl number. This is the broken one, * which the kernel needs to keep supporting until all userspaces stop * using the wrong command number. */ #define SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR SECCOMP_IOR(2, __u64) enum notify_state { SECCOMP_NOTIFY_INIT, SECCOMP_NOTIFY_SENT, SECCOMP_NOTIFY_REPLIED, }; struct seccomp_knotif { /* The struct pid of the task whose filter triggered the notification */ struct task_struct *task; /* The "cookie" for this request; this is unique for this filter. */ u64 id; /* * The seccomp data. This pointer is valid the entire time this * notification is active, since it comes from __seccomp_filter which * eclipses the entire lifecycle here. */ const struct seccomp_data *data; /* * Notification states. When SECCOMP_RET_USER_NOTIF is returned, a * struct seccomp_knotif is created and starts out in INIT. Once the * handler reads the notification off of an FD, it transitions to SENT. * If a signal is received the state transitions back to INIT and * another message is sent. When the userspace handler replies, state * transitions to REPLIED. */ enum notify_state state; /* The return values, only valid when in SECCOMP_NOTIFY_REPLIED */ int error; long val; u32 flags; /* * Signals when this has changed states, such as the listener * dying, a new seccomp addfd message, or changing to REPLIED */ struct completion ready; struct list_head list; /* outstanding addfd requests */ struct list_head addfd; }; /** * struct seccomp_kaddfd - container for seccomp_addfd ioctl messages * * @file: A reference to the file to install in the other task * @fd: The fd number to install it at. If the fd number is -1, it means the * installing process should allocate the fd as normal. * @flags: The flags for the new file descriptor. At the moment, only O_CLOEXEC * is allowed. * @ioctl_flags: The flags used for the seccomp_addfd ioctl. * @setfd: whether or not SECCOMP_ADDFD_FLAG_SETFD was set during notify_addfd * @ret: The return value of the installing process. It is set to the fd num * upon success (>= 0). * @completion: Indicates that the installing process has completed fd * installation, or gone away (either due to successful * reply, or signal) * @list: list_head for chaining seccomp_kaddfd together. * */ struct seccomp_kaddfd { struct file *file; int fd; unsigned int flags; __u32 ioctl_flags; union { bool setfd; /* To only be set on reply */ int ret; }; struct completion completion; struct list_head list; }; /** * struct notification - container for seccomp userspace notifications. Since * most seccomp filters will not have notification listeners attached and this * structure is fairly large, we store the notification-specific stuff in a * separate structure. * * @requests: A semaphore that users of this notification can wait on for * changes. Actual reads and writes are still controlled with * filter->notify_lock. * @flags: A set of SECCOMP_USER_NOTIF_FD_* flags. * @next_id: The id of the next request. * @notifications: A list of struct seccomp_knotif elements. */ struct notification { atomic_t requests; u32 flags; u64 next_id; struct list_head notifications; }; #ifdef SECCOMP_ARCH_NATIVE /** * struct action_cache - per-filter cache of seccomp actions per * arch/syscall pair * * @allow_native: A bitmap where each bit represents whether the * filter will always allow the syscall, for the * native architecture. * @allow_compat: A bitmap where each bit represents whether the * filter will always allow the syscall, for the * compat architecture. */ struct action_cache { DECLARE_BITMAP(allow_native, SECCOMP_ARCH_NATIVE_NR); #ifdef SECCOMP_ARCH_COMPAT DECLARE_BITMAP(allow_compat, SECCOMP_ARCH_COMPAT_NR); #endif }; #else struct action_cache { }; static inline bool seccomp_cache_check_allow(const struct seccomp_filter *sfilter, const struct seccomp_data *sd) { return false; } static inline void seccomp_cache_prepare(struct seccomp_filter *sfilter) { } #endif /* SECCOMP_ARCH_NATIVE */ /** * struct seccomp_filter - container for seccomp BPF programs * * @refs: Reference count to manage the object lifetime. * A filter's reference count is incremented for each directly * attached task, once for the dependent filter, and if * requested for the user notifier. When @refs reaches zero, * the filter can be freed. * @users: A filter's @users count is incremented for each directly * attached task (filter installation, fork(), thread_sync), * and once for the dependent filter (tracked in filter->prev). * When it reaches zero it indicates that no direct or indirect * users of that filter exist. No new tasks can get associated with * this filter after reaching 0. The @users count is always smaller * or equal to @refs. Hence, reaching 0 for @users does not mean * the filter can be freed. * @cache: cache of arch/syscall mappings to actions * @log: true if all actions except for SECCOMP_RET_ALLOW should be logged * @wait_killable_recv: Put notifying process in killable state once the * notification is received by the userspace listener. * @prev: points to a previously installed, or inherited, filter * @prog: the BPF program to evaluate * @notif: the struct that holds all notification related information * @notify_lock: A lock for all notification-related accesses. * @wqh: A wait queue for poll if a notifier is in use. * * seccomp_filter objects are organized in a tree linked via the @prev * pointer. For any task, it appears to be a singly-linked list starting * with current->seccomp.filter, the most recently attached or inherited filter. * However, multiple filters may share a @prev node, by way of fork(), which * results in a unidirectional tree existing in memory. This is similar to * how namespaces work. * * seccomp_filter objects should never be modified after being attached * to a task_struct (other than @refs). */ struct seccomp_filter { refcount_t refs; refcount_t users; bool log; bool wait_killable_recv; struct action_cache cache; struct seccomp_filter *prev; struct bpf_prog *prog; struct notification *notif; struct mutex notify_lock; wait_queue_head_t wqh; }; /* Limit any path through the tree to 256KB worth of instructions. */ #define MAX_INSNS_PER_PATH ((1 << 18) / sizeof(struct sock_filter)) /* * Endianness is explicitly ignored and left for BPF program authors to manage * as per the specific architecture. */ static void populate_seccomp_data(struct seccomp_data *sd) { /* * Instead of using current_pt_reg(), we're already doing the work * to safely fetch "current", so just use "task" everywhere below. */ struct task_struct *task = current; struct pt_regs *regs = task_pt_regs(task); unsigned long args[6]; sd->nr = syscall_get_nr(task, regs); sd->arch = syscall_get_arch(task); syscall_get_arguments(task, regs, args); sd->args[0] = args[0]; sd->args[1] = args[1]; sd->args[2] = args[2]; sd->args[3] = args[3]; sd->args[4] = args[4]; sd->args[5] = args[5]; sd->instruction_pointer = KSTK_EIP(task); } /** * seccomp_check_filter - verify seccomp filter code * @filter: filter to verify * @flen: length of filter * * Takes a previously checked filter (by bpf_check_classic) and * redirects all filter code that loads struct sk_buff data * and related data through seccomp_bpf_load. It also * enforces length and alignment checking of those loads. * * Returns 0 if the rule set is legal or -EINVAL if not. */ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) { int pc; for (pc = 0; pc < flen; pc++) { struct sock_filter *ftest = &filter[pc]; u16 code = ftest->code; u32 k = ftest->k; switch (code) { case BPF_LD | BPF_W | BPF_ABS: ftest->code = BPF_LDX | BPF_W | BPF_ABS; /* 32-bit aligned and not out of bounds. */ if (k >= sizeof(struct seccomp_data) || k & 3) return -EINVAL; continue; case BPF_LD | BPF_W | BPF_LEN: ftest->code = BPF_LD | BPF_IMM; ftest->k = sizeof(struct seccomp_data); continue; case BPF_LDX | BPF_W | BPF_LEN: ftest->code = BPF_LDX | BPF_IMM; ftest->k = sizeof(struct seccomp_data); continue; /* Explicitly include allowed calls. */ case BPF_RET | BPF_K: case BPF_RET | BPF_A: case BPF_ALU | BPF_ADD | BPF_K: case BPF_ALU | BPF_ADD | BPF_X: case BPF_ALU | BPF_SUB | BPF_K: case BPF_ALU | BPF_SUB | BPF_X: case BPF_ALU | BPF_MUL | BPF_K: case BPF_ALU | BPF_MUL | BPF_X: case BPF_ALU | BPF_DIV | BPF_K: case BPF_ALU | BPF_DIV | BPF_X: case BPF_ALU | BPF_AND | BPF_K: case BPF_ALU | BPF_AND | BPF_X: case BPF_ALU | BPF_OR | BPF_K: case BPF_ALU | BPF_OR | BPF_X: case BPF_ALU | BPF_XOR | BPF_K: case BPF_ALU | BPF_XOR | BPF_X: case BPF_ALU | BPF_LSH | BPF_K: case BPF_ALU | BPF_LSH | BPF_X: case BPF_ALU | BPF_RSH | BPF_K: case BPF_ALU | BPF_RSH | BPF_X: case BPF_ALU | BPF_NEG: case BPF_LD | BPF_IMM: case BPF_LDX | BPF_IMM: case BPF_MISC | BPF_TAX: case BPF_MISC | BPF_TXA: case BPF_LD | BPF_MEM: case BPF_LDX | BPF_MEM: case BPF_ST: case BPF_STX: case BPF_JMP | BPF_JA: case BPF_JMP | BPF_JEQ | BPF_K: case BPF_JMP | BPF_JEQ | BPF_X: case BPF_JMP | BPF_JGE | BPF_K: case BPF_JMP | BPF_JGE | BPF_X: case BPF_JMP | BPF_JGT | BPF_K: case BPF_JMP | BPF_JGT | BPF_X: case BPF_JMP | BPF_JSET | BPF_K: case BPF_JMP | BPF_JSET | BPF_X: continue; default: return -EINVAL; } } return 0; } #ifdef SECCOMP_ARCH_NATIVE static inline bool seccomp_cache_check_allow_bitmap(const void *bitmap, size_t bitmap_size, int syscall_nr) { if (unlikely(syscall_nr < 0 || syscall_nr >= bitmap_size)) return false; syscall_nr = array_index_nospec(syscall_nr, bitmap_size); return test_bit(syscall_nr, bitmap); } /** * seccomp_cache_check_allow - lookup seccomp cache * @sfilter: The seccomp filter * @sd: The seccomp data to lookup the cache with * * Returns true if the seccomp_data is cached and allowed. */ static inline bool seccomp_cache_check_allow(const struct seccomp_filter *sfilter, const struct seccomp_data *sd) { int syscall_nr = sd->nr; const struct action_cache *cache = &sfilter->cache; #ifndef SECCOMP_ARCH_COMPAT /* A native-only architecture doesn't need to check sd->arch. */ return seccomp_cache_check_allow_bitmap(cache->allow_native, SECCOMP_ARCH_NATIVE_NR, syscall_nr); #else if (likely(sd->arch == SECCOMP_ARCH_NATIVE)) return seccomp_cache_check_allow_bitmap(cache->allow_native, SECCOMP_ARCH_NATIVE_NR, syscall_nr); if (likely(sd->arch == SECCOMP_ARCH_COMPAT)) return seccomp_cache_check_allow_bitmap(cache->allow_compat, SECCOMP_ARCH_COMPAT_NR, syscall_nr); #endif /* SECCOMP_ARCH_COMPAT */ WARN_ON_ONCE(true); return false; } #endif /* SECCOMP_ARCH_NATIVE */ #define ACTION_ONLY(ret) ((s32)((ret) & (SECCOMP_RET_ACTION_FULL))) /** * seccomp_run_filters - evaluates all seccomp filters against @sd * @sd: optional seccomp data to be passed to filters * @match: stores struct seccomp_filter that resulted in the return value, * unless filter returned SECCOMP_RET_ALLOW, in which case it will * be unchanged. * * Returns valid seccomp BPF response codes. */ static u32 seccomp_run_filters(const struct seccomp_data *sd, struct seccomp_filter **match) { u32 ret = SECCOMP_RET_ALLOW; /* Make sure cross-thread synced filter points somewhere sane. */ struct seccomp_filter *f = READ_ONCE(current->seccomp.filter); /* Ensure unexpected behavior doesn't result in failing open. */ if (WARN_ON(f == NULL)) return SECCOMP_RET_KILL_PROCESS; if (seccomp_cache_check_allow(f, sd)) return SECCOMP_RET_ALLOW; /* * All filters in the list are evaluated and the lowest BPF return * value always takes priority (ignoring the DATA). */ for (; f; f = f->prev) { u32 cur_ret = bpf_prog_run_pin_on_cpu(f->prog, sd); if (ACTION_ONLY(cur_ret) < ACTION_ONLY(ret)) { ret = cur_ret; *match = f; } } return ret; } #endif /* CONFIG_SECCOMP_FILTER */ static inline bool seccomp_may_assign_mode(unsigned long seccomp_mode) { assert_spin_locked(&current->sighand->siglock); if (current->seccomp.mode && current->seccomp.mode != seccomp_mode) return false; return true; } void __weak arch_seccomp_spec_mitigate(struct task_struct *task) { } static inline void seccomp_assign_mode(struct task_struct *task, unsigned long seccomp_mode, unsigned long flags) { assert_spin_locked(&task->sighand->siglock); task->seccomp.mode = seccomp_mode; /* * Make sure SYSCALL_WORK_SECCOMP cannot be set before the mode (and * filter) is set. */ smp_mb__before_atomic(); /* Assume default seccomp processes want spec flaw mitigation. */ if ((flags & SECCOMP_FILTER_FLAG_SPEC_ALLOW) == 0) arch_seccomp_spec_mitigate(task); set_task_syscall_work(task, SECCOMP); } #ifdef CONFIG_SECCOMP_FILTER /* Returns 1 if the parent is an ancestor of the child. */ static int is_ancestor(struct seccomp_filter *parent, struct seccomp_filter *child) { /* NULL is the root ancestor. */ if (parent == NULL) return 1; for (; child; child = child->prev) if (child == parent) return 1; return 0; } /** * seccomp_can_sync_threads: checks if all threads can be synchronized * * Expects sighand and cred_guard_mutex locks to be held. * * Returns 0 on success, -ve on error, or the pid of a thread which was * either not in the correct seccomp mode or did not have an ancestral * seccomp filter. */ static inline pid_t seccomp_can_sync_threads(void) { struct task_struct *thread, *caller; BUG_ON(!mutex_is_locked(&current->signal->cred_guard_mutex)); assert_spin_locked(&current->sighand->siglock); /* Validate all threads being eligible for synchronization. */ caller = current; for_each_thread(caller, thread) { pid_t failed; /* Skip current, since it is initiating the sync. */ if (thread == caller) continue; /* Skip exited threads. */ if (thread->flags & PF_EXITING) continue; if (thread->seccomp.mode == SECCOMP_MODE_DISABLED || (thread->seccomp.mode == SECCOMP_MODE_FILTER && is_ancestor(thread->seccomp.filter, caller->seccomp.filter))) continue; /* Return the first thread that cannot be synchronized. */ failed = task_pid_vnr(thread); /* If the pid cannot be resolved, then return -ESRCH */ if (WARN_ON(failed == 0)) failed = -ESRCH; return failed; } return 0; } static inline void seccomp_filter_free(struct seccomp_filter *filter) { if (filter) { bpf_prog_destroy(filter->prog); kfree(filter); } } static void __seccomp_filter_orphan(struct seccomp_filter *orig) { while (orig && refcount_dec_and_test(&orig->users)) { if (waitqueue_active(&orig->wqh)) wake_up_poll(&orig->wqh, EPOLLHUP); orig = orig->prev; } } static void __put_seccomp_filter(struct seccomp_filter *orig) { /* Clean up single-reference branches iteratively. */ while (orig && refcount_dec_and_test(&orig->refs)) { struct seccomp_filter *freeme = orig; orig = orig->prev; seccomp_filter_free(freeme); } } static void __seccomp_filter_release(struct seccomp_filter *orig) { /* Notify about any unused filters in the task's former filter tree. */ __seccomp_filter_orphan(orig); /* Finally drop all references to the task's former tree. */ __put_seccomp_filter(orig); } /** * seccomp_filter_release - Detach the task from its filter tree, * drop its reference count, and notify * about unused filters * * @tsk: task the filter should be released from. * * This function should only be called when the task is exiting as * it detaches it from its filter tree. PF_EXITING has to be set * for the task. */ void seccomp_filter_release(struct task_struct *tsk) { struct seccomp_filter *orig; if (WARN_ON((tsk->flags & PF_EXITING) == 0)) return; if (READ_ONCE(tsk->seccomp.filter) == NULL) return; spin_lock_irq(&tsk->sighand->siglock); orig = tsk->seccomp.filter; /* Detach task from its filter tree. */ tsk->seccomp.filter = NULL; spin_unlock_irq(&tsk->sighand->siglock); __seccomp_filter_release(orig); } /** * seccomp_sync_threads: sets all threads to use current's filter * * @flags: SECCOMP_FILTER_FLAG_* flags to set during sync. * * Expects sighand and cred_guard_mutex locks to be held, and for * seccomp_can_sync_threads() to have returned success already * without dropping the locks. * */ static inline void seccomp_sync_threads(unsigned long flags) { struct task_struct *thread, *caller; BUG_ON(!mutex_is_locked(&current->signal->cred_guard_mutex)); assert_spin_locked(&current->sighand->siglock); /* * Don't touch any of the threads if the process is being killed. * This allows for a lockless check in seccomp_filter_release. */ if (current->signal->flags & SIGNAL_GROUP_EXIT) return; /* Synchronize all threads. */ caller = current; for_each_thread(caller, thread) { /* Skip current, since it needs no changes. */ if (thread == caller) continue; /* * Skip exited threads. seccomp_filter_release could have * been already called for this task. */ if (thread->flags & PF_EXITING) continue; /* Get a task reference for the new leaf node. */ get_seccomp_filter(caller); /* * Drop the task reference to the shared ancestor since * current's path will hold a reference. (This also * allows a put before the assignment.) */ __seccomp_filter_release(thread->seccomp.filter); /* Make our new filter tree visible. */ smp_store_release(&thread->seccomp.filter, caller->seccomp.filter); atomic_set(&thread->seccomp.filter_count, atomic_read(&caller->seccomp.filter_count)); /* * Don't let an unprivileged task work around * the no_new_privs restriction by creating * a thread that sets it up, enters seccomp, * then dies. */ if (task_no_new_privs(caller)) task_set_no_new_privs(thread); /* * Opt the other thread into seccomp if needed. * As threads are considered to be trust-realm * equivalent (see ptrace_may_access), it is safe to * allow one thread to transition the other. */ if (thread->seccomp.mode == SECCOMP_MODE_DISABLED) seccomp_assign_mode(thread, SECCOMP_MODE_FILTER, flags); } } /** * seccomp_prepare_filter: Prepares a seccomp filter for use. * @fprog: BPF program to install * * Returns filter on success or an ERR_PTR on failure. */ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) { struct seccomp_filter *sfilter; int ret; const bool save_orig = #if defined(CONFIG_CHECKPOINT_RESTORE) || defined(SECCOMP_ARCH_NATIVE) true; #else false; #endif if (fprog->len == 0 || fprog->len > BPF_MAXINSNS) return ERR_PTR(-EINVAL); BUG_ON(INT_MAX / fprog->len < sizeof(struct sock_filter)); /* * Installing a seccomp filter requires that the task has * CAP_SYS_ADMIN in its namespace or be running with no_new_privs. * This avoids scenarios where unprivileged tasks can affect the * behavior of privileged children. */ if (!task_no_new_privs(current) && !ns_capable_noaudit(current_user_ns(), CAP_SYS_ADMIN)) return ERR_PTR(-EACCES); /* Allocate a new seccomp_filter */ sfilter = kzalloc(sizeof(*sfilter), GFP_KERNEL | __GFP_NOWARN); if (!sfilter) return ERR_PTR(-ENOMEM); mutex_init(&sfilter->notify_lock); ret = bpf_prog_create_from_user(&sfilter->prog, fprog, seccomp_check_filter, save_orig); if (ret < 0) { kfree(sfilter); return ERR_PTR(ret); } refcount_set(&sfilter->refs, 1); refcount_set(&sfilter->users, 1); init_waitqueue_head(&sfilter->wqh); return sfilter; } /** * seccomp_prepare_user_filter - prepares a user-supplied sock_fprog * @user_filter: pointer to the user data containing a sock_fprog. * * Returns 0 on success and non-zero otherwise. */ static struct seccomp_filter * seccomp_prepare_user_filter(const char __user *user_filter) { struct sock_fprog fprog; struct seccomp_filter *filter = ERR_PTR(-EFAULT); #ifdef CONFIG_COMPAT if (in_compat_syscall()) { struct compat_sock_fprog fprog32; if (copy_from_user(&fprog32, user_filter, sizeof(fprog32))) goto out; fprog.len = fprog32.len; fprog.filter = compat_ptr(fprog32.filter); } else /* falls through to the if below. */ #endif if (copy_from_user(&fprog, user_filter, sizeof(fprog))) goto out; filter = seccomp_prepare_filter(&fprog); out: return filter; } #ifdef SECCOMP_ARCH_NATIVE /** * seccomp_is_const_allow - check if filter is constant allow with given data * @fprog: The BPF programs * @sd: The seccomp data to check against, only syscall number and arch * number are considered constant. */ static bool seccomp_is_const_allow(struct sock_fprog_kern *fprog, struct seccomp_data *sd) { unsigned int reg_value = 0; unsigned int pc; bool op_res; if (WARN_ON_ONCE(!fprog)) return false; /* Our single exception to filtering. */ #ifdef __NR_uretprobe #ifdef SECCOMP_ARCH_COMPAT if (sd->arch == SECCOMP_ARCH_NATIVE) #endif if (sd->nr == __NR_uretprobe) return true; #endif for (pc = 0; pc < fprog->len; pc++) { struct sock_filter *insn = &fprog->filter[pc]; u16 code = insn->code; u32 k = insn->k; switch (code) { case BPF_LD | BPF_W | BPF_ABS: switch (k) { case offsetof(struct seccomp_data, nr): reg_value = sd->nr; break; case offsetof(struct seccomp_data, arch): reg_value = sd->arch; break; default: /* can't optimize (non-constant value load) */ return false; } break; case BPF_RET | BPF_K: /* reached return with constant values only, check allow */ return k == SECCOMP_RET_ALLOW; case BPF_JMP | BPF_JA: pc += insn->k; break; case BPF_JMP | BPF_JEQ | BPF_K: case BPF_JMP | BPF_JGE | BPF_K: case BPF_JMP | BPF_JGT | BPF_K: case BPF_JMP | BPF_JSET | BPF_K: switch (BPF_OP(code)) { case BPF_JEQ: op_res = reg_value == k; break; case BPF_JGE: op_res = reg_value >= k; break; case BPF_JGT: op_res = reg_value > k; break; case BPF_JSET: op_res = !!(reg_value & k); break; default: /* can't optimize (unknown jump) */ return false; } pc += op_res ? insn->jt : insn->jf; break; case BPF_ALU | BPF_AND | BPF_K: reg_value &= k; break; default: /* can't optimize (unknown insn) */ return false; } } /* ran off the end of the filter?! */ WARN_ON(1); return false; } static void seccomp_cache_prepare_bitmap(struct seccomp_filter *sfilter, void *bitmap, const void *bitmap_prev, size_t bitmap_size, int arch) { struct sock_fprog_kern *fprog = sfilter->prog->orig_prog; struct seccomp_data sd; int nr; if (bitmap_prev) { /* The new filter must be as restrictive as the last. */ bitmap_copy(bitmap, bitmap_prev, bitmap_size); } else { /* Before any filters, all syscalls are always allowed. */ bitmap_fill(bitmap, bitmap_size); } for (nr = 0; nr < bitmap_size; nr++) { /* No bitmap change: not a cacheable action. */ if (!test_bit(nr, bitmap)) continue; sd.nr = nr; sd.arch = arch; /* No bitmap change: continue to always allow. */ if (seccomp_is_const_allow(fprog, &sd)) continue; /* * Not a cacheable action: always run filters. * atomic clear_bit() not needed, filter not visible yet. */ __clear_bit(nr, bitmap); } } /** * seccomp_cache_prepare - emulate the filter to find cacheable syscalls * @sfilter: The seccomp filter * * Returns 0 if successful or -errno if error occurred. */ static void seccomp_cache_prepare(struct seccomp_filter *sfilter) { struct action_cache *cache = &sfilter->cache; const struct action_cache *cache_prev = sfilter->prev ? &sfilter->prev->cache : NULL; seccomp_cache_prepare_bitmap(sfilter, cache->allow_native, cache_prev ? cache_prev->allow_native : NULL, SECCOMP_ARCH_NATIVE_NR, SECCOMP_ARCH_NATIVE); #ifdef SECCOMP_ARCH_COMPAT seccomp_cache_prepare_bitmap(sfilter, cache->allow_compat, cache_prev ? cache_prev->allow_compat : NULL, SECCOMP_ARCH_COMPAT_NR, SECCOMP_ARCH_COMPAT); #endif /* SECCOMP_ARCH_COMPAT */ } #endif /* SECCOMP_ARCH_NATIVE */ /** * seccomp_attach_filter: validate and attach filter * @flags: flags to change filter behavior * @filter: seccomp filter to add to the current process * * Caller must be holding current->sighand->siglock lock. * * Returns 0 on success, -ve on error, or * - in TSYNC mode: the pid of a thread which was either not in the correct * seccomp mode or did not have an ancestral seccomp filter * - in NEW_LISTENER mode: the fd of the new listener */ static long seccomp_attach_filter(unsigned int flags, struct seccomp_filter *filter) { unsigned long total_insns; struct seccomp_filter *walker; assert_spin_locked(&current->sighand->siglock); /* Validate resulting filter length. */ total_insns = filter->prog->len; for (walker = current->seccomp.filter; walker; walker = walker->prev) total_insns += walker->prog->len + 4; /* 4 instr penalty */ if (total_insns > MAX_INSNS_PER_PATH) return -ENOMEM; /* If thread sync has been requested, check that it is possible. */ if (flags & SECCOMP_FILTER_FLAG_TSYNC) { int ret; ret = seccomp_can_sync_threads(); if (ret) { if (flags & SECCOMP_FILTER_FLAG_TSYNC_ESRCH) return -ESRCH; else return ret; } } /* Set log flag, if present. */ if (flags & SECCOMP_FILTER_FLAG_LOG) filter->log = true; /* Set wait killable flag, if present. */ if (flags & SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV) filter->wait_killable_recv = true; /* * If there is an existing filter, make it the prev and don't drop its * task reference. */ filter->prev = current->seccomp.filter; seccomp_cache_prepare(filter); current->seccomp.filter = filter; atomic_inc(&current->seccomp.filter_count); /* Now that the new filter is in place, synchronize to all threads. */ if (flags & SECCOMP_FILTER_FLAG_TSYNC) seccomp_sync_threads(flags); return 0; } static void __get_seccomp_filter(struct seccomp_filter *filter) { refcount_inc(&filter->refs); } /* get_seccomp_filter - increments the reference count of the filter on @tsk */ void get_seccomp_filter(struct task_struct *tsk) { struct seccomp_filter *orig = tsk->seccomp.filter; if (!orig) return; __get_seccomp_filter(orig); refcount_inc(&orig->users); } #endif /* CONFIG_SECCOMP_FILTER */ /* For use with seccomp_actions_logged */ #define SECCOMP_LOG_KILL_PROCESS (1 << 0) #define SECCOMP_LOG_KILL_THREAD (1 << 1) #define SECCOMP_LOG_TRAP (1 << 2) #define SECCOMP_LOG_ERRNO (1 << 3) #define SECCOMP_LOG_TRACE (1 << 4) #define SECCOMP_LOG_LOG (1 << 5) #define SECCOMP_LOG_ALLOW (1 << 6) #define SECCOMP_LOG_USER_NOTIF (1 << 7) static u32 seccomp_actions_logged = SECCOMP_LOG_KILL_PROCESS | SECCOMP_LOG_KILL_THREAD | SECCOMP_LOG_TRAP | SECCOMP_LOG_ERRNO | SECCOMP_LOG_USER_NOTIF | SECCOMP_LOG_TRACE | SECCOMP_LOG_LOG; static inline void seccomp_log(unsigned long syscall, long signr, u32 action, bool requested) { bool log = false; switch (action) { case SECCOMP_RET_ALLOW: break; case SECCOMP_RET_TRAP: log = requested && seccomp_actions_logged & SECCOMP_LOG_TRAP; break; case SECCOMP_RET_ERRNO: log = requested && seccomp_actions_logged & SECCOMP_LOG_ERRNO; break; case SECCOMP_RET_TRACE: log = requested && seccomp_actions_logged & SECCOMP_LOG_TRACE; break; case SECCOMP_RET_USER_NOTIF: log = requested && seccomp_actions_logged & SECCOMP_LOG_USER_NOTIF; break; case SECCOMP_RET_LOG: log = seccomp_actions_logged & SECCOMP_LOG_LOG; break; case SECCOMP_RET_KILL_THREAD: log = seccomp_actions_logged & SECCOMP_LOG_KILL_THREAD; break; case SECCOMP_RET_KILL_PROCESS: default: log = seccomp_actions_logged & SECCOMP_LOG_KILL_PROCESS; } /* * Emit an audit message when the action is RET_KILL_*, RET_LOG, or the * FILTER_FLAG_LOG bit was set. The admin has the ability to silence * any action from being logged by removing the action name from the * seccomp_actions_logged sysctl. */ if (!log) return; audit_seccomp(syscall, signr, action); } /* * Secure computing mode 1 allows only read/write/exit/sigreturn. * To be fully secure this must be combined with rlimit * to limit the stack allocations too. */ static const int mode1_syscalls[] = { __NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, __NR_seccomp_sigreturn, #ifdef __NR_uretprobe __NR_uretprobe, #endif -1, /* negative terminated */ }; static void __secure_computing_strict(int this_syscall) { const int *allowed_syscalls = mode1_syscalls; #ifdef CONFIG_COMPAT if (in_compat_syscall()) allowed_syscalls = get_compat_mode1_syscalls(); #endif do { if (*allowed_syscalls == this_syscall) return; } while (*++allowed_syscalls != -1); #ifdef SECCOMP_DEBUG dump_stack(); #endif current->seccomp.mode = SECCOMP_MODE_DEAD; seccomp_log(this_syscall, SIGKILL, SECCOMP_RET_KILL_THREAD, true); do_exit(SIGKILL); } #ifndef CONFIG_HAVE_ARCH_SECCOMP_FILTER void secure_computing_strict(int this_syscall) { int mode = current->seccomp.mode; if (IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) && unlikely(current->ptrace & PT_SUSPEND_SECCOMP)) return; if (mode == SECCOMP_MODE_DISABLED) return; else if (mode == SECCOMP_MODE_STRICT) __secure_computing_strict(this_syscall); else BUG(); } int __secure_computing(void) { int this_syscall = syscall_get_nr(current, current_pt_regs()); secure_computing_strict(this_syscall); return 0; } #else #ifdef CONFIG_SECCOMP_FILTER static u64 seccomp_next_notify_id(struct seccomp_filter *filter) { /* * Note: overflow is ok here, the id just needs to be unique per * filter. */ lockdep_assert_held(&filter->notify_lock); return filter->notif->next_id++; } static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd, struct seccomp_knotif *n) { int fd; /* * Remove the notification, and reset the list pointers, indicating * that it has been handled. */ list_del_init(&addfd->list); if (!addfd->setfd) fd = receive_fd(addfd->file, NULL, addfd->flags); else fd = receive_fd_replace(addfd->fd, addfd->file, addfd->flags); addfd->ret = fd; if (addfd->ioctl_flags & SECCOMP_ADDFD_FLAG_SEND) { /* If we fail reset and return an error to the notifier */ if (fd < 0) { n->state = SECCOMP_NOTIFY_SENT; } else { /* Return the FD we just added */ n->flags = 0; n->error = 0; n->val = fd; } } /* * Mark the notification as completed. From this point, addfd mem * might be invalidated and we can't safely read it anymore. */ complete(&addfd->completion); } static bool should_sleep_killable(struct seccomp_filter *match, struct seccomp_knotif *n) { return match->wait_killable_recv && n->state == SECCOMP_NOTIFY_SENT; } static int seccomp_do_user_notification(int this_syscall, struct seccomp_filter *match, const struct seccomp_data *sd) { int err; u32 flags = 0; long ret = 0; struct seccomp_knotif n = {}; struct seccomp_kaddfd *addfd, *tmp; mutex_lock(&match->notify_lock); err = -ENOSYS; if (!match->notif) goto out; n.task = current; n.state = SECCOMP_NOTIFY_INIT; n.data = sd; n.id = seccomp_next_notify_id(match); init_completion(&n.ready); list_add_tail(&n.list, &match->notif->notifications); INIT_LIST_HEAD(&n.addfd); atomic_inc(&match->notif->requests); if (match->notif->flags & SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP) wake_up_poll_on_current_cpu(&match->wqh, EPOLLIN | EPOLLRDNORM); else wake_up_poll(&match->wqh, EPOLLIN | EPOLLRDNORM); /* * This is where we wait for a reply from userspace. */ do { bool wait_killable = should_sleep_killable(match, &n); mutex_unlock(&match->notify_lock); if (wait_killable) err = wait_for_completion_killable(&n.ready); else err = wait_for_completion_interruptible(&n.ready); mutex_lock(&match->notify_lock); if (err != 0) { /* * Check to see if the notifcation got picked up and * whether we should switch to wait killable. */ if (!wait_killable && should_sleep_killable(match, &n)) continue; goto interrupted; } addfd = list_first_entry_or_null(&n.addfd, struct seccomp_kaddfd, list); /* Check if we were woken up by a addfd message */ if (addfd) seccomp_handle_addfd(addfd, &n); } while (n.state != SECCOMP_NOTIFY_REPLIED); ret = n.val; err = n.error; flags = n.flags; interrupted: /* If there were any pending addfd calls, clear them out */ list_for_each_entry_safe(addfd, tmp, &n.addfd, list) { /* The process went away before we got a chance to handle it */ addfd->ret = -ESRCH; list_del_init(&addfd->list); complete(&addfd->completion); } /* * Note that it's possible the listener died in between the time when * we were notified of a response (or a signal) and when we were able to * re-acquire the lock, so only delete from the list if the * notification actually exists. * * Also note that this test is only valid because there's no way to * *reattach* to a notifier right now. If one is added, we'll need to * keep track of the notif itself and make sure they match here. */ if (match->notif) list_del(&n.list); out: mutex_unlock(&match->notify_lock); /* Userspace requests to continue the syscall. */ if (flags & SECCOMP_USER_NOTIF_FLAG_CONTINUE) return 0; syscall_set_return_value(current, current_pt_regs(), err, ret); return -1; } static int __seccomp_filter(int this_syscall, const bool recheck_after_trace) { u32 filter_ret, action; struct seccomp_data sd; struct seccomp_filter *match = NULL; int data; /* * Make sure that any changes to mode from another thread have * been seen after SYSCALL_WORK_SECCOMP was seen. */ smp_rmb(); populate_seccomp_data(&sd); filter_ret = seccomp_run_filters(&sd, &match); data = filter_ret & SECCOMP_RET_DATA; action = filter_ret & SECCOMP_RET_ACTION_FULL; switch (action) { case SECCOMP_RET_ERRNO: /* Set low-order bits as an errno, capped at MAX_ERRNO. */ if (data > MAX_ERRNO) data = MAX_ERRNO; syscall_set_return_value(current, current_pt_regs(), -data, 0); goto skip; case SECCOMP_RET_TRAP: /* Show the handler the original registers. */ syscall_rollback(current, current_pt_regs()); /* Let the filter pass back 16 bits of data. */ force_sig_seccomp(this_syscall, data, false); goto skip; case SECCOMP_RET_TRACE: /* We've been put in this state by the ptracer already. */ if (recheck_after_trace) return 0; /* ENOSYS these calls if there is no tracer attached. */ if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) { syscall_set_return_value(current, current_pt_regs(), -ENOSYS, 0); goto skip; } /* Allow the BPF to provide the event message */ ptrace_event(PTRACE_EVENT_SECCOMP, data); /* * The delivery of a fatal signal during event * notification may silently skip tracer notification, * which could leave us with a potentially unmodified * syscall that the tracer would have liked to have * changed. Since the process is about to die, we just * force the syscall to be skipped and let the signal * kill the process and correctly handle any tracer exit * notifications. */ if (fatal_signal_pending(current))