Total coverage: 438759 (25%)of 1785310
10207 9984 9970 6364 430 455 46 46 100 118 10286 10278 10288 10278 10319 10283 10281 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 // SPDX-License-Identifier: GPL-2.0 #include <linux/compiler.h> #include <linux/export.h> #include <linux/fault-inject-usercopy.h> #include <linux/kasan-checks.h> #include <linux/thread_info.h> #include <linux/uaccess.h> #include <linux/kernel.h> #include <linux/errno.h> #include <linux/mm.h> #include <asm/byteorder.h> #include <asm/word-at-a-time.h> #ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS #define IS_UNALIGNED(src, dst) 0 #else #define IS_UNALIGNED(src, dst) \ (((long) dst | (long) src) & (sizeof(long) - 1)) #endif /* * Do a strncpy, return length of string without final '\0'. * 'count' is the user-supplied count (return 'count' if we * hit it), 'max' is the address space maximum (and we return * -EFAULT if we hit it). */ static __always_inline long do_strncpy_from_user(char *dst, const char __user *src, unsigned long count, unsigned long max) { const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS; unsigned long res = 0; if (IS_UNALIGNED(src, dst)) goto byte_at_a_time; while (max >= sizeof(unsigned long)) { unsigned long c, data, mask; /* Fall back to byte-at-a-time if we get a page fault */ unsafe_get_user(c, (unsigned long __user *)(src+res), byte_at_a_time); /* * Note that we mask out the bytes following the NUL. This is * important to do because string oblivious code may read past * the NUL. For those routines, we don't want to give them * potentially random bytes after the NUL in `src`. * * One example of such code is BPF map keys. BPF treats map keys * as an opaque set of bytes. Without the post-NUL mask, any BPF * maps keyed by strings returned from strncpy_from_user() may * have multiple entries for semantically identical strings. */ if (has_zero(c, &data, &constants)) { data = prep_zero_mask(c, data, &constants); data = create_zero_mask(data); mask = zero_bytemask(data); *(unsigned long *)(dst+res) = c & mask; return res + find_zero(data); } *(unsigned long *)(dst+res) = c; res += sizeof(unsigned long); max -= sizeof(unsigned long); } byte_at_a_time: while (max) { char c; unsafe_get_user(c,src+res, efault); dst[res] = c; if (!c) return res; res++; max--; } /* * Uhhuh. We hit 'max'. But was that the user-specified maximum * too? If so, that's ok - we got as much as the user asked for. */ if (res >= count) return res; /* * Nope: we hit the address space limit, and we still had more * characters the caller would have wanted. That's an EFAULT. */ efault: return -EFAULT; } /** * strncpy_from_user: - Copy a NUL terminated string from userspace. * @dst: Destination address, in kernel space. This buffer must be at * least @count bytes long. * @src: Source address, in user space. * @count: Maximum number of bytes to copy, including the trailing NUL. * * Copies a NUL-terminated string from userspace to kernel space. * * On success, returns the length of the string (not including the trailing * NUL). * * If access to userspace fails, returns -EFAULT (some data may have been * copied). * * If @count is smaller than the length of the string, copies @count bytes * and returns @count. */ long strncpy_from_user(char *dst, const char __user *src, long count) { unsigned long max_addr, src_addr; might_fault(); if (should_fail_usercopy()) return -EFAULT; if (unlikely(count <= 0)) return 0; kasan_check_write(dst, count); check_object_size(dst, count, false); if (can_do_masked_user_access()) { long retval; src = masked_user_access_begin(src); retval = do_strncpy_from_user(dst, src, count, count); user_read_access_end(); return retval; } max_addr = TASK_SIZE_MAX; src_addr = (unsigned long)untagged_addr(src); if (likely(src_addr < max_addr)) { unsigned long max = max_addr - src_addr; long retval; /* * Truncate 'max' to the user-specified limit, so that * we only have one limit we need to check in the loop */ if (max > count) max = count; if (user_read_access_begin(src, max)) { retval = do_strncpy_from_user(dst, src, count, max); user_read_access_end(); return retval; } } return -EFAULT; } EXPORT_SYMBOL(strncpy_from_user);
22012 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __X86_KERNEL_FPU_INTERNAL_H #define __X86_KERNEL_FPU_INTERNAL_H extern struct fpstate init_fpstate; /* CPU feature check wrappers */ static __always_inline __pure bool use_xsave(void) { return cpu_feature_enabled(X86_FEATURE_XSAVE); } static __always_inline __pure bool use_fxsr(void) { return cpu_feature_enabled(X86_FEATURE_FXSR); } #ifdef CONFIG_X86_DEBUG_FPU # define WARN_ON_FPU(x) WARN_ON_ONCE(x) #else # define WARN_ON_FPU(x) ({ BUILD_BUG_ON_INVALID(x); 0; }) #endif /* Used in init.c */ extern void fpstate_init_user(struct fpstate *fpstate); extern void fpstate_reset(struct fpu *fpu); #endif
2 2 2 2 1 1 1 2 13 1 1 2 2 2 2 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 1 4 3 3 2 1 1 1 1 3 3 3 3 1 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 2 2 2 2 2 2 2 2 2 2 1 1 1 1 39 40 20 19 18 19 20 40 19 18 40 3 1 1 1 1 1 1 1 3 3 1 1 1 1 2 2 2 1 1 1 1 1 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 1 1 1 1 1 1 3 3 38 38 1 1 32 15 15 12 12 1 1 1 3 3 3 3 3 2 2 2 3 2 2 2 2 2 2 3 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. */ /* * Quota change tags are associated with each transaction that allocates or * deallocates space. Those changes are accumulated locally to each node (in a * per-node file) and then are periodically synced to the quota file. This * avoids the bottleneck of constantly touching the quota file, but introduces * fuzziness in the current usage value of IDs that are being used on different * nodes in the cluster simultaneously. So, it is possible for a user on * multiple nodes to overrun their quota, but that overrun is controlable. * Since quota tags are part of transactions, there is no need for a quota check * program to be run on node crashes or anything like that. * * There are couple of knobs that let the administrator manage the quota * fuzziness. "quota_quantum" sets the maximum time a quota change can be * sitting on one node before being synced to the quota file. (The default is * 60 seconds.) Another knob, "quota_scale" controls how quickly the frequency * of quota file syncs increases as the user moves closer to their limit. The * more frequent the syncs, the more accurate the quota enforcement, but that * means that there is more contention between the nodes for the quota file. * The default value is one. This sets the maximum theoretical quota overrun * (with infinite node with infinite bandwidth) to twice the user's limit. (In * practice, the maximum overrun you see should be much less.) A "quota_scale" * number greater than one makes quota syncs more frequent and reduces the * maximum overrun. Numbers less than one (but greater than zero) make quota * syncs less frequent. * * GFS quotas also use per-ID Lock Value Blocks (LVBs) to cache the contents of * the quota file, so it is not being constantly read. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/sched.h> #include <linux/slab.h> #include <linux/mm.h> #include <linux/spinlock.h> #include <linux/completion.h> #include <linux/buffer_head.h> #include <linux/sort.h> #include <linux/fs.h> #include <linux/bio.h> #include <linux/gfs2_ondisk.h> #include <linux/kthread.h> #include <linux/freezer.h> #include <linux/quota.h> #include <linux/dqblk_xfs.h> #include <linux/lockref.h> #include <linux/list_lru.h> #include <linux/rcupdate.h> #include <linux/rculist_bl.h> #include <linux/bit_spinlock.h> #include <linux/jhash.h> #include <linux/vmalloc.h> #include "gfs2.h" #include "incore.h" #include "bmap.h" #include "glock.h" #include "glops.h" #include "log.h" #include "meta_io.h" #include "quota.h" #include "rgrp.h" #include "super.h" #include "trans.h" #include "inode.h" #include "util.h" #define GFS2_QD_HASH_SHIFT 12 #define GFS2_QD_HASH_SIZE BIT(GFS2_QD_HASH_SHIFT) #define GFS2_QD_HASH_MASK (GFS2_QD_HASH_SIZE - 1) /* Lock order: qd_lock -> bucket lock -> qd->lockref.lock -> lru lock */ /* -> sd_bitmap_lock */ static DEFINE_SPINLOCK(qd_lock); struct list_lru gfs2_qd_lru; static struct hlist_bl_head qd_hash_table[GFS2_QD_HASH_SIZE]; static unsigned int gfs2_qd_hash(const struct gfs2_sbd *sdp, const struct kqid qid) { unsigned int h; h = jhash(&sdp, sizeof(struct gfs2_sbd *), 0); h = jhash(&qid, sizeof(struct kqid), h); return h & GFS2_QD_HASH_MASK; } static inline void spin_lock_bucket(unsigned int hash) { hlist_bl_lock(&qd_hash_table[hash]); } static inline void spin_unlock_bucket(unsigned int hash) { hlist_bl_unlock(&qd_hash_table[hash]); } static void gfs2_qd_dealloc(struct rcu_head *rcu) { struct gfs2_quota_data *qd = container_of(rcu, struct gfs2_quota_data, qd_rcu); struct gfs2_sbd *sdp = qd->qd_sbd; kmem_cache_free(gfs2_quotad_cachep, qd); if (atomic_dec_and_test(&sdp->sd_quota_count)) wake_up(&sdp->sd_kill_wait); } static void gfs2_qd_dispose(struct gfs2_quota_data *qd) { struct gfs2_sbd *sdp = qd->qd_sbd; spin_lock(&qd_lock); list_del(&qd->qd_list); spin_unlock(&qd_lock); spin_lock_bucket(qd->qd_hash); hlist_bl_del_rcu(&qd->qd_hlist); spin_unlock_bucket(qd->qd_hash); if (!gfs2_withdrawing_or_withdrawn(sdp)) { gfs2_assert_warn(sdp, !qd->qd_change); gfs2_assert_warn(sdp, !qd->qd_slot_ref); gfs2_assert_warn(sdp, !qd->qd_bh_count); } gfs2_glock_put(qd->qd_gl); call_rcu(&qd->qd_rcu, gfs2_qd_dealloc); } static void gfs2_qd_list_dispose(struct list_head *list) { struct gfs2_quota_data *qd; while (!list_empty(list)) { qd = list_first_entry(list, struct gfs2_quota_data, qd_lru); list_del(&qd->qd_lru); gfs2_qd_dispose(qd); } } static enum lru_status gfs2_qd_isolate(struct list_head *item, struct list_lru_one *lru, void *arg) { struct list_head *dispose = arg; struct gfs2_quota_data *qd = list_entry(item, struct gfs2_quota_data, qd_lru); enum lru_status status; if (!spin_trylock(&qd->qd_lockref.lock)) return LRU_SKIP; status = LRU_SKIP; if (qd->qd_lockref.count == 0) { lockref_mark_dead(&qd->qd_lockref); list_lru_isolate_move(lru, &qd->qd_lru, dispose); status = LRU_REMOVED; } spin_unlock(&qd->qd_lockref.lock); return status; } static unsigned long gfs2_qd_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) { LIST_HEAD(dispose); unsigned long freed; if (!(sc->gfp_mask & __GFP_FS)) return SHRINK_STOP; freed = list_lru_shrink_walk(&gfs2_qd_lru, sc, gfs2_qd_isolate, &dispose); gfs2_qd_list_dispose(&dispose); return freed; } static unsigned long gfs2_qd_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { return vfs_pressure_ratio(list_lru_shrink_count(&gfs2_qd_lru, sc)); } static struct shrinker *gfs2_qd_shrinker; int __init gfs2_qd_shrinker_init(void) { gfs2_qd_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE, "gfs2-qd"); if (!gfs2_qd_shrinker) return -ENOMEM; gfs2_qd_shrinker->count_objects = gfs2_qd_shrink_count; gfs2_qd_shrinker->scan_objects = gfs2_qd_shrink_scan; shrinker_register(gfs2_qd_shrinker); return 0; } void gfs2_qd_shrinker_exit(void) { shrinker_free(gfs2_qd_shrinker); } static u64 qd2index(struct gfs2_quota_data *qd) { struct kqid qid = qd->qd_id; return (2 * (u64)from_kqid(&init_user_ns, qid)) + ((qid.type == USRQUOTA) ? 0 : 1); } static u64 qd2offset(struct gfs2_quota_data *qd) { return qd2index(qd) * sizeof(struct gfs2_quota); } static struct gfs2_quota_data *qd_alloc(unsigned hash, struct gfs2_sbd *sdp, struct kqid qid) { struct gfs2_quota_data *qd; int error; qd = kmem_cache_zalloc(gfs2_quotad_cachep, GFP_NOFS); if (!qd) return NULL; qd->qd_sbd = sdp; lockref_init(&qd->qd_lockref); qd->qd_id = qid; qd->qd_slot = -1; INIT_LIST_HEAD(&qd->qd_lru); qd->qd_hash = hash; error = gfs2_glock_get(sdp, qd2index(qd), &gfs2_quota_glops, CREATE, &qd->qd_gl); if (error) goto fail; return qd; fail: kmem_cache_free(gfs2_quotad_cachep, qd); return NULL; } static struct gfs2_quota_data *gfs2_qd_search_bucket(unsigned int hash, const struct gfs2_sbd *sdp, struct kqid qid) { struct gfs2_quota_data *qd; struct hlist_bl_node *h; hlist_bl_for_each_entry_rcu(qd, h, &qd_hash_table[hash], qd_hlist) { if (!qid_eq(qd->qd_id, qid)) continue; if (qd->qd_sbd != sdp) continue; if (lockref_get_not_dead(&qd->qd_lockref)) { list_lru_del_obj(&gfs2_qd_lru, &qd->qd_lru); return qd; } } return NULL; } static int qd_get(struct gfs2_sbd *sdp, struct kqid qid, struct gfs2_quota_data **qdp) { struct gfs2_quota_data *qd, *new_qd; unsigned int hash = gfs2_qd_hash(sdp, qid); rcu_read_lock(); *qdp = qd = gfs2_qd_search_bucket(hash, sdp, qid); rcu_read_unlock(); if (qd) return 0; new_qd = qd_alloc(hash, sdp, qid); if (!new_qd) return -ENOMEM; spin_lock(&qd_lock); spin_lock_bucket(hash); *qdp = qd = gfs2_qd_search_bucket(hash, sdp, qid); if (qd == NULL) { *qdp = new_qd; list_add(&new_qd->qd_list, &sdp->sd_quota_list); hlist_bl_add_head_rcu(&new_qd->qd_hlist, &qd_hash_table[hash]); atomic_inc(&sdp->sd_quota_count); } spin_unlock_bucket(hash); spin_unlock(&qd_lock); if (qd) { gfs2_glock_put(new_qd->qd_gl); kmem_cache_free(gfs2_quotad_cachep, new_qd); } return 0; } static void __qd_hold(struct gfs2_quota_data *qd) { struct gfs2_sbd *sdp = qd->qd_sbd; gfs2_assert(sdp, qd->qd_lockref.count > 0); qd->qd_lockref.count++; } static void qd_put(struct gfs2_quota_data *qd) { struct gfs2_sbd *sdp; if (lockref_put_or_lock(&qd->qd_lockref)) return; BUG_ON(__lockref_is_dead(&qd->qd_lockref)); sdp = qd->qd_sbd; if (unlikely(!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))) { lockref_mark_dead(&qd->qd_lockref); spin_unlock(&qd->qd_lockref.lock); gfs2_qd_dispose(qd); return; } qd->qd_lockref.count = 0; list_lru_add_obj(&gfs2_qd_lru, &qd->qd_lru); spin_unlock(&qd->qd_lockref.lock); } static int slot_get(struct gfs2_quota_data *qd) { struct gfs2_sbd *sdp = qd->qd_sbd; unsigned int bit; int error = 0; spin_lock(&sdp->sd_bitmap_lock); if (qd->qd_slot_ref == 0) { bit = find_first_zero_bit(sdp->sd_quota_bitmap, sdp->sd_quota_slots); if (bit >= sdp->sd_quota_slots) { error = -ENOSPC; goto out; } set_bit(bit, sdp->sd_quota_bitmap); qd->qd_slot = bit; } qd->qd_slot_ref++; out: spin_unlock(&sdp->sd_bitmap_lock); return error; } static void slot_hold(struct gfs2_quota_data *qd) { struct gfs2_sbd *sdp = qd->qd_sbd; spin_lock(&sdp->sd_bitmap_lock); gfs2_assert(sdp, qd->qd_slot_ref); qd->qd_slot_ref++; spin_unlock(&sdp->sd_bitmap_lock); } static void slot_put(struct gfs2_quota_data *qd) { struct gfs2_sbd *sdp = qd->qd_sbd; spin_lock(&sdp->sd_bitmap_lock); gfs2_assert(sdp, qd->qd_slot_ref); if (!--qd->qd_slot_ref) { BUG_ON(!test_and_clear_bit(qd->qd_slot, sdp->sd_quota_bitmap)); qd->qd_slot = -1; } spin_unlock(&sdp->sd_bitmap_lock); } static int bh_get(struct gfs2_quota_data *qd) { struct gfs2_sbd *sdp = qd->qd_sbd; struct inode *inode = sdp->sd_qc_inode; struct gfs2_inode *ip = GFS2_I(inode); unsigned int block, offset; struct buffer_head *bh = NULL; struct iomap iomap = { }; int error; spin_lock(&qd->qd_lockref.lock); if (qd->qd_bh_count) { qd->qd_bh_count++; spin_unlock(&qd->qd_lockref.lock); return 0; } spin_unlock(&qd->qd_lockref.lock); block = qd->qd_slot / sdp->sd_qc_per_block; offset = qd->qd_slot % sdp->sd_qc_per_block; error = gfs2_iomap_get(inode, (loff_t)block << inode->i_blkbits, i_blocksize(inode), &iomap); if (error) return error; error = -ENOENT; if (iomap.type != IOMAP_MAPPED) return error; error = gfs2_meta_read(ip->i_gl, iomap.addr >> inode->i_blkbits, DIO_WAIT, 0, &bh); if (error) return error; error = -EIO; if (gfs2_metatype_check(sdp, bh, GFS2_METATYPE_QC)) goto out; spin_lock(&qd->qd_lockref.lock); if (qd->qd_bh == NULL) { qd->qd_bh = bh; qd->qd_bh_qc = (struct gfs2_quota_change *) (bh->b_data + sizeof(struct gfs2_meta_header) + offset * sizeof(struct gfs2_quota_change)); bh = NULL; } qd->qd_bh_count++; spin_unlock(&qd->qd_lockref.lock); error = 0; out: brelse(bh); return error; } static void bh_put(struct gfs2_quota_data *qd) { struct gfs2_sbd *sdp = qd->qd_sbd; struct buffer_head *bh = NULL; spin_lock(&qd->qd_lockref.lock); gfs2_assert(sdp, qd->qd_bh_count); if (!--qd->qd_bh_count) { bh = qd->qd_bh; qd->qd_bh = NULL; qd->qd_bh_qc = NULL; } spin_unlock(&qd->qd_lockref.lock); brelse(bh); } static bool qd_grab_sync(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd, u64 sync_gen) { bool ret = false; spin_lock(&qd->qd_lockref.lock); if (test_bit(QDF_LOCKED, &qd->qd_flags) || !test_bit(QDF_CHANGE, &qd->qd_flags) || qd->qd_sync_gen >= sync_gen) goto out; if (__lockref_is_dead(&qd->qd_lockref)) goto out; qd->qd_lockref.count++; list_move_tail(&qd->qd_list, &sdp->sd_quota_list); set_bit(QDF_LOCKED, &qd->qd_flags); qd->qd_change_sync = qd->qd_change; slot_hold(qd); ret = true; out: spin_unlock(&qd->qd_lockref.lock); return ret; } static void qd_ungrab_sync(struct gfs2_quota_data *qd) { clear_bit(QDF_LOCKED, &qd->qd_flags); slot_put(qd); qd_put(qd); } static void qdsb_put(struct gfs2_quota_data *qd) { bh_put(qd); slot_put(qd); qd_put(qd); } static void qd_unlock(struct gfs2_quota_data *qd) { spin_lock(&qd->qd_lockref.lock); gfs2_assert_warn(qd->qd_sbd, test_bit(QDF_LOCKED, &qd->qd_flags)); clear_bit(QDF_LOCKED, &qd->qd_flags); spin_unlock(&qd->qd_lockref.lock); qdsb_put(qd); } static int qdsb_get(struct gfs2_sbd *sdp, struct kqid qid, struct gfs2_quota_data **qdp) { int error; error = qd_get(sdp, qid, qdp); if (error) return error; error = slot_get(*qdp); if (error) goto fail; error = bh_get(*qdp); if (error) goto fail_slot; return 0; fail_slot: slot_put(*qdp); fail: qd_put(*qdp); return error; } /** * gfs2_qa_get - make sure we have a quota allocations data structure, * if necessary * @ip: the inode for this reservation */ int gfs2_qa_get(struct gfs2_inode *ip) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct inode *inode = &ip->i_inode; if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF) return 0; spin_lock(&inode->i_lock); if (ip->i_qadata == NULL) { struct gfs2_qadata *tmp; spin_unlock(&inode->i_lock); tmp = kmem_cache_zalloc(gfs2_qadata_cachep, GFP_NOFS); if (!tmp) return -ENOMEM; spin_lock(&inode->i_lock); if (ip->i_qadata == NULL) ip->i_qadata = tmp; else kmem_cache_free(gfs2_qadata_cachep, tmp); } ip->i_qadata->qa_ref++; spin_unlock(&inode->i_lock); return 0; } void gfs2_qa_put(struct gfs2_inode *ip) { struct inode *inode = &ip->i_inode; spin_lock(&inode->i_lock); if (ip->i_qadata && --ip->i_qadata->qa_ref == 0) { kmem_cache_free(gfs2_qadata_cachep, ip->i_qadata); ip->i_qadata = NULL; } spin_unlock(&inode->i_lock); } int gfs2_quota_hold(struct gfs2_inode *ip, kuid_t uid, kgid_t gid) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_quota_data **qd; int error; if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF) return 0; error = gfs2_qa_get(ip); if (error) return error; qd = ip->i_qadata->qa_qd; if (gfs2_assert_warn(sdp, !ip->i_qadata->qa_qd_num) || gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags))) { error = -EIO; gfs2_qa_put(ip); goto out; } error = qdsb_get(sdp, make_kqid_uid(ip->i_inode.i_uid), qd); if (error) goto out_unhold; ip->i_qadata->qa_qd_num++; qd++; error = qdsb_get(sdp, make_kqid_gid(ip->i_inode.i_gid), qd); if (error) goto out_unhold; ip->i_qadata->qa_qd_num++; qd++; if (!uid_eq(uid, NO_UID_QUOTA_CHANGE) && !uid_eq(uid, ip->i_inode.i_uid)) { error = qdsb_get(sdp, make_kqid_uid(uid), qd); if (error) goto out_unhold; ip->i_qadata->qa_qd_num++; qd++; } if (!gid_eq(gid, NO_GID_QUOTA_CHANGE) && !gid_eq(gid, ip->i_inode.i_gid)) { error = qdsb_get(sdp, make_kqid_gid(gid), qd); if (error) goto out_unhold; ip->i_qadata->qa_qd_num++; qd++; } out_unhold: if (error) gfs2_quota_unhold(ip); out: return error; } void gfs2_quota_unhold(struct gfs2_inode *ip) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); u32 x; if (ip->i_qadata == NULL) return; gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags)); for (x = 0; x < ip->i_qadata->qa_qd_num; x++) { qdsb_put(ip->i_qadata->qa_qd[x]); ip->i_qadata->qa_qd[x] = NULL; } ip->i_qadata->qa_qd_num = 0; gfs2_qa_put(ip); } static int sort_qd(const void *a, const void *b) { const struct gfs2_quota_data *qd_a = *(const struct gfs2_quota_data **)a; const struct gfs2_quota_data *qd_b = *(const struct gfs2_quota_data **)b; if (qid_lt(qd_a->qd_id, qd_b->qd_id)) return -1; if (qid_lt(qd_b->qd_id, qd_a->qd_id)) return 1; return 0; } static void do_qc(struct gfs2_quota_data *qd, s64 change) { struct gfs2_sbd *sdp = qd->qd_sbd; struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode); struct gfs2_quota_change *qc = qd->qd_bh_qc; bool needs_put = false; s64 x; gfs2_trans_add_meta(ip->i_gl, qd->qd_bh); /* * The QDF_CHANGE flag indicates that the slot in the quota change file * is used. Here, we use the value of qc->qc_change when the slot is * used, and we assume a value of 0 otherwise. */ spin_lock(&qd->qd_lockref.lock); x = 0; if (test_bit(QDF_CHANGE, &qd->qd_flags)) x = be64_to_cpu(qc->qc_change); x += change; qd->qd_change += change; if (!x && test_bit(QDF_CHANGE, &qd->qd_flags)) { /* The slot in the quota change file becomes unused. */ clear_bit(QDF_CHANGE, &qd->qd_flags); qc->qc_flags = 0; qc->qc_id = 0; needs_put = true; } else if (x && !test_bit(QDF_CHANGE, &qd->qd_flags)) { /* The slot in the quota change file becomes used. */ set_bit(QDF_CHANGE, &qd->qd_flags); __qd_hold(qd); slot_hold(qd); qc->qc_flags = 0; if (qd->qd_id.type == USRQUOTA) qc->qc_flags = cpu_to_be32(GFS2_QCF_USER); qc->qc_id = cpu_to_be32(from_kqid(&init_user_ns, qd->qd_id)); } qc->qc_change = cpu_to_be64(x); spin_unlock(&qd->qd_lockref.lock); if (needs_put) { slot_put(qd); qd_put(qd); } if (change < 0) /* Reset quiet flag if we freed some blocks */ clear_bit(QDF_QMSG_QUIET, &qd->qd_flags); } static int gfs2_write_buf_to_page(struct gfs2_sbd *sdp, unsigned long index, unsigned off, void *buf, unsigned bytes) { struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode); struct inode *inode = &ip->i_inode; struct address_space *mapping = inode->i_mapping; struct folio *folio; struct buffer_head *bh; u64 blk; unsigned bsize = sdp->sd_sb.sb_bsize, bnum = 0, boff = 0; unsigned to_write = bytes, pg_off = off; blk = index << (PAGE_SHIFT - sdp->sd_sb.sb_bsize_shift); boff = off % bsize; folio = filemap_grab_folio(mapping, index); if (IS_ERR(folio)) return PTR_ERR(folio); bh = folio_buffers(folio); if (!bh) bh = create_empty_buffers(folio, bsize, 0); for (;;) { /* Find the beginning block within the folio */ if (pg_off >= ((bnum * bsize) + bsize)) { bh = bh->b_this_page; bnum++; blk++; continue; } if (!buffer_mapped(bh)) { gfs2_block_map(inode, blk, bh, 1); if (!buffer_mapped(bh)) goto unlock_out; /* If it's a newly allocated disk block, zero it */ if (buffer_new(bh)) folio_zero_range(folio, bnum * bsize, bh->b_size); } if (folio_test_uptodate(folio)) set_buffer_uptodate(bh); if (bh_read(bh, REQ_META | REQ_PRIO) < 0) goto unlock_out; gfs2_trans_add_data(ip->i_gl, bh); /* If we need to write to the next block as well */ if (to_write > (bsize - boff)) { pg_off += (bsize - boff); to_write -= (bsize - boff); boff = pg_off % bsize; continue; } break; } /* Write to the folio, now that we have setup the buffer(s) */ memcpy_to_folio(folio, off, buf, bytes); flush_dcache_folio(folio); folio_unlock(folio); folio_put(folio); return 0; unlock_out: folio_unlock(folio); folio_put(folio); return -EIO; } static int gfs2_write_disk_quota(struct gfs2_sbd *sdp, struct gfs2_quota *qp, loff_t loc) { unsigned long pg_beg; unsigned pg_off, nbytes, overflow = 0; int error; void *ptr; nbytes = sizeof(struct gfs2_quota); pg_beg = loc >> PAGE_SHIFT; pg_off = offset_in_page(loc); /* If the quota straddles a page boundary, split the write in two */ if ((pg_off + nbytes) > PAGE_SIZE) overflow = (pg_off + nbytes) - PAGE_SIZE; ptr = qp; error = gfs2_write_buf_to_page(sdp, pg_beg, pg_off, ptr, nbytes - overflow); /* If there's an overflow, write the remaining bytes to the next page */ if (!error && overflow) error = gfs2_write_buf_to_page(sdp, pg_beg + 1, 0, ptr + nbytes - overflow, overflow); return error; } /** * gfs2_adjust_quota - adjust record of current block usage * @sdp: The superblock * @loc: Offset of the entry in the quota file * @change: The amount of usage change to record * @qd: The quota data * @fdq: The updated limits to record * * This function was mostly borrowed from gfs2_block_truncate_page which was * in turn mostly borrowed from ext3 * * Returns: 0 or -ve on error */ static int gfs2_adjust_quota(struct gfs2_sbd *sdp, loff_t loc, s64 change, struct gfs2_quota_data *qd, struct qc_dqblk *fdq) { struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode); struct inode *inode = &ip->i_inode; struct gfs2_quota q; int err; u64 size; if (gfs2_is_stuffed(ip)) { err = gfs2_unstuff_dinode(ip); if (err) return err; } memset(&q, 0, sizeof(struct gfs2_quota)); err = gfs2_internal_read(ip, (char *)&q, &loc, sizeof(q)); if (err < 0) return err; loc -= sizeof(q); /* gfs2_internal_read would've advanced the loc ptr */ be64_add_cpu(&q.qu_value, change); if (((s64)be64_to_cpu(q.qu_value)) < 0) q.qu_value = 0; /* Never go negative on quota usage */ spin_lock(&qd->qd_lockref.lock); qd->qd_qb.qb_value = q.qu_value; if (fdq) { if (fdq->d_fieldmask & QC_SPC_SOFT) { q.qu_warn = cpu_to_be64(fdq->d_spc_softlimit >> sdp->sd_sb.sb_bsize_shift); qd->qd_qb.qb_warn = q.qu_warn; } if (fdq->d_fieldmask & QC_SPC_HARD) { q.qu_limit = cpu_to_be64(fdq->d_spc_hardlimit >> sdp->sd_sb.sb_bsize_shift); qd->qd_qb.qb_limit = q.qu_limit; } if (fdq->d_fieldmask & QC_SPACE) { q.qu_value = cpu_to_be64(fdq->d_space >> sdp->sd_sb.sb_bsize_shift); qd->qd_qb.qb_value = q.qu_value; } } spin_unlock(&qd->qd_lockref.lock); err = gfs2_write_disk_quota(sdp, &q, loc); if (!err) { size = loc + sizeof(struct gfs2_quota); if (size > inode->i_size) i_size_write(inode, size); inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); mark_inode_dirty(inode); set_bit(QDF_REFRESH, &qd->qd_flags); } return err; } static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda, u64 sync_gen) { struct gfs2_sbd *sdp = (*qda)->qd_sbd; struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode); struct gfs2_alloc_parms ap = {}; unsigned int data_blocks, ind_blocks; struct gfs2_holder *ghs, i_gh; unsigned int qx, x; struct gfs2_quota_data *qd; unsigned reserved; loff_t offset; unsigned int nalloc = 0, blocks; int error; gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota), &data_blocks, &ind_blocks); ghs = kmalloc_array(num_qd, sizeof(struct gfs2_holder), GFP_NOFS); if (!ghs) return -ENOMEM; sort(qda, num_qd, sizeof(struct gfs2_quota_data *), sort_qd, NULL); inode_lock(&ip->i_inode); for (qx = 0; qx < num_qd; qx++) { error = gfs2_glock_nq_init(qda[qx]->qd_gl, LM_ST_EXCLUSIVE, GL_NOCACHE, &ghs[qx]); if (error) goto out_dq; } error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh); if (error) goto out_dq; for (x = 0; x < num_qd; x++) { offset = qd2offset(qda[x]); if (gfs2_write_alloc_required(ip, offset, sizeof(struct gfs2_quota))) nalloc++; } /* * 1 blk for unstuffing inode if stuffed. We add this extra * block to the reservation unconditionally. If the inode * doesn't need unstuffing, the block will be released to the * rgrp since it won't be allocated during the transaction */ /* +3 in the end for unstuffing block, inode size update block * and another block in case quota straddles page boundary and * two blocks need to be updated instead of 1 */ blocks = num_qd * data_blocks + RES_DINODE + num_qd + 3; reserved = 1 + (nalloc * (data_blocks + ind_blocks)); ap.target = reserved; error = gfs2_inplace_reserve(ip, &ap); if (error) goto out_alloc; if (nalloc) blocks += gfs2_rg_blocks(ip, reserved) + nalloc * ind_blocks + RES_STATFS; error = gfs2_trans_begin(sdp, blocks, 0); if (error) goto out_ipres; for (x = 0; x < num_qd; x++) { qd = qda[x]; offset = qd2offset(qd); error = gfs2_adjust_quota(sdp, offset, qd->qd_change_sync, qd, NULL); if (error) goto out_end_trans; do_qc(qd, -qd->qd_change_sync); set_bit(QDF_REFRESH, &qd->qd_flags); } out_end_trans: gfs2_trans_end(sdp); out_ipres: gfs2_inplace_release(ip); out_alloc: gfs2_glock_dq_uninit(&i_gh); out_dq: while (qx--) gfs2_glock_dq_uninit(&ghs[qx]); inode_unlock(&ip->i_inode); kfree(ghs); gfs2_log_flush(ip->i_gl->gl_name.ln_sbd, ip->i_gl, GFS2_LOG_HEAD_FLUSH_NORMAL | GFS2_LFC_DO_SYNC); if (!error) { for (x = 0; x < num_qd; x++) { qd = qda[x]; spin_lock(&qd->qd_lockref.lock); if (qd->qd_sync_gen < sync_gen) qd->qd_sync_gen = sync_gen; spin_unlock(&qd->qd_lockref.lock); } } return error; } static int update_qd(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd) { struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode); struct gfs2_quota q; struct gfs2_quota_lvb *qlvb; loff_t pos; int error; memset(&q, 0, sizeof(struct gfs2_quota)); pos = qd2offset(qd); error = gfs2_internal_read(ip, (char *)&q, &pos, sizeof(q)); if (error < 0) return error; qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lksb.sb_lvbptr; qlvb->qb_magic = cpu_to_be32(GFS2_MAGIC); qlvb->__pad = 0; qlvb->qb_limit = q.qu_limit; qlvb->qb_warn = q.qu_warn; qlvb->qb_value = q.qu_value; spin_lock(&qd->qd_lockref.lock); qd->qd_qb = *qlvb; spin_unlock(&qd->qd_lockref.lock); return 0; } static int do_glock(struct gfs2_quota_data *qd, int force_refresh, struct gfs2_holder *q_gh) { struct gfs2_sbd *sdp = qd->qd_sbd; struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode); struct gfs2_holder i_gh; int error; gfs2_assert_warn(sdp, sdp == qd->qd_gl->gl_name.ln_sbd); restart: error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_SHARED, 0, q_gh); if (error) return error; if (test_and_clear_bit(QDF_REFRESH, &qd->qd_flags)) force_refresh = FORCE; spin_lock(&qd->qd_lockref.lock); qd->qd_qb = *(struct gfs2_quota_lvb *)qd->qd_gl->gl_lksb.sb_lvbptr; spin_unlock(&qd->qd_lockref.lock); if (force_refresh || qd->qd_qb.qb_magic != cpu_to_be32(GFS2_MAGIC)) { gfs2_glock_dq_uninit(q_gh); error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_EXCLUSIVE, GL_NOCACHE, q_gh); if (error) return error; error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh); if (error) goto fail; error = update_qd(sdp, qd); if (error) goto fail_gunlock; gfs2_glock_dq_uninit(&i_gh); gfs2_glock_dq_uninit(q_gh); force_refresh = 0; goto restart; } return 0; fail_gunlock: gfs2_glock_dq_uninit(&i_gh); fail: gfs2_glock_dq_uninit(q_gh); return error; } int gfs2_quota_lock(struct gfs2_inode *ip, kuid_t uid, kgid_t gid) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_quota_data *qd; u32 x; int error; if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF) return 0; error = gfs2_quota_hold(ip, uid, gid); if (error) return error; sort(ip->i_qadata->qa_qd, ip->i_qadata->qa_qd_num, sizeof(struct gfs2_quota_data *), sort_qd, NULL); for (x = 0; x < ip->i_qadata->qa_qd_num; x++) { qd = ip->i_qadata->qa_qd[x]; error = do_glock(qd, NO_FORCE, &ip->i_qadata->qa_qd_ghs[x]); if (error) break; } if (!error) set_bit(GIF_QD_LOCKED, &ip->i_flags); else { while (x--) gfs2_glock_dq_uninit(&ip->i_qadata->qa_qd_ghs[x]); gfs2_quota_unhold(ip); } return error; } static bool need_sync(struct gfs2_quota_data *qd) { struct gfs2_sbd *sdp = qd->qd_sbd; struct gfs2_tune *gt = &sdp->sd_tune; s64 value, change, limit; unsigned int num, den; int ret = false; spin_lock(&qd->qd_lockref.lock); if (!qd->qd_qb.qb_limit) goto out; change = qd->qd_change; if (change <= 0) goto out; value = (s64)be64_to_cpu(qd->qd_qb.qb_value); limit = (s64)be64_to_cpu(qd->qd_qb.qb_limit); if (value >= limit) goto out; spin_lock(&gt->gt_spin); num = gt->gt_quota_scale_num; den = gt->gt_quota_scale_den; spin_unlock(&gt->gt_spin); change *= gfs2_jindex_size(sdp) * num; change = div_s64(change, den); if (value + change < limit) goto out; ret = true; out: spin_unlock(&qd->qd_lockref.lock); return ret; } void gfs2_quota_unlock(struct gfs2_inode *ip) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_quota_data *qda[2 * GFS2_MAXQUOTAS]; unsigned int count = 0; u32 x; if (!test_and_clear_bit(GIF_QD_LOCKED, &ip->i_flags)) return; for (x = 0; x < ip->i_qadata->qa_qd_num; x++) { struct gfs2_quota_data *qd; bool sync; int error; qd = ip->i_qadata->qa_qd[x]; sync = need_sync(qd); gfs2_glock_dq_uninit(&ip->i_qadata->qa_qd_ghs[x]); if (!sync) continue; spin_lock(&qd_lock); sync = qd_grab_sync(sdp, qd, U64_MAX); spin_unlock(&qd_lock); if (!sync) continue; gfs2_assert_warn(sdp, qd->qd_change_sync); error = bh_get(qd); if (error) { qd_ungrab_sync(qd); continue; } qda[count++] = qd; } if (count) { u64 sync_gen = READ_ONCE(sdp->sd_quota_sync_gen); do_sync(count, qda, sync_gen); for (x = 0; x < count; x++) qd_unlock(qda[x]); } gfs2_quota_unhold(ip); } #define MAX_LINE 256 static void print_message(struct gfs2_quota_data *qd, char *type) { struct gfs2_sbd *sdp = qd->qd_sbd; if (sdp->sd_args.ar_quota != GFS2_QUOTA_QUIET) { fs_info(sdp, "quota %s for %s %u\n", type, (qd->qd_id.type == USRQUOTA) ? "user" : "group", from_kqid(&init_user_ns, qd->qd_id)); } } /** * gfs2_quota_check - check if allocating new blocks will exceed quota * @ip: The inode for which this check is being performed * @uid: The uid to check against * @gid: The gid to check against * @ap: The allocation parameters. ap->target contains the requested * blocks. ap->min_target, if set, contains the minimum blks * requested. * * Returns: 0 on success. * min_req = ap->min_target ? ap->min_target : ap->target; * quota must allow at least min_req blks for success and * ap->allowed is set to the number of blocks allowed * * -EDQUOT otherwise, quota violation. ap->allowed is set to number * of blocks available. */ int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid, struct gfs2_alloc_parms *ap) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_quota_data *qd; s64 value, warn, limit; u32 x; int error = 0; ap->allowed = UINT_MAX; /* Assume we are permitted a whole lot */ if (!test_bit(GIF_QD_LOCKED, &ip->i_flags)) return 0; for (x = 0; x < ip->i_qadata->qa_qd_num; x++) { qd = ip->i_qadata->qa_qd[x]; if (!(qid_eq(qd->qd_id, make_kqid_uid(uid)) || qid_eq(qd->qd_id, make_kqid_gid(gid)))) continue; spin_lock(&qd->qd_lockref.lock); warn = (s64)be64_to_cpu(qd->qd_qb.qb_warn); limit = (s64)be64_to_cpu(qd->qd_qb.qb_limit); value = (s64)be64_to_cpu(qd->qd_qb.qb_value); value += qd->qd_change; spin_unlock(&qd->qd_lockref.lock); if (limit > 0 && (limit - value) < ap->allowed) ap->allowed = limit - value; /* If we can't meet the target */ if (limit && limit < (value + (s64)ap->target)) { /* If no min_target specified or we don't meet * min_target, return -EDQUOT */ if (!ap->min_target || ap->min_target > ap->allowed) { if (!test_and_set_bit(QDF_QMSG_QUIET, &qd->qd_flags)) { print_message(qd, "exceeded"); quota_send_warning(qd->qd_id, sdp->sd_vfs->s_dev, QUOTA_NL_BHARDWARN); } error = -EDQUOT; break; } } else if (warn && warn < value && time_after_eq(jiffies, qd->qd_last_warn + gfs2_tune_get(sdp, gt_quota_warn_period) * HZ)) { quota_send_warning(qd->qd_id, sdp->sd_vfs->s_dev, QUOTA_NL_BSOFTWARN); print_message(qd, "warning"); error = 0; qd->qd_last_warn = jiffies; } } return error; } void gfs2_quota_change(struct gfs2_inode *ip, s64 change, kuid_t uid, kgid_t gid) { struct gfs2_quota_data *qd; u32 x; struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF || gfs2_assert_warn(sdp, change)) return; if (ip->i_diskflags & GFS2_DIF_SYSTEM) return; if (gfs2_assert_withdraw(sdp, ip->i_qadata && ip->i_qadata->qa_ref > 0)) return; for (x = 0; x < ip->i_qadata->qa_qd_num; x++) { qd = ip->i_qadata->qa_qd[x]; if (qid_eq(qd->qd_id, make_kqid_uid(uid)) || qid_eq(qd->qd_id, make_kqid_gid(gid))) { do_qc(qd, change); } } } int gfs2_quota_sync(struct super_block *sb, int type) { struct gfs2_sbd *sdp = sb->s_fs_info; struct gfs2_quota_data **qda; unsigned int max_qd = PAGE_SIZE / sizeof(struct gfs2_holder); u64 sync_gen; int error = 0; if (sb_rdonly(sdp->sd_vfs)) return 0; qda = kcalloc(max_qd, sizeof(struct gfs2_quota_data *), GFP_KERNEL); if (!qda) return -ENOMEM; mutex_lock(&sdp->sd_quota_sync_mutex); sync_gen = sdp->sd_quota_sync_gen + 1; do { struct gfs2_quota_data *iter; unsigned int num_qd = 0; unsigned int x; spin_lock(&qd_lock); list_for_each_entry(iter, &sdp->sd_quota_list, qd_list) { if (qd_grab_sync(sdp, iter, sync_gen)) { qda[num_qd++] = iter; if (num_qd == max_qd) break; } } spin_unlock(&qd_lock); if (!num_qd) break; for (x = 0; x < num_qd; x++) { error = bh_get(qda[x]); if (!error) continue; while (x < num_qd) qd_ungrab_sync(qda[--num_qd]); break; } if (!error) { WRITE_ONCE(sdp->sd_quota_sync_gen, sync_gen); error = do_sync(num_qd, qda, sync_gen); } for (x = 0; x < num_qd; x++) qd_unlock(qda[x]); } while (!error); mutex_unlock(&sdp->sd_quota_sync_mutex); kfree(qda); return error; } int gfs2_quota_refresh(struct gfs2_sbd *sdp, struct kqid qid) { struct gfs2_quota_data *qd; struct gfs2_holder q_gh; int error; error = qd_get(sdp, qid, &qd); if (error) return error; error = do_glock(qd, FORCE, &q_gh); if (!error) gfs2_glock_dq_uninit(&q_gh); qd_put(qd); return error; } int gfs2_quota_init(struct gfs2_sbd *sdp) { struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode); u64 size = i_size_read(sdp->sd_qc_inode); unsigned int blocks = size >> sdp->sd_sb.sb_bsize_shift; unsigned int x, slot = 0; unsigned int found = 0; unsigned int hash; unsigned int bm_size; struct buffer_head *bh; u64 dblock; u32 extlen = 0; int error; if (gfs2_check_internal_file_size(sdp->sd_qc_inode, 1, 64 << 20)) return -EIO; sdp->sd_quota_slots = blocks * sdp->sd_qc_per_block; bm_size = DIV_ROUND_UP(sdp->sd_quota_slots, 8 * sizeof(unsigned long)); bm_size *= sizeof(unsigned long); error = -ENOMEM; sdp->sd_quota_bitmap = kzalloc(bm_size, GFP_NOFS | __GFP_NOWARN); if (sdp->sd_quota_bitmap == NULL) sdp->sd_quota_bitmap = __vmalloc(bm_size, GFP_NOFS | __GFP_ZERO); if (!sdp->sd_quota_bitmap) return error; for (x = 0; x < blocks; x++) { struct gfs2_quota_change *qc; unsigned int y; if (!extlen) { extlen = 32; error = gfs2_get_extent(&ip->i_inode, x, &dblock, &extlen); if (error) goto fail; } error = -EIO; bh = gfs2_meta_ra(ip->i_gl, dblock, extlen); if (!bh) goto fail; if (gfs2_metatype_check(sdp, bh, GFS2_METATYPE_QC)) goto fail_brelse; qc = (struct gfs2_quota_change *)(bh->b_data + sizeof(struct gfs2_meta_header)); for (y = 0; y < sdp->sd_qc_per_block && slot < sdp->sd_quota_slots; y++, slot++) { struct gfs2_quota_data *old_qd, *qd; s64 qc_change = be64_to_cpu(qc->qc_change); u32 qc_flags = be32_to_cpu(qc->qc_flags); enum quota_type qtype = (qc_flags & GFS2_QCF_USER) ? USRQUOTA : GRPQUOTA; struct kqid qc_id = make_kqid(&init_user_ns, qtype, be32_to_cpu(qc->qc_id)); qc++; if (!qc_change) continue; hash = gfs2_qd_hash(sdp, qc_id); qd = qd_alloc(hash, sdp, qc_id); if (qd == NULL) goto fail_brelse; qd->qd_lockref.count = 0; set_bit(QDF_CHANGE, &qd->qd_flags); qd->qd_change = qc_change; qd->qd_slot = slot; qd->qd_slot_ref = 1; spin_lock(&qd_lock); spin_lock_bucket(hash); old_qd = gfs2_qd_search_bucket(hash, sdp, qc_id); if (old_qd) { fs_err(sdp, "Corruption found in quota_change%u" "file: duplicate identifier in " "slot %u\n", sdp->sd_jdesc->jd_jid, slot); spin_unlock_bucket(hash); spin_unlock(&qd_lock); qd_put(old_qd); gfs2_glock_put(qd->qd_gl); kmem_cache_free(gfs2_quotad_cachep, qd); /* zero out the duplicate slot */ lock_buffer(bh); memset(qc, 0, sizeof(*qc)); mark_buffer_dirty(bh); unlock_buffer(bh); continue; } BUG_ON(test_and_set_bit(slot, sdp->sd_quota_bitmap)); list_add(&qd->qd_list, &sdp->sd_quota_list); atomic_inc(&sdp->sd_quota_count); hlist_bl_add_head_rcu(&qd->qd_hlist, &qd_hash_table[hash]); spin_unlock_bucket(hash); spin_unlock(&qd_lock); found++; } if (buffer_dirty(bh)) sync_dirty_buffer(bh); brelse(bh); dblock++; extlen--; } if (found) fs_info(sdp, "found %u quota changes\n", found); return 0; fail_brelse: if (buffer_dirty(bh)) sync_dirty_buffer(bh); brelse(bh); fail: gfs2_quota_cleanup(sdp); return error; } void gfs2_quota_cleanup(struct gfs2_sbd *sdp) { struct gfs2_quota_data *qd; LIST_HEAD(dispose); int count; BUG_ON(!test_bit(SDF_NORECOVERY, &sdp->sd_flags) && test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)); spin_lock(&qd_lock); list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) { spin_lock(&qd->qd_lockref.lock); if (qd->qd_lockref.count != 0) { spin_unlock(&qd->qd_lockref.lock); continue; } lockref_mark_dead(&qd->qd_lockref); spin_unlock(&qd->qd_lockref.lock); list_lru_del_obj(&gfs2_qd_lru, &qd->qd_lru); list_add(&qd->qd_lru, &dispose); } spin_unlock(&qd_lock); gfs2_qd_list_dispose(&dispose); wait_event_timeout(sdp->sd_kill_wait, (count = atomic_read(&sdp->sd_quota_count)) == 0, HZ * 60); if (count != 0) fs_err(sdp, "%d left-over quota data objects\n", count); kvfree(sdp->sd_quota_bitmap); sdp->sd_quota_bitmap = NULL; } static void quotad_error(struct gfs2_sbd *sdp, const char *msg, int error) { if (error == 0 || error == -EROFS) return; if (!gfs2_withdrawing_or_withdrawn(sdp)) { if (!cmpxchg(&sdp->sd_log_error, 0, error)) fs_err(sdp, "gfs2_quotad: %s error %d\n", msg, error); wake_up(&sdp->sd_logd_waitq); } } static void quotad_check_timeo(struct gfs2_sbd *sdp, const char *msg, int (*fxn)(struct super_block *sb, int type), unsigned long t, unsigned long *timeo, unsigned int *new_timeo) { if (t >= *timeo) { int error = fxn(sdp->sd_vfs, 0); quotad_error(sdp, msg, error); *timeo = gfs2_tune_get_i(&sdp->sd_tune, new_timeo) * HZ; } else { *timeo -= t; } } void gfs2_wake_up_statfs(struct gfs2_sbd *sdp) { if (!sdp->sd_statfs_force_sync) { sdp->sd_statfs_force_sync = 1; wake_up(&sdp->sd_quota_wait); } } /** * gfs2_quotad - Write cached quota changes into the quota file * @data: Pointer to GFS2 superblock * */ int gfs2_quotad(void *data) { struct gfs2_sbd *sdp = data; struct gfs2_tune *tune = &sdp->sd_tune; unsigned long statfs_timeo = 0; unsigned long quotad_timeo = 0; unsigned long t = 0; set_freezable(); while (!kthread_should_stop()) { if (gfs2_withdrawing_or_withdrawn(sdp)) break; /* Update the master statfs file */ if (sdp->sd_statfs_force_sync) { int error = gfs2_statfs_sync(sdp->sd_vfs, 0); quotad_error(sdp, "statfs", error); statfs_timeo = gfs2_tune_get(sdp, gt_statfs_quantum) * HZ; } else quotad_check_timeo(sdp, "statfs", gfs2_statfs_sync, t, &statfs_timeo, &tune->gt_statfs_quantum); /* Update quota file */ quotad_check_timeo(sdp, "sync", gfs2_quota_sync, t, &quotad_timeo, &tune->gt_quota_quantum); t = min(quotad_timeo, statfs_timeo); t = wait_event_freezable_timeout(sdp->sd_quota_wait, sdp->sd_statfs_force_sync || gfs2_withdrawing_or_withdrawn(sdp) || kthread_should_stop(), t); if (sdp->sd_statfs_force_sync) t = 0; } return 0; } static int gfs2_quota_get_state(struct super_block *sb, struct qc_state *state) { struct gfs2_sbd *sdp = sb->s_fs_info; memset(state, 0, sizeof(*state)); switch (sdp->sd_args.ar_quota) { case GFS2_QUOTA_QUIET: fallthrough; case GFS2_QUOTA_ON: state->s_state[USRQUOTA].flags |= QCI_LIMITS_ENFORCED; state->s_state[GRPQUOTA].flags |= QCI_LIMITS_ENFORCED; fallthrough; case GFS2_QUOTA_ACCOUNT: state->s_state[USRQUOTA].flags |= QCI_ACCT_ENABLED | QCI_SYSFILE; state->s_state[GRPQUOTA].flags |= QCI_ACCT_ENABLED | QCI_SYSFILE; break; case GFS2_QUOTA_OFF: break; } if (sdp->sd_quota_inode) { state->s_state[USRQUOTA].ino = GFS2_I(sdp->sd_quota_inode)->i_no_addr; state->s_state[USRQUOTA].blocks = sdp->sd_quota_inode->i_blocks; } state->s_state[USRQUOTA].nextents = 1; /* unsupported */ state->s_state[GRPQUOTA] = state->s_state[USRQUOTA]; state->s_incoredqs = list_lru_count(&gfs2_qd_lru); return 0; } static int gfs2_get_dqblk(struct super_block *sb, struct kqid qid, struct qc_dqblk *fdq) { struct gfs2_sbd *sdp = sb->s_fs_info; struct gfs2_quota_lvb *qlvb; struct gfs2_quota_data *qd; struct gfs2_holder q_gh; int error; memset(fdq, 0, sizeof(*fdq)); if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF) return -ESRCH; /* Crazy XFS error code */ if ((qid.type != USRQUOTA) && (qid.type != GRPQUOTA)) return -EINVAL; error = qd_get(sdp, qid, &qd); if (error) return error; error = do_glock(qd, FORCE, &q_gh); if (error) goto out; qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lksb.sb_lvbptr; fdq->d_spc_hardlimit = be64_to_cpu(qlvb->qb_limit) << sdp->sd_sb.sb_bsize_shift; fdq->d_spc_softlimit = be64_to_cpu(qlvb->qb_warn) << sdp->sd_sb.sb_bsize_shift; fdq->d_space = be64_to_cpu(qlvb->qb_value) << sdp->sd_sb.sb_bsize_shift; gfs2_glock_dq_uninit(&q_gh); out: qd_put(qd); return error; } /* GFS2 only supports a subset of the XFS fields */ #define GFS2_FIELDMASK (QC_SPC_SOFT|QC_SPC_HARD|QC_SPACE) static int gfs2_set_dqblk(struct super_block *sb, struct kqid qid, struct qc_dqblk *fdq) { struct gfs2_sbd *sdp = sb->s_fs_info; struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode); struct gfs2_quota_data *qd; struct gfs2_holder q_gh, i_gh; unsigned int data_blocks, ind_blocks; unsigned int blocks = 0; int alloc_required; loff_t offset; int error; if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF) return -ESRCH; /* Crazy XFS error code */ if ((qid.type != USRQUOTA) && (qid.type != GRPQUOTA)) return -EINVAL; if (fdq->d_fieldmask & ~GFS2_FIELDMASK) return -EINVAL; error = qd_get(sdp, qid, &qd); if (error) return error; error = gfs2_qa_get(ip); if (error) goto out_put; inode_lock(&ip->i_inode); error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_EXCLUSIVE, 0, &q_gh); if (error) goto out_unlockput; error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh); if (error) goto out_q; /* Check for existing entry, if none then alloc new blocks */ error = update_qd(sdp, qd); if (error) goto out_i; /* If nothing has changed, this is a no-op */ if ((fdq->d_fieldmask & QC_SPC_SOFT) && ((fdq->d_spc_softlimit >> sdp->sd_sb.sb_bsize_shift) == be64_to_cpu(qd->qd_qb.qb_warn))) fdq->d_fieldmask ^= QC_SPC_SOFT; if ((fdq->d_fieldmask & QC_SPC_HARD) && ((fdq->d_spc_hardlimit >> sdp->sd_sb.sb_bsize_shift) == be64_to_cpu(qd->qd_qb.qb_limit))) fdq->d_fieldmask ^= QC_SPC_HARD; if ((fdq->d_fieldmask & QC_SPACE) && ((fdq->d_space >> sdp->sd_sb.sb_bsize_shift) == be64_to_cpu(qd->qd_qb.qb_value))) fdq->d_fieldmask ^= QC_SPACE; if (fdq->d_fieldmask == 0) goto out_i; offset = qd2offset(qd); alloc_required = gfs2_write_alloc_required(ip, offset, sizeof(struct gfs2_quota)); if (gfs2_is_stuffed(ip)) alloc_required = 1; if (alloc_required) { struct gfs2_alloc_parms ap = {}; gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota), &data_blocks, &ind_blocks); blocks = 1 + data_blocks + ind_blocks; ap.target = blocks; error = gfs2_inplace_reserve(ip, &ap); if (error) goto out_i; blocks += gfs2_rg_blocks(ip, blocks); } /* Some quotas span block boundaries and can update two blocks, adding an extra block to the transaction to handle such quotas */ error = gfs2_trans_begin(sdp, blocks + RES_DINODE + 2, 0); if (error) goto out_release; /* Apply changes */ error = gfs2_adjust_quota(sdp, offset, 0, qd, fdq); if (!error) clear_bit(QDF_QMSG_QUIET, &qd->qd_flags); gfs2_trans_end(sdp); out_release: if (alloc_required) gfs2_inplace_release(ip); out_i: gfs2_glock_dq_uninit(&i_gh); out_q: gfs2_glock_dq_uninit(&q_gh); out_unlockput: gfs2_qa_put(ip); inode_unlock(&ip->i_inode); out_put: qd_put(qd); return error; } const struct quotactl_ops gfs2_quotactl_ops = { .quota_sync = gfs2_quota_sync, .get_state = gfs2_quota_get_state, .get_dqblk = gfs2_get_dqblk, .set_dqblk = gfs2_set_dqblk, }; void __init gfs2_quota_hash_init(void) { unsigned i; for(i = 0; i < GFS2_QD_HASH_SIZE; i++) INIT_HLIST_BL_HEAD(&qd_hash_table[i]); }
169 246 207 239 64 114 51 25 88 36 205 39 2 37 13 18 1 74 2 15 35 1 86 9 112 61 99 22 80 15 23 41 77 1 65 28 12 36 1 59 103 9 152 23 98 98 59 125 207 69 44 7 30 30 1 10 60 21 35 25 2 2 7 46 43 43 43 43 147 147 26 2 2 147 148 20 146 148 20 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 /* * net/tipc/msg.h: Include file for TIPC message header routines * * Copyright (c) 2000-2007, 2014-2017 Ericsson AB * Copyright (c) 2005-2008, 2010-2011, Wind River Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #ifndef _TIPC_MSG_H #define _TIPC_MSG_H #include <linux/tipc.h> #include "core.h" /* * Constants and routines used to read and write TIPC payload message headers * * Note: Some items are also used with TIPC internal message headers */ #define TIPC_VERSION 2 struct plist; /* * Payload message users are defined in TIPC's public API: * - TIPC_LOW_IMPORTANCE * - TIPC_MEDIUM_IMPORTANCE * - TIPC_HIGH_IMPORTANCE * - TIPC_CRITICAL_IMPORTANCE */ #define TIPC_SYSTEM_IMPORTANCE 4 /* * Payload message types */ #define TIPC_CONN_MSG 0 #define TIPC_MCAST_MSG 1 #define TIPC_NAMED_MSG 2 #define TIPC_DIRECT_MSG 3 #define TIPC_GRP_MEMBER_EVT 4 #define TIPC_GRP_BCAST_MSG 5 #define TIPC_GRP_MCAST_MSG 6 #define TIPC_GRP_UCAST_MSG 7 /* * Internal message users */ #define BCAST_PROTOCOL 5 #define MSG_BUNDLER 6 #define LINK_PROTOCOL 7 #define CONN_MANAGER 8 #define GROUP_PROTOCOL 9 #define TUNNEL_PROTOCOL 10 #define NAME_DISTRIBUTOR 11 #define MSG_FRAGMENTER 12 #define LINK_CONFIG 13 #define MSG_CRYPTO 14 #define SOCK_WAKEUP 14 /* pseudo user */ #define TOP_SRV 15 /* pseudo user */ /* * Message header sizes */ #define SHORT_H_SIZE 24 /* In-cluster basic payload message */ #define BASIC_H_SIZE 32 /* Basic payload message */ #define NAMED_H_SIZE 40 /* Named payload message */ #define MCAST_H_SIZE 44 /* Multicast payload message */ #define GROUP_H_SIZE 44 /* Group payload message */ #define INT_H_SIZE 40 /* Internal messages */ #define MIN_H_SIZE 24 /* Smallest legal TIPC header size */ #define MAX_H_SIZE 60 /* Largest possible TIPC header size */ #define MAX_MSG_SIZE (MAX_H_SIZE + TIPC_MAX_USER_MSG_SIZE) #define TIPC_MEDIA_INFO_OFFSET 5 extern const int one_page_mtu; struct tipc_skb_cb { union { struct { struct sk_buff *tail; unsigned long nxt_retr; unsigned long retr_stamp; u32 bytes_read; u32 orig_member; u16 chain_imp; u16 ackers; u16 retr_cnt; } __packed; #ifdef CONFIG_TIPC_CRYPTO struct { struct tipc_crypto *rx; struct tipc_aead *last; u8 recurs; } tx_clone_ctx __packed; #endif } __packed; union { struct { u8 validated:1; #ifdef CONFIG_TIPC_CRYPTO u8 encrypted:1; u8 decrypted:1; #define SKB_PROBING 1 #define SKB_GRACING 2 u8 xmit_type:2; u8 tx_clone_deferred:1; #endif }; u8 flags; }; u8 reserved; #ifdef CONFIG_TIPC_CRYPTO void *crypto_ctx; #endif } __packed; #define TIPC_SKB_CB(__skb) ((struct tipc_skb_cb *)&((__skb)->cb[0])) struct tipc_msg { __be32 hdr[15]; }; /* struct tipc_gap_ack - TIPC Gap ACK block * @ack: seqno of the last consecutive packet in link deferdq * @gap: number of gap packets since the last ack * * E.g: * link deferdq: 1 2 3 4 10 11 13 14 15 20 * --> Gap ACK blocks: <4, 5>, <11, 1>, <15, 4>, <20, 0> */ struct tipc_gap_ack { __be16 ack; __be16 gap; }; /* struct tipc_gap_ack_blks * @len: actual length of the record * @ugack_cnt: number of Gap ACK blocks for unicast (following the broadcast * ones) * @start_index: starting index for "valid" broadcast Gap ACK blocks * @bgack_cnt: number of Gap ACK blocks for broadcast in the record * @gacks: array of Gap ACK blocks * * 31 16 15 0 * +-------------+-------------+-------------+-------------+ * | bgack_cnt | ugack_cnt | len | * +-------------+-------------+-------------+-------------+ - * | gap | ack | | * +-------------+-------------+-------------+-------------+ > bc gacks * : : : | * +-------------+-------------+-------------+-------------+ - * | gap | ack | | * +-------------+-------------+-------------+-------------+ > uc gacks * : : : | * +-------------+-------------+-------------+-------------+ - */ struct tipc_gap_ack_blks { __be16 len; union { u8 ugack_cnt; u8 start_index; }; u8 bgack_cnt; struct tipc_gap_ack gacks[]; }; #define MAX_GAP_ACK_BLKS 128 #define MAX_GAP_ACK_BLKS_SZ (sizeof(struct tipc_gap_ack_blks) + \ sizeof(struct tipc_gap_ack) * MAX_GAP_ACK_BLKS) static inline struct tipc_msg *buf_msg(struct sk_buff *skb) { return (struct tipc_msg *)skb->data; } static inline u32 msg_word(struct tipc_msg *m, u32 pos) { return ntohl(m->hdr[pos]); } static inline void msg_set_word(struct tipc_msg *m, u32 w, u32 val) { m->hdr[w] = htonl(val); } static inline u32 msg_bits(struct tipc_msg *m, u32 w, u32 pos, u32 mask) { return (msg_word(m, w) >> pos) & mask; } static inline void msg_set_bits(struct tipc_msg *m, u32 w, u32 pos, u32 mask, u32 val) { val = (val & mask) << pos; mask = mask << pos; m->hdr[w] &= ~htonl(mask); m->hdr[w] |= htonl(val); } /* * Word 0 */ static inline u32 msg_version(struct tipc_msg *m) { return msg_bits(m, 0, 29, 7); } static inline void msg_set_version(struct tipc_msg *m) { msg_set_bits(m, 0, 29, 7, TIPC_VERSION); } static inline u32 msg_user(struct tipc_msg *m) { return msg_bits(m, 0, 25, 0xf); } static inline u32 msg_isdata(struct tipc_msg *m) { return msg_user(m) <= TIPC_CRITICAL_IMPORTANCE; } static inline void msg_set_user(struct tipc_msg *m, u32 n) { msg_set_bits(m, 0, 25, 0xf, n); } static inline u32 msg_hdr_sz(struct tipc_msg *m) { return msg_bits(m, 0, 21, 0xf) << 2; } static inline void msg_set_hdr_sz(struct tipc_msg *m, u32 n) { msg_set_bits(m, 0, 21, 0xf, n>>2); } static inline u32 msg_size(struct tipc_msg *m) { return msg_bits(m, 0, 0, 0x1ffff); } static inline u32 msg_blocks(struct tipc_msg *m) { return (msg_size(m) / 1024) + 1; } static inline u32 msg_data_sz(struct tipc_msg *m) { return msg_size(m) - msg_hdr_sz(m); } static inline int msg_non_seq(struct tipc_msg *m) { return msg_bits(m, 0, 20, 1); } static inline void msg_set_non_seq(struct tipc_msg *m, u32 n) { msg_set_bits(m, 0, 20, 1, n); } static inline int msg_is_syn(struct tipc_msg *m) { return msg_bits(m, 0, 17, 1); } static inline void msg_set_syn(struct tipc_msg *m, u32 d) { msg_set_bits(m, 0, 17, 1, d); } static inline int msg_dest_droppable(struct tipc_msg *m) { return msg_bits(m, 0, 19, 1); } static inline void msg_set_dest_droppable(struct tipc_msg *m, u32 d) { msg_set_bits(m, 0, 19, 1, d); } static inline int msg_is_keepalive(struct tipc_msg *m) { return msg_bits(m, 0, 19, 1); } static inline void msg_set_is_keepalive(struct tipc_msg *m, u32 d) { msg_set_bits(m, 0, 19, 1, d); } static inline int msg_src_droppable(struct tipc_msg *m) { return msg_bits(m, 0, 18, 1); } static inline void msg_set_src_droppable(struct tipc_msg *m, u32 d) { msg_set_bits(m, 0, 18, 1, d); } static inline int msg_ack_required(struct tipc_msg *m) { return msg_bits(m, 0, 18, 1); } static inline void msg_set_ack_required(struct tipc_msg *m) { msg_set_bits(m, 0, 18, 1, 1); } static inline int msg_nagle_ack(struct tipc_msg *m) { return msg_bits(m, 0, 18, 1); } static inline void msg_set_nagle_ack(struct tipc_msg *m) { msg_set_bits(m, 0, 18, 1, 1); } static inline bool msg_is_rcast(struct tipc_msg *m) { return msg_bits(m, 0, 18, 0x1); } static inline void msg_set_is_rcast(struct tipc_msg *m, bool d) { msg_set_bits(m, 0, 18, 0x1, d); } static inline void msg_set_size(struct tipc_msg *m, u32 sz) { m->hdr[0] = htonl((msg_word(m, 0) & ~0x1ffff) | sz); } static inline unchar *msg_data(struct tipc_msg *m) { return ((unchar *)m) + msg_hdr_sz(m); } static inline struct tipc_msg *msg_inner_hdr(struct tipc_msg *m) { return (struct tipc_msg *)msg_data(m); } /* * Word 1 */ static inline u32 msg_type(struct tipc_msg *m) { return msg_bits(m, 1, 29, 0x7); } static inline void msg_set_type(struct tipc_msg *m, u32 n) { msg_set_bits(m, 1, 29, 0x7, n); } static inline int msg_in_group(struct tipc_msg *m) { int mtyp = msg_type(m); return mtyp >= TIPC_GRP_MEMBER_EVT && mtyp <= TIPC_GRP_UCAST_MSG; } static inline bool msg_is_grp_evt(struct tipc_msg *m) { return msg_type(m) == TIPC_GRP_MEMBER_EVT; } static inline u32 msg_named(struct tipc_msg *m) { return msg_type(m) == TIPC_NAMED_MSG; } static inline u32 msg_mcast(struct tipc_msg *m) { int mtyp = msg_type(m); return ((mtyp == TIPC_MCAST_MSG) || (mtyp == TIPC_GRP_BCAST_MSG) || (mtyp == TIPC_GRP_MCAST_MSG)); } static inline u32 msg_connected(struct tipc_msg *m) { return msg_type(m) == TIPC_CONN_MSG; } static inline u32 msg_direct(struct tipc_msg *m) { return msg_type(m) == TIPC_DIRECT_MSG; } static inline u32 msg_errcode(struct tipc_msg *m) { return msg_bits(m, 1, 25, 0xf); } static inline void msg_set_errcode(struct tipc_msg *m, u32 err) { msg_set_bits(m, 1, 25, 0xf, err); } static inline void msg_set_bulk(struct tipc_msg *m) { msg_set_bits(m, 1, 28, 0x1, 1); } static inline u32 msg_is_bulk(struct tipc_msg *m) { return msg_bits(m, 1, 28, 0x1); } static inline void msg_set_last_bulk(struct tipc_msg *m) { msg_set_bits(m, 1, 27, 0x1, 1); } static inline u32 msg_is_last_bulk(struct tipc_msg *m) { return msg_bits(m, 1, 27, 0x1); } static inline void msg_set_non_legacy(struct tipc_msg *m) { msg_set_bits(m, 1, 26, 0x1, 1); } static inline u32 msg_is_legacy(struct tipc_msg *m) { return !msg_bits(m, 1, 26, 0x1); } static inline u32 msg_reroute_cnt(struct tipc_msg *m) { return msg_bits(m, 1, 21, 0xf); } static inline void msg_incr_reroute_cnt(struct tipc_msg *m) { msg_set_bits(m, 1, 21, 0xf, msg_reroute_cnt(m) + 1); } static inline u32 msg_lookup_scope(struct tipc_msg *m) { return msg_bits(m, 1, 19, 0x3); } static inline void msg_set_lookup_scope(struct tipc_msg *m, u32 n) { msg_set_bits(m, 1, 19, 0x3, n); } static inline u16 msg_bcast_ack(struct tipc_msg *m) { return msg_bits(m, 1, 0, 0xffff); } static inline void msg_set_bcast_ack(struct tipc_msg *m, u16 n) { msg_set_bits(m, 1, 0, 0xffff, n); } /* Note: reusing bits in word 1 for ACTIVATE_MSG only, to re-synch * link peer session number */ static inline bool msg_dest_session_valid(struct tipc_msg *m) { return msg_bits(m, 1, 16, 0x1); } static inline void msg_set_dest_session_valid(struct tipc_msg *m, bool valid) { msg_set_bits(m, 1, 16, 0x1, valid); } static inline u16 msg_dest_session(struct tipc_msg *m) { return msg_bits(m, 1, 0, 0xffff); } static inline void msg_set_dest_session(struct tipc_msg *m, u16 n) { msg_set_bits(m, 1, 0, 0xffff, n); } /* * Word 2 */ static inline u16 msg_ack(struct tipc_msg *m) { return msg_bits(m, 2, 16, 0xffff); } static inline void msg_set_ack(struct tipc_msg *m, u16 n) { msg_set_bits(m, 2, 16, 0xffff, n); } static inline u16 msg_seqno(struct tipc_msg *m) { return msg_bits(m, 2, 0, 0xffff); } static inline void msg_set_seqno(struct tipc_msg *m, u16 n) { msg_set_bits(m, 2, 0, 0xffff, n); } /* * Words 3-10 */ static inline u32 msg_importance(struct tipc_msg *m) { int usr = msg_user(m); if (likely((usr <= TIPC_CRITICAL_IMPORTANCE) && !msg_errcode(m))) return usr; if ((usr == MSG_FRAGMENTER) || (usr == MSG_BUNDLER)) return msg_bits(m, 9, 0, 0x7); return TIPC_SYSTEM_IMPORTANCE; } static inline void msg_set_importance(struct tipc_msg *m, u32 i) { int usr = msg_user(m); if (likely((usr == MSG_FRAGMENTER) || (usr == MSG_BUNDLER))) msg_set_bits(m, 9, 0, 0x7, i); else if (i < TIPC_SYSTEM_IMPORTANCE) msg_set_user(m, i); else pr_warn("Trying to set illegal importance in message\n"); } static inline u32 msg_prevnode(struct tipc_msg *m) { return msg_word(m, 3); } static inline void msg_set_prevnode(struct tipc_msg *m, u32 a) { msg_set_word(m, 3, a); } static inline u32 msg_origport(struct tipc_msg *m) { if (msg_user(m) == MSG_FRAGMENTER) m = msg_inner_hdr(m); return msg_word(m, 4); } static inline void msg_set_origport(struct tipc_msg *m, u32 p) { msg_set_word(m, 4, p); } static inline u16 msg_named_seqno(struct tipc_msg *m) { return msg_bits(m, 4, 0, 0xffff); } static inline void msg_set_named_seqno(struct tipc_msg *m, u16 n) { msg_set_bits(m, 4, 0, 0xffff, n); } static inline u32 msg_destport(struct tipc_msg *m) { return msg_word(m, 5); } static inline void msg_set_destport(struct tipc_msg *m, u32 p) { msg_set_word(m, 5, p); } static inline u32 msg_mc_netid(struct tipc_msg *m) { return msg_word(m, 5); } static inline void msg_set_mc_netid(struct tipc_msg *m, u32 p) { msg_set_word(m, 5, p); } static inline int msg_short(struct tipc_msg *m) { return msg_hdr_sz(m) == SHORT_H_SIZE; } static inline u32 msg_orignode(struct tipc_msg *m) { if (likely(msg_short(m))) return msg_prevnode(m); return msg_word(m, 6); } static inline void msg_set_orignode(struct tipc_msg *m, u32 a) { msg_set_word(m, 6, a); } static inline u32 msg_destnode(struct tipc_msg *m) { return msg_word(m, 7); } static inline void msg_set_destnode(struct tipc_msg *m, u32 a) { msg_set_word(m, 7, a); } static inline u32 msg_nametype(struct tipc_msg *m) { return msg_word(m, 8); } static inline void msg_set_nametype(struct tipc_msg *m, u32 n) { msg_set_word(m, 8, n); } static inline u32 msg_nameinst(struct tipc_msg *m) { return msg_word(m, 9); } static inline u32 msg_namelower(struct tipc_msg *m) { return msg_nameinst(m); } static inline void msg_set_namelower(struct tipc_msg *m, u32 n) { msg_set_word(m, 9, n); } static inline void msg_set_nameinst(struct tipc_msg *m, u32 n) { msg_set_namelower(m, n); } static inline u32 msg_nameupper(struct tipc_msg *m) { return msg_word(m, 10); } static inline void msg_set_nameupper(struct tipc_msg *m, u32 n) { msg_set_word(m, 10, n); } /* * Constants and routines used to read and write TIPC internal message headers */ /* * Connection management protocol message types */ #define CONN_PROBE 0 #define CONN_PROBE_REPLY 1 #define CONN_ACK 2 /* * Name distributor message types */ #define PUBLICATION 0 #define WITHDRAWAL 1 /* * Segmentation message types */ #define FIRST_FRAGMENT 0 #define FRAGMENT 1 #define LAST_FRAGMENT 2 /* * Link management protocol message types */ #define STATE_MSG 0 #define RESET_MSG 1 #define ACTIVATE_MSG 2 /* * Changeover tunnel message types */ #define SYNCH_MSG 0 #define FAILOVER_MSG 1 /* * Config protocol message types */ #define DSC_REQ_MSG 0 #define DSC_RESP_MSG 1 #define DSC_TRIAL_MSG 2 #define DSC_TRIAL_FAIL_MSG 3 /* * Group protocol message types */ #define GRP_JOIN_MSG 0 #define GRP_LEAVE_MSG 1 #define GRP_ADV_MSG 2 #define GRP_ACK_MSG 3 #define GRP_RECLAIM_MSG 4 #define GRP_REMIT_MSG 5 /* Crypto message types */ #define KEY_DISTR_MSG 0 /* * Word 1 */ static inline u32 msg_seq_gap(struct tipc_msg *m) { return msg_bits(m, 1, 16, 0x1fff); } static inline void msg_set_seq_gap(struct tipc_msg *m, u32 n) { msg_set_bits(m, 1, 16, 0x1fff, n); } static inline u32 msg_node_sig(struct tipc_msg *m) { return msg_bits(m, 1, 0, 0xffff); } static inline void msg_set_node_sig(struct tipc_msg *m, u32 n) { msg_set_bits(m, 1, 0, 0xffff, n); } static inline u32 msg_node_capabilities(struct tipc_msg *m) { return msg_bits(m, 1, 15, 0x1fff); } static inline void msg_set_node_capabilities(struct tipc_msg *m, u32 n) { msg_set_bits(m, 1, 15, 0x1fff, n); } /* * Word 2 */ static inline u32 msg_dest_domain(struct tipc_msg *m) { return msg_word(m, 2); } static inline void msg_set_dest_domain(struct tipc_msg *m, u32 n) { msg_set_word(m, 2, n); } static inline void msg_set_bcgap_after(struct tipc_msg *m, u32 n) { msg_set_bits(m, 2, 16, 0xffff, n); } static inline u32 msg_bcgap_to(struct tipc_msg *m) { return msg_bits(m, 2, 0, 0xffff); } static inline void msg_set_bcgap_to(struct tipc_msg *m, u32 n) { msg_set_bits(m, 2, 0, 0xffff, n); } /* * Word 4 */ static inline u32 msg_last_bcast(struct tipc_msg *m) { return msg_bits(m, 4, 16, 0xffff); } static inline u32 msg_bc_snd_nxt(struct tipc_msg *m) { return msg_last_bcast(m) + 1; } static inline void msg_set_last_bcast(struct tipc_msg *m, u32 n) { msg_set_bits(m, 4, 16, 0xffff, n); } static inline u32 msg_nof_fragms(struct tipc_msg *m) { return msg_bits(m, 4, 0, 0xffff); } static inline void msg_set_nof_fragms(struct tipc_msg *m, u32 n) { msg_set_bits(m, 4, 0, 0xffff, n); } static inline u32 msg_fragm_no(struct tipc_msg *m) { return msg_bits(m, 4, 16, 0xffff); } static inline void msg_set_fragm_no(struct tipc_msg *m, u32 n) { msg_set_bits(m, 4, 16, 0xffff, n); } static inline u16 msg_next_sent(struct tipc_msg *m) { return msg_bits(m, 4, 0, 0xffff); } static inline void msg_set_next_sent(struct tipc_msg *m, u16 n) { msg_set_bits(m, 4, 0, 0xffff, n); } static inline u32 msg_bc_netid(struct tipc_msg *m) { return msg_word(m, 4); } static inline void msg_set_bc_netid(struct tipc_msg *m, u32 id) { msg_set_word(m, 4, id); } static inline u32 msg_link_selector(struct tipc_msg *m) { if (msg_user(m) == MSG_FRAGMENTER) m = (void *)msg_data(m); return msg_bits(m, 4, 0, 1); } /* * Word 5 */ static inline u16 msg_session(struct tipc_msg *m) { return msg_bits(m, 5, 16, 0xffff); } static inline void msg_set_session(struct tipc_msg *m, u16 n) { msg_set_bits(m, 5, 16, 0xffff, n); } static inline u32 msg_probe(struct tipc_msg *m) { return msg_bits(m, 5, 0, 1); } static inline void msg_set_probe(struct tipc_msg *m, u32 val) { msg_set_bits(m, 5, 0, 1, val); } static inline char msg_net_plane(struct tipc_msg *m) { return msg_bits(m, 5, 1, 7) + 'A'; } static inline void msg_set_net_plane(struct tipc_msg *m, char n) { msg_set_bits(m, 5, 1, 7, (n - 'A')); } static inline u32 msg_linkprio(struct tipc_msg *m) { return msg_bits(m, 5, 4, 0x1f); } static inline void msg_set_linkprio(struct tipc_msg *m, u32 n) { msg_set_bits(m, 5, 4, 0x1f, n); } static inline u32 msg_bearer_id(struct tipc_msg *m) { return msg_bits(m, 5, 9, 0x7); } static inline void msg_set_bearer_id(struct tipc_msg *m, u32 n) { msg_set_bits(m, 5, 9, 0x7, n); } static inline u32 msg_redundant_link(struct tipc_msg *m) { return msg_bits(m, 5, 12, 0x1); } static inline void msg_set_redundant_link(struct tipc_msg *m, u32 r) { msg_set_bits(m, 5, 12, 0x1, r); } static inline u32 msg_peer_stopping(struct tipc_msg *m) { return msg_bits(m, 5, 13, 0x1); } static inline void msg_set_peer_stopping(struct tipc_msg *m, u32 s) { msg_set_bits(m, 5, 13, 0x1, s); } static inline bool msg_bc_ack_invalid(struct tipc_msg *m) { switch (msg_user(m)) { case BCAST_PROTOCOL: case NAME_DISTRIBUTOR: case LINK_PROTOCOL: return msg_bits(m, 5, 14, 0x1); default: return false; } } static inline void msg_set_bc_ack_invalid(struct tipc_msg *m, bool invalid) { msg_set_bits(m, 5, 14, 0x1, invalid); } static inline char *msg_media_addr(struct tipc_msg *m) { return (char *)&m->hdr[TIPC_MEDIA_INFO_OFFSET]; } static inline u32 msg_bc_gap(struct tipc_msg *m) { return msg_bits(m, 8, 0, 0x3ff); } static inline void msg_set_bc_gap(struct tipc_msg *m, u32 n) { msg_set_bits(m, 8, 0, 0x3ff, n); } /* * Word 9 */ static inline u16 msg_msgcnt(struct tipc_msg *m) { return msg_bits(m, 9, 16, 0xffff); } static inline void msg_set_msgcnt(struct tipc_msg *m, u16 n) { msg_set_bits(m, 9, 16, 0xffff, n); } static inline u16 msg_syncpt(struct tipc_msg *m) { return msg_bits(m, 9, 16, 0xffff); } static inline void msg_set_syncpt(struct tipc_msg *m, u16 n) { msg_set_bits(m, 9, 16, 0xffff, n); } static inline u32 msg_conn_ack(struct tipc_msg *m) { return msg_bits(m, 9, 16, 0xffff); } static inline void msg_set_conn_ack(struct tipc_msg *m, u32 n) { msg_set_bits(m, 9, 16, 0xffff, n); } static inline u16 msg_adv_win(struct tipc_msg *m) { return msg_bits(m, 9, 0, 0xffff); } static inline void msg_set_adv_win(struct tipc_msg *m, u16 n) { msg_set_bits(m, 9, 0, 0xffff, n); } static inline u32 msg_max_pkt(struct tipc_msg *m) { return msg_bits(m, 9, 16, 0xffff) * 4; } static inline void msg_set_max_pkt(struct tipc_msg *m, u32 n) { msg_set_bits(m, 9, 16, 0xffff, (n / 4)); } static inline u32 msg_link_tolerance(struct tipc_msg *m) { return msg_bits(m, 9, 0, 0xffff); } static inline void msg_set_link_tolerance(struct tipc_msg *m, u32 n) { msg_set_bits(m, 9, 0, 0xffff, n); } static inline u16 msg_grp_bc_syncpt(struct tipc_msg *m) { return msg_bits(m, 9, 16, 0xffff); } static inline void msg_set_grp_bc_syncpt(struct tipc_msg *m, u16 n) { msg_set_bits(m, 9, 16, 0xffff, n); } static inline u16 msg_grp_bc_acked(struct tipc_msg *m) { return msg_bits(m, 9, 16, 0xffff); } static inline void msg_set_grp_bc_acked(struct tipc_msg *m, u16 n) { msg_set_bits(m, 9, 16, 0xffff, n); } static inline u16 msg_grp_remitted(struct tipc_msg *m) { return msg_bits(m, 9, 16, 0xffff); } static inline void msg_set_grp_remitted(struct tipc_msg *m, u16 n) { msg_set_bits(m, 9, 16, 0xffff, n); } /* Word 10 */ static inline u16 msg_grp_evt(struct tipc_msg *m) { return msg_bits(m, 10, 0, 0x3); } static inline void msg_set_grp_evt(struct tipc_msg *m, int n) { msg_set_bits(m, 10, 0, 0x3, n); } static inline u16 msg_grp_bc_ack_req(struct tipc_msg *m) { return msg_bits(m, 10, 0, 0x1); } static inline void msg_set_grp_bc_ack_req(struct tipc_msg *m, bool n) { msg_set_bits(m, 10, 0, 0x1, n); } static inline u16 msg_grp_bc_seqno(struct tipc_msg *m) { return msg_bits(m, 10, 16, 0xffff); } static inline void msg_set_grp_bc_seqno(struct tipc_msg *m, u32 n) { msg_set_bits(m, 10, 16, 0xffff, n); } static inline bool msg_peer_link_is_up(struct tipc_msg *m) { if (likely(msg_user(m) != LINK_PROTOCOL)) return true; if (msg_type(m) == STATE_MSG) return true; return false; } static inline bool msg_peer_node_is_up(struct tipc_msg *m) { if (msg_peer_link_is_up(m)) return true; return msg_redundant_link(m); } static inline bool msg_is_reset(struct tipc_msg *hdr) { return (msg_user(hdr) == LINK_PROTOCOL) && (msg_type(hdr) == RESET_MSG); } /* Word 13 */ static inline void msg_set_peer_net_hash(struct tipc_msg *m, u32 n) { msg_set_word(m, 13, n); } static inline u32 msg_peer_net_hash(struct tipc_msg *m) { return msg_word(m, 13); } /* Word 14 */ static inline u32 msg_sugg_node_addr(struct tipc_msg *m) { return msg_word(m, 14); } static inline void msg_set_sugg_node_addr(struct tipc_msg *m, u32 n) { msg_set_word(m, 14, n); } static inline void msg_set_node_id(struct tipc_msg *hdr, u8 *id) { memcpy(msg_data(hdr), id, 16); } static inline u8 *msg_node_id(struct tipc_msg *hdr) { return (u8 *)msg_data(hdr); } struct sk_buff *tipc_buf_acquire(u32 size, gfp_t gfp); bool tipc_msg_validate(struct sk_buff **_skb); bool tipc_msg_reverse(u32 own_addr, struct sk_buff **skb, int err); void tipc_skb_reject(struct net *net, int err, struct sk_buff *skb, struct sk_buff_head *xmitq); void tipc_msg_init(u32 own_addr, struct tipc_msg *m, u32 user, u32 type, u32 hsize, u32 destnode); struct sk_buff *tipc_msg_create(uint user, uint type, uint hdr_sz, uint data_sz, u32 dnode, u32 onode, u32 dport, u32 oport, int errcode); int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf); bool tipc_msg_try_bundle(struct sk_buff *tskb, struct sk_buff **skb, u32 mss, u32 dnode, bool *new_bundle); bool tipc_msg_extract(struct sk_buff *skb, struct sk_buff **iskb, int *pos); int tipc_msg_fragment(struct sk_buff *skb, const struct tipc_msg *hdr, int pktmax, struct sk_buff_head *frags); int tipc_msg_build(struct tipc_msg *mhdr, struct msghdr *m, int offset, int dsz, int mtu, struct sk_buff_head *list); int tipc_msg_append(struct tipc_msg *hdr, struct msghdr *m, int dlen, int mss, struct sk_buff_head *txq); bool tipc_msg_lookup_dest(struct net *net, struct sk_buff *skb, int *err); bool tipc_msg_assemble(struct sk_buff_head *list); bool tipc_msg_reassemble(struct sk_buff_head *list, struct sk_buff_head *rcvq); bool tipc_msg_pskb_copy(u32 dst, struct sk_buff_head *msg, struct sk_buff_head *cpy); bool __tipc_skb_queue_sorted(struct sk_buff_head *list, u16 seqno, struct sk_buff *skb); bool tipc_msg_skb_clone(struct sk_buff_head *msg, struct sk_buff_head *cpy); static inline u16 buf_seqno(struct sk_buff *skb) { return msg_seqno(buf_msg(skb)); } static inline int buf_roundup_len(struct sk_buff *skb) { return (skb->len / 1024 + 1) * 1024; } /* tipc_skb_peek(): peek and reserve first buffer in list * @list: list to be peeked in * Returns pointer to first buffer in list, if any */ static inline struct sk_buff *tipc_skb_peek(struct sk_buff_head *list, spinlock_t *lock) { struct sk_buff *skb; spin_lock_bh(lock); skb = skb_peek(list); if (skb) skb_get(skb); spin_unlock_bh(lock); return skb; } /* tipc_skb_peek_port(): find a destination port, ignoring all destinations * up to and including 'filter'. * Note: ignoring previously tried destinations minimizes the risk of * contention on the socket lock * @list: list to be peeked in * @filter: last destination to be ignored from search * Returns a destination port number, of applicable. */ static inline u32 tipc_skb_peek_port(struct sk_buff_head *list, u32 filter) { struct sk_buff *skb; u32 dport = 0; bool ignore = true; spin_lock_bh(&list->lock); skb_queue_walk(list, skb) { dport = msg_destport(buf_msg(skb)); if (!filter || skb_queue_is_last(list, skb)) break; if (dport == filter) ignore = false; else if (!ignore) break; } spin_unlock_bh(&list->lock); return dport; } /* tipc_skb_dequeue(): unlink first buffer with dest 'dport' from list * @list: list to be unlinked from * @dport: selection criteria for buffer to unlink */ static inline struct sk_buff *tipc_skb_dequeue(struct sk_buff_head *list, u32 dport) { struct sk_buff *_skb, *tmp, *skb = NULL; spin_lock_bh(&list->lock); skb_queue_walk_safe(list, _skb, tmp) { if (msg_destport(buf_msg(_skb)) == dport) { __skb_unlink(_skb, list); skb = _skb; break; } } spin_unlock_bh(&list->lock); return skb; } /* tipc_skb_queue_splice_tail - append an skb list to lock protected list * @list: the new list to append. Not lock protected * @head: target list. Lock protected. */ static inline void tipc_skb_queue_splice_tail(struct sk_buff_head *list, struct sk_buff_head *head) { spin_lock_bh(&head->lock); skb_queue_splice_tail(list, head); spin_unlock_bh(&head->lock); } /* tipc_skb_queue_splice_tail_init - merge two lock protected skb lists * @list: the new list to add. Lock protected. Will be reinitialized * @head: target list. Lock protected. */ static inline void tipc_skb_queue_splice_tail_init(struct sk_buff_head *list, struct sk_buff_head *head) { struct sk_buff_head tmp; __skb_queue_head_init(&tmp); spin_lock_bh(&list->lock); skb_queue_splice_tail_init(list, &tmp); spin_unlock_bh(&list->lock); tipc_skb_queue_splice_tail(&tmp, head); } /* __tipc_skb_dequeue() - dequeue the head skb according to expected seqno * @list: list to be dequeued from * @seqno: seqno of the expected msg * * returns skb dequeued from the list if its seqno is less than or equal to * the expected one, otherwise the skb is still hold * * Note: must be used with appropriate locks held only */ static inline struct sk_buff *__tipc_skb_dequeue(struct sk_buff_head *list, u16 seqno) { struct sk_buff *skb = skb_peek(list); if (skb && less_eq(buf_seqno(skb), seqno)) { __skb_unlink(skb, list); return skb; } return NULL; } #endif
42 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 /* SPDX-License-Identifier: GPL-2.0 */ /* * linux/include/linux/nfs_fs.h * * Copyright (C) 1992 Rick Sladkey * * OS-specific nfs filesystem definitions and declarations */ #ifndef _LINUX_NFS_FS_H #define _LINUX_NFS_FS_H #include <uapi/linux/nfs_fs.h> /* * Enable dprintk() debugging support for nfs client. */ #ifdef CONFIG_NFS_DEBUG # define NFS_DEBUG #endif #include <linux/in.h> #include <linux/mm.h> #include <linux/pagemap.h> #include <linux/rbtree.h> #include <linux/refcount.h> #include <linux/rwsem.h> #include <linux/wait.h> #include <linux/sunrpc/debug.h> #include <linux/sunrpc/auth.h> #include <linux/sunrpc/clnt.h> #ifdef CONFIG_NFS_FSCACHE #include <linux/netfs.h> #endif #include <linux/nfs.h> #include <linux/nfs2.h> #include <linux/nfs3.h> #include <linux/nfs4.h> #include <linux/nfs_xdr.h> #include <linux/nfs_fs_sb.h> #include <linux/mempool.h> /* * These are the default for number of transports to different server IPs */ #define NFS_MAX_TRANSPORTS 16 /* * Size of the NFS directory verifier */ #define NFS_DIR_VERIFIER_SIZE 2 /* * NFSv3/v4 Access mode cache entry */ struct nfs_access_entry { struct rb_node rb_node; struct list_head lru; kuid_t fsuid; kgid_t fsgid; struct group_info *group_info; u64 timestamp; __u32 mask; struct rcu_head rcu_head; }; struct nfs_lock_context { refcount_t count; struct list_head list; struct nfs_open_context *open_context; fl_owner_t lockowner; atomic_t io_count; struct rcu_head rcu_head; }; struct nfs_file_localio { struct nfsd_file __rcu *ro_file; struct nfsd_file __rcu *rw_file; struct list_head list; void __rcu *nfs_uuid; /* opaque pointer to 'nfs_uuid_t' */ }; static inline void nfs_localio_file_init(struct nfs_file_localio *nfl) { #if IS_ENABLED(CONFIG_NFS_LOCALIO) nfl->ro_file = NULL; nfl->rw_file = NULL; INIT_LIST_HEAD(&nfl->list); nfl->nfs_uuid = NULL; #endif } struct nfs4_state; struct nfs_open_context { struct nfs_lock_context lock_context; fl_owner_t flock_owner; struct dentry *dentry; const struct cred *cred; struct rpc_cred __rcu *ll_cred; /* low-level cred - use to check for expiry */ struct nfs4_state *state; fmode_t mode; int error; unsigned long flags; #define NFS_CONTEXT_BAD (2) #define NFS_CONTEXT_UNLOCK (3) #define NFS_CONTEXT_FILE_OPEN (4) struct nfs4_threshold *mdsthreshold; struct list_head list; struct rcu_head rcu_head; struct nfs_file_localio nfl; }; struct nfs_open_dir_context { struct list_head list; atomic_t cache_hits; atomic_t cache_misses; unsigned long attr_gencount; __be32 verf[NFS_DIR_VERIFIER_SIZE]; __u64 dir_cookie; __u64 last_cookie; pgoff_t page_index; unsigned int dtsize; bool force_clear; bool eof; struct rcu_head rcu_head; }; /* * NFSv4 delegation */ struct nfs_delegation; struct posix_acl; struct nfs4_xattr_cache; /* * nfs fs inode data in memory */ struct nfs_inode { /* * The 64bit 'inode number' */ __u64 fileid; /* * NFS file handle */ struct nfs_fh fh; /* * Various flags */ unsigned long flags; /* atomic bit ops */ unsigned long cache_validity; /* bit mask */ /* * NFS Attributes not included in struct inode */ struct timespec64 btime; /* * read_cache_jiffies is when we started read-caching this inode. * attrtimeo is for how long the cached information is assumed * to be valid. A successful attribute revalidation doubles * attrtimeo (up to acregmax/acdirmax), a failure resets it to * acregmin/acdirmin. * * We need to revalidate the cached attrs for this inode if * * jiffies - read_cache_jiffies >= attrtimeo * * Please note the comparison is greater than or equal * so that zero timeout values can be specified. */ unsigned long read_cache_jiffies; unsigned long attrtimeo; unsigned long attrtimeo_timestamp; unsigned long attr_gencount; struct rb_root access_cache; struct list_head access_cache_entry_lru; struct list_head access_cache_inode_lru; union { /* Directory */ struct { /* "Generation counter" for the attribute cache. * This is bumped whenever we update the metadata * on the server. */ unsigned long cache_change_attribute; /* * This is the cookie verifier used for NFSv3 readdir * operations */ __be32 cookieverf[NFS_DIR_VERIFIER_SIZE]; /* Readers: in-flight sillydelete RPC calls */ /* Writers: rmdir */ struct rw_semaphore rmdir_sem; }; /* Regular file */ struct { atomic_long_t nrequests; atomic_long_t redirtied_pages; struct nfs_mds_commit_info commit_info; struct mutex commit_mutex; }; }; /* Open contexts for shared mmap writes */ struct list_head open_files; /* Keep track of out-of-order replies. * The ooo array contains start/end pairs of * numbers from the changeid sequence when * the inode's iversion has been updated. * It also contains end/start pair (i.e. reverse order) * of sections of the changeid sequence that have * been seen in replies from the server. * Normally these should match and when both * A:B and B:A are found in ooo, they are both removed. * And if a reply with A:B causes an iversion update * of A:B, then neither are added. * When a reply has pre_change that doesn't match * iversion, then the changeid pair and any consequent * change in iversion ARE added. Later replies * might fill in the gaps, or possibly a gap is caused * by a change from another client. * When a file or directory is opened, if the ooo table * is not empty, then we assume the gaps were due to * another client and we invalidate the cached data. * * We can only track a limited number of concurrent gaps. * Currently that limit is 16. * We allocate the table on demand. If there is insufficient * memory, then we probably cannot cache the file anyway * so there is no loss. */ struct { int cnt; struct { u64 start, end; } gap[16]; } *ooo; #if IS_ENABLED(CONFIG_NFS_V4) struct nfs4_cached_acl *nfs4_acl; /* NFSv4 state */ struct list_head open_states; struct nfs_delegation __rcu *delegation; struct rw_semaphore rwsem; /* pNFS layout information */ struct pnfs_layout_hdr *layout; #endif /* CONFIG_NFS_V4*/ /* how many bytes have been written/read and how many bytes queued up */ __u64 write_io; __u64 read_io; #ifdef CONFIG_NFS_V4_2 struct nfs4_xattr_cache *xattr_cache; #endif union { struct inode vfs_inode; #ifdef CONFIG_NFS_FSCACHE struct netfs_inode netfs; /* netfs context and VFS inode */ #endif }; }; struct nfs4_copy_state { struct list_head copies; struct list_head src_copies; nfs4_stateid stateid; struct completion completion; uint64_t count; struct nfs_writeverf verf; int error; int flags; struct nfs4_state *parent_src_state; struct nfs4_state *parent_dst_state; }; /* * Access bit flags */ #define NFS_ACCESS_READ 0x0001 #define NFS_ACCESS_LOOKUP 0x0002 #define NFS_ACCESS_MODIFY 0x0004 #define NFS_ACCESS_EXTEND 0x0008 #define NFS_ACCESS_DELETE 0x0010 #define NFS_ACCESS_EXECUTE 0x0020 #define NFS_ACCESS_XAREAD 0x0040 #define NFS_ACCESS_XAWRITE 0x0080 #define NFS_ACCESS_XALIST 0x0100 /* * Cache validity bit flags */ #define NFS_INO_INVALID_DATA BIT(1) /* cached data is invalid */ #define NFS_INO_INVALID_ATIME BIT(2) /* cached atime is invalid */ #define NFS_INO_INVALID_ACCESS BIT(3) /* cached access cred invalid */ #define NFS_INO_INVALID_ACL BIT(4) /* cached acls are invalid */ #define NFS_INO_REVAL_FORCED BIT(6) /* force revalidation ignoring a delegation */ #define NFS_INO_INVALID_LABEL BIT(7) /* cached label is invalid */ #define NFS_INO_INVALID_CHANGE BIT(8) /* cached change is invalid */ #define NFS_INO_INVALID_CTIME BIT(9) /* cached ctime is invalid */ #define NFS_INO_INVALID_MTIME BIT(10) /* cached mtime is invalid */ #define NFS_INO_INVALID_SIZE BIT(11) /* cached size is invalid */ #define NFS_INO_INVALID_OTHER BIT(12) /* other attrs are invalid */ #define NFS_INO_DATA_INVAL_DEFER \ BIT(13) /* Deferred cache invalidation */ #define NFS_INO_INVALID_BLOCKS BIT(14) /* cached blocks are invalid */ #define NFS_INO_INVALID_XATTR BIT(15) /* xattrs are invalid */ #define NFS_INO_INVALID_NLINK BIT(16) /* cached nlinks is invalid */ #define NFS_INO_INVALID_MODE BIT(17) /* cached mode is invalid */ #define NFS_INO_INVALID_BTIME BIT(18) /* cached btime is invalid */ #define NFS_INO_INVALID_ATTR (NFS_INO_INVALID_CHANGE \ | NFS_INO_INVALID_CTIME \ | NFS_INO_INVALID_MTIME \ | NFS_INO_INVALID_BTIME \ | NFS_INO_INVALID_SIZE \ | NFS_INO_INVALID_NLINK \ | NFS_INO_INVALID_MODE \ | NFS_INO_INVALID_OTHER) /* inode metadata is invalid */ /* * Bit offsets in flags field */ #define NFS_INO_STALE (1) /* possible stale inode */ #define NFS_INO_ACL_LRU_SET (2) /* Inode is on the LRU list */ #define NFS_INO_INVALIDATING (3) /* inode is being invalidated */ #define NFS_INO_PRESERVE_UNLINKED (4) /* preserve file if removed while open */ #define NFS_INO_LAYOUTCOMMIT (9) /* layoutcommit required */ #define NFS_INO_LAYOUTCOMMITTING (10) /* layoutcommit inflight */ #define NFS_INO_LAYOUTSTATS (11) /* layoutstats inflight */ #define NFS_INO_ODIRECT (12) /* I/O setting is O_DIRECT */ static inline struct nfs_inode *NFS_I(const struct inode *inode) { return container_of(inode, struct nfs_inode, vfs_inode); } static inline struct nfs_server *NFS_SB(const struct super_block *s) { return (struct nfs_server *)(s->s_fs_info); } static inline struct nfs_fh *NFS_FH(const struct inode *inode) { return &NFS_I(inode)->fh; } static inline struct nfs_server *NFS_SERVER(const struct inode *inode) { return NFS_SB(inode->i_sb); } static inline struct rpc_clnt *NFS_CLIENT(const struct inode *inode) { return NFS_SERVER(inode)->client; } static inline const struct nfs_rpc_ops *NFS_PROTO(const struct inode *inode) { return NFS_SERVER(inode)->nfs_client->rpc_ops; } static inline unsigned NFS_MINATTRTIMEO(const struct inode *inode) { struct nfs_server *nfss = NFS_SERVER(inode); return S_ISDIR(inode->i_mode) ? nfss->acdirmin : nfss->acregmin; } static inline unsigned NFS_MAXATTRTIMEO(const struct inode *inode) { struct nfs_server *nfss = NFS_SERVER(inode); return S_ISDIR(inode->i_mode) ? nfss->acdirmax : nfss->acregmax; } static inline int NFS_STALE(const struct inode *inode) { return test_bit(NFS_INO_STALE, &NFS_I(inode)->flags); } static inline __u64 NFS_FILEID(const struct inode *inode) { return NFS_I(inode)->fileid; } static inline void set_nfs_fileid(struct inode *inode, __u64 fileid) { NFS_I(inode)->fileid = fileid; } static inline void nfs_mark_for_revalidate(struct inode *inode) { struct nfs_inode *nfsi = NFS_I(inode); spin_lock(&inode->i_lock); nfsi->cache_validity |= NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL | NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_CTIME | NFS_INO_INVALID_SIZE; if (S_ISDIR(inode->i_mode)) nfsi->cache_validity |= NFS_INO_INVALID_DATA; spin_unlock(&inode->i_lock); } static inline int nfs_server_capable(const struct inode *inode, int cap) { return NFS_SERVER(inode)->caps & cap; } /** * nfs_save_change_attribute - Returns the inode attribute change cookie * @dir - pointer to parent directory inode * The "cache change attribute" is updated when we need to revalidate * our dentry cache after a directory was seen to change on the server. */ static inline unsigned long nfs_save_change_attribute(struct inode *dir) { return NFS_I(dir)->cache_change_attribute; } /* * linux/fs/nfs/inode.c */ extern int nfs_sync_mapping(struct address_space *mapping); extern void nfs_zap_mapping(struct inode *inode, struct address_space *mapping); extern void nfs_zap_caches(struct inode *); extern void nfs_set_inode_stale(struct inode *inode); extern void nfs_invalidate_atime(struct inode *); extern struct inode *nfs_fhget(struct super_block *, struct nfs_fh *, struct nfs_fattr *); struct inode *nfs_ilookup(struct super_block *sb, struct nfs_fattr *, struct nfs_fh *); extern int nfs_refresh_inode(struct inode *, struct nfs_fattr *); extern int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr); extern int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fattr); extern int nfs_post_op_update_inode_force_wcc_locked(struct inode *inode, struct nfs_fattr *fattr); extern int nfs_getattr(struct mnt_idmap *, const struct path *, struct kstat *, u32, unsigned int); extern void nfs_access_add_cache(struct inode *, struct nfs_access_entry *, const struct cred *); extern void nfs_access_set_mask(struct nfs_access_entry *, u32); extern int nfs_permission(struct mnt_idmap *, struct inode *, int); extern int nfs_open(struct inode *, struct file *); extern int nfs_attribute_cache_expired(struct inode *inode); extern int nfs_revalidate_inode(struct inode *inode, unsigned long flags); extern int __nfs_revalidate_inode(struct nfs_server *, struct inode *); extern int nfs_clear_invalid_mapping(struct address_space *mapping); extern bool nfs_mapping_need_revalidate_inode(struct inode *inode); extern int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping); extern int nfs_revalidate_mapping_rcu(struct inode *inode); extern int nfs_setattr(struct mnt_idmap *, struct dentry *, struct iattr *); extern void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, struct nfs_fattr *); extern void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr); extern struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx); extern void put_nfs_open_context(struct nfs_open_context *ctx); extern struct nfs_open_context *nfs_find_open_context(struct inode *inode, const struct cred *cred, fmode_t mode); extern struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry, fmode_t f_mode, struct file *filp); extern void nfs_inode_attach_open_context(struct nfs_open_context *ctx); extern void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx); extern void nfs_file_clear_open_context(struct file *flip); extern struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx); extern void nfs_put_lock_context(struct nfs_lock_context *l_ctx); extern u64 nfs_compat_user_ino64(u64 fileid); extern void nfs_fattr_init(struct nfs_fattr *fattr); extern void nfs_fattr_set_barrier(struct nfs_fattr *fattr); extern unsigned long nfs_inc_attr_generation_counter(void); extern struct nfs_fattr *nfs_alloc_fattr(void); extern struct nfs_fattr *nfs_alloc_fattr_with_label(struct nfs_server *server); static inline void nfs4_label_free(struct nfs4_label *label) { #ifdef CONFIG_NFS_V4_SECURITY_LABEL if (label) { kfree(label->label); kfree(label); } #endif } static inline void nfs_free_fattr(const struct nfs_fattr *fattr) { if (fattr) nfs4_label_free(fattr->label); kfree(fattr); } extern struct nfs_fh *nfs_alloc_fhandle(void); static inline void nfs_free_fhandle(const struct nfs_fh *fh) { kfree(fh); } #ifdef NFS_DEBUG extern u32 _nfs_display_fhandle_hash(const struct nfs_fh *fh); static inline u32 nfs_display_fhandle_hash(const struct nfs_fh *fh) { return _nfs_display_fhandle_hash(fh); } extern void _nfs_display_fhandle(const struct nfs_fh *fh, const char *caption); #define nfs_display_fhandle(fh, caption) \ do { \ if (unlikely(nfs_debug & NFSDBG_FACILITY)) \ _nfs_display_fhandle(fh, caption); \ } while (0) #else static inline u32 nfs_display_fhandle_hash(const struct nfs_fh *fh) { return 0; } static inline void nfs_display_fhandle(const struct nfs_fh *fh, const char *caption) { } #endif /* * linux/fs/nfs/nfsroot.c */ extern int nfs_root_data(char **root_device, char **root_data); /*__init*/ /* linux/net/ipv4/ipconfig.c: trims ip addr off front of name, too. */ extern __be32 root_nfs_parse_addr(char *name); /*__init*/ /* * linux/fs/nfs/file.c */ extern const struct file_operations nfs_file_operations; #if IS_ENABLED(CONFIG_NFS_V4) extern const struct file_operations nfs4_file_operations; #endif /* CONFIG_NFS_V4 */ extern const struct address_space_operations nfs_file_aops; extern const struct address_space_operations nfs_dir_aops; static inline struct nfs_open_context *nfs_file_open_context(struct file *filp) { return filp->private_data; } static inline const struct cred *nfs_file_cred(struct file *file) { if (file != NULL) { struct nfs_open_context *ctx = nfs_file_open_context(file); if (ctx) return ctx->cred; } return NULL; } /* * linux/fs/nfs/direct.c */ int nfs_swap_rw(struct kiocb *iocb, struct iov_iter *iter); ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter, bool swap); ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter, bool swap); /* * linux/fs/nfs/dir.c */ extern const struct file_operations nfs_dir_operations; extern const struct dentry_operations nfs_dentry_operations; extern void nfs_force_lookup_revalidate(struct inode *dir); extern void nfs_set_verifier(struct dentry * dentry, unsigned long verf); #if IS_ENABLED(CONFIG_NFS_V4) extern void nfs_clear_verifier_delegated(struct inode *inode); #endif /* IS_ENABLED(CONFIG_NFS_V4) */ extern struct dentry *nfs_add_or_obtain(struct dentry *dentry, struct nfs_fh *fh, struct nfs_fattr *fattr); extern int nfs_instantiate(struct dentry *dentry, struct nfs_fh *fh, struct nfs_fattr *fattr); extern int nfs_may_open(struct inode *inode, const struct cred *cred, int openflags); extern void nfs_access_zap_cache(struct inode *inode); extern int nfs_access_get_cached(struct inode *inode, const struct cred *cred, u32 *mask, bool may_block); extern int nfs_atomic_open_v23(struct inode *dir, struct dentry *dentry, struct file *file, unsigned int open_flags, umode_t mode); /* * linux/fs/nfs/symlink.c */ extern const struct inode_operations nfs_symlink_inode_operations; /* * linux/fs/nfs/sysctl.c */ #ifdef CONFIG_SYSCTL extern int nfs_register_sysctl(void); extern void nfs_unregister_sysctl(void); #else #define nfs_register_sysctl() 0 #define nfs_unregister_sysctl() do { } while(0) #endif /* * linux/fs/nfs/namespace.c */ extern const struct inode_operations nfs_mountpoint_inode_operations; extern const struct inode_operations nfs_referral_inode_operations; extern int nfs_mountpoint_expiry_timeout; extern void nfs_release_automount_timer(void); /* * linux/fs/nfs/unlink.c */ extern void nfs_complete_unlink(struct dentry *dentry, struct inode *); /* * linux/fs/nfs/write.c */ extern int nfs_congestion_kb; extern int nfs_writepages(struct address_space *, struct writeback_control *); extern int nfs_flush_incompatible(struct file *file, struct folio *folio); extern int nfs_update_folio(struct file *file, struct folio *folio, unsigned int offset, unsigned int count); /* * Try to write back everything synchronously (but check the * return value!) */ extern int nfs_sync_inode(struct inode *inode); extern int nfs_wb_all(struct inode *inode); extern int nfs_wb_folio(struct inode *inode, struct folio *folio); int nfs_wb_folio_cancel(struct inode *inode, struct folio *folio); extern int nfs_commit_inode(struct inode *, int); extern struct nfs_commit_data *nfs_commitdata_alloc(void); extern void nfs_commit_free(struct nfs_commit_data *data); void nfs_commit_begin(struct nfs_mds_commit_info *cinfo); bool nfs_commit_end(struct nfs_mds_commit_info *cinfo); static inline bool nfs_have_writebacks(const struct inode *inode) { if (S_ISREG(inode->i_mode)) return atomic_long_read(&NFS_I(inode)->nrequests) != 0; return false; } /* * linux/fs/nfs/read.c */ int nfs_read_folio(struct file *, struct folio *); void nfs_readahead(struct readahead_control *); /* * inline functions */ static inline loff_t nfs_size_to_loff_t(__u64 size) { return min_t(u64, size, OFFSET_MAX); } static inline ino_t nfs_fileid_to_ino_t(u64 fileid) { ino_t ino = (ino_t) fileid; if (sizeof(ino_t) < sizeof(u64)) ino ^= fileid >> (sizeof(u64)-sizeof(ino_t)) * 8; return ino; } static inline void nfs_ooo_clear(struct nfs_inode *nfsi) { nfsi->cache_validity &= ~NFS_INO_DATA_INVAL_DEFER; kfree(nfsi->ooo); nfsi->ooo = NULL; } static inline bool nfs_ooo_test(struct nfs_inode *nfsi) { return (nfsi->cache_validity & NFS_INO_DATA_INVAL_DEFER) || (nfsi->ooo && nfsi->ooo->cnt > 0); } #define NFS_JUKEBOX_RETRY_TIME (5 * HZ) /* We need to block new opens while a file is being unlinked. * If it is opened *before* we decide to unlink, we will silly-rename * instead. If it is opened *after*, then we need to create or will fail. * If we allow the two to race, we could end up with a file that is open * but deleted on the server resulting in ESTALE. * So use ->d_fsdata to record when the unlink is happening * and block dentry revalidation while it is set. */ #define NFS_FSDATA_BLOCKED ((void*)1) # undef ifdebug # ifdef NFS_DEBUG # define ifdebug(fac) if (unlikely(nfs_debug & NFSDBG_##fac)) # define NFS_IFDEBUG(x) x # else # define ifdebug(fac) if (0) # define NFS_IFDEBUG(x) # endif #endif
722 108 709 707 709 186 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 // SPDX-License-Identifier: GPL-2.0-only #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kvm_host.h> #include <linux/kvm_irqfd.h> #include <asm/irq_remapping.h> #include <asm/cpu.h> #include "lapic.h" #include "irq.h" #include "posted_intr.h" #include "trace.h" #include "vmx.h" #include "tdx.h" /* * Maintain a per-CPU list of vCPUs that need to be awakened by wakeup_handler() * when a WAKEUP_VECTOR interrupted is posted. vCPUs are added to the list when * the vCPU is scheduled out and is blocking (e.g. in HLT) with IRQs enabled. * The vCPUs posted interrupt descriptor is updated at the same time to set its * notification vector to WAKEUP_VECTOR, so that posted interrupt from devices * wake the target vCPUs. vCPUs are removed from the list and the notification * vector is reset when the vCPU is scheduled in. */ static DEFINE_PER_CPU(struct list_head, wakeup_vcpus_on_cpu); /* * Protect the per-CPU list with a per-CPU spinlock to handle task migration. * When a blocking vCPU is awakened _and_ migrated to a different pCPU, the * ->sched_in() path will need to take the vCPU off the list of the _previous_ * CPU. IRQs must be disabled when taking this lock, otherwise deadlock will * occur if a wakeup IRQ arrives and attempts to acquire the lock. */ static DEFINE_PER_CPU(raw_spinlock_t, wakeup_vcpus_on_cpu_lock); #define PI_LOCK_SCHED_OUT SINGLE_DEPTH_NESTING static struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) { return &(to_vt(vcpu)->pi_desc); } static int pi_try_set_control(struct pi_desc *pi_desc, u64 *pold, u64 new) { /* * PID.ON can be set at any time by a different vCPU or by hardware, * e.g. a device. PID.control must be written atomically, and the * update must be retried with a fresh snapshot an ON change causes * the cmpxchg to fail. */ if (!try_cmpxchg64(&pi_desc->control, pold, new)) return -EBUSY; return 0; } void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) { struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); struct vcpu_vt *vt = to_vt(vcpu); struct pi_desc old, new; unsigned long flags; unsigned int dest; /* * To simplify hot-plug and dynamic toggling of APICv, keep PI.NDST and * PI.SN up-to-date even if there is no assigned device or if APICv is * deactivated due to a dynamic inhibit bit, e.g. for Hyper-V's SyncIC. */ if (!enable_apicv || !lapic_in_kernel(vcpu)) return; /* * If the vCPU wasn't on the wakeup list and wasn't migrated, then the * full update can be skipped as neither the vector nor the destination * needs to be changed. Clear SN even if there is no assigned device, * again for simplicity. */ if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR && vcpu->cpu == cpu) { if (pi_test_and_clear_sn(pi_desc)) goto after_clear_sn; return; } local_irq_save(flags); /* * If the vCPU was waiting for wakeup, remove the vCPU from the wakeup * list of the _previous_ pCPU, which will not be the same as the * current pCPU if the task was migrated. */ if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR) { raw_spinlock_t *spinlock = &per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu); /* * In addition to taking the wakeup lock for the regular/IRQ * context, tell lockdep it is being taken for the "sched out" * context as well. vCPU loads happens in task context, and * this is taking the lock of the *previous* CPU, i.e. can race * with both the scheduler and the wakeup handler. */ raw_spin_lock(spinlock); spin_acquire(&spinlock->dep_map, PI_LOCK_SCHED_OUT, 0, _RET_IP_); list_del(&vt->pi_wakeup_list); spin_release(&spinlock->dep_map, _RET_IP_); raw_spin_unlock(spinlock); } dest = cpu_physical_id(cpu); if (!x2apic_mode) dest = (dest << 8) & 0xFF00; old.control = READ_ONCE(pi_desc->control); do { new.control = old.control; /* * Clear SN (as above) and refresh the destination APIC ID to * handle task migration (@cpu != vcpu->cpu). */ new.ndst = dest; __pi_clear_sn(&new); /* * Restore the notification vector; in the blocking case, the * descriptor was modified on "put" to use the wakeup vector. */ new.nv = POSTED_INTR_VECTOR; } while (pi_try_set_control(pi_desc, &old.control, new.control)); local_irq_restore(flags); after_clear_sn: /* * Clear SN before reading the bitmap. The VT-d firmware * writes the bitmap and reads SN atomically (5.2.3 in the * spec), so it doesn't really have a memory barrier that * pairs with this, but we cannot do that and we need one. */ smp_mb__after_atomic(); if (!pi_is_pir_empty(pi_desc)) pi_set_on(pi_desc); } static bool vmx_can_use_vtd_pi(struct kvm *kvm) { /* * Note, reading the number of possible bypass IRQs can race with a * bypass IRQ being attached to the VM. vmx_pi_start_bypass() ensures * blockng vCPUs will see an elevated count or get KVM_REQ_UNBLOCK. */ return irqchip_in_kernel(kvm) && kvm_arch_has_irq_bypass() && READ_ONCE(kvm->arch.nr_possible_bypass_irqs); } /* * Put the vCPU on this pCPU's list of vCPUs that needs to be awakened and set * WAKEUP as the notification vector in the PI descriptor. */ static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu) { struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); struct vcpu_vt *vt = to_vt(vcpu); struct pi_desc old, new; lockdep_assert_irqs_disabled(); /* * Acquire the wakeup lock using the "sched out" context to workaround * a lockdep false positive. When this is called, schedule() holds * various per-CPU scheduler locks. When the wakeup handler runs, it * holds this CPU's wakeup lock while calling try_to_wake_up(), which * can eventually take the aforementioned scheduler locks, which causes * lockdep to assume there is deadlock. * * Deadlock can't actually occur because IRQs are disabled for the * entirety of the sched_out critical section, i.e. the wakeup handler * can't run while the scheduler locks are held. */ raw_spin_lock_nested(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu), PI_LOCK_SCHED_OUT); list_add_tail(&vt->pi_wakeup_list, &per_cpu(wakeup_vcpus_on_cpu, vcpu->cpu)); raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); WARN(pi_test_sn(pi_desc), "PI descriptor SN field set before blocking"); old.control = READ_ONCE(pi_desc->control); do { /* set 'NV' to 'wakeup vector' */ new.control = old.control; new.nv = POSTED_INTR_WAKEUP_VECTOR; } while (pi_try_set_control(pi_desc, &old.control, new.control)); /* * Send a wakeup IPI to this CPU if an interrupt may have been posted * before the notification vector was updated, in which case the IRQ * will arrive on the non-wakeup vector. An IPI is needed as calling * try_to_wake_up() from ->sched_out() isn't allowed (IRQs are not * enabled until it is safe to call try_to_wake_up() on the task being * scheduled out). */ if (pi_test_on(&new)) __apic_send_IPI_self(POSTED_INTR_WAKEUP_VECTOR); } static bool vmx_needs_pi_wakeup(struct kvm_vcpu *vcpu) { /* * The default posted interrupt vector does nothing when * invoked outside guest mode. Return whether a blocked vCPU * can be the target of posted interrupts, as is the case when * using either IPI virtualization or VT-d PI, so that the * notification vector is switched to the one that calls * back to the pi_wakeup_handler() function. */ return (vmx_can_use_ipiv(vcpu) && !is_td_vcpu(vcpu)) || vmx_can_use_vtd_pi(vcpu->kvm); } void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) { struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); if (!vmx_needs_pi_wakeup(vcpu)) return; /* * If the vCPU is blocking with IRQs enabled and ISN'T being preempted, * enable the wakeup handler so that notification IRQ wakes the vCPU as * expected. There is no need to enable the wakeup handler if the vCPU * is preempted between setting its wait state and manually scheduling * out, as the task is still runnable, i.e. doesn't need a wake event * from KVM to be scheduled in. * * If the wakeup handler isn't being enabled, Suppress Notifications as * the cost of propagating PIR.IRR to PID.ON is negligible compared to * the cost of a spurious IRQ, and vCPU put/load is a slow path. */ if (!vcpu->preempted && kvm_vcpu_is_blocking(vcpu) && ((is_td_vcpu(vcpu) && tdx_interrupt_allowed(vcpu)) || (!is_td_vcpu(vcpu) && !vmx_interrupt_blocked(vcpu)))) pi_enable_wakeup_handler(vcpu); else pi_set_sn(pi_desc); } /* * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR. */ void pi_wakeup_handler(void) { int cpu = smp_processor_id(); struct list_head *wakeup_list = &per_cpu(wakeup_vcpus_on_cpu, cpu); raw_spinlock_t *spinlock = &per_cpu(wakeup_vcpus_on_cpu_lock, cpu); struct vcpu_vt *vt; raw_spin_lock(spinlock); list_for_each_entry(vt, wakeup_list, pi_wakeup_list) { if (pi_test_on(&vt->pi_desc)) kvm_vcpu_wake_up(vt_to_vcpu(vt)); } raw_spin_unlock(spinlock); } void __init pi_init_cpu(int cpu) { INIT_LIST_HEAD(&per_cpu(wakeup_vcpus_on_cpu, cpu)); raw_spin_lock_init(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu)); } void pi_apicv_pre_state_restore(struct kvm_vcpu *vcpu) { struct pi_desc *pi = vcpu_to_pi_desc(vcpu); pi_clear_on(pi); memset(pi->pir, 0, sizeof(pi->pir)); } bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu) { struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); return pi_test_on(pi_desc) || (pi_test_sn(pi_desc) && !pi_is_pir_empty(pi_desc)); } /* * Kick all vCPUs when the first possible bypass IRQ is attached to a VM, as * blocking vCPUs may scheduled out without reconfiguring PID.NV to the wakeup * vector, i.e. if the bypass IRQ came along after vmx_vcpu_pi_put(). */ void vmx_pi_start_bypass(struct kvm *kvm) { if (WARN_ON_ONCE(!vmx_can_use_vtd_pi(kvm))) return; kvm_make_all_cpus_request(kvm, KVM_REQ_UNBLOCK); } int vmx_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, struct kvm_vcpu *vcpu, u32 vector) { if (vcpu) { struct intel_iommu_pi_data pi_data = { .pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu)), .vector = vector, }; return irq_set_vcpu_affinity(host_irq, &pi_data); } else { return irq_set_vcpu_affinity(host_irq, NULL); } }
6 4 3 6 3 2 1 1 1 1 3 2 5 3 2 2 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 // SPDX-License-Identifier: GPL-2.0-only /* Copyright (c) 2019 Facebook */ #include <linux/bpf.h> #include <linux/bpf_verifier.h> #include <linux/btf.h> #include <linux/filter.h> #include <linux/slab.h> #include <linux/numa.h> #include <linux/seq_file.h> #include <linux/refcount.h> #include <linux/mutex.h> #include <linux/btf_ids.h> #include <linux/rcupdate_wait.h> #include <linux/poll.h> struct bpf_struct_ops_value { struct bpf_struct_ops_common_value common; char data[] ____cacheline_aligned_in_smp; }; #define MAX_TRAMP_IMAGE_PAGES 8 struct bpf_struct_ops_map { struct bpf_map map; const struct bpf_struct_ops_desc *st_ops_desc; /* protect map_update */ struct mutex lock; /* link has all the bpf_links that is populated * to the func ptr of the kernel's struct * (in kvalue.data). */ struct bpf_link **links; /* ksyms for bpf trampolines */ struct bpf_ksym **ksyms; u32 funcs_cnt; u32 image_pages_cnt; /* image_pages is an array of pages that has all the trampolines * that stores the func args before calling the bpf_prog. */ void *image_pages[MAX_TRAMP_IMAGE_PAGES]; /* The owner moduler's btf. */ struct btf *btf; /* uvalue->data stores the kernel struct * (e.g. tcp_congestion_ops) that is more useful * to userspace than the kvalue. For example, * the bpf_prog's id is stored instead of the kernel * address of a func ptr. */ struct bpf_struct_ops_value *uvalue; /* kvalue.data stores the actual kernel's struct * (e.g. tcp_congestion_ops) that will be * registered to the kernel subsystem. */ struct bpf_struct_ops_value kvalue; }; struct bpf_struct_ops_link { struct bpf_link link; struct bpf_map __rcu *map; wait_queue_head_t wait_hup; }; static DEFINE_MUTEX(update_mutex); #define VALUE_PREFIX "bpf_struct_ops_" #define VALUE_PREFIX_LEN (sizeof(VALUE_PREFIX) - 1) const struct bpf_verifier_ops bpf_struct_ops_verifier_ops = { }; const struct bpf_prog_ops bpf_struct_ops_prog_ops = { #ifdef CONFIG_NET .test_run = bpf_struct_ops_test_run, #endif }; BTF_ID_LIST(st_ops_ids) BTF_ID(struct, module) BTF_ID(struct, bpf_struct_ops_common_value) enum { IDX_MODULE_ID, IDX_ST_OPS_COMMON_VALUE_ID, }; extern struct btf *btf_vmlinux; static bool is_valid_value_type(struct btf *btf, s32 value_id, const struct btf_type *type, const char *value_name) { const struct btf_type *common_value_type; const struct btf_member *member; const struct btf_type *vt, *mt; vt = btf_type_by_id(btf, value_id); if (btf_vlen(vt) != 2) { pr_warn("The number of %s's members should be 2, but we get %d\n", value_name, btf_vlen(vt)); return false; } member = btf_type_member(vt); mt = btf_type_by_id(btf, member->type); common_value_type = btf_type_by_id(btf_vmlinux, st_ops_ids[IDX_ST_OPS_COMMON_VALUE_ID]); if (mt != common_value_type) { pr_warn("The first member of %s should be bpf_struct_ops_common_value\n", value_name); return false; } member++; mt = btf_type_by_id(btf, member->type); if (mt != type) { pr_warn("The second member of %s should be %s\n", value_name, btf_name_by_offset(btf, type->name_off)); return false; } return true; } static void *bpf_struct_ops_image_alloc(void) { void *image; int err; err = bpf_jit_charge_modmem(PAGE_SIZE); if (err) return ERR_PTR(err); image = arch_alloc_bpf_trampoline(PAGE_SIZE); if (!image) { bpf_jit_uncharge_modmem(PAGE_SIZE); return ERR_PTR(-ENOMEM); } return image; } void bpf_struct_ops_image_free(void *image) { if (image) { arch_free_bpf_trampoline(image, PAGE_SIZE); bpf_jit_uncharge_modmem(PAGE_SIZE); } } #define MAYBE_NULL_SUFFIX "__nullable" #define REFCOUNTED_SUFFIX "__ref" /* Prepare argument info for every nullable argument of a member of a * struct_ops type. * * Initialize a struct bpf_struct_ops_arg_info according to type info of * the arguments of a stub function. (Check kCFI for more information about * stub functions.) * * Each member in the struct_ops type has a struct bpf_struct_ops_arg_info * to provide an array of struct bpf_ctx_arg_aux, which in turn provides * the information that used by the verifier to check the arguments of the * BPF struct_ops program assigned to the member. Here, we only care about * the arguments that are marked as __nullable. * * The array of struct bpf_ctx_arg_aux is eventually assigned to * prog->aux->ctx_arg_info of BPF struct_ops programs and passed to the * verifier. (See check_struct_ops_btf_id()) * * arg_info->info will be the list of struct bpf_ctx_arg_aux if success. If * fails, it will be kept untouched. */ static int prepare_arg_info(struct btf *btf, const char *st_ops_name, const char *member_name, const struct btf_type *func_proto, void *stub_func_addr, struct bpf_struct_ops_arg_info *arg_info) { const struct btf_type *stub_func_proto, *pointed_type; bool is_nullable = false, is_refcounted = false; const struct btf_param *stub_args, *args; struct bpf_ctx_arg_aux *info, *info_buf; u32 nargs, arg_no, info_cnt = 0; char ksym[KSYM_SYMBOL_LEN]; const char *stub_fname; const char *suffix; s32 stub_func_id; u32 arg_btf_id; int offset; stub_fname = kallsyms_lookup((unsigned long)stub_func_addr, NULL, NULL, NULL, ksym); if (!stub_fname) { pr_warn("Cannot find the stub function name for the %s in struct %s\n", member_name, st_ops_name); return -ENOENT; } stub_func_id = btf_find_by_name_kind(btf, stub_fname, BTF_KIND_FUNC); if (stub_func_id < 0) { pr_warn("Cannot find the stub function %s in btf\n", stub_fname); return -ENOENT; } stub_func_proto = btf_type_by_id(btf, stub_func_id); stub_func_proto = btf_type_by_id(btf, stub_func_proto->type); /* Check if the number of arguments of the stub function is the same * as the number of arguments of the function pointer. */ nargs = btf_type_vlen(func_proto); if (nargs != btf_type_vlen(stub_func_proto)) { pr_warn("the number of arguments of the stub function %s does not match the number of arguments of the member %s of struct %s\n", stub_fname, member_name, st_ops_name); return -EINVAL; } if (!nargs) return 0; args = btf_params(func_proto); stub_args = btf_params(stub_func_proto); info_buf = kcalloc(nargs, sizeof(*info_buf), GFP_KERNEL); if (!info_buf) return -ENOMEM; /* Prepare info for every nullable argument */ info = info_buf; for (arg_no = 0; arg_no < nargs; arg_no++) { /* Skip arguments that is not suffixed with * "__nullable or __ref". */ is_nullable = btf_param_match_suffix(btf, &stub_args[arg_no], MAYBE_NULL_SUFFIX); is_refcounted = btf_param_match_suffix(btf, &stub_args[arg_no], REFCOUNTED_SUFFIX); if (is_nullable) suffix = MAYBE_NULL_SUFFIX; else if (is_refcounted) suffix = REFCOUNTED_SUFFIX; else continue; /* Should be a pointer to struct */ pointed_type = btf_type_resolve_ptr(btf, args[arg_no].type, &arg_btf_id); if (!pointed_type || !btf_type_is_struct(pointed_type)) { pr_warn("stub function %s has %s tagging to an unsupported type\n", stub_fname, suffix); goto err_out; } offset = btf_ctx_arg_offset(btf, func_proto, arg_no); if (offset < 0) { pr_warn("stub function %s has an invalid trampoline ctx offset for arg#%u\n", stub_fname, arg_no); goto err_out; } if (args[arg_no].type != stub_args[arg_no].type) { pr_warn("arg#%u type in stub function %s does not match with its original func_proto\n", arg_no, stub_fname); goto err_out; } /* Fill the information of the new argument */ info->btf_id = arg_btf_id; info->btf = btf; info->offset = offset; if (is_nullable) { info->reg_type = PTR_TRUSTED | PTR_TO_BTF_ID | PTR_MAYBE_NULL; } else if (is_refcounted) { info->reg_type = PTR_TRUSTED | PTR_TO_BTF_ID; info->refcounted = true; } info++; info_cnt++; } if (info_cnt) { arg_info->info = info_buf; arg_info->cnt = info_cnt; } else { kfree(info_buf); } return 0; err_out: kfree(info_buf); return -EINVAL; } /* Clean up the arg_info in a struct bpf_struct_ops_desc. */ void bpf_struct_ops_desc_release(struct bpf_struct_ops_desc *st_ops_desc) { struct bpf_struct_ops_arg_info *arg_info; int i; arg_info = st_ops_desc->arg_info; for (i = 0; i < btf_type_vlen(st_ops_desc->type); i++) kfree(arg_info[i].info); kfree(arg_info); } static bool is_module_member(const struct btf *btf, u32 id) { const struct btf_type *t; t = btf_type_resolve_ptr(btf, id, NULL); if (!t) return false; if (!__btf_type_is_struct(t) && !btf_type_is_fwd(t)) return false; return !strcmp(btf_name_by_offset(btf, t->name_off), "module"); } int bpf_struct_ops_supported(const struct bpf_struct_ops *st_ops, u32 moff) { void *func_ptr = *(void **)(st_ops->cfi_stubs + moff); return func_ptr ? 0 : -ENOTSUPP; } int bpf_struct_ops_desc_init(struct bpf_struct_ops_desc *st_ops_desc, struct btf *btf, struct bpf_verifier_log *log) { struct bpf_struct_ops *st_ops = st_ops_desc->st_ops; struct bpf_struct_ops_arg_info *arg_info; const struct btf_member *member; const struct btf_type *t; s32 type_id, value_id; char value_name[128]; const char *mname; int i, err; if (strlen(st_ops->name) + VALUE_PREFIX_LEN >= sizeof(value_name)) { pr_warn("struct_ops name %s is too long\n", st_ops->name); return -EINVAL; } sprintf(value_name, "%s%s", VALUE_PREFIX, st_ops->name); if (!st_ops->cfi_stubs) { pr_warn("struct_ops for %s has no cfi_stubs\n", st_ops->name); return -EINVAL; } type_id = btf_find_by_name_kind(btf, st_ops->name, BTF_KIND_STRUCT); if (type_id < 0) { pr_warn("Cannot find struct %s in %s\n", st_ops->name, btf_get_name(btf)); return -EINVAL; } t = btf_type_by_id(btf, type_id); if (btf_type_vlen(t) > BPF_STRUCT_OPS_MAX_NR_MEMBERS) { pr_warn("Cannot support #%u members in struct %s\n", btf_type_vlen(t), st_ops->name); return -EINVAL; } value_id = btf_find_by_name_kind(btf, value_name, BTF_KIND_STRUCT); if (value_id < 0) { pr_warn("Cannot find struct %s in %s\n", value_name, btf_get_name(btf)); return -EINVAL; } if (!is_valid_value_type(btf, value_id, t, value_name)) return -EINVAL; arg_info = kcalloc(btf_type_vlen(t), sizeof(*arg_info), GFP_KERNEL); if (!arg_info) return -ENOMEM; st_ops_desc->arg_info = arg_info; st_ops_desc->type = t; st_ops_desc->type_id = type_id; st_ops_desc->value_id = value_id; st_ops_desc->value_type = btf_type_by_id(btf, value_id); for_each_member(i, t, member) { const struct btf_type *func_proto, *ret_type; void **stub_func_addr; u32 moff; moff = __btf_member_bit_offset(t, member) / 8; mname = btf_name_by_offset(btf, member->name_off); if (!*mname) { pr_warn("anon member in struct %s is not supported\n", st_ops->name); err = -EOPNOTSUPP; goto errout; } if (__btf_member_bitfield_size(t, member)) { pr_warn("bit field member %s in struct %s is not supported\n", mname, st_ops->name); err = -EOPNOTSUPP; goto errout; } if (!st_ops_ids[IDX_MODULE_ID] && is_module_member(btf, member->type)) { pr_warn("'struct module' btf id not found. Is CONFIG_MODULES enabled? bpf_struct_ops '%s' needs module support.\n", st_ops->name); err = -EOPNOTSUPP; goto errout; } func_proto = btf_type_resolve_func_ptr(btf, member->type, NULL); /* The member is not a function pointer or * the function pointer is not supported. */ if (!func_proto || bpf_struct_ops_supported(st_ops, moff)) continue; if (func_proto->type) { ret_type = btf_type_resolve_ptr(btf, func_proto->type, NULL); if (ret_type && !__btf_type_is_struct(ret_type)) { pr_warn("func ptr %s in struct %s returns non-struct pointer, which is not supported\n", mname, st_ops->name); err = -EOPNOTSUPP; goto errout; } } if (btf_distill_func_proto(log, btf, func_proto, mname, &st_ops->func_models[i])) { pr_warn("Error in parsing func ptr %s in struct %s\n", mname, st_ops->name); err = -EINVAL; goto errout; } stub_func_addr = *(void **)(st_ops->cfi_stubs + moff); err = prepare_arg_info(btf, st_ops->name, mname, func_proto, stub_func_addr, arg_info + i); if (err) goto errout; } if (st_ops->init(btf)) { pr_warn("Error in init bpf_struct_ops %s\n", st_ops->name); err = -EINVAL; goto errout; } return 0; errout: bpf_struct_ops_desc_release(st_ops_desc); return err; } static int bpf_struct_ops_map_get_next_key(struct bpf_map *map, void *key, void *next_key) { if (key && *(u32 *)key == 0) return -ENOENT; *(u32 *)next_key = 0; return 0; } int bpf_struct_ops_map_sys_lookup_elem(struct bpf_map *map, void *key, void *value) { struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; struct bpf_struct_ops_value *uvalue, *kvalue; enum bpf_struct_ops_state state; s64 refcnt; if (unlikely(*(u32 *)key != 0)) return -ENOENT; kvalue = &st_map->kvalue; /* Pair with smp_store_release() during map_update */ state = smp_load_acquire(&kvalue->common.state); if (state == BPF_STRUCT_OPS_STATE_INIT) { memset(value, 0, map->value_size); return 0; } /* No lock is needed. state and refcnt do not need * to be updated together under atomic context. */ uvalue = value; memcpy(uvalue, st_map->uvalue, map->value_size); uvalue->common.state = state; /* This value offers the user space a general estimate of how * many sockets are still utilizing this struct_ops for TCP * congestion control. The number might not be exact, but it * should sufficiently meet our present goals. */ refcnt = atomic64_read(&map->refcnt) - atomic64_read(&map->usercnt); refcount_set(&uvalue->common.refcnt, max_t(s64, refcnt, 0)); return 0; } static void *bpf_struct_ops_map_lookup_elem(struct bpf_map *map, void *key) { return ERR_PTR(-EINVAL); } static void bpf_struct_ops_map_put_progs(struct bpf_struct_ops_map *st_map) { u32 i; for (i = 0; i < st_map->funcs_cnt; i++) { if (!st_map->links[i]) break; bpf_link_put(st_map->links[i]); st_map->links[i] = NULL; } } static void bpf_struct_ops_map_free_image(struct bpf_struct_ops_map *st_map) { int i; for (i = 0; i < st_map->image_pages_cnt; i++) bpf_struct_ops_image_free(st_map->image_pages[i]); st_map->image_pages_cnt = 0; } static int check_zero_holes(const struct btf *btf, const struct btf_type *t, void *data) { const struct btf_member *member; u32 i, moff, msize, prev_mend = 0; const struct btf_type *mtype; for_each_member(i, t, member) { moff = __btf_member_bit_offset(t, member) / 8; if (moff > prev_mend && memchr_inv(data + prev_mend, 0, moff - prev_mend)) return -EINVAL; mtype = btf_type_by_id(btf, member->type); mtype = btf_resolve_size(btf, mtype, &msize); if (IS_ERR(mtype)) return PTR_ERR(mtype); prev_mend = moff + msize; } if (t->size > prev_mend && memchr_inv(data + prev_mend, 0, t->size - prev_mend)) return -EINVAL; return 0; } static void bpf_struct_ops_link_release(struct bpf_link *link) { } static void bpf_struct_ops_link_dealloc(struct bpf_link *link) { struct bpf_tramp_link *tlink = container_of(link, struct bpf_tramp_link, link); kfree(tlink); } const struct bpf_link_ops bpf_struct_ops_link_lops = { .release = bpf_struct_ops_link_release, .dealloc = bpf_struct_ops_link_dealloc, }; int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks, struct bpf_tramp_link *link, const struct btf_func_model *model, void *stub_func, void **_image, u32 *_image_off, bool allow_alloc) { u32 image_off = *_image_off, flags = BPF_TRAMP_F_INDIRECT; void *image = *_image; int size; tlinks[BPF_TRAMP_FENTRY].links[0] = link; tlinks[BPF_TRAMP_FENTRY].nr_links = 1; if (model->ret_size > 0) flags |= BPF_TRAMP_F_RET_FENTRY_RET; size = arch_bpf_trampoline_size(model, flags, tlinks, stub_func); if (size <= 0) return size ? : -EFAULT; /* Allocate image buffer if necessary */ if (!image || size > PAGE_SIZE - image_off) { if (!allow_alloc) return -E2BIG; image = bpf_struct_ops_image_alloc(); if (IS_ERR(image)) return PTR_ERR(image); image_off = 0; } size = arch_prepare_bpf_trampoline(NULL, image + image_off, image + image_off + size, model, flags, tlinks, stub_func); if (size <= 0) { if (image != *_image) bpf_struct_ops_image_free(image); return size ? : -EFAULT; } *_image = image; *_image_off = image_off + size; return 0; } static void bpf_struct_ops_ksym_init(const char *tname, const char *mname, void *image, unsigned int size, struct bpf_ksym *ksym) { snprintf(ksym->name, KSYM_NAME_LEN, "bpf__%s_%s", tname, mname); INIT_LIST_HEAD_RCU(&ksym->lnode); bpf_image_ksym_init(image, size, ksym); } static void bpf_struct_ops_map_add_ksyms(struct bpf_struct_ops_map *st_map) { u32 i; for (i = 0; i < st_map->funcs_cnt; i++) { if (!st_map->ksyms[i]) break; bpf_image_ksym_add(st_map->ksyms[i]); } } static void bpf_struct_ops_map_del_ksyms(struct bpf_struct_ops_map *st_map) { u32 i; for (i = 0; i < st_map->funcs_cnt; i++) { if (!st_map->ksyms[i]) break; bpf_image_ksym_del(st_map->ksyms[i]); } } static void bpf_struct_ops_map_free_ksyms(struct bpf_struct_ops_map *st_map) { u32 i; for (i = 0; i < st_map->funcs_cnt; i++) { if (!st_map->ksyms[i]) break; kfree(st_map->ksyms[i]); st_map->ksyms[i] = NULL; } } static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, void *value, u64 flags) { struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; const struct bpf_struct_ops_desc *st_ops_desc = st_map->st_ops_desc; const struct bpf_struct_ops *st_ops = st_ops_desc->st_ops; struct bpf_struct_ops_value *uvalue, *kvalue; const struct btf_type *module_type; const struct btf_member *member; const struct btf_type *t = st_ops_desc->type; struct bpf_tramp_links *tlinks; void *udata, *kdata; int prog_fd, err; u32 i, trampoline_start, image_off = 0; void *cur_image = NULL, *image = NULL; struct bpf_link **plink; struct bpf_ksym **pksym; const char *tname, *mname; if (flags) return -EINVAL; if (*(u32 *)key != 0) return -E2BIG; err = check_zero_holes(st_map->btf, st_ops_desc->value_type, value); if (err) return err; uvalue = value; err = check_zero_holes(st_map->btf, t, uvalue->data); if (err) return err; if (uvalue->common.state || refcount_read(&uvalue->common.refcnt)) return -EINVAL; tlinks = kcalloc(BPF_TRAMP_MAX, sizeof(*tlinks), GFP_KERNEL); if (!tlinks) return -ENOMEM; uvalue = (struct bpf_struct_ops_value *)st_map->uvalue; kvalue = (struct bpf_struct_ops_value *)&st_map->kvalue; mutex_lock(&st_map->lock); if (kvalue->common.state != BPF_STRUCT_OPS_STATE_INIT) { err = -EBUSY; goto unlock; } memcpy(uvalue, value, map->value_size); udata = &uvalue->data; kdata = &kvalue->data; plink = st_map->links; pksym = st_map->ksyms; tname = btf_name_by_offset(st_map->btf, t->name_off); module_type = btf_type_by_id(btf_vmlinux, st_ops_ids[IDX_MODULE_ID]); for_each_member(i, t, member) { const struct btf_type *mtype, *ptype; struct bpf_prog *prog; struct bpf_tramp_link *link; struct bpf_ksym *ksym; u32 moff; moff = __btf_member_bit_offset(t, member) / 8; mname = btf_name_by_offset(st_map->btf, member->name_off); ptype = btf_type_resolve_ptr(st_map->btf, member->type, NULL); if (ptype == module_type) { if (*(void **)(udata + moff)) goto reset_unlock; *(void **)(kdata + moff) = BPF_MODULE_OWNER; continue; } err = st_ops->init_member(t, member, kdata, udata); if (err < 0) goto reset_unlock; /* The ->init_member() has handled this member */ if (err > 0) continue; /* If st_ops->init_member does not handle it, * we will only handle func ptrs and zero-ed members * here. Reject everything else. */ /* All non func ptr member must be 0 */ if (!ptype || !btf_type_is_func_proto(ptype)) { u32 msize; mtype = btf_type_by_id(st_map->btf, member->type); mtype = btf_resolve_size(st_map->btf, mtype, &msize); if (IS_ERR(mtype)) { err = PTR_ERR(mtype); goto reset_unlock; } if (memchr_inv(udata + moff, 0, msize)) { err = -EINVAL; goto reset_unlock; } continue; } prog_fd = (int)(*(unsigned long *)(udata + moff)); /* Similar check as the attr->attach_prog_fd */ if (!prog_fd) continue; prog = bpf_prog_get(prog_fd); if (IS_ERR(prog)) { err = PTR_ERR(prog); goto reset_unlock; } if (prog->type != BPF_PROG_TYPE_STRUCT_OPS || prog->aux->attach_btf_id != st_ops_desc->type_id || prog->expected_attach_type != i) { bpf_prog_put(prog); err = -EINVAL; goto reset_unlock; } link = kzalloc(sizeof(*link), GFP_USER); if (!link) { bpf_prog_put(prog); err = -ENOMEM; goto reset_unlock; } bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_link_lops, prog, prog->expected_attach_type); *plink++ = &link->link; ksym = kzalloc(sizeof(*ksym), GFP_USER); if (!ksym) { err = -ENOMEM; goto reset_unlock; } *pksym++ = ksym; trampoline_start = image_off; err = bpf_struct_ops_prepare_trampoline(tlinks, link, &st_ops->func_models[i], *(void **)(st_ops->cfi_stubs + moff), &image, &image_off, st_map->image_pages_cnt < MAX_TRAMP_IMAGE_PAGES); if (err) goto reset_unlock; if (cur_image != image) { st_map->image_pages[st_map->image_pages_cnt++] = image; cur_image = image; trampoline_start = 0; } *(void **)(kdata + moff) = image + trampoline_start + cfi_get_offset(); /* put prog_id to udata */ *(unsigned long *)(udata + moff) = prog->aux->id; /* init ksym for this trampoline */ bpf_struct_ops_ksym_init(tname, mname, image + trampoline_start, image_off - trampoline_start, ksym); } if (st_ops->validate) { err = st_ops->validate(kdata); if (err) goto reset_unlock; } for (i = 0; i < st_map->image_pages_cnt; i++) { err = arch_protect_bpf_trampoline(st_map->image_pages[i], PAGE_SIZE); if (err) goto reset_unlock; } if (st_map->map.map_flags & BPF_F_LINK) { err = 0; /* Let bpf_link handle registration & unregistration. * * Pair with smp_load_acquire() during lookup_elem(). */ smp_store_release(&kvalue->common.state, BPF_STRUCT_OPS_STATE_READY); goto unlock; } err = st_ops->reg(kdata, NULL); if (likely(!err)) { /* This refcnt increment on the map here after * 'st_ops->reg()' is secure since the state of the * map must be set to INIT at this moment, and thus * bpf_struct_ops_map_delete_elem() can't unregister * or transition it to TOBEFREE concurrently. */ bpf_map_inc(map); /* Pair with smp_load_acquire() during lookup_elem(). * It ensures the above udata updates (e.g. prog->aux->id) * can be seen once BPF_STRUCT_OPS_STATE_INUSE is set. */ smp_store_release(&kvalue->common.state, BPF_STRUCT_OPS_STATE_INUSE); goto unlock; } /* Error during st_ops->reg(). Can happen if this struct_ops needs to be * verified as a whole, after all init_member() calls. Can also happen if * there was a race in registering the struct_ops (under the same name) to * a sub-system through different struct_ops's maps. */ reset_unlock: bpf_struct_ops_map_free_ksyms(st_map); bpf_struct_ops_map_free_image(st_map); bpf_struct_ops_map_put_progs(st_map); memset(uvalue, 0, map->value_size); memset(kvalue, 0, map->value_size); unlock: kfree(tlinks); mutex_unlock(&st_map->lock); if (!err) bpf_struct_ops_map_add_ksyms(st_map); return err; } static long bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key) { enum bpf_struct_ops_state prev_state; struct bpf_struct_ops_map *st_map; st_map = (struct bpf_struct_ops_map *)map; if (st_map->map.map_flags & BPF_F_LINK) return -EOPNOTSUPP; prev_state = cmpxchg(&st_map->kvalue.common.state, BPF_STRUCT_OPS_STATE_INUSE, BPF_STRUCT_OPS_STATE_TOBEFREE); switch (prev_state) { case BPF_STRUCT_OPS_STATE_INUSE: st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data, NULL); bpf_map_put(map); return 0; case BPF_STRUCT_OPS_STATE_TOBEFREE: return -EINPROGRESS; case BPF_STRUCT_OPS_STATE_INIT: return -ENOENT; default: WARN_ON_ONCE(1); /* Should never happen. Treat it as not found. */ return -ENOENT; } } static void bpf_struct_ops_map_seq_show_elem(struct bpf_map *map, void *key, struct seq_file *m) { struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; void *value; int err; value = kmalloc(map->value_size, GFP_USER | __GFP_NOWARN); if (!value) return; err = bpf_struct_ops_map_sys_lookup_elem(map, key, value); if (!err) { btf_type_seq_show(st_map->btf, map->btf_vmlinux_value_type_id, value, m); seq_putc(m, '\n'); } kfree(value); } static void __bpf_struct_ops_map_free(struct bpf_map *map) { struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; if (st_map->links) bpf_struct_ops_map_put_progs(st_map); if (st_map->ksyms) bpf_struct_ops_map_free_ksyms(st_map); bpf_map_area_free(st_map->links); bpf_map_area_free(st_map->ksyms); bpf_struct_ops_map_free_image(st_map); bpf_map_area_free(st_map->uvalue); bpf_map_area_free(st_map); } static void bpf_struct_ops_map_free(struct bpf_map *map) { struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; /* st_ops->owner was acquired during map_alloc to implicitly holds * the btf's refcnt. The acquire was only done when btf_is_module() * st_map->btf cannot be NULL here. */ if (btf_is_module(st_map->btf)) module_put(st_map->st_ops_desc->st_ops->owner); bpf_struct_ops_map_del_ksyms(st_map); /* The struct_ops's function may switch to another struct_ops. * * For example, bpf_tcp_cc_x->init() may switch to * another tcp_cc_y by calling * setsockopt(TCP_CONGESTION, "tcp_cc_y"). * During the switch, bpf_struct_ops_put(tcp_cc_x) is called * and its refcount may reach 0 which then free its * trampoline image while tcp_cc_x is still running. * * A vanilla rcu gp is to wait for all bpf-tcp-cc prog * to finish. bpf-tcp-cc prog is non sleepable. * A rcu_tasks gp is to wait for the last few insn * in the tramopline image to finish before releasing * the trampoline image. */ synchronize_rcu_mult(call_rcu, call_rcu_tasks); __bpf_struct_ops_map_free(map); } static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr) { if (attr->key_size != sizeof(unsigned int) || attr->max_entries != 1 || (attr->map_flags & ~(BPF_F_LINK | BPF_F_VTYPE_BTF_OBJ_FD)) || !attr->btf_vmlinux_value_type_id) return -EINVAL; return 0; } static u32 count_func_ptrs(const struct btf *btf, const struct btf_type *t) { int i; u32 count; const struct btf_member *member; count = 0; for_each_member(i, t, member) if (btf_type_resolve_func_ptr(btf, member->type, NULL)) count++; return count; } static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) { const struct bpf_struct_ops_desc *st_ops_desc; size_t st_map_size; struct bpf_struct_ops_map *st_map; const struct btf_type *t, *vt; struct module *mod = NULL; struct bpf_map *map; struct btf *btf; int ret; if (attr->map_flags & BPF_F_VTYPE_BTF_OBJ_FD) { /* The map holds btf for its whole life time. */ btf = btf_get_by_fd(attr->value_type_btf_obj_fd); if (IS_ERR(btf)) return ERR_CAST(btf); if (!btf_is_module(btf)) { btf_put(btf); return ERR_PTR(-EINVAL); } mod = btf_try_get_module(btf); /* mod holds a refcnt to btf. We don't need an extra refcnt * here. */ btf_put(btf); if (!mod) return ERR_PTR(-EINVAL); } else { btf = bpf_get_btf_vmlinux(); if (IS_ERR(btf)) return ERR_CAST(btf); if (!btf) return ERR_PTR(-ENOTSUPP); } st_ops_desc = bpf_struct_ops_find_value(btf, attr->btf_vmlinux_value_type_id); if (!st_ops_desc) { ret = -ENOTSUPP; goto errout; } vt = st_ops_desc->value_type; if (attr->value_size != vt->size) { ret = -EINVAL; goto errout; } t = st_ops_desc->type; st_map_size = sizeof(*st_map) + /* kvalue stores the * struct bpf_struct_ops_tcp_congestions_ops */ (vt->size - sizeof(struct bpf_struct_ops_value)); st_map = bpf_map_area_alloc(st_map_size, NUMA_NO_NODE); if (!st_map) { ret = -ENOMEM; goto errout; } st_map->st_ops_desc = st_ops_desc; map = &st_map->map; st_map->uvalue = bpf_map_area_alloc(vt->size, NUMA_NO_NODE); st_map->funcs_cnt = count_func_ptrs(btf, t); st_map->links = bpf_map_area_alloc(st_map->funcs_cnt * sizeof(struct bpf_link *), NUMA_NO_NODE); st_map->ksyms = bpf_map_area_alloc(st_map->funcs_cnt * sizeof(struct bpf_ksym *), NUMA_NO_NODE); if (!st_map->uvalue || !st_map->links || !st_map->ksyms) { ret = -ENOMEM; goto errout_free; } st_map->btf = btf; mutex_init(&st_map->lock); bpf_map_init_from_attr(map, attr); return map; errout_free: __bpf_struct_ops_map_free(map); errout: module_put(mod); return ERR_PTR(ret); } static u64 bpf_struct_ops_map_mem_usage(const struct bpf_map *map) { struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; const struct bpf_struct_ops_desc *st_ops_desc = st_map->st_ops_desc; const struct btf_type *vt = st_ops_desc->value_type; u64 usage; usage = sizeof(*st_map) + vt->size - sizeof(struct bpf_struct_ops_value); usage += vt->size; usage += st_map->funcs_cnt * sizeof(struct bpf_link *); usage += st_map->funcs_cnt * sizeof(struct bpf_ksym *); usage += PAGE_SIZE; return usage; } BTF_ID_LIST_SINGLE(bpf_struct_ops_map_btf_ids, struct, bpf_struct_ops_map) const struct bpf_map_ops bpf_struct_ops_map_ops = { .map_alloc_check = bpf_struct_ops_map_alloc_check, .map_alloc = bpf_struct_ops_map_alloc, .map_free = bpf_struct_ops_map_free, .map_get_next_key = bpf_struct_ops_map_get_next_key, .map_lookup_elem = bpf_struct_ops_map_lookup_elem, .map_delete_elem = bpf_struct_ops_map_delete_elem, .map_update_elem = bpf_struct_ops_map_update_elem, .map_seq_show_elem = bpf_struct_ops_map_seq_show_elem, .map_mem_usage = bpf_struct_ops_map_mem_usage, .map_btf_id = &bpf_struct_ops_map_btf_ids[0], }; /* "const void *" because some subsystem is * passing a const (e.g. const struct tcp_congestion_ops *) */ bool bpf_struct_ops_get(const void *kdata) { struct bpf_struct_ops_value *kvalue; struct bpf_struct_ops_map *st_map; struct bpf_map *map; kvalue = container_of(kdata, struct bpf_struct_ops_value, data); st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue); map = __bpf_map_inc_not_zero(&st_map->map, false); return !IS_ERR(map); } void bpf_struct_ops_put(const void *kdata) { struct bpf_struct_ops_value *kvalue; struct bpf_struct_ops_map *st_map; kvalue = container_of(kdata, struct bpf_struct_ops_value, data); st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue); bpf_map_put(&st_map->map); } u32 bpf_struct_ops_id(const void *kdata) { struct bpf_struct_ops_value *kvalue; struct bpf_struct_ops_map *st_map; kvalue = container_of(kdata, struct bpf_struct_ops_value, data); st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue); return st_map->map.id; } EXPORT_SYMBOL_GPL(bpf_struct_ops_id); static bool bpf_struct_ops_valid_to_reg(struct bpf_map *map) { struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; return map->map_type == BPF_MAP_TYPE_STRUCT_OPS && map->map_flags & BPF_F_LINK && /* Pair with smp_store_release() during map_update */ smp_load_acquire(&st_map->kvalue.common.state) == BPF_STRUCT_OPS_STATE_READY; } static void bpf_struct_ops_map_link_dealloc(struct bpf_link *link) { struct bpf_struct_ops_link *st_link; struct bpf_struct_ops_map *st_map; st_link = container_of(link, struct bpf_struct_ops_link, link); st_map = (struct bpf_struct_ops_map *) rcu_dereference_protected(st_link->map, true); if (st_map) { st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data, link); bpf_map_put(&st_map->map); } kfree(st_link); } static void bpf_struct_ops_map_link_show_fdinfo(const struct bpf_link *link, struct seq_file *seq) { struct bpf_struct_ops_link *st_link; struct bpf_map *map; st_link = container_of(link, struct bpf_struct_ops_link, link); rcu_read_lock(); map = rcu_dereference(st_link->map); if (map) seq_printf(seq, "map_id:\t%d\n", map->id); rcu_read_unlock(); } static int bpf_struct_ops_map_link_fill_link_info(const struct bpf_link *link, struct bpf_link_info *info) { struct bpf_struct_ops_link *st_link; struct bpf_map *map; st_link = container_of(link, struct bpf_struct_ops_link, link); rcu_read_lock(); map = rcu_dereference(st_link->map); if (map) info->struct_ops.map_id = map->id; rcu_read_unlock(); return 0; } static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map *new_map, struct bpf_map *expected_old_map) { struct bpf_struct_ops_map *st_map, *old_st_map; struct bpf_map *old_map; struct bpf_struct_ops_link *st_link; int err; st_link = container_of(link, struct bpf_struct_ops_link, link); st_map = container_of(new_map, struct bpf_struct_ops_map, map); if (!bpf_struct_ops_valid_to_reg(new_map)) return -EINVAL; if (!st_map->st_ops_desc->st_ops->update) return -EOPNOTSUPP; mutex_lock(&update_mutex); old_map = rcu_dereference_protected(st_link->map, lockdep_is_held(&update_mutex)); if (!old_map) { err = -ENOLINK; goto err_out; } if (expected_old_map && old_map != expected_old_map) { err = -EPERM; goto err_out; } old_st_map = container_of(old_map, struct bpf_struct_ops_map, map); /* The new and old struct_ops must be the same type. */ if (st_map->st_ops_desc != old_st_map->st_ops_desc) { err = -EINVAL; goto err_out; } err = st_map->st_ops_desc->st_ops->update(st_map->kvalue.data, old_st_map->kvalue.data, link); if (err) goto err_out; bpf_map_inc(new_map); rcu_assign_pointer(st_link->map, new_map); bpf_map_put(old_map); err_out: mutex_unlock(&update_mutex); return err; } static int bpf_struct_ops_map_link_detach(struct bpf_link *link) { struct bpf_struct_ops_link *st_link = container_of(link, struct bpf_struct_ops_link, link); struct bpf_struct_ops_map *st_map; struct bpf_map *map; mutex_lock(&update_mutex); map = rcu_dereference_protected(st_link->map, lockdep_is_held(&update_mutex)); if (!map) { mutex_unlock(&update_mutex); return 0; } st_map = container_of(map, struct bpf_struct_ops_map, map); st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data, link); RCU_INIT_POINTER(st_link->map, NULL); /* Pair with bpf_map_get() in bpf_struct_ops_link_create() or * bpf_map_inc() in bpf_struct_ops_map_link_update(). */ bpf_map_put(&st_map->map); mutex_unlock(&update_mutex); wake_up_interruptible_poll(&st_link->wait_hup, EPOLLHUP); return 0; } static __poll_t bpf_struct_ops_map_link_poll(struct file *file, struct poll_table_struct *pts) { struct bpf_struct_ops_link *st_link = file->private_data; poll_wait(file, &st_link->wait_hup, pts); return rcu_access_pointer(st_link->map) ? 0 : EPOLLHUP; } static const struct bpf_link_ops bpf_struct_ops_map_lops = { .dealloc = bpf_struct_ops_map_link_dealloc, .detach = bpf_struct_ops_map_link_detach, .show_fdinfo = bpf_struct_ops_map_link_show_fdinfo, .fill_link_info = bpf_struct_ops_map_link_fill_link_info, .update_map = bpf_struct_ops_map_link_update, .poll = bpf_struct_ops_map_link_poll, }; int bpf_struct_ops_link_create(union bpf_attr *attr) { struct bpf_struct_ops_link *link = NULL; struct bpf_link_primer link_primer; struct bpf_struct_ops_map *st_map; struct bpf_map *map; int err; map = bpf_map_get(attr->link_create.map_fd); if (IS_ERR(map)) return PTR_ERR(map); st_map = (struct bpf_struct_ops_map *)map; if (!bpf_struct_ops_valid_to_reg(map)) { err = -EINVAL; goto err_out; } link = kzalloc(sizeof(*link), GFP_USER); if (!link) { err = -ENOMEM; goto err_out; } bpf_link_init(&link->link, BPF_LINK_TYPE_STRUCT_OPS, &bpf_struct_ops_map_lops, NULL, attr->link_create.attach_type); err = bpf_link_prime(&link->link, &link_primer); if (err) goto err_out; init_waitqueue_head(&link->wait_hup); /* Hold the update_mutex such that the subsystem cannot * do link->ops->detach() before the link is fully initialized. */ mutex_lock(&update_mutex); err = st_map->st_ops_desc->st_ops->reg(st_map->kvalue.data, &link->link); if (err) { mutex_unlock(&update_mutex); bpf_link_cleanup(&link_primer); link = NULL; goto err_out; } RCU_INIT_POINTER(link->map, map); mutex_unlock(&update_mutex); return bpf_link_settle(&link_primer); err_out: bpf_map_put(map); kfree(link); return err; } void bpf_map_struct_ops_info_fill(struct bpf_map_info *info, struct bpf_map *map) { struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map; info->btf_vmlinux_id = btf_obj_id(st_map->btf); }
22 22 5 5 5 5 3 22 7 31 30 30 7 7 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 /* * linux/fs/nls/nls_koi8-ru.c * * Charset koi8-ru translation based on charset koi8-u. * The Unicode to charset table has only exact mappings. */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/nls.h> #include <linux/errno.h> static struct nls_table *p_nls; static int uni2char(const wchar_t uni, unsigned char *out, int boundlen) { if (boundlen <= 0) return -ENAMETOOLONG; if ((uni & 0xffaf) == 0x040e || (uni & 0xffce) == 0x254c) { /* koi8-ru and koi8-u differ only on two characters */ if (uni == 0x040e) out[0] = 0xbe; else if (uni == 0x045e) out[0] = 0xae; else if (uni == 0x255d || uni == 0x256c) return 0; else return p_nls->uni2char(uni, out, boundlen); return 1; } else /* fast path */ return p_nls->uni2char(uni, out, boundlen); } static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) { int n; if ((*rawstring & 0xef) != 0xae) { /* koi8-ru and koi8-u differ only on two characters */ *uni = (*rawstring & 0x10) ? 0x040e : 0x045e; return 1; } n = p_nls->char2uni(rawstring, boundlen, uni); return n; } static struct nls_table table = { .charset = "koi8-ru", .uni2char = uni2char, .char2uni = char2uni, }; static int __init init_nls_koi8_ru(void) { p_nls = load_nls("koi8-u"); if (p_nls) { table.charset2upper = p_nls->charset2upper; table.charset2lower = p_nls->charset2lower; return register_nls(&table); } return -EINVAL; } static void __exit exit_nls_koi8_ru(void) { unregister_nls(&table); unload_nls(p_nls); } module_init(init_nls_koi8_ru) module_exit(exit_nls_koi8_ru) MODULE_DESCRIPTION("NLS KOI8-RU (Belarusian)"); MODULE_LICENSE("Dual BSD/GPL");
209 209 209 209 209 209 209 208 209 209 209 20 20 20 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2014 Red Hat, Inc. * All Rights Reserved. */ #include "xfs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_sysfs.h" #include "xfs_log.h" #include "xfs_log_priv.h" #include "xfs_mount.h" #include "xfs_zones.h" struct xfs_sysfs_attr { struct attribute attr; ssize_t (*show)(struct kobject *kobject, char *buf); ssize_t (*store)(struct kobject *kobject, const char *buf, size_t count); }; static inline struct xfs_sysfs_attr * to_attr(struct attribute *attr) { return container_of(attr, struct xfs_sysfs_attr, attr); } #define XFS_SYSFS_ATTR_RW(name) \ static struct xfs_sysfs_attr xfs_sysfs_attr_##name = __ATTR_RW(name) #define XFS_SYSFS_ATTR_RO(name) \ static struct xfs_sysfs_attr xfs_sysfs_attr_##name = __ATTR_RO(name) #define XFS_SYSFS_ATTR_WO(name) \ static struct xfs_sysfs_attr xfs_sysfs_attr_##name = __ATTR_WO(name) #define ATTR_LIST(name) &xfs_sysfs_attr_##name.attr STATIC ssize_t xfs_sysfs_object_show( struct kobject *kobject, struct attribute *attr, char *buf) { struct xfs_sysfs_attr *xfs_attr = to_attr(attr); return xfs_attr->show ? xfs_attr->show(kobject, buf) : 0; } STATIC ssize_t xfs_sysfs_object_store( struct kobject *kobject, struct attribute *attr, const char *buf, size_t count) { struct xfs_sysfs_attr *xfs_attr = to_attr(attr); return xfs_attr->store ? xfs_attr->store(kobject, buf, count) : 0; } static const struct sysfs_ops xfs_sysfs_ops = { .show = xfs_sysfs_object_show, .store = xfs_sysfs_object_store, }; static struct attribute *xfs_mp_attrs[] = { NULL, }; ATTRIBUTE_GROUPS(xfs_mp); static const struct kobj_type xfs_mp_ktype = { .release = xfs_sysfs_release, .sysfs_ops = &xfs_sysfs_ops, .default_groups = xfs_mp_groups, }; #ifdef DEBUG /* debug */ STATIC ssize_t bug_on_assert_store( struct kobject *kobject, const char *buf, size_t count) { int ret; int val; ret = kstrtoint(buf, 0, &val); if (ret) return ret; if (val == 1) xfs_globals.bug_on_assert = true; else if (val == 0) xfs_globals.bug_on_assert = false; else return -EINVAL; return count; } STATIC ssize_t bug_on_assert_show( struct kobject *kobject, char *buf) { return sysfs_emit(buf, "%d\n", xfs_globals.bug_on_assert); } XFS_SYSFS_ATTR_RW(bug_on_assert); STATIC ssize_t log_recovery_delay_store( struct kobject *kobject, const char *buf, size_t count) { int ret; int val; ret = kstrtoint(buf, 0, &val); if (ret) return ret; if (val < 0 || val > 60) return -EINVAL; xfs_globals.log_recovery_delay = val; return count; } STATIC ssize_t log_recovery_delay_show( struct kobject *kobject, char *buf) { return sysfs_emit(buf, "%d\n", xfs_globals.log_recovery_delay); } XFS_SYSFS_ATTR_RW(log_recovery_delay); STATIC ssize_t mount_delay_store( struct kobject *kobject, const char *buf, size_t count) { int ret; int val; ret = kstrtoint(buf, 0, &val); if (ret) return ret; if (val < 0 || val > 60) return -EINVAL; xfs_globals.mount_delay = val; return count; } STATIC ssize_t mount_delay_show( struct kobject *kobject, char *buf) { return sysfs_emit(buf, "%d\n", xfs_globals.mount_delay); } XFS_SYSFS_ATTR_RW(mount_delay); static ssize_t always_cow_store( struct kobject *kobject, const char *buf, size_t count) { ssize_t ret; ret = kstrtobool(buf, &xfs_globals.always_cow); if (ret < 0) return ret; return count; } static ssize_t always_cow_show( struct kobject *kobject, char *buf) { return sysfs_emit(buf, "%d\n", xfs_globals.always_cow); } XFS_SYSFS_ATTR_RW(always_cow); /* * Override how many threads the parallel work queue is allowed to create. * This has to be a debug-only global (instead of an errortag) because one of * the main users of parallel workqueues is mount time quotacheck. */ STATIC ssize_t pwork_threads_store( struct kobject *kobject, const char *buf, size_t count) { int ret; int val; ret = kstrtoint(buf, 0, &val); if (ret) return ret; if (val < -1 || val > num_possible_cpus()) return -EINVAL; xfs_globals.pwork_threads = val; return count; } STATIC ssize_t pwork_threads_show( struct kobject *kobject, char *buf) { return sysfs_emit(buf, "%d\n", xfs_globals.pwork_threads); } XFS_SYSFS_ATTR_RW(pwork_threads); /* * The "LARP" (Logged extended Attribute Recovery Persistence) debugging knob * sets the XFS_DA_OP_LOGGED flag on all xfs_attr_set operations performed on * V5 filesystems. As a result, the intermediate progress of all setxattr and * removexattr operations are tracked via the log and can be restarted during * recovery. This is useful for testing xattr recovery prior to merging of the * parent pointer feature which requires it to maintain consistency, and may be * enabled for userspace xattrs in the future. */ static ssize_t larp_store( struct kobject *kobject, const char *buf, size_t count) { ssize_t ret; ret = kstrtobool(buf, &xfs_globals.larp); if (ret < 0) return ret; return count; } STATIC ssize_t larp_show( struct kobject *kobject, char *buf) { return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.larp); } XFS_SYSFS_ATTR_RW(larp); STATIC ssize_t bload_leaf_slack_store( struct kobject *kobject, const char *buf, size_t count) { int ret; int val; ret = kstrtoint(buf, 0, &val); if (ret) return ret; xfs_globals.bload_leaf_slack = val; return count; } STATIC ssize_t bload_leaf_slack_show( struct kobject *kobject, char *buf) { return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.bload_leaf_slack); } XFS_SYSFS_ATTR_RW(bload_leaf_slack); STATIC ssize_t bload_node_slack_store( struct kobject *kobject, const char *buf, size_t count) { int ret; int val; ret = kstrtoint(buf, 0, &val); if (ret) return ret; xfs_globals.bload_node_slack = val; return count; } STATIC ssize_t bload_node_slack_show( struct kobject *kobject, char *buf) { return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.bload_node_slack); } XFS_SYSFS_ATTR_RW(bload_node_slack); static struct attribute *xfs_dbg_attrs[] = { ATTR_LIST(bug_on_assert), ATTR_LIST(log_recovery_delay), ATTR_LIST(mount_delay), ATTR_LIST(always_cow), ATTR_LIST(pwork_threads), ATTR_LIST(larp), ATTR_LIST(bload_leaf_slack), ATTR_LIST(bload_node_slack), NULL, }; ATTRIBUTE_GROUPS(xfs_dbg); const struct kobj_type xfs_dbg_ktype = { .release = xfs_sysfs_release, .sysfs_ops = &xfs_sysfs_ops, .default_groups = xfs_dbg_groups, }; #endif /* DEBUG */ /* stats */ static inline struct xstats * to_xstats(struct kobject *kobject) { struct xfs_kobj *kobj = to_kobj(kobject); return container_of(kobj, struct xstats, xs_kobj); } STATIC ssize_t stats_show( struct kobject *kobject, char *buf) { struct xstats *stats = to_xstats(kobject); return xfs_stats_format(stats->xs_stats, buf); } XFS_SYSFS_ATTR_RO(stats); STATIC ssize_t stats_clear_store( struct kobject *kobject, const char *buf, size_t count) { int ret; int val; struct xstats *stats = to_xstats(kobject); ret = kstrtoint(buf, 0, &val); if (ret) return ret; if (val != 1) return -EINVAL; xfs_stats_clearall(stats->xs_stats); return count; } XFS_SYSFS_ATTR_WO(stats_clear); static struct attribute *xfs_stats_attrs[] = { ATTR_LIST(stats), ATTR_LIST(stats_clear), NULL, }; ATTRIBUTE_GROUPS(xfs_stats); const struct kobj_type xfs_stats_ktype = { .release = xfs_sysfs_release, .sysfs_ops = &xfs_sysfs_ops, .default_groups = xfs_stats_groups, }; /* xlog */ static inline struct xlog * to_xlog(struct kobject *kobject) { struct xfs_kobj *kobj = to_kobj(kobject); return container_of(kobj, struct xlog, l_kobj); } STATIC ssize_t log_head_lsn_show( struct kobject *kobject, char *buf) { int cycle; int block; struct xlog *log = to_xlog(kobject); spin_lock(&log->l_icloglock); cycle = log->l_curr_cycle; block = log->l_curr_block; spin_unlock(&log->l_icloglock); return sysfs_emit(buf, "%d:%d\n", cycle, block); } XFS_SYSFS_ATTR_RO(log_head_lsn); STATIC ssize_t log_tail_lsn_show( struct kobject *kobject, char *buf) { int cycle; int block; struct xlog *log = to_xlog(kobject); xlog_crack_atomic_lsn(&log->l_tail_lsn, &cycle, &block); return sysfs_emit(buf, "%d:%d\n", cycle, block); } XFS_SYSFS_ATTR_RO(log_tail_lsn); STATIC ssize_t reserve_grant_head_bytes_show( struct kobject *kobject, char *buf) { return sysfs_emit(buf, "%lld\n", atomic64_read(&to_xlog(kobject)->l_reserve_head.grant)); } XFS_SYSFS_ATTR_RO(reserve_grant_head_bytes); STATIC ssize_t write_grant_head_bytes_show( struct kobject *kobject, char *buf) { return sysfs_emit(buf, "%lld\n", atomic64_read(&to_xlog(kobject)->l_write_head.grant)); } XFS_SYSFS_ATTR_RO(write_grant_head_bytes); static struct attribute *xfs_log_attrs[] = { ATTR_LIST(log_head_lsn), ATTR_LIST(log_tail_lsn), ATTR_LIST(reserve_grant_head_bytes), ATTR_LIST(write_grant_head_bytes), NULL, }; ATTRIBUTE_GROUPS(xfs_log); const struct kobj_type xfs_log_ktype = { .release = xfs_sysfs_release, .sysfs_ops = &xfs_sysfs_ops, .default_groups = xfs_log_groups, }; /* * Metadata IO error configuration * * The sysfs structure here is: * ...xfs/<dev>/error/<class>/<errno>/<error_attrs> * * where <class> allows us to discriminate between data IO and metadata IO, * and any other future type of IO (e.g. special inode or directory error * handling) we care to support. */ static inline struct xfs_error_cfg * to_error_cfg(struct kobject *kobject) { struct xfs_kobj *kobj = to_kobj(kobject); return container_of(kobj, struct xfs_error_cfg, kobj); } static inline struct xfs_mount * err_to_mp(struct kobject *kobject) { struct xfs_kobj *kobj = to_kobj(kobject); return container_of(kobj, struct xfs_mount, m_error_kobj); } static ssize_t max_retries_show( struct kobject *kobject, char *buf) { int retries; struct xfs_error_cfg *cfg = to_error_cfg(kobject); if (cfg->max_retries == XFS_ERR_RETRY_FOREVER) retries = -1; else retries = cfg->max_retries; return sysfs_emit(buf, "%d\n", retries); } static ssize_t max_retries_store( struct kobject *kobject, const char *buf, size_t count) { struct xfs_error_cfg *cfg = to_error_cfg(kobject); int ret; int val; ret = kstrtoint(buf, 0, &val); if (ret) return ret; if (val < -1) return -EINVAL; if (val == -1) cfg->max_retries = XFS_ERR_RETRY_FOREVER; else cfg->max_retries = val; return count; } XFS_SYSFS_ATTR_RW(max_retries); static ssize_t retry_timeout_seconds_show( struct kobject *kobject, char *buf) { int timeout; struct xfs_error_cfg *cfg = to_error_cfg(kobject); if (cfg->retry_timeout == XFS_ERR_RETRY_FOREVER) timeout = -1; else timeout = jiffies_to_msecs(cfg->retry_timeout) / MSEC_PER_SEC; return sysfs_emit(buf, "%d\n", timeout); } static ssize_t retry_timeout_seconds_store( struct kobject *kobject, const char *buf, size_t count) { struct xfs_error_cfg *cfg = to_error_cfg(kobject); int ret; int val; ret = kstrtoint(buf, 0, &val); if (ret) return ret; /* 1 day timeout maximum, -1 means infinite */ if (val < -1 || val > 86400) return -EINVAL; if (val == -1) cfg->retry_timeout = XFS_ERR_RETRY_FOREVER; else { cfg->retry_timeout = secs_to_jiffies(val); ASSERT(secs_to_jiffies(val) < LONG_MAX); } return count; } XFS_SYSFS_ATTR_RW(retry_timeout_seconds); static ssize_t fail_at_unmount_show( struct kobject *kobject, char *buf) { struct xfs_mount *mp = err_to_mp(kobject); return sysfs_emit(buf, "%d\n", mp->m_fail_unmount); } static ssize_t fail_at_unmount_store( struct kobject *kobject, const char *buf, size_t count) { struct xfs_mount *mp = err_to_mp(kobject); int ret; int val; ret = kstrtoint(buf, 0, &val); if (ret) return ret; if (val < 0 || val > 1) return -EINVAL; mp->m_fail_unmount = val; return count; } XFS_SYSFS_ATTR_RW(fail_at_unmount); static struct attribute *xfs_error_attrs[] = { ATTR_LIST(max_retries), ATTR_LIST(retry_timeout_seconds), NULL, }; ATTRIBUTE_GROUPS(xfs_error); static const struct kobj_type xfs_error_cfg_ktype = { .release = xfs_sysfs_release, .sysfs_ops = &xfs_sysfs_ops, .default_groups = xfs_error_groups, }; static const struct kobj_type xfs_error_ktype = { .release = xfs_sysfs_release, .sysfs_ops = &xfs_sysfs_ops, }; /* * Error initialization tables. These need to be ordered in the same * order as the enums used to index the array. All class init tables need to * define a "default" behaviour as the first entry, all other entries can be * empty. */ struct xfs_error_init { char *name; int max_retries; int retry_timeout; /* in seconds */ }; static const struct xfs_error_init xfs_error_meta_init[XFS_ERR_ERRNO_MAX] = { { .name = "default", .max_retries = XFS_ERR_RETRY_FOREVER, .retry_timeout = XFS_ERR_RETRY_FOREVER, }, { .name = "EIO", .max_retries = XFS_ERR_RETRY_FOREVER, .retry_timeout = XFS_ERR_RETRY_FOREVER, }, { .name = "ENOSPC", .max_retries = XFS_ERR_RETRY_FOREVER, .retry_timeout = XFS_ERR_RETRY_FOREVER, }, { .name = "ENODEV", .max_retries = 0, /* We can't recover from devices disappearing */ .retry_timeout = 0, }, }; static int xfs_error_sysfs_init_class( struct xfs_mount *mp, int class, const char *parent_name, struct xfs_kobj *parent_kobj, const struct xfs_error_init init[]) { struct xfs_error_cfg *cfg; int error; int i; ASSERT(class < XFS_ERR_CLASS_MAX); error = xfs_sysfs_init(parent_kobj, &xfs_error_ktype, &mp->m_error_kobj, parent_name); if (error) return error; for (i = 0; i < XFS_ERR_ERRNO_MAX; i++) { cfg = &mp->m_error_cfg[class][i]; error = xfs_sysfs_init(&cfg->kobj, &xfs_error_cfg_ktype, parent_kobj, init[i].name); if (error) goto out_error; cfg->max_retries = init[i].max_retries; if (init[i].retry_timeout == XFS_ERR_RETRY_FOREVER) cfg->retry_timeout = XFS_ERR_RETRY_FOREVER; else cfg->retry_timeout = secs_to_jiffies(init[i].retry_timeout); } return 0; out_error: /* unwind the entries that succeeded */ for (i--; i >= 0; i--) { cfg = &mp->m_error_cfg[class][i]; xfs_sysfs_del(&cfg->kobj); } xfs_sysfs_del(parent_kobj); return error; } static inline struct xfs_mount *zoned_to_mp(struct kobject *kobj) { return container_of(to_kobj(kobj), struct xfs_mount, m_zoned_kobj); } static ssize_t max_open_zones_show( struct kobject *kobj, char *buf) { /* only report the open zones available for user data */ return sysfs_emit(buf, "%u\n", zoned_to_mp(kobj)->m_max_open_zones - XFS_OPEN_GC_ZONES); } XFS_SYSFS_ATTR_RO(max_open_zones); static ssize_t zonegc_low_space_store( struct kobject *kobj, const char *buf, size_t count) { int ret; unsigned int val; ret = kstrtouint(buf, 0, &val); if (ret) return ret; if (val > 100) return -EINVAL; zoned_to_mp(kobj)->m_zonegc_low_space = val; return count; } static ssize_t zonegc_low_space_show( struct kobject *kobj, char *buf) { return sysfs_emit(buf, "%u\n", zoned_to_mp(kobj)->m_zonegc_low_space); } XFS_SYSFS_ATTR_RW(zonegc_low_space); static struct attribute *xfs_zoned_attrs[] = { ATTR_LIST(max_open_zones), ATTR_LIST(zonegc_low_space), NULL, }; ATTRIBUTE_GROUPS(xfs_zoned); static const struct kobj_type xfs_zoned_ktype = { .release = xfs_sysfs_release, .sysfs_ops = &xfs_sysfs_ops, .default_groups = xfs_zoned_groups, }; int xfs_mount_sysfs_init( struct xfs_mount *mp) { int error; super_set_sysfs_name_id(mp->m_super); /* .../xfs/<dev>/ */ error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype, NULL, mp->m_super->s_id); if (error) return error; /* .../xfs/<dev>/stats/ */ error = xfs_sysfs_init(&mp->m_stats.xs_kobj, &xfs_stats_ktype, &mp->m_kobj, "stats"); if (error) goto out_remove_fsdir; /* .../xfs/<dev>/error/ */ error = xfs_sysfs_init(&mp->m_error_kobj, &xfs_error_ktype, &mp->m_kobj, "error"); if (error) goto out_remove_stats_dir; /* .../xfs/<dev>/error/fail_at_unmount */ error = sysfs_create_file(&mp->m_error_kobj.kobject, ATTR_LIST(fail_at_unmount)); if (error) goto out_remove_error_dir; /* .../xfs/<dev>/error/metadata/ */ error = xfs_error_sysfs_init_class(mp, XFS_ERR_METADATA, "metadata", &mp->m_error_meta_kobj, xfs_error_meta_init); if (error) goto out_remove_error_dir; if (IS_ENABLED(CONFIG_XFS_RT) && xfs_has_zoned(mp)) { /* .../xfs/<dev>/zoned/ */ error = xfs_sysfs_init(&mp->m_zoned_kobj, &xfs_zoned_ktype, &mp->m_kobj, "zoned"); if (error) goto out_remove_error_dir; } return 0; out_remove_error_dir: xfs_sysfs_del(&mp->m_error_kobj); out_remove_stats_dir: xfs_sysfs_del(&mp->m_stats.xs_kobj); out_remove_fsdir: xfs_sysfs_del(&mp->m_kobj); return error; } void xfs_mount_sysfs_del( struct xfs_mount *mp) { struct xfs_error_cfg *cfg; int i, j; if (IS_ENABLED(CONFIG_XFS_RT) && xfs_has_zoned(mp)) xfs_sysfs_del(&mp->m_zoned_kobj); for (i = 0; i < XFS_ERR_CLASS_MAX; i++) { for (j = 0; j < XFS_ERR_ERRNO_MAX; j++) { cfg = &mp->m_error_cfg[i][j]; xfs_sysfs_del(&cfg->kobj); } } xfs_sysfs_del(&mp->m_error_meta_kobj); xfs_sysfs_del(&mp->m_error_kobj); xfs_sysfs_del(&mp->m_stats.xs_kobj); xfs_sysfs_del(&mp->m_kobj); } struct xfs_error_cfg * xfs_error_get_cfg( struct xfs_mount *mp, int error_class, int error) { struct xfs_error_cfg *cfg; if (error < 0) error = -error; switch (error) { case EIO: cfg = &mp->m_error_cfg[error_class][XFS_ERR_EIO]; break; case ENOSPC: cfg = &mp->m_error_cfg[error_class][XFS_ERR_ENOSPC]; break; case ENODEV: cfg = &mp->m_error_cfg[error_class][XFS_ERR_ENODEV]; break; default: cfg = &mp->m_error_cfg[error_class][XFS_ERR_DEFAULT]; break; } return cfg; }
15 2 13 2 11 11 11 3 8 7 8 1 8 2 8 15 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 // SPDX-License-Identifier: GPL-2.0-only #define pr_fmt(fmt) "IPsec: " fmt #include <crypto/hash.h> #include <crypto/utils.h> #include <linux/err.h> #include <linux/module.h> #include <linux/slab.h> #include <net/ip.h> #include <net/xfrm.h> #include <net/ah.h> #include <linux/crypto.h> #include <linux/pfkeyv2.h> #include <linux/scatterlist.h> #include <net/icmp.h> #include <net/protocol.h> struct ah_skb_cb { struct xfrm_skb_cb xfrm; void *tmp; }; #define AH_SKB_CB(__skb) ((struct ah_skb_cb *)&((__skb)->cb[0])) static void *ah_alloc_tmp(struct crypto_ahash *ahash, int nfrags, unsigned int size) { unsigned int len; len = size + crypto_ahash_digestsize(ahash); len = ALIGN(len, crypto_tfm_ctx_alignment()); len += sizeof(struct ahash_request) + crypto_ahash_reqsize(ahash); len = ALIGN(len, __alignof__(struct scatterlist)); len += sizeof(struct scatterlist) * nfrags; return kmalloc(len, GFP_ATOMIC); } static inline u8 *ah_tmp_auth(void *tmp, unsigned int offset) { return tmp + offset; } static inline u8 *ah_tmp_icv(void *tmp, unsigned int offset) { return tmp + offset; } static inline struct ahash_request *ah_tmp_req(struct crypto_ahash *ahash, u8 *icv) { struct ahash_request *req; req = (void *)PTR_ALIGN(icv + crypto_ahash_digestsize(ahash), crypto_tfm_ctx_alignment()); ahash_request_set_tfm(req, ahash); return req; } static inline struct scatterlist *ah_req_sg(struct crypto_ahash *ahash, struct ahash_request *req) { return (void *)ALIGN((unsigned long)(req + 1) + crypto_ahash_reqsize(ahash), __alignof__(struct scatterlist)); } /* Clear mutable options and find final destination to substitute * into IP header for icv calculation. Options are already checked * for validity, so paranoia is not required. */ static int ip_clear_mutable_options(const struct iphdr *iph, __be32 *daddr) { unsigned char *optptr = (unsigned char *)(iph+1); int l = iph->ihl*4 - sizeof(struct iphdr); int optlen; while (l > 0) { switch (*optptr) { case IPOPT_END: return 0; case IPOPT_NOOP: l--; optptr++; continue; } optlen = optptr[1]; if (optlen<2 || optlen>l) return -EINVAL; switch (*optptr) { case IPOPT_SEC: case 0x85: /* Some "Extended Security" crap. */ case IPOPT_CIPSO: case IPOPT_RA: case 0x80|21: /* RFC1770 */ break; case IPOPT_LSRR: case IPOPT_SSRR: if (optlen < 6) return -EINVAL; memcpy(daddr, optptr+optlen-4, 4); fallthrough; default: memset(optptr, 0, optlen); } l -= optlen; optptr += optlen; } return 0; } static void ah_output_done(void *data, int err) { u8 *icv; struct iphdr *iph; struct sk_buff *skb = data; struct xfrm_state *x = skb_dst(skb)->xfrm; struct ah_data *ahp = x->data; struct iphdr *top_iph = ip_hdr(skb); struct ip_auth_hdr *ah = ip_auth_hdr(skb); int ihl = ip_hdrlen(skb); iph = AH_SKB_CB(skb)->tmp; icv = ah_tmp_icv(iph, ihl); memcpy(ah->auth_data, icv, ahp->icv_trunc_len); top_iph->tos = iph->tos; top_iph->ttl = iph->ttl; top_iph->frag_off = iph->frag_off; if (top_iph->ihl != 5) { top_iph->daddr = iph->daddr; memcpy(top_iph+1, iph+1, top_iph->ihl*4 - sizeof(struct iphdr)); } kfree(AH_SKB_CB(skb)->tmp); xfrm_output_resume(skb->sk, skb, err); } static int ah_output(struct xfrm_state *x, struct sk_buff *skb) { int err; int nfrags; int ihl; u8 *icv; struct sk_buff *trailer; struct crypto_ahash *ahash; struct ahash_request *req; struct scatterlist *sg; struct iphdr *iph, *top_iph; struct ip_auth_hdr *ah; struct ah_data *ahp; int seqhi_len = 0; __be32 *seqhi; int sglists = 0; struct scatterlist *seqhisg; ahp = x->data; ahash = ahp->ahash; if ((err = skb_cow_data(skb, 0, &trailer)) < 0) goto out; nfrags = err; skb_push(skb, -skb_network_offset(skb)); ah = ip_auth_hdr(skb); ihl = ip_hdrlen(skb); if (x->props.flags & XFRM_STATE_ESN) { sglists = 1; seqhi_len = sizeof(*seqhi); } err = -ENOMEM; iph = ah_alloc_tmp(ahash, nfrags + sglists, ihl + seqhi_len); if (!iph) goto out; seqhi = (__be32 *)((char *)iph + ihl); icv = ah_tmp_icv(seqhi, seqhi_len); req = ah_tmp_req(ahash, icv); sg = ah_req_sg(ahash, req); seqhisg = sg + nfrags; memset(ah->auth_data, 0, ahp->icv_trunc_len); top_iph = ip_hdr(skb); iph->tos = top_iph->tos; iph->ttl = top_iph->ttl; iph->frag_off = top_iph->frag_off; if (top_iph->ihl != 5) { iph->daddr = top_iph->daddr; memcpy(iph+1, top_iph+1, top_iph->ihl*4 - sizeof(struct iphdr)); err = ip_clear_mutable_options(top_iph, &top_iph->daddr); if (err) goto out_free; } ah->nexthdr = *skb_mac_header(skb); *skb_mac_header(skb) = IPPROTO_AH; top_iph->tos = 0; top_iph->tot_len = htons(skb->len); top_iph->frag_off = 0; top_iph->ttl = 0; top_iph->check = 0; if (x->props.flags & XFRM_STATE_ALIGN4) ah->hdrlen = (XFRM_ALIGN4(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2; else ah->hdrlen = (XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len) >> 2) - 2; ah->reserved = 0; ah->spi = x->id.spi; ah->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); sg_init_table(sg, nfrags + sglists); err = skb_to_sgvec_nomark(skb, sg, 0, skb->len); if (unlikely(err < 0)) goto out_free; if (x->props.flags & XFRM_STATE_ESN) { /* Attach seqhi sg right after packet payload */ *seqhi = htonl(XFRM_SKB_CB(skb)->seq.output.hi); sg_set_buf(seqhisg, seqhi, seqhi_len); } ahash_request_set_crypt(req, sg, icv, skb->len + seqhi_len); ahash_request_set_callback(req, 0, ah_output_done, skb); AH_SKB_CB(skb)->tmp = iph; err = crypto_ahash_digest(req); if (err) { if (err == -EINPROGRESS) goto out; if (err == -ENOSPC) err = NET_XMIT_DROP; goto out_free; } memcpy(ah->auth_data, icv, ahp->icv_trunc_len); top_iph->tos = iph->tos; top_iph->ttl = iph->ttl; top_iph->frag_off = iph->frag_off; if (top_iph->ihl != 5) { top_iph->daddr = iph->daddr; memcpy(top_iph+1, iph+1, top_iph->ihl*4 - sizeof(struct iphdr)); } out_free: kfree(iph); out: return err; } static void ah_input_done(void *data, int err) { u8 *auth_data; u8 *icv; struct iphdr *work_iph; struct sk_buff *skb = data; struct xfrm_state *x = xfrm_input_state(skb); struct ah_data *ahp = x->data; struct ip_auth_hdr *ah = ip_auth_hdr(skb); int ihl = ip_hdrlen(skb); int ah_hlen = (ah->hdrlen + 2) << 2; if (err) goto out; work_iph = AH_SKB_CB(skb)->tmp; auth_data = ah_tmp_auth(work_iph, ihl); icv = ah_tmp_icv(auth_data, ahp->icv_trunc_len); err = crypto_memneq(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG : 0; if (err) goto out; err = ah->nexthdr; skb->network_header += ah_hlen; memcpy(skb_network_header(skb), work_iph, ihl); __skb_pull(skb, ah_hlen + ihl); if (x->props.mode == XFRM_MODE_TUNNEL) skb_reset_transport_header(skb); else skb_set_transport_header(skb, -ihl); out: kfree(AH_SKB_CB(skb)->tmp); xfrm_input_resume(skb, err); } static int ah_input(struct xfrm_state *x, struct sk_buff *skb) { int ah_hlen; int ihl; int nexthdr; int nfrags; u8 *auth_data; u8 *icv; struct sk_buff *trailer; struct crypto_ahash *ahash; struct ahash_request *req; struct scatterlist *sg; struct iphdr *iph, *work_iph; struct ip_auth_hdr *ah; struct ah_data *ahp; int err = -ENOMEM; int seqhi_len = 0; __be32 *seqhi; int sglists = 0; struct scatterlist *seqhisg; if (!pskb_may_pull(skb, sizeof(*ah))) goto out; ah = (struct ip_auth_hdr *)skb->data; ahp = x->data; ahash = ahp->ahash; nexthdr = ah->nexthdr; ah_hlen = (ah->hdrlen + 2) << 2; if (x->props.flags & XFRM_STATE_ALIGN4) { if (ah_hlen != XFRM_ALIGN4(sizeof(*ah) + ahp->icv_full_len) && ah_hlen != XFRM_ALIGN4(sizeof(*ah) + ahp->icv_trunc_len)) goto out; } else { if (ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_full_len) && ah_hlen != XFRM_ALIGN8(sizeof(*ah) + ahp->icv_trunc_len)) goto out; } if (!pskb_may_pull(skb, ah_hlen)) goto out; /* We are going to _remove_ AH header to keep sockets happy, * so... Later this can change. */ if (skb_unclone(skb, GFP_ATOMIC)) goto out; skb->ip_summed = CHECKSUM_NONE; if ((err = skb_cow_data(skb, 0, &trailer)) < 0) goto out; nfrags = err; ah = (struct ip_auth_hdr *)skb->data; iph = ip_hdr(skb); ihl = ip_hdrlen(skb); if (x->props.flags & XFRM_STATE_ESN) { sglists = 1; seqhi_len = sizeof(*seqhi); } work_iph = ah_alloc_tmp(ahash, nfrags + sglists, ihl + ahp->icv_trunc_len + seqhi_len); if (!work_iph) { err = -ENOMEM; goto out; } seqhi = (__be32 *)((char *)work_iph + ihl); auth_data = ah_tmp_auth(seqhi, seqhi_len); icv = ah_tmp_icv(auth_data, ahp->icv_trunc_len); req = ah_tmp_req(ahash, icv); sg = ah_req_sg(ahash, req); seqhisg = sg + nfrags; memcpy(work_iph, iph, ihl); memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len); memset(ah->auth_data, 0, ahp->icv_trunc_len); iph->ttl = 0; iph->tos = 0; iph->frag_off = 0; iph->check = 0; if (ihl > sizeof(*iph)) { __be32 dummy; err = ip_clear_mutable_options(iph, &dummy); if (err) goto out_free; } skb_push(skb, ihl); sg_init_table(sg, nfrags + sglists); err = skb_to_sgvec_nomark(skb, sg, 0, skb->len); if (unlikely(err < 0)) goto out_free; if (x->props.flags & XFRM_STATE_ESN) { /* Attach seqhi sg right after packet payload */ *seqhi = XFRM_SKB_CB(skb)->seq.input.hi; sg_set_buf(seqhisg, seqhi, seqhi_len); } ahash_request_set_crypt(req, sg, icv, skb->len + seqhi_len); ahash_request_set_callback(req, 0, ah_input_done, skb); AH_SKB_CB(skb)->tmp = work_iph; err = crypto_ahash_digest(req); if (err) { if (err == -EINPROGRESS) goto out; goto out_free; } err = crypto_memneq(icv, auth_data, ahp->icv_trunc_len) ? -EBADMSG : 0; if (err) goto out_free; skb->network_header += ah_hlen; memcpy(skb_network_header(skb), work_iph, ihl); __skb_pull(skb, ah_hlen + ihl); if (x->props.mode == XFRM_MODE_TUNNEL) skb_reset_transport_header(skb); else skb_set_transport_header(skb, -ihl); err = nexthdr; out_free: kfree (work_iph); out: return err; } static int ah4_err(struct sk_buff *skb, u32 info) { struct net *net = dev_net(skb->dev); const struct iphdr *iph = (const struct iphdr *)skb->data; struct ip_auth_hdr *ah = (struct ip_auth_hdr *)(skb->data+(iph->ihl<<2)); struct xfrm_state *x; switch (icmp_hdr(skb)->type) { case ICMP_DEST_UNREACH: if (icmp_hdr(skb)->code != ICMP_FRAG_NEEDED) return 0; break; case ICMP_REDIRECT: break; default: return 0; } x = xfrm_state_lookup(net, skb->mark, (const xfrm_address_t *)&iph->daddr, ah->spi, IPPROTO_AH, AF_INET); if (!x) return 0; if (icmp_hdr(skb)->type == ICMP_DEST_UNREACH) ipv4_update_pmtu(skb, net, info, 0, IPPROTO_AH); else ipv4_redirect(skb, net, 0, IPPROTO_AH); xfrm_state_put(x); return 0; } static int ah_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack) { struct ah_data *ahp = NULL; struct xfrm_algo_desc *aalg_desc; struct crypto_ahash *ahash; if (!x->aalg) { NL_SET_ERR_MSG(extack, "AH requires a state with an AUTH algorithm"); goto error; } if (x->encap) { NL_SET_ERR_MSG(extack, "AH is not compatible with encapsulation"); goto error; } ahp = kzalloc(sizeof(*ahp), GFP_KERNEL); if (!ahp) return -ENOMEM; ahash = crypto_alloc_ahash(x->aalg->alg_name, 0, 0); if (IS_ERR(ahash)) { NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations"); goto error; } ahp->ahash = ahash; if (crypto_ahash_setkey(ahash, x->aalg->alg_key, (x->aalg->alg_key_len + 7) / 8)) { NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations"); goto error; } /* * Lookup the algorithm description maintained by xfrm_algo, * verify crypto transform properties, and store information * we need for AH processing. This lookup cannot fail here * after a successful crypto_alloc_ahash(). */ aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name, 0); BUG_ON(!aalg_desc); if (aalg_desc->uinfo.auth.icv_fullbits/8 != crypto_ahash_digestsize(ahash)) { NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations"); goto error; } ahp->icv_full_len = aalg_desc->uinfo.auth.icv_fullbits/8; ahp->icv_trunc_len = x->aalg->alg_trunc_len/8; if (x->props.flags & XFRM_STATE_ALIGN4) x->props.header_len = XFRM_ALIGN4(sizeof(struct ip_auth_hdr) + ahp->icv_trunc_len); else x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + ahp->icv_trunc_len); if (x->props.mode == XFRM_MODE_TUNNEL) x->props.header_len += sizeof(struct iphdr); x->data = ahp; return 0; error: if (ahp) { crypto_free_ahash(ahp->ahash); kfree(ahp); } return -EINVAL; } static void ah_destroy(struct xfrm_state *x) { struct ah_data *ahp = x->data; if (!ahp) return; crypto_free_ahash(ahp->ahash); kfree(ahp); } static int ah4_rcv_cb(struct sk_buff *skb, int err) { return 0; } static const struct xfrm_type ah_type = { .owner = THIS_MODULE, .proto = IPPROTO_AH, .flags = XFRM_TYPE_REPLAY_PROT, .init_state = ah_init_state, .destructor = ah_destroy, .input = ah_input, .output = ah_output }; static struct xfrm4_protocol ah4_protocol = { .handler = xfrm4_rcv, .input_handler = xfrm_input, .cb_handler = ah4_rcv_cb, .err_handler = ah4_err, .priority = 0, }; static int __init ah4_init(void) { if (xfrm_register_type(&ah_type, AF_INET) < 0) { pr_info("%s: can't add xfrm type\n", __func__); return -EAGAIN; } if (xfrm4_protocol_register(&ah4_protocol, IPPROTO_AH) < 0) { pr_info("%s: can't add protocol\n", __func__); xfrm_unregister_type(&ah_type, AF_INET); return -EAGAIN; } return 0; } static void __exit ah4_fini(void) { if (xfrm4_protocol_deregister(&ah4_protocol, IPPROTO_AH) < 0) pr_info("%s: can't remove protocol\n", __func__); xfrm_unregister_type(&ah_type, AF_INET); } module_init(ah4_init); module_exit(ah4_fini); MODULE_DESCRIPTION("IPv4 AH transformation library"); MODULE_LICENSE("GPL"); MODULE_ALIAS_XFRM_TYPE(AF_INET, XFRM_PROTO_AH);
1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 // SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) /* * sisusb - usb kernel driver for SiS315(E) based USB2VGA dongles * * Main part * * Copyright (C) 2005 by Thomas Winischhofer, Vienna, Austria * * If distributed as part of the Linux kernel, this code is licensed under the * terms of the GPL v2. * * Otherwise, the following license terms apply: * * * Redistribution and use in source and binary forms, with or without * * modification, are permitted provided that the following conditions * * are met: * * 1) Redistributions of source code must retain the above copyright * * notice, this list of conditions and the following disclaimer. * * 2) Redistributions in binary form must reproduce the above copyright * * notice, this list of conditions and the following disclaimer in the * * documentation and/or other materials provided with the distribution. * * 3) The name of the author may not be used to endorse or promote products * * derived from this software without specific psisusbr written permission. * * * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESSED OR * * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. * * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, * * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT * * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * Author: Thomas Winischhofer <thomas@winischhofer.net> * */ #include <linux/mutex.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/signal.h> #include <linux/errno.h> #include <linux/poll.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/kref.h> #include <linux/usb.h> #include <linux/vmalloc.h> #include "sisusb.h" #define SISUSB_DONTSYNC /* Forward declarations / clean-up routines */ static struct usb_driver sisusb_driver; static void sisusb_free_buffers(struct sisusb_usb_data *sisusb) { int i; for (i = 0; i < NUMOBUFS; i++) { kfree(sisusb->obuf[i]); sisusb->obuf[i] = NULL; } kfree(sisusb->ibuf); sisusb->ibuf = NULL; } static void sisusb_free_urbs(struct sisusb_usb_data *sisusb) { int i; for (i = 0; i < NUMOBUFS; i++) { usb_free_urb(sisusb->sisurbout[i]); sisusb->sisurbout[i] = NULL; } usb_free_urb(sisusb->sisurbin); sisusb->sisurbin = NULL; } /* Level 0: USB transport layer */ /* 1. out-bulks */ /* out-urb management */ /* Return 1 if all free, 0 otherwise */ static int sisusb_all_free(struct sisusb_usb_data *sisusb) { int i; for (i = 0; i < sisusb->numobufs; i++) { if (sisusb->urbstatus[i] & SU_URB_BUSY) return 0; } return 1; } /* Kill all busy URBs */ static void sisusb_kill_all_busy(struct sisusb_usb_data *sisusb) { int i; if (sisusb_all_free(sisusb)) return; for (i = 0; i < sisusb->numobufs; i++) { if (sisusb->urbstatus[i] & SU_URB_BUSY) usb_kill_urb(sisusb->sisurbout[i]); } } /* Return 1 if ok, 0 if error (not all complete within timeout) */ static int sisusb_wait_all_out_complete(struct sisusb_usb_data *sisusb) { int timeout = 5 * HZ, i = 1; wait_event_timeout(sisusb->wait_q, (i = sisusb_all_free(sisusb)), timeout); return i; } static int sisusb_outurb_available(struct sisusb_usb_data *sisusb) { int i; for (i = 0; i < sisusb->numobufs; i++) { if ((sisusb->urbstatus[i] & (SU_URB_BUSY|SU_URB_ALLOC)) == 0) return i; } return -1; } static int sisusb_get_free_outbuf(struct sisusb_usb_data *sisusb) { int i, timeout = 5 * HZ; wait_event_timeout(sisusb->wait_q, ((i = sisusb_outurb_available(sisusb)) >= 0), timeout); return i; } static int sisusb_alloc_outbuf(struct sisusb_usb_data *sisusb) { int i; i = sisusb_outurb_available(sisusb); if (i >= 0) sisusb->urbstatus[i] |= SU_URB_ALLOC; return i; } static void sisusb_free_outbuf(struct sisusb_usb_data *sisusb, int index) { if ((index >= 0) && (index < sisusb->numobufs)) sisusb->urbstatus[index] &= ~SU_URB_ALLOC; } /* completion callback */ static void sisusb_bulk_completeout(struct urb *urb) { struct sisusb_urb_context *context = urb->context; struct sisusb_usb_data *sisusb; if (!context) return; sisusb = context->sisusb; if (!sisusb || !sisusb->sisusb_dev || !sisusb->present) return; #ifndef SISUSB_DONTSYNC if (context->actual_length) *(context->actual_length) += urb->actual_length; #endif sisusb->urbstatus[context->urbindex] &= ~SU_URB_BUSY; wake_up(&sisusb->wait_q); } static int sisusb_bulkout_msg(struct sisusb_usb_data *sisusb, int index, unsigned int pipe, void *data, int len, int *actual_length, int timeout, unsigned int tflags) { struct urb *urb = sisusb->sisurbout[index]; int retval, byteswritten = 0; /* Set up URB */ urb->transfer_flags = 0; usb_fill_bulk_urb(urb, sisusb->sisusb_dev, pipe, data, len, sisusb_bulk_completeout, &sisusb->urbout_context[index]); urb->transfer_flags |= tflags; urb->actual_length = 0; /* Set up context */ sisusb->urbout_context[index].actual_length = (timeout) ? NULL : actual_length; /* Declare this urb/buffer in use */ sisusb->urbstatus[index] |= SU_URB_BUSY; /* Submit URB */ retval = usb_submit_urb(urb, GFP_KERNEL); /* If OK, and if timeout > 0, wait for completion */ if ((retval == 0) && timeout) { wait_event_timeout(sisusb->wait_q, (!(sisusb->urbstatus[index] & SU_URB_BUSY)), timeout); if (sisusb->urbstatus[index] & SU_URB_BUSY) { /* URB timed out... kill it and report error */ usb_kill_urb(urb); retval = -ETIMEDOUT; } else { /* Otherwise, report urb status */ retval = urb->status; byteswritten = urb->actual_length; } } if (actual_length) *actual_length = byteswritten; return retval; } /* 2. in-bulks */ /* completion callback */ static void sisusb_bulk_completein(struct urb *urb) { struct sisusb_usb_data *sisusb = urb->context; if (!sisusb || !sisusb->sisusb_dev || !sisusb->present) return; sisusb->completein = 1; wake_up(&sisusb->wait_q); } static int sisusb_bulkin_msg(struct sisusb_usb_data *sisusb, unsigned int pipe, void *data, int len, int *actual_length, int timeout, unsigned int tflags) { struct urb *urb = sisusb->sisurbin; int retval, readbytes = 0; urb->transfer_flags = 0; usb_fill_bulk_urb(urb, sisusb->sisusb_dev, pipe, data, len, sisusb_bulk_completein, sisusb); urb->transfer_flags |= tflags; urb->actual_length = 0; sisusb->completein = 0; retval = usb_submit_urb(urb, GFP_KERNEL); if (retval == 0) { wait_event_timeout(sisusb->wait_q, sisusb->completein, timeout); if (!sisusb->completein) { /* URB timed out... kill it and report error */ usb_kill_urb(urb); retval = -ETIMEDOUT; } else { /* URB completed within timeout */ retval = urb->status; readbytes = urb->actual_length; } } if (actual_length) *actual_length = readbytes; return retval; } /* Level 1: */ /* Send a bulk message of variable size * * To copy the data from userspace, give pointer to "userbuffer", * to copy from (non-DMA) kernel memory, give "kernbuffer". If * both of these are NULL, it is assumed, that the transfer * buffer "sisusb->obuf[index]" is set up with the data to send. * Index is ignored if either kernbuffer or userbuffer is set. * If async is nonzero, URBs will be sent without waiting for * completion of the previous URB. * * (return 0 on success) */ static int sisusb_send_bulk_msg(struct sisusb_usb_data *sisusb, int ep, int len, char *kernbuffer, const char __user *userbuffer, int index, ssize_t *bytes_written, unsigned int tflags, int async) { int result = 0, retry, count = len; int passsize, thispass, transferred_len = 0; int fromuser = (userbuffer != NULL) ? 1 : 0; int fromkern = (kernbuffer != NULL) ? 1 : 0; unsigned int pipe; char *buffer; (*bytes_written) = 0; /* Sanity check */ if (!sisusb || !sisusb->present || !sisusb->sisusb_dev) return -ENODEV; /* If we copy data from kernel or userspace, force the * allocation of a buffer/urb. If we have the data in * the transfer buffer[index] already, reuse the buffer/URB * if the length is > buffer size. (So, transmitting * large data amounts directly from the transfer buffer * treats the buffer as a ring buffer. However, we need * to sync in this case.) */ if (fromuser || fromkern) index = -1; else if (len > sisusb->obufsize) async = 0; pipe = usb_sndbulkpipe(sisusb->sisusb_dev, ep); do { passsize = thispass = (sisusb->obufsize < count) ? sisusb->obufsize : count; if (index < 0) index = sisusb_get_free_outbuf(sisusb); if (index < 0) return -EIO; buffer = sisusb->obuf[index]; if (fromuser) { if (copy_from_user(buffer, userbuffer, passsize)) return -EFAULT; userbuffer += passsize; } else if (fromkern) { memcpy(buffer, kernbuffer, passsize); kernbuffer += passsize; } retry = 5; while (thispass) { if (!sisusb->sisusb_dev) return -ENODEV; result = sisusb_bulkout_msg(sisusb, index, pipe, buffer, thispass, &transferred_len, async ? 0 : 5 * HZ, tflags); if (result == -ETIMEDOUT) { /* Will not happen if async */ if (!retry--) return -ETIME; continue; } if ((result == 0) && !async && transferred_len) { thispass -= transferred_len; buffer += transferred_len; } else break; } if (result) return result; (*bytes_written) += passsize; count -= passsize; /* Force new allocation in next iteration */ if (fromuser || fromkern) index = -1; } while (count > 0); if (async) { #ifdef SISUSB_DONTSYNC (*bytes_written) = len; /* Some URBs/buffers might be busy */ #else sisusb_wait_all_out_complete(sisusb); (*bytes_written) = transferred_len; /* All URBs and all buffers are available */ #endif } return ((*bytes_written) == len) ? 0 : -EIO; } /* Receive a bulk message of variable size * * To copy the data to userspace, give pointer to "userbuffer", * to copy to kernel memory, give "kernbuffer". One of them * MUST be set. (There is no technique for letting the caller * read directly from the ibuf.) * */ static int sisusb_recv_bulk_msg(struct sisusb_usb_data *sisusb, int ep, int len, void *kernbuffer, char __user *userbuffer, ssize_t *bytes_read, unsigned int tflags) { int result = 0, retry, count = len; int bufsize, thispass, transferred_len; unsigned int pipe; char *buffer; (*bytes_read) = 0; /* Sanity check */ if (!sisusb || !sisusb->present || !sisusb->sisusb_dev) return -ENODEV; pipe = usb_rcvbulkpipe(sisusb->sisusb_dev, ep); buffer = sisusb->ibuf; bufsize = sisusb->ibufsize; retry = 5; #ifdef SISUSB_DONTSYNC if (!(sisusb_wait_all_out_complete(sisusb))) return -EIO; #endif while (count > 0) { if (!sisusb->sisusb_dev) return -ENODEV; thispass = (bufsize < count) ? bufsize : count; result = sisusb_bulkin_msg(sisusb, pipe, buffer, thispass, &transferred_len, 5 * HZ, tflags); if (transferred_len) thispass = transferred_len; else if (result == -ETIMEDOUT) { if (!retry--) return -ETIME; continue; } else return -EIO; if (thispass) { (*bytes_read) += thispass; count -= thispass; if (userbuffer) { if (copy_to_user(userbuffer, buffer, thispass)) return -EFAULT; userbuffer += thispass; } else { memcpy(kernbuffer, buffer, thispass); kernbuffer += thispass; } } } return ((*bytes_read) == len) ? 0 : -EIO; } static int sisusb_send_packet(struct sisusb_usb_data *sisusb, int len, struct sisusb_packet *packet) { int ret; ssize_t bytes_transferred = 0; __le32 tmp; if (len == 6) packet->data = 0; #ifdef SISUSB_DONTSYNC if (!(sisusb_wait_all_out_complete(sisusb))) return 1; #endif /* Eventually correct endianness */ SISUSB_CORRECT_ENDIANNESS_PACKET(packet); /* 1. send the packet */ ret = sisusb_send_bulk_msg(sisusb, SISUSB_EP_GFX_OUT, len, (char *)packet, NULL, 0, &bytes_transferred, 0, 0); if ((ret == 0) && (len == 6)) { /* 2. if packet len == 6, it means we read, so wait for 32bit * return value and write it to packet->data */ ret = sisusb_recv_bulk_msg(sisusb, SISUSB_EP_GFX_IN, 4, (char *)&tmp, NULL, &bytes_transferred, 0); packet->data = le32_to_cpu(tmp); } return ret; } static int sisusb_send_bridge_packet(struct sisusb_usb_data *sisusb, int len, struct sisusb_packet *packet, unsigned int tflags) { int ret; ssize_t bytes_transferred = 0; __le32 tmp; if (len == 6) packet->data = 0; #ifdef SISUSB_DONTSYNC if (!(sisusb_wait_all_out_complete(sisusb))) return 1; #endif /* Eventually correct endianness */ SISUSB_CORRECT_ENDIANNESS_PACKET(packet); /* 1. send the packet */ ret = sisusb_send_bulk_msg(sisusb, SISUSB_EP_BRIDGE_OUT, len, (char *)packet, NULL, 0, &bytes_transferred, tflags, 0); if ((ret == 0) && (len == 6)) { /* 2. if packet len == 6, it means we read, so wait for 32bit * return value and write it to packet->data */ ret = sisusb_recv_bulk_msg(sisusb, SISUSB_EP_BRIDGE_IN, 4, (char *)&tmp, NULL, &bytes_transferred, 0); packet->data = le32_to_cpu(tmp); } return ret; } /* access video memory and mmio (return 0 on success) */ /* Low level */ /* The following routines assume being used to transfer byte, word, * long etc. * This means that * - the write routines expect "data" in machine endianness format. * The data will be converted to leXX in sisusb_xxx_packet. * - the read routines can expect read data in machine-endianess. */ static int sisusb_write_memio_byte(struct sisusb_usb_data *sisusb, int type, u32 addr, u8 data) { struct sisusb_packet packet; packet.header = (1 << (addr & 3)) | (type << 6); packet.address = addr & ~3; packet.data = data << ((addr & 3) << 3); return sisusb_send_packet(sisusb, 10, &packet); } static int sisusb_write_memio_word(struct sisusb_usb_data *sisusb, int type, u32 addr, u16 data) { struct sisusb_packet packet; int ret = 0; packet.address = addr & ~3; switch (addr & 3) { case 0: packet.header = (type << 6) | 0x0003; packet.data = (u32)data; ret = sisusb_send_packet(sisusb, 10, &packet); break; case 1: packet.header = (type << 6) | 0x0006; packet.data = (u32)data << 8; ret = sisusb_send_packet(sisusb, 10, &packet); break; case 2: packet.header = (type << 6) | 0x000c; packet.data = (u32)data << 16; ret = sisusb_send_packet(sisusb, 10, &packet); break; case 3: packet.header = (type << 6) | 0x0008; packet.data = (u32)data << 24; ret = sisusb_send_packet(sisusb, 10, &packet); packet.header = (type << 6) | 0x0001; packet.address = (addr & ~3) + 4; packet.data = (u32)data >> 8; ret |= sisusb_send_packet(sisusb, 10, &packet); } return ret; } static int sisusb_write_memio_24bit(struct sisusb_usb_data *sisusb, int type, u32 addr, u32 data) { struct sisusb_packet packet; int ret = 0; packet.address = addr & ~3; switch (addr & 3) { case 0: packet.header = (type << 6) | 0x0007; packet.data = data & 0x00ffffff; ret = sisusb_send_packet(sisusb, 10, &packet); break; case 1: packet.header = (type << 6) | 0x000e; packet.data = data << 8; ret = sisusb_send_packet(sisusb, 10, &packet); break; case 2: packet.header = (type << 6) | 0x000c; packet.data = data << 16; ret = sisusb_send_packet(sisusb, 10, &packet); packet.header = (type << 6) | 0x0001; packet.address = (addr & ~3) + 4; packet.data = (data >> 16) & 0x00ff; ret |= sisusb_send_packet(sisusb, 10, &packet); break; case 3: packet.header = (type << 6) | 0x0008; packet.data = data << 24; ret = sisusb_send_packet(sisusb, 10, &packet); packet.header = (type << 6) | 0x0003; packet.address = (addr & ~3) + 4; packet.data = (data >> 8) & 0xffff; ret |= sisusb_send_packet(sisusb, 10, &packet); } return ret; } static int sisusb_write_memio_long(struct sisusb_usb_data *sisusb, int type, u32 addr, u32 data) { struct sisusb_packet packet; int ret = 0; packet.address = addr & ~3; switch (addr & 3) { case 0: packet.header = (type << 6) | 0x000f; packet.data = data; ret = sisusb_send_packet(sisusb, 10, &packet); break; case 1: packet.header = (type << 6) | 0x000e; packet.data = data << 8; ret = sisusb_send_packet(sisusb, 10, &packet); packet.header = (type << 6) | 0x0001; packet.address = (addr & ~3) + 4; packet.data = data >> 24; ret |= sisusb_send_packet(sisusb, 10, &packet); break; case 2: packet.header = (type << 6) | 0x000c; packet.data = data << 16; ret = sisusb_send_packet(sisusb, 10, &packet); packet.header = (type << 6) | 0x0003; packet.address = (addr & ~3) + 4; packet.data = data >> 16; ret |= sisusb_send_packet(sisusb, 10, &packet); break; case 3: packet.header = (type << 6) | 0x0008; packet.data = data << 24; ret = sisusb_send_packet(sisusb, 10, &packet); packet.header = (type << 6) | 0x0007; packet.address = (addr & ~3) + 4; packet.data = data >> 8; ret |= sisusb_send_packet(sisusb, 10, &packet); } return ret; } /* The xxx_bulk routines copy a buffer of variable size. They treat the * buffer as chars, therefore lsb/msb has to be corrected if using the * byte/word/long/etc routines for speed-up * * If data is from userland, set "userbuffer" (and clear "kernbuffer"), * if data is in kernel space, set "kernbuffer" (and clear "userbuffer"); * if neither "kernbuffer" nor "userbuffer" are given, it is assumed * that the data already is in the transfer buffer "sisusb->obuf[index]". */ static int sisusb_write_mem_bulk(struct sisusb_usb_data *sisusb, u32 addr, char *kernbuffer, int length, const char __user *userbuffer, int index, ssize_t *bytes_written) { struct sisusb_packet packet; int ret = 0; static int msgcount; u8 swap8, fromkern = kernbuffer ? 1 : 0; u16 swap16; u32 swap32, flag = (length >> 28) & 1; u8 buf[4]; /* if neither kernbuffer not userbuffer are given, assume * data in obuf */ if (!fromkern && !userbuffer) kernbuffer = sisusb->obuf[index]; (*bytes_written = 0); length &= 0x00ffffff; while (length) { switch (length) { case 1: if (userbuffer) { if (get_user(swap8, (u8 __user *)userbuffer)) return -EFAULT; } else swap8 = kernbuffer[0]; ret = sisusb_write_memio_byte(sisusb, SISUSB_TYPE_MEM, addr, swap8); if (!ret) (*bytes_written)++; return ret; case 2: if (userbuffer) { if (get_user(swap16, (u16 __user *)userbuffer)) return -EFAULT; } else swap16 = *((u16 *)kernbuffer); ret = sisusb_write_memio_word(sisusb, SISUSB_TYPE_MEM, addr, swap16); if (!ret) (*bytes_written) += 2; return ret; case 3: if (userbuffer) { if (copy_from_user(&buf, userbuffer, 3)) return -EFAULT; #ifdef __BIG_ENDIAN swap32 = (buf[0] << 16) | (buf[1] << 8) | buf[2]; #else swap32 = (buf[2] << 16) | (buf[1] << 8) | buf[0]; #endif } else #ifdef __BIG_ENDIAN swap32 = (kernbuffer[0] << 16) | (kernbuffer[1] << 8) | kernbuffer[2]; #else swap32 = (kernbuffer[2] << 16) | (kernbuffer[1] << 8) | kernbuffer[0]; #endif ret = sisusb_write_memio_24bit(sisusb, SISUSB_TYPE_MEM, addr, swap32); if (!ret) (*bytes_written) += 3; return ret; case 4: if (userbuffer) { if (get_user(swap32, (u32 __user *)userbuffer)) return -EFAULT; } else swap32 = *((u32 *)kernbuffer); ret = sisusb_write_memio_long(sisusb, SISUSB_TYPE_MEM, addr, swap32); if (!ret) (*bytes_written) += 4; return ret; default: if ((length & ~3) > 0x10000) { packet.header = 0x001f; packet.address = 0x000001d4; packet.data = addr; ret = sisusb_send_bridge_packet(sisusb, 10, &packet, 0); packet.header = 0x001f; packet.address = 0x000001d0; packet.data = (length & ~3); ret |= sisusb_send_bridge_packet(sisusb, 10, &packet, 0); packet.header = 0x001f; packet.address = 0x000001c0; packet.data = flag | 0x16; ret |= sisusb_send_bridge_packet(sisusb, 10, &packet, 0); if (userbuffer) { ret |= sisusb_send_bulk_msg(sisusb, SISUSB_EP_GFX_LBULK_OUT, (length & ~3), NULL, userbuffer, 0, bytes_written, 0, 1); userbuffer += (*bytes_written); } else if (fromkern) { ret |= sisusb_send_bulk_msg(sisusb, SISUSB_EP_GFX_LBULK_OUT, (length & ~3), kernbuffer, NULL, 0, bytes_written, 0, 1); kernbuffer += (*bytes_written); } else { ret |= sisusb_send_bulk_msg(sisusb, SISUSB_EP_GFX_LBULK_OUT, (length & ~3), NULL, NULL, index, bytes_written, 0, 1); kernbuffer += ((*bytes_written) & (sisusb->obufsize-1)); } } else { packet.header = 0x001f; packet.address = 0x00000194; packet.data = addr; ret = sisusb_send_bridge_packet(sisusb, 10, &packet, 0); packet.header = 0x001f; packet.address = 0x00000190; packet.data = (length & ~3); ret |= sisusb_send_bridge_packet(sisusb, 10, &packet, 0); if (sisusb->flagb0 != 0x16) { packet.header = 0x001f; packet.address = 0x00000180; packet.data = flag | 0x16; ret |= sisusb_send_bridge_packet(sisusb, 10, &packet, 0); sisusb->flagb0 = 0x16; } if (userbuffer) { ret |= sisusb_send_bulk_msg(sisusb, SISUSB_EP_GFX_BULK_OUT, (length & ~3), NULL, userbuffer, 0, bytes_written, 0, 1); userbuffer += (*bytes_written); } else if (fromkern) { ret |= sisusb_send_bulk_msg(sisusb, SISUSB_EP_GFX_BULK_OUT, (length & ~3), kernbuffer, NULL, 0, bytes_written, 0, 1); kernbuffer += (*bytes_written); } else { ret |= sisusb_send_bulk_msg(sisusb, SISUSB_EP_GFX_BULK_OUT, (length & ~3), NULL, NULL, index, bytes_written, 0, 1); kernbuffer += ((*bytes_written) & (sisusb->obufsize-1)); } } if (ret) { msgcount++; if (msgcount < 500) dev_err(&sisusb->sisusb_dev->dev, "Wrote %zd of %d bytes, error %d\n", *bytes_written, length, ret); else if (msgcount == 500) dev_err(&sisusb->sisusb_dev->dev, "Too many errors, logging stopped\n"); } addr += (*bytes_written); length -= (*bytes_written); } if (ret) break; } return ret ? -EIO : 0; } /* Remember: Read data in packet is in machine-endianess! So for * byte, word, 24bit, long no endian correction is necessary. */ static int sisusb_read_memio_byte(struct sisusb_usb_data *sisusb, int type, u32 addr, u8 *data) { struct sisusb_packet packet; int ret; CLEARPACKET(&packet); packet.header = (1 << (addr & 3)) | (type << 6); packet.address = addr & ~3; ret = sisusb_send_packet(sisusb, 6, &packet); *data = (u8)(packet.data >> ((addr & 3) << 3)); return ret; } static int sisusb_read_memio_word(struct sisusb_usb_data *sisusb, int type, u32 addr, u16 *data) { struct sisusb_packet packet; int ret = 0; CLEARPACKET(&packet); packet.address = addr & ~3; switch (addr & 3) { case 0: packet.header = (type << 6) | 0x0003; ret = sisusb_send_packet(sisusb, 6, &packet); *data = (u16)(packet.data); break; case 1: packet.header = (type << 6) | 0x0006; ret = sisusb_send_packet(sisusb, 6, &packet); *data = (u16)(packet.data >> 8); break; case 2: packet.header = (type << 6) | 0x000c; ret = sisusb_send_packet(sisusb, 6, &packet); *data = (u16)(packet.data >> 16); break; case 3: packet.header = (type << 6) | 0x0008; ret = sisusb_send_packet(sisusb, 6, &packet); *data = (u16)(packet.data >> 24); packet.header = (type << 6) | 0x0001; packet.address = (addr & ~3) + 4; ret |= sisusb_send_packet(sisusb, 6, &packet); *data |= (u16)(packet.data << 8); } return ret; } static int sisusb_read_memio_24bit(struct sisusb_usb_data *sisusb, int type, u32 addr, u32 *data) { struct sisusb_packet packet; int ret = 0; packet.address = addr & ~3; switch (addr & 3) { case 0: packet.header = (type << 6) | 0x0007; ret = sisusb_send_packet(sisusb, 6, &packet); *data = packet.data & 0x00ffffff; break; case 1: packet.header = (type << 6) | 0x000e; ret = sisusb_send_packet(sisusb, 6, &packet); *data = packet.data >> 8; break; case 2: packet.header = (type << 6) | 0x000c; ret = sisusb_send_packet(sisusb, 6, &packet); *data = packet.data >> 16; packet.header = (type << 6) | 0x0001; packet.address = (addr & ~3) + 4; ret |= sisusb_send_packet(sisusb, 6, &packet); *data |= ((packet.data & 0xff) << 16); break; case 3: packet.header = (type << 6) | 0x0008; ret = sisusb_send_packet(sisusb, 6, &packet); *data = packet.data >> 24; packet.header = (type << 6) | 0x0003; packet.address = (addr & ~3) + 4; ret |= sisusb_send_packet(sisusb, 6, &packet); *data |= ((packet.data & 0xffff) << 8); } return ret; } static int sisusb_read_memio_long(struct sisusb_usb_data *sisusb, int type, u32 addr, u32 *data) { struct sisusb_packet packet; int ret = 0; packet.address = addr & ~3; switch (addr & 3) { case 0: packet.header = (type << 6) | 0x000f; ret = sisusb_send_packet(sisusb, 6, &packet); *data = packet.data; break; case 1: packet.header = (type << 6) | 0x000e; ret = sisusb_send_packet(sisusb, 6, &packet); *data = packet.data >> 8; packet.header = (type << 6) | 0x0001; packet.address = (addr & ~3) + 4; ret |= sisusb_send_packet(sisusb, 6, &packet); *data |= (packet.data << 24); break; case 2: packet.header = (type << 6) | 0x000c; ret = sisusb_send_packet(sisusb, 6, &packet); *data = packet.data >> 16; packet.header = (type << 6) | 0x0003; packet.address = (addr & ~3) + 4; ret |= sisusb_send_packet(sisusb, 6, &packet); *data |= (packet.data << 16); break; case 3: packet.header = (type << 6) | 0x0008; ret = sisusb_send_packet(sisusb, 6, &packet); *data = packet.data >> 24; packet.header = (type << 6) | 0x0007; packet.address = (addr & ~3) + 4; ret |= sisusb_send_packet(sisusb, 6, &packet); *data |= (packet.data << 8); } return ret; } static int sisusb_read_mem_bulk(struct sisusb_usb_data *sisusb, u32 addr, char *kernbuffer, int length, char __user *userbuffer, ssize_t *bytes_read) { int ret = 0; char buf[4]; u16 swap16; u32 swap32; (*bytes_read = 0); length &= 0x00ffffff; while (length) { switch (length) { case 1: ret |= sisusb_read_memio_byte(sisusb, SISUSB_TYPE_MEM, addr, &buf[0]); if (!ret) { (*bytes_read)++; if (userbuffer) { if (put_user(buf[0], (u8 __user *)userbuffer)) return -EFAULT; } else kernbuffer[0] = buf[0]; } return ret; case 2: ret |= sisusb_read_memio_word(sisusb, SISUSB_TYPE_MEM, addr, &swap16); if (!ret) { (*bytes_read) += 2; if (userbuffer) { if (put_user(swap16, (u16 __user *)userbuffer)) return -EFAULT; } else { *((u16 *)kernbuffer) = swap16; } } return ret; case 3: ret |= sisusb_read_memio_24bit(sisusb, SISUSB_TYPE_MEM, addr, &swap32); if (!ret) { (*bytes_read) += 3; #ifdef __BIG_ENDIAN buf[0] = (swap32 >> 16) & 0xff; buf[1] = (swap32 >> 8) & 0xff; buf[2] = swap32 & 0xff; #else buf[2] = (swap32 >> 16) & 0xff; buf[1] = (swap32 >> 8) & 0xff; buf[0] = swap32 & 0xff; #endif if (userbuffer) { if (copy_to_user(userbuffer, &buf[0], 3)) return -EFAULT; } else { kernbuffer[0] = buf[0]; kernbuffer[1] = buf[1]; kernbuffer[2] = buf[2]; } } return ret; default: ret |= sisusb_read_memio_long(sisusb, SISUSB_TYPE_MEM, addr, &swap32); if (!ret) { (*bytes_read) += 4; if (userbuffer) { if (put_user(swap32, (u32 __user *)userbuffer)) return -EFAULT; userbuffer += 4; } else { *((u32 *)kernbuffer) = swap32; kernbuffer += 4; } addr += 4; length -= 4; } } if (ret) break; } return ret; } /* High level: Gfx (indexed) register access */ static int sisusb_setidxreg(struct sisusb_usb_data *sisusb, u32 port, u8 index, u8 data) { int ret; ret = sisusb_write_memio_byte(sisusb, SISUSB_TYPE_IO, port, index); ret |= sisusb_write_memio_byte(sisusb, SISUSB_TYPE_IO, port + 1, data); return ret; } static int sisusb_getidxreg(struct sisusb_usb_data *sisusb, u32 port, u8 index, u8 *data) { int ret; ret = sisusb_write_memio_byte(sisusb, SISUSB_TYPE_IO, port, index); ret |= sisusb_read_memio_byte(sisusb, SISUSB_TYPE_IO, port + 1, data); return ret; } static int sisusb_setidxregandor(struct sisusb_usb_data *sisusb, u32 port, u8 idx, u8 myand, u8 myor) { int ret; u8 tmp; ret = sisusb_write_memio_byte(sisusb, SISUSB_TYPE_IO, port, idx); ret |= sisusb_read_memio_byte(sisusb, SISUSB_TYPE_IO, port + 1, &tmp); tmp &= myand; tmp |= myor; ret |= sisusb_write_memio_byte(sisusb, SISUSB_TYPE_IO, port + 1, tmp); return ret; } static int sisusb_setidxregmask(struct sisusb_usb_data *sisusb, u32 port, u8 idx, u8 data, u8 mask) { int ret; u8 tmp; ret = sisusb_write_memio_byte(sisusb, SISUSB_TYPE_IO, port, idx); ret |= sisusb_read_memio_byte(sisusb, SISUSB_TYPE_IO, port + 1, &tmp); tmp &= ~(mask); tmp |= (data & mask); ret |= sisusb_write_memio_byte(sisusb, SISUSB_TYPE_IO, port + 1, tmp); return ret; } static int sisusb_setidxregor(struct sisusb_usb_data *sisusb, u32 port, u8 index, u8 myor) { return sisusb_setidxregandor(sisusb, port, index, 0xff, myor); } static int sisusb_setidxregand(struct sisusb_usb_data *sisusb, u32 port, u8 idx, u8 myand) { return sisusb_setidxregandor(sisusb, port, idx, myand, 0x00); } /* Write/read video ram */ #ifdef SISUSBENDIANTEST static void sisusb_testreadwrite(struct sisusb_usb_data *sisusb) { static u8 srcbuffer[] = { 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77 }; char destbuffer[10]; int i, j; sisusb_copy_memory(sisusb, srcbuffer, sisusb->vrambase, 7); for (i = 1; i <= 7; i++) { dev_dbg(&sisusb->sisusb_dev->dev, "sisusb: rwtest %d bytes\n", i); sisusb_read_memory(sisusb, destbuffer, sisusb->vrambase, i); for (j = 0; j < i; j++) { dev_dbg(&sisusb->sisusb_dev->dev, "rwtest read[%d] = %x\n", j, destbuffer[j]); } } } #endif /* access pci config registers (reg numbers 0, 4, 8, etc) */ static int sisusb_write_pci_config(struct sisusb_usb_data *sisusb, int regnum, u32 data) { struct sisusb_packet packet; packet.header = 0x008f; packet.address = regnum | 0x10000; packet.data = data; return sisusb_send_packet(sisusb, 10, &packet); } static int sisusb_read_pci_config(struct sisusb_usb_data *sisusb, int regnum, u32 *data) { struct sisusb_packet packet; int ret; packet.header = 0x008f; packet.address = (u32)regnum | 0x10000; ret = sisusb_send_packet(sisusb, 6, &packet); *data = packet.data; return ret; } /* Clear video RAM */ static int sisusb_clear_vram(struct sisusb_usb_data *sisusb, u32 address, int length) { int ret, i; ssize_t j; if (address < sisusb->vrambase) return 1; if (address >= sisusb->vrambase + sisusb->vramsize) return 1; if (address + length > sisusb->vrambase + sisusb->vramsize) length = sisusb->vrambase + sisusb->vramsize - address; if (length <= 0) return 0; /* allocate free buffer/urb and clear the buffer */ i = sisusb_alloc_outbuf(sisusb); if (i < 0) return -EBUSY; memset(sisusb->obuf[i], 0, sisusb->obufsize); /* We can write a length > buffer size here. The buffer * data will simply be re-used (like a ring-buffer). */ ret = sisusb_write_mem_bulk(sisusb, address, NULL, length, NULL, i, &j); /* Free the buffer/urb */ sisusb_free_outbuf(sisusb, i); return ret; } /* Initialize the graphics core (return 0 on success) * This resets the graphics hardware and puts it into * a defined mode (640x480@60Hz) */ #define GETREG(r, d) sisusb_read_memio_byte(sisusb, SISUSB_TYPE_IO, r, d) #define SETREG(r, d) sisusb_write_memio_byte(sisusb, SISUSB_TYPE_IO, r, d) #define SETIREG(r, i, d) sisusb_setidxreg(sisusb, r, i, d) #define GETIREG(r, i, d) sisusb_getidxreg(sisusb, r, i, d) #define SETIREGOR(r, i, o) sisusb_setidxregor(sisusb, r, i, o) #define SETIREGAND(r, i, a) sisusb_setidxregand(sisusb, r, i, a) #define SETIREGANDOR(r, i, a, o) sisusb_setidxregandor(sisusb, r, i, a, o) #define READL(a, d) sisusb_read_memio_long(sisusb, SISUSB_TYPE_MEM, a, d) #define WRITEL(a, d) sisusb_write_memio_long(sisusb, SISUSB_TYPE_MEM, a, d) #define READB(a, d) sisusb_read_memio_byte(sisusb, SISUSB_TYPE_MEM, a, d) #define WRITEB(a, d) sisusb_write_memio_byte(sisusb, SISUSB_TYPE_MEM, a, d) static int sisusb_triggersr16(struct sisusb_usb_data *sisusb, u8 ramtype) { int ret; u8 tmp8; ret = GETIREG(SISSR, 0x16, &tmp8); if (ramtype <= 1) { tmp8 &= 0x3f; ret |= SETIREG(SISSR, 0x16, tmp8); tmp8 |= 0x80; ret |= SETIREG(SISSR, 0x16, tmp8); } else { tmp8 |= 0xc0; ret |= SETIREG(SISSR, 0x16, tmp8); tmp8 &= 0x0f; ret |= SETIREG(SISSR, 0x16, tmp8); tmp8 |= 0x80; ret |= SETIREG(SISSR, 0x16, tmp8); tmp8 &= 0x0f; ret |= SETIREG(SISSR, 0x16, tmp8); tmp8 |= 0xd0; ret |= SETIREG(SISSR, 0x16, tmp8); tmp8 &= 0x0f; ret |= SETIREG(SISSR, 0x16, tmp8); tmp8 |= 0xa0; ret |= SETIREG(SISSR, 0x16, tmp8); } return ret; } static int sisusb_getbuswidth(struct sisusb_usb_data *sisusb, int *bw, int *chab) { int ret; u8 ramtype, done = 0; u32 t0, t1, t2, t3; u32 ramptr = SISUSB_PCI_MEMBASE; ret = GETIREG(SISSR, 0x3a, &ramtype); ramtype &= 3; ret |= SETIREG(SISSR, 0x13, 0x00); if (ramtype <= 1) { ret |= SETIREG(SISSR, 0x14, 0x12); ret |= SETIREGAND(SISSR, 0x15, 0xef); } else { ret |= SETIREG(SISSR, 0x14, 0x02); } ret |= sisusb_triggersr16(sisusb, ramtype); ret |= WRITEL(ramptr + 0, 0x01234567); ret |= WRITEL(ramptr + 4, 0x456789ab); ret |= WRITEL(ramptr + 8, 0x89abcdef); ret |= WRITEL(ramptr + 12, 0xcdef0123); ret |= WRITEL(ramptr + 16, 0x55555555); ret |= WRITEL(ramptr + 20, 0x55555555); ret |= WRITEL(ramptr + 24, 0xffffffff); ret |= WRITEL(ramptr + 28, 0xffffffff); ret |= READL(ramptr + 0, &t0); ret |= READL(ramptr + 4, &t1); ret |= READL(ramptr + 8, &t2); ret |= READL(ramptr + 12, &t3); if (ramtype <= 1) { *chab = 0; *bw = 64; if ((t3 != 0xcdef0123) || (t2 != 0x89abcdef)) { if ((t1 == 0x456789ab) && (t0 == 0x01234567)) { *chab = 0; *bw = 64; ret |= SETIREGAND(SISSR, 0x14, 0xfd); } } if ((t1 != 0x456789ab) || (t0 != 0x01234567)) { *chab = 1; *bw = 64; ret |= SETIREGANDOR(SISSR, 0x14, 0xfc, 0x01); ret |= sisusb_triggersr16(sisusb, ramtype); ret |= WRITEL(ramptr + 0, 0x89abcdef); ret |= WRITEL(ramptr + 4, 0xcdef0123); ret |= WRITEL(ramptr + 8, 0x55555555); ret |= WRITEL(ramptr + 12, 0x55555555); ret |= WRITEL(ramptr + 16, 0xaaaaaaaa); ret |= WRITEL(ramptr + 20, 0xaaaaaaaa); ret |= READL(ramptr + 4, &t1); if (t1 != 0xcdef0123) { *bw = 32; ret |= SETIREGOR(SISSR, 0x15, 0x10); } } } else { *chab = 0; *bw = 64; /* default: cha, bw = 64 */ done = 0; if (t1 == 0x456789ab) { if (t0 == 0x01234567) { *chab = 0; *bw = 64; done = 1; } } else { if (t0 == 0x01234567) { *chab = 0; *bw = 32; ret |= SETIREG(SISSR, 0x14, 0x00); done = 1; } } if (!done) { ret |= SETIREG(SISSR, 0x14, 0x03); ret |= sisusb_triggersr16(sisusb, ramtype); ret |= WRITEL(ramptr + 0, 0x01234567); ret |= WRITEL(ramptr + 4, 0x456789ab); ret |= WRITEL(ramptr + 8, 0x89abcdef); ret |= WRITEL(ramptr + 12, 0xcdef0123); ret |= WRITEL(ramptr + 16, 0x55555555); ret |= WRITEL(ramptr + 20, 0x55555555); ret |= WRITEL(ramptr + 24, 0xffffffff); ret |= WRITEL(ramptr + 28, 0xffffffff); ret |= READL(ramptr + 0, &t0); ret |= READL(ramptr + 4, &t1); if (t1 == 0x456789ab) { if (t0 == 0x01234567) { *chab = 1; *bw = 64; return ret; } /* else error */ } else { if (t0 == 0x01234567) { *chab = 1; *bw = 32; ret |= SETIREG(SISSR, 0x14, 0x01); } /* else error */ } } } return ret; } static int sisusb_verify_mclk(struct sisusb_usb_data *sisusb) { int ret = 0; u32 ramptr = SISUSB_PCI_MEMBASE; u8 tmp1, tmp2, i, j; ret |= WRITEB(ramptr, 0xaa); ret |= WRITEB(ramptr + 16, 0x55); ret |= READB(ramptr, &tmp1); ret |= READB(ramptr + 16, &tmp2); if ((tmp1 != 0xaa) || (tmp2 != 0x55)) { for (i = 0, j = 16; i < 2; i++, j += 16) { ret |= GETIREG(SISSR, 0x21, &tmp1); ret |= SETIREGAND(SISSR, 0x21, (tmp1 & 0xfb)); ret |= SETIREGOR(SISSR, 0x3c, 0x01); /* not on 330 */ ret |= SETIREGAND(SISSR, 0x3c, 0xfe); /* not on 330 */ ret |= SETIREG(SISSR, 0x21, tmp1); ret |= WRITEB(ramptr + 16 + j, j); ret |= READB(ramptr + 16 + j, &tmp1); if (tmp1 == j) { ret |= WRITEB(ramptr + j, j); break; } } } return ret; } static int sisusb_set_rank(struct sisusb_usb_data *sisusb, int *iret, int index, u8 rankno, u8 chab, const u8 dramtype[][5], int bw) { int ret = 0, ranksize; u8 tmp; *iret = 0; if ((rankno == 2) && (dramtype[index][0] == 2)) return ret; ranksize = dramtype[index][3] / 2 * bw / 32; if ((ranksize * rankno) > 128) return ret; tmp = 0; while ((ranksize >>= 1) > 0) tmp += 0x10; tmp |= ((rankno - 1) << 2); tmp |= ((bw / 64) & 0x02); tmp |= (chab & 0x01); ret = SETIREG(SISSR, 0x14, tmp); ret |= sisusb_triggersr16(sisusb, 0); /* sic! */ *iret = 1; return ret; } static int sisusb_check_rbc(struct sisusb_usb_data *sisusb, int *iret, u32 inc, int testn) { int ret = 0, i; u32 j, tmp; *iret = 0; for (i = 0, j = 0; i < testn; i++) { ret |= WRITEL(sisusb->vrambase + j, j); j += inc; } for (i = 0, j = 0; i < testn; i++) { ret |= READL(sisusb->vrambase + j, &tmp); if (tmp != j) return ret; j += inc; } *iret = 1; return ret; } static int sisusb_check_ranks(struct sisusb_usb_data *sisusb, int *iret, int rankno, int idx, int bw, const u8 rtype[][5]) { int ret = 0, i, i2ret; u32 inc; *iret = 0; for (i = rankno; i >= 1; i--) { inc = 1 << (rtype[idx][2] + rtype[idx][1] + rtype[idx][0] + bw / 64 + i); ret |= sisusb_check_rbc(sisusb, &i2ret, inc, 2); if (!i2ret) return ret; } inc = 1 << (rtype[idx][2] + bw / 64 + 2); ret |= sisusb_check_rbc(sisusb, &i2ret, inc, 4); if (!i2ret) return ret; inc = 1 << (10 + bw / 64); ret |= sisusb_check_rbc(sisusb, &i2ret, inc, 2); if (!i2ret) return ret; *iret = 1; return ret; } static int sisusb_get_sdram_size(struct sisusb_usb_data *sisusb, int *iret, int bw, int chab) { int ret = 0, i2ret = 0, i, j; static const u8 sdramtype[13][5] = { { 2, 12, 9, 64, 0x35 }, { 1, 13, 9, 64, 0x44 }, { 2, 12, 8, 32, 0x31 }, { 2, 11, 9, 32, 0x25 }, { 1, 12, 9, 32, 0x34 }, { 1, 13, 8, 32, 0x40 }, { 2, 11, 8, 16, 0x21 }, { 1, 12, 8, 16, 0x30 }, { 1, 11, 9, 16, 0x24 }, { 1, 11, 8, 8, 0x20 }, { 2, 9, 8, 4, 0x01 }, { 1, 10, 8, 4, 0x10 }, { 1, 9, 8, 2, 0x00 } }; *iret = 1; /* error */ for (i = 0; i < 13; i++) { ret |= SETIREGANDOR(SISSR, 0x13, 0x80, sdramtype[i][4]); for (j = 2; j > 0; j--) { ret |= sisusb_set_rank(sisusb, &i2ret, i, j, chab, sdramtype, bw); if (!i2ret) continue; ret |= sisusb_check_ranks(sisusb, &i2ret, j, i, bw, sdramtype); if (i2ret) { *iret = 0; /* ram size found */ return ret; } } } return ret; } static int sisusb_setup_screen(struct sisusb_usb_data *sisusb, int clrall, int drwfr) { int ret = 0; u32 address; int i, length, modex, modey, bpp; modex = 640; modey = 480; bpp = 2; address = sisusb->vrambase; /* Clear video ram */ if (clrall) length = sisusb->vramsize; else length = modex * bpp * modey; ret = sisusb_clear_vram(sisusb, address, length); if (!ret && drwfr) { for (i = 0; i < modex; i++) { address = sisusb->vrambase + (i * bpp); ret |= sisusb_write_memio_word(sisusb, SISUSB_TYPE_MEM, address, 0xf100); address += (modex * (modey-1) * bpp); ret |= sisusb_write_memio_word(sisusb, SISUSB_TYPE_MEM, address, 0xf100); } for (i = 0; i < modey; i++) { address = sisusb->vrambase + ((i * modex) * bpp); ret |= sisusb_write_memio_word(sisusb, SISUSB_TYPE_MEM, address, 0xf100); address += ((modex - 1) * bpp); ret |= sisusb_write_memio_word(sisusb, SISUSB_TYPE_MEM, address, 0xf100); } } return ret; } static void sisusb_set_default_mode(struct sisusb_usb_data *sisusb, int touchengines) { int i, j, modex, bpp, du; u8 sr31, cr63, tmp8; static const char attrdata[] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x01, 0x00, 0x00, 0x00 }; static const char crtcrdata[] = { 0x5f, 0x4f, 0x50, 0x82, 0x54, 0x80, 0x0b, 0x3e, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xea, 0x8c, 0xdf, 0x28, 0x40, 0xe7, 0x04, 0xa3, 0xff }; static const char grcdata[] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x05, 0x0f, 0xff }; static const char crtcdata[] = { 0x5f, 0x4f, 0x4f, 0x83, 0x55, 0x81, 0x0b, 0x3e, 0xe9, 0x8b, 0xdf, 0xe8, 0x0c, 0x00, 0x00, 0x05, 0x00 }; modex = 640; bpp = 2; GETIREG(SISSR, 0x31, &sr31); GETIREG(SISCR, 0x63, &cr63); SETIREGOR(SISSR, 0x01, 0x20); SETIREG(SISCR, 0x63, cr63 & 0xbf); SETIREGOR(SISCR, 0x17, 0x80); SETIREGOR(SISSR, 0x1f, 0x04); SETIREGAND(SISSR, 0x07, 0xfb); SETIREG(SISSR, 0x00, 0x03); /* seq */ SETIREG(SISSR, 0x01, 0x21); SETIREG(SISSR, 0x02, 0x0f); SETIREG(SISSR, 0x03, 0x00); SETIREG(SISSR, 0x04, 0x0e); SETREG(SISMISCW, 0x23); /* misc */ for (i = 0; i <= 0x18; i++) { /* crtc */ SETIREG(SISCR, i, crtcrdata[i]); } for (i = 0; i <= 0x13; i++) { /* att */ GETREG(SISINPSTAT, &tmp8); SETREG(SISAR, i); SETREG(SISAR, attrdata[i]); } GETREG(SISINPSTAT, &tmp8); SETREG(SISAR, 0x14); SETREG(SISAR, 0x00); GETREG(SISINPSTAT, &tmp8); SETREG(SISAR, 0x20); GETREG(SISINPSTAT, &tmp8); for (i = 0; i <= 0x08; i++) { /* grc */ SETIREG(SISGR, i, grcdata[i]); } SETIREGAND(SISGR, 0x05, 0xbf); for (i = 0x0A; i <= 0x0E; i++) { /* clr ext */ SETIREG(SISSR, i, 0x00); } SETIREGAND(SISSR, 0x37, 0xfe); SETREG(SISMISCW, 0xef); /* sync */ SETIREG(SISCR, 0x11, 0x00); /* crtc */ for (j = 0x00, i = 0; i <= 7; i++, j++) SETIREG(SISCR, j, crtcdata[i]); for (j = 0x10; i <= 10; i++, j++) SETIREG(SISCR, j, crtcdata[i]); for (j = 0x15; i <= 12; i++, j++) SETIREG(SISCR, j, crtcdata[i]); for (j = 0x0A; i <= 15; i++, j++) SETIREG(SISSR, j, crtcdata[i]); SETIREG(SISSR, 0x0E, (crtcdata[16] & 0xE0)); SETIREGANDOR(SISCR, 0x09, 0x5f, ((crtcdata[16] & 0x01) << 5)); SETIREG(SISCR, 0x14, 0x4f); du = (modex / 16) * (bpp * 2); /* offset/pitch */ SETIREGANDOR(SISSR, 0x0e, 0xf0, ((du >> 8) & 0x0f)); SETIREG(SISCR, 0x13, (du & 0xff)); du <<= 5; tmp8 = du >> 8; SETIREG(SISSR, 0x10, tmp8); SETIREG(SISSR, 0x31, 0x00); /* VCLK */ SETIREG(SISSR, 0x2b, 0x1b); SETIREG(SISSR, 0x2c, 0xe1); SETIREG(SISSR, 0x2d, 0x01); SETIREGAND(SISSR, 0x3d, 0xfe); /* FIFO */ SETIREG(SISSR, 0x08, 0xae); SETIREGAND(SISSR, 0x09, 0xf0); SETIREG(SISSR, 0x08, 0x34); SETIREGOR(SISSR, 0x3d, 0x01); SETIREGAND(SISSR, 0x1f, 0x3f); /* mode regs */ SETIREGANDOR(SISSR, 0x06, 0xc0, 0x0a); SETIREG(SISCR, 0x19, 0x00); SETIREGAND(SISCR, 0x1a, 0xfc); SETIREGAND(SISSR, 0x0f, 0xb7); SETIREGAND(SISSR, 0x31, 0xfb); SETIREGANDOR(SISSR, 0x21, 0x1f, 0xa0); SETIREGAND(SISSR, 0x32, 0xf3); SETIREGANDOR(SISSR, 0x07, 0xf8, 0x03); SETIREG(SISCR, 0x52, 0x6c); SETIREG(SISCR, 0x0d, 0x00); /* adjust frame */ SETIREG(SISCR, 0x0c, 0x00); SETIREG(SISSR, 0x0d, 0x00); SETIREGAND(SISSR, 0x37, 0xfe); SETIREG(SISCR, 0x32, 0x20); SETIREGAND(SISSR, 0x01, 0xdf); /* enable display */ SETIREG(SISCR, 0x63, (cr63 & 0xbf)); SETIREG(SISSR, 0x31, (sr31 & 0xfb)); if (touchengines) { SETIREG(SISSR, 0x20, 0xa1); /* enable engines */ SETIREGOR(SISSR, 0x1e, 0x5a); SETIREG(SISSR, 0x26, 0x01); /* disable cmdqueue */ SETIREG(SISSR, 0x27, 0x1f); SETIREG(SISSR, 0x26, 0x00); } SETIREG(SISCR, 0x34, 0x44); /* we just set std mode #44 */ } static int sisusb_init_gfxcore(struct sisusb_usb_data *sisusb) { int ret = 0, i, j, bw, chab, iret, retry = 3; u8 tmp8, ramtype; u32 tmp32; static const char mclktable[] = { 0x3b, 0x22, 0x01, 143, 0x3b, 0x22, 0x01, 143, 0x3b, 0x22, 0x01, 143, 0x3b, 0x22, 0x01, 143 }; static const char eclktable[] = { 0x3b, 0x22, 0x01, 143, 0x3b, 0x22, 0x01, 143, 0x3b, 0x22, 0x01, 143, 0x3b, 0x22, 0x01, 143 }; static const char ramtypetable1[] = { 0x00, 0x04, 0x60, 0x60, 0x0f, 0x0f, 0x1f, 0x1f, 0xba, 0xba, 0xba, 0xba, 0xa9, 0xa9, 0xac, 0xac, 0xa0, 0xa0, 0xa0, 0xa8, 0x00, 0x00, 0x02, 0x02, 0x30, 0x30, 0x40, 0x40 }; static const char ramtypetable2[] = { 0x77, 0x77, 0x44, 0x44, 0x77, 0x77, 0x44, 0x44, 0x00, 0x00, 0x00, 0x00, 0x5b, 0x5b, 0xab, 0xab, 0x00, 0x00, 0xf0, 0xf8 }; while (retry--) { /* Enable VGA */ ret = GETREG(SISVGAEN, &tmp8); ret |= SETREG(SISVGAEN, (tmp8 | 0x01)); /* Enable GPU access to VRAM */ ret |= GETREG(SISMISCR, &tmp8); ret |= SETREG(SISMISCW, (tmp8 | 0x01)); if (ret) continue; /* Reset registers */ ret |= SETIREGAND(SISCR, 0x5b, 0xdf); ret |= SETIREG(SISSR, 0x05, 0x86); ret |= SETIREGOR(SISSR, 0x20, 0x01); ret |= SETREG(SISMISCW, 0x67); for (i = 0x06; i <= 0x1f; i++) ret |= SETIREG(SISSR, i, 0x00); for (i = 0x21; i <= 0x27; i++) ret |= SETIREG(SISSR, i, 0x00); for (i = 0x31; i <= 0x3d; i++) ret |= SETIREG(SISSR, i, 0x00); for (i = 0x12; i <= 0x1b; i++) ret |= SETIREG(SISSR, i, 0x00); for (i = 0x79; i <= 0x7c; i++) ret |= SETIREG(SISCR, i, 0x00); if (ret) continue; ret |= SETIREG(SISCR, 0x63, 0x80); ret |= GETIREG(SISSR, 0x3a, &ramtype); ramtype &= 0x03; ret |= SETIREG(SISSR, 0x28, mclktable[ramtype * 4]); ret |= SETIREG(SISSR, 0x29, mclktable[(ramtype * 4) + 1]); ret |= SETIREG(SISSR, 0x2a, mclktable[(ramtype * 4) + 2]); ret |= SETIREG(SISSR, 0x2e, eclktable[ramtype * 4]); ret |= SETIREG(SISSR, 0x2f, eclktable[(ramtype * 4) + 1]); ret |= SETIREG(SISSR, 0x30, eclktable[(ramtype * 4) + 2]); ret |= SETIREG(SISSR, 0x07, 0x18); ret |= SETIREG(SISSR, 0x11, 0x0f); if (ret) continue; for (i = 0x15, j = 0; i <= 0x1b; i++, j++) { ret |= SETIREG(SISSR, i, ramtypetable1[(j*4) + ramtype]); } for (i = 0x40, j = 0; i <= 0x44; i++, j++) { ret |= SETIREG(SISCR, i, ramtypetable2[(j*4) + ramtype]); } ret |= SETIREG(SISCR, 0x49, 0xaa); ret |= SETIREG(SISSR, 0x1f, 0x00); ret |= SETIREG(SISSR, 0x20, 0xa0); ret |= SETIREG(SISSR, 0x23, 0xf6); ret |= SETIREG(SISSR, 0x24, 0x0d); ret |= SETIREG(SISSR, 0x25, 0x33); ret |= SETIREG(SISSR, 0x11, 0x0f); ret |= SETIREGOR(SISPART1, 0x2f, 0x01); ret |= SETIREGAND(SISCAP, 0x3f, 0xef); if (ret) continue; ret |= SETIREG(SISPART1, 0x00, 0x00); ret |= GETIREG(SISSR, 0x13, &tmp8); tmp8 >>= 4; ret |= SETIREG(SISPART1, 0x02, 0x00); ret |= SETIREG(SISPART1, 0x2e, 0x08); ret |= sisusb_read_pci_config(sisusb, 0x50, &tmp32); tmp32 &= 0x00f00000; tmp8 = (tmp32 == 0x100000) ? 0x33 : 0x03; ret |= SETIREG(SISSR, 0x25, tmp8); tmp8 = (tmp32 == 0x100000) ? 0xaa : 0x88; ret |= SETIREG(SISCR, 0x49, tmp8); ret |= SETIREG(SISSR, 0x27, 0x1f); ret |= SETIREG(SISSR, 0x31, 0x00); ret |= SETIREG(SISSR, 0x32, 0x11); ret |= SETIREG(SISSR, 0x33, 0x00); if (ret) continue; ret |= SETIREG(SISCR, 0x83, 0x00); sisusb_set_default_mode(sisusb, 0); ret |= SETIREGAND(SISSR, 0x21, 0xdf); ret |= SETIREGOR(SISSR, 0x01, 0x20); ret |= SETIREGOR(SISSR, 0x16, 0x0f); ret |= sisusb_triggersr16(sisusb, ramtype); /* Disable refresh */ ret |= SETIREGAND(SISSR, 0x17, 0xf8); ret |= SETIREGOR(SISSR, 0x19, 0x03); ret |= sisusb_getbuswidth(sisusb, &bw, &chab); ret |= sisusb_verify_mclk(sisusb); if (ramtype <= 1) { ret |= sisusb_get_sdram_size(sisusb, &iret, bw, chab); if (iret) { dev_err(&sisusb->sisusb_dev->dev, "RAM size detection failed, assuming 8MB video RAM\n"); ret |= SETIREG(SISSR, 0x14, 0x31); /* TODO */ } } else { dev_err(&sisusb->sisusb_dev->dev, "DDR RAM device found, assuming 8MB video RAM\n"); ret |= SETIREG(SISSR, 0x14, 0x31); /* *** TODO *** */ } /* Enable refresh */ ret |= SETIREG(SISSR, 0x16, ramtypetable1[4 + ramtype]); ret |= SETIREG(SISSR, 0x17, ramtypetable1[8 + ramtype]); ret |= SETIREG(SISSR, 0x19, ramtypetable1[16 + ramtype]); ret |= SETIREGOR(SISSR, 0x21, 0x20); ret |= SETIREG(SISSR, 0x22, 0xfb); ret |= SETIREG(SISSR, 0x21, 0xa5); if (ret == 0) break; } return ret; } #undef SETREG #undef GETREG #undef SETIREG #undef GETIREG #undef SETIREGOR #undef SETIREGAND #undef SETIREGANDOR #undef READL #undef WRITEL static void sisusb_get_ramconfig(struct sisusb_usb_data *sisusb) { u8 tmp8, tmp82, ramtype; int bw = 0; char *ramtypetext1 = NULL; static const char ram_datarate[4] = {'S', 'S', 'D', 'D'}; static const char ram_dynamictype[4] = {'D', 'G', 'D', 'G'}; static const int busSDR[4] = {64, 64, 128, 128}; static const int busDDR[4] = {32, 32, 64, 64}; static const int busDDRA[4] = {64+32, 64+32, (64+32)*2, (64+32)*2}; sisusb_getidxreg(sisusb, SISSR, 0x14, &tmp8); sisusb_getidxreg(sisusb, SISSR, 0x15, &tmp82); sisusb_getidxreg(sisusb, SISSR, 0x3a, &ramtype); sisusb->vramsize = (1 << ((tmp8 & 0xf0) >> 4)) * 1024 * 1024; ramtype &= 0x03; switch ((tmp8 >> 2) & 0x03) { case 0: ramtypetext1 = "1 ch/1 r"; if (tmp82 & 0x10) bw = 32; else bw = busSDR[(tmp8 & 0x03)]; break; case 1: ramtypetext1 = "1 ch/2 r"; sisusb->vramsize <<= 1; bw = busSDR[(tmp8 & 0x03)]; break; case 2: ramtypetext1 = "asymmetric"; sisusb->vramsize += sisusb->vramsize/2; bw = busDDRA[(tmp8 & 0x03)]; break; case 3: ramtypetext1 = "2 channel"; sisusb->vramsize <<= 1; bw = busDDR[(tmp8 & 0x03)]; break; } dev_info(&sisusb->sisusb_dev->dev, "%dMB %s %cDR S%cRAM, bus width %d\n", sisusb->vramsize >> 20, ramtypetext1, ram_datarate[ramtype], ram_dynamictype[ramtype], bw); } static int sisusb_do_init_gfxdevice(struct sisusb_usb_data *sisusb) { struct sisusb_packet packet; int ret; u32 tmp32; /* Do some magic */ packet.header = 0x001f; packet.address = 0x00000324; packet.data = 0x00000004; ret = sisusb_send_bridge_packet(sisusb, 10, &packet, 0); packet.header = 0x001f; packet.address = 0x00000364; packet.data = 0x00000004; ret |= sisusb_send_bridge_packet(sisusb, 10, &packet, 0); packet.header = 0x001f; packet.address = 0x00000384; packet.data = 0x00000004; ret |= sisusb_send_bridge_packet(sisusb, 10, &packet, 0); packet.header = 0x001f; packet.address = 0x00000100; packet.data = 0x00000700; ret |= sisusb_send_bridge_packet(sisusb, 10, &packet, 0); packet.header = 0x000f; packet.address = 0x00000004; ret |= sisusb_send_bridge_packet(sisusb, 6, &packet, 0); packet.data |= 0x17; ret |= sisusb_send_bridge_packet(sisusb, 10, &packet, 0); /* Init BAR 0 (VRAM) */ ret |= sisusb_read_pci_config(sisusb, 0x10, &tmp32); ret |= sisusb_write_pci_config(sisusb, 0x10, 0xfffffff0); ret |= sisusb_read_pci_config(sisusb, 0x10, &tmp32); tmp32 &= 0x0f; tmp32 |= SISUSB_PCI_MEMBASE; ret |= sisusb_write_pci_config(sisusb, 0x10, tmp32); /* Init BAR 1 (MMIO) */ ret |= sisusb_read_pci_config(sisusb, 0x14, &tmp32); ret |= sisusb_write_pci_config(sisusb, 0x14, 0xfffffff0); ret |= sisusb_read_pci_config(sisusb, 0x14, &tmp32); tmp32 &= 0x0f; tmp32 |= SISUSB_PCI_MMIOBASE; ret |= sisusb_write_pci_config(sisusb, 0x14, tmp32); /* Init BAR 2 (i/o ports) */ ret |= sisusb_read_pci_config(sisusb, 0x18, &tmp32); ret |= sisusb_write_pci_config(sisusb, 0x18, 0xfffffff0); ret |= sisusb_read_pci_config(sisusb, 0x18, &tmp32); tmp32 &= 0x0f; tmp32 |= SISUSB_PCI_IOPORTBASE; ret |= sisusb_write_pci_config(sisusb, 0x18, tmp32); /* Enable memory and i/o access */ ret |= sisusb_read_pci_config(sisusb, 0x04, &tmp32); tmp32 |= 0x3; ret |= sisusb_write_pci_config(sisusb, 0x04, tmp32); if (ret == 0) { /* Some further magic */ packet.header = 0x001f; packet.address = 0x00000050; packet.data = 0x000000ff; ret |= sisusb_send_bridge_packet(sisusb, 10, &packet, 0); } return ret; } /* Initialize the graphics device (return 0 on success) * This initializes the net2280 as well as the PCI registers * of the graphics board. */ static int sisusb_init_gfxdevice(struct sisusb_usb_data *sisusb, int initscreen) { int ret = 0, test = 0; u32 tmp32; if (sisusb->devinit == 1) { /* Read PCI BARs and see if they have been set up */ ret |= sisusb_read_pci_config(sisusb, 0x10, &tmp32); if (ret) return ret; if ((tmp32 & 0xfffffff0) == SISUSB_PCI_MEMBASE) test++; ret |= sisusb_read_pci_config(sisusb, 0x14, &tmp32); if (ret) return ret; if ((tmp32 & 0xfffffff0) == SISUSB_PCI_MMIOBASE) test++; ret |= sisusb_read_pci_config(sisusb, 0x18, &tmp32); if (ret) return ret; if ((tmp32 & 0xfffffff0) == SISUSB_PCI_IOPORTBASE) test++; } /* No? So reset the device */ if ((sisusb->devinit == 0) || (test != 3)) { ret |= sisusb_do_init_gfxdevice(sisusb); if (ret == 0) sisusb->devinit = 1; } if (sisusb->devinit) { /* Initialize the graphics core */ if (sisusb_init_gfxcore(sisusb) == 0) { sisusb->gfxinit = 1; sisusb_get_ramconfig(sisusb); sisusb_set_default_mode(sisusb, 1); ret |= sisusb_setup_screen(sisusb, 1, initscreen); } } return ret; } /* fops */ static int sisusb_open(struct inode *inode, struct file *file) { struct sisusb_usb_data *sisusb; struct usb_interface *interface; int subminor = iminor(inode); interface = usb_find_interface(&sisusb_driver, subminor); if (!interface) return -ENODEV; sisusb = usb_get_intfdata(interface); if (!sisusb) return -ENODEV; mutex_lock(&sisusb->lock); if (!sisusb->present || !sisusb->ready) { mutex_unlock(&sisusb->lock); return -ENODEV; } if (sisusb->isopen) { mutex_unlock(&sisusb->lock); return -EBUSY; } if (!sisusb->devinit) { if (sisusb->sisusb_dev->speed == USB_SPEED_HIGH || sisusb->sisusb_dev->speed >= USB_SPEED_SUPER) { if (sisusb_init_gfxdevice(sisusb, 0)) { mutex_unlock(&sisusb->lock); dev_err(&sisusb->sisusb_dev->dev, "Failed to initialize device\n"); return -EIO; } } else { mutex_unlock(&sisusb->lock); dev_err(&sisusb->sisusb_dev->dev, "Device not attached to USB 2.0 hub\n"); return -EIO; } } /* Increment usage count for our sisusb */ kref_get(&sisusb->kref); sisusb->isopen = 1; file->private_data = sisusb; mutex_unlock(&sisusb->lock); return 0; } static void sisusb_delete(struct kref *kref) { struct sisusb_usb_data *sisusb = to_sisusb_dev(kref); if (!sisusb) return; usb_put_dev(sisusb->sisusb_dev); sisusb->sisusb_dev = NULL; sisusb_free_buffers(sisusb); sisusb_free_urbs(sisusb); kfree(sisusb); } static int sisusb_release(struct inode *inode, struct file *file) { struct sisusb_usb_data *sisusb; sisusb = file->private_data; if (!sisusb) return -ENODEV; mutex_lock(&sisusb->lock); if (sisusb->present) { /* Wait for all URBs to finish if device still present */ if (!sisusb_wait_all_out_complete(sisusb)) sisusb_kill_all_busy(sisusb); } sisusb->isopen = 0; file->private_data = NULL; mutex_unlock(&sisusb->lock); /* decrement the usage count on our device */ kref_put(&sisusb->kref, sisusb_delete); return 0; } static ssize_t sisusb_read(struct file *file, char __user *buffer, size_t count, loff_t *ppos) { struct sisusb_usb_data *sisusb; ssize_t bytes_read = 0; int errno = 0; u8 buf8; u16 buf16; u32 buf32, address; sisusb = file->private_data; if (!sisusb) return -ENODEV; mutex_lock(&sisusb->lock); /* Sanity check */ if (!sisusb->present || !sisusb->ready || !sisusb->sisusb_dev) { mutex_unlock(&sisusb->lock); return -ENODEV; } if ((*ppos) >= SISUSB_PCI_PSEUDO_IOPORTBASE && (*ppos) < SISUSB_PCI_PSEUDO_IOPORTBASE + 128) { address = (*ppos) - SISUSB_PCI_PSEUDO_IOPORTBASE + SISUSB_PCI_IOPORTBASE; /* Read i/o ports * Byte, word and long(32) can be read. As this * emulates inX instructions, the data returned is * in machine-endianness. */ switch (count) { case 1: if (sisusb_read_memio_byte(sisusb, SISUSB_TYPE_IO, address, &buf8)) errno = -EIO; else if (put_user(buf8, (u8 __user *)buffer)) errno = -EFAULT; else bytes_read = 1; break; case 2: if (sisusb_read_memio_word(sisusb, SISUSB_TYPE_IO, address, &buf16)) errno = -EIO; else if (put_user(buf16, (u16 __user *)buffer)) errno = -EFAULT; else bytes_read = 2; break; case 4: if (sisusb_read_memio_long(sisusb, SISUSB_TYPE_IO, address, &buf32)) errno = -EIO; else if (put_user(buf32, (u32 __user *)buffer)) errno = -EFAULT; else bytes_read = 4; break; default: errno = -EIO; } } else if ((*ppos) >= SISUSB_PCI_PSEUDO_MEMBASE && (*ppos) < SISUSB_PCI_PSEUDO_MEMBASE + sisusb->vramsize) { address = (*ppos) - SISUSB_PCI_PSEUDO_MEMBASE + SISUSB_PCI_MEMBASE; /* Read video ram * Remember: Data delivered is never endian-corrected */ errno = sisusb_read_mem_bulk(sisusb, address, NULL, count, buffer, &bytes_read); if (bytes_read) errno = bytes_read; } else if ((*ppos) >= SISUSB_PCI_PSEUDO_MMIOBASE && (*ppos) < SISUSB_PCI_PSEUDO_MMIOBASE + SISUSB_PCI_MMIOSIZE) { address = (*ppos) - SISUSB_PCI_PSEUDO_MMIOBASE + SISUSB_PCI_MMIOBASE; /* Read MMIO * Remember: Data delivered is never endian-corrected */ errno = sisusb_read_mem_bulk(sisusb, address, NULL, count, buffer, &bytes_read); if (bytes_read) errno = bytes_read; } else if ((*ppos) >= SISUSB_PCI_PSEUDO_PCIBASE && (*ppos) <= SISUSB_PCI_PSEUDO_PCIBASE + 0x5c) { if (count != 4) { mutex_unlock(&sisusb->lock); return -EINVAL; } address = (*ppos) - SISUSB_PCI_PSEUDO_PCIBASE; /* Read PCI config register * Return value delivered in machine endianness. */ if (sisusb_read_pci_config(sisusb, address, &buf32)) errno = -EIO; else if (put_user(buf32, (u32 __user *)buffer)) errno = -EFAULT; else bytes_read = 4; } else { errno = -EBADFD; } (*ppos) += bytes_read; mutex_unlock(&sisusb->lock); return errno ? errno : bytes_read; } static ssize_t sisusb_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { struct sisusb_usb_data *sisusb; int errno = 0; ssize_t bytes_written = 0; u8 buf8; u16 buf16; u32 buf32, address; sisusb = file->private_data; if (!sisusb) return -ENODEV; mutex_lock(&sisusb->lock); /* Sanity check */ if (!sisusb->present || !sisusb->ready || !sisusb->sisusb_dev) { mutex_unlock(&sisusb->lock); return -ENODEV; } if ((*ppos) >= SISUSB_PCI_PSEUDO_IOPORTBASE && (*ppos) < SISUSB_PCI_PSEUDO_IOPORTBASE + 128) { address = (*ppos) - SISUSB_PCI_PSEUDO_IOPORTBASE + SISUSB_PCI_IOPORTBASE; /* Write i/o ports * Byte, word and long(32) can be written. As this * emulates outX instructions, the data is expected * in machine-endianness. */ switch (count) { case 1: if (get_user(buf8, (u8 __user *)buffer)) errno = -EFAULT; else if (sisusb_write_memio_byte(sisusb, SISUSB_TYPE_IO, address, buf8)) errno = -EIO; else bytes_written = 1; break; case 2: if (get_user(buf16, (u16 __user *)buffer)) errno = -EFAULT; else if (sisusb_write_memio_word(sisusb, SISUSB_TYPE_IO, address, buf16)) errno = -EIO; else bytes_written = 2; break; case 4: if (get_user(buf32, (u32 __user *)buffer)) errno = -EFAULT; else if (sisusb_write_memio_long(sisusb, SISUSB_TYPE_IO, address, buf32)) errno = -EIO; else bytes_written = 4; break; default: errno = -EIO; } } else if ((*ppos) >= SISUSB_PCI_PSEUDO_MEMBASE && (*ppos) < SISUSB_PCI_PSEUDO_MEMBASE + sisusb->vramsize) { address = (*ppos) - SISUSB_PCI_PSEUDO_MEMBASE + SISUSB_PCI_MEMBASE; /* Write video ram. * Buffer is copied 1:1, therefore, on big-endian * machines, the data must be swapped by userland * in advance (if applicable; no swapping in 8bpp * mode or if YUV data is being transferred). */ errno = sisusb_write_mem_bulk(sisusb, address, NULL, count, buffer, 0, &bytes_written); if (bytes_written) errno = bytes_written; } else if ((*ppos) >= SISUSB_PCI_PSEUDO_MMIOBASE && (*ppos) < SISUSB_PCI_PSEUDO_MMIOBASE + SISUSB_PCI_MMIOSIZE) { address = (*ppos) - SISUSB_PCI_PSEUDO_MMIOBASE + SISUSB_PCI_MMIOBASE; /* Write MMIO. * Buffer is copied 1:1, therefore, on big-endian * machines, the data must be swapped by userland * in advance. */ errno = sisusb_write_mem_bulk(sisusb, address, NULL, count, buffer, 0, &bytes_written); if (bytes_written) errno = bytes_written; } else if ((*ppos) >= SISUSB_PCI_PSEUDO_PCIBASE && (*ppos) <= SISUSB_PCI_PSEUDO_PCIBASE + SISUSB_PCI_PCONFSIZE) { if (count != 4) { mutex_unlock(&sisusb->lock); return -EINVAL; } address = (*ppos) - SISUSB_PCI_PSEUDO_PCIBASE; /* Write PCI config register. * Given value expected in machine endianness. */ if (get_user(buf32, (u32 __user *)buffer)) errno = -EFAULT; else if (sisusb_write_pci_config(sisusb, address, buf32)) errno = -EIO; else bytes_written = 4; } else { /* Error */ errno = -EBADFD; } (*ppos) += bytes_written; mutex_unlock(&sisusb->lock); return errno ? errno : bytes_written; } static loff_t sisusb_lseek(struct file *file, loff_t offset, int orig) { struct sisusb_usb_data *sisusb; loff_t ret; sisusb = file->private_data; if (!sisusb) return -ENODEV; mutex_lock(&sisusb->lock); /* Sanity check */ if (!sisusb->present || !sisusb->ready || !sisusb->sisusb_dev) { mutex_unlock(&sisusb->lock); return -ENODEV; } ret = no_seek_end_llseek(file, offset, orig); mutex_unlock(&sisusb->lock); return ret; } static int sisusb_handle_command(struct sisusb_usb_data *sisusb, struct sisusb_command *y, unsigned long arg) { int retval, length; u32 port, address; /* All our commands require the device * to be initialized. */ if (!sisusb->devinit) return -ENODEV; port = y->data3 - SISUSB_PCI_PSEUDO_IOPORTBASE + SISUSB_PCI_IOPORTBASE; switch (y->operation) { case SUCMD_GET: retval = sisusb_getidxreg(sisusb, port, y->data0, &y->data1); if (!retval) { if (copy_to_user((void __user *)arg, y, sizeof(*y))) retval = -EFAULT; } break; case SUCMD_SET: retval = sisusb_setidxreg(sisusb, port, y->data0, y->data1); break; case SUCMD_SETOR: retval = sisusb_setidxregor(sisusb, port, y->data0, y->data1); break; case SUCMD_SETAND: retval = sisusb_setidxregand(sisusb, port, y->data0, y->data1); break; case SUCMD_SETANDOR: retval = sisusb_setidxregandor(sisusb, port, y->data0, y->data1, y->data2); break; case SUCMD_SETMASK: retval = sisusb_setidxregmask(sisusb, port, y->data0, y->data1, y->data2); break; case SUCMD_CLRSCR: /* Gfx core must be initialized */ if (!sisusb->gfxinit) return -ENODEV; length = (y->data0 << 16) | (y->data1 << 8) | y->data2; address = y->data3 - SISUSB_PCI_PSEUDO_MEMBASE + SISUSB_PCI_MEMBASE; retval = sisusb_clear_vram(sisusb, address, length); break; case SUCMD_HANDLETEXTMODE: retval = 0; break; default: retval = -EINVAL; } if (retval > 0) retval = -EIO; return retval; } static long sisusb_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct sisusb_usb_data *sisusb; struct sisusb_info x; struct sisusb_command y; long retval = 0; u32 __user *argp = (u32 __user *)arg; sisusb = file->private_data; if (!sisusb) return -ENODEV; mutex_lock(&sisusb->lock); /* Sanity check */ if (!sisusb->present || !sisusb->ready || !sisusb->sisusb_dev) { retval = -ENODEV; goto err_out; } switch (cmd) { case SISUSB_GET_CONFIG_SIZE: if (put_user(sizeof(x), argp)) retval = -EFAULT; break; case SISUSB_GET_CONFIG: x.sisusb_id = SISUSB_ID; x.sisusb_version = SISUSB_VERSION; x.sisusb_revision = SISUSB_REVISION; x.sisusb_patchlevel = SISUSB_PATCHLEVEL; x.sisusb_gfxinit = sisusb->gfxinit; x.sisusb_vrambase = SISUSB_PCI_PSEUDO_MEMBASE; x.sisusb_mmiobase = SISUSB_PCI_PSEUDO_MMIOBASE; x.sisusb_iobase = SISUSB_PCI_PSEUDO_IOPORTBASE; x.sisusb_pcibase = SISUSB_PCI_PSEUDO_PCIBASE; x.sisusb_vramsize = sisusb->vramsize; x.sisusb_minor = sisusb->minor; x.sisusb_fbdevactive = 0; x.sisusb_conactive = 0; memset(x.sisusb_reserved, 0, sizeof(x.sisusb_reserved)); if (copy_to_user((void __user *)arg, &x, sizeof(x))) retval = -EFAULT; break; case SISUSB_COMMAND: if (copy_from_user(&y, (void __user *)arg, sizeof(y))) retval = -EFAULT; else retval = sisusb_handle_command(sisusb, &y, arg); break; default: retval = -ENOTTY; break; } err_out: mutex_unlock(&sisusb->lock); return retval; } #ifdef CONFIG_COMPAT static long sisusb_compat_ioctl(struct file *f, unsigned int cmd, unsigned long arg) { switch (cmd) { case SISUSB_GET_CONFIG_SIZE: case SISUSB_GET_CONFIG: case SISUSB_COMMAND: return sisusb_ioctl(f, cmd, arg); default: return -ENOIOCTLCMD; } } #endif static const struct file_operations usb_sisusb_fops = { .owner = THIS_MODULE, .open = sisusb_open, .release = sisusb_release, .read = sisusb_read, .write = sisusb_write, .llseek = sisusb_lseek, #ifdef CONFIG_COMPAT .compat_ioctl = sisusb_compat_ioctl, #endif .unlocked_ioctl = sisusb_ioctl }; static struct usb_class_driver usb_sisusb_class = { .name = "sisusbvga%d", .fops = &usb_sisusb_fops, .minor_base = SISUSB_MINOR }; static int sisusb_probe(struct usb_interface *intf, const struct usb_device_id *id) { struct usb_device *dev = interface_to_usbdev(intf); struct sisusb_usb_data *sisusb; int retval = 0, i; static const u8 ep_addresses[] = { SISUSB_EP_GFX_IN | USB_DIR_IN, SISUSB_EP_GFX_OUT | USB_DIR_OUT, SISUSB_EP_GFX_BULK_OUT | USB_DIR_OUT, SISUSB_EP_GFX_LBULK_OUT | USB_DIR_OUT, SISUSB_EP_BRIDGE_IN | USB_DIR_IN, SISUSB_EP_BRIDGE_OUT | USB_DIR_OUT, 0}; /* Are the expected endpoints present? */ if (!usb_check_bulk_endpoints(intf, ep_addresses)) { dev_err(&intf->dev, "Invalid USB2VGA device\n"); return -EINVAL; } dev_info(&dev->dev, "USB2VGA dongle found at address %d\n", dev->devnum); /* Allocate memory for our private */ sisusb = kzalloc(sizeof(*sisusb), GFP_KERNEL); if (!sisusb) return -ENOMEM; kref_init(&sisusb->kref); mutex_init(&(sisusb->lock)); sisusb->sisusb_dev = dev; sisusb->vrambase = SISUSB_PCI_MEMBASE; sisusb->mmiobase = SISUSB_PCI_MMIOBASE; sisusb->mmiosize = SISUSB_PCI_MMIOSIZE; sisusb->ioportbase = SISUSB_PCI_IOPORTBASE; /* Everything else is zero */ /* Register device */ retval = usb_register_dev(intf, &usb_sisusb_class); if (retval) { dev_err(&sisusb->sisusb_dev->dev, "Failed to get a minor for device %d\n", dev->devnum); retval = -ENODEV; goto error_1; } sisusb->minor = intf->minor; /* Allocate buffers */ sisusb->ibufsize = SISUSB_IBUF_SIZE; sisusb->ibuf = kmalloc(SISUSB_IBUF_SIZE, GFP_KERNEL); if (!sisusb->ibuf) { retval = -ENOMEM; goto error_2; } sisusb->numobufs = 0; sisusb->obufsize = SISUSB_OBUF_SIZE; for (i = 0; i < NUMOBUFS; i++) { sisusb->obuf[i] = kmalloc(SISUSB_OBUF_SIZE, GFP_KERNEL); if (!sisusb->obuf[i]) { if (i == 0) { retval = -ENOMEM; goto error_3; } break; } sisusb->numobufs++; } /* Allocate URBs */ sisusb->sisurbin = usb_alloc_urb(0, GFP_KERNEL); if (!sisusb->sisurbin) { retval = -ENOMEM; goto error_3; } sisusb->completein = 1; for (i = 0; i < sisusb->numobufs; i++) { sisusb->sisurbout[i] = usb_alloc_urb(0, GFP_KERNEL); if (!sisusb->sisurbout[i]) { retval = -ENOMEM; goto error_4; } sisusb->urbout_context[i].sisusb = (void *)sisusb; sisusb->urbout_context[i].urbindex = i; sisusb->urbstatus[i] = 0; } dev_info(&sisusb->sisusb_dev->dev, "Allocated %d output buffers\n", sisusb->numobufs); /* Do remaining init stuff */ init_waitqueue_head(&sisusb->wait_q); usb_set_intfdata(intf, sisusb); usb_get_dev(sisusb->sisusb_dev); sisusb->present = 1; if (dev->speed == USB_SPEED_HIGH || dev->speed >= USB_SPEED_SUPER) { int initscreen = 1; if (sisusb_init_gfxdevice(sisusb, initscreen)) dev_err(&sisusb->sisusb_dev->dev, "Failed to early initialize device\n"); } else dev_info(&sisusb->sisusb_dev->dev, "Not attached to USB 2.0 hub, deferring init\n"); sisusb->ready = 1; #ifdef SISUSBENDIANTEST dev_dbg(&sisusb->sisusb_dev->dev, "*** RWTEST ***\n"); sisusb_testreadwrite(sisusb); dev_dbg(&sisusb->sisusb_dev->dev, "*** RWTEST END ***\n"); #endif return 0; error_4: sisusb_free_urbs(sisusb); error_3: sisusb_free_buffers(sisusb); error_2: usb_deregister_dev(intf, &usb_sisusb_class); error_1: kfree(sisusb); return retval; } static void sisusb_disconnect(struct usb_interface *intf) { struct sisusb_usb_data *sisusb; /* This should *not* happen */ sisusb = usb_get_intfdata(intf); if (!sisusb) return; usb_deregister_dev(intf, &usb_sisusb_class); mutex_lock(&sisusb->lock); /* Wait for all URBs to complete and kill them in case (MUST do) */ if (!sisusb_wait_all_out_complete(sisusb)) sisusb_kill_all_busy(sisusb); usb_set_intfdata(intf, NULL); sisusb->present = 0; sisusb->ready = 0; mutex_unlock(&sisusb->lock); /* decrement our usage count */ kref_put(&sisusb->kref, sisusb_delete); } static const struct usb_device_id sisusb_table[] = { { USB_DEVICE(0x0711, 0x0550) }, { USB_DEVICE(0x0711, 0x0900) }, { USB_DEVICE(0x0711, 0x0901) }, { USB_DEVICE(0x0711, 0x0902) }, { USB_DEVICE(0x0711, 0x0903) }, { USB_DEVICE(0x0711, 0x0918) }, { USB_DEVICE(0x0711, 0x0920) }, { USB_DEVICE(0x0711, 0x0950) }, { USB_DEVICE(0x0711, 0x5200) }, { USB_DEVICE(0x182d, 0x021c) }, { USB_DEVICE(0x182d, 0x0269) }, { } }; MODULE_DEVICE_TABLE(usb, sisusb_table); static struct usb_driver sisusb_driver = { .name = "sisusb", .probe = sisusb_probe, .disconnect = sisusb_disconnect, .id_table = sisusb_table, }; module_usb_driver(sisusb_driver); MODULE_AUTHOR("Thomas Winischhofer <thomas@winischhofer.net>"); MODULE_DESCRIPTION("sisusbvga - Driver for Net2280/SiS315-based USB2VGA dongles"); MODULE_LICENSE("GPL");
3 10 69 71 76 7 71 76 74 71 5 70 71 1 71 45 76 82 149 64 65 64 64 64 25 41 25 24 13 25 51 51 1 50 7 7 41 50 24 89 91 90 7 9 8 7 86 82 18 5 5 69 87 5 1 5 89 90 3 3 3 3 66 66 2 2 2 57 58 1 1 1 1 52 53 2 2 2 18 18 2 2 16 16 2 2 2 1 4 4 4 1 92 91 10 3 1 2 9 8 9 90 9 9 9 8 92 10 90 4 3 3 58 39 27 26 27 27 27 14 14 14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 // SPDX-License-Identifier: GPL-2.0-or-later /* * Asynchronous Cryptographic Hash operations. * * This is the implementation of the ahash (asynchronous hash) API. It differs * from shash (synchronous hash) in that ahash supports asynchronous operations, * and it hashes data from scatterlists instead of virtually addressed buffers. * * The ahash API provides access to both ahash and shash algorithms. The shash * API only provides access to shash algorithms. * * Copyright (c) 2008 Loc Ho <lho@amcc.com> */ #include <crypto/scatterwalk.h> #include <linux/cryptouser.h> #include <linux/err.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/scatterlist.h> #include <linux/slab.h> #include <linux/seq_file.h> #include <linux/string.h> #include <linux/string_choices.h> #include <net/netlink.h> #include "hash.h" #define CRYPTO_ALG_TYPE_AHASH_MASK 0x0000000e static int ahash_def_finup(struct ahash_request *req); static inline bool crypto_ahash_block_only(struct crypto_ahash *tfm) { return crypto_ahash_alg(tfm)->halg.base.cra_flags & CRYPTO_AHASH_ALG_BLOCK_ONLY; } static inline bool crypto_ahash_final_nonzero(struct crypto_ahash *tfm) { return crypto_ahash_alg(tfm)->halg.base.cra_flags & CRYPTO_AHASH_ALG_FINAL_NONZERO; } static inline bool crypto_ahash_need_fallback(struct crypto_ahash *tfm) { return crypto_ahash_alg(tfm)->halg.base.cra_flags & CRYPTO_ALG_NEED_FALLBACK; } static inline void ahash_op_done(void *data, int err, int (*finish)(struct ahash_request *, int)) { struct ahash_request *areq = data; crypto_completion_t compl; compl = areq->saved_complete; data = areq->saved_data; if (err == -EINPROGRESS) goto out; areq->base.flags &= ~CRYPTO_TFM_REQ_MAY_SLEEP; err = finish(areq, err); if (err == -EINPROGRESS || err == -EBUSY) return; out: compl(data, err); } static int hash_walk_next(struct crypto_hash_walk *walk) { unsigned int offset = walk->offset; unsigned int nbytes = min(walk->entrylen, ((unsigned int)(PAGE_SIZE)) - offset); walk->data = kmap_local_page(walk->pg); walk->data += offset; walk->entrylen -= nbytes; return nbytes; } static int hash_walk_new_entry(struct crypto_hash_walk *walk) { struct scatterlist *sg; sg = walk->sg; walk->offset = sg->offset; walk->pg = sg_page(walk->sg) + (walk->offset >> PAGE_SHIFT); walk->offset = offset_in_page(walk->offset); walk->entrylen = sg->length; if (walk->entrylen > walk->total) walk->entrylen = walk->total; walk->total -= walk->entrylen; return hash_walk_next(walk); } int crypto_hash_walk_first(struct ahash_request *req, struct crypto_hash_walk *walk) { walk->total = req->nbytes; walk->entrylen = 0; if (!walk->total) return 0; walk->flags = req->base.flags; if (ahash_request_isvirt(req)) { walk->data = req->svirt; walk->total = 0; return req->nbytes; } walk->sg = req->src; return hash_walk_new_entry(walk); } EXPORT_SYMBOL_GPL(crypto_hash_walk_first); int crypto_hash_walk_done(struct crypto_hash_walk *walk, int err) { if ((walk->flags & CRYPTO_AHASH_REQ_VIRT)) return err; walk->data -= walk->offset; kunmap_local(walk->data); crypto_yield(walk->flags); if (err) return err; if (walk->entrylen) { walk->offset = 0; walk->pg++; return hash_walk_next(walk); } if (!walk->total) return 0; walk->sg = sg_next(walk->sg); return hash_walk_new_entry(walk); } EXPORT_SYMBOL_GPL(crypto_hash_walk_done); /* * For an ahash tfm that is using an shash algorithm (instead of an ahash * algorithm), this returns the underlying shash tfm. */ static inline struct crypto_shash *ahash_to_shash(struct crypto_ahash *tfm) { return *(struct crypto_shash **)crypto_ahash_ctx(tfm); } static inline struct shash_desc *prepare_shash_desc(struct ahash_request *req, struct crypto_ahash *tfm) { struct shash_desc *desc = ahash_request_ctx(req); desc->tfm = ahash_to_shash(tfm); return desc; } int shash_ahash_update(struct ahash_request *req, struct shash_desc *desc) { struct crypto_hash_walk walk; int nbytes; for (nbytes = crypto_hash_walk_first(req, &walk); nbytes > 0; nbytes = crypto_hash_walk_done(&walk, nbytes)) nbytes = crypto_shash_update(desc, walk.data, nbytes); return nbytes; } EXPORT_SYMBOL_GPL(shash_ahash_update); int shash_ahash_finup(struct ahash_request *req, struct shash_desc *desc) { struct crypto_hash_walk walk; int nbytes; nbytes = crypto_hash_walk_first(req, &walk); if (!nbytes) return crypto_shash_final(desc, req->result); do { nbytes = crypto_hash_walk_last(&walk) ? crypto_shash_finup(desc, walk.data, nbytes, req->result) : crypto_shash_update(desc, walk.data, nbytes); nbytes = crypto_hash_walk_done(&walk, nbytes); } while (nbytes > 0); return nbytes; } EXPORT_SYMBOL_GPL(shash_ahash_finup); int shash_ahash_digest(struct ahash_request *req, struct shash_desc *desc) { unsigned int nbytes = req->nbytes; struct scatterlist *sg; unsigned int offset; struct page *page; const u8 *data; int err; data = req->svirt; if (!nbytes || ahash_request_isvirt(req)) return crypto_shash_digest(desc, data, nbytes, req->result); sg = req->src; if (nbytes > sg->length) return crypto_shash_init(desc) ?: shash_ahash_finup(req, desc); page = sg_page(sg); offset = sg->offset; data = lowmem_page_address(page) + offset; if (!IS_ENABLED(CONFIG_HIGHMEM)) return crypto_shash_digest(desc, data, nbytes, req->result); page += offset >> PAGE_SHIFT; offset = offset_in_page(offset); if (nbytes > (unsigned int)PAGE_SIZE - offset) return crypto_shash_init(desc) ?: shash_ahash_finup(req, desc); data = kmap_local_page(page); err = crypto_shash_digest(desc, data + offset, nbytes, req->result); kunmap_local(data); return err; } EXPORT_SYMBOL_GPL(shash_ahash_digest); static void crypto_exit_ahash_using_shash(struct crypto_tfm *tfm) { struct crypto_shash **ctx = crypto_tfm_ctx(tfm); crypto_free_shash(*ctx); } static int crypto_init_ahash_using_shash(struct crypto_tfm *tfm) { struct crypto_alg *calg = tfm->__crt_alg; struct crypto_ahash *crt = __crypto_ahash_cast(tfm); struct crypto_shash **ctx = crypto_tfm_ctx(tfm); struct crypto_shash *shash; if (!crypto_mod_get(calg)) return -EAGAIN; shash = crypto_create_tfm(calg, &crypto_shash_type); if (IS_ERR(shash)) { crypto_mod_put(calg); return PTR_ERR(shash); } crt->using_shash = true; *ctx = shash; tfm->exit = crypto_exit_ahash_using_shash; crypto_ahash_set_flags(crt, crypto_shash_get_flags(shash) & CRYPTO_TFM_NEED_KEY); return 0; } static int ahash_nosetkey(struct crypto_ahash *tfm, const u8 *key, unsigned int keylen) { return -ENOSYS; } static void ahash_set_needkey(struct crypto_ahash *tfm, struct ahash_alg *alg) { if (alg->setkey != ahash_nosetkey && !(alg->halg.base.cra_flags & CRYPTO_ALG_OPTIONAL_KEY)) crypto_ahash_set_flags(tfm, CRYPTO_TFM_NEED_KEY); } int crypto_ahash_setkey(struct crypto_ahash *tfm, const u8 *key, unsigned int keylen) { if (likely(tfm->using_shash)) { struct crypto_shash *shash = ahash_to_shash(tfm); int err; err = crypto_shash_setkey(shash, key, keylen); if (unlikely(err)) { crypto_ahash_set_flags(tfm, crypto_shash_get_flags(shash) & CRYPTO_TFM_NEED_KEY); return err; } } else { struct ahash_alg *alg = crypto_ahash_alg(tfm); int err; err = alg->setkey(tfm, key, keylen); if (!err && crypto_ahash_need_fallback(tfm)) err = crypto_ahash_setkey(crypto_ahash_fb(tfm), key, keylen); if (unlikely(err)) { ahash_set_needkey(tfm, alg); return err; } } crypto_ahash_clear_flags(tfm, CRYPTO_TFM_NEED_KEY); return 0; } EXPORT_SYMBOL_GPL(crypto_ahash_setkey); static int ahash_do_req_chain(struct ahash_request *req, int (*const *op)(struct ahash_request *req)) { struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); int err; if (crypto_ahash_req_virt(tfm) || !ahash_request_isvirt(req)) return (*op)(req); if (crypto_ahash_statesize(tfm) > HASH_MAX_STATESIZE) return -ENOSYS; if (!crypto_ahash_need_fallback(tfm)) return -ENOSYS; if (crypto_hash_no_export_core(tfm)) return -ENOSYS; { u8 state[HASH_MAX_STATESIZE]; if (op == &crypto_ahash_alg(tfm)->digest) { ahash_request_set_tfm(req, crypto_ahash_fb(tfm)); err = crypto_ahash_digest(req); goto out_no_state; } err = crypto_ahash_export(req, state); ahash_request_set_tfm(req, crypto_ahash_fb(tfm)); err = err ?: crypto_ahash_import(req, state); if (op == &crypto_ahash_alg(tfm)->finup) { err = err ?: crypto_ahash_finup(req); goto out_no_state; } err = err ?: crypto_ahash_update(req) ?: crypto_ahash_export(req, state); ahash_request_set_tfm(req, tfm); return err ?: crypto_ahash_import(req, state); out_no_state: ahash_request_set_tfm(req, tfm); return err; } } int crypto_ahash_init(struct ahash_request *req) { struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); if (likely(tfm->using_shash)) return crypto_shash_init(prepare_shash_desc(req, tfm)); if (crypto_ahash_get_flags(tfm) & CRYPTO_TFM_NEED_KEY) return -ENOKEY; if (ahash_req_on_stack(req) && ahash_is_async(tfm)) return -EAGAIN; if (crypto_ahash_block_only(tfm)) { u8 *buf = ahash_request_ctx(req); buf += crypto_ahash_reqsize(tfm) - 1; *buf = 0; } return crypto_ahash_alg(tfm)->init(req); } EXPORT_SYMBOL_GPL(crypto_ahash_init); static void ahash_save_req(struct ahash_request *req, crypto_completion_t cplt) { req->saved_complete = req->base.complete; req->saved_data = req->base.data; req->base.complete = cplt; req->base.data = req; } static void ahash_restore_req(struct ahash_request *req) { req->base.complete = req->saved_complete; req->base.data = req->saved_data; } static int ahash_update_finish(struct ahash_request *req, int err) { struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); bool nonzero = crypto_ahash_final_nonzero(tfm); int bs = crypto_ahash_blocksize(tfm); u8 *blenp = ahash_request_ctx(req); int blen; u8 *buf; blenp += crypto_ahash_reqsize(tfm) - 1; blen = *blenp; buf = blenp - bs; if (blen) { req->src = req->sg_head + 1; if (sg_is_chain(req->src)) req->src = sg_chain_ptr(req->src); } req->nbytes += nonzero - blen; blen = err < 0 ? 0 : err + nonzero; if (ahash_request_isvirt(req)) memcpy(buf, req->svirt + req->nbytes - blen, blen); else memcpy_from_sglist(buf, req->src, req->nbytes - blen, blen); *blenp = blen; ahash_restore_req(req); return err; } static void ahash_update_done(void *data, int err) { ahash_op_done(data, err, ahash_update_finish); } int crypto_ahash_update(struct ahash_request *req) { struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); bool nonzero = crypto_ahash_final_nonzero(tfm); int bs = crypto_ahash_blocksize(tfm); u8 *blenp = ahash_request_ctx(req); int blen, err; u8 *buf; if (likely(tfm->using_shash)) return shash_ahash_update(req, ahash_request_ctx(req)); if (ahash_req_on_stack(req) && ahash_is_async(tfm)) return -EAGAIN; if (!crypto_ahash_block_only(tfm)) return ahash_do_req_chain(req, &crypto_ahash_alg(tfm)->update); blenp += crypto_ahash_reqsize(tfm) - 1; blen = *blenp; buf = blenp - bs; if (blen + req->nbytes < bs + nonzero) { if (ahash_request_isvirt(req)) memcpy(buf + blen, req->svirt, req->nbytes); else memcpy_from_sglist(buf + blen, req->src, 0, req->nbytes); *blenp += req->nbytes; return 0; } if (blen) { memset(req->sg_head, 0, sizeof(req->sg_head[0])); sg_set_buf(req->sg_head, buf, blen); if (req->src != req->sg_head + 1) sg_chain(req->sg_head, 2, req->src); req->src = req->sg_head; req->nbytes += blen; } req->nbytes -= nonzero; ahash_save_req(req, ahash_update_done); err = ahash_do_req_chain(req, &crypto_ahash_alg(tfm)->update); if (err == -EINPROGRESS || err == -EBUSY) return err; return ahash_update_finish(req, err); } EXPORT_SYMBOL_GPL(crypto_ahash_update); static int ahash_finup_finish(struct ahash_request *req, int err) { struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); u8 *blenp = ahash_request_ctx(req); int blen; blenp += crypto_ahash_reqsize(tfm) - 1; blen = *blenp; if (blen) { if (sg_is_last(req->src)) req->src = NULL; else { req->src = req->sg_head + 1; if (sg_is_chain(req->src)) req->src = sg_chain_ptr(req->src); } req->nbytes -= blen; } ahash_restore_req(req); return err; } static void ahash_finup_done(void *data, int err) { ahash_op_done(data, err, ahash_finup_finish); } int crypto_ahash_finup(struct ahash_request *req) { struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); int bs = crypto_ahash_blocksize(tfm); u8 *blenp = ahash_request_ctx(req); int blen, err; u8 *buf; if (likely(tfm->using_shash)) return shash_ahash_finup(req, ahash_request_ctx(req)); if (ahash_req_on_stack(req) && ahash_is_async(tfm)) return -EAGAIN; if (!crypto_ahash_alg(tfm)->finup) return ahash_def_finup(req); if (!crypto_ahash_block_only(tfm)) return ahash_do_req_chain(req, &crypto_ahash_alg(tfm)->finup); blenp += crypto_ahash_reqsize(tfm) - 1; blen = *blenp; buf = blenp - bs; if (blen) { memset(req->sg_head, 0, sizeof(req->sg_head[0])); sg_set_buf(req->sg_head, buf, blen); if (!req->src) sg_mark_end(req->sg_head); else if (req->src != req->sg_head + 1) sg_chain(req->sg_head, 2, req->src); req->src = req->sg_head; req->nbytes += blen; } ahash_save_req(req, ahash_finup_done); err = ahash_do_req_chain(req, &crypto_ahash_alg(tfm)->finup); if (err == -EINPROGRESS || err == -EBUSY) return err; return ahash_finup_finish(req, err); } EXPORT_SYMBOL_GPL(crypto_ahash_finup); int crypto_ahash_digest(struct ahash_request *req) { struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); if (likely(tfm->using_shash)) return shash_ahash_digest(req, prepare_shash_desc(req, tfm)); if (ahash_req_on_stack(req) && ahash_is_async(tfm)) return -EAGAIN; if (crypto_ahash_get_flags(tfm) & CRYPTO_TFM_NEED_KEY) return -ENOKEY; return ahash_do_req_chain(req, &crypto_ahash_alg(tfm)->digest); } EXPORT_SYMBOL_GPL(crypto_ahash_digest); static void ahash_def_finup_done2(void *data, int err) { struct ahash_request *areq = data; if (err == -EINPROGRESS) return; ahash_restore_req(areq); ahash_request_complete(areq, err); } static int ahash_def_finup_finish1(struct ahash_request *req, int err) { struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); if (err) goto out; req->base.complete = ahash_def_finup_done2; err = crypto_ahash_alg(tfm)->final(req); if (err == -EINPROGRESS || err == -EBUSY) return err; out: ahash_restore_req(req); return err; } static void ahash_def_finup_done1(void *data, int err) { ahash_op_done(data, err, ahash_def_finup_finish1); } static int ahash_def_finup(struct ahash_request *req) { int err; ahash_save_req(req, ahash_def_finup_done1); err = crypto_ahash_update(req); if (err == -EINPROGRESS || err == -EBUSY) return err; return ahash_def_finup_finish1(req, err); } int crypto_ahash_export_core(struct ahash_request *req, void *out) { struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); if (likely(tfm->using_shash)) return crypto_shash_export_core(ahash_request_ctx(req), out); return crypto_ahash_alg(tfm)->export_core(req, out); } EXPORT_SYMBOL_GPL(crypto_ahash_export_core); int crypto_ahash_export(struct ahash_request *req, void *out) { struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); if (likely(tfm->using_shash)) return crypto_shash_export(ahash_request_ctx(req), out); if (crypto_ahash_block_only(tfm)) { unsigned int plen = crypto_ahash_blocksize(tfm) + 1; unsigned int reqsize = crypto_ahash_reqsize(tfm); unsigned int ss = crypto_ahash_statesize(tfm); u8 *buf = ahash_request_ctx(req); memcpy(out + ss - plen, buf + reqsize - plen, plen); } return crypto_ahash_alg(tfm)->export(req, out); } EXPORT_SYMBOL_GPL(crypto_ahash_export); int crypto_ahash_import_core(struct ahash_request *req, const void *in) { struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); if (likely(tfm->using_shash)) return crypto_shash_import_core(prepare_shash_desc(req, tfm), in); if (crypto_ahash_get_flags(tfm) & CRYPTO_TFM_NEED_KEY) return -ENOKEY; return crypto_ahash_alg(tfm)->import_core(req, in); } EXPORT_SYMBOL_GPL(crypto_ahash_import_core); int crypto_ahash_import(struct ahash_request *req, const void *in) { struct crypto_ahash *tfm = crypto_ahash_reqtfm(req); if (likely(tfm->using_shash)) return crypto_shash_import(prepare_shash_desc(req, tfm), in); if (crypto_ahash_get_flags(tfm) & CRYPTO_TFM_NEED_KEY) return -ENOKEY; if (crypto_ahash_block_only(tfm)) { unsigned int reqsize = crypto_ahash_reqsize(tfm); u8 *buf = ahash_request_ctx(req); buf[reqsize - 1] = 0; } return crypto_ahash_alg(tfm)->import(req, in); } EXPORT_SYMBOL_GPL(crypto_ahash_import); static void crypto_ahash_exit_tfm(struct crypto_tfm *tfm) { struct crypto_ahash *hash = __crypto_ahash_cast(tfm); struct ahash_alg *alg = crypto_ahash_alg(hash); if (alg->exit_tfm) alg->exit_tfm(hash); else if (tfm->__crt_alg->cra_exit) tfm->__crt_alg->cra_exit(tfm); if (crypto_ahash_need_fallback(hash)) crypto_free_ahash(crypto_ahash_fb(hash)); } static int crypto_ahash_init_tfm(struct crypto_tfm *tfm) { struct crypto_ahash *hash = __crypto_ahash_cast(tfm); struct ahash_alg *alg = crypto_ahash_alg(hash); struct crypto_ahash *fb = NULL; int err; crypto_ahash_set_statesize(hash, alg->halg.statesize); crypto_ahash_set_reqsize(hash, crypto_tfm_alg_reqsize(tfm)); if (tfm->__crt_alg->cra_type == &crypto_shash_type) return crypto_init_ahash_using_shash(tfm); if (crypto_ahash_need_fallback(hash)) { fb = crypto_alloc_ahash(crypto_ahash_alg_name(hash), CRYPTO_ALG_REQ_VIRT, CRYPTO_ALG_ASYNC | CRYPTO_ALG_REQ_VIRT | CRYPTO_AHASH_ALG_NO_EXPORT_CORE); if (IS_ERR(fb)) return PTR_ERR(fb); tfm->fb = crypto_ahash_tfm(fb); } ahash_set_needkey(hash, alg); tfm->exit = crypto_ahash_exit_tfm; if (alg->init_tfm) err = alg->init_tfm(hash); else if (tfm->__crt_alg->cra_init) err = tfm->__crt_alg->cra_init(tfm); else return 0; if (err) goto out_free_sync_hash; if (!ahash_is_async(hash) && crypto_ahash_reqsize(hash) > MAX_SYNC_HASH_REQSIZE) goto out_exit_tfm; BUILD_BUG_ON(HASH_MAX_DESCSIZE > MAX_SYNC_HASH_REQSIZE); if (crypto_ahash_reqsize(hash) < HASH_MAX_DESCSIZE) crypto_ahash_set_reqsize(hash, HASH_MAX_DESCSIZE); return 0; out_exit_tfm: if (alg->exit_tfm) alg->exit_tfm(hash); else if (tfm->__crt_alg->cra_exit) tfm->__crt_alg->cra_exit(tfm); err = -EINVAL; out_free_sync_hash: crypto_free_ahash(fb); return err; } static unsigned int crypto_ahash_extsize(struct crypto_alg *alg) { if (alg->cra_type == &crypto_shash_type) return sizeof(struct crypto_shash *); return crypto_alg_extsize(alg); } static void crypto_ahash_free_instance(struct crypto_instance *inst) { struct ahash_instance *ahash = ahash_instance(inst); ahash->free(ahash); } static int __maybe_unused crypto_ahash_report( struct sk_buff *skb, struct crypto_alg *alg) { struct crypto_report_hash rhash; memset(&rhash, 0, sizeof(rhash)); strscpy(rhash.type, "ahash", sizeof(rhash.type)); rhash.blocksize = alg->cra_blocksize; rhash.digestsize = __crypto_hash_alg_common(alg)->digestsize; return nla_put(skb, CRYPTOCFGA_REPORT_HASH, sizeof(rhash), &rhash); } static void crypto_ahash_show(struct seq_file *m, struct crypto_alg *alg) __maybe_unused; static void crypto_ahash_show(struct seq_file *m, struct crypto_alg *alg) { seq_printf(m, "type : ahash\n"); seq_printf(m, "async : %s\n", str_yes_no(alg->cra_flags & CRYPTO_ALG_ASYNC)); seq_printf(m, "blocksize : %u\n", alg->cra_blocksize); seq_printf(m, "digestsize : %u\n", __crypto_hash_alg_common(alg)->digestsize); } static const struct crypto_type crypto_ahash_type = { .extsize = crypto_ahash_extsize, .init_tfm = crypto_ahash_init_tfm, .free = crypto_ahash_free_instance, #ifdef CONFIG_PROC_FS .show = crypto_ahash_show, #endif #if IS_ENABLED(CONFIG_CRYPTO_USER) .report = crypto_ahash_report, #endif .maskclear = ~CRYPTO_ALG_TYPE_MASK, .maskset = CRYPTO_ALG_TYPE_AHASH_MASK, .type = CRYPTO_ALG_TYPE_AHASH, .tfmsize = offsetof(struct crypto_ahash, base), .algsize = offsetof(struct ahash_alg, halg.base), }; int crypto_grab_ahash(struct crypto_ahash_spawn *spawn, struct crypto_instance *inst, const char *name, u32 type, u32 mask) { spawn->base.frontend = &crypto_ahash_type; return crypto_grab_spawn(&spawn->base, inst, name, type, mask); } EXPORT_SYMBOL_GPL(crypto_grab_ahash); struct crypto_ahash *crypto_alloc_ahash(const char *alg_name, u32 type, u32 mask) { return crypto_alloc_tfm(alg_name, &crypto_ahash_type, type, mask); } EXPORT_SYMBOL_GPL(crypto_alloc_ahash); int crypto_has_ahash(const char *alg_name, u32 type, u32 mask) { return crypto_type_has_alg(alg_name, &crypto_ahash_type, type, mask); } EXPORT_SYMBOL_GPL(crypto_has_ahash); bool crypto_hash_alg_has_setkey(struct hash_alg_common *halg) { struct crypto_alg *alg = &halg->base; if (alg->cra_type == &crypto_shash_type) return crypto_shash_alg_has_setkey(__crypto_shash_alg(alg)); return __crypto_ahash_alg(alg)->setkey != ahash_nosetkey; } EXPORT_SYMBOL_GPL(crypto_hash_alg_has_setkey); struct crypto_ahash *crypto_clone_ahash(struct crypto_ahash *hash) { struct hash_alg_common *halg = crypto_hash_alg_common(hash); struct crypto_tfm *tfm = crypto_ahash_tfm(hash); struct crypto_ahash *fb = NULL; struct crypto_ahash *nhash; struct ahash_alg *alg; int err; if (!crypto_hash_alg_has_setkey(halg)) { tfm = crypto_tfm_get(tfm); if (IS_ERR(tfm)) return ERR_CAST(tfm); return hash; } nhash = crypto_clone_tfm(&crypto_ahash_type, tfm); if (IS_ERR(nhash)) return nhash; nhash->reqsize = hash->reqsize; nhash->statesize = hash->statesize; if (likely(hash->using_shash)) { struct crypto_shash **nctx = crypto_ahash_ctx(nhash); struct crypto_shash *shash; shash = crypto_clone_shash(ahash_to_shash(hash)); if (IS_ERR(shash)) { err = PTR_ERR(shash); goto out_free_nhash; } crypto_ahash_tfm(nhash)->exit = crypto_exit_ahash_using_shash; nhash->using_shash = true; *nctx = shash; return nhash; } if (crypto_ahash_need_fallback(hash)) { fb = crypto_clone_ahash(crypto_ahash_fb(hash)); err = PTR_ERR(fb); if (IS_ERR(fb)) goto out_free_nhash; crypto_ahash_tfm(nhash)->fb = crypto_ahash_tfm(fb); } err = -ENOSYS; alg = crypto_ahash_alg(hash); if (!alg->clone_tfm) goto out_free_fb; err = alg->clone_tfm(nhash, hash); if (err) goto out_free_fb; crypto_ahash_tfm(nhash)->exit = crypto_ahash_exit_tfm; return nhash; out_free_fb: crypto_free_ahash(fb); out_free_nhash: crypto_free_ahash(nhash); return ERR_PTR(err); } EXPORT_SYMBOL_GPL(crypto_clone_ahash); static int ahash_default_export_core(struct ahash_request *req, void *out) { return -ENOSYS; } static int ahash_default_import_core(struct ahash_request *req, const void *in) { return -ENOSYS; } static int ahash_prepare_alg(struct ahash_alg *alg) { struct crypto_alg *base = &alg->halg.base; int err; if (alg->halg.statesize == 0) return -EINVAL; if (base->cra_reqsize && base->cra_reqsize < alg->halg.statesize) return -EINVAL; if (!(base->cra_flags & CRYPTO_ALG_ASYNC) && base->cra_reqsize > MAX_SYNC_HASH_REQSIZE) return -EINVAL; if (base->cra_flags & CRYPTO_ALG_NEED_FALLBACK && base->cra_flags & CRYPTO_ALG_NO_FALLBACK) return -EINVAL; err = hash_prepare_alg(&alg->halg); if (err) return err; base->cra_type = &crypto_ahash_type; base->cra_flags |= CRYPTO_ALG_TYPE_AHASH; if ((base->cra_flags ^ CRYPTO_ALG_REQ_VIRT) & (CRYPTO_ALG_ASYNC | CRYPTO_ALG_REQ_VIRT) && !(base->cra_flags & CRYPTO_ALG_NO_FALLBACK)) base->cra_flags |= CRYPTO_ALG_NEED_FALLBACK; if (!alg->setkey) alg->setkey = ahash_nosetkey; if (base->cra_flags & CRYPTO_AHASH_ALG_BLOCK_ONLY) { BUILD_BUG_ON(MAX_ALGAPI_BLOCKSIZE >= 256); if (!alg->finup) return -EINVAL; base->cra_reqsize += base->cra_blocksize + 1; alg->halg.statesize += base->cra_blocksize + 1; alg->export_core = alg->export; alg->import_core = alg->import; } else if (!alg->export_core || !alg->import_core) { alg->export_core = ahash_default_export_core; alg->import_core = ahash_default_import_core; base->cra_flags |= CRYPTO_AHASH_ALG_NO_EXPORT_CORE; } return 0; } int crypto_register_ahash(struct ahash_alg *alg) { struct crypto_alg *base = &alg->halg.base; int err; err = ahash_prepare_alg(alg); if (err) return err; return crypto_register_alg(base); } EXPORT_SYMBOL_GPL(crypto_register_ahash); void crypto_unregister_ahash(struct ahash_alg *alg) { crypto_unregister_alg(&alg->halg.base); } EXPORT_SYMBOL_GPL(crypto_unregister_ahash); int crypto_register_ahashes(struct ahash_alg *algs, int count) { int i, ret; for (i = 0; i < count; i++) { ret = crypto_register_ahash(&algs[i]); if (ret) goto err; } return 0; err: for (--i; i >= 0; --i) crypto_unregister_ahash(&algs[i]); return ret; } EXPORT_SYMBOL_GPL(crypto_register_ahashes); void crypto_unregister_ahashes(struct ahash_alg *algs, int count) { int i; for (i = count - 1; i >= 0; --i) crypto_unregister_ahash(&algs[i]); } EXPORT_SYMBOL_GPL(crypto_unregister_ahashes); int ahash_register_instance(struct crypto_template *tmpl, struct ahash_instance *inst) { int err; if (WARN_ON(!inst->free)) return -EINVAL; err = ahash_prepare_alg(&inst->alg); if (err) return err; return crypto_register_instance(tmpl, ahash_crypto_instance(inst)); } EXPORT_SYMBOL_GPL(ahash_register_instance); void ahash_request_free(struct ahash_request *req) { if (unlikely(!req)) return; if (!ahash_req_on_stack(req)) { kfree(req); return; } ahash_request_zero(req); } EXPORT_SYMBOL_GPL(ahash_request_free); int crypto_hash_digest(struct crypto_ahash *tfm, const u8 *data, unsigned int len, u8 *out) { HASH_REQUEST_ON_STACK(req, crypto_ahash_fb(tfm)); int err; ahash_request_set_callback(req, 0, NULL, NULL); ahash_request_set_virt(req, data, out, len); err = crypto_ahash_digest(req); ahash_request_zero(req); return err; } EXPORT_SYMBOL_GPL(crypto_hash_digest); void ahash_free_singlespawn_instance(struct ahash_instance *inst) { crypto_drop_spawn(ahash_instance_ctx(inst)); kfree(inst); } EXPORT_SYMBOL_GPL(ahash_free_singlespawn_instance); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Asynchronous cryptographic hash type");
21 13 19 19 9 9 9 19 19 11 11 11 20 2 20 13 13 13 8 8 8 1 7 8 2 2 7 1 7 2 2 1 1 1 6 8 8 8 7 7 24 24 22 20 20 14 10 13 1 1 10 10 1 9 4 9 9 9 9 6 9 9 9 13 8 11 8 7 6 1 6 4 6 8 11 2 3 2 2 3 11 10 9 4 2 9 10 11 3 2 2 1 2 3 15 13 5 4 4 4 4 3 3 4 15 2 2 1 5 5 4 3 2 9 8 6 4 4 1 1 3 2 5 9 5 3 4 1 1 3 1 3 1 2 2 1 2 2 4 20 20 20 9 6 8 10 7 3 19 19 19 8 8 8 6 8 20 1 1 20 20 18 18 9 11 11 15 1 2 10 10 8 8 1 1 11 10 22 21 18 22 13 13 13 12 9 10 6 5 10 20 23 10 10 10 10 10 10 10 10 10 10 9 9 9 9 9 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 // SPDX-License-Identifier: GPL-2.0 // Copyright (c) 2010-2011 EIA Electronics, // Pieter Beyens <pieter.beyens@eia.be> // Copyright (c) 2010-2011 EIA Electronics, // Kurt Van Dijck <kurt.van.dijck@eia.be> // Copyright (c) 2018 Protonic, // Robin van der Gracht <robin@protonic.nl> // Copyright (c) 2017-2019 Pengutronix, // Marc Kleine-Budde <kernel@pengutronix.de> // Copyright (c) 2017-2019 Pengutronix, // Oleksij Rempel <kernel@pengutronix.de> #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/can/can-ml.h> #include <linux/can/core.h> #include <linux/can/skb.h> #include <linux/errqueue.h> #include <linux/if_arp.h> #include "j1939-priv.h" #define J1939_MIN_NAMELEN CAN_REQUIRED_SIZE(struct sockaddr_can, can_addr.j1939) /* conversion function between struct sock::sk_priority from linux and * j1939 priority field */ static inline priority_t j1939_prio(u32 sk_priority) { sk_priority = min(sk_priority, 7U); return 7 - sk_priority; } static inline u32 j1939_to_sk_priority(priority_t prio) { return 7 - prio; } /* function to see if pgn is to be evaluated */ static inline bool j1939_pgn_is_valid(pgn_t pgn) { return pgn <= J1939_PGN_MAX; } /* test function to avoid non-zero DA placeholder for pdu1 pgn's */ static inline bool j1939_pgn_is_clean_pdu(pgn_t pgn) { if (j1939_pgn_is_pdu1(pgn)) return !(pgn & 0xff); else return true; } static inline void j1939_sock_pending_add(struct sock *sk) { struct j1939_sock *jsk = j1939_sk(sk); atomic_inc(&jsk->skb_pending); } static int j1939_sock_pending_get(struct sock *sk) { struct j1939_sock *jsk = j1939_sk(sk); return atomic_read(&jsk->skb_pending); } void j1939_sock_pending_del(struct sock *sk) { struct j1939_sock *jsk = j1939_sk(sk); /* atomic_dec_return returns the new value */ if (!atomic_dec_return(&jsk->skb_pending)) wake_up(&jsk->waitq); /* no pending SKB's */ } static void j1939_jsk_add(struct j1939_priv *priv, struct j1939_sock *jsk) { jsk->state |= J1939_SOCK_BOUND; j1939_priv_get(priv); write_lock_bh(&priv->j1939_socks_lock); list_add_tail(&jsk->list, &priv->j1939_socks); write_unlock_bh(&priv->j1939_socks_lock); } static void j1939_jsk_del(struct j1939_priv *priv, struct j1939_sock *jsk) { write_lock_bh(&priv->j1939_socks_lock); list_del_init(&jsk->list); write_unlock_bh(&priv->j1939_socks_lock); j1939_priv_put(priv); jsk->state &= ~J1939_SOCK_BOUND; } static bool j1939_sk_queue_session(struct j1939_session *session) { struct j1939_sock *jsk = j1939_sk(session->sk); bool empty; spin_lock_bh(&jsk->sk_session_queue_lock); empty = list_empty(&jsk->sk_session_queue); j1939_session_get(session); list_add_tail(&session->sk_session_queue_entry, &jsk->sk_session_queue); spin_unlock_bh(&jsk->sk_session_queue_lock); j1939_sock_pending_add(&jsk->sk); return empty; } static struct j1939_session *j1939_sk_get_incomplete_session(struct j1939_sock *jsk) { struct j1939_session *session = NULL; spin_lock_bh(&jsk->sk_session_queue_lock); if (!list_empty(&jsk->sk_session_queue)) { session = list_last_entry(&jsk->sk_session_queue, struct j1939_session, sk_session_queue_entry); if (session->total_queued_size == session->total_message_size) session = NULL; else j1939_session_get(session); } spin_unlock_bh(&jsk->sk_session_queue_lock); return session; } static void j1939_sk_queue_drop_all(struct j1939_priv *priv, struct j1939_sock *jsk, int err) { struct j1939_session *session, *tmp; netdev_dbg(priv->ndev, "%s: err: %i\n", __func__, err); spin_lock_bh(&jsk->sk_session_queue_lock); list_for_each_entry_safe(session, tmp, &jsk->sk_session_queue, sk_session_queue_entry) { list_del_init(&session->sk_session_queue_entry); session->err = err; j1939_session_put(session); } spin_unlock_bh(&jsk->sk_session_queue_lock); } static void j1939_sk_queue_activate_next_locked(struct j1939_session *session) { struct j1939_sock *jsk; struct j1939_session *first; int err; /* RX-Session don't have a socket (yet) */ if (!session->sk) return; jsk = j1939_sk(session->sk); lockdep_assert_held(&jsk->sk_session_queue_lock); err = session->err; first = list_first_entry_or_null(&jsk->sk_session_queue, struct j1939_session, sk_session_queue_entry); /* Some else has already activated the next session */ if (first != session) return; activate_next: list_del_init(&first->sk_session_queue_entry); j1939_session_put(first); first = list_first_entry_or_null(&jsk->sk_session_queue, struct j1939_session, sk_session_queue_entry); if (!first) return; if (j1939_session_activate(first)) { netdev_warn_once(first->priv->ndev, "%s: 0x%p: Identical session is already activated.\n", __func__, first); first->err = -EBUSY; goto activate_next; } else { /* Give receiver some time (arbitrary chosen) to recover */ int time_ms = 0; if (err) time_ms = 10 + get_random_u32_below(16); j1939_tp_schedule_txtimer(first, time_ms); } } void j1939_sk_queue_activate_next(struct j1939_session *session) { struct j1939_sock *jsk; if (!session->sk) return; jsk = j1939_sk(session->sk); spin_lock_bh(&jsk->sk_session_queue_lock); j1939_sk_queue_activate_next_locked(session); spin_unlock_bh(&jsk->sk_session_queue_lock); } static bool j1939_sk_match_dst(struct j1939_sock *jsk, const struct j1939_sk_buff_cb *skcb) { if ((jsk->state & J1939_SOCK_PROMISC)) return true; /* Destination address filter */ if (jsk->addr.src_name && skcb->addr.dst_name) { if (jsk->addr.src_name != skcb->addr.dst_name) return false; } else { /* receive (all sockets) if * - all packages that match our bind() address * - all broadcast on a socket if SO_BROADCAST * is set */ if (j1939_address_is_unicast(skcb->addr.da)) { if (jsk->addr.sa != skcb->addr.da) return false; } else if (!sock_flag(&jsk->sk, SOCK_BROADCAST)) { /* receiving broadcast without SO_BROADCAST * flag is not allowed */ return false; } } /* Source address filter */ if (jsk->state & J1939_SOCK_CONNECTED) { /* receive (all sockets) if * - all packages that match our connect() name or address */ if (jsk->addr.dst_name && skcb->addr.src_name) { if (jsk->addr.dst_name != skcb->addr.src_name) return false; } else { if (jsk->addr.da != skcb->addr.sa) return false; } } /* PGN filter */ if (j1939_pgn_is_valid(jsk->pgn_rx_filter) && jsk->pgn_rx_filter != skcb->addr.pgn) return false; return true; } /* matches skb control buffer (addr) with a j1939 filter */ static bool j1939_sk_match_filter(struct j1939_sock *jsk, const struct j1939_sk_buff_cb *skcb) { const struct j1939_filter *f; int nfilter; spin_lock_bh(&jsk->filters_lock); f = jsk->filters; nfilter = jsk->nfilters; if (!nfilter) /* receive all when no filters are assigned */ goto filter_match_found; for (; nfilter; ++f, --nfilter) { if ((skcb->addr.pgn & f->pgn_mask) != f->pgn) continue; if ((skcb->addr.sa & f->addr_mask) != f->addr) continue; if ((skcb->addr.src_name & f->name_mask) != f->name) continue; goto filter_match_found; } spin_unlock_bh(&jsk->filters_lock); return false; filter_match_found: spin_unlock_bh(&jsk->filters_lock); return true; } static bool j1939_sk_recv_match_one(struct j1939_sock *jsk, const struct j1939_sk_buff_cb *skcb) { if (!(jsk->state & J1939_SOCK_BOUND)) return false; if (!j1939_sk_match_dst(jsk, skcb)) return false; if (!j1939_sk_match_filter(jsk, skcb)) return false; return true; } static void j1939_sk_recv_one(struct j1939_sock *jsk, struct sk_buff *oskb) { const struct j1939_sk_buff_cb *oskcb = j1939_skb_to_cb(oskb); struct j1939_sk_buff_cb *skcb; enum skb_drop_reason reason; struct sk_buff *skb; if (oskb->sk == &jsk->sk) return; if (!j1939_sk_recv_match_one(jsk, oskcb)) return; skb = skb_clone(oskb, GFP_ATOMIC); if (!skb) { pr_warn("skb clone failed\n"); return; } can_skb_set_owner(skb, oskb->sk); skcb = j1939_skb_to_cb(skb); skcb->msg_flags &= ~(MSG_DONTROUTE); if (skb->sk) skcb->msg_flags |= MSG_DONTROUTE; if (sock_queue_rcv_skb_reason(&jsk->sk, skb, &reason) < 0) sk_skb_reason_drop(&jsk->sk, skb, reason); } bool j1939_sk_recv_match(struct j1939_priv *priv, struct j1939_sk_buff_cb *skcb) { struct j1939_sock *jsk; bool match = false; read_lock_bh(&priv->j1939_socks_lock); list_for_each_entry(jsk, &priv->j1939_socks, list) { match = j1939_sk_recv_match_one(jsk, skcb); if (match) break; } read_unlock_bh(&priv->j1939_socks_lock); return match; } void j1939_sk_recv(struct j1939_priv *priv, struct sk_buff *skb) { struct j1939_sock *jsk; read_lock_bh(&priv->j1939_socks_lock); list_for_each_entry(jsk, &priv->j1939_socks, list) { j1939_sk_recv_one(jsk, skb); } read_unlock_bh(&priv->j1939_socks_lock); } static void j1939_sk_sock_destruct(struct sock *sk) { struct j1939_sock *jsk = j1939_sk(sk); /* This function will be called by the generic networking code, when * the socket is ultimately closed (sk->sk_destruct). * * The race between * - processing a received CAN frame * (can_receive -> j1939_can_recv) * and accessing j1939_priv * ... and ... * - closing a socket * (j1939_can_rx_unregister -> can_rx_unregister) * and calling the final j1939_priv_put() * * is avoided by calling the final j1939_priv_put() from this * RCU deferred cleanup call. */ if (jsk->priv) { j1939_priv_put(jsk->priv); jsk->priv = NULL; } /* call generic CAN sock destruct */ can_sock_destruct(sk); } static int j1939_sk_init(struct sock *sk) { struct j1939_sock *jsk = j1939_sk(sk); /* Ensure that "sk" is first member in "struct j1939_sock", so that we * can skip it during memset(). */ BUILD_BUG_ON(offsetof(struct j1939_sock, sk) != 0); memset((void *)jsk + sizeof(jsk->sk), 0x0, sizeof(*jsk) - sizeof(jsk->sk)); INIT_LIST_HEAD(&jsk->list); init_waitqueue_head(&jsk->waitq); jsk->sk.sk_priority = j1939_to_sk_priority(6); jsk->sk.sk_reuse = 1; /* per default */ jsk->addr.sa = J1939_NO_ADDR; jsk->addr.da = J1939_NO_ADDR; jsk->addr.pgn = J1939_NO_PGN; jsk->pgn_rx_filter = J1939_NO_PGN; atomic_set(&jsk->skb_pending, 0); spin_lock_init(&jsk->sk_session_queue_lock); INIT_LIST_HEAD(&jsk->sk_session_queue); spin_lock_init(&jsk->filters_lock); /* j1939_sk_sock_destruct() depends on SOCK_RCU_FREE flag */ sock_set_flag(sk, SOCK_RCU_FREE); sk->sk_destruct = j1939_sk_sock_destruct; sk->sk_protocol = CAN_J1939; return 0; } static int j1939_sk_sanity_check(struct sockaddr_can *addr, int len) { if (!addr) return -EDESTADDRREQ; if (len < J1939_MIN_NAMELEN) return -EINVAL; if (addr->can_family != AF_CAN) return -EINVAL; if (!addr->can_ifindex) return -ENODEV; if (j1939_pgn_is_valid(addr->can_addr.j1939.pgn) && !j1939_pgn_is_clean_pdu(addr->can_addr.j1939.pgn)) return -EINVAL; return 0; } static int j1939_sk_bind(struct socket *sock, struct sockaddr *uaddr, int len) { struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; struct j1939_sock *jsk = j1939_sk(sock->sk); struct j1939_priv *priv; struct sock *sk; struct net *net; int ret = 0; ret = j1939_sk_sanity_check(addr, len); if (ret) return ret; lock_sock(sock->sk); priv = jsk->priv; sk = sock->sk; net = sock_net(sk); /* Already bound to an interface? */ if (jsk->state & J1939_SOCK_BOUND) { /* A re-bind() to a different interface is not * supported. */ if (jsk->ifindex != addr->can_ifindex) { ret = -EINVAL; goto out_release_sock; } /* drop old references */ j1939_jsk_del(priv, jsk); j1939_local_ecu_put(priv, jsk->addr.src_name, jsk->addr.sa); } else { struct can_ml_priv *can_ml; struct net_device *ndev; ndev = dev_get_by_index(net, addr->can_ifindex); if (!ndev) { ret = -ENODEV; goto out_release_sock; } can_ml = can_get_ml_priv(ndev); if (!can_ml) { dev_put(ndev); ret = -ENODEV; goto out_release_sock; } if (!(ndev->flags & IFF_UP)) { dev_put(ndev); ret = -ENETDOWN; goto out_release_sock; } priv = j1939_netdev_start(ndev); dev_put(ndev); if (IS_ERR(priv)) { ret = PTR_ERR(priv); goto out_release_sock; } jsk->ifindex = addr->can_ifindex; /* the corresponding j1939_priv_put() is called via * sk->sk_destruct, which points to j1939_sk_sock_destruct() */ j1939_priv_get(priv); jsk->priv = priv; } /* set default transmit pgn */ if (j1939_pgn_is_valid(addr->can_addr.j1939.pgn)) jsk->pgn_rx_filter = addr->can_addr.j1939.pgn; jsk->addr.src_name = addr->can_addr.j1939.name; jsk->addr.sa = addr->can_addr.j1939.addr; /* get new references */ ret = j1939_local_ecu_get(priv, jsk->addr.src_name, jsk->addr.sa); if (ret) { j1939_netdev_stop(priv); jsk->priv = NULL; synchronize_rcu(); j1939_priv_put(priv); goto out_release_sock; } j1939_jsk_add(priv, jsk); out_release_sock: /* fall through */ release_sock(sock->sk); return ret; } static int j1939_sk_connect(struct socket *sock, struct sockaddr *uaddr, int len, int flags) { struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; struct j1939_sock *jsk = j1939_sk(sock->sk); int ret = 0; ret = j1939_sk_sanity_check(addr, len); if (ret) return ret; lock_sock(sock->sk); /* bind() before connect() is mandatory */ if (!(jsk->state & J1939_SOCK_BOUND)) { ret = -EINVAL; goto out_release_sock; } /* A connect() to a different interface is not supported. */ if (jsk->ifindex != addr->can_ifindex) { ret = -EINVAL; goto out_release_sock; } if (!addr->can_addr.j1939.name && addr->can_addr.j1939.addr == J1939_NO_ADDR && !sock_flag(&jsk->sk, SOCK_BROADCAST)) { /* broadcast, but SO_BROADCAST not set */ ret = -EACCES; goto out_release_sock; } jsk->addr.dst_name = addr->can_addr.j1939.name; jsk->addr.da = addr->can_addr.j1939.addr; if (j1939_pgn_is_valid(addr->can_addr.j1939.pgn)) jsk->addr.pgn = addr->can_addr.j1939.pgn; jsk->state |= J1939_SOCK_CONNECTED; out_release_sock: /* fall through */ release_sock(sock->sk); return ret; } static void j1939_sk_sock2sockaddr_can(struct sockaddr_can *addr, const struct j1939_sock *jsk, int peer) { /* There are two holes (2 bytes and 3 bytes) to clear to avoid * leaking kernel information to user space. */ memset(addr, 0, J1939_MIN_NAMELEN); addr->can_family = AF_CAN; addr->can_ifindex = jsk->ifindex; addr->can_addr.j1939.pgn = jsk->addr.pgn; if (peer) { addr->can_addr.j1939.name = jsk->addr.dst_name; addr->can_addr.j1939.addr = jsk->addr.da; } else { addr->can_addr.j1939.name = jsk->addr.src_name; addr->can_addr.j1939.addr = jsk->addr.sa; } } static int j1939_sk_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; struct sock *sk = sock->sk; struct j1939_sock *jsk = j1939_sk(sk); int ret = 0; lock_sock(sk); if (peer && !(jsk->state & J1939_SOCK_CONNECTED)) { ret = -EADDRNOTAVAIL; goto failure; } j1939_sk_sock2sockaddr_can(addr, jsk, peer); ret = J1939_MIN_NAMELEN; failure: release_sock(sk); return ret; } static int j1939_sk_release(struct socket *sock) { struct sock *sk = sock->sk; struct j1939_sock *jsk; if (!sk) return 0; lock_sock(sk); jsk = j1939_sk(sk); if (jsk->state & J1939_SOCK_BOUND) { struct j1939_priv *priv = jsk->priv; if (wait_event_interruptible(jsk->waitq, !j1939_sock_pending_get(&jsk->sk))) { j1939_cancel_active_session(priv, sk); j1939_sk_queue_drop_all(priv, jsk, ESHUTDOWN); } j1939_jsk_del(priv, jsk); j1939_local_ecu_put(priv, jsk->addr.src_name, jsk->addr.sa); j1939_netdev_stop(priv); } kfree(jsk->filters); sock_orphan(sk); sock->sk = NULL; release_sock(sk); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); sock_put(sk); return 0; } static int j1939_sk_setsockopt_flag(struct j1939_sock *jsk, sockptr_t optval, unsigned int optlen, int flag) { int tmp; if (optlen != sizeof(tmp)) return -EINVAL; if (copy_from_sockptr(&tmp, optval, optlen)) return -EFAULT; lock_sock(&jsk->sk); if (tmp) jsk->state |= flag; else jsk->state &= ~flag; release_sock(&jsk->sk); return tmp; } static int j1939_sk_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; struct j1939_sock *jsk = j1939_sk(sk); int tmp, count = 0, ret = 0; struct j1939_filter *filters = NULL, *ofilters; if (level != SOL_CAN_J1939) return -EINVAL; switch (optname) { case SO_J1939_FILTER: if (!sockptr_is_null(optval) && optlen != 0) { struct j1939_filter *f; int c; if (optlen % sizeof(*filters) != 0) return -EINVAL; if (optlen > J1939_FILTER_MAX * sizeof(struct j1939_filter)) return -EINVAL; count = optlen / sizeof(*filters); filters = memdup_sockptr(optval, optlen); if (IS_ERR(filters)) return PTR_ERR(filters); for (f = filters, c = count; c; f++, c--) { f->name &= f->name_mask; f->pgn &= f->pgn_mask; f->addr &= f->addr_mask; } } lock_sock(&jsk->sk); spin_lock_bh(&jsk->filters_lock); ofilters = jsk->filters; jsk->filters = filters; jsk->nfilters = count; spin_unlock_bh(&jsk->filters_lock); release_sock(&jsk->sk); kfree(ofilters); return 0; case SO_J1939_PROMISC: return j1939_sk_setsockopt_flag(jsk, optval, optlen, J1939_SOCK_PROMISC); case SO_J1939_ERRQUEUE: ret = j1939_sk_setsockopt_flag(jsk, optval, optlen, J1939_SOCK_ERRQUEUE); if (ret < 0) return ret; if (!(jsk->state & J1939_SOCK_ERRQUEUE)) skb_queue_purge(&sk->sk_error_queue); return ret; case SO_J1939_SEND_PRIO: if (optlen != sizeof(tmp)) return -EINVAL; if (copy_from_sockptr(&tmp, optval, optlen)) return -EFAULT; if (tmp < 0 || tmp > 7) return -EDOM; if (tmp < 2 && !capable(CAP_NET_ADMIN)) return -EPERM; lock_sock(&jsk->sk); jsk->sk.sk_priority = j1939_to_sk_priority(tmp); release_sock(&jsk->sk); return 0; default: return -ENOPROTOOPT; } } static int j1939_sk_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; struct j1939_sock *jsk = j1939_sk(sk); int ret, ulen; /* set defaults for using 'int' properties */ int tmp = 0; int len = sizeof(tmp); void *val = &tmp; if (level != SOL_CAN_J1939) return -EINVAL; if (get_user(ulen, optlen)) return -EFAULT; if (ulen < 0) return -EINVAL; lock_sock(&jsk->sk); switch (optname) { case SO_J1939_PROMISC: tmp = (jsk->state & J1939_SOCK_PROMISC) ? 1 : 0; break; case SO_J1939_ERRQUEUE: tmp = (jsk->state & J1939_SOCK_ERRQUEUE) ? 1 : 0; break; case SO_J1939_SEND_PRIO: tmp = j1939_prio(jsk->sk.sk_priority); break; default: ret = -ENOPROTOOPT; goto no_copy; } /* copy to user, based on 'len' & 'val' * but most sockopt's are 'int' properties, and have 'len' & 'val' * left unchanged, but instead modified 'tmp' */ if (len > ulen) ret = -EFAULT; else if (put_user(len, optlen)) ret = -EFAULT; else if (copy_to_user(optval, val, len)) ret = -EFAULT; else ret = 0; no_copy: release_sock(&jsk->sk); return ret; } static int j1939_sk_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; struct sk_buff *skb; struct j1939_sk_buff_cb *skcb; int ret = 0; if (flags & ~(MSG_DONTWAIT | MSG_ERRQUEUE | MSG_CMSG_COMPAT)) return -EINVAL; if (flags & MSG_ERRQUEUE) return sock_recv_errqueue(sock->sk, msg, size, SOL_CAN_J1939, SCM_J1939_ERRQUEUE); skb = skb_recv_datagram(sk, flags, &ret); if (!skb) return ret; if (size < skb->len) msg->msg_flags |= MSG_TRUNC; else size = skb->len; ret = memcpy_to_msg(msg, skb->data, size); if (ret < 0) { skb_free_datagram(sk, skb); return ret; } skcb = j1939_skb_to_cb(skb); if (j1939_address_is_valid(skcb->addr.da)) put_cmsg(msg, SOL_CAN_J1939, SCM_J1939_DEST_ADDR, sizeof(skcb->addr.da), &skcb->addr.da); if (skcb->addr.dst_name) put_cmsg(msg, SOL_CAN_J1939, SCM_J1939_DEST_NAME, sizeof(skcb->addr.dst_name), &skcb->addr.dst_name); put_cmsg(msg, SOL_CAN_J1939, SCM_J1939_PRIO, sizeof(skcb->priority), &skcb->priority); if (msg->msg_name) { struct sockaddr_can *paddr = msg->msg_name; msg->msg_namelen = J1939_MIN_NAMELEN; memset(msg->msg_name, 0, msg->msg_namelen); paddr->can_family = AF_CAN; paddr->can_ifindex = skb->skb_iif; paddr->can_addr.j1939.name = skcb->addr.src_name; paddr->can_addr.j1939.addr = skcb->addr.sa; paddr->can_addr.j1939.pgn = skcb->addr.pgn; } sock_recv_cmsgs(msg, sk, skb); msg->msg_flags |= skcb->msg_flags; skb_free_datagram(sk, skb); return size; } static struct sk_buff *j1939_sk_alloc_skb(struct net_device *ndev, struct sock *sk, struct msghdr *msg, size_t size, int *errcode) { struct j1939_sock *jsk = j1939_sk(sk); struct j1939_sk_buff_cb *skcb; struct sk_buff *skb; int ret; skb = sock_alloc_send_skb(sk, size + sizeof(struct can_frame) - sizeof(((struct can_frame *)NULL)->data) + sizeof(struct can_skb_priv), msg->msg_flags & MSG_DONTWAIT, &ret); if (!skb) goto failure; can_skb_reserve(skb); can_skb_prv(skb)->ifindex = ndev->ifindex; can_skb_prv(skb)->skbcnt = 0; skb_reserve(skb, offsetof(struct can_frame, data)); ret = memcpy_from_msg(skb_put(skb, size), msg, size); if (ret < 0) goto free_skb; skb->dev = ndev; skcb = j1939_skb_to_cb(skb); memset(skcb, 0, sizeof(*skcb)); skcb->addr = jsk->addr; skcb->priority = j1939_prio(READ_ONCE(sk->sk_priority)); if (msg->msg_name) { struct sockaddr_can *addr = msg->msg_name; if (addr->can_addr.j1939.name || addr->can_addr.j1939.addr != J1939_NO_ADDR) { skcb->addr.dst_name = addr->can_addr.j1939.name; skcb->addr.da = addr->can_addr.j1939.addr; } if (j1939_pgn_is_valid(addr->can_addr.j1939.pgn)) skcb->addr.pgn = addr->can_addr.j1939.pgn; } *errcode = ret; return skb; free_skb: kfree_skb(skb); failure: *errcode = ret; return NULL; } static size_t j1939_sk_opt_stats_get_size(enum j1939_sk_errqueue_type type) { switch (type) { case J1939_ERRQUEUE_RX_RTS: return nla_total_size(sizeof(u32)) + /* J1939_NLA_TOTAL_SIZE */ nla_total_size(sizeof(u32)) + /* J1939_NLA_PGN */ nla_total_size(sizeof(u64)) + /* J1939_NLA_SRC_NAME */ nla_total_size(sizeof(u64)) + /* J1939_NLA_DEST_NAME */ nla_total_size(sizeof(u8)) + /* J1939_NLA_SRC_ADDR */ nla_total_size(sizeof(u8)) + /* J1939_NLA_DEST_ADDR */ 0; default: return nla_total_size(sizeof(u32)) + /* J1939_NLA_BYTES_ACKED */ 0; } } static struct sk_buff * j1939_sk_get_timestamping_opt_stats(struct j1939_session *session, enum j1939_sk_errqueue_type type) { struct sk_buff *stats; u32 size; stats = alloc_skb(j1939_sk_opt_stats_get_size(type), GFP_ATOMIC); if (!stats) return NULL; if (session->skcb.addr.type == J1939_SIMPLE) size = session->total_message_size; else size = min(session->pkt.tx_acked * 7, session->total_message_size); switch (type) { case J1939_ERRQUEUE_RX_RTS: nla_put_u32(stats, J1939_NLA_TOTAL_SIZE, session->total_message_size); nla_put_u32(stats, J1939_NLA_PGN, session->skcb.addr.pgn); nla_put_u64_64bit(stats, J1939_NLA_SRC_NAME, session->skcb.addr.src_name, J1939_NLA_PAD); nla_put_u64_64bit(stats, J1939_NLA_DEST_NAME, session->skcb.addr.dst_name, J1939_NLA_PAD); nla_put_u8(stats, J1939_NLA_SRC_ADDR, session->skcb.addr.sa); nla_put_u8(stats, J1939_NLA_DEST_ADDR, session->skcb.addr.da); break; default: nla_put_u32(stats, J1939_NLA_BYTES_ACKED, size); } return stats; } static void __j1939_sk_errqueue(struct j1939_session *session, struct sock *sk, enum j1939_sk_errqueue_type type) { struct j1939_priv *priv = session->priv; struct j1939_sock *jsk; struct sock_exterr_skb *serr; struct sk_buff *skb; char *state = "UNK"; u32 tsflags; int err; jsk = j1939_sk(sk); if (!(jsk->state & J1939_SOCK_ERRQUEUE)) return; tsflags = READ_ONCE(sk->sk_tsflags); switch (type) { case J1939_ERRQUEUE_TX_ACK: if (!(tsflags & SOF_TIMESTAMPING_TX_ACK)) return; break; case J1939_ERRQUEUE_TX_SCHED: if (!(tsflags & SOF_TIMESTAMPING_TX_SCHED)) return; break; case J1939_ERRQUEUE_TX_ABORT: break; case J1939_ERRQUEUE_RX_RTS: fallthrough; case J1939_ERRQUEUE_RX_DPO: fallthrough; case J1939_ERRQUEUE_RX_ABORT: if (!(tsflags & SOF_TIMESTAMPING_RX_SOFTWARE)) return; break; default: netdev_err(priv->ndev, "Unknown errqueue type %i\n", type); } skb = j1939_sk_get_timestamping_opt_stats(session, type); if (!skb) return; skb->tstamp = ktime_get_real(); BUILD_BUG_ON(sizeof(struct sock_exterr_skb) > sizeof(skb->cb)); serr = SKB_EXT_ERR(skb); memset(serr, 0, sizeof(*serr)); switch (type) { case J1939_ERRQUEUE_TX_ACK: serr->ee.ee_errno = ENOMSG; serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; serr->ee.ee_info = SCM_TSTAMP_ACK; state = "TX ACK"; break; case J1939_ERRQUEUE_TX_SCHED: serr->ee.ee_errno = ENOMSG; serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; serr->ee.ee_info = SCM_TSTAMP_SCHED; state = "TX SCH"; break; case J1939_ERRQUEUE_TX_ABORT: serr->ee.ee_errno = session->err; serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL; serr->ee.ee_info = J1939_EE_INFO_TX_ABORT; state = "TX ABT"; break; case J1939_ERRQUEUE_RX_RTS: serr->ee.ee_errno = ENOMSG; serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL; serr->ee.ee_info = J1939_EE_INFO_RX_RTS; state = "RX RTS"; break; case J1939_ERRQUEUE_RX_DPO: serr->ee.ee_errno = ENOMSG; serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL; serr->ee.ee_info = J1939_EE_INFO_RX_DPO; state = "RX DPO"; break; case J1939_ERRQUEUE_RX_ABORT: serr->ee.ee_errno = session->err; serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL; serr->ee.ee_info = J1939_EE_INFO_RX_ABORT; state = "RX ABT"; break; } serr->opt_stats = true; if (tsflags & SOF_TIMESTAMPING_OPT_ID) serr->ee.ee_data = session->tskey; netdev_dbg(session->priv->ndev, "%s: 0x%p tskey: %i, state: %s\n", __func__, session, session->tskey, state); err = sock_queue_err_skb(sk, skb); if (err) kfree_skb(skb); }; void j1939_sk_errqueue(struct j1939_session *session, enum j1939_sk_errqueue_type type) { struct j1939_priv *priv = session->priv; struct j1939_sock *jsk; if (session->sk) { /* send TX notifications to the socket of origin */ __j1939_sk_errqueue(session, session->sk, type); return; } /* spread RX notifications to all sockets subscribed to this session */ read_lock_bh(&priv->j1939_socks_lock); list_for_each_entry(jsk, &priv->j1939_socks, list) { if (j1939_sk_recv_match_one(jsk, &session->skcb)) __j1939_sk_errqueue(session, &jsk->sk, type); } read_unlock_bh(&priv->j1939_socks_lock); }; void j1939_sk_send_loop_abort(struct sock *sk, int err) { struct j1939_sock *jsk = j1939_sk(sk); if (jsk->state & J1939_SOCK_ERRQUEUE) return; sk->sk_err = err; sk_error_report(sk); } static int j1939_sk_send_loop(struct j1939_priv *priv, struct sock *sk, struct msghdr *msg, size_t size) { struct j1939_sock *jsk = j1939_sk(sk); struct j1939_session *session = j1939_sk_get_incomplete_session(jsk); struct sk_buff *skb; size_t segment_size, todo_size; int ret = 0; if (session && session->total_message_size != session->total_queued_size + size) { j1939_session_put(session); return -EIO; } todo_size = size; do { struct j1939_sk_buff_cb *skcb; segment_size = min_t(size_t, J1939_MAX_TP_PACKET_SIZE, todo_size); /* Allocate skb for one segment */ skb = j1939_sk_alloc_skb(priv->ndev, sk, msg, segment_size, &ret); if (ret) break; skcb = j1939_skb_to_cb(skb); if (!session) { /* at this point the size should be full size * of the session */ skcb->offset = 0; session = j1939_tp_send(priv, skb, size); if (IS_ERR(session)) { ret = PTR_ERR(session); goto kfree_skb; } if (j1939_sk_queue_session(session)) { /* try to activate session if we a * fist in the queue */ if (!j1939_session_activate(session)) { j1939_tp_schedule_txtimer(session, 0); } else { ret = -EBUSY; session->err = ret; j1939_sk_queue_drop_all(priv, jsk, EBUSY); break; } } } else { skcb->offset = session->total_queued_size; j1939_session_skb_queue(session, skb); } todo_size -= segment_size; session->total_queued_size += segment_size; } while (todo_size); switch (ret) { case 0: /* OK */ if (todo_size) netdev_warn(priv->ndev, "no error found and not completely queued?! %zu\n", todo_size); ret = size; break; case -ERESTARTSYS: ret = -EINTR; fallthrough; case -EAGAIN: /* OK */ if (todo_size != size) ret = size - todo_size; break; default: /* ERROR */ break; } if (session) j1939_session_put(session); return ret; kfree_skb: kfree_skb(skb); return ret; } static int j1939_sk_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) { struct sock *sk = sock->sk; struct j1939_sock *jsk = j1939_sk(sk); struct j1939_priv *priv; int ifindex; int ret; lock_sock(sock->sk); /* various socket state tests */ if (!(jsk->state & J1939_SOCK_BOUND)) { ret = -EBADFD; goto sendmsg_done; } priv = jsk->priv; ifindex = jsk->ifindex; if (!jsk->addr.src_name && jsk->addr.sa == J1939_NO_ADDR) { /* no source address assigned yet */ ret = -EBADFD; goto sendmsg_done; } /* deal with provided destination address info */ if (msg->msg_name) { struct sockaddr_can *addr = msg->msg_name; if (msg->msg_namelen < J1939_MIN_NAMELEN) { ret = -EINVAL; goto sendmsg_done; } if (addr->can_family != AF_CAN) { ret = -EINVAL; goto sendmsg_done; } if (addr->can_ifindex && addr->can_ifindex != ifindex) { ret = -EBADFD; goto sendmsg_done; } if (j1939_pgn_is_valid(addr->can_addr.j1939.pgn) && !j1939_pgn_is_clean_pdu(addr->can_addr.j1939.pgn)) { ret = -EINVAL; goto sendmsg_done; } if (!addr->can_addr.j1939.name && addr->can_addr.j1939.addr == J1939_NO_ADDR && !sock_flag(sk, SOCK_BROADCAST)) { /* broadcast, but SO_BROADCAST not set */ ret = -EACCES; goto sendmsg_done; } } else { if (!jsk->addr.dst_name && jsk->addr.da == J1939_NO_ADDR && !sock_flag(sk, SOCK_BROADCAST)) { /* broadcast, but SO_BROADCAST not set */ ret = -EACCES; goto sendmsg_done; } } ret = j1939_sk_send_loop(priv, sk, msg, size); sendmsg_done: release_sock(sock->sk); return ret; } void j1939_sk_netdev_event_netdown(struct j1939_priv *priv) { struct j1939_sock *jsk; int error_code = ENETDOWN; read_lock_bh(&priv->j1939_socks_lock); list_for_each_entry(jsk, &priv->j1939_socks, list) { jsk->sk.sk_err = error_code; if (!sock_flag(&jsk->sk, SOCK_DEAD)) sk_error_report(&jsk->sk); j1939_sk_queue_drop_all(priv, jsk, error_code); } read_unlock_bh(&priv->j1939_socks_lock); } void j1939_sk_netdev_event_unregister(struct j1939_priv *priv) { struct sock *sk; struct j1939_sock *jsk; bool wait_rcu = false; rescan: /* The caller is holding a ref on this "priv" via j1939_priv_get_by_ndev(). */ read_lock_bh(&priv->j1939_socks_lock); list_for_each_entry(jsk, &priv->j1939_socks, list) { /* Skip if j1939_jsk_add() is not called on this socket. */ if (!(jsk->state & J1939_SOCK_BOUND)) continue; sk = &jsk->sk; sock_hold(sk); read_unlock_bh(&priv->j1939_socks_lock); /* Check if j1939_jsk_del() is not yet called on this socket after holding * socket's lock, for both j1939_sk_bind() and j1939_sk_release() call * j1939_jsk_del() with socket's lock held. */ lock_sock(sk); if (jsk->state & J1939_SOCK_BOUND) { /* Neither j1939_sk_bind() nor j1939_sk_release() called j1939_jsk_del(). * Make this socket no longer bound, by pretending as if j1939_sk_bind() * dropped old references but did not get new references. */ j1939_jsk_del(priv, jsk); j1939_local_ecu_put(priv, jsk->addr.src_name, jsk->addr.sa); j1939_netdev_stop(priv); /* Call j1939_priv_put() now and prevent j1939_sk_sock_destruct() from * calling the corresponding j1939_priv_put(). * * j1939_sk_sock_destruct() is supposed to call j1939_priv_put() after * an RCU grace period. But since the caller is holding a ref on this * "priv", we can defer synchronize_rcu() until immediately before * the caller calls j1939_priv_put(). */ j1939_priv_put(priv); jsk->priv = NULL; wait_rcu = true; } release_sock(sk); sock_put(sk); goto rescan; } read_unlock_bh(&priv->j1939_socks_lock); if (wait_rcu) synchronize_rcu(); } static int j1939_sk_no_ioctlcmd(struct socket *sock, unsigned int cmd, unsigned long arg) { /* no ioctls for socket layer -> hand it down to NIC layer */ return -ENOIOCTLCMD; } static const struct proto_ops j1939_ops = { .family = PF_CAN, .release = j1939_sk_release, .bind = j1939_sk_bind, .connect = j1939_sk_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = j1939_sk_getname, .poll = datagram_poll, .ioctl = j1939_sk_no_ioctlcmd, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .setsockopt = j1939_sk_setsockopt, .getsockopt = j1939_sk_getsockopt, .sendmsg = j1939_sk_sendmsg, .recvmsg = j1939_sk_recvmsg, .mmap = sock_no_mmap, }; static struct proto j1939_proto __read_mostly = { .name = "CAN_J1939", .owner = THIS_MODULE, .obj_size = sizeof(struct j1939_sock), .init = j1939_sk_init, }; const struct can_proto j1939_can_proto = { .type = SOCK_DGRAM, .protocol = CAN_J1939, .ops = &j1939_ops, .prot = &j1939_proto, };
5 5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 // SPDX-License-Identifier: GPL-2.0 #include <linux/fs.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/hugetlb.h> #include <linux/mman.h> #include <linux/mmzone.h> #include <linux/memblock.h> #include <linux/proc_fs.h> #include <linux/percpu.h> #include <linux/seq_file.h> #include <linux/swap.h> #include <linux/vmstat.h> #include <linux/atomic.h> #include <linux/vmalloc.h> #ifdef CONFIG_CMA #include <linux/cma.h> #endif #include <linux/zswap.h> #include <asm/page.h> #include "internal.h" void __attribute__((weak)) arch_report_meminfo(struct seq_file *m) { } static void show_val_kb(struct seq_file *m, const char *s, unsigned long num) { seq_put_decimal_ull_width(m, s, num << (PAGE_SHIFT - 10), 8); seq_write(m, " kB\n", 4); } static int meminfo_proc_show(struct seq_file *m, void *v) { struct sysinfo i; unsigned long committed; long cached; long available; unsigned long pages[NR_LRU_LISTS]; unsigned long sreclaimable, sunreclaim; int lru; si_meminfo(&i); si_swapinfo(&i); committed = vm_memory_committed(); cached = global_node_page_state(NR_FILE_PAGES) - total_swapcache_pages() - i.bufferram; if (cached < 0) cached = 0; for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++) pages[lru] = global_node_page_state(NR_LRU_BASE + lru); available = si_mem_available(); sreclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B); sunreclaim = global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B); show_val_kb(m, "MemTotal: ", i.totalram); show_val_kb(m, "MemFree: ", i.freeram); show_val_kb(m, "MemAvailable: ", available); show_val_kb(m, "Buffers: ", i.bufferram); show_val_kb(m, "Cached: ", cached); show_val_kb(m, "SwapCached: ", total_swapcache_pages()); show_val_kb(m, "Active: ", pages[LRU_ACTIVE_ANON] + pages[LRU_ACTIVE_FILE]); show_val_kb(m, "Inactive: ", pages[LRU_INACTIVE_ANON] + pages[LRU_INACTIVE_FILE]); show_val_kb(m, "Active(anon): ", pages[LRU_ACTIVE_ANON]); show_val_kb(m, "Inactive(anon): ", pages[LRU_INACTIVE_ANON]); show_val_kb(m, "Active(file): ", pages[LRU_ACTIVE_FILE]); show_val_kb(m, "Inactive(file): ", pages[LRU_INACTIVE_FILE]); show_val_kb(m, "Unevictable: ", pages[LRU_UNEVICTABLE]); show_val_kb(m, "Mlocked: ", global_zone_page_state(NR_MLOCK)); #ifdef CONFIG_HIGHMEM show_val_kb(m, "HighTotal: ", i.totalhigh); show_val_kb(m, "HighFree: ", i.freehigh); show_val_kb(m, "LowTotal: ", i.totalram - i.totalhigh); show_val_kb(m, "LowFree: ", i.freeram - i.freehigh); #endif #ifndef CONFIG_MMU show_val_kb(m, "MmapCopy: ", (unsigned long)atomic_long_read(&mmap_pages_allocated)); #endif show_val_kb(m, "SwapTotal: ", i.totalswap); show_val_kb(m, "SwapFree: ", i.freeswap); #ifdef CONFIG_ZSWAP show_val_kb(m, "Zswap: ", zswap_total_pages()); seq_printf(m, "Zswapped: %8lu kB\n", (unsigned long)atomic_long_read(&zswap_stored_pages) << (PAGE_SHIFT - 10)); #endif show_val_kb(m, "Dirty: ", global_node_page_state(NR_FILE_DIRTY)); show_val_kb(m, "Writeback: ", global_node_page_state(NR_WRITEBACK)); show_val_kb(m, "AnonPages: ", global_node_page_state(NR_ANON_MAPPED)); show_val_kb(m, "Mapped: ", global_node_page_state(NR_FILE_MAPPED)); show_val_kb(m, "Shmem: ", i.sharedram); show_val_kb(m, "KReclaimable: ", sreclaimable + global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE)); show_val_kb(m, "Slab: ", sreclaimable + sunreclaim); show_val_kb(m, "SReclaimable: ", sreclaimable); show_val_kb(m, "SUnreclaim: ", sunreclaim); seq_printf(m, "KernelStack: %8lu kB\n", global_node_page_state(NR_KERNEL_STACK_KB)); #ifdef CONFIG_SHADOW_CALL_STACK seq_printf(m, "ShadowCallStack:%8lu kB\n", global_node_page_state(NR_KERNEL_SCS_KB)); #endif show_val_kb(m, "PageTables: ", global_node_page_state(NR_PAGETABLE)); show_val_kb(m, "SecPageTables: ", global_node_page_state(NR_SECONDARY_PAGETABLE)); show_val_kb(m, "NFS_Unstable: ", 0); show_val_kb(m, "Bounce: ", 0); show_val_kb(m, "WritebackTmp: ", 0); show_val_kb(m, "CommitLimit: ", vm_commit_limit()); show_val_kb(m, "Committed_AS: ", committed); seq_printf(m, "VmallocTotal: %8lu kB\n", (unsigned long)VMALLOC_TOTAL >> 10); show_val_kb(m, "VmallocUsed: ", vmalloc_nr_pages()); show_val_kb(m, "VmallocChunk: ", 0ul); show_val_kb(m, "Percpu: ", pcpu_nr_pages()); memtest_report_meminfo(m); #ifdef CONFIG_MEMORY_FAILURE seq_printf(m, "HardwareCorrupted: %5lu kB\n", atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10)); #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE show_val_kb(m, "AnonHugePages: ", global_node_page_state(NR_ANON_THPS)); show_val_kb(m, "ShmemHugePages: ", global_node_page_state(NR_SHMEM_THPS)); show_val_kb(m, "ShmemPmdMapped: ", global_node_page_state(NR_SHMEM_PMDMAPPED)); show_val_kb(m, "FileHugePages: ", global_node_page_state(NR_FILE_THPS)); show_val_kb(m, "FilePmdMapped: ", global_node_page_state(NR_FILE_PMDMAPPED)); #endif #ifdef CONFIG_CMA show_val_kb(m, "CmaTotal: ", totalcma_pages); show_val_kb(m, "CmaFree: ", global_zone_page_state(NR_FREE_CMA_PAGES)); #endif #ifdef CONFIG_UNACCEPTED_MEMORY show_val_kb(m, "Unaccepted: ", global_zone_page_state(NR_UNACCEPTED)); #endif show_val_kb(m, "Balloon: ", global_node_page_state(NR_BALLOON_PAGES)); hugetlb_report_meminfo(m); arch_report_meminfo(m); return 0; } static int __init proc_meminfo_init(void) { struct proc_dir_entry *pde; pde = proc_create_single("meminfo", 0, NULL, meminfo_proc_show); pde_make_permanent(pde); return 0; } fs_initcall(proc_meminfo_init);
8 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2000,2002-2005 Silicon Graphics, Inc. * All Rights Reserved. */ #ifndef __XFS_BMAP_BTREE_H__ #define __XFS_BMAP_BTREE_H__ struct xfs_btree_cur; struct xfs_btree_block; struct xfs_mount; struct xfs_inode; struct xfs_trans; struct xbtree_ifakeroot; /* * Maximum number of bmap btree levels. */ #define XFS_BM_MAXLEVELS(mp,w) ((mp)->m_bm_maxlevels[(w)]) /* * Prototypes for xfs_bmap.c to call. */ extern void xfs_bmdr_to_bmbt(struct xfs_inode *, xfs_bmdr_block_t *, int, struct xfs_btree_block *, int); void xfs_bmbt_disk_set_all(struct xfs_bmbt_rec *r, struct xfs_bmbt_irec *s); extern xfs_filblks_t xfs_bmbt_disk_get_blockcount(const struct xfs_bmbt_rec *r); extern xfs_fileoff_t xfs_bmbt_disk_get_startoff(const struct xfs_bmbt_rec *r); void xfs_bmbt_disk_get_all(const struct xfs_bmbt_rec *r, struct xfs_bmbt_irec *s); extern void xfs_bmbt_to_bmdr(struct xfs_mount *, struct xfs_btree_block *, int, xfs_bmdr_block_t *, int); extern int xfs_bmbt_get_maxrecs(struct xfs_btree_cur *, int level); extern int xfs_bmdr_maxrecs(int blocklen, int leaf); unsigned int xfs_bmbt_maxrecs(struct xfs_mount *mp, unsigned int blocklen, bool leaf); extern int xfs_bmbt_change_owner(struct xfs_trans *tp, struct xfs_inode *ip, int whichfork, xfs_ino_t new_owner, struct list_head *buffer_list); extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *, struct xfs_trans *, struct xfs_inode *, int); void xfs_bmbt_commit_staged_btree(struct xfs_btree_cur *cur, struct xfs_trans *tp, int whichfork); extern unsigned long long xfs_bmbt_calc_size(struct xfs_mount *mp, unsigned long long len); unsigned int xfs_bmbt_maxlevels_ondisk(void); int __init xfs_bmbt_init_cur_cache(void); void xfs_bmbt_destroy_cur_cache(void); void xfs_bmbt_init_block(struct xfs_inode *ip, struct xfs_btree_block *buf, struct xfs_buf *bp, __u16 level, __u16 numrecs); /* * Btree block header size depends on a superblock flag. */ static inline size_t xfs_bmbt_block_len(struct xfs_mount *mp) { return xfs_has_crc(mp) ? XFS_BTREE_LBLOCK_CRC_LEN : XFS_BTREE_LBLOCK_LEN; } /* Addresses of key, pointers, and records within an incore bmbt block. */ static inline struct xfs_bmbt_rec * xfs_bmbt_rec_addr( struct xfs_mount *mp, struct xfs_btree_block *block, unsigned int index) { return (struct xfs_bmbt_rec *) ((char *)block + xfs_bmbt_block_len(mp) + (index - 1) * sizeof(struct xfs_bmbt_rec)); } static inline struct xfs_bmbt_key * xfs_bmbt_key_addr( struct xfs_mount *mp, struct xfs_btree_block *block, unsigned int index) { return (struct xfs_bmbt_key *) ((char *)block + xfs_bmbt_block_len(mp) + (index - 1) * sizeof(struct xfs_bmbt_key *)); } static inline xfs_bmbt_ptr_t * xfs_bmbt_ptr_addr( struct xfs_mount *mp, struct xfs_btree_block *block, unsigned int index, unsigned int maxrecs) { return (xfs_bmbt_ptr_t *) ((char *)block + xfs_bmbt_block_len(mp) + maxrecs * sizeof(struct xfs_bmbt_key) + (index - 1) * sizeof(xfs_bmbt_ptr_t)); } /* Addresses of key, pointers, and records within an ondisk bmbt block. */ static inline struct xfs_bmbt_rec * xfs_bmdr_rec_addr( struct xfs_bmdr_block *block, unsigned int index) { return (struct xfs_bmbt_rec *) ((char *)(block + 1) + (index - 1) * sizeof(struct xfs_bmbt_rec)); } static inline struct xfs_bmbt_key * xfs_bmdr_key_addr( struct xfs_bmdr_block *block, unsigned int index) { return (struct xfs_bmbt_key *) ((char *)(block + 1) + (index - 1) * sizeof(struct xfs_bmbt_key)); } static inline xfs_bmbt_ptr_t * xfs_bmdr_ptr_addr( struct xfs_bmdr_block *block, unsigned int index, unsigned int maxrecs) { return (xfs_bmbt_ptr_t *) ((char *)(block + 1) + maxrecs * sizeof(struct xfs_bmbt_key) + (index - 1) * sizeof(xfs_bmbt_ptr_t)); } /* * Address of pointers within the incore btree root. * * These are to be used when we know the size of the block and * we don't have a cursor. */ static inline xfs_bmbt_ptr_t * xfs_bmap_broot_ptr_addr( struct xfs_mount *mp, struct xfs_btree_block *bb, unsigned int i, unsigned int sz) { return xfs_bmbt_ptr_addr(mp, bb, i, xfs_bmbt_maxrecs(mp, sz, false)); } /* * Compute the space required for the incore btree root containing the given * number of records. */ static inline size_t xfs_bmap_broot_space_calc( struct xfs_mount *mp, unsigned int nrecs) { return xfs_bmbt_block_len(mp) + (nrecs * (sizeof(struct xfs_bmbt_key) + sizeof(xfs_bmbt_ptr_t))); } /* * Compute the space required for the incore btree root given the ondisk * btree root block. */ static inline size_t xfs_bmap_broot_space( struct xfs_mount *mp, struct xfs_bmdr_block *bb) { return xfs_bmap_broot_space_calc(mp, be16_to_cpu(bb->bb_numrecs)); } /* Compute the space required for the ondisk root block. */ static inline size_t xfs_bmdr_space_calc(unsigned int nrecs) { return sizeof(struct xfs_bmdr_block) + (nrecs * (sizeof(struct xfs_bmbt_key) + sizeof(xfs_bmbt_ptr_t))); } /* * Compute the space required for the ondisk root block given an incore root * block. */ static inline size_t xfs_bmap_bmdr_space(struct xfs_btree_block *bb) { return xfs_bmdr_space_calc(be16_to_cpu(bb->bb_numrecs)); } struct xfs_btree_block *xfs_bmap_broot_realloc(struct xfs_inode *ip, int whichfork, unsigned int new_numrecs); #endif /* __XFS_BMAP_BTREE_H__ */
110 109 110 133 124 147 138 518 245 206 207 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __SOCK_DIAG_H__ #define __SOCK_DIAG_H__ #include <linux/netlink.h> #include <linux/user_namespace.h> #include <net/net_namespace.h> #include <net/sock.h> #include <uapi/linux/sock_diag.h> struct sk_buff; struct nlmsghdr; struct sock; struct sock_diag_handler { struct module *owner; __u8 family; int (*dump)(struct sk_buff *skb, struct nlmsghdr *nlh); int (*get_info)(struct sk_buff *skb, struct sock *sk); int (*destroy)(struct sk_buff *skb, struct nlmsghdr *nlh); }; int sock_diag_register(const struct sock_diag_handler *h); void sock_diag_unregister(const struct sock_diag_handler *h); struct sock_diag_inet_compat { struct module *owner; int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh); }; void sock_diag_register_inet_compat(const struct sock_diag_inet_compat *ptr); void sock_diag_unregister_inet_compat(const struct sock_diag_inet_compat *ptr); u64 __sock_gen_cookie(struct sock *sk); static inline u64 sock_gen_cookie(struct sock *sk) { u64 cookie; preempt_disable(); cookie = __sock_gen_cookie(sk); preempt_enable(); return cookie; } int sock_diag_check_cookie(struct sock *sk, const __u32 *cookie); void sock_diag_save_cookie(struct sock *sk, __u32 *cookie); int sock_diag_put_meminfo(struct sock *sk, struct sk_buff *skb, int attr); int sock_diag_put_filterinfo(bool may_report_filterinfo, struct sock *sk, struct sk_buff *skb, int attrtype); static inline enum sknetlink_groups sock_diag_destroy_group(const struct sock *sk) { switch (sk->sk_family) { case AF_INET: if (sk->sk_type == SOCK_RAW) return SKNLGRP_NONE; switch (sk->sk_protocol) { case IPPROTO_TCP: return SKNLGRP_INET_TCP_DESTROY; case IPPROTO_UDP: return SKNLGRP_INET_UDP_DESTROY; default: return SKNLGRP_NONE; } case AF_INET6: if (sk->sk_type == SOCK_RAW) return SKNLGRP_NONE; switch (sk->sk_protocol) { case IPPROTO_TCP: return SKNLGRP_INET6_TCP_DESTROY; case IPPROTO_UDP: return SKNLGRP_INET6_UDP_DESTROY; default: return SKNLGRP_NONE; } default: return SKNLGRP_NONE; } } static inline bool sock_diag_has_destroy_listeners(const struct sock *sk) { const struct net *n = sock_net(sk); const enum sknetlink_groups group = sock_diag_destroy_group(sk); return group != SKNLGRP_NONE && n->diag_nlsk && netlink_has_listeners(n->diag_nlsk, group); } void sock_diag_broadcast_destroy(struct sock *sk); int sock_diag_destroy(struct sock *sk, int err); #endif
5 5 5 5 5 5 12 5 5 5 5 5 5 5 5 5 5 5 5 5 5 112 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 63 63 58 58 63 63 63 63 63 63 5 5 5 5 5 4 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar * Copyright (C) 2005-2006, Thomas Gleixner, Russell King * * This file contains the interrupt descriptor management code. Detailed * information is available in Documentation/core-api/genericirq.rst * */ #include <linux/irq.h> #include <linux/slab.h> #include <linux/export.h> #include <linux/interrupt.h> #include <linux/kernel_stat.h> #include <linux/maple_tree.h> #include <linux/irqdomain.h> #include <linux/sysfs.h> #include <linux/string_choices.h> #include "internals.h" /* * lockdep: we want to handle all irq_desc locks as a single lock-class: */ static struct lock_class_key irq_desc_lock_class; #if defined(CONFIG_SMP) static int __init irq_affinity_setup(char *str) { alloc_bootmem_cpumask_var(&irq_default_affinity); cpulist_parse(str, irq_default_affinity); /* * Set at least the boot cpu. We don't want to end up with * bugreports caused by random commandline masks */ cpumask_set_cpu(smp_processor_id(), irq_default_affinity); return 1; } __setup("irqaffinity=", irq_affinity_setup); static void __init init_irq_default_affinity(void) { if (!cpumask_available(irq_default_affinity)) zalloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); if (cpumask_empty(irq_default_affinity)) cpumask_setall(irq_default_affinity); } #else static void __init init_irq_default_affinity(void) { } #endif #ifdef CONFIG_SMP static int alloc_masks(struct irq_desc *desc, int node) { if (!zalloc_cpumask_var_node(&desc->irq_common_data.affinity, GFP_KERNEL, node)) return -ENOMEM; #ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK if (!zalloc_cpumask_var_node(&desc->irq_common_data.effective_affinity, GFP_KERNEL, node)) { free_cpumask_var(desc->irq_common_data.affinity); return -ENOMEM; } #endif #ifdef CONFIG_GENERIC_PENDING_IRQ if (!zalloc_cpumask_var_node(&desc->pending_mask, GFP_KERNEL, node)) { #ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK free_cpumask_var(desc->irq_common_data.effective_affinity); #endif free_cpumask_var(desc->irq_common_data.affinity); return -ENOMEM; } #endif return 0; } static void desc_smp_init(struct irq_desc *desc, int node, const struct cpumask *affinity) { if (!affinity) affinity = irq_default_affinity; cpumask_copy(desc->irq_common_data.affinity, affinity); #ifdef CONFIG_GENERIC_PENDING_IRQ cpumask_clear(desc->pending_mask); #endif #ifdef CONFIG_NUMA desc->irq_common_data.node = node; #endif } static void free_masks(struct irq_desc *desc) { #ifdef CONFIG_GENERIC_PENDING_IRQ free_cpumask_var(desc->pending_mask); #endif free_cpumask_var(desc->irq_common_data.affinity); #ifdef CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK free_cpumask_var(desc->irq_common_data.effective_affinity); #endif } #else static inline int alloc_masks(struct irq_desc *desc, int node) { return 0; } static inline void desc_smp_init(struct irq_desc *desc, int node, const struct cpumask *affinity) { } static inline void free_masks(struct irq_desc *desc) { } #endif static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node, const struct cpumask *affinity, struct module *owner) { int cpu; desc->irq_common_data.handler_data = NULL; desc->irq_common_data.msi_desc = NULL; desc->irq_data.common = &desc->irq_common_data; desc->irq_data.irq = irq; desc->irq_data.chip = &no_irq_chip; desc->irq_data.chip_data = NULL; irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS); irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED); irqd_set(&desc->irq_data, IRQD_IRQ_MASKED); desc->handle_irq = handle_bad_irq; desc->depth = 1; desc->irq_count = 0; desc->irqs_unhandled = 0; desc->tot_count = 0; desc->name = NULL; desc->owner = owner; for_each_possible_cpu(cpu) *per_cpu_ptr(desc->kstat_irqs, cpu) = (struct irqstat) { }; desc_smp_init(desc, node, affinity); } static unsigned int nr_irqs = NR_IRQS; /** * irq_get_nr_irqs() - Number of interrupts supported by the system. */ unsigned int irq_get_nr_irqs(void) { return nr_irqs; } EXPORT_SYMBOL_GPL(irq_get_nr_irqs); /** * irq_set_nr_irqs() - Set the number of interrupts supported by the system. * @nr: New number of interrupts. * * Return: @nr. */ unsigned int irq_set_nr_irqs(unsigned int nr) { nr_irqs = nr; return nr; } EXPORT_SYMBOL_GPL(irq_set_nr_irqs); static DEFINE_MUTEX(sparse_irq_lock); static struct maple_tree sparse_irqs = MTREE_INIT_EXT(sparse_irqs, MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN | MT_FLAGS_USE_RCU, sparse_irq_lock); static int irq_find_free_area(unsigned int from, unsigned int cnt) { MA_STATE(mas, &sparse_irqs, 0, 0); if (mas_empty_area(&mas, from, MAX_SPARSE_IRQS, cnt)) return -ENOSPC; return mas.index; } static unsigned int irq_find_at_or_after(unsigned int offset) { unsigned long index = offset; struct irq_desc *desc; guard(rcu)(); desc = mt_find(&sparse_irqs, &index, nr_irqs); return desc ? irq_desc_get_irq(desc) : nr_irqs; } static void irq_insert_desc(unsigned int irq, struct irq_desc *desc) { MA_STATE(mas, &sparse_irqs, irq, irq); WARN_ON(mas_store_gfp(&mas, desc, GFP_KERNEL) != 0); } static void delete_irq_desc(unsigned int irq) { MA_STATE(mas, &sparse_irqs, irq, irq); mas_erase(&mas); } #ifdef CONFIG_SPARSE_IRQ static const struct kobj_type irq_kobj_type; #endif static int init_desc(struct irq_desc *desc, int irq, int node, unsigned int flags, const struct cpumask *affinity, struct module *owner) { desc->kstat_irqs = alloc_percpu(struct irqstat); if (!desc->kstat_irqs) return -ENOMEM; if (alloc_masks(desc, node)) { free_percpu(desc->kstat_irqs); return -ENOMEM; } raw_spin_lock_init(&desc->lock); lockdep_set_class(&desc->lock, &irq_desc_lock_class); mutex_init(&desc->request_mutex); init_waitqueue_head(&desc->wait_for_threads); desc_set_defaults(irq, desc, node, affinity, owner); irqd_set(&desc->irq_data, flags); irq_resend_init(desc); #ifdef CONFIG_SPARSE_IRQ kobject_init(&desc->kobj, &irq_kobj_type); init_rcu_head(&desc->rcu); #endif return 0; } #ifdef CONFIG_SPARSE_IRQ static void irq_kobj_release(struct kobject *kobj); #ifdef CONFIG_SYSFS static struct kobject *irq_kobj_base; #define IRQ_ATTR_RO(_name) \ static struct kobj_attribute _name##_attr = __ATTR_RO(_name) static ssize_t per_cpu_count_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); ssize_t ret = 0; char *p = ""; int cpu; for_each_possible_cpu(cpu) { unsigned int c = irq_desc_kstat_cpu(desc, cpu); ret += sysfs_emit_at(buf, ret, "%s%u", p, c); p = ","; } ret += sysfs_emit_at(buf, ret, "\n"); return ret; } IRQ_ATTR_RO(per_cpu_count); static ssize_t chip_name_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); guard(raw_spinlock_irq)(&desc->lock); if (desc->irq_data.chip && desc->irq_data.chip->name) return sysfs_emit(buf, "%s\n", desc->irq_data.chip->name); return 0; } IRQ_ATTR_RO(chip_name); static ssize_t hwirq_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); guard(raw_spinlock_irq)(&desc->lock); if (desc->irq_data.domain) return sysfs_emit(buf, "%lu\n", desc->irq_data.hwirq); return 0; } IRQ_ATTR_RO(hwirq); static ssize_t type_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); guard(raw_spinlock_irq)(&desc->lock); return sysfs_emit(buf, "%s\n", irqd_is_level_type(&desc->irq_data) ? "level" : "edge"); } IRQ_ATTR_RO(type); static ssize_t wakeup_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); guard(raw_spinlock_irq)(&desc->lock); return sysfs_emit(buf, "%s\n", str_enabled_disabled(irqd_is_wakeup_set(&desc->irq_data))); } IRQ_ATTR_RO(wakeup); static ssize_t name_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); guard(raw_spinlock_irq)(&desc->lock); if (desc->name) return sysfs_emit(buf, "%s\n", desc->name); return 0; } IRQ_ATTR_RO(name); static ssize_t actions_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); struct irqaction *action; ssize_t ret = 0; char *p = ""; scoped_guard(raw_spinlock_irq, &desc->lock) { for_each_action_of_desc(desc, action) { ret += sysfs_emit_at(buf, ret, "%s%s", p, action->name); p = ","; } } if (ret) ret += sysfs_emit_at(buf, ret, "\n"); return ret; } IRQ_ATTR_RO(actions); static struct attribute *irq_attrs[] = { &per_cpu_count_attr.attr, &chip_name_attr.attr, &hwirq_attr.attr, &type_attr.attr, &wakeup_attr.attr, &name_attr.attr, &actions_attr.attr, NULL }; ATTRIBUTE_GROUPS(irq); static const struct kobj_type irq_kobj_type = { .release = irq_kobj_release, .sysfs_ops = &kobj_sysfs_ops, .default_groups = irq_groups, }; static void irq_sysfs_add(int irq, struct irq_desc *desc) { if (irq_kobj_base) { /* * Continue even in case of failure as this is nothing * crucial and failures in the late irq_sysfs_init() * cannot be rolled back. */ if (kobject_add(&desc->kobj, irq_kobj_base, "%d", irq)) pr_warn("Failed to add kobject for irq %d\n", irq); else desc->istate |= IRQS_SYSFS; } } static void irq_sysfs_del(struct irq_desc *desc) { /* * Only invoke kobject_del() when kobject_add() was successfully * invoked for the descriptor. This covers both early boot, where * sysfs is not initialized yet, and the case of a failed * kobject_add() invocation. */ if (desc->istate & IRQS_SYSFS) kobject_del(&desc->kobj); } static int __init irq_sysfs_init(void) { struct irq_desc *desc; int irq; /* Prevent concurrent irq alloc/free */ guard(mutex)(&sparse_irq_lock); irq_kobj_base = kobject_create_and_add("irq", kernel_kobj); if (!irq_kobj_base) return -ENOMEM; /* Add the already allocated interrupts */ for_each_irq_desc(irq, desc) irq_sysfs_add(irq, desc); return 0; } postcore_initcall(irq_sysfs_init); #else /* !CONFIG_SYSFS */ static const struct kobj_type irq_kobj_type = { .release = irq_kobj_release, }; static void irq_sysfs_add(int irq, struct irq_desc *desc) {} static void irq_sysfs_del(struct irq_desc *desc) {} #endif /* CONFIG_SYSFS */ struct irq_desc *irq_to_desc(unsigned int irq) { return mtree_load(&sparse_irqs, irq); } #ifdef CONFIG_KVM_BOOK3S_64_HV_MODULE EXPORT_SYMBOL_GPL(irq_to_desc); #endif void irq_lock_sparse(void) { mutex_lock(&sparse_irq_lock); } void irq_unlock_sparse(void) { mutex_unlock(&sparse_irq_lock); } static struct irq_desc *alloc_desc(int irq, int node, unsigned int flags, const struct cpumask *affinity, struct module *owner) { struct irq_desc *desc; int ret; desc = kzalloc_node(sizeof(*desc), GFP_KERNEL, node); if (!desc) return NULL; ret = init_desc(desc, irq, node, flags, affinity, owner); if (unlikely(ret)) { kfree(desc); return NULL; } return desc; } static void irq_kobj_release(struct kobject *kobj) { struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); free_masks(desc); free_percpu(desc->kstat_irqs); kfree(desc); } static void delayed_free_desc(struct rcu_head *rhp) { struct irq_desc *desc = container_of(rhp, struct irq_desc, rcu); kobject_put(&desc->kobj); } static void free_desc(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); irq_remove_debugfs_entry(desc); unregister_irq_proc(irq, desc); /* * sparse_irq_lock protects also show_interrupts() and * kstat_irq_usr(). Once we deleted the descriptor from the * sparse tree we can free it. Access in proc will fail to * lookup the descriptor. * * The sysfs entry must be serialized against a concurrent * irq_sysfs_init() as well. */ irq_sysfs_del(desc); delete_irq_desc(irq); /* * We free the descriptor, masks and stat fields via RCU. That * allows demultiplex interrupts to do rcu based management of * the child interrupts. * This also allows us to use rcu in kstat_irqs_usr(). */ call_rcu(&desc->rcu, delayed_free_desc); } static int alloc_descs(unsigned int start, unsigned int cnt, int node, const struct irq_affinity_desc *affinity, struct module *owner) { struct irq_desc *desc; int i; /* Validate affinity mask(s) */ if (affinity) { for (i = 0; i < cnt; i++) { if (cpumask_empty(&affinity[i].mask)) return -EINVAL; } } for (i = 0; i < cnt; i++) { const struct cpumask *mask = NULL; unsigned int flags = 0; if (affinity) { if (affinity->is_managed) { flags = IRQD_AFFINITY_MANAGED | IRQD_MANAGED_SHUTDOWN; } flags |= IRQD_AFFINITY_SET; mask = &affinity->mask; node = cpu_to_node(cpumask_first(mask)); affinity++; } desc = alloc_desc(start + i, node, flags, mask, owner); if (!desc) goto err; irq_insert_desc(start + i, desc); irq_sysfs_add(start + i, desc); irq_add_debugfs_entry(start + i, desc); } return start; err: for (i--; i >= 0; i--) free_desc(start + i); return -ENOMEM; } static bool irq_expand_nr_irqs(unsigned int nr) { if (nr > MAX_SPARSE_IRQS) return false; nr_irqs = nr; return true; } int __init early_irq_init(void) { int i, initcnt, node = first_online_node; struct irq_desc *desc; init_irq_default_affinity(); /* Let arch update nr_irqs and return the nr of preallocated irqs */ initcnt = arch_probe_nr_irqs(); printk(KERN_INFO "NR_IRQS: %d, nr_irqs: %d, preallocated irqs: %d\n", NR_IRQS, nr_irqs, initcnt); if (WARN_ON(nr_irqs > MAX_SPARSE_IRQS)) nr_irqs = MAX_SPARSE_IRQS; if (WARN_ON(initcnt > MAX_SPARSE_IRQS)) initcnt = MAX_SPARSE_IRQS; if (initcnt > nr_irqs) nr_irqs = initcnt; for (i = 0; i < initcnt; i++) { desc = alloc_desc(i, node, 0, NULL, NULL); irq_insert_desc(i, desc); } return arch_early_irq_init(); } #else /* !CONFIG_SPARSE_IRQ */ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { [0 ... NR_IRQS-1] = { .handle_irq = handle_bad_irq, .depth = 1, .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock), } }; int __init early_irq_init(void) { int count, i, node = first_online_node; int ret; init_irq_default_affinity(); printk(KERN_INFO "NR_IRQS: %d\n", NR_IRQS); count = ARRAY_SIZE(irq_desc); for (i = 0; i < count; i++) { ret = init_desc(irq_desc + i, i, node, 0, NULL, NULL); if (unlikely(ret)) goto __free_desc_res; } return arch_early_irq_init(); __free_desc_res: while (--i >= 0) { free_masks(irq_desc + i); free_percpu(irq_desc[i].kstat_irqs); } return ret; } struct irq_desc *irq_to_desc(unsigned int irq) { return (irq < NR_IRQS) ? irq_desc + irq : NULL; } EXPORT_SYMBOL(irq_to_desc); static void free_desc(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); scoped_guard(raw_spinlock_irqsave, &desc->lock) desc_set_defaults(irq, desc, irq_desc_get_node(desc), NULL, NULL); delete_irq_desc(irq); } static inline int alloc_descs(unsigned int start, unsigned int cnt, int node, const struct irq_affinity_desc *affinity, struct module *owner) { u32 i; for (i = 0; i < cnt; i++) { struct irq_desc *desc = irq_to_desc(start + i); desc->owner = owner; irq_insert_desc(start + i, desc); } return start; } static inline bool irq_expand_nr_irqs(unsigned int nr) { return false; } void irq_mark_irq(unsigned int irq) { guard(mutex)(&sparse_irq_lock); irq_insert_desc(irq, irq_desc + irq); } #endif /* !CONFIG_SPARSE_IRQ */ int handle_irq_desc(struct irq_desc *desc) { struct irq_data *data; if (!desc) return -EINVAL; data = irq_desc_get_irq_data(desc); if (WARN_ON_ONCE(!in_hardirq() && irqd_is_handle_enforce_irqctx(data))) return -EPERM; generic_handle_irq_desc(desc); return 0; } /** * generic_handle_irq - Invoke the handler for a particular irq * @irq: The irq number to handle * * Returns: 0 on success, or -EINVAL if conversion has failed * * This function must be called from an IRQ context with irq regs * initialized. */ int generic_handle_irq(unsigned int irq) { return handle_irq_desc(irq_to_desc(irq)); } EXPORT_SYMBOL_GPL(generic_handle_irq); /** * generic_handle_irq_safe - Invoke the handler for a particular irq from any * context. * @irq: The irq number to handle * * Returns: 0 on success, a negative value on error. * * This function can be called from any context (IRQ or process context). It * will report an error if not invoked from IRQ context and the irq has been * marked to enforce IRQ-context only. */ int generic_handle_irq_safe(unsigned int irq) { unsigned long flags; int ret; local_irq_save(flags); ret = handle_irq_desc(irq_to_desc(irq)); local_irq_restore(flags); return ret; } EXPORT_SYMBOL_GPL(generic_handle_irq_safe); #ifdef CONFIG_IRQ_DOMAIN /** * generic_handle_domain_irq - Invoke the handler for a HW irq belonging * to a domain. * @domain: The domain where to perform the lookup * @hwirq: The HW irq number to convert to a logical one * * Returns: 0 on success, or -EINVAL if conversion has failed * * This function must be called from an IRQ context with irq regs * initialized. */ int generic_handle_domain_irq(struct irq_domain *domain, unsigned int hwirq) { return handle_irq_desc(irq_resolve_mapping(domain, hwirq)); } EXPORT_SYMBOL_GPL(generic_handle_domain_irq); /** * generic_handle_irq_safe - Invoke the handler for a HW irq belonging * to a domain from any context. * @domain: The domain where to perform the lookup * @hwirq: The HW irq number to convert to a logical one * * Returns: 0 on success, a negative value on error. * * This function can be called from any context (IRQ or process * context). If the interrupt is marked as 'enforce IRQ-context only' then * the function must be invoked from hard interrupt context. */ int generic_handle_domain_irq_safe(struct irq_domain *domain, unsigned int hwirq) { unsigned long flags; int ret; local_irq_save(flags); ret = handle_irq_desc(irq_resolve_mapping(domain, hwirq)); local_irq_restore(flags); return ret; } EXPORT_SYMBOL_GPL(generic_handle_domain_irq_safe); /** * generic_handle_domain_nmi - Invoke the handler for a HW nmi belonging * to a domain. * @domain: The domain where to perform the lookup * @hwirq: The HW irq number to convert to a logical one * * Returns: 0 on success, or -EINVAL if conversion has failed * * This function must be called from an NMI context with irq regs * initialized. **/ int generic_handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq) { WARN_ON_ONCE(!in_nmi()); return handle_irq_desc(irq_resolve_mapping(domain, hwirq)); } #endif /* Dynamic interrupt handling */ /** * irq_free_descs - free irq descriptors * @from: Start of descriptor range * @cnt: Number of consecutive irqs to free */ void irq_free_descs(unsigned int from, unsigned int cnt) { int i; if (from >= nr_irqs || (from + cnt) > nr_irqs) return; guard(mutex)(&sparse_irq_lock); for (i = 0; i < cnt; i++) free_desc(from + i); } EXPORT_SYMBOL_GPL(irq_free_descs); /** * __irq_alloc_descs - allocate and initialize a range of irq descriptors * @irq: Allocate for specific irq number if irq >= 0 * @from: Start the search from this irq number * @cnt: Number of consecutive irqs to allocate. * @node: Preferred node on which the irq descriptor should be allocated * @owner: Owning module (can be NULL) * @affinity: Optional pointer to an affinity mask array of size @cnt which * hints where the irq descriptors should be allocated and which * default affinities to use * * Returns the first irq number or error code */ int __ref __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node, struct module *owner, const struct irq_affinity_desc *affinity) { int start; if (!cnt) return -EINVAL; if (irq >= 0) { if (from > irq) return -EINVAL; from = irq; } else { /* * For interrupts which are freely allocated the * architecture can force a lower bound to the @from * argument. x86 uses this to exclude the GSI space. */ from = arch_dynirq_lower_bound(from); } guard(mutex)(&sparse_irq_lock); start = irq_find_free_area(from, cnt); if (irq >=0 && start != irq) return -EEXIST; if (start + cnt > nr_irqs) { if (!irq_expand_nr_irqs(start + cnt)) return -ENOMEM; } return alloc_descs(start, cnt, node, affinity, owner); } EXPORT_SYMBOL_GPL(__irq_alloc_descs); /** * irq_get_next_irq - get next allocated irq number * @offset: where to start the search * * Returns next irq number after offset or nr_irqs if none is found. */ unsigned int irq_get_next_irq(unsigned int offset) { return irq_find_at_or_after(offset); } struct irq_desc *__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus, unsigned int check) { struct irq_desc *desc; desc = irq_to_desc(irq); if (!desc) return NULL; if (check & _IRQ_DESC_CHECK) { if ((check & _IRQ_DESC_PERCPU) && !irq_settings_is_per_cpu_devid(desc)) return NULL; if (!(check & _IRQ_DESC_PERCPU) && irq_settings_is_per_cpu_devid(desc)) return NULL; } if (bus) chip_bus_lock(desc); raw_spin_lock_irqsave(&desc->lock, *flags); return desc; } void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus) __releases(&desc->lock) { raw_spin_unlock_irqrestore(&desc->lock, flags); if (bus) chip_bus_sync_unlock(desc); } int irq_set_percpu_devid_partition(unsigned int irq, const struct cpumask *affinity) { struct irq_desc *desc = irq_to_desc(irq); if (!desc || desc->percpu_enabled) return -EINVAL; desc->percpu_enabled = kzalloc(sizeof(*desc->percpu_enabled), GFP_KERNEL); if (!desc->percpu_enabled) return -ENOMEM; desc->percpu_affinity = affinity ? : cpu_possible_mask; irq_set_percpu_devid_flags(irq); return 0; } int irq_set_percpu_devid(unsigned int irq) { return irq_set_percpu_devid_partition(irq, NULL); } int irq_get_percpu_devid_partition(unsigned int irq, struct cpumask *affinity) { struct irq_desc *desc = irq_to_desc(irq); if (!desc || !desc->percpu_enabled) return -EINVAL; if (affinity) cpumask_copy(affinity, desc->percpu_affinity); return 0; } EXPORT_SYMBOL_GPL(irq_get_percpu_devid_partition); void kstat_incr_irq_this_cpu(unsigned int irq) { kstat_incr_irqs_this_cpu(irq_to_desc(irq)); } /** * kstat_irqs_cpu - Get the statistics for an interrupt on a cpu * @irq: The interrupt number * @cpu: The cpu number * * Returns the sum of interrupt counts on @cpu since boot for * @irq. The caller must ensure that the interrupt is not removed * concurrently. */ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) { struct irq_desc *desc = irq_to_desc(irq); return desc && desc->kstat_irqs ? per_cpu(desc->kstat_irqs->cnt, cpu) : 0; } static unsigned int kstat_irqs_desc(struct irq_desc *desc, const struct cpumask *cpumask) { unsigned int sum = 0; int cpu; if (!irq_settings_is_per_cpu_devid(desc) && !irq_settings_is_per_cpu(desc) && !irq_is_nmi(desc)) return data_race(desc->tot_count); for_each_cpu(cpu, cpumask) sum += data_race(per_cpu(desc->kstat_irqs->cnt, cpu)); return sum; } static unsigned int kstat_irqs(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); if (!desc || !desc->kstat_irqs) return 0; return kstat_irqs_desc(desc, cpu_possible_mask); } #ifdef CONFIG_GENERIC_IRQ_STAT_SNAPSHOT void kstat_snapshot_irqs(void) { struct irq_desc *desc; unsigned int irq; for_each_irq_desc(irq, desc) { if (!desc->kstat_irqs) continue; this_cpu_write(desc->kstat_irqs->ref, this_cpu_read(desc->kstat_irqs->cnt)); } } unsigned int kstat_get_irq_since_snapshot(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); if (!desc || !desc->kstat_irqs) return 0; return this_cpu_read(desc->kstat_irqs->cnt) - this_cpu_read(desc->kstat_irqs->ref); } #endif /** * kstat_irqs_usr - Get the statistics for an interrupt from thread context * @irq: The interrupt number * * Returns the sum of interrupt counts on all cpus since boot for @irq. * * It uses rcu to protect the access since a concurrent removal of an * interrupt descriptor is observing an rcu grace period before * delayed_free_desc()/irq_kobj_release(). */ unsigned int kstat_irqs_usr(unsigned int irq) { unsigned int sum; rcu_read_lock(); sum = kstat_irqs(irq); rcu_read_unlock(); return sum; } #ifdef CONFIG_LOCKDEP void __irq_set_lockdep_class(unsigned int irq, struct lock_class_key *lock_class, struct lock_class_key *request_class) { struct irq_desc *desc = irq_to_desc(irq); if (desc) { lockdep_set_class(&desc->lock, lock_class); lockdep_set_class(&desc->request_mutex, request_class); } } EXPORT_SYMBOL_GPL(__irq_set_lockdep_class); #endif
142 344 408 257 391 391 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2008 Oracle. All rights reserved. */ #ifndef BTRFS_LOCKING_H #define BTRFS_LOCKING_H #include <linux/atomic.h> #include <linux/wait.h> #include <linux/lockdep.h> #include <linux/percpu_counter.h> #include "extent_io.h" struct extent_buffer; struct btrfs_path; struct btrfs_root; #define BTRFS_WRITE_LOCK 1 #define BTRFS_READ_LOCK 2 /* * We are limited in number of subclasses by MAX_LOCKDEP_SUBCLASSES, which at * the time of this patch is 8, which is how many we use. Keep this in mind if * you decide you want to add another subclass. */ enum btrfs_lock_nesting { BTRFS_NESTING_NORMAL, /* * When we COW a block we are holding the lock on the original block, * and since our lockdep maps are rootid+level, this confuses lockdep * when we lock the newly allocated COW'd block. Handle this by having * a subclass for COW'ed blocks so that lockdep doesn't complain. */ BTRFS_NESTING_COW, /* * Oftentimes we need to lock adjacent nodes on the same level while * still holding the lock on the original node we searched to, such as * for searching forward or for split/balance. * * Because of this we need to indicate to lockdep that this is * acceptable by having a different subclass for each of these * operations. */ BTRFS_NESTING_LEFT, BTRFS_NESTING_RIGHT, /* * When splitting we will be holding a lock on the left/right node when * we need to cow that node, thus we need a new set of subclasses for * these two operations. */ BTRFS_NESTING_LEFT_COW, BTRFS_NESTING_RIGHT_COW, /* * When splitting we may push nodes to the left or right, but still use * the subsequent nodes in our path, keeping our locks on those adjacent * blocks. Thus when we go to allocate a new split block we've already * used up all of our available subclasses, so this subclass exists to * handle this case where we need to allocate a new split block. */ BTRFS_NESTING_SPLIT, /* * When promoting a new block to a root we need to have a special * subclass so we don't confuse lockdep, as it will appear that we are * locking a higher level node before a lower level one. Copying also * has this problem as it appears we're locking the same block again * when we make a snapshot of an existing root. */ BTRFS_NESTING_NEW_ROOT, /* * We are limited to MAX_LOCKDEP_SUBCLASSES number of subclasses, so * add this in here and add a static_assert to keep us from going over * the limit. As of this writing we're limited to 8, and we're * definitely using 8, hence this check to keep us from messing up in * the future. */ BTRFS_NESTING_MAX, }; enum btrfs_lockdep_trans_states { BTRFS_LOCKDEP_TRANS_COMMIT_PREP, BTRFS_LOCKDEP_TRANS_UNBLOCKED, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED, BTRFS_LOCKDEP_TRANS_COMPLETED, }; /* * Lockdep annotation for wait events. * * @owner: The struct where the lockdep map is defined * @lock: The lockdep map corresponding to a wait event * * This macro is used to annotate a wait event. In this case a thread acquires * the lockdep map as writer (exclusive lock) because it has to block until all * the threads that hold the lock as readers signal the condition for the wait * event and release their locks. */ #define btrfs_might_wait_for_event(owner, lock) \ do { \ rwsem_acquire(&owner->lock##_map, 0, 0, _THIS_IP_); \ rwsem_release(&owner->lock##_map, _THIS_IP_); \ } while (0) /* * Protection for the resource/condition of a wait event. * * @owner: The struct where the lockdep map is defined * @lock: The lockdep map corresponding to a wait event * * Many threads can modify the condition for the wait event at the same time * and signal the threads that block on the wait event. The threads that modify * the condition and do the signaling acquire the lock as readers (shared * lock). */ #define btrfs_lockdep_acquire(owner, lock) \ rwsem_acquire_read(&owner->lock##_map, 0, 0, _THIS_IP_) /* * Used after signaling the condition for a wait event to release the lockdep * map held by a reader thread. */ #define btrfs_lockdep_release(owner, lock) \ rwsem_release(&owner->lock##_map, _THIS_IP_) /* * Used to account for the fact that when doing io_uring encoded I/O, we can * return to userspace with the inode lock still held. */ #define btrfs_lockdep_inode_acquire(owner, lock) \ rwsem_acquire_read(&owner->vfs_inode.lock.dep_map, 0, 0, _THIS_IP_) #define btrfs_lockdep_inode_release(owner, lock) \ rwsem_release(&owner->vfs_inode.lock.dep_map, _THIS_IP_) /* * Macros for the transaction states wait events, similar to the generic wait * event macros. */ #define btrfs_might_wait_for_state(owner, i) \ do { \ rwsem_acquire(&owner->btrfs_state_change_map[i], 0, 0, _THIS_IP_); \ rwsem_release(&owner->btrfs_state_change_map[i], _THIS_IP_); \ } while (0) #define btrfs_trans_state_lockdep_acquire(owner, i) \ rwsem_acquire_read(&owner->btrfs_state_change_map[i], 0, 0, _THIS_IP_) #define btrfs_trans_state_lockdep_release(owner, i) \ rwsem_release(&owner->btrfs_state_change_map[i], _THIS_IP_) /* Initialization of the lockdep map */ #define btrfs_lockdep_init_map(owner, lock) \ do { \ static struct lock_class_key lock##_key; \ lockdep_init_map(&owner->lock##_map, #lock, &lock##_key, 0); \ } while (0) /* Initialization of the transaction states lockdep maps. */ #define btrfs_state_lockdep_init_map(owner, lock, state) \ do { \ static struct lock_class_key lock##_key; \ lockdep_init_map(&owner->btrfs_state_change_map[state], #lock, \ &lock##_key, 0); \ } while (0) static_assert(BTRFS_NESTING_MAX <= MAX_LOCKDEP_SUBCLASSES, "too many lock subclasses defined"); void btrfs_tree_lock_nested(struct extent_buffer *eb, enum btrfs_lock_nesting nest); static inline void btrfs_tree_lock(struct extent_buffer *eb) { btrfs_tree_lock_nested(eb, BTRFS_NESTING_NORMAL); } void btrfs_tree_unlock(struct extent_buffer *eb); void btrfs_tree_read_lock_nested(struct extent_buffer *eb, enum btrfs_lock_nesting nest); static inline void btrfs_tree_read_lock(struct extent_buffer *eb) { btrfs_tree_read_lock_nested(eb, BTRFS_NESTING_NORMAL); } void btrfs_tree_read_unlock(struct extent_buffer *eb); bool btrfs_try_tree_read_lock(struct extent_buffer *eb); struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root); struct extent_buffer *btrfs_try_read_lock_root_node(struct btrfs_root *root); #ifdef CONFIG_BTRFS_DEBUG static inline void btrfs_assert_tree_write_locked(struct extent_buffer *eb) { lockdep_assert_held_write(&eb->lock); } static inline void btrfs_assert_tree_read_locked(struct extent_buffer *eb) { lockdep_assert_held_read(&eb->lock); } #else static inline void btrfs_assert_tree_write_locked(struct extent_buffer *eb) { } static inline void btrfs_assert_tree_read_locked(struct extent_buffer *eb) { } #endif void btrfs_unlock_up_safe(struct btrfs_path *path, int level); static inline void btrfs_tree_unlock_rw(struct extent_buffer *eb, int rw) { if (rw == BTRFS_WRITE_LOCK) btrfs_tree_unlock(eb); else if (rw == BTRFS_READ_LOCK) btrfs_tree_read_unlock(eb); else BUG(); } struct btrfs_drew_lock { atomic_t readers; atomic_t writers; wait_queue_head_t pending_writers; wait_queue_head_t pending_readers; }; void btrfs_drew_lock_init(struct btrfs_drew_lock *lock); void btrfs_drew_write_lock(struct btrfs_drew_lock *lock); bool btrfs_drew_try_write_lock(struct btrfs_drew_lock *lock); void btrfs_drew_write_unlock(struct btrfs_drew_lock *lock); void btrfs_drew_read_lock(struct btrfs_drew_lock *lock); void btrfs_drew_read_unlock(struct btrfs_drew_lock *lock); #ifdef CONFIG_DEBUG_LOCK_ALLOC void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, int level); void btrfs_maybe_reset_lockdep_class(struct btrfs_root *root, struct extent_buffer *eb); #else static inline void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, int level) { } static inline void btrfs_maybe_reset_lockdep_class(struct btrfs_root *root, struct extent_buffer *eb) { } #endif #endif
149 149 150 137 137 137 137 137 136 137 137 17 17 17 17 137 137 137 137 136 136 27 17 27 17 153 22 150 149 150 149 147 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 // SPDX-License-Identifier: GPL-2.0+ /* * linux/fs/jbd2/revoke.c * * Written by Stephen C. Tweedie <sct@redhat.com>, 2000 * * Copyright 2000 Red Hat corp --- All Rights Reserved * * Journal revoke routines for the generic filesystem journaling code; * part of the ext2fs journaling system. * * Revoke is the mechanism used to prevent old log records for deleted * metadata from being replayed on top of newer data using the same * blocks. The revoke mechanism is used in two separate places: * * + Commit: during commit we write the entire list of the current * transaction's revoked blocks to the journal * * + Recovery: during recovery we record the transaction ID of all * revoked blocks. If there are multiple revoke records in the log * for a single block, only the last one counts, and if there is a log * entry for a block beyond the last revoke, then that log entry still * gets replayed. * * We can get interactions between revokes and new log data within a * single transaction: * * Block is revoked and then journaled: * The desired end result is the journaling of the new block, so we * cancel the revoke before the transaction commits. * * Block is journaled and then revoked: * The revoke must take precedence over the write of the block, so we * need either to cancel the journal entry or to write the revoke * later in the log than the log block. In this case, we choose the * latter: journaling a block cancels any revoke record for that block * in the current transaction, so any revoke for that block in the * transaction must have happened after the block was journaled and so * the revoke must take precedence. * * Block is revoked and then written as data: * The data write is allowed to succeed, but the revoke is _not_ * cancelled. We still need to prevent old log records from * overwriting the new data. We don't even need to clear the revoke * bit here. * * We cache revoke status of a buffer in the current transaction in b_states * bits. As the name says, revokevalid flag indicates that the cached revoke * status of a buffer is valid and we can rely on the cached status. * * Revoke information on buffers is a tri-state value: * * RevokeValid clear: no cached revoke status, need to look it up * RevokeValid set, Revoked clear: * buffer has not been revoked, and cancel_revoke * need do nothing. * RevokeValid set, Revoked set: * buffer has been revoked. * * Locking rules: * We keep two hash tables of revoke records. One hashtable belongs to the * running transaction (is pointed to by journal->j_revoke), the other one * belongs to the committing transaction. Accesses to the second hash table * happen only from the kjournald and no other thread touches this table. Also * journal_switch_revoke_table() which switches which hashtable belongs to the * running and which to the committing transaction is called only from * kjournald. Therefore we need no locks when accessing the hashtable belonging * to the committing transaction. * * All users operating on the hash table belonging to the running transaction * have a handle to the transaction. Therefore they are safe from kjournald * switching hash tables under them. For operations on the lists of entries in * the hash table j_revoke_lock is used. * * Finally, also replay code uses the hash tables but at this moment no one else * can touch them (filesystem isn't mounted yet) and hence no locking is * needed. */ #ifndef __KERNEL__ #include "jfs_user.h" #else #include <linux/time.h> #include <linux/fs.h> #include <linux/jbd2.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/list.h> #include <linux/init.h> #include <linux/bio.h> #include <linux/log2.h> #include <linux/hash.h> #endif static struct kmem_cache *jbd2_revoke_record_cache; static struct kmem_cache *jbd2_revoke_table_cache; /* Each revoke record represents one single revoked block. During journal replay, this involves recording the transaction ID of the last transaction to revoke this block. */ struct jbd2_revoke_record_s { struct list_head hash; tid_t sequence; /* Used for recovery only */ unsigned long long blocknr; }; /* The revoke table is just a simple hash table of revoke records. */ struct jbd2_revoke_table_s { /* It is conceivable that we might want a larger hash table * for recovery. Must be a power of two. */ int hash_size; int hash_shift; struct list_head *hash_table; }; #ifdef __KERNEL__ static void write_one_revoke_record(transaction_t *, struct list_head *, struct buffer_head **, int *, struct jbd2_revoke_record_s *); static void flush_descriptor(journal_t *, struct buffer_head *, int); #endif /* Utility functions to maintain the revoke table */ static inline int hash(journal_t *journal, unsigned long long block) { return hash_64(block, journal->j_revoke->hash_shift); } static int insert_revoke_hash(journal_t *journal, unsigned long long blocknr, tid_t seq) { struct list_head *hash_list; struct jbd2_revoke_record_s *record; gfp_t gfp_mask = GFP_NOFS; if (journal_oom_retry) gfp_mask |= __GFP_NOFAIL; record = kmem_cache_alloc(jbd2_revoke_record_cache, gfp_mask); if (!record) return -ENOMEM; record->sequence = seq; record->blocknr = blocknr; hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)]; spin_lock(&journal->j_revoke_lock); list_add(&record->hash, hash_list); spin_unlock(&journal->j_revoke_lock); return 0; } /* Find a revoke record in the journal's hash table. */ static struct jbd2_revoke_record_s *find_revoke_record(journal_t *journal, unsigned long long blocknr) { struct list_head *hash_list; struct jbd2_revoke_record_s *record; hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)]; spin_lock(&journal->j_revoke_lock); record = (struct jbd2_revoke_record_s *) hash_list->next; while (&(record->hash) != hash_list) { if (record->blocknr == blocknr) { spin_unlock(&journal->j_revoke_lock); return record; } record = (struct jbd2_revoke_record_s *) record->hash.next; } spin_unlock(&journal->j_revoke_lock); return NULL; } void jbd2_journal_destroy_revoke_record_cache(void) { kmem_cache_destroy(jbd2_revoke_record_cache); jbd2_revoke_record_cache = NULL; } void jbd2_journal_destroy_revoke_table_cache(void) { kmem_cache_destroy(jbd2_revoke_table_cache); jbd2_revoke_table_cache = NULL; } int __init jbd2_journal_init_revoke_record_cache(void) { J_ASSERT(!jbd2_revoke_record_cache); jbd2_revoke_record_cache = KMEM_CACHE(jbd2_revoke_record_s, SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY); if (!jbd2_revoke_record_cache) { pr_emerg("JBD2: failed to create revoke_record cache\n"); return -ENOMEM; } return 0; } int __init jbd2_journal_init_revoke_table_cache(void) { J_ASSERT(!jbd2_revoke_table_cache); jbd2_revoke_table_cache = KMEM_CACHE(jbd2_revoke_table_s, SLAB_TEMPORARY); if (!jbd2_revoke_table_cache) { pr_emerg("JBD2: failed to create revoke_table cache\n"); return -ENOMEM; } return 0; } struct jbd2_revoke_table_s *jbd2_journal_init_revoke_table(int hash_size) { int shift = 0; int tmp = hash_size; struct jbd2_revoke_table_s *table; table = kmem_cache_alloc(jbd2_revoke_table_cache, GFP_KERNEL); if (!table) goto out; while((tmp >>= 1UL) != 0UL) shift++; table->hash_size = hash_size; table->hash_shift = shift; table->hash_table = kvmalloc_array(hash_size, sizeof(struct list_head), GFP_KERNEL); if (!table->hash_table) { kmem_cache_free(jbd2_revoke_table_cache, table); table = NULL; goto out; } for (tmp = 0; tmp < hash_size; tmp++) INIT_LIST_HEAD(&table->hash_table[tmp]); out: return table; } void jbd2_journal_destroy_revoke_table(struct jbd2_revoke_table_s *table) { int i; struct list_head *hash_list; for (i = 0; i < table->hash_size; i++) { hash_list = &table->hash_table[i]; J_ASSERT(list_empty(hash_list)); } kvfree(table->hash_table); kmem_cache_free(jbd2_revoke_table_cache, table); } /* Initialise the revoke table for a given journal to a given size. */ int jbd2_journal_init_revoke(journal_t *journal, int hash_size) { J_ASSERT(journal->j_revoke_table[0] == NULL); J_ASSERT(is_power_of_2(hash_size)); journal->j_revoke_table[0] = jbd2_journal_init_revoke_table(hash_size); if (!journal->j_revoke_table[0]) goto fail0; journal->j_revoke_table[1] = jbd2_journal_init_revoke_table(hash_size); if (!journal->j_revoke_table[1]) goto fail1; journal->j_revoke = journal->j_revoke_table[1]; spin_lock_init(&journal->j_revoke_lock); return 0; fail1: jbd2_journal_destroy_revoke_table(journal->j_revoke_table[0]); journal->j_revoke_table[0] = NULL; fail0: return -ENOMEM; } /* Destroy a journal's revoke table. The table must already be empty! */ void jbd2_journal_destroy_revoke(journal_t *journal) { journal->j_revoke = NULL; if (journal->j_revoke_table[0]) jbd2_journal_destroy_revoke_table(journal->j_revoke_table[0]); if (journal->j_revoke_table[1]) jbd2_journal_destroy_revoke_table(journal->j_revoke_table[1]); } #ifdef __KERNEL__ /* * jbd2_journal_revoke: revoke a given buffer_head from the journal. This * prevents the block from being replayed during recovery if we take a * crash after this current transaction commits. Any subsequent * metadata writes of the buffer in this transaction cancel the * revoke. * * Note that this call may block --- it is up to the caller to make * sure that there are no further calls to journal_write_metadata * before the revoke is complete. In ext3, this implies calling the * revoke before clearing the block bitmap when we are deleting * metadata. * * Revoke performs a jbd2_journal_forget on any buffer_head passed in as a * parameter, but does _not_ forget the buffer_head if the bh was only * found implicitly. * * bh_in may not be a journalled buffer - it may have come off * the hash tables without an attached journal_head. * * If bh_in is non-zero, jbd2_journal_revoke() will decrement its b_count * by one. */ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr, struct buffer_head *bh_in) { struct buffer_head *bh = NULL; journal_t *journal; struct block_device *bdev; int err; might_sleep(); if (bh_in) BUFFER_TRACE(bh_in, "enter"); journal = handle->h_transaction->t_journal; if (!jbd2_journal_set_features(journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)){ J_ASSERT (!"Cannot set revoke feature!"); return -EINVAL; } bdev = journal->j_fs_dev; bh = bh_in; if (!bh) { bh = __find_get_block_nonatomic(bdev, blocknr, journal->j_blocksize); if (bh) BUFFER_TRACE(bh, "found on hash"); } #ifdef JBD2_EXPENSIVE_CHECKING else { struct buffer_head *bh2; /* If there is a different buffer_head lying around in * memory anywhere... */ bh2 = __find_get_block_nonatomic(bdev, blocknr, journal->j_blocksize); if (bh2) { /* ... and it has RevokeValid status... */ if (bh2 != bh && buffer_revokevalid(bh2)) /* ...then it better be revoked too, * since it's illegal to create a revoke * record against a buffer_head which is * not marked revoked --- that would * risk missing a subsequent revoke * cancel. */ J_ASSERT_BH(bh2, buffer_revoked(bh2)); put_bh(bh2); } } #endif if (WARN_ON_ONCE(handle->h_revoke_credits <= 0)) { if (!bh_in) brelse(bh); return -EIO; } /* We really ought not ever to revoke twice in a row without first having the revoke cancelled: it's illegal to free a block twice without allocating it in between! */ if (bh) { if (!J_EXPECT_BH(bh, !buffer_revoked(bh), "inconsistent data on disk")) { if (!bh_in) brelse(bh); return -EIO; } set_buffer_revoked(bh); set_buffer_revokevalid(bh); if (bh_in) { BUFFER_TRACE(bh_in, "call jbd2_journal_forget"); jbd2_journal_forget(handle, bh_in); } else { BUFFER_TRACE(bh, "call brelse"); __brelse(bh); } } handle->h_revoke_credits--; jbd2_debug(2, "insert revoke for block %llu, bh_in=%p\n",blocknr, bh_in); err = insert_revoke_hash(journal, blocknr, handle->h_transaction->t_tid); BUFFER_TRACE(bh_in, "exit"); return err; } /* * Cancel an outstanding revoke. For use only internally by the * journaling code (called from jbd2_journal_get_write_access). * * We trust buffer_revoked() on the buffer if the buffer is already * being journaled: if there is no revoke pending on the buffer, then we * don't do anything here. * * This would break if it were possible for a buffer to be revoked and * discarded, and then reallocated within the same transaction. In such * a case we would have lost the revoked bit, but when we arrived here * the second time we would still have a pending revoke to cancel. So, * do not trust the Revoked bit on buffers unless RevokeValid is also * set. */ void jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh) { struct jbd2_revoke_record_s *record; journal_t *journal = handle->h_transaction->t_journal; int need_cancel; struct buffer_head *bh = jh2bh(jh); jbd2_debug(4, "journal_head %p, cancelling revoke\n", jh); /* Is the existing Revoke bit valid? If so, we trust it, and * only perform the full cancel if the revoke bit is set. If * not, we can't trust the revoke bit, and we need to do the * full search for a revoke record. */ if (test_set_buffer_revokevalid(bh)) { need_cancel = test_clear_buffer_revoked(bh); } else { need_cancel = 1; clear_buffer_revoked(bh); } if (need_cancel) { record = find_revoke_record(journal, bh->b_blocknr); if (record) { jbd2_debug(4, "cancelled existing revoke on " "blocknr %llu\n", (unsigned long long)bh->b_blocknr); spin_lock(&journal->j_revoke_lock); list_del(&record->hash); spin_unlock(&journal->j_revoke_lock); kmem_cache_free(jbd2_revoke_record_cache, record); } } #ifdef JBD2_EXPENSIVE_CHECKING /* There better not be one left behind by now! */ record = find_revoke_record(journal, bh->b_blocknr); J_ASSERT_JH(jh, record == NULL); #endif /* Finally, have we just cleared revoke on an unhashed * buffer_head? If so, we'd better make sure we clear the * revoked status on any hashed alias too, otherwise the revoke * state machine will get very upset later on. */ if (need_cancel) { struct buffer_head *bh2; bh2 = __find_get_block_nonatomic(bh->b_bdev, bh->b_blocknr, bh->b_size); if (bh2) { if (bh2 != bh) clear_buffer_revoked(bh2); __brelse(bh2); } } } /* * jbd2_clear_buffer_revoked_flags clears revoked flag of buffers in * revoke table to reflect there is no revoked buffers in the next * transaction which is going to be started. */ void jbd2_clear_buffer_revoked_flags(journal_t *journal) { struct jbd2_revoke_table_s *revoke = journal->j_revoke; int i = 0; for (i = 0; i < revoke->hash_size; i++) { struct list_head *hash_list; struct list_head *list_entry; hash_list = &revoke->hash_table[i]; list_for_each(list_entry, hash_list) { struct jbd2_revoke_record_s *record; struct buffer_head *bh; record = (struct jbd2_revoke_record_s *)list_entry; bh = __find_get_block_nonatomic(journal->j_fs_dev, record->blocknr, journal->j_blocksize); if (bh) { clear_buffer_revoked(bh); __brelse(bh); } } } } /* jbd2_journal_switch_revoke_table table select j_revoke for next * transaction we do not want to suspend any processing until all * revokes are written -bzzz */ void jbd2_journal_switch_revoke_table(journal_t *journal) { int i; if (journal->j_revoke == journal->j_revoke_table[0]) journal->j_revoke = journal->j_revoke_table[1]; else journal->j_revoke = journal->j_revoke_table[0]; for (i = 0; i < journal->j_revoke->hash_size; i++) INIT_LIST_HEAD(&journal->j_revoke->hash_table[i]); } /* * Write revoke records to the journal for all entries in the current * revoke hash, deleting the entries as we go. */ void jbd2_journal_write_revoke_records(transaction_t *transaction, struct list_head *log_bufs) { journal_t *journal = transaction->t_journal; struct buffer_head *descriptor; struct jbd2_revoke_record_s *record; struct jbd2_revoke_table_s *revoke; struct list_head *hash_list; int i, offset, count; descriptor = NULL; offset = 0; count = 0; /* select revoke table for committing transaction */ revoke = journal->j_revoke == journal->j_revoke_table[0] ? journal->j_revoke_table[1] : journal->j_revoke_table[0]; for (i = 0; i < revoke->hash_size; i++) { hash_list = &revoke->hash_table[i]; while (!list_empty(hash_list)) { record = (struct jbd2_revoke_record_s *) hash_list->next; write_one_revoke_record(transaction, log_bufs, &descriptor, &offset, record); count++; list_del(&record->hash); kmem_cache_free(jbd2_revoke_record_cache, record); } } if (descriptor) flush_descriptor(journal, descriptor, offset); jbd2_debug(1, "Wrote %d revoke records\n", count); } /* * Write out one revoke record. We need to create a new descriptor * block if the old one is full or if we have not already created one. */ static void write_one_revoke_record(transaction_t *transaction, struct list_head *log_bufs, struct buffer_head **descriptorp, int *offsetp, struct jbd2_revoke_record_s *record) { journal_t *journal = transaction->t_journal; int csum_size = 0; struct buffer_head *descriptor; int sz, offset; /* If we are already aborting, this all becomes a noop. We still need to go round the loop in jbd2_journal_write_revoke_records in order to free all of the revoke records: only the IO to the journal is omitted. */ if (is_journal_aborted(journal)) return; descriptor = *descriptorp; offset = *offsetp; /* Do we need to leave space at the end for a checksum? */ if (jbd2_journal_has_csum_v2or3(journal)) csum_size = sizeof(struct jbd2_journal_block_tail); if (jbd2_has_feature_64bit(journal)) sz = 8; else sz = 4; /* Make sure we have a descriptor with space left for the record */ if (descriptor) { if (offset + sz > journal->j_blocksize - csum_size) { flush_descriptor(journal, descriptor, offset); descriptor = NULL; } } if (!descriptor) { descriptor = jbd2_journal_get_descriptor_buffer(transaction, JBD2_REVOKE_BLOCK); if (!descriptor) return; /* Record it so that we can wait for IO completion later */ BUFFER_TRACE(descriptor, "file in log_bufs"); jbd2_file_log_bh(log_bufs, descriptor); offset = sizeof(jbd2_journal_revoke_header_t); *descriptorp = descriptor; } if (jbd2_has_feature_64bit(journal)) * ((__be64 *)(&descriptor->b_data[offset])) = cpu_to_be64(record->blocknr); else * ((__be32 *)(&descriptor->b_data[offset])) = cpu_to_be32(record->blocknr); offset += sz; *offsetp = offset; } /* * Flush a revoke descriptor out to the journal. If we are aborting, * this is a noop; otherwise we are generating a buffer which needs to * be waited for during commit, so it has to go onto the appropriate * journal buffer list. */ static void flush_descriptor(journal_t *journal, struct buffer_head *descriptor, int offset) { jbd2_journal_revoke_header_t *header; if (is_journal_aborted(journal)) return; header = (jbd2_journal_revoke_header_t *)descriptor->b_data; header->r_count = cpu_to_be32(offset); jbd2_descriptor_block_csum_set(journal, descriptor); set_buffer_jwrite(descriptor); BUFFER_TRACE(descriptor, "write"); set_buffer_dirty(descriptor); write_dirty_buffer(descriptor, JBD2_JOURNAL_REQ_FLAGS); } #endif /* * Revoke support for recovery. * * Recovery needs to be able to: * * record all revoke records, including the tid of the latest instance * of each revoke in the journal * * check whether a given block in a given transaction should be replayed * (ie. has not been revoked by a revoke record in that or a subsequent * transaction) * * empty the revoke table after recovery. */ /* * First, setting revoke records. We create a new revoke record for * every block ever revoked in the log as we scan it for recovery, and * we update the existing records if we find multiple revokes for a * single block. */ int jbd2_journal_set_revoke(journal_t *journal, unsigned long long blocknr, tid_t sequence) { struct jbd2_revoke_record_s *record; record = find_revoke_record(journal, blocknr); if (record) { /* If we have multiple occurrences, only record the * latest sequence number in the hashed record */ if (tid_gt(sequence, record->sequence)) record->sequence = sequence; return 0; } return insert_revoke_hash(journal, blocknr, sequence); } /* * Test revoke records. For a given block referenced in the log, has * that block been revoked? A revoke record with a given transaction * sequence number revokes all blocks in that transaction and earlier * ones, but later transactions still need replayed. */ int jbd2_journal_test_revoke(journal_t *journal, unsigned long long blocknr, tid_t sequence) { struct jbd2_revoke_record_s *record; record = find_revoke_record(journal, blocknr); if (!record) return 0; if (tid_gt(sequence, record->sequence)) return 0; return 1; } /* * Finally, once recovery is over, we need to clear the revoke table so * that it can be reused by the running filesystem. */ void jbd2_journal_clear_revoke(journal_t *journal) { int i; struct list_head *hash_list; struct jbd2_revoke_record_s *record; struct jbd2_revoke_table_s *revoke; revoke = journal->j_revoke; for (i = 0; i < revoke->hash_size; i++) { hash_list = &revoke->hash_table[i]; while (!list_empty(hash_list)) { record = (struct jbd2_revoke_record_s*) hash_list->next; list_del(&record->hash); kmem_cache_free(jbd2_revoke_record_cache, record); } } }
62 62 3 13 23 25 25 23 23 2 2 2 2 1 1 1 1 1 1 2 21 19 21 20 15 15 15 15 2 13 2 2 12 13 1 11 4 11 11 11 15 12 13 7 4 3 3 4 4 4 17 17 17 17 3 2 2 1 1 1 3 2 43 43 43 43 3 39 39 39 39 36 6 3 3 3 1 2 1 1 39 39 39 40 40 40 16 16 4 18 18 18 7 11 11 18 13 13 9 4 3 9 18 18 18 18 3 17 17 17 1 1 16 16 1 16 16 4 12 12 16 4 4 16 17 17 6 20 6 6 6 1 5 3 2 1 2 2 2 6 6 5 5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/hfsplus/xattr.c * * Vyacheslav Dubeyko <slava@dubeyko.com> * * Logic of processing extended attributes */ #include "hfsplus_fs.h" #include <linux/nls.h> #include "xattr.h" static int hfsplus_removexattr(struct inode *inode, const char *name); const struct xattr_handler * const hfsplus_xattr_handlers[] = { &hfsplus_xattr_osx_handler, &hfsplus_xattr_user_handler, &hfsplus_xattr_trusted_handler, &hfsplus_xattr_security_handler, NULL }; static int strcmp_xattr_finder_info(const char *name) { if (name) { return strncmp(name, HFSPLUS_XATTR_FINDER_INFO_NAME, sizeof(HFSPLUS_XATTR_FINDER_INFO_NAME)); } return -1; } static int strcmp_xattr_acl(const char *name) { if (name) { return strncmp(name, HFSPLUS_XATTR_ACL_NAME, sizeof(HFSPLUS_XATTR_ACL_NAME)); } return -1; } static bool is_known_namespace(const char *name) { if (strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) && strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) && strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) && strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) return false; return true; } static void hfsplus_init_header_node(struct inode *attr_file, u32 clump_size, char *buf, u16 node_size) { struct hfs_bnode_desc *desc; struct hfs_btree_header_rec *head; u16 offset; __be16 *rec_offsets; u32 hdr_node_map_rec_bits; char *bmp; u32 used_nodes; u32 used_bmp_bytes; u64 tmp; hfs_dbg("clump %u, node_size %u\n", clump_size, node_size); /* The end of the node contains list of record offsets */ rec_offsets = (__be16 *)(buf + node_size); desc = (struct hfs_bnode_desc *)buf; desc->type = HFS_NODE_HEADER; desc->num_recs = cpu_to_be16(HFSPLUS_BTREE_HDR_NODE_RECS_COUNT); offset = sizeof(struct hfs_bnode_desc); *--rec_offsets = cpu_to_be16(offset); head = (struct hfs_btree_header_rec *)(buf + offset); head->node_size = cpu_to_be16(node_size); tmp = i_size_read(attr_file); do_div(tmp, node_size); head->node_count = cpu_to_be32(tmp); head->free_nodes = cpu_to_be32(be32_to_cpu(head->node_count) - 1); head->clump_size = cpu_to_be32(clump_size); head->attributes |= cpu_to_be32(HFS_TREE_BIGKEYS | HFS_TREE_VARIDXKEYS); head->max_key_len = cpu_to_be16(HFSPLUS_ATTR_KEYLEN - sizeof(u16)); offset += sizeof(struct hfs_btree_header_rec); *--rec_offsets = cpu_to_be16(offset); offset += HFSPLUS_BTREE_HDR_USER_BYTES; *--rec_offsets = cpu_to_be16(offset); hdr_node_map_rec_bits = 8 * (node_size - offset - (4 * sizeof(u16))); if (be32_to_cpu(head->node_count) > hdr_node_map_rec_bits) { u32 map_node_bits; u32 map_nodes; desc->next = cpu_to_be32(be32_to_cpu(head->leaf_tail) + 1); map_node_bits = 8 * (node_size - sizeof(struct hfs_bnode_desc) - (2 * sizeof(u16)) - 2); map_nodes = (be32_to_cpu(head->node_count) - hdr_node_map_rec_bits + (map_node_bits - 1)) / map_node_bits; be32_add_cpu(&head->free_nodes, 0 - map_nodes); } bmp = buf + offset; used_nodes = be32_to_cpu(head->node_count) - be32_to_cpu(head->free_nodes); used_bmp_bytes = used_nodes / 8; if (used_bmp_bytes) { memset(bmp, 0xFF, used_bmp_bytes); bmp += used_bmp_bytes; used_nodes %= 8; } *bmp = ~(0xFF >> used_nodes); offset += hdr_node_map_rec_bits / 8; *--rec_offsets = cpu_to_be16(offset); } static int hfsplus_create_attributes_file(struct super_block *sb) { int err = 0; struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb); struct inode *attr_file; struct hfsplus_inode_info *hip; u32 clump_size; u16 node_size = HFSPLUS_ATTR_TREE_NODE_SIZE; char *buf; int index, written; struct address_space *mapping; struct page *page; int old_state = HFSPLUS_EMPTY_ATTR_TREE; hfs_dbg("ino %d\n", HFSPLUS_ATTR_CNID); check_attr_tree_state_again: switch (atomic_read(&sbi->attr_tree_state)) { case HFSPLUS_EMPTY_ATTR_TREE: if (old_state != atomic_cmpxchg(&sbi->attr_tree_state, old_state, HFSPLUS_CREATING_ATTR_TREE)) goto check_attr_tree_state_again; break; case HFSPLUS_CREATING_ATTR_TREE: /* * This state means that another thread is in process * of AttributesFile creation. Theoretically, it is * possible to be here. But really __setxattr() method * first of all calls hfs_find_init() for lookup in * B-tree of CatalogFile. This method locks mutex of * CatalogFile's B-tree. As a result, if some thread * is inside AttributedFile creation operation then * another threads will be waiting unlocking of * CatalogFile's B-tree's mutex. However, if code will * change then we will return error code (-EAGAIN) from * here. Really, it means that first try to set of xattr * fails with error but second attempt will have success. */ return -EAGAIN; case HFSPLUS_VALID_ATTR_TREE: return 0; case HFSPLUS_FAILED_ATTR_TREE: return -EOPNOTSUPP; default: BUG(); } attr_file = hfsplus_iget(sb, HFSPLUS_ATTR_CNID); if (IS_ERR(attr_file)) { pr_err("failed to load attributes file\n"); return PTR_ERR(attr_file); } if (i_size_read(attr_file) != 0) { err = -EIO; pr_err("detected inconsistent attributes file, running fsck.hfsplus is recommended.\n"); goto end_attr_file_creation; } hip = HFSPLUS_I(attr_file); clump_size = hfsplus_calc_btree_clump_size(sb->s_blocksize, node_size, sbi->sect_count, HFSPLUS_ATTR_CNID); mutex_lock(&hip->extents_lock); hip->clump_blocks = clump_size >> sbi->alloc_blksz_shift; mutex_unlock(&hip->extents_lock); if (sbi->free_blocks <= (hip->clump_blocks << 1)) { err = -ENOSPC; goto end_attr_file_creation; } while (hip->alloc_blocks < hip->clump_blocks) { err = hfsplus_file_extend(attr_file, false); if (unlikely(err)) { pr_err("failed to extend attributes file\n"); goto end_attr_file_creation; } hip->phys_size = attr_file->i_size = (loff_t)hip->alloc_blocks << sbi->alloc_blksz_shift; hip->fs_blocks = hip->alloc_blocks << sbi->fs_shift; inode_set_bytes(attr_file, attr_file->i_size); } buf = kzalloc(node_size, GFP_NOFS); if (!buf) { err = -ENOMEM; goto end_attr_file_creation; } hfsplus_init_header_node(attr_file, clump_size, buf, node_size); mapping = attr_file->i_mapping; index = 0; written = 0; for (; written < node_size; index++, written += PAGE_SIZE) { void *kaddr; page = read_mapping_page(mapping, index, NULL); if (IS_ERR(page)) { err = PTR_ERR(page); goto failed_header_node_init; } kaddr = kmap_atomic(page); memcpy(kaddr, buf + written, min_t(size_t, PAGE_SIZE, node_size - written)); kunmap_atomic(kaddr); set_page_dirty(page); put_page(page); } hfsplus_mark_inode_dirty(attr_file, HFSPLUS_I_ATTR_DIRTY); sbi->attr_tree = hfs_btree_open(sb, HFSPLUS_ATTR_CNID); if (!sbi->attr_tree) pr_err("failed to load attributes file\n"); failed_header_node_init: kfree(buf); end_attr_file_creation: iput(attr_file); if (!err) atomic_set(&sbi->attr_tree_state, HFSPLUS_VALID_ATTR_TREE); else if (err == -ENOSPC) atomic_set(&sbi->attr_tree_state, HFSPLUS_EMPTY_ATTR_TREE); else atomic_set(&sbi->attr_tree_state, HFSPLUS_FAILED_ATTR_TREE); return err; } int __hfsplus_setxattr(struct inode *inode, const char *name, const void *value, size_t size, int flags) { int err; struct hfs_find_data cat_fd; hfsplus_cat_entry entry; u16 cat_entry_flags, cat_entry_type; u16 folder_finderinfo_len = sizeof(struct DInfo) + sizeof(struct DXInfo); u16 file_finderinfo_len = sizeof(struct FInfo) + sizeof(struct FXInfo); if ((!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) || HFSPLUS_IS_RSRC(inode)) return -EOPNOTSUPP; if (value == NULL) return hfsplus_removexattr(inode, name); err = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &cat_fd); if (err) { pr_err("can't init xattr find struct\n"); return err; } err = hfsplus_find_cat(inode->i_sb, inode->i_ino, &cat_fd); if (err) { pr_err("catalog searching failed\n"); goto end_setxattr; } if (!strcmp_xattr_finder_info(name)) { if (flags & XATTR_CREATE) { pr_err("xattr exists yet\n"); err = -EOPNOTSUPP; goto end_setxattr; } hfs_bnode_read(cat_fd.bnode, &entry, cat_fd.entryoffset, sizeof(hfsplus_cat_entry)); if (be16_to_cpu(entry.type) == HFSPLUS_FOLDER) { if (size == folder_finderinfo_len) { memcpy(&entry.folder.info, value, folder_finderinfo_len); hfs_bnode_write(cat_fd.bnode, &entry, cat_fd.entryoffset, sizeof(struct hfsplus_cat_folder)); hfsplus_mark_inode_dirty(inode, HFSPLUS_I_CAT_DIRTY); } else { err = -ERANGE; goto end_setxattr; } } else if (be16_to_cpu(entry.type) == HFSPLUS_FILE) { if (size == file_finderinfo_len) { memcpy(&entry.file.info, value, file_finderinfo_len); hfs_bnode_write(cat_fd.bnode, &entry, cat_fd.entryoffset, sizeof(struct hfsplus_cat_file)); hfsplus_mark_inode_dirty(inode, HFSPLUS_I_CAT_DIRTY); } else { err = -ERANGE; goto end_setxattr; } } else { err = -EOPNOTSUPP; goto end_setxattr; } goto end_setxattr; } if (!HFSPLUS_SB(inode->i_sb)->attr_tree) { err = hfsplus_create_attributes_file(inode->i_sb); if (unlikely(err)) goto end_setxattr; } if (hfsplus_attr_exists(inode, name)) { if (flags & XATTR_CREATE) { pr_err("xattr exists yet\n"); err = -EOPNOTSUPP; goto end_setxattr; } err = hfsplus_delete_attr(inode, name); if (err) goto end_setxattr; err = hfsplus_create_attr(inode, name, value, size); if (err) goto end_setxattr; } else { if (flags & XATTR_REPLACE) { pr_err("cannot replace xattr\n"); err = -EOPNOTSUPP; goto end_setxattr; } err = hfsplus_create_attr(inode, name, value, size); if (err) goto end_setxattr; } cat_entry_type = hfs_bnode_read_u16(cat_fd.bnode, cat_fd.entryoffset); if (cat_entry_type == HFSPLUS_FOLDER) { cat_entry_flags = hfs_bnode_read_u16(cat_fd.bnode, cat_fd.entryoffset + offsetof(struct hfsplus_cat_folder, flags)); cat_entry_flags |= HFSPLUS_XATTR_EXISTS; if (!strcmp_xattr_acl(name)) cat_entry_flags |= HFSPLUS_ACL_EXISTS; hfs_bnode_write_u16(cat_fd.bnode, cat_fd.entryoffset + offsetof(struct hfsplus_cat_folder, flags), cat_entry_flags); hfsplus_mark_inode_dirty(inode, HFSPLUS_I_CAT_DIRTY); } else if (cat_entry_type == HFSPLUS_FILE) { cat_entry_flags = hfs_bnode_read_u16(cat_fd.bnode, cat_fd.entryoffset + offsetof(struct hfsplus_cat_file, flags)); cat_entry_flags |= HFSPLUS_XATTR_EXISTS; if (!strcmp_xattr_acl(name)) cat_entry_flags |= HFSPLUS_ACL_EXISTS; hfs_bnode_write_u16(cat_fd.bnode, cat_fd.entryoffset + offsetof(struct hfsplus_cat_file, flags), cat_entry_flags); hfsplus_mark_inode_dirty(inode, HFSPLUS_I_CAT_DIRTY); } else { pr_err("invalid catalog entry type\n"); err = -EIO; goto end_setxattr; } end_setxattr: hfs_find_exit(&cat_fd); return err; } static int name_len(const char *xattr_name, int xattr_name_len) { int len = xattr_name_len + 1; if (!is_known_namespace(xattr_name)) len += XATTR_MAC_OSX_PREFIX_LEN; return len; } static ssize_t copy_name(char *buffer, const char *xattr_name, int name_len) { ssize_t len; if (!is_known_namespace(xattr_name)) len = scnprintf(buffer, name_len + XATTR_MAC_OSX_PREFIX_LEN, "%s%s", XATTR_MAC_OSX_PREFIX, xattr_name); else len = strscpy(buffer, xattr_name, name_len + 1); /* include NUL-byte in length for non-empty name */ if (len >= 0) len++; return len; } int hfsplus_setxattr(struct inode *inode, const char *name, const void *value, size_t size, int flags, const char *prefix, size_t prefixlen) { char *xattr_name; int res; xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1, GFP_KERNEL); if (!xattr_name) return -ENOMEM; strcpy(xattr_name, prefix); strcpy(xattr_name + prefixlen, name); res = __hfsplus_setxattr(inode, xattr_name, value, size, flags); kfree(xattr_name); return res; } static ssize_t hfsplus_getxattr_finder_info(struct inode *inode, void *value, size_t size) { ssize_t res = 0; struct hfs_find_data fd; u16 entry_type; u16 folder_rec_len = sizeof(struct DInfo) + sizeof(struct DXInfo); u16 file_rec_len = sizeof(struct FInfo) + sizeof(struct FXInfo); u16 record_len = max(folder_rec_len, file_rec_len); u8 folder_finder_info[sizeof(struct DInfo) + sizeof(struct DXInfo)]; u8 file_finder_info[sizeof(struct FInfo) + sizeof(struct FXInfo)]; if (size >= record_len) { res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd); if (res) { pr_err("can't init xattr find struct\n"); return res; } res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd); if (res) goto end_getxattr_finder_info; entry_type = hfs_bnode_read_u16(fd.bnode, fd.entryoffset); if (entry_type == HFSPLUS_FOLDER) { hfs_bnode_read(fd.bnode, folder_finder_info, fd.entryoffset + offsetof(struct hfsplus_cat_folder, user_info), folder_rec_len); memcpy(value, folder_finder_info, folder_rec_len); res = folder_rec_len; } else if (entry_type == HFSPLUS_FILE) { hfs_bnode_read(fd.bnode, file_finder_info, fd.entryoffset + offsetof(struct hfsplus_cat_file, user_info), file_rec_len); memcpy(value, file_finder_info, file_rec_len); res = file_rec_len; } else { res = -EOPNOTSUPP; goto end_getxattr_finder_info; } } else res = size ? -ERANGE : record_len; end_getxattr_finder_info: if (size >= record_len) hfs_find_exit(&fd); return res; } ssize_t __hfsplus_getxattr(struct inode *inode, const char *name, void *value, size_t size) { struct hfs_find_data fd; hfsplus_attr_entry *entry; __be32 xattr_record_type; u32 record_type; u16 record_length = 0; ssize_t res; if ((!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) || HFSPLUS_IS_RSRC(inode)) return -EOPNOTSUPP; if (!strcmp_xattr_finder_info(name)) return hfsplus_getxattr_finder_info(inode, value, size); if (!HFSPLUS_SB(inode->i_sb)->attr_tree) return -EOPNOTSUPP; entry = hfsplus_alloc_attr_entry(); if (!entry) { pr_err("can't allocate xattr entry\n"); return -ENOMEM; } res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->attr_tree, &fd); if (res) { pr_err("can't init xattr find struct\n"); goto failed_getxattr_init; } res = hfsplus_find_attr(inode->i_sb, inode->i_ino, name, &fd); if (res) { if (res == -ENOENT) res = -ENODATA; else pr_err("xattr searching failed\n"); goto out; } hfs_bnode_read(fd.bnode, &xattr_record_type, fd.entryoffset, sizeof(xattr_record_type)); record_type = be32_to_cpu(xattr_record_type); if (record_type == HFSPLUS_ATTR_INLINE_DATA) { record_length = hfs_bnode_read_u16(fd.bnode, fd.entryoffset + offsetof(struct hfsplus_attr_inline_data, length)); if (record_length > HFSPLUS_MAX_INLINE_DATA_SIZE) { pr_err("invalid xattr record size\n"); res = -EIO; goto out; } } else if (record_type == HFSPLUS_ATTR_FORK_DATA || record_type == HFSPLUS_ATTR_EXTENTS) { pr_err("only inline data xattr are supported\n"); res = -EOPNOTSUPP; goto out; } else { pr_err("invalid xattr record\n"); res = -EIO; goto out; } if (size) { hfs_bnode_read(fd.bnode, entry, fd.entryoffset, offsetof(struct hfsplus_attr_inline_data, raw_bytes) + record_length); } if (size >= record_length) { memcpy(value, entry->inline_data.raw_bytes, record_length); res = record_length; } else res = size ? -ERANGE : record_length; out: hfs_find_exit(&fd); failed_getxattr_init: hfsplus_destroy_attr_entry(entry); return res; } ssize_t hfsplus_getxattr(struct inode *inode, const char *name, void *value, size_t size, const char *prefix, size_t prefixlen) { int res; char *xattr_name; xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1, GFP_KERNEL); if (!xattr_name) return -ENOMEM; strcpy(xattr_name, prefix); strcpy(xattr_name + prefixlen, name); res = __hfsplus_getxattr(inode, xattr_name, value, size); kfree(xattr_name); return res; } static inline int can_list(const char *xattr_name) { if (!xattr_name) return 0; return strncmp(xattr_name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || capable(CAP_SYS_ADMIN); } static ssize_t hfsplus_listxattr_finder_info(struct dentry *dentry, char *buffer, size_t size) { ssize_t res; struct inode *inode = d_inode(dentry); struct hfs_find_data fd; u16 entry_type; u8 folder_finder_info[sizeof(struct DInfo) + sizeof(struct DXInfo)]; u8 file_finder_info[sizeof(struct FInfo) + sizeof(struct FXInfo)]; unsigned long len, found_bit; int xattr_name_len, symbols_count; res = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &fd); if (res) { pr_err("can't init xattr find struct\n"); return res; } res = hfsplus_find_cat(inode->i_sb, inode->i_ino, &fd); if (res) goto end_listxattr_finder_info; entry_type = hfs_bnode_read_u16(fd.bnode, fd.entryoffset); if (entry_type == HFSPLUS_FOLDER) { len = sizeof(struct DInfo) + sizeof(struct DXInfo); hfs_bnode_read(fd.bnode, folder_finder_info, fd.entryoffset + offsetof(struct hfsplus_cat_folder, user_info), len); found_bit = find_first_bit((void *)folder_finder_info, len*8); } else if (entry_type == HFSPLUS_FILE) { len = sizeof(struct FInfo) + sizeof(struct FXInfo); hfs_bnode_read(fd.bnode, file_finder_info, fd.entryoffset + offsetof(struct hfsplus_cat_file, user_info), len); found_bit = find_first_bit((void *)file_finder_info, len*8); } else { res = -EOPNOTSUPP; goto end_listxattr_finder_info; } if (found_bit >= (len*8)) res = 0; else { symbols_count = sizeof(HFSPLUS_XATTR_FINDER_INFO_NAME) - 1; xattr_name_len = name_len(HFSPLUS_XATTR_FINDER_INFO_NAME, symbols_count); if (!buffer || !size) { if (can_list(HFSPLUS_XATTR_FINDER_INFO_NAME)) res = xattr_name_len; } else if (can_list(HFSPLUS_XATTR_FINDER_INFO_NAME)) { if (size < xattr_name_len) res = -ERANGE; else { res = copy_name(buffer, HFSPLUS_XATTR_FINDER_INFO_NAME, symbols_count); } } } end_listxattr_finder_info: hfs_find_exit(&fd); return res; } ssize_t hfsplus_listxattr(struct dentry *dentry, char *buffer, size_t size) { ssize_t err; ssize_t res; struct inode *inode = d_inode(dentry); struct hfs_find_data fd; struct hfsplus_attr_key attr_key; char *strbuf; int xattr_name_len; if ((!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) || HFSPLUS_IS_RSRC(inode)) return -EOPNOTSUPP; res = hfsplus_listxattr_finder_info(dentry, buffer, size); if (res < 0) return res; else if (!HFSPLUS_SB(inode->i_sb)->attr_tree) return (res == 0) ? -EOPNOTSUPP : res; err = hfs_find_init(HFSPLUS_SB(inode->i_sb)->attr_tree, &fd); if (err) { pr_err("can't init xattr find struct\n"); return err; } strbuf = kzalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + XATTR_MAC_OSX_PREFIX_LEN + 1, GFP_KERNEL); if (!strbuf) { res = -ENOMEM; goto out; } err = hfsplus_find_attr(inode->i_sb, inode->i_ino, NULL, &fd); if (err) { if (err == -ENOENT) { if (res == 0) res = -ENODATA; goto end_listxattr; } else { res = err; goto end_listxattr; } } for (;;) { u16 key_len = hfs_bnode_read_u16(fd.bnode, fd.keyoffset); if (key_len == 0 || key_len > fd.tree->max_key_len) { pr_err("invalid xattr key length: %d\n", key_len); res = -EIO; goto end_listxattr; } hfs_bnode_read(fd.bnode, &attr_key, fd.keyoffset, key_len + sizeof(key_len)); if (be32_to_cpu(attr_key.cnid) != inode->i_ino) goto end_listxattr; xattr_name_len = NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN; if (hfsplus_uni2asc_xattr_str(inode->i_sb, &fd.key->attr.key_name, strbuf, &xattr_name_len)) { pr_err("unicode conversion failed\n"); res = -EIO; goto end_listxattr; } if (!buffer || !size) { if (can_list(strbuf)) res += name_len(strbuf, xattr_name_len); } else if (can_list(strbuf)) { if (size < (res + name_len(strbuf, xattr_name_len))) { res = -ERANGE; goto end_listxattr; } else res += copy_name(buffer + res, strbuf, xattr_name_len); } if (hfs_brec_goto(&fd, 1)) goto end_listxattr; } end_listxattr: kfree(strbuf); out: hfs_find_exit(&fd); return res; } static int hfsplus_removexattr(struct inode *inode, const char *name) { int err; struct hfs_find_data cat_fd; u16 flags; u16 cat_entry_type; int is_xattr_acl_deleted; int is_all_xattrs_deleted; if (!HFSPLUS_SB(inode->i_sb)->attr_tree) return -EOPNOTSUPP; if (!strcmp_xattr_finder_info(name)) return -EOPNOTSUPP; err = hfs_find_init(HFSPLUS_SB(inode->i_sb)->cat_tree, &cat_fd); if (err) { pr_err("can't init xattr find struct\n"); return err; } err = hfsplus_find_cat(inode->i_sb, inode->i_ino, &cat_fd); if (err) { pr_err("catalog searching failed\n"); goto end_removexattr; } err = hfsplus_delete_attr(inode, name); if (err) goto end_removexattr; is_xattr_acl_deleted = !strcmp_xattr_acl(name); is_all_xattrs_deleted = !hfsplus_attr_exists(inode, NULL); if (!is_xattr_acl_deleted && !is_all_xattrs_deleted) goto end_removexattr; cat_entry_type = hfs_bnode_read_u16(cat_fd.bnode, cat_fd.entryoffset); if (cat_entry_type == HFSPLUS_FOLDER) { flags = hfs_bnode_read_u16(cat_fd.bnode, cat_fd.entryoffset + offsetof(struct hfsplus_cat_folder, flags)); if (is_xattr_acl_deleted) flags &= ~HFSPLUS_ACL_EXISTS; if (is_all_xattrs_deleted) flags &= ~HFSPLUS_XATTR_EXISTS; hfs_bnode_write_u16(cat_fd.bnode, cat_fd.entryoffset + offsetof(struct hfsplus_cat_folder, flags), flags); hfsplus_mark_inode_dirty(inode, HFSPLUS_I_CAT_DIRTY); } else if (cat_entry_type == HFSPLUS_FILE) { flags = hfs_bnode_read_u16(cat_fd.bnode, cat_fd.entryoffset + offsetof(struct hfsplus_cat_file, flags)); if (is_xattr_acl_deleted) flags &= ~HFSPLUS_ACL_EXISTS; if (is_all_xattrs_deleted) flags &= ~HFSPLUS_XATTR_EXISTS; hfs_bnode_write_u16(cat_fd.bnode, cat_fd.entryoffset + offsetof(struct hfsplus_cat_file, flags), flags); hfsplus_mark_inode_dirty(inode, HFSPLUS_I_CAT_DIRTY); } else { pr_err("invalid catalog entry type\n"); err = -EIO; goto end_removexattr; } end_removexattr: hfs_find_exit(&cat_fd); return err; } static int hfsplus_osx_getxattr(const struct xattr_handler *handler, struct dentry *unused, struct inode *inode, const char *name, void *buffer, size_t size) { /* * Don't allow retrieving properly prefixed attributes * by prepending them with "osx." */ if (is_known_namespace(name)) return -EOPNOTSUPP; /* * osx is the namespace we use to indicate an unprefixed * attribute on the filesystem (like the ones that OS X * creates), so we pass the name through unmodified (after * ensuring it doesn't conflict with another namespace). */ return __hfsplus_getxattr(inode, name, buffer, size); } static int hfsplus_osx_setxattr(const struct xattr_handler *handler, struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) { /* * Don't allow setting properly prefixed attributes * by prepending them with "osx." */ if (is_known_namespace(name)) return -EOPNOTSUPP; /* * osx is the namespace we use to indicate an unprefixed * attribute on the filesystem (like the ones that OS X * creates), so we pass the name through unmodified (after * ensuring it doesn't conflict with another namespace). */ return __hfsplus_setxattr(inode, name, buffer, size, flags); } const struct xattr_handler hfsplus_xattr_osx_handler = { .prefix = XATTR_MAC_OSX_PREFIX, .get = hfsplus_osx_getxattr, .set = hfsplus_osx_setxattr, };
11 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_HW_BREAKPOINT_H #define _LINUX_HW_BREAKPOINT_H #include <linux/perf_event.h> #include <uapi/linux/hw_breakpoint.h> #ifdef CONFIG_HAVE_HW_BREAKPOINT enum bp_type_idx { TYPE_INST = 0, #if defined(CONFIG_HAVE_MIXED_BREAKPOINTS_REGS) TYPE_DATA = 0, #else TYPE_DATA = 1, #endif TYPE_MAX }; extern int __init init_hw_breakpoint(void); static inline void hw_breakpoint_init(struct perf_event_attr *attr) { memset(attr, 0, sizeof(*attr)); attr->type = PERF_TYPE_BREAKPOINT; attr->size = sizeof(*attr); /* * As it's for in-kernel or ptrace use, we want it to be pinned * and to call its callback every hits. */ attr->pinned = 1; attr->sample_period = 1; } static inline void ptrace_breakpoint_init(struct perf_event_attr *attr) { hw_breakpoint_init(attr); attr->exclude_kernel = 1; } static inline unsigned long hw_breakpoint_addr(struct perf_event *bp) { return bp->attr.bp_addr; } static inline int hw_breakpoint_type(struct perf_event *bp) { return bp->attr.bp_type; } static inline unsigned long hw_breakpoint_len(struct perf_event *bp) { return bp->attr.bp_len; } extern struct perf_event * register_user_hw_breakpoint(struct perf_event_attr *attr, perf_overflow_handler_t triggered, void *context, struct task_struct *tsk); /* FIXME: only change from the attr, and don't unregister */ extern int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr); extern int modify_user_hw_breakpoint_check(struct perf_event *bp, struct perf_event_attr *attr, bool check); /* * Kernel breakpoints are not associated with any particular thread. */ extern struct perf_event * register_wide_hw_breakpoint_cpu(struct perf_event_attr *attr, perf_overflow_handler_t triggered, void *context, int cpu); extern struct perf_event * __percpu * register_wide_hw_breakpoint(struct perf_event_attr *attr, perf_overflow_handler_t triggered, void *context); extern int register_perf_hw_breakpoint(struct perf_event *bp); extern void unregister_hw_breakpoint(struct perf_event *bp); extern void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events); extern bool hw_breakpoint_is_used(void); extern int dbg_reserve_bp_slot(struct perf_event *bp); extern int dbg_release_bp_slot(struct perf_event *bp); extern int reserve_bp_slot(struct perf_event *bp); extern void release_bp_slot(struct perf_event *bp); extern void flush_ptrace_hw_breakpoint(struct task_struct *tsk); static inline struct arch_hw_breakpoint *counter_arch_bp(struct perf_event *bp) { return &bp->hw.info; } #else /* !CONFIG_HAVE_HW_BREAKPOINT */ static inline int __init init_hw_breakpoint(void) { return 0; } static inline struct perf_event * register_user_hw_breakpoint(struct perf_event_attr *attr, perf_overflow_handler_t triggered, void *context, struct task_struct *tsk) { return NULL; } static inline int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr) { return -ENOSYS; } static inline int modify_user_hw_breakpoint_check(struct perf_event *bp, struct perf_event_attr *attr, bool check) { return -ENOSYS; } static inline struct perf_event * register_wide_hw_breakpoint_cpu(struct perf_event_attr *attr, perf_overflow_handler_t triggered, void *context, int cpu) { return NULL; } static inline struct perf_event * __percpu * register_wide_hw_breakpoint(struct perf_event_attr *attr, perf_overflow_handler_t triggered, void *context) { return NULL; } static inline int register_perf_hw_breakpoint(struct perf_event *bp) { return -ENOSYS; } static inline void unregister_hw_breakpoint(struct perf_event *bp) { } static inline void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events) { } static inline bool hw_breakpoint_is_used(void) { return false; } static inline int reserve_bp_slot(struct perf_event *bp) {return -ENOSYS; } static inline void release_bp_slot(struct perf_event *bp) { } static inline void flush_ptrace_hw_breakpoint(struct task_struct *tsk) { } static inline struct arch_hw_breakpoint *counter_arch_bp(struct perf_event *bp) { return NULL; } #endif /* CONFIG_HAVE_HW_BREAKPOINT */ #endif /* _LINUX_HW_BREAKPOINT_H */
14 14 14 2 2 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 // SPDX-License-Identifier: GPL-2.0-or-later #include <linux/seq_file.h> #include <net/ip.h> #include <net/mptcp.h> #include <net/snmp.h> #include <net/net_namespace.h> #include "mib.h" static const struct snmp_mib mptcp_snmp_list[] = { SNMP_MIB_ITEM("MPCapableSYNRX", MPTCP_MIB_MPCAPABLEPASSIVE), SNMP_MIB_ITEM("MPCapableSYNTX", MPTCP_MIB_MPCAPABLEACTIVE), SNMP_MIB_ITEM("MPCapableSYNACKRX", MPTCP_MIB_MPCAPABLEACTIVEACK), SNMP_MIB_ITEM("MPCapableACKRX", MPTCP_MIB_MPCAPABLEPASSIVEACK), SNMP_MIB_ITEM("MPCapableFallbackACK", MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK), SNMP_MIB_ITEM("MPCapableFallbackSYNACK", MPTCP_MIB_MPCAPABLEACTIVEFALLBACK), SNMP_MIB_ITEM("MPCapableSYNTXDrop", MPTCP_MIB_MPCAPABLEACTIVEDROP), SNMP_MIB_ITEM("MPCapableSYNTXDisabled", MPTCP_MIB_MPCAPABLEACTIVEDISABLED), SNMP_MIB_ITEM("MPCapableEndpAttempt", MPTCP_MIB_MPCAPABLEENDPATTEMPT), SNMP_MIB_ITEM("MPFallbackTokenInit", MPTCP_MIB_TOKENFALLBACKINIT), SNMP_MIB_ITEM("MPTCPRetrans", MPTCP_MIB_RETRANSSEGS), SNMP_MIB_ITEM("MPJoinNoTokenFound", MPTCP_MIB_JOINNOTOKEN), SNMP_MIB_ITEM("MPJoinSynRx", MPTCP_MIB_JOINSYNRX), SNMP_MIB_ITEM("MPJoinSynBackupRx", MPTCP_MIB_JOINSYNBACKUPRX), SNMP_MIB_ITEM("MPJoinSynAckRx", MPTCP_MIB_JOINSYNACKRX), SNMP_MIB_ITEM("MPJoinSynAckBackupRx", MPTCP_MIB_JOINSYNACKBACKUPRX), SNMP_MIB_ITEM("MPJoinSynAckHMacFailure", MPTCP_MIB_JOINSYNACKMAC), SNMP_MIB_ITEM("MPJoinAckRx", MPTCP_MIB_JOINACKRX), SNMP_MIB_ITEM("MPJoinAckHMacFailure", MPTCP_MIB_JOINACKMAC), SNMP_MIB_ITEM("MPJoinRejected", MPTCP_MIB_JOINREJECTED), SNMP_MIB_ITEM("MPJoinSynTx", MPTCP_MIB_JOINSYNTX), SNMP_MIB_ITEM("MPJoinSynTxCreatSkErr", MPTCP_MIB_JOINSYNTXCREATSKERR), SNMP_MIB_ITEM("MPJoinSynTxBindErr", MPTCP_MIB_JOINSYNTXBINDERR), SNMP_MIB_ITEM("MPJoinSynTxConnectErr", MPTCP_MIB_JOINSYNTXCONNECTERR), SNMP_MIB_ITEM("DSSNotMatching", MPTCP_MIB_DSSNOMATCH), SNMP_MIB_ITEM("DSSCorruptionFallback", MPTCP_MIB_DSSCORRUPTIONFALLBACK), SNMP_MIB_ITEM("DSSCorruptionReset", MPTCP_MIB_DSSCORRUPTIONRESET), SNMP_MIB_ITEM("InfiniteMapTx", MPTCP_MIB_INFINITEMAPTX), SNMP_MIB_ITEM("InfiniteMapRx", MPTCP_MIB_INFINITEMAPRX), SNMP_MIB_ITEM("DSSNoMatchTCP", MPTCP_MIB_DSSTCPMISMATCH), SNMP_MIB_ITEM("DataCsumErr", MPTCP_MIB_DATACSUMERR), SNMP_MIB_ITEM("OFOQueueTail", MPTCP_MIB_OFOQUEUETAIL), SNMP_MIB_ITEM("OFOQueue", MPTCP_MIB_OFOQUEUE), SNMP_MIB_ITEM("OFOMerge", MPTCP_MIB_OFOMERGE), SNMP_MIB_ITEM("NoDSSInWindow", MPTCP_MIB_NODSSWINDOW), SNMP_MIB_ITEM("DuplicateData", MPTCP_MIB_DUPDATA), SNMP_MIB_ITEM("AddAddr", MPTCP_MIB_ADDADDR), SNMP_MIB_ITEM("AddAddrTx", MPTCP_MIB_ADDADDRTX), SNMP_MIB_ITEM("AddAddrTxDrop", MPTCP_MIB_ADDADDRTXDROP), SNMP_MIB_ITEM("EchoAdd", MPTCP_MIB_ECHOADD), SNMP_MIB_ITEM("EchoAddTx", MPTCP_MIB_ECHOADDTX), SNMP_MIB_ITEM("EchoAddTxDrop", MPTCP_MIB_ECHOADDTXDROP), SNMP_MIB_ITEM("PortAdd", MPTCP_MIB_PORTADD), SNMP_MIB_ITEM("AddAddrDrop", MPTCP_MIB_ADDADDRDROP), SNMP_MIB_ITEM("MPJoinPortSynRx", MPTCP_MIB_JOINPORTSYNRX), SNMP_MIB_ITEM("MPJoinPortSynAckRx", MPTCP_MIB_JOINPORTSYNACKRX), SNMP_MIB_ITEM("MPJoinPortAckRx", MPTCP_MIB_JOINPORTACKRX), SNMP_MIB_ITEM("MismatchPortSynRx", MPTCP_MIB_MISMATCHPORTSYNRX), SNMP_MIB_ITEM("MismatchPortAckRx", MPTCP_MIB_MISMATCHPORTACKRX), SNMP_MIB_ITEM("RmAddr", MPTCP_MIB_RMADDR), SNMP_MIB_ITEM("RmAddrDrop", MPTCP_MIB_RMADDRDROP), SNMP_MIB_ITEM("RmAddrTx", MPTCP_MIB_RMADDRTX), SNMP_MIB_ITEM("RmAddrTxDrop", MPTCP_MIB_RMADDRTXDROP), SNMP_MIB_ITEM("RmSubflow", MPTCP_MIB_RMSUBFLOW), SNMP_MIB_ITEM("MPPrioTx", MPTCP_MIB_MPPRIOTX), SNMP_MIB_ITEM("MPPrioRx", MPTCP_MIB_MPPRIORX), SNMP_MIB_ITEM("MPFailTx", MPTCP_MIB_MPFAILTX), SNMP_MIB_ITEM("MPFailRx", MPTCP_MIB_MPFAILRX), SNMP_MIB_ITEM("MPFastcloseTx", MPTCP_MIB_MPFASTCLOSETX), SNMP_MIB_ITEM("MPFastcloseRx", MPTCP_MIB_MPFASTCLOSERX), SNMP_MIB_ITEM("MPRstTx", MPTCP_MIB_MPRSTTX), SNMP_MIB_ITEM("MPRstRx", MPTCP_MIB_MPRSTRX), SNMP_MIB_ITEM("RcvPruned", MPTCP_MIB_RCVPRUNED), SNMP_MIB_ITEM("SubflowStale", MPTCP_MIB_SUBFLOWSTALE), SNMP_MIB_ITEM("SubflowRecover", MPTCP_MIB_SUBFLOWRECOVER), SNMP_MIB_ITEM("SndWndShared", MPTCP_MIB_SNDWNDSHARED), SNMP_MIB_ITEM("RcvWndShared", MPTCP_MIB_RCVWNDSHARED), SNMP_MIB_ITEM("RcvWndConflictUpdate", MPTCP_MIB_RCVWNDCONFLICTUPDATE), SNMP_MIB_ITEM("RcvWndConflict", MPTCP_MIB_RCVWNDCONFLICT), SNMP_MIB_ITEM("MPCurrEstab", MPTCP_MIB_CURRESTAB), SNMP_MIB_ITEM("Blackhole", MPTCP_MIB_BLACKHOLE), SNMP_MIB_ITEM("MPCapableDataFallback", MPTCP_MIB_MPCAPABLEDATAFALLBACK), SNMP_MIB_ITEM("MD5SigFallback", MPTCP_MIB_MD5SIGFALLBACK), SNMP_MIB_ITEM("DssFallback", MPTCP_MIB_DSSFALLBACK), SNMP_MIB_ITEM("SimultConnectFallback", MPTCP_MIB_SIMULTCONNFALLBACK), SNMP_MIB_ITEM("FallbackFailed", MPTCP_MIB_FALLBACKFAILED), SNMP_MIB_ITEM("WinProbe", MPTCP_MIB_WINPROBE), }; /* mptcp_mib_alloc - allocate percpu mib counters * * These are allocated when the first mptcp socket is created so * we do not waste percpu memory if mptcp isn't in use. */ bool mptcp_mib_alloc(struct net *net) { struct mptcp_mib __percpu *mib = alloc_percpu(struct mptcp_mib); if (!mib) return false; if (cmpxchg(&net->mib.mptcp_statistics, NULL, mib)) free_percpu(mib); return true; } void mptcp_seq_show(struct seq_file *seq) { unsigned long sum[ARRAY_SIZE(mptcp_snmp_list)]; const int cnt = ARRAY_SIZE(mptcp_snmp_list); struct net *net = seq->private; int i; seq_puts(seq, "MPTcpExt:"); for (i = 0; i < cnt; i++) seq_printf(seq, " %s", mptcp_snmp_list[i].name); seq_puts(seq, "\nMPTcpExt:"); memset(sum, 0, sizeof(sum)); if (net->mib.mptcp_statistics) snmp_get_cpu_field_batch_cnt(sum, mptcp_snmp_list, cnt, net->mib.mptcp_statistics); for (i = 0; i < cnt; i++) seq_printf(seq, " %lu", sum[i]); seq_putc(seq, '\n'); }
10 5 5 5 10 5 5 5 376 490 488 5 5 1 3 2 1 1 1 1 1 1 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 5 5 5 5 5 5 5 5 5 5 847 491 376 848 849 6 6 3 3 3 3 3 5 3 876 847 844 847 881 944 944 927 518 515 877 385 384 173 881 91 92 92 88 67 91 92 91 92 92 135 129 91 12903 144 144 144 100 98 7 90 769 765 400 767 397 72 329 741 740 741 267 265 48 218 11753 11757 1930 781 11751 386 18 367 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 // SPDX-License-Identifier: GPL-2.0 /* * security/tomoyo/network.c * * Copyright (C) 2005-2011 NTT DATA CORPORATION */ #include "common.h" #include <linux/slab.h> /* Structure for holding inet domain socket's address. */ struct tomoyo_inet_addr_info { __be16 port; /* In network byte order. */ const __be32 *address; /* In network byte order. */ bool is_ipv6; }; /* Structure for holding unix domain socket's address. */ struct tomoyo_unix_addr_info { u8 *addr; /* This may not be '\0' terminated string. */ unsigned int addr_len; }; /* Structure for holding socket address. */ struct tomoyo_addr_info { u8 protocol; u8 operation; struct tomoyo_inet_addr_info inet; struct tomoyo_unix_addr_info unix0; }; /* String table for socket's protocols. */ const char * const tomoyo_proto_keyword[TOMOYO_SOCK_MAX] = { [SOCK_STREAM] = "stream", [SOCK_DGRAM] = "dgram", [SOCK_RAW] = "raw", [SOCK_SEQPACKET] = "seqpacket", [0] = " ", /* Dummy for avoiding NULL pointer dereference. */ [4] = " ", /* Dummy for avoiding NULL pointer dereference. */ }; /** * tomoyo_parse_ipaddr_union - Parse an IP address. * * @param: Pointer to "struct tomoyo_acl_param". * @ptr: Pointer to "struct tomoyo_ipaddr_union". * * Returns true on success, false otherwise. */ bool tomoyo_parse_ipaddr_union(struct tomoyo_acl_param *param, struct tomoyo_ipaddr_union *ptr) { u8 * const min = ptr->ip[0].in6_u.u6_addr8; u8 * const max = ptr->ip[1].in6_u.u6_addr8; char *address = tomoyo_read_token(param); const char *end; if (!strchr(address, ':') && in4_pton(address, -1, min, '-', &end) > 0) { ptr->is_ipv6 = false; if (!*end) ptr->ip[1].s6_addr32[0] = ptr->ip[0].s6_addr32[0]; else if (*end++ != '-' || in4_pton(end, -1, max, '\0', &end) <= 0 || *end) return false; return true; } if (in6_pton(address, -1, min, '-', &end) > 0) { ptr->is_ipv6 = true; if (!*end) memmove(max, min, sizeof(u16) * 8); else if (*end++ != '-' || in6_pton(end, -1, max, '\0', &end) <= 0 || *end) return false; return true; } return false; } /** * tomoyo_print_ipv4 - Print an IPv4 address. * * @buffer: Buffer to write to. * @buffer_len: Size of @buffer. * @min_ip: Pointer to __be32. * @max_ip: Pointer to __be32. * * Returns nothing. */ static void tomoyo_print_ipv4(char *buffer, const unsigned int buffer_len, const __be32 *min_ip, const __be32 *max_ip) { snprintf(buffer, buffer_len, "%pI4%c%pI4", min_ip, *min_ip == *max_ip ? '\0' : '-', max_ip); } /** * tomoyo_print_ipv6 - Print an IPv6 address. * * @buffer: Buffer to write to. * @buffer_len: Size of @buffer. * @min_ip: Pointer to "struct in6_addr". * @max_ip: Pointer to "struct in6_addr". * * Returns nothing. */ static void tomoyo_print_ipv6(char *buffer, const unsigned int buffer_len, const struct in6_addr *min_ip, const struct in6_addr *max_ip) { snprintf(buffer, buffer_len, "%pI6c%c%pI6c", min_ip, !memcmp(min_ip, max_ip, 16) ? '\0' : '-', max_ip); } /** * tomoyo_print_ip - Print an IP address. * * @buf: Buffer to write to. * @size: Size of @buf. * @ptr: Pointer to "struct ipaddr_union". * * Returns nothing. */ void tomoyo_print_ip(char *buf, const unsigned int size, const struct tomoyo_ipaddr_union *ptr) { if (ptr->is_ipv6) tomoyo_print_ipv6(buf, size, &ptr->ip[0], &ptr->ip[1]); else tomoyo_print_ipv4(buf, size, &ptr->ip[0].s6_addr32[0], &ptr->ip[1].s6_addr32[0]); } /* * Mapping table from "enum tomoyo_network_acl_index" to * "enum tomoyo_mac_index" for inet domain socket. */ static const u8 tomoyo_inet2mac [TOMOYO_SOCK_MAX][TOMOYO_MAX_NETWORK_OPERATION] = { [SOCK_STREAM] = { [TOMOYO_NETWORK_BIND] = TOMOYO_MAC_NETWORK_INET_STREAM_BIND, [TOMOYO_NETWORK_LISTEN] = TOMOYO_MAC_NETWORK_INET_STREAM_LISTEN, [TOMOYO_NETWORK_CONNECT] = TOMOYO_MAC_NETWORK_INET_STREAM_CONNECT, }, [SOCK_DGRAM] = { [TOMOYO_NETWORK_BIND] = TOMOYO_MAC_NETWORK_INET_DGRAM_BIND, [TOMOYO_NETWORK_SEND] = TOMOYO_MAC_NETWORK_INET_DGRAM_SEND, }, [SOCK_RAW] = { [TOMOYO_NETWORK_BIND] = TOMOYO_MAC_NETWORK_INET_RAW_BIND, [TOMOYO_NETWORK_SEND] = TOMOYO_MAC_NETWORK_INET_RAW_SEND, }, }; /* * Mapping table from "enum tomoyo_network_acl_index" to * "enum tomoyo_mac_index" for unix domain socket. */ static const u8 tomoyo_unix2mac [TOMOYO_SOCK_MAX][TOMOYO_MAX_NETWORK_OPERATION] = { [SOCK_STREAM] = { [TOMOYO_NETWORK_BIND] = TOMOYO_MAC_NETWORK_UNIX_STREAM_BIND, [TOMOYO_NETWORK_LISTEN] = TOMOYO_MAC_NETWORK_UNIX_STREAM_LISTEN, [TOMOYO_NETWORK_CONNECT] = TOMOYO_MAC_NETWORK_UNIX_STREAM_CONNECT, }, [SOCK_DGRAM] = { [TOMOYO_NETWORK_BIND] = TOMOYO_MAC_NETWORK_UNIX_DGRAM_BIND, [TOMOYO_NETWORK_SEND] = TOMOYO_MAC_NETWORK_UNIX_DGRAM_SEND, }, [SOCK_SEQPACKET] = { [TOMOYO_NETWORK_BIND] = TOMOYO_MAC_NETWORK_UNIX_SEQPACKET_BIND, [TOMOYO_NETWORK_LISTEN] = TOMOYO_MAC_NETWORK_UNIX_SEQPACKET_LISTEN, [TOMOYO_NETWORK_CONNECT] = TOMOYO_MAC_NETWORK_UNIX_SEQPACKET_CONNECT, }, }; /** * tomoyo_same_inet_acl - Check for duplicated "struct tomoyo_inet_acl" entry. * * @a: Pointer to "struct tomoyo_acl_info". * @b: Pointer to "struct tomoyo_acl_info". * * Returns true if @a == @b except permission bits, false otherwise. */ static bool tomoyo_same_inet_acl(const struct tomoyo_acl_info *a, const struct tomoyo_acl_info *b) { const struct tomoyo_inet_acl *p1 = container_of(a, typeof(*p1), head); const struct tomoyo_inet_acl *p2 = container_of(b, typeof(*p2), head); return p1->protocol == p2->protocol && tomoyo_same_ipaddr_union(&p1->address, &p2->address) && tomoyo_same_number_union(&p1->port, &p2->port); } /** * tomoyo_same_unix_acl - Check for duplicated "struct tomoyo_unix_acl" entry. * * @a: Pointer to "struct tomoyo_acl_info". * @b: Pointer to "struct tomoyo_acl_info". * * Returns true if @a == @b except permission bits, false otherwise. */ static bool tomoyo_same_unix_acl(const struct tomoyo_acl_info *a, const struct tomoyo_acl_info *b) { const struct tomoyo_unix_acl *p1 = container_of(a, typeof(*p1), head); const struct tomoyo_unix_acl *p2 = container_of(b, typeof(*p2), head); return p1->protocol == p2->protocol && tomoyo_same_name_union(&p1->name, &p2->name); } /** * tomoyo_merge_inet_acl - Merge duplicated "struct tomoyo_inet_acl" entry. * * @a: Pointer to "struct tomoyo_acl_info". * @b: Pointer to "struct tomoyo_acl_info". * @is_delete: True for @a &= ~@b, false for @a |= @b. * * Returns true if @a is empty, false otherwise. */ static bool tomoyo_merge_inet_acl(struct tomoyo_acl_info *a, struct tomoyo_acl_info *b, const bool is_delete) { u8 * const a_perm = &container_of(a, struct tomoyo_inet_acl, head)->perm; u8 perm = READ_ONCE(*a_perm); const u8 b_perm = container_of(b, struct tomoyo_inet_acl, head)->perm; if (is_delete) perm &= ~b_perm; else perm |= b_perm; WRITE_ONCE(*a_perm, perm); return !perm; } /** * tomoyo_merge_unix_acl - Merge duplicated "struct tomoyo_unix_acl" entry. * * @a: Pointer to "struct tomoyo_acl_info". * @b: Pointer to "struct tomoyo_acl_info". * @is_delete: True for @a &= ~@b, false for @a |= @b. * * Returns true if @a is empty, false otherwise. */ static bool tomoyo_merge_unix_acl(struct tomoyo_acl_info *a, struct tomoyo_acl_info *b, const bool is_delete) { u8 * const a_perm = &container_of(a, struct tomoyo_unix_acl, head)->perm; u8 perm = READ_ONCE(*a_perm); const u8 b_perm = container_of(b, struct tomoyo_unix_acl, head)->perm; if (is_delete) perm &= ~b_perm; else perm |= b_perm; WRITE_ONCE(*a_perm, perm); return !perm; } /** * tomoyo_write_inet_network - Write "struct tomoyo_inet_acl" list. * * @param: Pointer to "struct tomoyo_acl_param". * * Returns 0 on success, negative value otherwise. * * Caller holds tomoyo_read_lock(). */ int tomoyo_write_inet_network(struct tomoyo_acl_param *param) { struct tomoyo_inet_acl e = { .head.type = TOMOYO_TYPE_INET_ACL }; int error = -EINVAL; u8 type; const char *protocol = tomoyo_read_token(param); const char *operation = tomoyo_read_token(param); for (e.protocol = 0; e.protocol < TOMOYO_SOCK_MAX; e.protocol++) if (!strcmp(protocol, tomoyo_proto_keyword[e.protocol])) break; for (type = 0; type < TOMOYO_MAX_NETWORK_OPERATION; type++) if (tomoyo_permstr(operation, tomoyo_socket_keyword[type])) e.perm |= 1 << type; if (e.protocol == TOMOYO_SOCK_MAX || !e.perm) return -EINVAL; if (param->data[0] == '@') { param->data++; e.address.group = tomoyo_get_group(param, TOMOYO_ADDRESS_GROUP); if (!e.address.group) return -ENOMEM; } else { if (!tomoyo_parse_ipaddr_union(param, &e.address)) goto out; } if (!tomoyo_parse_number_union(param, &e.port) || e.port.values[1] > 65535) goto out; error = tomoyo_update_domain(&e.head, sizeof(e), param, tomoyo_same_inet_acl, tomoyo_merge_inet_acl); out: tomoyo_put_group(e.address.group); tomoyo_put_number_union(&e.port); return error; } /** * tomoyo_write_unix_network - Write "struct tomoyo_unix_acl" list. * * @param: Pointer to "struct tomoyo_acl_param". * * Returns 0 on success, negative value otherwise. */ int tomoyo_write_unix_network(struct tomoyo_acl_param *param) { struct tomoyo_unix_acl e = { .head.type = TOMOYO_TYPE_UNIX_ACL }; int error; u8 type; const char *protocol = tomoyo_read_token(param); const char *operation = tomoyo_read_token(param); for (e.protocol = 0; e.protocol < TOMOYO_SOCK_MAX; e.protocol++) if (!strcmp(protocol, tomoyo_proto_keyword[e.protocol])) break; for (type = 0; type < TOMOYO_MAX_NETWORK_OPERATION; type++) if (tomoyo_permstr(operation, tomoyo_socket_keyword[type])) e.perm |= 1 << type; if (e.protocol == TOMOYO_SOCK_MAX || !e.perm) return -EINVAL; if (!tomoyo_parse_name_union(param, &e.name)) return -EINVAL; error = tomoyo_update_domain(&e.head, sizeof(e), param, tomoyo_same_unix_acl, tomoyo_merge_unix_acl); tomoyo_put_name_union(&e.name); return error; } /** * tomoyo_audit_net_log - Audit network log. * * @r: Pointer to "struct tomoyo_request_info". * @family: Name of socket family ("inet" or "unix"). * @protocol: Name of protocol in @family. * @operation: Name of socket operation. * @address: Name of address. * * Returns 0 on success, negative value otherwise. */ static int tomoyo_audit_net_log(struct tomoyo_request_info *r, const char *family, const u8 protocol, const u8 operation, const char *address) { return tomoyo_supervisor(r, "network %s %s %s %s\n", family, tomoyo_proto_keyword[protocol], tomoyo_socket_keyword[operation], address); } /** * tomoyo_audit_inet_log - Audit INET network log. * * @r: Pointer to "struct tomoyo_request_info". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_audit_inet_log(struct tomoyo_request_info *r) { char buf[128]; int len; const __be32 *address = r->param.inet_network.address; if (r->param.inet_network.is_ipv6) tomoyo_print_ipv6(buf, sizeof(buf), (const struct in6_addr *) address, (const struct in6_addr *) address); else tomoyo_print_ipv4(buf, sizeof(buf), address, address); len = strlen(buf); snprintf(buf + len, sizeof(buf) - len, " %u", r->param.inet_network.port); return tomoyo_audit_net_log(r, "inet", r->param.inet_network.protocol, r->param.inet_network.operation, buf); } /** * tomoyo_audit_unix_log - Audit UNIX network log. * * @r: Pointer to "struct tomoyo_request_info". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_audit_unix_log(struct tomoyo_request_info *r) { return tomoyo_audit_net_log(r, "unix", r->param.unix_network.protocol, r->param.unix_network.operation, r->param.unix_network.address->name); } /** * tomoyo_check_inet_acl - Check permission for inet domain socket operation. * * @r: Pointer to "struct tomoyo_request_info". * @ptr: Pointer to "struct tomoyo_acl_info". * * Returns true if granted, false otherwise. */ static bool tomoyo_check_inet_acl(struct tomoyo_request_info *r, const struct tomoyo_acl_info *ptr) { const struct tomoyo_inet_acl *acl = container_of(ptr, typeof(*acl), head); const u8 size = r->param.inet_network.is_ipv6 ? 16 : 4; if (!(acl->perm & (1 << r->param.inet_network.operation)) || !tomoyo_compare_number_union(r->param.inet_network.port, &acl->port)) return false; if (acl->address.group) return tomoyo_address_matches_group (r->param.inet_network.is_ipv6, r->param.inet_network.address, acl->address.group); return acl->address.is_ipv6 == r->param.inet_network.is_ipv6 && memcmp(&acl->address.ip[0], r->param.inet_network.address, size) <= 0 && memcmp(r->param.inet_network.address, &acl->address.ip[1], size) <= 0; } /** * tomoyo_check_unix_acl - Check permission for unix domain socket operation. * * @r: Pointer to "struct tomoyo_request_info". * @ptr: Pointer to "struct tomoyo_acl_info". * * Returns true if granted, false otherwise. */ static bool tomoyo_check_unix_acl(struct tomoyo_request_info *r, const struct tomoyo_acl_info *ptr) { const struct tomoyo_unix_acl *acl = container_of(ptr, typeof(*acl), head); return (acl->perm & (1 << r->param.unix_network.operation)) && tomoyo_compare_name_union(r->param.unix_network.address, &acl->name); } /** * tomoyo_inet_entry - Check permission for INET network operation. * * @address: Pointer to "struct tomoyo_addr_info". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_inet_entry(const struct tomoyo_addr_info *address) { const int idx = tomoyo_read_lock(); struct tomoyo_request_info r; int error = 0; const u8 type = tomoyo_inet2mac[address->protocol][address->operation]; if (type && tomoyo_init_request_info(&r, NULL, type) != TOMOYO_CONFIG_DISABLED) { r.param_type = TOMOYO_TYPE_INET_ACL; r.param.inet_network.protocol = address->protocol; r.param.inet_network.operation = address->operation; r.param.inet_network.is_ipv6 = address->inet.is_ipv6; r.param.inet_network.address = address->inet.address; r.param.inet_network.port = ntohs(address->inet.port); do { tomoyo_check_acl(&r, tomoyo_check_inet_acl); error = tomoyo_audit_inet_log(&r); } while (error == TOMOYO_RETRY_REQUEST); } tomoyo_read_unlock(idx); return error; } /** * tomoyo_check_inet_address - Check permission for inet domain socket's operation. * * @addr: Pointer to "struct sockaddr". * @addr_len: Size of @addr. * @port: Port number. * @address: Pointer to "struct tomoyo_addr_info". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_check_inet_address(const struct sockaddr *addr, const unsigned int addr_len, const u16 port, struct tomoyo_addr_info *address) { struct tomoyo_inet_addr_info *i = &address->inet; if (addr_len < offsetofend(struct sockaddr, sa_family)) return 0; switch (addr->sa_family) { case AF_INET6: if (addr_len < SIN6_LEN_RFC2133) goto skip; i->is_ipv6 = true; i->address = (__be32 *) ((struct sockaddr_in6 *) addr)->sin6_addr.s6_addr; i->port = ((struct sockaddr_in6 *) addr)->sin6_port; break; case AF_INET: if (addr_len < sizeof(struct sockaddr_in)) goto skip; i->is_ipv6 = false; i->address = (__be32 *) &((struct sockaddr_in *) addr)->sin_addr; i->port = ((struct sockaddr_in *) addr)->sin_port; break; default: goto skip; } if (address->protocol == SOCK_RAW) i->port = htons(port); return tomoyo_inet_entry(address); skip: return 0; } /** * tomoyo_unix_entry - Check permission for UNIX network operation. * * @address: Pointer to "struct tomoyo_addr_info". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_unix_entry(const struct tomoyo_addr_info *address) { const int idx = tomoyo_read_lock(); struct tomoyo_request_info r; int error = 0; const u8 type = tomoyo_unix2mac[address->protocol][address->operation]; if (type && tomoyo_init_request_info(&r, NULL, type) != TOMOYO_CONFIG_DISABLED) { char *buf = address->unix0.addr; int len = address->unix0.addr_len - sizeof(sa_family_t); if (len <= 0) { buf = "anonymous"; len = 9; } else if (buf[0]) { len = strnlen(buf, len); } buf = tomoyo_encode2(buf, len); if (buf) { struct tomoyo_path_info addr; addr.name = buf; tomoyo_fill_path_info(&addr); r.param_type = TOMOYO_TYPE_UNIX_ACL; r.param.unix_network.protocol = address->protocol; r.param.unix_network.operation = address->operation; r.param.unix_network.address = &addr; do { tomoyo_check_acl(&r, tomoyo_check_unix_acl); error = tomoyo_audit_unix_log(&r); } while (error == TOMOYO_RETRY_REQUEST); kfree(buf); } else error = -ENOMEM; } tomoyo_read_unlock(idx); return error; } /** * tomoyo_check_unix_address - Check permission for unix domain socket's operation. * * @addr: Pointer to "struct sockaddr". * @addr_len: Size of @addr. * @address: Pointer to "struct tomoyo_addr_info". * * Returns 0 on success, negative value otherwise. */ static int tomoyo_check_unix_address(struct sockaddr *addr, const unsigned int addr_len, struct tomoyo_addr_info *address) { struct tomoyo_unix_addr_info *u = &address->unix0; if (addr_len < offsetofend(struct sockaddr, sa_family)) return 0; if (addr->sa_family != AF_UNIX) return 0; u->addr = ((struct sockaddr_un *) addr)->sun_path; u->addr_len = addr_len; return tomoyo_unix_entry(address); } /** * tomoyo_kernel_service - Check whether I'm kernel service or not. * * Returns true if I'm kernel service, false otherwise. */ static bool tomoyo_kernel_service(void) { /* Nothing to do if I am a kernel service. */ return current->flags & PF_KTHREAD; } /** * tomoyo_sock_family - Get socket's family. * * @sk: Pointer to "struct sock". * * Returns one of PF_INET, PF_INET6, PF_UNIX or 0. */ static u8 tomoyo_sock_family(struct sock *sk) { u8 family; if (tomoyo_kernel_service()) return 0; family = sk->sk_family; switch (family) { case PF_INET: case PF_INET6: case PF_UNIX: return family; default: return 0; } } /** * tomoyo_socket_listen_permission - Check permission for listening a socket. * * @sock: Pointer to "struct socket". * * Returns 0 on success, negative value otherwise. */ int tomoyo_socket_listen_permission(struct socket *sock) { struct tomoyo_addr_info address; const u8 family = tomoyo_sock_family(sock->sk); const unsigned int type = sock->type; struct sockaddr_storage addr; int addr_len; if (!family || (type != SOCK_STREAM && type != SOCK_SEQPACKET)) return 0; { const int error = sock->ops->getname(sock, (struct sockaddr *) &addr, 0); if (error < 0) return error; addr_len = error; } address.protocol = type; address.operation = TOMOYO_NETWORK_LISTEN; if (family == PF_UNIX) return tomoyo_check_unix_address((struct sockaddr *) &addr, addr_len, &address); return tomoyo_check_inet_address((struct sockaddr *) &addr, addr_len, 0, &address); } /** * tomoyo_socket_connect_permission - Check permission for setting the remote address of a socket. * * @sock: Pointer to "struct socket". * @addr: Pointer to "struct sockaddr". * @addr_len: Size of @addr. * * Returns 0 on success, negative value otherwise. */ int tomoyo_socket_connect_permission(struct socket *sock, struct sockaddr *addr, int addr_len) { struct tomoyo_addr_info address; const u8 family = tomoyo_sock_family(sock->sk); const unsigned int type = sock->type; if (!family) return 0; address.protocol = type; switch (type) { case SOCK_DGRAM: case SOCK_RAW: address.operation = TOMOYO_NETWORK_SEND; break; case SOCK_STREAM: case SOCK_SEQPACKET: address.operation = TOMOYO_NETWORK_CONNECT; break; default: return 0; } if (family == PF_UNIX) return tomoyo_check_unix_address(addr, addr_len, &address); return tomoyo_check_inet_address(addr, addr_len, sock->sk->sk_protocol, &address); } /** * tomoyo_socket_bind_permission - Check permission for setting the local address of a socket. * * @sock: Pointer to "struct socket". * @addr: Pointer to "struct sockaddr". * @addr_len: Size of @addr. * * Returns 0 on success, negative value otherwise. */ int tomoyo_socket_bind_permission(struct socket *sock, struct sockaddr *addr, int addr_len) { struct tomoyo_addr_info address; const u8 family = tomoyo_sock_family(sock->sk); const unsigned int type = sock->type; if (!family) return 0; switch (type) { case SOCK_STREAM: case SOCK_DGRAM: case SOCK_RAW: case SOCK_SEQPACKET: address.protocol = type; address.operation = TOMOYO_NETWORK_BIND; break; default: return 0; } if (family == PF_UNIX) return tomoyo_check_unix_address(addr, addr_len, &address); return tomoyo_check_inet_address(addr, addr_len, sock->sk->sk_protocol, &address); } /** * tomoyo_socket_sendmsg_permission - Check permission for sending a datagram. * * @sock: Pointer to "struct socket". * @msg: Pointer to "struct msghdr". * @size: Unused. * * Returns 0 on success, negative value otherwise. */ int tomoyo_socket_sendmsg_permission(struct socket *sock, struct msghdr *msg, int size) { struct tomoyo_addr_info address; const u8 family = tomoyo_sock_family(sock->sk); const unsigned int type = sock->type; if (!msg->msg_name || !family || (type != SOCK_DGRAM && type != SOCK_RAW)) return 0; address.protocol = type; address.operation = TOMOYO_NETWORK_SEND; if (family == PF_UNIX) return tomoyo_check_unix_address((struct sockaddr *) msg->msg_name, msg->msg_namelen, &address); return tomoyo_check_inet_address((struct sockaddr *) msg->msg_name, msg->msg_namelen, sock->sk->sk_protocol, &address); }
22 18 18 16 2 22 22 22 18 18 18 18 18 18 18 18 18 18 18 18 18 22 91 91 10399 128 10408 10410 10371 10390 10382 10408 10447 10415 10391 10397 5320 423 5307 5311 5280 5269 5266 5315 5338 5313 5272 5307 10267 4388 18 1019 1 1021 9374 4116 18 353 7 355 902 528 1 1 2 2 2 3 3 3 3 3 3 8 8 2 2 3 3 3 3 8 2 2 2 2 1 1 2 2 4 3 3 2 2 2 1 4 1 23 23 23 109 111 3 2 2 2 2 2 3 3 1 8 1 7 7 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 // SPDX-License-Identifier: GPL-2.0-only #include <linux/export.h> #include <linux/nsproxy.h> #include <linux/slab.h> #include <linux/sched/signal.h> #include <linux/user_namespace.h> #include <linux/proc_ns.h> #include <linux/highuid.h> #include <linux/cred.h> #include <linux/securebits.h> #include <linux/security.h> #include <linux/keyctl.h> #include <linux/key-type.h> #include <keys/user-type.h> #include <linux/seq_file.h> #include <linux/fs.h> #include <linux/uaccess.h> #include <linux/ctype.h> #include <linux/projid.h> #include <linux/fs_struct.h> #include <linux/bsearch.h> #include <linux/sort.h> #include <linux/nstree.h> static struct kmem_cache *user_ns_cachep __ro_after_init; static DEFINE_MUTEX(userns_state_mutex); static bool new_idmap_permitted(const struct file *file, struct user_namespace *ns, int cap_setid, struct uid_gid_map *map); static void free_user_ns(struct work_struct *work); static struct ucounts *inc_user_namespaces(struct user_namespace *ns, kuid_t uid) { return inc_ucount(ns, uid, UCOUNT_USER_NAMESPACES); } static void dec_user_namespaces(struct ucounts *ucounts) { return dec_ucount(ucounts, UCOUNT_USER_NAMESPACES); } static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns) { /* Start with the same capabilities as init but useless for doing * anything as the capabilities are bound to the new user namespace. */ cred->securebits = SECUREBITS_DEFAULT; cred->cap_inheritable = CAP_EMPTY_SET; cred->cap_permitted = CAP_FULL_SET; cred->cap_effective = CAP_FULL_SET; cred->cap_ambient = CAP_EMPTY_SET; cred->cap_bset = CAP_FULL_SET; #ifdef CONFIG_KEYS key_put(cred->request_key_auth); cred->request_key_auth = NULL; #endif /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */ cred->user_ns = user_ns; } static unsigned long enforced_nproc_rlimit(void) { unsigned long limit = RLIM_INFINITY; /* Is RLIMIT_NPROC currently enforced? */ if (!uid_eq(current_uid(), GLOBAL_ROOT_UID) || (current_user_ns() != &init_user_ns)) limit = rlimit(RLIMIT_NPROC); return limit; } /* * Create a new user namespace, deriving the creator from the user in the * passed credentials, and replacing that user with the new root user for the * new namespace. * * This is called by copy_creds(), which will finish setting the target task's * credentials. */ int create_user_ns(struct cred *new) { struct user_namespace *ns, *parent_ns = new->user_ns; kuid_t owner = new->euid; kgid_t group = new->egid; struct ucounts *ucounts; int ret, i; ret = -ENOSPC; if (parent_ns->level > 32) goto fail; ucounts = inc_user_namespaces(parent_ns, owner); if (!ucounts) goto fail; /* * Verify that we can not violate the policy of which files * may be accessed that is specified by the root directory, * by verifying that the root directory is at the root of the * mount namespace which allows all files to be accessed. */ ret = -EPERM; if (current_chrooted()) goto fail_dec; /* The creator needs a mapping in the parent user namespace * or else we won't be able to reasonably tell userspace who * created a user_namespace. */ ret = -EPERM; if (!kuid_has_mapping(parent_ns, owner) || !kgid_has_mapping(parent_ns, group)) goto fail_dec; ret = security_create_user_ns(new); if (ret < 0) goto fail_dec; ret = -ENOMEM; ns = kmem_cache_zalloc(user_ns_cachep, GFP_KERNEL); if (!ns) goto fail_dec; ns->parent_could_setfcap = cap_raised(new->cap_effective, CAP_SETFCAP); ret = ns_common_init(ns); if (ret) goto fail_free; /* Leave the new->user_ns reference with the new user namespace. */ ns->parent = parent_ns; ns->level = parent_ns->level + 1; ns->owner = owner; ns->group = group; INIT_WORK(&ns->work, free_user_ns); for (i = 0; i < UCOUNT_COUNTS; i++) { ns->ucount_max[i] = INT_MAX; } set_userns_rlimit_max(ns, UCOUNT_RLIMIT_NPROC, enforced_nproc_rlimit()); set_userns_rlimit_max(ns, UCOUNT_RLIMIT_MSGQUEUE, rlimit(RLIMIT_MSGQUEUE)); set_userns_rlimit_max(ns, UCOUNT_RLIMIT_SIGPENDING, rlimit(RLIMIT_SIGPENDING)); set_userns_rlimit_max(ns, UCOUNT_RLIMIT_MEMLOCK, rlimit(RLIMIT_MEMLOCK)); ns->ucounts = ucounts; /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */ mutex_lock(&userns_state_mutex); ns->flags = parent_ns->flags; mutex_unlock(&userns_state_mutex); #ifdef CONFIG_KEYS INIT_LIST_HEAD(&ns->keyring_name_list); init_rwsem(&ns->keyring_sem); #endif ret = -ENOMEM; if (!setup_userns_sysctls(ns)) goto fail_keyring; set_cred_user_ns(new, ns); ns_tree_add(ns); return 0; fail_keyring: #ifdef CONFIG_PERSISTENT_KEYRINGS key_put(ns->persistent_keyring_register); #endif ns_common_free(ns); fail_free: kmem_cache_free(user_ns_cachep, ns); fail_dec: dec_user_namespaces(ucounts); fail: return ret; } int unshare_userns(unsigned long unshare_flags, struct cred **new_cred) { struct cred *cred; int err = -ENOMEM; if (!(unshare_flags & CLONE_NEWUSER)) return 0; cred = prepare_creds(); if (cred) { err = create_user_ns(cred); if (err) put_cred(cred); else *new_cred = cred; } return err; } static void free_user_ns(struct work_struct *work) { struct user_namespace *parent, *ns = container_of(work, struct user_namespace, work); do { struct ucounts *ucounts = ns->ucounts; parent = ns->parent; ns_tree_remove(ns); if (ns->gid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(ns->gid_map.forward); kfree(ns->gid_map.reverse); } if (ns->uid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(ns->uid_map.forward); kfree(ns->uid_map.reverse); } if (ns->projid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(ns->projid_map.forward); kfree(ns->projid_map.reverse); } #if IS_ENABLED(CONFIG_BINFMT_MISC) kfree(ns->binfmt_misc); #endif retire_userns_sysctls(ns); key_free_user_ns(ns); ns_common_free(ns); /* Concurrent nstree traversal depends on a grace period. */ kfree_rcu(ns, ns.ns_rcu); dec_user_namespaces(ucounts); ns = parent; } while (ns_ref_put(parent)); } void __put_user_ns(struct user_namespace *ns) { schedule_work(&ns->work); } EXPORT_SYMBOL(__put_user_ns); /* * struct idmap_key - holds the information necessary to find an idmapping in a * sorted idmap array. It is passed to cmp_map_id() as first argument. */ struct idmap_key { bool map_up; /* true -> id from kid; false -> kid from id */ u32 id; /* id to find */ u32 count; }; /* * cmp_map_id - Function to be passed to bsearch() to find the requested * idmapping. Expects struct idmap_key to be passed via @k. */ static int cmp_map_id(const void *k, const void *e) { u32 first, last, id2; const struct idmap_key *key = k; const struct uid_gid_extent *el = e; id2 = key->id + key->count - 1; /* handle map_id_{down,up}() */ if (key->map_up) first = el->lower_first; else first = el->first; last = first + el->count - 1; if (key->id >= first && key->id <= last && (id2 >= first && id2 <= last)) return 0; if (key->id < first || id2 < first) return -1; return 1; } /* * map_id_range_down_max - Find idmap via binary search in ordered idmap array. * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. */ static struct uid_gid_extent * map_id_range_down_max(unsigned extents, struct uid_gid_map *map, u32 id, u32 count) { struct idmap_key key; key.map_up = false; key.count = count; key.id = id; return bsearch(&key, map->forward, extents, sizeof(struct uid_gid_extent), cmp_map_id); } /* * map_id_range_down_base - Find idmap via binary search in static extent array. * Can only be called if number of mappings is equal or less than * UID_GID_MAP_MAX_BASE_EXTENTS. */ static struct uid_gid_extent * map_id_range_down_base(unsigned extents, struct uid_gid_map *map, u32 id, u32 count) { unsigned idx; u32 first, last, id2; id2 = id + count - 1; /* Find the matching extent */ for (idx = 0; idx < extents; idx++) { first = map->extent[idx].first; last = first + map->extent[idx].count - 1; if (id >= first && id <= last && (id2 >= first && id2 <= last)) return &map->extent[idx]; } return NULL; } static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count) { struct uid_gid_extent *extent; unsigned extents = map->nr_extents; smp_rmb(); if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) extent = map_id_range_down_base(extents, map, id, count); else extent = map_id_range_down_max(extents, map, id, count); /* Map the id or note failure */ if (extent) id = (id - extent->first) + extent->lower_first; else id = (u32) -1; return id; } u32 map_id_down(struct uid_gid_map *map, u32 id) { return map_id_range_down(map, id, 1); } /* * map_id_up_base - Find idmap via binary search in static extent array. * Can only be called if number of mappings is equal or less than * UID_GID_MAP_MAX_BASE_EXTENTS. */ static struct uid_gid_extent * map_id_range_up_base(unsigned extents, struct uid_gid_map *map, u32 id, u32 count) { unsigned idx; u32 first, last, id2; id2 = id + count - 1; /* Find the matching extent */ for (idx = 0; idx < extents; idx++) { first = map->extent[idx].lower_first; last = first + map->extent[idx].count - 1; if (id >= first && id <= last && (id2 >= first && id2 <= last)) return &map->extent[idx]; } return NULL; } /* * map_id_up_max - Find idmap via binary search in ordered idmap array. * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. */ static struct uid_gid_extent * map_id_range_up_max(unsigned extents, struct uid_gid_map *map, u32 id, u32 count) { struct idmap_key key; key.map_up = true; key.count = count; key.id = id; return bsearch(&key, map->reverse, extents, sizeof(struct uid_gid_extent), cmp_map_id); } u32 map_id_range_up(struct uid_gid_map *map, u32 id, u32 count) { struct uid_gid_extent *extent; unsigned extents = map->nr_extents; smp_rmb(); if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) extent = map_id_range_up_base(extents, map, id, count); else extent = map_id_range_up_max(extents, map, id, count); /* Map the id or note failure */ if (extent) id = (id - extent->lower_first) + extent->first; else id = (u32) -1; return id; } u32 map_id_up(struct uid_gid_map *map, u32 id) { return map_id_range_up(map, id, 1); } /** * make_kuid - Map a user-namespace uid pair into a kuid. * @ns: User namespace that the uid is in * @uid: User identifier * * Maps a user-namespace uid pair into a kernel internal kuid, * and returns that kuid. * * When there is no mapping defined for the user-namespace uid * pair INVALID_UID is returned. Callers are expected to test * for and handle INVALID_UID being returned. INVALID_UID * may be tested for using uid_valid(). */ kuid_t make_kuid(struct user_namespace *ns, uid_t uid) { /* Map the uid to a global kernel uid */ return KUIDT_INIT(map_id_down(&ns->uid_map, uid)); } EXPORT_SYMBOL(make_kuid); /** * from_kuid - Create a uid from a kuid user-namespace pair. * @targ: The user namespace we want a uid in. * @kuid: The kernel internal uid to start with. * * Map @kuid into the user-namespace specified by @targ and * return the resulting uid. * * There is always a mapping into the initial user_namespace. * * If @kuid has no mapping in @targ (uid_t)-1 is returned. */ uid_t from_kuid(struct user_namespace *targ, kuid_t kuid) { /* Map the uid from a global kernel uid */ return map_id_up(&targ->uid_map, __kuid_val(kuid)); } EXPORT_SYMBOL(from_kuid); /** * from_kuid_munged - Create a uid from a kuid user-namespace pair. * @targ: The user namespace we want a uid in. * @kuid: The kernel internal uid to start with. * * Map @kuid into the user-namespace specified by @targ and * return the resulting uid. * * There is always a mapping into the initial user_namespace. * * Unlike from_kuid from_kuid_munged never fails and always * returns a valid uid. This makes from_kuid_munged appropriate * for use in syscalls like stat and getuid where failing the * system call and failing to provide a valid uid are not an * options. * * If @kuid has no mapping in @targ overflowuid is returned. */ uid_t from_kuid_munged(struct user_namespace *targ, kuid_t kuid) { uid_t uid; uid = from_kuid(targ, kuid); if (uid == (uid_t) -1) uid = overflowuid; return uid; } EXPORT_SYMBOL(from_kuid_munged); /** * make_kgid - Map a user-namespace gid pair into a kgid. * @ns: User namespace that the gid is in * @gid: group identifier * * Maps a user-namespace gid pair into a kernel internal kgid, * and returns that kgid. * * When there is no mapping defined for the user-namespace gid * pair INVALID_GID is returned. Callers are expected to test * for and handle INVALID_GID being returned. INVALID_GID may be * tested for using gid_valid(). */ kgid_t make_kgid(struct user_namespace *ns, gid_t gid) { /* Map the gid to a global kernel gid */ return KGIDT_INIT(map_id_down(&ns->gid_map, gid)); } EXPORT_SYMBOL(make_kgid); /** * from_kgid - Create a gid from a kgid user-namespace pair. * @targ: The user namespace we want a gid in. * @kgid: The kernel internal gid to start with. * * Map @kgid into the user-namespace specified by @targ and * return the resulting gid. * * There is always a mapping into the initial user_namespace. * * If @kgid has no mapping in @targ (gid_t)-1 is returned. */ gid_t from_kgid(struct user_namespace *targ, kgid_t kgid) { /* Map the gid from a global kernel gid */ return map_id_up(&targ->gid_map, __kgid_val(kgid)); } EXPORT_SYMBOL(from_kgid); /** * from_kgid_munged - Create a gid from a kgid user-namespace pair. * @targ: The user namespace we want a gid in. * @kgid: The kernel internal gid to start with. * * Map @kgid into the user-namespace specified by @targ and * return the resulting gid. * * There is always a mapping into the initial user_namespace. * * Unlike from_kgid from_kgid_munged never fails and always * returns a valid gid. This makes from_kgid_munged appropriate * for use in syscalls like stat and getgid where failing the * system call and failing to provide a valid gid are not options. * * If @kgid has no mapping in @targ overflowgid is returned. */ gid_t from_kgid_munged(struct user_namespace *targ, kgid_t kgid) { gid_t gid; gid = from_kgid(targ, kgid); if (gid == (gid_t) -1) gid = overflowgid; return gid; } EXPORT_SYMBOL(from_kgid_munged); /** * make_kprojid - Map a user-namespace projid pair into a kprojid. * @ns: User namespace that the projid is in * @projid: Project identifier * * Maps a user-namespace uid pair into a kernel internal kuid, * and returns that kuid. * * When there is no mapping defined for the user-namespace projid * pair INVALID_PROJID is returned. Callers are expected to test * for and handle INVALID_PROJID being returned. INVALID_PROJID * may be tested for using projid_valid(). */ kprojid_t make_kprojid(struct user_namespace *ns, projid_t projid) { /* Map the uid to a global kernel uid */ return KPROJIDT_INIT(map_id_down(&ns->projid_map, projid)); } EXPORT_SYMBOL(make_kprojid); /** * from_kprojid - Create a projid from a kprojid user-namespace pair. * @targ: The user namespace we want a projid in. * @kprojid: The kernel internal project identifier to start with. * * Map @kprojid into the user-namespace specified by @targ and * return the resulting projid. * * There is always a mapping into the initial user_namespace. * * If @kprojid has no mapping in @targ (projid_t)-1 is returned. */ projid_t from_kprojid(struct user_namespace *targ, kprojid_t kprojid) { /* Map the uid from a global kernel uid */ return map_id_up(&targ->projid_map, __kprojid_val(kprojid)); } EXPORT_SYMBOL(from_kprojid); /** * from_kprojid_munged - Create a projiid from a kprojid user-namespace pair. * @targ: The user namespace we want a projid in. * @kprojid: The kernel internal projid to start with. * * Map @kprojid into the user-namespace specified by @targ and * return the resulting projid. * * There is always a mapping into the initial user_namespace. * * Unlike from_kprojid from_kprojid_munged never fails and always * returns a valid projid. This makes from_kprojid_munged * appropriate for use in syscalls like stat and where * failing the system call and failing to provide a valid projid are * not an options. * * If @kprojid has no mapping in @targ OVERFLOW_PROJID is returned. */ projid_t from_kprojid_munged(struct user_namespace *targ, kprojid_t kprojid) { projid_t projid; projid = from_kprojid(targ, kprojid); if (projid == (projid_t) -1) projid = OVERFLOW_PROJID; return projid; } EXPORT_SYMBOL(from_kprojid_munged); static int uid_m_show(struct seq_file *seq, void *v) { struct user_namespace *ns = seq->private; struct uid_gid_extent *extent = v; struct user_namespace *lower_ns; uid_t lower; lower_ns = seq_user_ns(seq); if ((lower_ns == ns) && lower_ns->parent) lower_ns = lower_ns->parent; lower = from_kuid(lower_ns, KUIDT_INIT(extent->lower_first)); seq_printf(seq, "%10u %10u %10u\n", extent->first, lower, extent->count); return 0; } static int gid_m_show(struct seq_file *seq, void *v) { struct user_namespace *ns = seq->private; struct uid_gid_extent *extent = v; struct user_namespace *lower_ns; gid_t lower; lower_ns = seq_user_ns(seq); if ((lower_ns == ns) && lower_ns->parent) lower_ns = lower_ns->parent; lower = from_kgid(lower_ns, KGIDT_INIT(extent->lower_first)); seq_printf(seq, "%10u %10u %10u\n", extent->first, lower, extent->count); return 0; } static int projid_m_show(struct seq_file *seq, void *v) { struct user_namespace *ns = seq->private; struct uid_gid_extent *extent = v; struct user_namespace *lower_ns; projid_t lower; lower_ns = seq_user_ns(seq); if ((lower_ns == ns) && lower_ns->parent) lower_ns = lower_ns->parent; lower = from_kprojid(lower_ns, KPROJIDT_INIT(extent->lower_first)); seq_printf(seq, "%10u %10u %10u\n", extent->first, lower, extent->count); return 0; } static void *m_start(struct seq_file *seq, loff_t *ppos, struct uid_gid_map *map) { loff_t pos = *ppos; unsigned extents = map->nr_extents; smp_rmb(); if (pos >= extents) return NULL; if (extents <= UID_GID_MAP_MAX_BASE_EXTENTS) return &map->extent[pos]; return &map->forward[pos]; } static void *uid_m_start(struct seq_file *seq, loff_t *ppos) { struct user_namespace *ns = seq->private; return m_start(seq, ppos, &ns->uid_map); } static void *gid_m_start(struct seq_file *seq, loff_t *ppos) { struct user_namespace *ns = seq->private; return m_start(seq, ppos, &ns->gid_map); } static void *projid_m_start(struct seq_file *seq, loff_t *ppos) { struct user_namespace *ns = seq->private; return m_start(seq, ppos, &ns->projid_map); } static void *m_next(struct seq_file *seq, void *v, loff_t *pos) { (*pos)++; return seq->op->start(seq, pos); } static void m_stop(struct seq_file *seq, void *v) { return; } const struct seq_operations proc_uid_seq_operations = { .start = uid_m_start, .stop = m_stop, .next = m_next, .show = uid_m_show, }; const struct seq_operations proc_gid_seq_operations = { .start = gid_m_start, .stop = m_stop, .next = m_next, .show = gid_m_show, }; const struct seq_operations proc_projid_seq_operations = { .start = projid_m_start, .stop = m_stop, .next = m_next, .show = projid_m_show, }; static bool mappings_overlap(struct uid_gid_map *new_map, struct uid_gid_extent *extent) { u32 upper_first, lower_first, upper_last, lower_last; unsigned idx; upper_first = extent->first; lower_first = extent->lower_first; upper_last = upper_first + extent->count - 1; lower_last = lower_first + extent->count - 1; for (idx = 0; idx < new_map->nr_extents; idx++) { u32 prev_upper_first, prev_lower_first; u32 prev_upper_last, prev_lower_last; struct uid_gid_extent *prev; if (new_map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) prev = &new_map->extent[idx]; else prev = &new_map->forward[idx]; prev_upper_first = prev->first; prev_lower_first = prev->lower_first; prev_upper_last = prev_upper_first + prev->count - 1; prev_lower_last = prev_lower_first + prev->count - 1; /* Does the upper range intersect a previous extent? */ if ((prev_upper_first <= upper_last) && (prev_upper_last >= upper_first)) return true; /* Does the lower range intersect a previous extent? */ if ((prev_lower_first <= lower_last) && (prev_lower_last >= lower_first)) return true; } return false; } /* * insert_extent - Safely insert a new idmap extent into struct uid_gid_map. * Takes care to allocate a 4K block of memory if the number of mappings exceeds * UID_GID_MAP_MAX_BASE_EXTENTS. */ static int insert_extent(struct uid_gid_map *map, struct uid_gid_extent *extent) { struct uid_gid_extent *dest; if (map->nr_extents == UID_GID_MAP_MAX_BASE_EXTENTS) { struct uid_gid_extent *forward; /* Allocate memory for 340 mappings. */ forward = kmalloc_array(UID_GID_MAP_MAX_EXTENTS, sizeof(struct uid_gid_extent), GFP_KERNEL); if (!forward) return -ENOMEM; /* Copy over memory. Only set up memory for the forward pointer. * Defer the memory setup for the reverse pointer. */ memcpy(forward, map->extent, map->nr_extents * sizeof(map->extent[0])); map->forward = forward; map->reverse = NULL; } if (map->nr_extents < UID_GID_MAP_MAX_BASE_EXTENTS) dest = &map->extent[map->nr_extents]; else dest = &map->forward[map->nr_extents]; *dest = *extent; map->nr_extents++; return 0; } /* cmp function to sort() forward mappings */ static int cmp_extents_forward(const void *a, const void *b) { const struct uid_gid_extent *e1 = a; const struct uid_gid_extent *e2 = b; if (e1->first < e2->first) return -1; if (e1->first > e2->first) return 1; return 0; } /* cmp function to sort() reverse mappings */ static int cmp_extents_reverse(const void *a, const void *b) { const struct uid_gid_extent *e1 = a; const struct uid_gid_extent *e2 = b; if (e1->lower_first < e2->lower_first) return -1; if (e1->lower_first > e2->lower_first) return 1; return 0; } /* * sort_idmaps - Sorts an array of idmap entries. * Can only be called if number of mappings exceeds UID_GID_MAP_MAX_BASE_EXTENTS. */ static int sort_idmaps(struct uid_gid_map *map) { if (map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) return 0; /* Sort forward array. */ sort(map->forward, map->nr_extents, sizeof(struct uid_gid_extent), cmp_extents_forward, NULL); /* Only copy the memory from forward we actually need. */ map->reverse = kmemdup_array(map->forward, map->nr_extents, sizeof(struct uid_gid_extent), GFP_KERNEL); if (!map->reverse) return -ENOMEM; /* Sort reverse array. */ sort(map->reverse, map->nr_extents, sizeof(struct uid_gid_extent), cmp_extents_reverse, NULL); return 0; } /** * verify_root_map() - check the uid 0 mapping * @file: idmapping file * @map_ns: user namespace of the target process * @new_map: requested idmap * * If a process requests mapping parent uid 0 into the new ns, verify that the * process writing the map had the CAP_SETFCAP capability as the target process * will be able to write fscaps that are valid in ancestor user namespaces. * * Return: true if the mapping is allowed, false if not. */ static bool verify_root_map(const struct file *file, struct user_namespace *map_ns, struct uid_gid_map *new_map) { int idx; const struct user_namespace *file_ns = file->f_cred->user_ns; struct uid_gid_extent *extent0 = NULL; for (idx = 0; idx < new_map->nr_extents; idx++) { if (new_map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) extent0 = &new_map->extent[idx]; else extent0 = &new_map->forward[idx]; if (extent0->lower_first == 0) break; extent0 = NULL; } if (!extent0) return true; if (map_ns == file_ns) { /* The process unshared its ns and is writing to its own * /proc/self/uid_map. User already has full capabilites in * the new namespace. Verify that the parent had CAP_SETFCAP * when it unshared. * */ if (!file_ns->parent_could_setfcap) return false; } else { /* Process p1 is writing to uid_map of p2, who is in a child * user namespace to p1's. Verify that the opener of the map * file has CAP_SETFCAP against the parent of the new map * namespace */ if (!file_ns_capable(file, map_ns->parent, CAP_SETFCAP)) return false; } return true; } static ssize_t map_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos, int cap_setid, struct uid_gid_map *map, struct uid_gid_map *parent_map) { struct seq_file *seq = file->private_data; struct user_namespace *map_ns = seq->private; struct uid_gid_map new_map; unsigned idx; struct uid_gid_extent extent; char *kbuf, *pos, *next_line; ssize_t ret; /* Only allow < page size writes at the beginning of the file */ if ((*ppos != 0) || (count >= PAGE_SIZE)) return -EINVAL; /* Slurp in the user data */ kbuf = memdup_user_nul(buf, count); if (IS_ERR(kbuf)) return PTR_ERR(kbuf); /* * The userns_state_mutex serializes all writes to any given map. * * Any map is only ever written once. * * An id map fits within 1 cache line on most architectures. * * On read nothing needs to be done unless you are on an * architecture with a crazy cache coherency model like alpha. * * There is a one time data dependency between reading the * count of the extents and the values of the extents. The * desired behavior is to see the values of the extents that * were written before the count of the extents. * * To achieve this smp_wmb() is used on guarantee the write * order and smp_rmb() is guaranteed that we don't have crazy * architectures returning stale data. */ mutex_lock(&userns_state_mutex); memset(&new_map, 0, sizeof(struct uid_gid_map)); ret = -EPERM; /* Only allow one successful write to the map */ if (map->nr_extents != 0) goto out; /* * Adjusting namespace settings requires capabilities on the target. */ if (cap_valid(cap_setid) && !file_ns_capable(file, map_ns, CAP_SYS_ADMIN)) goto out; /* Parse the user data */ ret = -EINVAL; pos = kbuf; for (; pos; pos = next_line) { /* Find the end of line and ensure I don't look past it */ next_line = strchr(pos, '\n'); if (next_line) { *next_line = '\0'; next_line++; if (*next_line == '\0') next_line = NULL; } pos = skip_spaces(pos); extent.first = simple_strtoul(pos, &pos, 10); if (!isspace(*pos)) goto out; pos = skip_spaces(pos); extent.lower_first = simple_strtoul(pos, &pos, 10); if (!isspace(*pos)) goto out; pos = skip_spaces(pos); extent.count = simple_strtoul(pos, &pos, 10); if (*pos && !isspace(*pos)) goto out; /* Verify there is not trailing junk on the line */ pos = skip_spaces(pos); if (*pos != '\0') goto out; /* Verify we have been given valid starting values */ if ((extent.first == (u32) -1) || (extent.lower_first == (u32) -1)) goto out; /* Verify count is not zero and does not cause the * extent to wrap */ if ((extent.first + extent.count) <= extent.first) goto out; if ((extent.lower_first + extent.count) <= extent.lower_first) goto out; /* Do the ranges in extent overlap any previous extents? */ if (mappings_overlap(&new_map, &extent)) goto out; if ((new_map.nr_extents + 1) == UID_GID_MAP_MAX_EXTENTS && (next_line != NULL)) goto out; ret = insert_extent(&new_map, &extent); if (ret < 0) goto out; ret = -EINVAL; } /* Be very certain the new map actually exists */ if (new_map.nr_extents == 0) goto out; ret = -EPERM; /* Validate the user is allowed to use user id's mapped to. */ if (!new_idmap_permitted(file, map_ns, cap_setid, &new_map)) goto out; ret = -EPERM; /* Map the lower ids from the parent user namespace to the * kernel global id space. */ for (idx = 0; idx < new_map.nr_extents; idx++) { struct uid_gid_extent *e; u32 lower_first; if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) e = &new_map.extent[idx]; else e = &new_map.forward[idx]; lower_first = map_id_range_down(parent_map, e->lower_first, e->count); /* Fail if we can not map the specified extent to * the kernel global id space. */ if (lower_first == (u32) -1) goto out; e->lower_first = lower_first; } /* * If we want to use binary search for lookup, this clones the extent * array and sorts both copies. */ ret = sort_idmaps(&new_map); if (ret < 0) goto out; /* Install the map */ if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) { memcpy(map->extent, new_map.extent, new_map.nr_extents * sizeof(new_map.extent[0])); } else { map->forward = new_map.forward; map->reverse = new_map.reverse; } smp_wmb(); map->nr_extents = new_map.nr_extents; *ppos = count; ret = count; out: if (ret < 0 && new_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) { kfree(new_map.forward); kfree(new_map.reverse); map->forward = NULL; map->reverse = NULL; map->nr_extents = 0; } mutex_unlock(&userns_state_mutex); kfree(kbuf); return ret; } ssize_t proc_uid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; struct user_namespace *seq_ns = seq_user_ns(seq); if (!ns->parent) return -EPERM; if ((seq_ns != ns) && (seq_ns != ns->parent)) return -EPERM; return map_write(file, buf, size, ppos, CAP_SETUID, &ns->uid_map, &ns->parent->uid_map); } ssize_t proc_gid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; struct user_namespace *seq_ns = seq_user_ns(seq); if (!ns->parent) return -EPERM; if ((seq_ns != ns) && (seq_ns != ns->parent)) return -EPERM; return map_write(file, buf, size, ppos, CAP_SETGID, &ns->gid_map, &ns->parent->gid_map); } ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t size, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; struct user_namespace *seq_ns = seq_user_ns(seq); if (!ns->parent) return -EPERM; if ((seq_ns != ns) && (seq_ns != ns->parent)) return -EPERM; /* Anyone can set any valid project id no capability needed */ return map_write(file, buf, size, ppos, -1, &ns->projid_map, &ns->parent->projid_map); } static bool new_idmap_permitted(const struct file *file, struct user_namespace *ns, int cap_setid, struct uid_gid_map *new_map) { const struct cred *cred = file->f_cred; if (cap_setid == CAP_SETUID && !verify_root_map(file, ns, new_map)) return false; /* Don't allow mappings that would allow anything that wouldn't * be allowed without the establishment of unprivileged mappings. */ if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1) && uid_eq(ns->owner, cred->euid)) { u32 id = new_map->extent[0].lower_first; if (cap_setid == CAP_SETUID) { kuid_t uid = make_kuid(ns->parent, id); if (uid_eq(uid, cred->euid)) return true; } else if (cap_setid == CAP_SETGID) { kgid_t gid = make_kgid(ns->parent, id); if (!(ns->flags & USERNS_SETGROUPS_ALLOWED) && gid_eq(gid, cred->egid)) return true; } } /* Allow anyone to set a mapping that doesn't require privilege */ if (!cap_valid(cap_setid)) return true; /* Allow the specified ids if we have the appropriate capability * (CAP_SETUID or CAP_SETGID) over the parent user namespace. * And the opener of the id file also has the appropriate capability. */ if (ns_capable(ns->parent, cap_setid) && file_ns_capable(file, ns->parent, cap_setid)) return true; return false; } int proc_setgroups_show(struct seq_file *seq, void *v) { struct user_namespace *ns = seq->private; unsigned long userns_flags = READ_ONCE(ns->flags); seq_printf(seq, "%s\n", (userns_flags & USERNS_SETGROUPS_ALLOWED) ? "allow" : "deny"); return 0; } ssize_t proc_setgroups_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct seq_file *seq = file->private_data; struct user_namespace *ns = seq->private; char kbuf[8], *pos; bool setgroups_allowed; ssize_t ret; /* Only allow a very narrow range of strings to be written */ ret = -EINVAL; if ((*ppos != 0) || (count >= sizeof(kbuf))) goto out; /* What was written? */ ret = -EFAULT; if (copy_from_user(kbuf, buf, count)) goto out; kbuf[count] = '\0'; pos = kbuf; /* What is being requested? */ ret = -EINVAL; if (strncmp(pos, "allow", 5) == 0) { pos += 5; setgroups_allowed = true; } else if (strncmp(pos, "deny", 4) == 0) { pos += 4; setgroups_allowed = false; } else goto out; /* Verify there is not trailing junk on the line */ pos = skip_spaces(pos); if (*pos != '\0') goto out; ret = -EPERM; mutex_lock(&userns_state_mutex); if (setgroups_allowed) { /* Enabling setgroups after setgroups has been disabled * is not allowed. */ if (!(ns->flags & USERNS_SETGROUPS_ALLOWED)) goto out_unlock; } else { /* Permanently disabling setgroups after setgroups has * been enabled by writing the gid_map is not allowed. */ if (ns->gid_map.nr_extents != 0) goto out_unlock; ns->flags &= ~USERNS_SETGROUPS_ALLOWED; } mutex_unlock(&userns_state_mutex); /* Report a successful write */ *ppos = count; ret = count; out: return ret; out_unlock: mutex_unlock(&userns_state_mutex); goto out; } bool userns_may_setgroups(const struct user_namespace *ns) { bool allowed; mutex_lock(&userns_state_mutex); /* It is not safe to use setgroups until a gid mapping in * the user namespace has been established. */ allowed = ns->gid_map.nr_extents != 0; /* Is setgroups allowed? */ allowed = allowed && (ns->flags & USERNS_SETGROUPS_ALLOWED); mutex_unlock(&userns_state_mutex); return allowed; } /* * Returns true if @child is the same namespace or a descendant of * @ancestor. */ bool in_userns(const struct user_namespace *ancestor, const struct user_namespace *child) { const struct user_namespace *ns; for (ns = child; ns->level > ancestor->level; ns = ns->parent) ; return (ns == ancestor); } bool current_in_userns(const struct user_namespace *target_ns) { return in_userns(target_ns, current_user_ns()); } EXPORT_SYMBOL(current_in_userns); static struct ns_common *userns_get(struct task_struct *task) { struct user_namespace *user_ns; rcu_read_lock(); user_ns = get_user_ns(__task_cred(task)->user_ns); rcu_read_unlock(); return user_ns ? &user_ns->ns : NULL; } static void userns_put(struct ns_common *ns) { put_user_ns(to_user_ns(ns)); } static int userns_install(struct nsset *nsset, struct ns_common *ns) { struct user_namespace *user_ns = to_user_ns(ns); struct cred *cred; /* Don't allow gaining capabilities by reentering * the same user namespace. */ if (user_ns == current_user_ns()) return -EINVAL; /* Tasks that share a thread group must share a user namespace */ if (!thread_group_empty(current)) return -EINVAL; if (current->fs->users != 1) return -EINVAL; if (!ns_capable(user_ns, CAP_SYS_ADMIN)) return -EPERM; cred = nsset_cred(nsset); if (!cred) return -EINVAL; put_user_ns(cred->user_ns); set_cred_user_ns(cred, get_user_ns(user_ns)); if (set_cred_ucounts(cred) < 0) return -EINVAL; return 0; } struct ns_common *ns_get_owner(struct ns_common *ns) { struct user_namespace *my_user_ns = current_user_ns(); struct user_namespace *owner, *p; /* See if the owner is in the current user namespace */ owner = p = ns->ops->owner(ns); for (;;) { if (!p) return ERR_PTR(-EPERM); if (p == my_user_ns) break; p = p->parent; } return &get_user_ns(owner)->ns; } static struct user_namespace *userns_owner(struct ns_common *ns) { return to_user_ns(ns)->parent; } const struct proc_ns_operations userns_operations = { .name = "user", .get = userns_get, .put = userns_put, .install = userns_install, .owner = userns_owner, .get_parent = ns_get_owner, }; static __init int user_namespaces_init(void) { user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC | SLAB_ACCOUNT); ns_tree_add(&init_user_ns); return 0; } subsys_initcall(user_namespaces_init);
14 14 3 14 14 2 1 12 12 12 12 12 12 14 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 9 9 9 9 1 1 9 5 5 9 5 5 5 1 4 5 5 5 5 9 3 9 9 7 2 2 1 1 2 2 2 2 7 4 4 7 7 2 7 7 5 7 2 6 7 1 6 6 7 7 1 1 7 7 2 2 2 2 7 5 4 5 5 5 5 4 4 4 4 4 4 4 1 1 1 1 1 1 5 5 5 1 5 5 5 5 1 1 2 2 2 2 2 2 1 1 4 4 8 8 6 1 1 1 1 1 1 1 1 1 1 8 12 11 12 12 11 6 6 3 3 3 3 3 3 3 3 3 3 1 3 3 5 5 4 4 4 4 4 1 3 1 3 3 4 4 12 2 1 1 1 2 1 1 2 2 1 1 3 3 2 1 1 1 1 1 4 3 4 3 3 3 3 3 2 1 2 2 2 1 2 3 3 3 2 1 5 4 6 5 5 2 3 2 2 2 2 1 2 7 2 2 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 // SPDX-License-Identifier: GPL-2.0-only /* * vivid-vid-out.c - video output support functions. * * Copyright 2014 Cisco Systems, Inc. and/or its affiliates. All rights reserved. */ #include <linux/errno.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/videodev2.h> #include <linux/v4l2-dv-timings.h> #include <media/v4l2-common.h> #include <media/v4l2-event.h> #include <media/v4l2-dv-timings.h> #include <media/v4l2-rect.h> #include "vivid-core.h" #include "vivid-osd.h" #include "vivid-vid-common.h" #include "vivid-kthread-out.h" #include "vivid-vid-out.h" static int vid_out_queue_setup(struct vb2_queue *vq, unsigned *nbuffers, unsigned *nplanes, unsigned sizes[], struct device *alloc_devs[]) { struct vivid_dev *dev = vb2_get_drv_priv(vq); const struct vivid_fmt *vfmt = dev->fmt_out; unsigned planes = vfmt->buffers; unsigned h = dev->fmt_out_rect.height; unsigned int size = dev->bytesperline_out[0] * h + vfmt->data_offset[0]; unsigned p; for (p = vfmt->buffers; p < vfmt->planes; p++) size += dev->bytesperline_out[p] * h / vfmt->vdownsampling[p] + vfmt->data_offset[p]; if (dev->field_out == V4L2_FIELD_ALTERNATE) { /* * You cannot use write() with FIELD_ALTERNATE since the field * information (TOP/BOTTOM) cannot be passed to the kernel. */ if (vb2_fileio_is_active(vq)) return -EINVAL; } if (dev->queue_setup_error) { /* * Error injection: test what happens if queue_setup() returns * an error. */ dev->queue_setup_error = false; return -EINVAL; } if (*nplanes) { /* * Check if the number of requested planes match * the number of planes in the current format. You can't mix that. */ if (*nplanes != planes) return -EINVAL; if (sizes[0] < size) return -EINVAL; for (p = 1; p < planes; p++) { if (sizes[p] < dev->bytesperline_out[p] * h / vfmt->vdownsampling[p] + vfmt->data_offset[p]) return -EINVAL; } } else { for (p = 0; p < planes; p++) sizes[p] = p ? dev->bytesperline_out[p] * h / vfmt->vdownsampling[p] + vfmt->data_offset[p] : size; } *nplanes = planes; dprintk(dev, 1, "%s: count=%u\n", __func__, *nbuffers); for (p = 0; p < planes; p++) dprintk(dev, 1, "%s: size[%u]=%u\n", __func__, p, sizes[p]); return 0; } static int vid_out_buf_out_validate(struct vb2_buffer *vb) { struct vb2_v4l2_buffer *vbuf = to_vb2_v4l2_buffer(vb); struct vivid_dev *dev = vb2_get_drv_priv(vb->vb2_queue); dprintk(dev, 1, "%s\n", __func__); if (dev->field_out != V4L2_FIELD_ALTERNATE) vbuf->field = dev->field_out; else if (vbuf->field != V4L2_FIELD_TOP && vbuf->field != V4L2_FIELD_BOTTOM) return -EINVAL; return 0; } static int vid_out_buf_prepare(struct vb2_buffer *vb) { struct vivid_dev *dev = vb2_get_drv_priv(vb->vb2_queue); const struct vivid_fmt *vfmt = dev->fmt_out; unsigned int planes = vfmt->buffers; unsigned int h = dev->fmt_out_rect.height; unsigned int size = dev->bytesperline_out[0] * h; unsigned p; for (p = vfmt->buffers; p < vfmt->planes; p++) size += dev->bytesperline_out[p] * h / vfmt->vdownsampling[p]; dprintk(dev, 1, "%s\n", __func__); if (WARN_ON(NULL == dev->fmt_out)) return -EINVAL; if (dev->buf_prepare_error) { /* * Error injection: test what happens if buf_prepare() returns * an error. */ dev->buf_prepare_error = false; return -EINVAL; } for (p = 0; p < planes; p++) { if (p) size = dev->bytesperline_out[p] * h / vfmt->vdownsampling[p]; size += vb->planes[p].data_offset; if (vb2_get_plane_payload(vb, p) < size) { dprintk(dev, 1, "%s the payload is too small for plane %u (%lu < %u)\n", __func__, p, vb2_get_plane_payload(vb, p), size); return -EINVAL; } } return 0; } static void vid_out_buf_queue(struct vb2_buffer *vb) { struct vb2_v4l2_buffer *vbuf = to_vb2_v4l2_buffer(vb); struct vivid_dev *dev = vb2_get_drv_priv(vb->vb2_queue); struct vivid_buffer *buf = container_of(vbuf, struct vivid_buffer, vb); dprintk(dev, 1, "%s\n", __func__); spin_lock(&dev->slock); list_add_tail(&buf->list, &dev->vid_out_active); spin_unlock(&dev->slock); } static int vid_out_start_streaming(struct vb2_queue *vq, unsigned count) { struct vivid_dev *dev = vb2_get_drv_priv(vq); int err; dev->vid_out_seq_count = 0; dprintk(dev, 1, "%s\n", __func__); if (dev->start_streaming_error) { dev->start_streaming_error = false; err = -EINVAL; } else { err = vivid_start_generating_vid_out(dev, &dev->vid_out_streaming); } if (err) { struct vivid_buffer *buf, *tmp; list_for_each_entry_safe(buf, tmp, &dev->vid_out_active, list) { list_del(&buf->list); vb2_buffer_done(&buf->vb.vb2_buf, VB2_BUF_STATE_QUEUED); } } return err; } /* abort streaming and wait for last buffer */ static void vid_out_stop_streaming(struct vb2_queue *vq) { struct vivid_dev *dev = vb2_get_drv_priv(vq); dprintk(dev, 1, "%s\n", __func__); vivid_stop_generating_vid_out(dev, &dev->vid_out_streaming); } static void vid_out_buf_request_complete(struct vb2_buffer *vb) { struct vivid_dev *dev = vb2_get_drv_priv(vb->vb2_queue); v4l2_ctrl_request_complete(vb->req_obj.req, &dev->ctrl_hdl_vid_out); } const struct vb2_ops vivid_vid_out_qops = { .queue_setup = vid_out_queue_setup, .buf_out_validate = vid_out_buf_out_validate, .buf_prepare = vid_out_buf_prepare, .buf_queue = vid_out_buf_queue, .start_streaming = vid_out_start_streaming, .stop_streaming = vid_out_stop_streaming, .buf_request_complete = vid_out_buf_request_complete, }; /* * Called whenever the format has to be reset which can occur when * changing outputs, standard, timings, etc. */ void vivid_update_format_out(struct vivid_dev *dev) { struct v4l2_bt_timings *bt = &dev->dv_timings_out.bt; unsigned size, p; u64 pixelclock; switch (dev->output_type[dev->output]) { case SVID: default: dev->field_out = dev->tv_field_out; dev->sink_rect.width = 720; if (dev->std_out & V4L2_STD_525_60) { dev->sink_rect.height = 480; dev->timeperframe_vid_out = (struct v4l2_fract) { 1001, 30000 }; dev->service_set_out = V4L2_SLICED_CAPTION_525; } else { dev->sink_rect.height = 576; dev->timeperframe_vid_out = (struct v4l2_fract) { 1000, 25000 }; dev->service_set_out = V4L2_SLICED_WSS_625 | V4L2_SLICED_TELETEXT_B; } dev->colorspace_out = V4L2_COLORSPACE_SMPTE170M; break; case HDMI: dev->sink_rect.width = bt->width; dev->sink_rect.height = bt->height; size = V4L2_DV_BT_FRAME_WIDTH(bt) * V4L2_DV_BT_FRAME_HEIGHT(bt); if (can_reduce_fps(bt) && (bt->flags & V4L2_DV_FL_REDUCED_FPS)) pixelclock = div_u64(bt->pixelclock * 1000, 1001); else pixelclock = bt->pixelclock; dev->timeperframe_vid_out = (struct v4l2_fract) { size / 100, (u32)pixelclock / 100 }; if (bt->interlaced) dev->field_out = V4L2_FIELD_ALTERNATE; else dev->field_out = V4L2_FIELD_NONE; if (!dev->dvi_d_out && (bt->flags & V4L2_DV_FL_IS_CE_VIDEO)) { if (bt->width == 720 && bt->height <= 576) dev->colorspace_out = V4L2_COLORSPACE_SMPTE170M; else dev->colorspace_out = V4L2_COLORSPACE_REC709; } else { dev->colorspace_out = V4L2_COLORSPACE_SRGB; } break; } dev->xfer_func_out = V4L2_XFER_FUNC_DEFAULT; dev->ycbcr_enc_out = V4L2_YCBCR_ENC_DEFAULT; dev->hsv_enc_out = V4L2_HSV_ENC_180; dev->quantization_out = V4L2_QUANTIZATION_DEFAULT; dev->compose_out = dev->sink_rect; dev->compose_bounds_out = dev->sink_rect; dev->crop_out = dev->compose_out; if (V4L2_FIELD_HAS_T_OR_B(dev->field_out)) dev->crop_out.height /= 2; dev->fmt_out_rect = dev->crop_out; for (p = 0; p < dev->fmt_out->planes; p++) dev->bytesperline_out[p] = (dev->sink_rect.width * dev->fmt_out->bit_depth[p]) / 8; } /* Map the field to something that is valid for the current output */ static enum v4l2_field vivid_field_out(struct vivid_dev *dev, enum v4l2_field field) { if (vivid_is_svid_out(dev)) { switch (field) { case V4L2_FIELD_INTERLACED_TB: case V4L2_FIELD_INTERLACED_BT: case V4L2_FIELD_SEQ_TB: case V4L2_FIELD_SEQ_BT: case V4L2_FIELD_ALTERNATE: return field; case V4L2_FIELD_INTERLACED: default: return V4L2_FIELD_INTERLACED; } } if (vivid_is_hdmi_out(dev)) return dev->dv_timings_out.bt.interlaced ? V4L2_FIELD_ALTERNATE : V4L2_FIELD_NONE; return V4L2_FIELD_NONE; } static enum tpg_pixel_aspect vivid_get_pixel_aspect(const struct vivid_dev *dev) { if (vivid_is_svid_out(dev)) return (dev->std_out & V4L2_STD_525_60) ? TPG_PIXEL_ASPECT_NTSC : TPG_PIXEL_ASPECT_PAL; if (vivid_is_hdmi_out(dev) && dev->sink_rect.width == 720 && dev->sink_rect.height <= 576) return dev->sink_rect.height == 480 ? TPG_PIXEL_ASPECT_NTSC : TPG_PIXEL_ASPECT_PAL; return TPG_PIXEL_ASPECT_SQUARE; } int vivid_g_fmt_vid_out(struct file *file, void *priv, struct v4l2_format *f) { struct vivid_dev *dev = video_drvdata(file); struct v4l2_pix_format_mplane *mp = &f->fmt.pix_mp; const struct vivid_fmt *fmt = dev->fmt_out; unsigned p; mp->width = dev->fmt_out_rect.width; mp->height = dev->fmt_out_rect.height; mp->field = dev->field_out; mp->pixelformat = fmt->fourcc; mp->colorspace = dev->colorspace_out; mp->xfer_func = dev->xfer_func_out; mp->ycbcr_enc = dev->ycbcr_enc_out; mp->quantization = dev->quantization_out; mp->num_planes = fmt->buffers; for (p = 0; p < mp->num_planes; p++) { mp->plane_fmt[p].bytesperline = dev->bytesperline_out[p]; mp->plane_fmt[p].sizeimage = mp->plane_fmt[p].bytesperline * mp->height / fmt->vdownsampling[p] + fmt->data_offset[p]; } for (p = fmt->buffers; p < fmt->planes; p++) { unsigned stride = dev->bytesperline_out[p]; mp->plane_fmt[0].sizeimage += (stride * mp->height) / fmt->vdownsampling[p]; } return 0; } int vivid_try_fmt_vid_out(struct file *file, void *priv, struct v4l2_format *f) { struct vivid_dev *dev = video_drvdata(file); struct v4l2_bt_timings *bt = &dev->dv_timings_out.bt; struct v4l2_pix_format_mplane *mp = &f->fmt.pix_mp; struct v4l2_plane_pix_format *pfmt = mp->plane_fmt; const struct vivid_fmt *fmt; unsigned bytesperline, max_bpl; unsigned factor = 1; unsigned w, h; unsigned p; fmt = vivid_get_format(dev, mp->pixelformat); if (!fmt) { dprintk(dev, 1, "Fourcc format (0x%08x) unknown.\n", mp->pixelformat); mp->pixelformat = V4L2_PIX_FMT_YUYV; fmt = vivid_get_format(dev, mp->pixelformat); } mp->field = vivid_field_out(dev, mp->field); if (vivid_is_svid_out(dev)) { w = 720; h = (dev->std_out & V4L2_STD_525_60) ? 480 : 576; } else { w = dev->sink_rect.width; h = dev->sink_rect.height; } if (V4L2_FIELD_HAS_T_OR_B(mp->field)) factor = 2; if (!dev->has_scaler_out && !dev->has_crop_out && !dev->has_compose_out) { mp->width = w; mp->height = h / factor; } else { struct v4l2_rect r = { 0, 0, mp->width, mp->height * factor }; v4l2_rect_set_min_size(&r, &vivid_min_rect); v4l2_rect_set_max_size(&r, &vivid_max_rect); if (dev->has_scaler_out && !dev->has_crop_out) { struct v4l2_rect max_r = { 0, 0, MAX_ZOOM * w, MAX_ZOOM * h }; v4l2_rect_set_max_size(&r, &max_r); } else if (!dev->has_scaler_out && dev->has_compose_out && !dev->has_crop_out) { v4l2_rect_set_max_size(&r, &dev->sink_rect); } else if (!dev->has_scaler_out && !dev->has_compose_out) { v4l2_rect_set_min_size(&r, &dev->sink_rect); } mp->width = r.width; mp->height = r.height / factor; } /* This driver supports custom bytesperline values */ mp->num_planes = fmt->buffers; for (p = 0; p < fmt->buffers; p++) { /* Calculate the minimum supported bytesperline value */ bytesperline = (mp->width * fmt->bit_depth[p]) >> 3; /* Calculate the maximum supported bytesperline value */ max_bpl = (MAX_ZOOM * MAX_WIDTH * fmt->bit_depth[p]) >> 3; if (pfmt[p].bytesperline > max_bpl) pfmt[p].bytesperline = max_bpl; if (pfmt[p].bytesperline < bytesperline) pfmt[p].bytesperline = bytesperline; pfmt[p].sizeimage = (pfmt[p].bytesperline * mp->height) / fmt->vdownsampling[p] + fmt->data_offset[p]; memset(pfmt[p].reserved, 0, sizeof(pfmt[p].reserved)); } for (p = fmt->buffers; p < fmt->planes; p++) pfmt[0].sizeimage += (pfmt[0].bytesperline * mp->height * (fmt->bit_depth[p] / fmt->vdownsampling[p])) / (fmt->bit_depth[0] / fmt->vdownsampling[0]); mp->xfer_func = V4L2_XFER_FUNC_DEFAULT; mp->ycbcr_enc = V4L2_YCBCR_ENC_DEFAULT; mp->quantization = V4L2_QUANTIZATION_DEFAULT; if (vivid_is_svid_out(dev)) { mp->colorspace = V4L2_COLORSPACE_SMPTE170M; } else if (dev->dvi_d_out || !(bt->flags & V4L2_DV_FL_IS_CE_VIDEO)) { mp->colorspace = V4L2_COLORSPACE_SRGB; if (dev->dvi_d_out) mp->quantization = V4L2_QUANTIZATION_LIM_RANGE; } else if (bt->width == 720 && bt->height <= 576) { mp->colorspace = V4L2_COLORSPACE_SMPTE170M; } else if (mp->colorspace != V4L2_COLORSPACE_SMPTE170M && mp->colorspace != V4L2_COLORSPACE_REC709 && mp->colorspace != V4L2_COLORSPACE_OPRGB && mp->colorspace != V4L2_COLORSPACE_BT2020 && mp->colorspace != V4L2_COLORSPACE_SRGB) { mp->colorspace = V4L2_COLORSPACE_REC709; } memset(mp->reserved, 0, sizeof(mp->reserved)); return 0; } int vivid_s_fmt_vid_out(struct file *file, void *priv, struct v4l2_format *f) { struct v4l2_pix_format_mplane *mp = &f->fmt.pix_mp; struct vivid_dev *dev = video_drvdata(file); struct v4l2_rect *crop = &dev->crop_out; struct v4l2_rect *compose = &dev->compose_out; struct vb2_queue *q = &dev->vb_vid_out_q; int ret = vivid_try_fmt_vid_out(file, priv, f); unsigned factor = 1; unsigned p; if (ret < 0) return ret; if (vb2_is_busy(q) && (vivid_is_svid_out(dev) || mp->width != dev->fmt_out_rect.width || mp->height != dev->fmt_out_rect.height || mp->pixelformat != dev->fmt_out->fourcc || mp->field != dev->field_out)) { dprintk(dev, 1, "%s device busy\n", __func__); return -EBUSY; } /* * Allow for changing the colorspace on the fly. Useful for testing * purposes, and it is something that HDMI transmitters are able * to do. */ if (vb2_is_busy(q)) goto set_colorspace; dev->fmt_out = vivid_get_format(dev, mp->pixelformat); if (V4L2_FIELD_HAS_T_OR_B(mp->field)) factor = 2; if (dev->has_scaler_out || dev->has_crop_out || dev->has_compose_out) { struct v4l2_rect r = { 0, 0, mp->width, mp->height }; if (dev->has_scaler_out) { if (dev->has_crop_out) v4l2_rect_map_inside(crop, &r); else *crop = r; if (dev->has_compose_out && !dev->has_crop_out) { struct v4l2_rect min_r = { 0, 0, r.width / MAX_ZOOM, factor * r.height / MAX_ZOOM }; struct v4l2_rect max_r = { 0, 0, r.width * MAX_ZOOM, factor * r.height * MAX_ZOOM }; v4l2_rect_set_min_size(compose, &min_r); v4l2_rect_set_max_size(compose, &max_r); v4l2_rect_map_inside(compose, &dev->compose_bounds_out); } else if (dev->has_compose_out) { struct v4l2_rect min_r = { 0, 0, crop->width / MAX_ZOOM, factor * crop->height / MAX_ZOOM }; struct v4l2_rect max_r = { 0, 0, crop->width * MAX_ZOOM, factor * crop->height * MAX_ZOOM }; v4l2_rect_set_min_size(compose, &min_r); v4l2_rect_set_max_size(compose, &max_r); v4l2_rect_map_inside(compose, &dev->compose_bounds_out); } } else if (dev->has_compose_out && !dev->has_crop_out) { v4l2_rect_set_size_to(crop, &r); r.height *= factor; v4l2_rect_set_size_to(compose, &r); v4l2_rect_map_inside(compose, &dev->compose_bounds_out); } else if (!dev->has_compose_out) { v4l2_rect_map_inside(crop, &r); r.height /= factor; v4l2_rect_set_size_to(compose, &r); } else { r.height *= factor; v4l2_rect_set_max_size(compose, &r); v4l2_rect_map_inside(compose, &dev->compose_bounds_out); crop->top *= factor; crop->height *= factor; v4l2_rect_set_size_to(crop, compose); v4l2_rect_map_inside(crop, &r); crop->top /= factor; crop->height /= factor; } } else { struct v4l2_rect r = { 0, 0, mp->width, mp->height }; v4l2_rect_set_size_to(crop, &r); r.height /= factor; v4l2_rect_set_size_to(compose, &r); } dev->fmt_out_rect.width = mp->width; dev->fmt_out_rect.height = mp->height; for (p = 0; p < mp->num_planes; p++) dev->bytesperline_out[p] = mp->plane_fmt[p].bytesperline; for (p = dev->fmt_out->buffers; p < dev->fmt_out->planes; p++) dev->bytesperline_out[p] = (dev->bytesperline_out[0] * dev->fmt_out->bit_depth[p]) / dev->fmt_out->bit_depth[0]; dev->field_out = mp->field; if (vivid_is_svid_out(dev)) dev->tv_field_out = mp->field; set_colorspace: dev->colorspace_out = mp->colorspace; dev->xfer_func_out = mp->xfer_func; dev->ycbcr_enc_out = mp->ycbcr_enc; dev->quantization_out = mp->quantization; struct vivid_dev *in_dev = vivid_output_is_connected_to(dev); if (in_dev) { vivid_send_source_change(in_dev, SVID); vivid_send_source_change(in_dev, HDMI); } return 0; } int vidioc_g_fmt_vid_out_mplane(struct file *file, void *priv, struct v4l2_format *f) { struct vivid_dev *dev = video_drvdata(file); if (!dev->multiplanar) return -ENOTTY; return vivid_g_fmt_vid_out(file, priv, f); } int vidioc_try_fmt_vid_out_mplane(struct file *file, void *priv, struct v4l2_format *f) { struct vivid_dev *dev = video_drvdata(file); if (!dev->multiplanar) return -ENOTTY; return vivid_try_fmt_vid_out(file, priv, f); } int vidioc_s_fmt_vid_out_mplane(struct file *file, void *priv, struct v4l2_format *f) { struct vivid_dev *dev = video_drvdata(file); if (!dev->multiplanar) return -ENOTTY; return vivid_s_fmt_vid_out(file, priv, f); } int vidioc_g_fmt_vid_out(struct file *file, void *priv, struct v4l2_format *f) { struct vivid_dev *dev = video_drvdata(file); if (dev->multiplanar) return -ENOTTY; return fmt_sp2mp_func(file, priv, f, vivid_g_fmt_vid_out); } int vidioc_try_fmt_vid_out(struct file *file, void *priv, struct v4l2_format *f) { struct vivid_dev *dev = video_drvdata(file); if (dev->multiplanar) return -ENOTTY; return fmt_sp2mp_func(file, priv, f, vivid_try_fmt_vid_out); } int vidioc_s_fmt_vid_out(struct file *file, void *priv, struct v4l2_format *f) { struct vivid_dev *dev = video_drvdata(file); if (dev->multiplanar) return -ENOTTY; return fmt_sp2mp_func(file, priv, f, vivid_s_fmt_vid_out); } int vivid_vid_out_g_selection(struct file *file, void *priv, struct v4l2_selection *sel) { struct vivid_dev *dev = video_drvdata(file); if (!dev->has_crop_out && !dev->has_compose_out) return -ENOTTY; if (sel->type != V4L2_BUF_TYPE_VIDEO_OUTPUT) return -EINVAL; sel->r.left = sel->r.top = 0; switch (sel->target) { case V4L2_SEL_TGT_CROP: if (!dev->has_crop_out) return -EINVAL; sel->r = dev->crop_out; break; case V4L2_SEL_TGT_CROP_DEFAULT: if (!dev->has_crop_out) return -EINVAL; sel->r = dev->fmt_out_rect; break; case V4L2_SEL_TGT_CROP_BOUNDS: if (!dev->has_crop_out) return -EINVAL; sel->r = vivid_max_rect; break; case V4L2_SEL_TGT_COMPOSE: if (!dev->has_compose_out) return -EINVAL; sel->r = dev->compose_out; break; case V4L2_SEL_TGT_COMPOSE_DEFAULT: case V4L2_SEL_TGT_COMPOSE_BOUNDS: if (!dev->has_compose_out) return -EINVAL; sel->r = dev->sink_rect; break; default: return -EINVAL; } return 0; } int vivid_vid_out_s_selection(struct file *file, void *priv, struct v4l2_selection *s) { struct vivid_dev *dev = video_drvdata(file); struct v4l2_rect *crop = &dev->crop_out; struct v4l2_rect *compose = &dev->compose_out; unsigned factor = V4L2_FIELD_HAS_T_OR_B(dev->field_out) ? 2 : 1; int ret; if (!dev->has_crop_out && !dev->has_compose_out) return -ENOTTY; if (s->type != V4L2_BUF_TYPE_VIDEO_OUTPUT) return -EINVAL; switch (s->target) { case V4L2_SEL_TGT_CROP: if (!dev->has_crop_out) return -EINVAL; ret = vivid_vid_adjust_sel(s->flags, &s->r); if (ret) return ret; v4l2_rect_set_min_size(&s->r, &vivid_min_rect); v4l2_rect_set_max_size(&s->r, &dev->fmt_out_rect); if (dev->has_scaler_out) { struct v4l2_rect max_rect = { 0, 0, dev->sink_rect.width * MAX_ZOOM, (dev->sink_rect.height / factor) * MAX_ZOOM }; v4l2_rect_set_max_size(&s->r, &max_rect); if (dev->has_compose_out) { struct v4l2_rect min_rect = { 0, 0, s->r.width / MAX_ZOOM, (s->r.height * factor) / MAX_ZOOM }; struct v4l2_rect max_rect = { 0, 0, s->r.width * MAX_ZOOM, (s->r.height * factor) * MAX_ZOOM }; v4l2_rect_set_min_size(compose, &min_rect); v4l2_rect_set_max_size(compose, &max_rect); v4l2_rect_map_inside(compose, &dev->compose_bounds_out); } } else if (dev->has_compose_out) { s->r.top *= factor; s->r.height *= factor; v4l2_rect_set_max_size(&s->r, &dev->sink_rect); v4l2_rect_set_size_to(compose, &s->r); v4l2_rect_map_inside(compose, &dev->compose_bounds_out); s->r.top /= factor; s->r.height /= factor; } else { v4l2_rect_set_size_to(&s->r, &dev->sink_rect); s->r.height /= factor; } v4l2_rect_map_inside(&s->r, &dev->fmt_out_rect); *crop = s->r; break; case V4L2_SEL_TGT_COMPOSE: if (!dev->has_compose_out) return -EINVAL; ret = vivid_vid_adjust_sel(s->flags, &s->r); if (ret) return ret; v4l2_rect_set_min_size(&s->r, &vivid_min_rect); v4l2_rect_set_max_size(&s->r, &dev->sink_rect); v4l2_rect_map_inside(&s->r, &dev->compose_bounds_out); s->r.top /= factor; s->r.height /= factor; if (dev->has_scaler_out) { struct v4l2_rect fmt = dev->fmt_out_rect; struct v4l2_rect max_rect = { 0, 0, s->r.width * MAX_ZOOM, s->r.height * MAX_ZOOM }; struct v4l2_rect min_rect = { 0, 0, s->r.width / MAX_ZOOM, s->r.height / MAX_ZOOM }; v4l2_rect_set_min_size(&fmt, &min_rect); if (!dev->has_crop_out) v4l2_rect_set_max_size(&fmt, &max_rect); if (!v4l2_rect_same_size(&dev->fmt_out_rect, &fmt) && vb2_is_busy(&dev->vb_vid_out_q)) return -EBUSY; if (dev->has_crop_out) { v4l2_rect_set_min_size(crop, &min_rect); v4l2_rect_set_max_size(crop, &max_rect); } dev->fmt_out_rect = fmt; } else if (dev->has_crop_out) { struct v4l2_rect fmt = dev->fmt_out_rect; v4l2_rect_set_min_size(&fmt, &s->r); if (!v4l2_rect_same_size(&dev->fmt_out_rect, &fmt) && vb2_is_busy(&dev->vb_vid_out_q)) return -EBUSY; dev->fmt_out_rect = fmt; v4l2_rect_set_size_to(crop, &s->r); v4l2_rect_map_inside(crop, &dev->fmt_out_rect); } else { if (!v4l2_rect_same_size(&s->r, &dev->fmt_out_rect) && vb2_is_busy(&dev->vb_vid_out_q)) return -EBUSY; v4l2_rect_set_size_to(&dev->fmt_out_rect, &s->r); v4l2_rect_set_size_to(crop, &s->r); crop->height /= factor; v4l2_rect_map_inside(crop, &dev->fmt_out_rect); } s->r.top *= factor; s->r.height *= factor; *compose = s->r; break; default: return -EINVAL; } return 0; } int vivid_vid_out_g_pixelaspect(struct file *file, void *priv, int type, struct v4l2_fract *f) { struct vivid_dev *dev = video_drvdata(file); if (type != V4L2_BUF_TYPE_VIDEO_OUTPUT) return -EINVAL; switch (vivid_get_pixel_aspect(dev)) { case TPG_PIXEL_ASPECT_NTSC: f->numerator = 11; f->denominator = 10; break; case TPG_PIXEL_ASPECT_PAL: f->numerator = 54; f->denominator = 59; break; default: break; } return 0; } int vidioc_g_fmt_vid_out_overlay(struct file *file, void *priv, struct v4l2_format *f) { struct vivid_dev *dev = video_drvdata(file); const struct v4l2_rect *compose = &dev->compose_out; struct v4l2_window *win = &f->fmt.win; if (!dev->has_fb) return -EINVAL; win->w.top = dev->overlay_out_top; win->w.left = dev->overlay_out_left; win->w.width = compose->width; win->w.height = compose->height; win->field = V4L2_FIELD_ANY; win->chromakey = dev->chromakey_out; win->global_alpha = dev->global_alpha_out; return 0; } int vidioc_try_fmt_vid_out_overlay(struct file *file, void *priv, struct v4l2_format *f) { struct vivid_dev *dev = video_drvdata(file); const struct v4l2_rect *compose = &dev->compose_out; struct v4l2_window *win = &f->fmt.win; if (!dev->has_fb) return -EINVAL; win->w.left = clamp_t(int, win->w.left, -dev->display_width, dev->display_width); win->w.top = clamp_t(int, win->w.top, -dev->display_height, dev->display_height); win->w.width = compose->width; win->w.height = compose->height; /* * It makes no sense for an OSD to overlay only top or bottom fields, * so always set this to ANY. */ win->field = V4L2_FIELD_ANY; return 0; } int vidioc_s_fmt_vid_out_overlay(struct file *file, void *priv, struct v4l2_format *f) { struct vivid_dev *dev = video_drvdata(file); struct v4l2_window *win = &f->fmt.win; int ret = vidioc_try_fmt_vid_out_overlay(file, priv, f); if (ret) return ret; dev->overlay_out_top = win->w.top; dev->overlay_out_left = win->w.left; dev->chromakey_out = win->chromakey; dev->global_alpha_out = win->global_alpha; return ret; } int vivid_vid_out_overlay(struct file *file, void *priv, unsigned i) { struct vivid_dev *dev = video_drvdata(file); if (i && !dev->fmt_out->can_do_overlay) { dprintk(dev, 1, "unsupported output format for output overlay\n"); return -EINVAL; } dev->overlay_out_enabled = i; return 0; } int vivid_vid_out_g_fbuf(struct file *file, void *priv, struct v4l2_framebuffer *a) { struct vivid_dev *dev = video_drvdata(file); a->capability = V4L2_FBUF_CAP_EXTERNOVERLAY | V4L2_FBUF_CAP_CHROMAKEY | V4L2_FBUF_CAP_SRC_CHROMAKEY | V4L2_FBUF_CAP_GLOBAL_ALPHA | V4L2_FBUF_CAP_LOCAL_ALPHA | V4L2_FBUF_CAP_LOCAL_INV_ALPHA; a->flags = V4L2_FBUF_FLAG_OVERLAY | dev->fbuf_out_flags; a->base = (void *)dev->video_pbase; a->fmt.width = dev->display_width; a->fmt.height = dev->display_height; if (vivid_fb_green_bits(dev) == 5) a->fmt.pixelformat = V4L2_PIX_FMT_ARGB555; else a->fmt.pixelformat = V4L2_PIX_FMT_RGB565; a->fmt.bytesperline = dev->display_byte_stride; a->fmt.sizeimage = a->fmt.height * a->fmt.bytesperline; a->fmt.field = V4L2_FIELD_NONE; a->fmt.colorspace = V4L2_COLORSPACE_SRGB; a->fmt.priv = 0; return 0; } int vivid_vid_out_s_fbuf(struct file *file, void *priv, const struct v4l2_framebuffer *a) { struct vivid_dev *dev = video_drvdata(file); const unsigned chroma_flags = V4L2_FBUF_FLAG_CHROMAKEY | V4L2_FBUF_FLAG_SRC_CHROMAKEY; const unsigned alpha_flags = V4L2_FBUF_FLAG_GLOBAL_ALPHA | V4L2_FBUF_FLAG_LOCAL_ALPHA | V4L2_FBUF_FLAG_LOCAL_INV_ALPHA; if ((a->flags & chroma_flags) == chroma_flags) return -EINVAL; switch (a->flags & alpha_flags) { case 0: case V4L2_FBUF_FLAG_GLOBAL_ALPHA: case V4L2_FBUF_FLAG_LOCAL_ALPHA: case V4L2_FBUF_FLAG_LOCAL_INV_ALPHA: break; default: return -EINVAL; } dev->fbuf_out_flags &= ~(chroma_flags | alpha_flags); dev->fbuf_out_flags |= a->flags & (chroma_flags | alpha_flags); return 0; } static const struct v4l2_audioout vivid_audio_outputs[] = { { 0, "Line-Out 1" }, { 1, "Line-Out 2" }, }; int vidioc_enum_output(struct file *file, void *priv, struct v4l2_output *out) { struct vivid_dev *dev = video_drvdata(file); if (out->index >= dev->num_outputs) return -EINVAL; out->type = V4L2_OUTPUT_TYPE_ANALOG; switch (dev->output_type[out->index]) { case SVID: snprintf(out->name, sizeof(out->name), "S-Video %03u-%u", dev->inst, dev->output_name_counter[out->index]); out->std = V4L2_STD_ALL; if (dev->has_audio_outputs) out->audioset = (1 << ARRAY_SIZE(vivid_audio_outputs)) - 1; out->capabilities = V4L2_OUT_CAP_STD; break; case HDMI: snprintf(out->name, sizeof(out->name), "HDMI %03u-%u", dev->inst, dev->output_name_counter[out->index]); out->capabilities = V4L2_OUT_CAP_DV_TIMINGS; break; } return 0; } int vidioc_g_output(struct file *file, void *priv, unsigned *o) { struct vivid_dev *dev = video_drvdata(file); *o = dev->output; return 0; } int vidioc_s_output(struct file *file, void *priv, unsigned o) { struct vivid_dev *dev = video_drvdata(file); if (o >= dev->num_outputs) return -EINVAL; if (o == dev->output) return 0; if (vb2_is_busy(&dev->vb_vid_out_q) || vb2_is_busy(&dev->vb_vbi_out_q) || vb2_is_busy(&dev->vb_meta_out_q)) return -EBUSY; dev->output = o; dev->tv_audio_output = 0; if (dev->output_type[o] == SVID) dev->vid_out_dev.tvnorms = V4L2_STD_ALL; else dev->vid_out_dev.tvnorms = 0; dev->vbi_out_dev.tvnorms = dev->vid_out_dev.tvnorms; dev->meta_out_dev.tvnorms = dev->vid_out_dev.tvnorms; vivid_update_format_out(dev); return 0; } int vidioc_enumaudout(struct file *file, void *priv, struct v4l2_audioout *vout) { if (vout->index >= ARRAY_SIZE(vivid_audio_outputs)) return -EINVAL; *vout = vivid_audio_outputs[vout->index]; return 0; } int vidioc_g_audout(struct file *file, void *priv, struct v4l2_audioout *vout) { struct vivid_dev *dev = video_drvdata(file); if (!vivid_is_svid_out(dev)) return -EINVAL; *vout = vivid_audio_outputs[dev->tv_audio_output]; return 0; } int vidioc_s_audout(struct file *file, void *priv, const struct v4l2_audioout *vout) { struct vivid_dev *dev = video_drvdata(file); if (!vivid_is_svid_out(dev)) return -EINVAL; if (vout->index >= ARRAY_SIZE(vivid_audio_outputs)) return -EINVAL; dev->tv_audio_output = vout->index; return 0; } int vivid_vid_out_s_std(struct file *file, void *priv, v4l2_std_id id) { struct vivid_dev *dev = video_drvdata(file); if (!vivid_is_svid_out(dev)) return -ENODATA; if (dev->std_out == id) return 0; if (vb2_is_busy(&dev->vb_vid_out_q) || vb2_is_busy(&dev->vb_vbi_out_q)) return -EBUSY; dev->std_out = id; vivid_update_format_out(dev); return 0; } static bool valid_cvt_gtf_timings(struct v4l2_dv_timings *timings) { struct v4l2_bt_timings *bt = &timings->bt; if ((bt->standards & (V4L2_DV_BT_STD_CVT | V4L2_DV_BT_STD_GTF)) && v4l2_valid_dv_timings(timings, &vivid_dv_timings_cap, NULL, NULL)) return true; return false; } int vivid_vid_out_s_dv_timings(struct file *file, void *priv, struct v4l2_dv_timings *timings) { struct vivid_dev *dev = video_drvdata(file); if (!vivid_is_hdmi_out(dev)) return -ENODATA; if (!v4l2_find_dv_timings_cap(timings, &vivid_dv_timings_cap, 0, NULL, NULL) && !valid_cvt_gtf_timings(timings)) return -EINVAL; if (v4l2_match_dv_timings(timings, &dev->dv_timings_out, 0, true)) return 0; if (vb2_is_busy(&dev->vb_vid_out_q)) return -EBUSY; dev->dv_timings_out = *timings; vivid_update_format_out(dev); return 0; } int vivid_vid_out_g_parm(struct file *file, void *priv, struct v4l2_streamparm *parm) { struct vivid_dev *dev = video_drvdata(file); if (parm->type != (dev->multiplanar ? V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE : V4L2_BUF_TYPE_VIDEO_OUTPUT)) return -EINVAL; parm->parm.output.capability = V4L2_CAP_TIMEPERFRAME; parm->parm.output.timeperframe = dev->timeperframe_vid_out; parm->parm.output.writebuffers = 1; return 0; } int vidioc_subscribe_event(struct v4l2_fh *fh, const struct v4l2_event_subscription *sub) { switch (sub->type) { case V4L2_EVENT_SOURCE_CHANGE: if (fh->vdev->vfl_dir == VFL_DIR_RX) return v4l2_src_change_event_subscribe(fh, sub); break; default: return v4l2_ctrl_subscribe_event(fh, sub); } return -EINVAL; }
4 2 2 1 5 4 4 3 3 2 3 7 6 5 5 3 2 7 2 1 1 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 // SPDX-License-Identifier: GPL-2.0 #include <linux/kernel.h> #include <linux/errno.h> #include <linux/mm.h> #include <linux/slab.h> #include <linux/eventfd.h> #include <linux/eventpoll.h> #include <linux/io_uring.h> #include <linux/io_uring_types.h> #include "io-wq.h" #include "eventfd.h" struct io_ev_fd { struct eventfd_ctx *cq_ev_fd; unsigned int eventfd_async; /* protected by ->completion_lock */ unsigned last_cq_tail; refcount_t refs; atomic_t ops; struct rcu_head rcu; }; enum { IO_EVENTFD_OP_SIGNAL_BIT, }; static void io_eventfd_free(struct rcu_head *rcu) { struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu); eventfd_ctx_put(ev_fd->cq_ev_fd); kfree(ev_fd); } static void io_eventfd_put(struct io_ev_fd *ev_fd) { if (refcount_dec_and_test(&ev_fd->refs)) call_rcu(&ev_fd->rcu, io_eventfd_free); } static void io_eventfd_do_signal(struct rcu_head *rcu) { struct io_ev_fd *ev_fd = container_of(rcu, struct io_ev_fd, rcu); eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE); io_eventfd_put(ev_fd); } /* * Returns true if the caller should put the ev_fd reference, false if not. */ static bool __io_eventfd_signal(struct io_ev_fd *ev_fd) { if (eventfd_signal_allowed()) { eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE); return true; } if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_SIGNAL_BIT), &ev_fd->ops)) { call_rcu_hurry(&ev_fd->rcu, io_eventfd_do_signal); return false; } return true; } /* * Trigger if eventfd_async isn't set, or if it's set and the caller is * an async worker. */ static bool io_eventfd_trigger(struct io_ev_fd *ev_fd) { return !ev_fd->eventfd_async || io_wq_current_is_worker(); } void io_eventfd_signal(struct io_ring_ctx *ctx, bool cqe_event) { bool skip = false; struct io_ev_fd *ev_fd; if (READ_ONCE(ctx->rings->cq_flags) & IORING_CQ_EVENTFD_DISABLED) return; guard(rcu)(); ev_fd = rcu_dereference(ctx->io_ev_fd); /* * Check again if ev_fd exists in case an io_eventfd_unregister call * completed between the NULL check of ctx->io_ev_fd at the start of * the function and rcu_read_lock. */ if (!ev_fd) return; if (!io_eventfd_trigger(ev_fd) || !refcount_inc_not_zero(&ev_fd->refs)) return; if (cqe_event) { /* * Eventfd should only get triggered when at least one event * has been posted. Some applications rely on the eventfd * notification count only changing IFF a new CQE has been * added to the CQ ring. There's no dependency on 1:1 * relationship between how many times this function is called * (and hence the eventfd count) and number of CQEs posted to * the CQ ring. */ spin_lock(&ctx->completion_lock); skip = ctx->cached_cq_tail == ev_fd->last_cq_tail; ev_fd->last_cq_tail = ctx->cached_cq_tail; spin_unlock(&ctx->completion_lock); } if (skip || __io_eventfd_signal(ev_fd)) io_eventfd_put(ev_fd); } int io_eventfd_register(struct io_ring_ctx *ctx, void __user *arg, unsigned int eventfd_async) { struct io_ev_fd *ev_fd; __s32 __user *fds = arg; int fd; ev_fd = rcu_dereference_protected(ctx->io_ev_fd, lockdep_is_held(&ctx->uring_lock)); if (ev_fd) return -EBUSY; if (copy_from_user(&fd, fds, sizeof(*fds))) return -EFAULT; ev_fd = kmalloc(sizeof(*ev_fd), GFP_KERNEL); if (!ev_fd) return -ENOMEM; ev_fd->cq_ev_fd = eventfd_ctx_fdget(fd); if (IS_ERR(ev_fd->cq_ev_fd)) { int ret = PTR_ERR(ev_fd->cq_ev_fd); kfree(ev_fd); return ret; } spin_lock(&ctx->completion_lock); ev_fd->last_cq_tail = ctx->cached_cq_tail; spin_unlock(&ctx->completion_lock); ev_fd->eventfd_async = eventfd_async; ctx->has_evfd = true; refcount_set(&ev_fd->refs, 1); atomic_set(&ev_fd->ops, 0); rcu_assign_pointer(ctx->io_ev_fd, ev_fd); return 0; } int io_eventfd_unregister(struct io_ring_ctx *ctx) { struct io_ev_fd *ev_fd; ev_fd = rcu_dereference_protected(ctx->io_ev_fd, lockdep_is_held(&ctx->uring_lock)); if (ev_fd) { ctx->has_evfd = false; rcu_assign_pointer(ctx->io_ev_fd, NULL); io_eventfd_put(ev_fd); return 0; } return -ENXIO; }
178 176 179 179 179 102 102 102 1 102 24 24 82 101 102 102 102 1 141 141 141 16 16 141 141 141 73 73 73 13 73 84 78 77 197 90 15 15 25 1 24 31 31 29 1 28 8 36 25 50 30 16 26 44 40 40 79 36 19 197 11 3 2 11 11 2 9 1 1 8 11 129 129 129 1 268 269 269 28 69 23 69 4 69 3 69 3 69 44 69 73 1 1 1 5 4 4 1 4 1 4 1 4 4 1 4 2 4 4 4 1 4 2 4 2 2 4 4 1 4 9 9 9 9 7 9 6 1 7 6 9 9 3 2 3 3 1 4 176 187 185 2 1 189 14 189 188 1 188 187 1 187 185 1 185 186 184 2 184 123 61 60 1 182 183 182 2 180 1 180 179 28 27 1 178 2 177 1 176 176 176 94 176 176 4 176 1 175 175 175 14 14 14 14 14 190 165 18 190 1 189 188 189 189 175 2 172 171 169 172 172 172 1 172 4 171 1 171 3 168 6 3 160 6 157 5 151 153 150 2 148 149 1 148 1 147 1 146 1 145 145 145 145 1 144 144 144 144 1 143 142 142 141 134 140 134 141 143 2 141 141 141 141 141 139 10 129 13 116 12 104 13 131 2 129 129 129 129 8 61 190 71 71 71 71 71 26 8 26 25 5 3 4 21 190 206 7 206 23 206 206 197 197 197 206 206 66 62 66 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 // SPDX-License-Identifier: GPL-2.0 /* * * Copyright (C) 2019-2021 Paragon Software GmbH, All rights reserved. * * * terminology * * cluster - allocation unit - 512,1K,2K,4K,...,2M * vcn - virtual cluster number - Offset inside the file in clusters. * vbo - virtual byte offset - Offset inside the file in bytes. * lcn - logical cluster number - 0 based cluster in clusters heap. * lbo - logical byte offset - Absolute position inside volume. * run - maps VCN to LCN - Stored in attributes in packed form. * attr - attribute segment - std/name/data etc records inside MFT. * mi - MFT inode - One MFT record(usually 1024 bytes or 4K), consists of attributes. * ni - NTFS inode - Extends linux inode. consists of one or more mft inodes. * index - unit inside directory - 2K, 4K, <=page size, does not depend on cluster size. * * WSL - Windows Subsystem for Linux * https://docs.microsoft.com/en-us/windows/wsl/file-permissions * It stores uid/gid/mode/dev in xattr * * ntfs allows up to 2^64 clusters per volume. * It means you should use 64 bits lcn to operate with ntfs. * Implementation of ntfs.sys uses only 32 bits lcn. * Default ntfs3 uses 32 bits lcn too. * ntfs3 built with CONFIG_NTFS3_64BIT_CLUSTER (ntfs3_64) uses 64 bits per lcn. * * * ntfs limits, cluster size is 4K (2^12) * ----------------------------------------------------------------------------- * | Volume size | Clusters | ntfs.sys | ntfs3 | ntfs3_64 | mkntfs | chkdsk | * ----------------------------------------------------------------------------- * | < 16T, 2^44 | < 2^32 | yes | yes | yes | yes | yes | * | > 16T, 2^44 | > 2^32 | no | no | yes | yes | yes | * ----------------------------------------------------------|------------------ * * To mount large volumes as ntfs one should use large cluster size (up to 2M) * The maximum volume size in this case is 2^32 * 2^21 = 2^53 = 8P * * ntfs limits, cluster size is 2M (2^21) * ----------------------------------------------------------------------------- * | < 8P, 2^53 | < 2^32 | yes | yes | yes | yes | yes | * | > 8P, 2^53 | > 2^32 | no | no | yes | yes | yes | * ----------------------------------------------------------|------------------ * */ #include <linux/blkdev.h> #include <linux/buffer_head.h> #include <linux/exportfs.h> #include <linux/fs.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> #include <linux/log2.h> #include <linux/minmax.h> #include <linux/module.h> #include <linux/nls.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/statfs.h> #include "debug.h" #include "ntfs.h" #include "ntfs_fs.h" #ifdef CONFIG_NTFS3_LZX_XPRESS #include "lib/lib.h" #endif #ifdef CONFIG_PRINTK /* * ntfs_printk - Trace warnings/notices/errors. * * Thanks Joe Perches <joe@perches.com> for implementation */ void ntfs_printk(const struct super_block *sb, const char *fmt, ...) { struct va_format vaf; va_list args; int level; struct ntfs_sb_info *sbi = sb->s_fs_info; /* Should we use different ratelimits for warnings/notices/errors? */ if (!___ratelimit(&sbi->msg_ratelimit, "ntfs3")) return; va_start(args, fmt); level = printk_get_level(fmt); vaf.fmt = printk_skip_level(fmt); vaf.va = &args; printk("%c%cntfs3(%s): %pV\n", KERN_SOH_ASCII, level, sb->s_id, &vaf); va_end(args); } static char s_name_buf[512]; static atomic_t s_name_buf_cnt = ATOMIC_INIT(1); // 1 means 'free s_name_buf'. /* * ntfs_inode_printk * * Print warnings/notices/errors about inode using name or inode number. */ void ntfs_inode_printk(struct inode *inode, const char *fmt, ...) { struct super_block *sb = inode->i_sb; struct ntfs_sb_info *sbi = sb->s_fs_info; char *name; va_list args; struct va_format vaf; int level; if (!___ratelimit(&sbi->msg_ratelimit, "ntfs3")) return; /* Use static allocated buffer, if possible. */ name = atomic_dec_and_test(&s_name_buf_cnt) ? s_name_buf : kmalloc(sizeof(s_name_buf), GFP_NOFS); if (name) { struct dentry *de = d_find_alias(inode); if (de) { int len; spin_lock(&de->d_lock); len = snprintf(name, sizeof(s_name_buf), " \"%s\"", de->d_name.name); spin_unlock(&de->d_lock); if (len <= 0) name[0] = 0; else if (len >= sizeof(s_name_buf)) name[sizeof(s_name_buf) - 1] = 0; } else { name[0] = 0; } dput(de); /* Cocci warns if placed in branch "if (de)" */ } va_start(args, fmt); level = printk_get_level(fmt); vaf.fmt = printk_skip_level(fmt); vaf.va = &args; printk("%c%cntfs3(%s): ino=%lx,%s %pV\n", KERN_SOH_ASCII, level, sb->s_id, inode->i_ino, name ? name : "", &vaf); va_end(args); atomic_inc(&s_name_buf_cnt); if (name != s_name_buf) kfree(name); } #endif /* * Shared memory struct. * * On-disk ntfs's upcase table is created by ntfs formatter. * 'upcase' table is 128K bytes of memory. * We should read it into memory when mounting. * Several ntfs volumes likely use the same 'upcase' table. * It is good idea to share in-memory 'upcase' table between different volumes. * Unfortunately winxp/vista/win7 use different upcase tables. */ static DEFINE_SPINLOCK(s_shared_lock); static struct { void *ptr; u32 len; int cnt; } s_shared[8]; /* * ntfs_set_shared * * Return: * * @ptr - If pointer was saved in shared memory. * * NULL - If pointer was not shared. */ void *ntfs_set_shared(void *ptr, u32 bytes) { void *ret = NULL; int i, j = -1; spin_lock(&s_shared_lock); for (i = 0; i < ARRAY_SIZE(s_shared); i++) { if (!s_shared[i].cnt) { j = i; } else if (bytes == s_shared[i].len && !memcmp(s_shared[i].ptr, ptr, bytes)) { s_shared[i].cnt += 1; ret = s_shared[i].ptr; break; } } if (!ret && j != -1) { s_shared[j].ptr = ptr; s_shared[j].len = bytes; s_shared[j].cnt = 1; ret = ptr; } spin_unlock(&s_shared_lock); return ret; } /* * ntfs_put_shared * * Return: * * @ptr - If pointer is not shared anymore. * * NULL - If pointer is still shared. */ void *ntfs_put_shared(void *ptr) { void *ret = ptr; int i; spin_lock(&s_shared_lock); for (i = 0; i < ARRAY_SIZE(s_shared); i++) { if (s_shared[i].cnt && s_shared[i].ptr == ptr) { if (--s_shared[i].cnt) ret = NULL; break; } } spin_unlock(&s_shared_lock); return ret; } static inline void put_mount_options(struct ntfs_mount_options *options) { kfree(options->nls_name); unload_nls(options->nls); kfree(options); } enum Opt { Opt_uid, Opt_gid, Opt_umask, Opt_dmask, Opt_fmask, Opt_immutable, Opt_discard, Opt_force, Opt_sparse, Opt_nohidden, Opt_hide_dot_files, Opt_windows_names, Opt_showmeta, Opt_acl, Opt_iocharset, Opt_prealloc, Opt_nocase, Opt_err, }; // clang-format off static const struct fs_parameter_spec ntfs_fs_parameters[] = { fsparam_uid("uid", Opt_uid), fsparam_gid("gid", Opt_gid), fsparam_u32oct("umask", Opt_umask), fsparam_u32oct("dmask", Opt_dmask), fsparam_u32oct("fmask", Opt_fmask), fsparam_flag("sys_immutable", Opt_immutable), fsparam_flag("discard", Opt_discard), fsparam_flag("force", Opt_force), fsparam_flag("sparse", Opt_sparse), fsparam_flag("nohidden", Opt_nohidden), fsparam_flag("hide_dot_files", Opt_hide_dot_files), fsparam_flag("windows_names", Opt_windows_names), fsparam_flag("showmeta", Opt_showmeta), fsparam_flag("acl", Opt_acl), fsparam_string("iocharset", Opt_iocharset), fsparam_flag("prealloc", Opt_prealloc), fsparam_flag("nocase", Opt_nocase), {} }; // clang-format on /* * Load nls table or if @nls is utf8 then return NULL. * * It is good idea to use here "const char *nls". * But load_nls accepts "char*". */ static struct nls_table *ntfs_load_nls(char *nls) { struct nls_table *ret; if (!nls) nls = CONFIG_NLS_DEFAULT; if (strcmp(nls, "utf8") == 0) return NULL; if (strcmp(nls, CONFIG_NLS_DEFAULT) == 0) return load_nls_default(); ret = load_nls(nls); if (ret) return ret; return ERR_PTR(-EINVAL); } static int ntfs_fs_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct ntfs_mount_options *opts = fc->fs_private; struct fs_parse_result result; int opt; opt = fs_parse(fc, ntfs_fs_parameters, param, &result); if (opt < 0) return opt; switch (opt) { case Opt_uid: opts->fs_uid = result.uid; break; case Opt_gid: opts->fs_gid = result.gid; break; case Opt_umask: if (result.uint_32 & ~07777) return invalf(fc, "ntfs3: Invalid value for umask."); opts->fs_fmask_inv = ~result.uint_32; opts->fs_dmask_inv = ~result.uint_32; opts->fmask = 1; opts->dmask = 1; break; case Opt_dmask: if (result.uint_32 & ~07777) return invalf(fc, "ntfs3: Invalid value for dmask."); opts->fs_dmask_inv = ~result.uint_32; opts->dmask = 1; break; case Opt_fmask: if (result.uint_32 & ~07777) return invalf(fc, "ntfs3: Invalid value for fmask."); opts->fs_fmask_inv = ~result.uint_32; opts->fmask = 1; break; case Opt_immutable: opts->sys_immutable = 1; break; case Opt_discard: opts->discard = 1; break; case Opt_force: opts->force = 1; break; case Opt_sparse: opts->sparse = 1; break; case Opt_nohidden: opts->nohidden = 1; break; case Opt_hide_dot_files: opts->hide_dot_files = 1; break; case Opt_windows_names: opts->windows_names = 1; break; case Opt_showmeta: opts->showmeta = 1; break; case Opt_acl: if (!result.negated) #ifdef CONFIG_NTFS3_FS_POSIX_ACL fc->sb_flags |= SB_POSIXACL; #else return invalf( fc, "ntfs3: Support for ACL not compiled in!"); #endif else fc->sb_flags &= ~SB_POSIXACL; break; case Opt_iocharset: kfree(opts->nls_name); opts->nls_name = param->string; param->string = NULL; break; case Opt_prealloc: opts->prealloc = 1; break; case Opt_nocase: opts->nocase = 1; break; default: /* Should not be here unless we forget add case. */ return -EINVAL; } return 0; } static int ntfs_fs_reconfigure(struct fs_context *fc) { struct super_block *sb = fc->root->d_sb; struct ntfs_sb_info *sbi = sb->s_fs_info; struct ntfs_mount_options *new_opts = fc->fs_private; int ro_rw; /* If ntfs3 is used as legacy ntfs enforce read-only mode. */ if (is_legacy_ntfs(sb)) { fc->sb_flags |= SB_RDONLY; goto out; } ro_rw = sb_rdonly(sb) && !(fc->sb_flags & SB_RDONLY); if (ro_rw && (sbi->flags & NTFS_FLAGS_NEED_REPLAY)) { errorf(fc, "ntfs3: Couldn't remount rw because journal is not replayed. Please umount/remount instead\n"); return -EINVAL; } new_opts->nls = ntfs_load_nls(new_opts->nls_name); if (IS_ERR(new_opts->nls)) { new_opts->nls = NULL; errorf(fc, "ntfs3: Cannot load iocharset %s", new_opts->nls_name); return -EINVAL; } if (new_opts->nls != sbi->options->nls) return invalf( fc, "ntfs3: Cannot use different iocharset when remounting!"); if (ro_rw && (sbi->volume.flags & VOLUME_FLAG_DIRTY) && !new_opts->force) { errorf(fc, "ntfs3: Volume is dirty and \"force\" flag is not set!"); return -EINVAL; } out: sync_filesystem(sb); swap(sbi->options, fc->fs_private); return 0; } #ifdef CONFIG_PROC_FS static struct proc_dir_entry *proc_info_root; /* * ntfs3_volinfo: * * The content of /proc/fs/ntfs3/<dev>/volinfo * * ntfs3.1 * cluster size * number of clusters * total number of mft records * number of used mft records ~= number of files + folders * real state of ntfs "dirty"/"clean" * current state of ntfs "dirty"/"clean" */ static int ntfs3_volinfo(struct seq_file *m, void *o) { struct super_block *sb = m->private; struct ntfs_sb_info *sbi = sb->s_fs_info; seq_printf(m, "ntfs%d.%d\n%u\n%zu\n%zu\n%zu\n%s\n%s\n", sbi->volume.major_ver, sbi->volume.minor_ver, sbi->cluster_size, sbi->used.bitmap.nbits, sbi->mft.bitmap.nbits, sbi->mft.bitmap.nbits - wnd_zeroes(&sbi->mft.bitmap), sbi->volume.real_dirty ? "dirty" : "clean", (sbi->volume.flags & VOLUME_FLAG_DIRTY) ? "dirty" : "clean"); return 0; } static int ntfs3_volinfo_open(struct inode *inode, struct file *file) { return single_open(file, ntfs3_volinfo, pde_data(inode)); } /* read /proc/fs/ntfs3/<dev>/label */ static int ntfs3_label_show(struct seq_file *m, void *o) { struct super_block *sb = m->private; struct ntfs_sb_info *sbi = sb->s_fs_info; seq_printf(m, "%s\n", sbi->volume.label); return 0; } /* write /proc/fs/ntfs3/<dev>/label */ static ssize_t ntfs3_label_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { int err; struct super_block *sb = pde_data(file_inode(file)); ssize_t ret = count; u8 *label; if (sb_rdonly(sb)) return -EROFS; label = kmalloc(count, GFP_NOFS); if (!label) return -ENOMEM; if (copy_from_user(label, buffer, ret)) { ret = -EFAULT; goto out; } while (ret > 0 && label[ret - 1] == '\n') ret -= 1; err = ntfs_set_label(sb->s_fs_info, label, ret); if (err < 0) { ntfs_err(sb, "failed (%d) to write label", err); ret = err; goto out; } *ppos += count; ret = count; out: kfree(label); return ret; } static int ntfs3_label_open(struct inode *inode, struct file *file) { return single_open(file, ntfs3_label_show, pde_data(inode)); } static const struct proc_ops ntfs3_volinfo_fops = { .proc_read = seq_read, .proc_lseek = seq_lseek, .proc_release = single_release, .proc_open = ntfs3_volinfo_open, }; static const struct proc_ops ntfs3_label_fops = { .proc_read = seq_read, .proc_lseek = seq_lseek, .proc_release = single_release, .proc_open = ntfs3_label_open, .proc_write = ntfs3_label_write, }; static void ntfs_create_procdir(struct super_block *sb) { struct proc_dir_entry *e; if (!proc_info_root) return; e = proc_mkdir(sb->s_id, proc_info_root); if (e) { struct ntfs_sb_info *sbi = sb->s_fs_info; proc_create_data("volinfo", 0444, e, &ntfs3_volinfo_fops, sb); proc_create_data("label", 0644, e, &ntfs3_label_fops, sb); sbi->procdir = e; } } static void ntfs_remove_procdir(struct super_block *sb) { struct ntfs_sb_info *sbi = sb->s_fs_info; if (!sbi->procdir) return; remove_proc_entry("label", sbi->procdir); remove_proc_entry("volinfo", sbi->procdir); remove_proc_entry(sb->s_id, proc_info_root); sbi->procdir = NULL; } static void ntfs_create_proc_root(void) { proc_info_root = proc_mkdir("fs/ntfs3", NULL); } static void ntfs_remove_proc_root(void) { if (proc_info_root) { remove_proc_entry("fs/ntfs3", NULL); proc_info_root = NULL; } } #else static void ntfs_create_procdir(struct super_block *sb) {} static void ntfs_remove_procdir(struct super_block *sb) {} static void ntfs_create_proc_root(void) {} static void ntfs_remove_proc_root(void) {} #endif static struct kmem_cache *ntfs_inode_cachep; static struct inode *ntfs_alloc_inode(struct super_block *sb) { struct ntfs_inode *ni = alloc_inode_sb(sb, ntfs_inode_cachep, GFP_NOFS); if (!ni) return NULL; memset(ni, 0, offsetof(struct ntfs_inode, vfs_inode)); mutex_init(&ni->ni_lock); return &ni->vfs_inode; } static void ntfs_free_inode(struct inode *inode) { struct ntfs_inode *ni = ntfs_i(inode); mutex_destroy(&ni->ni_lock); kmem_cache_free(ntfs_inode_cachep, ni); } static void init_once(void *foo) { struct ntfs_inode *ni = foo; inode_init_once(&ni->vfs_inode); } /* * Noinline to reduce binary size. */ static noinline void ntfs3_put_sbi(struct ntfs_sb_info *sbi) { wnd_close(&sbi->mft.bitmap); wnd_close(&sbi->used.bitmap); if (sbi->mft.ni) { iput(&sbi->mft.ni->vfs_inode); sbi->mft.ni = NULL; } if (sbi->security.ni) { iput(&sbi->security.ni->vfs_inode); sbi->security.ni = NULL; } if (sbi->reparse.ni) { iput(&sbi->reparse.ni->vfs_inode); sbi->reparse.ni = NULL; } if (sbi->objid.ni) { iput(&sbi->objid.ni->vfs_inode); sbi->objid.ni = NULL; } if (sbi->volume.ni) { iput(&sbi->volume.ni->vfs_inode); sbi->volume.ni = NULL; } ntfs_update_mftmirr(sbi, 0); indx_clear(&sbi->security.index_sii); indx_clear(&sbi->security.index_sdh); indx_clear(&sbi->reparse.index_r); indx_clear(&sbi->objid.index_o); } static void ntfs3_free_sbi(struct ntfs_sb_info *sbi) { kfree(sbi->new_rec); kvfree(ntfs_put_shared(sbi->upcase)); kvfree(sbi->def_table); kfree(sbi->compress.lznt); #ifdef CONFIG_NTFS3_LZX_XPRESS xpress_free_decompressor(sbi->compress.xpress); lzx_free_decompressor(sbi->compress.lzx); #endif kfree(sbi); } static void ntfs_put_super(struct super_block *sb) { struct ntfs_sb_info *sbi = sb->s_fs_info; ntfs_remove_procdir(sb); /* Mark rw ntfs as clear, if possible. */ ntfs_set_state(sbi, NTFS_DIRTY_CLEAR); ntfs3_put_sbi(sbi); } static int ntfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; struct ntfs_sb_info *sbi = sb->s_fs_info; struct wnd_bitmap *wnd = &sbi->used.bitmap; buf->f_type = sb->s_magic; buf->f_bsize = sbi->cluster_size; buf->f_blocks = wnd->nbits; buf->f_bfree = buf->f_bavail = wnd_zeroes(wnd); buf->f_fsid.val[0] = sbi->volume.ser_num; buf->f_fsid.val[1] = (sbi->volume.ser_num >> 32); buf->f_namelen = NTFS_NAME_LEN; return 0; } static int ntfs_show_options(struct seq_file *m, struct dentry *root) { struct super_block *sb = root->d_sb; struct ntfs_sb_info *sbi = sb->s_fs_info; struct ntfs_mount_options *opts = sbi->options; struct user_namespace *user_ns = seq_user_ns(m); seq_printf(m, ",uid=%u", from_kuid_munged(user_ns, opts->fs_uid)); seq_printf(m, ",gid=%u", from_kgid_munged(user_ns, opts->fs_gid)); if (opts->dmask) seq_printf(m, ",dmask=%04o", opts->fs_dmask_inv ^ 0xffff); if (opts->fmask) seq_printf(m, ",fmask=%04o", opts->fs_fmask_inv ^ 0xffff); if (opts->sys_immutable) seq_puts(m, ",sys_immutable"); if (opts->discard) seq_puts(m, ",discard"); if (opts->force) seq_puts(m, ",force"); if (opts->sparse) seq_puts(m, ",sparse"); if (opts->nohidden) seq_puts(m, ",nohidden"); if (opts->hide_dot_files) seq_puts(m, ",hide_dot_files"); if (opts->windows_names) seq_puts(m, ",windows_names"); if (opts->showmeta) seq_puts(m, ",showmeta"); if (sb->s_flags & SB_POSIXACL) seq_puts(m, ",acl"); if (opts->nls) seq_printf(m, ",iocharset=%s", opts->nls->charset); else seq_puts(m, ",iocharset=utf8"); if (opts->prealloc) seq_puts(m, ",prealloc"); if (opts->nocase) seq_puts(m, ",nocase"); return 0; } /* * ntfs_shutdown - super_operations::shutdown */ static void ntfs_shutdown(struct super_block *sb) { set_bit(NTFS_FLAGS_SHUTDOWN_BIT, &ntfs_sb(sb)->flags); } /* * ntfs_sync_fs - super_operations::sync_fs */ static int ntfs_sync_fs(struct super_block *sb, int wait) { int err = 0, err2; struct ntfs_sb_info *sbi = sb->s_fs_info; struct ntfs_inode *ni; struct inode *inode; if (unlikely(ntfs3_forced_shutdown(sb))) return -EIO; ni = sbi->security.ni; if (ni) { inode = &ni->vfs_inode; err2 = _ni_write_inode(inode, wait); if (err2 && !err) err = err2; } ni = sbi->objid.ni; if (ni) { inode = &ni->vfs_inode; err2 = _ni_write_inode(inode, wait); if (err2 && !err) err = err2; } ni = sbi->reparse.ni; if (ni) { inode = &ni->vfs_inode; err2 = _ni_write_inode(inode, wait); if (err2 && !err) err = err2; } if (!err) ntfs_set_state(sbi, NTFS_DIRTY_CLEAR); ntfs_update_mftmirr(sbi, wait); return err; } static const struct super_operations ntfs_sops = { .alloc_inode = ntfs_alloc_inode, .free_inode = ntfs_free_inode, .evict_inode = ntfs_evict_inode, .put_super = ntfs_put_super, .statfs = ntfs_statfs, .show_options = ntfs_show_options, .shutdown = ntfs_shutdown, .sync_fs = ntfs_sync_fs, .write_inode = ntfs3_write_inode, }; static struct inode *ntfs_export_get_inode(struct super_block *sb, u64 ino, u32 generation) { struct MFT_REF ref; struct inode *inode; ref.low = cpu_to_le32(ino); #ifdef CONFIG_NTFS3_64BIT_CLUSTER ref.high = cpu_to_le16(ino >> 32); #else ref.high = 0; #endif ref.seq = cpu_to_le16(generation); inode = ntfs_iget5(sb, &ref, NULL); if (!IS_ERR(inode) && is_bad_inode(inode)) { iput(inode); inode = ERR_PTR(-ESTALE); } return inode; } static struct dentry *ntfs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { return generic_fh_to_dentry(sb, fid, fh_len, fh_type, ntfs_export_get_inode); } static struct dentry *ntfs_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { return generic_fh_to_parent(sb, fid, fh_len, fh_type, ntfs_export_get_inode); } /* TODO: == ntfs_sync_inode */ static int ntfs_nfs_commit_metadata(struct inode *inode) { return _ni_write_inode(inode, 1); } static const struct export_operations ntfs_export_ops = { .encode_fh = generic_encode_ino32_fh, .fh_to_dentry = ntfs_fh_to_dentry, .fh_to_parent = ntfs_fh_to_parent, .get_parent = ntfs3_get_parent, .commit_metadata = ntfs_nfs_commit_metadata, }; /* * format_size_gb - Return Gb,Mb to print with "%u.%02u Gb". */ static u32 format_size_gb(const u64 bytes, u32 *mb) { /* Do simple right 30 bit shift of 64 bit value. */ u64 kbytes = bytes >> 10; u32 kbytes32 = kbytes; *mb = (100 * (kbytes32 & 0xfffff) + 0x7ffff) >> 20; if (*mb >= 100) *mb = 99; return (kbytes32 >> 20) | (((u32)(kbytes >> 32)) << 12); } static u32 true_sectors_per_clst(const struct NTFS_BOOT *boot) { if (boot->sectors_per_clusters <= 0x80) return boot->sectors_per_clusters; if (boot->sectors_per_clusters >= 0xf4) /* limit shift to 2MB max */ return 1U << (-(s8)boot->sectors_per_clusters); return -EINVAL; } /* * ntfs_init_from_boot - Init internal info from on-disk boot sector. * * NTFS mount begins from boot - special formatted 512 bytes. * There are two boots: the first and the last 512 bytes of volume. * The content of boot is not changed during ntfs life. * * NOTE: ntfs.sys checks only first (primary) boot. * chkdsk checks both boots. */ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size, u64 dev_size, struct NTFS_BOOT **boot2) { struct ntfs_sb_info *sbi = sb->s_fs_info; int err; u32 mb, gb, boot_sector_size, sct_per_clst, record_size; u64 sectors, clusters, mlcn, mlcn2, dev_size0; struct NTFS_BOOT *boot; struct buffer_head *bh; struct MFT_REC *rec; u16 fn, ao; u8 cluster_bits; u32 boot_off = 0; sector_t boot_block = 0; const char *hint = "Primary boot"; /* Save original dev_size. Used with alternative boot. */ dev_size0 = dev_size; sbi->volume.blocks = dev_size >> PAGE_SHIFT; read_boot: bh = ntfs_bread(sb, boot_block); if (!bh) return boot_block ? -EINVAL : -EIO; err = -EINVAL; /* Corrupted image; do not read OOB */ if (bh->b_size - sizeof(*boot) < boot_off) goto out; boot = (struct NTFS_BOOT *)Add2Ptr(bh->b_data, boot_off); if (memcmp(boot->system_id, "NTFS ", sizeof("NTFS ") - 1)) { ntfs_err(sb, "%s signature is not NTFS.", hint); goto out; } /* 0x55AA is not mandaroty. Thanks Maxim Suhanov*/ /*if (0x55 != boot->boot_magic[0] || 0xAA != boot->boot_magic[1]) * goto out; */ boot_sector_size = ((u32)boot->bytes_per_sector[1] << 8) | boot->bytes_per_sector[0]; if (boot_sector_size < SECTOR_SIZE || !is_power_of_2(boot_sector_size)) { ntfs_err(sb, "%s: invalid bytes per sector %u.", hint, boot_sector_size); goto out; } /* cluster size: 512, 1K, 2K, 4K, ... 2M */ sct_per_clst = true_sectors_per_clst(boot); if ((int)sct_per_clst < 0 || !is_power_of_2(sct_per_clst)) { ntfs_err(sb, "%s: invalid sectors per cluster %u.", hint, sct_per_clst); goto out; } sbi->cluster_size = boot_sector_size * sct_per_clst; sbi->cluster_bits = cluster_bits = blksize_bits(sbi->cluster_size); sbi->cluster_mask = sbi->cluster_size - 1; sbi->cluster_mask_inv = ~(u64)sbi->cluster_mask; mlcn = le64_to_cpu(boot->mft_clst); mlcn2 = le64_to_cpu(boot->mft2_clst); sectors = le64_to_cpu(boot->sectors_per_volume); if (mlcn * sct_per_clst >= sectors || mlcn2 * sct_per_clst >= sectors) { ntfs_err( sb, "%s: start of MFT 0x%llx (0x%llx) is out of volume 0x%llx.", hint, mlcn, mlcn2, sectors); goto out; } if (boot->record_size >= 0) { record_size = (u32)boot->record_size << cluster_bits; } else if (-boot->record_size <= MAXIMUM_SHIFT_BYTES_PER_MFT) { record_size = 1u << (-boot->record_size); } else { ntfs_err(sb, "%s: invalid record size %d.", hint, boot->record_size); goto out; } sbi->record_size = record_size; sbi->record_bits = blksize_bits(record_size); sbi->attr_size_tr = (5 * record_size >> 4); // ~320 bytes /* Check MFT record size. */ if (record_size < SECTOR_SIZE || !is_power_of_2(record_size)) { ntfs_err(sb, "%s: invalid bytes per MFT record %u (%d).", hint, record_size, boot->record_size); goto out; } if (record_size > MAXIMUM_BYTES_PER_MFT) { ntfs_err(sb, "Unsupported bytes per MFT record %u.", record_size); goto out; } if (boot->index_size >= 0) { sbi->index_size = (u32)boot->index_size << cluster_bits; } else if (-boot->index_size <= MAXIMUM_SHIFT_BYTES_PER_INDEX) { sbi->index_size = 1u << (-boot->index_size); } else { ntfs_err(sb, "%s: invalid index size %d.", hint, boot->index_size); goto out; } /* Check index record size. */ if (sbi->index_size < SECTOR_SIZE || !is_power_of_2(sbi->index_size)) { ntfs_err(sb, "%s: invalid bytes per index %u(%d).", hint, sbi->index_size, boot->index_size); goto out; } if (sbi->index_size > MAXIMUM_BYTES_PER_INDEX) { ntfs_err(sb, "%s: unsupported bytes per index %u.", hint, sbi->index_size); goto out; } sbi->volume.size = sectors * boot_sector_size; gb = format_size_gb(sbi->volume.size + boot_sector_size, &mb); /* * - Volume formatted and mounted with the same sector size. * - Volume formatted 4K and mounted as 512. * - Volume formatted 512 and mounted as 4K. */ if (boot_sector_size != sector_size) { ntfs_warn( sb, "Different NTFS sector size (%u) and media sector size (%u).", boot_sector_size, sector_size); dev_size += sector_size - 1; } sbi->mft.lbo = mlcn << cluster_bits; sbi->mft.lbo2 = mlcn2 << cluster_bits; /* Compare boot's cluster and sector. */ if (sbi->cluster_size < boot_sector_size) { ntfs_err(sb, "%s: invalid bytes per cluster (%u).", hint, sbi->cluster_size); goto out; } /* Compare boot's cluster and media sector. */ if (sbi->cluster_size < sector_size) { /* No way to use ntfs_get_block in this case. */ ntfs_err( sb, "Failed to mount 'cause NTFS's cluster size (%u) is less than media sector size (%u).", sbi->cluster_size, sector_size); goto out; } sbi->max_bytes_per_attr = record_size - ALIGN(MFTRECORD_FIXUP_OFFSET, 8) - ALIGN(((record_size >> SECTOR_SHIFT) * sizeof(short)), 8) - ALIGN(sizeof(enum ATTR_TYPE), 8); sbi->volume.ser_num = le64_to_cpu(boot->serial_num); /* Warning if RAW volume. */ if (dev_size < sbi->volume.size + boot_sector_size) { u32 mb0, gb0; gb0 = format_size_gb(dev_size, &mb0); ntfs_warn( sb, "RAW NTFS volume: Filesystem size %u.%02u Gb > volume size %u.%02u Gb. Mount in read-only.", gb, mb, gb0, mb0); sb->s_flags |= SB_RDONLY; } clusters = sbi->volume.size >> cluster_bits; #ifndef CONFIG_NTFS3_64BIT_CLUSTER /* 32 bits per cluster. */ if (clusters >> 32) { ntfs_notice( sb, "NTFS %u.%02u Gb is too big to use 32 bits per cluster.", gb, mb); goto out; } #elif BITS_PER_LONG < 64 #error "CONFIG_NTFS3_64BIT_CLUSTER incompatible in 32 bit OS" #endif sbi->used.bitmap.nbits = clusters; rec = kzalloc(record_size, GFP_NOFS); if (!rec) { err = -ENOMEM; goto out; } sbi->new_rec = rec; rec->rhdr.sign = NTFS_FILE_SIGNATURE; rec->rhdr.fix_off = cpu_to_le16(MFTRECORD_FIXUP_OFFSET); fn = (sbi->record_size >> SECTOR_SHIFT) + 1; rec->rhdr.fix_num = cpu_to_le16(fn); ao = ALIGN(MFTRECORD_FIXUP_OFFSET + sizeof(short) * fn, 8); rec->attr_off = cpu_to_le16(ao); rec->used = cpu_to_le32(ao + ALIGN(sizeof(enum ATTR_TYPE), 8)); rec->total = cpu_to_le32(sbi->record_size); ((struct ATTRIB *)Add2Ptr(rec, ao))->type = ATTR_END; sb_set_blocksize(sb, min_t(u32, sbi->cluster_size, PAGE_SIZE)); sbi->block_mask = sb->s_blocksize - 1; sbi->blocks_per_cluster = sbi->cluster_size >> sb->s_blocksize_bits; sbi->volume.blocks = sbi->volume.size >> sb->s_blocksize_bits; /* Maximum size for normal files. */ sbi->maxbytes = (clusters << cluster_bits) - 1; #ifdef CONFIG_NTFS3_64BIT_CLUSTER if (clusters >= (1ull << (64 - cluster_bits))) sbi->maxbytes = -1; sbi->maxbytes_sparse = -1; sb->s_maxbytes = MAX_LFS_FILESIZE; #else /* Maximum size for sparse file. */ sbi->maxbytes_sparse = (1ull << (cluster_bits + 32)) - 1; sb->s_maxbytes = 0xFFFFFFFFull << cluster_bits; #endif /* * Compute the MFT zone at two steps. * It would be nice if we are able to allocate 1/8 of * total clusters for MFT but not more then 512 MB. */ sbi->zone_max = min_t(CLST, 0x20000000 >> cluster_bits, clusters >> 3); err = 0; if (bh->b_blocknr && !sb_rdonly(sb)) { /* * Alternative boot is ok but primary is not ok. * Do not update primary boot here 'cause it may be faked boot. * Let ntfs to be mounted and update boot later. */ *boot2 = kmemdup(boot, sizeof(*boot), GFP_NOFS | __GFP_NOWARN); } out: brelse(bh); if (err == -EINVAL && !boot_block && dev_size0 > PAGE_SHIFT) { u32 block_size = min_t(u32, sector_size, PAGE_SIZE); u64 lbo = dev_size0 - sizeof(*boot); boot_block = lbo >> blksize_bits(block_size); boot_off = lbo & (block_size - 1); if (boot_block && block_size >= boot_off + sizeof(*boot)) { /* * Try alternative boot (last sector) */ sb_set_blocksize(sb, block_size); hint = "Alternative boot"; dev_size = dev_size0; /* restore original size. */ goto read_boot; } } return err; } /* * ntfs_fill_super - Try to mount. */ static int ntfs_fill_super(struct super_block *sb, struct fs_context *fc) { int err; struct ntfs_sb_info *sbi = sb->s_fs_info; struct block_device *bdev = sb->s_bdev; struct ntfs_mount_options *options; struct inode *inode; struct ntfs_inode *ni; size_t i, tt, bad_len, bad_frags; CLST vcn, lcn, len; struct ATTRIB *attr; const struct VOLUME_INFO *info; u32 done, bytes; struct ATTR_DEF_ENTRY *t; u16 *shared; struct MFT_REF ref; bool ro = sb_rdonly(sb); struct NTFS_BOOT *boot2 = NULL; ref.high = 0; sbi->sb = sb; sbi->options = options = fc->fs_private; fc->fs_private = NULL; sb->s_flags |= SB_NODIRATIME; sb->s_magic = 0x7366746e; // "ntfs" sb->s_op = &ntfs_sops; sb->s_export_op = &ntfs_export_ops; sb->s_time_gran = NTFS_TIME_GRAN; // 100 nsec sb->s_xattr = ntfs_xattr_handlers; if (options->nocase) set_default_d_op(sb, &ntfs_dentry_ops); options->nls = ntfs_load_nls(options->nls_name); if (IS_ERR(options->nls)) { options->nls = NULL; errorf(fc, "Cannot load nls %s", options->nls_name); err = -EINVAL; goto out; } if (bdev_max_discard_sectors(bdev) && bdev_discard_granularity(bdev)) { sbi->discard_granularity = bdev_discard_granularity(bdev); sbi->discard_granularity_mask_inv = ~(u64)(sbi->discard_granularity - 1); } /* Parse boot. */ err = ntfs_init_from_boot(sb, bdev_logical_block_size(bdev), bdev_nr_bytes(bdev), &boot2); if (err) goto out; /* * Load $Volume. This should be done before $LogFile * 'cause 'sbi->volume.ni' is used in 'ntfs_set_state'. */ ref.low = cpu_to_le32(MFT_REC_VOL); ref.seq = cpu_to_le16(MFT_REC_VOL); inode = ntfs_iget5(sb, &ref, &NAME_VOLUME); if (IS_ERR(inode)) { err = PTR_ERR(inode); ntfs_err(sb, "Failed to load $Volume (%d).", err); goto out; } ni = ntfs_i(inode); /* Load and save label (not necessary). */ attr = ni_find_attr(ni, NULL, NULL, ATTR_LABEL, NULL, 0, NULL, NULL); if (!attr) { /* It is ok if no ATTR_LABEL */ } else if (!attr->non_res && !is_attr_ext(attr)) { /* $AttrDef allows labels to be up to 128 symbols. */ err = utf16s_to_utf8s(resident_data(attr), le32_to_cpu(attr->res.data_size) >> 1, UTF16_LITTLE_ENDIAN, sbi->volume.label, sizeof(sbi->volume.label)); if (err < 0) sbi->volume.label[0] = 0; } else { /* Should we break mounting here? */ //err = -EINVAL; //goto put_inode_out; } attr = ni_find_attr(ni, attr, NULL, ATTR_VOL_INFO, NULL, 0, NULL, NULL); if (!attr || is_attr_ext(attr) || !(info = resident_data_ex(attr, SIZEOF_ATTRIBUTE_VOLUME_INFO))) { ntfs_err(sb, "$Volume is corrupted."); err = -EINVAL; goto put_inode_out; } sbi->volume.major_ver = info->major_ver; sbi->volume.minor_ver = info->minor_ver; sbi->volume.flags = info->flags; sbi->volume.ni = ni; if (info->flags & VOLUME_FLAG_DIRTY) { sbi->volume.real_dirty = true; ntfs_info(sb, "It is recommened to use chkdsk."); } /* Load $MFTMirr to estimate recs_mirr. */ ref.low = cpu_to_le32(MFT_REC_MIRR); ref.seq = cpu_to_le16(MFT_REC_MIRR); inode = ntfs_iget5(sb, &ref, &NAME_MIRROR); if (IS_ERR(inode)) { err = PTR_ERR(inode); ntfs_err(sb, "Failed to load $MFTMirr (%d).", err); goto out; } sbi->mft.recs_mirr = ntfs_up_cluster(sbi, inode->i_size) >> sbi->record_bits; iput(inode); /* Load LogFile to replay. */ ref.low = cpu_to_le32(MFT_REC_LOG); ref.seq = cpu_to_le16(MFT_REC_LOG); inode = ntfs_iget5(sb, &ref, &NAME_LOGFILE); if (IS_ERR(inode)) { err = PTR_ERR(inode); ntfs_err(sb, "Failed to load \x24LogFile (%d).", err); goto out; } ni = ntfs_i(inode); err = ntfs_loadlog_and_replay(ni, sbi); if (err) goto put_inode_out; iput(inode); if ((sbi->flags & NTFS_FLAGS_NEED_REPLAY) && !ro) { ntfs_warn(sb, "failed to replay log file. Can't mount rw!"); err = -EINVAL; goto out; } if ((sbi->volume.flags & VOLUME_FLAG_DIRTY) && !ro && !options->force) { ntfs_warn(sb, "volume is dirty and \"force\" flag is not set!"); err = -EINVAL; goto out; } /* Load $MFT. */ ref.low = cpu_to_le32(MFT_REC_MFT); ref.seq = cpu_to_le16(1); inode = ntfs_iget5(sb, &ref, &NAME_MFT); if (IS_ERR(inode)) { err = PTR_ERR(inode); ntfs_err(sb, "Failed to load $MFT (%d).", err); goto out; } ni = ntfs_i(inode); sbi->mft.used = ni->i_valid >> sbi->record_bits; tt = inode->i_size >> sbi->record_bits; sbi->mft.next_free = MFT_REC_USER; err = wnd_init(&sbi->mft.bitmap, sb, tt); if (err) goto put_inode_out; err = ni_load_all_mi(ni); if (err) { ntfs_err(sb, "Failed to load $MFT's subrecords (%d).", err); goto put_inode_out; } sbi->mft.ni = ni; /* Load $Bitmap. */ ref.low = cpu_to_le32(MFT_REC_BITMAP); ref.seq = cpu_to_le16(MFT_REC_BITMAP); inode = ntfs_iget5(sb, &ref, &NAME_BITMAP); if (IS_ERR(inode)) { err = PTR_ERR(inode); ntfs_err(sb, "Failed to load $Bitmap (%d).", err); goto out; } #ifndef CONFIG_NTFS3_64BIT_CLUSTER if (inode->i_size >> 32) { err = -EINVAL; goto put_inode_out; } #endif /* Check bitmap boundary. */ tt = sbi->used.bitmap.nbits; if (inode->i_size < ntfs3_bitmap_size(tt)) { ntfs_err(sb, "$Bitmap is corrupted."); err = -EINVAL; goto put_inode_out; } err = wnd_init(&sbi->used.bitmap, sb, tt); if (err) { ntfs_err(sb, "Failed to initialize $Bitmap (%d).", err); goto put_inode_out; } iput(inode); /* Compute the MFT zone. */ err = ntfs_refresh_zone(sbi); if (err) { ntfs_err(sb, "Failed to initialize MFT zone (%d).", err); goto out; } /* Load $BadClus. */ ref.low = cpu_to_le32(MFT_REC_BADCLUST); ref.seq = cpu_to_le16(MFT_REC_BADCLUST); inode = ntfs_iget5(sb, &ref, &NAME_BADCLUS); if (IS_ERR(inode)) { err = PTR_ERR(inode); ntfs_err(sb, "Failed to load $BadClus (%d).", err); goto out; } ni = ntfs_i(inode); bad_len = bad_frags = 0; for (i = 0; run_get_entry(&ni->file.run, i, &vcn, &lcn, &len); i++) { if (lcn == SPARSE_LCN) continue; bad_len += len; bad_frags += 1; if (ro) continue; if (wnd_set_used_safe(&sbi->used.bitmap, lcn, len, &tt) || tt) { /* Bad blocks marked as free in bitmap. */ ntfs_set_state(sbi, NTFS_DIRTY_ERROR); } } if (bad_len) { /* * Notice about bad blocks. * In normal cases these blocks are marked as used in bitmap. * And we never allocate space in it. */ ntfs_notice(sb, "Volume contains %zu bad blocks in %zu fragments.", bad_len, bad_frags); } iput(inode); /* Load $AttrDef. */ ref.low = cpu_to_le32(MFT_REC_ATTR); ref.seq = cpu_to_le16(MFT_REC_ATTR); inode = ntfs_iget5(sb, &ref, &NAME_ATTRDEF); if (IS_ERR(inode)) { err = PTR_ERR(inode); ntfs_err(sb, "Failed to load $AttrDef (%d)", err); goto out; } /* * Typical $AttrDef contains up to 20 entries. * Check for extremely large/small size. */ if (inode->i_size < sizeof(struct ATTR_DEF_ENTRY) || inode->i_size > 100 * sizeof(struct ATTR_DEF_ENTRY)) { ntfs_err(sb, "Looks like $AttrDef is corrupted (size=%llu).", inode->i_size); err = -EINVAL; goto put_inode_out; } bytes = inode->i_size; sbi->def_table = t = kvmalloc(bytes, GFP_KERNEL); if (!t) { err = -ENOMEM; goto put_inode_out; } /* Read the entire file. */ err = inode_read_data(inode, sbi->def_table, bytes); if (err) { ntfs_err(sb, "Failed to read $AttrDef (%d).", err); goto put_inode_out; } if (ATTR_STD != t->type) { ntfs_err(sb, "$AttrDef is corrupted."); err = -EINVAL; goto put_inode_out; } t += 1; sbi->def_entries = 1; done = sizeof(struct ATTR_DEF_ENTRY); while (done + sizeof(struct ATTR_DEF_ENTRY) <= bytes) { u32 t32 = le32_to_cpu(t->type); u64 sz = le64_to_cpu(t->max_sz); if ((t32 & 0xF) || le32_to_cpu(t[-1].type) >= t32) break; if (t->type == ATTR_REPARSE) sbi->reparse.max_size = sz; else if (t->type == ATTR_EA) sbi->ea_max_size = sz; done += sizeof(struct ATTR_DEF_ENTRY); t += 1; sbi->def_entries += 1; } iput(inode); /* Load $UpCase. */ ref.low = cpu_to_le32(MFT_REC_UPCASE); ref.seq = cpu_to_le16(MFT_REC_UPCASE); inode = ntfs_iget5(sb, &ref, &NAME_UPCASE); if (IS_ERR(inode)) { err = PTR_ERR(inode); ntfs_err(sb, "Failed to load $UpCase (%d).", err); goto out; } if (inode->i_size != 0x10000 * sizeof(short)) { err = -EINVAL; ntfs_err(sb, "$UpCase is corrupted."); goto put_inode_out; } /* Read the entire file. */ err = inode_read_data(inode, sbi->upcase, 0x10000 * sizeof(short)); if (err) { ntfs_err(sb, "Failed to read $UpCase (%d).", err); goto put_inode_out; } #ifdef __BIG_ENDIAN { u16 *dst = sbi->upcase; for (i = 0; i < 0x10000; i++) __swab16s(dst++); } #endif shared = ntfs_set_shared(sbi->upcase, 0x10000 * sizeof(short)); if (shared && sbi->upcase != shared) { kvfree(sbi->upcase); sbi->upcase = shared; } iput(inode); if (is_ntfs3(sbi)) { /* Load $Secure. */ err = ntfs_security_init(sbi); if (err) { ntfs_err(sb, "Failed to initialize $Secure (%d).", err); goto out; } /* Load $Extend. */ err = ntfs_extend_init(sbi); if (err) { ntfs_warn(sb, "Failed to initialize $Extend."); goto load_root; } /* Load $Extend/$Reparse. */ err = ntfs_reparse_init(sbi); if (err) { ntfs_warn(sb, "Failed to initialize $Extend/$Reparse."); goto load_root; } /* Load $Extend/$ObjId. */ err = ntfs_objid_init(sbi); if (err) { ntfs_warn(sb, "Failed to initialize $Extend/$ObjId."); goto load_root; } } load_root: /* Load root. */ ref.low = cpu_to_le32(MFT_REC_ROOT); ref.seq = cpu_to_le16(MFT_REC_ROOT); inode = ntfs_iget5(sb, &ref, &NAME_ROOT); if (IS_ERR(inode)) { err = PTR_ERR(inode); ntfs_err(sb, "Failed to load root (%d).", err); goto out; } /* * Final check. Looks like this case should never occurs. */ if (!inode->i_op) { err = -EINVAL; ntfs_err(sb, "Failed to load root (%d).", err); goto put_inode_out; } sb->s_root = d_make_root(inode); if (!sb->s_root) { err = -ENOMEM; goto put_inode_out; } if (boot2) { /* * Alternative boot is ok but primary is not ok. * Volume is recognized as NTFS. Update primary boot. */ struct buffer_head *bh0 = sb_getblk(sb, 0); if (bh0) { if (buffer_locked(bh0)) __wait_on_buffer(bh0); lock_buffer(bh0); memcpy(bh0->b_data, boot2, sizeof(*boot2)); set_buffer_uptodate(bh0); mark_buffer_dirty(bh0); unlock_buffer(bh0); if (!sync_dirty_buffer(bh0)) ntfs_warn(sb, "primary boot is updated"); put_bh(bh0); } kfree(boot2); } ntfs_create_procdir(sb); if (is_legacy_ntfs(sb)) sb->s_flags |= SB_RDONLY; return 0; put_inode_out: iput(inode); out: ntfs3_put_sbi(sbi); kfree(boot2); ntfs3_put_sbi(sbi); return err; } void ntfs_unmap_meta(struct super_block *sb, CLST lcn, CLST len) { struct ntfs_sb_info *sbi = sb->s_fs_info; struct block_device *bdev = sb->s_bdev; sector_t devblock = (u64)lcn * sbi->blocks_per_cluster; unsigned long blocks = (u64)len * sbi->blocks_per_cluster; unsigned long cnt = 0; unsigned long limit = global_zone_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - sb->s_blocksize_bits); if (limit >= 0x2000) limit -= 0x1000; else if (limit < 32) limit = 32; else limit >>= 1; while (blocks--) { clean_bdev_aliases(bdev, devblock++, 1); if (cnt++ >= limit) { sync_blockdev(bdev); cnt = 0; } } } /* * ntfs_discard - Issue a discard request (trim for SSD). */ int ntfs_discard(struct ntfs_sb_info *sbi, CLST lcn, CLST len) { int err; u64 lbo, bytes, start, end; struct super_block *sb; if (sbi->used.next_free_lcn == lcn + len) sbi->used.next_free_lcn = lcn; if (sbi->flags & NTFS_FLAGS_NODISCARD) return -EOPNOTSUPP; if (!sbi->options->discard) return -EOPNOTSUPP; lbo = (u64)lcn << sbi->cluster_bits; bytes = (u64)len << sbi->cluster_bits; /* Align up 'start' on discard_granularity. */ start = (lbo + sbi->discard_granularity - 1) & sbi->discard_granularity_mask_inv; /* Align down 'end' on discard_granularity. */ end = (lbo + bytes) & sbi->discard_granularity_mask_inv; sb = sbi->sb; if (start >= end) return 0; err = blkdev_issue_discard(sb->s_bdev, start >> 9, (end - start) >> 9, GFP_NOFS); if (err == -EOPNOTSUPP) sbi->flags |= NTFS_FLAGS_NODISCARD; return err; } static int ntfs_fs_get_tree(struct fs_context *fc) { return get_tree_bdev(fc, ntfs_fill_super); } /* * ntfs_fs_free - Free fs_context. * * Note that this will be called after fill_super and reconfigure * even when they pass. So they have to take pointers if they pass. */ static void ntfs_fs_free(struct fs_context *fc) { struct ntfs_mount_options *opts = fc->fs_private; struct ntfs_sb_info *sbi = fc->s_fs_info; if (sbi) { ntfs3_put_sbi(sbi); ntfs3_free_sbi(sbi); } if (opts) put_mount_options(opts); } // clang-format off static const struct fs_context_operations ntfs_context_ops = { .parse_param = ntfs_fs_parse_param, .get_tree = ntfs_fs_get_tree, .reconfigure = ntfs_fs_reconfigure, .free = ntfs_fs_free, }; // clang-format on /* * ntfs_init_fs_context - Initialize sbi and opts * * This will called when mount/remount. We will first initialize * options so that if remount we can use just that. */ static int __ntfs_init_fs_context(struct fs_context *fc) { struct ntfs_mount_options *opts; struct ntfs_sb_info *sbi; opts = kzalloc(sizeof(struct ntfs_mount_options), GFP_NOFS); if (!opts) return -ENOMEM; /* Default options. */ opts->fs_uid = current_uid(); opts->fs_gid = current_gid(); opts->fs_fmask_inv = ~current_umask(); opts->fs_dmask_inv = ~current_umask(); if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) goto ok; sbi = kzalloc(sizeof(struct ntfs_sb_info), GFP_NOFS); if (!sbi) goto free_opts; sbi->upcase = kvmalloc(0x10000 * sizeof(short), GFP_KERNEL); if (!sbi->upcase) goto free_sbi; ratelimit_state_init(&sbi->msg_ratelimit, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); mutex_init(&sbi->compress.mtx_lznt); #ifdef CONFIG_NTFS3_LZX_XPRESS mutex_init(&sbi->compress.mtx_xpress); mutex_init(&sbi->compress.mtx_lzx); #endif fc->s_fs_info = sbi; ok: fc->fs_private = opts; fc->ops = &ntfs_context_ops; return 0; free_sbi: kfree(sbi); free_opts: kfree(opts); return -ENOMEM; } static int ntfs_init_fs_context(struct fs_context *fc) { return __ntfs_init_fs_context(fc); } static void ntfs3_kill_sb(struct super_block *sb) { struct ntfs_sb_info *sbi = sb->s_fs_info; kill_block_super(sb); if (sbi->options) put_mount_options(sbi->options); ntfs3_free_sbi(sbi); } // clang-format off static struct file_system_type ntfs_fs_type = { .owner = THIS_MODULE, .name = "ntfs3", .init_fs_context = ntfs_init_fs_context, .parameters = ntfs_fs_parameters, .kill_sb = ntfs3_kill_sb, .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, }; #if IS_ENABLED(CONFIG_NTFS_FS) static int ntfs_legacy_init_fs_context(struct fs_context *fc) { int ret; ret = __ntfs_init_fs_context(fc); /* If ntfs3 is used as legacy ntfs enforce read-only mode. */ fc->sb_flags |= SB_RDONLY; return ret; } static struct file_system_type ntfs_legacy_fs_type = { .owner = THIS_MODULE, .name = "ntfs", .init_fs_context = ntfs_legacy_init_fs_context, .parameters = ntfs_fs_parameters, .kill_sb = ntfs3_kill_sb, .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, }; MODULE_ALIAS_FS("ntfs"); static inline void register_as_ntfs_legacy(void) { int err = register_filesystem(&ntfs_legacy_fs_type); if (err) pr_warn("ntfs3: Failed to register legacy ntfs filesystem driver: %d\n", err); } static inline void unregister_as_ntfs_legacy(void) { unregister_filesystem(&ntfs_legacy_fs_type); } bool is_legacy_ntfs(struct super_block *sb) { return sb->s_type == &ntfs_legacy_fs_type; } #else static inline void register_as_ntfs_legacy(void) {} static inline void unregister_as_ntfs_legacy(void) {} #endif // clang-format on static int __init init_ntfs_fs(void) { int err; if (IS_ENABLED(CONFIG_NTFS3_FS_POSIX_ACL)) pr_info("ntfs3: Enabled Linux POSIX ACLs support\n"); if (IS_ENABLED(CONFIG_NTFS3_64BIT_CLUSTER)) pr_notice( "ntfs3: Warning: Activated 64 bits per cluster. Windows does not support this\n"); if (IS_ENABLED(CONFIG_NTFS3_LZX_XPRESS)) pr_info("ntfs3: Read-only LZX/Xpress compression included\n"); ntfs_create_proc_root(); err = ntfs3_init_bitmap(); if (err) goto out2; ntfs_inode_cachep = kmem_cache_create( "ntfs_inode_cache", sizeof(struct ntfs_inode), 0, (SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT), init_once); if (!ntfs_inode_cachep) { err = -ENOMEM; goto out1; } register_as_ntfs_legacy(); err = register_filesystem(&ntfs_fs_type); if (err) goto out; return 0; out: kmem_cache_destroy(ntfs_inode_cachep); out1: ntfs3_exit_bitmap(); out2: ntfs_remove_proc_root(); return err; } static void __exit exit_ntfs_fs(void) { rcu_barrier(); kmem_cache_destroy(ntfs_inode_cachep); unregister_filesystem(&ntfs_fs_type); unregister_as_ntfs_legacy(); ntfs3_exit_bitmap(); ntfs_remove_proc_root(); } MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("ntfs3 read/write filesystem"); #ifdef CONFIG_NTFS3_FS_POSIX_ACL MODULE_INFO(behaviour, "Enabled Linux POSIX ACLs support"); #endif #ifdef CONFIG_NTFS3_64BIT_CLUSTER MODULE_INFO( cluster, "Warning: Activated 64 bits per cluster. Windows does not support this"); #endif #ifdef CONFIG_NTFS3_LZX_XPRESS MODULE_INFO(compression, "Read-only lzx/xpress compression included"); #endif MODULE_AUTHOR("Konstantin Komarov"); MODULE_ALIAS_FS("ntfs3"); module_init(init_ntfs_fs); module_exit(exit_ntfs_fs);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 // SPDX-License-Identifier: GPL-2.0-or-later /* Paravirtualization interfaces Copyright (C) 2006 Rusty Russell IBM Corporation 2007 - x86_64 support added by Glauber de Oliveira Costa, Red Hat Inc */ #include <linux/errno.h> #include <linux/init.h> #include <linux/export.h> #include <linux/efi.h> #include <linux/bcd.h> #include <linux/highmem.h> #include <linux/kprobes.h> #include <linux/pgtable.h> #include <linux/static_call.h> #include <asm/bug.h> #include <asm/paravirt.h> #include <asm/debugreg.h> #include <asm/desc.h> #include <asm/setup.h> #include <asm/time.h> #include <asm/pgalloc.h> #include <asm/irq.h> #include <asm/delay.h> #include <asm/fixmap.h> #include <asm/apic.h> #include <asm/tlbflush.h> #include <asm/timer.h> #include <asm/special_insns.h> #include <asm/tlb.h> #include <asm/io_bitmap.h> #include <asm/gsseg.h> #include <asm/msr.h> /* stub always returning 0. */ DEFINE_ASM_FUNC(paravirt_ret0, "xor %eax,%eax", .entry.text); void __init default_banner(void) { printk(KERN_INFO "Booting paravirtualized kernel on %s\n", pv_info.name); } #ifdef CONFIG_PARAVIRT_XXL DEFINE_ASM_FUNC(_paravirt_ident_64, "mov %rdi, %rax", .text); DEFINE_ASM_FUNC(pv_native_save_fl, "pushf; pop %rax", .noinstr.text); DEFINE_ASM_FUNC(pv_native_irq_disable, "cli", .noinstr.text); DEFINE_ASM_FUNC(pv_native_irq_enable, "sti", .noinstr.text); DEFINE_ASM_FUNC(pv_native_read_cr2, "mov %cr2, %rax", .noinstr.text); #endif DEFINE_STATIC_KEY_FALSE(virt_spin_lock_key); void __init native_pv_lock_init(void) { if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) static_branch_enable(&virt_spin_lock_key); } struct static_key paravirt_steal_enabled; struct static_key paravirt_steal_rq_enabled; static u64 native_steal_clock(int cpu) { return 0; } DEFINE_STATIC_CALL(pv_steal_clock, native_steal_clock); DEFINE_STATIC_CALL(pv_sched_clock, native_sched_clock); void paravirt_set_sched_clock(u64 (*func)(void)) { static_call_update(pv_sched_clock, func); } static noinstr void pv_native_safe_halt(void) { native_safe_halt(); } #ifdef CONFIG_PARAVIRT_XXL static noinstr void pv_native_write_cr2(unsigned long val) { native_write_cr2(val); } static noinstr unsigned long pv_native_read_cr3(void) { return __native_read_cr3(); } static noinstr void pv_native_write_cr3(unsigned long cr3) { native_write_cr3(cr3); } static noinstr unsigned long pv_native_get_debugreg(int regno) { return native_get_debugreg(regno); } static noinstr void pv_native_set_debugreg(int regno, unsigned long val) { native_set_debugreg(regno, val); } #endif struct pv_info pv_info = { .name = "bare hardware", #ifdef CONFIG_PARAVIRT_XXL .extra_user_64bit_cs = __USER_CS, #endif }; /* 64-bit pagetable entries */ #define PTE_IDENT __PV_IS_CALLEE_SAVE(_paravirt_ident_64) struct paravirt_patch_template pv_ops = { /* Cpu ops. */ .cpu.io_delay = native_io_delay, #ifdef CONFIG_PARAVIRT_XXL .cpu.cpuid = native_cpuid, .cpu.get_debugreg = pv_native_get_debugreg, .cpu.set_debugreg = pv_native_set_debugreg, .cpu.read_cr0 = native_read_cr0, .cpu.write_cr0 = native_write_cr0, .cpu.write_cr4 = native_write_cr4, .cpu.read_msr = native_read_msr, .cpu.write_msr = native_write_msr, .cpu.read_msr_safe = native_read_msr_safe, .cpu.write_msr_safe = native_write_msr_safe, .cpu.read_pmc = native_read_pmc, .cpu.load_tr_desc = native_load_tr_desc, .cpu.set_ldt = native_set_ldt, .cpu.load_gdt = native_load_gdt, .cpu.load_idt = native_load_idt, .cpu.store_tr = native_store_tr, .cpu.load_tls = native_load_tls, .cpu.load_gs_index = native_load_gs_index, .cpu.write_ldt_entry = native_write_ldt_entry, .cpu.write_gdt_entry = native_write_gdt_entry, .cpu.write_idt_entry = native_write_idt_entry, .cpu.alloc_ldt = paravirt_nop, .cpu.free_ldt = paravirt_nop, .cpu.load_sp0 = native_load_sp0, #ifdef CONFIG_X86_IOPL_IOPERM .cpu.invalidate_io_bitmap = native_tss_invalidate_io_bitmap, .cpu.update_io_bitmap = native_tss_update_io_bitmap, #endif .cpu.start_context_switch = paravirt_nop, .cpu.end_context_switch = paravirt_nop, /* Irq ops. */ .irq.save_fl = __PV_IS_CALLEE_SAVE(pv_native_save_fl), .irq.irq_disable = __PV_IS_CALLEE_SAVE(pv_native_irq_disable), .irq.irq_enable = __PV_IS_CALLEE_SAVE(pv_native_irq_enable), #endif /* CONFIG_PARAVIRT_XXL */ /* Irq HLT ops. */ .irq.safe_halt = pv_native_safe_halt, .irq.halt = native_halt, /* Mmu ops. */ .mmu.flush_tlb_user = native_flush_tlb_local, .mmu.flush_tlb_kernel = native_flush_tlb_global, .mmu.flush_tlb_one_user = native_flush_tlb_one_user, .mmu.flush_tlb_multi = native_flush_tlb_multi, .mmu.exit_mmap = paravirt_nop, .mmu.notify_page_enc_status_changed = paravirt_nop, #ifdef CONFIG_PARAVIRT_XXL .mmu.read_cr2 = __PV_IS_CALLEE_SAVE(pv_native_read_cr2), .mmu.write_cr2 = pv_native_write_cr2, .mmu.read_cr3 = pv_native_read_cr3, .mmu.write_cr3 = pv_native_write_cr3, .mmu.pgd_alloc = __paravirt_pgd_alloc, .mmu.pgd_free = paravirt_nop, .mmu.alloc_pte = paravirt_nop, .mmu.alloc_pmd = paravirt_nop, .mmu.alloc_pud = paravirt_nop, .mmu.alloc_p4d = paravirt_nop, .mmu.release_pte = paravirt_nop, .mmu.release_pmd = paravirt_nop, .mmu.release_pud = paravirt_nop, .mmu.release_p4d = paravirt_nop, .mmu.set_pte = native_set_pte, .mmu.set_pmd = native_set_pmd, .mmu.ptep_modify_prot_start = __ptep_modify_prot_start, .mmu.ptep_modify_prot_commit = __ptep_modify_prot_commit, .mmu.set_pud = native_set_pud, .mmu.pmd_val = PTE_IDENT, .mmu.make_pmd = PTE_IDENT, .mmu.pud_val = PTE_IDENT, .mmu.make_pud = PTE_IDENT, .mmu.set_p4d = native_set_p4d, .mmu.p4d_val = PTE_IDENT, .mmu.make_p4d = PTE_IDENT, .mmu.set_pgd = native_set_pgd, .mmu.pte_val = PTE_IDENT, .mmu.pgd_val = PTE_IDENT, .mmu.make_pte = PTE_IDENT, .mmu.make_pgd = PTE_IDENT, .mmu.enter_mmap = paravirt_nop, .mmu.lazy_mode = { .enter = paravirt_nop, .leave = paravirt_nop, .flush = paravirt_nop, }, .mmu.set_fixmap = native_set_fixmap, #endif /* CONFIG_PARAVIRT_XXL */ #if defined(CONFIG_PARAVIRT_SPINLOCKS) /* Lock ops. */ #ifdef CONFIG_SMP .lock.queued_spin_lock_slowpath = native_queued_spin_lock_slowpath, .lock.queued_spin_unlock = PV_CALLEE_SAVE(__native_queued_spin_unlock), .lock.wait = paravirt_nop, .lock.kick = paravirt_nop, .lock.vcpu_is_preempted = PV_CALLEE_SAVE(__native_vcpu_is_preempted), #endif /* SMP */ #endif }; #ifdef CONFIG_PARAVIRT_XXL NOKPROBE_SYMBOL(native_load_idt); #endif EXPORT_SYMBOL(pv_ops); EXPORT_SYMBOL_GPL(pv_info);
43 5 21 72 4 67 68 72 21 21 21 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar * Copyright (C) 2005-2006, Thomas Gleixner * * This file contains the IRQ-resend code * * If the interrupt is waiting to be processed, we try to re-run it. * We can't directly run it from here since the caller might be in an * interrupt-protected region. Not all irq controller chips can * retrigger interrupts at the hardware level, so in those cases * we allow the resending of IRQs via a tasklet. */ #include <linux/irq.h> #include <linux/module.h> #include <linux/random.h> #include <linux/interrupt.h> #include "internals.h" #ifdef CONFIG_HARDIRQS_SW_RESEND /* hlist_head to handle software resend of interrupts: */ static HLIST_HEAD(irq_resend_list); static DEFINE_RAW_SPINLOCK(irq_resend_lock); /* * Run software resends of IRQ's */ static void resend_irqs(struct tasklet_struct *unused) { guard(raw_spinlock_irq)(&irq_resend_lock); while (!hlist_empty(&irq_resend_list)) { struct irq_desc *desc; desc = hlist_entry(irq_resend_list.first, struct irq_desc, resend_node); hlist_del_init(&desc->resend_node); raw_spin_unlock(&irq_resend_lock); desc->handle_irq(desc); raw_spin_lock(&irq_resend_lock); } } /* Tasklet to handle resend: */ static DECLARE_TASKLET(resend_tasklet, resend_irqs); static int irq_sw_resend(struct irq_desc *desc) { /* * Validate whether this interrupt can be safely injected from * non interrupt context */ if (irqd_is_handle_enforce_irqctx(&desc->irq_data)) return -EINVAL; /* * If the interrupt is running in the thread context of the parent * irq we need to be careful, because we cannot trigger it * directly. */ if (irq_settings_is_nested_thread(desc)) { /* * If the parent_irq is valid, we retrigger the parent, * otherwise we do nothing. */ if (!desc->parent_irq) return -EINVAL; desc = irq_to_desc(desc->parent_irq); if (!desc) return -EINVAL; } /* Add to resend_list and activate the softirq: */ scoped_guard(raw_spinlock, &irq_resend_lock) { if (hlist_unhashed(&desc->resend_node)) hlist_add_head(&desc->resend_node, &irq_resend_list); } tasklet_schedule(&resend_tasklet); return 0; } void clear_irq_resend(struct irq_desc *desc) { guard(raw_spinlock)(&irq_resend_lock); hlist_del_init(&desc->resend_node); } void irq_resend_init(struct irq_desc *desc) { INIT_HLIST_NODE(&desc->resend_node); } #else void clear_irq_resend(struct irq_desc *desc) {} void irq_resend_init(struct irq_desc *desc) {} static int irq_sw_resend(struct irq_desc *desc) { return -EINVAL; } #endif static int try_retrigger(struct irq_desc *desc) { if (desc->irq_data.chip->irq_retrigger) return desc->irq_data.chip->irq_retrigger(&desc->irq_data); #ifdef CONFIG_IRQ_DOMAIN_HIERARCHY return irq_chip_retrigger_hierarchy(&desc->irq_data); #else return 0; #endif } /* * IRQ resend * * Is called with interrupts disabled and desc->lock held. */ int check_irq_resend(struct irq_desc *desc, bool inject) { int err = 0; /* * We do not resend level type interrupts. Level type interrupts * are resent by hardware when they are still active. Clear the * pending bit so suspend/resume does not get confused. */ if (irq_settings_is_level(desc)) { desc->istate &= ~IRQS_PENDING; return -EINVAL; } if (desc->istate & IRQS_REPLAY) return -EBUSY; if (!(desc->istate & IRQS_PENDING) && !inject) return 0; desc->istate &= ~IRQS_PENDING; if (!try_retrigger(desc)) err = irq_sw_resend(desc); /* If the retrigger was successful, mark it with the REPLAY bit */ if (!err) desc->istate |= IRQS_REPLAY; return err; } #ifdef CONFIG_GENERIC_IRQ_INJECTION /** * irq_inject_interrupt - Inject an interrupt for testing/error injection * @irq: The interrupt number * * This function must only be used for debug and testing purposes! * * Especially on x86 this can cause a premature completion of an interrupt * affinity change causing the interrupt line to become stale. Very * unlikely, but possible. * * The injection can fail for various reasons: * - Interrupt is not activated * - Interrupt is NMI type or currently replaying * - Interrupt is level type * - Interrupt does not support hardware retrigger and software resend is * either not enabled or not possible for the interrupt. */ int irq_inject_interrupt(unsigned int irq) { int err = -EINVAL; /* Try the state injection hardware interface first */ if (!irq_set_irqchip_state(irq, IRQCHIP_STATE_PENDING, true)) return 0; /* That failed, try via the resend mechanism */ scoped_irqdesc_get_and_buslock(irq, 0) { struct irq_desc *desc = scoped_irqdesc; /* * Only try to inject when the interrupt is: * - not NMI type * - activated */ if (!irq_is_nmi(desc) && irqd_is_activated(&desc->irq_data)) err = check_irq_resend(desc, true); } return err; } EXPORT_SYMBOL_GPL(irq_inject_interrupt); #endif
16757 16732 16893 16749 16757 16762 16901 16764 16892 16768 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 // SPDX-License-Identifier: GPL-2.0-only /* * AppArmor security module * * This file contains AppArmor network mediation * * Copyright (C) 1998-2008 Novell/SUSE * Copyright 2009-2017 Canonical Ltd. */ #include "include/af_unix.h" #include "include/apparmor.h" #include "include/audit.h" #include "include/cred.h" #include "include/label.h" #include "include/net.h" #include "include/policy.h" #include "include/secid.h" #include "net_names.h" struct aa_sfs_entry aa_sfs_entry_network[] = { AA_SFS_FILE_STRING("af_mask", AA_SFS_AF_MASK), { } }; struct aa_sfs_entry aa_sfs_entry_networkv9[] = { AA_SFS_FILE_STRING("af_mask", AA_SFS_AF_MASK), AA_SFS_FILE_BOOLEAN("af_unix", 1), { } }; static const char * const net_mask_names[] = { "unknown", "send", "receive", "unknown", "create", "shutdown", "connect", "unknown", "setattr", "getattr", "setcred", "getcred", "chmod", "chown", "chgrp", "lock", "mmap", "mprot", "unknown", "unknown", "accept", "bind", "listen", "unknown", "setopt", "getopt", "unknown", "unknown", "unknown", "unknown", "unknown", "unknown", }; static void audit_unix_addr(struct audit_buffer *ab, const char *str, struct sockaddr_un *addr, int addrlen) { int len = unix_addr_len(addrlen); if (!addr || len <= 0) { audit_log_format(ab, " %s=none", str); } else if (addr->sun_path[0]) { audit_log_format(ab, " %s=", str); audit_log_untrustedstring(ab, addr->sun_path); } else { audit_log_format(ab, " %s=\"@", str); if (audit_string_contains_control(&addr->sun_path[1], len - 1)) audit_log_n_hex(ab, &addr->sun_path[1], len - 1); else audit_log_format(ab, "%.*s", len - 1, &addr->sun_path[1]); audit_log_format(ab, "\""); } } static void audit_unix_sk_addr(struct audit_buffer *ab, const char *str, const struct sock *sk) { const struct unix_sock *u = unix_sk(sk); if (u && u->addr) { int addrlen; struct sockaddr_un *addr = aa_sunaddr(u, &addrlen); audit_unix_addr(ab, str, addr, addrlen); } else { audit_unix_addr(ab, str, NULL, 0); } } /* audit callback for net specific fields */ void audit_net_cb(struct audit_buffer *ab, void *va) { struct common_audit_data *sa = va; struct apparmor_audit_data *ad = aad(sa); if (address_family_names[ad->common.u.net->family]) audit_log_format(ab, " family=\"%s\"", address_family_names[ad->common.u.net->family]); else audit_log_format(ab, " family=\"unknown(%d)\"", ad->common.u.net->family); if (sock_type_names[ad->net.type]) audit_log_format(ab, " sock_type=\"%s\"", sock_type_names[ad->net.type]); else audit_log_format(ab, " sock_type=\"unknown(%d)\"", ad->net.type); audit_log_format(ab, " protocol=%d", ad->net.protocol); if (ad->request & NET_PERMS_MASK) { audit_log_format(ab, " requested_mask="); aa_audit_perm_mask(ab, ad->request, NULL, 0, net_mask_names, NET_PERMS_MASK); if (ad->denied & NET_PERMS_MASK) { audit_log_format(ab, " denied_mask="); aa_audit_perm_mask(ab, ad->denied, NULL, 0, net_mask_names, NET_PERMS_MASK); } } if (ad->common.u.net->family == PF_UNIX) { if (ad->net.addr || !ad->common.u.net->sk) audit_unix_addr(ab, "addr", unix_addr(ad->net.addr), ad->net.addrlen); else audit_unix_sk_addr(ab, "addr", ad->common.u.net->sk); if (ad->request & NET_PEER_MASK) { audit_unix_addr(ab, "peer_addr", unix_addr(ad->net.peer.addr), ad->net.peer.addrlen); } } if (ad->peer) { audit_log_format(ab, " peer="); aa_label_xaudit(ab, labels_ns(ad->subj_label), ad->peer, FLAGS_NONE, GFP_ATOMIC); } } /* standard permission lookup pattern - supports early bailout */ int aa_do_perms(struct aa_profile *profile, struct aa_policydb *policy, aa_state_t state, u32 request, struct aa_perms *p, struct apparmor_audit_data *ad) { struct aa_perms perms; AA_BUG(!profile); AA_BUG(!policy); if (state || !p) p = aa_lookup_perms(policy, state); perms = *p; aa_apply_modes_to_perms(profile, &perms); return aa_check_perms(profile, &perms, request, ad, audit_net_cb); } /* only continue match if * insufficient current perms at current state * indicates there are more perms in later state * Returns: perms struct if early match */ static struct aa_perms *early_match(struct aa_policydb *policy, aa_state_t state, u32 request) { struct aa_perms *p; p = aa_lookup_perms(policy, state); if (((p->allow & request) != request) && (p->allow & AA_CONT_MATCH)) return NULL; return p; } static aa_state_t aa_dfa_match_be16(struct aa_dfa *dfa, aa_state_t state, u16 data) { __be16 buffer = cpu_to_be16(data); return aa_dfa_match_len(dfa, state, (char *) &buffer, 2); } /** * aa_match_to_prot - match the af, type, protocol triplet * @policy: policy being matched * @state: state to start in * @request: permissions being requested, ignored if @p == NULL * @af: socket address family * @type: socket type * @protocol: socket protocol * @p: output - pointer to permission associated with match * @info: output - pointer to string describing failure * * RETURNS: state match stopped in. * * If @(p) is assigned a value the returned state will be the * corresponding state. Will not set @p on failure or if match completes * only if an early match occurs */ aa_state_t aa_match_to_prot(struct aa_policydb *policy, aa_state_t state, u32 request, u16 af, int type, int protocol, struct aa_perms **p, const char **info) { state = aa_dfa_match_be16(policy->dfa, state, (u16)af); if (!state) { *info = "failed af match"; return state; } state = aa_dfa_match_be16(policy->dfa, state, (u16)type); if (state) { if (p) *p = early_match(policy, state, request); if (!p || !*p) { state = aa_dfa_match_be16(policy->dfa, state, (u16)protocol); if (!state) *info = "failed protocol match"; } } else { *info = "failed type match"; } return state; } /* Generic af perm */ int aa_profile_af_perm(struct aa_profile *profile, struct apparmor_audit_data *ad, u32 request, u16 family, int type, int protocol) { struct aa_ruleset *rules = profile->label.rules[0]; struct aa_perms *p = NULL; aa_state_t state; AA_BUG(family >= AF_MAX); AA_BUG(type < 0 || type >= SOCK_MAX); AA_BUG(profile_unconfined(profile)); if (profile_unconfined(profile)) return 0; state = RULE_MEDIATES_NET(rules); if (!state) return 0; state = aa_match_to_prot(rules->policy, state, request, family, type, protocol, &p, &ad->info); return aa_do_perms(profile, rules->policy, state, request, p, ad); } int aa_af_perm(const struct cred *subj_cred, struct aa_label *label, const char *op, u32 request, u16 family, int type, int protocol) { struct aa_profile *profile; DEFINE_AUDIT_NET(ad, op, subj_cred, NULL, family, type, protocol); return fn_for_each_confined(label, profile, aa_profile_af_perm(profile, &ad, request, family, type, protocol)); } static int aa_label_sk_perm(const struct cred *subj_cred, struct aa_label *label, const char *op, u32 request, struct sock *sk) { struct aa_sk_ctx *ctx = aa_sock(sk); int error = 0; AA_BUG(!label); AA_BUG(!sk); if (rcu_access_pointer(ctx->label) != kernel_t && !unconfined(label)) { struct aa_profile *profile; DEFINE_AUDIT_SK(ad, op, subj_cred, sk); ad.subj_cred = subj_cred; error = fn_for_each_confined(label, profile, aa_profile_af_sk_perm(profile, &ad, request, sk)); } return error; } int aa_sk_perm(const char *op, u32 request, struct sock *sk) { struct aa_label *label; int error; AA_BUG(!sk); AA_BUG(in_interrupt()); /* TODO: switch to begin_current_label ???? */ label = begin_current_label_crit_section(); error = aa_label_sk_perm(current_cred(), label, op, request, sk); end_current_label_crit_section(label); return error; } int aa_sock_file_perm(const struct cred *subj_cred, struct aa_label *label, const char *op, u32 request, struct file *file) { struct socket *sock = (struct socket *) file->private_data; AA_BUG(!label); AA_BUG(!sock); AA_BUG(!sock->sk); if (sock->sk->sk_family == PF_UNIX) return aa_unix_file_perm(subj_cred, label, op, request, file); return aa_label_sk_perm(subj_cred, label, op, request, sock->sk); } #ifdef CONFIG_NETWORK_SECMARK static int apparmor_secmark_init(struct aa_secmark *secmark) { struct aa_label *label; if (secmark->label[0] == '*') { secmark->secid = AA_SECID_WILDCARD; return 0; } label = aa_label_strn_parse(&root_ns->unconfined->label, secmark->label, strlen(secmark->label), GFP_ATOMIC, false, false); if (IS_ERR(label)) return PTR_ERR(label); secmark->secid = label->secid; return 0; } static int aa_secmark_perm(struct aa_profile *profile, u32 request, u32 secid, struct apparmor_audit_data *ad) { int i, ret; struct aa_perms perms = { }; struct aa_ruleset *rules = profile->label.rules[0]; if (rules->secmark_count == 0) return 0; for (i = 0; i < rules->secmark_count; i++) { if (!rules->secmark[i].secid) { ret = apparmor_secmark_init(&rules->secmark[i]); if (ret) return ret; } if (rules->secmark[i].secid == secid || rules->secmark[i].secid == AA_SECID_WILDCARD) { if (rules->secmark[i].deny) perms.deny = ALL_PERMS_MASK; else perms.allow = ALL_PERMS_MASK; if (rules->secmark[i].audit) perms.audit = ALL_PERMS_MASK; } } aa_apply_modes_to_perms(profile, &perms); return aa_check_perms(profile, &perms, request, ad, audit_net_cb); } int apparmor_secmark_check(struct aa_label *label, char *op, u32 request, u32 secid, const struct sock *sk) { struct aa_profile *profile; DEFINE_AUDIT_SK(ad, op, NULL, sk); return fn_for_each_confined(label, profile, aa_secmark_perm(profile, request, secid, &ad)); } #endif
136 114 44 34 335 4 966 116 116 115 115 38 10 225 222 445 5 111 787 670 54 324 324 385 12 5 533 19 2 70 8 421 623 667 436 802 408 179 360 190 216 6 351 397 355 359 438 528 2 571 1 10 11 428 11 1 167 775 142 772 37 341 335 118 334 46 145 145 68 620 314 82 479 3 90 4 4 35 361 361 361 360 356 622 839 309 528 488 786 106 93 536 345 232 110 110 110 110 2 89 87 91 616 34 326 615 625 95 625 95 604 572 1 774 263 40 476 423 256 36 35 855 862 145 146 96 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 // SPDX-License-Identifier: GPL-2.0 /* * ext4.h * * Copyright (C) 1992, 1993, 1994, 1995 * Remy Card (card@masi.ibp.fr) * Laboratoire MASI - Institut Blaise Pascal * Universite Pierre et Marie Curie (Paris VI) * * from * * linux/include/linux/minix_fs.h * * Copyright (C) 1991, 1992 Linus Torvalds */ #ifndef _EXT4_H #define _EXT4_H #include <linux/refcount.h> #include <linux/types.h> #include <linux/blkdev.h> #include <linux/magic.h> #include <linux/jbd2.h> #include <linux/quota.h> #include <linux/rwsem.h> #include <linux/rbtree.h> #include <linux/seqlock.h> #include <linux/mutex.h> #include <linux/timer.h> #include <linux/wait.h> #include <linux/sched/signal.h> #include <linux/blockgroup_lock.h> #include <linux/percpu_counter.h> #include <linux/ratelimit.h> #include <linux/crc32c.h> #include <linux/falloc.h> #include <linux/percpu-rwsem.h> #include <linux/fiemap.h> #ifdef __KERNEL__ #include <linux/compat.h> #endif #include <uapi/linux/ext4.h> #include <linux/fscrypt.h> #include <linux/fsverity.h> #include <linux/compiler.h> /* * The fourth extended filesystem constants/structures */ /* * with AGGRESSIVE_CHECK allocator runs consistency checks over * structures. these checks slow things down a lot */ #define AGGRESSIVE_CHECK__ /* * with DOUBLE_CHECK defined mballoc creates persistent in-core * bitmaps, maintains and uses them to check for double allocations */ #define DOUBLE_CHECK__ /* * Define EXT4FS_DEBUG to produce debug messages */ #undef EXT4FS_DEBUG /* * Debug code */ #ifdef EXT4FS_DEBUG #define ext4_debug(f, a...) \ do { \ printk(KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:", \ __FILE__, __LINE__, __func__); \ printk(KERN_DEBUG f, ## a); \ } while (0) #else #define ext4_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif /* * Turn on EXT_DEBUG to enable ext4_ext_show_path/leaf/move in extents.c */ #define EXT_DEBUG__ /* * Dynamic printk for controlled extents debugging. */ #ifdef CONFIG_EXT4_DEBUG #define ext_debug(ino, fmt, ...) \ pr_debug("[%s/%d] EXT4-fs (%s): ino %lu: (%s, %d): %s:" fmt, \ current->comm, task_pid_nr(current), \ ino->i_sb->s_id, ino->i_ino, __FILE__, __LINE__, \ __func__, ##__VA_ARGS__) #else #define ext_debug(ino, fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif #define ASSERT(assert) \ do { \ if (unlikely(!(assert))) { \ printk(KERN_EMERG \ "Assertion failure in %s() at %s:%d: '%s'\n", \ __func__, __FILE__, __LINE__, #assert); \ BUG(); \ } \ } while (0) /* data type for block offset of block group */ typedef int ext4_grpblk_t; /* data type for filesystem-wide blocks number */ typedef unsigned long long ext4_fsblk_t; /* data type for file logical block number */ typedef __u32 ext4_lblk_t; /* data type for block group number */ typedef unsigned int ext4_group_t; enum SHIFT_DIRECTION { SHIFT_LEFT = 0, SHIFT_RIGHT, }; /* * For each criteria, mballoc has slightly different way of finding * the required blocks nad usually, higher the criteria the slower the * allocation. We start at lower criterias and keep falling back to * higher ones if we are not able to find any blocks. Lower (earlier) * criteria are faster. */ enum criteria { /* * Used when number of blocks needed is a power of 2. This * doesn't trigger any disk IO except prefetch and is the * fastest criteria. */ CR_POWER2_ALIGNED, /* * Tries to lookup in-memory data structures to find the most * suitable group that satisfies goal request. No disk IO * except block prefetch. */ CR_GOAL_LEN_FAST, /* * Same as CR_GOAL_LEN_FAST but is allowed to reduce the goal * length to the best available length for faster allocation. */ CR_BEST_AVAIL_LEN, /* * Reads each block group sequentially, performing disk IO if * necessary, to find suitable block group. Tries to * allocate goal length but might trim the request if nothing * is found after enough tries. */ CR_GOAL_LEN_SLOW, /* * Finds the first free set of blocks and allocates * those. This is only used in rare cases when * CR_GOAL_LEN_SLOW also fails to allocate anything. */ CR_ANY_FREE, /* * Number of criterias defined. */ EXT4_MB_NUM_CRS }; /* * Flags used in mballoc's allocation_context flags field. * * Also used to show what's going on for debugging purposes when the * flag field is exported via the traceport interface */ /* prefer goal again. length */ #define EXT4_MB_HINT_MERGE 0x0001 /* first blocks in the file */ #define EXT4_MB_HINT_FIRST 0x0008 /* data is being allocated */ #define EXT4_MB_HINT_DATA 0x0020 /* don't preallocate (for tails) */ #define EXT4_MB_HINT_NOPREALLOC 0x0040 /* allocate for locality group */ #define EXT4_MB_HINT_GROUP_ALLOC 0x0080 /* allocate goal blocks or none */ #define EXT4_MB_HINT_GOAL_ONLY 0x0100 /* goal is meaningful */ #define EXT4_MB_HINT_TRY_GOAL 0x0200 /* blocks already pre-reserved by delayed allocation */ #define EXT4_MB_DELALLOC_RESERVED 0x0400 /* We are doing stream allocation */ #define EXT4_MB_STREAM_ALLOC 0x0800 /* Use reserved root blocks if needed */ #define EXT4_MB_USE_ROOT_BLOCKS 0x1000 /* Use blocks from reserved pool */ #define EXT4_MB_USE_RESERVED 0x2000 /* Do strict check for free blocks while retrying block allocation */ #define EXT4_MB_STRICT_CHECK 0x4000 struct ext4_allocation_request { /* target inode for block we're allocating */ struct inode *inode; /* how many blocks we want to allocate */ unsigned int len; /* logical block in target inode */ ext4_lblk_t logical; /* the closest logical allocated block to the left */ ext4_lblk_t lleft; /* the closest logical allocated block to the right */ ext4_lblk_t lright; /* phys. target (a hint) */ ext4_fsblk_t goal; /* phys. block for the closest logical allocated block to the left */ ext4_fsblk_t pleft; /* phys. block for the closest logical allocated block to the right */ ext4_fsblk_t pright; /* flags. see above EXT4_MB_HINT_* */ unsigned int flags; }; /* * Logical to physical block mapping, used by ext4_map_blocks() * * This structure is used to pass requests into ext4_map_blocks() as * well as to store the information returned by ext4_map_blocks(). It * takes less room on the stack than a struct buffer_head. */ #define EXT4_MAP_NEW BIT(BH_New) #define EXT4_MAP_MAPPED BIT(BH_Mapped) #define EXT4_MAP_UNWRITTEN BIT(BH_Unwritten) #define EXT4_MAP_BOUNDARY BIT(BH_Boundary) #define EXT4_MAP_DELAYED BIT(BH_Delay) /* * This is for use in ext4_map_query_blocks() for a special case where we can * have a physically and logically contiguous blocks split across two leaf * nodes instead of a single extent. This is required in case of atomic writes * to know whether the returned extent is last in leaf. If yes, then lookup for * next in leaf block in ext4_map_query_blocks_next_in_leaf(). * - This is never going to be added to any buffer head state. * - We use the next available bit after BH_BITMAP_UPTODATE. */ #define EXT4_MAP_QUERY_LAST_IN_LEAF BIT(BH_BITMAP_UPTODATE + 1) #define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\ EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\ EXT4_MAP_DELAYED | EXT4_MAP_QUERY_LAST_IN_LEAF) struct ext4_map_blocks { ext4_fsblk_t m_pblk; ext4_lblk_t m_lblk; unsigned int m_len; unsigned int m_flags; }; /* * Block validity checking, system zone rbtree. */ struct ext4_system_blocks { struct rb_root root; struct rcu_head rcu; }; /* * Flags for ext4_io_end->flags */ #define EXT4_IO_END_UNWRITTEN 0x0001 #define EXT4_IO_END_FAILED 0x0002 #define EXT4_IO_END_DEFER_COMPLETION (EXT4_IO_END_UNWRITTEN | EXT4_IO_END_FAILED) struct ext4_io_end_vec { struct list_head list; /* list of io_end_vec */ loff_t offset; /* offset in the file */ ssize_t size; /* size of the extent */ }; /* * For converting unwritten extents on a work queue. 'handle' is used for * buffered writeback. */ typedef struct ext4_io_end { struct list_head list; /* per-file finished IO list */ handle_t *handle; /* handle reserved for extent * conversion */ struct inode *inode; /* file being written to */ struct bio *bio; /* Linked list of completed * bios covering the extent */ unsigned int flag; /* unwritten or not */ refcount_t count; /* reference counter */ struct list_head list_vec; /* list of ext4_io_end_vec */ } ext4_io_end_t; struct ext4_io_submit { struct writeback_control *io_wbc; struct bio *io_bio; ext4_io_end_t *io_end; sector_t io_next_block; }; /* * Special inodes numbers */ #define EXT4_BAD_INO 1 /* Bad blocks inode */ #define EXT4_ROOT_INO 2 /* Root inode */ #define EXT4_USR_QUOTA_INO 3 /* User quota inode */ #define EXT4_GRP_QUOTA_INO 4 /* Group quota inode */ #define EXT4_BOOT_LOADER_INO 5 /* Boot loader inode */ #define EXT4_UNDEL_DIR_INO 6 /* Undelete directory inode */ #define EXT4_RESIZE_INO 7 /* Reserved group descriptors inode */ #define EXT4_JOURNAL_INO 8 /* Journal inode */ /* First non-reserved inode for old ext4 filesystems */ #define EXT4_GOOD_OLD_FIRST_INO 11 /* * Maximal count of links to a file */ #define EXT4_LINK_MAX 65000 /* * Macro-instructions used to manage several block sizes */ #define EXT4_MIN_BLOCK_SIZE 1024 #define EXT4_MAX_BLOCK_SIZE 65536 #define EXT4_MIN_BLOCK_LOG_SIZE 10 #define EXT4_MAX_BLOCK_LOG_SIZE 16 #define EXT4_MAX_CLUSTER_LOG_SIZE 30 #ifdef __KERNEL__ # define EXT4_BLOCK_SIZE(s) ((s)->s_blocksize) #else # define EXT4_BLOCK_SIZE(s) (EXT4_MIN_BLOCK_SIZE << (s)->s_log_block_size) #endif #define EXT4_ADDR_PER_BLOCK(s) (EXT4_BLOCK_SIZE(s) / sizeof(__u32)) #define EXT4_CLUSTER_SIZE(s) (EXT4_BLOCK_SIZE(s) << \ EXT4_SB(s)->s_cluster_bits) #ifdef __KERNEL__ # define EXT4_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits) # define EXT4_CLUSTER_BITS(s) (EXT4_SB(s)->s_cluster_bits) #else # define EXT4_BLOCK_SIZE_BITS(s) ((s)->s_log_block_size + 10) #endif #ifdef __KERNEL__ #define EXT4_ADDR_PER_BLOCK_BITS(s) (EXT4_SB(s)->s_addr_per_block_bits) #define EXT4_INODE_SIZE(s) (EXT4_SB(s)->s_inode_size) #define EXT4_FIRST_INO(s) (EXT4_SB(s)->s_first_ino) #else #define EXT4_INODE_SIZE(s) (((s)->s_rev_level == EXT4_GOOD_OLD_REV) ? \ EXT4_GOOD_OLD_INODE_SIZE : \ (s)->s_inode_size) #define EXT4_FIRST_INO(s) (((s)->s_rev_level == EXT4_GOOD_OLD_REV) ? \ EXT4_GOOD_OLD_FIRST_INO : \ (s)->s_first_ino) #endif #define EXT4_BLOCK_ALIGN(size, blkbits) ALIGN((size), (1 << (blkbits))) #define EXT4_MAX_BLOCKS(size, offset, blkbits) \ ((EXT4_BLOCK_ALIGN(size + offset, blkbits) >> blkbits) - (offset >> \ blkbits)) #define EXT4_B_TO_LBLK(inode, offset) \ (round_up((offset), i_blocksize(inode)) >> (inode)->i_blkbits) /* Translate a block number to a cluster number */ #define EXT4_B2C(sbi, blk) ((blk) >> (sbi)->s_cluster_bits) /* Translate a cluster number to a block number */ #define EXT4_C2B(sbi, cluster) ((cluster) << (sbi)->s_cluster_bits) /* Translate # of blks to # of clusters */ #define EXT4_NUM_B2C(sbi, blks) (((blks) + (sbi)->s_cluster_ratio - 1) >> \ (sbi)->s_cluster_bits) /* Mask out the low bits to get the starting block of the cluster */ #define EXT4_PBLK_CMASK(s, pblk) ((pblk) & \ ~((ext4_fsblk_t) (s)->s_cluster_ratio - 1)) #define EXT4_LBLK_CMASK(s, lblk) ((lblk) & \ ~((ext4_lblk_t) (s)->s_cluster_ratio - 1)) /* Fill in the low bits to get the last block of the cluster */ #define EXT4_LBLK_CFILL(sbi, lblk) ((lblk) | \ ((ext4_lblk_t) (sbi)->s_cluster_ratio - 1)) /* Get the cluster offset */ #define EXT4_PBLK_COFF(s, pblk) ((pblk) & \ ((ext4_fsblk_t) (s)->s_cluster_ratio - 1)) #define EXT4_LBLK_COFF(s, lblk) ((lblk) & \ ((ext4_lblk_t) (s)->s_cluster_ratio - 1)) /* * Structure of a blocks group descriptor */ struct ext4_group_desc { __le32 bg_block_bitmap_lo; /* Blocks bitmap block */ __le32 bg_inode_bitmap_lo; /* Inodes bitmap block */ __le32 bg_inode_table_lo; /* Inodes table block */ __le16 bg_free_blocks_count_lo;/* Free blocks count */ __le16 bg_free_inodes_count_lo;/* Free inodes count */ __le16 bg_used_dirs_count_lo; /* Directories count */ __le16 bg_flags; /* EXT4_BG_flags (INODE_UNINIT, etc) */ __le32 bg_exclude_bitmap_lo; /* Exclude bitmap for snapshots */ __le16 bg_block_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+bbitmap) LE */ __le16 bg_inode_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+ibitmap) LE */ __le16 bg_itable_unused_lo; /* Unused inodes count */ __le16 bg_checksum; /* crc16(sb_uuid+group+desc) */ __le32 bg_block_bitmap_hi; /* Blocks bitmap block MSB */ __le32 bg_inode_bitmap_hi; /* Inodes bitmap block MSB */ __le32 bg_inode_table_hi; /* Inodes table block MSB */ __le16 bg_free_blocks_count_hi;/* Free blocks count MSB */ __le16 bg_free_inodes_count_hi;/* Free inodes count MSB */ __le16 bg_used_dirs_count_hi; /* Directories count MSB */ __le16 bg_itable_unused_hi; /* Unused inodes count MSB */ __le32 bg_exclude_bitmap_hi; /* Exclude bitmap block MSB */ __le16 bg_block_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+bbitmap) BE */ __le16 bg_inode_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+ibitmap) BE */ __u32 bg_reserved; }; #define EXT4_BG_INODE_BITMAP_CSUM_HI_END \ (offsetof(struct ext4_group_desc, bg_inode_bitmap_csum_hi) + \ sizeof(__le16)) #define EXT4_BG_BLOCK_BITMAP_CSUM_HI_END \ (offsetof(struct ext4_group_desc, bg_block_bitmap_csum_hi) + \ sizeof(__le16)) /* * Structure of a flex block group info */ struct flex_groups { atomic64_t free_clusters; atomic_t free_inodes; atomic_t used_dirs; }; #define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */ #define EXT4_BG_BLOCK_UNINIT 0x0002 /* Block bitmap not in use */ #define EXT4_BG_INODE_ZEROED 0x0004 /* On-disk itable initialized to zero */ /* * Macro-instructions used to manage group descriptors */ #define EXT4_MIN_DESC_SIZE 32 #define EXT4_MIN_DESC_SIZE_64BIT 64 #define EXT4_MAX_DESC_SIZE EXT4_MIN_BLOCK_SIZE #define EXT4_DESC_SIZE(s) (EXT4_SB(s)->s_desc_size) #ifdef __KERNEL__ # define EXT4_BLOCKS_PER_GROUP(s) (EXT4_SB(s)->s_blocks_per_group) # define EXT4_CLUSTERS_PER_GROUP(s) (EXT4_SB(s)->s_clusters_per_group) # define EXT4_DESC_PER_BLOCK(s) (EXT4_SB(s)->s_desc_per_block) # define EXT4_INODES_PER_GROUP(s) (EXT4_SB(s)->s_inodes_per_group) # define EXT4_DESC_PER_BLOCK_BITS(s) (EXT4_SB(s)->s_desc_per_block_bits) #else # define EXT4_BLOCKS_PER_GROUP(s) ((s)->s_blocks_per_group) # define EXT4_DESC_PER_BLOCK(s) (EXT4_BLOCK_SIZE(s) / EXT4_DESC_SIZE(s)) # define EXT4_INODES_PER_GROUP(s) ((s)->s_inodes_per_group) #endif /* * Constants relative to the data blocks */ #define EXT4_NDIR_BLOCKS 12 #define EXT4_IND_BLOCK EXT4_NDIR_BLOCKS #define EXT4_DIND_BLOCK (EXT4_IND_BLOCK + 1) #define EXT4_TIND_BLOCK (EXT4_DIND_BLOCK + 1) #define EXT4_N_BLOCKS (EXT4_TIND_BLOCK + 1) /* * Inode flags */ #define EXT4_SECRM_FL 0x00000001 /* Secure deletion */ #define EXT4_UNRM_FL 0x00000002 /* Undelete */ #define EXT4_COMPR_FL 0x00000004 /* Compress file */ #define EXT4_SYNC_FL 0x00000008 /* Synchronous updates */ #define EXT4_IMMUTABLE_FL 0x00000010 /* Immutable file */ #define EXT4_APPEND_FL 0x00000020 /* writes to file may only append */ #define EXT4_NODUMP_FL 0x00000040 /* do not dump file */ #define EXT4_NOATIME_FL 0x00000080 /* do not update atime */ /* Reserved for compression usage... */ #define EXT4_DIRTY_FL 0x00000100 #define EXT4_COMPRBLK_FL 0x00000200 /* One or more compressed clusters */ #define EXT4_NOCOMPR_FL 0x00000400 /* Don't compress */ /* nb: was previously EXT2_ECOMPR_FL */ #define EXT4_ENCRYPT_FL 0x00000800 /* encrypted file */ /* End compression flags --- maybe not all used */ #define EXT4_INDEX_FL 0x00001000 /* hash-indexed directory */ #define EXT4_IMAGIC_FL 0x00002000 /* AFS directory */ #define EXT4_JOURNAL_DATA_FL 0x00004000 /* file data should be journaled */ #define EXT4_NOTAIL_FL 0x00008000 /* file tail should not be merged */ #define EXT4_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ #define EXT4_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ #define EXT4_HUGE_FILE_FL 0x00040000 /* Set to each huge file */ #define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */ #define EXT4_VERITY_FL 0x00100000 /* Verity protected inode */ #define EXT4_EA_INODE_FL 0x00200000 /* Inode used for large EA */ /* 0x00400000 was formerly EXT4_EOFBLOCKS_FL */ #define EXT4_DAX_FL 0x02000000 /* Inode is DAX */ #define EXT4_INLINE_DATA_FL 0x10000000 /* Inode has inline data. */ #define EXT4_PROJINHERIT_FL 0x20000000 /* Create with parents projid */ #define EXT4_CASEFOLD_FL 0x40000000 /* Casefolded directory */ #define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ /* User modifiable flags */ #define EXT4_FL_USER_MODIFIABLE (EXT4_SECRM_FL | \ EXT4_UNRM_FL | \ EXT4_COMPR_FL | \ EXT4_SYNC_FL | \ EXT4_IMMUTABLE_FL | \ EXT4_APPEND_FL | \ EXT4_NODUMP_FL | \ EXT4_NOATIME_FL | \ EXT4_JOURNAL_DATA_FL | \ EXT4_NOTAIL_FL | \ EXT4_DIRSYNC_FL | \ EXT4_TOPDIR_FL | \ EXT4_EXTENTS_FL | \ 0x00400000 /* EXT4_EOFBLOCKS_FL */ | \ EXT4_DAX_FL | \ EXT4_PROJINHERIT_FL | \ EXT4_CASEFOLD_FL) /* User visible flags */ #define EXT4_FL_USER_VISIBLE (EXT4_FL_USER_MODIFIABLE | \ EXT4_DIRTY_FL | \ EXT4_COMPRBLK_FL | \ EXT4_NOCOMPR_FL | \ EXT4_ENCRYPT_FL | \ EXT4_INDEX_FL | \ EXT4_VERITY_FL | \ EXT4_INLINE_DATA_FL) /* Flags that should be inherited by new inodes from their parent. */ #define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\ EXT4_SYNC_FL | EXT4_NODUMP_FL | EXT4_NOATIME_FL |\ EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\ EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL |\ EXT4_PROJINHERIT_FL | EXT4_CASEFOLD_FL |\ EXT4_DAX_FL) /* Flags that are appropriate for regular files (all but dir-specific ones). */ #define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL | EXT4_CASEFOLD_FL |\ EXT4_PROJINHERIT_FL)) /* Flags that are appropriate for non-directories/regular files. */ #define EXT4_OTHER_FLMASK (EXT4_NODUMP_FL | EXT4_NOATIME_FL) /* The only flags that should be swapped */ #define EXT4_FL_SHOULD_SWAP (EXT4_HUGE_FILE_FL | EXT4_EXTENTS_FL) /* Flags which are mutually exclusive to DAX */ #define EXT4_DAX_MUT_EXCL (EXT4_VERITY_FL | EXT4_ENCRYPT_FL |\ EXT4_JOURNAL_DATA_FL | EXT4_INLINE_DATA_FL) /* Mask out flags that are inappropriate for the given type of inode. */ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags) { if (S_ISDIR(mode)) return flags; else if (S_ISREG(mode)) return flags & EXT4_REG_FLMASK; else return flags & EXT4_OTHER_FLMASK; } /* * Inode flags used for atomic set/get */ enum { EXT4_INODE_SECRM = 0, /* Secure deletion */ EXT4_INODE_UNRM = 1, /* Undelete */ EXT4_INODE_COMPR = 2, /* Compress file */ EXT4_INODE_SYNC = 3, /* Synchronous updates */ EXT4_INODE_IMMUTABLE = 4, /* Immutable file */ EXT4_INODE_APPEND = 5, /* writes to file may only append */ EXT4_INODE_NODUMP = 6, /* do not dump file */ EXT4_INODE_NOATIME = 7, /* do not update atime */ /* Reserved for compression usage... */ EXT4_INODE_DIRTY = 8, EXT4_INODE_COMPRBLK = 9, /* One or more compressed clusters */ EXT4_INODE_NOCOMPR = 10, /* Don't compress */ EXT4_INODE_ENCRYPT = 11, /* Encrypted file */ /* End compression flags --- maybe not all used */ EXT4_INODE_INDEX = 12, /* hash-indexed directory */ EXT4_INODE_IMAGIC = 13, /* AFS directory */ EXT4_INODE_JOURNAL_DATA = 14, /* file data should be journaled */ EXT4_INODE_NOTAIL = 15, /* file tail should not be merged */ EXT4_INODE_DIRSYNC = 16, /* dirsync behaviour (directories only) */ EXT4_INODE_TOPDIR = 17, /* Top of directory hierarchies*/ EXT4_INODE_HUGE_FILE = 18, /* Set to each huge file */ EXT4_INODE_EXTENTS = 19, /* Inode uses extents */ EXT4_INODE_VERITY = 20, /* Verity protected inode */ EXT4_INODE_EA_INODE = 21, /* Inode used for large EA */ /* 22 was formerly EXT4_INODE_EOFBLOCKS */ EXT4_INODE_DAX = 25, /* Inode is DAX */ EXT4_INODE_INLINE_DATA = 28, /* Data in inode. */ EXT4_INODE_PROJINHERIT = 29, /* Create with parents projid */ EXT4_INODE_CASEFOLD = 30, /* Casefolded directory */ EXT4_INODE_RESERVED = 31, /* reserved for ext4 lib */ }; /* * Since it's pretty easy to mix up bit numbers and hex values, we use a * build-time check to make sure that EXT4_XXX_FL is consistent with respect to * EXT4_INODE_XXX. If all is well, the macros will be dropped, so, it won't cost * any extra space in the compiled kernel image, otherwise, the build will fail. * It's important that these values are the same, since we are using * EXT4_INODE_XXX to test for flag values, but EXT4_XXX_FL must be consistent * with the values of FS_XXX_FL defined in include/linux/fs.h and the on-disk * values found in ext2, ext3 and ext4 filesystems, and of course the values * defined in e2fsprogs. * * It's not paranoia if the Murphy's Law really *is* out to get you. :-) */ #define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1U << EXT4_INODE_##FLAG)) #define CHECK_FLAG_VALUE(FLAG) BUILD_BUG_ON(!TEST_FLAG_VALUE(FLAG)) static inline void ext4_check_flag_values(void) { CHECK_FLAG_VALUE(SECRM); CHECK_FLAG_VALUE(UNRM); CHECK_FLAG_VALUE(COMPR); CHECK_FLAG_VALUE(SYNC); CHECK_FLAG_VALUE(IMMUTABLE); CHECK_FLAG_VALUE(APPEND); CHECK_FLAG_VALUE(NODUMP); CHECK_FLAG_VALUE(NOATIME); CHECK_FLAG_VALUE(DIRTY); CHECK_FLAG_VALUE(COMPRBLK); CHECK_FLAG_VALUE(NOCOMPR); CHECK_FLAG_VALUE(ENCRYPT); CHECK_FLAG_VALUE(INDEX); CHECK_FLAG_VALUE(IMAGIC); CHECK_FLAG_VALUE(JOURNAL_DATA); CHECK_FLAG_VALUE(NOTAIL); CHECK_FLAG_VALUE(DIRSYNC); CHECK_FLAG_VALUE(TOPDIR); CHECK_FLAG_VALUE(HUGE_FILE); CHECK_FLAG_VALUE(EXTENTS); CHECK_FLAG_VALUE(VERITY); CHECK_FLAG_VALUE(EA_INODE); CHECK_FLAG_VALUE(INLINE_DATA); CHECK_FLAG_VALUE(PROJINHERIT); CHECK_FLAG_VALUE(CASEFOLD); CHECK_FLAG_VALUE(RESERVED); } #if defined(__KERNEL__) && defined(CONFIG_COMPAT) struct compat_ext4_new_group_input { u32 group; compat_u64 block_bitmap; compat_u64 inode_bitmap; compat_u64 inode_table; u32 blocks_count; u16 reserved_blocks; u16 unused; }; #endif /* The struct ext4_new_group_input in kernel space, with free_blocks_count */ struct ext4_new_group_data { __u32 group; __u64 block_bitmap; __u64 inode_bitmap; __u64 inode_table; __u32 blocks_count; __u16 reserved_blocks; __u16 mdata_blocks; __u32 free_clusters_count; }; /* Indexes used to index group tables in ext4_new_group_data */ enum { BLOCK_BITMAP = 0, /* block bitmap */ INODE_BITMAP, /* inode bitmap */ INODE_TABLE, /* inode tables */ GROUP_TABLE_COUNT, }; /* * Flags used by ext4_map_blocks() */ /* Allocate any needed blocks and/or convert an unwritten extent to be an initialized ext4 */ #define EXT4_GET_BLOCKS_CREATE 0x0001 /* Request the creation of an unwritten extent */ #define EXT4_GET_BLOCKS_UNWRIT_EXT 0x0002 #define EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT (EXT4_GET_BLOCKS_UNWRIT_EXT|\ EXT4_GET_BLOCKS_CREATE) /* Caller is from the delayed allocation writeout path * finally doing the actual allocation of delayed blocks */ #define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004 /* caller is from the direct IO path, request to creation of an unwritten extents if not allocated, split the unwritten extent if blocks has been preallocated already*/ #define EXT4_GET_BLOCKS_PRE_IO 0x0008 #define EXT4_GET_BLOCKS_CONVERT 0x0010 #define EXT4_GET_BLOCKS_IO_CREATE_EXT (EXT4_GET_BLOCKS_PRE_IO|\ EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT) /* Eventual metadata allocation (due to growing extent tree) * should not fail, so try to use reserved blocks for that.*/ #define EXT4_GET_BLOCKS_METADATA_NOFAIL 0x0020 /* Don't normalize allocation size (used for fallocate) */ #define EXT4_GET_BLOCKS_NO_NORMALIZE 0x0040 /* Convert written extents to unwritten */ #define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN 0x0100 /* Write zeros to newly created written extents */ #define EXT4_GET_BLOCKS_ZERO 0x0200 #define EXT4_GET_BLOCKS_CREATE_ZERO (EXT4_GET_BLOCKS_CREATE |\ EXT4_GET_BLOCKS_ZERO) /* Caller is in the context of data submission, such as writeback, * fsync, etc. Especially, in the generic writeback path, caller will * submit data before dropping transaction handle. This allows jbd2 * to avoid submitting data before commit. */ #define EXT4_GET_BLOCKS_IO_SUBMIT 0x0400 /* Convert extent to initialized after IO complete */ #define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT |\ EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT |\ EXT4_GET_BLOCKS_IO_SUBMIT) /* Caller is in the atomic contex, find extent if it has been cached */ #define EXT4_GET_BLOCKS_CACHED_NOWAIT 0x0800 /* * Atomic write caller needs this to query in the slow path of mixed mapping * case, when a contiguous extent can be split across two adjacent leaf nodes. * Look EXT4_MAP_QUERY_LAST_IN_LEAF. */ #define EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF 0x1000 /* * The bit position of these flags must not overlap with any of the * EXT4_GET_BLOCKS_*. They are used by ext4_find_extent(), * read_extent_tree_block(), ext4_split_extent_at(), * ext4_ext_insert_extent(), and ext4_ext_create_new_leaf(). * EXT4_EX_NOCACHE is used to indicate that the we shouldn't be * caching the extents when reading from the extent tree while a * truncate or punch hole operation is in progress. */ #define EXT4_EX_NOCACHE 0x40000000 #define EXT4_EX_FORCE_CACHE 0x20000000 #define EXT4_EX_NOFAIL 0x10000000 /* * ext4_map_query_blocks() uses this filter mask to filter the flags needed to * pass while lookup/querying of on disk extent tree. */ #define EXT4_EX_QUERY_FILTER (EXT4_EX_NOCACHE | EXT4_EX_FORCE_CACHE |\ EXT4_EX_NOFAIL |\ EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF) /* * Flags used by ext4_free_blocks */ #define EXT4_FREE_BLOCKS_METADATA 0x0001 #define EXT4_FREE_BLOCKS_FORGET 0x0002 #define EXT4_FREE_BLOCKS_VALIDATED 0x0004 #define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE 0x0008 #define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER 0x0010 #define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020 #define EXT4_FREE_BLOCKS_RERESERVE_CLUSTER 0x0040 #if defined(__KERNEL__) && defined(CONFIG_COMPAT) /* * ioctl commands in 32 bit emulation */ #define EXT4_IOC32_GETVERSION _IOR('f', 3, int) #define EXT4_IOC32_SETVERSION _IOW('f', 4, int) #define EXT4_IOC32_GETRSVSZ _IOR('f', 5, int) #define EXT4_IOC32_SETRSVSZ _IOW('f', 6, int) #define EXT4_IOC32_GROUP_EXTEND _IOW('f', 7, unsigned int) #define EXT4_IOC32_GROUP_ADD _IOW('f', 8, struct compat_ext4_new_group_input) #define EXT4_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION #define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION #endif /* Max physical block we can address w/o extents */ #define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF /* Max logical block we can support */ #define EXT4_MAX_LOGICAL_BLOCK 0xFFFFFFFE /* * Structure of an inode on the disk */ struct ext4_inode { __le16 i_mode; /* File mode */ __le16 i_uid; /* Low 16 bits of Owner Uid */ __le32 i_size_lo; /* Size in bytes */ __le32 i_atime; /* Access time */ __le32 i_ctime; /* Inode Change time */ __le32 i_mtime; /* Modification time */ __le32 i_dtime; /* Deletion Time */ __le16 i_gid; /* Low 16 bits of Group Id */ __le16 i_links_count; /* Links count */ __le32 i_blocks_lo; /* Blocks count */ __le32 i_flags; /* File flags */ union { struct { __le32 l_i_version; } linux1; struct { __u32 h_i_translator; } hurd1; struct { __u32 m_i_reserved1; } masix1; } osd1; /* OS dependent 1 */ __le32 i_block[EXT4_N_BLOCKS];/* Pointers to blocks */ __le32 i_generation; /* File version (for NFS) */ __le32 i_file_acl_lo; /* File ACL */ __le32 i_size_high; __le32 i_obso_faddr; /* Obsoleted fragment address */ union { struct { __le16 l_i_blocks_high; /* were l_i_reserved1 */ __le16 l_i_file_acl_high; __le16 l_i_uid_high; /* these 2 fields */ __le16 l_i_gid_high; /* were reserved2[0] */ __le16 l_i_checksum_lo;/* crc32c(uuid+inum+inode) LE */ __le16 l_i_reserved; } linux2; struct { __le16 h_i_reserved1; /* Obsoleted fragment number/size which are removed in ext4 */ __u16 h_i_mode_high; __u16 h_i_uid_high; __u16 h_i_gid_high; __u32 h_i_author; } hurd2; struct { __le16 h_i_reserved1; /* Obsoleted fragment number/size which are removed in ext4 */ __le16 m_i_file_acl_high; __u32 m_i_reserved2[2]; } masix2; } osd2; /* OS dependent 2 */ __le16 i_extra_isize; __le16 i_checksum_hi; /* crc32c(uuid+inum+inode) BE */ __le32 i_ctime_extra; /* extra Change time (nsec << 2 | epoch) */ __le32 i_mtime_extra; /* extra Modification time(nsec << 2 | epoch) */ __le32 i_atime_extra; /* extra Access time (nsec << 2 | epoch) */ __le32 i_crtime; /* File Creation time */ __le32 i_crtime_extra; /* extra FileCreationtime (nsec << 2 | epoch) */ __le32 i_version_hi; /* high 32 bits for 64-bit version */ __le32 i_projid; /* Project ID */ }; #define EXT4_EPOCH_BITS 2 #define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1) #define EXT4_NSEC_MASK (~0UL << EXT4_EPOCH_BITS) /* * Extended fields will fit into an inode if the filesystem was formatted * with large inodes (-I 256 or larger) and there are not currently any EAs * consuming all of the available space. For new inodes we always reserve * enough space for the kernel's known extended fields, but for inodes * created with an old kernel this might not have been the case. None of * the extended inode fields is critical for correct filesystem operation. * This macro checks if a certain field fits in the inode. Note that * inode-size = GOOD_OLD_INODE_SIZE + i_extra_isize */ #define EXT4_FITS_IN_INODE(ext4_inode, einode, field) \ ((offsetof(typeof(*ext4_inode), field) + \ sizeof((ext4_inode)->field)) \ <= (EXT4_GOOD_OLD_INODE_SIZE + \ (einode)->i_extra_isize)) \ /* * We use an encoding that preserves the times for extra epoch "00": * * extra msb of adjust for signed * epoch 32-bit 32-bit tv_sec to * bits time decoded 64-bit tv_sec 64-bit tv_sec valid time range * 0 0 1 -0x80000000..-0x00000001 0x000000000 1901-12-13..1969-12-31 * 0 0 0 0x000000000..0x07fffffff 0x000000000 1970-01-01..2038-01-19 * 0 1 1 0x080000000..0x0ffffffff 0x100000000 2038-01-19..2106-02-07 * 0 1 0 0x100000000..0x17fffffff 0x100000000 2106-02-07..2174-02-25 * 1 0 1 0x180000000..0x1ffffffff 0x200000000 2174-02-25..2242-03-16 * 1 0 0 0x200000000..0x27fffffff 0x200000000 2242-03-16..2310-04-04 * 1 1 1 0x280000000..0x2ffffffff 0x300000000 2310-04-04..2378-04-22 * 1 1 0 0x300000000..0x37fffffff 0x300000000 2378-04-22..2446-05-10 * * Note that previous versions of the kernel on 64-bit systems would * incorrectly use extra epoch bits 1,1 for dates between 1901 and * 1970. e2fsck will correct this, assuming that it is run on the * affected filesystem before 2242. */ static inline __le32 ext4_encode_extra_time(struct timespec64 ts) { u32 extra = ((ts.tv_sec - (s32)ts.tv_sec) >> 32) & EXT4_EPOCH_MASK; return cpu_to_le32(extra | (ts.tv_nsec << EXT4_EPOCH_BITS)); } static inline struct timespec64 ext4_decode_extra_time(__le32 base, __le32 extra) { struct timespec64 ts = { .tv_sec = (signed)le32_to_cpu(base) }; if (unlikely(extra & cpu_to_le32(EXT4_EPOCH_MASK))) ts.tv_sec += (u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK) << 32; ts.tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> EXT4_EPOCH_BITS; return ts; } #define EXT4_INODE_SET_XTIME_VAL(xtime, inode, raw_inode, ts) \ do { \ if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) { \ (raw_inode)->xtime = cpu_to_le32((ts).tv_sec); \ (raw_inode)->xtime ## _extra = ext4_encode_extra_time(ts); \ } else \ (raw_inode)->xtime = cpu_to_le32(clamp_t(int32_t, (ts).tv_sec, S32_MIN, S32_MAX)); \ } while (0) #define EXT4_INODE_SET_ATIME(inode, raw_inode) \ EXT4_INODE_SET_XTIME_VAL(i_atime, inode, raw_inode, inode_get_atime(inode)) #define EXT4_INODE_SET_MTIME(inode, raw_inode) \ EXT4_INODE_SET_XTIME_VAL(i_mtime, inode, raw_inode, inode_get_mtime(inode)) #define EXT4_INODE_SET_CTIME(inode, raw_inode) \ EXT4_INODE_SET_XTIME_VAL(i_ctime, inode, raw_inode, inode_get_ctime(inode)) #define EXT4_EINODE_SET_XTIME(xtime, einode, raw_inode) \ if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \ EXT4_INODE_SET_XTIME_VAL(xtime, &((einode)->vfs_inode), \ raw_inode, (einode)->xtime) #define EXT4_INODE_GET_XTIME_VAL(xtime, inode, raw_inode) \ (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra) ? \ ext4_decode_extra_time((raw_inode)->xtime, \ (raw_inode)->xtime ## _extra) : \ (struct timespec64) { \ .tv_sec = (signed)le32_to_cpu((raw_inode)->xtime) \ }) #define EXT4_INODE_GET_ATIME(inode, raw_inode) \ do { \ inode_set_atime_to_ts(inode, \ EXT4_INODE_GET_XTIME_VAL(i_atime, inode, raw_inode)); \ } while (0) #define EXT4_INODE_GET_MTIME(inode, raw_inode) \ do { \ inode_set_mtime_to_ts(inode, \ EXT4_INODE_GET_XTIME_VAL(i_mtime, inode, raw_inode)); \ } while (0) #define EXT4_INODE_GET_CTIME(inode, raw_inode) \ do { \ inode_set_ctime_to_ts(inode, \ EXT4_INODE_GET_XTIME_VAL(i_ctime, inode, raw_inode)); \ } while (0) #define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode) \ do { \ if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \ (einode)->xtime = \ EXT4_INODE_GET_XTIME_VAL(xtime, &(einode->vfs_inode), \ raw_inode); \ else \ (einode)->xtime = (struct timespec64){0, 0}; \ } while (0) #define i_disk_version osd1.linux1.l_i_version #if defined(__KERNEL__) || defined(__linux__) #define i_reserved1 osd1.linux1.l_i_reserved1 #define i_file_acl_high osd2.linux2.l_i_file_acl_high #define i_blocks_high osd2.linux2.l_i_blocks_high #define i_uid_low i_uid #define i_gid_low i_gid #define i_uid_high osd2.linux2.l_i_uid_high #define i_gid_high osd2.linux2.l_i_gid_high #define i_checksum_lo osd2.linux2.l_i_checksum_lo #elif defined(__GNU__) #define i_translator osd1.hurd1.h_i_translator #define i_uid_high osd2.hurd2.h_i_uid_high #define i_gid_high osd2.hurd2.h_i_gid_high #define i_author osd2.hurd2.h_i_author #elif defined(__masix__) #define i_reserved1 osd1.masix1.m_i_reserved1 #define i_file_acl_high osd2.masix2.m_i_file_acl_high #define i_reserved2 osd2.masix2.m_i_reserved2 #endif /* defined(__KERNEL__) || defined(__linux__) */ #include "extents_status.h" #include "fast_commit.h" /* * Lock subclasses for i_data_sem in the ext4_inode_info structure. * * These are needed to avoid lockdep false positives when we need to * allocate blocks to the quota inode during ext4_map_blocks(), while * holding i_data_sem for a normal (non-quota) inode. Since we don't * do quota tracking for the quota inode, this avoids deadlock (as * well as infinite recursion, since it isn't turtles all the way * down...) * * I_DATA_SEM_NORMAL - Used for most inodes * I_DATA_SEM_OTHER - Used by move_inode.c for the second normal inode * where the second inode has larger inode number * than the first * I_DATA_SEM_QUOTA - Used for quota inodes only * I_DATA_SEM_EA - Used for ea_inodes only */ enum { I_DATA_SEM_NORMAL = 0, I_DATA_SEM_OTHER, I_DATA_SEM_QUOTA, I_DATA_SEM_EA }; /* * fourth extended file system inode data in memory */ struct ext4_inode_info { __le32 i_data[15]; /* unconverted */ __u32 i_dtime; ext4_fsblk_t i_file_acl; /* * i_block_group is the number of the block group which contains * this file's inode. Constant across the lifetime of the inode, * it is used for making block allocation decisions - we try to * place a file's data blocks near its inode block, and new inodes * near to their parent directory's inode. */ ext4_group_t i_block_group; ext4_lblk_t i_dir_start_lookup; #if (BITS_PER_LONG < 64) unsigned long i_state_flags; /* Dynamic state flags */ #endif unsigned long i_flags; /* * Extended attributes can be read independently of the main file * data. Taking i_rwsem even when reading would cause contention * between readers of EAs and writers of regular file data, so * instead we synchronize on xattr_sem when reading or changing * EAs. */ struct rw_semaphore xattr_sem; /* * Inodes with EXT4_STATE_ORPHAN_FILE use i_orphan_idx. Otherwise * i_orphan is used. */ union { struct list_head i_orphan; /* unlinked but open inodes */ unsigned int i_orphan_idx; /* Index in orphan file */ }; /* Fast commit related info */ /* For tracking dentry create updates */ struct list_head i_fc_dilist; struct list_head i_fc_list; /* * inodes that need fast commit * protected by sbi->s_fc_lock. */ /* Start of lblk range that needs to be committed in this fast commit */ ext4_lblk_t i_fc_lblk_start; /* End of lblk range that needs to be committed in this fast commit */ ext4_lblk_t i_fc_lblk_len; spinlock_t i_raw_lock; /* protects updates to the raw inode */ /* Fast commit wait queue for this inode */ wait_queue_head_t i_fc_wait; /* * Protect concurrent accesses on i_fc_lblk_start, i_fc_lblk_len * and inode's EXT4_FC_STATE_COMMITTING state bit. */ spinlock_t i_fc_lock; /* * i_disksize keeps track of what the inode size is ON DISK, not * in memory. During truncate, i_size is set to the new size by * the VFS prior to calling ext4_truncate(), but the filesystem won't * set i_disksize to 0 until the truncate is actually under way. * * The intent is that i_disksize always represents the blocks which * are used by this file. This allows recovery to restart truncate * on orphans if we crash during truncate. We actually write i_disksize * into the on-disk inode when writing inodes out, instead of i_size. * * The only time when i_disksize and i_size may be different is when * a truncate is in progress. The only things which change i_disksize * are ext4_get_block (growth) and ext4_truncate (shrinkth). */ loff_t i_disksize; /* * i_data_sem is for serialising ext4_truncate() against * ext4_getblock(). In the 2.4 ext2 design, great chunks of inode's * data tree are chopped off during truncate. We can't do that in * ext4 because whenever we perform intermediate commits during * truncate, the inode and all the metadata blocks *must* be in a * consistent state which allows truncation of the orphans to restart * during recovery. Hence we must fix the get_block-vs-truncate race * by other means, so we have i_data_sem. */ struct rw_semaphore i_data_sem; struct inode vfs_inode; struct jbd2_inode *jinode; /* * File creation time. Its function is same as that of * struct timespec64 i_{a,c,m}time in the generic inode. */ struct timespec64 i_crtime; /* mballoc */ atomic_t i_prealloc_active; /* allocation reservation info for delalloc */ /* In case of bigalloc, this refer to clusters rather than blocks */ unsigned int i_reserved_data_blocks; struct rb_root i_prealloc_node; rwlock_t i_prealloc_lock; /* extents status tree */ struct ext4_es_tree i_es_tree; rwlock_t i_es_lock; struct list_head i_es_list; unsigned int i_es_all_nr; /* protected by i_es_lock */ unsigned int i_es_shk_nr; /* protected by i_es_lock */ ext4_lblk_t i_es_shrink_lblk; /* Offset where we start searching for extents to shrink. Protected by i_es_lock */ /* ialloc */ ext4_group_t i_last_alloc_group; /* pending cluster reservations for bigalloc file systems */ struct ext4_pending_tree i_pending_tree; /* on-disk additional length */ __u16 i_extra_isize; /* Indicate the inline data space. */ u16 i_inline_off; u16 i_inline_size; #ifdef CONFIG_QUOTA /* quota space reservation, managed internally by quota code */ qsize_t i_reserved_quota; #endif spinlock_t i_block_reservation_lock; /* Lock protecting lists below */ spinlock_t i_completed_io_lock; /* * Completed IOs that need unwritten extents handling and have * transaction reserved */ struct list_head i_rsv_conversion_list; struct work_struct i_rsv_conversion_work; /* * Transactions that contain inode's metadata needed to complete * fsync and fdatasync, respectively. */ tid_t i_sync_tid; tid_t i_datasync_tid; #ifdef CONFIG_QUOTA struct dquot __rcu *i_dquot[MAXQUOTAS]; #endif /* Precomputed uuid+inum+igen checksum for seeding inode checksums */ __u32 i_csum_seed; kprojid_t i_projid; #ifdef CONFIG_FS_ENCRYPTION struct fscrypt_inode_info *i_crypt_info; #endif #ifdef CONFIG_FS_VERITY struct fsverity_info *i_verity_info; #endif }; /* * File system states */ #define EXT4_VALID_FS 0x0001 /* Unmounted cleanly */ #define EXT4_ERROR_FS 0x0002 /* Errors detected */ #define EXT4_ORPHAN_FS 0x0004 /* Orphans being recovered */ #define EXT4_FC_REPLAY 0x0020 /* Fast commit replay ongoing */ /* * Misc. filesystem flags */ #define EXT2_FLAGS_SIGNED_HASH 0x0001 /* Signed dirhash in use */ #define EXT2_FLAGS_UNSIGNED_HASH 0x0002 /* Unsigned dirhash in use */ #define EXT2_FLAGS_TEST_FILESYS 0x0004 /* to test development code */ /* * Mount flags set via mount options or defaults */ #define EXT4_MOUNT_NO_MBCACHE 0x00001 /* Do not use mbcache */ #define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */ #define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */ #define EXT4_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */ #define EXT4_MOUNT_ERRORS_RO 0x00020 /* Remount fs ro on errors */ #define EXT4_MOUNT_ERRORS_PANIC 0x00040 /* Panic on errors */ #define EXT4_MOUNT_ERRORS_MASK 0x00070 #define EXT4_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */ #define EXT4_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/ #ifdef CONFIG_FS_DAX #define EXT4_MOUNT_DAX_ALWAYS 0x00200 /* Direct Access */ #else #define EXT4_MOUNT_DAX_ALWAYS 0 #endif #define EXT4_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */ #define EXT4_MOUNT_JOURNAL_DATA 0x00400 /* Write data to journal */ #define EXT4_MOUNT_ORDERED_DATA 0x00800 /* Flush data before commit */ #define EXT4_MOUNT_WRITEBACK_DATA 0x00C00 /* No data ordering */ #define EXT4_MOUNT_UPDATE_JOURNAL 0x01000 /* Update the journal format */ #define EXT4_MOUNT_NO_UID32 0x02000 /* Disable 32-bit UIDs */ #define EXT4_MOUNT_XATTR_USER 0x04000 /* Extended user attributes */ #define EXT4_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */ #define EXT4_MOUNT_NO_AUTO_DA_ALLOC 0x10000 /* No auto delalloc mapping */ #define EXT4_MOUNT_BARRIER 0x20000 /* Use block barriers */ #define EXT4_MOUNT_QUOTA 0x40000 /* Some quota option set */ #define EXT4_MOUNT_USRQUOTA 0x80000 /* "old" user quota, * enable enforcement for hidden * quota files */ #define EXT4_MOUNT_GRPQUOTA 0x100000 /* "old" group quota, enable * enforcement for hidden quota * files */ #define EXT4_MOUNT_PRJQUOTA 0x200000 /* Enable project quota * enforcement */ #define EXT4_MOUNT_DIOREAD_NOLOCK 0x400000 /* Enable support for dio read nolocking */ #define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ #define EXT4_MOUNT_WARN_ON_ERROR 0x2000000 /* Trigger WARN_ON on error */ #define EXT4_MOUNT_NO_PREFETCH_BLOCK_BITMAPS 0x4000000 #define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ #define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ #define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */ #define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */ #define EXT4_MOUNT_INIT_INODE_TABLE 0x80000000 /* Initialize uninitialized itables */ /* * Mount flags set either automatically (could not be set by mount option) * based on per file system feature or property or in special cases such as * distinguishing between explicit mount option definition and default. */ #define EXT4_MOUNT2_EXPLICIT_DELALLOC 0x00000001 /* User explicitly specified delalloc */ #define EXT4_MOUNT2_STD_GROUP_SIZE 0x00000002 /* We have standard group size of blocksize * 8 blocks */ #define EXT4_MOUNT2_HURD_COMPAT 0x00000004 /* Support HURD-castrated file systems */ #define EXT4_MOUNT2_EXPLICIT_JOURNAL_CHECKSUM 0x00000008 /* User explicitly specified journal checksum */ #define EXT4_MOUNT2_JOURNAL_FAST_COMMIT 0x00000010 /* Journal fast commit */ #define EXT4_MOUNT2_DAX_NEVER 0x00000020 /* Do not allow Direct Access */ #define EXT4_MOUNT2_DAX_INODE 0x00000040 /* For printing options only */ #define EXT4_MOUNT2_MB_OPTIMIZE_SCAN 0x00000080 /* Optimize group * scanning in mballoc */ #define EXT4_MOUNT2_ABORT 0x00000100 /* Abort filesystem */ #define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \ ~EXT4_MOUNT_##opt #define set_opt(sb, opt) EXT4_SB(sb)->s_mount_opt |= \ EXT4_MOUNT_##opt #define test_opt(sb, opt) (EXT4_SB(sb)->s_mount_opt & \ EXT4_MOUNT_##opt) #define clear_opt2(sb, opt) EXT4_SB(sb)->s_mount_opt2 &= \ ~EXT4_MOUNT2_##opt #define set_opt2(sb, opt) EXT4_SB(sb)->s_mount_opt2 |= \ EXT4_MOUNT2_##opt #define test_opt2(sb, opt) (EXT4_SB(sb)->s_mount_opt2 & \ EXT4_MOUNT2_##opt) #define ext4_test_and_set_bit __test_and_set_bit_le #define ext4_set_bit __set_bit_le #define ext4_test_and_clear_bit __test_and_clear_bit_le #define ext4_clear_bit __clear_bit_le #define ext4_test_bit test_bit_le #define ext4_find_next_zero_bit find_next_zero_bit_le #define ext4_find_next_bit find_next_bit_le extern void mb_set_bits(void *bm, int cur, int len); /* * Maximal mount counts between two filesystem checks */ #define EXT4_DFL_MAX_MNT_COUNT 20 /* Allow 20 mounts */ #define EXT4_DFL_CHECKINTERVAL 0 /* Don't use interval check */ /* * Behaviour when detecting errors */ #define EXT4_ERRORS_CONTINUE 1 /* Continue execution */ #define EXT4_ERRORS_RO 2 /* Remount fs read-only */ #define EXT4_ERRORS_PANIC 3 /* Panic */ #define EXT4_ERRORS_DEFAULT EXT4_ERRORS_CONTINUE /* Metadata checksum algorithm codes */ #define EXT4_CRC32C_CHKSUM 1 #define EXT4_LABEL_MAX 16 /* * Structure of the super block */ struct ext4_super_block { /*00*/ __le32 s_inodes_count; /* Inodes count */ __le32 s_blocks_count_lo; /* Blocks count */ __le32 s_r_blocks_count_lo; /* Reserved blocks count */ __le32 s_free_blocks_count_lo; /* Free blocks count */ /*10*/ __le32 s_free_inodes_count; /* Free inodes count */ __le32 s_first_data_block; /* First Data Block */ __le32 s_log_block_size; /* Block size */ __le32 s_log_cluster_size; /* Allocation cluster size */ /*20*/ __le32 s_blocks_per_group; /* # Blocks per group */ __le32 s_clusters_per_group; /* # Clusters per group */ __le32 s_inodes_per_group; /* # Inodes per group */ __le32 s_mtime; /* Mount time */ /*30*/ __le32 s_wtime; /* Write time */ __le16 s_mnt_count; /* Mount count */ __le16 s_max_mnt_count; /* Maximal mount count */ __le16 s_magic; /* Magic signature */ __le16 s_state; /* File system state */ __le16 s_errors; /* Behaviour when detecting errors */ __le16 s_minor_rev_level; /* minor revision level */ /*40*/ __le32 s_lastcheck; /* time of last check */ __le32 s_checkinterval; /* max. time between checks */ __le32 s_creator_os; /* OS */ __le32 s_rev_level; /* Revision level */ /*50*/ __le16 s_def_resuid; /* Default uid for reserved blocks */ __le16 s_def_resgid; /* Default gid for reserved blocks */ /* * These fields are for EXT4_DYNAMIC_REV superblocks only. * * Note: the difference between the compatible feature set and * the incompatible feature set is that if there is a bit set * in the incompatible feature set that the kernel doesn't * know about, it should refuse to mount the filesystem. * * e2fsck's requirements are more strict; if it doesn't know * about a feature in either the compatible or incompatible * feature set, it must abort and not try to meddle with * things it doesn't understand... */ __le32 s_first_ino; /* First non-reserved inode */ __le16 s_inode_size; /* size of inode structure */ __le16 s_block_group_nr; /* block group # of this superblock */ __le32 s_feature_compat; /* compatible feature set */ /*60*/ __le32 s_feature_incompat; /* incompatible feature set */ __le32 s_feature_ro_compat; /* readonly-compatible feature set */ /*68*/ __u8 s_uuid[16]; /* 128-bit uuid for volume */ /*78*/ char s_volume_name[EXT4_LABEL_MAX] __nonstring; /* volume name */ /*88*/ char s_last_mounted[64] __nonstring; /* directory where last mounted */ /*C8*/ __le32 s_algorithm_usage_bitmap; /* For compression */ /* * Performance hints. Directory preallocation should only * happen if the EXT4_FEATURE_COMPAT_DIR_PREALLOC flag is on. */ __u8 s_prealloc_blocks; /* Nr of blocks to try to preallocate*/ __u8 s_prealloc_dir_blocks; /* Nr to preallocate for dirs */ __le16 s_reserved_gdt_blocks; /* Per group desc for online growth */ /* * Journaling support valid if EXT4_FEATURE_COMPAT_HAS_JOURNAL set. */ /*D0*/ __u8 s_journal_uuid[16]; /* uuid of journal superblock */ /*E0*/ __le32 s_journal_inum; /* inode number of journal file */ __le32 s_journal_dev; /* device number of journal file */ __le32 s_last_orphan; /* start of list of inodes to delete */ __le32 s_hash_seed[4]; /* HTREE hash seed */ __u8 s_def_hash_version; /* Default hash version to use */ __u8 s_jnl_backup_type; __le16 s_desc_size; /* size of group descriptor */ /*100*/ __le32 s_default_mount_opts; __le32 s_first_meta_bg; /* First metablock block group */ __le32 s_mkfs_time; /* When the filesystem was created */ __le32 s_jnl_blocks[17]; /* Backup of the journal inode */ /* 64bit support valid if EXT4_FEATURE_INCOMPAT_64BIT */ /*150*/ __le32 s_blocks_count_hi; /* Blocks count */ __le32 s_r_blocks_count_hi; /* Reserved blocks count */ __le32 s_free_blocks_count_hi; /* Free blocks count */ __le16 s_min_extra_isize; /* All inodes have at least # bytes */ __le16 s_want_extra_isize; /* New inodes should reserve # bytes */ __le32 s_flags; /* Miscellaneous flags */ __le16 s_raid_stride; /* RAID stride */ __le16 s_mmp_update_interval; /* # seconds to wait in MMP checking */ __le64 s_mmp_block; /* Block for multi-mount protection */ __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/ __u8 s_log_groups_per_flex; /* FLEX_BG group size */ __u8 s_checksum_type; /* metadata checksum algorithm used */ __u8 s_encryption_level; /* versioning level for encryption */ __u8 s_reserved_pad; /* Padding to next 32bits */ __le64 s_kbytes_written; /* nr of lifetime kilobytes written */ __le32 s_snapshot_inum; /* Inode number of active snapshot */ __le32 s_snapshot_id; /* sequential ID of active snapshot */ __le64 s_snapshot_r_blocks_count; /* reserved blocks for active snapshot's future use */ __le32 s_snapshot_list; /* inode number of the head of the on-disk snapshot list */ #define EXT4_S_ERR_START offsetof(struct ext4_super_block, s_error_count) __le32 s_error_count; /* number of fs errors */ __le32 s_first_error_time; /* first time an error happened */ __le32 s_first_error_ino; /* inode involved in first error */ __le64 s_first_error_block; /* block involved of first error */ __u8 s_first_error_func[32] __nonstring; /* function where the error happened */ __le32 s_first_error_line; /* line number where error happened */ __le32 s_last_error_time; /* most recent time of an error */ __le32 s_last_error_ino; /* inode involved in last error */ __le32 s_last_error_line; /* line number where error happened */ __le64 s_last_error_block; /* block involved of last error */ __u8 s_last_error_func[32] __nonstring; /* function where the error happened */ #define EXT4_S_ERR_END offsetof(struct ext4_super_block, s_mount_opts) __u8 s_mount_opts[64]; __le32 s_usr_quota_inum; /* inode for tracking user quota */ __le32 s_grp_quota_inum; /* inode for tracking group quota */ __le32 s_overhead_clusters; /* overhead blocks/clusters in fs */ __le32 s_backup_bgs[2]; /* groups with sparse_super2 SBs */ __u8 s_encrypt_algos[4]; /* Encryption algorithms in use */ __u8 s_encrypt_pw_salt[16]; /* Salt used for string2key algorithm */ __le32 s_lpf_ino; /* Location of the lost+found inode */ __le32 s_prj_quota_inum; /* inode for tracking project quota */ __le32 s_checksum_seed; /* crc32c(uuid) if csum_seed set */ __u8 s_wtime_hi; __u8 s_mtime_hi; __u8 s_mkfs_time_hi; __u8 s_lastcheck_hi; __u8 s_first_error_time_hi; __u8 s_last_error_time_hi; __u8 s_first_error_errcode; __u8 s_last_error_errcode; __le16 s_encoding; /* Filename charset encoding */ __le16 s_encoding_flags; /* Filename charset encoding flags */ __le32 s_orphan_file_inum; /* Inode for tracking orphan inodes */ __le16 s_def_resuid_hi; __le16 s_def_resgid_hi; __le32 s_reserved[93]; /* Padding to the end of the block */ __le32 s_checksum; /* crc32c(superblock) */ }; #define EXT4_S_ERR_LEN (EXT4_S_ERR_END - EXT4_S_ERR_START) #ifdef __KERNEL__ /* Number of quota types we support */ #define EXT4_MAXQUOTAS 3 #define EXT4_ENC_UTF8_12_1 1 /* Types of ext4 journal triggers */ enum ext4_journal_trigger_type { EXT4_JTR_ORPHAN_FILE, EXT4_JTR_NONE /* This must be the last entry for indexing to work! */ }; #define EXT4_JOURNAL_TRIGGER_COUNT EXT4_JTR_NONE struct ext4_journal_trigger { struct jbd2_buffer_trigger_type tr_triggers; struct super_block *sb; }; static inline struct ext4_journal_trigger *EXT4_TRIGGER( struct jbd2_buffer_trigger_type *trigger) { return container_of(trigger, struct ext4_journal_trigger, tr_triggers); } #define EXT4_ORPHAN_BLOCK_MAGIC 0x0b10ca04 /* Structure at the tail of orphan block */ struct ext4_orphan_block_tail { __le32 ob_magic; __le32 ob_checksum; }; static inline int ext4_inodes_per_orphan_block(struct super_block *sb) { return (sb->s_blocksize - sizeof(struct ext4_orphan_block_tail)) / sizeof(u32); } struct ext4_orphan_block { atomic_t ob_free_entries; /* Number of free orphan entries in block */ struct buffer_head *ob_bh; /* Buffer for orphan block */ }; /* * Info about orphan file. */ struct ext4_orphan_info { int of_blocks; /* Number of orphan blocks in a file */ __u32 of_csum_seed; /* Checksum seed for orphan file */ struct ext4_orphan_block *of_binfo; /* Array with info about orphan * file blocks */ }; /* * fourth extended-fs super-block data in memory */ struct ext4_sb_info { unsigned long s_desc_size; /* Size of a group descriptor in bytes */ unsigned long s_inodes_per_block;/* Number of inodes per block */ unsigned long s_blocks_per_group;/* Number of blocks in a group */ unsigned long s_clusters_per_group; /* Number of clusters in a group */ unsigned long s_inodes_per_group;/* Number of inodes in a group */ unsigned long s_itb_per_group; /* Number of inode table blocks per group */ unsigned long s_gdb_count; /* Number of group descriptor blocks */ unsigned long s_desc_per_block; /* Number of group descriptors per block */ ext4_group_t s_groups_count; /* Number of groups in the fs */ ext4_group_t s_blockfile_groups;/* Groups acceptable for non-extent files */ unsigned long s_overhead; /* # of fs overhead clusters */ unsigned int s_cluster_ratio; /* Number of blocks per cluster */ unsigned int s_cluster_bits; /* log2 of s_cluster_ratio */ loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */ struct buffer_head * s_sbh; /* Buffer containing the super block */ struct ext4_super_block *s_es; /* Pointer to the super block in the buffer */ /* Array of bh's for the block group descriptors */ struct buffer_head * __rcu *s_group_desc; unsigned int s_mount_opt; unsigned int s_mount_opt2; unsigned long s_mount_flags; unsigned int s_def_mount_opt; unsigned int s_def_mount_opt2; ext4_fsblk_t s_sb_block; atomic64_t s_resv_clusters; kuid_t s_resuid; kgid_t s_resgid; unsigned short s_mount_state; unsigned short s_pad; int s_addr_per_block_bits; int s_desc_per_block_bits; int s_inode_size; int s_first_ino; unsigned int s_inode_readahead_blks; unsigned int s_inode_goal; u32 s_hash_seed[4]; int s_def_hash_version; int s_hash_unsigned; /* 3 if hash should be unsigned, 0 if not */ struct percpu_counter s_freeclusters_counter; struct percpu_counter s_freeinodes_counter; struct percpu_counter s_dirs_counter; struct percpu_counter s_dirtyclusters_counter; struct percpu_counter s_sra_exceeded_retry_limit; struct blockgroup_lock *s_blockgroup_lock; struct proc_dir_entry *s_proc; struct kobject s_kobj; struct completion s_kobj_unregister; struct super_block *s_sb; struct buffer_head *s_mmp_bh; /* Journaling */ struct journal_s *s_journal; unsigned long s_ext4_flags; /* Ext4 superblock flags */ struct mutex s_orphan_lock; /* Protects on disk list changes */ struct list_head s_orphan; /* List of orphaned inodes in on disk list */ struct ext4_orphan_info s_orphan_info; unsigned long s_commit_interval; u32 s_max_batch_time; u32 s_min_batch_time; struct file *s_journal_bdev_file; #ifdef CONFIG_QUOTA /* Names of quota files with journalled quota */ char __rcu *s_qf_names[EXT4_MAXQUOTAS]; int s_jquota_fmt; /* Format of quota to use */ #endif unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */ struct ext4_system_blocks __rcu *s_system_blks; #ifdef EXTENTS_STATS /* ext4 extents stats */ unsigned long s_ext_min; unsigned long s_ext_max; unsigned long s_depth_max; spinlock_t s_ext_stats_lock; unsigned long s_ext_blocks; unsigned long s_ext_extents; #endif /* for buddy allocator */ struct ext4_group_info ** __rcu *s_group_info; struct inode *s_buddy_cache; spinlock_t s_md_lock; unsigned short *s_mb_offsets; unsigned int *s_mb_maxs; unsigned int s_group_info_size; atomic_t s_mb_free_pending; struct list_head s_freed_data_list[2]; /* List of blocks to be freed after commit completed */ struct list_head s_discard_list; struct work_struct s_discard_work; atomic_t s_retry_alloc_pending; struct xarray *s_mb_avg_fragment_size; struct xarray *s_mb_largest_free_orders; /* tunables */ unsigned long s_stripe; unsigned int s_mb_max_linear_groups; unsigned int s_mb_stream_request; unsigned int s_mb_max_to_scan; unsigned int s_mb_min_to_scan; unsigned int s_mb_stats; unsigned int s_mb_order2_reqs; unsigned int s_mb_group_prealloc; unsigned int s_max_dir_size_kb; unsigned int s_mb_prefetch; unsigned int s_mb_prefetch_limit; unsigned int s_mb_best_avail_max_trim_order; unsigned int s_sb_update_sec; unsigned int s_sb_update_kb; /* where last allocation was done - for stream allocation */ ext4_group_t *s_mb_last_groups; unsigned int s_mb_nr_global_goals; /* stats for buddy allocator */ atomic_t s_bal_reqs; /* number of reqs with len > 1 */ atomic_t s_bal_success; /* we found long enough chunks */ atomic_t s_bal_allocated; /* in blocks */ atomic_t s_bal_ex_scanned; /* total extents scanned */ atomic_t s_bal_cX_ex_scanned[EXT4_MB_NUM_CRS]; /* total extents scanned */ atomic_t s_bal_groups_scanned; /* number of groups scanned */ atomic_t s_bal_goals; /* goal hits */ atomic_t s_bal_stream_goals; /* stream allocation global goal hits */ atomic_t s_bal_len_goals; /* len goal hits */ atomic_t s_bal_breaks; /* too long searches */ atomic_t s_bal_2orders; /* 2^order hits */ atomic64_t s_bal_cX_groups_considered[EXT4_MB_NUM_CRS]; atomic64_t s_bal_cX_hits[EXT4_MB_NUM_CRS]; atomic64_t s_bal_cX_failed[EXT4_MB_NUM_CRS]; /* cX loop didn't find blocks */ atomic_t s_mb_buddies_generated; /* number of buddies generated */ atomic64_t s_mb_generation_time; atomic_t s_mb_lost_chunks; atomic_t s_mb_preallocated; atomic_t s_mb_discarded; atomic_t s_lock_busy; /* locality groups */ struct ext4_locality_group __percpu *s_locality_groups; /* for write statistics */ unsigned long s_sectors_written_start; u64 s_kbytes_written; /* the size of zero-out chunk */ unsigned int s_extent_max_zeroout_kb; unsigned int s_log_groups_per_flex; struct flex_groups * __rcu *s_flex_groups; ext4_group_t s_flex_groups_allocated; /* workqueue for reserved extent conversions (buffered io) */ struct workqueue_struct *rsv_conversion_wq; /* timer for periodic error stats printing */ struct timer_list s_err_report; /* Lazy inode table initialization info */ struct ext4_li_request *s_li_request; /* Wait multiplier for lazy initialization thread */ unsigned int s_li_wait_mult; /* Kernel thread for multiple mount protection */ struct task_struct *s_mmp_tsk; /* record the last minlen when FITRIM is called. */ unsigned long s_last_trim_minblks; /* Precomputed FS UUID checksum for seeding other checksums */ __u32 s_csum_seed; /* Reclaim extents from extent status tree */ struct shrinker *s_es_shrinker; struct list_head s_es_list; /* List of inodes with reclaimable extents */ long s_es_nr_inode; struct ext4_es_stats s_es_stats; struct mb_cache *s_ea_block_cache; struct mb_cache *s_ea_inode_cache; spinlock_t s_es_lock ____cacheline_aligned_in_smp; /* Journal triggers for checksum computation */ struct ext4_journal_trigger s_journal_triggers[EXT4_JOURNAL_TRIGGER_COUNT]; /* Ratelimit ext4 messages. */ struct ratelimit_state s_err_ratelimit_state; struct ratelimit_state s_warning_ratelimit_state; struct ratelimit_state s_msg_ratelimit_state; atomic_t s_warning_count; atomic_t s_msg_count; /* Encryption policy for '-o test_dummy_encryption' */ struct fscrypt_dummy_policy s_dummy_enc_policy; /* * Barrier between writepages ops and changing any inode's JOURNAL_DATA * or EXTENTS flag or between writepages ops and changing DELALLOC or * DIOREAD_NOLOCK mount options on remount. */ struct percpu_rw_semaphore s_writepages_rwsem; struct dax_device *s_daxdev; u64 s_dax_part_off; #ifdef CONFIG_EXT4_DEBUG unsigned long s_simulate_fail; #endif /* Record the errseq of the backing block device */ errseq_t s_bdev_wb_err; spinlock_t s_bdev_wb_lock; /* Information about errors that happened during this mount */ spinlock_t s_error_lock; int s_add_error_count; int s_first_error_code; __u32 s_first_error_line; __u32 s_first_error_ino; __u64 s_first_error_block; const char *s_first_error_func; time64_t s_first_error_time; int s_last_error_code; __u32 s_last_error_line; __u32 s_last_error_ino; __u64 s_last_error_block; const char *s_last_error_func; time64_t s_last_error_time; /* * If we are in a context where we cannot update the on-disk * superblock, we queue the work here. This is used to update * the error information in the superblock, and for periodic * updates of the superblock called from the commit callback * function. */ struct work_struct s_sb_upd_work; /* Atomic write unit values in bytes */ unsigned int s_awu_min; unsigned int s_awu_max; /* Ext4 fast commit sub transaction ID */ atomic_t s_fc_subtid; /* * After commit starts, the main queue gets locked, and the further * updates get added in the staging queue. */ #define FC_Q_MAIN 0 #define FC_Q_STAGING 1 struct list_head s_fc_q[2]; /* Inodes staged for fast commit * that have data changes in them. */ struct list_head s_fc_dentry_q[2]; /* directory entry updates */ unsigned int s_fc_bytes; /* * Main fast commit lock. This lock protects accesses to the * following fields: * ei->i_fc_list, s_fc_dentry_q, s_fc_q, s_fc_bytes, s_fc_bh. */ struct mutex s_fc_lock; struct buffer_head *s_fc_bh; struct ext4_fc_stats s_fc_stats; tid_t s_fc_ineligible_tid; #ifdef CONFIG_EXT4_DEBUG int s_fc_debug_max_replay; #endif struct ext4_fc_replay_state s_fc_replay_state; }; static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) { return sb->s_fs_info; } static inline struct ext4_inode_info *EXT4_I(struct inode *inode) { return container_of(inode, struct ext4_inode_info, vfs_inode); } static inline int ext4_writepages_down_read(struct super_block *sb) { percpu_down_read(&EXT4_SB(sb)->s_writepages_rwsem); return memalloc_nofs_save(); } static inline void ext4_writepages_up_read(struct super_block *sb, int ctx) { memalloc_nofs_restore(ctx); percpu_up_read(&EXT4_SB(sb)->s_writepages_rwsem); } static inline int ext4_writepages_down_write(struct super_block *sb) { percpu_down_write(&EXT4_SB(sb)->s_writepages_rwsem); return memalloc_nofs_save(); } static inline void ext4_writepages_up_write(struct super_block *sb, int ctx) { memalloc_nofs_restore(ctx); percpu_up_write(&EXT4_SB(sb)->s_writepages_rwsem); } static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) { return ino == EXT4_ROOT_INO || (ino >= EXT4_FIRST_INO(sb) && ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)); } static inline int ext4_get_resuid(struct ext4_super_block *es) { return le16_to_cpu(es->s_def_resuid) | le16_to_cpu(es->s_def_resuid_hi) << 16; } static inline int ext4_get_resgid(struct ext4_super_block *es) { return le16_to_cpu(es->s_def_resgid) | le16_to_cpu(es->s_def_resgid_hi) << 16; } /* * Returns: sbi->field[index] * Used to access an array element from the following sbi fields which require * rcu protection to avoid dereferencing an invalid pointer due to reassignment * - s_group_desc * - s_group_info * - s_flex_group */ #define sbi_array_rcu_deref(sbi, field, index) \ ({ \ typeof(*((sbi)->field)) _v; \ rcu_read_lock(); \ _v = ((typeof(_v)*)rcu_dereference((sbi)->field))[index]; \ rcu_read_unlock(); \ _v; \ }) /* * run-time mount flags */ enum { EXT4_MF_MNTDIR_SAMPLED, EXT4_MF_FC_INELIGIBLE, /* Fast commit ineligible */ EXT4_MF_JOURNAL_DESTROY /* Journal is in process of destroying */ }; static inline void ext4_set_mount_flag(struct super_block *sb, int bit) { set_bit(bit, &EXT4_SB(sb)->s_mount_flags); } static inline void ext4_clear_mount_flag(struct super_block *sb, int bit) { clear_bit(bit, &EXT4_SB(sb)->s_mount_flags); } static inline int ext4_test_mount_flag(struct super_block *sb, int bit) { return test_bit(bit, &EXT4_SB(sb)->s_mount_flags); } /* * Simulate_fail codes */ #define EXT4_SIM_BBITMAP_EIO 1 #define EXT4_SIM_BBITMAP_CRC 2 #define EXT4_SIM_IBITMAP_EIO 3 #define EXT4_SIM_IBITMAP_CRC 4 #define EXT4_SIM_INODE_EIO 5 #define EXT4_SIM_INODE_CRC 6 #define EXT4_SIM_DIRBLOCK_EIO 7 #define EXT4_SIM_DIRBLOCK_CRC 8 static inline bool ext4_simulate_fail(struct super_block *sb, unsigned long code) { #ifdef CONFIG_EXT4_DEBUG struct ext4_sb_info *sbi = EXT4_SB(sb); if (unlikely(sbi->s_simulate_fail == code)) { sbi->s_simulate_fail = 0; return true; } #endif return false; } /* * Error number codes for s_{first,last}_error_errno * * Linux errno numbers are architecture specific, so we need to translate * them into something which is architecture independent. We don't define * codes for all errno's; just the ones which are most likely to be the cause * of an ext4_error() call. */ #define EXT4_ERR_UNKNOWN 1 #define EXT4_ERR_EIO 2 #define EXT4_ERR_ENOMEM 3 #define EXT4_ERR_EFSBADCRC 4 #define EXT4_ERR_EFSCORRUPTED 5 #define EXT4_ERR_ENOSPC 6 #define EXT4_ERR_ENOKEY 7 #define EXT4_ERR_EROFS 8 #define EXT4_ERR_EFBIG 9 #define EXT4_ERR_EEXIST 10 #define EXT4_ERR_ERANGE 11 #define EXT4_ERR_EOVERFLOW 12 #define EXT4_ERR_EBUSY 13 #define EXT4_ERR_ENOTDIR 14 #define EXT4_ERR_ENOTEMPTY 15 #define EXT4_ERR_ESHUTDOWN 16 #define EXT4_ERR_EFAULT 17 /* * Inode dynamic state flags */ enum { EXT4_STATE_NEW, /* inode is newly created */ EXT4_STATE_XATTR, /* has in-inode xattrs */ EXT4_STATE_NO_EXPAND, /* No space for expansion */ EXT4_STATE_DA_ALLOC_CLOSE, /* Alloc DA blks on close */ EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */ EXT4_STATE_NEWENTRY, /* File just added to dir */ EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */ EXT4_STATE_EXT_PRECACHED, /* extents have been precached */ EXT4_STATE_LUSTRE_EA_INODE, /* Lustre-style ea_inode */ EXT4_STATE_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */ EXT4_STATE_FC_COMMITTING, /* Fast commit ongoing */ EXT4_STATE_FC_FLUSHING_DATA, /* Fast commit flushing data */ EXT4_STATE_ORPHAN_FILE, /* Inode orphaned in orphan file */ }; #define EXT4_INODE_BIT_FNS(name, field, offset) \ static inline int ext4_test_inode_##name(struct inode *inode, int bit) \ { \ return test_bit(bit + (offset), &EXT4_I(inode)->i_##field); \ } \ static inline void ext4_set_inode_##name(struct inode *inode, int bit) \ { \ set_bit(bit + (offset), &EXT4_I(inode)->i_##field); \ } \ static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \ { \ clear_bit(bit + (offset), &EXT4_I(inode)->i_##field); \ } /* Add these declarations here only so that these functions can be * found by name. Otherwise, they are very hard to locate. */ static inline int ext4_test_inode_flag(struct inode *inode, int bit); static inline void ext4_set_inode_flag(struct inode *inode, int bit); static inline void ext4_clear_inode_flag(struct inode *inode, int bit); EXT4_INODE_BIT_FNS(flag, flags, 0) /* Add these declarations here only so that these functions can be * found by name. Otherwise, they are very hard to locate. */ static inline int ext4_test_inode_state(struct inode *inode, int bit); static inline void ext4_set_inode_state(struct inode *inode, int bit); static inline void ext4_clear_inode_state(struct inode *inode, int bit); #if (BITS_PER_LONG < 64) EXT4_INODE_BIT_FNS(state, state_flags, 0) static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) { (ei)->i_state_flags = 0; } #else EXT4_INODE_BIT_FNS(state, flags, 32) static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) { /* We depend on the fact that callers will set i_flags */ } #endif #else /* Assume that user mode programs are passing in an ext4fs superblock, not * a kernel struct super_block. This will allow us to call the feature-test * macros from user land. */ #define EXT4_SB(sb) (sb) #endif static inline bool ext4_verity_in_progress(struct inode *inode) { return IS_ENABLED(CONFIG_FS_VERITY) && ext4_test_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS); } #define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime /* * Check whether the inode is tracked as orphan (either in orphan file or * orphan list). */ static inline bool ext4_inode_orphan_tracked(struct inode *inode) { return ext4_test_inode_state(inode, EXT4_STATE_ORPHAN_FILE) || !list_empty(&EXT4_I(inode)->i_orphan); } /* * Codes for operating systems */ #define EXT4_OS_LINUX 0 #define EXT4_OS_HURD 1 #define EXT4_OS_MASIX 2 #define EXT4_OS_FREEBSD 3 #define EXT4_OS_LITES 4 /* * Revision levels */ #define EXT4_GOOD_OLD_REV 0 /* The good old (original) format */ #define EXT4_DYNAMIC_REV 1 /* V2 format w/ dynamic inode sizes */ #define EXT4_MAX_SUPP_REV EXT4_DYNAMIC_REV #define EXT4_GOOD_OLD_INODE_SIZE 128 #define EXT4_EXTRA_TIMESTAMP_MAX (((s64)1 << 34) - 1 + S32_MIN) #define EXT4_NON_EXTRA_TIMESTAMP_MAX S32_MAX #define EXT4_TIMESTAMP_MIN S32_MIN /* * Feature set definitions */ #define EXT4_FEATURE_COMPAT_DIR_PREALLOC 0x0001 #define EXT4_FEATURE_COMPAT_IMAGIC_INODES 0x0002 #define EXT4_FEATURE_COMPAT_HAS_JOURNAL 0x0004 #define EXT4_FEATURE_COMPAT_EXT_ATTR 0x0008 #define EXT4_FEATURE_COMPAT_RESIZE_INODE 0x0010 #define EXT4_FEATURE_COMPAT_DIR_INDEX 0x0020 #define EXT4_FEATURE_COMPAT_SPARSE_SUPER2 0x0200 /* * The reason why "FAST_COMMIT" is a compat feature is that, FS becomes * incompatible only if fast commit blocks are present in the FS. Since we * clear the journal (and thus the fast commit blocks), we don't mark FS as * incompatible. We also have a JBD2 incompat feature, which gets set when * there are fast commit blocks present in the journal. */ #define EXT4_FEATURE_COMPAT_FAST_COMMIT 0x0400 #define EXT4_FEATURE_COMPAT_STABLE_INODES 0x0800 #define EXT4_FEATURE_COMPAT_ORPHAN_FILE 0x1000 /* Orphan file exists */ #define EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001 #define EXT4_FEATURE_RO_COMPAT_LARGE_FILE 0x0002 #define EXT4_FEATURE_RO_COMPAT_BTREE_DIR 0x0004 #define EXT4_FEATURE_RO_COMPAT_HUGE_FILE 0x0008 #define EXT4_FEATURE_RO_COMPAT_GDT_CSUM 0x0010 #define EXT4_FEATURE_RO_COMPAT_DIR_NLINK 0x0020 #define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040 #define EXT4_FEATURE_RO_COMPAT_QUOTA 0x0100 #define EXT4_FEATURE_RO_COMPAT_BIGALLOC 0x0200 /* * METADATA_CSUM also enables group descriptor checksums (GDT_CSUM). When * METADATA_CSUM is set, group descriptor checksums use the same algorithm as * all other data structures' checksums. However, the METADATA_CSUM and * GDT_CSUM bits are mutually exclusive. */ #define EXT4_FEATURE_RO_COMPAT_METADATA_CSUM 0x0400 #define EXT4_FEATURE_RO_COMPAT_READONLY 0x1000 #define EXT4_FEATURE_RO_COMPAT_PROJECT 0x2000 #define EXT4_FEATURE_RO_COMPAT_VERITY 0x8000 #define EXT4_FEATURE_RO_COMPAT_ORPHAN_PRESENT 0x10000 /* Orphan file may be non-empty */ #define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001 #define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002 #define EXT4_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */ #define EXT4_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */ #define EXT4_FEATURE_INCOMPAT_META_BG 0x0010 #define EXT4_FEATURE_INCOMPAT_EXTENTS 0x0040 /* extents support */ #define EXT4_FEATURE_INCOMPAT_64BIT 0x0080 #define EXT4_FEATURE_INCOMPAT_MMP 0x0100 #define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200 #define EXT4_FEATURE_INCOMPAT_EA_INODE 0x0400 /* EA in inode */ #define EXT4_FEATURE_INCOMPAT_DIRDATA 0x1000 /* data in dirent */ #define EXT4_FEATURE_INCOMPAT_CSUM_SEED 0x2000 #define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000 /* >2GB or 3-lvl htree */ #define EXT4_FEATURE_INCOMPAT_INLINE_DATA 0x8000 /* data in inode */ #define EXT4_FEATURE_INCOMPAT_ENCRYPT 0x10000 #define EXT4_FEATURE_INCOMPAT_CASEFOLD 0x20000 extern void ext4_update_dynamic_rev(struct super_block *sb); #define EXT4_FEATURE_COMPAT_FUNCS(name, flagname) \ static inline bool ext4_has_feature_##name(struct super_block *sb) \ { \ return ((EXT4_SB(sb)->s_es->s_feature_compat & \ cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname)) != 0); \ } \ static inline void ext4_set_feature_##name(struct super_block *sb) \ { \ ext4_update_dynamic_rev(sb); \ EXT4_SB(sb)->s_es->s_feature_compat |= \ cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname); \ } \ static inline void ext4_clear_feature_##name(struct super_block *sb) \ { \ EXT4_SB(sb)->s_es->s_feature_compat &= \ ~cpu_to_le32(EXT4_FEATURE_COMPAT_##flagname); \ } #define EXT4_FEATURE_RO_COMPAT_FUNCS(name, flagname) \ static inline bool ext4_has_feature_##name(struct super_block *sb) \ { \ return ((EXT4_SB(sb)->s_es->s_feature_ro_compat & \ cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname)) != 0); \ } \ static inline void ext4_set_feature_##name(struct super_block *sb) \ { \ ext4_update_dynamic_rev(sb); \ EXT4_SB(sb)->s_es->s_feature_ro_compat |= \ cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname); \ } \ static inline void ext4_clear_feature_##name(struct super_block *sb) \ { \ EXT4_SB(sb)->s_es->s_feature_ro_compat &= \ ~cpu_to_le32(EXT4_FEATURE_RO_COMPAT_##flagname); \ } #define EXT4_FEATURE_INCOMPAT_FUNCS(name, flagname) \ static inline bool ext4_has_feature_##name(struct super_block *sb) \ { \ return ((EXT4_SB(sb)->s_es->s_feature_incompat & \ cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname)) != 0); \ } \ static inline void ext4_set_feature_##name(struct super_block *sb) \ { \ ext4_update_dynamic_rev(sb); \ EXT4_SB(sb)->s_es->s_feature_incompat |= \ cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname); \ } \ static inline void ext4_clear_feature_##name(struct super_block *sb) \ { \ EXT4_SB(sb)->s_es->s_feature_incompat &= \ ~cpu_to_le32(EXT4_FEATURE_INCOMPAT_##flagname); \ } EXT4_FEATURE_COMPAT_FUNCS(dir_prealloc, DIR_PREALLOC) EXT4_FEATURE_COMPAT_FUNCS(imagic_inodes, IMAGIC_INODES) EXT4_FEATURE_COMPAT_FUNCS(journal, HAS_JOURNAL) EXT4_FEATURE_COMPAT_FUNCS(xattr, EXT_ATTR) EXT4_FEATURE_COMPAT_FUNCS(resize_inode, RESIZE_INODE) EXT4_FEATURE_COMPAT_FUNCS(dir_index, DIR_INDEX) EXT4_FEATURE_COMPAT_FUNCS(sparse_super2, SPARSE_SUPER2) EXT4_FEATURE_COMPAT_FUNCS(fast_commit, FAST_COMMIT) EXT4_FEATURE_COMPAT_FUNCS(stable_inodes, STABLE_INODES) EXT4_FEATURE_COMPAT_FUNCS(orphan_file, ORPHAN_FILE) EXT4_FEATURE_RO_COMPAT_FUNCS(sparse_super, SPARSE_SUPER) EXT4_FEATURE_RO_COMPAT_FUNCS(large_file, LARGE_FILE) EXT4_FEATURE_RO_COMPAT_FUNCS(btree_dir, BTREE_DIR) EXT4_FEATURE_RO_COMPAT_FUNCS(huge_file, HUGE_FILE) EXT4_FEATURE_RO_COMPAT_FUNCS(gdt_csum, GDT_CSUM) EXT4_FEATURE_RO_COMPAT_FUNCS(dir_nlink, DIR_NLINK) EXT4_FEATURE_RO_COMPAT_FUNCS(extra_isize, EXTRA_ISIZE) EXT4_FEATURE_RO_COMPAT_FUNCS(quota, QUOTA) EXT4_FEATURE_RO_COMPAT_FUNCS(bigalloc, BIGALLOC) EXT4_FEATURE_RO_COMPAT_FUNCS(metadata_csum, METADATA_CSUM) EXT4_FEATURE_RO_COMPAT_FUNCS(readonly, READONLY) EXT4_FEATURE_RO_COMPAT_FUNCS(project, PROJECT) EXT4_FEATURE_RO_COMPAT_FUNCS(verity, VERITY) EXT4_FEATURE_RO_COMPAT_FUNCS(orphan_present, ORPHAN_PRESENT) EXT4_FEATURE_INCOMPAT_FUNCS(compression, COMPRESSION) EXT4_FEATURE_INCOMPAT_FUNCS(filetype, FILETYPE) EXT4_FEATURE_INCOMPAT_FUNCS(journal_needs_recovery, RECOVER) EXT4_FEATURE_INCOMPAT_FUNCS(journal_dev, JOURNAL_DEV) EXT4_FEATURE_INCOMPAT_FUNCS(meta_bg, META_BG) EXT4_FEATURE_INCOMPAT_FUNCS(extents, EXTENTS) EXT4_FEATURE_INCOMPAT_FUNCS(64bit, 64BIT) EXT4_FEATURE_INCOMPAT_FUNCS(mmp, MMP) EXT4_FEATURE_INCOMPAT_FUNCS(flex_bg, FLEX_BG) EXT4_FEATURE_INCOMPAT_FUNCS(ea_inode, EA_INODE) EXT4_FEATURE_INCOMPAT_FUNCS(dirdata, DIRDATA) EXT4_FEATURE_INCOMPAT_FUNCS(csum_seed, CSUM_SEED) EXT4_FEATURE_INCOMPAT_FUNCS(largedir, LARGEDIR) EXT4_FEATURE_INCOMPAT_FUNCS(inline_data, INLINE_DATA) EXT4_FEATURE_INCOMPAT_FUNCS(encrypt, ENCRYPT) EXT4_FEATURE_INCOMPAT_FUNCS(casefold, CASEFOLD) #define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR #define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ EXT4_FEATURE_INCOMPAT_META_BG) #define EXT2_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ EXT4_FEATURE_RO_COMPAT_BTREE_DIR) #define EXT3_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR #define EXT3_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ EXT4_FEATURE_INCOMPAT_RECOVER| \ EXT4_FEATURE_INCOMPAT_META_BG) #define EXT3_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ EXT4_FEATURE_RO_COMPAT_BTREE_DIR) #define EXT4_FEATURE_COMPAT_SUPP (EXT4_FEATURE_COMPAT_EXT_ATTR| \ EXT4_FEATURE_COMPAT_ORPHAN_FILE) #define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ EXT4_FEATURE_INCOMPAT_RECOVER| \ EXT4_FEATURE_INCOMPAT_META_BG| \ EXT4_FEATURE_INCOMPAT_EXTENTS| \ EXT4_FEATURE_INCOMPAT_64BIT| \ EXT4_FEATURE_INCOMPAT_FLEX_BG| \ EXT4_FEATURE_INCOMPAT_EA_INODE| \ EXT4_FEATURE_INCOMPAT_MMP | \ EXT4_FEATURE_INCOMPAT_INLINE_DATA | \ EXT4_FEATURE_INCOMPAT_ENCRYPT | \ EXT4_FEATURE_INCOMPAT_CASEFOLD | \ EXT4_FEATURE_INCOMPAT_CSUM_SEED | \ EXT4_FEATURE_INCOMPAT_LARGEDIR) #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ EXT4_FEATURE_RO_COMPAT_DIR_NLINK | \ EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \ EXT4_FEATURE_RO_COMPAT_BTREE_DIR |\ EXT4_FEATURE_RO_COMPAT_HUGE_FILE |\ EXT4_FEATURE_RO_COMPAT_BIGALLOC |\ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM|\ EXT4_FEATURE_RO_COMPAT_QUOTA |\ EXT4_FEATURE_RO_COMPAT_PROJECT |\ EXT4_FEATURE_RO_COMPAT_VERITY |\ EXT4_FEATURE_RO_COMPAT_ORPHAN_PRESENT) #define EXTN_FEATURE_FUNCS(ver) \ static inline bool ext4_has_unknown_ext##ver##_compat_features(struct super_block *sb) \ { \ return ((EXT4_SB(sb)->s_es->s_feature_compat & \ cpu_to_le32(~EXT##ver##_FEATURE_COMPAT_SUPP)) != 0); \ } \ static inline bool ext4_has_unknown_ext##ver##_ro_compat_features(struct super_block *sb) \ { \ return ((EXT4_SB(sb)->s_es->s_feature_ro_compat & \ cpu_to_le32(~EXT##ver##_FEATURE_RO_COMPAT_SUPP)) != 0); \ } \ static inline bool ext4_has_unknown_ext##ver##_incompat_features(struct super_block *sb) \ { \ return ((EXT4_SB(sb)->s_es->s_feature_incompat & \ cpu_to_le32(~EXT##ver##_FEATURE_INCOMPAT_SUPP)) != 0); \ } EXTN_FEATURE_FUNCS(2) EXTN_FEATURE_FUNCS(3) EXTN_FEATURE_FUNCS(4) static inline bool ext4_has_compat_features(struct super_block *sb) { return (EXT4_SB(sb)->s_es->s_feature_compat != 0); } static inline bool ext4_has_ro_compat_features(struct super_block *sb) { return (EXT4_SB(sb)->s_es->s_feature_ro_compat != 0); } static inline bool ext4_has_incompat_features(struct super_block *sb) { return (EXT4_SB(sb)->s_es->s_feature_incompat != 0); } extern int ext4_feature_set_ok(struct super_block *sb, int readonly); /* * Superblock flags */ enum { EXT4_FLAGS_RESIZING, /* Avoid superblock update and resize race */ EXT4_FLAGS_SHUTDOWN, /* Prevent access to the file system */ EXT4_FLAGS_BDEV_IS_DAX, /* Current block device support DAX */ EXT4_FLAGS_EMERGENCY_RO,/* Emergency read-only due to fs errors */ }; static inline int ext4_forced_shutdown(struct super_block *sb) { return test_bit(EXT4_FLAGS_SHUTDOWN, &EXT4_SB(sb)->s_ext4_flags); } static inline int ext4_emergency_ro(struct super_block *sb) { return test_bit(EXT4_FLAGS_EMERGENCY_RO, &EXT4_SB(sb)->s_ext4_flags); } static inline int ext4_emergency_state(struct super_block *sb) { if (unlikely(ext4_forced_shutdown(sb))) return -EIO; if (unlikely(ext4_emergency_ro(sb))) return -EROFS; return 0; } /* * Default values for user and/or group using reserved blocks */ #define EXT4_DEF_RESUID 0 #define EXT4_DEF_RESGID 0 /* * Default project ID */ #define EXT4_DEF_PROJID 0 #define EXT4_DEF_INODE_READAHEAD_BLKS 32 /* * Default mount options */ #define EXT4_DEFM_DEBUG 0x0001 #define EXT4_DEFM_BSDGROUPS 0x0002 #define EXT4_DEFM_XATTR_USER 0x0004 #define EXT4_DEFM_ACL 0x0008 #define EXT4_DEFM_UID16 0x0010 #define EXT4_DEFM_JMODE 0x0060 #define EXT4_DEFM_JMODE_DATA 0x0020 #define EXT4_DEFM_JMODE_ORDERED 0x0040 #define EXT4_DEFM_JMODE_WBACK 0x0060 #define EXT4_DEFM_NOBARRIER 0x0100 #define EXT4_DEFM_BLOCK_VALIDITY 0x0200 #define EXT4_DEFM_DISCARD 0x0400 #define EXT4_DEFM_NODELALLOC 0x0800 /* * Default journal batch times and ioprio. */ #define EXT4_DEF_MIN_BATCH_TIME 0 #define EXT4_DEF_MAX_BATCH_TIME 15000 /* 15ms */ #define EXT4_DEF_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) /* * Default values for superblock update */ #define EXT4_DEF_SB_UPDATE_INTERVAL_SEC (3600) /* seconds (1 hour) */ #define EXT4_DEF_SB_UPDATE_INTERVAL_KB (16384) /* kilobytes (16MB) */ /* * Minimum number of groups in a flexgroup before we separate out * directories into the first block group of a flexgroup */ #define EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME 4 /* * Structure of a directory entry */ #define EXT4_NAME_LEN 255 /* * Base length of the ext4 directory entry excluding the name length */ #define EXT4_BASE_DIR_LEN (sizeof(struct ext4_dir_entry_2) - EXT4_NAME_LEN) struct ext4_dir_entry { __le32 inode; /* Inode number */ __le16 rec_len; /* Directory entry length */ __le16 name_len; /* Name length */ char name[EXT4_NAME_LEN]; /* File name */ }; /* * Encrypted Casefolded entries require saving the hash on disk. This structure * followed ext4_dir_entry_2's name[name_len] at the next 4 byte aligned * boundary. */ struct ext4_dir_entry_hash { __le32 hash; __le32 minor_hash; }; /* * The new version of the directory entry. Since EXT4 structures are * stored in intel byte order, and the name_len field could never be * bigger than 255 chars, it's safe to reclaim the extra byte for the * file_type field. */ struct ext4_dir_entry_2 { __le32 inode; /* Inode number */ __le16 rec_len; /* Directory entry length */ __u8 name_len; /* Name length */ __u8 file_type; /* See file type macros EXT4_FT_* below */ char name[EXT4_NAME_LEN]; /* File name */ }; /* * Access the hashes at the end of ext4_dir_entry_2 */ #define EXT4_DIRENT_HASHES(entry) \ ((struct ext4_dir_entry_hash *) \ (((void *)(entry)) + \ ((8 + (entry)->name_len + EXT4_DIR_ROUND) & ~EXT4_DIR_ROUND))) #define EXT4_DIRENT_HASH(entry) le32_to_cpu(EXT4_DIRENT_HASHES(entry)->hash) #define EXT4_DIRENT_MINOR_HASH(entry) \ le32_to_cpu(EXT4_DIRENT_HASHES(entry)->minor_hash) static inline bool ext4_hash_in_dirent(const struct inode *inode) { return IS_CASEFOLDED(inode) && IS_ENCRYPTED(inode); } /* * This is a bogus directory entry at the end of each leaf block that * records checksums. */ struct ext4_dir_entry_tail { __le32 det_reserved_zero1; /* Pretend to be unused */ __le16 det_rec_len; /* 12 */ __u8 det_reserved_zero2; /* Zero name length */ __u8 det_reserved_ft; /* 0xDE, fake file type */ __le32 det_checksum; /* crc32c(uuid+inum+dirblock) */ }; #define EXT4_DIRENT_TAIL(block, blocksize) \ ((struct ext4_dir_entry_tail *)(((void *)(block)) + \ ((blocksize) - \ sizeof(struct ext4_dir_entry_tail)))) /* * Ext4 directory file types. Only the low 3 bits are used. The * other bits are reserved for now. */ #define EXT4_FT_UNKNOWN 0 #define EXT4_FT_REG_FILE 1 #define EXT4_FT_DIR 2 #define EXT4_FT_CHRDEV 3 #define EXT4_FT_BLKDEV 4 #define EXT4_FT_FIFO 5 #define EXT4_FT_SOCK 6 #define EXT4_FT_SYMLINK 7 #define EXT4_FT_MAX 8 #define EXT4_FT_DIR_CSUM 0xDE /* * EXT4_DIR_PAD defines the directory entries boundaries * * NOTE: It must be a multiple of 4 */ #define EXT4_DIR_PAD 4 #define EXT4_DIR_ROUND (EXT4_DIR_PAD - 1) #define EXT4_MAX_REC_LEN ((1<<16)-1) /* * The rec_len is dependent on the type of directory. Directories that are * casefolded and encrypted need to store the hash as well, so we add room for * ext4_extended_dir_entry_2. For all entries related to '.' or '..' you should * pass NULL for dir, as those entries do not use the extra fields. */ static inline unsigned int ext4_dir_rec_len(__u8 name_len, const struct inode *dir) { int rec_len = (name_len + 8 + EXT4_DIR_ROUND); if (dir && ext4_hash_in_dirent(dir)) rec_len += sizeof(struct ext4_dir_entry_hash); return (rec_len & ~EXT4_DIR_ROUND); } /* * If we ever get support for fs block sizes > page_size, we'll need * to remove the #if statements in the next two functions... */ static inline unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize) { unsigned len = le16_to_cpu(dlen); #if (PAGE_SIZE >= 65536) if (len == EXT4_MAX_REC_LEN || len == 0) return blocksize; return (len & 65532) | ((len & 3) << 16); #else return len; #endif } static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize) { BUG_ON((len > blocksize) || (blocksize > (1 << 18)) || (len & 3)); #if (PAGE_SIZE >= 65536) if (len < 65536) return cpu_to_le16(len); if (len == blocksize) { if (blocksize == 65536) return cpu_to_le16(EXT4_MAX_REC_LEN); else return cpu_to_le16(0); } return cpu_to_le16((len & 65532) | ((len >> 16) & 3)); #else return cpu_to_le16(len); #endif } /* * Hash Tree Directory indexing * (c) Daniel Phillips, 2001 */ #define is_dx(dir) (ext4_has_feature_dir_index((dir)->i_sb) && \ ext4_test_inode_flag((dir), EXT4_INODE_INDEX)) #define EXT4_DIR_LINK_MAX(dir) unlikely((dir)->i_nlink >= EXT4_LINK_MAX && \ !(ext4_has_feature_dir_nlink((dir)->i_sb) && is_dx(dir))) #define EXT4_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1) /* Legal values for the dx_root hash_version field: */ #define DX_HASH_LEGACY 0 #define DX_HASH_HALF_MD4 1 #define DX_HASH_TEA 2 #define DX_HASH_LEGACY_UNSIGNED 3 #define DX_HASH_HALF_MD4_UNSIGNED 4 #define DX_HASH_TEA_UNSIGNED 5 #define DX_HASH_SIPHASH 6 #define DX_HASH_LAST DX_HASH_SIPHASH static inline u32 ext4_chksum(u32 crc, const void *address, unsigned int length) { return crc32c(crc, address, length); } #ifdef __KERNEL__ /* hash info structure used by the directory hash */ struct dx_hash_info { u32 hash; u32 minor_hash; int hash_version; u32 *seed; }; /* 32 and 64 bit signed EOF for dx directories */ #define EXT4_HTREE_EOF_32BIT ((1UL << (32 - 1)) - 1) #define EXT4_HTREE_EOF_64BIT ((1ULL << (64 - 1)) - 1) /* * Control parameters used by ext4_htree_next_block */ #define HASH_NB_ALWAYS 1 struct ext4_filename { const struct qstr *usr_fname; struct fscrypt_str disk_name; struct dx_hash_info hinfo; #ifdef CONFIG_FS_ENCRYPTION struct fscrypt_str crypto_buf; #endif #if IS_ENABLED(CONFIG_UNICODE) struct qstr cf_name; #endif }; #define fname_name(p) ((p)->disk_name.name) #define fname_usr_name(p) ((p)->usr_fname->name) #define fname_len(p) ((p)->disk_name.len) /* * Describe an inode's exact location on disk and in memory */ struct ext4_iloc { struct buffer_head *bh; unsigned long offset; ext4_group_t block_group; }; static inline struct ext4_inode *ext4_raw_inode(struct ext4_iloc *iloc) { return (struct ext4_inode *) (iloc->bh->b_data + iloc->offset); } static inline bool ext4_is_quota_file(struct inode *inode) { return IS_NOQUOTA(inode) && !(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL); } /* * This structure is stuffed into the struct file's private_data field * for directories. It is where we put information so that we can do * readdir operations in hash tree order. */ struct dir_private_info { struct rb_root root; struct rb_node *curr_node; struct fname *extra_fname; loff_t last_pos; __u32 curr_hash; __u32 curr_minor_hash; __u32 next_hash; u64 cookie; bool initialized; }; /* calculate the first block number of the group */ static inline ext4_fsblk_t ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no) { return group_no * (ext4_fsblk_t)EXT4_BLOCKS_PER_GROUP(sb) + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); } /* * Special error return code only used by dx_probe() and its callers. */ #define ERR_BAD_DX_DIR (-(MAX_ERRNO - 1)) /* htree levels for ext4 */ #define EXT4_HTREE_LEVEL_COMPAT 2 #define EXT4_HTREE_LEVEL 3 static inline int ext4_dir_htree_level(struct super_block *sb) { return ext4_has_feature_largedir(sb) ? EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT; } /* * Timeout and state flag for lazy initialization inode thread. */ #define EXT4_DEF_LI_WAIT_MULT 10 #define EXT4_DEF_LI_MAX_START_DELAY 5 #define EXT4_LAZYINIT_QUIT 0x0001 #define EXT4_LAZYINIT_RUNNING 0x0002 /* * Lazy inode table initialization info */ struct ext4_lazy_init { unsigned long li_state; struct list_head li_request_list; struct mutex li_list_mtx; }; enum ext4_li_mode { EXT4_LI_MODE_PREFETCH_BBITMAP, EXT4_LI_MODE_ITABLE, }; struct ext4_li_request { struct super_block *lr_super; enum ext4_li_mode lr_mode; ext4_group_t lr_first_not_zeroed; ext4_group_t lr_next_group; struct list_head lr_request; unsigned long lr_next_sched; unsigned long lr_timeout; }; struct ext4_features { struct kobject f_kobj; struct completion f_kobj_unregister; }; /* * This structure will be used for multiple mount protection. It will be * written into the block number saved in the s_mmp_block field in the * superblock. Programs that check MMP should assume that if * SEQ_FSCK (or any unknown code above SEQ_MAX) is present then it is NOT safe * to use the filesystem, regardless of how old the timestamp is. */ #define EXT4_MMP_MAGIC 0x004D4D50U /* ASCII for MMP */ #define EXT4_MMP_SEQ_CLEAN 0xFF4D4D50U /* mmp_seq value for clean unmount */ #define EXT4_MMP_SEQ_FSCK 0xE24D4D50U /* mmp_seq value when being fscked */ #define EXT4_MMP_SEQ_MAX 0xE24D4D4FU /* maximum valid mmp_seq value */ struct mmp_struct { __le32 mmp_magic; /* Magic number for MMP */ __le32 mmp_seq; /* Sequence no. updated periodically */ /* * mmp_time, mmp_nodename & mmp_bdevname are only used for information * purposes and do not affect the correctness of the algorithm */ __le64 mmp_time; /* Time last updated */ char mmp_nodename[64]; /* Node which last updated MMP block */ char mmp_bdevname[32]; /* Bdev which last updated MMP block */ /* * mmp_check_interval is used to verify if the MMP block has been * updated on the block device. The value is updated based on the * maximum time to write the MMP block during an update cycle. */ __le16 mmp_check_interval; __le16 mmp_pad1; __le32 mmp_pad2[226]; __le32 mmp_checksum; /* crc32c(uuid+mmp_block) */ }; /* arguments passed to the mmp thread */ struct mmpd_data { struct buffer_head *bh; /* bh from initial read_mmp_block() */ struct super_block *sb; /* super block of the fs */ }; /* * Check interval multiplier * The MMP block is written every update interval and initially checked every * update interval x the multiplier (the value is then adapted based on the * write latency). The reason is that writes can be delayed under load and we * don't want readers to incorrectly assume that the filesystem is no longer * in use. */ #define EXT4_MMP_CHECK_MULT 2UL /* * Minimum interval for MMP checking in seconds. */ #define EXT4_MMP_MIN_CHECK_INTERVAL 5UL /* * Maximum interval for MMP checking in seconds. */ #define EXT4_MMP_MAX_CHECK_INTERVAL 300UL /* * Function prototypes */ /* * Ok, these declarations are also in <linux/kernel.h> but none of the * ext4 source programs needs to include it so they are duplicated here. */ # define NORET_TYPE /**/ # define ATTRIB_NORET __attribute__((noreturn)) # define NORET_AND noreturn, /* bitmap.c */ extern unsigned int ext4_count_free(char *bitmap, unsigned numchars); void ext4_inode_bitmap_csum_set(struct super_block *sb, struct ext4_group_desc *gdp, struct buffer_head *bh); int ext4_inode_bitmap_csum_verify(struct super_block *sb, struct ext4_group_desc *gdp, struct buffer_head *bh); void ext4_block_bitmap_csum_set(struct super_block *sb, struct ext4_group_desc *gdp, struct buffer_head *bh); int ext4_block_bitmap_csum_verify(struct super_block *sb, struct ext4_group_desc *gdp, struct buffer_head *bh); /* balloc.c */ extern void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp); extern ext4_group_t ext4_get_group_number(struct super_block *sb, ext4_fsblk_t block); extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group); extern unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group); extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, ext4_fsblk_t goal, unsigned int flags, unsigned long *count, int *errp); extern int ext4_claim_free_clusters(struct ext4_sb_info *sbi, s64 nclusters, unsigned int flags); extern ext4_fsblk_t ext4_count_free_clusters(struct super_block *); extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, ext4_group_t block_group, struct buffer_head ** bh); extern struct ext4_group_info *ext4_get_group_info(struct super_block *sb, ext4_group_t group); extern int ext4_should_retry_alloc(struct super_block *sb, int *retries); extern struct buffer_head *ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group, bool ignore_locked); extern int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group, struct buffer_head *bh); extern struct buffer_head *ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group); extern unsigned ext4_free_clusters_after_init(struct super_block *sb, ext4_group_t block_group, struct ext4_group_desc *gdp); ext4_fsblk_t ext4_inode_to_goal_block(struct inode *); #if IS_ENABLED(CONFIG_UNICODE) extern int ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname, struct ext4_filename *fname); static inline void ext4_fname_free_ci_filename(struct ext4_filename *fname) { kfree(fname->cf_name.name); fname->cf_name.name = NULL; } #else static inline int ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname, struct ext4_filename *fname) { return 0; } static inline void ext4_fname_free_ci_filename(struct ext4_filename *fname) { } #endif /* ext4 encryption related stuff goes here crypto.c */ #ifdef CONFIG_FS_ENCRYPTION extern const struct fscrypt_operations ext4_cryptops; int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname, int lookup, struct ext4_filename *fname); int ext4_fname_prepare_lookup(struct inode *dir, struct dentry *dentry, struct ext4_filename *fname); void ext4_fname_free_filename(struct ext4_filename *fname); int ext4_ioctl_get_encryption_pwsalt(struct file *filp, void __user *arg); #else /* !CONFIG_FS_ENCRYPTION */ static inline int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname, int lookup, struct ext4_filename *fname) { fname->usr_fname = iname; fname->disk_name.name = (unsigned char *) iname->name; fname->disk_name.len = iname->len; return ext4_fname_setup_ci_filename(dir, iname, fname); } static inline int ext4_fname_prepare_lookup(struct inode *dir, struct dentry *dentry, struct ext4_filename *fname) { return ext4_fname_setup_filename(dir, &dentry->d_name, 1, fname); } static inline void ext4_fname_free_filename(struct ext4_filename *fname) { ext4_fname_free_ci_filename(fname); } static inline int ext4_ioctl_get_encryption_pwsalt(struct file *filp, void __user *arg) { return -EOPNOTSUPP; } #endif /* !CONFIG_FS_ENCRYPTION */ /* dir.c */ extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *, struct file *, struct ext4_dir_entry_2 *, struct buffer_head *, char *, int, unsigned int); #define ext4_check_dir_entry(dir, filp, de, bh, buf, size, offset) \ unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \ (de), (bh), (buf), (size), (offset))) extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, __u32 minor_hash, struct ext4_dir_entry_2 *dirent, struct fscrypt_str *ent_name); extern void ext4_htree_free_dir_info(struct dir_private_info *p); extern int ext4_find_dest_de(struct inode *dir, struct buffer_head *bh, void *buf, int buf_size, struct ext4_filename *fname, struct ext4_dir_entry_2 **dest_de); void ext4_insert_dentry(struct inode *dir, struct inode *inode, struct ext4_dir_entry_2 *de, int buf_size, struct ext4_filename *fname); static inline void ext4_update_dx_flag(struct inode *inode) { if (!ext4_has_feature_dir_index(inode->i_sb) && ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) { /* ext4_iget() should have caught this... */ WARN_ON_ONCE(ext4_has_feature_metadata_csum(inode->i_sb)); ext4_clear_inode_flag(inode, EXT4_INODE_INDEX); } } static const unsigned char ext4_filetype_table[] = { DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK }; static inline unsigned char get_dtype(struct super_block *sb, int filetype) { if (!ext4_has_feature_filetype(sb) || filetype >= EXT4_FT_MAX) return DT_UNKNOWN; return ext4_filetype_table[filetype]; } extern int ext4_check_all_de(struct inode *dir, struct buffer_head *bh, void *buf, int buf_size); /* fsync.c */ extern int ext4_sync_file(struct file *, loff_t, loff_t, int); /* hash.c */ extern int ext4fs_dirhash(const struct inode *dir, const char *name, int len, struct dx_hash_info *hinfo); /* ialloc.c */ extern int ext4_mark_inode_used(struct super_block *sb, int ino); extern struct inode *__ext4_new_inode(struct mnt_idmap *, handle_t *, struct inode *, umode_t, const struct qstr *qstr, __u32 goal, uid_t *owner, __u32 i_flags, int handle_type, unsigned int line_no, int nblocks); #define ext4_new_inode(handle, dir, mode, qstr, goal, owner, i_flags) \ __ext4_new_inode(&nop_mnt_idmap, (handle), (dir), (mode), (qstr), \ (goal), (owner), i_flags, 0, 0, 0) #define ext4_new_inode_start_handle(idmap, dir, mode, qstr, goal, owner, \ type, nblocks) \ __ext4_new_inode((idmap), NULL, (dir), (mode), (qstr), (goal), (owner), \ 0, (type), __LINE__, (nblocks)) extern void ext4_free_inode(handle_t *, struct inode *); extern struct inode * ext4_orphan_get(struct super_block *, unsigned long); extern unsigned long ext4_count_free_inodes(struct super_block *); extern unsigned long ext4_count_dirs(struct super_block *); extern void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap); extern int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, int barrier); extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate); /* fast_commit.c */ int ext4_fc_info_show(struct seq_file *seq, void *v); void ext4_fc_init(struct super_block *sb, journal_t *journal); void ext4_fc_init_inode(struct inode *inode); void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t start, ext4_lblk_t end); void __ext4_fc_track_unlink(handle_t *handle, struct inode *inode, struct dentry *dentry); void __ext4_fc_track_link(handle_t *handle, struct inode *inode, struct dentry *dentry); void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry); void ext4_fc_track_link(handle_t *handle, struct dentry *dentry); void __ext4_fc_track_create(handle_t *handle, struct inode *inode, struct dentry *dentry); void ext4_fc_track_create(handle_t *handle, struct dentry *dentry); void ext4_fc_track_inode(handle_t *handle, struct inode *inode); void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle); void ext4_fc_del(struct inode *inode); bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t block); void ext4_fc_replay_cleanup(struct super_block *sb); int ext4_fc_commit(journal_t *journal, tid_t commit_tid); int __init ext4_fc_init_dentry_cache(void); void ext4_fc_destroy_dentry_cache(void); int ext4_fc_record_regions(struct super_block *sb, int ino, ext4_lblk_t lblk, ext4_fsblk_t pblk, int len, int replay); /* mballoc.c */ extern const struct seq_operations ext4_mb_seq_groups_ops; extern const struct seq_operations ext4_mb_seq_structs_summary_ops; extern int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset); extern int ext4_mb_init(struct super_block *); extern void ext4_mb_release(struct super_block *); extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *, struct ext4_allocation_request *, int *); extern void ext4_discard_preallocations(struct inode *); extern int __init ext4_init_mballoc(void); extern void ext4_exit_mballoc(void); extern ext4_group_t ext4_mb_prefetch(struct super_block *sb, ext4_group_t group, unsigned int nr, int *cnt); extern void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group, unsigned int nr); extern void ext4_free_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bh, ext4_fsblk_t block, unsigned long count, int flags); extern int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups); extern int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t i, struct ext4_group_desc *desc); extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, ext4_fsblk_t block, unsigned long count); extern int ext4_trim_fs(struct super_block *, struct fstrim_range *); extern void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid); extern void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block, int len, bool state); static inline bool ext4_mb_cr_expensive(enum criteria cr) { return cr >= CR_GOAL_LEN_SLOW; } /* inode.c */ void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw, struct ext4_inode_info *ei); int ext4_inode_is_fast_symlink(struct inode *inode); void ext4_check_map_extents_env(struct inode *inode); struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int); struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int); int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count, bool wait, struct buffer_head **bhs); int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); int ext4_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create); int ext4_walk_page_buffers(handle_t *handle, struct inode *inode, struct buffer_head *head, unsigned from, unsigned to, int *partial, int (*fn)(handle_t *handle, struct inode *inode, struct buffer_head *bh)); int do_journal_get_write_access(handle_t *handle, struct inode *inode, struct buffer_head *bh); void ext4_set_inode_mapping_order(struct inode *inode); #define FALL_BACK_TO_NONDELALLOC 1 #define CONVERT_INLINE_DATA 2 typedef enum { EXT4_IGET_NORMAL = 0, EXT4_IGET_SPECIAL = 0x0001, /* OK to iget a system inode */ EXT4_IGET_HANDLE = 0x0002, /* Inode # is from a handle */ EXT4_IGET_BAD = 0x0004, /* Allow to iget a bad inode */ EXT4_IGET_EA_INODE = 0x0008 /* Inode should contain an EA value */ } ext4_iget_flags; extern struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, ext4_iget_flags flags, const char *function, unsigned int line); #define ext4_iget(sb, ino, flags) \ __ext4_iget((sb), (ino), (flags), __func__, __LINE__) extern int ext4_write_inode(struct inode *, struct writeback_control *); extern int ext4_setattr(struct mnt_idmap *, struct dentry *, struct iattr *); extern u32 ext4_dio_alignment(struct inode *inode); extern int ext4_getattr(struct mnt_idmap *, const struct path *, struct kstat *, u32, unsigned int); extern void ext4_evict_inode(struct inode *); extern void ext4_clear_inode(struct inode *); extern int ext4_file_getattr(struct mnt_idmap *, const struct path *, struct kstat *, u32, unsigned int); extern void ext4_dirty_inode(struct inode *, int); extern int ext4_change_inode_journal_flag(struct inode *, int); extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); extern int ext4_get_fc_inode_loc(struct super_block *sb, unsigned long ino, struct ext4_iloc *iloc); extern int ext4_inode_attach_jinode(struct inode *inode); extern int ext4_can_truncate(struct inode *inode); extern int ext4_truncate(struct inode *); extern int ext4_break_layouts(struct inode *); extern int ext4_truncate_page_cache_block_range(struct inode *inode, loff_t start, loff_t end); extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length); extern void ext4_set_inode_flags(struct inode *, bool init); extern int ext4_alloc_da_blocks(struct inode *inode); extern void ext4_set_aops(struct inode *inode); extern int ext4_normal_submit_inode_data_buffers(struct jbd2_inode *jinode); extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); extern int ext4_chunk_trans_extent(struct inode *inode, int nrblocks); extern int ext4_meta_trans_blocks(struct inode *inode, int lblocks, int pextents); extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, loff_t lstart, loff_t lend); extern vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf); extern qsize_t *ext4_get_reserved_space(struct inode *inode); extern int ext4_get_projid(struct inode *inode, kprojid_t *projid); extern void ext4_da_release_space(struct inode *inode, int to_free); extern void ext4_da_update_reserve_space(struct inode *inode, int used, int quota_claim); extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk, ext4_lblk_t len); static inline bool is_special_ino(struct super_block *sb, unsigned long ino) { struct ext4_super_block *es = EXT4_SB(sb)->s_es; return (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) || ino == le32_to_cpu(es->s_usr_quota_inum) || ino == le32_to_cpu(es->s_grp_quota_inum) || ino == le32_to_cpu(es->s_prj_quota_inum) || ino == le32_to_cpu(es->s_orphan_file_inum); } /* indirect.c */ extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks); extern void ext4_ind_truncate(handle_t *, struct inode *inode); extern int ext4_ind_remove_space(handle_t *handle, struct inode *inode, ext4_lblk_t start, ext4_lblk_t end); /* ioctl.c */ extern long ext4_ioctl(struct file *, unsigned int, unsigned long); extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); int ext4_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct file_kattr *fa); int ext4_fileattr_get(struct dentry *dentry, struct file_kattr *fa); extern void ext4_reset_inode_seed(struct inode *inode); int ext4_update_overhead(struct super_block *sb, bool force); int ext4_force_shutdown(struct super_block *sb, u32 flags); /* migrate.c */ extern int ext4_ext_migrate(struct inode *); extern int ext4_ind_migrate(struct inode *inode); /* namei.c */ extern int ext4_init_new_dir(handle_t *handle, struct inode *dir, struct inode *inode); extern int ext4_dirblock_csum_verify(struct inode *inode, struct buffer_head *bh); extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, __u32 start_minor_hash, __u32 *next_hash); extern int ext4_search_dir(struct buffer_head *bh, char *search_buf, int buf_size, struct inode *dir, struct ext4_filename *fname, unsigned int offset, struct ext4_dir_entry_2 **res_dir); extern int ext4_generic_delete_entry(struct inode *dir, struct ext4_dir_entry_2 *de_del, struct buffer_head *bh, void *entry_buf, int buf_size, int csum_size); extern bool ext4_empty_dir(struct inode *inode); /* resize.c */ extern void ext4_kvfree_array_rcu(void *to_free); extern int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input); extern int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, ext4_fsblk_t n_blocks_count); extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count); extern unsigned int ext4_list_backups(struct super_block *sb, unsigned int *three, unsigned int *five, unsigned int *seven); /* super.c */ extern struct buffer_head *ext4_sb_bread(struct super_block *sb, sector_t block, blk_opf_t op_flags); extern struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb, sector_t block); extern struct buffer_head *ext4_sb_bread_nofail(struct super_block *sb, sector_t block); extern void ext4_read_bh_nowait(struct buffer_head *bh, blk_opf_t op_flags, bh_end_io_t *end_io, bool simu_fail); extern int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, bh_end_io_t *end_io, bool simu_fail); extern int ext4_read_bh_lock(struct buffer_head *bh, blk_opf_t op_flags, bool wait); extern void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block); extern int ext4_seq_options_show(struct seq_file *seq, void *offset); extern int ext4_calculate_overhead(struct super_block *sb); extern __le32 ext4_superblock_csum(struct ext4_super_block *es); extern void ext4_superblock_csum_set(struct super_block *sb); extern int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup); extern const char *ext4_decode_error(struct super_block *sb, int errno, char nbuf[16]); extern void ext4_mark_group_bitmap_corrupted(struct super_block *sb, ext4_group_t block_group, unsigned int flags); extern unsigned int ext4_num_base_meta_blocks(struct super_block *sb, ext4_group_t block_group); extern __printf(7, 8) void __ext4_error(struct super_block *, const char *, unsigned int, bool, int, __u64, const char *, ...); extern __printf(6, 7) void __ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t, int, const char *, ...); extern __printf(5, 6) void __ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t, const char *, ...); extern void __ext4_std_error(struct super_block *, const char *, unsigned int, int); extern __printf(4, 5) void __ext4_warning(struct super_block *, const char *, unsigned int, const char *, ...); extern __printf(4, 5) void __ext4_warning_inode(const struct inode *inode, const char *function, unsigned int line, const char *fmt, ...); extern __printf(3, 4) void __ext4_msg(struct super_block *, const char *, const char *, ...); extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp, const char *, unsigned int, const char *); extern __printf(7, 8) void __ext4_grp_locked_error(const char *, unsigned int, struct super_block *, ext4_group_t, unsigned long, ext4_fsblk_t, const char *, ...); #define EXT4_ERROR_INODE(inode, fmt, a...) \ ext4_error_inode((inode), __func__, __LINE__, 0, (fmt), ## a) #define EXT4_ERROR_INODE_ERR(inode, err, fmt, a...) \ __ext4_error_inode((inode), __func__, __LINE__, 0, (err), (fmt), ## a) #define ext4_error_inode_block(inode, block, err, fmt, a...) \ __ext4_error_inode((inode), __func__, __LINE__, (block), (err), \ (fmt), ## a) #define EXT4_ERROR_FILE(file, block, fmt, a...) \ ext4_error_file((file), __func__, __LINE__, (block), (fmt), ## a) #define ext4_abort(sb, err, fmt, a...) \ __ext4_error((sb), __func__, __LINE__, true, (err), 0, (fmt), ## a) #ifdef CONFIG_PRINTK #define ext4_error_inode(inode, func, line, block, fmt, ...) \ __ext4_error_inode(inode, func, line, block, 0, fmt, ##__VA_ARGS__) #define ext4_error_inode_err(inode, func, line, block, err, fmt, ...) \ __ext4_error_inode((inode), (func), (line), (block), \ (err), (fmt), ##__VA_ARGS__) #define ext4_error_file(file, func, line, block, fmt, ...) \ __ext4_error_file(file, func, line, block, fmt, ##__VA_ARGS__) #define ext4_error(sb, fmt, ...) \ __ext4_error((sb), __func__, __LINE__, false, 0, 0, (fmt), \ ##__VA_ARGS__) #define ext4_error_err(sb, err, fmt, ...) \ __ext4_error((sb), __func__, __LINE__, false, (err), 0, (fmt), \ ##__VA_ARGS__) #define ext4_warning(sb, fmt, ...) \ __ext4_warning(sb, __func__, __LINE__, fmt, ##__VA_ARGS__) #define ext4_warning_inode(inode, fmt, ...) \ __ext4_warning_inode(inode, __func__, __LINE__, fmt, ##__VA_ARGS__) #define ext4_msg(sb, level, fmt, ...) \ __ext4_msg(sb, level, fmt, ##__VA_ARGS__) #define dump_mmp_msg(sb, mmp, msg) \ __dump_mmp_msg(sb, mmp, __func__, __LINE__, msg) #define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...) \ __ext4_grp_locked_error(__func__, __LINE__, sb, grp, ino, block, \ fmt, ##__VA_ARGS__) #else #define ext4_error_inode(inode, func, line, block, fmt, ...) \ do { \ no_printk(fmt, ##__VA_ARGS__); \ __ext4_error_inode(inode, "", 0, block, 0, " "); \ } while (0) #define ext4_error_inode_err(inode, func, line, block, err, fmt, ...) \ do { \ no_printk(fmt, ##__VA_ARGS__); \ __ext4_error_inode(inode, "", 0, block, err, " "); \ } while (0) #define ext4_error_file(file, func, line, block, fmt, ...) \ do { \ no_printk(fmt, ##__VA_ARGS__); \ __ext4_error_file(file, "", 0, block, " "); \ } while (0) #define ext4_error(sb, fmt, ...) \ do { \ no_printk(fmt, ##__VA_ARGS__); \ __ext4_error(sb, "", 0, false, 0, 0, " "); \ } while (0) #define ext4_error_err(sb, err, fmt, ...) \ do { \ no_printk(fmt, ##__VA_ARGS__); \ __ext4_error(sb, "", 0, false, err, 0, " "); \ } while (0) #define ext4_warning(sb, fmt, ...) \ do { \ no_printk(fmt, ##__VA_ARGS__); \ __ext4_warning(sb, "", 0, " "); \ } while (0) #define ext4_warning_inode(inode, fmt, ...) \ do { \ no_printk(fmt, ##__VA_ARGS__); \ __ext4_warning_inode(inode, "", 0, " "); \ } while (0) #define ext4_msg(sb, level, fmt, ...) \ do { \ no_printk(fmt, ##__VA_ARGS__); \ __ext4_msg(sb, "", " "); \ } while (0) #define dump_mmp_msg(sb, mmp, msg) \ __dump_mmp_msg(sb, mmp, "", 0, "") #define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...) \ do { \ no_printk(fmt, ##__VA_ARGS__); \ __ext4_grp_locked_error("", 0, sb, grp, ino, block, " "); \ } while (0) #endif extern ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, struct ext4_group_desc *bg); extern ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb, struct ext4_group_desc *bg); extern ext4_fsblk_t ext4_inode_table(struct super_block *sb, struct ext4_group_desc *bg); extern __u32 ext4_free_group_clusters(struct super_block *sb, struct ext4_group_desc *bg); extern __u32 ext4_free_inodes_count(struct super_block *sb, struct ext4_group_desc *bg); extern __u32 ext4_used_dirs_count(struct super_block *sb, struct ext4_group_desc *bg); extern __u32 ext4_itable_unused_count(struct super_block *sb, struct ext4_group_desc *bg); extern void ext4_block_bitmap_set(struct super_block *sb, struct ext4_group_desc *bg, ext4_fsblk_t blk); extern void ext4_inode_bitmap_set(struct super_block *sb, struct ext4_group_desc *bg, ext4_fsblk_t blk); extern void ext4_inode_table_set(struct super_block *sb, struct ext4_group_desc *bg, ext4_fsblk_t blk); extern void ext4_free_group_clusters_set(struct super_block *sb, struct ext4_group_desc *bg, __u32 count); extern void ext4_free_inodes_set(struct super_block *sb, struct ext4_group_desc *bg, __u32 count); extern void ext4_used_dirs_set(struct super_block *sb, struct ext4_group_desc *bg, __u32 count); extern void ext4_itable_unused_set(struct super_block *sb, struct ext4_group_desc *bg, __u32 count); extern int ext4_group_desc_csum_verify(struct super_block *sb, __u32 group, struct ext4_group_desc *gdp); extern void ext4_group_desc_csum_set(struct super_block *sb, __u32 group, struct ext4_group_desc *gdp); extern int ext4_register_li_request(struct super_block *sb, ext4_group_t first_not_zeroed); static inline int ext4_has_group_desc_csum(struct super_block *sb) { return ext4_has_feature_gdt_csum(sb) || ext4_has_feature_metadata_csum(sb); } #define ext4_read_incompat_64bit_val(es, name) \ (((es)->s_feature_incompat & cpu_to_le32(EXT4_FEATURE_INCOMPAT_64BIT) \ ? (ext4_fsblk_t)le32_to_cpu(es->name##_hi) << 32 : 0) | \ le32_to_cpu(es->name##_lo)) static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es) { return ext4_read_incompat_64bit_val(es, s_blocks_count); } static inline ext4_fsblk_t ext4_r_blocks_count(struct ext4_super_block *es) { return ext4_read_incompat_64bit_val(es, s_r_blocks_count); } static inline ext4_fsblk_t ext4_free_blocks_count(struct ext4_super_block *es) { return ext4_read_incompat_64bit_val(es, s_free_blocks_count); } static inline void ext4_blocks_count_set(struct ext4_super_block *es, ext4_fsblk_t blk) { es->s_blocks_count_lo = cpu_to_le32((u32)blk); es->s_blocks_count_hi = cpu_to_le32(blk >> 32); } static inline void ext4_free_blocks_count_set(struct ext4_super_block *es, ext4_fsblk_t blk) { es->s_free_blocks_count_lo = cpu_to_le32((u32)blk); es->s_free_blocks_count_hi = cpu_to_le32(blk >> 32); } static inline void ext4_r_blocks_count_set(struct ext4_super_block *es, ext4_fsblk_t blk) { es->s_r_blocks_count_lo = cpu_to_le32((u32)blk); es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32); } static inline loff_t ext4_isize(struct super_block *sb, struct ext4_inode *raw_inode) { if (ext4_has_feature_largedir(sb) || S_ISREG(le16_to_cpu(raw_inode->i_mode))) return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) | le32_to_cpu(raw_inode->i_size_lo); return (loff_t) le32_to_cpu(raw_inode->i_size_lo); } static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size) { raw_inode->i_size_lo = cpu_to_le32(i_size); raw_inode->i_size_high = cpu_to_le32(i_size >> 32); } /* * Reading s_groups_count requires using smp_rmb() afterwards. See * the locking protocol documented in the comments of ext4_group_add() * in resize.c */ static inline ext4_group_t ext4_get_groups_count(struct super_block *sb) { ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; smp_rmb(); return ngroups; } static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi, ext4_group_t block_group) { return block_group >> sbi->s_log_groups_per_flex; } static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi) { return 1 << sbi->s_log_groups_per_flex; } static inline loff_t ext4_get_maxbytes(struct inode *inode) { if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) return inode->i_sb->s_maxbytes; return EXT4_SB(inode->i_sb)->s_bitmap_maxbytes; } #define ext4_std_error(sb, errno) \ do { \ if ((errno)) \ __ext4_std_error((sb), __func__, __LINE__, (errno)); \ } while (0) #ifdef CONFIG_SMP /* Each CPU can accumulate percpu_counter_batch clusters in their local * counters. So we need to make sure we have free clusters more * than percpu_counter_batch * nr_cpu_ids. Also add a window of 4 times. */ #define EXT4_FREECLUSTERS_WATERMARK (4 * (percpu_counter_batch * nr_cpu_ids)) #else #define EXT4_FREECLUSTERS_WATERMARK 0 #endif /* Update i_disksize. Requires i_rwsem to avoid races with truncate */ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize) { WARN_ON_ONCE(S_ISREG(inode->i_mode) && !inode_is_locked(inode)); down_write(&EXT4_I(inode)->i_data_sem); if (newsize > EXT4_I(inode)->i_disksize) WRITE_ONCE(EXT4_I(inode)->i_disksize, newsize); up_write(&EXT4_I(inode)->i_data_sem); } /* Update i_size, i_disksize. Requires i_rwsem to avoid races with truncate */ static inline int ext4_update_inode_size(struct inode *inode, loff_t newsize) { int changed = 0; if (newsize > inode->i_size) { i_size_write(inode, newsize); changed = 1; } if (newsize > EXT4_I(inode)->i_disksize) { ext4_update_i_disksize(inode, newsize); changed |= 2; } return changed; } int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, loff_t len); struct ext4_group_info { unsigned long bb_state; #ifdef AGGRESSIVE_CHECK unsigned long bb_check_counter; #endif struct rb_root bb_free_root; ext4_grpblk_t bb_first_free; /* first free block */ ext4_grpblk_t bb_free; /* total free blocks */ ext4_grpblk_t bb_fragments; /* nr of freespace fragments */ int bb_avg_fragment_size_order; /* order of average fragment in BG */ ext4_grpblk_t bb_largest_free_order;/* order of largest frag in BG */ ext4_group_t bb_group; /* Group number */ struct list_head bb_prealloc_list; #ifdef DOUBLE_CHECK void *bb_bitmap; #endif struct rw_semaphore alloc_sem; ext4_grpblk_t bb_counters[]; /* Nr of free power-of-two-block * regions, index is order. * bb_counters[3] = 5 means * 5 free 8-block regions. */ }; #define EXT4_GROUP_INFO_NEED_INIT_BIT 0 #define EXT4_GROUP_INFO_WAS_TRIMMED_BIT 1 #define EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT 2 #define EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT 3 #define EXT4_GROUP_INFO_BBITMAP_CORRUPT \ (1 << EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT) #define EXT4_GROUP_INFO_IBITMAP_CORRUPT \ (1 << EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT) #define EXT4_GROUP_INFO_BBITMAP_READ_BIT 4 #define EXT4_MB_GRP_NEED_INIT(grp) \ (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) #define EXT4_MB_GRP_BBITMAP_CORRUPT(grp) \ (test_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &((grp)->bb_state))) #define EXT4_MB_GRP_IBITMAP_CORRUPT(grp) \ (test_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &((grp)->bb_state))) #define EXT4_MB_GRP_WAS_TRIMMED(grp) \ (test_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) #define EXT4_MB_GRP_SET_TRIMMED(grp) \ (set_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) #define EXT4_MB_GRP_CLEAR_TRIMMED(grp) \ (clear_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state))) #define EXT4_MB_GRP_TEST_AND_SET_READ(grp) \ (test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_READ_BIT, &((grp)->bb_state))) #define EXT4_MAX_CONTENTION 8 #define EXT4_CONTENTION_THRESHOLD 2 static inline spinlock_t *ext4_group_lock_ptr(struct super_block *sb, ext4_group_t group) { return bgl_lock_ptr(EXT4_SB(sb)->s_blockgroup_lock, group); } /* * Returns true if the filesystem is busy enough that attempts to * access the block group locks has run into contention. */ static inline int ext4_fs_is_busy(struct ext4_sb_info *sbi) { return (atomic_read(&sbi->s_lock_busy) > EXT4_CONTENTION_THRESHOLD); } static inline bool ext4_try_lock_group(struct super_block *sb, ext4_group_t group) { if (!spin_trylock(ext4_group_lock_ptr(sb, group))) return false; /* * We're able to grab the lock right away, so drop the lock * contention counter. */ atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, -1, 0); return true; } static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group) { if (!ext4_try_lock_group(sb, group)) { /* * The lock is busy, so bump the contention counter, * and then wait on the spin lock. */ atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, 1, EXT4_MAX_CONTENTION); spin_lock(ext4_group_lock_ptr(sb, group)); } } static inline void ext4_unlock_group(struct super_block *sb, ext4_group_t group) { spin_unlock(ext4_group_lock_ptr(sb, group)); } #ifdef CONFIG_QUOTA static inline bool ext4_quota_capable(struct super_block *sb) { return (test_opt(sb, QUOTA) || ext4_has_feature_quota(sb)); } static inline bool ext4_is_quota_journalled(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); return (ext4_has_feature_quota(sb) || sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]); } int ext4_enable_quotas(struct super_block *sb); #endif /* * Block validity checking */ #define ext4_check_indirect_blockref(inode, bh) \ ext4_check_blockref(__func__, __LINE__, inode, \ (__le32 *)(bh)->b_data, \ EXT4_ADDR_PER_BLOCK((inode)->i_sb)) #define ext4_ind_check_inode(inode) \ ext4_check_blockref(__func__, __LINE__, inode, \ EXT4_I(inode)->i_data, \ EXT4_NDIR_BLOCKS) /* * Inodes and files operations */ /* dir.c */ extern const struct file_operations ext4_dir_operations; /* file.c */ extern const struct inode_operations ext4_file_inode_operations; extern const struct file_operations ext4_file_operations; extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin); /* inline.c */ extern int ext4_get_max_inline_size(struct inode *inode); extern int ext4_find_inline_data_nolock(struct inode *inode); extern int ext4_destroy_inline_data(handle_t *handle, struct inode *inode); extern void ext4_update_final_de(void *de_buf, int old_size, int new_size); int ext4_readpage_inline(struct inode *inode, struct folio *folio); extern int ext4_try_to_write_inline_data(struct address_space *mapping, struct inode *inode, loff_t pos, unsigned len, struct folio **foliop); int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, unsigned copied, struct folio *folio); extern int ext4_generic_write_inline_data(struct address_space *mapping, struct inode *inode, loff_t pos, unsigned len, struct folio **foliop, void **fsdata, bool da); extern int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname, struct inode *dir, struct inode *inode); extern int ext4_try_create_inline_dir(handle_t *handle, struct inode *parent, struct inode *inode); extern int ext4_read_inline_dir(struct file *filp, struct dir_context *ctx, int *has_inline_data); extern int ext4_inlinedir_to_tree(struct file *dir_file, struct inode *dir, ext4_lblk_t block, struct dx_hash_info *hinfo, __u32 start_hash, __u32 start_minor_hash, int *has_inline_data); extern struct buffer_head *ext4_find_inline_entry(struct inode *dir, struct ext4_filename *fname, struct ext4_dir_entry_2 **res_dir, int *has_inline_data); extern int ext4_delete_inline_entry(handle_t *handle, struct inode *dir, struct ext4_dir_entry_2 *de_del, struct buffer_head *bh, int *has_inline_data); extern bool empty_inline_dir(struct inode *dir, int *has_inline_data); extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode, struct ext4_dir_entry_2 **parent_de, int *retval); extern void *ext4_read_inline_link(struct inode *inode); struct iomap; extern int ext4_inline_data_iomap(struct inode *inode, struct iomap *iomap); extern int ext4_inline_data_truncate(struct inode *inode, int *has_inline); extern int ext4_convert_inline_data(struct inode *inode); static inline int ext4_has_inline_data(struct inode *inode) { return ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA) && EXT4_I(inode)->i_inline_off; } /* namei.c */ extern const struct inode_operations ext4_dir_inode_operations; extern const struct inode_operations ext4_special_inode_operations; extern struct dentry *ext4_get_parent(struct dentry *child); extern int ext4_init_dirblock(handle_t *handle, struct inode *inode, struct buffer_head *dir_block, unsigned int parent_ino, void *inline_buf, int inline_size); extern void ext4_initialize_dirent_tail(struct buffer_head *bh, unsigned int blocksize); extern int ext4_handle_dirty_dirblock(handle_t *handle, struct inode *inode, struct buffer_head *bh); extern int __ext4_unlink(struct inode *dir, const struct qstr *d_name, struct inode *inode, struct dentry *dentry); extern int __ext4_link(struct inode *dir, struct inode *inode, struct dentry *dentry); #define S_SHIFT 12 static const unsigned char ext4_type_by_mode[(S_IFMT >> S_SHIFT) + 1] = { [S_IFREG >> S_SHIFT] = EXT4_FT_REG_FILE, [S_IFDIR >> S_SHIFT] = EXT4_FT_DIR, [S_IFCHR >> S_SHIFT] = EXT4_FT_CHRDEV, [S_IFBLK >> S_SHIFT] = EXT4_FT_BLKDEV, [S_IFIFO >> S_SHIFT] = EXT4_FT_FIFO, [S_IFSOCK >> S_SHIFT] = EXT4_FT_SOCK, [S_IFLNK >> S_SHIFT] = EXT4_FT_SYMLINK, }; static inline void ext4_set_de_type(struct super_block *sb, struct ext4_dir_entry_2 *de, umode_t mode) { if (ext4_has_feature_filetype(sb)) de->file_type = ext4_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; } /* readpages.c */ extern int ext4_mpage_readpages(struct inode *inode, struct readahead_control *rac, struct folio *folio); extern int __init ext4_init_post_read_processing(void); extern void ext4_exit_post_read_processing(void); /* symlink.c */ extern const struct inode_operations ext4_encrypted_symlink_inode_operations; extern const struct inode_operations ext4_symlink_inode_operations; extern const struct inode_operations ext4_fast_symlink_inode_operations; /* sysfs.c */ extern void ext4_notify_error_sysfs(struct ext4_sb_info *sbi); extern int ext4_register_sysfs(struct super_block *sb); extern void ext4_unregister_sysfs(struct super_block *sb); extern int __init ext4_init_sysfs(void); extern void ext4_exit_sysfs(void); /* block_validity */ extern void ext4_release_system_zone(struct super_block *sb); extern int ext4_setup_system_zone(struct super_block *sb); extern int __init ext4_init_system_zone(void); extern void ext4_exit_system_zone(void); extern int ext4_inode_block_valid(struct inode *inode, ext4_fsblk_t start_blk, unsigned int count); extern int ext4_check_blockref(const char *, unsigned int, struct inode *, __le32 *, unsigned int); extern int ext4_sb_block_valid(struct super_block *sb, struct inode *inode, ext4_fsblk_t start_blk, unsigned int count); /* extents.c */ struct ext4_ext_path; struct ext4_extent; /* * Maximum number of logical blocks in a file; ext4_extent's ee_block is * __le32. */ #define EXT_MAX_BLOCKS 0xffffffff extern void ext4_ext_tree_init(handle_t *handle, struct inode *inode); extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents); extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); extern int ext4_ext_truncate(handle_t *, struct inode *); extern int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, ext4_lblk_t end); extern void ext4_ext_init(struct super_block *); extern void ext4_ext_release(struct super_block *); extern long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len); extern int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, loff_t offset, ssize_t len); extern int ext4_convert_unwritten_extents_atomic(handle_t *handle, struct inode *inode, loff_t offset, ssize_t len); extern int ext4_convert_unwritten_io_end_vec(handle_t *handle, ext4_io_end_t *io_end); extern int ext4_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int num, struct ext4_ext_path *path); extern struct ext4_ext_path *ext4_ext_insert_extent( handle_t *handle, struct inode *inode, struct ext4_ext_path *path, struct ext4_extent *newext, int gb_flags); extern struct ext4_ext_path *ext4_find_extent(struct inode *, ext4_lblk_t, struct ext4_ext_path *, int flags); extern void ext4_free_ext_path(struct ext4_ext_path *); extern int ext4_ext_check_inode(struct inode *inode); extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path); extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); extern int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); extern int ext4_ext_precache(struct inode *inode); extern int ext4_swap_extents(handle_t *handle, struct inode *inode1, struct inode *inode2, ext4_lblk_t lblk1, ext4_lblk_t lblk2, ext4_lblk_t count, int mark_unwritten,int *err); extern int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu); extern int ext4_datasem_ensure_credits(handle_t *handle, struct inode *inode, int check_cred, int restart_cred, int revoke_cred); extern void ext4_ext_replay_shrink_inode(struct inode *inode, ext4_lblk_t end); extern int ext4_ext_replay_set_iblocks(struct inode *inode); extern int ext4_ext_replay_update_ex(struct inode *inode, ext4_lblk_t start, int len, int unwritten, ext4_fsblk_t pblk); extern int ext4_ext_clear_bb(struct inode *inode); /* move_extent.c */ extern void ext4_double_down_write_data_sem(struct inode *first, struct inode *second); extern void ext4_double_up_write_data_sem(struct inode *orig_inode, struct inode *donor_inode); extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 start_orig, __u64 start_donor, __u64 len, __u64 *moved_len); /* page-io.c */ extern int __init ext4_init_pageio(void); extern void ext4_exit_pageio(void); extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags); extern ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end); extern int ext4_put_io_end(ext4_io_end_t *io_end); extern void ext4_put_io_end_defer(ext4_io_end_t *io_end); extern void ext4_io_submit_init(struct ext4_io_submit *io, struct writeback_control *wbc); extern void ext4_end_io_rsv_work(struct work_struct *work); extern void ext4_io_submit(struct ext4_io_submit *io); int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *page, size_t len); extern struct ext4_io_end_vec *ext4_alloc_io_end_vec(ext4_io_end_t *io_end); extern struct ext4_io_end_vec *ext4_last_io_end_vec(ext4_io_end_t *io_end); /* mmp.c */ extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t); /* mmp.c */ extern void ext4_stop_mmpd(struct ext4_sb_info *sbi); /* verity.c */ extern const struct fsverity_operations ext4_verityops; /* orphan.c */ extern int ext4_orphan_add(handle_t *, struct inode *); extern int ext4_orphan_del(handle_t *, struct inode *); extern void ext4_orphan_cleanup(struct super_block *sb, struct ext4_super_block *es); extern void ext4_release_orphan_info(struct super_block *sb); extern int ext4_init_orphan_info(struct super_block *sb); extern int ext4_orphan_file_empty(struct super_block *sb); extern void ext4_orphan_file_block_trigger( struct jbd2_buffer_trigger_type *triggers, struct buffer_head *bh, void *data, size_t size); /* * Add new method to test whether block and inode bitmaps are properly * initialized. With uninit_bg reading the block from disk is not enough * to mark the bitmap uptodate. We need to also zero-out the bitmap */ #define BH_BITMAP_UPTODATE BH_JBDPrivateStart static inline int bitmap_uptodate(struct buffer_head *bh) { return (buffer_uptodate(bh) && test_bit(BH_BITMAP_UPTODATE, &(bh)->b_state)); } static inline void set_bitmap_uptodate(struct buffer_head *bh) { set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state); } extern int ext4_resize_begin(struct super_block *sb); extern int ext4_resize_end(struct super_block *sb, bool update_backups); static inline void ext4_set_io_unwritten_flag(struct ext4_io_end *io_end) { if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) io_end->flag |= EXT4_IO_END_UNWRITTEN; } static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end) { if (io_end->flag & EXT4_IO_END_UNWRITTEN) io_end->flag &= ~EXT4_IO_END_UNWRITTEN; } extern const struct iomap_ops ext4_iomap_ops; extern const struct iomap_ops ext4_iomap_overwrite_ops; extern const struct iomap_ops ext4_iomap_report_ops; static inline int ext4_buffer_uptodate(struct buffer_head *bh) { /* * If the buffer has the write error flag, we have failed * to write out data in the block. In this case, we don't * have to read the block because we may read the old data * successfully. */ if (buffer_write_io_error(bh)) set_buffer_uptodate(bh); return buffer_uptodate(bh); } static inline bool ext4_inode_can_atomic_write(struct inode *inode) { return S_ISREG(inode->i_mode) && ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) && EXT4_SB(inode->i_sb)->s_awu_min > 0; } extern int ext4_block_write_begin(handle_t *handle, struct folio *folio, loff_t pos, unsigned len, get_block_t *get_block); #endif /* __KERNEL__ */ #define EFSBADCRC EBADMSG /* Bad CRC detected */ #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ #endif /* _EXT4_H */
8 8 1977 1991 1982 1989 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _X_TABLES_H #define _X_TABLES_H #include <linux/netdevice.h> #include <linux/static_key.h> #include <linux/netfilter.h> #include <uapi/linux/netfilter/x_tables.h> /* Test a struct->invflags and a boolean for inequality */ #define NF_INVF(ptr, flag, boolean) \ ((boolean) ^ !!((ptr)->invflags & (flag))) /** * struct xt_action_param - parameters for matches/targets * * @match: the match extension * @target: the target extension * @matchinfo: per-match data * @targetinfo: per-target data * @state: pointer to hook state this packet came from * @fragoff: packet is a fragment, this is the data offset * @thoff: position of transport header relative to skb->data * * Fields written to by extensions: * * @hotdrop: drop packet if we had inspection problems */ struct xt_action_param { union { const struct xt_match *match; const struct xt_target *target; }; union { const void *matchinfo, *targinfo; }; const struct nf_hook_state *state; unsigned int thoff; u16 fragoff; bool hotdrop; }; static inline struct net *xt_net(const struct xt_action_param *par) { return par->state->net; } static inline struct net_device *xt_in(const struct xt_action_param *par) { return par->state->in; } static inline struct net_device *xt_out(const struct xt_action_param *par) { return par->state->out; } static inline unsigned int xt_hooknum(const struct xt_action_param *par) { return par->state->hook; } static inline u_int8_t xt_family(const struct xt_action_param *par) { return par->state->pf; } /** * struct xt_mtchk_param - parameters for match extensions' * checkentry functions * * @net: network namespace through which the check was invoked * @table: table the rule is tried to be inserted into * @entryinfo: the family-specific rule data * (struct ipt_ip, ip6t_ip, arpt_arp or (note) ebt_entry) * @match: struct xt_match through which this function was invoked * @matchinfo: per-match data * @hook_mask: via which hooks the new rule is reachable * Other fields as above. */ struct xt_mtchk_param { struct net *net; const char *table; const void *entryinfo; const struct xt_match *match; void *matchinfo; unsigned int hook_mask; u_int8_t family; bool nft_compat; }; /** * struct xt_mdtor_param - match destructor parameters * Fields as above. */ struct xt_mtdtor_param { struct net *net; const struct xt_match *match; void *matchinfo; u_int8_t family; }; /** * struct xt_tgchk_param - parameters for target extensions' * checkentry functions * * @entryinfo: the family-specific rule data * (struct ipt_entry, ip6t_entry, arpt_entry, ebt_entry) * * Other fields see above. */ struct xt_tgchk_param { struct net *net; const char *table; const void *entryinfo; const struct xt_target *target; void *targinfo; unsigned int hook_mask; u_int8_t family; bool nft_compat; }; /* Target destructor parameters */ struct xt_tgdtor_param { struct net *net; const struct xt_target *target; void *targinfo; u_int8_t family; }; struct xt_match { struct list_head list; const char name[XT_EXTENSION_MAXNAMELEN]; u_int8_t revision; /* Return true or false: return FALSE and set *hotdrop = 1 to force immediate packet drop. */ /* Arguments changed since 2.6.9, as this must now handle non-linear skb, using skb_header_pointer and skb_ip_make_writable. */ bool (*match)(const struct sk_buff *skb, struct xt_action_param *); /* Called when user tries to insert an entry of this type. */ int (*checkentry)(const struct xt_mtchk_param *); /* Called when entry of this type deleted. */ void (*destroy)(const struct xt_mtdtor_param *); #ifdef CONFIG_NETFILTER_XTABLES_COMPAT /* Called when userspace align differs from kernel space one */ void (*compat_from_user)(void *dst, const void *src); int (*compat_to_user)(void __user *dst, const void *src); #endif /* Set this to THIS_MODULE if you are a module, otherwise NULL */ struct module *me; const char *table; unsigned int matchsize; unsigned int usersize; #ifdef CONFIG_NETFILTER_XTABLES_COMPAT unsigned int compatsize; #endif unsigned int hooks; unsigned short proto; unsigned short family; }; /* Registration hooks for targets. */ struct xt_target { struct list_head list; const char name[XT_EXTENSION_MAXNAMELEN]; u_int8_t revision; /* Returns verdict. Argument order changed since 2.6.9, as this must now handle non-linear skbs, using skb_copy_bits and skb_ip_make_writable. */ unsigned int (*target)(struct sk_buff *skb, const struct xt_action_param *); /* Called when user tries to insert an entry of this type: hook_mask is a bitmask of hooks from which it can be called. */ /* Should return 0 on success or an error code otherwise (-Exxxx). */ int (*checkentry)(const struct xt_tgchk_param *); /* Called when entry of this type deleted. */ void (*destroy)(const struct xt_tgdtor_param *); #ifdef CONFIG_NETFILTER_XTABLES_COMPAT /* Called when userspace align differs from kernel space one */ void (*compat_from_user)(void *dst, const void *src); int (*compat_to_user)(void __user *dst, const void *src); #endif /* Set this to THIS_MODULE if you are a module, otherwise NULL */ struct module *me; const char *table; unsigned int targetsize; unsigned int usersize; #ifdef CONFIG_NETFILTER_XTABLES_COMPAT unsigned int compatsize; #endif unsigned int hooks; unsigned short proto; unsigned short family; }; /* Furniture shopping... */ struct xt_table { struct list_head list; /* What hooks you will enter on */ unsigned int valid_hooks; /* Man behind the curtain... */ struct xt_table_info *private; /* hook ops that register the table with the netfilter core */ struct nf_hook_ops *ops; /* Set this to THIS_MODULE if you are a module, otherwise NULL */ struct module *me; u_int8_t af; /* address/protocol family */ int priority; /* hook order */ /* A unique name... */ const char name[XT_TABLE_MAXNAMELEN]; }; #include <linux/netfilter_ipv4.h> /* The table itself */ struct xt_table_info { /* Size per table */ unsigned int size; /* Number of entries: FIXME. --RR */ unsigned int number; /* Initial number of entries. Needed for module usage count */ unsigned int initial_entries; /* Entry points and underflows */ unsigned int hook_entry[NF_INET_NUMHOOKS]; unsigned int underflow[NF_INET_NUMHOOKS]; /* * Number of user chains. Since tables cannot have loops, at most * @stacksize jumps (number of user chains) can possibly be made. */ unsigned int stacksize; void ***jumpstack; unsigned char entries[] __aligned(8); }; int xt_register_target(struct xt_target *target); void xt_unregister_target(struct xt_target *target); int xt_register_targets(struct xt_target *target, unsigned int n); void xt_unregister_targets(struct xt_target *target, unsigned int n); int xt_register_match(struct xt_match *target); void xt_unregister_match(struct xt_match *target); int xt_register_matches(struct xt_match *match, unsigned int n); void xt_unregister_matches(struct xt_match *match, unsigned int n); int xt_check_entry_offsets(const void *base, const char *elems, unsigned int target_offset, unsigned int next_offset); int xt_check_table_hooks(const struct xt_table_info *info, unsigned int valid_hooks); unsigned int *xt_alloc_entry_offsets(unsigned int size); bool xt_find_jump_offset(const unsigned int *offsets, unsigned int target, unsigned int size); int xt_check_proc_name(const char *name, unsigned int size); int xt_check_match(struct xt_mtchk_param *, unsigned int size, u16 proto, bool inv_proto); int xt_check_target(struct xt_tgchk_param *, unsigned int size, u16 proto, bool inv_proto); int xt_match_to_user(const struct xt_entry_match *m, struct xt_entry_match __user *u); int xt_target_to_user(const struct xt_entry_target *t, struct xt_entry_target __user *u); int xt_data_to_user(void __user *dst, const void *src, int usersize, int size, int aligned_size); void *xt_copy_counters(sockptr_t arg, unsigned int len, struct xt_counters_info *info); struct xt_counters *xt_counters_alloc(unsigned int counters); struct xt_table *xt_register_table(struct net *net, const struct xt_table *table, struct xt_table_info *bootstrap, struct xt_table_info *newinfo); void *xt_unregister_table(struct xt_table *table); struct xt_table_info *xt_replace_table(struct xt_table *table, unsigned int num_counters, struct xt_table_info *newinfo, int *error); struct xt_match *xt_find_match(u8 af, const char *name, u8 revision); struct xt_match *xt_request_find_match(u8 af, const char *name, u8 revision); struct xt_target *xt_request_find_target(u8 af, const char *name, u8 revision); int xt_find_revision(u8 af, const char *name, u8 revision, int target, int *err); struct xt_table *xt_find_table(struct net *net, u8 af, const char *name); struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af, const char *name); struct xt_table *xt_request_find_table_lock(struct net *net, u_int8_t af, const char *name); void xt_table_unlock(struct xt_table *t); int xt_proto_init(struct net *net, u_int8_t af); void xt_proto_fini(struct net *net, u_int8_t af); struct xt_table_info *xt_alloc_table_info(unsigned int size); void xt_free_table_info(struct xt_table_info *info); /** * xt_recseq - recursive seqcount for netfilter use * * Packet processing changes the seqcount only if no recursion happened * get_counters() can use read_seqcount_begin()/read_seqcount_retry(), * because we use the normal seqcount convention : * Low order bit set to 1 if a writer is active. */ DECLARE_PER_CPU(seqcount_t, xt_recseq); /* xt_tee_enabled - true if x_tables needs to handle reentrancy * * Enabled if current ip(6)tables ruleset has at least one -j TEE rule. */ extern struct static_key xt_tee_enabled; /** * xt_write_recseq_begin - start of a write section * * Begin packet processing : all readers must wait the end * 1) Must be called with preemption disabled * 2) softirqs must be disabled too (or we should use this_cpu_add()) * Returns: * 1 if no recursion on this cpu * 0 if recursion detected */ static inline unsigned int xt_write_recseq_begin(void) { unsigned int addend; /* * Low order bit of sequence is set if we already * called xt_write_recseq_begin(). */ addend = (__this_cpu_read(xt_recseq.sequence) + 1) & 1; /* * This is kind of a write_seqcount_begin(), but addend is 0 or 1 * We dont check addend value to avoid a test and conditional jump, * since addend is most likely 1 */ __this_cpu_add(xt_recseq.sequence, addend); smp_mb(); return addend; } /** * xt_write_recseq_end - end of a write section * @addend: return value from previous xt_write_recseq_begin() * * End packet processing : all readers can proceed * 1) Must be called with preemption disabled * 2) softirqs must be disabled too (or we should use this_cpu_add()) */ static inline void xt_write_recseq_end(unsigned int addend) { /* this is kind of a write_seqcount_end(), but addend is 0 or 1 */ smp_wmb(); __this_cpu_add(xt_recseq.sequence, addend); } /* * This helper is performance critical and must be inlined */ static inline unsigned long ifname_compare_aligned(const char *_a, const char *_b, const char *_mask) { const unsigned long *a = (const unsigned long *)_a; const unsigned long *b = (const unsigned long *)_b; const unsigned long *mask = (const unsigned long *)_mask; unsigned long ret; ret = (a[0] ^ b[0]) & mask[0]; if (IFNAMSIZ > sizeof(unsigned long)) ret |= (a[1] ^ b[1]) & mask[1]; if (IFNAMSIZ > 2 * sizeof(unsigned long)) ret |= (a[2] ^ b[2]) & mask[2]; if (IFNAMSIZ > 3 * sizeof(unsigned long)) ret |= (a[3] ^ b[3]) & mask[3]; BUILD_BUG_ON(IFNAMSIZ > 4 * sizeof(unsigned long)); return ret; } struct xt_percpu_counter_alloc_state { unsigned int off; const char __percpu *mem; }; bool xt_percpu_counter_alloc(struct xt_percpu_counter_alloc_state *state, struct xt_counters *counter); void xt_percpu_counter_free(struct xt_counters *cnt); static inline struct xt_counters * xt_get_this_cpu_counter(struct xt_counters *cnt) { if (nr_cpu_ids > 1) return this_cpu_ptr((void __percpu *) (unsigned long) cnt->pcnt); return cnt; } static inline struct xt_counters * xt_get_per_cpu_counter(struct xt_counters *cnt, unsigned int cpu) { if (nr_cpu_ids > 1) return per_cpu_ptr((void __percpu *) (unsigned long) cnt->pcnt, cpu); return cnt; } struct nf_hook_ops *xt_hook_ops_alloc(const struct xt_table *, nf_hookfn *); int xt_register_template(const struct xt_table *t, int(*table_init)(struct net *net)); void xt_unregister_template(const struct xt_table *t); #ifdef CONFIG_NETFILTER_XTABLES_COMPAT #include <net/compat.h> struct compat_xt_entry_match { union { struct { u_int16_t match_size; char name[XT_FUNCTION_MAXNAMELEN - 1]; u_int8_t revision; } user; struct { u_int16_t match_size; compat_uptr_t match; } kernel; u_int16_t match_size; } u; unsigned char data[]; }; struct compat_xt_entry_target { union { struct { u_int16_t target_size; char name[XT_FUNCTION_MAXNAMELEN - 1]; u_int8_t revision; } user; struct { u_int16_t target_size; compat_uptr_t target; } kernel; u_int16_t target_size; } u; unsigned char data[]; }; /* FIXME: this works only on 32 bit tasks * need to change whole approach in order to calculate align as function of * current task alignment */ struct compat_xt_counters { compat_u64 pcnt, bcnt; /* Packet and byte counters */ }; struct compat_xt_counters_info { char name[XT_TABLE_MAXNAMELEN]; compat_uint_t num_counters; struct compat_xt_counters counters[]; }; struct _compat_xt_align { __u8 u8; __u16 u16; __u32 u32; compat_u64 u64; }; #define COMPAT_XT_ALIGN(s) __ALIGN_KERNEL((s), __alignof__(struct _compat_xt_align)) void xt_compat_lock(u_int8_t af); void xt_compat_unlock(u_int8_t af); int xt_compat_add_offset(u_int8_t af, unsigned int offset, int delta); void xt_compat_flush_offsets(u_int8_t af); int xt_compat_init_offsets(u8 af, unsigned int number); int xt_compat_calc_jump(u_int8_t af, unsigned int offset); int xt_compat_match_offset(const struct xt_match *match); void xt_compat_match_from_user(struct xt_entry_match *m, void **dstptr, unsigned int *size); int xt_compat_match_to_user(const struct xt_entry_match *m, void __user **dstptr, unsigned int *size); int xt_compat_target_offset(const struct xt_target *target); void xt_compat_target_from_user(struct xt_entry_target *t, void **dstptr, unsigned int *size); int xt_compat_target_to_user(const struct xt_entry_target *t, void __user **dstptr, unsigned int *size); int xt_compat_check_entry_offsets(const void *base, const char *elems, unsigned int target_offset, unsigned int next_offset); #endif /* CONFIG_NETFILTER_XTABLES_COMPAT */ #endif /* _X_TABLES_H */
33 17 17 15 15 17 13 9 21 21 21 21 8 21 21 21 14 9 32 32 23 22 21 20 9 8 7 1 1 1 7 5 7 27 27 3 25 3 3 23 4 20 22 22 1 27 33 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 /* * Copyright (c) 2006, 2019 Oracle and/or its affiliates. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include <linux/kernel.h> #include <net/sock.h> #include <linux/in.h> #include <linux/ipv6.h> #include <linux/if_arp.h> #include <linux/jhash.h> #include <linux/ratelimit.h> #include "rds.h" static struct rhashtable bind_hash_table; static const struct rhashtable_params ht_parms = { .nelem_hint = 768, .key_len = RDS_BOUND_KEY_LEN, .key_offset = offsetof(struct rds_sock, rs_bound_key), .head_offset = offsetof(struct rds_sock, rs_bound_node), .max_size = 16384, .min_size = 1024, }; /* Create a key for the bind hash table manipulation. Port is in network byte * order. */ static inline void __rds_create_bind_key(u8 *key, const struct in6_addr *addr, __be16 port, __u32 scope_id) { memcpy(key, addr, sizeof(*addr)); key += sizeof(*addr); memcpy(key, &port, sizeof(port)); key += sizeof(port); memcpy(key, &scope_id, sizeof(scope_id)); } /* * Return the rds_sock bound at the given local address. * * The rx path can race with rds_release. We notice if rds_release() has * marked this socket and don't return a rs ref to the rx path. */ struct rds_sock *rds_find_bound(const struct in6_addr *addr, __be16 port, __u32 scope_id) { u8 key[RDS_BOUND_KEY_LEN]; struct rds_sock *rs; __rds_create_bind_key(key, addr, port, scope_id); rcu_read_lock(); rs = rhashtable_lookup(&bind_hash_table, key, ht_parms); if (rs && (sock_flag(rds_rs_to_sk(rs), SOCK_DEAD) || !refcount_inc_not_zero(&rds_rs_to_sk(rs)->sk_refcnt))) rs = NULL; rcu_read_unlock(); rdsdebug("returning rs %p for %pI6c:%u\n", rs, addr, ntohs(port)); return rs; } /* returns -ve errno or +ve port */ static int rds_add_bound(struct rds_sock *rs, const struct in6_addr *addr, __be16 *port, __u32 scope_id) { int ret = -EADDRINUSE; u16 rover, last; u8 key[RDS_BOUND_KEY_LEN]; if (*port != 0) { rover = be16_to_cpu(*port); if (rover == RDS_FLAG_PROBE_PORT) return -EINVAL; last = rover; } else { rover = max_t(u16, get_random_u16(), 2); last = rover - 1; } do { if (rover == 0) rover++; if (rover == RDS_FLAG_PROBE_PORT) continue; __rds_create_bind_key(key, addr, cpu_to_be16(rover), scope_id); if (rhashtable_lookup_fast(&bind_hash_table, key, ht_parms)) continue; memcpy(rs->rs_bound_key, key, sizeof(rs->rs_bound_key)); rs->rs_bound_addr = *addr; net_get_random_once(&rs->rs_hash_initval, sizeof(rs->rs_hash_initval)); rs->rs_bound_port = cpu_to_be16(rover); rs->rs_bound_node.next = NULL; rds_sock_addref(rs); if (!rhashtable_insert_fast(&bind_hash_table, &rs->rs_bound_node, ht_parms)) { *port = rs->rs_bound_port; rs->rs_bound_scope_id = scope_id; ret = 0; rdsdebug("rs %p binding to %pI6c:%d\n", rs, addr, (int)ntohs(*port)); break; } else { rs->rs_bound_addr = in6addr_any; rds_sock_put(rs); ret = -ENOMEM; break; } } while (rover++ != last); return ret; } void rds_remove_bound(struct rds_sock *rs) { if (ipv6_addr_any(&rs->rs_bound_addr)) return; rdsdebug("rs %p unbinding from %pI6c:%d\n", rs, &rs->rs_bound_addr, ntohs(rs->rs_bound_port)); rhashtable_remove_fast(&bind_hash_table, &rs->rs_bound_node, ht_parms); rds_sock_put(rs); rs->rs_bound_addr = in6addr_any; } int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sock *sk = sock->sk; struct rds_sock *rs = rds_sk_to_rs(sk); struct in6_addr v6addr, *binding_addr; struct rds_transport *trans; __u32 scope_id = 0; int ret = 0; __be16 port; /* We allow an RDS socket to be bound to either IPv4 or IPv6 * address. */ if (addr_len < offsetofend(struct sockaddr, sa_family)) return -EINVAL; if (uaddr->sa_family == AF_INET) { struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; if (addr_len < sizeof(struct sockaddr_in) || sin->sin_addr.s_addr == htonl(INADDR_ANY) || sin->sin_addr.s_addr == htonl(INADDR_BROADCAST) || ipv4_is_multicast(sin->sin_addr.s_addr)) return -EINVAL; ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &v6addr); binding_addr = &v6addr; port = sin->sin_port; #if IS_ENABLED(CONFIG_IPV6) } else if (uaddr->sa_family == AF_INET6) { struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)uaddr; int addr_type; if (addr_len < sizeof(struct sockaddr_in6)) return -EINVAL; addr_type = ipv6_addr_type(&sin6->sin6_addr); if (!(addr_type & IPV6_ADDR_UNICAST)) { __be32 addr4; if (!(addr_type & IPV6_ADDR_MAPPED)) return -EINVAL; /* It is a mapped address. Need to do some sanity * checks. */ addr4 = sin6->sin6_addr.s6_addr32[3]; if (addr4 == htonl(INADDR_ANY) || addr4 == htonl(INADDR_BROADCAST) || ipv4_is_multicast(addr4)) return -EINVAL; } /* The scope ID must be specified for link local address. */ if (addr_type & IPV6_ADDR_LINKLOCAL) { if (sin6->sin6_scope_id == 0) return -EINVAL; scope_id = sin6->sin6_scope_id; } binding_addr = &sin6->sin6_addr; port = sin6->sin6_port; #endif } else { return -EINVAL; } lock_sock(sk); /* RDS socket does not allow re-binding. */ if (!ipv6_addr_any(&rs->rs_bound_addr)) { ret = -EINVAL; goto out; } /* Socket is connected. The binding address should have the same * scope ID as the connected address, except the case when one is * non-link local address (scope_id is 0). */ if (!ipv6_addr_any(&rs->rs_conn_addr) && scope_id && rs->rs_bound_scope_id && scope_id != rs->rs_bound_scope_id) { ret = -EINVAL; goto out; } /* The transport can be set using SO_RDS_TRANSPORT option before the * socket is bound. */ if (rs->rs_transport) { trans = rs->rs_transport; if (!trans->laddr_check || trans->laddr_check(sock_net(sock->sk), binding_addr, scope_id) != 0) { ret = -ENOPROTOOPT; goto out; } } else { trans = rds_trans_get_preferred(sock_net(sock->sk), binding_addr, scope_id); if (!trans) { ret = -EADDRNOTAVAIL; pr_info_ratelimited("RDS: %s could not find a transport for %pI6c, load rds_tcp or rds_rdma?\n", __func__, binding_addr); goto out; } rs->rs_transport = trans; } sock_set_flag(sk, SOCK_RCU_FREE); ret = rds_add_bound(rs, binding_addr, &port, scope_id); if (ret) rs->rs_transport = NULL; out: release_sock(sk); return ret; } void rds_bind_lock_destroy(void) { rhashtable_destroy(&bind_hash_table); } int rds_bind_lock_init(void) { return rhashtable_init(&bind_hash_table, &ht_parms); }
40 40 3 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef IOU_NAPI_H #define IOU_NAPI_H #include <linux/kernel.h> #include <linux/io_uring.h> #include <net/busy_poll.h> #ifdef CONFIG_NET_RX_BUSY_POLL void io_napi_init(struct io_ring_ctx *ctx); void io_napi_free(struct io_ring_ctx *ctx); int io_register_napi(struct io_ring_ctx *ctx, void __user *arg); int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg); int __io_napi_add_id(struct io_ring_ctx *ctx, unsigned int napi_id); void __io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq); int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx); static inline bool io_napi(struct io_ring_ctx *ctx) { return !list_empty(&ctx->napi_list); } static inline void io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq) { if (!io_napi(ctx)) return; __io_napi_busy_loop(ctx, iowq); } /* * io_napi_add() - Add napi id to the busy poll list * @req: pointer to io_kiocb request * * Add the napi id of the socket to the napi busy poll list and hash table. */ static inline void io_napi_add(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; struct socket *sock; if (READ_ONCE(ctx->napi_track_mode) != IO_URING_NAPI_TRACKING_DYNAMIC) return; sock = sock_from_file(req->file); if (sock && sock->sk) __io_napi_add_id(ctx, READ_ONCE(sock->sk->sk_napi_id)); } #else static inline void io_napi_init(struct io_ring_ctx *ctx) { } static inline void io_napi_free(struct io_ring_ctx *ctx) { } static inline int io_register_napi(struct io_ring_ctx *ctx, void __user *arg) { return -EOPNOTSUPP; } static inline int io_unregister_napi(struct io_ring_ctx *ctx, void __user *arg) { return -EOPNOTSUPP; } static inline bool io_napi(struct io_ring_ctx *ctx) { return false; } static inline void io_napi_add(struct io_kiocb *req) { } static inline void io_napi_busy_loop(struct io_ring_ctx *ctx, struct io_wait_queue *iowq) { } static inline int io_napi_sqpoll_busy_poll(struct io_ring_ctx *ctx) { return 0; } #endif /* CONFIG_NET_RX_BUSY_POLL */ #endif
77 79 79 77 77 3 3 3 2 3 2 2 168 308 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 // SPDX-License-Identifier: GPL-2.0-or-later /* SCTP kernel implementation * (C) Copyright Red Hat Inc. 2017 * * This file is part of the SCTP kernel implementation * * These functions implement sctp stream message interleaving, mostly * including I-DATA and I-FORWARD-TSN chunks process. * * Please send any bug reports or fixes you make to the * email addresched(es): * lksctp developers <linux-sctp@vger.kernel.org> * * Written or modified by: * Xin Long <lucien.xin@gmail.com> */ #include <net/busy_poll.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> #include <net/sctp/ulpevent.h> #include <linux/sctp.h> static struct sctp_chunk *sctp_make_idatafrag_empty( const struct sctp_association *asoc, const struct sctp_sndrcvinfo *sinfo, int len, __u8 flags, gfp_t gfp) { struct sctp_chunk *retval; struct sctp_idatahdr dp; memset(&dp, 0, sizeof(dp)); dp.stream = htons(sinfo->sinfo_stream); if (sinfo->sinfo_flags & SCTP_UNORDERED) flags |= SCTP_DATA_UNORDERED; retval = sctp_make_idata(asoc, flags, sizeof(dp) + len, gfp); if (!retval) return NULL; retval->subh.idata_hdr = sctp_addto_chunk(retval, sizeof(dp), &dp); memcpy(&retval->sinfo, sinfo, sizeof(struct sctp_sndrcvinfo)); return retval; } static void sctp_chunk_assign_mid(struct sctp_chunk *chunk) { struct sctp_stream *stream; struct sctp_chunk *lchunk; __u32 cfsn = 0; __u16 sid; if (chunk->has_mid) return; sid = sctp_chunk_stream_no(chunk); stream = &chunk->asoc->stream; list_for_each_entry(lchunk, &chunk->msg->chunks, frag_list) { struct sctp_idatahdr *hdr; __u32 mid; lchunk->has_mid = 1; hdr = lchunk->subh.idata_hdr; if (lchunk->chunk_hdr->flags & SCTP_DATA_FIRST_FRAG) hdr->ppid = lchunk->sinfo.sinfo_ppid; else hdr->fsn = htonl(cfsn++); if (lchunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) { mid = lchunk->chunk_hdr->flags & SCTP_DATA_LAST_FRAG ? sctp_mid_uo_next(stream, out, sid) : sctp_mid_uo_peek(stream, out, sid); } else { mid = lchunk->chunk_hdr->flags & SCTP_DATA_LAST_FRAG ? sctp_mid_next(stream, out, sid) : sctp_mid_peek(stream, out, sid); } hdr->mid = htonl(mid); } } static bool sctp_validate_data(struct sctp_chunk *chunk) { struct sctp_stream *stream; __u16 sid, ssn; if (chunk->chunk_hdr->type != SCTP_CID_DATA) return false; if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) return true; stream = &chunk->asoc->stream; sid = sctp_chunk_stream_no(chunk); ssn = ntohs(chunk->subh.data_hdr->ssn); return !SSN_lt(ssn, sctp_ssn_peek(stream, in, sid)); } static bool sctp_validate_idata(struct sctp_chunk *chunk) { struct sctp_stream *stream; __u32 mid; __u16 sid; if (chunk->chunk_hdr->type != SCTP_CID_I_DATA) return false; if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) return true; stream = &chunk->asoc->stream; sid = sctp_chunk_stream_no(chunk); mid = ntohl(chunk->subh.idata_hdr->mid); return !MID_lt(mid, sctp_mid_peek(stream, in, sid)); } static void sctp_intl_store_reasm(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sctp_ulpevent *cevent; struct sk_buff *pos, *loc; pos = skb_peek_tail(&ulpq->reasm); if (!pos) { __skb_queue_tail(&ulpq->reasm, sctp_event2skb(event)); return; } cevent = sctp_skb2event(pos); if (event->stream == cevent->stream && event->mid == cevent->mid && (cevent->msg_flags & SCTP_DATA_FIRST_FRAG || (!(event->msg_flags & SCTP_DATA_FIRST_FRAG) && event->fsn > cevent->fsn))) { __skb_queue_tail(&ulpq->reasm, sctp_event2skb(event)); return; } if ((event->stream == cevent->stream && MID_lt(cevent->mid, event->mid)) || event->stream > cevent->stream) { __skb_queue_tail(&ulpq->reasm, sctp_event2skb(event)); return; } loc = NULL; skb_queue_walk(&ulpq->reasm, pos) { cevent = sctp_skb2event(pos); if (event->stream < cevent->stream || (event->stream == cevent->stream && MID_lt(event->mid, cevent->mid))) { loc = pos; break; } if (event->stream == cevent->stream && event->mid == cevent->mid && !(cevent->msg_flags & SCTP_DATA_FIRST_FRAG) && (event->msg_flags & SCTP_DATA_FIRST_FRAG || event->fsn < cevent->fsn)) { loc = pos; break; } } if (!loc) __skb_queue_tail(&ulpq->reasm, sctp_event2skb(event)); else __skb_queue_before(&ulpq->reasm, loc, sctp_event2skb(event)); } static struct sctp_ulpevent *sctp_intl_retrieve_partial( struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sk_buff *first_frag = NULL; struct sk_buff *last_frag = NULL; struct sctp_ulpevent *retval; struct sctp_stream_in *sin; struct sk_buff *pos; __u32 next_fsn = 0; int is_last = 0; sin = sctp_stream_in(&ulpq->asoc->stream, event->stream); skb_queue_walk(&ulpq->reasm, pos) { struct sctp_ulpevent *cevent = sctp_skb2event(pos); if (cevent->stream < event->stream) continue; if (cevent->stream > event->stream || cevent->mid != sin->mid) break; switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) { case SCTP_DATA_FIRST_FRAG: goto out; case SCTP_DATA_MIDDLE_FRAG: if (!first_frag) { if (cevent->fsn == sin->fsn) { first_frag = pos; last_frag = pos; next_fsn = cevent->fsn + 1; } } else if (cevent->fsn == next_fsn) { last_frag = pos; next_fsn++; } else { goto out; } break; case SCTP_DATA_LAST_FRAG: if (!first_frag) { if (cevent->fsn == sin->fsn) { first_frag = pos; last_frag = pos; next_fsn = 0; is_last = 1; } } else if (cevent->fsn == next_fsn) { last_frag = pos; next_fsn = 0; is_last = 1; } goto out; default: goto out; } } out: if (!first_frag) return NULL; retval = sctp_make_reassembled_event(ulpq->asoc->base.net, &ulpq->reasm, first_frag, last_frag); if (retval) { sin->fsn = next_fsn; if (is_last) { retval->msg_flags |= MSG_EOR; sin->pd_mode = 0; } } return retval; } static struct sctp_ulpevent *sctp_intl_retrieve_reassembled( struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sctp_association *asoc = ulpq->asoc; struct sk_buff *pos, *first_frag = NULL; struct sctp_ulpevent *retval = NULL; struct sk_buff *pd_first = NULL; struct sk_buff *pd_last = NULL; struct sctp_stream_in *sin; __u32 next_fsn = 0; __u32 pd_point = 0; __u32 pd_len = 0; __u32 mid = 0; sin = sctp_stream_in(&ulpq->asoc->stream, event->stream); skb_queue_walk(&ulpq->reasm, pos) { struct sctp_ulpevent *cevent = sctp_skb2event(pos); if (cevent->stream < event->stream) continue; if (cevent->stream > event->stream) break; if (MID_lt(cevent->mid, event->mid)) continue; if (MID_lt(event->mid, cevent->mid)) break; switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) { case SCTP_DATA_FIRST_FRAG: if (cevent->mid == sin->mid) { pd_first = pos; pd_last = pos; pd_len = pos->len; } first_frag = pos; next_fsn = 0; mid = cevent->mid; break; case SCTP_DATA_MIDDLE_FRAG: if (first_frag && cevent->mid == mid && cevent->fsn == next_fsn) { next_fsn++; if (pd_first) { pd_last = pos; pd_len += pos->len; } } else { first_frag = NULL; } break; case SCTP_DATA_LAST_FRAG: if (first_frag && cevent->mid == mid && cevent->fsn == next_fsn) goto found; else first_frag = NULL; break; } } if (!pd_first) goto out; pd_point = sctp_sk(asoc->base.sk)->pd_point; if (pd_point && pd_point <= pd_len) { retval = sctp_make_reassembled_event(asoc->base.net, &ulpq->reasm, pd_first, pd_last); if (retval) { sin->fsn = next_fsn; sin->pd_mode = 1; } } goto out; found: retval = sctp_make_reassembled_event(asoc->base.net, &ulpq->reasm, first_frag, pos); if (retval) retval->msg_flags |= MSG_EOR; out: return retval; } static struct sctp_ulpevent *sctp_intl_reasm(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sctp_ulpevent *retval = NULL; struct sctp_stream_in *sin; if (SCTP_DATA_NOT_FRAG == (event->msg_flags & SCTP_DATA_FRAG_MASK)) { event->msg_flags |= MSG_EOR; return event; } sctp_intl_store_reasm(ulpq, event); sin = sctp_stream_in(&ulpq->asoc->stream, event->stream); if (sin->pd_mode && event->mid == sin->mid && event->fsn == sin->fsn) retval = sctp_intl_retrieve_partial(ulpq, event); if (!retval) retval = sctp_intl_retrieve_reassembled(ulpq, event); return retval; } static void sctp_intl_store_ordered(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sctp_ulpevent *cevent; struct sk_buff *pos, *loc; pos = skb_peek_tail(&ulpq->lobby); if (!pos) { __skb_queue_tail(&ulpq->lobby, sctp_event2skb(event)); return; } cevent = (struct sctp_ulpevent *)pos->cb; if (event->stream == cevent->stream && MID_lt(cevent->mid, event->mid)) { __skb_queue_tail(&ulpq->lobby, sctp_event2skb(event)); return; } if (event->stream > cevent->stream) { __skb_queue_tail(&ulpq->lobby, sctp_event2skb(event)); return; } loc = NULL; skb_queue_walk(&ulpq->lobby, pos) { cevent = (struct sctp_ulpevent *)pos->cb; if (cevent->stream > event->stream) { loc = pos; break; } if (cevent->stream == event->stream && MID_lt(event->mid, cevent->mid)) { loc = pos; break; } } if (!loc) __skb_queue_tail(&ulpq->lobby, sctp_event2skb(event)); else __skb_queue_before(&ulpq->lobby, loc, sctp_event2skb(event)); } static void sctp_intl_retrieve_ordered(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sk_buff_head *event_list; struct sctp_stream *stream; struct sk_buff *pos, *tmp; __u16 sid = event->stream; stream = &ulpq->asoc->stream; event_list = (struct sk_buff_head *)sctp_event2skb(event)->prev; sctp_skb_for_each(pos, &ulpq->lobby, tmp) { struct sctp_ulpevent *cevent = (struct sctp_ulpevent *)pos->cb; if (cevent->stream > sid) break; if (cevent->stream < sid) continue; if (cevent->mid != sctp_mid_peek(stream, in, sid)) break; sctp_mid_next(stream, in, sid); __skb_unlink(pos, &ulpq->lobby); __skb_queue_tail(event_list, pos); } } static struct sctp_ulpevent *sctp_intl_order(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sctp_stream *stream; __u16 sid; stream = &ulpq->asoc->stream; sid = event->stream; if (event->mid != sctp_mid_peek(stream, in, sid)) { sctp_intl_store_ordered(ulpq, event); return NULL; } sctp_mid_next(stream, in, sid); sctp_intl_retrieve_ordered(ulpq, event); return event; } static int sctp_enqueue_event(struct sctp_ulpq *ulpq, struct sk_buff_head *skb_list) { struct sock *sk = ulpq->asoc->base.sk; struct sctp_sock *sp = sctp_sk(sk); struct sctp_ulpevent *event; struct sk_buff *skb; skb = __skb_peek(skb_list); event = sctp_skb2event(skb); if (sk->sk_shutdown & RCV_SHUTDOWN && (sk->sk_shutdown & SEND_SHUTDOWN || !sctp_ulpevent_is_notification(event))) goto out_free; if (!sctp_ulpevent_is_notification(event)) { sk_mark_napi_id(sk, skb); sk_incoming_cpu_update(sk); } if (!sctp_ulpevent_is_enabled(event, ulpq->asoc->subscribe)) goto out_free; skb_queue_splice_tail_init(skb_list, &sk->sk_receive_queue); if (!sp->data_ready_signalled) { sp->data_ready_signalled = 1; sk->sk_data_ready(sk); } return 1; out_free: sctp_queue_purge_ulpevents(skb_list); return 0; } static void sctp_intl_store_reasm_uo(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sctp_ulpevent *cevent; struct sk_buff *pos; pos = skb_peek_tail(&ulpq->reasm_uo); if (!pos) { __skb_queue_tail(&ulpq->reasm_uo, sctp_event2skb(event)); return; } cevent = sctp_skb2event(pos); if (event->stream == cevent->stream && event->mid == cevent->mid && (cevent->msg_flags & SCTP_DATA_FIRST_FRAG || (!(event->msg_flags & SCTP_DATA_FIRST_FRAG) && event->fsn > cevent->fsn))) { __skb_queue_tail(&ulpq->reasm_uo, sctp_event2skb(event)); return; } if ((event->stream == cevent->stream && MID_lt(cevent->mid, event->mid)) || event->stream > cevent->stream) { __skb_queue_tail(&ulpq->reasm_uo, sctp_event2skb(event)); return; } skb_queue_walk(&ulpq->reasm_uo, pos) { cevent = sctp_skb2event(pos); if (event->stream < cevent->stream || (event->stream == cevent->stream && MID_lt(event->mid, cevent->mid))) break; if (event->stream == cevent->stream && event->mid == cevent->mid && !(cevent->msg_flags & SCTP_DATA_FIRST_FRAG) && (event->msg_flags & SCTP_DATA_FIRST_FRAG || event->fsn < cevent->fsn)) break; } __skb_queue_before(&ulpq->reasm_uo, pos, sctp_event2skb(event)); } static struct sctp_ulpevent *sctp_intl_retrieve_partial_uo( struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sk_buff *first_frag = NULL; struct sk_buff *last_frag = NULL; struct sctp_ulpevent *retval; struct sctp_stream_in *sin; struct sk_buff *pos; __u32 next_fsn = 0; int is_last = 0; sin = sctp_stream_in(&ulpq->asoc->stream, event->stream); skb_queue_walk(&ulpq->reasm_uo, pos) { struct sctp_ulpevent *cevent = sctp_skb2event(pos); if (cevent->stream < event->stream) continue; if (cevent->stream > event->stream) break; if (MID_lt(cevent->mid, sin->mid_uo)) continue; if (MID_lt(sin->mid_uo, cevent->mid)) break; switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) { case SCTP_DATA_FIRST_FRAG: goto out; case SCTP_DATA_MIDDLE_FRAG: if (!first_frag) { if (cevent->fsn == sin->fsn_uo) { first_frag = pos; last_frag = pos; next_fsn = cevent->fsn + 1; } } else if (cevent->fsn == next_fsn) { last_frag = pos; next_fsn++; } else { goto out; } break; case SCTP_DATA_LAST_FRAG: if (!first_frag) { if (cevent->fsn == sin->fsn_uo) { first_frag = pos; last_frag = pos; next_fsn = 0; is_last = 1; } } else if (cevent->fsn == next_fsn) { last_frag = pos; next_fsn = 0; is_last = 1; } goto out; default: goto out; } } out: if (!first_frag) return NULL; retval = sctp_make_reassembled_event(ulpq->asoc->base.net, &ulpq->reasm_uo, first_frag, last_frag); if (retval) { sin->fsn_uo = next_fsn; if (is_last) { retval->msg_flags |= MSG_EOR; sin->pd_mode_uo = 0; } } return retval; } static struct sctp_ulpevent *sctp_intl_retrieve_reassembled_uo( struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sctp_association *asoc = ulpq->asoc; struct sk_buff *pos, *first_frag = NULL; struct sctp_ulpevent *retval = NULL; struct sk_buff *pd_first = NULL; struct sk_buff *pd_last = NULL; struct sctp_stream_in *sin; __u32 next_fsn = 0; __u32 pd_point = 0; __u32 pd_len = 0; __u32 mid = 0; sin = sctp_stream_in(&ulpq->asoc->stream, event->stream); skb_queue_walk(&ulpq->reasm_uo, pos) { struct sctp_ulpevent *cevent = sctp_skb2event(pos); if (cevent->stream < event->stream) continue; if (cevent->stream > event->stream) break; if (MID_lt(cevent->mid, event->mid)) continue; if (MID_lt(event->mid, cevent->mid)) break; switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) { case SCTP_DATA_FIRST_FRAG: if (!sin->pd_mode_uo) { sin->mid_uo = cevent->mid; pd_first = pos; pd_last = pos; pd_len = pos->len; } first_frag = pos; next_fsn = 0; mid = cevent->mid; break; case SCTP_DATA_MIDDLE_FRAG: if (first_frag && cevent->mid == mid && cevent->fsn == next_fsn) { next_fsn++; if (pd_first) { pd_last = pos; pd_len += pos->len; } } else { first_frag = NULL; } break; case SCTP_DATA_LAST_FRAG: if (first_frag && cevent->mid == mid && cevent->fsn == next_fsn) goto found; else first_frag = NULL; break; } } if (!pd_first) goto out; pd_point = sctp_sk(asoc->base.sk)->pd_point; if (pd_point && pd_point <= pd_len) { retval = sctp_make_reassembled_event(asoc->base.net, &ulpq->reasm_uo, pd_first, pd_last); if (retval) { sin->fsn_uo = next_fsn; sin->pd_mode_uo = 1; } } goto out; found: retval = sctp_make_reassembled_event(asoc->base.net, &ulpq->reasm_uo, first_frag, pos); if (retval) retval->msg_flags |= MSG_EOR; out: return retval; } static struct sctp_ulpevent *sctp_intl_reasm_uo(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sctp_ulpevent *retval = NULL; struct sctp_stream_in *sin; if (SCTP_DATA_NOT_FRAG == (event->msg_flags & SCTP_DATA_FRAG_MASK)) { event->msg_flags |= MSG_EOR; return event; } sctp_intl_store_reasm_uo(ulpq, event); sin = sctp_stream_in(&ulpq->asoc->stream, event->stream); if (sin->pd_mode_uo && event->mid == sin->mid_uo && event->fsn == sin->fsn_uo) retval = sctp_intl_retrieve_partial_uo(ulpq, event); if (!retval) retval = sctp_intl_retrieve_reassembled_uo(ulpq, event); return retval; } static struct sctp_ulpevent *sctp_intl_retrieve_first_uo(struct sctp_ulpq *ulpq) { struct sctp_stream_in *csin, *sin = NULL; struct sk_buff *first_frag = NULL; struct sk_buff *last_frag = NULL; struct sctp_ulpevent *retval; struct sk_buff *pos; __u32 next_fsn = 0; __u16 sid = 0; skb_queue_walk(&ulpq->reasm_uo, pos) { struct sctp_ulpevent *cevent = sctp_skb2event(pos); csin = sctp_stream_in(&ulpq->asoc->stream, cevent->stream); if (csin->pd_mode_uo) continue; switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) { case SCTP_DATA_FIRST_FRAG: if (first_frag) goto out; first_frag = pos; last_frag = pos; next_fsn = 0; sin = csin; sid = cevent->stream; sin->mid_uo = cevent->mid; break; case SCTP_DATA_MIDDLE_FRAG: if (!first_frag) break; if (cevent->stream == sid && cevent->mid == sin->mid_uo && cevent->fsn == next_fsn) { next_fsn++; last_frag = pos; } else { goto out; } break; case SCTP_DATA_LAST_FRAG: if (first_frag) goto out; break; default: break; } } if (!first_frag) return NULL; out: retval = sctp_make_reassembled_event(ulpq->asoc->base.net, &ulpq->reasm_uo, first_frag, last_frag); if (retval) { sin->fsn_uo = next_fsn; sin->pd_mode_uo = 1; } return retval; } static int sctp_ulpevent_idata(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk, gfp_t gfp) { struct sctp_ulpevent *event; struct sk_buff_head temp; int event_eor = 0; event = sctp_ulpevent_make_rcvmsg(chunk->asoc, chunk, gfp); if (!event) return -ENOMEM; event->mid = ntohl(chunk->subh.idata_hdr->mid); if (event->msg_flags & SCTP_DATA_FIRST_FRAG) event->ppid = chunk->subh.idata_hdr->ppid; else event->fsn = ntohl(chunk->subh.idata_hdr->fsn); if (!(event->msg_flags & SCTP_DATA_UNORDERED)) { event = sctp_intl_reasm(ulpq, event); if (event) { skb_queue_head_init(&temp); __skb_queue_tail(&temp, sctp_event2skb(event)); if (event->msg_flags & MSG_EOR) event = sctp_intl_order(ulpq, event); } } else { event = sctp_intl_reasm_uo(ulpq, event); if (event) { skb_queue_head_init(&temp); __skb_queue_tail(&temp, sctp_event2skb(event)); } } if (event) { event_eor = (event->msg_flags & MSG_EOR) ? 1 : 0; sctp_enqueue_event(ulpq, &temp); } return event_eor; } static struct sctp_ulpevent *sctp_intl_retrieve_first(struct sctp_ulpq *ulpq) { struct sctp_stream_in *csin, *sin = NULL; struct sk_buff *first_frag = NULL; struct sk_buff *last_frag = NULL; struct sctp_ulpevent *retval; struct sk_buff *pos; __u32 next_fsn = 0; __u16 sid = 0; skb_queue_walk(&ulpq->reasm, pos) { struct sctp_ulpevent *cevent = sctp_skb2event(pos); csin = sctp_stream_in(&ulpq->asoc->stream, cevent->stream); if (csin->pd_mode) continue; switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) { case SCTP_DATA_FIRST_FRAG: if (first_frag) goto out; if (cevent->mid == csin->mid) { first_frag = pos; last_frag = pos; next_fsn = 0; sin = csin; sid = cevent->stream; } break; case SCTP_DATA_MIDDLE_FRAG: if (!first_frag) break; if (cevent->stream == sid && cevent->mid == sin->mid && cevent->fsn == next_fsn) { next_fsn++; last_frag = pos; } else { goto out; } break; case SCTP_DATA_LAST_FRAG: if (first_frag) goto out; break; default: break; } } if (!first_frag) return NULL; out: retval = sctp_make_reassembled_event(ulpq->asoc->base.net, &ulpq->reasm, first_frag, last_frag); if (retval) { sin->fsn = next_fsn; sin->pd_mode = 1; } return retval; } static void sctp_intl_start_pd(struct sctp_ulpq *ulpq, gfp_t gfp) { struct sctp_ulpevent *event; struct sk_buff_head temp; if (!skb_queue_empty(&ulpq->reasm)) { do { event = sctp_intl_retrieve_first(ulpq); if (event) { skb_queue_head_init(&temp); __skb_queue_tail(&temp, sctp_event2skb(event)); sctp_enqueue_event(ulpq, &temp); } } while (event); } if (!skb_queue_empty(&ulpq->reasm_uo)) { do { event = sctp_intl_retrieve_first_uo(ulpq); if (event) { skb_queue_head_init(&temp); __skb_queue_tail(&temp, sctp_event2skb(event)); sctp_enqueue_event(ulpq, &temp); } } while (event); } } static void sctp_renege_events(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk, gfp_t gfp) { struct sctp_association *asoc = ulpq->asoc; __u32 freed = 0; __u16 needed; needed = ntohs(chunk->chunk_hdr->length) - sizeof(struct sctp_idata_chunk); if (skb_queue_empty(&asoc->base.sk->sk_receive_queue)) { freed = sctp_ulpq_renege_list(ulpq, &ulpq->lobby, needed); if (freed < needed) freed += sctp_ulpq_renege_list(ulpq, &ulpq->reasm, needed); if (freed < needed) freed += sctp_ulpq_renege_list(ulpq, &ulpq->reasm_uo, needed); } if (freed >= needed && sctp_ulpevent_idata(ulpq, chunk, gfp) <= 0) sctp_intl_start_pd(ulpq, gfp); } static void sctp_intl_stream_abort_pd(struct sctp_ulpq *ulpq, __u16 sid, __u32 mid, __u16 flags, gfp_t gfp) { struct sock *sk = ulpq->asoc->base.sk; struct sctp_ulpevent *ev = NULL; if (!sctp_ulpevent_type_enabled(ulpq->asoc->subscribe, SCTP_PARTIAL_DELIVERY_EVENT)) return; ev = sctp_ulpevent_make_pdapi(ulpq->asoc, SCTP_PARTIAL_DELIVERY_ABORTED, sid, mid, flags, gfp); if (ev) { struct sctp_sock *sp = sctp_sk(sk); __skb_queue_tail(&sk->sk_receive_queue, sctp_event2skb(ev)); if (!sp->data_ready_signalled) { sp->data_ready_signalled = 1; sk->sk_data_ready(sk); } } } static void sctp_intl_reap_ordered(struct sctp_ulpq *ulpq, __u16 sid) { struct sctp_stream *stream = &ulpq->asoc->stream; struct sctp_ulpevent *cevent, *event = NULL; struct sk_buff_head *lobby = &ulpq->lobby; struct sk_buff *pos, *tmp; struct sk_buff_head temp; __u16 csid; __u32 cmid; skb_queue_head_init(&temp); sctp_skb_for_each(pos, lobby, tmp) { cevent = (struct sctp_ulpevent *)pos->cb; csid = cevent->stream; cmid = cevent->mid; if (csid > sid) break; if (csid < sid) continue; if (!MID_lt(cmid, sctp_mid_peek(stream, in, csid))) break; __skb_unlink(pos, lobby); if (!event) event = sctp_skb2event(pos); __skb_queue_tail(&temp, pos); } if (!event && pos != (struct sk_buff *)lobby) { cevent = (struct sctp_ulpevent *)pos->cb; csid = cevent->stream; cmid = cevent->mid; if (csid == sid && cmid == sctp_mid_peek(stream, in, csid)) { sctp_mid_next(stream, in, csid); __skb_unlink(pos, lobby); __skb_queue_tail(&temp, pos); event = sctp_skb2event(pos); } } if (event) { sctp_intl_retrieve_ordered(ulpq, event); sctp_enqueue_event(ulpq, &temp); } } static void sctp_intl_abort_pd(struct sctp_ulpq *ulpq, gfp_t gfp) { struct sctp_stream *stream = &ulpq->asoc->stream; __u16 sid; for (sid = 0; sid < stream->incnt; sid++) { struct sctp_stream_in *sin = SCTP_SI(stream, sid); __u32 mid; if (sin->pd_mode_uo) { sin->pd_mode_uo = 0; mid = sin->mid_uo; sctp_intl_stream_abort_pd(ulpq, sid, mid, 0x1, gfp); } if (sin->pd_mode) { sin->pd_mode = 0; mid = sin->mid; sctp_intl_stream_abort_pd(ulpq, sid, mid, 0, gfp); sctp_mid_skip(stream, in, sid, mid); sctp_intl_reap_ordered(ulpq, sid); } } /* intl abort pd happens only when all data needs to be cleaned */ sctp_ulpq_flush(ulpq); } static inline int sctp_get_skip_pos(struct sctp_ifwdtsn_skip *skiplist, int nskips, __be16 stream, __u8 flags) { int i; for (i = 0; i < nskips; i++) if (skiplist[i].stream == stream && skiplist[i].flags == flags) return i; return i; } #define SCTP_FTSN_U_BIT 0x1 static void sctp_generate_iftsn(struct sctp_outq *q, __u32 ctsn) { struct sctp_ifwdtsn_skip ftsn_skip_arr[10]; struct sctp_association *asoc = q->asoc; struct sctp_chunk *ftsn_chunk = NULL; struct list_head *lchunk, *temp; int nskips = 0, skip_pos; struct sctp_chunk *chunk; __u32 tsn; if (!asoc->peer.prsctp_capable) return; if (TSN_lt(asoc->adv_peer_ack_point, ctsn)) asoc->adv_peer_ack_point = ctsn; list_for_each_safe(lchunk, temp, &q->abandoned) { chunk = list_entry(lchunk, struct sctp_chunk, transmitted_list); tsn = ntohl(chunk->subh.data_hdr->tsn); if (TSN_lte(tsn, ctsn)) { list_del_init(lchunk); sctp_chunk_free(chunk); } else if (TSN_lte(tsn, asoc->adv_peer_ack_point + 1)) { __be16 sid = chunk->subh.idata_hdr->stream; __be32 mid = chunk->subh.idata_hdr->mid; __u8 flags = 0; if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) flags |= SCTP_FTSN_U_BIT; asoc->adv_peer_ack_point = tsn; skip_pos = sctp_get_skip_pos(&ftsn_skip_arr[0], nskips, sid, flags); ftsn_skip_arr[skip_pos].stream = sid; ftsn_skip_arr[skip_pos].reserved = 0; ftsn_skip_arr[skip_pos].flags = flags; ftsn_skip_arr[skip_pos].mid = mid; if (skip_pos == nskips) nskips++; if (nskips == 10) break; } else { break; } } if (asoc->adv_peer_ack_point > ctsn) ftsn_chunk = sctp_make_ifwdtsn(asoc, asoc->adv_peer_ack_point, nskips, &ftsn_skip_arr[0]); if (ftsn_chunk) { list_add_tail(&ftsn_chunk->list, &q->control_chunk_list); SCTP_INC_STATS(asoc->base.net, SCTP_MIB_OUTCTRLCHUNKS); } } #define _sctp_walk_ifwdtsn(pos, chunk, end) \ for (pos = (void *)(chunk->subh.ifwdtsn_hdr + 1); \ (void *)pos <= (void *)(chunk->subh.ifwdtsn_hdr + 1) + (end) - \ sizeof(struct sctp_ifwdtsn_skip); pos++) #define sctp_walk_ifwdtsn(pos, ch) \ _sctp_walk_ifwdtsn((pos), (ch), ntohs((ch)->chunk_hdr->length) - \ sizeof(struct sctp_ifwdtsn_chunk)) static bool sctp_validate_fwdtsn(struct sctp_chunk *chunk) { struct sctp_fwdtsn_skip *skip; __u16 incnt; if (chunk->chunk_hdr->type != SCTP_CID_FWD_TSN) return false; incnt = chunk->asoc->stream.incnt; sctp_walk_fwdtsn(skip, chunk) if (ntohs(skip->stream) >= incnt) return false; return true; } static bool sctp_validate_iftsn(struct sctp_chunk *chunk) { struct sctp_ifwdtsn_skip *skip; __u16 incnt; if (chunk->chunk_hdr->type != SCTP_CID_I_FWD_TSN) return false; incnt = chunk->asoc->stream.incnt; sctp_walk_ifwdtsn(skip, chunk) if (ntohs(skip->stream) >= incnt) return false; return true; } static void sctp_report_fwdtsn(struct sctp_ulpq *ulpq, __u32 ftsn) { /* Move the Cumulattive TSN Ack ahead. */ sctp_tsnmap_skip(&ulpq->asoc->peer.tsn_map, ftsn); /* purge the fragmentation queue */ sctp_ulpq_reasm_flushtsn(ulpq, ftsn); /* Abort any in progress partial delivery. */ sctp_ulpq_abort_pd(ulpq, GFP_ATOMIC); } static void sctp_intl_reasm_flushtsn(struct sctp_ulpq *ulpq, __u32 ftsn) { struct sk_buff *pos, *tmp; skb_queue_walk_safe(&ulpq->reasm, pos, tmp) { struct sctp_ulpevent *event = sctp_skb2event(pos); __u32 tsn = event->tsn; if (TSN_lte(tsn, ftsn)) { __skb_unlink(pos, &ulpq->reasm); sctp_ulpevent_free(event); } } skb_queue_walk_safe(&ulpq->reasm_uo, pos, tmp) { struct sctp_ulpevent *event = sctp_skb2event(pos); __u32 tsn = event->tsn; if (TSN_lte(tsn, ftsn)) { __skb_unlink(pos, &ulpq->reasm_uo); sctp_ulpevent_free(event); } } } static void sctp_report_iftsn(struct sctp_ulpq *ulpq, __u32 ftsn) { /* Move the Cumulattive TSN Ack ahead. */ sctp_tsnmap_skip(&ulpq->asoc->peer.tsn_map, ftsn); /* purge the fragmentation queue */ sctp_intl_reasm_flushtsn(ulpq, ftsn); /* abort only when it's for all data */ if (ftsn == sctp_tsnmap_get_max_tsn_seen(&ulpq->asoc->peer.tsn_map)) sctp_intl_abort_pd(ulpq, GFP_ATOMIC); } static void sctp_handle_fwdtsn(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk) { struct sctp_fwdtsn_skip *skip; /* Walk through all the skipped SSNs */ sctp_walk_fwdtsn(skip, chunk) sctp_ulpq_skip(ulpq, ntohs(skip->stream), ntohs(skip->ssn)); } static void sctp_intl_skip(struct sctp_ulpq *ulpq, __u16 sid, __u32 mid, __u8 flags) { struct sctp_stream_in *sin = sctp_stream_in(&ulpq->asoc->stream, sid); struct sctp_stream *stream = &ulpq->asoc->stream; if (flags & SCTP_FTSN_U_BIT) { if (sin->pd_mode_uo && MID_lt(sin->mid_uo, mid)) { sin->pd_mode_uo = 0; sctp_intl_stream_abort_pd(ulpq, sid, mid, 0x1, GFP_ATOMIC); } return; } if (MID_lt(mid, sctp_mid_peek(stream, in, sid))) return; if (sin->pd_mode) { sin->pd_mode = 0; sctp_intl_stream_abort_pd(ulpq, sid, mid, 0x0, GFP_ATOMIC); } sctp_mid_skip(stream, in, sid, mid); sctp_intl_reap_ordered(ulpq, sid); } static void sctp_handle_iftsn(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk) { struct sctp_ifwdtsn_skip *skip; /* Walk through all the skipped MIDs and abort stream pd if possible */ sctp_walk_ifwdtsn(skip, chunk) sctp_intl_skip(ulpq, ntohs(skip->stream), ntohl(skip->mid), skip->flags); } static int do_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sk_buff_head temp; skb_queue_head_init(&temp); __skb_queue_tail(&temp, sctp_event2skb(event)); return sctp_ulpq_tail_event(ulpq, &temp); } static struct sctp_stream_interleave sctp_stream_interleave_0 = { .data_chunk_len = sizeof(struct sctp_data_chunk), .ftsn_chunk_len = sizeof(struct sctp_fwdtsn_chunk), /* DATA process functions */ .make_datafrag = sctp_make_datafrag_empty, .assign_number = sctp_chunk_assign_ssn, .validate_data = sctp_validate_data, .ulpevent_data = sctp_ulpq_tail_data, .enqueue_event = do_ulpq_tail_event, .renege_events = sctp_ulpq_renege, .start_pd = sctp_ulpq_partial_delivery, .abort_pd = sctp_ulpq_abort_pd, /* FORWARD-TSN process functions */ .generate_ftsn = sctp_generate_fwdtsn, .validate_ftsn = sctp_validate_fwdtsn, .report_ftsn = sctp_report_fwdtsn, .handle_ftsn = sctp_handle_fwdtsn, }; static int do_sctp_enqueue_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) { struct sk_buff_head temp; skb_queue_head_init(&temp); __skb_queue_tail(&temp, sctp_event2skb(event)); return sctp_enqueue_event(ulpq, &temp); } static struct sctp_stream_interleave sctp_stream_interleave_1 = { .data_chunk_len = sizeof(struct sctp_idata_chunk), .ftsn_chunk_len = sizeof(struct sctp_ifwdtsn_chunk), /* I-DATA process functions */ .make_datafrag = sctp_make_idatafrag_empty, .assign_number = sctp_chunk_assign_mid, .validate_data = sctp_validate_idata, .ulpevent_data = sctp_ulpevent_idata, .enqueue_event = do_sctp_enqueue_event, .renege_events = sctp_renege_events, .start_pd = sctp_intl_start_pd, .abort_pd = sctp_intl_abort_pd, /* I-FORWARD-TSN process functions */ .generate_ftsn = sctp_generate_iftsn, .validate_ftsn = sctp_validate_iftsn, .report_ftsn = sctp_report_iftsn, .handle_ftsn = sctp_handle_iftsn, }; void sctp_stream_interleave_init(struct sctp_stream *stream) { struct sctp_association *asoc; asoc = container_of(stream, struct sctp_association, stream); stream->si = asoc->peer.intl_capable ? &sctp_stream_interleave_1 : &sctp_stream_interleave_0; }
71 4 67 2 65 66 66 71 19 19 6 14 19 61 61 61 61 61 61 61 61 1 320 310 310 311 28 27 28 28 10 5 5 2 5 91 8 90 91 288 287 288 288 33 33 33 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Oracle. All rights reserved. */ #include "messages.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" #include "accessors.h" #include "dir-item.h" /* * insert a name into a directory, doing overflow properly if there is a hash * collision. data_size indicates how big the item inserted should be. On * success a struct btrfs_dir_item pointer is returned, otherwise it is * an ERR_PTR. * * The name is not copied into the dir item, you have to do that yourself. */ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, const struct btrfs_key *cpu_key, u32 data_size, const char *name, int name_len) { int ret; char *ptr; struct extent_buffer *leaf; ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size); if (ret == -EEXIST) { struct btrfs_dir_item *di; di = btrfs_match_dir_item_name(path, name, name_len); if (di) return ERR_PTR(-EEXIST); btrfs_extend_item(trans, path, data_size); } else if (ret < 0) return ERR_PTR(ret); WARN_ON(ret > 0); leaf = path->nodes[0]; ptr = btrfs_item_ptr(leaf, path->slots[0], char); ASSERT(data_size <= btrfs_item_size(leaf, path->slots[0])); ptr += btrfs_item_size(leaf, path->slots[0]) - data_size; return (struct btrfs_dir_item *)ptr; } /* * xattrs work a lot like directories, this inserts an xattr item * into the tree */ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 objectid, const char *name, u16 name_len, const void *data, u16 data_len) { int ret = 0; struct btrfs_dir_item *dir_item; unsigned long name_ptr, data_ptr; struct btrfs_key key, location; struct btrfs_disk_key disk_key; struct extent_buffer *leaf; u32 data_size; if (name_len + data_len > BTRFS_MAX_XATTR_SIZE(root->fs_info)) return -ENOSPC; key.objectid = objectid; key.type = BTRFS_XATTR_ITEM_KEY; key.offset = btrfs_name_hash(name, name_len); data_size = sizeof(*dir_item) + name_len + data_len; dir_item = insert_with_overflow(trans, root, path, &key, data_size, name, name_len); if (IS_ERR(dir_item)) return PTR_ERR(dir_item); memset(&location, 0, sizeof(location)); leaf = path->nodes[0]; btrfs_cpu_key_to_disk(&disk_key, &location); btrfs_set_dir_item_key(leaf, dir_item, &disk_key); btrfs_set_dir_flags(leaf, dir_item, BTRFS_FT_XATTR); btrfs_set_dir_name_len(leaf, dir_item, name_len); btrfs_set_dir_transid(leaf, dir_item, trans->transid); btrfs_set_dir_data_len(leaf, dir_item, data_len); name_ptr = (unsigned long)(dir_item + 1); data_ptr = (unsigned long)((char *)name_ptr + name_len); write_extent_buffer(leaf, name, name_ptr, name_len); write_extent_buffer(leaf, data, data_ptr, data_len); return ret; } /* * insert a directory item in the tree, doing all the magic for * both indexes. 'dir' indicates which objectid to insert it into, * 'location' is the key to stuff into the directory item, 'type' is the * type of the inode we're pointing to, and 'index' is the sequence number * to use for the second index (if one is created). * Will return 0 or -ENOMEM */ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, const struct fscrypt_str *name, struct btrfs_inode *dir, const struct btrfs_key *location, u8 type, u64 index) { int ret = 0; int ret2 = 0; struct btrfs_root *root = dir->root; struct btrfs_path *path; struct btrfs_dir_item *dir_item; struct extent_buffer *leaf; unsigned long name_ptr; struct btrfs_key key; struct btrfs_disk_key disk_key; u32 data_size; key.objectid = btrfs_ino(dir); key.type = BTRFS_DIR_ITEM_KEY; key.offset = btrfs_name_hash(name->name, name->len); path = btrfs_alloc_path(); if (!path) return -ENOMEM; btrfs_cpu_key_to_disk(&disk_key, location); data_size = sizeof(*dir_item) + name->len; dir_item = insert_with_overflow(trans, root, path, &key, data_size, name->name, name->len); if (IS_ERR(dir_item)) { ret = PTR_ERR(dir_item); if (ret == -EEXIST) goto second_insert; goto out_free; } if (IS_ENCRYPTED(&dir->vfs_inode)) type |= BTRFS_FT_ENCRYPTED; leaf = path->nodes[0]; btrfs_set_dir_item_key(leaf, dir_item, &disk_key); btrfs_set_dir_flags(leaf, dir_item, type); btrfs_set_dir_data_len(leaf, dir_item, 0); btrfs_set_dir_name_len(leaf, dir_item, name->len); btrfs_set_dir_transid(leaf, dir_item, trans->transid); name_ptr = (unsigned long)(dir_item + 1); write_extent_buffer(leaf, name->name, name_ptr, name->len); second_insert: /* FIXME, use some real flag for selecting the extra index */ if (root == root->fs_info->tree_root) { ret = 0; goto out_free; } btrfs_release_path(path); ret2 = btrfs_insert_delayed_dir_index(trans, name->name, name->len, dir, &disk_key, type, index); out_free: btrfs_free_path(path); if (ret) return ret; if (ret2) return ret2; return 0; } static struct btrfs_dir_item *btrfs_lookup_match_dir( struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *key, const char *name, int name_len, int mod) { const int ins_len = (mod < 0 ? -1 : 0); const int cow = (mod != 0); int ret; ret = btrfs_search_slot(trans, root, key, path, ins_len, cow); if (ret < 0) return ERR_PTR(ret); if (ret > 0) return ERR_PTR(-ENOENT); return btrfs_match_dir_item_name(path, name, name_len); } /* * Lookup for a directory item by name. * * @trans: The transaction handle to use. Can be NULL if @mod is 0. * @root: The root of the target tree. * @path: Path to use for the search. * @dir: The inode number (objectid) of the directory. * @name: The name associated to the directory entry we are looking for. * @name_len: The length of the name. * @mod: Used to indicate if the tree search is meant for a read only * lookup, for a modification lookup or for a deletion lookup, so * its value should be 0, 1 or -1, respectively. * * Returns: NULL if the dir item does not exists, an error pointer if an error * happened, or a pointer to a dir item if a dir item exists for the given name. */ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 dir, const struct fscrypt_str *name, int mod) { struct btrfs_key key; struct btrfs_dir_item *di; key.objectid = dir; key.type = BTRFS_DIR_ITEM_KEY; key.offset = btrfs_name_hash(name->name, name->len); di = btrfs_lookup_match_dir(trans, root, path, &key, name->name, name->len, mod); if (IS_ERR(di) && PTR_ERR(di) == -ENOENT) return NULL; return di; } int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir_ino, const struct fscrypt_str *name) { int ret; struct btrfs_key key; struct btrfs_dir_item *di; int data_size; struct extent_buffer *leaf; int slot; BTRFS_PATH_AUTO_FREE(path); path = btrfs_alloc_path(); if (!path) return -ENOMEM; key.objectid = dir_ino; key.type = BTRFS_DIR_ITEM_KEY; key.offset = btrfs_name_hash(name->name, name->len); di = btrfs_lookup_match_dir(NULL, root, path, &key, name->name, name->len, 0); if (IS_ERR(di)) { ret = PTR_ERR(di); /* Nothing found, we're safe */ if (ret == -ENOENT) return 0; if (ret < 0) return ret; } /* we found an item, look for our name in the item */ if (di) { /* our exact name was found */ return -EEXIST; } /* See if there is room in the item to insert this name. */ data_size = sizeof(*di) + name->len; leaf = path->nodes[0]; slot = path->slots[0]; if (data_size + btrfs_item_size(leaf, slot) + sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root->fs_info)) { return -EOVERFLOW; } /* Plenty of insertion room. */ return 0; } /* * Lookup for a directory index item by name and index number. * * @trans: The transaction handle to use. Can be NULL if @mod is 0. * @root: The root of the target tree. * @path: Path to use for the search. * @dir: The inode number (objectid) of the directory. * @index: The index number. * @name: The name associated to the directory entry we are looking for. * @name_len: The length of the name. * @mod: Used to indicate if the tree search is meant for a read only * lookup, for a modification lookup or for a deletion lookup, so * its value should be 0, 1 or -1, respectively. * * Returns: NULL if the dir index item does not exists, an error pointer if an * error happened, or a pointer to a dir item if the dir index item exists and * matches the criteria (name and index number). */ struct btrfs_dir_item * btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 dir, u64 index, const struct fscrypt_str *name, int mod) { struct btrfs_dir_item *di; struct btrfs_key key; key.objectid = dir; key.type = BTRFS_DIR_INDEX_KEY; key.offset = index; di = btrfs_lookup_match_dir(trans, root, path, &key, name->name, name->len, mod); if (di == ERR_PTR(-ENOENT)) return NULL; return di; } struct btrfs_dir_item * btrfs_search_dir_index_item(struct btrfs_root *root, struct btrfs_path *path, u64 dirid, const struct fscrypt_str *name) { struct btrfs_dir_item *di; struct btrfs_key key; int ret; key.objectid = dirid; key.type = BTRFS_DIR_INDEX_KEY; key.offset = 0; btrfs_for_each_slot(root, &key, &key, path, ret) { if (key.objectid != dirid || key.type != BTRFS_DIR_INDEX_KEY) break; di = btrfs_match_dir_item_name(path, name->name, name->len); if (di) return di; } /* Adjust return code if the key was not found in the next leaf. */ if (ret >= 0) ret = -ENOENT; return ERR_PTR(ret); } struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 dir, const char *name, u16 name_len, int mod) { struct btrfs_key key; struct btrfs_dir_item *di; key.objectid = dir; key.type = BTRFS_XATTR_ITEM_KEY; key.offset = btrfs_name_hash(name, name_len); di = btrfs_lookup_match_dir(trans, root, path, &key, name, name_len, mod); if (IS_ERR(di) && PTR_ERR(di) == -ENOENT) return NULL; return di; } /* * helper function to look at the directory item pointed to by 'path' * this walks through all the entries in a dir item and finds one * for a specific name. */ struct btrfs_dir_item *btrfs_match_dir_item_name(const struct btrfs_path *path, const char *name, int name_len) { struct btrfs_dir_item *dir_item; unsigned long name_ptr; u32 total_len; u32 cur = 0; u32 this_len; struct extent_buffer *leaf; leaf = path->nodes[0]; dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item); total_len = btrfs_item_size(leaf, path->slots[0]); while (cur < total_len) { this_len = sizeof(*dir_item) + btrfs_dir_name_len(leaf, dir_item) + btrfs_dir_data_len(leaf, dir_item); name_ptr = (unsigned long)(dir_item + 1); if (btrfs_dir_name_len(leaf, dir_item) == name_len && memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0) return dir_item; cur += this_len; dir_item = (struct btrfs_dir_item *)((char *)dir_item + this_len); } return NULL; } /* * given a pointer into a directory item, delete it. This * handles items that have more than one entry in them. */ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, const struct btrfs_dir_item *di) { struct extent_buffer *leaf; u32 sub_item_len; u32 item_len; int ret = 0; leaf = path->nodes[0]; sub_item_len = sizeof(*di) + btrfs_dir_name_len(leaf, di) + btrfs_dir_data_len(leaf, di); item_len = btrfs_item_size(leaf, path->slots[0]); if (sub_item_len == item_len) { ret = btrfs_del_item(trans, root, path); } else { /* MARKER */ unsigned long ptr = (unsigned long)di; unsigned long start; start = btrfs_item_ptr_offset(leaf, path->slots[0]); memmove_extent_buffer(leaf, ptr, ptr + sub_item_len, item_len - (ptr + sub_item_len - start)); btrfs_truncate_item(trans, path, item_len - sub_item_len, 1); } return ret; }
22 22 22 22 22 22 22 22 22 22 22 22 223 223 222 14 8 22 22 22 22 22 22 22 14 8 22 22 22 22 20 22 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 /* * Rusty Russell (C)2000 -- This code is GPL. * Patrick McHardy (c) 2006-2012 */ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/module.h> #include <linux/proc_fs.h> #include <linux/skbuff.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> #include <linux/netfilter_ipv6.h> #include <linux/netfilter_bridge.h> #include <linux/seq_file.h> #include <linux/rcupdate.h> #include <net/protocol.h> #include <net/netfilter/nf_queue.h> #include <net/dst.h> #include "nf_internals.h" static const struct nf_queue_handler __rcu *nf_queue_handler; /* * Hook for nfnetlink_queue to register its queue handler. * We do this so that most of the NFQUEUE code can be modular. * * Once the queue is registered it must reinject all packets it * receives, no matter what. */ void nf_register_queue_handler(const struct nf_queue_handler *qh) { /* should never happen, we only have one queueing backend in kernel */ WARN_ON(rcu_access_pointer(nf_queue_handler)); rcu_assign_pointer(nf_queue_handler, qh); } EXPORT_SYMBOL(nf_register_queue_handler); /* The caller must flush their queue before this */ void nf_unregister_queue_handler(void) { RCU_INIT_POINTER(nf_queue_handler, NULL); } EXPORT_SYMBOL(nf_unregister_queue_handler); static void nf_queue_sock_put(struct sock *sk) { #ifdef CONFIG_INET sock_gen_put(sk); #else sock_put(sk); #endif } static void nf_queue_entry_release_refs(struct nf_queue_entry *entry) { struct nf_hook_state *state = &entry->state; /* Release those devices we held, or Alexey will kill me. */ dev_put(state->in); dev_put(state->out); if (state->sk) nf_queue_sock_put(state->sk); #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) dev_put(entry->physin); dev_put(entry->physout); #endif } void nf_queue_entry_free(struct nf_queue_entry *entry) { nf_queue_entry_release_refs(entry); kfree(entry); } EXPORT_SYMBOL_GPL(nf_queue_entry_free); static void __nf_queue_entry_init_physdevs(struct nf_queue_entry *entry) { #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) const struct sk_buff *skb = entry->skb; if (nf_bridge_info_exists(skb)) { entry->physin = nf_bridge_get_physindev(skb, entry->state.net); entry->physout = nf_bridge_get_physoutdev(skb); } else { entry->physin = NULL; entry->physout = NULL; } #endif } /* Bump dev refs so they don't vanish while packet is out */ bool nf_queue_entry_get_refs(struct nf_queue_entry *entry) { struct nf_hook_state *state = &entry->state; if (state->sk && !refcount_inc_not_zero(&state->sk->sk_refcnt)) return false; dev_hold(state->in); dev_hold(state->out); #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) dev_hold(entry->physin); dev_hold(entry->physout); #endif return true; } EXPORT_SYMBOL_GPL(nf_queue_entry_get_refs); void nf_queue_nf_hook_drop(struct net *net) { const struct nf_queue_handler *qh; rcu_read_lock(); qh = rcu_dereference(nf_queue_handler); if (qh) qh->nf_hook_drop(net); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(nf_queue_nf_hook_drop); static void nf_ip_saveroute(const struct sk_buff *skb, struct nf_queue_entry *entry) { struct ip_rt_info *rt_info = nf_queue_entry_reroute(entry); if (entry->state.hook == NF_INET_LOCAL_OUT) { const struct iphdr *iph = ip_hdr(skb); rt_info->tos = iph->tos; rt_info->daddr = iph->daddr; rt_info->saddr = iph->saddr; rt_info->mark = skb->mark; } } static void nf_ip6_saveroute(const struct sk_buff *skb, struct nf_queue_entry *entry) { struct ip6_rt_info *rt_info = nf_queue_entry_reroute(entry); if (entry->state.hook == NF_INET_LOCAL_OUT) { const struct ipv6hdr *iph = ipv6_hdr(skb); rt_info->daddr = iph->daddr; rt_info->saddr = iph->saddr; rt_info->mark = skb->mark; } } static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state, unsigned int index, unsigned int queuenum) { struct nf_queue_entry *entry = NULL; const struct nf_queue_handler *qh; unsigned int route_key_size; int status; /* QUEUE == DROP if no one is waiting, to be safe. */ qh = rcu_dereference(nf_queue_handler); if (!qh) return -ESRCH; switch (state->pf) { case AF_INET: route_key_size = sizeof(struct ip_rt_info); break; case AF_INET6: route_key_size = sizeof(struct ip6_rt_info); break; default: route_key_size = 0; break; } if (skb_sk_is_prefetched(skb)) { struct sock *sk = skb->sk; if (!sk_is_refcounted(sk)) { if (!refcount_inc_not_zero(&sk->sk_refcnt)) return -ENOTCONN; /* drop refcount on skb_orphan */ skb->destructor = sock_edemux; } } entry = kmalloc(sizeof(*entry) + route_key_size, GFP_ATOMIC); if (!entry) return -ENOMEM; if (skb_dst(skb) && !skb_dst_force(skb)) { kfree(entry); return -ENETDOWN; } *entry = (struct nf_queue_entry) { .skb = skb, .state = *state, .hook_index = index, .size = sizeof(*entry) + route_key_size, }; __nf_queue_entry_init_physdevs(entry); if (!nf_queue_entry_get_refs(entry)) { kfree(entry); return -ENOTCONN; } switch (entry->state.pf) { case AF_INET: nf_ip_saveroute(skb, entry); break; case AF_INET6: nf_ip6_saveroute(skb, entry); break; } status = qh->outfn(entry, queuenum); if (status < 0) { nf_queue_entry_free(entry); return status; } return 0; } /* Packets leaving via this function must come back through nf_reinject(). */ int nf_queue(struct sk_buff *skb, struct nf_hook_state *state, unsigned int index, unsigned int verdict) { int ret; ret = __nf_queue(skb, state, index, verdict >> NF_VERDICT_QBITS); if (ret < 0) { if (ret == -ESRCH && (verdict & NF_VERDICT_FLAG_QUEUE_BYPASS)) return 1; kfree_skb(skb); } return 0; } EXPORT_SYMBOL_GPL(nf_queue);
151 29 150 30 137 2 307 152 96 157 96 160 41 310 43 1 43 2 43 41 43 15 15 15 15 8 8 15 15 15 15 15 15 15 27 34 34 24 20 19 15 20 15 15 15 15 15 15 14 15 15 15 33 34 14 15 15 63 63 62 57 57 27 27 30 40 44 33 19 31 28 25 25 15 28 31 44 43 44 44 34 15 30 30 30 30 30 30 30 30 30 30 30 30 30 44 44 29 35 34 2 34 35 35 35 35 34 35 35 11 11 11 11 17 17 17 17 17 17 17 17 2 17 15 17 17 17 15 14 14 1 15 5 10 15 19 19 19 19 19 12 12 7 19 19 5 5 17 16 16 16 16 16 19 1 15 15 15 5 5 3 3 15 3 15 15 19 6 19 10 19 19 83 19 6 83 84 83 19 83 67 63 35 34 34 4 9 4 4 4 33 34 34 32 2 33 34 34 92 92 92 90 91 91 92 54 54 54 54 54 140 140 96 96 95 3 92 96 92 141 139 54 138 5 1 56 140 55 56 22 21 56 51 51 51 31 56 56 54 138 58 8 8 8 73 74 74 9 8 10 74 74 93 93 48 46 74 73 93 38 24 24 24 24 100 100 3 3 3 3 3 31 20 19 20 2 2 33 1 33 3 223 223 222 222 222 222 222 221 2 2 2 2 2 2 222 284 11 284 284 271 271 46 46 46 46 284 271 46 26 26 301 301 301 301 301 301 301 301 287 287 6 285 284 253 284 284 284 284 284 3 283 275 275 275 275 284 64 283 271 284 63 253 3 284 6 285 52 9 9 9 9 170 171 164 165 164 171 8 8 62 62 62 31 31 31 31 31 20 20 24 31 24 22 22 31 31 31 22 4 22 16 12 12 12 12 12 12 11 12 12 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 4 4 4 4 4 4 4 1 4 4 4 4 4 60 60 60 41 60 59 17 44 2 43 1 1 42 42 42 11 1 11 41 42 41 42 42 37 7 42 42 42 42 59 60 35 36 41 40 40 36 36 36 36 35 36 36 36 36 36 19 15 4 15 5 1 4 4 14 14 7 14 24 24 23 19 4 24 22 22 7 57 1 1 1 57 29 29 29 29 29 29 29 57 57 57 57 57 57 31 17 26 55 55 55 26 29 29 28 29 29 28 29 57 111 110 111 111 77 78 8 1 77 54 7 7 7 7 7 7 7 7 31 30 14 1 1 287 287 287 287 21 21 290 290 290 290 287 287 6 284 101 111 109 111 77 73 73 7 7 7 7 7 7 70 21 21 71 71 111 110 244 244 244 244 244 10 244 244 234 242 239 239 242 242 242 241 242 242 242 1 1 17 9 3 9 14 19 19 19 17 17 17 17 16 10 10 10 9 9 9 17 17 14 13 5 13 17 14 90 90 90 89 6 1 5 89 89 75 75 75 75 51 4 7 54 24 9 45 45 43 45 45 45 1 45 2 45 45 45 45 43 44 45 45 1 42 1 43 1 1 43 45 43 45 45 43 2 45 45 45 45 8 43 43 45 13 45 43 45 45 28 9 28 5 28 45 45 43 2 43 41 8 45 1 45 43 2 40 45 2 2 43 45 45 43 45 44 45 8 43 1 1 1 45 54 2 52 3 5 54 54 54 53 54 37 37 37 37 1 1 37 6 6 6 25 25 25 3 3 3 3 3 3 3 3 3 3 3 3 3 3 11 11 11 3 3 3 3 3 46 35 35 35 3 32 32 112 87 87 2 89 88 89 89 89 58 88 88 71 26 26 50 50 87 55 23 23 1 33 57 46 46 46 46 45 46 55 89 89 89 113 113 2 2 2 2 2 2 1 2 2 2 2 2 2 2 1 1 2 2 2 41 15 15 32 32 32 41 41 40 40 46 46 44 44 44 46 24 27 22 18 22 18 4 17 18 18 18 16 18 18 18 17 18 14 10 14 14 14 14 14 14 1 1 1 14 14 14 14 14 10 2 2 2 341 341 340 339 341 56 57 57 57 56 57 57 53 53 53 57 48 57 57 57 57 65 65 51 21 54 1 54 1 54 1 54 1 54 1 54 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 3 8 8 8 8 8 8 8 8 7 8 7 8 8 8 8 7 7 18 18 18 11 11 2 17 18 17 17 10 10 17 7 17 7 13 17 17 17 15 3 17 10 17 17 17 17 17 3 17 17 17 10 10 10 10 9 17 17 15 17 17 7 7 13 17 17 7 7 17 24 24 8 18 24 24 2 2 15 2 15 2 2 2 1 2 2 2 2 2 2 2 2 15 15 2 15 15 15 15 2 2 2 2 2 2 2 4 3 3 3 3 3 3 3 3 4 27 27 27 27 27 1 1 27 26 27 27 1 26 26 26 24 23 23 26 25 26 27 1 26 26 312 108 108 106 313 8 8 8 8 8 8 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 44 36 44 13 44 40 44 24 3 3 2 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 7057 7058 7059 7060 7061 7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072 7073 7074 7075 7076 7077 7078 7079 7080 7081 7082 7083 7084 7085 7086 7087 7088 7089 7090 7091 7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105 7106 7107 7108 7109 7110 7111 7112 7113 7114 7115 7116 7117 7118 7119 7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132 7133 7134 7135 7136 7137 7138 7139 7140 7141 7142 7143 7144 7145 7146 7147 7148 7149 7150 7151 7152 7153 7154 7155 7156 7157 7158 7159 7160 7161 7162 7163 7164 7165 7166 7167 7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178 7179 7180 7181 7182 7183 7184 7185 7186 7187 7188 7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215 7216 7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234 7235 7236 7237 7238 7239 7240 7241 7242 7243 7244 7245 7246 7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290 7291 7292 7293 7294 7295 7296 7297 7298 7299 7300 7301 7302 7303 7304 7305 7306 7307 7308 7309 7310 7311 7312 7313 7314 7315 7316 7317 7318 7319 7320 7321 7322 7323 7324 7325 7326 7327 7328 7329 7330 7331 7332 7333 7334 7335 7336 7337 7338 7339 7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364 7365 7366 7367 7368 7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381 7382 7383 7384 7385 7386 7387 7388 7389 7390 7391 7392 7393 7394 7395 7396 7397 7398 7399 7400 7401 7402 7403 7404 7405 7406 7407 7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429 7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446 7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460 7461 7462 7463 7464 7465 7466 7467 7468 7469 7470 7471 7472 7473 7474 7475 7476 7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512 7513 7514 7515 7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536 7537 7538 7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560 7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571 7572 7573 7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588 7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636 7637 7638 7639 7640 7641 7642 7643 7644 7645 7646 7647 7648 7649 7650 7651 7652 7653 7654 7655 7656 7657 7658 7659 7660 7661 7662 7663 7664 7665 7666 7667 7668 7669 7670 7671 7672 7673 7674 7675 7676 7677 7678 7679 7680 7681 7682 7683 7684 7685 7686 7687 7688 7689 7690 7691 7692 7693 7694 7695 7696 7697 7698 7699 7700 7701 7702 7703 7704 7705 7706 7707 7708 7709 7710 7711 7712 7713 7714 7715 7716 7717 7718 7719 7720 7721 7722 7723 7724 7725 7726 7727 7728 7729 7730 7731 7732 7733 7734 7735 7736 7737 7738 7739 7740 7741 7742 7743 7744 7745 7746 7747 7748 7749 7750 7751 7752 7753 7754 7755 7756 7757 7758 7759 7760 7761 7762 7763 7764 7765 7766 7767 7768 7769 7770 7771 7772 7773 7774 7775 7776 7777 7778 7779 7780 7781 7782 7783 7784 7785 7786 7787 7788 7789 7790 7791 7792 7793 7794 7795 7796 7797 7798 7799 7800 7801 7802 7803 7804 7805 7806 7807 7808 7809 7810 7811 7812 7813 7814 7815 7816 7817 7818 7819 7820 7821 7822 7823 7824 7825 7826 7827 7828 7829 7830 7831 7832 7833 7834 7835 7836 7837 7838 7839 7840 7841 7842 7843 7844 7845 7846 7847 7848 7849 7850 7851 7852 7853 7854 7855 7856 7857 7858 7859 7860 7861 7862 7863 7864 7865 7866 7867 7868 7869 7870 7871 7872 7873 7874 7875 7876 7877 7878 7879 7880 7881 7882 7883 7884 7885 7886 7887 7888 7889 7890 7891 7892 7893 7894 7895 7896 7897 7898 7899 7900 7901 7902 7903 7904 7905 7906 7907 7908 7909 7910 7911 7912 7913 7914 7915 7916 7917 7918 7919 7920 7921 7922 7923 7924 7925 7926 7927 7928 7929 7930 7931 7932 7933 7934 7935 7936 7937 7938 7939 7940 7941 7942 7943 7944 7945 7946 7947 7948 7949 7950 7951 7952 7953 7954 7955 7956 7957 7958 7959 7960 7961 7962 7963 7964 7965 7966 7967 7968 7969 7970 7971 7972 7973 7974 7975 7976 7977 7978 7979 7980 7981 7982 7983 7984 7985 7986 7987 7988 7989 7990 7991 7992 7993 7994 7995 7996 7997 7998 7999 8000 8001 8002 8003 8004 8005 8006 8007 8008 8009 8010 8011 8012 8013 8014 8015 8016 8017 8018 8019 8020 8021 8022 8023 8024 8025 8026 8027 8028 8029 8030 8031 8032 8033 8034 8035 8036 8037 8038 8039 8040 8041 8042 8043 8044 8045 8046 8047 8048 8049 8050 8051 8052 8053 8054 8055 8056 8057 8058 8059 8060 8061 8062 8063 8064 8065 8066 8067 8068 8069 8070 8071 8072 8073 8074 8075 8076 8077 8078 8079 8080 8081 8082 8083 8084 8085 8086 8087 8088 8089 8090 8091 8092 8093 8094 8095 8096 8097 8098 8099 8100 8101 8102 8103 8104 8105 8106 8107 8108 8109 8110 8111 8112 8113 8114 8115 8116 8117 8118 8119 8120 8121 8122 8123 8124 8125 8126 8127 8128 8129 8130 8131 8132 8133 8134 8135 8136 8137 8138 8139 8140 8141 8142 8143 8144 8145 8146 8147 8148 8149 8150 8151 8152 8153 8154 8155 8156 8157 8158 8159 8160 8161 8162 8163 8164 8165 8166 8167 8168 8169 8170 8171 8172 8173 8174 8175 8176 8177 8178 8179 8180 8181 8182 8183 8184 8185 8186 8187 8188 8189 8190 8191 8192 8193 8194 8195 8196 8197 8198 8199 8200 8201 8202 8203 8204 8205 8206 8207 8208 8209 8210 8211 8212 8213 8214 8215 8216 8217 8218 8219 8220 8221 8222 8223 8224 8225 8226 8227 8228 8229 8230 8231 8232 8233 8234 8235 8236 8237 8238 8239 8240 8241 8242 8243 8244 8245 8246 8247 8248 8249 8250 8251 8252 8253 8254 8255 8256 8257 8258 8259 8260 8261 8262 8263 8264 8265 8266 8267 8268 8269 8270 8271 8272 8273 8274 8275 8276 8277 8278 8279 8280 8281 8282 8283 8284 8285 8286 8287 8288 8289 8290 8291 8292 8293 8294 8295 8296 8297 8298 8299 8300 8301 8302 8303 8304 8305 8306 8307 8308 8309 8310 8311 8312 8313 8314 8315 8316 8317 8318 8319 8320 8321 8322 8323 8324 8325 8326 8327 8328 8329 8330 8331 8332 8333 8334 8335 8336 8337 8338 8339 8340 8341 8342 8343 8344 8345 8346 8347 8348 8349 8350 8351 8352 8353 8354 8355 8356 8357 8358 8359 8360 8361 8362 8363 8364 8365 8366 8367 8368 8369 8370 8371 8372 8373 8374 8375 8376 8377 8378 8379 8380 8381 8382 8383 8384 8385 8386 8387 8388 8389 8390 8391 8392 8393 8394 8395 8396 8397 8398 8399 8400 8401 8402 8403 8404 8405 8406 8407 8408 8409 8410 8411 8412 8413 8414 8415 8416 8417 8418 8419 8420 8421 8422 8423 8424 8425 8426 8427 8428 8429 8430 8431 8432 8433 8434 8435 8436 8437 8438 8439 8440 8441 8442 8443 8444 8445 8446 8447 8448 8449 8450 8451 8452 8453 8454 8455 8456 8457 8458 8459 8460 8461 8462 8463 8464 8465 8466 8467 8468 8469 8470 8471 8472 8473 8474 8475 8476 8477 8478 8479 8480 8481 8482 8483 8484 8485 8486 8487 8488 8489 8490 8491 8492 8493 8494 8495 8496 8497 8498 8499 8500 8501 8502 8503 8504 8505 8506 8507 8508 8509 8510 8511 8512 8513 8514 8515 8516 8517 8518 8519 8520 8521 8522 8523 8524 8525 8526 8527 8528 8529 8530 8531 8532 8533 8534 8535 8536 8537 8538 8539 8540 8541 8542 8543 8544 8545 8546 8547 8548 8549 8550 8551 8552 8553 8554 8555 8556 8557 8558 8559 8560 8561 8562 8563 8564 8565 8566 8567 8568 8569 8570 8571 8572 8573 8574 8575 8576 8577 8578 8579 8580 8581 8582 8583 8584 8585 8586 8587 8588 8589 8590 8591 8592 8593 8594 8595 8596 8597 8598 8599 8600 8601 8602 8603 8604 8605 8606 8607 8608 8609 8610 8611 8612 8613 8614 8615 8616 8617 8618 8619 8620 8621 8622 8623 8624 8625 8626 8627 8628 8629 8630 8631 8632 8633 8634 8635 8636 8637 8638 8639 8640 8641 8642 8643 8644 8645 8646 8647 8648 8649 8650 8651 8652 8653 8654 8655 8656 8657 8658 8659 8660 8661 8662 8663 8664 8665 8666 8667 8668 8669 8670 8671 8672 8673 8674 8675 8676 8677 8678 8679 8680 8681 8682 8683 8684 8685 8686 8687 8688 8689 8690 8691 8692 8693 8694 8695 8696 8697 8698 8699 8700 8701 8702 8703 8704 8705 8706 8707 8708 8709 8710 8711 8712 8713 8714 8715 8716 8717 8718 8719 8720 8721 8722 8723 8724 8725 8726 8727 8728 8729 8730 8731 8732 8733 8734 8735 8736 8737 8738 8739 8740 8741 8742 8743 8744 8745 8746 8747 8748 8749 8750 8751 8752 8753 8754 8755 8756 8757 8758 8759 8760 8761 8762 8763 8764 8765 8766 8767 8768 8769 8770 8771 8772 8773 8774 8775 8776 8777 8778 8779 8780 8781 8782 8783 8784 8785 8786 8787 8788 8789 8790 8791 8792 8793 8794 8795 8796 8797 8798 8799 8800 8801 8802 8803 8804 8805 8806 8807 8808 8809 8810 8811 8812 8813 8814 8815 8816 8817 8818 8819 8820 8821 8822 8823 8824 8825 8826 8827 8828 8829 8830 8831 8832 8833 8834 8835 8836 8837 8838 8839 8840 8841 8842 8843 8844 8845 8846 8847 8848 8849 8850 8851 8852 8853 8854 8855 8856 8857 8858 8859 8860 8861 8862 8863 8864 8865 8866 8867 8868 8869 8870 8871 8872 8873 8874 8875 8876 8877 8878 8879 8880 8881 8882 8883 8884 8885 8886 8887 8888 8889 8890 8891 8892 8893 8894 8895 8896 8897 8898 8899 8900 8901 8902 8903 8904 8905 8906 8907 8908 8909 8910 8911 8912 8913 8914 8915 8916 8917 8918 8919 8920 8921 8922 8923 8924 8925 8926 8927 8928 8929 8930 8931 8932 8933 8934 8935 8936 8937 8938 8939 8940 8941 8942 8943 8944 8945 8946 8947 8948 8949 8950 8951 8952 8953 8954 8955 8956 8957 8958 8959 8960 8961 8962 8963 8964 8965 8966 8967 8968 8969 8970 8971 8972 8973 8974 8975 8976 8977 8978 8979 8980 8981 8982 8983 8984 8985 8986 8987 8988 8989 8990 8991 8992 8993 8994 8995 8996 8997 8998 8999 9000 9001 9002 9003 9004 9005 9006 9007 9008 9009 9010 9011 9012 9013 9014 9015 9016 9017 9018 9019 9020 9021 9022 9023 9024 9025 9026 9027 9028 9029 9030 9031 9032 9033 9034 9035 9036 9037 9038 9039 9040 9041 9042 9043 9044 9045 9046 9047 9048 9049 9050 9051 9052 9053 9054 9055 9056 9057 9058 9059 9060 9061 9062 9063 9064 9065 9066 9067 9068 9069 9070 9071 9072 9073 9074 9075 9076 9077 9078 9079 9080 9081 9082 9083 9084 9085 9086 9087 9088 9089 9090 9091 9092 9093 9094 9095 9096 9097 9098 9099 9100 9101 9102 9103 9104 9105 9106 9107 9108 9109 9110 9111 9112 9113 9114 9115 9116 9117 9118 9119 9120 9121 9122 9123 9124 9125 9126 9127 9128 9129 9130 9131 9132 9133 9134 9135 9136 9137 9138 9139 9140 9141 9142 9143 9144 9145 9146 9147 9148 9149 9150 9151 9152 9153 9154 9155 9156 9157 9158 9159 9160 9161 9162 9163 9164 9165 9166 9167 9168 9169 9170 9171 9172 9173 9174 9175 9176 9177 9178 9179 9180 9181 9182 9183 9184 9185 9186 9187 9188 9189 9190 9191 9192 9193 9194 9195 9196 9197 9198 9199 9200 9201 9202 9203 9204 9205 9206 9207 9208 9209 9210 9211 9212 9213 9214 9215 9216 9217 9218 9219 9220 9221 9222 9223 9224 9225 9226 9227 9228 9229 9230 9231 9232 9233 9234 9235 9236 9237 9238 9239 9240 9241 9242 9243 9244 9245 9246 9247 9248 9249 9250 9251 9252 9253 9254 9255 9256 9257 9258 9259 9260 9261 9262 9263 9264 9265 9266 9267 9268 9269 9270 9271 9272 9273 9274 9275 9276 9277 9278 9279 9280 9281 9282 9283 9284 9285 9286 9287 9288 9289 9290 9291 9292 9293 9294 9295 9296 9297 9298 9299 9300 9301 9302 9303 9304 9305 9306 9307 9308 9309 9310 9311 9312 9313 9314 9315 9316 9317 9318 9319 9320 9321 9322 9323 9324 9325 9326 9327 9328 9329 9330 9331 9332 9333 9334 9335 9336 9337 9338 9339 9340 9341 9342 9343 9344 9345 9346 9347 9348 9349 9350 9351 9352 9353 9354 9355 9356 9357 9358 9359 9360 9361 9362 9363 9364 9365 9366 9367 9368 9369 9370 9371 9372 9373 9374 9375 9376 9377 9378 9379 9380 9381 9382 9383 9384 9385 9386 9387 9388 9389 9390 9391 9392 9393 9394 9395 9396 9397 9398 9399 9400 9401 9402 9403 9404 9405 9406 9407 9408 9409 9410 9411 9412 9413 9414 9415 9416 9417 9418 9419 9420 9421 9422 9423 9424 9425 9426 9427 9428 9429 9430 9431 9432 9433 9434 9435 9436 9437 9438 9439 9440 9441 9442 9443 9444 9445 9446 9447 9448 9449 9450 9451 9452 9453 9454 9455 9456 9457 9458 9459 9460 9461 9462 9463 9464 9465 9466 9467 9468 9469 9470 9471 9472 9473 9474 9475 9476 9477 9478 9479 9480 9481 9482 9483 9484 9485 9486 9487 9488 9489 9490 9491 9492 9493 9494 9495 9496 9497 9498 9499 9500 9501 9502 9503 9504 9505 9506 9507 9508 9509 9510 9511 9512 9513 9514 9515 9516 9517 9518 9519 9520 9521 9522 9523 9524 9525 9526 9527 9528 9529 9530 9531 9532 9533 9534 9535 9536 9537 9538 9539 9540 9541 9542 9543 9544 9545 9546 9547 9548 9549 9550 9551 9552 9553 9554 9555 9556 9557 9558 9559 9560 9561 9562 9563 9564 9565 9566 9567 9568 9569 9570 9571 9572 9573 9574 9575 9576 9577 9578 9579 9580 9581 9582 9583 9584 9585 9586 9587 9588 9589 9590 9591 9592 9593 9594 9595 9596 9597 9598 9599 9600 9601 9602 9603 9604 9605 9606 9607 9608 9609 9610 9611 9612 9613 9614 9615 9616 9617 9618 9619 9620 9621 9622 9623 9624 9625 9626 9627 9628 9629 9630 9631 9632 9633 9634 9635 9636 9637 9638 9639 9640 9641 9642 9643 9644 9645 9646 9647 9648 9649 9650 9651 9652 9653 9654 9655 9656 9657 9658 9659 9660 9661 9662 9663 9664 9665 9666 9667 9668 9669 9670 9671 9672 9673 9674 9675 9676 9677 9678 9679 9680 9681 9682 9683 9684 9685 9686 9687 9688 9689 9690 9691 9692 9693 9694 9695 9696 9697 9698 9699 9700 9701 9702 9703 9704 9705 9706 9707 9708 9709 9710 9711 9712 9713 9714 9715 9716 9717 9718 9719 9720 9721 9722 9723 9724 9725 9726 9727 9728 9729 9730 9731 9732 9733 9734 9735 9736 9737 9738 9739 9740 9741 9742 9743 9744 9745 9746 9747 9748 9749 9750 9751 9752 9753 9754 9755 9756 9757 9758 9759 9760 9761 9762 9763 9764 9765 9766 9767 9768 9769 9770 9771 9772 9773 9774 9775 9776 9777 9778 9779 9780 9781 9782 9783 9784 9785 9786 9787 9788 9789 9790 9791 9792 9793 9794 9795 9796 9797 9798 9799 9800 9801 9802 9803 9804 9805 9806 9807 9808 9809 9810 9811 9812 9813 9814 9815 9816 9817 9818 9819 9820 9821 9822 9823 9824 9825 9826 9827 9828 9829 9830 9831 9832 9833 9834 9835 9836 9837 9838 9839 9840 9841 9842 9843 9844 9845 9846 9847 9848 9849 9850 9851 9852 9853 9854 9855 9856 9857 9858 9859 9860 9861 9862 9863 9864 9865 9866 9867 9868 9869 9870 9871 9872 9873 9874 9875 9876 9877 9878 9879 9880 9881 9882 9883 9884 9885 9886 9887 9888 9889 9890 9891 9892 9893 9894 9895 9896 9897 9898 9899 9900 9901 9902 9903 9904 9905 9906 9907 9908 9909 9910 9911 9912 9913 9914 9915 9916 9917 9918 9919 9920 9921 9922 9923 9924 9925 9926 9927 9928 9929 9930 9931 9932 9933 9934 9935 9936 9937 9938 9939 9940 9941 9942 9943 9944 9945 9946 9947 9948 9949 9950 9951 9952 9953 9954 9955 9956 9957 9958 9959 9960 9961 9962 9963 9964 9965 9966 9967 9968 9969 9970 9971 9972 9973 9974 9975 9976 9977 9978 9979 9980 9981 9982 9983 9984 9985 9986 9987 9988 9989 9990 9991 9992 9993 9994 9995 9996 9997 9998 9999 10000 10001 10002 10003 10004 10005 10006 10007 10008 10009 10010 10011 10012 10013 10014 10015 10016 10017 10018 10019 10020 10021 10022 10023 10024 10025 10026 10027 10028 10029 10030 10031 10032 10033 10034 10035 10036 10037 10038 10039 10040 10041 10042 10043 10044 10045 10046 10047 10048 10049 10050 10051 10052 10053 10054 10055 10056 10057 10058 10059 10060 10061 10062 10063 10064 10065 10066 10067 10068 10069 10070 10071 10072 10073 10074 10075 10076 10077 10078 10079 10080 10081 10082 10083 10084 10085 10086 10087 10088 10089 10090 10091 10092 10093 10094 10095 10096 10097 10098 10099 10100 10101 10102 10103 10104 10105 10106 10107 10108 10109 10110 10111 10112 10113 10114 10115 10116 10117 10118 10119 10120 10121 10122 10123 10124 10125 10126 10127 10128 10129 10130 10131 10132 10133 10134 10135 10136 10137 10138 10139 10140 10141 10142 10143 10144 10145 10146 10147 10148 10149 10150 10151 10152 10153 10154 10155 10156 10157 10158 10159 10160 10161 10162 10163 10164 10165 10166 10167 10168 10169 10170 10171 10172 10173 10174 10175 10176 10177 10178 10179 10180 10181 10182 10183 10184 10185 10186 10187 10188 10189 10190 10191 10192 10193 10194 10195 10196 10197 10198 10199 10200 10201 10202 10203 10204 10205 10206 10207 10208 10209 10210 10211 10212 10213 10214 10215 10216 10217 10218 10219 10220 10221 10222 10223 10224 10225 10226 10227 10228 10229 10230 10231 10232 10233 10234 10235 10236 10237 10238 10239 10240 10241 10242 10243 10244 10245 10246 10247 10248 10249 10250 10251 10252 10253 10254 10255 10256 10257 10258 10259 10260 10261 10262 10263 10264 10265 10266 10267 10268 10269 10270 10271 10272 10273 10274 10275 10276 10277 10278 10279 10280 10281 10282 10283 10284 10285 10286 10287 10288 10289 10290 10291 10292 10293 10294 10295 10296 10297 10298 10299 10300 10301 10302 10303 10304 10305 10306 10307 10308 10309 10310 10311 10312 10313 10314 10315 10316 10317 10318 10319 10320 10321 10322 10323 10324 10325 10326 10327 10328 10329 10330 10331 10332 10333 10334 10335 10336 10337 10338 10339 10340 10341 10342 10343 10344 10345 10346 10347 10348 10349 10350 10351 10352 10353 10354 10355 10356 10357 10358 10359 10360 10361 10362 10363 10364 10365 10366 10367 10368 10369 10370 10371 10372 10373 10374 10375 10376 10377 10378 10379 10380 10381 10382 10383 10384 10385 10386 10387 10388 10389 10390 10391 10392 10393 10394 10395 10396 10397 10398 10399 10400 10401 10402 10403 10404 10405 10406 10407 10408 10409 10410 10411 10412 10413 10414 10415 10416 10417 10418 10419 10420 10421 10422 10423 10424 10425 10426 10427 10428 10429 10430 10431 10432 10433 10434 10435 10436 10437 10438 10439 10440 10441 10442 10443 10444 10445 10446 10447 10448 10449 10450 10451 10452 10453 10454 10455 10456 10457 10458 10459 10460 10461 10462 10463 10464 10465 10466 10467 10468 10469 10470 10471 10472 10473 10474 10475 10476 10477 10478 10479 10480 10481 10482 10483 10484 10485 10486 10487 10488 10489 10490 10491 10492 10493 10494 10495 10496 10497 10498 10499 10500 10501 10502 10503 10504 10505 10506 10507 10508 10509 10510 10511 10512 10513 10514 10515 10516 10517 10518 10519 10520 10521 10522 10523 10524 10525 10526 10527 10528 10529 10530 10531 10532 10533 10534 10535 10536 10537 10538 10539 10540 10541 10542 10543 10544 10545 10546 10547 10548 10549 10550 10551 10552 10553 10554 10555 10556 10557 10558 10559 10560 10561 10562 10563 10564 10565 10566 10567 10568 10569 10570 10571 10572 10573 10574 10575 10576 10577 10578 10579 10580 10581 10582 10583 10584 10585 10586 10587 10588 10589 10590 10591 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Oracle. All rights reserved. */ #include <crypto/hash.h> #include <linux/kernel.h> #include <linux/bio.h> #include <linux/blk-cgroup.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/pagemap.h> #include <linux/highmem.h> #include <linux/time.h> #include <linux/init.h> #include <linux/string.h> #include <linux/backing-dev.h> #include <linux/writeback.h> #include <linux/compat.h> #include <linux/xattr.h> #include <linux/posix_acl.h> #include <linux/falloc.h> #include <linux/slab.h> #include <linux/ratelimit.h> #include <linux/btrfs.h> #include <linux/blkdev.h> #include <linux/posix_acl_xattr.h> #include <linux/uio.h> #include <linux/magic.h> #include <linux/iversion.h> #include <linux/swap.h> #include <linux/migrate.h> #include <linux/sched/mm.h> #include <linux/iomap.h> #include <linux/unaligned.h> #include <linux/fsverity.h> #include "misc.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" #include "btrfs_inode.h" #include "ordered-data.h" #include "xattr.h" #include "tree-log.h" #include "bio.h" #include "compression.h" #include "locking.h" #include "props.h" #include "qgroup.h" #include "delalloc-space.h" #include "block-group.h" #include "space-info.h" #include "zoned.h" #include "subpage.h" #include "inode-item.h" #include "fs.h" #include "accessors.h" #include "extent-tree.h" #include "root-tree.h" #include "defrag.h" #include "dir-item.h" #include "file-item.h" #include "uuid-tree.h" #include "ioctl.h" #include "file.h" #include "acl.h" #include "relocation.h" #include "verity.h" #include "super.h" #include "orphan.h" #include "backref.h" #include "raid-stripe-tree.h" #include "fiemap.h" #define COW_FILE_RANGE_KEEP_LOCKED (1UL << 0) #define COW_FILE_RANGE_NO_INLINE (1UL << 1) struct btrfs_iget_args { u64 ino; struct btrfs_root *root; }; struct btrfs_rename_ctx { /* Output field. Stores the index number of the old directory entry. */ u64 index; }; /* * Used by data_reloc_print_warning_inode() to pass needed info for filename * resolution and output of error message. */ struct data_reloc_warn { struct btrfs_path path; struct btrfs_fs_info *fs_info; u64 extent_item_size; u64 logical; int mirror_num; }; /* * For the file_extent_tree, we want to hold the inode lock when we lookup and * update the disk_i_size, but lockdep will complain because our io_tree we hold * the tree lock and get the inode lock when setting delalloc. These two things * are unrelated, so make a class for the file_extent_tree so we don't get the * two locking patterns mixed up. */ static struct lock_class_key file_extent_tree_class; static const struct inode_operations btrfs_dir_inode_operations; static const struct inode_operations btrfs_symlink_inode_operations; static const struct inode_operations btrfs_special_inode_operations; static const struct inode_operations btrfs_file_inode_operations; static const struct address_space_operations btrfs_aops; static const struct file_operations btrfs_dir_file_operations; static struct kmem_cache *btrfs_inode_cachep; static int btrfs_setsize(struct inode *inode, struct iattr *attr); static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback); static noinline int run_delalloc_cow(struct btrfs_inode *inode, struct folio *locked_folio, u64 start, u64 end, struct writeback_control *wbc, bool pages_dirty); static int data_reloc_print_warning_inode(u64 inum, u64 offset, u64 num_bytes, u64 root, void *warn_ctx) { struct data_reloc_warn *warn = warn_ctx; struct btrfs_fs_info *fs_info = warn->fs_info; struct extent_buffer *eb; struct btrfs_inode_item *inode_item; struct inode_fs_paths *ipath = NULL; struct btrfs_root *local_root; struct btrfs_key key; unsigned int nofs_flag; u32 nlink; int ret; local_root = btrfs_get_fs_root(fs_info, root, true); if (IS_ERR(local_root)) { ret = PTR_ERR(local_root); goto err; } /* This makes the path point to (inum INODE_ITEM ioff). */ key.objectid = inum; key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; ret = btrfs_search_slot(NULL, local_root, &key, &warn->path, 0, 0); if (ret) { btrfs_put_root(local_root); btrfs_release_path(&warn->path); goto err; } eb = warn->path.nodes[0]; inode_item = btrfs_item_ptr(eb, warn->path.slots[0], struct btrfs_inode_item); nlink = btrfs_inode_nlink(eb, inode_item); btrfs_release_path(&warn->path); nofs_flag = memalloc_nofs_save(); ipath = init_ipath(4096, local_root, &warn->path); memalloc_nofs_restore(nofs_flag); if (IS_ERR(ipath)) { btrfs_put_root(local_root); ret = PTR_ERR(ipath); ipath = NULL; /* * -ENOMEM, not a critical error, just output an generic error * without filename. */ btrfs_warn(fs_info, "checksum error at logical %llu mirror %u root %llu, inode %llu offset %llu", warn->logical, warn->mirror_num, root, inum, offset); return ret; } ret = paths_from_inode(inum, ipath); if (ret < 0) goto err; /* * We deliberately ignore the bit ipath might have been too small to * hold all of the paths here */ for (int i = 0; i < ipath->fspath->elem_cnt; i++) { btrfs_warn(fs_info, "checksum error at logical %llu mirror %u root %llu inode %llu offset %llu length %u links %u (path: %s)", warn->logical, warn->mirror_num, root, inum, offset, fs_info->sectorsize, nlink, (char *)(unsigned long)ipath->fspath->val[i]); } btrfs_put_root(local_root); free_ipath(ipath); return 0; err: btrfs_warn(fs_info, "checksum error at logical %llu mirror %u root %llu inode %llu offset %llu, path resolving failed with ret=%d", warn->logical, warn->mirror_num, root, inum, offset, ret); free_ipath(ipath); return ret; } /* * Do extra user-friendly error output (e.g. lookup all the affected files). * * Return true if we succeeded doing the backref lookup. * Return false if such lookup failed, and has to fallback to the old error message. */ static void print_data_reloc_error(const struct btrfs_inode *inode, u64 file_off, const u8 *csum, const u8 *csum_expected, int mirror_num) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_path path = { 0 }; struct btrfs_key found_key = { 0 }; struct extent_buffer *eb; struct btrfs_extent_item *ei; const u32 csum_size = fs_info->csum_size; u64 logical; u64 flags; u32 item_size; int ret; mutex_lock(&fs_info->reloc_mutex); logical = btrfs_get_reloc_bg_bytenr(fs_info); mutex_unlock(&fs_info->reloc_mutex); if (logical == U64_MAX) { btrfs_warn_rl(fs_info, "has data reloc tree but no running relocation"); btrfs_warn_rl(fs_info, "csum failed root %lld ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", btrfs_root_id(inode->root), btrfs_ino(inode), file_off, CSUM_FMT_VALUE(csum_size, csum), CSUM_FMT_VALUE(csum_size, csum_expected), mirror_num); return; } logical += file_off; btrfs_warn_rl(fs_info, "csum failed root %lld ino %llu off %llu logical %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", btrfs_root_id(inode->root), btrfs_ino(inode), file_off, logical, CSUM_FMT_VALUE(csum_size, csum), CSUM_FMT_VALUE(csum_size, csum_expected), mirror_num); ret = extent_from_logical(fs_info, logical, &path, &found_key, &flags); if (ret < 0) { btrfs_err_rl(fs_info, "failed to lookup extent item for logical %llu: %d", logical, ret); return; } eb = path.nodes[0]; ei = btrfs_item_ptr(eb, path.slots[0], struct btrfs_extent_item); item_size = btrfs_item_size(eb, path.slots[0]); if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { unsigned long ptr = 0; u64 ref_root; u8 ref_level; while (true) { ret = tree_backref_for_extent(&ptr, eb, &found_key, ei, item_size, &ref_root, &ref_level); if (ret < 0) { btrfs_warn_rl(fs_info, "failed to resolve tree backref for logical %llu: %d", logical, ret); break; } if (ret > 0) break; btrfs_warn_rl(fs_info, "csum error at logical %llu mirror %u: metadata %s (level %d) in tree %llu", logical, mirror_num, (ref_level ? "node" : "leaf"), ref_level, ref_root); } btrfs_release_path(&path); } else { struct btrfs_backref_walk_ctx ctx = { 0 }; struct data_reloc_warn reloc_warn = { 0 }; btrfs_release_path(&path); ctx.bytenr = found_key.objectid; ctx.extent_item_pos = logical - found_key.objectid; ctx.fs_info = fs_info; reloc_warn.logical = logical; reloc_warn.extent_item_size = found_key.offset; reloc_warn.mirror_num = mirror_num; reloc_warn.fs_info = fs_info; iterate_extent_inodes(&ctx, true, data_reloc_print_warning_inode, &reloc_warn); } } static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode, u64 logical_start, u8 *csum, u8 *csum_expected, int mirror_num) { struct btrfs_root *root = inode->root; const u32 csum_size = root->fs_info->csum_size; /* For data reloc tree, it's better to do a backref lookup instead. */ if (btrfs_is_data_reloc_root(root)) return print_data_reloc_error(inode, logical_start, csum, csum_expected, mirror_num); /* Output without objectid, which is more meaningful */ if (btrfs_root_id(root) >= BTRFS_LAST_FREE_OBJECTID) { btrfs_warn_rl(root->fs_info, "csum failed root %lld ino %lld off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", btrfs_root_id(root), btrfs_ino(inode), logical_start, CSUM_FMT_VALUE(csum_size, csum), CSUM_FMT_VALUE(csum_size, csum_expected), mirror_num); } else { btrfs_warn_rl(root->fs_info, "csum failed root %llu ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", btrfs_root_id(root), btrfs_ino(inode), logical_start, CSUM_FMT_VALUE(csum_size, csum), CSUM_FMT_VALUE(csum_size, csum_expected), mirror_num); } } /* * Lock inode i_rwsem based on arguments passed. * * ilock_flags can have the following bit set: * * BTRFS_ILOCK_SHARED - acquire a shared lock on the inode * BTRFS_ILOCK_TRY - try to acquire the lock, if fails on first attempt * return -EAGAIN * BTRFS_ILOCK_MMAP - acquire a write lock on the i_mmap_lock */ int btrfs_inode_lock(struct btrfs_inode *inode, unsigned int ilock_flags) { if (ilock_flags & BTRFS_ILOCK_SHARED) { if (ilock_flags & BTRFS_ILOCK_TRY) { if (!inode_trylock_shared(&inode->vfs_inode)) return -EAGAIN; else return 0; } inode_lock_shared(&inode->vfs_inode); } else { if (ilock_flags & BTRFS_ILOCK_TRY) { if (!inode_trylock(&inode->vfs_inode)) return -EAGAIN; else return 0; } inode_lock(&inode->vfs_inode); } if (ilock_flags & BTRFS_ILOCK_MMAP) down_write(&inode->i_mmap_lock); return 0; } /* * Unlock inode i_rwsem. * * ilock_flags should contain the same bits set as passed to btrfs_inode_lock() * to decide whether the lock acquired is shared or exclusive. */ void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags) { if (ilock_flags & BTRFS_ILOCK_MMAP) up_write(&inode->i_mmap_lock); if (ilock_flags & BTRFS_ILOCK_SHARED) inode_unlock_shared(&inode->vfs_inode); else inode_unlock(&inode->vfs_inode); } /* * Cleanup all submitted ordered extents in specified range to handle errors * from the btrfs_run_delalloc_range() callback. * * NOTE: caller must ensure that when an error happens, it can not call * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING * and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata * to be released, which we want to happen only when finishing the ordered * extent (btrfs_finish_ordered_io()). */ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, u64 offset, u64 bytes) { pgoff_t index = offset >> PAGE_SHIFT; const pgoff_t end_index = (offset + bytes - 1) >> PAGE_SHIFT; struct folio *folio; while (index <= end_index) { folio = filemap_get_folio(inode->vfs_inode.i_mapping, index); if (IS_ERR(folio)) { index++; continue; } index = folio_end(folio) >> PAGE_SHIFT; /* * Here we just clear all Ordered bits for every page in the * range, then btrfs_mark_ordered_io_finished() will handle * the ordered extent accounting for the range. */ btrfs_folio_clamp_clear_ordered(inode->root->fs_info, folio, offset, bytes); folio_put(folio); } return btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes, false); } static int btrfs_dirty_inode(struct btrfs_inode *inode); static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, struct btrfs_new_inode_args *args) { int ret; if (args->default_acl) { ret = __btrfs_set_acl(trans, args->inode, args->default_acl, ACL_TYPE_DEFAULT); if (ret) return ret; } if (args->acl) { ret = __btrfs_set_acl(trans, args->inode, args->acl, ACL_TYPE_ACCESS); if (ret) return ret; } if (!args->default_acl && !args->acl) cache_no_acl(args->inode); return btrfs_xattr_security_init(trans, args->inode, args->dir, &args->dentry->d_name); } /* * this does all the hard work for inserting an inline extent into * the btree. The caller should have done a btrfs_drop_extents so that * no overlapping inline items exist in the btree */ static int insert_inline_extent(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_inode *inode, bool extent_inserted, size_t size, size_t compressed_size, int compress_type, struct folio *compressed_folio, bool update_i_size) { struct btrfs_root *root = inode->root; struct extent_buffer *leaf; const u32 sectorsize = trans->fs_info->sectorsize; char *kaddr; unsigned long ptr; struct btrfs_file_extent_item *ei; int ret; size_t cur_size = size; u64 i_size; /* * The decompressed size must still be no larger than a sector. Under * heavy race, we can have size == 0 passed in, but that shouldn't be a * big deal and we can continue the insertion. */ ASSERT(size <= sectorsize); /* * The compressed size also needs to be no larger than a sector. * That's also why we only need one page as the parameter. */ if (compressed_folio) ASSERT(compressed_size <= sectorsize); else ASSERT(compressed_size == 0); if (compressed_size && compressed_folio) cur_size = compressed_size; if (!extent_inserted) { struct btrfs_key key; size_t datasize; key.objectid = btrfs_ino(inode); key.type = BTRFS_EXTENT_DATA_KEY; key.offset = 0; datasize = btrfs_file_extent_calc_inline_size(cur_size); ret = btrfs_insert_empty_item(trans, root, path, &key, datasize); if (ret) goto fail; } leaf = path->nodes[0]; ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); btrfs_set_file_extent_generation(leaf, ei, trans->transid); btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE); btrfs_set_file_extent_encryption(leaf, ei, 0); btrfs_set_file_extent_other_encoding(leaf, ei, 0); btrfs_set_file_extent_ram_bytes(leaf, ei, size); ptr = btrfs_file_extent_inline_start(ei); if (compress_type != BTRFS_COMPRESS_NONE) { kaddr = kmap_local_folio(compressed_folio, 0); write_extent_buffer(leaf, kaddr, ptr, compressed_size); kunmap_local(kaddr); btrfs_set_file_extent_compression(leaf, ei, compress_type); } else { struct folio *folio; folio = filemap_get_folio(inode->vfs_inode.i_mapping, 0); ASSERT(!IS_ERR(folio)); btrfs_set_file_extent_compression(leaf, ei, 0); kaddr = kmap_local_folio(folio, 0); write_extent_buffer(leaf, kaddr, ptr, size); kunmap_local(kaddr); folio_put(folio); } btrfs_release_path(path); /* * We align size to sectorsize for inline extents just for simplicity * sake. */ ret = btrfs_inode_set_file_extent_range(inode, 0, ALIGN(size, root->fs_info->sectorsize)); if (ret) goto fail; /* * We're an inline extent, so nobody can extend the file past i_size * without locking a page we already have locked. * * We must do any i_size and inode updates before we unlock the pages. * Otherwise we could end up racing with unlink. */ i_size = i_size_read(&inode->vfs_inode); if (update_i_size && size > i_size) { i_size_write(&inode->vfs_inode, size); i_size = size; } inode->disk_i_size = i_size; fail: return ret; } static bool can_cow_file_range_inline(struct btrfs_inode *inode, u64 offset, u64 size, size_t compressed_size) { struct btrfs_fs_info *fs_info = inode->root->fs_info; u64 data_len = (compressed_size ?: size); /* Inline extents must start at offset 0. */ if (offset != 0) return false; /* Inline extents are limited to sectorsize. */ if (size > fs_info->sectorsize) return false; /* We do not allow a non-compressed extent to be as large as block size. */ if (data_len >= fs_info->sectorsize) return false; /* We cannot exceed the maximum inline data size. */ if (data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info)) return false; /* We cannot exceed the user specified max_inline size. */ if (data_len > fs_info->max_inline) return false; /* Inline extents must be the entirety of the file. */ if (size < i_size_read(&inode->vfs_inode)) return false; return true; } /* * conditionally insert an inline extent into the file. This * does the checks required to make sure the data is small enough * to fit as an inline extent. * * If being used directly, you must have already checked we're allowed to cow * the range by getting true from can_cow_file_range_inline(). */ static noinline int __cow_file_range_inline(struct btrfs_inode *inode, u64 size, size_t compressed_size, int compress_type, struct folio *compressed_folio, bool update_i_size) { struct btrfs_drop_extents_args drop_args = { 0 }; struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans; u64 data_len = (compressed_size ?: size); int ret; struct btrfs_path *path; path = btrfs_alloc_path(); if (!path) return -ENOMEM; trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { btrfs_free_path(path); return PTR_ERR(trans); } trans->block_rsv = &inode->block_rsv; drop_args.path = path; drop_args.start = 0; drop_args.end = fs_info->sectorsize; drop_args.drop_cache = true; drop_args.replace_extent = true; drop_args.extent_item_size = btrfs_file_extent_calc_inline_size(data_len); ret = btrfs_drop_extents(trans, root, inode, &drop_args); if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } ret = insert_inline_extent(trans, path, inode, drop_args.extent_inserted, size, compressed_size, compress_type, compressed_folio, update_i_size); if (unlikely(ret && ret != -ENOSPC)) { btrfs_abort_transaction(trans, ret); goto out; } else if (ret == -ENOSPC) { ret = 1; goto out; } btrfs_update_inode_bytes(inode, size, drop_args.bytes_found); ret = btrfs_update_inode(trans, inode); if (unlikely(ret && ret != -ENOSPC)) { btrfs_abort_transaction(trans, ret); goto out; } else if (ret == -ENOSPC) { ret = 1; goto out; } btrfs_set_inode_full_sync(inode); out: /* * Don't forget to free the reserved space, as for inlined extent * it won't count as data extent, free them directly here. * And at reserve time, it's always aligned to page size, so * just free one page here. */ btrfs_qgroup_free_data(inode, NULL, 0, fs_info->sectorsize, NULL); btrfs_free_path(path); btrfs_end_transaction(trans); return ret; } static noinline int cow_file_range_inline(struct btrfs_inode *inode, struct folio *locked_folio, u64 offset, u64 end, size_t compressed_size, int compress_type, struct folio *compressed_folio, bool update_i_size) { struct extent_state *cached = NULL; unsigned long clear_flags = EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING | EXTENT_LOCKED; u64 size = min_t(u64, i_size_read(&inode->vfs_inode), end + 1); int ret; if (!can_cow_file_range_inline(inode, offset, size, compressed_size)) return 1; btrfs_lock_extent(&inode->io_tree, offset, end, &cached); ret = __cow_file_range_inline(inode, size, compressed_size, compress_type, compressed_folio, update_i_size); if (ret > 0) { btrfs_unlock_extent(&inode->io_tree, offset, end, &cached); return ret; } /* * In the successful case (ret == 0 here), cow_file_range will return 1. * * Quite a bit further up the callstack in extent_writepage(), ret == 1 * is treated as a short circuited success and does not unlock the folio, * so we must do it here. * * In the failure case, the locked_folio does get unlocked by * btrfs_folio_end_all_writers, which asserts that it is still locked * at that point, so we must *not* unlock it here. * * The other two callsites in compress_file_range do not have a * locked_folio, so they are not relevant to this logic. */ if (ret == 0) locked_folio = NULL; extent_clear_unlock_delalloc(inode, offset, end, locked_folio, &cached, clear_flags, PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); return ret; } struct async_extent { u64 start; u64 ram_size; u64 compressed_size; struct folio **folios; unsigned long nr_folios; int compress_type; struct list_head list; }; struct async_chunk { struct btrfs_inode *inode; struct folio *locked_folio; u64 start; u64 end; blk_opf_t write_flags; struct list_head extents; struct cgroup_subsys_state *blkcg_css; struct btrfs_work work; struct async_cow *async_cow; }; struct async_cow { atomic_t num_chunks; struct async_chunk chunks[]; }; static noinline int add_async_extent(struct async_chunk *cow, u64 start, u64 ram_size, u64 compressed_size, struct folio **folios, unsigned long nr_folios, int compress_type) { struct async_extent *async_extent; async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS); if (!async_extent) return -ENOMEM; async_extent->start = start; async_extent->ram_size = ram_size; async_extent->compressed_size = compressed_size; async_extent->folios = folios; async_extent->nr_folios = nr_folios; async_extent->compress_type = compress_type; list_add_tail(&async_extent->list, &cow->extents); return 0; } /* * Check if the inode needs to be submitted to compression, based on mount * options, defragmentation, properties or heuristics. */ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start, u64 end) { struct btrfs_fs_info *fs_info = inode->root->fs_info; if (!btrfs_inode_can_compress(inode)) { DEBUG_WARN("BTRFS: unexpected compression for ino %llu", btrfs_ino(inode)); return 0; } /* Defrag ioctl takes precedence over mount options and properties. */ if (inode->defrag_compress == BTRFS_DEFRAG_DONT_COMPRESS) return 0; if (BTRFS_COMPRESS_NONE < inode->defrag_compress && inode->defrag_compress < BTRFS_NR_COMPRESS_TYPES) return 1; /* force compress */ if (btrfs_test_opt(fs_info, FORCE_COMPRESS)) return 1; /* bad compression ratios */ if (inode->flags & BTRFS_INODE_NOCOMPRESS) return 0; if (btrfs_test_opt(fs_info, COMPRESS) || inode->flags & BTRFS_INODE_COMPRESS || inode->prop_compress) return btrfs_compress_heuristic(inode, start, end); return 0; } static inline void inode_should_defrag(struct btrfs_inode *inode, u64 start, u64 end, u64 num_bytes, u32 small_write) { /* If this is a small write inside eof, kick off a defrag */ if (num_bytes < small_write && (start > 0 || end + 1 < inode->disk_i_size)) btrfs_add_inode_defrag(inode, small_write); } static int extent_range_clear_dirty_for_io(struct btrfs_inode *inode, u64 start, u64 end) { const pgoff_t end_index = end >> PAGE_SHIFT; struct folio *folio; int ret = 0; for (pgoff_t index = start >> PAGE_SHIFT; index <= end_index; index++) { folio = filemap_get_folio(inode->vfs_inode.i_mapping, index); if (IS_ERR(folio)) { if (!ret) ret = PTR_ERR(folio); continue; } btrfs_folio_clamp_clear_dirty(inode->root->fs_info, folio, start, end + 1 - start); folio_put(folio); } return ret; } /* * Work queue call back to started compression on a file and pages. * * This is done inside an ordered work queue, and the compression is spread * across many cpus. The actual IO submission is step two, and the ordered work * queue takes care of making sure that happens in the same order things were * put onto the queue by writepages and friends. * * If this code finds it can't get good compression, it puts an entry onto the * work queue to write the uncompressed bytes. This makes sure that both * compressed inodes and uncompressed inodes are written in the same order that * the flusher thread sent them down. */ static void compress_file_range(struct btrfs_work *work) { struct async_chunk *async_chunk = container_of(work, struct async_chunk, work); struct btrfs_inode *inode = async_chunk->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; struct address_space *mapping = inode->vfs_inode.i_mapping; const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order; const u32 min_folio_size = btrfs_min_folio_size(fs_info); u64 blocksize = fs_info->sectorsize; u64 start = async_chunk->start; u64 end = async_chunk->end; u64 actual_end; u64 i_size; int ret = 0; struct folio **folios; unsigned long nr_folios; unsigned long total_compressed = 0; unsigned long total_in = 0; unsigned int loff; int i; int compress_type = fs_info->compress_type; int compress_level = fs_info->compress_level; inode_should_defrag(inode, start, end, end - start + 1, SZ_16K); /* * We need to call clear_page_dirty_for_io on each page in the range. * Otherwise applications with the file mmap'd can wander in and change * the page contents while we are compressing them. */ ret = extent_range_clear_dirty_for_io(inode, start, end); /* * All the folios should have been locked thus no failure. * * And even if some folios are missing, btrfs_compress_folios() * would handle them correctly, so here just do an ASSERT() check for * early logic errors. */ ASSERT(ret == 0); /* * We need to save i_size before now because it could change in between * us evaluating the size and assigning it. This is because we lock and * unlock the page in truncate and fallocate, and then modify the i_size * later on. * * The barriers are to emulate READ_ONCE, remove that once i_size_read * does that for us. */ barrier(); i_size = i_size_read(&inode->vfs_inode); barrier(); actual_end = min_t(u64, i_size, end + 1); again: folios = NULL; nr_folios = (end >> min_folio_shift) - (start >> min_folio_shift) + 1; nr_folios = min_t(unsigned long, nr_folios, BTRFS_MAX_COMPRESSED >> min_folio_shift); /* * we don't want to send crud past the end of i_size through * compression, that's just a waste of CPU time. So, if the * end of the file is before the start of our current * requested range of bytes, we bail out to the uncompressed * cleanup code that can deal with all of this. * * It isn't really the fastest way to fix things, but this is a * very uncommon corner. */ if (actual_end <= start) goto cleanup_and_bail_uncompressed; total_compressed = actual_end - start; /* * Skip compression for a small file range(<=blocksize) that * isn't an inline extent, since it doesn't save disk space at all. */ if (total_compressed <= blocksize && (start > 0 || end + 1 < inode->disk_i_size)) goto cleanup_and_bail_uncompressed; total_compressed = min_t(unsigned long, total_compressed, BTRFS_MAX_UNCOMPRESSED); total_in = 0; ret = 0; /* * We do compression for mount -o compress and when the inode has not * been flagged as NOCOMPRESS. This flag can change at any time if we * discover bad compression ratios. */ if (!inode_need_compress(inode, start, end)) goto cleanup_and_bail_uncompressed; folios = kcalloc(nr_folios, sizeof(struct folio *), GFP_NOFS); if (!folios) { /* * Memory allocation failure is not a fatal error, we can fall * back to uncompressed code. */ goto cleanup_and_bail_uncompressed; } if (0 < inode->defrag_compress && inode->defrag_compress < BTRFS_NR_COMPRESS_TYPES) { compress_type = inode->defrag_compress; compress_level = inode->defrag_compress_level; } else if (inode->prop_compress) { compress_type = inode->prop_compress; } /* Compression level is applied here. */ ret = btrfs_compress_folios(compress_type, compress_level, inode, start, folios, &nr_folios, &total_in, &total_compressed); if (ret) goto mark_incompressible; /* * Zero the tail end of the last folio, as we might be sending it down * to disk. */ loff = (total_compressed & (min_folio_size - 1)); if (loff) folio_zero_range(folios[nr_folios - 1], loff, min_folio_size - loff); /* * Try to create an inline extent. * * If we didn't compress the entire range, try to create an uncompressed * inline extent, else a compressed one. * * Check cow_file_range() for why we don't even try to create inline * extent for the subpage case. */ if (total_in < actual_end) ret = cow_file_range_inline(inode, NULL, start, end, 0, BTRFS_COMPRESS_NONE, NULL, false); else ret = cow_file_range_inline(inode, NULL, start, end, total_compressed, compress_type, folios[0], false); if (ret <= 0) { if (ret < 0) mapping_set_error(mapping, -EIO); goto free_pages; } /* * We aren't doing an inline extent. Round the compressed size up to a * block size boundary so the allocator does sane things. */ total_compressed = ALIGN(total_compressed, blocksize); /* * One last check to make sure the compression is really a win, compare * the page count read with the blocks on disk, compression must free at * least one sector. */ total_in = round_up(total_in, fs_info->sectorsize); if (total_compressed + blocksize > total_in) goto mark_incompressible; /* * The async work queues will take care of doing actual allocation on * disk for these compressed pages, and will submit the bios. */ ret = add_async_extent(async_chunk, start, total_in, total_compressed, folios, nr_folios, compress_type); BUG_ON(ret); if (start + total_in < end) { start += total_in; cond_resched(); goto again; } return; mark_incompressible: if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) && !inode->prop_compress) inode->flags |= BTRFS_INODE_NOCOMPRESS; cleanup_and_bail_uncompressed: ret = add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0, BTRFS_COMPRESS_NONE); BUG_ON(ret); free_pages: if (folios) { for (i = 0; i < nr_folios; i++) { WARN_ON(folios[i]->mapping); btrfs_free_compr_folio(folios[i]); } kfree(folios); } } static void free_async_extent_pages(struct async_extent *async_extent) { int i; if (!async_extent->folios) return; for (i = 0; i < async_extent->nr_folios; i++) { WARN_ON(async_extent->folios[i]->mapping); btrfs_free_compr_folio(async_extent->folios[i]); } kfree(async_extent->folios); async_extent->nr_folios = 0; async_extent->folios = NULL; } static void submit_uncompressed_range(struct btrfs_inode *inode, struct async_extent *async_extent, struct folio *locked_folio) { u64 start = async_extent->start; u64 end = async_extent->start + async_extent->ram_size - 1; int ret; struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, .range_start = start, .range_end = end, .no_cgroup_owner = 1, }; wbc_attach_fdatawrite_inode(&wbc, &inode->vfs_inode); ret = run_delalloc_cow(inode, locked_folio, start, end, &wbc, false); wbc_detach_inode(&wbc); if (ret < 0) { if (locked_folio) btrfs_folio_end_lock(inode->root->fs_info, locked_folio, start, async_extent->ram_size); btrfs_err_rl(inode->root->fs_info, "%s failed, root=%llu inode=%llu start=%llu len=%llu: %d", __func__, btrfs_root_id(inode->root), btrfs_ino(inode), start, async_extent->ram_size, ret); } } static void submit_one_async_extent(struct async_chunk *async_chunk, struct async_extent *async_extent, u64 *alloc_hint) { struct btrfs_inode *inode = async_chunk->inode; struct extent_io_tree *io_tree = &inode->io_tree; struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_ordered_extent *ordered; struct btrfs_file_extent file_extent; struct btrfs_key ins; struct folio *locked_folio = NULL; struct extent_state *cached = NULL; struct extent_map *em; int ret = 0; bool free_pages = false; u64 start = async_extent->start; u64 end = async_extent->start + async_extent->ram_size - 1; if (async_chunk->blkcg_css) kthread_associate_blkcg(async_chunk->blkcg_css); /* * If async_chunk->locked_folio is in the async_extent range, we need to * handle it. */ if (async_chunk->locked_folio) { u64 locked_folio_start = folio_pos(async_chunk->locked_folio); u64 locked_folio_end = locked_folio_start + folio_size(async_chunk->locked_folio) - 1; if (!(start >= locked_folio_end || end <= locked_folio_start)) locked_folio = async_chunk->locked_folio; } if (async_extent->compress_type == BTRFS_COMPRESS_NONE) { ASSERT(!async_extent->folios); ASSERT(async_extent->nr_folios == 0); submit_uncompressed_range(inode, async_extent, locked_folio); free_pages = true; goto done; } ret = btrfs_reserve_extent(root, async_extent->ram_size, async_extent->compressed_size, async_extent->compressed_size, 0, *alloc_hint, &ins, 1, 1); if (ret) { /* * We can't reserve contiguous space for the compressed size. * Unlikely, but it's possible that we could have enough * non-contiguous space for the uncompressed size instead. So * fall back to uncompressed. */ submit_uncompressed_range(inode, async_extent, locked_folio); free_pages = true; goto done; } btrfs_lock_extent(io_tree, start, end, &cached); /* Here we're doing allocation and writeback of the compressed pages */ file_extent.disk_bytenr = ins.objectid; file_extent.disk_num_bytes = ins.offset; file_extent.ram_bytes = async_extent->ram_size; file_extent.num_bytes = async_extent->ram_size; file_extent.offset = 0; file_extent.compression = async_extent->compress_type; em = btrfs_create_io_em(inode, start, &file_extent, BTRFS_ORDERED_COMPRESSED); if (IS_ERR(em)) { ret = PTR_ERR(em); goto out_free_reserve; } btrfs_free_extent_map(em); ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent, 1U << BTRFS_ORDERED_COMPRESSED); if (IS_ERR(ordered)) { btrfs_drop_extent_map_range(inode, start, end, false); ret = PTR_ERR(ordered); goto out_free_reserve; } btrfs_dec_block_group_reservations(fs_info, ins.objectid); /* Clear dirty, set writeback and unlock the pages. */ extent_clear_unlock_delalloc(inode, start, end, NULL, &cached, EXTENT_LOCKED | EXTENT_DELALLOC, PAGE_UNLOCK | PAGE_START_WRITEBACK); btrfs_submit_compressed_write(ordered, async_extent->folios, /* compressed_folios */ async_extent->nr_folios, async_chunk->write_flags, true); *alloc_hint = ins.objectid + ins.offset; done: if (async_chunk->blkcg_css) kthread_associate_blkcg(NULL); if (free_pages) free_async_extent_pages(async_extent); kfree(async_extent); return; out_free_reserve: btrfs_dec_block_group_reservations(fs_info, ins.objectid); btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, true); mapping_set_error(inode->vfs_inode.i_mapping, -EIO); extent_clear_unlock_delalloc(inode, start, end, NULL, &cached, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); free_async_extent_pages(async_extent); if (async_chunk->blkcg_css) kthread_associate_blkcg(NULL); btrfs_debug(fs_info, "async extent submission failed root=%lld inode=%llu start=%llu len=%llu ret=%d", btrfs_root_id(root), btrfs_ino(inode), start, async_extent->ram_size, ret); kfree(async_extent); } u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, u64 num_bytes) { struct extent_map_tree *em_tree = &inode->extent_tree; struct extent_map *em; u64 alloc_hint = 0; read_lock(&em_tree->lock); em = btrfs_search_extent_mapping(em_tree, start, num_bytes); if (em) { /* * if block start isn't an actual block number then find the * first block in this inode and use that as a hint. If that * block is also bogus then just don't worry about it. */ if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) { btrfs_free_extent_map(em); em = btrfs_search_extent_mapping(em_tree, 0, 0); if (em && em->disk_bytenr < EXTENT_MAP_LAST_BYTE) alloc_hint = btrfs_extent_map_block_start(em); if (em) btrfs_free_extent_map(em); } else { alloc_hint = btrfs_extent_map_block_start(em); btrfs_free_extent_map(em); } } read_unlock(&em_tree->lock); return alloc_hint; } /* * when extent_io.c finds a delayed allocation range in the file, * the call backs end up in this code. The basic idea is to * allocate extents on disk for the range, and create ordered data structs * in ram to track those extents. * * locked_folio is the folio that writepage had locked already. We use * it to make sure we don't do extra locks or unlocks. * * When this function fails, it unlocks all folios except @locked_folio. * * When this function successfully creates an inline extent, it returns 1 and * unlocks all folios including locked_folio and starts I/O on them. * (In reality inline extents are limited to a single block, so locked_folio is * the only folio handled anyway). * * When this function succeed and creates a normal extent, the folio locking * status depends on the passed in flags: * * - If COW_FILE_RANGE_KEEP_LOCKED flag is set, all folios are kept locked. * - Else all folios except for @locked_folio are unlocked. * * When a failure happens in the second or later iteration of the * while-loop, the ordered extents created in previous iterations are cleaned up. */ static noinline int cow_file_range(struct btrfs_inode *inode, struct folio *locked_folio, u64 start, u64 end, u64 *done_offset, unsigned long flags) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct extent_state *cached = NULL; u64 alloc_hint = 0; u64 orig_start = start; u64 num_bytes; u64 cur_alloc_size = 0; u64 min_alloc_size; u64 blocksize = fs_info->sectorsize; struct btrfs_key ins; struct extent_map *em; unsigned clear_bits; unsigned long page_ops; int ret = 0; if (btrfs_is_free_space_inode(inode)) { ret = -EINVAL; goto out_unlock; } num_bytes = ALIGN(end - start + 1, blocksize); num_bytes = max(blocksize, num_bytes); ASSERT(num_bytes <= btrfs_super_total_bytes(fs_info->super_copy)); inode_should_defrag(inode, start, end, num_bytes, SZ_64K); if (!(flags & COW_FILE_RANGE_NO_INLINE)) { /* lets try to make an inline extent */ ret = cow_file_range_inline(inode, locked_folio, start, end, 0, BTRFS_COMPRESS_NONE, NULL, false); if (ret <= 0) { /* * We succeeded, return 1 so the caller knows we're done * with this page and already handled the IO. * * If there was an error then cow_file_range_inline() has * already done the cleanup. */ if (ret == 0) ret = 1; goto done; } } alloc_hint = btrfs_get_extent_allocation_hint(inode, start, num_bytes); /* * We're not doing compressed IO, don't unlock the first page (which * the caller expects to stay locked), don't clear any dirty bits and * don't set any writeback bits. * * Do set the Ordered (Private2) bit so we know this page was properly * setup for writepage. */ page_ops = ((flags & COW_FILE_RANGE_KEEP_LOCKED) ? 0 : PAGE_UNLOCK); page_ops |= PAGE_SET_ORDERED; /* * Relocation relies on the relocated extents to have exactly the same * size as the original extents. Normally writeback for relocation data * extents follows a NOCOW path because relocation preallocates the * extents. However, due to an operation such as scrub turning a block * group to RO mode, it may fallback to COW mode, so we must make sure * an extent allocated during COW has exactly the requested size and can * not be split into smaller extents, otherwise relocation breaks and * fails during the stage where it updates the bytenr of file extent * items. */ if (btrfs_is_data_reloc_root(root)) min_alloc_size = num_bytes; else min_alloc_size = fs_info->sectorsize; while (num_bytes > 0) { struct btrfs_ordered_extent *ordered; struct btrfs_file_extent file_extent; ret = btrfs_reserve_extent(root, num_bytes, num_bytes, min_alloc_size, 0, alloc_hint, &ins, 1, 1); if (ret == -EAGAIN) { /* * btrfs_reserve_extent only returns -EAGAIN for zoned * file systems, which is an indication that there are * no active zones to allocate from at the moment. * * If this is the first loop iteration, wait for at * least one zone to finish before retrying the * allocation. Otherwise ask the caller to write out * the already allocated blocks before coming back to * us, or return -ENOSPC if it can't handle retries. */ ASSERT(btrfs_is_zoned(fs_info)); if (start == orig_start) { wait_on_bit_io(&inode->root->fs_info->flags, BTRFS_FS_NEED_ZONE_FINISH, TASK_UNINTERRUPTIBLE); continue; } if (done_offset) { /* * Move @end to the end of the processed range, * and exit the loop to unlock the processed extents. */ end = start - 1; ret = 0; break; } ret = -ENOSPC; } if (ret < 0) goto out_unlock; cur_alloc_size = ins.offset; file_extent.disk_bytenr = ins.objectid; file_extent.disk_num_bytes = ins.offset; file_extent.num_bytes = ins.offset; file_extent.ram_bytes = ins.offset; file_extent.offset = 0; file_extent.compression = BTRFS_COMPRESS_NONE; /* * Locked range will be released either during error clean up or * after the whole range is finished. */ btrfs_lock_extent(&inode->io_tree, start, start + cur_alloc_size - 1, &cached); em = btrfs_create_io_em(inode, start, &file_extent, BTRFS_ORDERED_REGULAR); if (IS_ERR(em)) { btrfs_unlock_extent(&inode->io_tree, start, start + cur_alloc_size - 1, &cached); ret = PTR_ERR(em); goto out_reserve; } btrfs_free_extent_map(em); ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent, 1U << BTRFS_ORDERED_REGULAR); if (IS_ERR(ordered)) { btrfs_unlock_extent(&inode->io_tree, start, start + cur_alloc_size - 1, &cached); ret = PTR_ERR(ordered); goto out_drop_extent_cache; } if (btrfs_is_data_reloc_root(root)) { ret = btrfs_reloc_clone_csums(ordered); /* * Only drop cache here, and process as normal. * * We must not allow extent_clear_unlock_delalloc() * at out_unlock label to free meta of this ordered * extent, as its meta should be freed by * btrfs_finish_ordered_io(). * * So we must continue until @start is increased to * skip current ordered extent. */ if (ret) btrfs_drop_extent_map_range(inode, start, start + cur_alloc_size - 1, false); } btrfs_put_ordered_extent(ordered); btrfs_dec_block_group_reservations(fs_info, ins.objectid); if (num_bytes < cur_alloc_size) num_bytes = 0; else num_bytes -= cur_alloc_size; alloc_hint = ins.objectid + ins.offset; start += cur_alloc_size; cur_alloc_size = 0; /* * btrfs_reloc_clone_csums() error, since start is increased * extent_clear_unlock_delalloc() at out_unlock label won't * free metadata of current ordered extent, we're OK to exit. */ if (ret) goto out_unlock; } extent_clear_unlock_delalloc(inode, orig_start, end, locked_folio, &cached, EXTENT_LOCKED | EXTENT_DELALLOC, page_ops); done: if (done_offset) *done_offset = end; return ret; out_drop_extent_cache: btrfs_drop_extent_map_range(inode, start, start + cur_alloc_size - 1, false); out_reserve: btrfs_dec_block_group_reservations(fs_info, ins.objectid); btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, true); out_unlock: /* * Now, we have three regions to clean up: * * |-------(1)----|---(2)---|-------------(3)----------| * `- orig_start `- start `- start + cur_alloc_size `- end * * We process each region below. */ /* * For the range (1). We have already instantiated the ordered extents * for this region, thus we need to cleanup those ordered extents. * EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV * are also handled by the ordered extents cleanup. * * So here we only clear EXTENT_LOCKED and EXTENT_DELALLOC flag, and * finish the writeback of the involved folios, which will be never submitted. */ if (orig_start < start) { clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC; page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK; if (!locked_folio) mapping_set_error(inode->vfs_inode.i_mapping, ret); btrfs_cleanup_ordered_extents(inode, orig_start, start - orig_start); extent_clear_unlock_delalloc(inode, orig_start, start - 1, locked_folio, NULL, clear_bits, page_ops); } clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV; page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK; /* * For the range (2). If we reserved an extent for our delalloc range * (or a subrange) and failed to create the respective ordered extent, * then it means that when we reserved the extent we decremented the * extent's size from the data space_info's bytes_may_use counter and * incremented the space_info's bytes_reserved counter by the same * amount. We must make sure extent_clear_unlock_delalloc() does not try * to decrement again the data space_info's bytes_may_use counter, * therefore we do not pass it the flag EXTENT_CLEAR_DATA_RESV. */ if (cur_alloc_size) { extent_clear_unlock_delalloc(inode, start, start + cur_alloc_size - 1, locked_folio, &cached, clear_bits, page_ops); btrfs_qgroup_free_data(inode, NULL, start, cur_alloc_size, NULL); } /* * For the range (3). We never touched the region. In addition to the * clear_bits above, we add EXTENT_CLEAR_DATA_RESV to release the data * space_info's bytes_may_use counter, reserved in * btrfs_check_data_free_space(). */ if (start + cur_alloc_size < end) { clear_bits |= EXTENT_CLEAR_DATA_RESV; extent_clear_unlock_delalloc(inode, start + cur_alloc_size, end, locked_folio, &cached, clear_bits, page_ops); btrfs_qgroup_free_data(inode, NULL, start + cur_alloc_size, end - start - cur_alloc_size + 1, NULL); } btrfs_err(fs_info, "%s failed, root=%llu inode=%llu start=%llu len=%llu cur_offset=%llu cur_alloc_size=%llu: %d", __func__, btrfs_root_id(inode->root), btrfs_ino(inode), orig_start, end + 1 - orig_start, start, cur_alloc_size, ret); return ret; } /* * Phase two of compressed writeback. This is the ordered portion of the code, * which only gets called in the order the work was queued. We walk all the * async extents created by compress_file_range and send them down to the disk. * * If called with @do_free == true then it'll try to finish the work and free * the work struct eventually. */ static noinline void submit_compressed_extents(struct btrfs_work *work, bool do_free) { struct async_chunk *async_chunk = container_of(work, struct async_chunk, work); struct btrfs_fs_info *fs_info = btrfs_work_owner(work); struct async_extent *async_extent; unsigned long nr_pages; u64 alloc_hint = 0; if (do_free) { struct async_cow *async_cow; btrfs_add_delayed_iput(async_chunk->inode); if (async_chunk->blkcg_css) css_put(async_chunk->blkcg_css); async_cow = async_chunk->async_cow; if (atomic_dec_and_test(&async_cow->num_chunks)) kvfree(async_cow); return; } nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >> PAGE_SHIFT; while (!list_empty(&async_chunk->extents)) { async_extent = list_first_entry(&async_chunk->extents, struct async_extent, list); list_del(&async_extent->list); submit_one_async_extent(async_chunk, async_extent, &alloc_hint); } /* atomic_sub_return implies a barrier */ if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) < 5 * SZ_1M) cond_wake_up_nomb(&fs_info->async_submit_wait); } static bool run_delalloc_compressed(struct btrfs_inode *inode, struct folio *locked_folio, u64 start, u64 end, struct writeback_control *wbc) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct cgroup_subsys_state *blkcg_css = wbc_blkcg_css(wbc); struct async_cow *ctx; struct async_chunk *async_chunk; unsigned long nr_pages; u64 num_chunks = DIV_ROUND_UP(end - start, SZ_512K); int i; unsigned nofs_flag; const blk_opf_t write_flags = wbc_to_write_flags(wbc); nofs_flag = memalloc_nofs_save(); ctx = kvmalloc(struct_size(ctx, chunks, num_chunks), GFP_KERNEL); memalloc_nofs_restore(nofs_flag); if (!ctx) return false; set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags); async_chunk = ctx->chunks; atomic_set(&ctx->num_chunks, num_chunks); for (i = 0; i < num_chunks; i++) { u64 cur_end = min(end, start + SZ_512K - 1); /* * igrab is called higher up in the call chain, take only the * lightweight reference for the callback lifetime */ ihold(&inode->vfs_inode); async_chunk[i].async_cow = ctx; async_chunk[i].inode = inode; async_chunk[i].start = start; async_chunk[i].end = cur_end; async_chunk[i].write_flags = write_flags; INIT_LIST_HEAD(&async_chunk[i].extents); /* * The locked_folio comes all the way from writepage and its * the original folio we were actually given. As we spread * this large delalloc region across multiple async_chunk * structs, only the first struct needs a pointer to * locked_folio. * * This way we don't need racey decisions about who is supposed * to unlock it. */ if (locked_folio) { /* * Depending on the compressibility, the pages might or * might not go through async. We want all of them to * be accounted against wbc once. Let's do it here * before the paths diverge. wbc accounting is used * only for foreign writeback detection and doesn't * need full accuracy. Just account the whole thing * against the first page. */ wbc_account_cgroup_owner(wbc, locked_folio, cur_end - start); async_chunk[i].locked_folio = locked_folio; locked_folio = NULL; } else { async_chunk[i].locked_folio = NULL; } if (blkcg_css != blkcg_root_css) { css_get(blkcg_css); async_chunk[i].blkcg_css = blkcg_css; async_chunk[i].write_flags |= REQ_BTRFS_CGROUP_PUNT; } else { async_chunk[i].blkcg_css = NULL; } btrfs_init_work(&async_chunk[i].work, compress_file_range, submit_compressed_extents); nr_pages = DIV_ROUND_UP(cur_end - start, PAGE_SIZE); atomic_add(nr_pages, &fs_info->async_delalloc_pages); btrfs_queue_work(fs_info->delalloc_workers, &async_chunk[i].work); start = cur_end + 1; } return true; } /* * Run the delalloc range from start to end, and write back any dirty pages * covered by the range. */ static noinline int run_delalloc_cow(struct btrfs_inode *inode, struct folio *locked_folio, u64 start, u64 end, struct writeback_control *wbc, bool pages_dirty) { u64 done_offset = end; int ret; while (start <= end) { ret = cow_file_range(inode, locked_folio, start, end, &done_offset, COW_FILE_RANGE_KEEP_LOCKED); if (ret) return ret; extent_write_locked_range(&inode->vfs_inode, locked_folio, start, done_offset, wbc, pages_dirty); start = done_offset + 1; } return 1; } static int fallback_to_cow(struct btrfs_inode *inode, struct folio *locked_folio, const u64 start, const u64 end) { const bool is_space_ino = btrfs_is_free_space_inode(inode); const bool is_reloc_ino = btrfs_is_data_reloc_root(inode->root); const u64 range_bytes = end + 1 - start; struct extent_io_tree *io_tree = &inode->io_tree; struct extent_state *cached_state = NULL; u64 range_start = start; u64 count; int ret; /* * If EXTENT_NORESERVE is set it means that when the buffered write was * made we had not enough available data space and therefore we did not * reserve data space for it, since we though we could do NOCOW for the * respective file range (either there is prealloc extent or the inode * has the NOCOW bit set). * * However when we need to fallback to COW mode (because for example the * block group for the corresponding extent was turned to RO mode by a * scrub or relocation) we need to do the following: * * 1) We increment the bytes_may_use counter of the data space info. * If COW succeeds, it allocates a new data extent and after doing * that it decrements the space info's bytes_may_use counter and * increments its bytes_reserved counter by the same amount (we do * this at btrfs_add_reserved_bytes()). So we need to increment the * bytes_may_use counter to compensate (when space is reserved at * buffered write time, the bytes_may_use counter is incremented); * * 2) We clear the EXTENT_NORESERVE bit from the range. We do this so * that if the COW path fails for any reason, it decrements (through * extent_clear_unlock_delalloc()) the bytes_may_use counter of the * data space info, which we incremented in the step above. * * If we need to fallback to cow and the inode corresponds to a free * space cache inode or an inode of the data relocation tree, we must * also increment bytes_may_use of the data space_info for the same * reason. Space caches and relocated data extents always get a prealloc * extent for them, however scrub or balance may have set the block * group that contains that extent to RO mode and therefore force COW * when starting writeback. */ btrfs_lock_extent(io_tree, start, end, &cached_state); count = btrfs_count_range_bits(io_tree, &range_start, end, range_bytes, EXTENT_NORESERVE, 0, NULL); if (count > 0 || is_space_ino || is_reloc_ino) { u64 bytes = count; struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_space_info *sinfo = fs_info->data_sinfo; if (is_space_ino || is_reloc_ino) bytes = range_bytes; spin_lock(&sinfo->lock); btrfs_space_info_update_bytes_may_use(sinfo, bytes); spin_unlock(&sinfo->lock); if (count > 0) btrfs_clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE, &cached_state); } btrfs_unlock_extent(io_tree, start, end, &cached_state); /* * Don't try to create inline extents, as a mix of inline extent that * is written out and unlocked directly and a normal NOCOW extent * doesn't work. * * And here we do not unlock the folio after a successful run. * The folios will be unlocked after everything is finished, or by error handling. * * This is to ensure error handling won't need to clear dirty/ordered flags without * a locked folio, which can race with writeback. */ ret = cow_file_range(inode, locked_folio, start, end, NULL, COW_FILE_RANGE_NO_INLINE | COW_FILE_RANGE_KEEP_LOCKED); ASSERT(ret != 1); return ret; } struct can_nocow_file_extent_args { /* Input fields. */ /* Start file offset of the range we want to NOCOW. */ u64 start; /* End file offset (inclusive) of the range we want to NOCOW. */ u64 end; bool writeback_path; /* * Free the path passed to can_nocow_file_extent() once it's not needed * anymore. */ bool free_path; /* * Output fields. Only set when can_nocow_file_extent() returns 1. * The expected file extent for the NOCOW write. */ struct btrfs_file_extent file_extent; }; /* * Check if we can NOCOW the file extent that the path points to. * This function may return with the path released, so the caller should check * if path->nodes[0] is NULL or not if it needs to use the path afterwards. * * Returns: < 0 on error * 0 if we can not NOCOW * 1 if we can NOCOW */ static int can_nocow_file_extent(struct btrfs_path *path, struct btrfs_key *key, struct btrfs_inode *inode, struct can_nocow_file_extent_args *args) { const bool is_freespace_inode = btrfs_is_free_space_inode(inode); struct extent_buffer *leaf = path->nodes[0]; struct btrfs_root *root = inode->root; struct btrfs_file_extent_item *fi; struct btrfs_root *csum_root; u64 io_start; u64 extent_end; u8 extent_type; int can_nocow = 0; int ret = 0; bool nowait = path->nowait; fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); extent_type = btrfs_file_extent_type(leaf, fi); if (extent_type == BTRFS_FILE_EXTENT_INLINE) goto out; if (!(inode->flags & BTRFS_INODE_NODATACOW) && extent_type == BTRFS_FILE_EXTENT_REG) goto out; /* * If the extent was created before the generation where the last snapshot * for its subvolume was created, then this implies the extent is shared, * hence we must COW. */ if (btrfs_file_extent_generation(leaf, fi) <= btrfs_root_last_snapshot(&root->root_item)) goto out; /* An explicit hole, must COW. */ if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0) goto out; /* Compressed/encrypted/encoded extents must be COWed. */ if (btrfs_file_extent_compression(leaf, fi) || btrfs_file_extent_encryption(leaf, fi) || btrfs_file_extent_other_encoding(leaf, fi)) goto out; extent_end = btrfs_file_extent_end(path); args->file_extent.disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); args->file_extent.disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); args->file_extent.ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); args->file_extent.offset = btrfs_file_extent_offset(leaf, fi); args->file_extent.compression = btrfs_file_extent_compression(leaf, fi); /* * The following checks can be expensive, as they need to take other * locks and do btree or rbtree searches, so release the path to avoid * blocking other tasks for too long. */ btrfs_release_path(path); ret = btrfs_cross_ref_exist(inode, key->offset - args->file_extent.offset, args->file_extent.disk_bytenr, path); WARN_ON_ONCE(ret > 0 && is_freespace_inode); if (ret != 0) goto out; if (args->free_path) { /* * We don't need the path anymore, plus through the * btrfs_lookup_csums_list() call below we will end up allocating * another path. So free the path to avoid unnecessary extra * memory usage. */ btrfs_free_path(path); path = NULL; } /* If there are pending snapshots for this root, we must COW. */ if (args->writeback_path && !is_freespace_inode && atomic_read(&root->snapshot_force_cow)) goto out; args->file_extent.num_bytes = min(args->end + 1, extent_end) - args->start; args->file_extent.offset += args->start - key->offset; io_start = args->file_extent.disk_bytenr + args->file_extent.offset; /* * Force COW if csums exist in the range. This ensures that csums for a * given extent are either valid or do not exist. */ csum_root = btrfs_csum_root(root->fs_info, io_start); ret = btrfs_lookup_csums_list(csum_root, io_start, io_start + args->file_extent.num_bytes - 1, NULL, nowait); WARN_ON_ONCE(ret > 0 && is_freespace_inode); if (ret != 0) goto out; can_nocow = 1; out: if (args->free_path && path) btrfs_free_path(path); return ret < 0 ? ret : can_nocow; } static int nocow_one_range(struct btrfs_inode *inode, struct folio *locked_folio, struct extent_state **cached, struct can_nocow_file_extent_args *nocow_args, u64 file_pos, bool is_prealloc) { struct btrfs_ordered_extent *ordered; const u64 len = nocow_args->file_extent.num_bytes; const u64 end = file_pos + len - 1; int ret = 0; btrfs_lock_extent(&inode->io_tree, file_pos, end, cached); if (is_prealloc) { struct extent_map *em; em = btrfs_create_io_em(inode, file_pos, &nocow_args->file_extent, BTRFS_ORDERED_PREALLOC); if (IS_ERR(em)) { ret = PTR_ERR(em); goto error; } btrfs_free_extent_map(em); } ordered = btrfs_alloc_ordered_extent(inode, file_pos, &nocow_args->file_extent, is_prealloc ? (1U << BTRFS_ORDERED_PREALLOC) : (1U << BTRFS_ORDERED_NOCOW)); if (IS_ERR(ordered)) { if (is_prealloc) btrfs_drop_extent_map_range(inode, file_pos, end, false); ret = PTR_ERR(ordered); goto error; } if (btrfs_is_data_reloc_root(inode->root)) /* * Errors are handled later, as we must prevent * extent_clear_unlock_delalloc() in error handler from freeing * metadata of the created ordered extent. */ ret = btrfs_reloc_clone_csums(ordered); btrfs_put_ordered_extent(ordered); if (ret < 0) goto error; extent_clear_unlock_delalloc(inode, file_pos, end, locked_folio, cached, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_CLEAR_DATA_RESV, PAGE_SET_ORDERED); return ret; error: btrfs_cleanup_ordered_extents(inode, file_pos, len); extent_clear_unlock_delalloc(inode, file_pos, end, locked_folio, cached, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_CLEAR_DATA_RESV, PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); btrfs_err(inode->root->fs_info, "%s failed, root=%lld inode=%llu start=%llu len=%llu: %d", __func__, btrfs_root_id(inode->root), btrfs_ino(inode), file_pos, len, ret); return ret; } /* * When nocow writeback calls back. This checks for snapshots or COW copies * of the extents that exist in the file, and COWs the file as required. * * If no cow copies or snapshots exist, we write directly to the existing * blocks on disk */ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, struct folio *locked_folio, const u64 start, const u64 end) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_root *root = inode->root; struct btrfs_path *path; u64 cow_start = (u64)-1; /* * If not 0, represents the inclusive end of the last fallback_to_cow() * range. Only for error handling. * * The same for nocow_end, it's to avoid double cleaning up the range * already cleaned by nocow_one_range(). */ u64 cow_end = 0; u64 nocow_end = 0; u64 cur_offset = start; int ret; bool check_prev = true; u64 ino = btrfs_ino(inode); struct can_nocow_file_extent_args nocow_args = { 0 }; /* The range that has ordered extent(s). */ u64 oe_cleanup_start; u64 oe_cleanup_len = 0; /* The range that is untouched. */ u64 untouched_start; u64 untouched_len = 0; /* * Normally on a zoned device we're only doing COW writes, but in case * of relocation on a zoned filesystem serializes I/O so that we're only * writing sequentially and can end up here as well. */ ASSERT(!btrfs_is_zoned(fs_info) || btrfs_is_data_reloc_root(root)); path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; goto error; } nocow_args.end = end; nocow_args.writeback_path = true; while (cur_offset <= end) { struct btrfs_block_group *nocow_bg = NULL; struct btrfs_key found_key; struct btrfs_file_extent_item *fi; struct extent_buffer *leaf; struct extent_state *cached_state = NULL; u64 extent_end; int extent_type; ret = btrfs_lookup_file_extent(NULL, root, path, ino, cur_offset, 0); if (ret < 0) goto error; /* * If there is no extent for our range when doing the initial * search, then go back to the previous slot as it will be the * one containing the search offset */ if (ret > 0 && path->slots[0] > 0 && check_prev) { leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0] - 1); if (found_key.objectid == ino && found_key.type == BTRFS_EXTENT_DATA_KEY) path->slots[0]--; } check_prev = false; next_slot: /* Go to next leaf if we have exhausted the current one */ leaf = path->nodes[0]; if (path->slots[0] >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(root, path); if (ret < 0) goto error; if (ret > 0) break; leaf = path->nodes[0]; } btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); /* Didn't find anything for our INO */ if (found_key.objectid > ino) break; /* * Keep searching until we find an EXTENT_ITEM or there are no * more extents for this inode */ if (WARN_ON_ONCE(found_key.objectid < ino) || found_key.type < BTRFS_EXTENT_DATA_KEY) { path->slots[0]++; goto next_slot; } /* Found key is not EXTENT_DATA_KEY or starts after req range */ if (found_key.type > BTRFS_EXTENT_DATA_KEY || found_key.offset > end) break; /* * If the found extent starts after requested offset, then * adjust cur_offset to be right before this extent begins. */ if (found_key.offset > cur_offset) { if (cow_start == (u64)-1) cow_start = cur_offset; cur_offset = found_key.offset; goto next_slot; } /* * Found extent which begins before our range and potentially * intersect it */ fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); extent_type = btrfs_file_extent_type(leaf, fi); /* If this is triggered then we have a memory corruption. */ ASSERT(extent_type < BTRFS_NR_FILE_EXTENT_TYPES); if (WARN_ON(extent_type >= BTRFS_NR_FILE_EXTENT_TYPES)) { ret = -EUCLEAN; goto error; } extent_end = btrfs_file_extent_end(path); /* * If the extent we got ends before our current offset, skip to * the next extent. */ if (extent_end <= cur_offset) { path->slots[0]++; goto next_slot; } nocow_args.start = cur_offset; ret = can_nocow_file_extent(path, &found_key, inode, &nocow_args); if (ret < 0) goto error; if (ret == 0) goto must_cow; ret = 0; nocow_bg = btrfs_inc_nocow_writers(fs_info, nocow_args.file_extent.disk_bytenr + nocow_args.file_extent.offset); if (!nocow_bg) { must_cow: /* * If we can't perform NOCOW writeback for the range, * then record the beginning of the range that needs to * be COWed. It will be written out before the next * NOCOW range if we find one, or when exiting this * loop. */ if (cow_start == (u64)-1) cow_start = cur_offset; cur_offset = extent_end; if (cur_offset > end) break; if (!path->nodes[0]) continue; path->slots[0]++; goto next_slot; } /* * COW range from cow_start to found_key.offset - 1. As the key * will contain the beginning of the first extent that can be * NOCOW, following one which needs to be COW'ed */ if (cow_start != (u64)-1) { ret = fallback_to_cow(inode, locked_folio, cow_start, found_key.offset - 1); if (ret) { cow_end = found_key.offset - 1; btrfs_dec_nocow_writers(nocow_bg); goto error; } cow_start = (u64)-1; } ret = nocow_one_range(inode, locked_folio, &cached_state, &nocow_args, cur_offset, extent_type == BTRFS_FILE_EXTENT_PREALLOC); btrfs_dec_nocow_writers(nocow_bg); if (ret < 0) { nocow_end = cur_offset + nocow_args.file_extent.num_bytes - 1; goto error; } cur_offset = extent_end; } btrfs_release_path(path); if (cur_offset <= end && cow_start == (u64)-1) cow_start = cur_offset; if (cow_start != (u64)-1) { ret = fallback_to_cow(inode, locked_folio, cow_start, end); if (ret) { cow_end = end; goto error; } cow_start = (u64)-1; } /* * Everything is finished without an error, can unlock the folios now. * * No need to touch the io tree range nor set folio ordered flag, as * fallback_to_cow() and nocow_one_range() have already handled them. */ extent_clear_unlock_delalloc(inode, start, end, locked_folio, NULL, 0, PAGE_UNLOCK); btrfs_free_path(path); return 0; error: if (cow_start == (u64)-1) { /* * case a) * start cur_offset end * | OE cleanup | Untouched | * * We finished a fallback_to_cow() or nocow_one_range() call, * but failed to check the next range. * * or * start cur_offset nocow_end end * | OE cleanup | Skip | Untouched | * * nocow_one_range() failed, the range [cur_offset, nocow_end] is * already cleaned up. */ oe_cleanup_start = start; oe_cleanup_len = cur_offset - start; if (nocow_end) untouched_start = nocow_end + 1; else untouched_start = cur_offset; untouched_len = end + 1 - untouched_start; } else if (cow_start != (u64)-1 && cow_end == 0) { /* * case b) * start cow_start cur_offset end * | OE cleanup | Untouched | * * We got a range that needs COW, but before we hit the next NOCOW range, * thus [cow_start, cur_offset) doesn't yet have any OE. */ oe_cleanup_start = start; oe_cleanup_len = cow_start - start; untouched_start = cow_start; untouched_len = end + 1 - untouched_start; } else { /* * case c) * start cow_start cow_end end * | OE cleanup | Skip | Untouched | * * fallback_to_cow() failed, and fallback_to_cow() will do the * cleanup for its range, we shouldn't touch the range * [cow_start, cow_end]. */ ASSERT(cow_start != (u64)-1 && cow_end != 0); oe_cleanup_start = start; oe_cleanup_len = cow_start - start; untouched_start = cow_end + 1; untouched_len = end + 1 - untouched_start; } if (oe_cleanup_len) { const u64 oe_cleanup_end = oe_cleanup_start + oe_cleanup_len - 1; btrfs_cleanup_ordered_extents(inode, oe_cleanup_start, oe_cleanup_len); extent_clear_unlock_delalloc(inode, oe_cleanup_start, oe_cleanup_end, locked_folio, NULL, EXTENT_LOCKED | EXTENT_DELALLOC, PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); } if (untouched_len) { struct extent_state *cached = NULL; const u64 untouched_end = untouched_start + untouched_len - 1; /* * We need to lock the extent here because we're clearing DELALLOC and * we're not locked at this point. */ btrfs_lock_extent(&inode->io_tree, untouched_start, untouched_end, &cached); extent_clear_unlock_delalloc(inode, untouched_start, untouched_end, locked_folio, &cached, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); btrfs_qgroup_free_data(inode, NULL, untouched_start, untouched_len, NULL); } btrfs_free_path(path); btrfs_err(fs_info, "%s failed, root=%llu inode=%llu start=%llu len=%llu cur_offset=%llu oe_cleanup=%llu oe_cleanup_len=%llu untouched_start=%llu untouched_len=%llu: %d", __func__, btrfs_root_id(inode->root), btrfs_ino(inode), start, end + 1 - start, cur_offset, oe_cleanup_start, oe_cleanup_len, untouched_start, untouched_len, ret); return ret; } static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end) { if (inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) { if (inode->defrag_bytes && btrfs_test_range_bit_exists(&inode->io_tree, start, end, EXTENT_DEFRAG)) return false; return true; } return false; } /* * Function to process delayed allocation (create CoW) for ranges which are * being touched for the first time. */ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_folio, u64 start, u64 end, struct writeback_control *wbc) { const bool zoned = btrfs_is_zoned(inode->root->fs_info); int ret; /* * The range must cover part of the @locked_folio, or a return of 1 * can confuse the caller. */ ASSERT(!(end <= folio_pos(locked_folio) || start >= folio_end(locked_folio))); if (should_nocow(inode, start, end)) { ret = run_delalloc_nocow(inode, locked_folio, start, end); return ret; } if (btrfs_inode_can_compress(inode) && inode_need_compress(inode, start, end) && run_delalloc_compressed(inode, locked_folio, start, end, wbc)) return 1; if (zoned) ret = run_delalloc_cow(inode, locked_folio, start, end, wbc, true); else ret = cow_file_range(inode, locked_folio, start, end, NULL, 0); return ret; } void btrfs_split_delalloc_extent(struct btrfs_inode *inode, struct extent_state *orig, u64 split) { struct btrfs_fs_info *fs_info = inode->root->fs_info; u64 size; lockdep_assert_held(&inode->io_tree.lock); /* not delalloc, ignore it */ if (!(orig->state & EXTENT_DELALLOC)) return; size = orig->end - orig->start + 1; if (size > fs_info->max_extent_size) { u32 num_extents; u64 new_size; /* * See the explanation in btrfs_merge_delalloc_extent, the same * applies here, just in reverse. */ new_size = orig->end - split + 1; num_extents = count_max_extents(fs_info, new_size); new_size = split - orig->start; num_extents += count_max_extents(fs_info, new_size); if (count_max_extents(fs_info, size) >= num_extents) return; } spin_lock(&inode->lock); btrfs_mod_outstanding_extents(inode, 1); spin_unlock(&inode->lock); } /* * Handle merged delayed allocation extents so we can keep track of new extents * that are just merged onto old extents, such as when we are doing sequential * writes, so we can properly account for the metadata space we'll need. */ void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state *new, struct extent_state *other) { struct btrfs_fs_info *fs_info = inode->root->fs_info; u64 new_size, old_size; u32 num_extents; lockdep_assert_held(&inode->io_tree.lock); /* not delalloc, ignore it */ if (!(other->state & EXTENT_DELALLOC)) return; if (new->start > other->start) new_size = new->end - other->start + 1; else new_size = other->end - new->start + 1; /* we're not bigger than the max, unreserve the space and go */ if (new_size <= fs_info->max_extent_size) { spin_lock(&inode->lock); btrfs_mod_outstanding_extents(inode, -1); spin_unlock(&inode->lock); return; } /* * We have to add up either side to figure out how many extents were * accounted for before we merged into one big extent. If the number of * extents we accounted for is <= the amount we need for the new range * then we can return, otherwise drop. Think of it like this * * [ 4k][MAX_SIZE] * * So we've grown the extent by a MAX_SIZE extent, this would mean we * need 2 outstanding extents, on one side we have 1 and the other side * we have 1 so they are == and we can return. But in this case * * [MAX_SIZE+4k][MAX_SIZE+4k] * * Each range on their own accounts for 2 extents, but merged together * they are only 3 extents worth of accounting, so we need to drop in * this case. */ old_size = other->end - other->start + 1; num_extents = count_max_extents(fs_info, old_size); old_size = new->end - new->start + 1; num_extents += count_max_extents(fs_info, old_size); if (count_max_extents(fs_info, new_size) >= num_extents) return; spin_lock(&inode->lock); btrfs_mod_outstanding_extents(inode, -1); spin_unlock(&inode->lock); } static void btrfs_add_delalloc_inode(struct btrfs_inode *inode) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; spin_lock(&root->delalloc_lock); ASSERT(list_empty(&inode->delalloc_inodes)); list_add_tail(&inode->delalloc_inodes, &root->delalloc_inodes); root->nr_delalloc_inodes++; if (root->nr_delalloc_inodes == 1) { spin_lock(&fs_info->delalloc_root_lock); ASSERT(list_empty(&root->delalloc_root)); list_add_tail(&root->delalloc_root, &fs_info->delalloc_roots); spin_unlock(&fs_info->delalloc_root_lock); } spin_unlock(&root->delalloc_lock); } void btrfs_del_delalloc_inode(struct btrfs_inode *inode) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; lockdep_assert_held(&root->delalloc_lock); /* * We may be called after the inode was already deleted from the list, * namely in the transaction abort path btrfs_destroy_delalloc_inodes(), * and then later through btrfs_clear_delalloc_extent() while the inode * still has ->delalloc_bytes > 0. */ if (!list_empty(&inode->delalloc_inodes)) { list_del_init(&inode->delalloc_inodes); root->nr_delalloc_inodes--; if (!root->nr_delalloc_inodes) { ASSERT(list_empty(&root->delalloc_inodes)); spin_lock(&fs_info->delalloc_root_lock); ASSERT(!list_empty(&root->delalloc_root)); list_del_init(&root->delalloc_root); spin_unlock(&fs_info->delalloc_root_lock); } } } /* * Properly track delayed allocation bytes in the inode and to maintain the * list of inodes that have pending delalloc work to be done. */ void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *state, u32 bits) { struct btrfs_fs_info *fs_info = inode->root->fs_info; lockdep_assert_held(&inode->io_tree.lock); if ((bits & EXTENT_DEFRAG) && !(bits & EXTENT_DELALLOC)) WARN_ON(1); /* * set_bit and clear bit hooks normally require _irqsave/restore * but in this case, we are only testing for the DELALLOC * bit, which is only set or cleared with irqs on */ if (!(state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { u64 len = state->end + 1 - state->start; u64 prev_delalloc_bytes; u32 num_extents = count_max_extents(fs_info, len); spin_lock(&inode->lock); btrfs_mod_outstanding_extents(inode, num_extents); spin_unlock(&inode->lock); /* For sanity tests */ if (btrfs_is_testing(fs_info)) return; percpu_counter_add_batch(&fs_info->delalloc_bytes, len, fs_info->delalloc_batch); spin_lock(&inode->lock); prev_delalloc_bytes = inode->delalloc_bytes; inode->delalloc_bytes += len; if (bits & EXTENT_DEFRAG) inode->defrag_bytes += len; spin_unlock(&inode->lock); /* * We don't need to be under the protection of the inode's lock, * because we are called while holding the inode's io_tree lock * and are therefore protected against concurrent calls of this * function and btrfs_clear_delalloc_extent(). */ if (!btrfs_is_free_space_inode(inode) && prev_delalloc_bytes == 0) btrfs_add_delalloc_inode(inode); } if (!(state->state & EXTENT_DELALLOC_NEW) && (bits & EXTENT_DELALLOC_NEW)) { spin_lock(&inode->lock); inode->new_delalloc_bytes += state->end + 1 - state->start; spin_unlock(&inode->lock); } } /* * Once a range is no longer delalloc this function ensures that proper * accounting happens. */ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode, struct extent_state *state, u32 bits) { struct btrfs_fs_info *fs_info = inode->root->fs_info; u64 len = state->end + 1 - state->start; u32 num_extents = count_max_extents(fs_info, len); lockdep_assert_held(&inode->io_tree.lock); if ((state->state & EXTENT_DEFRAG) && (bits & EXTENT_DEFRAG)) { spin_lock(&inode->lock); inode->defrag_bytes -= len; spin_unlock(&inode->lock); } /* * set_bit and clear bit hooks normally require _irqsave/restore * but in this case, we are only testing for the DELALLOC * bit, which is only set or cleared with irqs on */ if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { struct btrfs_root *root = inode->root; u64 new_delalloc_bytes; spin_lock(&inode->lock); btrfs_mod_outstanding_extents(inode, -num_extents); spin_unlock(&inode->lock); /* * We don't reserve metadata space for space cache inodes so we * don't need to call delalloc_release_metadata if there is an * error. */ if (bits & EXTENT_CLEAR_META_RESV && root != fs_info->tree_root) btrfs_delalloc_release_metadata(inode, len, true); /* For sanity tests. */ if (btrfs_is_testing(fs_info)) return; if (!btrfs_is_data_reloc_root(root) && !btrfs_is_free_space_inode(inode) && !(state->state & EXTENT_NORESERVE) && (bits & EXTENT_CLEAR_DATA_RESV)) btrfs_free_reserved_data_space_noquota(inode, len); percpu_counter_add_batch(&fs_info->delalloc_bytes, -len, fs_info->delalloc_batch); spin_lock(&inode->lock); inode->delalloc_bytes -= len; new_delalloc_bytes = inode->delalloc_bytes; spin_unlock(&inode->lock); /* * We don't need to be under the protection of the inode's lock, * because we are called while holding the inode's io_tree lock * and are therefore protected against concurrent calls of this * function and btrfs_set_delalloc_extent(). */ if (!btrfs_is_free_space_inode(inode) && new_delalloc_bytes == 0) { spin_lock(&root->delalloc_lock); btrfs_del_delalloc_inode(inode); spin_unlock(&root->delalloc_lock); } } if ((state->state & EXTENT_DELALLOC_NEW) && (bits & EXTENT_DELALLOC_NEW)) { spin_lock(&inode->lock); ASSERT(inode->new_delalloc_bytes >= len); inode->new_delalloc_bytes -= len; if (bits & EXTENT_ADD_INODE_BYTES) inode_add_bytes(&inode->vfs_inode, len); spin_unlock(&inode->lock); } } /* * given a list of ordered sums record them in the inode. This happens * at IO completion time based on sums calculated at bio submission time. */ static int add_pending_csums(struct btrfs_trans_handle *trans, struct list_head *list) { struct btrfs_ordered_sum *sum; struct btrfs_root *csum_root = NULL; int ret; list_for_each_entry(sum, list, list) { trans->adding_csums = true; if (!csum_root) csum_root = btrfs_csum_root(trans->fs_info, sum->logical); ret = btrfs_csum_file_blocks(trans, csum_root, sum); trans->adding_csums = false; if (ret) return ret; } return 0; } static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode, const u64 start, const u64 len, struct extent_state **cached_state) { u64 search_start = start; const u64 end = start + len - 1; while (search_start < end) { const u64 search_len = end - search_start + 1; struct extent_map *em; u64 em_len; int ret = 0; em = btrfs_get_extent(inode, NULL, search_start, search_len); if (IS_ERR(em)) return PTR_ERR(em); if (em->disk_bytenr != EXTENT_MAP_HOLE) goto next; em_len = em->len; if (em->start < search_start) em_len -= search_start - em->start; if (em_len > search_len) em_len = search_len; ret = btrfs_set_extent_bit(&inode->io_tree, search_start, search_start + em_len - 1, EXTENT_DELALLOC_NEW, cached_state); next: search_start = btrfs_extent_map_end(em); btrfs_free_extent_map(em); if (ret) return ret; } return 0; } int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, unsigned int extra_bits, struct extent_state **cached_state) { WARN_ON(PAGE_ALIGNED(end)); if (start >= i_size_read(&inode->vfs_inode) && !(inode->flags & BTRFS_INODE_PREALLOC)) { /* * There can't be any extents following eof in this case so just * set the delalloc new bit for the range directly. */ extra_bits |= EXTENT_DELALLOC_NEW; } else { int ret; ret = btrfs_find_new_delalloc_bytes(inode, start, end + 1 - start, cached_state); if (ret) return ret; } return btrfs_set_extent_bit(&inode->io_tree, start, end, EXTENT_DELALLOC | extra_bits, cached_state); } /* see btrfs_writepage_start_hook for details on why this is required */ struct btrfs_writepage_fixup { struct folio *folio; struct btrfs_inode *inode; struct btrfs_work work; }; static void btrfs_writepage_fixup_worker(struct btrfs_work *work) { struct btrfs_writepage_fixup *fixup = container_of(work, struct btrfs_writepage_fixup, work); struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; struct extent_changeset *data_reserved = NULL; struct folio *folio = fixup->folio; struct btrfs_inode *inode = fixup->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; u64 page_start = folio_pos(folio); u64 page_end = folio_end(folio) - 1; int ret = 0; bool free_delalloc_space = true; /* * This is similar to page_mkwrite, we need to reserve the space before * we take the folio lock. */ ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start, folio_size(folio)); again: folio_lock(folio); /* * Before we queued this fixup, we took a reference on the folio. * folio->mapping may go NULL, but it shouldn't be moved to a different * address space. */ if (!folio->mapping || !folio_test_dirty(folio) || !folio_test_checked(folio)) { /* * Unfortunately this is a little tricky, either * * 1) We got here and our folio had already been dealt with and * we reserved our space, thus ret == 0, so we need to just * drop our space reservation and bail. This can happen the * first time we come into the fixup worker, or could happen * while waiting for the ordered extent. * 2) Our folio was already dealt with, but we happened to get an * ENOSPC above from the btrfs_delalloc_reserve_space. In * this case we obviously don't have anything to release, but * because the folio was already dealt with we don't want to * mark the folio with an error, so make sure we're resetting * ret to 0. This is why we have this check _before_ the ret * check, because we do not want to have a surprise ENOSPC * when the folio was already properly dealt with. */ if (!ret) { btrfs_delalloc_release_extents(inode, folio_size(folio)); btrfs_delalloc_release_space(inode, data_reserved, page_start, folio_size(folio), true); } ret = 0; goto out_page; } /* * We can't mess with the folio state unless it is locked, so now that * it is locked bail if we failed to make our space reservation. */ if (ret) goto out_page; btrfs_lock_extent(&inode->io_tree, page_start, page_end, &cached_state); /* already ordered? We're done */ if (folio_test_ordered(folio)) goto out_reserved; ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE); if (ordered) { btrfs_unlock_extent(&inode->io_tree, page_start, page_end, &cached_state); folio_unlock(folio); btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); goto again; } ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0, &cached_state); if (ret) goto out_reserved; /* * Everything went as planned, we're now the owner of a dirty page with * delayed allocation bits set and space reserved for our COW * destination. * * The page was dirty when we started, nothing should have cleaned it. */ BUG_ON(!folio_test_dirty(folio)); free_delalloc_space = false; out_reserved: btrfs_delalloc_release_extents(inode, PAGE_SIZE); if (free_delalloc_space) btrfs_delalloc_release_space(inode, data_reserved, page_start, PAGE_SIZE, true); btrfs_unlock_extent(&inode->io_tree, page_start, page_end, &cached_state); out_page: if (ret) { /* * We hit ENOSPC or other errors. Update the mapping and page * to reflect the errors and clean the page. */ mapping_set_error(folio->mapping, ret); btrfs_mark_ordered_io_finished(inode, folio, page_start, folio_size(folio), !ret); folio_clear_dirty_for_io(folio); } btrfs_folio_clear_checked(fs_info, folio, page_start, PAGE_SIZE); folio_unlock(folio); folio_put(folio); kfree(fixup); extent_changeset_free(data_reserved); /* * As a precaution, do a delayed iput in case it would be the last iput * that could need flushing space. Recursing back to fixup worker would * deadlock. */ btrfs_add_delayed_iput(inode); } /* * There are a few paths in the higher layers of the kernel that directly * set the folio dirty bit without asking the filesystem if it is a * good idea. This causes problems because we want to make sure COW * properly happens and the data=ordered rules are followed. * * In our case any range that doesn't have the ORDERED bit set * hasn't been properly setup for IO. We kick off an async process * to fix it up. The async helper will wait for ordered extents, set * the delalloc bit and make it safe to write the folio. */ int btrfs_writepage_cow_fixup(struct folio *folio) { struct inode *inode = folio->mapping->host; struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); struct btrfs_writepage_fixup *fixup; /* This folio has ordered extent covering it already */ if (folio_test_ordered(folio)) return 0; /* * For experimental build, we error out instead of EAGAIN. * * We should not hit such out-of-band dirty folios anymore. */ if (IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL)) { DEBUG_WARN(); btrfs_err_rl(fs_info, "root %lld ino %llu folio %llu is marked dirty without notifying the fs", btrfs_root_id(BTRFS_I(inode)->root), btrfs_ino(BTRFS_I(inode)), folio_pos(folio)); return -EUCLEAN; } /* * folio_checked is set below when we create a fixup worker for this * folio, don't try to create another one if we're already * folio_test_checked. * * The extent_io writepage code will redirty the foio if we send back * EAGAIN. */ if (folio_test_checked(folio)) return -EAGAIN; fixup = kzalloc(sizeof(*fixup), GFP_NOFS); if (!fixup) return -EAGAIN; /* * We are already holding a reference to this inode from * write_cache_pages. We need to hold it because the space reservation * takes place outside of the folio lock, and we can't trust * folio->mapping outside of the folio lock. */ ihold(inode); btrfs_folio_set_checked(fs_info, folio, folio_pos(folio), folio_size(folio)); folio_get(folio); btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL); fixup->folio = folio; fixup->inode = BTRFS_I(inode); btrfs_queue_work(fs_info->fixup_workers, &fixup->work); return -EAGAIN; } static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, u64 file_pos, struct btrfs_file_extent_item *stack_fi, const bool update_inode_bytes, u64 qgroup_reserved) { struct btrfs_root *root = inode->root; const u64 sectorsize = root->fs_info->sectorsize; BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; struct btrfs_key ins; u64 disk_num_bytes = btrfs_stack_file_extent_disk_num_bytes(stack_fi); u64 disk_bytenr = btrfs_stack_file_extent_disk_bytenr(stack_fi); u64 offset = btrfs_stack_file_extent_offset(stack_fi); u64 num_bytes = btrfs_stack_file_extent_num_bytes(stack_fi); u64 ram_bytes = btrfs_stack_file_extent_ram_bytes(stack_fi); struct btrfs_drop_extents_args drop_args = { 0 }; int ret; path = btrfs_alloc_path(); if (!path) return -ENOMEM; /* * we may be replacing one extent in the tree with another. * The new extent is pinned in the extent map, and we don't want * to drop it from the cache until it is completely in the btree. * * So, tell btrfs_drop_extents to leave this extent in the cache. * the caller is expected to unpin it and allow it to be merged * with the others. */ drop_args.path = path; drop_args.start = file_pos; drop_args.end = file_pos + num_bytes; drop_args.replace_extent = true; drop_args.extent_item_size = sizeof(*stack_fi); ret = btrfs_drop_extents(trans, root, inode, &drop_args); if (ret) goto out; if (!drop_args.extent_inserted) { ins.objectid = btrfs_ino(inode); ins.type = BTRFS_EXTENT_DATA_KEY; ins.offset = file_pos; ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*stack_fi)); if (ret) goto out; } leaf = path->nodes[0]; btrfs_set_stack_file_extent_generation(stack_fi, trans->transid); write_extent_buffer(leaf, stack_fi, btrfs_item_ptr_offset(leaf, path->slots[0]), sizeof(struct btrfs_file_extent_item)); btrfs_release_path(path); /* * If we dropped an inline extent here, we know the range where it is * was not marked with the EXTENT_DELALLOC_NEW bit, so we update the * number of bytes only for that range containing the inline extent. * The remaining of the range will be processed when clearing the * EXTENT_DELALLOC_BIT bit through the ordered extent completion. */ if (file_pos == 0 && !IS_ALIGNED(drop_args.bytes_found, sectorsize)) { u64 inline_size = round_down(drop_args.bytes_found, sectorsize); inline_size = drop_args.bytes_found - inline_size; btrfs_update_inode_bytes(inode, sectorsize, inline_size); drop_args.bytes_found -= inline_size; num_bytes -= sectorsize; } if (update_inode_bytes) btrfs_update_inode_bytes(inode, num_bytes, drop_args.bytes_found); ins.objectid = disk_bytenr; ins.type = BTRFS_EXTENT_ITEM_KEY; ins.offset = disk_num_bytes; ret = btrfs_inode_set_file_extent_range(inode, file_pos, ram_bytes); if (ret) goto out; ret = btrfs_alloc_reserved_file_extent(trans, root, btrfs_ino(inode), file_pos - offset, qgroup_reserved, &ins); out: return ret; } static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info, u64 start, u64 len) { struct btrfs_block_group *cache; cache = btrfs_lookup_block_group(fs_info, start); ASSERT(cache); spin_lock(&cache->lock); cache->delalloc_bytes -= len; spin_unlock(&cache->lock); btrfs_put_block_group(cache); } static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, struct btrfs_ordered_extent *oe) { struct btrfs_file_extent_item stack_fi; bool update_inode_bytes; u64 num_bytes = oe->num_bytes; u64 ram_bytes = oe->ram_bytes; memset(&stack_fi, 0, sizeof(stack_fi)); btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_REG); btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, oe->disk_bytenr); btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi, oe->disk_num_bytes); btrfs_set_stack_file_extent_offset(&stack_fi, oe->offset); if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags)) num_bytes = oe->truncated_len; btrfs_set_stack_file_extent_num_bytes(&stack_fi, num_bytes); btrfs_set_stack_file_extent_ram_bytes(&stack_fi, ram_bytes); btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type); /* Encryption and other encoding is reserved and all 0 */ /* * For delalloc, when completing an ordered extent we update the inode's * bytes when clearing the range in the inode's io tree, so pass false * as the argument 'update_inode_bytes' to insert_reserved_file_extent(), * except if the ordered extent was truncated. */ update_inode_bytes = test_bit(BTRFS_ORDERED_DIRECT, &oe->flags) || test_bit(BTRFS_ORDERED_ENCODED, &oe->flags) || test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags); return insert_reserved_file_extent(trans, oe->inode, oe->file_offset, &stack_fi, update_inode_bytes, oe->qgroup_rsv); } /* * As ordered data IO finishes, this gets called so we can finish * an ordered extent if the range of bytes in the file it covers are * fully written. */ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) { struct btrfs_inode *inode = ordered_extent->inode; struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans = NULL; struct extent_io_tree *io_tree = &inode->io_tree; struct extent_state *cached_state = NULL; u64 start, end; int compress_type = 0; int ret = 0; u64 logical_len = ordered_extent->num_bytes; bool freespace_inode; bool truncated = false; bool clear_reserved_extent = true; unsigned int clear_bits = EXTENT_DEFRAG; start = ordered_extent->file_offset; end = start + ordered_extent->num_bytes - 1; if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) && !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags) && !test_bit(BTRFS_ORDERED_ENCODED, &ordered_extent->flags)) clear_bits |= EXTENT_DELALLOC_NEW; freespace_inode = btrfs_is_free_space_inode(inode); if (!freespace_inode) btrfs_lockdep_acquire(fs_info, btrfs_ordered_extent); if (unlikely(test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags))) { ret = -EIO; goto out; } ret = btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr, ordered_extent->disk_num_bytes); if (ret) goto out; if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) { truncated = true; logical_len = ordered_extent->truncated_len; /* Truncated the entire extent, don't bother adding */ if (!logical_len) goto out; } /* * If it's a COW write we need to lock the extent range as we will be * inserting/replacing file extent items and unpinning an extent map. * This must be taken before joining a transaction, as it's a higher * level lock (like the inode's VFS lock), otherwise we can run into an * ABBA deadlock with other tasks (transactions work like a lock, * depending on their current state). */ if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { clear_bits |= EXTENT_LOCKED | EXTENT_FINISHING_ORDERED; btrfs_lock_extent_bits(io_tree, start, end, EXTENT_LOCKED | EXTENT_FINISHING_ORDERED, &cached_state); } if (freespace_inode) trans = btrfs_join_transaction_spacecache(root); else trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); trans = NULL; goto out; } trans->block_rsv = &inode->block_rsv; ret = btrfs_insert_raid_extent(trans, ordered_extent); if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { /* Logic error */ ASSERT(list_empty(&ordered_extent->list)); if (unlikely(!list_empty(&ordered_extent->list))) { ret = -EINVAL; btrfs_abort_transaction(trans, ret); goto out; } btrfs_inode_safe_disk_i_size_write(inode, 0); ret = btrfs_update_inode_fallback(trans, inode); if (unlikely(ret)) { /* -ENOMEM or corruption */ btrfs_abort_transaction(trans, ret); } goto out; } if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) compress_type = ordered_extent->compress_type; if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { BUG_ON(compress_type); ret = btrfs_mark_extent_written(trans, inode, ordered_extent->file_offset, ordered_extent->file_offset + logical_len); btrfs_zoned_release_data_reloc_bg(fs_info, ordered_extent->disk_bytenr, ordered_extent->disk_num_bytes); } else { BUG_ON(root == fs_info->tree_root); ret = insert_ordered_extent_file_extent(trans, ordered_extent); if (!ret) { clear_reserved_extent = false; btrfs_release_delalloc_bytes(fs_info, ordered_extent->disk_bytenr, ordered_extent->disk_num_bytes); } } if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto out; } ret = btrfs_unpin_extent_cache(inode, ordered_extent->file_offset, ordered_extent->num_bytes, trans->transid); if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); goto out; } ret = add_pending_csums(trans, &ordered_extent->list); if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; } /* * If this is a new delalloc range, clear its new delalloc flag to * update the inode's number of bytes. This needs to be done first * before updating the inode item. */ if ((clear_bits & EXTENT_DELALLOC_NEW) && !test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) btrfs_clear_extent_bit(&inode->io_tree, start, end, EXTENT_DELALLOC_NEW | EXTENT_ADD_INODE_BYTES, &cached_state); btrfs_inode_safe_disk_i_size_write(inode, 0); ret = btrfs_update_inode_fallback(trans, inode); if (unlikely(ret)) { /* -ENOMEM or corruption */ btrfs_abort_transaction(trans, ret); goto out; } out: btrfs_clear_extent_bit(&inode->io_tree, start, end, clear_bits, &cached_state); if (trans) btrfs_end_transaction(trans); if (ret || truncated) { /* * If we failed to finish this ordered extent for any reason we * need to make sure BTRFS_ORDERED_IOERR is set on the ordered * extent, and mark the inode with the error if it wasn't * already set. Any error during writeback would have already * set the mapping error, so we need to set it if we're the ones * marking this ordered extent as failed. */ if (ret) btrfs_mark_ordered_extent_error(ordered_extent); /* * Drop extent maps for the part of the extent we didn't write. * * We have an exception here for the free_space_inode, this is * because when we do btrfs_get_extent() on the free space inode * we will search the commit root. If this is a new block group * we won't find anything, and we will trip over the assert in * writepage where we do ASSERT(em->block_start != * EXTENT_MAP_HOLE). * * Theoretically we could also skip this for any NOCOW extent as * we don't mess with the extent map tree in the NOCOW case, but * for now simply skip this if we are the free space inode. */ if (!btrfs_is_free_space_inode(inode)) { u64 unwritten_start = start; if (truncated) unwritten_start += logical_len; btrfs_drop_extent_map_range(inode, unwritten_start, end, false); } /* * If the ordered extent had an IOERR or something else went * wrong we need to return the space for this ordered extent * back to the allocator. We only free the extent in the * truncated case if we didn't write out the extent at all. * * If we made it past insert_reserved_file_extent before we * errored out then we don't need to do this as the accounting * has already been done. */ if ((ret || !logical_len) && clear_reserved_extent && !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { /* * Discard the range before returning it back to the * free space pool */ if (ret && btrfs_test_opt(fs_info, DISCARD_SYNC)) btrfs_discard_extent(fs_info, ordered_extent->disk_bytenr, ordered_extent->disk_num_bytes, NULL); btrfs_free_reserved_extent(fs_info, ordered_extent->disk_bytenr, ordered_extent->disk_num_bytes, true); /* * Actually free the qgroup rsv which was released when * the ordered extent was created. */ btrfs_qgroup_free_refroot(fs_info, btrfs_root_id(inode->root), ordered_extent->qgroup_rsv, BTRFS_QGROUP_RSV_DATA); } } /* * This needs to be done to make sure anybody waiting knows we are done * updating everything for this ordered extent. */ btrfs_remove_ordered_extent(inode, ordered_extent); /* once for us */ btrfs_put_ordered_extent(ordered_extent); /* once for the tree */ btrfs_put_ordered_extent(ordered_extent); return ret; } int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered) { if (btrfs_is_zoned(ordered->inode->root->fs_info) && !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) && list_empty(&ordered->bioc_list)) btrfs_finish_ordered_zoned(ordered); return btrfs_finish_one_ordered(ordered); } void btrfs_calculate_block_csum(struct btrfs_fs_info *fs_info, phys_addr_t paddr, u8 *dest) { struct folio *folio = page_folio(phys_to_page(paddr)); const u32 blocksize = fs_info->sectorsize; SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); shash->tfm = fs_info->csum_shash; /* The full block must be inside the folio. */ ASSERT(offset_in_folio(folio, paddr) + blocksize <= folio_size(folio)); if (folio_test_partial_kmap(folio)) { size_t cur = paddr; crypto_shash_init(shash); while (cur < paddr + blocksize) { void *kaddr; size_t len = min(paddr + blocksize - cur, PAGE_SIZE - offset_in_page(cur)); kaddr = kmap_local_folio(folio, offset_in_folio(folio, cur)); crypto_shash_update(shash, kaddr, len); kunmap_local(kaddr); cur += len; } crypto_shash_final(shash, dest); } else { crypto_shash_digest(shash, phys_to_virt(paddr), blocksize, dest); } } /* * Verify the checksum for a single sector without any extra action that depend * on the type of I/O. * * @kaddr must be a properly kmapped address. */ int btrfs_check_block_csum(struct btrfs_fs_info *fs_info, phys_addr_t paddr, u8 *csum, const u8 * const csum_expected) { btrfs_calculate_block_csum(fs_info, paddr, csum); if (unlikely(memcmp(csum, csum_expected, fs_info->csum_size) != 0)) return -EIO; return 0; } /* * Verify the checksum of a single data sector. * * @bbio: btrfs_io_bio which contains the csum * @dev: device the sector is on * @bio_offset: offset to the beginning of the bio (in bytes) * @bv: bio_vec to check * * Check if the checksum on a data block is valid. When a checksum mismatch is * detected, report the error and fill the corrupted range with zero. * * Return %true if the sector is ok or had no checksum to start with, else %false. */ bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, u32 bio_offset, phys_addr_t paddr) { struct btrfs_inode *inode = bbio->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; const u32 blocksize = fs_info->sectorsize; struct folio *folio; u64 file_offset = bbio->file_offset + bio_offset; u64 end = file_offset + blocksize - 1; u8 *csum_expected; u8 csum[BTRFS_CSUM_SIZE]; if (!bbio->csum) return true; if (btrfs_is_data_reloc_root(inode->root) && btrfs_test_range_bit(&inode->io_tree, file_offset, end, EXTENT_NODATASUM, NULL)) { /* Skip the range without csum for data reloc inode */ btrfs_clear_extent_bit(&inode->io_tree, file_offset, end, EXTENT_NODATASUM, NULL); return true; } csum_expected = bbio->csum + (bio_offset >> fs_info->sectorsize_bits) * fs_info->csum_size; if (btrfs_check_block_csum(fs_info, paddr, csum, csum_expected)) goto zeroit; return true; zeroit: btrfs_print_data_csum_error(inode, file_offset, csum, csum_expected, bbio->mirror_num); if (dev) btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); folio = page_folio(phys_to_page(paddr)); ASSERT(offset_in_folio(folio, paddr) + blocksize <= folio_size(folio)); folio_zero_range(folio, offset_in_folio(folio, paddr), blocksize); return false; } /* * Perform a delayed iput on @inode. * * @inode: The inode we want to perform iput on * * This function uses the generic vfs_inode::i_count to track whether we should * just decrement it (in case it's > 1) or if this is the last iput then link * the inode to the delayed iput machinery. Delayed iputs are processed at * transaction commit time/superblock commit/cleaner kthread. */ void btrfs_add_delayed_iput(struct btrfs_inode *inode) { struct btrfs_fs_info *fs_info = inode->root->fs_info; unsigned long flags; if (atomic_add_unless(&inode->vfs_inode.i_count, -1, 1)) return; WARN_ON_ONCE(test_bit(BTRFS_FS_STATE_NO_DELAYED_IPUT, &fs_info->fs_state)); atomic_inc(&fs_info->nr_delayed_iputs); /* * Need to be irq safe here because we can be called from either an irq * context (see bio.c and btrfs_put_ordered_extent()) or a non-irq * context. */ spin_lock_irqsave(&fs_info->delayed_iput_lock, flags); ASSERT(list_empty(&inode->delayed_iput)); list_add_tail(&inode->delayed_iput, &fs_info->delayed_iputs); spin_unlock_irqrestore(&fs_info->delayed_iput_lock, flags); if (!test_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags)) wake_up_process(fs_info->cleaner_kthread); } static void run_delayed_iput_locked(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) { list_del_init(&inode->delayed_iput); spin_unlock_irq(&fs_info->delayed_iput_lock); iput(&inode->vfs_inode); if (atomic_dec_and_test(&fs_info->nr_delayed_iputs)) wake_up(&fs_info->delayed_iputs_wait); spin_lock_irq(&fs_info->delayed_iput_lock); } static void btrfs_run_delayed_iput(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) { if (!list_empty(&inode->delayed_iput)) { spin_lock_irq(&fs_info->delayed_iput_lock); if (!list_empty(&inode->delayed_iput)) run_delayed_iput_locked(fs_info, inode); spin_unlock_irq(&fs_info->delayed_iput_lock); } } void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info) { /* * btrfs_put_ordered_extent() can run in irq context (see bio.c), which * calls btrfs_add_delayed_iput() and that needs to lock * fs_info->delayed_iput_lock. So we need to disable irqs here to * prevent a deadlock. */ spin_lock_irq(&fs_info->delayed_iput_lock); while (!list_empty(&fs_info->delayed_iputs)) { struct btrfs_inode *inode; inode = list_first_entry(&fs_info->delayed_iputs, struct btrfs_inode, delayed_iput); run_delayed_iput_locked(fs_info, inode); if (need_resched()) { spin_unlock_irq(&fs_info->delayed_iput_lock); cond_resched(); spin_lock_irq(&fs_info->delayed_iput_lock); } } spin_unlock_irq(&fs_info->delayed_iput_lock); } /* * Wait for flushing all delayed iputs * * @fs_info: the filesystem * * This will wait on any delayed iputs that are currently running with KILLABLE * set. Once they are all done running we will return, unless we are killed in * which case we return EINTR. This helps in user operations like fallocate etc * that might get blocked on the iputs. * * Return EINTR if we were killed, 0 if nothing's pending */ int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info) { int ret = wait_event_killable(fs_info->delayed_iputs_wait, atomic_read(&fs_info->nr_delayed_iputs) == 0); if (ret) return -EINTR; return 0; } /* * This creates an orphan entry for the given inode in case something goes wrong * in the middle of an unlink. */ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct btrfs_inode *inode) { int ret; ret = btrfs_insert_orphan_item(trans, inode->root, btrfs_ino(inode)); if (unlikely(ret && ret != -EEXIST)) { btrfs_abort_transaction(trans, ret); return ret; } return 0; } /* * We have done the delete so we can go ahead and remove the orphan item for * this particular inode. */ static int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct btrfs_inode *inode) { return btrfs_del_orphan_item(trans, inode->root, btrfs_ino(inode)); } /* * this cleans up any orphans that may be left on the list from the last use * of this root. */ int btrfs_orphan_cleanup(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; struct btrfs_key key, found_key; struct btrfs_trans_handle *trans; u64 last_objectid = 0; int ret = 0, nr_unlink = 0; if (test_and_set_bit(BTRFS_ROOT_ORPHAN_CLEANUP, &root->state)) return 0; path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; goto out; } path->reada = READA_BACK; key.objectid = BTRFS_ORPHAN_OBJECTID; key.type = BTRFS_ORPHAN_ITEM_KEY; key.offset = (u64)-1; while (1) { struct btrfs_inode *inode; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto out; /* * if ret == 0 means we found what we were searching for, which * is weird, but possible, so only screw with path if we didn't * find the key and see if we have stuff that matches */ if (ret > 0) { ret = 0; if (path->slots[0] == 0) break; path->slots[0]--; } /* pull out the item */ leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); /* make sure the item matches what we want */ if (found_key.objectid != BTRFS_ORPHAN_OBJECTID) break; if (found_key.type != BTRFS_ORPHAN_ITEM_KEY) break; /* release the path since we're done with it */ btrfs_release_path(path); /* * this is where we are basically btrfs_lookup, without the * crossing root thing. we store the inode number in the * offset of the orphan item. */ if (found_key.offset == last_objectid) { /* * We found the same inode as before. This means we were * not able to remove its items via eviction triggered * by an iput(). A transaction abort may have happened, * due to -ENOSPC for example, so try to grab the error * that lead to a transaction abort, if any. */ btrfs_err(fs_info, "Error removing orphan entry, stopping orphan cleanup"); ret = BTRFS_FS_ERROR(fs_info) ?: -EINVAL; goto out; } last_objectid = found_key.offset; found_key.objectid = found_key.offset; found_key.type = BTRFS_INODE_ITEM_KEY; found_key.offset = 0; inode = btrfs_iget(last_objectid, root); if (IS_ERR(inode)) { ret = PTR_ERR(inode); inode = NULL; if (ret != -ENOENT) goto out; } if (!inode && root == fs_info->tree_root) { struct btrfs_root *dead_root; int is_dead_root = 0; /* * This is an orphan in the