| 16 16 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 | // SPDX-License-Identifier: GPL-2.0 /* * fs-verity module initialization and logging * * Copyright 2019 Google LLC */ #include "fsverity_private.h" #include <linux/ratelimit.h> #ifdef CONFIG_SYSCTL static const struct ctl_table fsverity_sysctl_table[] = { #ifdef CONFIG_FS_VERITY_BUILTIN_SIGNATURES { .procname = "require_signatures", .data = &fsverity_require_signatures, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, #endif }; static void __init fsverity_init_sysctl(void) { register_sysctl_init("fs/verity", fsverity_sysctl_table); } #else /* CONFIG_SYSCTL */ static inline void fsverity_init_sysctl(void) { } #endif /* !CONFIG_SYSCTL */ void fsverity_msg(const struct inode *inode, const char *level, const char *fmt, ...) { static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); struct va_format vaf; va_list args; if (!__ratelimit(&rs)) return; va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; if (inode) printk("%sfs-verity (%s, inode %lu): %pV\n", level, inode->i_sb->s_id, inode->i_ino, &vaf); else printk("%sfs-verity: %pV\n", level, &vaf); va_end(args); } static int __init fsverity_init(void) { fsverity_check_hash_algs(); fsverity_init_info_cache(); fsverity_init_workqueue(); fsverity_init_sysctl(); fsverity_init_signature(); fsverity_init_bpf(); return 0; } late_initcall(fsverity_init) |
| 2 1 2 2 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 | // SPDX-License-Identifier: GPL-2.0-or-later /* * X.25 Packet Layer release 002 * * This is ALPHA test software. This code may break your machine, * randomly fail to work with new releases, misbehave and/or generally * screw up. It might even work. * * This code REQUIRES 2.1.15 or higher * * History * X.25 001 Jonathan Naylor Started coding. * X.25 002 Jonathan Naylor Centralised disconnect handling. * New timer architecture. * 2000-03-11 Henner Eisen MSG_EOR handling more POSIX compliant. * 2000-03-22 Daniela Squassoni Allowed disabling/enabling of * facilities negotiation and increased * the throughput upper limit. * 2000-08-27 Arnaldo C. Melo s/suser/capable/ + micro cleanups * 2000-09-04 Henner Eisen Set sock->state in x25_accept(). * Fixed x25_output() related skb leakage. * 2000-10-02 Henner Eisen Made x25_kick() single threaded per socket. * 2000-10-27 Henner Eisen MSG_DONTWAIT for fragment allocation. * 2000-11-14 Henner Eisen Closing datalink from NETDEV_GOING_DOWN * 2002-10-06 Arnaldo C. Melo Get rid of cli/sti, move proc stuff to * x25_proc.c, using seq_file * 2005-04-02 Shaun Pereira Selective sub address matching * with call user data * 2005-04-15 Shaun Pereira Fast select with no restriction on * response */ #define pr_fmt(fmt) "X25: " fmt #include <linux/module.h> #include <linux/capability.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/sched/signal.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/net.h> #include <linux/netdevice.h> #include <linux/if_arp.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <net/sock.h> #include <net/tcp_states.h> #include <linux/uaccess.h> #include <linux/fcntl.h> #include <linux/termios.h> /* For TIOCINQ/OUTQ */ #include <linux/notifier.h> #include <linux/init.h> #include <linux/compat.h> #include <linux/ctype.h> #include <net/x25.h> #include <net/compat.h> int sysctl_x25_restart_request_timeout = X25_DEFAULT_T20; int sysctl_x25_call_request_timeout = X25_DEFAULT_T21; int sysctl_x25_reset_request_timeout = X25_DEFAULT_T22; int sysctl_x25_clear_request_timeout = X25_DEFAULT_T23; int sysctl_x25_ack_holdback_timeout = X25_DEFAULT_T2; int sysctl_x25_forward = 0; HLIST_HEAD(x25_list); DEFINE_RWLOCK(x25_list_lock); static const struct proto_ops x25_proto_ops; static const struct x25_address null_x25_address = {" "}; #ifdef CONFIG_COMPAT struct compat_x25_subscrip_struct { char device[200-sizeof(compat_ulong_t)]; compat_ulong_t global_facil_mask; compat_uint_t extended; }; #endif int x25_parse_address_block(struct sk_buff *skb, struct x25_address *called_addr, struct x25_address *calling_addr) { unsigned char len; int needed; int rc; if (!pskb_may_pull(skb, 1)) { /* packet has no address block */ rc = 0; goto empty; } len = *skb->data; needed = 1 + ((len >> 4) + (len & 0x0f) + 1) / 2; if (!pskb_may_pull(skb, needed)) { /* packet is too short to hold the addresses it claims to hold */ rc = -1; goto empty; } return x25_addr_ntoa(skb->data, called_addr, calling_addr); empty: *called_addr->x25_addr = 0; *calling_addr->x25_addr = 0; return rc; } int x25_addr_ntoa(unsigned char *p, struct x25_address *called_addr, struct x25_address *calling_addr) { unsigned int called_len, calling_len; char *called, *calling; unsigned int i; called_len = (*p >> 0) & 0x0F; calling_len = (*p >> 4) & 0x0F; called = called_addr->x25_addr; calling = calling_addr->x25_addr; p++; for (i = 0; i < (called_len + calling_len); i++) { if (i < called_len) { if (i % 2 != 0) { *called++ = ((*p >> 0) & 0x0F) + '0'; p++; } else { *called++ = ((*p >> 4) & 0x0F) + '0'; } } else { if (i % 2 != 0) { *calling++ = ((*p >> 0) & 0x0F) + '0'; p++; } else { *calling++ = ((*p >> 4) & 0x0F) + '0'; } } } *called = *calling = '\0'; return 1 + (called_len + calling_len + 1) / 2; } int x25_addr_aton(unsigned char *p, struct x25_address *called_addr, struct x25_address *calling_addr) { unsigned int called_len, calling_len; char *called, *calling; int i; called = called_addr->x25_addr; calling = calling_addr->x25_addr; called_len = strlen(called); calling_len = strlen(calling); *p++ = (calling_len << 4) | (called_len << 0); for (i = 0; i < (called_len + calling_len); i++) { if (i < called_len) { if (i % 2 != 0) { *p |= (*called++ - '0') << 0; p++; } else { *p = 0x00; *p |= (*called++ - '0') << 4; } } else { if (i % 2 != 0) { *p |= (*calling++ - '0') << 0; p++; } else { *p = 0x00; *p |= (*calling++ - '0') << 4; } } } return 1 + (called_len + calling_len + 1) / 2; } /* * Socket removal during an interrupt is now safe. */ static void x25_remove_socket(struct sock *sk) { write_lock_bh(&x25_list_lock); sk_del_node_init(sk); write_unlock_bh(&x25_list_lock); } /* * Handle device status changes. */ static int x25_device_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct x25_neigh *nb; if (!net_eq(dev_net(dev), &init_net)) return NOTIFY_DONE; if (dev->type == ARPHRD_X25) { switch (event) { case NETDEV_REGISTER: case NETDEV_POST_TYPE_CHANGE: x25_link_device_up(dev); break; case NETDEV_DOWN: nb = x25_get_neigh(dev); if (nb) { x25_link_terminated(nb); x25_neigh_put(nb); } x25_route_device_down(dev); break; case NETDEV_PRE_TYPE_CHANGE: case NETDEV_UNREGISTER: x25_link_device_down(dev); break; case NETDEV_CHANGE: if (!netif_carrier_ok(dev)) { nb = x25_get_neigh(dev); if (nb) { x25_link_terminated(nb); x25_neigh_put(nb); } } break; } } return NOTIFY_DONE; } /* * Add a socket to the bound sockets list. */ static void x25_insert_socket(struct sock *sk) { write_lock_bh(&x25_list_lock); sk_add_node(sk, &x25_list); write_unlock_bh(&x25_list_lock); } /* * Find a socket that wants to accept the Call Request we just * received. Check the full list for an address/cud match. * If no cuds match return the next_best thing, an address match. * Note: if a listening socket has cud set it must only get calls * with matching cud. */ static struct sock *x25_find_listener(struct x25_address *addr, struct sk_buff *skb) { struct sock *s; struct sock *next_best; read_lock_bh(&x25_list_lock); next_best = NULL; sk_for_each(s, &x25_list) if ((!strcmp(addr->x25_addr, x25_sk(s)->source_addr.x25_addr) || !strcmp(x25_sk(s)->source_addr.x25_addr, null_x25_address.x25_addr)) && s->sk_state == TCP_LISTEN) { /* * Found a listening socket, now check the incoming * call user data vs this sockets call user data */ if (x25_sk(s)->cudmatchlength > 0 && skb->len >= x25_sk(s)->cudmatchlength) { if((memcmp(x25_sk(s)->calluserdata.cuddata, skb->data, x25_sk(s)->cudmatchlength)) == 0) { sock_hold(s); goto found; } } else next_best = s; } if (next_best) { s = next_best; sock_hold(s); goto found; } s = NULL; found: read_unlock_bh(&x25_list_lock); return s; } /* * Find a connected X.25 socket given my LCI and neighbour. */ static struct sock *__x25_find_socket(unsigned int lci, struct x25_neigh *nb) { struct sock *s; sk_for_each(s, &x25_list) if (x25_sk(s)->lci == lci && x25_sk(s)->neighbour == nb) { sock_hold(s); goto found; } s = NULL; found: return s; } struct sock *x25_find_socket(unsigned int lci, struct x25_neigh *nb) { struct sock *s; read_lock_bh(&x25_list_lock); s = __x25_find_socket(lci, nb); read_unlock_bh(&x25_list_lock); return s; } /* * Find a unique LCI for a given device. */ static unsigned int x25_new_lci(struct x25_neigh *nb) { unsigned int lci = 1; struct sock *sk; while ((sk = x25_find_socket(lci, nb)) != NULL) { sock_put(sk); if (++lci == 4096) { lci = 0; break; } cond_resched(); } return lci; } /* * Deferred destroy. */ static void __x25_destroy_socket(struct sock *); /* * handler for deferred kills. */ static void x25_destroy_timer(struct timer_list *t) { struct sock *sk = from_timer(sk, t, sk_timer); x25_destroy_socket_from_timer(sk); } /* * This is called from user mode and the timers. Thus it protects itself * against interrupting users but doesn't worry about being called during * work. Once it is removed from the queue no interrupt or bottom half * will touch it and we are (fairly 8-) ) safe. * Not static as it's used by the timer */ static void __x25_destroy_socket(struct sock *sk) { struct sk_buff *skb; x25_stop_heartbeat(sk); x25_stop_timer(sk); x25_remove_socket(sk); x25_clear_queues(sk); /* Flush the queues */ while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { if (skb->sk != sk) { /* A pending connection */ /* * Queue the unaccepted socket for death */ skb->sk->sk_state = TCP_LISTEN; sock_set_flag(skb->sk, SOCK_DEAD); x25_start_heartbeat(skb->sk); x25_sk(skb->sk)->state = X25_STATE_0; } kfree_skb(skb); } if (sk_has_allocations(sk)) { /* Defer: outstanding buffers */ sk->sk_timer.expires = jiffies + 10 * HZ; sk->sk_timer.function = x25_destroy_timer; add_timer(&sk->sk_timer); } else { /* drop last reference so sock_put will free */ __sock_put(sk); } } void x25_destroy_socket_from_timer(struct sock *sk) { sock_hold(sk); bh_lock_sock(sk); __x25_destroy_socket(sk); bh_unlock_sock(sk); sock_put(sk); } /* * Handling for system calls applied via the various interfaces to a * X.25 socket object. */ static int x25_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { int opt; struct sock *sk = sock->sk; int rc = -ENOPROTOOPT; if (level != SOL_X25 || optname != X25_QBITINCL) goto out; rc = -EINVAL; if (optlen < sizeof(int)) goto out; rc = -EFAULT; if (copy_from_sockptr(&opt, optval, sizeof(int))) goto out; if (opt) set_bit(X25_Q_BIT_FLAG, &x25_sk(sk)->flags); else clear_bit(X25_Q_BIT_FLAG, &x25_sk(sk)->flags); rc = 0; out: return rc; } static int x25_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; int val, len, rc = -ENOPROTOOPT; if (level != SOL_X25 || optname != X25_QBITINCL) goto out; rc = -EFAULT; if (get_user(len, optlen)) goto out; rc = -EINVAL; if (len < 0) goto out; len = min_t(unsigned int, len, sizeof(int)); rc = -EFAULT; if (put_user(len, optlen)) goto out; val = test_bit(X25_Q_BIT_FLAG, &x25_sk(sk)->flags); rc = copy_to_user(optval, &val, len) ? -EFAULT : 0; out: return rc; } static int x25_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; int rc = -EOPNOTSUPP; lock_sock(sk); if (sock->state != SS_UNCONNECTED) { rc = -EINVAL; release_sock(sk); return rc; } if (sk->sk_state != TCP_LISTEN) { memset(&x25_sk(sk)->dest_addr, 0, X25_ADDR_LEN); sk->sk_max_ack_backlog = backlog; sk->sk_state = TCP_LISTEN; rc = 0; } release_sock(sk); return rc; } static struct proto x25_proto = { .name = "X25", .owner = THIS_MODULE, .obj_size = sizeof(struct x25_sock), }; static struct sock *x25_alloc_socket(struct net *net, int kern) { struct x25_sock *x25; struct sock *sk = sk_alloc(net, AF_X25, GFP_ATOMIC, &x25_proto, kern); if (!sk) goto out; sock_init_data(NULL, sk); x25 = x25_sk(sk); skb_queue_head_init(&x25->ack_queue); skb_queue_head_init(&x25->fragment_queue); skb_queue_head_init(&x25->interrupt_in_queue); skb_queue_head_init(&x25->interrupt_out_queue); out: return sk; } static int x25_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; struct x25_sock *x25; int rc = -EAFNOSUPPORT; if (!net_eq(net, &init_net)) goto out; rc = -ESOCKTNOSUPPORT; if (sock->type != SOCK_SEQPACKET) goto out; rc = -EINVAL; if (protocol) goto out; rc = -ENOMEM; if ((sk = x25_alloc_socket(net, kern)) == NULL) goto out; x25 = x25_sk(sk); sock_init_data(sock, sk); x25_init_timers(sk); sock->ops = &x25_proto_ops; sk->sk_protocol = protocol; sk->sk_backlog_rcv = x25_backlog_rcv; x25->t21 = sysctl_x25_call_request_timeout; x25->t22 = sysctl_x25_reset_request_timeout; x25->t23 = sysctl_x25_clear_request_timeout; x25->t2 = sysctl_x25_ack_holdback_timeout; x25->state = X25_STATE_0; x25->cudmatchlength = 0; set_bit(X25_ACCPT_APPRV_FLAG, &x25->flags); /* normally no cud */ /* on call accept */ x25->facilities.winsize_in = X25_DEFAULT_WINDOW_SIZE; x25->facilities.winsize_out = X25_DEFAULT_WINDOW_SIZE; x25->facilities.pacsize_in = X25_DEFAULT_PACKET_SIZE; x25->facilities.pacsize_out = X25_DEFAULT_PACKET_SIZE; x25->facilities.throughput = 0; /* by default don't negotiate throughput */ x25->facilities.reverse = X25_DEFAULT_REVERSE; x25->dte_facilities.calling_len = 0; x25->dte_facilities.called_len = 0; memset(x25->dte_facilities.called_ae, '\0', sizeof(x25->dte_facilities.called_ae)); memset(x25->dte_facilities.calling_ae, '\0', sizeof(x25->dte_facilities.calling_ae)); rc = 0; out: return rc; } static struct sock *x25_make_new(struct sock *osk) { struct sock *sk = NULL; struct x25_sock *x25, *ox25; if (osk->sk_type != SOCK_SEQPACKET) goto out; if ((sk = x25_alloc_socket(sock_net(osk), 0)) == NULL) goto out; x25 = x25_sk(sk); sk->sk_type = osk->sk_type; sk->sk_priority = READ_ONCE(osk->sk_priority); sk->sk_protocol = osk->sk_protocol; sk->sk_rcvbuf = osk->sk_rcvbuf; sk->sk_sndbuf = osk->sk_sndbuf; sk->sk_state = TCP_ESTABLISHED; sk->sk_backlog_rcv = osk->sk_backlog_rcv; sock_copy_flags(sk, osk); ox25 = x25_sk(osk); x25->t21 = ox25->t21; x25->t22 = ox25->t22; x25->t23 = ox25->t23; x25->t2 = ox25->t2; x25->flags = ox25->flags; x25->facilities = ox25->facilities; x25->dte_facilities = ox25->dte_facilities; x25->cudmatchlength = ox25->cudmatchlength; clear_bit(X25_INTERRUPT_FLAG, &x25->flags); x25_init_timers(sk); out: return sk; } static int x25_release(struct socket *sock) { struct sock *sk = sock->sk; struct x25_sock *x25; if (!sk) return 0; x25 = x25_sk(sk); sock_hold(sk); lock_sock(sk); switch (x25->state) { case X25_STATE_0: case X25_STATE_2: x25_disconnect(sk, 0, 0, 0); __x25_destroy_socket(sk); goto out; case X25_STATE_1: case X25_STATE_3: case X25_STATE_4: x25_clear_queues(sk); x25_write_internal(sk, X25_CLEAR_REQUEST); x25_start_t23timer(sk); x25->state = X25_STATE_2; sk->sk_state = TCP_CLOSE; sk->sk_shutdown |= SEND_SHUTDOWN; sk->sk_state_change(sk); sock_set_flag(sk, SOCK_DEAD); sock_set_flag(sk, SOCK_DESTROY); break; case X25_STATE_5: x25_write_internal(sk, X25_CLEAR_REQUEST); x25_disconnect(sk, 0, 0, 0); __x25_destroy_socket(sk); goto out; } sock_orphan(sk); out: release_sock(sk); sock_put(sk); return 0; } static int x25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sock *sk = sock->sk; struct sockaddr_x25 *addr = (struct sockaddr_x25 *)uaddr; int len, i, rc = 0; if (addr_len != sizeof(struct sockaddr_x25) || addr->sx25_family != AF_X25 || strnlen(addr->sx25_addr.x25_addr, X25_ADDR_LEN) == X25_ADDR_LEN) { rc = -EINVAL; goto out; } /* check for the null_x25_address */ if (strcmp(addr->sx25_addr.x25_addr, null_x25_address.x25_addr)) { len = strlen(addr->sx25_addr.x25_addr); for (i = 0; i < len; i++) { if (!isdigit(addr->sx25_addr.x25_addr[i])) { rc = -EINVAL; goto out; } } } lock_sock(sk); if (sock_flag(sk, SOCK_ZAPPED)) { x25_sk(sk)->source_addr = addr->sx25_addr; x25_insert_socket(sk); sock_reset_flag(sk, SOCK_ZAPPED); } else { rc = -EINVAL; } release_sock(sk); net_dbg_ratelimited("x25_bind: socket is bound\n"); out: return rc; } static int x25_wait_for_connection_establishment(struct sock *sk) { DECLARE_WAITQUEUE(wait, current); int rc; add_wait_queue_exclusive(sk_sleep(sk), &wait); for (;;) { __set_current_state(TASK_INTERRUPTIBLE); rc = -ERESTARTSYS; if (signal_pending(current)) break; rc = sock_error(sk); if (rc) { sk->sk_socket->state = SS_UNCONNECTED; break; } rc = -ENOTCONN; if (sk->sk_state == TCP_CLOSE) { sk->sk_socket->state = SS_UNCONNECTED; break; } rc = 0; if (sk->sk_state != TCP_ESTABLISHED) { release_sock(sk); schedule(); lock_sock(sk); } else break; } __set_current_state(TASK_RUNNING); remove_wait_queue(sk_sleep(sk), &wait); return rc; } static int x25_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) { struct sock *sk = sock->sk; struct x25_sock *x25 = x25_sk(sk); struct sockaddr_x25 *addr = (struct sockaddr_x25 *)uaddr; struct x25_route *rt; int rc = 0; lock_sock(sk); if (sk->sk_state == TCP_ESTABLISHED && sock->state == SS_CONNECTING) { sock->state = SS_CONNECTED; goto out; /* Connect completed during a ERESTARTSYS event */ } rc = -ECONNREFUSED; if (sk->sk_state == TCP_CLOSE && sock->state == SS_CONNECTING) { sock->state = SS_UNCONNECTED; goto out; } rc = -EISCONN; /* No reconnect on a seqpacket socket */ if (sk->sk_state == TCP_ESTABLISHED) goto out; rc = -EALREADY; /* Do nothing if call is already in progress */ if (sk->sk_state == TCP_SYN_SENT) goto out; sk->sk_state = TCP_CLOSE; sock->state = SS_UNCONNECTED; rc = -EINVAL; if (addr_len != sizeof(struct sockaddr_x25) || addr->sx25_family != AF_X25 || strnlen(addr->sx25_addr.x25_addr, X25_ADDR_LEN) == X25_ADDR_LEN) goto out; rc = -ENETUNREACH; rt = x25_get_route(&addr->sx25_addr); if (!rt) goto out; x25->neighbour = x25_get_neigh(rt->dev); if (!x25->neighbour) goto out_put_route; x25_limit_facilities(&x25->facilities, x25->neighbour); x25->lci = x25_new_lci(x25->neighbour); if (!x25->lci) goto out_put_neigh; rc = -EINVAL; if (sock_flag(sk, SOCK_ZAPPED)) /* Must bind first - autobinding does not work */ goto out_put_neigh; if (!strcmp(x25->source_addr.x25_addr, null_x25_address.x25_addr)) memset(&x25->source_addr, '\0', X25_ADDR_LEN); x25->dest_addr = addr->sx25_addr; /* Move to connecting socket, start sending Connect Requests */ sock->state = SS_CONNECTING; sk->sk_state = TCP_SYN_SENT; x25->state = X25_STATE_1; x25_write_internal(sk, X25_CALL_REQUEST); x25_start_heartbeat(sk); x25_start_t21timer(sk); /* Now the loop */ rc = -EINPROGRESS; if (sk->sk_state != TCP_ESTABLISHED && (flags & O_NONBLOCK)) goto out; rc = x25_wait_for_connection_establishment(sk); if (rc) goto out_put_neigh; sock->state = SS_CONNECTED; rc = 0; out_put_neigh: if (rc && x25->neighbour) { read_lock_bh(&x25_list_lock); x25_neigh_put(x25->neighbour); x25->neighbour = NULL; read_unlock_bh(&x25_list_lock); x25->state = X25_STATE_0; } out_put_route: x25_route_put(rt); out: release_sock(sk); return rc; } static int x25_wait_for_data(struct sock *sk, long timeout) { DECLARE_WAITQUEUE(wait, current); int rc = 0; add_wait_queue_exclusive(sk_sleep(sk), &wait); for (;;) { __set_current_state(TASK_INTERRUPTIBLE); if (sk->sk_shutdown & RCV_SHUTDOWN) break; rc = -ERESTARTSYS; if (signal_pending(current)) break; rc = -EAGAIN; if (!timeout) break; rc = 0; if (skb_queue_empty(&sk->sk_receive_queue)) { release_sock(sk); timeout = schedule_timeout(timeout); lock_sock(sk); } else break; } __set_current_state(TASK_RUNNING); remove_wait_queue(sk_sleep(sk), &wait); return rc; } static int x25_accept(struct socket *sock, struct socket *newsock, struct proto_accept_arg *arg) { struct sock *sk = sock->sk; struct sock *newsk; struct sk_buff *skb; int rc = -EINVAL; if (!sk) goto out; rc = -EOPNOTSUPP; if (sk->sk_type != SOCK_SEQPACKET) goto out; lock_sock(sk); rc = -EINVAL; if (sk->sk_state != TCP_LISTEN) goto out2; rc = x25_wait_for_data(sk, sk->sk_rcvtimeo); if (rc) goto out2; skb = skb_dequeue(&sk->sk_receive_queue); rc = -EINVAL; if (!skb->sk) goto out2; newsk = skb->sk; sock_graft(newsk, newsock); /* Now attach up the new socket */ skb->sk = NULL; kfree_skb(skb); sk_acceptq_removed(sk); newsock->state = SS_CONNECTED; rc = 0; out2: release_sock(sk); out: return rc; } static int x25_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { struct sockaddr_x25 *sx25 = (struct sockaddr_x25 *)uaddr; struct sock *sk = sock->sk; struct x25_sock *x25 = x25_sk(sk); int rc = 0; if (peer) { if (sk->sk_state != TCP_ESTABLISHED) { rc = -ENOTCONN; goto out; } sx25->sx25_addr = x25->dest_addr; } else sx25->sx25_addr = x25->source_addr; sx25->sx25_family = AF_X25; rc = sizeof(*sx25); out: return rc; } int x25_rx_call_request(struct sk_buff *skb, struct x25_neigh *nb, unsigned int lci) { struct sock *sk; struct sock *make; struct x25_sock *makex25; struct x25_address source_addr, dest_addr; struct x25_facilities facilities; struct x25_dte_facilities dte_facilities; int len, addr_len, rc; /* * Remove the LCI and frame type. */ skb_pull(skb, X25_STD_MIN_LEN); /* * Extract the X.25 addresses and convert them to ASCII strings, * and remove them. * * Address block is mandatory in call request packets */ addr_len = x25_parse_address_block(skb, &source_addr, &dest_addr); if (addr_len <= 0) goto out_clear_request; skb_pull(skb, addr_len); /* * Get the length of the facilities, skip past them for the moment * get the call user data because this is needed to determine * the correct listener * * Facilities length is mandatory in call request packets */ if (!pskb_may_pull(skb, 1)) goto out_clear_request; len = skb->data[0] + 1; if (!pskb_may_pull(skb, len)) goto out_clear_request; skb_pull(skb,len); /* * Ensure that the amount of call user data is valid. */ if (skb->len > X25_MAX_CUD_LEN) goto out_clear_request; /* * Get all the call user data so it can be used in * x25_find_listener and skb_copy_from_linear_data up ahead. */ if (!pskb_may_pull(skb, skb->len)) goto out_clear_request; /* * Find a listener for the particular address/cud pair. */ sk = x25_find_listener(&source_addr,skb); skb_push(skb,len); if (sk != NULL && sk_acceptq_is_full(sk)) { goto out_sock_put; } /* * We dont have any listeners for this incoming call. * Try forwarding it. */ if (sk == NULL) { skb_push(skb, addr_len + X25_STD_MIN_LEN); if (sysctl_x25_forward && x25_forward_call(&dest_addr, nb, skb, lci) > 0) { /* Call was forwarded, dont process it any more */ kfree_skb(skb); rc = 1; goto out; } else { /* No listeners, can't forward, clear the call */ goto out_clear_request; } } /* * Try to reach a compromise on the requested facilities. */ len = x25_negotiate_facilities(skb, sk, &facilities, &dte_facilities); if (len == -1) goto out_sock_put; /* * current neighbour/link might impose additional limits * on certain facilities */ x25_limit_facilities(&facilities, nb); /* * Try to create a new socket. */ make = x25_make_new(sk); if (!make) goto out_sock_put; /* * Remove the facilities */ skb_pull(skb, len); skb->sk = make; make->sk_state = TCP_ESTABLISHED; makex25 = x25_sk(make); makex25->lci = lci; makex25->dest_addr = dest_addr; makex25->source_addr = source_addr; x25_neigh_hold(nb); makex25->neighbour = nb; makex25->facilities = facilities; makex25->dte_facilities= dte_facilities; makex25->vc_facil_mask = x25_sk(sk)->vc_facil_mask; /* ensure no reverse facil on accept */ makex25->vc_facil_mask &= ~X25_MASK_REVERSE; /* ensure no calling address extension on accept */ makex25->vc_facil_mask &= ~X25_MASK_CALLING_AE; makex25->cudmatchlength = x25_sk(sk)->cudmatchlength; /* Normally all calls are accepted immediately */ if (test_bit(X25_ACCPT_APPRV_FLAG, &makex25->flags)) { x25_write_internal(make, X25_CALL_ACCEPTED); makex25->state = X25_STATE_3; } else { makex25->state = X25_STATE_5; } /* * Incoming Call User Data. */ skb_copy_from_linear_data(skb, makex25->calluserdata.cuddata, skb->len); makex25->calluserdata.cudlength = skb->len; sk_acceptq_added(sk); x25_insert_socket(make); skb_queue_head(&sk->sk_receive_queue, skb); x25_start_heartbeat(make); if (!sock_flag(sk, SOCK_DEAD)) sk->sk_data_ready(sk); rc = 1; sock_put(sk); out: return rc; out_sock_put: sock_put(sk); out_clear_request: rc = 0; x25_transmit_clear_request(nb, lci, 0x01); goto out; } static int x25_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct x25_sock *x25 = x25_sk(sk); DECLARE_SOCKADDR(struct sockaddr_x25 *, usx25, msg->msg_name); struct sockaddr_x25 sx25; struct sk_buff *skb; unsigned char *asmptr; int noblock = msg->msg_flags & MSG_DONTWAIT; size_t size; int qbit = 0, rc = -EINVAL; lock_sock(sk); if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_OOB|MSG_EOR|MSG_CMSG_COMPAT)) goto out; /* we currently don't support segmented records at the user interface */ if (!(msg->msg_flags & (MSG_EOR|MSG_OOB))) goto out; rc = -EADDRNOTAVAIL; if (sock_flag(sk, SOCK_ZAPPED)) goto out; rc = -EPIPE; if (sk->sk_shutdown & SEND_SHUTDOWN) { send_sig(SIGPIPE, current, 0); goto out; } rc = -ENETUNREACH; if (!x25->neighbour) goto out; if (usx25) { rc = -EINVAL; if (msg->msg_namelen < sizeof(sx25)) goto out; memcpy(&sx25, usx25, sizeof(sx25)); rc = -EISCONN; if (strcmp(x25->dest_addr.x25_addr, sx25.sx25_addr.x25_addr)) goto out; rc = -EINVAL; if (sx25.sx25_family != AF_X25) goto out; } else { /* * FIXME 1003.1g - if the socket is like this because * it has become closed (not started closed) we ought * to SIGPIPE, EPIPE; */ rc = -ENOTCONN; if (sk->sk_state != TCP_ESTABLISHED) goto out; sx25.sx25_family = AF_X25; sx25.sx25_addr = x25->dest_addr; } /* Sanity check the packet size */ if (len > 65535) { rc = -EMSGSIZE; goto out; } net_dbg_ratelimited("x25_sendmsg: sendto: Addresses built.\n"); /* Build a packet */ net_dbg_ratelimited("x25_sendmsg: sendto: building packet.\n"); if ((msg->msg_flags & MSG_OOB) && len > 32) len = 32; size = len + X25_MAX_L2_LEN + X25_EXT_MIN_LEN; release_sock(sk); skb = sock_alloc_send_skb(sk, size, noblock, &rc); lock_sock(sk); if (!skb) goto out; X25_SKB_CB(skb)->flags = msg->msg_flags; skb_reserve(skb, X25_MAX_L2_LEN + X25_EXT_MIN_LEN); /* * Put the data on the end */ net_dbg_ratelimited("x25_sendmsg: Copying user data\n"); skb_reset_transport_header(skb); skb_put(skb, len); rc = memcpy_from_msg(skb_transport_header(skb), msg, len); if (rc) goto out_kfree_skb; /* * If the Q BIT Include socket option is in force, the first * byte of the user data is the logical value of the Q Bit. */ if (test_bit(X25_Q_BIT_FLAG, &x25->flags)) { if (!pskb_may_pull(skb, 1)) goto out_kfree_skb; qbit = skb->data[0]; skb_pull(skb, 1); } /* * Push down the X.25 header */ net_dbg_ratelimited("x25_sendmsg: Building X.25 Header.\n"); if (msg->msg_flags & MSG_OOB) { if (x25->neighbour->extended) { asmptr = skb_push(skb, X25_STD_MIN_LEN); *asmptr++ = ((x25->lci >> 8) & 0x0F) | X25_GFI_EXTSEQ; *asmptr++ = (x25->lci >> 0) & 0xFF; *asmptr++ = X25_INTERRUPT; } else { asmptr = skb_push(skb, X25_STD_MIN_LEN); *asmptr++ = ((x25->lci >> 8) & 0x0F) | X25_GFI_STDSEQ; *asmptr++ = (x25->lci >> 0) & 0xFF; *asmptr++ = X25_INTERRUPT; } } else { if (x25->neighbour->extended) { /* Build an Extended X.25 header */ asmptr = skb_push(skb, X25_EXT_MIN_LEN); *asmptr++ = ((x25->lci >> 8) & 0x0F) | X25_GFI_EXTSEQ; *asmptr++ = (x25->lci >> 0) & 0xFF; *asmptr++ = X25_DATA; *asmptr++ = X25_DATA; } else { /* Build an Standard X.25 header */ asmptr = skb_push(skb, X25_STD_MIN_LEN); *asmptr++ = ((x25->lci >> 8) & 0x0F) | X25_GFI_STDSEQ; *asmptr++ = (x25->lci >> 0) & 0xFF; *asmptr++ = X25_DATA; } if (qbit) skb->data[0] |= X25_Q_BIT; } net_dbg_ratelimited("x25_sendmsg: Built header.\n"); net_dbg_ratelimited("x25_sendmsg: Transmitting buffer\n"); rc = -ENOTCONN; if (sk->sk_state != TCP_ESTABLISHED) goto out_kfree_skb; if (msg->msg_flags & MSG_OOB) skb_queue_tail(&x25->interrupt_out_queue, skb); else { rc = x25_output(sk, skb); len = rc; if (rc < 0) kfree_skb(skb); else if (test_bit(X25_Q_BIT_FLAG, &x25->flags)) len++; } x25_kick(sk); rc = len; out: release_sock(sk); return rc; out_kfree_skb: kfree_skb(skb); goto out; } static int x25_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; struct x25_sock *x25 = x25_sk(sk); DECLARE_SOCKADDR(struct sockaddr_x25 *, sx25, msg->msg_name); size_t copied; int qbit, header_len; struct sk_buff *skb; unsigned char *asmptr; int rc = -ENOTCONN; lock_sock(sk); if (x25->neighbour == NULL) goto out; header_len = x25->neighbour->extended ? X25_EXT_MIN_LEN : X25_STD_MIN_LEN; /* * This works for seqpacket too. The receiver has ordered the queue for * us! We do one quick check first though */ if (sk->sk_state != TCP_ESTABLISHED) goto out; if (flags & MSG_OOB) { rc = -EINVAL; if (sock_flag(sk, SOCK_URGINLINE) || !skb_peek(&x25->interrupt_in_queue)) goto out; skb = skb_dequeue(&x25->interrupt_in_queue); if (!pskb_may_pull(skb, X25_STD_MIN_LEN)) goto out_free_dgram; skb_pull(skb, X25_STD_MIN_LEN); /* * No Q bit information on Interrupt data. */ if (test_bit(X25_Q_BIT_FLAG, &x25->flags)) { asmptr = skb_push(skb, 1); *asmptr = 0x00; } msg->msg_flags |= MSG_OOB; } else { /* Now we can treat all alike */ release_sock(sk); skb = skb_recv_datagram(sk, flags, &rc); lock_sock(sk); if (!skb) goto out; if (!pskb_may_pull(skb, header_len)) goto out_free_dgram; qbit = (skb->data[0] & X25_Q_BIT) == X25_Q_BIT; skb_pull(skb, header_len); if (test_bit(X25_Q_BIT_FLAG, &x25->flags)) { asmptr = skb_push(skb, 1); *asmptr = qbit; } } skb_reset_transport_header(skb); copied = skb->len; if (copied > size) { copied = size; msg->msg_flags |= MSG_TRUNC; } /* Currently, each datagram always contains a complete record */ msg->msg_flags |= MSG_EOR; rc = skb_copy_datagram_msg(skb, 0, msg, copied); if (rc) goto out_free_dgram; if (sx25) { sx25->sx25_family = AF_X25; sx25->sx25_addr = x25->dest_addr; msg->msg_namelen = sizeof(*sx25); } x25_check_rbuf(sk); rc = copied; out_free_dgram: skb_free_datagram(sk, skb); out: release_sock(sk); return rc; } static int x25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct sock *sk = sock->sk; struct x25_sock *x25 = x25_sk(sk); void __user *argp = (void __user *)arg; int rc; switch (cmd) { case TIOCOUTQ: { int amount; amount = sk->sk_sndbuf - sk_wmem_alloc_get(sk); if (amount < 0) amount = 0; rc = put_user(amount, (unsigned int __user *)argp); break; } case TIOCINQ: { struct sk_buff *skb; int amount = 0; /* * These two are safe on a single CPU system as * only user tasks fiddle here */ lock_sock(sk); if ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) amount = skb->len; release_sock(sk); rc = put_user(amount, (unsigned int __user *)argp); break; } case SIOCGIFADDR: case SIOCSIFADDR: case SIOCGIFDSTADDR: case SIOCSIFDSTADDR: case SIOCGIFBRDADDR: case SIOCSIFBRDADDR: case SIOCGIFNETMASK: case SIOCSIFNETMASK: case SIOCGIFMETRIC: case SIOCSIFMETRIC: rc = -EINVAL; break; case SIOCADDRT: case SIOCDELRT: rc = -EPERM; if (!capable(CAP_NET_ADMIN)) break; rc = x25_route_ioctl(cmd, argp); break; case SIOCX25GSUBSCRIP: rc = x25_subscr_ioctl(cmd, argp); break; case SIOCX25SSUBSCRIP: rc = -EPERM; if (!capable(CAP_NET_ADMIN)) break; rc = x25_subscr_ioctl(cmd, argp); break; case SIOCX25GFACILITIES: { lock_sock(sk); rc = copy_to_user(argp, &x25->facilities, sizeof(x25->facilities)) ? -EFAULT : 0; release_sock(sk); break; } case SIOCX25SFACILITIES: { struct x25_facilities facilities; rc = -EFAULT; if (copy_from_user(&facilities, argp, sizeof(facilities))) break; rc = -EINVAL; lock_sock(sk); if (sk->sk_state != TCP_LISTEN && sk->sk_state != TCP_CLOSE) goto out_fac_release; if (facilities.pacsize_in < X25_PS16 || facilities.pacsize_in > X25_PS4096) goto out_fac_release; if (facilities.pacsize_out < X25_PS16 || facilities.pacsize_out > X25_PS4096) goto out_fac_release; if (facilities.winsize_in < 1 || facilities.winsize_in > 127) goto out_fac_release; if (facilities.throughput) { int out = facilities.throughput & 0xf0; int in = facilities.throughput & 0x0f; if (!out) facilities.throughput |= X25_DEFAULT_THROUGHPUT << 4; else if (out < 0x30 || out > 0xD0) goto out_fac_release; if (!in) facilities.throughput |= X25_DEFAULT_THROUGHPUT; else if (in < 0x03 || in > 0x0D) goto out_fac_release; } if (facilities.reverse && (facilities.reverse & 0x81) != 0x81) goto out_fac_release; x25->facilities = facilities; rc = 0; out_fac_release: release_sock(sk); break; } case SIOCX25GDTEFACILITIES: { lock_sock(sk); rc = copy_to_user(argp, &x25->dte_facilities, sizeof(x25->dte_facilities)); release_sock(sk); if (rc) rc = -EFAULT; break; } case SIOCX25SDTEFACILITIES: { struct x25_dte_facilities dtefacs; rc = -EFAULT; if (copy_from_user(&dtefacs, argp, sizeof(dtefacs))) break; rc = -EINVAL; lock_sock(sk); if (sk->sk_state != TCP_LISTEN && sk->sk_state != TCP_CLOSE) goto out_dtefac_release; if (dtefacs.calling_len > X25_MAX_AE_LEN) goto out_dtefac_release; if (dtefacs.called_len > X25_MAX_AE_LEN) goto out_dtefac_release; x25->dte_facilities = dtefacs; rc = 0; out_dtefac_release: release_sock(sk); break; } case SIOCX25GCALLUSERDATA: { lock_sock(sk); rc = copy_to_user(argp, &x25->calluserdata, sizeof(x25->calluserdata)) ? -EFAULT : 0; release_sock(sk); break; } case SIOCX25SCALLUSERDATA: { struct x25_calluserdata calluserdata; rc = -EFAULT; if (copy_from_user(&calluserdata, argp, sizeof(calluserdata))) break; rc = -EINVAL; if (calluserdata.cudlength > X25_MAX_CUD_LEN) break; lock_sock(sk); x25->calluserdata = calluserdata; release_sock(sk); rc = 0; break; } case SIOCX25GCAUSEDIAG: { lock_sock(sk); rc = copy_to_user(argp, &x25->causediag, sizeof(x25->causediag)) ? -EFAULT : 0; release_sock(sk); break; } case SIOCX25SCAUSEDIAG: { struct x25_causediag causediag; rc = -EFAULT; if (copy_from_user(&causediag, argp, sizeof(causediag))) break; lock_sock(sk); x25->causediag = causediag; release_sock(sk); rc = 0; break; } case SIOCX25SCUDMATCHLEN: { struct x25_subaddr sub_addr; rc = -EINVAL; lock_sock(sk); if(sk->sk_state != TCP_CLOSE) goto out_cud_release; rc = -EFAULT; if (copy_from_user(&sub_addr, argp, sizeof(sub_addr))) goto out_cud_release; rc = -EINVAL; if (sub_addr.cudmatchlength > X25_MAX_CUD_LEN) goto out_cud_release; x25->cudmatchlength = sub_addr.cudmatchlength; rc = 0; out_cud_release: release_sock(sk); break; } case SIOCX25CALLACCPTAPPRV: { rc = -EINVAL; lock_sock(sk); if (sk->sk_state == TCP_CLOSE) { clear_bit(X25_ACCPT_APPRV_FLAG, &x25->flags); rc = 0; } release_sock(sk); break; } case SIOCX25SENDCALLACCPT: { rc = -EINVAL; lock_sock(sk); if (sk->sk_state != TCP_ESTABLISHED) goto out_sendcallaccpt_release; /* must call accptapprv above */ if (test_bit(X25_ACCPT_APPRV_FLAG, &x25->flags)) goto out_sendcallaccpt_release; x25_write_internal(sk, X25_CALL_ACCEPTED); x25->state = X25_STATE_3; rc = 0; out_sendcallaccpt_release: release_sock(sk); break; } default: rc = -ENOIOCTLCMD; break; } return rc; } static const struct net_proto_family x25_family_ops = { .family = AF_X25, .create = x25_create, .owner = THIS_MODULE, }; #ifdef CONFIG_COMPAT static int compat_x25_subscr_ioctl(unsigned int cmd, struct compat_x25_subscrip_struct __user *x25_subscr32) { struct compat_x25_subscrip_struct x25_subscr; struct x25_neigh *nb; struct net_device *dev; int rc = -EINVAL; rc = -EFAULT; if (copy_from_user(&x25_subscr, x25_subscr32, sizeof(*x25_subscr32))) goto out; rc = -EINVAL; dev = x25_dev_get(x25_subscr.device); if (dev == NULL) goto out; nb = x25_get_neigh(dev); if (nb == NULL) goto out_dev_put; dev_put(dev); if (cmd == SIOCX25GSUBSCRIP) { read_lock_bh(&x25_neigh_list_lock); x25_subscr.extended = nb->extended; x25_subscr.global_facil_mask = nb->global_facil_mask; read_unlock_bh(&x25_neigh_list_lock); rc = copy_to_user(x25_subscr32, &x25_subscr, sizeof(*x25_subscr32)) ? -EFAULT : 0; } else { rc = -EINVAL; if (x25_subscr.extended == 0 || x25_subscr.extended == 1) { rc = 0; write_lock_bh(&x25_neigh_list_lock); nb->extended = x25_subscr.extended; nb->global_facil_mask = x25_subscr.global_facil_mask; write_unlock_bh(&x25_neigh_list_lock); } } x25_neigh_put(nb); out: return rc; out_dev_put: dev_put(dev); goto out; } static int compat_x25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { void __user *argp = compat_ptr(arg); int rc = -ENOIOCTLCMD; switch(cmd) { case TIOCOUTQ: case TIOCINQ: rc = x25_ioctl(sock, cmd, (unsigned long)argp); break; case SIOCGIFADDR: case SIOCSIFADDR: case SIOCGIFDSTADDR: case SIOCSIFDSTADDR: case SIOCGIFBRDADDR: case SIOCSIFBRDADDR: case SIOCGIFNETMASK: case SIOCSIFNETMASK: case SIOCGIFMETRIC: case SIOCSIFMETRIC: rc = -EINVAL; break; case SIOCADDRT: case SIOCDELRT: rc = -EPERM; if (!capable(CAP_NET_ADMIN)) break; rc = x25_route_ioctl(cmd, argp); break; case SIOCX25GSUBSCRIP: rc = compat_x25_subscr_ioctl(cmd, argp); break; case SIOCX25SSUBSCRIP: rc = -EPERM; if (!capable(CAP_NET_ADMIN)) break; rc = compat_x25_subscr_ioctl(cmd, argp); break; case SIOCX25GFACILITIES: case SIOCX25SFACILITIES: case SIOCX25GDTEFACILITIES: case SIOCX25SDTEFACILITIES: case SIOCX25GCALLUSERDATA: case SIOCX25SCALLUSERDATA: case SIOCX25GCAUSEDIAG: case SIOCX25SCAUSEDIAG: case SIOCX25SCUDMATCHLEN: case SIOCX25CALLACCPTAPPRV: case SIOCX25SENDCALLACCPT: rc = x25_ioctl(sock, cmd, (unsigned long)argp); break; default: rc = -ENOIOCTLCMD; break; } return rc; } #endif static const struct proto_ops x25_proto_ops = { .family = AF_X25, .owner = THIS_MODULE, .release = x25_release, .bind = x25_bind, .connect = x25_connect, .socketpair = sock_no_socketpair, .accept = x25_accept, .getname = x25_getname, .poll = datagram_poll, .ioctl = x25_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = compat_x25_ioctl, #endif .gettstamp = sock_gettstamp, .listen = x25_listen, .shutdown = sock_no_shutdown, .setsockopt = x25_setsockopt, .getsockopt = x25_getsockopt, .sendmsg = x25_sendmsg, .recvmsg = x25_recvmsg, .mmap = sock_no_mmap, }; static struct packet_type x25_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_X25), .func = x25_lapb_receive_frame, }; static struct notifier_block x25_dev_notifier = { .notifier_call = x25_device_event, }; void x25_kill_by_neigh(struct x25_neigh *nb) { struct sock *s; write_lock_bh(&x25_list_lock); sk_for_each(s, &x25_list) { if (x25_sk(s)->neighbour == nb) { write_unlock_bh(&x25_list_lock); lock_sock(s); x25_disconnect(s, ENETUNREACH, 0, 0); release_sock(s); write_lock_bh(&x25_list_lock); } } write_unlock_bh(&x25_list_lock); /* Remove any related forwards */ x25_clear_forward_by_dev(nb->dev); } static int __init x25_init(void) { int rc; rc = proto_register(&x25_proto, 0); if (rc) goto out; rc = sock_register(&x25_family_ops); if (rc) goto out_proto; dev_add_pack(&x25_packet_type); rc = register_netdevice_notifier(&x25_dev_notifier); if (rc) goto out_sock; rc = x25_register_sysctl(); if (rc) goto out_dev; rc = x25_proc_init(); if (rc) goto out_sysctl; pr_info("Linux Version 0.2\n"); out: return rc; out_sysctl: x25_unregister_sysctl(); out_dev: unregister_netdevice_notifier(&x25_dev_notifier); out_sock: dev_remove_pack(&x25_packet_type); sock_unregister(AF_X25); out_proto: proto_unregister(&x25_proto); goto out; } module_init(x25_init); static void __exit x25_exit(void) { x25_proc_exit(); x25_link_free(); x25_route_free(); x25_unregister_sysctl(); unregister_netdevice_notifier(&x25_dev_notifier); dev_remove_pack(&x25_packet_type); sock_unregister(AF_X25); proto_unregister(&x25_proto); } module_exit(x25_exit); MODULE_AUTHOR("Jonathan Naylor <g4klx@g4klx.demon.co.uk>"); MODULE_DESCRIPTION("The X.25 Packet Layer network layer protocol"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(PF_X25); |
| 4882 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_JIFFIES_H #define _LINUX_JIFFIES_H #include <linux/cache.h> #include <linux/limits.h> #include <linux/math64.h> #include <linux/minmax.h> #include <linux/types.h> #include <linux/time.h> #include <linux/timex.h> #include <vdso/jiffies.h> #include <asm/param.h> /* for HZ */ #include <generated/timeconst.h> /* * The following defines establish the engineering parameters of the PLL * model. The HZ variable establishes the timer interrupt frequency, 100 Hz * for the SunOS kernel, 256 Hz for the Ultrix kernel and 1024 Hz for the * OSF/1 kernel. The SHIFT_HZ define expresses the same value as the * nearest power of two in order to avoid hardware multiply operations. */ #if HZ >= 12 && HZ < 24 # define SHIFT_HZ 4 #elif HZ >= 24 && HZ < 48 # define SHIFT_HZ 5 #elif HZ >= 48 && HZ < 96 # define SHIFT_HZ 6 #elif HZ >= 96 && HZ < 192 # define SHIFT_HZ 7 #elif HZ >= 192 && HZ < 384 # define SHIFT_HZ 8 #elif HZ >= 384 && HZ < 768 # define SHIFT_HZ 9 #elif HZ >= 768 && HZ < 1536 # define SHIFT_HZ 10 #elif HZ >= 1536 && HZ < 3072 # define SHIFT_HZ 11 #elif HZ >= 3072 && HZ < 6144 # define SHIFT_HZ 12 #elif HZ >= 6144 && HZ < 12288 # define SHIFT_HZ 13 #else # error Invalid value of HZ. #endif /* Suppose we want to divide two numbers NOM and DEN: NOM/DEN, then we can * improve accuracy by shifting LSH bits, hence calculating: * (NOM << LSH) / DEN * This however means trouble for large NOM, because (NOM << LSH) may no * longer fit in 32 bits. The following way of calculating this gives us * some slack, under the following conditions: * - (NOM / DEN) fits in (32 - LSH) bits. * - (NOM % DEN) fits in (32 - LSH) bits. */ #define SH_DIV(NOM,DEN,LSH) ( (((NOM) / (DEN)) << (LSH)) \ + ((((NOM) % (DEN)) << (LSH)) + (DEN) / 2) / (DEN)) /* LATCH is used in the interval timer and ftape setup. */ #define LATCH ((CLOCK_TICK_RATE + HZ/2) / HZ) /* For divider */ extern int register_refined_jiffies(long clock_tick_rate); /* TICK_USEC is the time between ticks in usec assuming SHIFTED_HZ */ #define TICK_USEC ((USEC_PER_SEC + HZ/2) / HZ) /* USER_TICK_USEC is the time between ticks in usec assuming fake USER_HZ */ #define USER_TICK_USEC ((1000000UL + USER_HZ/2) / USER_HZ) #ifndef __jiffy_arch_data #define __jiffy_arch_data #endif /* * The 64-bit value is not atomic on 32-bit systems - you MUST NOT read it * without sampling the sequence number in jiffies_lock. * get_jiffies_64() will do this for you as appropriate. * * jiffies and jiffies_64 are at the same address for little-endian systems * and for 64-bit big-endian systems. * On 32-bit big-endian systems, jiffies is the lower 32 bits of jiffies_64 * (i.e., at address @jiffies_64 + 4). * See arch/ARCH/kernel/vmlinux.lds.S */ extern u64 __cacheline_aligned_in_smp jiffies_64; extern unsigned long volatile __cacheline_aligned_in_smp __jiffy_arch_data jiffies; #if (BITS_PER_LONG < 64) u64 get_jiffies_64(void); #else /** * get_jiffies_64 - read the 64-bit non-atomic jiffies_64 value * * When BITS_PER_LONG < 64, this uses sequence number sampling using * jiffies_lock to protect the 64-bit read. * * Return: current 64-bit jiffies value */ static inline u64 get_jiffies_64(void) { return (u64)jiffies; } #endif /** * DOC: General information about time_* inlines * * These inlines deal with timer wrapping correctly. You are strongly encouraged * to use them: * * #. Because people otherwise forget * #. Because if the timer wrap changes in future you won't have to alter your * driver code. */ /** * time_after - returns true if the time a is after time b. * @a: first comparable as unsigned long * @b: second comparable as unsigned long * * Do this with "<0" and ">=0" to only test the sign of the result. A * good compiler would generate better code (and a really good compiler * wouldn't care). Gcc is currently neither. * * Return: %true is time a is after time b, otherwise %false. */ #define time_after(a,b) \ (typecheck(unsigned long, a) && \ typecheck(unsigned long, b) && \ ((long)((b) - (a)) < 0)) /** * time_before - returns true if the time a is before time b. * @a: first comparable as unsigned long * @b: second comparable as unsigned long * * Return: %true is time a is before time b, otherwise %false. */ #define time_before(a,b) time_after(b,a) /** * time_after_eq - returns true if the time a is after or the same as time b. * @a: first comparable as unsigned long * @b: second comparable as unsigned long * * Return: %true is time a is after or the same as time b, otherwise %false. */ #define time_after_eq(a,b) \ (typecheck(unsigned long, a) && \ typecheck(unsigned long, b) && \ ((long)((a) - (b)) >= 0)) /** * time_before_eq - returns true if the time a is before or the same as time b. * @a: first comparable as unsigned long * @b: second comparable as unsigned long * * Return: %true is time a is before or the same as time b, otherwise %false. */ #define time_before_eq(a,b) time_after_eq(b,a) /** * time_in_range - Calculate whether a is in the range of [b, c]. * @a: time to test * @b: beginning of the range * @c: end of the range * * Return: %true is time a is in the range [b, c], otherwise %false. */ #define time_in_range(a,b,c) \ (time_after_eq(a,b) && \ time_before_eq(a,c)) /** * time_in_range_open - Calculate whether a is in the range of [b, c). * @a: time to test * @b: beginning of the range * @c: end of the range * * Return: %true is time a is in the range [b, c), otherwise %false. */ #define time_in_range_open(a,b,c) \ (time_after_eq(a,b) && \ time_before(a,c)) /* Same as above, but does so with platform independent 64bit types. * These must be used when utilizing jiffies_64 (i.e. return value of * get_jiffies_64()). */ /** * time_after64 - returns true if the time a is after time b. * @a: first comparable as __u64 * @b: second comparable as __u64 * * This must be used when utilizing jiffies_64 (i.e. return value of * get_jiffies_64()). * * Return: %true is time a is after time b, otherwise %false. */ #define time_after64(a,b) \ (typecheck(__u64, a) && \ typecheck(__u64, b) && \ ((__s64)((b) - (a)) < 0)) /** * time_before64 - returns true if the time a is before time b. * @a: first comparable as __u64 * @b: second comparable as __u64 * * This must be used when utilizing jiffies_64 (i.e. return value of * get_jiffies_64()). * * Return: %true is time a is before time b, otherwise %false. */ #define time_before64(a,b) time_after64(b,a) /** * time_after_eq64 - returns true if the time a is after or the same as time b. * @a: first comparable as __u64 * @b: second comparable as __u64 * * This must be used when utilizing jiffies_64 (i.e. return value of * get_jiffies_64()). * * Return: %true is time a is after or the same as time b, otherwise %false. */ #define time_after_eq64(a,b) \ (typecheck(__u64, a) && \ typecheck(__u64, b) && \ ((__s64)((a) - (b)) >= 0)) /** * time_before_eq64 - returns true if the time a is before or the same as time b. * @a: first comparable as __u64 * @b: second comparable as __u64 * * This must be used when utilizing jiffies_64 (i.e. return value of * get_jiffies_64()). * * Return: %true is time a is before or the same as time b, otherwise %false. */ #define time_before_eq64(a,b) time_after_eq64(b,a) /** * time_in_range64 - Calculate whether a is in the range of [b, c]. * @a: time to test * @b: beginning of the range * @c: end of the range * * Return: %true is time a is in the range [b, c], otherwise %false. */ #define time_in_range64(a, b, c) \ (time_after_eq64(a, b) && \ time_before_eq64(a, c)) /* * These eight macros compare jiffies[_64] and 'a' for convenience. */ /** * time_is_before_jiffies - return true if a is before jiffies * @a: time (unsigned long) to compare to jiffies * * Return: %true is time a is before jiffies, otherwise %false. */ #define time_is_before_jiffies(a) time_after(jiffies, a) /** * time_is_before_jiffies64 - return true if a is before jiffies_64 * @a: time (__u64) to compare to jiffies_64 * * Return: %true is time a is before jiffies_64, otherwise %false. */ #define time_is_before_jiffies64(a) time_after64(get_jiffies_64(), a) /** * time_is_after_jiffies - return true if a is after jiffies * @a: time (unsigned long) to compare to jiffies * * Return: %true is time a is after jiffies, otherwise %false. */ #define time_is_after_jiffies(a) time_before(jiffies, a) /** * time_is_after_jiffies64 - return true if a is after jiffies_64 * @a: time (__u64) to compare to jiffies_64 * * Return: %true is time a is after jiffies_64, otherwise %false. */ #define time_is_after_jiffies64(a) time_before64(get_jiffies_64(), a) /** * time_is_before_eq_jiffies - return true if a is before or equal to jiffies * @a: time (unsigned long) to compare to jiffies * * Return: %true is time a is before or the same as jiffies, otherwise %false. */ #define time_is_before_eq_jiffies(a) time_after_eq(jiffies, a) /** * time_is_before_eq_jiffies64 - return true if a is before or equal to jiffies_64 * @a: time (__u64) to compare to jiffies_64 * * Return: %true is time a is before or the same jiffies_64, otherwise %false. */ #define time_is_before_eq_jiffies64(a) time_after_eq64(get_jiffies_64(), a) /** * time_is_after_eq_jiffies - return true if a is after or equal to jiffies * @a: time (unsigned long) to compare to jiffies * * Return: %true is time a is after or the same as jiffies, otherwise %false. */ #define time_is_after_eq_jiffies(a) time_before_eq(jiffies, a) /** * time_is_after_eq_jiffies64 - return true if a is after or equal to jiffies_64 * @a: time (__u64) to compare to jiffies_64 * * Return: %true is time a is after or the same as jiffies_64, otherwise %false. */ #define time_is_after_eq_jiffies64(a) time_before_eq64(get_jiffies_64(), a) /* * Have the 32-bit jiffies value wrap 5 minutes after boot * so jiffies wrap bugs show up earlier. */ #define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-300*HZ)) /* * Change timeval to jiffies, trying to avoid the * most obvious overflows.. * * And some not so obvious. * * Note that we don't want to return LONG_MAX, because * for various timeout reasons we often end up having * to wait "jiffies+1" in order to guarantee that we wait * at _least_ "jiffies" - so "jiffies+1" had better still * be positive. */ #define MAX_JIFFY_OFFSET ((LONG_MAX >> 1)-1) extern unsigned long preset_lpj; /* * We want to do realistic conversions of time so we need to use the same * values the update wall clock code uses as the jiffies size. This value * is: TICK_NSEC (which is defined in timex.h). This * is a constant and is in nanoseconds. We will use scaled math * with a set of scales defined here as SEC_JIFFIE_SC, USEC_JIFFIE_SC and * NSEC_JIFFIE_SC. Note that these defines contain nothing but * constants and so are computed at compile time. SHIFT_HZ (computed in * timex.h) adjusts the scaling for different HZ values. * Scaled math??? What is that? * * Scaled math is a way to do integer math on values that would, * otherwise, either overflow, underflow, or cause undesired div * instructions to appear in the execution path. In short, we "scale" * up the operands so they take more bits (more precision, less * underflow), do the desired operation and then "scale" the result back * by the same amount. If we do the scaling by shifting we avoid the * costly mpy and the dastardly div instructions. * Suppose, for example, we want to convert from seconds to jiffies * where jiffies is defined in nanoseconds as NSEC_PER_JIFFIE. The * simple math is: jiff = (sec * NSEC_PER_SEC) / NSEC_PER_JIFFIE; We * observe that (NSEC_PER_SEC / NSEC_PER_JIFFIE) is a constant which we * might calculate at compile time, however, the result will only have * about 3-4 bits of precision (less for smaller values of HZ). * * So, we scale as follows: * jiff = (sec) * (NSEC_PER_SEC / NSEC_PER_JIFFIE); * jiff = ((sec) * ((NSEC_PER_SEC * SCALE)/ NSEC_PER_JIFFIE)) / SCALE; * Then we make SCALE a power of two so: * jiff = ((sec) * ((NSEC_PER_SEC << SCALE)/ NSEC_PER_JIFFIE)) >> SCALE; * Now we define: * #define SEC_CONV = ((NSEC_PER_SEC << SCALE)/ NSEC_PER_JIFFIE)) * jiff = (sec * SEC_CONV) >> SCALE; * * Often the math we use will expand beyond 32-bits so we tell C how to * do this and pass the 64-bit result of the mpy through the ">> SCALE" * which should take the result back to 32-bits. We want this expansion * to capture as much precision as possible. At the same time we don't * want to overflow so we pick the SCALE to avoid this. In this file, * that means using a different scale for each range of HZ values (as * defined in timex.h). * * For those who want to know, gcc will give a 64-bit result from a "*" * operator if the result is a long long AND at least one of the * operands is cast to long long (usually just prior to the "*" so as * not to confuse it into thinking it really has a 64-bit operand, * which, buy the way, it can do, but it takes more code and at least 2 * mpys). * We also need to be aware that one second in nanoseconds is only a * couple of bits away from overflowing a 32-bit word, so we MUST use * 64-bits to get the full range time in nanoseconds. */ /* * Here are the scales we will use. One for seconds, nanoseconds and * microseconds. * * Within the limits of cpp we do a rough cut at the SEC_JIFFIE_SC and * check if the sign bit is set. If not, we bump the shift count by 1. * (Gets an extra bit of precision where we can use it.) * We know it is set for HZ = 1024 and HZ = 100 not for 1000. * Haven't tested others. * Limits of cpp (for #if expressions) only long (no long long), but * then we only need the most signicant bit. */ #define SEC_JIFFIE_SC (31 - SHIFT_HZ) #if !((((NSEC_PER_SEC << 2) / TICK_NSEC) << (SEC_JIFFIE_SC - 2)) & 0x80000000) #undef SEC_JIFFIE_SC #define SEC_JIFFIE_SC (32 - SHIFT_HZ) #endif #define NSEC_JIFFIE_SC (SEC_JIFFIE_SC + 29) #define SEC_CONVERSION ((unsigned long)((((u64)NSEC_PER_SEC << SEC_JIFFIE_SC) +\ TICK_NSEC -1) / (u64)TICK_NSEC)) #define NSEC_CONVERSION ((unsigned long)((((u64)1 << NSEC_JIFFIE_SC) +\ TICK_NSEC -1) / (u64)TICK_NSEC)) /* * The maximum jiffy value is (MAX_INT >> 1). Here we translate that * into seconds. The 64-bit case will overflow if we are not careful, * so use the messy SH_DIV macro to do it. Still all constants. */ #if BITS_PER_LONG < 64 # define MAX_SEC_IN_JIFFIES \ (long)((u64)((u64)MAX_JIFFY_OFFSET * TICK_NSEC) / NSEC_PER_SEC) #else /* take care of overflow on 64-bit machines */ # define MAX_SEC_IN_JIFFIES \ (SH_DIV((MAX_JIFFY_OFFSET >> SEC_JIFFIE_SC) * TICK_NSEC, NSEC_PER_SEC, 1) - 1) #endif /* * Convert various time units to each other: */ extern unsigned int jiffies_to_msecs(const unsigned long j); extern unsigned int jiffies_to_usecs(const unsigned long j); /** * jiffies_to_nsecs - Convert jiffies to nanoseconds * @j: jiffies value * * Return: nanoseconds value */ static inline u64 jiffies_to_nsecs(const unsigned long j) { return (u64)jiffies_to_usecs(j) * NSEC_PER_USEC; } extern u64 jiffies64_to_nsecs(u64 j); extern u64 jiffies64_to_msecs(u64 j); extern unsigned long __msecs_to_jiffies(const unsigned int m); #if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) /* * HZ is equal to or smaller than 1000, and 1000 is a nice round * multiple of HZ, divide with the factor between them, but round * upwards: */ static inline unsigned long _msecs_to_jiffies(const unsigned int m) { return (m + (MSEC_PER_SEC / HZ) - 1) / (MSEC_PER_SEC / HZ); } #elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) /* * HZ is larger than 1000, and HZ is a nice round multiple of 1000 - * simply multiply with the factor between them. * * But first make sure the multiplication result cannot overflow: */ static inline unsigned long _msecs_to_jiffies(const unsigned int m) { if (m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) return MAX_JIFFY_OFFSET; return m * (HZ / MSEC_PER_SEC); } #else /* * Generic case - multiply, round and divide. But first check that if * we are doing a net multiplication, that we wouldn't overflow: */ static inline unsigned long _msecs_to_jiffies(const unsigned int m) { if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) return MAX_JIFFY_OFFSET; return (MSEC_TO_HZ_MUL32 * m + MSEC_TO_HZ_ADJ32) >> MSEC_TO_HZ_SHR32; } #endif /** * msecs_to_jiffies: - convert milliseconds to jiffies * @m: time in milliseconds * * conversion is done as follows: * * - negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET) * * - 'too large' values [that would result in larger than * MAX_JIFFY_OFFSET values] mean 'infinite timeout' too. * * - all other values are converted to jiffies by either multiplying * the input value by a factor or dividing it with a factor and * handling any 32-bit overflows. * for the details see _msecs_to_jiffies() * * msecs_to_jiffies() checks for the passed in value being a constant * via __builtin_constant_p() allowing gcc to eliminate most of the * code. __msecs_to_jiffies() is called if the value passed does not * allow constant folding and the actual conversion must be done at * runtime. * The HZ range specific helpers _msecs_to_jiffies() are called both * directly here and from __msecs_to_jiffies() in the case where * constant folding is not possible. * * Return: jiffies value */ static __always_inline unsigned long msecs_to_jiffies(const unsigned int m) { if (__builtin_constant_p(m)) { if ((int)m < 0) return MAX_JIFFY_OFFSET; return _msecs_to_jiffies(m); } else { return __msecs_to_jiffies(m); } } /** * secs_to_jiffies: - convert seconds to jiffies * @_secs: time in seconds * * Conversion is done by simple multiplication with HZ * * secs_to_jiffies() is defined as a macro rather than a static inline * function so it can be used in static initializers. * * Return: jiffies value */ #define secs_to_jiffies(_secs) (unsigned long)((_secs) * HZ) extern unsigned long __usecs_to_jiffies(const unsigned int u); #if !(USEC_PER_SEC % HZ) static inline unsigned long _usecs_to_jiffies(const unsigned int u) { return (u + (USEC_PER_SEC / HZ) - 1) / (USEC_PER_SEC / HZ); } #else static inline unsigned long _usecs_to_jiffies(const unsigned int u) { return (USEC_TO_HZ_MUL32 * u + USEC_TO_HZ_ADJ32) >> USEC_TO_HZ_SHR32; } #endif /** * usecs_to_jiffies: - convert microseconds to jiffies * @u: time in microseconds * * conversion is done as follows: * * - 'too large' values [that would result in larger than * MAX_JIFFY_OFFSET values] mean 'infinite timeout' too. * * - all other values are converted to jiffies by either multiplying * the input value by a factor or dividing it with a factor and * handling any 32-bit overflows as for msecs_to_jiffies. * * usecs_to_jiffies() checks for the passed in value being a constant * via __builtin_constant_p() allowing gcc to eliminate most of the * code. __usecs_to_jiffies() is called if the value passed does not * allow constant folding and the actual conversion must be done at * runtime. * The HZ range specific helpers _usecs_to_jiffies() are called both * directly here and from __msecs_to_jiffies() in the case where * constant folding is not possible. * * Return: jiffies value */ static __always_inline unsigned long usecs_to_jiffies(const unsigned int u) { if (__builtin_constant_p(u)) { if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET)) return MAX_JIFFY_OFFSET; return _usecs_to_jiffies(u); } else { return __usecs_to_jiffies(u); } } extern unsigned long timespec64_to_jiffies(const struct timespec64 *value); extern void jiffies_to_timespec64(const unsigned long jiffies, struct timespec64 *value); extern clock_t jiffies_to_clock_t(unsigned long x); static inline clock_t jiffies_delta_to_clock_t(long delta) { return jiffies_to_clock_t(max(0L, delta)); } static inline unsigned int jiffies_delta_to_msecs(long delta) { return jiffies_to_msecs(max(0L, delta)); } extern unsigned long clock_t_to_jiffies(unsigned long x); extern u64 jiffies_64_to_clock_t(u64 x); extern u64 nsec_to_clock_t(u64 x); extern u64 nsecs_to_jiffies64(u64 n); extern unsigned long nsecs_to_jiffies(u64 n); #define TIMESTAMP_SIZE 30 #endif |
| 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 4 1 3 3 2 2 1 1 1 1 1 1 3 1 7 1 4 4 4 5 3 2 3 1 1 4 3 4 3 1 4 4 4 4 4 4 4 4 4 4 5 5 5 2 3 1 5 3 5 1 4 2 2 4 4 2 1 1 1 1 1 1 1 2 1 1 6 7 7 7 7 1 6 5 1 7 7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 | // SPDX-License-Identifier: GPL-2.0 /* * mm/mremap.c * * (C) Copyright 1996 Linus Torvalds * * Address space accounting code <alan@lxorguk.ukuu.org.uk> * (C) Copyright 2002 Red Hat Inc, All Rights Reserved */ #include <linux/mm.h> #include <linux/mm_inline.h> #include <linux/hugetlb.h> #include <linux/shm.h> #include <linux/ksm.h> #include <linux/mman.h> #include <linux/swap.h> #include <linux/capability.h> #include <linux/fs.h> #include <linux/swapops.h> #include <linux/highmem.h> #include <linux/security.h> #include <linux/syscalls.h> #include <linux/mmu_notifier.h> #include <linux/uaccess.h> #include <linux/userfaultfd_k.h> #include <linux/mempolicy.h> #include <asm/cacheflush.h> #include <asm/tlb.h> #include <asm/pgalloc.h> #include "internal.h" /* Classify the kind of remap operation being performed. */ enum mremap_type { MREMAP_INVALID, /* Initial state. */ MREMAP_NO_RESIZE, /* old_len == new_len, if not moved, do nothing. */ MREMAP_SHRINK, /* old_len > new_len. */ MREMAP_EXPAND, /* old_len < new_len. */ }; /* * Describes a VMA mremap() operation and is threaded throughout it. * * Any of the fields may be mutated by the operation, however these values will * always accurately reflect the remap (for instance, we may adjust lengths and * delta to account for hugetlb alignment). */ struct vma_remap_struct { /* User-provided state. */ unsigned long addr; /* User-specified address from which we remap. */ unsigned long old_len; /* Length of range being remapped. */ unsigned long new_len; /* Desired new length of mapping. */ unsigned long flags; /* user-specified MREMAP_* flags. */ unsigned long new_addr; /* Optionally, desired new address. */ /* uffd state. */ struct vm_userfaultfd_ctx *uf; struct list_head *uf_unmap_early; struct list_head *uf_unmap; /* VMA state, determined in do_mremap(). */ struct vm_area_struct *vma; /* Internal state, determined in do_mremap(). */ unsigned long delta; /* Absolute delta of old_len,new_len. */ bool mlocked; /* Was the VMA mlock()'d? */ enum mremap_type remap_type; /* expand, shrink, etc. */ bool mmap_locked; /* Is mm currently write-locked? */ unsigned long charged; /* If VM_ACCOUNT, # pages to account. */ }; static pud_t *get_old_pud(struct mm_struct *mm, unsigned long addr) { pgd_t *pgd; p4d_t *p4d; pud_t *pud; pgd = pgd_offset(mm, addr); if (pgd_none_or_clear_bad(pgd)) return NULL; p4d = p4d_offset(pgd, addr); if (p4d_none_or_clear_bad(p4d)) return NULL; pud = pud_offset(p4d, addr); if (pud_none_or_clear_bad(pud)) return NULL; return pud; } static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr) { pud_t *pud; pmd_t *pmd; pud = get_old_pud(mm, addr); if (!pud) return NULL; pmd = pmd_offset(pud, addr); if (pmd_none(*pmd)) return NULL; return pmd; } static pud_t *alloc_new_pud(struct mm_struct *mm, unsigned long addr) { pgd_t *pgd; p4d_t *p4d; pgd = pgd_offset(mm, addr); p4d = p4d_alloc(mm, pgd, addr); if (!p4d) return NULL; return pud_alloc(mm, p4d, addr); } static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr) { pud_t *pud; pmd_t *pmd; pud = alloc_new_pud(mm, addr); if (!pud) return NULL; pmd = pmd_alloc(mm, pud, addr); if (!pmd) return NULL; VM_BUG_ON(pmd_trans_huge(*pmd)); return pmd; } static void take_rmap_locks(struct vm_area_struct *vma) { if (vma->vm_file) i_mmap_lock_write(vma->vm_file->f_mapping); if (vma->anon_vma) anon_vma_lock_write(vma->anon_vma); } static void drop_rmap_locks(struct vm_area_struct *vma) { if (vma->anon_vma) anon_vma_unlock_write(vma->anon_vma); if (vma->vm_file) i_mmap_unlock_write(vma->vm_file->f_mapping); } static pte_t move_soft_dirty_pte(pte_t pte) { /* * Set soft dirty bit so we can notice * in userspace the ptes were moved. */ #ifdef CONFIG_MEM_SOFT_DIRTY if (pte_present(pte)) pte = pte_mksoft_dirty(pte); else if (is_swap_pte(pte)) pte = pte_swp_mksoft_dirty(pte); #endif return pte; } static int move_ptes(struct pagetable_move_control *pmc, unsigned long extent, pmd_t *old_pmd, pmd_t *new_pmd) { struct vm_area_struct *vma = pmc->old; bool need_clear_uffd_wp = vma_has_uffd_without_event_remap(vma); struct mm_struct *mm = vma->vm_mm; pte_t *old_pte, *new_pte, pte; pmd_t dummy_pmdval; spinlock_t *old_ptl, *new_ptl; bool force_flush = false; unsigned long old_addr = pmc->old_addr; unsigned long new_addr = pmc->new_addr; unsigned long old_end = old_addr + extent; unsigned long len = old_end - old_addr; int err = 0; /* * When need_rmap_locks is true, we take the i_mmap_rwsem and anon_vma * locks to ensure that rmap will always observe either the old or the * new ptes. This is the easiest way to avoid races with * truncate_pagecache(), page migration, etc... * * When need_rmap_locks is false, we use other ways to avoid * such races: * * - During exec() shift_arg_pages(), we use a specially tagged vma * which rmap call sites look for using vma_is_temporary_stack(). * * - During mremap(), new_vma is often known to be placed after vma * in rmap traversal order. This ensures rmap will always observe * either the old pte, or the new pte, or both (the page table locks * serialize access to individual ptes, but only rmap traversal * order guarantees that we won't miss both the old and new ptes). */ if (pmc->need_rmap_locks) take_rmap_locks(vma); /* * We don't have to worry about the ordering of src and dst * pte locks because exclusive mmap_lock prevents deadlock. */ old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl); if (!old_pte) { err = -EAGAIN; goto out; } /* * Now new_pte is none, so hpage_collapse_scan_file() path can not find * this by traversing file->f_mapping, so there is no concurrency with * retract_page_tables(). In addition, we already hold the exclusive * mmap_lock, so this new_pte page is stable, so there is no need to get * pmdval and do pmd_same() check. */ new_pte = pte_offset_map_rw_nolock(mm, new_pmd, new_addr, &dummy_pmdval, &new_ptl); if (!new_pte) { pte_unmap_unlock(old_pte, old_ptl); err = -EAGAIN; goto out; } if (new_ptl != old_ptl) spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); flush_tlb_batched_pending(vma->vm_mm); arch_enter_lazy_mmu_mode(); for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE, new_pte++, new_addr += PAGE_SIZE) { if (pte_none(ptep_get(old_pte))) continue; pte = ptep_get_and_clear(mm, old_addr, old_pte); /* * If we are remapping a valid PTE, make sure * to flush TLB before we drop the PTL for the * PTE. * * NOTE! Both old and new PTL matter: the old one * for racing with folio_mkclean(), the new one to * make sure the physical page stays valid until * the TLB entry for the old mapping has been * flushed. */ if (pte_present(pte)) force_flush = true; pte = move_pte(pte, old_addr, new_addr); pte = move_soft_dirty_pte(pte); if (need_clear_uffd_wp && pte_marker_uffd_wp(pte)) pte_clear(mm, new_addr, new_pte); else { if (need_clear_uffd_wp) { if (pte_present(pte)) pte = pte_clear_uffd_wp(pte); else if (is_swap_pte(pte)) pte = pte_swp_clear_uffd_wp(pte); } set_pte_at(mm, new_addr, new_pte, pte); } } arch_leave_lazy_mmu_mode(); if (force_flush) flush_tlb_range(vma, old_end - len, old_end); if (new_ptl != old_ptl) spin_unlock(new_ptl); pte_unmap(new_pte - 1); pte_unmap_unlock(old_pte - 1, old_ptl); out: if (pmc->need_rmap_locks) drop_rmap_locks(vma); return err; } #ifndef arch_supports_page_table_move #define arch_supports_page_table_move arch_supports_page_table_move static inline bool arch_supports_page_table_move(void) { return IS_ENABLED(CONFIG_HAVE_MOVE_PMD) || IS_ENABLED(CONFIG_HAVE_MOVE_PUD); } #endif #ifdef CONFIG_HAVE_MOVE_PMD static bool move_normal_pmd(struct pagetable_move_control *pmc, pmd_t *old_pmd, pmd_t *new_pmd) { spinlock_t *old_ptl, *new_ptl; struct vm_area_struct *vma = pmc->old; struct mm_struct *mm = vma->vm_mm; bool res = false; pmd_t pmd; if (!arch_supports_page_table_move()) return false; /* * The destination pmd shouldn't be established, free_pgtables() * should have released it. * * However, there's a case during execve() where we use mremap * to move the initial stack, and in that case the target area * may overlap the source area (always moving down). * * If everything is PMD-aligned, that works fine, as moving * each pmd down will clear the source pmd. But if we first * have a few 4kB-only pages that get moved down, and then * hit the "now the rest is PMD-aligned, let's do everything * one pmd at a time", we will still have the old (now empty * of any 4kB pages, but still there) PMD in the page table * tree. * * Warn on it once - because we really should try to figure * out how to do this better - but then say "I won't move * this pmd". * * One alternative might be to just unmap the target pmd at * this point, and verify that it really is empty. We'll see. */ if (WARN_ON_ONCE(!pmd_none(*new_pmd))) return false; /* If this pmd belongs to a uffd vma with remap events disabled, we need * to ensure that the uffd-wp state is cleared from all pgtables. This * means recursing into lower page tables in move_page_tables(), and we * can reuse the existing code if we simply treat the entry as "not * moved". */ if (vma_has_uffd_without_event_remap(vma)) return false; /* * We don't have to worry about the ordering of src and dst * ptlocks because exclusive mmap_lock prevents deadlock. */ old_ptl = pmd_lock(mm, old_pmd); new_ptl = pmd_lockptr(mm, new_pmd); if (new_ptl != old_ptl) spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); pmd = *old_pmd; /* Racing with collapse? */ if (unlikely(!pmd_present(pmd) || pmd_leaf(pmd))) goto out_unlock; /* Clear the pmd */ pmd_clear(old_pmd); res = true; VM_BUG_ON(!pmd_none(*new_pmd)); pmd_populate(mm, new_pmd, pmd_pgtable(pmd)); flush_tlb_range(vma, pmc->old_addr, pmc->old_addr + PMD_SIZE); out_unlock: if (new_ptl != old_ptl) spin_unlock(new_ptl); spin_unlock(old_ptl); return res; } #else static inline bool move_normal_pmd(struct pagetable_move_control *pmc, pmd_t *old_pmd, pmd_t *new_pmd) { return false; } #endif #if CONFIG_PGTABLE_LEVELS > 2 && defined(CONFIG_HAVE_MOVE_PUD) static bool move_normal_pud(struct pagetable_move_control *pmc, pud_t *old_pud, pud_t *new_pud) { spinlock_t *old_ptl, *new_ptl; struct vm_area_struct *vma = pmc->old; struct mm_struct *mm = vma->vm_mm; pud_t pud; if (!arch_supports_page_table_move()) return false; /* * The destination pud shouldn't be established, free_pgtables() * should have released it. */ if (WARN_ON_ONCE(!pud_none(*new_pud))) return false; /* If this pud belongs to a uffd vma with remap events disabled, we need * to ensure that the uffd-wp state is cleared from all pgtables. This * means recursing into lower page tables in move_page_tables(), and we * can reuse the existing code if we simply treat the entry as "not * moved". */ if (vma_has_uffd_without_event_remap(vma)) return false; /* * We don't have to worry about the ordering of src and dst * ptlocks because exclusive mmap_lock prevents deadlock. */ old_ptl = pud_lock(mm, old_pud); new_ptl = pud_lockptr(mm, new_pud); if (new_ptl != old_ptl) spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); /* Clear the pud */ pud = *old_pud; pud_clear(old_pud); VM_BUG_ON(!pud_none(*new_pud)); pud_populate(mm, new_pud, pud_pgtable(pud)); flush_tlb_range(vma, pmc->old_addr, pmc->old_addr + PUD_SIZE); if (new_ptl != old_ptl) spin_unlock(new_ptl); spin_unlock(old_ptl); return true; } #else static inline bool move_normal_pud(struct pagetable_move_control *pmc, pud_t *old_pud, pud_t *new_pud) { return false; } #endif #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD) static bool move_huge_pud(struct pagetable_move_control *pmc, pud_t *old_pud, pud_t *new_pud) { spinlock_t *old_ptl, *new_ptl; struct vm_area_struct *vma = pmc->old; struct mm_struct *mm = vma->vm_mm; pud_t pud; /* * The destination pud shouldn't be established, free_pgtables() * should have released it. */ if (WARN_ON_ONCE(!pud_none(*new_pud))) return false; /* * We don't have to worry about the ordering of src and dst * ptlocks because exclusive mmap_lock prevents deadlock. */ old_ptl = pud_lock(mm, old_pud); new_ptl = pud_lockptr(mm, new_pud); if (new_ptl != old_ptl) spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); /* Clear the pud */ pud = *old_pud; pud_clear(old_pud); VM_BUG_ON(!pud_none(*new_pud)); /* Set the new pud */ /* mark soft_ditry when we add pud level soft dirty support */ set_pud_at(mm, pmc->new_addr, new_pud, pud); flush_pud_tlb_range(vma, pmc->old_addr, pmc->old_addr + HPAGE_PUD_SIZE); if (new_ptl != old_ptl) spin_unlock(new_ptl); spin_unlock(old_ptl); return true; } #else static bool move_huge_pud(struct pagetable_move_control *pmc, pud_t *old_pud, pud_t *new_pud) { WARN_ON_ONCE(1); return false; } #endif enum pgt_entry { NORMAL_PMD, HPAGE_PMD, NORMAL_PUD, HPAGE_PUD, }; /* * Returns an extent of the corresponding size for the pgt_entry specified if * valid. Else returns a smaller extent bounded by the end of the source and * destination pgt_entry. */ static __always_inline unsigned long get_extent(enum pgt_entry entry, struct pagetable_move_control *pmc) { unsigned long next, extent, mask, size; unsigned long old_addr = pmc->old_addr; unsigned long old_end = pmc->old_end; unsigned long new_addr = pmc->new_addr; switch (entry) { case HPAGE_PMD: case NORMAL_PMD: mask = PMD_MASK; size = PMD_SIZE; break; case HPAGE_PUD: case NORMAL_PUD: mask = PUD_MASK; size = PUD_SIZE; break; default: BUILD_BUG(); break; } next = (old_addr + size) & mask; /* even if next overflowed, extent below will be ok */ extent = next - old_addr; if (extent > old_end - old_addr) extent = old_end - old_addr; next = (new_addr + size) & mask; if (extent > next - new_addr) extent = next - new_addr; return extent; } /* * Should move_pgt_entry() acquire the rmap locks? This is either expressed in * the PMC, or overridden in the case of normal, larger page tables. */ static bool should_take_rmap_locks(struct pagetable_move_control *pmc, enum pgt_entry entry) { switch (entry) { case NORMAL_PMD: case NORMAL_PUD: return true; default: return pmc->need_rmap_locks; } } /* * Attempts to speedup the move by moving entry at the level corresponding to * pgt_entry. Returns true if the move was successful, else false. */ static bool move_pgt_entry(struct pagetable_move_control *pmc, enum pgt_entry entry, void *old_entry, void *new_entry) { bool moved = false; bool need_rmap_locks = should_take_rmap_locks(pmc, entry); /* See comment in move_ptes() */ if (need_rmap_locks) take_rmap_locks(pmc->old); switch (entry) { case NORMAL_PMD: moved = move_normal_pmd(pmc, old_entry, new_entry); break; case NORMAL_PUD: moved = move_normal_pud(pmc, old_entry, new_entry); break; case HPAGE_PMD: moved = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && move_huge_pmd(pmc->old, pmc->old_addr, pmc->new_addr, old_entry, new_entry); break; case HPAGE_PUD: moved = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && move_huge_pud(pmc, old_entry, new_entry); break; default: WARN_ON_ONCE(1); break; } if (need_rmap_locks) drop_rmap_locks(pmc->old); return moved; } /* * A helper to check if aligning down is OK. The aligned address should fall * on *no mapping*. For the stack moving down, that's a special move within * the VMA that is created to span the source and destination of the move, * so we make an exception for it. */ static bool can_align_down(struct pagetable_move_control *pmc, struct vm_area_struct *vma, unsigned long addr_to_align, unsigned long mask) { unsigned long addr_masked = addr_to_align & mask; /* * If @addr_to_align of either source or destination is not the beginning * of the corresponding VMA, we can't align down or we will destroy part * of the current mapping. */ if (!pmc->for_stack && vma->vm_start != addr_to_align) return false; /* In the stack case we explicitly permit in-VMA alignment. */ if (pmc->for_stack && addr_masked >= vma->vm_start) return true; /* * Make sure the realignment doesn't cause the address to fall on an * existing mapping. */ return find_vma_intersection(vma->vm_mm, addr_masked, vma->vm_start) == NULL; } /* * Determine if are in fact able to realign for efficiency to a higher page * table boundary. */ static bool can_realign_addr(struct pagetable_move_control *pmc, unsigned long pagetable_mask) { unsigned long align_mask = ~pagetable_mask; unsigned long old_align = pmc->old_addr & align_mask; unsigned long new_align = pmc->new_addr & align_mask; unsigned long pagetable_size = align_mask + 1; unsigned long old_align_next = pagetable_size - old_align; /* * We don't want to have to go hunting for VMAs from the end of the old * VMA to the next page table boundary, also we want to make sure the * operation is wortwhile. * * So ensure that we only perform this realignment if the end of the * range being copied reaches or crosses the page table boundary. * * boundary boundary * .<- old_align -> . * . |----------------.-----------| * . | vma . | * . |----------------.-----------| * . <----------------.-----------> * . len_in * <-------------------------------> * . pagetable_size . * . <----------------> * . old_align_next . */ if (pmc->len_in < old_align_next) return false; /* Skip if the addresses are already aligned. */ if (old_align == 0) return false; /* Only realign if the new and old addresses are mutually aligned. */ if (old_align != new_align) return false; /* Ensure realignment doesn't cause overlap with existing mappings. */ if (!can_align_down(pmc, pmc->old, pmc->old_addr, pagetable_mask) || !can_align_down(pmc, pmc->new, pmc->new_addr, pagetable_mask)) return false; return true; } /* * Opportunistically realign to specified boundary for faster copy. * * Consider an mremap() of a VMA with page table boundaries as below, and no * preceding VMAs from the lower page table boundary to the start of the VMA, * with the end of the range reaching or crossing the page table boundary. * * boundary boundary * . |----------------.-----------| * . | vma . | * . |----------------.-----------| * . pmc->old_addr . pmc->old_end * . <----------------------------> * . move these page tables * * If we proceed with moving page tables in this scenario, we will have a lot of * work to do traversing old page tables and establishing new ones in the * destination across multiple lower level page tables. * * The idea here is simply to align pmc->old_addr, pmc->new_addr down to the * page table boundary, so we can simply copy a single page table entry for the * aligned portion of the VMA instead: * * boundary boundary * . |----------------.-----------| * . | vma . | * . |----------------.-----------| * pmc->old_addr . pmc->old_end * <-------------------------------------------> * . move these page tables */ static void try_realign_addr(struct pagetable_move_control *pmc, unsigned long pagetable_mask) { if (!can_realign_addr(pmc, pagetable_mask)) return; /* * Simply align to page table boundaries. Note that we do NOT update the * pmc->old_end value, and since the move_page_tables() operation spans * from [old_addr, old_end) (offsetting new_addr as it is performed), * this simply changes the start of the copy, not the end. */ pmc->old_addr &= pagetable_mask; pmc->new_addr &= pagetable_mask; } /* Is the page table move operation done? */ static bool pmc_done(struct pagetable_move_control *pmc) { return pmc->old_addr >= pmc->old_end; } /* Advance to the next page table, offset by extent bytes. */ static void pmc_next(struct pagetable_move_control *pmc, unsigned long extent) { pmc->old_addr += extent; pmc->new_addr += extent; } /* * Determine how many bytes in the specified input range have had their page * tables moved so far. */ static unsigned long pmc_progress(struct pagetable_move_control *pmc) { unsigned long orig_old_addr = pmc->old_end - pmc->len_in; unsigned long old_addr = pmc->old_addr; /* * Prevent negative return values when {old,new}_addr was realigned but * we broke out of the loop in move_page_tables() for the first PMD * itself. */ return old_addr < orig_old_addr ? 0 : old_addr - orig_old_addr; } unsigned long move_page_tables(struct pagetable_move_control *pmc) { unsigned long extent; struct mmu_notifier_range range; pmd_t *old_pmd, *new_pmd; pud_t *old_pud, *new_pud; struct mm_struct *mm = pmc->old->vm_mm; if (!pmc->len_in) return 0; if (is_vm_hugetlb_page(pmc->old)) return move_hugetlb_page_tables(pmc->old, pmc->new, pmc->old_addr, pmc->new_addr, pmc->len_in); /* * If possible, realign addresses to PMD boundary for faster copy. * Only realign if the mremap copying hits a PMD boundary. */ try_realign_addr(pmc, PMD_MASK); flush_cache_range(pmc->old, pmc->old_addr, pmc->old_end); mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, mm, pmc->old_addr, pmc->old_end); mmu_notifier_invalidate_range_start(&range); for (; !pmc_done(pmc); pmc_next(pmc, extent)) { cond_resched(); /* * If extent is PUD-sized try to speed up the move by moving at the * PUD level if possible. */ extent = get_extent(NORMAL_PUD, pmc); old_pud = get_old_pud(mm, pmc->old_addr); if (!old_pud) continue; new_pud = alloc_new_pud(mm, pmc->new_addr); if (!new_pud) break; if (pud_trans_huge(*old_pud) || pud_devmap(*old_pud)) { if (extent == HPAGE_PUD_SIZE) { move_pgt_entry(pmc, HPAGE_PUD, old_pud, new_pud); /* We ignore and continue on error? */ continue; } } else if (IS_ENABLED(CONFIG_HAVE_MOVE_PUD) && extent == PUD_SIZE) { if (move_pgt_entry(pmc, NORMAL_PUD, old_pud, new_pud)) continue; } extent = get_extent(NORMAL_PMD, pmc); old_pmd = get_old_pmd(mm, pmc->old_addr); if (!old_pmd) continue; new_pmd = alloc_new_pmd(mm, pmc->new_addr); if (!new_pmd) break; again: if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd) || pmd_devmap(*old_pmd)) { if (extent == HPAGE_PMD_SIZE && move_pgt_entry(pmc, HPAGE_PMD, old_pmd, new_pmd)) continue; split_huge_pmd(pmc->old, old_pmd, pmc->old_addr); } else if (IS_ENABLED(CONFIG_HAVE_MOVE_PMD) && extent == PMD_SIZE) { /* * If the extent is PMD-sized, try to speed the move by * moving at the PMD level if possible. */ if (move_pgt_entry(pmc, NORMAL_PMD, old_pmd, new_pmd)) continue; } if (pmd_none(*old_pmd)) continue; if (pte_alloc(pmc->new->vm_mm, new_pmd)) break; if (move_ptes(pmc, extent, old_pmd, new_pmd) < 0) goto again; } mmu_notifier_invalidate_range_end(&range); return pmc_progress(pmc); } /* Set vrm->delta to the difference in VMA size specified by user. */ static void vrm_set_delta(struct vma_remap_struct *vrm) { vrm->delta = abs_diff(vrm->old_len, vrm->new_len); } /* Determine what kind of remap this is - shrink, expand or no resize at all. */ static enum mremap_type vrm_remap_type(struct vma_remap_struct *vrm) { if (vrm->delta == 0) return MREMAP_NO_RESIZE; if (vrm->old_len > vrm->new_len) return MREMAP_SHRINK; return MREMAP_EXPAND; } /* * When moving a VMA to vrm->new_adr, does this result in the new and old VMAs * overlapping? */ static bool vrm_overlaps(struct vma_remap_struct *vrm) { unsigned long start_old = vrm->addr; unsigned long start_new = vrm->new_addr; unsigned long end_old = vrm->addr + vrm->old_len; unsigned long end_new = vrm->new_addr + vrm->new_len; /* * start_old end_old * |-----------| * | | * |-----------| * |-------------| * | | * |-------------| * start_new end_new */ if (end_old > start_new && end_new > start_old) return true; return false; } /* Do the mremap() flags require that the new_addr parameter be specified? */ static bool vrm_implies_new_addr(struct vma_remap_struct *vrm) { return vrm->flags & (MREMAP_FIXED | MREMAP_DONTUNMAP); } /* * Find an unmapped area for the requested vrm->new_addr. * * If MREMAP_FIXED then this is equivalent to a MAP_FIXED mmap() call. If only * MREMAP_DONTUNMAP is set, then this is equivalent to providing a hint to * mmap(), otherwise this is equivalent to mmap() specifying a NULL address. * * Returns 0 on success (with vrm->new_addr updated), or an error code upon * failure. */ static unsigned long vrm_set_new_addr(struct vma_remap_struct *vrm) { struct vm_area_struct *vma = vrm->vma; unsigned long map_flags = 0; /* Page Offset _into_ the VMA. */ pgoff_t internal_pgoff = (vrm->addr - vma->vm_start) >> PAGE_SHIFT; pgoff_t pgoff = vma->vm_pgoff + internal_pgoff; unsigned long new_addr = vrm_implies_new_addr(vrm) ? vrm->new_addr : 0; unsigned long res; if (vrm->flags & MREMAP_FIXED) map_flags |= MAP_FIXED; if (vma->vm_flags & VM_MAYSHARE) map_flags |= MAP_SHARED; res = get_unmapped_area(vma->vm_file, new_addr, vrm->new_len, pgoff, map_flags); if (IS_ERR_VALUE(res)) return res; vrm->new_addr = res; return 0; } /* * Keep track of pages which have been added to the memory mapping. If the VMA * is accounted, also check to see if there is sufficient memory. * * Returns true on success, false if insufficient memory to charge. */ static bool vrm_charge(struct vma_remap_struct *vrm) { unsigned long charged; if (!(vrm->vma->vm_flags & VM_ACCOUNT)) return true; /* * If we don't unmap the old mapping, then we account the entirety of * the length of the new one. Otherwise it's just the delta in size. */ if (vrm->flags & MREMAP_DONTUNMAP) charged = vrm->new_len >> PAGE_SHIFT; else charged = vrm->delta >> PAGE_SHIFT; /* This accounts 'charged' pages of memory. */ if (security_vm_enough_memory_mm(current->mm, charged)) return false; vrm->charged = charged; return true; } /* * an error has occurred so we will not be using vrm->charged memory. Unaccount * this memory if the VMA is accounted. */ static void vrm_uncharge(struct vma_remap_struct *vrm) { if (!(vrm->vma->vm_flags & VM_ACCOUNT)) return; vm_unacct_memory(vrm->charged); vrm->charged = 0; } /* * Update mm exec_vm, stack_vm, data_vm, and locked_vm fields as needed to * account for 'bytes' memory used, and if locked, indicate this in the VRM so * we can handle this correctly later. */ static void vrm_stat_account(struct vma_remap_struct *vrm, unsigned long bytes) { unsigned long pages = bytes >> PAGE_SHIFT; struct mm_struct *mm = current->mm; struct vm_area_struct *vma = vrm->vma; vm_stat_account(mm, vma->vm_flags, pages); if (vma->vm_flags & VM_LOCKED) { mm->locked_vm += pages; vrm->mlocked = true; } } /* * Perform checks before attempting to write a VMA prior to it being * moved. */ static unsigned long prep_move_vma(struct vma_remap_struct *vrm) { unsigned long err = 0; struct vm_area_struct *vma = vrm->vma; unsigned long old_addr = vrm->addr; unsigned long old_len = vrm->old_len; unsigned long dummy = vma->vm_flags; /* * We'd prefer to avoid failure later on in do_munmap: * which may split one vma into three before unmapping. */ if (current->mm->map_count >= sysctl_max_map_count - 3) return -ENOMEM; if (vma->vm_ops && vma->vm_ops->may_split) { if (vma->vm_start != old_addr) err = vma->vm_ops->may_split(vma, old_addr); if (!err && vma->vm_end != old_addr + old_len) err = vma->vm_ops->may_split(vma, old_addr + old_len); if (err) return err; } /* * Advise KSM to break any KSM pages in the area to be moved: * it would be confusing if they were to turn up at the new * location, where they happen to coincide with different KSM * pages recently unmapped. But leave vma->vm_flags as it was, * so KSM can come around to merge on vma and new_vma afterwards. */ err = ksm_madvise(vma, old_addr, old_addr + old_len, MADV_UNMERGEABLE, &dummy); if (err) return err; return 0; } /* * Unmap source VMA for VMA move, turning it from a copy to a move, being * careful to ensure we do not underflow memory account while doing so if an * accountable move. * * This is best effort, if we fail to unmap then we simply try to correct * accounting and exit. */ static void unmap_source_vma(struct vma_remap_struct *vrm) { struct mm_struct *mm = current->mm; unsigned long addr = vrm->addr; unsigned long len = vrm->old_len; struct vm_area_struct *vma = vrm->vma; VMA_ITERATOR(vmi, mm, addr); int err; unsigned long vm_start; unsigned long vm_end; /* * It might seem odd that we check for MREMAP_DONTUNMAP here, given this * function implies that we unmap the original VMA, which seems * contradictory. * * However, this occurs when this operation was attempted and an error * arose, in which case we _do_ wish to unmap the _new_ VMA, which means * we actually _do_ want it be unaccounted. */ bool accountable_move = (vma->vm_flags & VM_ACCOUNT) && !(vrm->flags & MREMAP_DONTUNMAP); /* * So we perform a trick here to prevent incorrect accounting. Any merge * or new VMA allocation performed in copy_vma() does not adjust * accounting, it is expected that callers handle this. * * And indeed we already have, accounting appropriately in the case of * both in vrm_charge(). * * However, when we unmap the existing VMA (to effect the move), this * code will, if the VMA has VM_ACCOUNT set, attempt to unaccount * removed pages. * * To avoid this we temporarily clear this flag, reinstating on any * portions of the original VMA that remain. */ if (accountable_move) { vm_flags_clear(vma, VM_ACCOUNT); /* We are about to split vma, so store the start/end. */ vm_start = vma->vm_start; vm_end = vma->vm_end; } err = do_vmi_munmap(&vmi, mm, addr, len, vrm->uf_unmap, /* unlock= */false); vrm->vma = NULL; /* Invalidated. */ if (err) { /* OOM: unable to split vma, just get accounts right */ vm_acct_memory(len >> PAGE_SHIFT); return; } /* * If we mremap() from a VMA like this: * * addr end * | | * v v * |-------------| * | | * |-------------| * * Having cleared VM_ACCOUNT from the whole VMA, after we unmap above * we'll end up with: * * addr end * | | * v v * |---| |---| * | A | | B | * |---| |---| * * The VMI is still pointing at addr, so vma_prev() will give us A, and * a subsequent or lone vma_next() will give as B. * * do_vmi_munmap() will have restored the VMI back to addr. */ if (accountable_move) { unsigned long end = addr + len; if (vm_start < addr) { struct vm_area_struct *prev = vma_prev(&vmi); vm_flags_set(prev, VM_ACCOUNT); /* Acquires VMA lock. */ } if (vm_end > end) { struct vm_area_struct *next = vma_next(&vmi); vm_flags_set(next, VM_ACCOUNT); /* Acquires VMA lock. */ } } } /* * Copy vrm->vma over to vrm->new_addr possibly adjusting size as part of the * process. Additionally handle an error occurring on moving of page tables, * where we reset vrm state to cause unmapping of the new VMA. * * Outputs the newly installed VMA to new_vma_ptr. Returns 0 on success or an * error code. */ static int copy_vma_and_data(struct vma_remap_struct *vrm, struct vm_area_struct **new_vma_ptr) { unsigned long internal_offset = vrm->addr - vrm->vma->vm_start; unsigned long internal_pgoff = internal_offset >> PAGE_SHIFT; unsigned long new_pgoff = vrm->vma->vm_pgoff + internal_pgoff; unsigned long moved_len; struct vm_area_struct *vma = vrm->vma; struct vm_area_struct *new_vma; int err = 0; PAGETABLE_MOVE(pmc, NULL, NULL, vrm->addr, vrm->new_addr, vrm->old_len); new_vma = copy_vma(&vma, vrm->new_addr, vrm->new_len, new_pgoff, &pmc.need_rmap_locks); if (!new_vma) { vrm_uncharge(vrm); *new_vma_ptr = NULL; return -ENOMEM; } vrm->vma = vma; pmc.old = vma; pmc.new = new_vma; moved_len = move_page_tables(&pmc); if (moved_len < vrm->old_len) err = -ENOMEM; else if (vma->vm_ops && vma->vm_ops->mremap) err = vma->vm_ops->mremap(new_vma); if (unlikely(err)) { PAGETABLE_MOVE(pmc_revert, new_vma, vma, vrm->new_addr, vrm->addr, moved_len); /* * On error, move entries back from new area to old, * which will succeed since page tables still there, * and then proceed to unmap new area instead of old. */ pmc_revert.need_rmap_locks = true; move_page_tables(&pmc_revert); vrm->vma = new_vma; vrm->old_len = vrm->new_len; vrm->addr = vrm->new_addr; } else { mremap_userfaultfd_prep(new_vma, vrm->uf); } if (is_vm_hugetlb_page(vma)) clear_vma_resv_huge_pages(vma); /* Tell pfnmap has moved from this vma */ if (unlikely(vma->vm_flags & VM_PFNMAP)) untrack_pfn_clear(vma); *new_vma_ptr = new_vma; return err; } /* * Perform final tasks for MADV_DONTUNMAP operation, clearing mlock() and * account flags on remaining VMA by convention (it cannot be mlock()'d any * longer, as pages in range are no longer mapped), and removing anon_vma_chain * links from it (if the entire VMA was copied over). */ static void dontunmap_complete(struct vma_remap_struct *vrm, struct vm_area_struct *new_vma) { unsigned long start = vrm->addr; unsigned long end = vrm->addr + vrm->old_len; unsigned long old_start = vrm->vma->vm_start; unsigned long old_end = vrm->vma->vm_end; /* * We always clear VM_LOCKED[ONFAULT] | VM_ACCOUNT on the old * vma. */ vm_flags_clear(vrm->vma, VM_LOCKED_MASK | VM_ACCOUNT); /* * anon_vma links of the old vma is no longer needed after its page * table has been moved. */ if (new_vma != vrm->vma && start == old_start && end == old_end) unlink_anon_vmas(vrm->vma); /* Because we won't unmap we don't need to touch locked_vm. */ } static unsigned long move_vma(struct vma_remap_struct *vrm) { struct mm_struct *mm = current->mm; struct vm_area_struct *new_vma; unsigned long hiwater_vm; int err; err = prep_move_vma(vrm); if (err) return err; /* If accounted, charge the number of bytes the operation will use. */ if (!vrm_charge(vrm)) return -ENOMEM; /* We don't want racing faults. */ vma_start_write(vrm->vma); /* Perform copy step. */ err = copy_vma_and_data(vrm, &new_vma); /* * If we established the copied-to VMA, we attempt to recover from the * error by setting the destination VMA to the source VMA and unmapping * it below. */ if (err && !new_vma) return err; /* * If we failed to move page tables we still do total_vm increment * since do_munmap() will decrement it by old_len == new_len. * * Since total_vm is about to be raised artificially high for a * moment, we need to restore high watermark afterwards: if stats * are taken meanwhile, total_vm and hiwater_vm appear too high. * If this were a serious issue, we'd add a flag to do_munmap(). */ hiwater_vm = mm->hiwater_vm; vrm_stat_account(vrm, vrm->new_len); if (unlikely(!err && (vrm->flags & MREMAP_DONTUNMAP))) dontunmap_complete(vrm, new_vma); else unmap_source_vma(vrm); mm->hiwater_vm = hiwater_vm; return err ? (unsigned long)err : vrm->new_addr; } /* * resize_is_valid() - Ensure the vma can be resized to the new length at the give * address. * * Return 0 on success, error otherwise. */ static int resize_is_valid(struct vma_remap_struct *vrm) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma = vrm->vma; unsigned long addr = vrm->addr; unsigned long old_len = vrm->old_len; unsigned long new_len = vrm->new_len; unsigned long pgoff; /* * !old_len is a special case where an attempt is made to 'duplicate' * a mapping. This makes no sense for private mappings as it will * instead create a fresh/new mapping unrelated to the original. This * is contrary to the basic idea of mremap which creates new mappings * based on the original. There are no known use cases for this * behavior. As a result, fail such attempts. */ if (!old_len && !(vma->vm_flags & (VM_SHARED | VM_MAYSHARE))) { pr_warn_once("%s (%d): attempted to duplicate a private mapping with mremap. This is not supported.\n", current->comm, current->pid); return -EINVAL; } if ((vrm->flags & MREMAP_DONTUNMAP) && (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))) return -EINVAL; /* We can't remap across vm area boundaries */ if (old_len > vma->vm_end - addr) return -EFAULT; if (new_len == old_len) return 0; /* Need to be careful about a growing mapping */ pgoff = (addr - vma->vm_start) >> PAGE_SHIFT; pgoff += vma->vm_pgoff; if (pgoff + (new_len >> PAGE_SHIFT) < pgoff) return -EINVAL; if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) return -EFAULT; if (!mlock_future_ok(mm, vma->vm_flags, vrm->delta)) return -EAGAIN; if (!may_expand_vm(mm, vma->vm_flags, vrm->delta >> PAGE_SHIFT)) return -ENOMEM; return 0; } /* * The user has requested that the VMA be shrunk (i.e., old_len > new_len), so * execute this, optionally dropping the mmap lock when we do so. * * In both cases this invalidates the VMA, however if we don't drop the lock, * then load the correct VMA into vrm->vma afterwards. */ static unsigned long shrink_vma(struct vma_remap_struct *vrm, bool drop_lock) { struct mm_struct *mm = current->mm; unsigned long unmap_start = vrm->addr + vrm->new_len; unsigned long unmap_bytes = vrm->delta; unsigned long res; VMA_ITERATOR(vmi, mm, unmap_start); VM_BUG_ON(vrm->remap_type != MREMAP_SHRINK); res = do_vmi_munmap(&vmi, mm, unmap_start, unmap_bytes, vrm->uf_unmap, drop_lock); vrm->vma = NULL; /* Invalidated. */ if (res) return res; /* * If we've not dropped the lock, then we should reload the VMA to * replace the invalidated VMA with the one that may have now been * split. */ if (drop_lock) { vrm->mmap_locked = false; } else { vrm->vma = vma_lookup(mm, vrm->addr); if (!vrm->vma) return -EFAULT; } return 0; } /* * mremap_to() - remap a vma to a new location. * Returns: The new address of the vma or an error. */ static unsigned long mremap_to(struct vma_remap_struct *vrm) { struct mm_struct *mm = current->mm; unsigned long err; /* Is the new length or address silly? */ if (vrm->new_len > TASK_SIZE || vrm->new_addr > TASK_SIZE - vrm->new_len) return -EINVAL; if (vrm_overlaps(vrm)) return -EINVAL; if (vrm->flags & MREMAP_FIXED) { /* * In mremap_to(). * VMA is moved to dst address, and munmap dst first. * do_munmap will check if dst is sealed. */ err = do_munmap(mm, vrm->new_addr, vrm->new_len, vrm->uf_unmap_early); vrm->vma = NULL; /* Invalidated. */ if (err) return err; /* * If we remap a portion of a VMA elsewhere in the same VMA, * this can invalidate the old VMA. Reset. */ vrm->vma = vma_lookup(mm, vrm->addr); if (!vrm->vma) return -EFAULT; } if (vrm->remap_type == MREMAP_SHRINK) { err = shrink_vma(vrm, /* drop_lock= */false); if (err) return err; /* Set up for the move now shrink has been executed. */ vrm->old_len = vrm->new_len; } err = resize_is_valid(vrm); if (err) return err; /* MREMAP_DONTUNMAP expands by old_len since old_len == new_len */ if (vrm->flags & MREMAP_DONTUNMAP) { vm_flags_t vm_flags = vrm->vma->vm_flags; unsigned long pages = vrm->old_len >> PAGE_SHIFT; if (!may_expand_vm(mm, vm_flags, pages)) return -ENOMEM; } err = vrm_set_new_addr(vrm); if (err) return err; return move_vma(vrm); } static int vma_expandable(struct vm_area_struct *vma, unsigned long delta) { unsigned long end = vma->vm_end + delta; if (end < vma->vm_end) /* overflow */ return 0; if (find_vma_intersection(vma->vm_mm, vma->vm_end, end)) return 0; if (get_unmapped_area(NULL, vma->vm_start, end - vma->vm_start, 0, MAP_FIXED) & ~PAGE_MASK) return 0; return 1; } /* Determine whether we are actually able to execute an in-place expansion. */ static bool vrm_can_expand_in_place(struct vma_remap_struct *vrm) { /* Number of bytes from vrm->addr to end of VMA. */ unsigned long suffix_bytes = vrm->vma->vm_end - vrm->addr; /* If end of range aligns to end of VMA, we can just expand in-place. */ if (suffix_bytes != vrm->old_len) return false; /* Check whether this is feasible. */ if (!vma_expandable(vrm->vma, vrm->delta)) return false; return true; } /* * Are the parameters passed to mremap() valid? If so return 0, otherwise return * error. */ static unsigned long check_mremap_params(struct vma_remap_struct *vrm) { unsigned long addr = vrm->addr; unsigned long flags = vrm->flags; /* Ensure no unexpected flag values. */ if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE | MREMAP_DONTUNMAP)) return -EINVAL; /* Start address must be page-aligned. */ if (offset_in_page(addr)) return -EINVAL; /* * We allow a zero old-len as a special case * for DOS-emu "duplicate shm area" thing. But * a zero new-len is nonsensical. */ if (!PAGE_ALIGN(vrm->new_len)) return -EINVAL; /* Remainder of checks are for cases with specific new_addr. */ if (!vrm_implies_new_addr(vrm)) return 0; /* The new address must be page-aligned. */ if (offset_in_page(vrm->new_addr)) return -EINVAL; /* A fixed address implies a move. */ if (!(flags & MREMAP_MAYMOVE)) return -EINVAL; /* MREMAP_DONTUNMAP does not allow resizing in the process. */ if (flags & MREMAP_DONTUNMAP && vrm->old_len != vrm->new_len) return -EINVAL; /* * move_vma() need us to stay 4 maps below the threshold, otherwise * it will bail out at the very beginning. * That is a problem if we have already unmaped the regions here * (new_addr, and old_addr), because userspace will not know the * state of the vma's after it gets -ENOMEM. * So, to avoid such scenario we can pre-compute if the whole * operation has high chances to success map-wise. * Worst-scenario case is when both vma's (new_addr and old_addr) get * split in 3 before unmapping it. * That means 2 more maps (1 for each) to the ones we already hold. * Check whether current map count plus 2 still leads us to 4 maps below * the threshold, otherwise return -ENOMEM here to be more safe. */ if ((current->mm->map_count + 2) >= sysctl_max_map_count - 3) return -ENOMEM; return 0; } /* * We know we can expand the VMA in-place by delta pages, so do so. * * If we discover the VMA is locked, update mm_struct statistics accordingly and * indicate so to the caller. */ static unsigned long expand_vma_in_place(struct vma_remap_struct *vrm) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma = vrm->vma; VMA_ITERATOR(vmi, mm, vma->vm_end); if (!vrm_charge(vrm)) return -ENOMEM; /* * Function vma_merge_extend() is called on the * extension we are adding to the already existing vma, * vma_merge_extend() will merge this extension with the * already existing vma (expand operation itself) and * possibly also with the next vma if it becomes * adjacent to the expanded vma and otherwise * compatible. */ vma = vma_merge_extend(&vmi, vma, vrm->delta); if (!vma) { vrm_uncharge(vrm); return -ENOMEM; } vrm->vma = vma; vrm_stat_account(vrm, vrm->delta); return 0; } static bool align_hugetlb(struct vma_remap_struct *vrm) { struct hstate *h __maybe_unused = hstate_vma(vrm->vma); vrm->old_len = ALIGN(vrm->old_len, huge_page_size(h)); vrm->new_len = ALIGN(vrm->new_len, huge_page_size(h)); /* addrs must be huge page aligned */ if (vrm->addr & ~huge_page_mask(h)) return false; if (vrm->new_addr & ~huge_page_mask(h)) return false; /* * Don't allow remap expansion, because the underlying hugetlb * reservation is not yet capable to handle split reservation. */ if (vrm->new_len > vrm->old_len) return false; vrm_set_delta(vrm); return true; } /* * We are mremap()'ing without specifying a fixed address to move to, but are * requesting that the VMA's size be increased. * * Try to do so in-place, if this fails, then move the VMA to a new location to * action the change. */ static unsigned long expand_vma(struct vma_remap_struct *vrm) { unsigned long err; unsigned long addr = vrm->addr; err = resize_is_valid(vrm); if (err) return err; /* * [addr, old_len) spans precisely to the end of the VMA, so try to * expand it in-place. */ if (vrm_can_expand_in_place(vrm)) { err = expand_vma_in_place(vrm); if (err) return err; /* * We want to populate the newly expanded portion of the VMA to * satisfy the expectation that mlock()'ing a VMA maintains all * of its pages in memory. */ if (vrm->mlocked) vrm->new_addr = addr; /* OK we're done! */ return addr; } /* * We weren't able to just expand or shrink the area, * we need to create a new one and move it. */ /* We're not allowed to move the VMA, so error out. */ if (!(vrm->flags & MREMAP_MAYMOVE)) return -ENOMEM; /* Find a new location to move the VMA to. */ err = vrm_set_new_addr(vrm); if (err) return err; return move_vma(vrm); } /* * Attempt to resize the VMA in-place, if we cannot, then move the VMA to the * first available address to perform the operation. */ static unsigned long mremap_at(struct vma_remap_struct *vrm) { unsigned long res; switch (vrm->remap_type) { case MREMAP_INVALID: break; case MREMAP_NO_RESIZE: /* NO-OP CASE - resizing to the same size. */ return vrm->addr; case MREMAP_SHRINK: /* * SHRINK CASE. Can always be done in-place. * * Simply unmap the shrunken portion of the VMA. This does all * the needed commit accounting, and we indicate that the mmap * lock should be dropped. */ res = shrink_vma(vrm, /* drop_lock= */true); if (res) return res; return vrm->addr; case MREMAP_EXPAND: return expand_vma(vrm); } BUG(); } static unsigned long do_mremap(struct vma_remap_struct *vrm) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; unsigned long ret; ret = check_mremap_params(vrm); if (ret) return ret; vrm->old_len = PAGE_ALIGN(vrm->old_len); vrm->new_len = PAGE_ALIGN(vrm->new_len); vrm_set_delta(vrm); if (mmap_write_lock_killable(mm)) return -EINTR; vrm->mmap_locked = true; vma = vrm->vma = vma_lookup(mm, vrm->addr); if (!vma) { ret = -EFAULT; goto out; } /* If mseal()'d, mremap() is prohibited. */ if (!can_modify_vma(vma)) { ret = -EPERM; goto out; } /* Align to hugetlb page size, if required. */ if (is_vm_hugetlb_page(vma) && !align_hugetlb(vrm)) { ret = -EINVAL; goto out; } vrm->remap_type = vrm_remap_type(vrm); /* Actually execute mremap. */ ret = vrm_implies_new_addr(vrm) ? mremap_to(vrm) : mremap_at(vrm); out: if (vrm->mmap_locked) { mmap_write_unlock(mm); vrm->mmap_locked = false; if (!offset_in_page(ret) && vrm->mlocked && vrm->new_len > vrm->old_len) mm_populate(vrm->new_addr + vrm->old_len, vrm->delta); } userfaultfd_unmap_complete(mm, vrm->uf_unmap_early); mremap_userfaultfd_complete(vrm->uf, vrm->addr, ret, vrm->old_len); userfaultfd_unmap_complete(mm, vrm->uf_unmap); return ret; } /* * Expand (or shrink) an existing mapping, potentially moving it at the * same time (controlled by the MREMAP_MAYMOVE flag and available VM space) * * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise * This option implies MREMAP_MAYMOVE. */ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, unsigned long, new_len, unsigned long, flags, unsigned long, new_addr) { struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX; LIST_HEAD(uf_unmap_early); LIST_HEAD(uf_unmap); /* * There is a deliberate asymmetry here: we strip the pointer tag * from the old address but leave the new address alone. This is * for consistency with mmap(), where we prevent the creation of * aliasing mappings in userspace by leaving the tag bits of the * mapping address intact. A non-zero tag will cause the subsequent * range checks to reject the address as invalid. * * See Documentation/arch/arm64/tagged-address-abi.rst for more * information. */ struct vma_remap_struct vrm = { .addr = untagged_addr(addr), .old_len = old_len, .new_len = new_len, .flags = flags, .new_addr = new_addr, .uf = &uf, .uf_unmap_early = &uf_unmap_early, .uf_unmap = &uf_unmap, .remap_type = MREMAP_INVALID, /* We set later. */ }; return do_mremap(&vrm); } |
| 14 17 14 17 11 12 10 12 12 4 8 12 12 12 12 12 12 12 11 12 12 12 11 12 12 12 12 12 11 12 12 11 12 11 12 12 12 12 12 12 12 10 11 12 11 12 12 12 11 12 12 12 12 12 12 12 12 12 12 12 10 12 12 12 12 12 12 12 12 11 12 12 12 12 12 4 8 8 4 11 12 12 12 12 12 4 7 12 11 12 1 1 1 12 12 12 12 12 12 11 6 10 10 12 1 10 10 10 10 3 3 10 12 9 9 9 8 9 9 9 5 3 1 1 1 1 1 1 1 11 12 12 12 12 12 12 12 12 12 12 11 12 12 1 1 1 1 1 1 1 1 1 1 1 1 1 1 12 12 1 1 1 12 11 12 11 12 12 12 12 11 11 12 12 12 11 12 12 12 12 12 12 11 12 12 12 12 12 12 12 12 12 12 12 12 12 12 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 7 7 7 7 6 7 13 12 12 12 3 1 11 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Digital Audio (PCM) abstract layer * Copyright (c) by Jaroslav Kysela <perex@perex.cz> */ #include <linux/compat.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/file.h> #include <linux/slab.h> #include <linux/sched/signal.h> #include <linux/time.h> #include <linux/pm_qos.h> #include <linux/io.h> #include <linux/dma-mapping.h> #include <linux/vmalloc.h> #include <sound/core.h> #include <sound/control.h> #include <sound/info.h> #include <sound/pcm.h> #include <sound/pcm_params.h> #include <sound/timer.h> #include <sound/minors.h> #include <linux/uio.h> #include <linux/delay.h> #include "pcm_local.h" #ifdef CONFIG_SND_DEBUG #define CREATE_TRACE_POINTS #include "pcm_param_trace.h" #else #define trace_hw_mask_param_enabled() 0 #define trace_hw_interval_param_enabled() 0 #define trace_hw_mask_param(substream, type, index, prev, curr) #define trace_hw_interval_param(substream, type, index, prev, curr) #endif /* * Compatibility */ struct snd_pcm_hw_params_old { unsigned int flags; unsigned int masks[SNDRV_PCM_HW_PARAM_SUBFORMAT - SNDRV_PCM_HW_PARAM_ACCESS + 1]; struct snd_interval intervals[SNDRV_PCM_HW_PARAM_TICK_TIME - SNDRV_PCM_HW_PARAM_SAMPLE_BITS + 1]; unsigned int rmask; unsigned int cmask; unsigned int info; unsigned int msbits; unsigned int rate_num; unsigned int rate_den; snd_pcm_uframes_t fifo_size; unsigned char reserved[64]; }; #ifdef CONFIG_SND_SUPPORT_OLD_API #define SNDRV_PCM_IOCTL_HW_REFINE_OLD _IOWR('A', 0x10, struct snd_pcm_hw_params_old) #define SNDRV_PCM_IOCTL_HW_PARAMS_OLD _IOWR('A', 0x11, struct snd_pcm_hw_params_old) static int snd_pcm_hw_refine_old_user(struct snd_pcm_substream *substream, struct snd_pcm_hw_params_old __user * _oparams); static int snd_pcm_hw_params_old_user(struct snd_pcm_substream *substream, struct snd_pcm_hw_params_old __user * _oparams); #endif static int snd_pcm_open(struct file *file, struct snd_pcm *pcm, int stream); /* * */ static DECLARE_RWSEM(snd_pcm_link_rwsem); void snd_pcm_group_init(struct snd_pcm_group *group) { spin_lock_init(&group->lock); mutex_init(&group->mutex); INIT_LIST_HEAD(&group->substreams); refcount_set(&group->refs, 1); } /* define group lock helpers */ #define DEFINE_PCM_GROUP_LOCK(action, mutex_action) \ static void snd_pcm_group_ ## action(struct snd_pcm_group *group, bool nonatomic) \ { \ if (nonatomic) \ mutex_ ## mutex_action(&group->mutex); \ else \ spin_ ## action(&group->lock); \ } DEFINE_PCM_GROUP_LOCK(lock, lock); DEFINE_PCM_GROUP_LOCK(unlock, unlock); DEFINE_PCM_GROUP_LOCK(lock_irq, lock); DEFINE_PCM_GROUP_LOCK(unlock_irq, unlock); /** * snd_pcm_stream_lock - Lock the PCM stream * @substream: PCM substream * * This locks the PCM stream's spinlock or mutex depending on the nonatomic * flag of the given substream. This also takes the global link rw lock * (or rw sem), too, for avoiding the race with linked streams. */ void snd_pcm_stream_lock(struct snd_pcm_substream *substream) { snd_pcm_group_lock(&substream->self_group, substream->pcm->nonatomic); } EXPORT_SYMBOL_GPL(snd_pcm_stream_lock); /** * snd_pcm_stream_unlock - Unlock the PCM stream * @substream: PCM substream * * This unlocks the PCM stream that has been locked via snd_pcm_stream_lock(). */ void snd_pcm_stream_unlock(struct snd_pcm_substream *substream) { snd_pcm_group_unlock(&substream->self_group, substream->pcm->nonatomic); } EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock); /** * snd_pcm_stream_lock_irq - Lock the PCM stream * @substream: PCM substream * * This locks the PCM stream like snd_pcm_stream_lock() and disables the local * IRQ (only when nonatomic is false). In nonatomic case, this is identical * as snd_pcm_stream_lock(). */ void snd_pcm_stream_lock_irq(struct snd_pcm_substream *substream) { snd_pcm_group_lock_irq(&substream->self_group, substream->pcm->nonatomic); } EXPORT_SYMBOL_GPL(snd_pcm_stream_lock_irq); static void snd_pcm_stream_lock_nested(struct snd_pcm_substream *substream) { struct snd_pcm_group *group = &substream->self_group; if (substream->pcm->nonatomic) mutex_lock_nested(&group->mutex, SINGLE_DEPTH_NESTING); else spin_lock_nested(&group->lock, SINGLE_DEPTH_NESTING); } /** * snd_pcm_stream_unlock_irq - Unlock the PCM stream * @substream: PCM substream * * This is a counter-part of snd_pcm_stream_lock_irq(). */ void snd_pcm_stream_unlock_irq(struct snd_pcm_substream *substream) { snd_pcm_group_unlock_irq(&substream->self_group, substream->pcm->nonatomic); } EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock_irq); unsigned long _snd_pcm_stream_lock_irqsave(struct snd_pcm_substream *substream) { unsigned long flags = 0; if (substream->pcm->nonatomic) mutex_lock(&substream->self_group.mutex); else spin_lock_irqsave(&substream->self_group.lock, flags); return flags; } EXPORT_SYMBOL_GPL(_snd_pcm_stream_lock_irqsave); unsigned long _snd_pcm_stream_lock_irqsave_nested(struct snd_pcm_substream *substream) { unsigned long flags = 0; if (substream->pcm->nonatomic) mutex_lock_nested(&substream->self_group.mutex, SINGLE_DEPTH_NESTING); else spin_lock_irqsave_nested(&substream->self_group.lock, flags, SINGLE_DEPTH_NESTING); return flags; } EXPORT_SYMBOL_GPL(_snd_pcm_stream_lock_irqsave_nested); /** * snd_pcm_stream_unlock_irqrestore - Unlock the PCM stream * @substream: PCM substream * @flags: irq flags * * This is a counter-part of snd_pcm_stream_lock_irqsave(). */ void snd_pcm_stream_unlock_irqrestore(struct snd_pcm_substream *substream, unsigned long flags) { if (substream->pcm->nonatomic) mutex_unlock(&substream->self_group.mutex); else spin_unlock_irqrestore(&substream->self_group.lock, flags); } EXPORT_SYMBOL_GPL(snd_pcm_stream_unlock_irqrestore); /* Run PCM ioctl ops */ static int snd_pcm_ops_ioctl(struct snd_pcm_substream *substream, unsigned cmd, void *arg) { if (substream->ops->ioctl) return substream->ops->ioctl(substream, cmd, arg); else return snd_pcm_lib_ioctl(substream, cmd, arg); } int snd_pcm_info(struct snd_pcm_substream *substream, struct snd_pcm_info *info) { struct snd_pcm *pcm = substream->pcm; struct snd_pcm_str *pstr = substream->pstr; memset(info, 0, sizeof(*info)); info->card = pcm->card->number; info->device = pcm->device; info->stream = substream->stream; info->subdevice = substream->number; strscpy(info->id, pcm->id, sizeof(info->id)); strscpy(info->name, pcm->name, sizeof(info->name)); info->dev_class = pcm->dev_class; info->dev_subclass = pcm->dev_subclass; info->subdevices_count = pstr->substream_count; info->subdevices_avail = pstr->substream_count - pstr->substream_opened; strscpy(info->subname, substream->name, sizeof(info->subname)); return 0; } int snd_pcm_info_user(struct snd_pcm_substream *substream, struct snd_pcm_info __user * _info) { struct snd_pcm_info *info __free(kfree) = NULL; int err; info = kmalloc(sizeof(*info), GFP_KERNEL); if (! info) return -ENOMEM; err = snd_pcm_info(substream, info); if (err >= 0) { if (copy_to_user(_info, info, sizeof(*info))) err = -EFAULT; } return err; } /* macro for simplified cast */ #define PARAM_MASK_BIT(b) (1U << (__force int)(b)) static bool hw_support_mmap(struct snd_pcm_substream *substream) { struct snd_dma_buffer *dmabuf; if (!(substream->runtime->hw.info & SNDRV_PCM_INFO_MMAP)) return false; if (substream->ops->mmap || substream->ops->page) return true; dmabuf = snd_pcm_get_dma_buf(substream); if (!dmabuf) dmabuf = &substream->dma_buffer; switch (dmabuf->dev.type) { case SNDRV_DMA_TYPE_UNKNOWN: /* we can't know the device, so just assume that the driver does * everything right */ return true; case SNDRV_DMA_TYPE_CONTINUOUS: case SNDRV_DMA_TYPE_VMALLOC: return true; default: return dma_can_mmap(dmabuf->dev.dev); } } static int constrain_mask_params(struct snd_pcm_substream *substream, struct snd_pcm_hw_params *params) { struct snd_pcm_hw_constraints *constrs = &substream->runtime->hw_constraints; struct snd_mask *m; unsigned int k; struct snd_mask old_mask __maybe_unused; int changed; for (k = SNDRV_PCM_HW_PARAM_FIRST_MASK; k <= SNDRV_PCM_HW_PARAM_LAST_MASK; k++) { m = hw_param_mask(params, k); if (snd_mask_empty(m)) return -EINVAL; /* This parameter is not requested to change by a caller. */ if (!(params->rmask & PARAM_MASK_BIT(k))) continue; if (trace_hw_mask_param_enabled()) old_mask = *m; changed = snd_mask_refine(m, constrs_mask(constrs, k)); if (changed < 0) return changed; if (changed == 0) continue; /* Set corresponding flag so that the caller gets it. */ trace_hw_mask_param(substream, k, 0, &old_mask, m); params->cmask |= PARAM_MASK_BIT(k); } return 0; } static int constrain_interval_params(struct snd_pcm_substream *substream, struct snd_pcm_hw_params *params) { struct snd_pcm_hw_constraints *constrs = &substream->runtime->hw_constraints; struct snd_interval *i; unsigned int k; struct snd_interval old_interval __maybe_unused; int changed; for (k = SNDRV_PCM_HW_PARAM_FIRST_INTERVAL; k <= SNDRV_PCM_HW_PARAM_LAST_INTERVAL; k++) { i = hw_param_interval(params, k); if (snd_interval_empty(i)) return -EINVAL; /* This parameter is not requested to change by a caller. */ if (!(params->rmask & PARAM_MASK_BIT(k))) continue; if (trace_hw_interval_param_enabled()) old_interval = *i; changed = snd_interval_refine(i, constrs_interval(constrs, k)); if (changed < 0) return changed; if (changed == 0) continue; /* Set corresponding flag so that the caller gets it. */ trace_hw_interval_param(substream, k, 0, &old_interval, i); params->cmask |= PARAM_MASK_BIT(k); } return 0; } static int constrain_params_by_rules(struct snd_pcm_substream *substream, struct snd_pcm_hw_params *params) { struct snd_pcm_hw_constraints *constrs = &substream->runtime->hw_constraints; unsigned int k; unsigned int *rstamps __free(kfree) = NULL; unsigned int vstamps[SNDRV_PCM_HW_PARAM_LAST_INTERVAL + 1]; unsigned int stamp; struct snd_pcm_hw_rule *r; unsigned int d; struct snd_mask old_mask __maybe_unused; struct snd_interval old_interval __maybe_unused; bool again; int changed, err = 0; /* * Each application of rule has own sequence number. * * Each member of 'rstamps' array represents the sequence number of * recent application of corresponding rule. */ rstamps = kcalloc(constrs->rules_num, sizeof(unsigned int), GFP_KERNEL); if (!rstamps) return -ENOMEM; /* * Each member of 'vstamps' array represents the sequence number of * recent application of rule in which corresponding parameters were * changed. * * In initial state, elements corresponding to parameters requested by * a caller is 1. For unrequested parameters, corresponding members * have 0 so that the parameters are never changed anymore. */ for (k = 0; k <= SNDRV_PCM_HW_PARAM_LAST_INTERVAL; k++) vstamps[k] = (params->rmask & PARAM_MASK_BIT(k)) ? 1 : 0; /* Due to the above design, actual sequence number starts at 2. */ stamp = 2; retry: /* Apply all rules in order. */ again = false; for (k = 0; k < constrs->rules_num; k++) { r = &constrs->rules[k]; /* * Check condition bits of this rule. When the rule has * some condition bits, parameter without the bits is * never processed. SNDRV_PCM_HW_PARAMS_NO_PERIOD_WAKEUP * is an example of the condition bits. */ if (r->cond && !(r->cond & params->flags)) continue; /* * The 'deps' array includes maximum four dependencies * to SNDRV_PCM_HW_PARAM_XXXs for this rule. The fifth * member of this array is a sentinel and should be * negative value. * * This rule should be processed in this time when dependent * parameters were changed at former applications of the other * rules. */ for (d = 0; r->deps[d] >= 0; d++) { if (vstamps[r->deps[d]] > rstamps[k]) break; } if (r->deps[d] < 0) continue; if (trace_hw_mask_param_enabled()) { if (hw_is_mask(r->var)) old_mask = *hw_param_mask(params, r->var); } if (trace_hw_interval_param_enabled()) { if (hw_is_interval(r->var)) old_interval = *hw_param_interval(params, r->var); } changed = r->func(params, r); if (changed < 0) return changed; /* * When the parameter is changed, notify it to the caller * by corresponding returned bit, then preparing for next * iteration. */ if (changed && r->var >= 0) { if (hw_is_mask(r->var)) { trace_hw_mask_param(substream, r->var, k + 1, &old_mask, hw_param_mask(params, r->var)); } if (hw_is_interval(r->var)) { trace_hw_interval_param(substream, r->var, k + 1, &old_interval, hw_param_interval(params, r->var)); } params->cmask |= PARAM_MASK_BIT(r->var); vstamps[r->var] = stamp; again = true; } rstamps[k] = stamp++; } /* Iterate to evaluate all rules till no parameters are changed. */ if (again) goto retry; return err; } static int fixup_unreferenced_params(struct snd_pcm_substream *substream, struct snd_pcm_hw_params *params) { const struct snd_interval *i; const struct snd_mask *m; struct snd_mask *m_rw; int err; if (!params->msbits) { i = hw_param_interval_c(params, SNDRV_PCM_HW_PARAM_SAMPLE_BITS); if (snd_interval_single(i)) params->msbits = snd_interval_value(i); m = hw_param_mask_c(params, SNDRV_PCM_HW_PARAM_FORMAT); if (snd_mask_single(m)) { snd_pcm_format_t format = (__force snd_pcm_format_t)snd_mask_min(m); params->msbits = snd_pcm_format_width(format); } } if (params->msbits) { m = hw_param_mask_c(params, SNDRV_PCM_HW_PARAM_FORMAT); if (snd_mask_single(m)) { snd_pcm_format_t format = (__force snd_pcm_format_t)snd_mask_min(m); if (snd_pcm_format_linear(format) && snd_pcm_format_width(format) != params->msbits) { m_rw = hw_param_mask(params, SNDRV_PCM_HW_PARAM_SUBFORMAT); snd_mask_reset(m_rw, (__force unsigned)SNDRV_PCM_SUBFORMAT_MSBITS_MAX); if (snd_mask_empty(m_rw)) return -EINVAL; } } } if (!params->rate_den) { i = hw_param_interval_c(params, SNDRV_PCM_HW_PARAM_RATE); if (snd_interval_single(i)) { params->rate_num = snd_interval_value(i); params->rate_den = 1; } } if (!params->fifo_size) { m = hw_param_mask_c(params, SNDRV_PCM_HW_PARAM_FORMAT); i = hw_param_interval_c(params, SNDRV_PCM_HW_PARAM_CHANNELS); if (snd_mask_single(m) && snd_interval_single(i)) { err = snd_pcm_ops_ioctl(substream, SNDRV_PCM_IOCTL1_FIFO_SIZE, params); if (err < 0) return err; } } if (!params->info) { params->info = substream->runtime->hw.info; params->info &= ~(SNDRV_PCM_INFO_FIFO_IN_FRAMES | SNDRV_PCM_INFO_DRAIN_TRIGGER); if (!hw_support_mmap(substream)) params->info &= ~(SNDRV_PCM_INFO_MMAP | SNDRV_PCM_INFO_MMAP_VALID); } err = snd_pcm_ops_ioctl(substream, SNDRV_PCM_IOCTL1_SYNC_ID, params); if (err < 0) return err; return 0; } int snd_pcm_hw_refine(struct snd_pcm_substream *substream, struct snd_pcm_hw_params *params) { int err; params->info = 0; params->fifo_size = 0; if (params->rmask & PARAM_MASK_BIT(SNDRV_PCM_HW_PARAM_SAMPLE_BITS)) params->msbits = 0; if (params->rmask & PARAM_MASK_BIT(SNDRV_PCM_HW_PARAM_RATE)) { params->rate_num = 0; params->rate_den = 0; } err = constrain_mask_params(substream, params); if (err < 0) return err; err = constrain_interval_params(substream, params); if (err < 0) return err; err = constrain_params_by_rules(substream, params); if (err < 0) return err; params->rmask = 0; return 0; } EXPORT_SYMBOL(snd_pcm_hw_refine); static int snd_pcm_hw_refine_user(struct snd_pcm_substream *substream, struct snd_pcm_hw_params __user * _params) { struct snd_pcm_hw_params *params __free(kfree) = NULL; int err; params = memdup_user(_params, sizeof(*params)); if (IS_ERR(params)) return PTR_ERR(params); err = snd_pcm_hw_refine(substream, params); if (err < 0) return err; err = fixup_unreferenced_params(substream, params); if (err < 0) return err; if (copy_to_user(_params, params, sizeof(*params))) return -EFAULT; return 0; } static int period_to_usecs(struct snd_pcm_runtime *runtime) { int usecs; if (! runtime->rate) return -1; /* invalid */ /* take 75% of period time as the deadline */ usecs = (750000 / runtime->rate) * runtime->period_size; usecs += ((750000 % runtime->rate) * runtime->period_size) / runtime->rate; return usecs; } static void snd_pcm_set_state(struct snd_pcm_substream *substream, snd_pcm_state_t state) { guard(pcm_stream_lock_irq)(substream); if (substream->runtime->state != SNDRV_PCM_STATE_DISCONNECTED) __snd_pcm_set_state(substream->runtime, state); } static inline void snd_pcm_timer_notify(struct snd_pcm_substream *substream, int event) { #ifdef CONFIG_SND_PCM_TIMER if (substream->timer) snd_timer_notify(substream->timer, event, &substream->runtime->trigger_tstamp); #endif } void snd_pcm_sync_stop(struct snd_pcm_substream *substream, bool sync_irq) { if (substream->runtime && substream->runtime->stop_operating) { substream->runtime->stop_operating = false; if (substream->ops && substream->ops->sync_stop) substream->ops->sync_stop(substream); else if (sync_irq && substream->pcm->card->sync_irq > 0) synchronize_irq(substream->pcm->card->sync_irq); } } /** * snd_pcm_hw_params_choose - choose a configuration defined by @params * @pcm: PCM instance * @params: the hw_params instance * * Choose one configuration from configuration space defined by @params. * The configuration chosen is that obtained fixing in this order: * first access, first format, first subformat, min channels, * min rate, min period time, max buffer size, min tick time * * Return: Zero if successful, or a negative error code on failure. */ static int snd_pcm_hw_params_choose(struct snd_pcm_substream *pcm, struct snd_pcm_hw_params *params) { static const int vars[] = { SNDRV_PCM_HW_PARAM_ACCESS, SNDRV_PCM_HW_PARAM_FORMAT, SNDRV_PCM_HW_PARAM_SUBFORMAT, SNDRV_PCM_HW_PARAM_CHANNELS, SNDRV_PCM_HW_PARAM_RATE, SNDRV_PCM_HW_PARAM_PERIOD_TIME, SNDRV_PCM_HW_PARAM_BUFFER_SIZE, SNDRV_PCM_HW_PARAM_TICK_TIME, -1 }; const int *v; struct snd_mask old_mask __maybe_unused; struct snd_interval old_interval __maybe_unused; int changed; for (v = vars; *v != -1; v++) { /* Keep old parameter to trace. */ if (trace_hw_mask_param_enabled()) { if (hw_is_mask(*v)) old_mask = *hw_param_mask(params, *v); } if (trace_hw_interval_param_enabled()) { if (hw_is_interval(*v)) old_interval = *hw_param_interval(params, *v); } if (*v != SNDRV_PCM_HW_PARAM_BUFFER_SIZE) changed = snd_pcm_hw_param_first(pcm, params, *v, NULL); else changed = snd_pcm_hw_param_last(pcm, params, *v, NULL); if (changed < 0) return changed; if (changed == 0) continue; /* Trace the changed parameter. */ if (hw_is_mask(*v)) { trace_hw_mask_param(pcm, *v, 0, &old_mask, hw_param_mask(params, *v)); } if (hw_is_interval(*v)) { trace_hw_interval_param(pcm, *v, 0, &old_interval, hw_param_interval(params, *v)); } } return 0; } /* acquire buffer_mutex; if it's in r/w operation, return -EBUSY, otherwise * block the further r/w operations */ static int snd_pcm_buffer_access_lock(struct snd_pcm_runtime *runtime) { if (!atomic_dec_unless_positive(&runtime->buffer_accessing)) return -EBUSY; mutex_lock(&runtime->buffer_mutex); return 0; /* keep buffer_mutex, unlocked by below */ } /* release buffer_mutex and clear r/w access flag */ static void snd_pcm_buffer_access_unlock(struct snd_pcm_runtime *runtime) { mutex_unlock(&runtime->buffer_mutex); atomic_inc(&runtime->buffer_accessing); } #if IS_ENABLED(CONFIG_SND_PCM_OSS) #define is_oss_stream(substream) ((substream)->oss.oss) #else #define is_oss_stream(substream) false #endif static int snd_pcm_hw_params(struct snd_pcm_substream *substream, struct snd_pcm_hw_params *params) { struct snd_pcm_runtime *runtime; int err, usecs; unsigned int bits; snd_pcm_uframes_t frames; if (PCM_RUNTIME_CHECK(substream)) return -ENXIO; runtime = substream->runtime; err = snd_pcm_buffer_access_lock(runtime); if (err < 0) return err; scoped_guard(pcm_stream_lock_irq, substream) { switch (runtime->state) { case SNDRV_PCM_STATE_OPEN: case SNDRV_PCM_STATE_SETUP: case SNDRV_PCM_STATE_PREPARED: if (!is_oss_stream(substream) && atomic_read(&substream->mmap_count)) err = -EBADFD; break; default: err = -EBADFD; break; } } if (err) goto unlock; snd_pcm_sync_stop(substream, true); params->rmask = ~0U; err = snd_pcm_hw_refine(substream, params); if (err < 0) goto _error; err = snd_pcm_hw_params_choose(substream, params); if (err < 0) goto _error; err = fixup_unreferenced_params(substream, params); if (err < 0) goto _error; if (substream->managed_buffer_alloc) { err = snd_pcm_lib_malloc_pages(substream, params_buffer_bytes(params)); if (err < 0) goto _error; runtime->buffer_changed = err > 0; } if (substream->ops->hw_params != NULL) { err = substream->ops->hw_params(substream, params); if (err < 0) goto _error; } runtime->access = params_access(params); runtime->format = params_format(params); runtime->subformat = params_subformat(params); runtime->channels = params_channels(params); runtime->rate = params_rate(params); runtime->period_size = params_period_size(params); runtime->periods = params_periods(params); runtime->buffer_size = params_buffer_size(params); runtime->info = params->info; runtime->rate_num = params->rate_num; runtime->rate_den = params->rate_den; runtime->no_period_wakeup = (params->info & SNDRV_PCM_INFO_NO_PERIOD_WAKEUP) && (params->flags & SNDRV_PCM_HW_PARAMS_NO_PERIOD_WAKEUP); bits = snd_pcm_format_physical_width(runtime->format); runtime->sample_bits = bits; bits *= runtime->channels; runtime->frame_bits = bits; frames = 1; while (bits % 8 != 0) { bits *= 2; frames *= 2; } runtime->byte_align = bits / 8; runtime->min_align = frames; /* Default sw params */ runtime->tstamp_mode = SNDRV_PCM_TSTAMP_NONE; runtime->period_step = 1; runtime->control->avail_min = runtime->period_size; runtime->start_threshold = 1; runtime->stop_threshold = runtime->buffer_size; runtime->silence_threshold = 0; runtime->silence_size = 0; runtime->boundary = runtime->buffer_size; while (runtime->boundary * 2 <= LONG_MAX - runtime->buffer_size) runtime->boundary *= 2; /* clear the buffer for avoiding possible kernel info leaks */ if (runtime->dma_area && !substream->ops->copy) { size_t size = runtime->dma_bytes; if (runtime->info & SNDRV_PCM_INFO_MMAP) size = PAGE_ALIGN(size); memset(runtime->dma_area, 0, size); } snd_pcm_timer_resolution_change(substream); snd_pcm_set_state(substream, SNDRV_PCM_STATE_SETUP); if (cpu_latency_qos_request_active(&substream->latency_pm_qos_req)) cpu_latency_qos_remove_request(&substream->latency_pm_qos_req); usecs = period_to_usecs(runtime); if (usecs >= 0) cpu_latency_qos_add_request(&substream->latency_pm_qos_req, usecs); err = 0; _error: if (err) { /* hardware might be unusable from this time, * so we force application to retry to set * the correct hardware parameter settings */ snd_pcm_set_state(substream, SNDRV_PCM_STATE_OPEN); if (substream->ops->hw_free != NULL) substream->ops->hw_free(substream); if (substream->managed_buffer_alloc) snd_pcm_lib_free_pages(substream); } unlock: snd_pcm_buffer_access_unlock(runtime); return err; } static int snd_pcm_hw_params_user(struct snd_pcm_substream *substream, struct snd_pcm_hw_params __user * _params) { struct snd_pcm_hw_params *params __free(kfree) = NULL; int err; params = memdup_user(_params, sizeof(*params)); if (IS_ERR(params)) return PTR_ERR(params); err = snd_pcm_hw_params(substream, params); if (err < 0) return err; if (copy_to_user(_params, params, sizeof(*params))) return -EFAULT; return err; } static int do_hw_free(struct snd_pcm_substream *substream) { int result = 0; snd_pcm_sync_stop(substream, true); if (substream->ops->hw_free) result = substream->ops->hw_free(substream); if (substream->managed_buffer_alloc) snd_pcm_lib_free_pages(substream); return result; } static int snd_pcm_hw_free(struct snd_pcm_substream *substream) { struct snd_pcm_runtime *runtime; int result = 0; if (PCM_RUNTIME_CHECK(substream)) return -ENXIO; runtime = substream->runtime; result = snd_pcm_buffer_access_lock(runtime); if (result < 0) return result; scoped_guard(pcm_stream_lock_irq, substream) { switch (runtime->state) { case SNDRV_PCM_STATE_SETUP: case SNDRV_PCM_STATE_PREPARED: if (atomic_read(&substream->mmap_count)) result = -EBADFD; break; default: result = -EBADFD; break; } } if (result) goto unlock; result = do_hw_free(substream); snd_pcm_set_state(substream, SNDRV_PCM_STATE_OPEN); cpu_latency_qos_remove_request(&substream->latency_pm_qos_req); unlock: snd_pcm_buffer_access_unlock(runtime); return result; } static int snd_pcm_sw_params(struct snd_pcm_substream *substream, struct snd_pcm_sw_params *params) { struct snd_pcm_runtime *runtime; int err; if (PCM_RUNTIME_CHECK(substream)) return -ENXIO; runtime = substream->runtime; scoped_guard(pcm_stream_lock_irq, substream) { if (runtime->state == SNDRV_PCM_STATE_OPEN) return -EBADFD; } if (params->tstamp_mode < 0 || params->tstamp_mode > SNDRV_PCM_TSTAMP_LAST) return -EINVAL; if (params->proto >= SNDRV_PROTOCOL_VERSION(2, 0, 12) && params->tstamp_type > SNDRV_PCM_TSTAMP_TYPE_LAST) return -EINVAL; if (params->avail_min == 0) return -EINVAL; if (params->silence_size >= runtime->boundary) { if (params->silence_threshold != 0) return -EINVAL; } else { if (params->silence_size > params->silence_threshold) return -EINVAL; if (params->silence_threshold > runtime->buffer_size) return -EINVAL; } err = 0; scoped_guard(pcm_stream_lock_irq, substream) { runtime->tstamp_mode = params->tstamp_mode; if (params->proto >= SNDRV_PROTOCOL_VERSION(2, 0, 12)) runtime->tstamp_type = params->tstamp_type; runtime->period_step = params->period_step; runtime->control->avail_min = params->avail_min; runtime->start_threshold = params->start_threshold; runtime->stop_threshold = params->stop_threshold; runtime->silence_threshold = params->silence_threshold; runtime->silence_size = params->silence_size; params->boundary = runtime->boundary; if (snd_pcm_running(substream)) { if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK && runtime->silence_size > 0) snd_pcm_playback_silence(substream, ULONG_MAX); err = snd_pcm_update_state(substream, runtime); } } return err; } static int snd_pcm_sw_params_user(struct snd_pcm_substream *substream, struct snd_pcm_sw_params __user * _params) { struct snd_pcm_sw_params params; int err; if (copy_from_user(¶ms, _params, sizeof(params))) return -EFAULT; err = snd_pcm_sw_params(substream, ¶ms); if (copy_to_user(_params, ¶ms, sizeof(params))) return -EFAULT; return err; } static inline snd_pcm_uframes_t snd_pcm_calc_delay(struct snd_pcm_substream *substream) { snd_pcm_uframes_t delay; if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) delay = snd_pcm_playback_hw_avail(substream->runtime); else delay = snd_pcm_capture_avail(substream->runtime); return delay + substream->runtime->delay; } int snd_pcm_status64(struct snd_pcm_substream *substream, struct snd_pcm_status64 *status) { struct snd_pcm_runtime *runtime = substream->runtime; guard(pcm_stream_lock_irq)(substream); snd_pcm_unpack_audio_tstamp_config(status->audio_tstamp_data, &runtime->audio_tstamp_config); /* backwards compatible behavior */ if (runtime->audio_tstamp_config.type_requested == SNDRV_PCM_AUDIO_TSTAMP_TYPE_COMPAT) { if (runtime->hw.info & SNDRV_PCM_INFO_HAS_WALL_CLOCK) runtime->audio_tstamp_config.type_requested = SNDRV_PCM_AUDIO_TSTAMP_TYPE_LINK; else runtime->audio_tstamp_config.type_requested = SNDRV_PCM_AUDIO_TSTAMP_TYPE_DEFAULT; runtime->audio_tstamp_report.valid = 0; } else runtime->audio_tstamp_report.valid = 1; status->state = runtime->state; status->suspended_state = runtime->suspended_state; if (status->state == SNDRV_PCM_STATE_OPEN) return 0; status->trigger_tstamp_sec = runtime->trigger_tstamp.tv_sec; status->trigger_tstamp_nsec = runtime->trigger_tstamp.tv_nsec; if (snd_pcm_running(substream)) { snd_pcm_update_hw_ptr(substream); if (runtime->tstamp_mode == SNDRV_PCM_TSTAMP_ENABLE) { status->tstamp_sec = runtime->status->tstamp.tv_sec; status->tstamp_nsec = runtime->status->tstamp.tv_nsec; status->driver_tstamp_sec = runtime->driver_tstamp.tv_sec; status->driver_tstamp_nsec = runtime->driver_tstamp.tv_nsec; status->audio_tstamp_sec = runtime->status->audio_tstamp.tv_sec; status->audio_tstamp_nsec = runtime->status->audio_tstamp.tv_nsec; if (runtime->audio_tstamp_report.valid == 1) /* backwards compatibility, no report provided in COMPAT mode */ snd_pcm_pack_audio_tstamp_report(&status->audio_tstamp_data, &status->audio_tstamp_accuracy, &runtime->audio_tstamp_report); goto _tstamp_end; } } else { /* get tstamp only in fallback mode and only if enabled */ if (runtime->tstamp_mode == SNDRV_PCM_TSTAMP_ENABLE) { struct timespec64 tstamp; snd_pcm_gettime(runtime, &tstamp); status->tstamp_sec = tstamp.tv_sec; status->tstamp_nsec = tstamp.tv_nsec; } } _tstamp_end: status->appl_ptr = runtime->control->appl_ptr; status->hw_ptr = runtime->status->hw_ptr; status->avail = snd_pcm_avail(substream); status->delay = snd_pcm_running(substream) ? snd_pcm_calc_delay(substream) : 0; status->avail_max = runtime->avail_max; status->overrange = runtime->overrange; runtime->avail_max = 0; runtime->overrange = 0; return 0; } static int snd_pcm_status_user64(struct snd_pcm_substream *substream, struct snd_pcm_status64 __user * _status, bool ext) { struct snd_pcm_status64 status; int res; memset(&status, 0, sizeof(status)); /* * with extension, parameters are read/write, * get audio_tstamp_data from user, * ignore rest of status structure */ if (ext && get_user(status.audio_tstamp_data, (u32 __user *)(&_status->audio_tstamp_data))) return -EFAULT; res = snd_pcm_status64(substream, &status); if (res < 0) return res; if (copy_to_user(_status, &status, sizeof(status))) return -EFAULT; return 0; } static int snd_pcm_status_user32(struct snd_pcm_substream *substream, struct snd_pcm_status32 __user * _status, bool ext) { struct snd_pcm_status64 status64; struct snd_pcm_status32 status32; int res; memset(&status64, 0, sizeof(status64)); memset(&status32, 0, sizeof(status32)); /* * with extension, parameters are read/write, * get audio_tstamp_data from user, * ignore rest of status structure */ if (ext && get_user(status64.audio_tstamp_data, (u32 __user *)(&_status->audio_tstamp_data))) return -EFAULT; res = snd_pcm_status64(substream, &status64); if (res < 0) return res; status32 = (struct snd_pcm_status32) { .state = status64.state, .trigger_tstamp_sec = status64.trigger_tstamp_sec, .trigger_tstamp_nsec = status64.trigger_tstamp_nsec, .tstamp_sec = status64.tstamp_sec, .tstamp_nsec = status64.tstamp_nsec, .appl_ptr = status64.appl_ptr, .hw_ptr = status64.hw_ptr, .delay = status64.delay, .avail = status64.avail, .avail_max = status64.avail_max, .overrange = status64.overrange, .suspended_state = status64.suspended_state, .audio_tstamp_data = status64.audio_tstamp_data, .audio_tstamp_sec = status64.audio_tstamp_sec, .audio_tstamp_nsec = status64.audio_tstamp_nsec, .driver_tstamp_sec = status64.audio_tstamp_sec, .driver_tstamp_nsec = status64.audio_tstamp_nsec, .audio_tstamp_accuracy = status64.audio_tstamp_accuracy, }; if (copy_to_user(_status, &status32, sizeof(status32))) return -EFAULT; return 0; } static int snd_pcm_channel_info(struct snd_pcm_substream *substream, struct snd_pcm_channel_info * info) { struct snd_pcm_runtime *runtime; unsigned int channel; channel = info->channel; runtime = substream->runtime; scoped_guard(pcm_stream_lock_irq, substream) { if (runtime->state == SNDRV_PCM_STATE_OPEN) return -EBADFD; } if (channel >= runtime->channels) return -EINVAL; memset(info, 0, sizeof(*info)); info->channel = channel; return snd_pcm_ops_ioctl(substream, SNDRV_PCM_IOCTL1_CHANNEL_INFO, info); } static int snd_pcm_channel_info_user(struct snd_pcm_substream *substream, struct snd_pcm_channel_info __user * _info) { struct snd_pcm_channel_info info; int res; if (copy_from_user(&info, _info, sizeof(info))) return -EFAULT; res = snd_pcm_channel_info(substream, &info); if (res < 0) return res; if (copy_to_user(_info, &info, sizeof(info))) return -EFAULT; return 0; } static void snd_pcm_trigger_tstamp(struct snd_pcm_substream *substream) { struct snd_pcm_runtime *runtime = substream->runtime; if (runtime->trigger_master == NULL) return; if (runtime->trigger_master == substream) { if (!runtime->trigger_tstamp_latched) snd_pcm_gettime(runtime, &runtime->trigger_tstamp); } else { snd_pcm_trigger_tstamp(runtime->trigger_master); runtime->trigger_tstamp = runtime->trigger_master->runtime->trigger_tstamp; } runtime->trigger_master = NULL; } #define ACTION_ARG_IGNORE (__force snd_pcm_state_t)0 struct action_ops { int (*pre_action)(struct snd_pcm_substream *substream, snd_pcm_state_t state); int (*do_action)(struct snd_pcm_substream *substream, snd_pcm_state_t state); void (*undo_action)(struct snd_pcm_substream *substream, snd_pcm_state_t state); void (*post_action)(struct snd_pcm_substream *substream, snd_pcm_state_t state); }; /* * this functions is core for handling of linked stream * Note: the stream state might be changed also on failure * Note2: call with calling stream lock + link lock */ static int snd_pcm_action_group(const struct action_ops *ops, struct snd_pcm_substream *substream, snd_pcm_state_t state, bool stream_lock) { struct snd_pcm_substream *s = NULL; struct snd_pcm_substream *s1; int res = 0, depth = 1; snd_pcm_group_for_each_entry(s, substream) { if (s != substream) { if (!stream_lock) mutex_lock_nested(&s->runtime->buffer_mutex, depth); else if (s->pcm->nonatomic) mutex_lock_nested(&s->self_group.mutex, depth); else spin_lock_nested(&s->self_group.lock, depth); depth++; } res = ops->pre_action(s, state); if (res < 0) goto _unlock; } snd_pcm_group_for_each_entry(s, substream) { res = ops->do_action(s, state); if (res < 0) { if (ops->undo_action) { snd_pcm_group_for_each_entry(s1, substream) { if (s1 == s) /* failed stream */ break; ops->undo_action(s1, state); } } s = NULL; /* unlock all */ goto _unlock; } } snd_pcm_group_for_each_entry(s, substream) { ops->post_action(s, state); } _unlock: /* unlock streams */ snd_pcm_group_for_each_entry(s1, substream) { if (s1 != substream) { if (!stream_lock) mutex_unlock(&s1->runtime->buffer_mutex); else if (s1->pcm->nonatomic) mutex_unlock(&s1->self_group.mutex); else spin_unlock(&s1->self_group.lock); } if (s1 == s) /* end */ break; } return res; } /* * Note: call with stream lock */ static int snd_pcm_action_single(const struct action_ops *ops, struct snd_pcm_substream *substream, snd_pcm_state_t state) { int res; res = ops->pre_action(substream, state); if (res < 0) return res; res = ops->do_action(substream, state); if (res == 0) ops->post_action(substream, state); else if (ops->undo_action) ops->undo_action(substream, state); return res; } static void snd_pcm_group_assign(struct snd_pcm_substream *substream, struct snd_pcm_group *new_group) { substream->group = new_group; list_move(&substream->link_list, &new_group->substreams); } /* * Unref and unlock the group, but keep the stream lock; * when the group becomes empty and no longer referred, destroy itself */ static void snd_pcm_group_unref(struct snd_pcm_group *group, struct snd_pcm_substream *substream) { bool do_free; if (!group) return; do_free = refcount_dec_and_test(&group->refs); snd_pcm_group_unlock(group, substream->pcm->nonatomic); if (do_free) kfree(group); } /* * Lock the group inside a stream lock and reference it; * return the locked group object, or NULL if not linked */ static struct snd_pcm_group * snd_pcm_stream_group_ref(struct snd_pcm_substream *substream) { bool nonatomic = substream->pcm->nonatomic; struct snd_pcm_group *group; bool trylock; for (;;) { if (!snd_pcm_stream_linked(substream)) return NULL; group = substream->group; /* block freeing the group object */ refcount_inc(&group->refs); trylock = nonatomic ? mutex_trylock(&group->mutex) : spin_trylock(&group->lock); if (trylock) break; /* OK */ /* re-lock for avoiding ABBA deadlock */ snd_pcm_stream_unlock(substream); snd_pcm_group_lock(group, nonatomic); snd_pcm_stream_lock(substream); /* check the group again; the above opens a small race window */ if (substream->group == group) break; /* OK */ /* group changed, try again */ snd_pcm_group_unref(group, substream); } return group; } /* * Note: call with stream lock */ static int snd_pcm_action(const struct action_ops *ops, struct snd_pcm_substream *substream, snd_pcm_state_t state) { struct snd_pcm_group *group; int res; group = snd_pcm_stream_group_ref(substream); if (group) res = snd_pcm_action_group(ops, substream, state, true); else res = snd_pcm_action_single(ops, substream, state); snd_pcm_group_unref(group, substream); return res; } /* * Note: don't use any locks before */ static int snd_pcm_action_lock_irq(const struct action_ops *ops, struct snd_pcm_substream *substream, snd_pcm_state_t state) { guard(pcm_stream_lock_irq)(substream); return snd_pcm_action(ops, substream, state); } /* */ static int snd_pcm_action_nonatomic(const struct action_ops *ops, struct snd_pcm_substream *substream, snd_pcm_state_t state) { int res; /* Guarantee the group members won't change during non-atomic action */ guard(rwsem_read)(&snd_pcm_link_rwsem); res = snd_pcm_buffer_access_lock(substream->runtime); if (res < 0) return res; if (snd_pcm_stream_linked(substream)) res = snd_pcm_action_group(ops, substream, state, false); else res = snd_pcm_action_single(ops, substream, state); snd_pcm_buffer_access_unlock(substream->runtime); return res; } /* * start callbacks */ static int snd_pcm_pre_start(struct snd_pcm_substream *substream, snd_pcm_state_t state) { struct snd_pcm_runtime *runtime = substream->runtime; if (runtime->state != SNDRV_PCM_STATE_PREPARED) return -EBADFD; if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK && !snd_pcm_playback_data(substream)) return -EPIPE; runtime->trigger_tstamp_latched = false; runtime->trigger_master = substream; return 0; } static int snd_pcm_do_start(struct snd_pcm_substream *substream, snd_pcm_state_t state) { int err; if (substream->runtime->trigger_master != substream) return 0; err = substream->ops->trigger(substream, SNDRV_PCM_TRIGGER_START); /* XRUN happened during the start */ if (err == -EPIPE) __snd_pcm_set_state(substream->runtime, SNDRV_PCM_STATE_XRUN); return err; } static void snd_pcm_undo_start(struct snd_pcm_substream *substream, snd_pcm_state_t state) { if (substream->runtime->trigger_master == substream) { substream->ops->trigger(substream, SNDRV_PCM_TRIGGER_STOP); substream->runtime->stop_operating = true; } } static void snd_pcm_post_start(struct snd_pcm_substream *substream, snd_pcm_state_t state) { struct snd_pcm_runtime *runtime = substream->runtime; snd_pcm_trigger_tstamp(substream); runtime->hw_ptr_jiffies = jiffies; runtime->hw_ptr_buffer_jiffies = (runtime->buffer_size * HZ) / runtime->rate; __snd_pcm_set_state(runtime, state); if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK && runtime->silence_size > 0) snd_pcm_playback_silence(substream, ULONG_MAX); snd_pcm_timer_notify(substream, SNDRV_TIMER_EVENT_MSTART); } static const struct action_ops snd_pcm_action_start = { .pre_action = snd_pcm_pre_start, .do_action = snd_pcm_do_start, .undo_action = snd_pcm_undo_start, .post_action = snd_pcm_post_start }; /** * snd_pcm_start - start all linked streams * @substream: the PCM substream instance * * Return: Zero if successful, or a negative error code. * The stream lock must be acquired before calling this function. */ int snd_pcm_start(struct snd_pcm_substream *substream) { return snd_pcm_action(&snd_pcm_action_start, substream, SNDRV_PCM_STATE_RUNNING); } /* take the stream lock and start the streams */ static int snd_pcm_start_lock_irq(struct snd_pcm_substream *substream) { return snd_pcm_action_lock_irq(&snd_pcm_action_start, substream, SNDRV_PCM_STATE_RUNNING); } /* * stop callbacks */ static int snd_pcm_pre_stop(struct snd_pcm_substream *substream, snd_pcm_state_t state) { struct snd_pcm_runtime *runtime = substream->runtime; if (runtime->state == SNDRV_PCM_STATE_OPEN) return -EBADFD; runtime->trigger_master = substream; return 0; } static int snd_pcm_do_stop(struct snd_pcm_substream *substream, snd_pcm_state_t state) { if (substream->runtime->trigger_master == substream && snd_pcm_running(substream)) { substream->ops->trigger(substream, SNDRV_PCM_TRIGGER_STOP); substream->runtime->stop_operating = true; } return 0; /* unconditionally stop all substreams */ } static void snd_pcm_post_stop(struct snd_pcm_substream *substream, snd_pcm_state_t state) { struct snd_pcm_runtime *runtime = substream->runtime; if (runtime->state != state) { snd_pcm_trigger_tstamp(substream); __snd_pcm_set_state(runtime, state); snd_pcm_timer_notify(substream, SNDRV_TIMER_EVENT_MSTOP); } wake_up(&runtime->sleep); wake_up(&runtime->tsleep); } static const struct action_ops snd_pcm_action_stop = { .pre_action = snd_pcm_pre_stop, .do_action = snd_pcm_do_stop, .post_action = snd_pcm_post_stop }; /** * snd_pcm_stop - try to stop all running streams in the substream group * @substream: the PCM substream instance * @state: PCM state after stopping the stream * * The state of each stream is then changed to the given state unconditionally. * * Return: Zero if successful, or a negative error code. */ int snd_pcm_stop(struct snd_pcm_substream *substream, snd_pcm_state_t state) { return snd_pcm_action(&snd_pcm_action_stop, substream, state); } EXPORT_SYMBOL(snd_pcm_stop); /** * snd_pcm_drain_done - stop the DMA only when the given stream is playback * @substream: the PCM substream * * After stopping, the state is changed to SETUP. * Unlike snd_pcm_stop(), this affects only the given stream. * * Return: Zero if successful, or a negative error code. */ int snd_pcm_drain_done(struct snd_pcm_substream *substream) { return snd_pcm_action_single(&snd_pcm_action_stop, substream, SNDRV_PCM_STATE_SETUP); } /** * snd_pcm_stop_xrun - stop the running streams as XRUN * @substream: the PCM substream instance * * This stops the given running substream (and all linked substreams) as XRUN. * Unlike snd_pcm_stop(), this function takes the substream lock by itself. * * Return: Zero if successful, or a negative error code. */ int snd_pcm_stop_xrun(struct snd_pcm_substream *substream) { guard(pcm_stream_lock_irqsave)(substream); if (substream->runtime && snd_pcm_running(substream)) __snd_pcm_xrun(substream); return 0; } EXPORT_SYMBOL_GPL(snd_pcm_stop_xrun); /* * pause callbacks: pass boolean (to start pause or resume) as state argument */ #define pause_pushed(state) (__force bool)(state) static int snd_pcm_pre_pause(struct snd_pcm_substream *substream, snd_pcm_state_t state) { struct snd_pcm_runtime *runtime = substream->runtime; if (!(runtime->info & SNDRV_PCM_INFO_PAUSE)) return -ENOSYS; if (pause_pushed(state)) { if (runtime->state != SNDRV_PCM_STATE_RUNNING) return -EBADFD; } else if (runtime->state != SNDRV_PCM_STATE_PAUSED) return -EBADFD; runtime->trigger_master = substream; return 0; } static int snd_pcm_do_pause(struct snd_pcm_substream *substream, snd_pcm_state_t state) { if (substream->runtime->trigger_master != substream) return 0; /* The jiffies check in snd_pcm_update_hw_ptr*() is done by * a delta between the current jiffies, this gives a large enough * delta, effectively to skip the check once. */ substream->runtime->hw_ptr_jiffies = jiffies - HZ * 1000; return substream->ops->trigger(substream, pause_pushed(state) ? SNDRV_PCM_TRIGGER_PAUSE_PUSH : SNDRV_PCM_TRIGGER_PAUSE_RELEASE); } static void snd_pcm_undo_pause(struct snd_pcm_substream *substream, snd_pcm_state_t state) { if (substream->runtime->trigger_master == substream) substream->ops->trigger(substream, pause_pushed(state) ? SNDRV_PCM_TRIGGER_PAUSE_RELEASE : SNDRV_PCM_TRIGGER_PAUSE_PUSH); } static void snd_pcm_post_pause(struct snd_pcm_substream *substream, snd_pcm_state_t state) { struct snd_pcm_runtime *runtime = substream->runtime; snd_pcm_trigger_tstamp(substream); if (pause_pushed(state)) { __snd_pcm_set_state(runtime, SNDRV_PCM_STATE_PAUSED); snd_pcm_timer_notify(substream, SNDRV_TIMER_EVENT_MPAUSE); wake_up(&runtime->sleep); wake_up(&runtime->tsleep); } else { __snd_pcm_set_state(runtime, SNDRV_PCM_STATE_RUNNING); snd_pcm_timer_notify(substream, SNDRV_TIMER_EVENT_MCONTINUE); } } static const struct action_ops snd_pcm_action_pause = { .pre_action = snd_pcm_pre_pause, .do_action = snd_pcm_do_pause, .undo_action = snd_pcm_undo_pause, .post_action = snd_pcm_post_pause }; /* * Push/release the pause for all linked streams. */ static int snd_pcm_pause(struct snd_pcm_substream *substream, bool push) { return snd_pcm_action(&snd_pcm_action_pause, substream, (__force snd_pcm_state_t)push); } static int snd_pcm_pause_lock_irq(struct snd_pcm_substream *substream, bool push) { return snd_pcm_action_lock_irq(&snd_pcm_action_pause, substream, (__force snd_pcm_state_t)push); } #ifdef CONFIG_PM /* suspend callback: state argument ignored */ static int snd_pcm_pre_suspend(struct snd_pcm_substream *substream, snd_pcm_state_t state) { struct snd_pcm_runtime *runtime = substream->runtime; switch (runtime->state) { case SNDRV_PCM_STATE_SUSPENDED: return -EBUSY; /* unresumable PCM state; return -EBUSY for skipping suspend */ case SNDRV_PCM_STATE_OPEN: case SNDRV_PCM_STATE_SETUP: case SNDRV_PCM_STATE_DISCONNECTED: return -EBUSY; } runtime->trigger_master = substream; return 0; } static int snd_pcm_do_suspend(struct snd_pcm_substream *substream, snd_pcm_state_t state) { struct snd_pcm_runtime *runtime = substream->runtime; if (runtime->trigger_master != substream) return 0; if (! snd_pcm_running(substream)) return 0; substream->ops->trigger(substream, SNDRV_PCM_TRIGGER_SUSPEND); runtime->stop_operating = true; return 0; /* suspend unconditionally */ } static void snd_pcm_post_suspend(struct snd_pcm_substream *substream, snd_pcm_state_t state) { struct snd_pcm_runtime *runtime = substream->runtime; snd_pcm_trigger_tstamp(substream); runtime->suspended_state = runtime->state; runtime->status->suspended_state = runtime->suspended_state; __snd_pcm_set_state(runtime, SNDRV_PCM_STATE_SUSPENDED); snd_pcm_timer_notify(substream, SNDRV_TIMER_EVENT_MSUSPEND); wake_up(&runtime->sleep); wake_up(&runtime->tsleep); } static const struct action_ops snd_pcm_action_suspend = { .pre_action = snd_pcm_pre_suspend, .do_action = snd_pcm_do_suspend, .post_action = snd_pcm_post_suspend }; /* * snd_pcm_suspend - trigger SUSPEND to all linked streams * @substream: the PCM substream * * After this call, all streams are changed to SUSPENDED state. * * Return: Zero if successful, or a negative error code. */ static int snd_pcm_suspend(struct snd_pcm_substream *substream) { guard(pcm_stream_lock_irqsave)(substream); return snd_pcm_action(&snd_pcm_action_suspend, substream, ACTION_ARG_IGNORE); } /** * snd_pcm_suspend_all - trigger SUSPEND to all substreams in the given pcm * @pcm: the PCM instance * * After this call, all streams are changed to SUSPENDED state. * * Return: Zero if successful (or @pcm is %NULL), or a negative error code. */ int snd_pcm_suspend_all(struct snd_pcm *pcm) { struct snd_pcm_substream *substream; int stream, err = 0; if (! pcm) return 0; for_each_pcm_substream(pcm, stream, substream) { /* FIXME: the open/close code should lock this as well */ if (!substream->runtime) continue; /* * Skip BE dai link PCM's that are internal and may * not have their substream ops set. */ if (!substream->ops) continue; err = snd_pcm_suspend(substream); if (err < 0 && err != -EBUSY) return err; } for_each_pcm_substream(pcm, stream, substream) snd_pcm_sync_stop(substream, false); return 0; } EXPORT_SYMBOL(snd_pcm_suspend_all); /* resume callbacks: state argument ignored */ static int snd_pcm_pre_resume(struct snd_pcm_substream *substream, snd_pcm_state_t state) { struct snd_pcm_runtime *runtime = substream->runtime; if (runtime->state != SNDRV_PCM_STATE_SUSPENDED) return -EBADFD; if (!(runtime->info & SNDRV_PCM_INFO_RESUME)) return -ENOSYS; runtime->trigger_master = substream; return 0; } static int snd_pcm_do_resume(struct snd_pcm_substream *substream, snd_pcm_state_t state) { struct snd_pcm_runtime *runtime = substream->runtime; if (runtime->trigger_master != substream) return 0; /* DMA not running previously? */ if (runtime->suspended_state != SNDRV_PCM_STATE_RUNNING && (runtime->suspended_state != SNDRV_PCM_STATE_DRAINING || substream->stream != SNDRV_PCM_STREAM_PLAYBACK)) return 0; return substream->ops->trigger(substream, SNDRV_PCM_TRIGGER_RESUME); } static void snd_pcm_undo_resume(struct snd_pcm_substream *substream, snd_pcm_state_t state) { if (substream->runtime->trigger_master == substream && snd_pcm_running(substream)) substream->ops->trigger(substream, SNDRV_PCM_TRIGGER_SUSPEND); } static void snd_pcm_post_resume(struct snd_pcm_substream *substream, snd_pcm_state_t state) { struct snd_pcm_runtime *runtime = substream->runtime; snd_pcm_trigger_tstamp(substream); __snd_pcm_set_state(runtime, runtime->suspended_state); snd_pcm_timer_notify(substream, SNDRV_TIMER_EVENT_MRESUME); } static const struct action_ops snd_pcm_action_resume = { .pre_action = snd_pcm_pre_resume, .do_action = snd_pcm_do_resume, .undo_action = snd_pcm_undo_resume, .post_action = snd_pcm_post_resume }; static int snd_pcm_resume(struct snd_pcm_substream *substream) { return snd_pcm_action_lock_irq(&snd_pcm_action_resume, substream, ACTION_ARG_IGNORE); } #else static int snd_pcm_resume(struct snd_pcm_substream *substream) { return -ENOSYS; } #endif /* CONFIG_PM */ /* * xrun ioctl * * Change the RUNNING stream(s) to XRUN state. */ static int snd_pcm_xrun(struct snd_pcm_substream *substream) { struct snd_pcm_runtime *runtime = substream->runtime; guard(pcm_stream_lock_irq)(substream); switch (runtime->state) { case SNDRV_PCM_STATE_XRUN: return 0; /* already there */ case SNDRV_PCM_STATE_RUNNING: __snd_pcm_xrun(substream); return 0; default: return -EBADFD; } } /* * reset ioctl */ /* reset callbacks: state argument ignored */ static int snd_pcm_pre_reset(struct snd_pcm_substream *substream, snd_pcm_state_t state) { struct snd_pcm_runtime *runtime = substream->runtime; switch (runtime->state) { case SNDRV_PCM_STATE_RUNNING: case SNDRV_PCM_STATE_PREPARED: case SNDRV_PCM_STATE_PAUSED: case SNDRV_PCM_STATE_SUSPENDED: return 0; default: return -EBADFD; } } static int snd_pcm_do_reset(struct snd_pcm_substream *substream, snd_pcm_state_t state) { struct snd_pcm_runtime *runtime = substream->runtime; int err = snd_pcm_ops_ioctl(substream, SNDRV_PCM_IOCTL1_RESET, NULL); if (err < 0) return err; guard(pcm_stream_lock_irq)(substream); runtime->hw_ptr_base = 0; runtime->hw_ptr_interrupt = runtime->status->hw_ptr - runtime->status->hw_ptr % runtime->period_size; runtime->silence_start = runtime->status->hw_ptr; runtime->silence_filled = 0; return 0; } static void snd_pcm_post_reset(struct snd_pcm_substream *substream, snd_pcm_state_t state) { struct snd_pcm_runtime *runtime = substream->runtime; guard(pcm_stream_lock_irq)(substream); runtime->control->appl_ptr = runtime->status->hw_ptr; if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK && runtime->silence_size > 0) snd_pcm_playback_silence(substream, ULONG_MAX); } static const struct action_ops snd_pcm_action_reset = { .pre_action = snd_pcm_pre_reset, .do_action = snd_pcm_do_reset, .post_action = snd_pcm_post_reset }; static int snd_pcm_reset(struct snd_pcm_substream *substream) { return snd_pcm_action_nonatomic(&snd_pcm_action_reset, substream, ACTION_ARG_IGNORE); } /* * prepare ioctl */ /* pass f_flags as state argument */ static int snd_pcm_pre_prepare(struct snd_pcm_substream *substream, snd_pcm_state_t state) { struct snd_pcm_runtime *runtime = substream->runtime; int f_flags = (__force int)state; if (runtime->state == SNDRV_PCM_STATE_OPEN || runtime->state == SNDRV_PCM_STATE_DISCONNECTED) return -EBADFD; if (snd_pcm_running(substream)) return -EBUSY; substream->f_flags = f_flags; return 0; } static int snd_pcm_do_prepare(struct snd_pcm_substream *substream, snd_pcm_state_t state) { int err; snd_pcm_sync_stop(substream, true); err = substream->ops->prepare(substream); if (err < 0) return err; return snd_pcm_do_reset(substream, state); } static void snd_pcm_post_prepare(struct snd_pcm_substream *substream, snd_pcm_state_t state) { struct snd_pcm_runtime *runtime = substream->runtime; runtime->control->appl_ptr = runtime->status->hw_ptr; snd_pcm_set_state(substream, SNDRV_PCM_STATE_PREPARED); } static const struct action_ops snd_pcm_action_prepare = { .pre_action = snd_pcm_pre_prepare, .do_action = snd_pcm_do_prepare, .post_action = snd_pcm_post_prepare }; /** * snd_pcm_prepare - prepare the PCM substream to be triggerable * @substream: the PCM substream instance * @file: file to refer f_flags * * Return: Zero if successful, or a negative error code. */ static int snd_pcm_prepare(struct snd_pcm_substream *substream, struct file *file) { int f_flags; if (file) f_flags = file->f_flags; else f_flags = substream->f_flags; scoped_guard(pcm_stream_lock_irq, substream) { switch (substream->runtime->state) { case SNDRV_PCM_STATE_PAUSED: snd_pcm_pause(substream, false); fallthrough; case SNDRV_PCM_STATE_SUSPENDED: snd_pcm_stop(substream, SNDRV_PCM_STATE_SETUP); break; } } return snd_pcm_action_nonatomic(&snd_pcm_action_prepare, substream, (__force snd_pcm_state_t)f_flags); } /* * drain ioctl */ /* drain init callbacks: state argument ignored */ static int snd_pcm_pre_drain_init(struct snd_pcm_substream *substream, snd_pcm_state_t state) { struct snd_pcm_runtime *runtime = substream->runtime; switch (runtime->state) { case SNDRV_PCM_STATE_OPEN: case SNDRV_PCM_STATE_DISCONNECTED: case SNDRV_PCM_STATE_SUSPENDED: return -EBADFD; } runtime->trigger_master = substream; return 0; } static int snd_pcm_do_drain_init(struct snd_pcm_substream *substream, snd_pcm_state_t state) { struct snd_pcm_runtime *runtime = substream->runtime; if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) { switch (runtime->state) { case SNDRV_PCM_STATE_PREPARED: /* start playback stream if possible */ if (! snd_pcm_playback_empty(substream)) { snd_pcm_do_start(substream, SNDRV_PCM_STATE_DRAINING); snd_pcm_post_start(substream, SNDRV_PCM_STATE_DRAINING); } else { __snd_pcm_set_state(runtime, SNDRV_PCM_STATE_SETUP); } break; case SNDRV_PCM_STATE_RUNNING: __snd_pcm_set_state(runtime, SNDRV_PCM_STATE_DRAINING); break; case SNDRV_PCM_STATE_XRUN: __snd_pcm_set_state(runtime, SNDRV_PCM_STATE_SETUP); break; default: break; } } else { /* stop running stream */ if (runtime->state == SNDRV_PCM_STATE_RUNNING) { snd_pcm_state_t new_state; new_state = snd_pcm_capture_avail(runtime) > 0 ? SNDRV_PCM_STATE_DRAINING : SNDRV_PCM_STATE_SETUP; snd_pcm_do_stop(substream, new_state); snd_pcm_post_stop(substream, new_state); } } if (runtime->state == SNDRV_PCM_STATE_DRAINING && runtime->trigger_master == substream && (runtime->hw.info & SNDRV_PCM_INFO_DRAIN_TRIGGER)) return substream->ops->trigger(substream, SNDRV_PCM_TRIGGER_DRAIN); return 0; } static void snd_pcm_post_drain_init(struct snd_pcm_substream *substream, snd_pcm_state_t state) { } static const struct action_ops snd_pcm_action_drain_init = { .pre_action = snd_pcm_pre_drain_init, .do_action = snd_pcm_do_drain_init, .post_action = snd_pcm_post_drain_init }; /* * Drain the stream(s). * When the substream is linked, sync until the draining of all playback streams * is finished. * After this call, all streams are supposed to be either SETUP or DRAINING * (capture only) state. */ static int snd_pcm_drain(struct snd_pcm_substream *substream, struct file *file) { struct snd_card *card; struct snd_pcm_runtime *runtime; struct snd_pcm_substream *s; struct snd_pcm_group *group; wait_queue_entry_t wait; int result = 0; int nonblock = 0; card = substream->pcm->card; runtime = substream->runtime; if (runtime->state == SNDRV_PCM_STATE_OPEN) return -EBADFD; if (file) { if (file->f_flags & O_NONBLOCK) nonblock = 1; } else if (substream->f_flags & O_NONBLOCK) nonblock = 1; snd_pcm_stream_lock_irq(substream); /* resume pause */ if (runtime->state == SNDRV_PCM_STATE_PAUSED) snd_pcm_pause(substream, false); /* pre-start/stop - all running streams are changed to DRAINING state */ result = snd_pcm_action(&snd_pcm_action_drain_init, substream, ACTION_ARG_IGNORE); if (result < 0) goto unlock; /* in non-blocking, we don't wait in ioctl but let caller poll */ if (nonblock) { result = -EAGAIN; goto unlock; } for (;;) { long tout; struct snd_pcm_runtime *to_check; if (signal_pending(current)) { result = -ERESTARTSYS; break; } /* find a substream to drain */ to_check = NULL; group = snd_pcm_stream_group_ref(substream); snd_pcm_group_for_each_entry(s, substream) { if (s->stream != SNDRV_PCM_STREAM_PLAYBACK) continue; runtime = s->runtime; if (runtime->state == SNDRV_PCM_STATE_DRAINING) { to_check = runtime; break; } } snd_pcm_group_unref(group, substream); if (!to_check) break; /* all drained */ init_waitqueue_entry(&wait, current); set_current_state(TASK_INTERRUPTIBLE); add_wait_queue(&to_check->sleep, &wait); snd_pcm_stream_unlock_irq(substream); if (runtime->no_period_wakeup) tout = MAX_SCHEDULE_TIMEOUT; else { tout = 100; if (runtime->rate) { long t = runtime->buffer_size * 1100 / runtime->rate; tout = max(t, tout); } tout = msecs_to_jiffies(tout); } tout = schedule_timeout(tout); snd_pcm_stream_lock_irq(substream); group = snd_pcm_stream_group_ref(substream); snd_pcm_group_for_each_entry(s, substream) { if (s->runtime == to_check) { remove_wait_queue(&to_check->sleep, &wait); break; } } snd_pcm_group_unref(group, substream); if (card->shutdown) { result = -ENODEV; break; } if (tout == 0) { if (substream->runtime->state == SNDRV_PCM_STATE_SUSPENDED) result = -ESTRPIPE; else { dev_dbg(substream->pcm->card->dev, "playback drain timeout (DMA or IRQ trouble?)\n"); snd_pcm_stop(substream, SNDRV_PCM_STATE_SETUP); result = -EIO; } break; } } unlock: snd_pcm_stream_unlock_irq(substream); return result; } /* * drop ioctl * * Immediately put all linked substreams into SETUP state. */ static int snd_pcm_drop(struct snd_pcm_substream *substream) { struct snd_pcm_runtime *runtime; int result = 0; if (PCM_RUNTIME_CHECK(substream)) return -ENXIO; runtime = substream->runtime; if (runtime->state == SNDRV_PCM_STATE_OPEN || runtime->state == SNDRV_PCM_STATE_DISCONNECTED) return -EBADFD; guard(pcm_stream_lock_irq)(substream); /* resume pause */ if (runtime->state == SNDRV_PCM_STATE_PAUSED) snd_pcm_pause(substream, false); snd_pcm_stop(substream, SNDRV_PCM_STATE_SETUP); /* runtime->control->appl_ptr = runtime->status->hw_ptr; */ return result; } static bool is_pcm_file(struct file *file) { struct inode *inode = file_inode(file); struct snd_pcm *pcm; unsigned int minor; if (!S_ISCHR(inode->i_mode) || imajor(inode) != snd_major) return false; minor = iminor(inode); pcm = snd_lookup_minor_data(minor, SNDRV_DEVICE_TYPE_PCM_PLAYBACK); if (!pcm) pcm = snd_lookup_minor_data(minor, SNDRV_DEVICE_TYPE_PCM_CAPTURE); if (!pcm) return false; snd_card_unref(pcm->card); return true; } /* * PCM link handling */ static int snd_pcm_link(struct snd_pcm_substream *substream, int fd) { struct snd_pcm_file *pcm_file; struct snd_pcm_substream *substream1; struct snd_pcm_group *group __free(kfree) = NULL; struct snd_pcm_group *target_group; bool nonatomic = substream->pcm->nonatomic; CLASS(fd, f)(fd); if (fd_empty(f)) return -EBADFD; if (!is_pcm_file(fd_file(f))) return -EBADFD; pcm_file = fd_file(f)->private_data; substream1 = pcm_file->substream; if (substream == substream1) return -EINVAL; group = kzalloc(sizeof(*group), GFP_KERNEL); if (!group) return -ENOMEM; snd_pcm_group_init(group); guard(rwsem_write)(&snd_pcm_link_rwsem); if (substream->runtime->state == SNDRV_PCM_STATE_OPEN || substream->runtime->state != substream1->runtime->state || substream->pcm->nonatomic != substream1->pcm->nonatomic) return -EBADFD; if (snd_pcm_stream_linked(substream1)) return -EALREADY; scoped_guard(pcm_stream_lock_irq, substream) { if (!snd_pcm_stream_linked(substream)) { snd_pcm_group_assign(substream, group); group = NULL; /* assigned, don't free this one below */ } target_group = substream->group; } snd_pcm_group_lock_irq(target_group, nonatomic); snd_pcm_stream_lock_nested(substream1); snd_pcm_group_assign(substream1, target_group); refcount_inc(&target_group->refs); snd_pcm_stream_unlock(substream1); snd_pcm_group_unlock_irq(target_group, nonatomic); return 0; } static void relink_to_local(struct snd_pcm_substream *substream) { snd_pcm_stream_lock_nested(substream); snd_pcm_group_assign(substream, &substream->self_group); snd_pcm_stream_unlock(substream); } static int snd_pcm_unlink(struct snd_pcm_substream *substream) { struct snd_pcm_group *group; bool nonatomic = substream->pcm->nonatomic; bool do_free = false; guard(rwsem_write)(&snd_pcm_link_rwsem); if (!snd_pcm_stream_linked(substream)) return -EALREADY; group = substream->group; snd_pcm_group_lock_irq(group, nonatomic); relink_to_local(substream); refcount_dec(&group->refs); /* detach the last stream, too */ if (list_is_singular(&group->substreams)) { relink_to_local(list_first_entry(&group->substreams, struct snd_pcm_substream, link_list)); do_free = refcount_dec_and_test(&group->refs); } snd_pcm_group_unlock_irq(group, nonatomic); if (do_free) kfree(group); return 0; } /* * hw configurator */ static int snd_pcm_hw_rule_mul(struct snd_pcm_hw_params *params, struct snd_pcm_hw_rule *rule) { struct snd_interval t; snd_interval_mul(hw_param_interval_c(params, rule->deps[0]), hw_param_interval_c(params, rule->deps[1]), &t); return snd_interval_refine(hw_param_interval(params, rule->var), &t); } static int snd_pcm_hw_rule_div(struct snd_pcm_hw_params *params, struct snd_pcm_hw_rule *rule) { struct snd_interval t; snd_interval_div(hw_param_interval_c(params, rule->deps[0]), hw_param_interval_c(params, rule->deps[1]), &t); return snd_interval_refine(hw_param_interval(params, rule->var), &t); } static int snd_pcm_hw_rule_muldivk(struct snd_pcm_hw_params *params, struct snd_pcm_hw_rule *rule) { struct snd_interval t; snd_interval_muldivk(hw_param_interval_c(params, rule->deps[0]), hw_param_interval_c(params, rule->deps[1]), (unsigned long) rule->private, &t); return snd_interval_refine(hw_param_interval(params, rule->var), &t); } static int snd_pcm_hw_rule_mulkdiv(struct snd_pcm_hw_params *params, struct snd_pcm_hw_rule *rule) { struct snd_interval t; snd_interval_mulkdiv(hw_param_interval_c(params, rule->deps[0]), (unsigned long) rule->private, hw_param_interval_c(params, rule->deps[1]), &t); return snd_interval_refine(hw_param_interval(params, rule->var), &t); } static int snd_pcm_hw_rule_format(struct snd_pcm_hw_params *params, struct snd_pcm_hw_rule *rule) { snd_pcm_format_t k; const struct snd_interval *i = hw_param_interval_c(params, rule->deps[0]); struct snd_mask m; struct snd_mask *mask = hw_param_mask(params, SNDRV_PCM_HW_PARAM_FORMAT); snd_mask_any(&m); pcm_for_each_format(k) { int bits; if (!snd_mask_test_format(mask, k)) continue; bits = snd_pcm_format_physical_width(k); if (bits <= 0) continue; /* ignore invalid formats */ if ((unsigned)bits < i->min || (unsigned)bits > i->max) snd_mask_reset(&m, (__force unsigned)k); } return snd_mask_refine(mask, &m); } static int snd_pcm_hw_rule_sample_bits(struct snd_pcm_hw_params *params, struct snd_pcm_hw_rule *rule) { struct snd_interval t; snd_pcm_format_t k; t.min = UINT_MAX; t.max = 0; t.openmin = 0; t.openmax = 0; pcm_for_each_format(k) { int bits; if (!snd_mask_test_format(hw_param_mask(params, SNDRV_PCM_HW_PARAM_FORMAT), k)) continue; bits = snd_pcm_format_physical_width(k); if (bits <= 0) continue; /* ignore invalid formats */ if (t.min > (unsigned)bits) t.min = bits; if (t.max < (unsigned)bits) t.max = bits; } t.integer = 1; return snd_interval_refine(hw_param_interval(params, rule->var), &t); } #if SNDRV_PCM_RATE_5512 != 1 << 0 || SNDRV_PCM_RATE_192000 != 1 << 12 ||\ SNDRV_PCM_RATE_128000 != 1 << 19 #error "Change this table" #endif /* NOTE: the list is unsorted! */ static const unsigned int rates[] = { 5512, 8000, 11025, 16000, 22050, 32000, 44100, 48000, 64000, 88200, 96000, 176400, 192000, 352800, 384000, 705600, 768000, /* extended */ 12000, 24000, 128000 }; const struct snd_pcm_hw_constraint_list snd_pcm_known_rates = { .count = ARRAY_SIZE(rates), .list = rates, }; static int snd_pcm_hw_rule_rate(struct snd_pcm_hw_params *params, struct snd_pcm_hw_rule *rule) { struct snd_pcm_hardware *hw = rule->private; return snd_interval_list(hw_param_interval(params, rule->var), snd_pcm_known_rates.count, snd_pcm_known_rates.list, hw->rates); } static int snd_pcm_hw_rule_buffer_bytes_max(struct snd_pcm_hw_params *params, struct snd_pcm_hw_rule *rule) { struct snd_interval t; struct snd_pcm_substream *substream = rule->private; t.min = 0; t.max = substream->buffer_bytes_max; t.openmin = 0; t.openmax = 0; t.integer = 1; return snd_interval_refine(hw_param_interval(params, rule->var), &t); } static int snd_pcm_hw_rule_subformats(struct snd_pcm_hw_params *params, struct snd_pcm_hw_rule *rule) { struct snd_mask *sfmask = hw_param_mask(params, SNDRV_PCM_HW_PARAM_SUBFORMAT); struct snd_mask *fmask = hw_param_mask(params, SNDRV_PCM_HW_PARAM_FORMAT); u32 *subformats = rule->private; snd_pcm_format_t f; struct snd_mask m; snd_mask_none(&m); /* All PCMs support at least the default STD subformat. */ snd_mask_set(&m, (__force unsigned)SNDRV_PCM_SUBFORMAT_STD); pcm_for_each_format(f) { if (!snd_mask_test(fmask, (__force unsigned)f)) continue; if (f == SNDRV_PCM_FORMAT_S32_LE && *subformats) m.bits[0] |= *subformats; else if (snd_pcm_format_linear(f)) snd_mask_set(&m, (__force unsigned)SNDRV_PCM_SUBFORMAT_MSBITS_MAX); } return snd_mask_refine(sfmask, &m); } static int snd_pcm_hw_constraint_subformats(struct snd_pcm_runtime *runtime, unsigned int cond, u32 *subformats) { return snd_pcm_hw_rule_add(runtime, cond, -1, snd_pcm_hw_rule_subformats, (void *)subformats, SNDRV_PCM_HW_PARAM_SUBFORMAT, SNDRV_PCM_HW_PARAM_FORMAT, -1); } static int snd_pcm_hw_constraints_init(struct snd_pcm_substream *substream) { struct snd_pcm_runtime *runtime = substream->runtime; struct snd_pcm_hw_constraints *constrs = &runtime->hw_constraints; int k, err; for (k = SNDRV_PCM_HW_PARAM_FIRST_MASK; k <= SNDRV_PCM_HW_PARAM_LAST_MASK; k++) { snd_mask_any(constrs_mask(constrs, k)); } for (k = SNDRV_PCM_HW_PARAM_FIRST_INTERVAL; k <= SNDRV_PCM_HW_PARAM_LAST_INTERVAL; k++) { snd_interval_any(constrs_interval(constrs, k)); } snd_interval_setinteger(constrs_interval(constrs, SNDRV_PCM_HW_PARAM_CHANNELS)); snd_interval_setinteger(constrs_interval(constrs, SNDRV_PCM_HW_PARAM_BUFFER_SIZE)); snd_interval_setinteger(constrs_interval(constrs, SNDRV_PCM_HW_PARAM_BUFFER_BYTES)); snd_interval_setinteger(constrs_interval(constrs, SNDRV_PCM_HW_PARAM_SAMPLE_BITS)); snd_interval_setinteger(constrs_interval(constrs, SNDRV_PCM_HW_PARAM_FRAME_BITS)); err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_FORMAT, snd_pcm_hw_rule_format, NULL, SNDRV_PCM_HW_PARAM_SAMPLE_BITS, -1); if (err < 0) return err; err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_SAMPLE_BITS, snd_pcm_hw_rule_sample_bits, NULL, SNDRV_PCM_HW_PARAM_FORMAT, SNDRV_PCM_HW_PARAM_SAMPLE_BITS, -1); if (err < 0) return err; err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_SAMPLE_BITS, snd_pcm_hw_rule_div, NULL, SNDRV_PCM_HW_PARAM_FRAME_BITS, SNDRV_PCM_HW_PARAM_CHANNELS, -1); if (err < 0) return err; err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_FRAME_BITS, snd_pcm_hw_rule_mul, NULL, SNDRV_PCM_HW_PARAM_SAMPLE_BITS, SNDRV_PCM_HW_PARAM_CHANNELS, -1); if (err < 0) return err; err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_FRAME_BITS, snd_pcm_hw_rule_mulkdiv, (void*) 8, SNDRV_PCM_HW_PARAM_PERIOD_BYTES, SNDRV_PCM_HW_PARAM_PERIOD_SIZE, -1); if (err < 0) return err; err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_FRAME_BITS, snd_pcm_hw_rule_mulkdiv, (void*) 8, SNDRV_PCM_HW_PARAM_BUFFER_BYTES, SNDRV_PCM_HW_PARAM_BUFFER_SIZE, -1); if (err < 0) return err; err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_CHANNELS, snd_pcm_hw_rule_div, NULL, SNDRV_PCM_HW_PARAM_FRAME_BITS, SNDRV_PCM_HW_PARAM_SAMPLE_BITS, -1); if (err < 0) return err; err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_RATE, snd_pcm_hw_rule_mulkdiv, (void*) 1000000, SNDRV_PCM_HW_PARAM_PERIOD_SIZE, SNDRV_PCM_HW_PARAM_PERIOD_TIME, -1); if (err < 0) return err; err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_RATE, snd_pcm_hw_rule_mulkdiv, (void*) 1000000, SNDRV_PCM_HW_PARAM_BUFFER_SIZE, SNDRV_PCM_HW_PARAM_BUFFER_TIME, -1); if (err < 0) return err; err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_PERIODS, snd_pcm_hw_rule_div, NULL, SNDRV_PCM_HW_PARAM_BUFFER_SIZE, SNDRV_PCM_HW_PARAM_PERIOD_SIZE, -1); if (err < 0) return err; err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_PERIOD_SIZE, snd_pcm_hw_rule_div, NULL, SNDRV_PCM_HW_PARAM_BUFFER_SIZE, SNDRV_PCM_HW_PARAM_PERIODS, -1); if (err < 0) return err; err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_PERIOD_SIZE, snd_pcm_hw_rule_mulkdiv, (void*) 8, SNDRV_PCM_HW_PARAM_PERIOD_BYTES, SNDRV_PCM_HW_PARAM_FRAME_BITS, -1); if (err < 0) return err; err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_PERIOD_SIZE, snd_pcm_hw_rule_muldivk, (void*) 1000000, SNDRV_PCM_HW_PARAM_PERIOD_TIME, SNDRV_PCM_HW_PARAM_RATE, -1); if (err < 0) return err; err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_BUFFER_SIZE, snd_pcm_hw_rule_mul, NULL, SNDRV_PCM_HW_PARAM_PERIOD_SIZE, SNDRV_PCM_HW_PARAM_PERIODS, -1); if (err < 0) return err; err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_BUFFER_SIZE, snd_pcm_hw_rule_mulkdiv, (void*) 8, SNDRV_PCM_HW_PARAM_BUFFER_BYTES, SNDRV_PCM_HW_PARAM_FRAME_BITS, -1); if (err < 0) return err; err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_BUFFER_SIZE, snd_pcm_hw_rule_muldivk, (void*) 1000000, SNDRV_PCM_HW_PARAM_BUFFER_TIME, SNDRV_PCM_HW_PARAM_RATE, -1); if (err < 0) return err; err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_PERIOD_BYTES, snd_pcm_hw_rule_muldivk, (void*) 8, SNDRV_PCM_HW_PARAM_PERIOD_SIZE, SNDRV_PCM_HW_PARAM_FRAME_BITS, -1); if (err < 0) return err; err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_BUFFER_BYTES, snd_pcm_hw_rule_muldivk, (void*) 8, SNDRV_PCM_HW_PARAM_BUFFER_SIZE, SNDRV_PCM_HW_PARAM_FRAME_BITS, -1); if (err < 0) return err; err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_PERIOD_TIME, snd_pcm_hw_rule_mulkdiv, (void*) 1000000, SNDRV_PCM_HW_PARAM_PERIOD_SIZE, SNDRV_PCM_HW_PARAM_RATE, -1); if (err < 0) return err; err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_BUFFER_TIME, snd_pcm_hw_rule_mulkdiv, (void*) 1000000, SNDRV_PCM_HW_PARAM_BUFFER_SIZE, SNDRV_PCM_HW_PARAM_RATE, -1); if (err < 0) return err; return 0; } static int snd_pcm_hw_constraints_complete(struct snd_pcm_substream *substream) { struct snd_pcm_runtime *runtime = substream->runtime; struct snd_pcm_hardware *hw = &runtime->hw; int err; unsigned int mask = 0; if (hw->info & SNDRV_PCM_INFO_INTERLEAVED) mask |= PARAM_MASK_BIT(SNDRV_PCM_ACCESS_RW_INTERLEAVED); if (hw->info & SNDRV_PCM_INFO_NONINTERLEAVED) mask |= PARAM_MASK_BIT(SNDRV_PCM_ACCESS_RW_NONINTERLEAVED); if (hw_support_mmap(substream)) { if (hw->info & SNDRV_PCM_INFO_INTERLEAVED) mask |= PARAM_MASK_BIT(SNDRV_PCM_ACCESS_MMAP_INTERLEAVED); if (hw->info & SNDRV_PCM_INFO_NONINTERLEAVED) mask |= PARAM_MASK_BIT(SNDRV_PCM_ACCESS_MMAP_NONINTERLEAVED); if (hw->info & SNDRV_PCM_INFO_COMPLEX) mask |= PARAM_MASK_BIT(SNDRV_PCM_ACCESS_MMAP_COMPLEX); } err = snd_pcm_hw_constraint_mask(runtime, SNDRV_PCM_HW_PARAM_ACCESS, mask); if (err < 0) return err; err = snd_pcm_hw_constraint_mask64(runtime, SNDRV_PCM_HW_PARAM_FORMAT, hw->formats); if (err < 0) return err; err = snd_pcm_hw_constraint_subformats(runtime, 0, &hw->subformats); if (err < 0) return err; err = snd_pcm_hw_constraint_minmax(runtime, SNDRV_PCM_HW_PARAM_CHANNELS, hw->channels_min, hw->channels_max); if (err < 0) return err; err = snd_pcm_hw_constraint_minmax(runtime, SNDRV_PCM_HW_PARAM_RATE, hw->rate_min, hw->rate_max); if (err < 0) return err; err = snd_pcm_hw_constraint_minmax(runtime, SNDRV_PCM_HW_PARAM_PERIOD_BYTES, hw->period_bytes_min, hw->period_bytes_max); if (err < 0) return err; err = snd_pcm_hw_constraint_minmax(runtime, SNDRV_PCM_HW_PARAM_PERIODS, hw->periods_min, hw->periods_max); if (err < 0) return err; err = snd_pcm_hw_constraint_minmax(runtime, SNDRV_PCM_HW_PARAM_BUFFER_BYTES, hw->period_bytes_min, hw->buffer_bytes_max); if (err < 0) return err; err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_BUFFER_BYTES, snd_pcm_hw_rule_buffer_bytes_max, substream, SNDRV_PCM_HW_PARAM_BUFFER_BYTES, -1); if (err < 0) return err; /* FIXME: remove */ if (runtime->dma_bytes) { err = snd_pcm_hw_constraint_minmax(runtime, SNDRV_PCM_HW_PARAM_BUFFER_BYTES, 0, runtime->dma_bytes); if (err < 0) return err; } if (!(hw->rates & (SNDRV_PCM_RATE_KNOT | SNDRV_PCM_RATE_CONTINUOUS))) { err = snd_pcm_hw_rule_add(runtime, 0, SNDRV_PCM_HW_PARAM_RATE, snd_pcm_hw_rule_rate, hw, SNDRV_PCM_HW_PARAM_RATE, -1); if (err < 0) return err; } /* FIXME: this belong to lowlevel */ snd_pcm_hw_constraint_integer(runtime, SNDRV_PCM_HW_PARAM_PERIOD_SIZE); return 0; } static void pcm_release_private(struct snd_pcm_substream *substream) { if (snd_pcm_stream_linked(substream)) snd_pcm_unlink(substream); } void snd_pcm_release_substream(struct snd_pcm_substream *substream) { substream->ref_count--; if (substream->ref_count > 0) return; snd_pcm_drop(substream); if (substream->hw_opened) { if (substream->runtime->state != SNDRV_PCM_STATE_OPEN) do_hw_free(substream); substream->ops->close(substream); substream->hw_opened = 0; } if (cpu_latency_qos_request_active(&substream->latency_pm_qos_req)) cpu_latency_qos_remove_request(&substream->latency_pm_qos_req); if (substream->pcm_release) { substream->pcm_release(substream); substream->pcm_release = NULL; } snd_pcm_detach_substream(substream); } EXPORT_SYMBOL(snd_pcm_release_substream); int snd_pcm_open_substream(struct snd_pcm *pcm, int stream, struct file *file, struct snd_pcm_substream **rsubstream) { struct snd_pcm_substream *substream; int err; err = snd_pcm_attach_substream(pcm, stream, file, &substream); if (err < 0) return err; if (substream->ref_count > 1) { *rsubstream = substream; return 0; } err = snd_pcm_hw_constraints_init(substream); if (err < 0) { pcm_dbg(pcm, "snd_pcm_hw_constraints_init failed\n"); goto error; } err = substream->ops->open(substream); if (err < 0) goto error; substream->hw_opened = 1; err = snd_pcm_hw_constraints_complete(substream); if (err < 0) { pcm_dbg(pcm, "snd_pcm_hw_constraints_complete failed\n"); goto error; } /* automatically set EXPLICIT_SYNC flag in the managed mode whenever * the DMA buffer requires it */ if (substream->managed_buffer_alloc && substream->dma_buffer.dev.need_sync) substream->runtime->hw.info |= SNDRV_PCM_INFO_EXPLICIT_SYNC; *rsubstream = substream; return 0; error: snd_pcm_release_substream(substream); return err; } EXPORT_SYMBOL(snd_pcm_open_substream); static int snd_pcm_open_file(struct file *file, struct snd_pcm *pcm, int stream) { struct snd_pcm_file *pcm_file; struct snd_pcm_substream *substream; int err; err = snd_pcm_open_substream(pcm, stream, file, &substream); if (err < 0) return err; pcm_file = kzalloc(sizeof(*pcm_file), GFP_KERNEL); if (pcm_file == NULL) { snd_pcm_release_substream(substream); return -ENOMEM; } pcm_file->substream = substream; if (substream->ref_count == 1) substream->pcm_release = pcm_release_private; file->private_data = pcm_file; return 0; } static int snd_pcm_playback_open(struct inode *inode, struct file *file) { struct snd_pcm *pcm; int err = nonseekable_open(inode, file); if (err < 0) return err; pcm = snd_lookup_minor_data(iminor(inode), SNDRV_DEVICE_TYPE_PCM_PLAYBACK); err = snd_pcm_open(file, pcm, SNDRV_PCM_STREAM_PLAYBACK); if (pcm) snd_card_unref(pcm->card); return err; } static int snd_pcm_capture_open(struct inode *inode, struct file *file) { struct snd_pcm *pcm; int err = nonseekable_open(inode, file); if (err < 0) return err; pcm = snd_lookup_minor_data(iminor(inode), SNDRV_DEVICE_TYPE_PCM_CAPTURE); err = snd_pcm_open(file, pcm, SNDRV_PCM_STREAM_CAPTURE); if (pcm) snd_card_unref(pcm->card); return err; } static int snd_pcm_open(struct file *file, struct snd_pcm *pcm, int stream) { int err; wait_queue_entry_t wait; if (pcm == NULL) { err = -ENODEV; goto __error1; } err = snd_card_file_add(pcm->card, file); if (err < 0) goto __error1; if (!try_module_get(pcm->card->module)) { err = -EFAULT; goto __error2; } init_waitqueue_entry(&wait, current); add_wait_queue(&pcm->open_wait, &wait); mutex_lock(&pcm->open_mutex); while (1) { err = snd_pcm_open_file(file, pcm, stream); if (err >= 0) break; if (err == -EAGAIN) { if (file->f_flags & O_NONBLOCK) { err = -EBUSY; break; } } else break; set_current_state(TASK_INTERRUPTIBLE); mutex_unlock(&pcm->open_mutex); schedule(); mutex_lock(&pcm->open_mutex); if (pcm->card->shutdown) { err = -ENODEV; break; } if (signal_pending(current)) { err = -ERESTARTSYS; break; } } remove_wait_queue(&pcm->open_wait, &wait); mutex_unlock(&pcm->open_mutex); if (err < 0) goto __error; return err; __error: module_put(pcm->card->module); __error2: snd_card_file_remove(pcm->card, file); __error1: return err; } static int snd_pcm_release(struct inode *inode, struct file *file) { struct snd_pcm *pcm; struct snd_pcm_substream *substream; struct snd_pcm_file *pcm_file; pcm_file = file->private_data; substream = pcm_file->substream; if (snd_BUG_ON(!substream)) return -ENXIO; pcm = substream->pcm; /* block until the device gets woken up as it may touch the hardware */ snd_power_wait(pcm->card); scoped_guard(mutex, &pcm->open_mutex) { snd_pcm_release_substream(substream); kfree(pcm_file); } wake_up(&pcm->open_wait); module_put(pcm->card->module); snd_card_file_remove(pcm->card, file); return 0; } /* check and update PCM state; return 0 or a negative error * call this inside PCM lock */ static int do_pcm_hwsync(struct snd_pcm_substream *substream) { switch (substream->runtime->state) { case SNDRV_PCM_STATE_DRAINING: if (substream->stream == SNDRV_PCM_STREAM_CAPTURE) return -EBADFD; fallthrough; case SNDRV_PCM_STATE_RUNNING: return snd_pcm_update_hw_ptr(substream); case SNDRV_PCM_STATE_PREPARED: case SNDRV_PCM_STATE_PAUSED: return 0; case SNDRV_PCM_STATE_SUSPENDED: return -ESTRPIPE; case SNDRV_PCM_STATE_XRUN: return -EPIPE; default: return -EBADFD; } } /* increase the appl_ptr; returns the processed frames or a negative error */ static snd_pcm_sframes_t forward_appl_ptr(struct snd_pcm_substream *substream, snd_pcm_uframes_t frames, snd_pcm_sframes_t avail) { struct snd_pcm_runtime *runtime = substream->runtime; snd_pcm_sframes_t appl_ptr; int ret; if (avail <= 0) return 0; if (frames > (snd_pcm_uframes_t)avail) frames = avail; appl_ptr = runtime->control->appl_ptr + frames; if (appl_ptr >= (snd_pcm_sframes_t)runtime->boundary) appl_ptr -= runtime->boundary; ret = pcm_lib_apply_appl_ptr(substream, appl_ptr); return ret < 0 ? ret : frames; } /* decrease the appl_ptr; returns the processed frames or zero for error */ static snd_pcm_sframes_t rewind_appl_ptr(struct snd_pcm_substream *substream, snd_pcm_uframes_t frames, snd_pcm_sframes_t avail) { struct snd_pcm_runtime *runtime = substream->runtime; snd_pcm_sframes_t appl_ptr; int ret; if (avail <= 0) return 0; if (frames > (snd_pcm_uframes_t)avail) frames = avail; appl_ptr = runtime->control->appl_ptr - frames; if (appl_ptr < 0) appl_ptr += runtime->boundary; ret = pcm_lib_apply_appl_ptr(substream, appl_ptr); /* NOTE: we return zero for errors because PulseAudio gets depressed * upon receiving an error from rewind ioctl and stops processing * any longer. Returning zero means that no rewind is done, so * it's not absolutely wrong to answer like that. */ return ret < 0 ? 0 : frames; } static snd_pcm_sframes_t snd_pcm_rewind(struct snd_pcm_substream *substream, snd_pcm_uframes_t frames) { snd_pcm_sframes_t ret; if (frames == 0) return 0; scoped_guard(pcm_stream_lock_irq, substream) { ret = do_pcm_hwsync(substream); if (!ret) ret = rewind_appl_ptr(substream, frames, snd_pcm_hw_avail(substream)); } if (ret >= 0) snd_pcm_dma_buffer_sync(substream, SNDRV_DMA_SYNC_DEVICE); return ret; } static snd_pcm_sframes_t snd_pcm_forward(struct snd_pcm_substream *substream, snd_pcm_uframes_t frames) { snd_pcm_sframes_t ret; if (frames == 0) return 0; scoped_guard(pcm_stream_lock_irq, substream) { ret = do_pcm_hwsync(substream); if (!ret) ret = forward_appl_ptr(substream, frames, snd_pcm_avail(substream)); } if (ret >= 0) snd_pcm_dma_buffer_sync(substream, SNDRV_DMA_SYNC_DEVICE); return ret; } static int snd_pcm_delay(struct snd_pcm_substream *substream, snd_pcm_sframes_t *delay) { int err; scoped_guard(pcm_stream_lock_irq, substream) { err = do_pcm_hwsync(substream); if (delay && !err) *delay = snd_pcm_calc_delay(substream); } snd_pcm_dma_buffer_sync(substream, SNDRV_DMA_SYNC_CPU); return err; } static inline int snd_pcm_hwsync(struct snd_pcm_substream *substream) { return snd_pcm_delay(substream, NULL); } static int snd_pcm_sync_ptr(struct snd_pcm_substream *substream, struct snd_pcm_sync_ptr __user *_sync_ptr) { struct snd_pcm_runtime *runtime = substream->runtime; struct snd_pcm_sync_ptr sync_ptr; volatile struct snd_pcm_mmap_status *status; volatile struct snd_pcm_mmap_control *control; int err; memset(&sync_ptr, 0, sizeof(sync_ptr)); if (get_user(sync_ptr.flags, (unsigned __user *)&(_sync_ptr->flags))) return -EFAULT; if (copy_from_user(&sync_ptr.c.control, &(_sync_ptr->c.control), sizeof(struct snd_pcm_mmap_control))) return -EFAULT; status = runtime->status; control = runtime->control; if (sync_ptr.flags & SNDRV_PCM_SYNC_PTR_HWSYNC) { err = snd_pcm_hwsync(substream); if (err < 0) return err; } scoped_guard(pcm_stream_lock_irq, substream) { if (!(sync_ptr.flags & SNDRV_PCM_SYNC_PTR_APPL)) { err = pcm_lib_apply_appl_ptr(substream, sync_ptr.c.control.appl_ptr); if (err < 0) return err; } else { sync_ptr.c.control.appl_ptr = control->appl_ptr; } if (!(sync_ptr.flags & SNDRV_PCM_SYNC_PTR_AVAIL_MIN)) control->avail_min = sync_ptr.c.control.avail_min; else sync_ptr.c.control.avail_min = control->avail_min; sync_ptr.s.status.state = status->state; sync_ptr.s.status.hw_ptr = status->hw_ptr; sync_ptr.s.status.tstamp = status->tstamp; sync_ptr.s.status.suspended_state = status->suspended_state; sync_ptr.s.status.audio_tstamp = status->audio_tstamp; } if (!(sync_ptr.flags & SNDRV_PCM_SYNC_PTR_APPL)) snd_pcm_dma_buffer_sync(substream, SNDRV_DMA_SYNC_DEVICE); if (copy_to_user(_sync_ptr, &sync_ptr, sizeof(sync_ptr))) return -EFAULT; return 0; } struct snd_pcm_mmap_status32 { snd_pcm_state_t state; s32 pad1; u32 hw_ptr; s32 tstamp_sec; s32 tstamp_nsec; snd_pcm_state_t suspended_state; s32 audio_tstamp_sec; s32 audio_tstamp_nsec; } __packed; struct snd_pcm_mmap_control32 { u32 appl_ptr; u32 avail_min; }; struct snd_pcm_sync_ptr32 { u32 flags; union { struct snd_pcm_mmap_status32 status; unsigned char reserved[64]; } s; union { struct snd_pcm_mmap_control32 control; unsigned char reserved[64]; } c; } __packed; /* recalculate the boundary within 32bit */ static snd_pcm_uframes_t recalculate_boundary(struct snd_pcm_runtime *runtime) { snd_pcm_uframes_t boundary; if (! runtime->buffer_size) return 0; boundary = runtime->buffer_size; while (boundary * 2 <= 0x7fffffffUL - runtime->buffer_size) boundary *= 2; return boundary; } static int snd_pcm_ioctl_sync_ptr_compat(struct snd_pcm_substream *substream, struct snd_pcm_sync_ptr32 __user *src) { struct snd_pcm_runtime *runtime = substream->runtime; volatile struct snd_pcm_mmap_status *status; volatile struct snd_pcm_mmap_control *control; u32 sflags; struct snd_pcm_mmap_control scontrol; struct snd_pcm_mmap_status sstatus; snd_pcm_uframes_t boundary; int err; if (snd_BUG_ON(!runtime)) return -EINVAL; if (get_user(sflags, &src->flags) || get_user(scontrol.appl_ptr, &src->c.control.appl_ptr) || get_user(scontrol.avail_min, &src->c.control.avail_min)) return -EFAULT; if (sflags & SNDRV_PCM_SYNC_PTR_HWSYNC) { err = snd_pcm_hwsync(substream); if (err < 0) return err; } status = runtime->status; control = runtime->control; boundary = recalculate_boundary(runtime); if (! boundary) boundary = 0x7fffffff; scoped_guard(pcm_stream_lock_irq, substream) { /* FIXME: we should consider the boundary for the sync from app */ if (!(sflags & SNDRV_PCM_SYNC_PTR_APPL)) { err = pcm_lib_apply_appl_ptr(substream, scontrol.appl_ptr); if (err < 0) return err; } else scontrol.appl_ptr = control->appl_ptr % boundary; if (!(sflags & SNDRV_PCM_SYNC_PTR_AVAIL_MIN)) control->avail_min = scontrol.avail_min; else scontrol.avail_min = control->avail_min; sstatus.state = status->state; sstatus.hw_ptr = status->hw_ptr % boundary; sstatus.tstamp = status->tstamp; sstatus.suspended_state = status->suspended_state; sstatus.audio_tstamp = status->audio_tstamp; } if (!(sflags & SNDRV_PCM_SYNC_PTR_APPL)) snd_pcm_dma_buffer_sync(substream, SNDRV_DMA_SYNC_DEVICE); if (put_user(sstatus.state, &src->s.status.state) || put_user(sstatus.hw_ptr, &src->s.status.hw_ptr) || put_user(sstatus.tstamp.tv_sec, &src->s.status.tstamp_sec) || put_user(sstatus.tstamp.tv_nsec, &src->s.status.tstamp_nsec) || put_user(sstatus.suspended_state, &src->s.status.suspended_state) || put_user(sstatus.audio_tstamp.tv_sec, &src->s.status.audio_tstamp_sec) || put_user(sstatus.audio_tstamp.tv_nsec, &src->s.status.audio_tstamp_nsec) || put_user(scontrol.appl_ptr, &src->c.control.appl_ptr) || put_user(scontrol.avail_min, &src->c.control.avail_min)) return -EFAULT; return 0; } #define __SNDRV_PCM_IOCTL_SYNC_PTR32 _IOWR('A', 0x23, struct snd_pcm_sync_ptr32) static int snd_pcm_tstamp(struct snd_pcm_substream *substream, int __user *_arg) { struct snd_pcm_runtime *runtime = substream->runtime; int arg; if (get_user(arg, _arg)) return -EFAULT; if (arg < 0 || arg > SNDRV_PCM_TSTAMP_TYPE_LAST) return -EINVAL; runtime->tstamp_type = arg; return 0; } static int snd_pcm_xferi_frames_ioctl(struct snd_pcm_substream *substream, struct snd_xferi __user *_xferi) { struct snd_xferi xferi; struct snd_pcm_runtime *runtime = substream->runtime; snd_pcm_sframes_t result; if (runtime->state == SNDRV_PCM_STATE_OPEN) return -EBADFD; if (put_user(0, &_xferi->result)) return -EFAULT; if (copy_from_user(&xferi, _xferi, sizeof(xferi))) return -EFAULT; if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) result = snd_pcm_lib_write(substream, xferi.buf, xferi.frames); else result = snd_pcm_lib_read(substream, xferi.buf, xferi.frames); if (put_user(result, &_xferi->result)) return -EFAULT; return result < 0 ? result : 0; } static int snd_pcm_xfern_frames_ioctl(struct snd_pcm_substream *substream, struct snd_xfern __user *_xfern) { struct snd_xfern xfern; struct snd_pcm_runtime *runtime = substream->runtime; void *bufs __free(kfree) = NULL; snd_pcm_sframes_t result; if (runtime->state == SNDRV_PCM_STATE_OPEN) return -EBADFD; if (runtime->channels > 128) return -EINVAL; if (put_user(0, &_xfern->result)) return -EFAULT; if (copy_from_user(&xfern, _xfern, sizeof(xfern))) return -EFAULT; bufs = memdup_array_user(xfern.bufs, runtime->channels, sizeof(void *)); if (IS_ERR(bufs)) return PTR_ERR(bufs); if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) result = snd_pcm_lib_writev(substream, bufs, xfern.frames); else result = snd_pcm_lib_readv(substream, bufs, xfern.frames); if (put_user(result, &_xfern->result)) return -EFAULT; return result < 0 ? result : 0; } static int snd_pcm_rewind_ioctl(struct snd_pcm_substream *substream, snd_pcm_uframes_t __user *_frames) { snd_pcm_uframes_t frames; snd_pcm_sframes_t result; if (get_user(frames, _frames)) return -EFAULT; if (put_user(0, _frames)) return -EFAULT; result = snd_pcm_rewind(substream, frames); if (put_user(result, _frames)) return -EFAULT; return result < 0 ? result : 0; } static int snd_pcm_forward_ioctl(struct snd_pcm_substream *substream, snd_pcm_uframes_t __user *_frames) { snd_pcm_uframes_t frames; snd_pcm_sframes_t result; if (get_user(frames, _frames)) return -EFAULT; if (put_user(0, _frames)) return -EFAULT; result = snd_pcm_forward(substream, frames); if (put_user(result, _frames)) return -EFAULT; return result < 0 ? result : 0; } static int snd_pcm_common_ioctl(struct file *file, struct snd_pcm_substream *substream, unsigned int cmd, void __user *arg) { struct snd_pcm_file *pcm_file = file->private_data; int res; if (PCM_RUNTIME_CHECK(substream)) return -ENXIO; if (substream->runtime->state == SNDRV_PCM_STATE_DISCONNECTED) return -EBADFD; res = snd_power_wait(substream->pcm->card); if (res < 0) return res; switch (cmd) { case SNDRV_PCM_IOCTL_PVERSION: return put_user(SNDRV_PCM_VERSION, (int __user *)arg) ? -EFAULT : 0; case SNDRV_PCM_IOCTL_INFO: return snd_pcm_info_user(substream, arg); case SNDRV_PCM_IOCTL_TSTAMP: /* just for compatibility */ return 0; case SNDRV_PCM_IOCTL_TTSTAMP: return snd_pcm_tstamp(substream, arg); case SNDRV_PCM_IOCTL_USER_PVERSION: if (get_user(pcm_file->user_pversion, (unsigned int __user *)arg)) return -EFAULT; return 0; case SNDRV_PCM_IOCTL_HW_REFINE: return snd_pcm_hw_refine_user(substream, arg); case SNDRV_PCM_IOCTL_HW_PARAMS: return snd_pcm_hw_params_user(substream, arg); case SNDRV_PCM_IOCTL_HW_FREE: return snd_pcm_hw_free(substream); case SNDRV_PCM_IOCTL_SW_PARAMS: return snd_pcm_sw_params_user(substream, arg); case SNDRV_PCM_IOCTL_STATUS32: return snd_pcm_status_user32(substream, arg, false); case SNDRV_PCM_IOCTL_STATUS_EXT32: return snd_pcm_status_user32(substream, arg, true); case SNDRV_PCM_IOCTL_STATUS64: return snd_pcm_status_user64(substream, arg, false); case SNDRV_PCM_IOCTL_STATUS_EXT64: return snd_pcm_status_user64(substream, arg, true); case SNDRV_PCM_IOCTL_CHANNEL_INFO: return snd_pcm_channel_info_user(substream, arg); case SNDRV_PCM_IOCTL_PREPARE: return snd_pcm_prepare(substream, file); case SNDRV_PCM_IOCTL_RESET: return snd_pcm_reset(substream); case SNDRV_PCM_IOCTL_START: return snd_pcm_start_lock_irq(substream); case SNDRV_PCM_IOCTL_LINK: return snd_pcm_link(substream, (int)(unsigned long) arg); case SNDRV_PCM_IOCTL_UNLINK: return snd_pcm_unlink(substream); case SNDRV_PCM_IOCTL_RESUME: return snd_pcm_resume(substream); case SNDRV_PCM_IOCTL_XRUN: return snd_pcm_xrun(substream); case SNDRV_PCM_IOCTL_HWSYNC: return snd_pcm_hwsync(substream); case SNDRV_PCM_IOCTL_DELAY: { snd_pcm_sframes_t delay = 0; snd_pcm_sframes_t __user *res = arg; int err; err = snd_pcm_delay(substream, &delay); if (err) return err; if (put_user(delay, res)) return -EFAULT; return 0; } case __SNDRV_PCM_IOCTL_SYNC_PTR32: return snd_pcm_ioctl_sync_ptr_compat(substream, arg); case __SNDRV_PCM_IOCTL_SYNC_PTR64: return snd_pcm_sync_ptr(substream, arg); #ifdef CONFIG_SND_SUPPORT_OLD_API case SNDRV_PCM_IOCTL_HW_REFINE_OLD: return snd_pcm_hw_refine_old_user(substream, arg); case SNDRV_PCM_IOCTL_HW_PARAMS_OLD: return snd_pcm_hw_params_old_user(substream, arg); #endif case SNDRV_PCM_IOCTL_DRAIN: return snd_pcm_drain(substream, file); case SNDRV_PCM_IOCTL_DROP: return snd_pcm_drop(substream); case SNDRV_PCM_IOCTL_PAUSE: return snd_pcm_pause_lock_irq(substream, (unsigned long)arg); case SNDRV_PCM_IOCTL_WRITEI_FRAMES: case SNDRV_PCM_IOCTL_READI_FRAMES: return snd_pcm_xferi_frames_ioctl(substream, arg); case SNDRV_PCM_IOCTL_WRITEN_FRAMES: case SNDRV_PCM_IOCTL_READN_FRAMES: return snd_pcm_xfern_frames_ioctl(substream, arg); case SNDRV_PCM_IOCTL_REWIND: return snd_pcm_rewind_ioctl(substream, arg); case SNDRV_PCM_IOCTL_FORWARD: return snd_pcm_forward_ioctl(substream, arg); } pcm_dbg(substream->pcm, "unknown ioctl = 0x%x\n", cmd); return -ENOTTY; } static long snd_pcm_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct snd_pcm_file *pcm_file; pcm_file = file->private_data; if (((cmd >> 8) & 0xff) != 'A') return -ENOTTY; return snd_pcm_common_ioctl(file, pcm_file->substream, cmd, (void __user *)arg); } /** * snd_pcm_kernel_ioctl - Execute PCM ioctl in the kernel-space * @substream: PCM substream * @cmd: IOCTL cmd * @arg: IOCTL argument * * The function is provided primarily for OSS layer and USB gadget drivers, * and it allows only the limited set of ioctls (hw_params, sw_params, * prepare, start, drain, drop, forward). * * Return: zero if successful, or a negative error code */ int snd_pcm_kernel_ioctl(struct snd_pcm_substream *substream, unsigned int cmd, void *arg) { snd_pcm_uframes_t *frames = arg; snd_pcm_sframes_t result; if (substream->runtime->state == SNDRV_PCM_STATE_DISCONNECTED) return -EBADFD; switch (cmd) { case SNDRV_PCM_IOCTL_FORWARD: { /* provided only for OSS; capture-only and no value returned */ if (substream->stream != SNDRV_PCM_STREAM_CAPTURE) return -EINVAL; result = snd_pcm_forward(substream, *frames); return result < 0 ? result : 0; } case SNDRV_PCM_IOCTL_HW_PARAMS: return snd_pcm_hw_params(substream, arg); case SNDRV_PCM_IOCTL_SW_PARAMS: return snd_pcm_sw_params(substream, arg); case SNDRV_PCM_IOCTL_PREPARE: return snd_pcm_prepare(substream, NULL); case SNDRV_PCM_IOCTL_START: return snd_pcm_start_lock_irq(substream); case SNDRV_PCM_IOCTL_DRAIN: return snd_pcm_drain(substream, NULL); case SNDRV_PCM_IOCTL_DROP: return snd_pcm_drop(substream); case SNDRV_PCM_IOCTL_DELAY: return snd_pcm_delay(substream, frames); default: return -EINVAL; } } EXPORT_SYMBOL(snd_pcm_kernel_ioctl); static ssize_t snd_pcm_read(struct file *file, char __user *buf, size_t count, loff_t * offset) { struct snd_pcm_file *pcm_file; struct snd_pcm_substream *substream; struct snd_pcm_runtime *runtime; snd_pcm_sframes_t result; pcm_file = file->private_data; substream = pcm_file->substream; if (PCM_RUNTIME_CHECK(substream)) return -ENXIO; runtime = substream->runtime; if (runtime->state == SNDRV_PCM_STATE_OPEN || runtime->state == SNDRV_PCM_STATE_DISCONNECTED) return -EBADFD; if (!frame_aligned(runtime, count)) return -EINVAL; count = bytes_to_frames(runtime, count); result = snd_pcm_lib_read(substream, buf, count); if (result > 0) result = frames_to_bytes(runtime, result); return result; } static ssize_t snd_pcm_write(struct file *file, const char __user *buf, size_t count, loff_t * offset) { struct snd_pcm_file *pcm_file; struct snd_pcm_substream *substream; struct snd_pcm_runtime *runtime; snd_pcm_sframes_t result; pcm_file = file->private_data; substream = pcm_file->substream; if (PCM_RUNTIME_CHECK(substream)) return -ENXIO; runtime = substream->runtime; if (runtime->state == SNDRV_PCM_STATE_OPEN || runtime->state == SNDRV_PCM_STATE_DISCONNECTED) return -EBADFD; if (!frame_aligned(runtime, count)) return -EINVAL; count = bytes_to_frames(runtime, count); result = snd_pcm_lib_write(substream, buf, count); if (result > 0) result = frames_to_bytes(runtime, result); return result; } static ssize_t snd_pcm_readv(struct kiocb *iocb, struct iov_iter *to) { struct snd_pcm_file *pcm_file; struct snd_pcm_substream *substream; struct snd_pcm_runtime *runtime; snd_pcm_sframes_t result; unsigned long i; void __user **bufs __free(kfree) = NULL; snd_pcm_uframes_t frames; const struct iovec *iov = iter_iov(to); pcm_file = iocb->ki_filp->private_data; substream = pcm_file->substream; if (PCM_RUNTIME_CHECK(substream)) return -ENXIO; runtime = substream->runtime; if (runtime->state == SNDRV_PCM_STATE_OPEN || runtime->state == SNDRV_PCM_STATE_DISCONNECTED) return -EBADFD; if (!user_backed_iter(to)) return -EINVAL; if (to->nr_segs > 1024 || to->nr_segs != runtime->channels) return -EINVAL; if (!frame_aligned(runtime, iov->iov_len)) return -EINVAL; frames = bytes_to_samples(runtime, iov->iov_len); bufs = kmalloc_array(to->nr_segs, sizeof(void *), GFP_KERNEL); if (bufs == NULL) return -ENOMEM; for (i = 0; i < to->nr_segs; ++i) { bufs[i] = iov->iov_base; iov++; } result = snd_pcm_lib_readv(substream, bufs, frames); if (result > 0) result = frames_to_bytes(runtime, result); return result; } static ssize_t snd_pcm_writev(struct kiocb *iocb, struct iov_iter *from) { struct snd_pcm_file *pcm_file; struct snd_pcm_substream *substream; struct snd_pcm_runtime *runtime; snd_pcm_sframes_t result; unsigned long i; void __user **bufs __free(kfree) = NULL; snd_pcm_uframes_t frames; const struct iovec *iov = iter_iov(from); pcm_file = iocb->ki_filp->private_data; substream = pcm_file->substream; if (PCM_RUNTIME_CHECK(substream)) return -ENXIO; runtime = substream->runtime; if (runtime->state == SNDRV_PCM_STATE_OPEN || runtime->state == SNDRV_PCM_STATE_DISCONNECTED) return -EBADFD; if (!user_backed_iter(from)) return -EINVAL; if (from->nr_segs > 128 || from->nr_segs != runtime->channels || !frame_aligned(runtime, iov->iov_len)) return -EINVAL; frames = bytes_to_samples(runtime, iov->iov_len); bufs = kmalloc_array(from->nr_segs, sizeof(void *), GFP_KERNEL); if (bufs == NULL) return -ENOMEM; for (i = 0; i < from->nr_segs; ++i) { bufs[i] = iov->iov_base; iov++; } result = snd_pcm_lib_writev(substream, bufs, frames); if (result > 0) result = frames_to_bytes(runtime, result); return result; } static __poll_t snd_pcm_poll(struct file *file, poll_table *wait) { struct snd_pcm_file *pcm_file; struct snd_pcm_substream *substream; struct snd_pcm_runtime *runtime; __poll_t mask, ok; snd_pcm_uframes_t avail; pcm_file = file->private_data; substream = pcm_file->substream; if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) ok = EPOLLOUT | EPOLLWRNORM; else ok = EPOLLIN | EPOLLRDNORM; if (PCM_RUNTIME_CHECK(substream)) return ok | EPOLLERR; runtime = substream->runtime; if (runtime->state == SNDRV_PCM_STATE_DISCONNECTED) return ok | EPOLLERR; poll_wait(file, &runtime->sleep, wait); mask = 0; guard(pcm_stream_lock_irq)(substream); avail = snd_pcm_avail(substream); switch (runtime->state) { case SNDRV_PCM_STATE_RUNNING: case SNDRV_PCM_STATE_PREPARED: case SNDRV_PCM_STATE_PAUSED: if (avail >= runtime->control->avail_min) mask = ok; break; case SNDRV_PCM_STATE_DRAINING: if (substream->stream == SNDRV_PCM_STREAM_CAPTURE) { mask = ok; if (!avail) mask |= EPOLLERR; } break; default: mask = ok | EPOLLERR; break; } return mask; } /* * mmap support */ /* * Only on coherent architectures, we can mmap the status and the control records * for effcient data transfer. On others, we have to use HWSYNC ioctl... */ #if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_ALPHA) /* * mmap status record */ static vm_fault_t snd_pcm_mmap_status_fault(struct vm_fault *vmf) { struct snd_pcm_substream *substream = vmf->vma->vm_private_data; struct snd_pcm_runtime *runtime; if (substream == NULL) return VM_FAULT_SIGBUS; runtime = substream->runtime; vmf->page = virt_to_page(runtime->status); get_page(vmf->page); return 0; } static const struct vm_operations_struct snd_pcm_vm_ops_status = { .fault = snd_pcm_mmap_status_fault, }; static int snd_pcm_mmap_status(struct snd_pcm_substream *substream, struct file *file, struct vm_area_struct *area) { long size; if (!(area->vm_flags & VM_READ)) return -EINVAL; size = area->vm_end - area->vm_start; if (size != PAGE_ALIGN(sizeof(struct snd_pcm_mmap_status))) return -EINVAL; area->vm_ops = &snd_pcm_vm_ops_status; area->vm_private_data = substream; vm_flags_mod(area, VM_DONTEXPAND | VM_DONTDUMP, VM_WRITE | VM_MAYWRITE); return 0; } /* * mmap control record */ static vm_fault_t snd_pcm_mmap_control_fault(struct vm_fault *vmf) { struct snd_pcm_substream *substream = vmf->vma->vm_private_data; struct snd_pcm_runtime *runtime; if (substream == NULL) return VM_FAULT_SIGBUS; runtime = substream->runtime; vmf->page = virt_to_page(runtime->control); get_page(vmf->page); return 0; } static const struct vm_operations_struct snd_pcm_vm_ops_control = { .fault = snd_pcm_mmap_control_fault, }; static int snd_pcm_mmap_control(struct snd_pcm_substream *substream, struct file *file, struct vm_area_struct *area) { long size; if (!(area->vm_flags & VM_READ)) return -EINVAL; size = area->vm_end - area->vm_start; if (size != PAGE_ALIGN(sizeof(struct snd_pcm_mmap_control))) return -EINVAL; area->vm_ops = &snd_pcm_vm_ops_control; area->vm_private_data = substream; vm_flags_set(area, VM_DONTEXPAND | VM_DONTDUMP); return 0; } static bool pcm_status_mmap_allowed(struct snd_pcm_file *pcm_file) { /* If drivers require the explicit sync (typically for non-coherent * pages), we have to disable the mmap of status and control data * to enforce the control via SYNC_PTR ioctl. */ if (pcm_file->substream->runtime->hw.info & SNDRV_PCM_INFO_EXPLICIT_SYNC) return false; /* See pcm_control_mmap_allowed() below. * Since older alsa-lib requires both status and control mmaps to be * coupled, we have to disable the status mmap for old alsa-lib, too. */ if (pcm_file->user_pversion < SNDRV_PROTOCOL_VERSION(2, 0, 14) && (pcm_file->substream->runtime->hw.info & SNDRV_PCM_INFO_SYNC_APPLPTR)) return false; return true; } static bool pcm_control_mmap_allowed(struct snd_pcm_file *pcm_file) { if (pcm_file->no_compat_mmap) return false; /* see above */ if (pcm_file->substream->runtime->hw.info & SNDRV_PCM_INFO_EXPLICIT_SYNC) return false; /* Disallow the control mmap when SYNC_APPLPTR flag is set; * it enforces the user-space to fall back to snd_pcm_sync_ptr(), * thus it effectively assures the manual update of appl_ptr. */ if (pcm_file->substream->runtime->hw.info & SNDRV_PCM_INFO_SYNC_APPLPTR) return false; return true; } #else /* ! coherent mmap */ /* * don't support mmap for status and control records. */ #define pcm_status_mmap_allowed(pcm_file) false #define pcm_control_mmap_allowed(pcm_file) false static int snd_pcm_mmap_status(struct snd_pcm_substream *substream, struct file *file, struct vm_area_struct *area) { return -ENXIO; } static int snd_pcm_mmap_control(struct snd_pcm_substream *substream, struct file *file, struct vm_area_struct *area) { return -ENXIO; } #endif /* coherent mmap */ /* * snd_pcm_mmap_data_open - increase the mmap counter */ static void snd_pcm_mmap_data_open(struct vm_area_struct *area) { struct snd_pcm_substream *substream = area->vm_private_data; atomic_inc(&substream->mmap_count); } /* * snd_pcm_mmap_data_close - decrease the mmap counter */ static void snd_pcm_mmap_data_close(struct vm_area_struct *area) { struct snd_pcm_substream *substream = area->vm_private_data; atomic_dec(&substream->mmap_count); } /* * fault callback for mmapping a RAM page */ static vm_fault_t snd_pcm_mmap_data_fault(struct vm_fault *vmf) { struct snd_pcm_substream *substream = vmf->vma->vm_private_data; struct snd_pcm_runtime *runtime; unsigned long offset; struct page * page; size_t dma_bytes; if (substream == NULL) return VM_FAULT_SIGBUS; runtime = substream->runtime; offset = vmf->pgoff << PAGE_SHIFT; dma_bytes = PAGE_ALIGN(runtime->dma_bytes); if (offset > dma_bytes - PAGE_SIZE) return VM_FAULT_SIGBUS; if (substream->ops->page) page = substream->ops->page(substream, offset); else if (!snd_pcm_get_dma_buf(substream)) { if (WARN_ON_ONCE(!runtime->dma_area)) return VM_FAULT_SIGBUS; page = virt_to_page(runtime->dma_area + offset); } else page = snd_sgbuf_get_page(snd_pcm_get_dma_buf(substream), offset); if (!page) return VM_FAULT_SIGBUS; get_page(page); vmf->page = page; return 0; } static const struct vm_operations_struct snd_pcm_vm_ops_data = { .open = snd_pcm_mmap_data_open, .close = snd_pcm_mmap_data_close, }; static const struct vm_operations_struct snd_pcm_vm_ops_data_fault = { .open = snd_pcm_mmap_data_open, .close = snd_pcm_mmap_data_close, .fault = snd_pcm_mmap_data_fault, }; /* * mmap the DMA buffer on RAM */ /** * snd_pcm_lib_default_mmap - Default PCM data mmap function * @substream: PCM substream * @area: VMA * * This is the default mmap handler for PCM data. When mmap pcm_ops is NULL, * this function is invoked implicitly. * * Return: zero if successful, or a negative error code */ int snd_pcm_lib_default_mmap(struct snd_pcm_substream *substream, struct vm_area_struct *area) { vm_flags_set(area, VM_DONTEXPAND | VM_DONTDUMP); if (!substream->ops->page && !snd_dma_buffer_mmap(snd_pcm_get_dma_buf(substream), area)) return 0; /* mmap with fault handler */ area->vm_ops = &snd_pcm_vm_ops_data_fault; return 0; } EXPORT_SYMBOL_GPL(snd_pcm_lib_default_mmap); /* * mmap the DMA buffer on I/O memory area */ #if SNDRV_PCM_INFO_MMAP_IOMEM /** * snd_pcm_lib_mmap_iomem - Default PCM data mmap function for I/O mem * @substream: PCM substream * @area: VMA * * When your hardware uses the iomapped pages as the hardware buffer and * wants to mmap it, pass this function as mmap pcm_ops. Note that this * is supposed to work only on limited architectures. * * Return: zero if successful, or a negative error code */ int snd_pcm_lib_mmap_iomem(struct snd_pcm_substream *substream, struct vm_area_struct *area) { struct snd_pcm_runtime *runtime = substream->runtime; area->vm_page_prot = pgprot_noncached(area->vm_page_prot); return vm_iomap_memory(area, runtime->dma_addr, runtime->dma_bytes); } EXPORT_SYMBOL(snd_pcm_lib_mmap_iomem); #endif /* SNDRV_PCM_INFO_MMAP */ /* * mmap DMA buffer */ int snd_pcm_mmap_data(struct snd_pcm_substream *substream, struct file *file, struct vm_area_struct *area) { struct snd_pcm_runtime *runtime; long size; unsigned long offset; size_t dma_bytes; int err; if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) { if (!(area->vm_flags & (VM_WRITE|VM_READ))) return -EINVAL; } else { if (!(area->vm_flags & VM_READ)) return -EINVAL; } runtime = substream->runtime; if (runtime->state == SNDRV_PCM_STATE_OPEN) return -EBADFD; if (!(runtime->info & SNDRV_PCM_INFO_MMAP)) return -ENXIO; if (runtime->access == SNDRV_PCM_ACCESS_RW_INTERLEAVED || runtime->access == SNDRV_PCM_ACCESS_RW_NONINTERLEAVED) return -EINVAL; size = area->vm_end - area->vm_start; offset = area->vm_pgoff << PAGE_SHIFT; dma_bytes = PAGE_ALIGN(runtime->dma_bytes); if ((size_t)size > dma_bytes) return -EINVAL; if (offset > dma_bytes - size) return -EINVAL; area->vm_ops = &snd_pcm_vm_ops_data; area->vm_private_data = substream; if (substream->ops->mmap) err = substream->ops->mmap(substream, area); else err = snd_pcm_lib_default_mmap(substream, area); if (!err) atomic_inc(&substream->mmap_count); return err; } EXPORT_SYMBOL(snd_pcm_mmap_data); static int snd_pcm_mmap(struct file *file, struct vm_area_struct *area) { struct snd_pcm_file * pcm_file; struct snd_pcm_substream *substream; unsigned long offset; pcm_file = file->private_data; substream = pcm_file->substream; if (PCM_RUNTIME_CHECK(substream)) return -ENXIO; if (substream->runtime->state == SNDRV_PCM_STATE_DISCONNECTED) return -EBADFD; offset = area->vm_pgoff << PAGE_SHIFT; switch (offset) { case SNDRV_PCM_MMAP_OFFSET_STATUS_OLD: if (pcm_file->no_compat_mmap || !IS_ENABLED(CONFIG_64BIT)) return -ENXIO; fallthrough; case SNDRV_PCM_MMAP_OFFSET_STATUS_NEW: if (!pcm_status_mmap_allowed(pcm_file)) return -ENXIO; return snd_pcm_mmap_status(substream, file, area); case SNDRV_PCM_MMAP_OFFSET_CONTROL_OLD: if (pcm_file->no_compat_mmap || !IS_ENABLED(CONFIG_64BIT)) return -ENXIO; fallthrough; case SNDRV_PCM_MMAP_OFFSET_CONTROL_NEW: if (!pcm_control_mmap_allowed(pcm_file)) return -ENXIO; return snd_pcm_mmap_control(substream, file, area); default: return snd_pcm_mmap_data(substream, file, area); } return 0; } static int snd_pcm_fasync(int fd, struct file * file, int on) { struct snd_pcm_file * pcm_file; struct snd_pcm_substream *substream; struct snd_pcm_runtime *runtime; pcm_file = file->private_data; substream = pcm_file->substream; if (PCM_RUNTIME_CHECK(substream)) return -ENXIO; runtime = substream->runtime; if (runtime->state == SNDRV_PCM_STATE_DISCONNECTED) return -EBADFD; return snd_fasync_helper(fd, file, on, &runtime->fasync); } /* * ioctl32 compat */ #ifdef CONFIG_COMPAT #include "pcm_compat.c" #else #define snd_pcm_ioctl_compat NULL #endif /* * To be removed helpers to keep binary compatibility */ #ifdef CONFIG_SND_SUPPORT_OLD_API #define __OLD_TO_NEW_MASK(x) ((x&7)|((x&0x07fffff8)<<5)) #define __NEW_TO_OLD_MASK(x) ((x&7)|((x&0xffffff00)>>5)) static void snd_pcm_hw_convert_from_old_params(struct snd_pcm_hw_params *params, struct snd_pcm_hw_params_old *oparams) { unsigned int i; memset(params, 0, sizeof(*params)); params->flags = oparams->flags; for (i = 0; i < ARRAY_SIZE(oparams->masks); i++) params->masks[i].bits[0] = oparams->masks[i]; memcpy(params->intervals, oparams->intervals, sizeof(oparams->intervals)); params->rmask = __OLD_TO_NEW_MASK(oparams->rmask); params->cmask = __OLD_TO_NEW_MASK(oparams->cmask); params->info = oparams->info; params->msbits = oparams->msbits; params->rate_num = oparams->rate_num; params->rate_den = oparams->rate_den; params->fifo_size = oparams->fifo_size; } static void snd_pcm_hw_convert_to_old_params(struct snd_pcm_hw_params_old *oparams, struct snd_pcm_hw_params *params) { unsigned int i; memset(oparams, 0, sizeof(*oparams)); oparams->flags = params->flags; for (i = 0; i < ARRAY_SIZE(oparams->masks); i++) oparams->masks[i] = params->masks[i].bits[0]; memcpy(oparams->intervals, params->intervals, sizeof(oparams->intervals)); oparams->rmask = __NEW_TO_OLD_MASK(params->rmask); oparams->cmask = __NEW_TO_OLD_MASK(params->cmask); oparams->info = params->info; oparams->msbits = params->msbits; oparams->rate_num = params->rate_num; oparams->rate_den = params->rate_den; oparams->fifo_size = params->fifo_size; } static int snd_pcm_hw_refine_old_user(struct snd_pcm_substream *substream, struct snd_pcm_hw_params_old __user * _oparams) { struct snd_pcm_hw_params *params __free(kfree) = NULL; struct snd_pcm_hw_params_old *oparams __free(kfree) = NULL; int err; params = kmalloc(sizeof(*params), GFP_KERNEL); if (!params) return -ENOMEM; oparams = memdup_user(_oparams, sizeof(*oparams)); if (IS_ERR(oparams)) return PTR_ERR(oparams); snd_pcm_hw_convert_from_old_params(params, oparams); err = snd_pcm_hw_refine(substream, params); if (err < 0) return err; err = fixup_unreferenced_params(substream, params); if (err < 0) return err; snd_pcm_hw_convert_to_old_params(oparams, params); if (copy_to_user(_oparams, oparams, sizeof(*oparams))) return -EFAULT; return 0; } static int snd_pcm_hw_params_old_user(struct snd_pcm_substream *substream, struct snd_pcm_hw_params_old __user * _oparams) { struct snd_pcm_hw_params *params __free(kfree) = NULL; struct snd_pcm_hw_params_old *oparams __free(kfree) = NULL; int err; params = kmalloc(sizeof(*params), GFP_KERNEL); if (!params) return -ENOMEM; oparams = memdup_user(_oparams, sizeof(*oparams)); if (IS_ERR(oparams)) return PTR_ERR(oparams); snd_pcm_hw_convert_from_old_params(params, oparams); err = snd_pcm_hw_params(substream, params); if (err < 0) return err; snd_pcm_hw_convert_to_old_params(oparams, params); if (copy_to_user(_oparams, oparams, sizeof(*oparams))) return -EFAULT; return 0; } #endif /* CONFIG_SND_SUPPORT_OLD_API */ #ifndef CONFIG_MMU static unsigned long snd_pcm_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { struct snd_pcm_file *pcm_file = file->private_data; struct snd_pcm_substream *substream = pcm_file->substream; struct snd_pcm_runtime *runtime = substream->runtime; unsigned long offset = pgoff << PAGE_SHIFT; switch (offset) { case SNDRV_PCM_MMAP_OFFSET_STATUS_NEW: return (unsigned long)runtime->status; case SNDRV_PCM_MMAP_OFFSET_CONTROL_NEW: return (unsigned long)runtime->control; default: return (unsigned long)runtime->dma_area + offset; } } #else # define snd_pcm_get_unmapped_area NULL #endif /* * Register section */ const struct file_operations snd_pcm_f_ops[2] = { { .owner = THIS_MODULE, .write = snd_pcm_write, .write_iter = snd_pcm_writev, .open = snd_pcm_playback_open, .release = snd_pcm_release, .poll = snd_pcm_poll, .unlocked_ioctl = snd_pcm_ioctl, .compat_ioctl = snd_pcm_ioctl_compat, .mmap = snd_pcm_mmap, .fasync = snd_pcm_fasync, .get_unmapped_area = snd_pcm_get_unmapped_area, }, { .owner = THIS_MODULE, .read = snd_pcm_read, .read_iter = snd_pcm_readv, .open = snd_pcm_capture_open, .release = snd_pcm_release, .poll = snd_pcm_poll, .unlocked_ioctl = snd_pcm_ioctl, .compat_ioctl = snd_pcm_ioctl_compat, .mmap = snd_pcm_mmap, .fasync = snd_pcm_fasync, .get_unmapped_area = snd_pcm_get_unmapped_area, } }; |
| 40 222 13 2 10 16 2 9 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 | /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM io_uring #if !defined(_TRACE_IO_URING_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_IO_URING_H #include <linux/tracepoint.h> #include <uapi/linux/io_uring.h> #include <linux/io_uring_types.h> #include <linux/io_uring.h> struct io_wq_work; /** * io_uring_create - called after a new io_uring context was prepared * * @fd: corresponding file descriptor * @ctx: pointer to a ring context structure * @sq_entries: actual SQ size * @cq_entries: actual CQ size * @flags: SQ ring flags, provided to io_uring_setup(2) * * Allows to trace io_uring creation and provide pointer to a context, that can * be used later to find correlated events. */ TRACE_EVENT(io_uring_create, TP_PROTO(int fd, void *ctx, u32 sq_entries, u32 cq_entries, u32 flags), TP_ARGS(fd, ctx, sq_entries, cq_entries, flags), TP_STRUCT__entry ( __field( int, fd ) __field( void *, ctx ) __field( u32, sq_entries ) __field( u32, cq_entries ) __field( u32, flags ) ), TP_fast_assign( __entry->fd = fd; __entry->ctx = ctx; __entry->sq_entries = sq_entries; __entry->cq_entries = cq_entries; __entry->flags = flags; ), TP_printk("ring %p, fd %d sq size %d, cq size %d, flags 0x%x", __entry->ctx, __entry->fd, __entry->sq_entries, __entry->cq_entries, __entry->flags) ); /** * io_uring_register - called after a buffer/file/eventfd was successfully * registered for a ring * * @ctx: pointer to a ring context structure * @opcode: describes which operation to perform * @nr_user_files: number of registered files * @nr_user_bufs: number of registered buffers * @ret: return code * * Allows to trace fixed files/buffers, that could be registered to * avoid an overhead of getting references to them for every operation. This * event, together with io_uring_file_get, can provide a full picture of how * much overhead one can reduce via fixing. */ TRACE_EVENT(io_uring_register, TP_PROTO(void *ctx, unsigned opcode, unsigned nr_files, unsigned nr_bufs, long ret), TP_ARGS(ctx, opcode, nr_files, nr_bufs, ret), TP_STRUCT__entry ( __field( void *, ctx ) __field( unsigned, opcode ) __field( unsigned, nr_files) __field( unsigned, nr_bufs ) __field( long, ret ) ), TP_fast_assign( __entry->ctx = ctx; __entry->opcode = opcode; __entry->nr_files = nr_files; __entry->nr_bufs = nr_bufs; __entry->ret = ret; ), TP_printk("ring %p, opcode %d, nr_user_files %d, nr_user_bufs %d, " "ret %ld", __entry->ctx, __entry->opcode, __entry->nr_files, __entry->nr_bufs, __entry->ret) ); /** * io_uring_file_get - called before getting references to an SQE file * * @req: pointer to a submitted request * @fd: SQE file descriptor * * Allows to trace out how often an SQE file reference is obtained, which can * help figuring out if it makes sense to use fixed files, or check that fixed * files are used correctly. */ TRACE_EVENT(io_uring_file_get, TP_PROTO(struct io_kiocb *req, int fd), TP_ARGS(req, fd), TP_STRUCT__entry ( __field( void *, ctx ) __field( void *, req ) __field( u64, user_data ) __field( int, fd ) ), TP_fast_assign( __entry->ctx = req->ctx; __entry->req = req; __entry->user_data = req->cqe.user_data; __entry->fd = fd; ), TP_printk("ring %p, req %p, user_data 0x%llx, fd %d", __entry->ctx, __entry->req, __entry->user_data, __entry->fd) ); /** * io_uring_queue_async_work - called before submitting a new async work * * @req: pointer to a submitted request * @rw: type of workqueue, hashed or normal * * Allows to trace asynchronous work submission. */ TRACE_EVENT(io_uring_queue_async_work, TP_PROTO(struct io_kiocb *req, int rw), TP_ARGS(req, rw), TP_STRUCT__entry ( __field( void *, ctx ) __field( void *, req ) __field( u64, user_data ) __field( u8, opcode ) __field( unsigned long long, flags ) __field( struct io_wq_work *, work ) __field( int, rw ) __string( op_str, io_uring_get_opcode(req->opcode) ) ), TP_fast_assign( __entry->ctx = req->ctx; __entry->req = req; __entry->user_data = req->cqe.user_data; __entry->flags = (__force unsigned long long) req->flags; __entry->opcode = req->opcode; __entry->work = &req->work; __entry->rw = rw; __assign_str(op_str); ), TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s, flags 0x%llx, %s queue, work %p", __entry->ctx, __entry->req, __entry->user_data, __get_str(op_str), __entry->flags, __entry->rw ? "hashed" : "normal", __entry->work) ); /** * io_uring_defer - called when an io_uring request is deferred * * @req: pointer to a deferred request * * Allows to track deferred requests, to get an insight about what requests are * not started immediately. */ TRACE_EVENT(io_uring_defer, TP_PROTO(struct io_kiocb *req), TP_ARGS(req), TP_STRUCT__entry ( __field( void *, ctx ) __field( void *, req ) __field( unsigned long long, data ) __field( u8, opcode ) __string( op_str, io_uring_get_opcode(req->opcode) ) ), TP_fast_assign( __entry->ctx = req->ctx; __entry->req = req; __entry->data = req->cqe.user_data; __entry->opcode = req->opcode; __assign_str(op_str); ), TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s", __entry->ctx, __entry->req, __entry->data, __get_str(op_str)) ); /** * io_uring_link - called before the io_uring request added into link_list of * another request * * @req: pointer to a linked request * @target_req: pointer to a previous request, that would contain @req * * Allows to track linked requests, to understand dependencies between requests * and how does it influence their execution flow. */ TRACE_EVENT(io_uring_link, TP_PROTO(struct io_kiocb *req, struct io_kiocb *target_req), TP_ARGS(req, target_req), TP_STRUCT__entry ( __field( void *, ctx ) __field( void *, req ) __field( void *, target_req ) ), TP_fast_assign( __entry->ctx = req->ctx; __entry->req = req; __entry->target_req = target_req; ), TP_printk("ring %p, request %p linked after %p", __entry->ctx, __entry->req, __entry->target_req) ); /** * io_uring_cqring_wait - called before start waiting for an available CQE * * @ctx: pointer to a ring context structure * @min_events: minimal number of events to wait for * * Allows to track waiting for CQE, so that we can e.g. troubleshoot * situations, when an application wants to wait for an event, that never * comes. */ TRACE_EVENT(io_uring_cqring_wait, TP_PROTO(void *ctx, int min_events), TP_ARGS(ctx, min_events), TP_STRUCT__entry ( __field( void *, ctx ) __field( int, min_events ) ), TP_fast_assign( __entry->ctx = ctx; __entry->min_events = min_events; ), TP_printk("ring %p, min_events %d", __entry->ctx, __entry->min_events) ); /** * io_uring_fail_link - called before failing a linked request * * @req: request, which links were cancelled * @link: cancelled link * * Allows to track linked requests cancellation, to see not only that some work * was cancelled, but also which request was the reason. */ TRACE_EVENT(io_uring_fail_link, TP_PROTO(struct io_kiocb *req, struct io_kiocb *link), TP_ARGS(req, link), TP_STRUCT__entry ( __field( void *, ctx ) __field( void *, req ) __field( unsigned long long, user_data ) __field( u8, opcode ) __field( void *, link ) __string( op_str, io_uring_get_opcode(req->opcode) ) ), TP_fast_assign( __entry->ctx = req->ctx; __entry->req = req; __entry->user_data = req->cqe.user_data; __entry->opcode = req->opcode; __entry->link = link; __assign_str(op_str); ), TP_printk("ring %p, request %p, user_data 0x%llx, opcode %s, link %p", __entry->ctx, __entry->req, __entry->user_data, __get_str(op_str), __entry->link) ); /** * io_uring_complete - called when completing an SQE * * @ctx: pointer to a ring context structure * @req: (optional) pointer to a submitted request * @cqe: pointer to the filled in CQE being posted */ TRACE_EVENT(io_uring_complete, TP_PROTO(struct io_ring_ctx *ctx, void *req, struct io_uring_cqe *cqe), TP_ARGS(ctx, req, cqe), TP_STRUCT__entry ( __field( void *, ctx ) __field( void *, req ) __field( u64, user_data ) __field( int, res ) __field( unsigned, cflags ) __field( u64, extra1 ) __field( u64, extra2 ) ), TP_fast_assign( __entry->ctx = ctx; __entry->req = req; __entry->user_data = cqe->user_data; __entry->res = cqe->res; __entry->cflags = cqe->flags; __entry->extra1 = io_ctx_cqe32(ctx) ? cqe->big_cqe[0] : 0; __entry->extra2 = io_ctx_cqe32(ctx) ? cqe->big_cqe[1] : 0; ), TP_printk("ring %p, req %p, user_data 0x%llx, result %d, cflags 0x%x " "extra1 %llu extra2 %llu ", __entry->ctx, __entry->req, __entry->user_data, __entry->res, __entry->cflags, (unsigned long long) __entry->extra1, (unsigned long long) __entry->extra2) ); /** * io_uring_submit_req - called before submitting a request * * @req: pointer to a submitted request * * Allows to track SQE submitting, to understand what was the source of it, SQ * thread or io_uring_enter call. */ TRACE_EVENT(io_uring_submit_req, TP_PROTO(struct io_kiocb *req), TP_ARGS(req), TP_STRUCT__entry ( __field( void *, ctx ) __field( void *, req ) __field( unsigned long long, user_data ) __field( u8, opcode ) __field( unsigned long long, flags ) __field( bool, sq_thread ) __string( op_str, io_uring_get_opcode(req->opcode) ) ), TP_fast_assign( __entry->ctx = req->ctx; __entry->req = req; __entry->user_data = req->cqe.user_data; __entry->opcode = req->opcode; __entry->flags = (__force unsigned long long) req->flags; __entry->sq_thread = req->ctx->flags & IORING_SETUP_SQPOLL; __assign_str(op_str); ), TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, flags 0x%llx, " "sq_thread %d", __entry->ctx, __entry->req, __entry->user_data, __get_str(op_str), __entry->flags, __entry->sq_thread) ); /* * io_uring_poll_arm - called after arming a poll wait if successful * * @req: pointer to the armed request * @mask: request poll events mask * @events: registered events of interest * * Allows to track which fds are waiting for and what are the events of * interest. */ TRACE_EVENT(io_uring_poll_arm, TP_PROTO(struct io_kiocb *req, int mask, int events), TP_ARGS(req, mask, events), TP_STRUCT__entry ( __field( void *, ctx ) __field( void *, req ) __field( unsigned long long, user_data ) __field( u8, opcode ) __field( int, mask ) __field( int, events ) __string( op_str, io_uring_get_opcode(req->opcode) ) ), TP_fast_assign( __entry->ctx = req->ctx; __entry->req = req; __entry->user_data = req->cqe.user_data; __entry->opcode = req->opcode; __entry->mask = mask; __entry->events = events; __assign_str(op_str); ), TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, mask 0x%x, events 0x%x", __entry->ctx, __entry->req, __entry->user_data, __get_str(op_str), __entry->mask, __entry->events) ); /* * io_uring_task_add - called after adding a task * * @req: pointer to request * @mask: request poll events mask * */ TRACE_EVENT(io_uring_task_add, TP_PROTO(struct io_kiocb *req, int mask), TP_ARGS(req, mask), TP_STRUCT__entry ( __field( void *, ctx ) __field( void *, req ) __field( unsigned long long, user_data ) __field( u8, opcode ) __field( int, mask ) __string( op_str, io_uring_get_opcode(req->opcode) ) ), TP_fast_assign( __entry->ctx = req->ctx; __entry->req = req; __entry->user_data = req->cqe.user_data; __entry->opcode = req->opcode; __entry->mask = mask; __assign_str(op_str); ), TP_printk("ring %p, req %p, user_data 0x%llx, opcode %s, mask %x", __entry->ctx, __entry->req, __entry->user_data, __get_str(op_str), __entry->mask) ); /* * io_uring_req_failed - called when an sqe is errored dring submission * * @sqe: pointer to the io_uring_sqe that failed * @req: pointer to request * @error: error it failed with * * Allows easier diagnosing of malformed requests in production systems. */ TRACE_EVENT(io_uring_req_failed, TP_PROTO(const struct io_uring_sqe *sqe, struct io_kiocb *req, int error), TP_ARGS(sqe, req, error), TP_STRUCT__entry ( __field( void *, ctx ) __field( void *, req ) __field( unsigned long long, user_data ) __field( u8, opcode ) __field( u8, flags ) __field( u8, ioprio ) __field( u64, off ) __field( u64, addr ) __field( u32, len ) __field( u32, op_flags ) __field( u16, buf_index ) __field( u16, personality ) __field( u32, file_index ) __field( u64, pad1 ) __field( u64, addr3 ) __field( int, error ) __string( op_str, io_uring_get_opcode(sqe->opcode) ) ), TP_fast_assign( __entry->ctx = req->ctx; __entry->req = req; __entry->user_data = sqe->user_data; __entry->opcode = sqe->opcode; __entry->flags = sqe->flags; __entry->ioprio = sqe->ioprio; __entry->off = sqe->off; __entry->addr = sqe->addr; __entry->len = sqe->len; __entry->op_flags = sqe->poll32_events; __entry->buf_index = sqe->buf_index; __entry->personality = sqe->personality; __entry->file_index = sqe->file_index; __entry->pad1 = sqe->__pad2[0]; __entry->addr3 = sqe->addr3; __entry->error = error; __assign_str(op_str); ), TP_printk("ring %p, req %p, user_data 0x%llx, " "opcode %s, flags 0x%x, prio=%d, off=%llu, addr=%llu, " "len=%u, rw_flags=0x%x, buf_index=%d, " "personality=%d, file_index=%d, pad=0x%llx, addr3=%llx, " "error=%d", __entry->ctx, __entry->req, __entry->user_data, __get_str(op_str), __entry->flags, __entry->ioprio, (unsigned long long)__entry->off, (unsigned long long) __entry->addr, __entry->len, __entry->op_flags, __entry->buf_index, __entry->personality, __entry->file_index, (unsigned long long) __entry->pad1, (unsigned long long) __entry->addr3, __entry->error) ); /* * io_uring_cqe_overflow - a CQE overflowed * * @ctx: pointer to a ring context structure * @user_data: user data associated with the request * @res: CQE result * @cflags: CQE flags * @ocqe: pointer to the overflow cqe (if available) * */ TRACE_EVENT(io_uring_cqe_overflow, TP_PROTO(void *ctx, unsigned long long user_data, s32 res, u32 cflags, void *ocqe), TP_ARGS(ctx, user_data, res, cflags, ocqe), TP_STRUCT__entry ( __field( void *, ctx ) __field( unsigned long long, user_data ) __field( s32, res ) __field( u32, cflags ) __field( void *, ocqe ) ), TP_fast_assign( __entry->ctx = ctx; __entry->user_data = user_data; __entry->res = res; __entry->cflags = cflags; __entry->ocqe = ocqe; ), TP_printk("ring %p, user_data 0x%llx, res %d, cflags 0x%x, " "overflow_cqe %p", __entry->ctx, __entry->user_data, __entry->res, __entry->cflags, __entry->ocqe) ); /* * io_uring_task_work_run - ran task work * * @tctx: pointer to a io_uring_task * @count: how many functions it ran * */ TRACE_EVENT(io_uring_task_work_run, TP_PROTO(void *tctx, unsigned int count), TP_ARGS(tctx, count), TP_STRUCT__entry ( __field( void *, tctx ) __field( unsigned int, count ) ), TP_fast_assign( __entry->tctx = tctx; __entry->count = count; ), TP_printk("tctx %p, count %u", __entry->tctx, __entry->count) ); TRACE_EVENT(io_uring_short_write, TP_PROTO(void *ctx, u64 fpos, u64 wanted, u64 got), TP_ARGS(ctx, fpos, wanted, got), TP_STRUCT__entry( __field(void *, ctx) __field(u64, fpos) __field(u64, wanted) __field(u64, got) ), TP_fast_assign( __entry->ctx = ctx; __entry->fpos = fpos; __entry->wanted = wanted; __entry->got = got; ), TP_printk("ring %p, fpos %lld, wanted %lld, got %lld", __entry->ctx, __entry->fpos, __entry->wanted, __entry->got) ); /* * io_uring_local_work_run - ran ring local task work * * @tctx: pointer to a io_uring_ctx * @count: how many functions it ran * @loops: how many loops it ran * */ TRACE_EVENT(io_uring_local_work_run, TP_PROTO(void *ctx, int count, unsigned int loops), TP_ARGS(ctx, count, loops), TP_STRUCT__entry ( __field(void *, ctx ) __field(int, count ) __field(unsigned int, loops ) ), TP_fast_assign( __entry->ctx = ctx; __entry->count = count; __entry->loops = loops; ), TP_printk("ring %p, count %d, loops %u", __entry->ctx, __entry->count, __entry->loops) ); #endif /* _TRACE_IO_URING_H */ /* This part must be outside protection */ #include <trace/define_trace.h> |
| 4836 54 53 47 8 53 8 47 54 2 2 2 2 47 46 47 47 47 47 47 46 47 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 | // SPDX-License-Identifier: GPL-2.0-only /* * jump label support * * Copyright (C) 2009 Jason Baron <jbaron@redhat.com> * Copyright (C) 2011 Peter Zijlstra * */ #include <linux/memory.h> #include <linux/uaccess.h> #include <linux/module.h> #include <linux/list.h> #include <linux/slab.h> #include <linux/sort.h> #include <linux/err.h> #include <linux/static_key.h> #include <linux/jump_label_ratelimit.h> #include <linux/bug.h> #include <linux/cpu.h> #include <asm/sections.h> /* mutex to protect coming/going of the jump_label table */ static DEFINE_MUTEX(jump_label_mutex); void jump_label_lock(void) { mutex_lock(&jump_label_mutex); } void jump_label_unlock(void) { mutex_unlock(&jump_label_mutex); } static int jump_label_cmp(const void *a, const void *b) { const struct jump_entry *jea = a; const struct jump_entry *jeb = b; /* * Entrires are sorted by key. */ if (jump_entry_key(jea) < jump_entry_key(jeb)) return -1; if (jump_entry_key(jea) > jump_entry_key(jeb)) return 1; /* * In the batching mode, entries should also be sorted by the code * inside the already sorted list of entries, enabling a bsearch in * the vector. */ if (jump_entry_code(jea) < jump_entry_code(jeb)) return -1; if (jump_entry_code(jea) > jump_entry_code(jeb)) return 1; return 0; } static void jump_label_swap(void *a, void *b, int size) { long delta = (unsigned long)a - (unsigned long)b; struct jump_entry *jea = a; struct jump_entry *jeb = b; struct jump_entry tmp = *jea; jea->code = jeb->code - delta; jea->target = jeb->target - delta; jea->key = jeb->key - delta; jeb->code = tmp.code + delta; jeb->target = tmp.target + delta; jeb->key = tmp.key + delta; } static void jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop) { unsigned long size; void *swapfn = NULL; if (IS_ENABLED(CONFIG_HAVE_ARCH_JUMP_LABEL_RELATIVE)) swapfn = jump_label_swap; size = (((unsigned long)stop - (unsigned long)start) / sizeof(struct jump_entry)); sort(start, size, sizeof(struct jump_entry), jump_label_cmp, swapfn); } static void jump_label_update(struct static_key *key); /* * There are similar definitions for the !CONFIG_JUMP_LABEL case in jump_label.h. * The use of 'atomic_read()' requires atomic.h and its problematic for some * kernel headers such as kernel.h and others. Since static_key_count() is not * used in the branch statements as it is for the !CONFIG_JUMP_LABEL case its ok * to have it be a function here. Similarly, for 'static_key_enable()' and * 'static_key_disable()', which require bug.h. This should allow jump_label.h * to be included from most/all places for CONFIG_JUMP_LABEL. */ int static_key_count(struct static_key *key) { /* * -1 means the first static_key_slow_inc() is in progress. * static_key_enabled() must return true, so return 1 here. */ int n = atomic_read(&key->enabled); return n >= 0 ? n : 1; } EXPORT_SYMBOL_GPL(static_key_count); /* * static_key_fast_inc_not_disabled - adds a user for a static key * @key: static key that must be already enabled * * The caller must make sure that the static key can't get disabled while * in this function. It doesn't patch jump labels, only adds a user to * an already enabled static key. * * Returns true if the increment was done. Unlike refcount_t the ref counter * is not saturated, but will fail to increment on overflow. */ bool static_key_fast_inc_not_disabled(struct static_key *key) { int v; STATIC_KEY_CHECK_USE(key); /* * Negative key->enabled has a special meaning: it sends * static_key_slow_inc/dec() down the slow path, and it is non-zero * so it counts as "enabled" in jump_label_update(). * * The INT_MAX overflow condition is either used by the networking * code to reset or detected in the slow path of * static_key_slow_inc_cpuslocked(). */ v = atomic_read(&key->enabled); do { if (v <= 0 || v == INT_MAX) return false; } while (!likely(atomic_try_cmpxchg(&key->enabled, &v, v + 1))); return true; } EXPORT_SYMBOL_GPL(static_key_fast_inc_not_disabled); bool static_key_slow_inc_cpuslocked(struct static_key *key) { lockdep_assert_cpus_held(); /* * Careful if we get concurrent static_key_slow_inc/dec() calls; * later calls must wait for the first one to _finish_ the * jump_label_update() process. At the same time, however, * the jump_label_update() call below wants to see * static_key_enabled(&key) for jumps to be updated properly. */ if (static_key_fast_inc_not_disabled(key)) return true; guard(mutex)(&jump_label_mutex); /* Try to mark it as 'enabling in progress. */ if (!atomic_cmpxchg(&key->enabled, 0, -1)) { jump_label_update(key); /* * Ensure that when static_key_fast_inc_not_disabled() or * static_key_dec_not_one() observe the positive value, * they must also observe all the text changes. */ atomic_set_release(&key->enabled, 1); } else { /* * While holding the mutex this should never observe * anything else than a value >= 1 and succeed */ if (WARN_ON_ONCE(!static_key_fast_inc_not_disabled(key))) return false; } return true; } bool static_key_slow_inc(struct static_key *key) { bool ret; cpus_read_lock(); ret = static_key_slow_inc_cpuslocked(key); cpus_read_unlock(); return ret; } EXPORT_SYMBOL_GPL(static_key_slow_inc); void static_key_enable_cpuslocked(struct static_key *key) { STATIC_KEY_CHECK_USE(key); lockdep_assert_cpus_held(); if (atomic_read(&key->enabled) > 0) { WARN_ON_ONCE(atomic_read(&key->enabled) != 1); return; } jump_label_lock(); if (atomic_read(&key->enabled) == 0) { atomic_set(&key->enabled, -1); jump_label_update(key); /* * See static_key_slow_inc(). */ atomic_set_release(&key->enabled, 1); } jump_label_unlock(); } EXPORT_SYMBOL_GPL(static_key_enable_cpuslocked); void static_key_enable(struct static_key *key) { cpus_read_lock(); static_key_enable_cpuslocked(key); cpus_read_unlock(); } EXPORT_SYMBOL_GPL(static_key_enable); void static_key_disable_cpuslocked(struct static_key *key) { STATIC_KEY_CHECK_USE(key); lockdep_assert_cpus_held(); if (atomic_read(&key->enabled) != 1) { WARN_ON_ONCE(atomic_read(&key->enabled) != 0); return; } jump_label_lock(); if (atomic_cmpxchg(&key->enabled, 1, 0) == 1) jump_label_update(key); jump_label_unlock(); } EXPORT_SYMBOL_GPL(static_key_disable_cpuslocked); void static_key_disable(struct static_key *key) { cpus_read_lock(); static_key_disable_cpuslocked(key); cpus_read_unlock(); } EXPORT_SYMBOL_GPL(static_key_disable); static bool static_key_dec_not_one(struct static_key *key) { int v; /* * Go into the slow path if key::enabled is less than or equal than * one. One is valid to shut down the key, anything less than one * is an imbalance, which is handled at the call site. * * That includes the special case of '-1' which is set in * static_key_slow_inc_cpuslocked(), but that's harmless as it is * fully serialized in the slow path below. By the time this task * acquires the jump label lock the value is back to one and the * retry under the lock must succeed. */ v = atomic_read(&key->enabled); do { /* * Warn about the '-1' case though; since that means a * decrement is concurrent with a first (0->1) increment. IOW * people are trying to disable something that wasn't yet fully * enabled. This suggests an ordering problem on the user side. */ WARN_ON_ONCE(v < 0); /* * Warn about underflow, and lie about success in an attempt to * not make things worse. */ if (WARN_ON_ONCE(v == 0)) return true; if (v <= 1) return false; } while (!likely(atomic_try_cmpxchg(&key->enabled, &v, v - 1))); return true; } static void __static_key_slow_dec_cpuslocked(struct static_key *key) { lockdep_assert_cpus_held(); int val; if (static_key_dec_not_one(key)) return; guard(mutex)(&jump_label_mutex); val = atomic_read(&key->enabled); /* * It should be impossible to observe -1 with jump_label_mutex held, * see static_key_slow_inc_cpuslocked(). */ if (WARN_ON_ONCE(val == -1)) return; /* * Cannot already be 0, something went sideways. */ if (WARN_ON_ONCE(val == 0)) return; if (atomic_dec_and_test(&key->enabled)) jump_label_update(key); } static void __static_key_slow_dec(struct static_key *key) { cpus_read_lock(); __static_key_slow_dec_cpuslocked(key); cpus_read_unlock(); } void jump_label_update_timeout(struct work_struct *work) { struct static_key_deferred *key = container_of(work, struct static_key_deferred, work.work); __static_key_slow_dec(&key->key); } EXPORT_SYMBOL_GPL(jump_label_update_timeout); void static_key_slow_dec(struct static_key *key) { STATIC_KEY_CHECK_USE(key); __static_key_slow_dec(key); } EXPORT_SYMBOL_GPL(static_key_slow_dec); void static_key_slow_dec_cpuslocked(struct static_key *key) { STATIC_KEY_CHECK_USE(key); __static_key_slow_dec_cpuslocked(key); } void __static_key_slow_dec_deferred(struct static_key *key, struct delayed_work *work, unsigned long timeout) { STATIC_KEY_CHECK_USE(key); if (static_key_dec_not_one(key)) return; schedule_delayed_work(work, timeout); } EXPORT_SYMBOL_GPL(__static_key_slow_dec_deferred); void __static_key_deferred_flush(void *key, struct delayed_work *work) { STATIC_KEY_CHECK_USE(key); flush_delayed_work(work); } EXPORT_SYMBOL_GPL(__static_key_deferred_flush); void jump_label_rate_limit(struct static_key_deferred *key, unsigned long rl) { STATIC_KEY_CHECK_USE(key); key->timeout = rl; INIT_DELAYED_WORK(&key->work, jump_label_update_timeout); } EXPORT_SYMBOL_GPL(jump_label_rate_limit); static int addr_conflict(struct jump_entry *entry, void *start, void *end) { if (jump_entry_code(entry) <= (unsigned long)end && jump_entry_code(entry) + jump_entry_size(entry) > (unsigned long)start) return 1; return 0; } static int __jump_label_text_reserved(struct jump_entry *iter_start, struct jump_entry *iter_stop, void *start, void *end, bool init) { struct jump_entry *iter; iter = iter_start; while (iter < iter_stop) { if (init || !jump_entry_is_init(iter)) { if (addr_conflict(iter, start, end)) return 1; } iter++; } return 0; } #ifndef arch_jump_label_transform_static static void arch_jump_label_transform_static(struct jump_entry *entry, enum jump_label_type type) { /* nothing to do on most architectures */ } #endif static inline struct jump_entry *static_key_entries(struct static_key *key) { WARN_ON_ONCE(key->type & JUMP_TYPE_LINKED); return (struct jump_entry *)(key->type & ~JUMP_TYPE_MASK); } static inline bool static_key_type(struct static_key *key) { return key->type & JUMP_TYPE_TRUE; } static inline bool static_key_linked(struct static_key *key) { return key->type & JUMP_TYPE_LINKED; } static inline void static_key_clear_linked(struct static_key *key) { key->type &= ~JUMP_TYPE_LINKED; } static inline void static_key_set_linked(struct static_key *key) { key->type |= JUMP_TYPE_LINKED; } /*** * A 'struct static_key' uses a union such that it either points directly * to a table of 'struct jump_entry' or to a linked list of modules which in * turn point to 'struct jump_entry' tables. * * The two lower bits of the pointer are used to keep track of which pointer * type is in use and to store the initial branch direction, we use an access * function which preserves these bits. */ static void static_key_set_entries(struct static_key *key, struct jump_entry *entries) { unsigned long type; WARN_ON_ONCE((unsigned long)entries & JUMP_TYPE_MASK); type = key->type & JUMP_TYPE_MASK; key->entries = entries; key->type |= type; } static enum jump_label_type jump_label_type(struct jump_entry *entry) { struct static_key *key = jump_entry_key(entry); bool enabled = static_key_enabled(key); bool branch = jump_entry_is_branch(entry); /* See the comment in linux/jump_label.h */ return enabled ^ branch; } static bool jump_label_can_update(struct jump_entry *entry, bool init) { /* * Cannot update code that was in an init text area. */ if (!init && jump_entry_is_init(entry)) return false; if (!kernel_text_address(jump_entry_code(entry))) { /* * This skips patching built-in __exit, which * is part of init_section_contains() but is * not part of kernel_text_address(). * * Skipping built-in __exit is fine since it * will never be executed. */ WARN_ONCE(!jump_entry_is_init(entry), "can't patch jump_label at %pS", (void *)jump_entry_code(entry)); return false; } return true; } #ifndef HAVE_JUMP_LABEL_BATCH static void __jump_label_update(struct static_key *key, struct jump_entry *entry, struct jump_entry *stop, bool init) { for (; (entry < stop) && (jump_entry_key(entry) == key); entry++) { if (jump_label_can_update(entry, init)) arch_jump_label_transform(entry, jump_label_type(entry)); } } #else static void __jump_label_update(struct static_key *key, struct jump_entry *entry, struct jump_entry *stop, bool init) { for (; (entry < stop) && (jump_entry_key(entry) == key); entry++) { if (!jump_label_can_update(entry, init)) continue; if (!arch_jump_label_transform_queue(entry, jump_label_type(entry))) { /* * Queue is full: Apply the current queue and try again. */ arch_jump_label_transform_apply(); BUG_ON(!arch_jump_label_transform_queue(entry, jump_label_type(entry))); } } arch_jump_label_transform_apply(); } #endif void __init jump_label_init(void) { struct jump_entry *iter_start = __start___jump_table; struct jump_entry *iter_stop = __stop___jump_table; struct static_key *key = NULL; struct jump_entry *iter; /* * Since we are initializing the static_key.enabled field with * with the 'raw' int values (to avoid pulling in atomic.h) in * jump_label.h, let's make sure that is safe. There are only two * cases to check since we initialize to 0 or 1. */ BUILD_BUG_ON((int)ATOMIC_INIT(0) != 0); BUILD_BUG_ON((int)ATOMIC_INIT(1) != 1); if (static_key_initialized) return; cpus_read_lock(); jump_label_lock(); jump_label_sort_entries(iter_start, iter_stop); for (iter = iter_start; iter < iter_stop; iter++) { struct static_key *iterk; bool in_init; /* rewrite NOPs */ if (jump_label_type(iter) == JUMP_LABEL_NOP) arch_jump_label_transform_static(iter, JUMP_LABEL_NOP); in_init = init_section_contains((void *)jump_entry_code(iter), 1); jump_entry_set_init(iter, in_init); iterk = jump_entry_key(iter); if (iterk == key) continue; key = iterk; static_key_set_entries(key, iter); } static_key_initialized = true; jump_label_unlock(); cpus_read_unlock(); } static inline bool static_key_sealed(struct static_key *key) { return (key->type & JUMP_TYPE_LINKED) && !(key->type & ~JUMP_TYPE_MASK); } static inline void static_key_seal(struct static_key *key) { unsigned long type = key->type & JUMP_TYPE_TRUE; key->type = JUMP_TYPE_LINKED | type; } void jump_label_init_ro(void) { struct jump_entry *iter_start = __start___jump_table; struct jump_entry *iter_stop = __stop___jump_table; struct jump_entry *iter; if (WARN_ON_ONCE(!static_key_initialized)) return; cpus_read_lock(); jump_label_lock(); for (iter = iter_start; iter < iter_stop; iter++) { struct static_key *iterk = jump_entry_key(iter); if (!is_kernel_ro_after_init((unsigned long)iterk)) continue; if (static_key_sealed(iterk)) continue; static_key_seal(iterk); } jump_label_unlock(); cpus_read_unlock(); } #ifdef CONFIG_MODULES enum jump_label_type jump_label_init_type(struct jump_entry *entry) { struct static_key *key = jump_entry_key(entry); bool type = static_key_type(key); bool branch = jump_entry_is_branch(entry); /* See the comment in linux/jump_label.h */ return type ^ branch; } struct static_key_mod { struct static_key_mod *next; struct jump_entry *entries; struct module *mod; }; static inline struct static_key_mod *static_key_mod(struct static_key *key) { WARN_ON_ONCE(!static_key_linked(key)); return (struct static_key_mod *)(key->type & ~JUMP_TYPE_MASK); } /*** * key->type and key->next are the same via union. * This sets key->next and preserves the type bits. * * See additional comments above static_key_set_entries(). */ static void static_key_set_mod(struct static_key *key, struct static_key_mod *mod) { unsigned long type; WARN_ON_ONCE((unsigned long)mod & JUMP_TYPE_MASK); type = key->type & JUMP_TYPE_MASK; key->next = mod; key->type |= type; } static int __jump_label_mod_text_reserved(void *start, void *end) { struct module *mod; int ret; scoped_guard(rcu) { mod = __module_text_address((unsigned long)start); WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod); if (!try_module_get(mod)) mod = NULL; } if (!mod) return 0; ret = __jump_label_text_reserved(mod->jump_entries, mod->jump_entries + mod->num_jump_entries, start, end, mod->state == MODULE_STATE_COMING); module_put(mod); return ret; } static void __jump_label_mod_update(struct static_key *key) { struct static_key_mod *mod; for (mod = static_key_mod(key); mod; mod = mod->next) { struct jump_entry *stop; struct module *m; /* * NULL if the static_key is defined in a module * that does not use it */ if (!mod->entries) continue; m = mod->mod; if (!m) stop = __stop___jump_table; else stop = m->jump_entries + m->num_jump_entries; __jump_label_update(key, mod->entries, stop, m && m->state == MODULE_STATE_COMING); } } static int jump_label_add_module(struct module *mod) { struct jump_entry *iter_start = mod->jump_entries; struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; struct jump_entry *iter; struct static_key *key = NULL; struct static_key_mod *jlm, *jlm2; /* if the module doesn't have jump label entries, just return */ if (iter_start == iter_stop) return 0; jump_label_sort_entries(iter_start, iter_stop); for (iter = iter_start; iter < iter_stop; iter++) { struct static_key *iterk; bool in_init; in_init = within_module_init(jump_entry_code(iter), mod); jump_entry_set_init(iter, in_init); iterk = jump_entry_key(iter); if (iterk == key) continue; key = iterk; if (within_module((unsigned long)key, mod)) { static_key_set_entries(key, iter); continue; } /* * If the key was sealed at init, then there's no need to keep a * reference to its module entries - just patch them now and be * done with it. */ if (static_key_sealed(key)) goto do_poke; jlm = kzalloc(sizeof(struct static_key_mod), GFP_KERNEL); if (!jlm) return -ENOMEM; if (!static_key_linked(key)) { jlm2 = kzalloc(sizeof(struct static_key_mod), GFP_KERNEL); if (!jlm2) { kfree(jlm); return -ENOMEM; } scoped_guard(rcu) jlm2->mod = __module_address((unsigned long)key); jlm2->entries = static_key_entries(key); jlm2->next = NULL; static_key_set_mod(key, jlm2); static_key_set_linked(key); } jlm->mod = mod; jlm->entries = iter; jlm->next = static_key_mod(key); static_key_set_mod(key, jlm); static_key_set_linked(key); /* Only update if we've changed from our initial state */ do_poke: if (jump_label_type(iter) != jump_label_init_type(iter)) __jump_label_update(key, iter, iter_stop, true); } return 0; } static void jump_label_del_module(struct module *mod) { struct jump_entry *iter_start = mod->jump_entries; struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; struct jump_entry *iter; struct static_key *key = NULL; struct static_key_mod *jlm, **prev; for (iter = iter_start; iter < iter_stop; iter++) { if (jump_entry_key(iter) == key) continue; key = jump_entry_key(iter); if (within_module((unsigned long)key, mod)) continue; /* No @jlm allocated because key was sealed at init. */ if (static_key_sealed(key)) continue; /* No memory during module load */ if (WARN_ON(!static_key_linked(key))) continue; prev = &key->next; jlm = static_key_mod(key); while (jlm && jlm->mod != mod) { prev = &jlm->next; jlm = jlm->next; } /* No memory during module load */ if (WARN_ON(!jlm)) continue; if (prev == &key->next) static_key_set_mod(key, jlm->next); else *prev = jlm->next; kfree(jlm); jlm = static_key_mod(key); /* if only one etry is left, fold it back into the static_key */ if (jlm->next == NULL) { static_key_set_entries(key, jlm->entries); static_key_clear_linked(key); kfree(jlm); } } } static int jump_label_module_notify(struct notifier_block *self, unsigned long val, void *data) { struct module *mod = data; int ret = 0; cpus_read_lock(); jump_label_lock(); switch (val) { case MODULE_STATE_COMING: ret = jump_label_add_module(mod); if (ret) { WARN(1, "Failed to allocate memory: jump_label may not work properly.\n"); jump_label_del_module(mod); } break; case MODULE_STATE_GOING: jump_label_del_module(mod); break; } jump_label_unlock(); cpus_read_unlock(); return notifier_from_errno(ret); } static struct notifier_block jump_label_module_nb = { .notifier_call = jump_label_module_notify, .priority = 1, /* higher than tracepoints */ }; static __init int jump_label_init_module(void) { return register_module_notifier(&jump_label_module_nb); } early_initcall(jump_label_init_module); #endif /* CONFIG_MODULES */ /*** * jump_label_text_reserved - check if addr range is reserved * @start: start text addr * @end: end text addr * * checks if the text addr located between @start and @end * overlaps with any of the jump label patch addresses. Code * that wants to modify kernel text should first verify that * it does not overlap with any of the jump label addresses. * Caller must hold jump_label_mutex. * * returns 1 if there is an overlap, 0 otherwise */ int jump_label_text_reserved(void *start, void *end) { bool init = system_state < SYSTEM_RUNNING; int ret = __jump_label_text_reserved(__start___jump_table, __stop___jump_table, start, end, init); if (ret) return ret; #ifdef CONFIG_MODULES ret = __jump_label_mod_text_reserved(start, end); #endif return ret; } static void jump_label_update(struct static_key *key) { struct jump_entry *stop = __stop___jump_table; bool init = system_state < SYSTEM_RUNNING; struct jump_entry *entry; #ifdef CONFIG_MODULES struct module *mod; if (static_key_linked(key)) { __jump_label_mod_update(key); return; } scoped_guard(rcu) { mod = __module_address((unsigned long)key); if (mod) { stop = mod->jump_entries + mod->num_jump_entries; init = mod->state == MODULE_STATE_COMING; } } #endif entry = static_key_entries(key); /* if there are no users, entry can be NULL */ if (entry) __jump_label_update(key, entry, stop, init); } #ifdef CONFIG_STATIC_KEYS_SELFTEST static DEFINE_STATIC_KEY_TRUE(sk_true); static DEFINE_STATIC_KEY_FALSE(sk_false); static __init int jump_label_test(void) { int i; for (i = 0; i < 2; i++) { WARN_ON(static_key_enabled(&sk_true.key) != true); WARN_ON(static_key_enabled(&sk_false.key) != false); WARN_ON(!static_branch_likely(&sk_true)); WARN_ON(!static_branch_unlikely(&sk_true)); WARN_ON(static_branch_likely(&sk_false)); WARN_ON(static_branch_unlikely(&sk_false)); static_branch_disable(&sk_true); static_branch_enable(&sk_false); WARN_ON(static_key_enabled(&sk_true.key) == true); WARN_ON(static_key_enabled(&sk_false.key) == false); WARN_ON(static_branch_likely(&sk_true)); WARN_ON(static_branch_unlikely(&sk_true)); WARN_ON(!static_branch_likely(&sk_false)); WARN_ON(!static_branch_unlikely(&sk_false)); static_branch_enable(&sk_true); static_branch_disable(&sk_false); } return 0; } early_initcall(jump_label_test); #endif /* STATIC_KEYS_SELFTEST */ |
| 808 2 40 2 1 240 794 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 | // SPDX-License-Identifier: GPL-2.0 /* * device.h - generic, centralized driver model * * Copyright (c) 2001-2003 Patrick Mochel <mochel@osdl.org> * Copyright (c) 2004-2009 Greg Kroah-Hartman <gregkh@suse.de> * Copyright (c) 2008-2009 Novell Inc. * * See Documentation/driver-api/driver-model/ for more information. */ #ifndef _DEVICE_H_ #define _DEVICE_H_ #include <linux/dev_printk.h> #include <linux/energy_model.h> #include <linux/ioport.h> #include <linux/kobject.h> #include <linux/klist.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/compiler.h> #include <linux/types.h> #include <linux/mutex.h> #include <linux/pm.h> #include <linux/atomic.h> #include <linux/uidgid.h> #include <linux/gfp.h> #include <linux/device/bus.h> #include <linux/device/class.h> #include <linux/device/devres.h> #include <linux/device/driver.h> #include <linux/cleanup.h> #include <asm/device.h> struct device; struct device_private; struct device_driver; struct driver_private; struct module; struct class; struct subsys_private; struct device_node; struct fwnode_handle; struct iommu_group; struct dev_pin_info; struct dev_iommu; struct msi_device_data; /** * struct subsys_interface - interfaces to device functions * @name: name of the device function * @subsys: subsystem of the devices to attach to * @node: the list of functions registered at the subsystem * @add_dev: device hookup to device function handler * @remove_dev: device hookup to device function handler * * Simple interfaces attached to a subsystem. Multiple interfaces can * attach to a subsystem and its devices. Unlike drivers, they do not * exclusively claim or control devices. Interfaces usually represent * a specific functionality of a subsystem/class of devices. */ struct subsys_interface { const char *name; const struct bus_type *subsys; struct list_head node; int (*add_dev)(struct device *dev, struct subsys_interface *sif); void (*remove_dev)(struct device *dev, struct subsys_interface *sif); }; int subsys_interface_register(struct subsys_interface *sif); void subsys_interface_unregister(struct subsys_interface *sif); int subsys_system_register(const struct bus_type *subsys, const struct attribute_group **groups); int subsys_virtual_register(const struct bus_type *subsys, const struct attribute_group **groups); /* * The type of device, "struct device" is embedded in. A class * or bus can contain devices of different types * like "partitions" and "disks", "mouse" and "event". * This identifies the device type and carries type-specific * information, equivalent to the kobj_type of a kobject. * If "name" is specified, the uevent will contain it in * the DEVTYPE variable. */ struct device_type { const char *name; const struct attribute_group **groups; int (*uevent)(const struct device *dev, struct kobj_uevent_env *env); char *(*devnode)(const struct device *dev, umode_t *mode, kuid_t *uid, kgid_t *gid); void (*release)(struct device *dev); const struct dev_pm_ops *pm; }; /** * struct device_attribute - Interface for exporting device attributes. * @attr: sysfs attribute definition. * @show: Show handler. * @store: Store handler. */ struct device_attribute { struct attribute attr; ssize_t (*show)(struct device *dev, struct device_attribute *attr, char *buf); ssize_t (*store)(struct device *dev, struct device_attribute *attr, const char *buf, size_t count); }; /** * struct dev_ext_attribute - Exported device attribute with extra context. * @attr: Exported device attribute. * @var: Pointer to context. */ struct dev_ext_attribute { struct device_attribute attr; void *var; }; ssize_t device_show_ulong(struct device *dev, struct device_attribute *attr, char *buf); ssize_t device_store_ulong(struct device *dev, struct device_attribute *attr, const char *buf, size_t count); ssize_t device_show_int(struct device *dev, struct device_attribute *attr, char *buf); ssize_t device_store_int(struct device *dev, struct device_attribute *attr, const char *buf, size_t count); ssize_t device_show_bool(struct device *dev, struct device_attribute *attr, char *buf); ssize_t device_store_bool(struct device *dev, struct device_attribute *attr, const char *buf, size_t count); ssize_t device_show_string(struct device *dev, struct device_attribute *attr, char *buf); /** * DEVICE_ATTR - Define a device attribute. * @_name: Attribute name. * @_mode: File mode. * @_show: Show handler. Optional, but mandatory if attribute is readable. * @_store: Store handler. Optional, but mandatory if attribute is writable. * * Convenience macro for defining a struct device_attribute. * * For example, ``DEVICE_ATTR(foo, 0644, foo_show, foo_store);`` expands to: * * .. code-block:: c * * struct device_attribute dev_attr_foo = { * .attr = { .name = "foo", .mode = 0644 }, * .show = foo_show, * .store = foo_store, * }; */ #define DEVICE_ATTR(_name, _mode, _show, _store) \ struct device_attribute dev_attr_##_name = __ATTR(_name, _mode, _show, _store) /** * DEVICE_ATTR_PREALLOC - Define a preallocated device attribute. * @_name: Attribute name. * @_mode: File mode. * @_show: Show handler. Optional, but mandatory if attribute is readable. * @_store: Store handler. Optional, but mandatory if attribute is writable. * * Like DEVICE_ATTR(), but ``SYSFS_PREALLOC`` is set on @_mode. */ #define DEVICE_ATTR_PREALLOC(_name, _mode, _show, _store) \ struct device_attribute dev_attr_##_name = \ __ATTR_PREALLOC(_name, _mode, _show, _store) /** * DEVICE_ATTR_RW - Define a read-write device attribute. * @_name: Attribute name. * * Like DEVICE_ATTR(), but @_mode is 0644, @_show is <_name>_show, * and @_store is <_name>_store. */ #define DEVICE_ATTR_RW(_name) \ struct device_attribute dev_attr_##_name = __ATTR_RW(_name) /** * DEVICE_ATTR_ADMIN_RW - Define an admin-only read-write device attribute. * @_name: Attribute name. * * Like DEVICE_ATTR_RW(), but @_mode is 0600. */ #define DEVICE_ATTR_ADMIN_RW(_name) \ struct device_attribute dev_attr_##_name = __ATTR_RW_MODE(_name, 0600) /** * DEVICE_ATTR_RO - Define a readable device attribute. * @_name: Attribute name. * * Like DEVICE_ATTR(), but @_mode is 0444 and @_show is <_name>_show. */ #define DEVICE_ATTR_RO(_name) \ struct device_attribute dev_attr_##_name = __ATTR_RO(_name) /** * DEVICE_ATTR_ADMIN_RO - Define an admin-only readable device attribute. * @_name: Attribute name. * * Like DEVICE_ATTR_RO(), but @_mode is 0400. */ #define DEVICE_ATTR_ADMIN_RO(_name) \ struct device_attribute dev_attr_##_name = __ATTR_RO_MODE(_name, 0400) /** * DEVICE_ATTR_WO - Define an admin-only writable device attribute. * @_name: Attribute name. * * Like DEVICE_ATTR(), but @_mode is 0200 and @_store is <_name>_store. */ #define DEVICE_ATTR_WO(_name) \ struct device_attribute dev_attr_##_name = __ATTR_WO(_name) /** * DEVICE_ULONG_ATTR - Define a device attribute backed by an unsigned long. * @_name: Attribute name. * @_mode: File mode. * @_var: Identifier of unsigned long. * * Like DEVICE_ATTR(), but @_show and @_store are automatically provided * such that reads and writes to the attribute from userspace affect @_var. */ #define DEVICE_ULONG_ATTR(_name, _mode, _var) \ struct dev_ext_attribute dev_attr_##_name = \ { __ATTR(_name, _mode, device_show_ulong, device_store_ulong), &(_var) } /** * DEVICE_INT_ATTR - Define a device attribute backed by an int. * @_name: Attribute name. * @_mode: File mode. * @_var: Identifier of int. * * Like DEVICE_ULONG_ATTR(), but @_var is an int. */ #define DEVICE_INT_ATTR(_name, _mode, _var) \ struct dev_ext_attribute dev_attr_##_name = \ { __ATTR(_name, _mode, device_show_int, device_store_int), &(_var) } /** * DEVICE_BOOL_ATTR - Define a device attribute backed by a bool. * @_name: Attribute name. * @_mode: File mode. * @_var: Identifier of bool. * * Like DEVICE_ULONG_ATTR(), but @_var is a bool. */ #define DEVICE_BOOL_ATTR(_name, _mode, _var) \ struct dev_ext_attribute dev_attr_##_name = \ { __ATTR(_name, _mode, device_show_bool, device_store_bool), &(_var) } /** * DEVICE_STRING_ATTR_RO - Define a device attribute backed by a r/o string. * @_name: Attribute name. * @_mode: File mode. * @_var: Identifier of string. * * Like DEVICE_ULONG_ATTR(), but @_var is a string. Because the length of the * string allocation is unknown, the attribute must be read-only. */ #define DEVICE_STRING_ATTR_RO(_name, _mode, _var) \ struct dev_ext_attribute dev_attr_##_name = \ { __ATTR(_name, (_mode) & ~0222, device_show_string, NULL), (_var) } #define DEVICE_ATTR_IGNORE_LOCKDEP(_name, _mode, _show, _store) \ struct device_attribute dev_attr_##_name = \ __ATTR_IGNORE_LOCKDEP(_name, _mode, _show, _store) int device_create_file(struct device *device, const struct device_attribute *entry); void device_remove_file(struct device *dev, const struct device_attribute *attr); bool device_remove_file_self(struct device *dev, const struct device_attribute *attr); int __must_check device_create_bin_file(struct device *dev, const struct bin_attribute *attr); void device_remove_bin_file(struct device *dev, const struct bin_attribute *attr); /* allows to add/remove a custom action to devres stack */ int devm_remove_action_nowarn(struct device *dev, void (*action)(void *), void *data); /** * devm_remove_action() - removes previously added custom action * @dev: Device that owns the action * @action: Function implementing the action * @data: Pointer to data passed to @action implementation * * Removes instance of @action previously added by devm_add_action(). * Both action and data should match one of the existing entries. */ static inline void devm_remove_action(struct device *dev, void (*action)(void *), void *data) { WARN_ON(devm_remove_action_nowarn(dev, action, data)); } void devm_release_action(struct device *dev, void (*action)(void *), void *data); int __devm_add_action(struct device *dev, void (*action)(void *), void *data, const char *name); #define devm_add_action(dev, action, data) \ __devm_add_action(dev, action, data, #action) static inline int __devm_add_action_or_reset(struct device *dev, void (*action)(void *), void *data, const char *name) { int ret; ret = __devm_add_action(dev, action, data, name); if (ret) action(data); return ret; } #define devm_add_action_or_reset(dev, action, data) \ __devm_add_action_or_reset(dev, action, data, #action) /** * devm_alloc_percpu - Resource-managed alloc_percpu * @dev: Device to allocate per-cpu memory for * @type: Type to allocate per-cpu memory for * * Managed alloc_percpu. Per-cpu memory allocated with this function is * automatically freed on driver detach. * * RETURNS: * Pointer to allocated memory on success, NULL on failure. */ #define devm_alloc_percpu(dev, type) \ ((typeof(type) __percpu *)__devm_alloc_percpu((dev), sizeof(type), \ __alignof__(type))) void __percpu *__devm_alloc_percpu(struct device *dev, size_t size, size_t align); void devm_free_percpu(struct device *dev, void __percpu *pdata); struct device_dma_parameters { /* * a low level driver may set these to teach IOMMU code about * sg limitations. */ unsigned int max_segment_size; unsigned int min_align_mask; unsigned long segment_boundary_mask; }; /** * enum device_link_state - Device link states. * @DL_STATE_NONE: The presence of the drivers is not being tracked. * @DL_STATE_DORMANT: None of the supplier/consumer drivers is present. * @DL_STATE_AVAILABLE: The supplier driver is present, but the consumer is not. * @DL_STATE_CONSUMER_PROBE: The consumer is probing (supplier driver present). * @DL_STATE_ACTIVE: Both the supplier and consumer drivers are present. * @DL_STATE_SUPPLIER_UNBIND: The supplier driver is unbinding. */ enum device_link_state { DL_STATE_NONE = -1, DL_STATE_DORMANT = 0, DL_STATE_AVAILABLE, DL_STATE_CONSUMER_PROBE, DL_STATE_ACTIVE, DL_STATE_SUPPLIER_UNBIND, }; /* * Device link flags. * * STATELESS: The core will not remove this link automatically. * AUTOREMOVE_CONSUMER: Remove the link automatically on consumer driver unbind. * PM_RUNTIME: If set, the runtime PM framework will use this link. * RPM_ACTIVE: Run pm_runtime_get_sync() on the supplier during link creation. * AUTOREMOVE_SUPPLIER: Remove the link automatically on supplier driver unbind. * AUTOPROBE_CONSUMER: Probe consumer driver automatically after supplier binds. * MANAGED: The core tracks presence of supplier/consumer drivers (internal). * SYNC_STATE_ONLY: Link only affects sync_state() behavior. * INFERRED: Inferred from data (eg: firmware) and not from driver actions. */ #define DL_FLAG_STATELESS BIT(0) #define DL_FLAG_AUTOREMOVE_CONSUMER BIT(1) #define DL_FLAG_PM_RUNTIME BIT(2) #define DL_FLAG_RPM_ACTIVE BIT(3) #define DL_FLAG_AUTOREMOVE_SUPPLIER BIT(4) #define DL_FLAG_AUTOPROBE_CONSUMER BIT(5) #define DL_FLAG_MANAGED BIT(6) #define DL_FLAG_SYNC_STATE_ONLY BIT(7) #define DL_FLAG_INFERRED BIT(8) #define DL_FLAG_CYCLE BIT(9) /** * enum dl_dev_state - Device driver presence tracking information. * @DL_DEV_NO_DRIVER: There is no driver attached to the device. * @DL_DEV_PROBING: A driver is probing. * @DL_DEV_DRIVER_BOUND: The driver has been bound to the device. * @DL_DEV_UNBINDING: The driver is unbinding from the device. */ enum dl_dev_state { DL_DEV_NO_DRIVER = 0, DL_DEV_PROBING, DL_DEV_DRIVER_BOUND, DL_DEV_UNBINDING, }; /** * enum device_removable - Whether the device is removable. The criteria for a * device to be classified as removable is determined by its subsystem or bus. * @DEVICE_REMOVABLE_NOT_SUPPORTED: This attribute is not supported for this * device (default). * @DEVICE_REMOVABLE_UNKNOWN: Device location is Unknown. * @DEVICE_FIXED: Device is not removable by the user. * @DEVICE_REMOVABLE: Device is removable by the user. */ enum device_removable { DEVICE_REMOVABLE_NOT_SUPPORTED = 0, /* must be 0 */ DEVICE_REMOVABLE_UNKNOWN, DEVICE_FIXED, DEVICE_REMOVABLE, }; /** * struct dev_links_info - Device data related to device links. * @suppliers: List of links to supplier devices. * @consumers: List of links to consumer devices. * @defer_sync: Hook to global list of devices that have deferred sync_state. * @status: Driver status information. */ struct dev_links_info { struct list_head suppliers; struct list_head consumers; struct list_head defer_sync; enum dl_dev_state status; }; /** * struct dev_msi_info - Device data related to MSI * @domain: The MSI interrupt domain associated to the device * @data: Pointer to MSI device data */ struct dev_msi_info { #ifdef CONFIG_GENERIC_MSI_IRQ struct irq_domain *domain; struct msi_device_data *data; #endif }; /** * enum device_physical_location_panel - Describes which panel surface of the * system's housing the device connection point resides on. * @DEVICE_PANEL_TOP: Device connection point is on the top panel. * @DEVICE_PANEL_BOTTOM: Device connection point is on the bottom panel. * @DEVICE_PANEL_LEFT: Device connection point is on the left panel. * @DEVICE_PANEL_RIGHT: Device connection point is on the right panel. * @DEVICE_PANEL_FRONT: Device connection point is on the front panel. * @DEVICE_PANEL_BACK: Device connection point is on the back panel. * @DEVICE_PANEL_UNKNOWN: The panel with device connection point is unknown. */ enum device_physical_location_panel { DEVICE_PANEL_TOP, DEVICE_PANEL_BOTTOM, DEVICE_PANEL_LEFT, DEVICE_PANEL_RIGHT, DEVICE_PANEL_FRONT, DEVICE_PANEL_BACK, DEVICE_PANEL_UNKNOWN, }; /** * enum device_physical_location_vertical_position - Describes vertical * position of the device connection point on the panel surface. * @DEVICE_VERT_POS_UPPER: Device connection point is at upper part of panel. * @DEVICE_VERT_POS_CENTER: Device connection point is at center part of panel. * @DEVICE_VERT_POS_LOWER: Device connection point is at lower part of panel. */ enum device_physical_location_vertical_position { DEVICE_VERT_POS_UPPER, DEVICE_VERT_POS_CENTER, DEVICE_VERT_POS_LOWER, }; /** * enum device_physical_location_horizontal_position - Describes horizontal * position of the device connection point on the panel surface. * @DEVICE_HORI_POS_LEFT: Device connection point is at left part of panel. * @DEVICE_HORI_POS_CENTER: Device connection point is at center part of panel. * @DEVICE_HORI_POS_RIGHT: Device connection point is at right part of panel. */ enum device_physical_location_horizontal_position { DEVICE_HORI_POS_LEFT, DEVICE_HORI_POS_CENTER, DEVICE_HORI_POS_RIGHT, }; /** * struct device_physical_location - Device data related to physical location * of the device connection point. * @panel: Panel surface of the system's housing that the device connection * point resides on. * @vertical_position: Vertical position of the device connection point within * the panel. * @horizontal_position: Horizontal position of the device connection point * within the panel. * @dock: Set if the device connection point resides in a docking station or * port replicator. * @lid: Set if this device connection point resides on the lid of laptop * system. */ struct device_physical_location { enum device_physical_location_panel panel; enum device_physical_location_vertical_position vertical_position; enum device_physical_location_horizontal_position horizontal_position; bool dock; bool lid; }; /** * struct device - The basic device structure * @parent: The device's "parent" device, the device to which it is attached. * In most cases, a parent device is some sort of bus or host * controller. If parent is NULL, the device, is a top-level device, * which is not usually what you want. * @p: Holds the private data of the driver core portions of the device. * See the comment of the struct device_private for detail. * @kobj: A top-level, abstract class from which other classes are derived. * @init_name: Initial name of the device. * @type: The type of device. * This identifies the device type and carries type-specific * information. * @mutex: Mutex to synchronize calls to its driver. * @bus: Type of bus device is on. * @driver: Which driver has allocated this * @platform_data: Platform data specific to the device. * Example: For devices on custom boards, as typical of embedded * and SOC based hardware, Linux often uses platform_data to point * to board-specific structures describing devices and how they * are wired. That can include what ports are available, chip * variants, which GPIO pins act in what additional roles, and so * on. This shrinks the "Board Support Packages" (BSPs) and * minimizes board-specific #ifdefs in drivers. * @driver_data: Private pointer for driver specific info. * @links: Links to suppliers and consumers of this device. * @power: For device power management. * See Documentation/driver-api/pm/devices.rst for details. * @pm_domain: Provide callbacks that are executed during system suspend, * hibernation, system resume and during runtime PM transitions * along with subsystem-level and driver-level callbacks. * @em_pd: device's energy model performance domain * @pins: For device pin management. * See Documentation/driver-api/pin-control.rst for details. * @msi: MSI related data * @numa_node: NUMA node this device is close to. * @dma_ops: DMA mapping operations for this device. * @dma_mask: Dma mask (if dma'ble device). * @coherent_dma_mask: Like dma_mask, but for alloc_coherent mapping as not all * hardware supports 64-bit addresses for consistent allocations * such descriptors. * @bus_dma_limit: Limit of an upstream bridge or bus which imposes a smaller * DMA limit than the device itself supports. * @dma_range_map: map for DMA memory ranges relative to that of RAM * @dma_parms: A low level driver may set these to teach IOMMU code about * segment limitations. * @dma_pools: Dma pools (if dma'ble device). * @dma_mem: Internal for coherent mem override. * @cma_area: Contiguous memory area for dma allocations * @dma_io_tlb_mem: Software IO TLB allocator. Not for driver use. * @dma_io_tlb_pools: List of transient swiotlb memory pools. * @dma_io_tlb_lock: Protects changes to the list of active pools. * @dma_uses_io_tlb: %true if device has used the software IO TLB. * @archdata: For arch-specific additions. * @of_node: Associated device tree node. * @fwnode: Associated device node supplied by platform firmware. * @devt: For creating the sysfs "dev". * @id: device instance * @devres_lock: Spinlock to protect the resource of the device. * @devres_head: The resources list of the device. * @class: The class of the device. * @groups: Optional attribute groups. * @release: Callback to free the device after all references have * gone away. This should be set by the allocator of the * device (i.e. the bus driver that discovered the device). * @iommu_group: IOMMU group the device belongs to. * @iommu: Per device generic IOMMU runtime data * @physical_location: Describes physical location of the device connection * point in the system housing. * @removable: Whether the device can be removed from the system. This * should be set by the subsystem / bus driver that discovered * the device. * * @offline_disabled: If set, the device is permanently online. * @offline: Set after successful invocation of bus type's .offline(). * @of_node_reused: Set if the device-tree node is shared with an ancestor * device. * @state_synced: The hardware state of this device has been synced to match * the software state of this device by calling the driver/bus * sync_state() callback. * @can_match: The device has matched with a driver at least once or it is in * a bus (like AMBA) which can't check for matching drivers until * other devices probe successfully. * @dma_coherent: this particular device is dma coherent, even if the * architecture supports non-coherent devices. * @dma_ops_bypass: If set to %true then the dma_ops are bypassed for the * streaming DMA operations (->map_* / ->unmap_* / ->sync_*), * and optionall (if the coherent mask is large enough) also * for dma allocations. This flag is managed by the dma ops * instance from ->dma_supported. * @dma_skip_sync: DMA sync operations can be skipped for coherent buffers. * @dma_iommu: Device is using default IOMMU implementation for DMA and * doesn't rely on dma_ops structure. * * At the lowest level, every device in a Linux system is represented by an * instance of struct device. The device structure contains the information * that the device model core needs to model the system. Most subsystems, * however, track additional information about the devices they host. As a * result, it is rare for devices to be represented by bare device structures; * instead, that structure, like kobject structures, is usually embedded within * a higher-level representation of the device. */ struct device { struct kobject kobj; struct device *parent; struct device_private *p; const char *init_name; /* initial name of the device */ const struct device_type *type; const struct bus_type *bus; /* type of bus device is on */ struct device_driver *driver; /* which driver has allocated this device */ void *platform_data; /* Platform specific data, device core doesn't touch it */ void *driver_data; /* Driver data, set and get with dev_set_drvdata/dev_get_drvdata */ struct mutex mutex; /* mutex to synchronize calls to * its driver. */ struct dev_links_info links; struct dev_pm_info power; struct dev_pm_domain *pm_domain; #ifdef CONFIG_ENERGY_MODEL struct em_perf_domain *em_pd; #endif #ifdef CONFIG_PINCTRL struct dev_pin_info *pins; #endif struct dev_msi_info msi; #ifdef CONFIG_ARCH_HAS_DMA_OPS const struct dma_map_ops *dma_ops; #endif u64 *dma_mask; /* dma mask (if dma'able device) */ u64 coherent_dma_mask;/* Like dma_mask, but for alloc_coherent mappings as not all hardware supports 64 bit addresses for consistent allocations such descriptors. */ u64 bus_dma_limit; /* upstream dma constraint */ const struct bus_dma_region *dma_range_map; struct device_dma_parameters *dma_parms; struct list_head dma_pools; /* dma pools (if dma'ble) */ #ifdef CONFIG_DMA_DECLARE_COHERENT struct dma_coherent_mem *dma_mem; /* internal for coherent mem override */ #endif #ifdef CONFIG_DMA_CMA struct cma *cma_area; /* contiguous memory area for dma allocations */ #endif #ifdef CONFIG_SWIOTLB struct io_tlb_mem *dma_io_tlb_mem; #endif #ifdef CONFIG_SWIOTLB_DYNAMIC struct list_head dma_io_tlb_pools; spinlock_t dma_io_tlb_lock; bool dma_uses_io_tlb; #endif /* arch specific additions */ struct dev_archdata archdata; struct device_node *of_node; /* associated device tree node */ struct fwnode_handle *fwnode; /* firmware device node */ #ifdef CONFIG_NUMA int numa_node; /* NUMA node this device is close to */ #endif dev_t devt; /* dev_t, creates the sysfs "dev" */ u32 id; /* device instance */ spinlock_t devres_lock; struct list_head devres_head; const struct class *class; const struct attribute_group **groups; /* optional groups */ void (*release)(struct device *dev); struct iommu_group *iommu_group; struct dev_iommu *iommu; struct device_physical_location *physical_location; enum device_removable removable; bool offline_disabled:1; bool offline:1; bool of_node_reused:1; bool state_synced:1; bool can_match:1; #if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \ defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \ defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) bool dma_coherent:1; #endif #ifdef CONFIG_DMA_OPS_BYPASS bool dma_ops_bypass : 1; #endif #ifdef CONFIG_DMA_NEED_SYNC bool dma_skip_sync:1; #endif #ifdef CONFIG_IOMMU_DMA bool dma_iommu:1; #endif }; /** * struct device_link - Device link representation. * @supplier: The device on the supplier end of the link. * @s_node: Hook to the supplier device's list of links to consumers. * @consumer: The device on the consumer end of the link. * @c_node: Hook to the consumer device's list of links to suppliers. * @link_dev: device used to expose link details in sysfs * @status: The state of the link (with respect to the presence of drivers). * @flags: Link flags. * @rpm_active: Whether or not the consumer device is runtime-PM-active. * @kref: Count repeated addition of the same link. * @rm_work: Work structure used for removing the link. * @supplier_preactivated: Supplier has been made active before consumer probe. */ struct device_link { struct device *supplier; struct list_head s_node; struct device *consumer; struct list_head c_node; struct device link_dev; enum device_link_state status; u32 flags; refcount_t rpm_active; struct kref kref; struct work_struct rm_work; bool supplier_preactivated; /* Owned by consumer probe. */ }; #define kobj_to_dev(__kobj) container_of_const(__kobj, struct device, kobj) /** * device_iommu_mapped - Returns true when the device DMA is translated * by an IOMMU * @dev: Device to perform the check on */ static inline bool device_iommu_mapped(struct device *dev) { return (dev->iommu_group != NULL); } /* Get the wakeup routines, which depend on struct device */ #include <linux/pm_wakeup.h> /** * dev_name - Return a device's name. * @dev: Device with name to get. * Return: The kobject name of the device, or its initial name if unavailable. */ static inline const char *dev_name(const struct device *dev) { /* Use the init name until the kobject becomes available */ if (dev->init_name) return dev->init_name; return kobject_name(&dev->kobj); } /** * dev_bus_name - Return a device's bus/class name, if at all possible * @dev: struct device to get the bus/class name of * * Will return the name of the bus/class the device is attached to. If it is * not attached to a bus/class, an empty string will be returned. */ static inline const char *dev_bus_name(const struct device *dev) { return dev->bus ? dev->bus->name : (dev->class ? dev->class->name : ""); } __printf(2, 3) int dev_set_name(struct device *dev, const char *name, ...); #ifdef CONFIG_NUMA static inline int dev_to_node(struct device *dev) { return dev->numa_node; } static inline void set_dev_node(struct device *dev, int node) { dev->numa_node = node; } #else static inline int dev_to_node(struct device *dev) { return NUMA_NO_NODE; } static inline void set_dev_node(struct device *dev, int node) { } #endif static inline struct irq_domain *dev_get_msi_domain(const struct device *dev) { #ifdef CONFIG_GENERIC_MSI_IRQ return dev->msi.domain; #else return NULL; #endif } static inline void dev_set_msi_domain(struct device *dev, struct irq_domain *d) { #ifdef CONFIG_GENERIC_MSI_IRQ dev->msi.domain = d; #endif } static inline void *dev_get_drvdata(const struct device *dev) { return dev->driver_data; } static inline void dev_set_drvdata(struct device *dev, void *data) { dev->driver_data = data; } static inline struct pm_subsys_data *dev_to_psd(struct device *dev) { return dev ? dev->power.subsys_data : NULL; } static inline unsigned int dev_get_uevent_suppress(const struct device *dev) { return dev->kobj.uevent_suppress; } static inline void dev_set_uevent_suppress(struct device *dev, int val) { dev->kobj.uevent_suppress = val; } static inline int device_is_registered(struct device *dev) { return dev->kobj.state_in_sysfs; } static inline void device_enable_async_suspend(struct device *dev) { if (!dev->power.is_prepared) dev->power.async_suspend = true; } static inline void device_disable_async_suspend(struct device *dev) { if (!dev->power.is_prepared) dev->power.async_suspend = false; } static inline bool device_async_suspend_enabled(struct device *dev) { return !!dev->power.async_suspend; } static inline bool device_pm_not_required(struct device *dev) { return dev->power.no_pm; } static inline void device_set_pm_not_required(struct device *dev) { dev->power.no_pm = true; } static inline void dev_pm_syscore_device(struct device *dev, bool val) { #ifdef CONFIG_PM_SLEEP dev->power.syscore = val; #endif } static inline void dev_pm_set_driver_flags(struct device *dev, u32 flags) { dev->power.driver_flags = flags; } static inline bool dev_pm_test_driver_flags(struct device *dev, u32 flags) { return !!(dev->power.driver_flags & flags); } static inline bool dev_pm_smart_suspend(struct device *dev) { #ifdef CONFIG_PM_SLEEP return dev->power.smart_suspend; #else return false; #endif } static inline void device_lock(struct device *dev) { mutex_lock(&dev->mutex); } static inline int device_lock_interruptible(struct device *dev) { return mutex_lock_interruptible(&dev->mutex); } static inline int device_trylock(struct device *dev) { return mutex_trylock(&dev->mutex); } static inline void device_unlock(struct device *dev) { mutex_unlock(&dev->mutex); } DEFINE_GUARD(device, struct device *, device_lock(_T), device_unlock(_T)) static inline void device_lock_assert(struct device *dev) { lockdep_assert_held(&dev->mutex); } static inline bool dev_has_sync_state(struct device *dev) { if (!dev) return false; if (dev->driver && dev->driver->sync_state) return true; if (dev->bus && dev->bus->sync_state) return true; return false; } static inline void dev_set_removable(struct device *dev, enum device_removable removable) { dev->removable = removable; } static inline bool dev_is_removable(struct device *dev) { return dev->removable == DEVICE_REMOVABLE; } static inline bool dev_removable_is_valid(struct device *dev) { return dev->removable != DEVICE_REMOVABLE_NOT_SUPPORTED; } /* * High level routines for use by the bus drivers */ int __must_check device_register(struct device *dev); void device_unregister(struct device *dev); void device_initialize(struct device *dev); int __must_check device_add(struct device *dev); void device_del(struct device *dev); DEFINE_FREE(device_del, struct device *, if (_T) device_del(_T)) int device_for_each_child(struct device *parent, void *data, device_iter_t fn); int device_for_each_child_reverse(struct device *parent, void *data, device_iter_t fn); int device_for_each_child_reverse_from(struct device *parent, struct device *from, void *data, device_iter_t fn); struct device *device_find_child(struct device *parent, const void *data, device_match_t match); /** * device_find_child_by_name - device iterator for locating a child device. * @parent: parent struct device * @name: name of the child device * * This is similar to the device_find_child() function above, but it * returns a reference to a device that has the name @name. * * NOTE: you will need to drop the reference with put_device() after use. */ static inline struct device *device_find_child_by_name(struct device *parent, const char *name) { return device_find_child(parent, name, device_match_name); } /** * device_find_any_child - device iterator for locating a child device, if any. * @parent: parent struct device * * This is similar to the device_find_child() function above, but it * returns a reference to a child device, if any. * * NOTE: you will need to drop the reference with put_device() after use. */ static inline struct device *device_find_any_child(struct device *parent) { return device_find_child(parent, NULL, device_match_any); } int device_rename(struct device *dev, const char *new_name); int device_move(struct device *dev, struct device *new_parent, enum dpm_order dpm_order); int device_change_owner(struct device *dev, kuid_t kuid, kgid_t kgid); static inline bool device_supports_offline(struct device *dev) { return dev->bus && dev->bus->offline && dev->bus->online; } #define __device_lock_set_class(dev, name, key) \ do { \ struct device *__d2 __maybe_unused = dev; \ lock_set_class(&__d2->mutex.dep_map, name, key, 0, _THIS_IP_); \ } while (0) /** * device_lock_set_class - Specify a temporary lock class while a device * is attached to a driver * @dev: device to modify * @key: lock class key data * * This must be called with the device_lock() already held, for example * from driver ->probe(). Take care to only override the default * lockdep_no_validate class. */ #ifdef CONFIG_LOCKDEP #define device_lock_set_class(dev, key) \ do { \ struct device *__d = dev; \ dev_WARN_ONCE(__d, !lockdep_match_class(&__d->mutex, \ &__lockdep_no_validate__), \ "overriding existing custom lock class\n"); \ __device_lock_set_class(__d, #key, key); \ } while (0) #else #define device_lock_set_class(dev, key) __device_lock_set_class(dev, #key, key) #endif /** * device_lock_reset_class - Return a device to the default lockdep novalidate state * @dev: device to modify * * This must be called with the device_lock() already held, for example * from driver ->remove(). */ #define device_lock_reset_class(dev) \ do { \ struct device *__d __maybe_unused = dev; \ lock_set_novalidate_class(&__d->mutex.dep_map, "&dev->mutex", \ _THIS_IP_); \ } while (0) void lock_device_hotplug(void); void unlock_device_hotplug(void); int lock_device_hotplug_sysfs(void); int device_offline(struct device *dev); int device_online(struct device *dev); void set_primary_fwnode(struct device *dev, struct fwnode_handle *fwnode); void set_secondary_fwnode(struct device *dev, struct fwnode_handle *fwnode); void device_set_node(struct device *dev, struct fwnode_handle *fwnode); int device_add_of_node(struct device *dev, struct device_node *of_node); void device_remove_of_node(struct device *dev); void device_set_of_node_from_dev(struct device *dev, const struct device *dev2); static inline struct device_node *dev_of_node(struct device *dev) { if (!IS_ENABLED(CONFIG_OF) || !dev) return NULL; return dev->of_node; } static inline int dev_num_vf(struct device *dev) { if (dev->bus && dev->bus->num_vf) return dev->bus->num_vf(dev); return 0; } /* * Root device objects for grouping under /sys/devices */ struct device *__root_device_register(const char *name, struct module *owner); /* This is a macro to avoid include problems with THIS_MODULE */ #define root_device_register(name) \ __root_device_register(name, THIS_MODULE) void root_device_unregister(struct device *root); static inline void *dev_get_platdata(const struct device *dev) { return dev->platform_data; } /* * Manual binding of a device to driver. See drivers/base/bus.c * for information on use. */ int __must_check device_driver_attach(const struct device_driver *drv, struct device *dev); int __must_check device_bind_driver(struct device *dev); void device_release_driver(struct device *dev); int __must_check device_attach(struct device *dev); int __must_check driver_attach(const struct device_driver *drv); void device_initial_probe(struct device *dev); int __must_check device_reprobe(struct device *dev); bool device_is_bound(struct device *dev); /* * Easy functions for dynamically creating devices on the fly */ __printf(5, 6) struct device * device_create(const struct class *cls, struct device *parent, dev_t devt, void *drvdata, const char *fmt, ...); __printf(6, 7) struct device * device_create_with_groups(const struct class *cls, struct device *parent, dev_t devt, void *drvdata, const struct attribute_group **groups, const char *fmt, ...); void device_destroy(const struct class *cls, dev_t devt); int __must_check device_add_groups(struct device *dev, const struct attribute_group **groups); void device_remove_groups(struct device *dev, const struct attribute_group **groups); static inline int __must_check device_add_group(struct device *dev, const struct attribute_group *grp) { const struct attribute_group *groups[] = { grp, NULL }; return device_add_groups(dev, groups); } static inline void device_remove_group(struct device *dev, const struct attribute_group *grp) { const struct attribute_group *groups[] = { grp, NULL }; device_remove_groups(dev, groups); } int __must_check devm_device_add_group(struct device *dev, const struct attribute_group *grp); /* * get_device - atomically increment the reference count for the device. * */ struct device *get_device(struct device *dev); void put_device(struct device *dev); DEFINE_FREE(put_device, struct device *, if (_T) put_device(_T)) bool kill_device(struct device *dev); #ifdef CONFIG_DEVTMPFS int devtmpfs_mount(void); #else static inline int devtmpfs_mount(void) { return 0; } #endif /* drivers/base/power/shutdown.c */ void device_shutdown(void); /* debugging and troubleshooting/diagnostic helpers. */ const char *dev_driver_string(const struct device *dev); /* Device links interface. */ struct device_link *device_link_add(struct device *consumer, struct device *supplier, u32 flags); void device_link_del(struct device_link *link); void device_link_remove(void *consumer, struct device *supplier); void device_links_supplier_sync_state_pause(void); void device_links_supplier_sync_state_resume(void); void device_link_wait_removal(void); /* Create alias, so I can be autoloaded. */ #define MODULE_ALIAS_CHARDEV(major,minor) \ MODULE_ALIAS("char-major-" __stringify(major) "-" __stringify(minor)) #define MODULE_ALIAS_CHARDEV_MAJOR(major) \ MODULE_ALIAS("char-major-" __stringify(major) "-*") #endif /* _DEVICE_H_ */ |
| 2 2 2 1 1 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 | // SPDX-License-Identifier: GPL-2.0 /* * RTC subsystem, interface functions * * Copyright (C) 2005 Tower Technologies * Author: Alessandro Zummo <a.zummo@towertech.it> * * based on arch/arm/common/rtctime.c */ #include <linux/rtc.h> #include <linux/sched.h> #include <linux/module.h> #include <linux/log2.h> #include <linux/workqueue.h> #define CREATE_TRACE_POINTS #include <trace/events/rtc.h> static int rtc_timer_enqueue(struct rtc_device *rtc, struct rtc_timer *timer); static void rtc_timer_remove(struct rtc_device *rtc, struct rtc_timer *timer); static void rtc_add_offset(struct rtc_device *rtc, struct rtc_time *tm) { time64_t secs; if (!rtc->offset_secs) return; secs = rtc_tm_to_time64(tm); /* * Since the reading time values from RTC device are always in the RTC * original valid range, but we need to skip the overlapped region * between expanded range and original range, which is no need to add * the offset. */ if ((rtc->start_secs > rtc->range_min && secs >= rtc->start_secs) || (rtc->start_secs < rtc->range_min && secs <= (rtc->start_secs + rtc->range_max - rtc->range_min))) return; rtc_time64_to_tm(secs + rtc->offset_secs, tm); } static void rtc_subtract_offset(struct rtc_device *rtc, struct rtc_time *tm) { time64_t secs; if (!rtc->offset_secs) return; secs = rtc_tm_to_time64(tm); /* * If the setting time values are in the valid range of RTC hardware * device, then no need to subtract the offset when setting time to RTC * device. Otherwise we need to subtract the offset to make the time * values are valid for RTC hardware device. */ if (secs >= rtc->range_min && secs <= rtc->range_max) return; rtc_time64_to_tm(secs - rtc->offset_secs, tm); } static int rtc_valid_range(struct rtc_device *rtc, struct rtc_time *tm) { if (rtc->range_min != rtc->range_max) { time64_t time = rtc_tm_to_time64(tm); time64_t range_min = rtc->set_start_time ? rtc->start_secs : rtc->range_min; timeu64_t range_max = rtc->set_start_time ? (rtc->start_secs + rtc->range_max - rtc->range_min) : rtc->range_max; if (time < range_min || time > range_max) return -ERANGE; } return 0; } static int __rtc_read_time(struct rtc_device *rtc, struct rtc_time *tm) { int err; if (!rtc->ops) { err = -ENODEV; } else if (!rtc->ops->read_time) { err = -EINVAL; } else { memset(tm, 0, sizeof(struct rtc_time)); err = rtc->ops->read_time(rtc->dev.parent, tm); if (err < 0) { dev_dbg(&rtc->dev, "read_time: fail to read: %d\n", err); return err; } rtc_add_offset(rtc, tm); err = rtc_valid_tm(tm); if (err < 0) dev_dbg(&rtc->dev, "read_time: rtc_time isn't valid\n"); } return err; } int rtc_read_time(struct rtc_device *rtc, struct rtc_time *tm) { int err; err = mutex_lock_interruptible(&rtc->ops_lock); if (err) return err; err = __rtc_read_time(rtc, tm); mutex_unlock(&rtc->ops_lock); trace_rtc_read_time(rtc_tm_to_time64(tm), err); return err; } EXPORT_SYMBOL_GPL(rtc_read_time); int rtc_set_time(struct rtc_device *rtc, struct rtc_time *tm) { int err, uie; err = rtc_valid_tm(tm); if (err != 0) return err; err = rtc_valid_range(rtc, tm); if (err) return err; rtc_subtract_offset(rtc, tm); #ifdef CONFIG_RTC_INTF_DEV_UIE_EMUL uie = rtc->uie_rtctimer.enabled || rtc->uie_irq_active; #else uie = rtc->uie_rtctimer.enabled; #endif if (uie) { err = rtc_update_irq_enable(rtc, 0); if (err) return err; } err = mutex_lock_interruptible(&rtc->ops_lock); if (err) return err; if (!rtc->ops) err = -ENODEV; else if (rtc->ops->set_time) err = rtc->ops->set_time(rtc->dev.parent, tm); else err = -EINVAL; pm_stay_awake(rtc->dev.parent); mutex_unlock(&rtc->ops_lock); /* A timer might have just expired */ schedule_work(&rtc->irqwork); if (uie) { err = rtc_update_irq_enable(rtc, 1); if (err) return err; } trace_rtc_set_time(rtc_tm_to_time64(tm), err); return err; } EXPORT_SYMBOL_GPL(rtc_set_time); static int rtc_read_alarm_internal(struct rtc_device *rtc, struct rtc_wkalrm *alarm) { int err; err = mutex_lock_interruptible(&rtc->ops_lock); if (err) return err; if (!rtc->ops) { err = -ENODEV; } else if (!test_bit(RTC_FEATURE_ALARM, rtc->features) || !rtc->ops->read_alarm) { err = -EINVAL; } else { alarm->enabled = 0; alarm->pending = 0; alarm->time.tm_sec = -1; alarm->time.tm_min = -1; alarm->time.tm_hour = -1; alarm->time.tm_mday = -1; alarm->time.tm_mon = -1; alarm->time.tm_year = -1; alarm->time.tm_wday = -1; alarm->time.tm_yday = -1; alarm->time.tm_isdst = -1; err = rtc->ops->read_alarm(rtc->dev.parent, alarm); } mutex_unlock(&rtc->ops_lock); trace_rtc_read_alarm(rtc_tm_to_time64(&alarm->time), err); return err; } int __rtc_read_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm) { int err; struct rtc_time before, now; int first_time = 1; time64_t t_now, t_alm; enum { none, day, month, year } missing = none; unsigned int days; /* The lower level RTC driver may return -1 in some fields, * creating invalid alarm->time values, for reasons like: * * - The hardware may not be capable of filling them in; * many alarms match only on time-of-day fields, not * day/month/year calendar data. * * - Some hardware uses illegal values as "wildcard" match * values, which non-Linux firmware (like a BIOS) may try * to set up as e.g. "alarm 15 minutes after each hour". * Linux uses only oneshot alarms. * * When we see that here, we deal with it by using values from * a current RTC timestamp for any missing (-1) values. The * RTC driver prevents "periodic alarm" modes. * * But this can be racey, because some fields of the RTC timestamp * may have wrapped in the interval since we read the RTC alarm, * which would lead to us inserting inconsistent values in place * of the -1 fields. * * Reading the alarm and timestamp in the reverse sequence * would have the same race condition, and not solve the issue. * * So, we must first read the RTC timestamp, * then read the RTC alarm value, * and then read a second RTC timestamp. * * If any fields of the second timestamp have changed * when compared with the first timestamp, then we know * our timestamp may be inconsistent with that used by * the low-level rtc_read_alarm_internal() function. * * So, when the two timestamps disagree, we just loop and do * the process again to get a fully consistent set of values. * * This could all instead be done in the lower level driver, * but since more than one lower level RTC implementation needs it, * then it's probably best to do it here instead of there.. */ /* Get the "before" timestamp */ err = rtc_read_time(rtc, &before); if (err < 0) return err; do { if (!first_time) memcpy(&before, &now, sizeof(struct rtc_time)); first_time = 0; /* get the RTC alarm values, which may be incomplete */ err = rtc_read_alarm_internal(rtc, alarm); if (err) return err; /* full-function RTCs won't have such missing fields */ err = rtc_valid_tm(&alarm->time); if (!err) goto done; /* get the "after" timestamp, to detect wrapped fields */ err = rtc_read_time(rtc, &now); if (err < 0) return err; /* note that tm_sec is a "don't care" value here: */ } while (before.tm_min != now.tm_min || before.tm_hour != now.tm_hour || before.tm_mon != now.tm_mon || before.tm_year != now.tm_year); /* Fill in the missing alarm fields using the timestamp; we * know there's at least one since alarm->time is invalid. */ if (alarm->time.tm_sec == -1) alarm->time.tm_sec = now.tm_sec; if (alarm->time.tm_min == -1) alarm->time.tm_min = now.tm_min; if (alarm->time.tm_hour == -1) alarm->time.tm_hour = now.tm_hour; /* For simplicity, only support date rollover for now */ if (alarm->time.tm_mday < 1 || alarm->time.tm_mday > 31) { alarm->time.tm_mday = now.tm_mday; missing = day; } if ((unsigned int)alarm->time.tm_mon >= 12) { alarm->time.tm_mon = now.tm_mon; if (missing == none) missing = month; } if (alarm->time.tm_year == -1) { alarm->time.tm_year = now.tm_year; if (missing == none) missing = year; } /* Can't proceed if alarm is still invalid after replacing * missing fields. */ err = rtc_valid_tm(&alarm->time); if (err) goto done; /* with luck, no rollover is needed */ t_now = rtc_tm_to_time64(&now); t_alm = rtc_tm_to_time64(&alarm->time); if (t_now < t_alm) goto done; switch (missing) { /* 24 hour rollover ... if it's now 10am Monday, an alarm that * that will trigger at 5am will do so at 5am Tuesday, which * could also be in the next month or year. This is a common * case, especially for PCs. */ case day: dev_dbg(&rtc->dev, "alarm rollover: %s\n", "day"); t_alm += 24 * 60 * 60; rtc_time64_to_tm(t_alm, &alarm->time); break; /* Month rollover ... if it's the 31th, an alarm on the 3rd will * be next month. An alarm matching on the 30th, 29th, or 28th * may end up in the month after that! Many newer PCs support * this type of alarm. */ case month: dev_dbg(&rtc->dev, "alarm rollover: %s\n", "month"); do { if (alarm->time.tm_mon < 11) { alarm->time.tm_mon++; } else { alarm->time.tm_mon = 0; alarm->time.tm_year++; } days = rtc_month_days(alarm->time.tm_mon, alarm->time.tm_year); } while (days < alarm->time.tm_mday); break; /* Year rollover ... easy except for leap years! */ case year: dev_dbg(&rtc->dev, "alarm rollover: %s\n", "year"); do { alarm->time.tm_year++; } while (!is_leap_year(alarm->time.tm_year + 1900) && rtc_valid_tm(&alarm->time) != 0); break; default: dev_warn(&rtc->dev, "alarm rollover not handled\n"); } err = rtc_valid_tm(&alarm->time); done: if (err && alarm->enabled) dev_warn(&rtc->dev, "invalid alarm value: %ptR\n", &alarm->time); else rtc_add_offset(rtc, &alarm->time); return err; } int rtc_read_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm) { int err; err = mutex_lock_interruptible(&rtc->ops_lock); if (err) return err; if (!rtc->ops) { err = -ENODEV; } else if (!test_bit(RTC_FEATURE_ALARM, rtc->features)) { err = -EINVAL; } else { memset(alarm, 0, sizeof(struct rtc_wkalrm)); alarm->enabled = rtc->aie_timer.enabled; alarm->time = rtc_ktime_to_tm(rtc->aie_timer.node.expires); } mutex_unlock(&rtc->ops_lock); trace_rtc_read_alarm(rtc_tm_to_time64(&alarm->time), err); return err; } EXPORT_SYMBOL_GPL(rtc_read_alarm); static int __rtc_set_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm) { struct rtc_time tm; time64_t now, scheduled; int err; err = rtc_valid_tm(&alarm->time); if (err) return err; scheduled = rtc_tm_to_time64(&alarm->time); /* Make sure we're not setting alarms in the past */ err = __rtc_read_time(rtc, &tm); if (err) return err; now = rtc_tm_to_time64(&tm); if (scheduled <= now) return -ETIME; /* * XXX - We just checked to make sure the alarm time is not * in the past, but there is still a race window where if * the is alarm set for the next second and the second ticks * over right here, before we set the alarm. */ rtc_subtract_offset(rtc, &alarm->time); if (!rtc->ops) err = -ENODEV; else if (!test_bit(RTC_FEATURE_ALARM, rtc->features)) err = -EINVAL; else err = rtc->ops->set_alarm(rtc->dev.parent, alarm); trace_rtc_set_alarm(rtc_tm_to_time64(&alarm->time), err); return err; } int rtc_set_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm) { ktime_t alarm_time; int err; if (!rtc->ops) return -ENODEV; else if (!test_bit(RTC_FEATURE_ALARM, rtc->features)) return -EINVAL; err = rtc_valid_tm(&alarm->time); if (err != 0) return err; err = rtc_valid_range(rtc, &alarm->time); if (err) return err; err = mutex_lock_interruptible(&rtc->ops_lock); if (err) return err; if (rtc->aie_timer.enabled) rtc_timer_remove(rtc, &rtc->aie_timer); alarm_time = rtc_tm_to_ktime(alarm->time); /* * Round down so we never miss a deadline, checking for past deadline is * done in __rtc_set_alarm */ if (test_bit(RTC_FEATURE_ALARM_RES_MINUTE, rtc->features)) alarm_time = ktime_sub_ns(alarm_time, (u64)alarm->time.tm_sec * NSEC_PER_SEC); rtc->aie_timer.node.expires = alarm_time; rtc->aie_timer.period = 0; if (alarm->enabled) err = rtc_timer_enqueue(rtc, &rtc->aie_timer); mutex_unlock(&rtc->ops_lock); return err; } EXPORT_SYMBOL_GPL(rtc_set_alarm); /* Called once per device from rtc_device_register */ int rtc_initialize_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm) { int err; struct rtc_time now; err = rtc_valid_tm(&alarm->time); if (err != 0) return err; err = rtc_read_time(rtc, &now); if (err) return err; err = mutex_lock_interruptible(&rtc->ops_lock); if (err) return err; rtc->aie_timer.node.expires = rtc_tm_to_ktime(alarm->time); rtc->aie_timer.period = 0; /* Alarm has to be enabled & in the future for us to enqueue it */ if (alarm->enabled && (rtc_tm_to_ktime(now) < rtc->aie_timer.node.expires)) { rtc->aie_timer.enabled = 1; timerqueue_add(&rtc->timerqueue, &rtc->aie_timer.node); trace_rtc_timer_enqueue(&rtc->aie_timer); } mutex_unlock(&rtc->ops_lock); return err; } EXPORT_SYMBOL_GPL(rtc_initialize_alarm); int rtc_alarm_irq_enable(struct rtc_device *rtc, unsigned int enabled) { int err; err = mutex_lock_interruptible(&rtc->ops_lock); if (err) return err; if (rtc->aie_timer.enabled != enabled) { if (enabled) err = rtc_timer_enqueue(rtc, &rtc->aie_timer); else rtc_timer_remove(rtc, &rtc->aie_timer); } if (err) /* nothing */; else if (!rtc->ops) err = -ENODEV; else if (!test_bit(RTC_FEATURE_ALARM, rtc->features) || !rtc->ops->alarm_irq_enable) err = -EINVAL; else err = rtc->ops->alarm_irq_enable(rtc->dev.parent, enabled); mutex_unlock(&rtc->ops_lock); trace_rtc_alarm_irq_enable(enabled, err); return err; } EXPORT_SYMBOL_GPL(rtc_alarm_irq_enable); int rtc_update_irq_enable(struct rtc_device *rtc, unsigned int enabled) { int err; err = mutex_lock_interruptible(&rtc->ops_lock); if (err) return err; #ifdef CONFIG_RTC_INTF_DEV_UIE_EMUL if (enabled == 0 && rtc->uie_irq_active) { mutex_unlock(&rtc->ops_lock); return rtc_dev_update_irq_enable_emul(rtc, 0); } #endif /* make sure we're changing state */ if (rtc->uie_rtctimer.enabled == enabled) goto out; if (!test_bit(RTC_FEATURE_UPDATE_INTERRUPT, rtc->features) || !test_bit(RTC_FEATURE_ALARM, rtc->features)) { mutex_unlock(&rtc->ops_lock); #ifdef CONFIG_RTC_INTF_DEV_UIE_EMUL return rtc_dev_update_irq_enable_emul(rtc, enabled); #else return -EINVAL; #endif } if (enabled) { struct rtc_time tm; ktime_t now, onesec; err = __rtc_read_time(rtc, &tm); if (err) goto out; onesec = ktime_set(1, 0); now = rtc_tm_to_ktime(tm); rtc->uie_rtctimer.node.expires = ktime_add(now, onesec); rtc->uie_rtctimer.period = ktime_set(1, 0); err = rtc_timer_enqueue(rtc, &rtc->uie_rtctimer); } else { rtc_timer_remove(rtc, &rtc->uie_rtctimer); } out: mutex_unlock(&rtc->ops_lock); return err; } EXPORT_SYMBOL_GPL(rtc_update_irq_enable); /** * rtc_handle_legacy_irq - AIE, UIE and PIE event hook * @rtc: pointer to the rtc device * @num: number of occurence of the event * @mode: type of the event, RTC_AF, RTC_UF of RTC_PF * * This function is called when an AIE, UIE or PIE mode interrupt * has occurred (or been emulated). * */ void rtc_handle_legacy_irq(struct rtc_device *rtc, int num, int mode) { unsigned long flags; /* mark one irq of the appropriate mode */ spin_lock_irqsave(&rtc->irq_lock, flags); rtc->irq_data = (rtc->irq_data + (num << 8)) | (RTC_IRQF | mode); spin_unlock_irqrestore(&rtc->irq_lock, flags); wake_up_interruptible(&rtc->irq_queue); kill_fasync(&rtc->async_queue, SIGIO, POLL_IN); } /** * rtc_aie_update_irq - AIE mode rtctimer hook * @rtc: pointer to the rtc_device * * This functions is called when the aie_timer expires. */ void rtc_aie_update_irq(struct rtc_device *rtc) { rtc_handle_legacy_irq(rtc, 1, RTC_AF); } /** * rtc_uie_update_irq - UIE mode rtctimer hook * @rtc: pointer to the rtc_device * * This functions is called when the uie_timer expires. */ void rtc_uie_update_irq(struct rtc_device *rtc) { rtc_handle_legacy_irq(rtc, 1, RTC_UF); } /** * rtc_pie_update_irq - PIE mode hrtimer hook * @timer: pointer to the pie mode hrtimer * * This function is used to emulate PIE mode interrupts * using an hrtimer. This function is called when the periodic * hrtimer expires. */ enum hrtimer_restart rtc_pie_update_irq(struct hrtimer *timer) { struct rtc_device *rtc; ktime_t period; u64 count; rtc = container_of(timer, struct rtc_device, pie_timer); period = NSEC_PER_SEC / rtc->irq_freq; count = hrtimer_forward_now(timer, period); rtc_handle_legacy_irq(rtc, count, RTC_PF); return HRTIMER_RESTART; } /** * rtc_update_irq - Triggered when a RTC interrupt occurs. * @rtc: the rtc device * @num: how many irqs are being reported (usually one) * @events: mask of RTC_IRQF with one or more of RTC_PF, RTC_AF, RTC_UF * Context: any */ void rtc_update_irq(struct rtc_device *rtc, unsigned long num, unsigned long events) { if (IS_ERR_OR_NULL(rtc)) return; pm_stay_awake(rtc->dev.parent); schedule_work(&rtc->irqwork); } EXPORT_SYMBOL_GPL(rtc_update_irq); struct rtc_device *rtc_class_open(const char *name) { struct device *dev; struct rtc_device *rtc = NULL; dev = class_find_device_by_name(&rtc_class, name); if (dev) rtc = to_rtc_device(dev); if (rtc) { if (!try_module_get(rtc->owner)) { put_device(dev); rtc = NULL; } } return rtc; } EXPORT_SYMBOL_GPL(rtc_class_open); void rtc_class_close(struct rtc_device *rtc) { module_put(rtc->owner); put_device(&rtc->dev); } EXPORT_SYMBOL_GPL(rtc_class_close); static int rtc_update_hrtimer(struct rtc_device *rtc, int enabled) { /* * We always cancel the timer here first, because otherwise * we could run into BUG_ON(timer->state != HRTIMER_STATE_CALLBACK); * when we manage to start the timer before the callback * returns HRTIMER_RESTART. * * We cannot use hrtimer_cancel() here as a running callback * could be blocked on rtc->irq_task_lock and hrtimer_cancel() * would spin forever. */ if (hrtimer_try_to_cancel(&rtc->pie_timer) < 0) return -1; if (enabled) { ktime_t period = NSEC_PER_SEC / rtc->irq_freq; hrtimer_start(&rtc->pie_timer, period, HRTIMER_MODE_REL); } return 0; } /** * rtc_irq_set_state - enable/disable 2^N Hz periodic IRQs * @rtc: the rtc device * @enabled: true to enable periodic IRQs * Context: any * * Note that rtc_irq_set_freq() should previously have been used to * specify the desired frequency of periodic IRQ. */ int rtc_irq_set_state(struct rtc_device *rtc, int enabled) { int err = 0; while (rtc_update_hrtimer(rtc, enabled) < 0) cpu_relax(); rtc->pie_enabled = enabled; trace_rtc_irq_set_state(enabled, err); return err; } /** * rtc_irq_set_freq - set 2^N Hz periodic IRQ frequency for IRQ * @rtc: the rtc device * @freq: positive frequency * Context: any * * Note that rtc_irq_set_state() is used to enable or disable the * periodic IRQs. */ int rtc_irq_set_freq(struct rtc_device *rtc, int freq) { int err = 0; if (freq <= 0 || freq > RTC_MAX_FREQ) return -EINVAL; rtc->irq_freq = freq; while (rtc->pie_enabled && rtc_update_hrtimer(rtc, 1) < 0) cpu_relax(); trace_rtc_irq_set_freq(freq, err); return err; } /** * rtc_timer_enqueue - Adds a rtc_timer to the rtc_device timerqueue * @rtc: rtc device * @timer: timer being added. * * Enqueues a timer onto the rtc devices timerqueue and sets * the next alarm event appropriately. * * Sets the enabled bit on the added timer. * * Must hold ops_lock for proper serialization of timerqueue */ static int rtc_timer_enqueue(struct rtc_device *rtc, struct rtc_timer *timer) { struct timerqueue_node *next = timerqueue_getnext(&rtc->timerqueue); struct rtc_time tm; ktime_t now; int err; err = __rtc_read_time(rtc, &tm); if (err) return err; timer->enabled = 1; now = rtc_tm_to_ktime(tm); /* Skip over expired timers */ while (next) { if (next->expires >= now) break; next = timerqueue_iterate_next(next); } timerqueue_add(&rtc->timerqueue, &timer->node); trace_rtc_timer_enqueue(timer); if (!next || ktime_before(timer->node.expires, next->expires)) { struct rtc_wkalrm alarm; alarm.time = rtc_ktime_to_tm(timer->node.expires); alarm.enabled = 1; err = __rtc_set_alarm(rtc, &alarm); if (err == -ETIME) { pm_stay_awake(rtc->dev.parent); schedule_work(&rtc->irqwork); } else if (err) { timerqueue_del(&rtc->timerqueue, &timer->node); trace_rtc_timer_dequeue(timer); timer->enabled = 0; return err; } } return 0; } static void rtc_alarm_disable(struct rtc_device *rtc) { if (!rtc->ops || !test_bit(RTC_FEATURE_ALARM, rtc->features) || !rtc->ops->alarm_irq_enable) return; rtc->ops->alarm_irq_enable(rtc->dev.parent, false); trace_rtc_alarm_irq_enable(0, 0); } /** * rtc_timer_remove - Removes a rtc_timer from the rtc_device timerqueue * @rtc: rtc device * @timer: timer being removed. * * Removes a timer onto the rtc devices timerqueue and sets * the next alarm event appropriately. * * Clears the enabled bit on the removed timer. * * Must hold ops_lock for proper serialization of timerqueue */ static void rtc_timer_remove(struct rtc_device *rtc, struct rtc_timer *timer) { struct timerqueue_node *next = timerqueue_getnext(&rtc->timerqueue); timerqueue_del(&rtc->timerqueue, &timer->node); trace_rtc_timer_dequeue(timer); timer->enabled = 0; if (next == &timer->node) { struct rtc_wkalrm alarm; int err; next = timerqueue_getnext(&rtc->timerqueue); if (!next) { rtc_alarm_disable(rtc); return; } alarm.time = rtc_ktime_to_tm(next->expires); alarm.enabled = 1; err = __rtc_set_alarm(rtc, &alarm); if (err == -ETIME) { pm_stay_awake(rtc->dev.parent); schedule_work(&rtc->irqwork); } } } /** * rtc_timer_do_work - Expires rtc timers * @work: work item * * Expires rtc timers. Reprograms next alarm event if needed. * Called via worktask. * * Serializes access to timerqueue via ops_lock mutex */ void rtc_timer_do_work(struct work_struct *work) { struct rtc_timer *timer; struct timerqueue_node *next; ktime_t now; struct rtc_time tm; int err; struct rtc_device *rtc = container_of(work, struct rtc_device, irqwork); mutex_lock(&rtc->ops_lock); again: err = __rtc_read_time(rtc, &tm); if (err) { mutex_unlock(&rtc->ops_lock); return; } now = rtc_tm_to_ktime(tm); while ((next = timerqueue_getnext(&rtc->timerqueue))) { if (next->expires > now) break; /* expire timer */ timer = container_of(next, struct rtc_timer, node); timerqueue_del(&rtc->timerqueue, &timer->node); trace_rtc_timer_dequeue(timer); timer->enabled = 0; if (timer->func) timer->func(timer->rtc); trace_rtc_timer_fired(timer); /* Re-add/fwd periodic timers */ if (ktime_to_ns(timer->period)) { timer->node.expires = ktime_add(timer->node.expires, timer->period); timer->enabled = 1; timerqueue_add(&rtc->timerqueue, &timer->node); trace_rtc_timer_enqueue(timer); } } /* Set next alarm */ if (next) { struct rtc_wkalrm alarm; int err; int retry = 3; alarm.time = rtc_ktime_to_tm(next->expires); alarm.enabled = 1; reprogram: err = __rtc_set_alarm(rtc, &alarm); if (err == -ETIME) { goto again; } else if (err) { if (retry-- > 0) goto reprogram; timer = container_of(next, struct rtc_timer, node); timerqueue_del(&rtc->timerqueue, &timer->node); trace_rtc_timer_dequeue(timer); timer->enabled = 0; dev_err(&rtc->dev, "__rtc_set_alarm: err=%d\n", err); goto again; } } else { rtc_alarm_disable(rtc); } pm_relax(rtc->dev.parent); mutex_unlock(&rtc->ops_lock); } /* rtc_timer_init - Initializes an rtc_timer * @timer: timer to be intiialized * @f: function pointer to be called when timer fires * @rtc: pointer to the rtc_device * * Kernel interface to initializing an rtc_timer. */ void rtc_timer_init(struct rtc_timer *timer, void (*f)(struct rtc_device *r), struct rtc_device *rtc) { timerqueue_init(&timer->node); timer->enabled = 0; timer->func = f; timer->rtc = rtc; } /* rtc_timer_start - Sets an rtc_timer to fire in the future * @ rtc: rtc device to be used * @ timer: timer being set * @ expires: time at which to expire the timer * @ period: period that the timer will recur * * Kernel interface to set an rtc_timer */ int rtc_timer_start(struct rtc_device *rtc, struct rtc_timer *timer, ktime_t expires, ktime_t period) { int ret = 0; mutex_lock(&rtc->ops_lock); if (timer->enabled) rtc_timer_remove(rtc, timer); timer->node.expires = expires; timer->period = period; ret = rtc_timer_enqueue(rtc, timer); mutex_unlock(&rtc->ops_lock); return ret; } /* rtc_timer_cancel - Stops an rtc_timer * @ rtc: rtc device to be used * @ timer: timer being set * * Kernel interface to cancel an rtc_timer */ void rtc_timer_cancel(struct rtc_device *rtc, struct rtc_timer *timer) { mutex_lock(&rtc->ops_lock); if (timer->enabled) rtc_timer_remove(rtc, timer); mutex_unlock(&rtc->ops_lock); } /** * rtc_read_offset - Read the amount of rtc offset in parts per billion * @rtc: rtc device to be used * @offset: the offset in parts per billion * * see below for details. * * Kernel interface to read rtc clock offset * Returns 0 on success, or a negative number on error. * If read_offset() is not implemented for the rtc, return -EINVAL */ int rtc_read_offset(struct rtc_device *rtc, long *offset) { int ret; if (!rtc->ops) return -ENODEV; if (!rtc->ops->read_offset) return -EINVAL; mutex_lock(&rtc->ops_lock); ret = rtc->ops->read_offset(rtc->dev.parent, offset); mutex_unlock(&rtc->ops_lock); trace_rtc_read_offset(*offset, ret); return ret; } /** * rtc_set_offset - Adjusts the duration of the average second * @rtc: rtc device to be used * @offset: the offset in parts per billion * * Some rtc's allow an adjustment to the average duration of a second * to compensate for differences in the actual clock rate due to temperature, * the crystal, capacitor, etc. * * The adjustment applied is as follows: * t = t0 * (1 + offset * 1e-9) * where t0 is the measured length of 1 RTC second with offset = 0 * * Kernel interface to adjust an rtc clock offset. * Return 0 on success, or a negative number on error. * If the rtc offset is not setable (or not implemented), return -EINVAL */ int rtc_set_offset(struct rtc_device *rtc, long offset) { int ret; if (!rtc->ops) return -ENODEV; if (!rtc->ops->set_offset) return -EINVAL; mutex_lock(&rtc->ops_lock); ret = rtc->ops->set_offset(rtc->dev.parent, offset); mutex_unlock(&rtc->ops_lock); trace_rtc_set_offset(offset, ret); return ret; } |
| 72 72 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 | // SPDX-License-Identifier: GPL-2.0 /* * fs/partitions/osf.c * * Code extracted from drivers/block/genhd.c * * Copyright (C) 1991-1998 Linus Torvalds * Re-organised Feb 1998 Russell King */ #include "check.h" #define MAX_OSF_PARTITIONS 18 #define DISKLABELMAGIC (0x82564557UL) int osf_partition(struct parsed_partitions *state) { int i; int slot = 1; unsigned int npartitions; Sector sect; unsigned char *data; struct disklabel { __le32 d_magic; __le16 d_type,d_subtype; u8 d_typename[16]; u8 d_packname[16]; __le32 d_secsize; __le32 d_nsectors; __le32 d_ntracks; __le32 d_ncylinders; __le32 d_secpercyl; __le32 d_secprtunit; __le16 d_sparespertrack; __le16 d_sparespercyl; __le32 d_acylinders; __le16 d_rpm, d_interleave, d_trackskew, d_cylskew; __le32 d_headswitch, d_trkseek, d_flags; __le32 d_drivedata[5]; __le32 d_spare[5]; __le32 d_magic2; __le16 d_checksum; __le16 d_npartitions; __le32 d_bbsize, d_sbsize; struct d_partition { __le32 p_size; __le32 p_offset; __le32 p_fsize; u8 p_fstype; u8 p_frag; __le16 p_cpg; } d_partitions[MAX_OSF_PARTITIONS]; } * label; struct d_partition * partition; data = read_part_sector(state, 0, §); if (!data) return -1; label = (struct disklabel *) (data+64); partition = label->d_partitions; if (le32_to_cpu(label->d_magic) != DISKLABELMAGIC) { put_dev_sector(sect); return 0; } if (le32_to_cpu(label->d_magic2) != DISKLABELMAGIC) { put_dev_sector(sect); return 0; } npartitions = le16_to_cpu(label->d_npartitions); if (npartitions > MAX_OSF_PARTITIONS) { put_dev_sector(sect); return 0; } for (i = 0 ; i < npartitions; i++, partition++) { if (slot == state->limit) break; if (le32_to_cpu(partition->p_size)) put_partition(state, slot, le32_to_cpu(partition->p_offset), le32_to_cpu(partition->p_size)); slot++; } strlcat(state->pp_buf, "\n", PAGE_SIZE); put_dev_sector(sect); return 1; } |
| 200 4 12 8 8 8 12 12 12 12 12 12 12 4 1 9 9 7 1 8 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 8 7 7 7 7 7 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 7 5 6 12 10 2 12 5 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 | /* inflate.c -- zlib decompression * Copyright (C) 1995-2005 Mark Adler * For conditions of distribution and use, see copyright notice in zlib.h * * Based on zlib 1.2.3 but modified for the Linux Kernel by * Richard Purdie <richard@openedhand.com> * * Changes mainly for static instead of dynamic memory allocation * */ #include <linux/zutil.h> #include "inftrees.h" #include "inflate.h" #include "inffast.h" #include "infutil.h" /* architecture-specific bits */ #ifdef CONFIG_ZLIB_DFLTCC # include "../zlib_dfltcc/dfltcc_inflate.h" #else #define INFLATE_RESET_HOOK(strm) do {} while (0) #define INFLATE_TYPEDO_HOOK(strm, flush) do {} while (0) #define INFLATE_NEED_UPDATEWINDOW(strm) 1 #define INFLATE_NEED_CHECKSUM(strm) 1 #endif int zlib_inflate_workspacesize(void) { return sizeof(struct inflate_workspace); } int zlib_inflateReset(z_streamp strm) { struct inflate_state *state; if (strm == NULL || strm->state == NULL) return Z_STREAM_ERROR; state = (struct inflate_state *)strm->state; strm->total_in = strm->total_out = state->total = 0; strm->msg = NULL; strm->adler = 1; /* to support ill-conceived Java test suite */ state->mode = HEAD; state->last = 0; state->havedict = 0; state->dmax = 32768U; state->hold = 0; state->bits = 0; state->lencode = state->distcode = state->next = state->codes; /* Initialise Window */ state->wsize = 1U << state->wbits; state->write = 0; state->whave = 0; INFLATE_RESET_HOOK(strm); return Z_OK; } int zlib_inflateInit2(z_streamp strm, int windowBits) { struct inflate_state *state; if (strm == NULL) return Z_STREAM_ERROR; strm->msg = NULL; /* in case we return an error */ state = &WS(strm)->inflate_state; strm->state = (struct internal_state *)state; if (windowBits < 0) { state->wrap = 0; windowBits = -windowBits; } else { state->wrap = (windowBits >> 4) + 1; } if (windowBits < 8 || windowBits > 15) { return Z_STREAM_ERROR; } state->wbits = (unsigned)windowBits; #ifdef CONFIG_ZLIB_DFLTCC /* * DFLTCC requires the window to be page aligned. * Thus, we overallocate and take the aligned portion of the buffer. */ state->window = PTR_ALIGN(&WS(strm)->working_window[0], PAGE_SIZE); #else state->window = &WS(strm)->working_window[0]; #endif return zlib_inflateReset(strm); } /* Return state with length and distance decoding tables and index sizes set to fixed code decoding. This returns fixed tables from inffixed.h. */ static void zlib_fixedtables(struct inflate_state *state) { # include "inffixed.h" state->lencode = lenfix; state->lenbits = 9; state->distcode = distfix; state->distbits = 5; } /* Update the window with the last wsize (normally 32K) bytes written before returning. This is only called when a window is already in use, or when output has been written during this inflate call, but the end of the deflate stream has not been reached yet. It is also called to window dictionary data when a dictionary is loaded. Providing output buffers larger than 32K to inflate() should provide a speed advantage, since only the last 32K of output is copied to the sliding window upon return from inflate(), and since all distances after the first 32K of output will fall in the output data, making match copies simpler and faster. The advantage may be dependent on the size of the processor's data caches. */ static void zlib_updatewindow(z_streamp strm, unsigned out) { struct inflate_state *state; unsigned copy, dist; state = (struct inflate_state *)strm->state; /* copy state->wsize or less output bytes into the circular window */ copy = out - strm->avail_out; if (copy >= state->wsize) { memcpy(state->window, strm->next_out - state->wsize, state->wsize); state->write = 0; state->whave = state->wsize; } else { dist = state->wsize - state->write; if (dist > copy) dist = copy; memcpy(state->window + state->write, strm->next_out - copy, dist); copy -= dist; if (copy) { memcpy(state->window, strm->next_out - copy, copy); state->write = copy; state->whave = state->wsize; } else { state->write += dist; if (state->write == state->wsize) state->write = 0; if (state->whave < state->wsize) state->whave += dist; } } } /* * At the end of a Deflate-compressed PPP packet, we expect to have seen * a `stored' block type value but not the (zero) length bytes. */ /* Returns true if inflate is currently at the end of a block generated by Z_SYNC_FLUSH or Z_FULL_FLUSH. This function is used by one PPP implementation to provide an additional safety check. PPP uses Z_SYNC_FLUSH but removes the length bytes of the resulting empty stored block. When decompressing, PPP checks that at the end of input packet, inflate is waiting for these length bytes. */ static int zlib_inflateSyncPacket(z_streamp strm) { struct inflate_state *state; if (strm == NULL || strm->state == NULL) return Z_STREAM_ERROR; state = (struct inflate_state *)strm->state; if (state->mode == STORED && state->bits == 0) { state->mode = TYPE; return Z_OK; } return Z_DATA_ERROR; } /* Macros for inflate(): */ /* check function to use adler32() for zlib or crc32() for gzip */ #define UPDATE(check, buf, len) zlib_adler32(check, buf, len) /* Load registers with state in inflate() for speed */ #define LOAD() \ do { \ put = strm->next_out; \ left = strm->avail_out; \ next = strm->next_in; \ have = strm->avail_in; \ hold = state->hold; \ bits = state->bits; \ } while (0) /* Restore state from registers in inflate() */ #define RESTORE() \ do { \ strm->next_out = put; \ strm->avail_out = left; \ strm->next_in = next; \ strm->avail_in = have; \ state->hold = hold; \ state->bits = bits; \ } while (0) /* Clear the input bit accumulator */ #define INITBITS() \ do { \ hold = 0; \ bits = 0; \ } while (0) /* Get a byte of input into the bit accumulator, or return from inflate() if there is no input available. */ #define PULLBYTE() \ do { \ if (have == 0) goto inf_leave; \ have--; \ hold += (unsigned long)(*next++) << bits; \ bits += 8; \ } while (0) /* Assure that there are at least n bits in the bit accumulator. If there is not enough available input to do that, then return from inflate(). */ #define NEEDBITS(n) \ do { \ while (bits < (unsigned)(n)) \ PULLBYTE(); \ } while (0) /* Return the low n bits of the bit accumulator (n < 16) */ #define BITS(n) \ ((unsigned)hold & ((1U << (n)) - 1)) /* Remove n bits from the bit accumulator */ #define DROPBITS(n) \ do { \ hold >>= (n); \ bits -= (unsigned)(n); \ } while (0) /* Remove zero to seven bits as needed to go to a byte boundary */ #define BYTEBITS() \ do { \ hold >>= bits & 7; \ bits -= bits & 7; \ } while (0) /* inflate() uses a state machine to process as much input data and generate as much output data as possible before returning. The state machine is structured roughly as follows: for (;;) switch (state) { ... case STATEn: if (not enough input data or output space to make progress) return; ... make progress ... state = STATEm; break; ... } so when inflate() is called again, the same case is attempted again, and if the appropriate resources are provided, the machine proceeds to the next state. The NEEDBITS() macro is usually the way the state evaluates whether it can proceed or should return. NEEDBITS() does the return if the requested bits are not available. The typical use of the BITS macros is: NEEDBITS(n); ... do something with BITS(n) ... DROPBITS(n); where NEEDBITS(n) either returns from inflate() if there isn't enough input left to load n bits into the accumulator, or it continues. BITS(n) gives the low n bits in the accumulator. When done, DROPBITS(n) drops the low n bits off the accumulator. INITBITS() clears the accumulator and sets the number of available bits to zero. BYTEBITS() discards just enough bits to put the accumulator on a byte boundary. After BYTEBITS() and a NEEDBITS(8), then BITS(8) would return the next byte in the stream. NEEDBITS(n) uses PULLBYTE() to get an available byte of input, or to return if there is no input available. The decoding of variable length codes uses PULLBYTE() directly in order to pull just enough bytes to decode the next code, and no more. Some states loop until they get enough input, making sure that enough state information is maintained to continue the loop where it left off if NEEDBITS() returns in the loop. For example, want, need, and keep would all have to actually be part of the saved state in case NEEDBITS() returns: case STATEw: while (want < need) { NEEDBITS(n); keep[want++] = BITS(n); DROPBITS(n); } state = STATEx; case STATEx: As shown above, if the next state is also the next case, then the break is omitted. A state may also return if there is not enough output space available to complete that state. Those states are copying stored data, writing a literal byte, and copying a matching string. When returning, a "goto inf_leave" is used to update the total counters, update the check value, and determine whether any progress has been made during that inflate() call in order to return the proper return code. Progress is defined as a change in either strm->avail_in or strm->avail_out. When there is a window, goto inf_leave will update the window with the last output written. If a goto inf_leave occurs in the middle of decompression and there is no window currently, goto inf_leave will create one and copy output to the window for the next call of inflate(). In this implementation, the flush parameter of inflate() only affects the return code (per zlib.h). inflate() always writes as much as possible to strm->next_out, given the space available and the provided input--the effect documented in zlib.h of Z_SYNC_FLUSH. Furthermore, inflate() always defers the allocation of and copying into a sliding window until necessary, which provides the effect documented in zlib.h for Z_FINISH when the entire input stream available. So the only thing the flush parameter actually does is: when flush is set to Z_FINISH, inflate() cannot return Z_OK. Instead it will return Z_BUF_ERROR if it has not reached the end of the stream. */ int zlib_inflate(z_streamp strm, int flush) { struct inflate_state *state; const unsigned char *next; /* next input */ unsigned char *put; /* next output */ unsigned have, left; /* available input and output */ unsigned long hold; /* bit buffer */ unsigned bits; /* bits in bit buffer */ unsigned in, out; /* save starting available input and output */ unsigned copy; /* number of stored or match bytes to copy */ unsigned char *from; /* where to copy match bytes from */ code this; /* current decoding table entry */ code last; /* parent table entry */ unsigned len; /* length to copy for repeats, bits to drop */ int ret; /* return code */ static const unsigned short order[19] = /* permutation of code lengths */ {16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15}; /* Do not check for strm->next_out == NULL here as ppc zImage inflates to strm->next_out = 0 */ if (strm == NULL || strm->state == NULL || (strm->next_in == NULL && strm->avail_in != 0)) return Z_STREAM_ERROR; state = (struct inflate_state *)strm->state; if (state->mode == TYPE) state->mode = TYPEDO; /* skip check */ LOAD(); in = have; out = left; ret = Z_OK; for (;;) switch (state->mode) { case HEAD: if (state->wrap == 0) { state->mode = TYPEDO; break; } NEEDBITS(16); if ( ((BITS(8) << 8) + (hold >> 8)) % 31) { strm->msg = (char *)"incorrect header check"; state->mode = BAD; break; } if (BITS(4) != Z_DEFLATED) { strm->msg = (char *)"unknown compression method"; state->mode = BAD; break; } DROPBITS(4); len = BITS(4) + 8; if (len > state->wbits) { strm->msg = (char *)"invalid window size"; state->mode = BAD; break; } state->dmax = 1U << len; strm->adler = state->check = zlib_adler32(0L, NULL, 0); state->mode = hold & 0x200 ? DICTID : TYPE; INITBITS(); break; case DICTID: NEEDBITS(32); strm->adler = state->check = REVERSE(hold); INITBITS(); state->mode = DICT; fallthrough; case DICT: if (state->havedict == 0) { RESTORE(); return Z_NEED_DICT; } strm->adler = state->check = zlib_adler32(0L, NULL, 0); state->mode = TYPE; fallthrough; case TYPE: if (flush == Z_BLOCK) goto inf_leave; fallthrough; case TYPEDO: INFLATE_TYPEDO_HOOK(strm, flush); if (state->last) { BYTEBITS(); state->mode = CHECK; break; } NEEDBITS(3); state->last = BITS(1); DROPBITS(1); switch (BITS(2)) { case 0: /* stored block */ state->mode = STORED; break; case 1: /* fixed block */ zlib_fixedtables(state); state->mode = LEN; /* decode codes */ break; case 2: /* dynamic block */ state->mode = TABLE; break; case 3: strm->msg = (char *)"invalid block type"; state->mode = BAD; } DROPBITS(2); break; case STORED: BYTEBITS(); /* go to byte boundary */ NEEDBITS(32); if ((hold & 0xffff) != ((hold >> 16) ^ 0xffff)) { strm->msg = (char *)"invalid stored block lengths"; state->mode = BAD; break; } state->length = (unsigned)hold & 0xffff; INITBITS(); state->mode = COPY; fallthrough; case COPY: copy = state->length; if (copy) { if (copy > have) copy = have; if (copy > left) copy = left; if (copy == 0) goto inf_leave; memcpy(put, next, copy); have -= copy; next += copy; left -= copy; put += copy; state->length -= copy; break; } state->mode = TYPE; break; case TABLE: NEEDBITS(14); state->nlen = BITS(5) + 257; DROPBITS(5); state->ndist = BITS(5) + 1; DROPBITS(5); state->ncode = BITS(4) + 4; DROPBITS(4); #ifndef PKZIP_BUG_WORKAROUND if (state->nlen > 286 || state->ndist > 30) { strm->msg = (char *)"too many length or distance symbols"; state->mode = BAD; break; } #endif state->have = 0; state->mode = LENLENS; fallthrough; case LENLENS: while (state->have < state->ncode) { NEEDBITS(3); state->lens[order[state->have++]] = (unsigned short)BITS(3); DROPBITS(3); } while (state->have < 19) state->lens[order[state->have++]] = 0; state->next = state->codes; state->lencode = (code const *)(state->next); state->lenbits = 7; ret = zlib_inflate_table(CODES, state->lens, 19, &(state->next), &(state->lenbits), state->work); if (ret) { strm->msg = (char *)"invalid code lengths set"; state->mode = BAD; break; } state->have = 0; state->mode = CODELENS; fallthrough; case CODELENS: while (state->have < state->nlen + state->ndist) { for (;;) { this = state->lencode[BITS(state->lenbits)]; if ((unsigned)(this.bits) <= bits) break; PULLBYTE(); } if (this.val < 16) { NEEDBITS(this.bits); DROPBITS(this.bits); state->lens[state->have++] = this.val; } else { if (this.val == 16) { NEEDBITS(this.bits + 2); DROPBITS(this.bits); if (state->have == 0) { strm->msg = (char *)"invalid bit length repeat"; state->mode = BAD; break; } len = state->lens[state->have - 1]; copy = 3 + BITS(2); DROPBITS(2); } else if (this.val == 17) { NEEDBITS(this.bits + 3); DROPBITS(this.bits); len = 0; copy = 3 + BITS(3); DROPBITS(3); } else { NEEDBITS(this.bits + 7); DROPBITS(this.bits); len = 0; copy = 11 + BITS(7); DROPBITS(7); } if (state->have + copy > state->nlen + state->ndist) { strm->msg = (char *)"invalid bit length repeat"; state->mode = BAD; break; } while (copy--) state->lens[state->have++] = (unsigned short)len; } } /* handle error breaks in while */ if (state->mode == BAD) break; /* build code tables */ state->next = state->codes; state->lencode = (code const *)(state->next); state->lenbits = 9; ret = zlib_inflate_table(LENS, state->lens, state->nlen, &(state->next), &(state->lenbits), state->work); if (ret) { strm->msg = (char *)"invalid literal/lengths set"; state->mode = BAD; break; } state->distcode = (code const *)(state->next); state->distbits = 6; ret = zlib_inflate_table(DISTS, state->lens + state->nlen, state->ndist, &(state->next), &(state->distbits), state->work); if (ret) { strm->msg = (char *)"invalid distances set"; state->mode = BAD; break; } state->mode = LEN; fallthrough; case LEN: if (have >= 6 && left >= 258) { RESTORE(); inflate_fast(strm, out); LOAD(); break; } for (;;) { this = state->lencode[BITS(state->lenbits)]; if ((unsigned)(this.bits) <= bits) break; PULLBYTE(); } if (this.op && (this.op & 0xf0) == 0) { last = this; for (;;) { this = state->lencode[last.val + (BITS(last.bits + last.op) >> last.bits)]; if ((unsigned)(last.bits + this.bits) <= bits) break; PULLBYTE(); } DROPBITS(last.bits); } DROPBITS(this.bits); state->length = (unsigned)this.val; if ((int)(this.op) == 0) { state->mode = LIT; break; } if (this.op & 32) { state->mode = TYPE; break; } if (this.op & 64) { strm->msg = (char *)"invalid literal/length code"; state->mode = BAD; break; } state->extra = (unsigned)(this.op) & 15; state->mode = LENEXT; fallthrough; case LENEXT: if (state->extra) { NEEDBITS(state->extra); state->length += BITS(state->extra); DROPBITS(state->extra); } state->mode = DIST; fallthrough; case DIST: for (;;) { this = state->distcode[BITS(state->distbits)]; if ((unsigned)(this.bits) <= bits) break; PULLBYTE(); } if ((this.op & 0xf0) == 0) { last = this; for (;;) { this = state->distcode[last.val + (BITS(last.bits + last.op) >> last.bits)]; if ((unsigned)(last.bits + this.bits) <= bits) break; PULLBYTE(); } DROPBITS(last.bits); } DROPBITS(this.bits); if (this.op & 64) { strm->msg = (char *)"invalid distance code"; state->mode = BAD; break; } state->offset = (unsigned)this.val; state->extra = (unsigned)(this.op) & 15; state->mode = DISTEXT; fallthrough; case DISTEXT: if (state->extra) { NEEDBITS(state->extra); state->offset += BITS(state->extra); DROPBITS(state->extra); } #ifdef INFLATE_STRICT if (state->offset > state->dmax) { strm->msg = (char *)"invalid distance too far back"; state->mode = BAD; break; } #endif if (state->offset > state->whave + out - left) { strm->msg = (char *)"invalid distance too far back"; state->mode = BAD; break; } state->mode = MATCH; fallthrough; case MATCH: if (left == 0) goto inf_leave; copy = out - left; if (state->offset > copy) { /* copy from window */ copy = state->offset - copy; if (copy > state->write) { copy -= state->write; from = state->window + (state->wsize - copy); } else from = state->window + (state->write - copy); if (copy > state->length) copy = state->length; } else { /* copy from output */ from = put - state->offset; copy = state->length; } if (copy > left) copy = left; left -= copy; state->length -= copy; do { *put++ = *from++; } while (--copy); if (state->length == 0) state->mode = LEN; break; case LIT: if (left == 0) goto inf_leave; *put++ = (unsigned char)(state->length); left--; state->mode = LEN; break; case CHECK: if (state->wrap) { NEEDBITS(32); out -= left; strm->total_out += out; state->total += out; if (INFLATE_NEED_CHECKSUM(strm) && out) strm->adler = state->check = UPDATE(state->check, put - out, out); out = left; if (( REVERSE(hold)) != state->check) { strm->msg = (char *)"incorrect data check"; state->mode = BAD; break; } INITBITS(); } state->mode = DONE; fallthrough; case DONE: ret = Z_STREAM_END; goto inf_leave; case BAD: ret = Z_DATA_ERROR; goto inf_leave; case MEM: return Z_MEM_ERROR; case SYNC: default: return Z_STREAM_ERROR; } /* Return from inflate(), updating the total counts and the check value. If there was no progress during the inflate() call, return a buffer error. Call zlib_updatewindow() to create and/or update the window state. */ inf_leave: RESTORE(); if (INFLATE_NEED_UPDATEWINDOW(strm) && (state->wsize || (state->mode < CHECK && out != strm->avail_out))) zlib_updatewindow(strm, out); in -= strm->avail_in; out -= strm->avail_out; strm->total_in += in; strm->total_out += out; state->total += out; if (INFLATE_NEED_CHECKSUM(strm) && state->wrap && out) strm->adler = state->check = UPDATE(state->check, strm->next_out - out, out); strm->data_type = state->bits + (state->last ? 64 : 0) + (state->mode == TYPE ? 128 : 0); if (flush == Z_PACKET_FLUSH && ret == Z_OK && strm->avail_out != 0 && strm->avail_in == 0) return zlib_inflateSyncPacket(strm); if (((in == 0 && out == 0) || flush == Z_FINISH) && ret == Z_OK) ret = Z_BUF_ERROR; return ret; } int zlib_inflateEnd(z_streamp strm) { if (strm == NULL || strm->state == NULL) return Z_STREAM_ERROR; return Z_OK; } /* * This subroutine adds the data at next_in/avail_in to the output history * without performing any output. The output buffer must be "caught up"; * i.e. no pending output but this should always be the case. The state must * be waiting on the start of a block (i.e. mode == TYPE or HEAD). On exit, * the output will also be caught up, and the checksum will have been updated * if need be. */ int zlib_inflateIncomp(z_stream *z) { struct inflate_state *state = (struct inflate_state *)z->state; Byte *saved_no = z->next_out; uInt saved_ao = z->avail_out; if (state->mode != TYPE && state->mode != HEAD) return Z_DATA_ERROR; /* Setup some variables to allow misuse of updateWindow */ z->avail_out = 0; z->next_out = (unsigned char*)z->next_in + z->avail_in; zlib_updatewindow(z, z->avail_in); /* Restore saved variables */ z->avail_out = saved_ao; z->next_out = saved_no; z->adler = state->check = UPDATE(state->check, z->next_in, z->avail_in); z->total_out += z->avail_in; z->total_in += z->avail_in; z->next_in += z->avail_in; state->total += z->avail_in; z->avail_in = 0; return Z_OK; } |
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __X86_KERNEL_FPU_LEGACY_H #define __X86_KERNEL_FPU_LEGACY_H #include <asm/fpu/types.h> extern unsigned int mxcsr_feature_mask; static inline void ldmxcsr(u32 mxcsr) { asm volatile("ldmxcsr %0" :: "m" (mxcsr)); } /* * Returns 0 on success or the trap number when the operation raises an * exception. */ #define user_insn(insn, output, input...) \ ({ \ int err; \ \ might_fault(); \ \ asm volatile(ASM_STAC "\n" \ "1: " #insn "\n" \ "2: " ASM_CLAC "\n" \ _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FAULT_MCE_SAFE) \ : [err] "=a" (err), output \ : "0"(0), input); \ err; \ }) #define kernel_insn_err(insn, output, input...) \ ({ \ int err; \ asm volatile("1:" #insn "\n\t" \ "2:\n" \ _ASM_EXTABLE_TYPE_REG(1b, 2b, EX_TYPE_EFAULT_REG, %[err]) \ : [err] "=r" (err), output \ : "0"(0), input); \ err; \ }) #define kernel_insn(insn, output, input...) \ asm volatile("1:" #insn "\n\t" \ "2:\n" \ _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_FPU_RESTORE) \ : output : input) static inline int fnsave_to_user_sigframe(struct fregs_state __user *fx) { return user_insn(fnsave %[fx]; fwait, [fx] "=m" (*fx), "m" (*fx)); } static inline int fxsave_to_user_sigframe(struct fxregs_state __user *fx) { if (IS_ENABLED(CONFIG_X86_32)) return user_insn(fxsave %[fx], [fx] "=m" (*fx), "m" (*fx)); else return user_insn(fxsaveq %[fx], [fx] "=m" (*fx), "m" (*fx)); } static inline void fxrstor(struct fxregs_state *fx) { if (IS_ENABLED(CONFIG_X86_32)) kernel_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); else kernel_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); } static inline int fxrstor_safe(struct fxregs_state *fx) { if (IS_ENABLED(CONFIG_X86_32)) return kernel_insn_err(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); else return kernel_insn_err(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); } static inline int fxrstor_from_user_sigframe(struct fxregs_state __user *fx) { if (IS_ENABLED(CONFIG_X86_32)) return user_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); else return user_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); } static inline void frstor(struct fregs_state *fx) { kernel_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); } static inline int frstor_safe(struct fregs_state *fx) { return kernel_insn_err(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); } static inline int frstor_from_user_sigframe(struct fregs_state __user *fx) { return user_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); } static inline void fxsave(struct fxregs_state *fx) { if (IS_ENABLED(CONFIG_X86_32)) asm volatile( "fxsave %[fx]" : [fx] "=m" (*fx)); else asm volatile("fxsaveq %[fx]" : [fx] "=m" (*fx)); } #endif |
| 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 | // SPDX-License-Identifier: GPL-2.0-only /* (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> */ #include <linux/types.h> #include <linux/export.h> #include <linux/init.h> #include <linux/udp.h> #include <linux/tcp.h> #include <linux/icmp.h> #include <linux/icmpv6.h> #include <linux/dccp.h> #include <linux/sctp.h> #include <net/sctp/checksum.h> #include <linux/netfilter.h> #include <net/netfilter/nf_nat.h> #include <linux/ipv6.h> #include <linux/netfilter_ipv6.h> #include <net/checksum.h> #include <net/ip6_checksum.h> #include <net/ip6_route.h> #include <net/xfrm.h> #include <net/ipv6.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack.h> #include <linux/netfilter/nfnetlink_conntrack.h> static void nf_csum_update(struct sk_buff *skb, unsigned int iphdroff, __sum16 *check, const struct nf_conntrack_tuple *t, enum nf_nat_manip_type maniptype); static void __udp_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, struct udphdr *hdr, const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype, bool do_csum) { __be16 *portptr, newport; if (maniptype == NF_NAT_MANIP_SRC) { /* Get rid of src port */ newport = tuple->src.u.udp.port; portptr = &hdr->source; } else { /* Get rid of dst port */ newport = tuple->dst.u.udp.port; portptr = &hdr->dest; } if (do_csum) { nf_csum_update(skb, iphdroff, &hdr->check, tuple, maniptype); inet_proto_csum_replace2(&hdr->check, skb, *portptr, newport, false); if (!hdr->check) hdr->check = CSUM_MANGLED_0; } *portptr = newport; } static bool udp_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, unsigned int hdroff, const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype) { struct udphdr *hdr; if (skb_ensure_writable(skb, hdroff + sizeof(*hdr))) return false; hdr = (struct udphdr *)(skb->data + hdroff); __udp_manip_pkt(skb, iphdroff, hdr, tuple, maniptype, !!hdr->check); return true; } static bool udplite_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, unsigned int hdroff, const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype) { #ifdef CONFIG_NF_CT_PROTO_UDPLITE struct udphdr *hdr; if (skb_ensure_writable(skb, hdroff + sizeof(*hdr))) return false; hdr = (struct udphdr *)(skb->data + hdroff); __udp_manip_pkt(skb, iphdroff, hdr, tuple, maniptype, true); #endif return true; } static bool sctp_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, unsigned int hdroff, const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype) { #ifdef CONFIG_NF_CT_PROTO_SCTP struct sctphdr *hdr; int hdrsize = 8; /* This could be an inner header returned in imcp packet; in such * cases we cannot update the checksum field since it is outside * of the 8 bytes of transport layer headers we are guaranteed. */ if (skb->len >= hdroff + sizeof(*hdr)) hdrsize = sizeof(*hdr); if (skb_ensure_writable(skb, hdroff + hdrsize)) return false; hdr = (struct sctphdr *)(skb->data + hdroff); if (maniptype == NF_NAT_MANIP_SRC) { /* Get rid of src port */ hdr->source = tuple->src.u.sctp.port; } else { /* Get rid of dst port */ hdr->dest = tuple->dst.u.sctp.port; } if (hdrsize < sizeof(*hdr)) return true; if (skb->ip_summed != CHECKSUM_PARTIAL) { hdr->checksum = sctp_compute_cksum(skb, hdroff); skb->ip_summed = CHECKSUM_NONE; } #endif return true; } static bool tcp_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, unsigned int hdroff, const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype) { struct tcphdr *hdr; __be16 *portptr, newport, oldport; int hdrsize = 8; /* TCP connection tracking guarantees this much */ /* this could be a inner header returned in icmp packet; in such cases we cannot update the checksum field since it is outside of the 8 bytes of transport layer headers we are guaranteed */ if (skb->len >= hdroff + sizeof(struct tcphdr)) hdrsize = sizeof(struct tcphdr); if (skb_ensure_writable(skb, hdroff + hdrsize)) return false; hdr = (struct tcphdr *)(skb->data + hdroff); if (maniptype == NF_NAT_MANIP_SRC) { /* Get rid of src port */ newport = tuple->src.u.tcp.port; portptr = &hdr->source; } else { /* Get rid of dst port */ newport = tuple->dst.u.tcp.port; portptr = &hdr->dest; } oldport = *portptr; *portptr = newport; if (hdrsize < sizeof(*hdr)) return true; nf_csum_update(skb, iphdroff, &hdr->check, tuple, maniptype); inet_proto_csum_replace2(&hdr->check, skb, oldport, newport, false); return true; } static bool dccp_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, unsigned int hdroff, const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype) { #ifdef CONFIG_NF_CT_PROTO_DCCP struct dccp_hdr *hdr; __be16 *portptr, oldport, newport; int hdrsize = 8; /* DCCP connection tracking guarantees this much */ if (skb->len >= hdroff + sizeof(struct dccp_hdr)) hdrsize = sizeof(struct dccp_hdr); if (skb_ensure_writable(skb, hdroff + hdrsize)) return false; hdr = (struct dccp_hdr *)(skb->data + hdroff); if (maniptype == NF_NAT_MANIP_SRC) { newport = tuple->src.u.dccp.port; portptr = &hdr->dccph_sport; } else { newport = tuple->dst.u.dccp.port; portptr = &hdr->dccph_dport; } oldport = *portptr; *portptr = newport; if (hdrsize < sizeof(*hdr)) return true; nf_csum_update(skb, iphdroff, &hdr->dccph_checksum, tuple, maniptype); inet_proto_csum_replace2(&hdr->dccph_checksum, skb, oldport, newport, false); #endif return true; } static bool icmp_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, unsigned int hdroff, const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype) { struct icmphdr *hdr; if (skb_ensure_writable(skb, hdroff + sizeof(*hdr))) return false; hdr = (struct icmphdr *)(skb->data + hdroff); switch (hdr->type) { case ICMP_ECHO: case ICMP_ECHOREPLY: case ICMP_TIMESTAMP: case ICMP_TIMESTAMPREPLY: case ICMP_INFO_REQUEST: case ICMP_INFO_REPLY: case ICMP_ADDRESS: case ICMP_ADDRESSREPLY: break; default: return true; } inet_proto_csum_replace2(&hdr->checksum, skb, hdr->un.echo.id, tuple->src.u.icmp.id, false); hdr->un.echo.id = tuple->src.u.icmp.id; return true; } static bool icmpv6_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, unsigned int hdroff, const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype) { struct icmp6hdr *hdr; if (skb_ensure_writable(skb, hdroff + sizeof(*hdr))) return false; hdr = (struct icmp6hdr *)(skb->data + hdroff); nf_csum_update(skb, iphdroff, &hdr->icmp6_cksum, tuple, maniptype); if (hdr->icmp6_type == ICMPV6_ECHO_REQUEST || hdr->icmp6_type == ICMPV6_ECHO_REPLY) { inet_proto_csum_replace2(&hdr->icmp6_cksum, skb, hdr->icmp6_identifier, tuple->src.u.icmp.id, false); hdr->icmp6_identifier = tuple->src.u.icmp.id; } return true; } /* manipulate a GRE packet according to maniptype */ static bool gre_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, unsigned int hdroff, const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype) { #if IS_ENABLED(CONFIG_NF_CT_PROTO_GRE) const struct gre_base_hdr *greh; struct pptp_gre_header *pgreh; /* pgreh includes two optional 32bit fields which are not required * to be there. That's where the magic '8' comes from */ if (skb_ensure_writable(skb, hdroff + sizeof(*pgreh) - 8)) return false; greh = (void *)skb->data + hdroff; pgreh = (struct pptp_gre_header *)greh; /* we only have destination manip of a packet, since 'source key' * is not present in the packet itself */ if (maniptype != NF_NAT_MANIP_DST) return true; switch (greh->flags & GRE_VERSION) { case GRE_VERSION_0: /* We do not currently NAT any GREv0 packets. * Try to behave like "nf_nat_proto_unknown" */ break; case GRE_VERSION_1: pr_debug("call_id -> 0x%04x\n", ntohs(tuple->dst.u.gre.key)); pgreh->call_id = tuple->dst.u.gre.key; break; default: pr_debug("can't nat unknown GRE version\n"); return false; } #endif return true; } static bool l4proto_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, unsigned int hdroff, const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype) { switch (tuple->dst.protonum) { case IPPROTO_TCP: return tcp_manip_pkt(skb, iphdroff, hdroff, tuple, maniptype); case IPPROTO_UDP: return udp_manip_pkt(skb, iphdroff, hdroff, tuple, maniptype); case IPPROTO_UDPLITE: return udplite_manip_pkt(skb, iphdroff, hdroff, tuple, maniptype); case IPPROTO_SCTP: return sctp_manip_pkt(skb, iphdroff, hdroff, tuple, maniptype); case IPPROTO_ICMP: return icmp_manip_pkt(skb, iphdroff, hdroff, tuple, maniptype); case IPPROTO_ICMPV6: return icmpv6_manip_pkt(skb, iphdroff, hdroff, tuple, maniptype); case IPPROTO_DCCP: return dccp_manip_pkt(skb, iphdroff, hdroff, tuple, maniptype); case IPPROTO_GRE: return gre_manip_pkt(skb, iphdroff, hdroff, tuple, maniptype); } /* If we don't know protocol -- no error, pass it unmodified. */ return true; } static bool nf_nat_ipv4_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, const struct nf_conntrack_tuple *target, enum nf_nat_manip_type maniptype) { struct iphdr *iph; unsigned int hdroff; if (skb_ensure_writable(skb, iphdroff + sizeof(*iph))) return false; iph = (void *)skb->data + iphdroff; hdroff = iphdroff + iph->ihl * 4; if (!l4proto_manip_pkt(skb, iphdroff, hdroff, target, maniptype)) return false; iph = (void *)skb->data + iphdroff; if (maniptype == NF_NAT_MANIP_SRC) { csum_replace4(&iph->check, iph->saddr, target->src.u3.ip); iph->saddr = target->src.u3.ip; } else { csum_replace4(&iph->check, iph->daddr, target->dst.u3.ip); iph->daddr = target->dst.u3.ip; } return true; } static bool nf_nat_ipv6_manip_pkt(struct sk_buff *skb, unsigned int iphdroff, const struct nf_conntrack_tuple *target, enum nf_nat_manip_type maniptype) { #if IS_ENABLED(CONFIG_IPV6) struct ipv6hdr *ipv6h; __be16 frag_off; int hdroff; u8 nexthdr; if (skb_ensure_writable(skb, iphdroff + sizeof(*ipv6h))) return false; ipv6h = (void *)skb->data + iphdroff; nexthdr = ipv6h->nexthdr; hdroff = ipv6_skip_exthdr(skb, iphdroff + sizeof(*ipv6h), &nexthdr, &frag_off); if (hdroff < 0) goto manip_addr; if ((frag_off & htons(~0x7)) == 0 && !l4proto_manip_pkt(skb, iphdroff, hdroff, target, maniptype)) return false; /* must reload, offset might have changed */ ipv6h = (void *)skb->data + iphdroff; manip_addr: if (maniptype == NF_NAT_MANIP_SRC) ipv6h->saddr = target->src.u3.in6; else ipv6h->daddr = target->dst.u3.in6; #endif return true; } unsigned int nf_nat_manip_pkt(struct sk_buff *skb, struct nf_conn *ct, enum nf_nat_manip_type mtype, enum ip_conntrack_dir dir) { struct nf_conntrack_tuple target; /* We are aiming to look like inverse of other direction. */ nf_ct_invert_tuple(&target, &ct->tuplehash[!dir].tuple); switch (target.src.l3num) { case NFPROTO_IPV6: if (nf_nat_ipv6_manip_pkt(skb, 0, &target, mtype)) return NF_ACCEPT; break; case NFPROTO_IPV4: if (nf_nat_ipv4_manip_pkt(skb, 0, &target, mtype)) return NF_ACCEPT; break; default: WARN_ON_ONCE(1); break; } return NF_DROP; } static void nf_nat_ipv4_csum_update(struct sk_buff *skb, unsigned int iphdroff, __sum16 *check, const struct nf_conntrack_tuple *t, enum nf_nat_manip_type maniptype) { struct iphdr *iph = (struct iphdr *)(skb->data + iphdroff); __be32 oldip, newip; if (maniptype == NF_NAT_MANIP_SRC) { oldip = iph->saddr; newip = t->src.u3.ip; } else { oldip = iph->daddr; newip = t->dst.u3.ip; } inet_proto_csum_replace4(check, skb, oldip, newip, true); } static void nf_nat_ipv6_csum_update(struct sk_buff *skb, unsigned int iphdroff, __sum16 *check, const struct nf_conntrack_tuple *t, enum nf_nat_manip_type maniptype) { #if IS_ENABLED(CONFIG_IPV6) const struct ipv6hdr *ipv6h = (struct ipv6hdr *)(skb->data + iphdroff); const struct in6_addr *oldip, *newip; if (maniptype == NF_NAT_MANIP_SRC) { oldip = &ipv6h->saddr; newip = &t->src.u3.in6; } else { oldip = &ipv6h->daddr; newip = &t->dst.u3.in6; } inet_proto_csum_replace16(check, skb, oldip->s6_addr32, newip->s6_addr32, true); #endif } static void nf_csum_update(struct sk_buff *skb, unsigned int iphdroff, __sum16 *check, const struct nf_conntrack_tuple *t, enum nf_nat_manip_type maniptype) { switch (t->src.l3num) { case NFPROTO_IPV4: nf_nat_ipv4_csum_update(skb, iphdroff, check, t, maniptype); return; case NFPROTO_IPV6: nf_nat_ipv6_csum_update(skb, iphdroff, check, t, maniptype); return; } } static void nf_nat_ipv4_csum_recalc(struct sk_buff *skb, u8 proto, void *data, __sum16 *check, int datalen, int oldlen) { if (skb->ip_summed != CHECKSUM_PARTIAL) { const struct iphdr *iph = ip_hdr(skb); skb->ip_summed = CHECKSUM_PARTIAL; skb->csum_start = skb_headroom(skb) + skb_network_offset(skb) + ip_hdrlen(skb); skb->csum_offset = (void *)check - data; *check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, datalen, proto, 0); } else { inet_proto_csum_replace2(check, skb, htons(oldlen), htons(datalen), true); } } #if IS_ENABLED(CONFIG_IPV6) static void nf_nat_ipv6_csum_recalc(struct sk_buff *skb, u8 proto, void *data, __sum16 *check, int datalen, int oldlen) { if (skb->ip_summed != CHECKSUM_PARTIAL) { const struct ipv6hdr *ipv6h = ipv6_hdr(skb); skb->ip_summed = CHECKSUM_PARTIAL; skb->csum_start = skb_headroom(skb) + skb_network_offset(skb) + (data - (void *)skb->data); skb->csum_offset = (void *)check - data; *check = ~csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr, datalen, proto, 0); } else { inet_proto_csum_replace2(check, skb, htons(oldlen), htons(datalen), true); } } #endif void nf_nat_csum_recalc(struct sk_buff *skb, u8 nfproto, u8 proto, void *data, __sum16 *check, int datalen, int oldlen) { switch (nfproto) { case NFPROTO_IPV4: nf_nat_ipv4_csum_recalc(skb, proto, data, check, datalen, oldlen); return; #if IS_ENABLED(CONFIG_IPV6) case NFPROTO_IPV6: nf_nat_ipv6_csum_recalc(skb, proto, data, check, datalen, oldlen); return; #endif } WARN_ON_ONCE(1); } int nf_nat_icmp_reply_translation(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int hooknum) { struct { struct icmphdr icmp; struct iphdr ip; } *inside; enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); enum nf_nat_manip_type manip = HOOK2MANIP(hooknum); unsigned int hdrlen = ip_hdrlen(skb); struct nf_conntrack_tuple target; unsigned long statusbit; WARN_ON(ctinfo != IP_CT_RELATED && ctinfo != IP_CT_RELATED_REPLY); if (skb_ensure_writable(skb, hdrlen + sizeof(*inside))) return 0; if (nf_ip_checksum(skb, hooknum, hdrlen, IPPROTO_ICMP)) return 0; inside = (void *)skb->data + hdrlen; if (inside->icmp.type == ICMP_REDIRECT) { if ((ct->status & IPS_NAT_DONE_MASK) != IPS_NAT_DONE_MASK) return 0; if (ct->status & IPS_NAT_MASK) return 0; } if (manip == NF_NAT_MANIP_SRC) statusbit = IPS_SRC_NAT; else statusbit = IPS_DST_NAT; /* Invert if this is reply direction */ if (dir == IP_CT_DIR_REPLY) statusbit ^= IPS_NAT_MASK; if (!(ct->status & statusbit)) return 1; if (!nf_nat_ipv4_manip_pkt(skb, hdrlen + sizeof(inside->icmp), &ct->tuplehash[!dir].tuple, !manip)) return 0; if (skb->ip_summed != CHECKSUM_PARTIAL) { /* Reloading "inside" here since manip_pkt may reallocate */ inside = (void *)skb->data + hdrlen; inside->icmp.checksum = 0; inside->icmp.checksum = csum_fold(skb_checksum(skb, hdrlen, skb->len - hdrlen, 0)); } /* Change outer to look like the reply to an incoming packet */ nf_ct_invert_tuple(&target, &ct->tuplehash[!dir].tuple); target.dst.protonum = IPPROTO_ICMP; if (!nf_nat_ipv4_manip_pkt(skb, 0, &target, manip)) return 0; return 1; } EXPORT_SYMBOL_GPL(nf_nat_icmp_reply_translation); static unsigned int nf_nat_ipv4_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct nf_conn *ct; enum ip_conntrack_info ctinfo; ct = nf_ct_get(skb, &ctinfo); if (!ct) return NF_ACCEPT; if (ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY) { if (ip_hdr(skb)->protocol == IPPROTO_ICMP) { if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo, state->hook)) return NF_DROP; else return NF_ACCEPT; } } return nf_nat_inet_fn(priv, skb, state); } static unsigned int nf_nat_ipv4_pre_routing(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { unsigned int ret; __be32 daddr = ip_hdr(skb)->daddr; ret = nf_nat_ipv4_fn(priv, skb, state); if (ret == NF_ACCEPT && daddr != ip_hdr(skb)->daddr) skb_dst_drop(skb); return ret; } #ifdef CONFIG_XFRM static int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family) { struct sock *sk = skb->sk; struct dst_entry *dst; unsigned int hh_len; struct flowi fl; int err; err = xfrm_decode_session(net, skb, &fl, family); if (err < 0) return err; dst = skb_dst(skb); if (dst->xfrm) dst = ((struct xfrm_dst *)dst)->route; if (!dst_hold_safe(dst)) return -EHOSTUNREACH; if (sk && !net_eq(net, sock_net(sk))) sk = NULL; dst = xfrm_lookup(net, dst, &fl, sk, 0); if (IS_ERR(dst)) return PTR_ERR(dst); skb_dst_drop(skb); skb_dst_set(skb, dst); /* Change in oif may mean change in hh_len. */ hh_len = skb_dst(skb)->dev->hard_header_len; if (skb_headroom(skb) < hh_len && pskb_expand_head(skb, hh_len - skb_headroom(skb), 0, GFP_ATOMIC)) return -ENOMEM; return 0; } #endif static bool nf_nat_inet_port_was_mangled(const struct sk_buff *skb, __be16 sport) { enum ip_conntrack_info ctinfo; enum ip_conntrack_dir dir; const struct nf_conn *ct; ct = nf_ct_get(skb, &ctinfo); if (!ct) return false; switch (nf_ct_protonum(ct)) { case IPPROTO_TCP: case IPPROTO_UDP: break; default: return false; } dir = CTINFO2DIR(ctinfo); if (dir != IP_CT_DIR_ORIGINAL) return false; return ct->tuplehash[!dir].tuple.dst.u.all != sport; } static unsigned int nf_nat_ipv4_local_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { __be32 saddr = ip_hdr(skb)->saddr; struct sock *sk = skb->sk; unsigned int ret; ret = nf_nat_ipv4_fn(priv, skb, state); if (ret != NF_ACCEPT || !sk || inet_sk_transparent(sk)) return ret; /* skb has a socket assigned via tcp edemux. We need to check * if nf_nat_ipv4_fn() has mangled the packet in a way that * edemux would not have found this socket. * * This includes both changes to the source address and changes * to the source port, which are both handled by the * nf_nat_ipv4_fn() call above -- long after tcp/udp early demux * might have found a socket for the old (pre-snat) address. */ if (saddr != ip_hdr(skb)->saddr || nf_nat_inet_port_was_mangled(skb, sk->sk_dport)) skb_orphan(skb); /* TCP edemux obtained wrong socket */ return ret; } static unsigned int nf_nat_ipv4_out(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { #ifdef CONFIG_XFRM const struct nf_conn *ct; enum ip_conntrack_info ctinfo; int err; #endif unsigned int ret; ret = nf_nat_ipv4_fn(priv, skb, state); #ifdef CONFIG_XFRM if (ret != NF_ACCEPT) return ret; if (IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) return ret; ct = nf_ct_get(skb, &ctinfo); if (ct) { enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); if (ct->tuplehash[dir].tuple.src.u3.ip != ct->tuplehash[!dir].tuple.dst.u3.ip || (ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP && ct->tuplehash[dir].tuple.src.u.all != ct->tuplehash[!dir].tuple.dst.u.all)) { err = nf_xfrm_me_harder(state->net, skb, AF_INET); if (err < 0) ret = NF_DROP_ERR(err); } } #endif return ret; } static unsigned int nf_nat_ipv4_local_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { const struct nf_conn *ct; enum ip_conntrack_info ctinfo; unsigned int ret; int err; ret = nf_nat_ipv4_fn(priv, skb, state); if (ret != NF_ACCEPT) return ret; ct = nf_ct_get(skb, &ctinfo); if (ct) { enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); if (ct->tuplehash[dir].tuple.dst.u3.ip != ct->tuplehash[!dir].tuple.src.u3.ip) { err = ip_route_me_harder(state->net, state->sk, skb, RTN_UNSPEC); if (err < 0) ret = NF_DROP_ERR(err); } #ifdef CONFIG_XFRM else if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) && ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMP && ct->tuplehash[dir].tuple.dst.u.all != ct->tuplehash[!dir].tuple.src.u.all) { err = nf_xfrm_me_harder(state->net, skb, AF_INET); if (err < 0) ret = NF_DROP_ERR(err); } #endif } return ret; } static const struct nf_hook_ops nf_nat_ipv4_ops[] = { /* Before packet filtering, change destination */ { .hook = nf_nat_ipv4_pre_routing, .pf = NFPROTO_IPV4, .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP_PRI_NAT_DST, }, /* After packet filtering, change source */ { .hook = nf_nat_ipv4_out, .pf = NFPROTO_IPV4, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP_PRI_NAT_SRC, }, /* Before packet filtering, change destination */ { .hook = nf_nat_ipv4_local_fn, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP_PRI_NAT_DST, }, /* After packet filtering, change source */ { .hook = nf_nat_ipv4_local_in, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_NAT_SRC, }, }; int nf_nat_ipv4_register_fn(struct net *net, const struct nf_hook_ops *ops) { return nf_nat_register_fn(net, ops->pf, ops, nf_nat_ipv4_ops, ARRAY_SIZE(nf_nat_ipv4_ops)); } EXPORT_SYMBOL_GPL(nf_nat_ipv4_register_fn); void nf_nat_ipv4_unregister_fn(struct net *net, const struct nf_hook_ops *ops) { nf_nat_unregister_fn(net, ops->pf, ops, ARRAY_SIZE(nf_nat_ipv4_ops)); } EXPORT_SYMBOL_GPL(nf_nat_ipv4_unregister_fn); #if IS_ENABLED(CONFIG_IPV6) int nf_nat_icmpv6_reply_translation(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int hooknum, unsigned int hdrlen) { struct { struct icmp6hdr icmp6; struct ipv6hdr ip6; } *inside; enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); enum nf_nat_manip_type manip = HOOK2MANIP(hooknum); struct nf_conntrack_tuple target; unsigned long statusbit; WARN_ON(ctinfo != IP_CT_RELATED && ctinfo != IP_CT_RELATED_REPLY); if (skb_ensure_writable(skb, hdrlen + sizeof(*inside))) return 0; if (nf_ip6_checksum(skb, hooknum, hdrlen, IPPROTO_ICMPV6)) return 0; inside = (void *)skb->data + hdrlen; if (inside->icmp6.icmp6_type == NDISC_REDIRECT) { if ((ct->status & IPS_NAT_DONE_MASK) != IPS_NAT_DONE_MASK) return 0; if (ct->status & IPS_NAT_MASK) return 0; } if (manip == NF_NAT_MANIP_SRC) statusbit = IPS_SRC_NAT; else statusbit = IPS_DST_NAT; /* Invert if this is reply direction */ if (dir == IP_CT_DIR_REPLY) statusbit ^= IPS_NAT_MASK; if (!(ct->status & statusbit)) return 1; if (!nf_nat_ipv6_manip_pkt(skb, hdrlen + sizeof(inside->icmp6), &ct->tuplehash[!dir].tuple, !manip)) return 0; if (skb->ip_summed != CHECKSUM_PARTIAL) { struct ipv6hdr *ipv6h = ipv6_hdr(skb); inside = (void *)skb->data + hdrlen; inside->icmp6.icmp6_cksum = 0; inside->icmp6.icmp6_cksum = csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr, skb->len - hdrlen, IPPROTO_ICMPV6, skb_checksum(skb, hdrlen, skb->len - hdrlen, 0)); } nf_ct_invert_tuple(&target, &ct->tuplehash[!dir].tuple); target.dst.protonum = IPPROTO_ICMPV6; if (!nf_nat_ipv6_manip_pkt(skb, 0, &target, manip)) return 0; return 1; } EXPORT_SYMBOL_GPL(nf_nat_icmpv6_reply_translation); static unsigned int nf_nat_ipv6_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct nf_conn *ct; enum ip_conntrack_info ctinfo; __be16 frag_off; int hdrlen; u8 nexthdr; ct = nf_ct_get(skb, &ctinfo); /* Can't track? It's not due to stress, or conntrack would * have dropped it. Hence it's the user's responsibilty to * packet filter it out, or implement conntrack/NAT for that * protocol. 8) --RR */ if (!ct) return NF_ACCEPT; if (ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY) { nexthdr = ipv6_hdr(skb)->nexthdr; hdrlen = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, &frag_off); if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) { if (!nf_nat_icmpv6_reply_translation(skb, ct, ctinfo, state->hook, hdrlen)) return NF_DROP; else return NF_ACCEPT; } } return nf_nat_inet_fn(priv, skb, state); } static unsigned int nf_nat_ipv6_local_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct in6_addr saddr = ipv6_hdr(skb)->saddr; struct sock *sk = skb->sk; unsigned int ret; ret = nf_nat_ipv6_fn(priv, skb, state); if (ret != NF_ACCEPT || !sk || inet_sk_transparent(sk)) return ret; /* see nf_nat_ipv4_local_in */ if (ipv6_addr_cmp(&saddr, &ipv6_hdr(skb)->saddr) || nf_nat_inet_port_was_mangled(skb, sk->sk_dport)) skb_orphan(skb); return ret; } static unsigned int nf_nat_ipv6_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { unsigned int ret, verdict; struct in6_addr daddr = ipv6_hdr(skb)->daddr; ret = nf_nat_ipv6_fn(priv, skb, state); verdict = ret & NF_VERDICT_MASK; if (verdict != NF_DROP && verdict != NF_STOLEN && ipv6_addr_cmp(&daddr, &ipv6_hdr(skb)->daddr)) skb_dst_drop(skb); return ret; } static unsigned int nf_nat_ipv6_out(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { #ifdef CONFIG_XFRM const struct nf_conn *ct; enum ip_conntrack_info ctinfo; int err; #endif unsigned int ret; ret = nf_nat_ipv6_fn(priv, skb, state); #ifdef CONFIG_XFRM if (ret != NF_ACCEPT) return ret; if (IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) return ret; ct = nf_ct_get(skb, &ctinfo); if (ct) { enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.src.u3, &ct->tuplehash[!dir].tuple.dst.u3) || (ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMPV6 && ct->tuplehash[dir].tuple.src.u.all != ct->tuplehash[!dir].tuple.dst.u.all)) { err = nf_xfrm_me_harder(state->net, skb, AF_INET6); if (err < 0) ret = NF_DROP_ERR(err); } } #endif return ret; } static unsigned int nf_nat_ipv6_local_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { const struct nf_conn *ct; enum ip_conntrack_info ctinfo; unsigned int ret; int err; ret = nf_nat_ipv6_fn(priv, skb, state); if (ret != NF_ACCEPT) return ret; ct = nf_ct_get(skb, &ctinfo); if (ct) { enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); if (!nf_inet_addr_cmp(&ct->tuplehash[dir].tuple.dst.u3, &ct->tuplehash[!dir].tuple.src.u3)) { err = nf_ip6_route_me_harder(state->net, state->sk, skb); if (err < 0) ret = NF_DROP_ERR(err); } #ifdef CONFIG_XFRM else if (!(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) && ct->tuplehash[dir].tuple.dst.protonum != IPPROTO_ICMPV6 && ct->tuplehash[dir].tuple.dst.u.all != ct->tuplehash[!dir].tuple.src.u.all) { err = nf_xfrm_me_harder(state->net, skb, AF_INET6); if (err < 0) ret = NF_DROP_ERR(err); } #endif } return ret; } static const struct nf_hook_ops nf_nat_ipv6_ops[] = { /* Before packet filtering, change destination */ { .hook = nf_nat_ipv6_in, .pf = NFPROTO_IPV6, .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP6_PRI_NAT_DST, }, /* After packet filtering, change source */ { .hook = nf_nat_ipv6_out, .pf = NFPROTO_IPV6, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP6_PRI_NAT_SRC, }, /* Before packet filtering, change destination */ { .hook = nf_nat_ipv6_local_fn, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP6_PRI_NAT_DST, }, /* After packet filtering, change source */ { .hook = nf_nat_ipv6_local_in, .pf = NFPROTO_IPV6, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP6_PRI_NAT_SRC, }, }; int nf_nat_ipv6_register_fn(struct net *net, const struct nf_hook_ops *ops) { return nf_nat_register_fn(net, ops->pf, ops, nf_nat_ipv6_ops, ARRAY_SIZE(nf_nat_ipv6_ops)); } EXPORT_SYMBOL_GPL(nf_nat_ipv6_register_fn); void nf_nat_ipv6_unregister_fn(struct net *net, const struct nf_hook_ops *ops) { nf_nat_unregister_fn(net, ops->pf, ops, ARRAY_SIZE(nf_nat_ipv6_ops)); } EXPORT_SYMBOL_GPL(nf_nat_ipv6_unregister_fn); #endif /* CONFIG_IPV6 */ #if defined(CONFIG_NF_TABLES_INET) && IS_ENABLED(CONFIG_NFT_NAT) int nf_nat_inet_register_fn(struct net *net, const struct nf_hook_ops *ops) { int ret; if (WARN_ON_ONCE(ops->pf != NFPROTO_INET)) return -EINVAL; ret = nf_nat_register_fn(net, NFPROTO_IPV6, ops, nf_nat_ipv6_ops, ARRAY_SIZE(nf_nat_ipv6_ops)); if (ret) return ret; ret = nf_nat_register_fn(net, NFPROTO_IPV4, ops, nf_nat_ipv4_ops, ARRAY_SIZE(nf_nat_ipv4_ops)); if (ret) nf_nat_unregister_fn(net, NFPROTO_IPV6, ops, ARRAY_SIZE(nf_nat_ipv6_ops)); return ret; } EXPORT_SYMBOL_GPL(nf_nat_inet_register_fn); void nf_nat_inet_unregister_fn(struct net *net, const struct nf_hook_ops *ops) { nf_nat_unregister_fn(net, NFPROTO_IPV4, ops, ARRAY_SIZE(nf_nat_ipv4_ops)); nf_nat_unregister_fn(net, NFPROTO_IPV6, ops, ARRAY_SIZE(nf_nat_ipv6_ops)); } EXPORT_SYMBOL_GPL(nf_nat_inet_unregister_fn); #endif /* NFT INET NAT */ |
| 1 27 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 | /* SPDX-License-Identifier: GPL-2.0 */ /* * workqueue.h --- work queue handling for Linux. */ #ifndef _LINUX_WORKQUEUE_H #define _LINUX_WORKQUEUE_H #include <linux/timer.h> #include <linux/linkage.h> #include <linux/bitops.h> #include <linux/lockdep.h> #include <linux/threads.h> #include <linux/atomic.h> #include <linux/cpumask_types.h> #include <linux/rcupdate.h> #include <linux/workqueue_types.h> /* * The first word is the work queue pointer and the flags rolled into * one */ #define work_data_bits(work) ((unsigned long *)(&(work)->data)) enum work_bits { WORK_STRUCT_PENDING_BIT = 0, /* work item is pending execution */ WORK_STRUCT_INACTIVE_BIT, /* work item is inactive */ WORK_STRUCT_PWQ_BIT, /* data points to pwq */ WORK_STRUCT_LINKED_BIT, /* next work is linked to this one */ #ifdef CONFIG_DEBUG_OBJECTS_WORK WORK_STRUCT_STATIC_BIT, /* static initializer (debugobjects) */ #endif WORK_STRUCT_FLAG_BITS, /* color for workqueue flushing */ WORK_STRUCT_COLOR_SHIFT = WORK_STRUCT_FLAG_BITS, WORK_STRUCT_COLOR_BITS = 4, /* * When WORK_STRUCT_PWQ is set, reserve 8 bits off of pwq pointer w/ * debugobjects turned off. This makes pwqs aligned to 256 bytes (512 * bytes w/ DEBUG_OBJECTS_WORK) and allows 16 workqueue flush colors. * * MSB * [ pwq pointer ] [ flush color ] [ STRUCT flags ] * 4 bits 4 or 5 bits */ WORK_STRUCT_PWQ_SHIFT = WORK_STRUCT_COLOR_SHIFT + WORK_STRUCT_COLOR_BITS, /* * data contains off-queue information when !WORK_STRUCT_PWQ. * * MSB * [ pool ID ] [ disable depth ] [ OFFQ flags ] [ STRUCT flags ] * 16 bits 1 bit 4 or 5 bits */ WORK_OFFQ_FLAG_SHIFT = WORK_STRUCT_FLAG_BITS, WORK_OFFQ_BH_BIT = WORK_OFFQ_FLAG_SHIFT, WORK_OFFQ_FLAG_END, WORK_OFFQ_FLAG_BITS = WORK_OFFQ_FLAG_END - WORK_OFFQ_FLAG_SHIFT, WORK_OFFQ_DISABLE_SHIFT = WORK_OFFQ_FLAG_SHIFT + WORK_OFFQ_FLAG_BITS, WORK_OFFQ_DISABLE_BITS = 16, /* * When a work item is off queue, the high bits encode off-queue flags * and the last pool it was on. Cap pool ID to 31 bits and use the * highest number to indicate that no pool is associated. */ WORK_OFFQ_POOL_SHIFT = WORK_OFFQ_DISABLE_SHIFT + WORK_OFFQ_DISABLE_BITS, WORK_OFFQ_LEFT = BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT, WORK_OFFQ_POOL_BITS = WORK_OFFQ_LEFT <= 31 ? WORK_OFFQ_LEFT : 31, }; enum work_flags { WORK_STRUCT_PENDING = 1 << WORK_STRUCT_PENDING_BIT, WORK_STRUCT_INACTIVE = 1 << WORK_STRUCT_INACTIVE_BIT, WORK_STRUCT_PWQ = 1 << WORK_STRUCT_PWQ_BIT, WORK_STRUCT_LINKED = 1 << WORK_STRUCT_LINKED_BIT, #ifdef CONFIG_DEBUG_OBJECTS_WORK WORK_STRUCT_STATIC = 1 << WORK_STRUCT_STATIC_BIT, #else WORK_STRUCT_STATIC = 0, #endif }; enum wq_misc_consts { WORK_NR_COLORS = (1 << WORK_STRUCT_COLOR_BITS), /* not bound to any CPU, prefer the local CPU */ WORK_CPU_UNBOUND = NR_CPUS, /* bit mask for work_busy() return values */ WORK_BUSY_PENDING = 1 << 0, WORK_BUSY_RUNNING = 1 << 1, /* maximum string length for set_worker_desc() */ WORKER_DESC_LEN = 32, }; /* Convenience constants - of type 'unsigned long', not 'enum'! */ #define WORK_OFFQ_BH (1ul << WORK_OFFQ_BH_BIT) #define WORK_OFFQ_FLAG_MASK (((1ul << WORK_OFFQ_FLAG_BITS) - 1) << WORK_OFFQ_FLAG_SHIFT) #define WORK_OFFQ_DISABLE_MASK (((1ul << WORK_OFFQ_DISABLE_BITS) - 1) << WORK_OFFQ_DISABLE_SHIFT) #define WORK_OFFQ_POOL_NONE ((1ul << WORK_OFFQ_POOL_BITS) - 1) #define WORK_STRUCT_NO_POOL (WORK_OFFQ_POOL_NONE << WORK_OFFQ_POOL_SHIFT) #define WORK_STRUCT_PWQ_MASK (~((1ul << WORK_STRUCT_PWQ_SHIFT) - 1)) #define WORK_DATA_INIT() ATOMIC_LONG_INIT((unsigned long)WORK_STRUCT_NO_POOL) #define WORK_DATA_STATIC_INIT() \ ATOMIC_LONG_INIT((unsigned long)(WORK_STRUCT_NO_POOL | WORK_STRUCT_STATIC)) struct delayed_work { struct work_struct work; struct timer_list timer; /* target workqueue and CPU ->timer uses to queue ->work */ struct workqueue_struct *wq; int cpu; }; struct rcu_work { struct work_struct work; struct rcu_head rcu; /* target workqueue ->rcu uses to queue ->work */ struct workqueue_struct *wq; }; enum wq_affn_scope { WQ_AFFN_DFL, /* use system default */ WQ_AFFN_CPU, /* one pod per CPU */ WQ_AFFN_SMT, /* one pod poer SMT */ WQ_AFFN_CACHE, /* one pod per LLC */ WQ_AFFN_NUMA, /* one pod per NUMA node */ WQ_AFFN_SYSTEM, /* one pod across the whole system */ WQ_AFFN_NR_TYPES, }; /** * struct workqueue_attrs - A struct for workqueue attributes. * * This can be used to change attributes of an unbound workqueue. */ struct workqueue_attrs { /** * @nice: nice level */ int nice; /** * @cpumask: allowed CPUs * * Work items in this workqueue are affine to these CPUs and not allowed * to execute on other CPUs. A pool serving a workqueue must have the * same @cpumask. */ cpumask_var_t cpumask; /** * @__pod_cpumask: internal attribute used to create per-pod pools * * Internal use only. * * Per-pod unbound worker pools are used to improve locality. Always a * subset of ->cpumask. A workqueue can be associated with multiple * worker pools with disjoint @__pod_cpumask's. Whether the enforcement * of a pool's @__pod_cpumask is strict depends on @affn_strict. */ cpumask_var_t __pod_cpumask; /** * @affn_strict: affinity scope is strict * * If clear, workqueue will make a best-effort attempt at starting the * worker inside @__pod_cpumask but the scheduler is free to migrate it * outside. * * If set, workers are only allowed to run inside @__pod_cpumask. */ bool affn_strict; /* * Below fields aren't properties of a worker_pool. They only modify how * :c:func:`apply_workqueue_attrs` select pools and thus don't * participate in pool hash calculations or equality comparisons. * * If @affn_strict is set, @cpumask isn't a property of a worker_pool * either. */ /** * @affn_scope: unbound CPU affinity scope * * CPU pods are used to improve execution locality of unbound work * items. There are multiple pod types, one for each wq_affn_scope, and * every CPU in the system belongs to one pod in every pod type. CPUs * that belong to the same pod share the worker pool. For example, * selecting %WQ_AFFN_NUMA makes the workqueue use a separate worker * pool for each NUMA node. */ enum wq_affn_scope affn_scope; /** * @ordered: work items must be executed one by one in queueing order */ bool ordered; }; static inline struct delayed_work *to_delayed_work(struct work_struct *work) { return container_of(work, struct delayed_work, work); } static inline struct rcu_work *to_rcu_work(struct work_struct *work) { return container_of(work, struct rcu_work, work); } struct execute_work { struct work_struct work; }; #ifdef CONFIG_LOCKDEP /* * NB: because we have to copy the lockdep_map, setting _key * here is required, otherwise it could get initialised to the * copy of the lockdep_map! */ #define __WORK_INIT_LOCKDEP_MAP(n, k) \ .lockdep_map = STATIC_LOCKDEP_MAP_INIT(n, k), #else #define __WORK_INIT_LOCKDEP_MAP(n, k) #endif #define __WORK_INITIALIZER(n, f) { \ .data = WORK_DATA_STATIC_INIT(), \ .entry = { &(n).entry, &(n).entry }, \ .func = (f), \ __WORK_INIT_LOCKDEP_MAP(#n, &(n)) \ } #define __DELAYED_WORK_INITIALIZER(n, f, tflags) { \ .work = __WORK_INITIALIZER((n).work, (f)), \ .timer = __TIMER_INITIALIZER(delayed_work_timer_fn,\ (tflags) | TIMER_IRQSAFE), \ } #define DECLARE_WORK(n, f) \ struct work_struct n = __WORK_INITIALIZER(n, f) #define DECLARE_DELAYED_WORK(n, f) \ struct delayed_work n = __DELAYED_WORK_INITIALIZER(n, f, 0) #define DECLARE_DEFERRABLE_WORK(n, f) \ struct delayed_work n = __DELAYED_WORK_INITIALIZER(n, f, TIMER_DEFERRABLE) #ifdef CONFIG_DEBUG_OBJECTS_WORK extern void __init_work(struct work_struct *work, int onstack); extern void destroy_work_on_stack(struct work_struct *work); extern void destroy_delayed_work_on_stack(struct delayed_work *work); static inline unsigned int work_static(struct work_struct *work) { return *work_data_bits(work) & WORK_STRUCT_STATIC; } #else static inline void __init_work(struct work_struct *work, int onstack) { } static inline void destroy_work_on_stack(struct work_struct *work) { } static inline void destroy_delayed_work_on_stack(struct delayed_work *work) { } static inline unsigned int work_static(struct work_struct *work) { return 0; } #endif /* * initialize all of a work item in one go * * NOTE! No point in using "atomic_long_set()": using a direct * assignment of the work data initializer allows the compiler * to generate better code. */ #ifdef CONFIG_LOCKDEP #define __INIT_WORK_KEY(_work, _func, _onstack, _key) \ do { \ __init_work((_work), _onstack); \ (_work)->data = (atomic_long_t) WORK_DATA_INIT(); \ lockdep_init_map(&(_work)->lockdep_map, "(work_completion)"#_work, (_key), 0); \ INIT_LIST_HEAD(&(_work)->entry); \ (_work)->func = (_func); \ } while (0) #else #define __INIT_WORK_KEY(_work, _func, _onstack, _key) \ do { \ __init_work((_work), _onstack); \ (_work)->data = (atomic_long_t) WORK_DATA_INIT(); \ INIT_LIST_HEAD(&(_work)->entry); \ (_work)->func = (_func); \ } while (0) #endif #define __INIT_WORK(_work, _func, _onstack) \ do { \ static __maybe_unused struct lock_class_key __key; \ \ __INIT_WORK_KEY(_work, _func, _onstack, &__key); \ } while (0) #define INIT_WORK(_work, _func) \ __INIT_WORK((_work), (_func), 0) #define INIT_WORK_ONSTACK(_work, _func) \ __INIT_WORK((_work), (_func), 1) #define INIT_WORK_ONSTACK_KEY(_work, _func, _key) \ __INIT_WORK_KEY((_work), (_func), 1, _key) #define __INIT_DELAYED_WORK(_work, _func, _tflags) \ do { \ INIT_WORK(&(_work)->work, (_func)); \ __init_timer(&(_work)->timer, \ delayed_work_timer_fn, \ (_tflags) | TIMER_IRQSAFE); \ } while (0) #define __INIT_DELAYED_WORK_ONSTACK(_work, _func, _tflags) \ do { \ INIT_WORK_ONSTACK(&(_work)->work, (_func)); \ __init_timer_on_stack(&(_work)->timer, \ delayed_work_timer_fn, \ (_tflags) | TIMER_IRQSAFE); \ } while (0) #define INIT_DELAYED_WORK(_work, _func) \ __INIT_DELAYED_WORK(_work, _func, 0) #define INIT_DELAYED_WORK_ONSTACK(_work, _func) \ __INIT_DELAYED_WORK_ONSTACK(_work, _func, 0) #define INIT_DEFERRABLE_WORK(_work, _func) \ __INIT_DELAYED_WORK(_work, _func, TIMER_DEFERRABLE) #define INIT_DEFERRABLE_WORK_ONSTACK(_work, _func) \ __INIT_DELAYED_WORK_ONSTACK(_work, _func, TIMER_DEFERRABLE) #define INIT_RCU_WORK(_work, _func) \ INIT_WORK(&(_work)->work, (_func)) #define INIT_RCU_WORK_ONSTACK(_work, _func) \ INIT_WORK_ONSTACK(&(_work)->work, (_func)) /** * work_pending - Find out whether a work item is currently pending * @work: The work item in question */ #define work_pending(work) \ test_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)) /** * delayed_work_pending - Find out whether a delayable work item is currently * pending * @w: The work item in question */ #define delayed_work_pending(w) \ work_pending(&(w)->work) /* * Workqueue flags and constants. For details, please refer to * Documentation/core-api/workqueue.rst. */ enum wq_flags { WQ_BH = 1 << 0, /* execute in bottom half (softirq) context */ WQ_UNBOUND = 1 << 1, /* not bound to any cpu */ WQ_FREEZABLE = 1 << 2, /* freeze during suspend */ WQ_MEM_RECLAIM = 1 << 3, /* may be used for memory reclaim */ WQ_HIGHPRI = 1 << 4, /* high priority */ WQ_CPU_INTENSIVE = 1 << 5, /* cpu intensive workqueue */ WQ_SYSFS = 1 << 6, /* visible in sysfs, see workqueue_sysfs_register() */ /* * Per-cpu workqueues are generally preferred because they tend to * show better performance thanks to cache locality. Per-cpu * workqueues exclude the scheduler from choosing the CPU to * execute the worker threads, which has an unfortunate side effect * of increasing power consumption. * * The scheduler considers a CPU idle if it doesn't have any task * to execute and tries to keep idle cores idle to conserve power; * however, for example, a per-cpu work item scheduled from an * interrupt handler on an idle CPU will force the scheduler to * execute the work item on that CPU breaking the idleness, which in * turn may lead to more scheduling choices which are sub-optimal * in terms of power consumption. * * Workqueues marked with WQ_POWER_EFFICIENT are per-cpu by default * but become unbound if workqueue.power_efficient kernel param is * specified. Per-cpu workqueues which are identified to * contribute significantly to power-consumption are identified and * marked with this flag and enabling the power_efficient mode * leads to noticeable power saving at the cost of small * performance disadvantage. * * http://thread.gmane.org/gmane.linux.kernel/1480396 */ WQ_POWER_EFFICIENT = 1 << 7, __WQ_DESTROYING = 1 << 15, /* internal: workqueue is destroying */ __WQ_DRAINING = 1 << 16, /* internal: workqueue is draining */ __WQ_ORDERED = 1 << 17, /* internal: workqueue is ordered */ __WQ_LEGACY = 1 << 18, /* internal: create*_workqueue() */ /* BH wq only allows the following flags */ __WQ_BH_ALLOWS = WQ_BH | WQ_HIGHPRI, }; enum wq_consts { WQ_MAX_ACTIVE = 2048, /* I like 2048, better ideas? */ WQ_UNBOUND_MAX_ACTIVE = WQ_MAX_ACTIVE, WQ_DFL_ACTIVE = WQ_MAX_ACTIVE / 2, /* * Per-node default cap on min_active. Unless explicitly set, min_active * is set to min(max_active, WQ_DFL_MIN_ACTIVE). For more details, see * workqueue_struct->min_active definition. */ WQ_DFL_MIN_ACTIVE = 8, }; /* * System-wide workqueues which are always present. * * system_wq is the one used by schedule[_delayed]_work[_on](). * Multi-CPU multi-threaded. There are users which expect relatively * short queue flush time. Don't queue works which can run for too * long. * * system_highpri_wq is similar to system_wq but for work items which * require WQ_HIGHPRI. * * system_long_wq is similar to system_wq but may host long running * works. Queue flushing might take relatively long. * * system_unbound_wq is unbound workqueue. Workers are not bound to * any specific CPU, not concurrency managed, and all queued works are * executed immediately as long as max_active limit is not reached and * resources are available. * * system_freezable_wq is equivalent to system_wq except that it's * freezable. * * *_power_efficient_wq are inclined towards saving power and converted * into WQ_UNBOUND variants if 'wq_power_efficient' is enabled; otherwise, * they are same as their non-power-efficient counterparts - e.g. * system_power_efficient_wq is identical to system_wq if * 'wq_power_efficient' is disabled. See WQ_POWER_EFFICIENT for more info. * * system_bh[_highpri]_wq are convenience interface to softirq. BH work items * are executed in the queueing CPU's BH context in the queueing order. */ extern struct workqueue_struct *system_wq; extern struct workqueue_struct *system_highpri_wq; extern struct workqueue_struct *system_long_wq; extern struct workqueue_struct *system_unbound_wq; extern struct workqueue_struct *system_freezable_wq; extern struct workqueue_struct *system_power_efficient_wq; extern struct workqueue_struct *system_freezable_power_efficient_wq; extern struct workqueue_struct *system_bh_wq; extern struct workqueue_struct *system_bh_highpri_wq; void workqueue_softirq_action(bool highpri); void workqueue_softirq_dead(unsigned int cpu); /** * alloc_workqueue - allocate a workqueue * @fmt: printf format for the name of the workqueue * @flags: WQ_* flags * @max_active: max in-flight work items, 0 for default * @...: args for @fmt * * For a per-cpu workqueue, @max_active limits the number of in-flight work * items for each CPU. e.g. @max_active of 1 indicates that each CPU can be * executing at most one work item for the workqueue. * * For unbound workqueues, @max_active limits the number of in-flight work items * for the whole system. e.g. @max_active of 16 indicates that that there can be * at most 16 work items executing for the workqueue in the whole system. * * As sharing the same active counter for an unbound workqueue across multiple * NUMA nodes can be expensive, @max_active is distributed to each NUMA node * according to the proportion of the number of online CPUs and enforced * independently. * * Depending on online CPU distribution, a node may end up with per-node * max_active which is significantly lower than @max_active, which can lead to * deadlocks if the per-node concurrency limit is lower than the maximum number * of interdependent work items for the workqueue. * * To guarantee forward progress regardless of online CPU distribution, the * concurrency limit on every node is guaranteed to be equal to or greater than * min_active which is set to min(@max_active, %WQ_DFL_MIN_ACTIVE). This means * that the sum of per-node max_active's may be larger than @max_active. * * For detailed information on %WQ_* flags, please refer to * Documentation/core-api/workqueue.rst. * * RETURNS: * Pointer to the allocated workqueue on success, %NULL on failure. */ __printf(1, 4) struct workqueue_struct * alloc_workqueue(const char *fmt, unsigned int flags, int max_active, ...); #ifdef CONFIG_LOCKDEP /** * alloc_workqueue_lockdep_map - allocate a workqueue with user-defined lockdep_map * @fmt: printf format for the name of the workqueue * @flags: WQ_* flags * @max_active: max in-flight work items, 0 for default * @lockdep_map: user-defined lockdep_map * @...: args for @fmt * * Same as alloc_workqueue but with the a user-define lockdep_map. Useful for * workqueues created with the same purpose and to avoid leaking a lockdep_map * on each workqueue creation. * * RETURNS: * Pointer to the allocated workqueue on success, %NULL on failure. */ __printf(1, 5) struct workqueue_struct * alloc_workqueue_lockdep_map(const char *fmt, unsigned int flags, int max_active, struct lockdep_map *lockdep_map, ...); /** * alloc_ordered_workqueue_lockdep_map - allocate an ordered workqueue with * user-defined lockdep_map * * @fmt: printf format for the name of the workqueue * @flags: WQ_* flags (only WQ_FREEZABLE and WQ_MEM_RECLAIM are meaningful) * @lockdep_map: user-defined lockdep_map * @args: args for @fmt * * Same as alloc_ordered_workqueue but with the a user-define lockdep_map. * Useful for workqueues created with the same purpose and to avoid leaking a * lockdep_map on each workqueue creation. * * RETURNS: * Pointer to the allocated workqueue on success, %NULL on failure. */ #define alloc_ordered_workqueue_lockdep_map(fmt, flags, lockdep_map, args...) \ alloc_workqueue_lockdep_map(fmt, WQ_UNBOUND | __WQ_ORDERED | (flags), \ 1, lockdep_map, ##args) #endif /** * alloc_ordered_workqueue - allocate an ordered workqueue * @fmt: printf format for the name of the workqueue * @flags: WQ_* flags (only WQ_FREEZABLE and WQ_MEM_RECLAIM are meaningful) * @args: args for @fmt * * Allocate an ordered workqueue. An ordered workqueue executes at * most one work item at any given time in the queued order. They are * implemented as unbound workqueues with @max_active of one. * * RETURNS: * Pointer to the allocated workqueue on success, %NULL on failure. */ #define alloc_ordered_workqueue(fmt, flags, args...) \ alloc_workqueue(fmt, WQ_UNBOUND | __WQ_ORDERED | (flags), 1, ##args) #define create_workqueue(name) \ alloc_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM, 1, (name)) #define create_freezable_workqueue(name) \ alloc_workqueue("%s", __WQ_LEGACY | WQ_FREEZABLE | WQ_UNBOUND | \ WQ_MEM_RECLAIM, 1, (name)) #define create_singlethread_workqueue(name) \ alloc_ordered_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM, name) #define from_work(var, callback_work, work_fieldname) \ container_of(callback_work, typeof(*var), work_fieldname) extern void destroy_workqueue(struct workqueue_struct *wq); struct workqueue_attrs *alloc_workqueue_attrs(void); void free_workqueue_attrs(struct workqueue_attrs *attrs); int apply_workqueue_attrs(struct workqueue_struct *wq, const struct workqueue_attrs *attrs); extern int workqueue_unbound_exclude_cpumask(cpumask_var_t cpumask); extern bool queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work); extern bool queue_work_node(int node, struct workqueue_struct *wq, struct work_struct *work); extern bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq, struct delayed_work *work, unsigned long delay); extern bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq, struct delayed_work *dwork, unsigned long delay); extern bool queue_rcu_work(struct workqueue_struct *wq, struct rcu_work *rwork); extern void __flush_workqueue(struct workqueue_struct *wq); extern void drain_workqueue(struct workqueue_struct *wq); extern int schedule_on_each_cpu(work_func_t func); int execute_in_process_context(work_func_t fn, struct execute_work *); extern bool flush_work(struct work_struct *work); extern bool cancel_work(struct work_struct *work); extern bool cancel_work_sync(struct work_struct *work); extern bool flush_delayed_work(struct delayed_work *dwork); extern bool cancel_delayed_work(struct delayed_work *dwork); extern bool cancel_delayed_work_sync(struct delayed_work *dwork); extern bool disable_work(struct work_struct *work); extern bool disable_work_sync(struct work_struct *work); extern bool enable_work(struct work_struct *work); extern bool disable_delayed_work(struct delayed_work *dwork); extern bool disable_delayed_work_sync(struct delayed_work *dwork); extern bool enable_delayed_work(struct delayed_work *dwork); extern bool flush_rcu_work(struct rcu_work *rwork); extern void workqueue_set_max_active(struct workqueue_struct *wq, int max_active); extern void workqueue_set_min_active(struct workqueue_struct *wq, int min_active); extern struct work_struct *current_work(void); extern bool current_is_workqueue_rescuer(void); extern bool workqueue_congested(int cpu, struct workqueue_struct *wq); extern unsigned int work_busy(struct work_struct *work); extern __printf(1, 2) void set_worker_desc(const char *fmt, ...); extern void print_worker_info(const char *log_lvl, struct task_struct *task); extern void show_all_workqueues(void); extern void show_freezable_workqueues(void); extern void show_one_workqueue(struct workqueue_struct *wq); extern void wq_worker_comm(char *buf, size_t size, struct task_struct *task); /** * queue_work - queue work on a workqueue * @wq: workqueue to use * @work: work to queue * * Returns %false if @work was already on a queue, %true otherwise. * * We queue the work to the CPU on which it was submitted, but if the CPU dies * it can be processed by another CPU. * * Memory-ordering properties: If it returns %true, guarantees that all stores * preceding the call to queue_work() in the program order will be visible from * the CPU which will execute @work by the time such work executes, e.g., * * { x is initially 0 } * * CPU0 CPU1 * * WRITE_ONCE(x, 1); [ @work is being executed ] * r0 = queue_work(wq, work); r1 = READ_ONCE(x); * * Forbids: r0 == true && r1 == 0 */ static inline bool queue_work(struct workqueue_struct *wq, struct work_struct *work) { return queue_work_on(WORK_CPU_UNBOUND, wq, work); } /** * queue_delayed_work - queue work on a workqueue after delay * @wq: workqueue to use * @dwork: delayable work to queue * @delay: number of jiffies to wait before queueing * * Equivalent to queue_delayed_work_on() but tries to use the local CPU. */ static inline bool queue_delayed_work(struct workqueue_struct *wq, struct delayed_work *dwork, unsigned long delay) { return queue_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay); } /** * mod_delayed_work - modify delay of or queue a delayed work * @wq: workqueue to use * @dwork: work to queue * @delay: number of jiffies to wait before queueing * * mod_delayed_work_on() on local CPU. */ static inline bool mod_delayed_work(struct workqueue_struct *wq, struct delayed_work *dwork, unsigned long delay) { return mod_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay); } /** * schedule_work_on - put work task on a specific cpu * @cpu: cpu to put the work task on * @work: job to be done * * This puts a job on a specific cpu */ static inline bool schedule_work_on(int cpu, struct work_struct *work) { return queue_work_on(cpu, system_wq, work); } /** * schedule_work - put work task in global workqueue * @work: job to be done * * Returns %false if @work was already on the kernel-global workqueue and * %true otherwise. * * This puts a job in the kernel-global workqueue if it was not already * queued and leaves it in the same position on the kernel-global * workqueue otherwise. * * Shares the same memory-ordering properties of queue_work(), cf. the * DocBook header of queue_work(). */ static inline bool schedule_work(struct work_struct *work) { return queue_work(system_wq, work); } /** * enable_and_queue_work - Enable and queue a work item on a specific workqueue * @wq: The target workqueue * @work: The work item to be enabled and queued * * This function combines the operations of enable_work() and queue_work(), * providing a convenient way to enable and queue a work item in a single call. * It invokes enable_work() on @work and then queues it if the disable depth * reached 0. Returns %true if the disable depth reached 0 and @work is queued, * and %false otherwise. * * Note that @work is always queued when disable depth reaches zero. If the * desired behavior is queueing only if certain events took place while @work is * disabled, the user should implement the necessary state tracking and perform * explicit conditional queueing after enable_work(). */ static inline bool enable_and_queue_work(struct workqueue_struct *wq, struct work_struct *work) { if (enable_work(work)) { queue_work(wq, work); return true; } return false; } /* * Detect attempt to flush system-wide workqueues at compile time when possible. * Warn attempt to flush system-wide workqueues at runtime. * * See https://lkml.kernel.org/r/49925af7-78a8-a3dd-bce6-cfc02e1a9236@I-love.SAKURA.ne.jp * for reasons and steps for converting system-wide workqueues into local workqueues. */ extern void __warn_flushing_systemwide_wq(void) __compiletime_warning("Please avoid flushing system-wide workqueues."); /* Please stop using this function, for this function will be removed in near future. */ #define flush_scheduled_work() \ ({ \ __warn_flushing_systemwide_wq(); \ __flush_workqueue(system_wq); \ }) #define flush_workqueue(wq) \ ({ \ struct workqueue_struct *_wq = (wq); \ \ if ((__builtin_constant_p(_wq == system_wq) && \ _wq == system_wq) || \ (__builtin_constant_p(_wq == system_highpri_wq) && \ _wq == system_highpri_wq) || \ (__builtin_constant_p(_wq == system_long_wq) && \ _wq == system_long_wq) || \ (__builtin_constant_p(_wq == system_unbound_wq) && \ _wq == system_unbound_wq) || \ (__builtin_constant_p(_wq == system_freezable_wq) && \ _wq == system_freezable_wq) || \ (__builtin_constant_p(_wq == system_power_efficient_wq) && \ _wq == system_power_efficient_wq) || \ (__builtin_constant_p(_wq == system_freezable_power_efficient_wq) && \ _wq == system_freezable_power_efficient_wq)) \ __warn_flushing_systemwide_wq(); \ __flush_workqueue(_wq); \ }) /** * schedule_delayed_work_on - queue work in global workqueue on CPU after delay * @cpu: cpu to use * @dwork: job to be done * @delay: number of jiffies to wait * * After waiting for a given time this puts a job in the kernel-global * workqueue on the specified CPU. */ static inline bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay) { return queue_delayed_work_on(cpu, system_wq, dwork, delay); } /** * schedule_delayed_work - put work task in global workqueue after delay * @dwork: job to be done * @delay: number of jiffies to wait or 0 for immediate execution * * After waiting for a given time this puts a job in the kernel-global * workqueue. */ static inline bool schedule_delayed_work(struct delayed_work *dwork, unsigned long delay) { return queue_delayed_work(system_wq, dwork, delay); } #ifndef CONFIG_SMP static inline long work_on_cpu(int cpu, long (*fn)(void *), void *arg) { return fn(arg); } static inline long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg) { return fn(arg); } #else long work_on_cpu_key(int cpu, long (*fn)(void *), void *arg, struct lock_class_key *key); /* * A new key is defined for each caller to make sure the work * associated with the function doesn't share its locking class. */ #define work_on_cpu(_cpu, _fn, _arg) \ ({ \ static struct lock_class_key __key; \ \ work_on_cpu_key(_cpu, _fn, _arg, &__key); \ }) long work_on_cpu_safe_key(int cpu, long (*fn)(void *), void *arg, struct lock_class_key *key); /* * A new key is defined for each caller to make sure the work * associated with the function doesn't share its locking class. */ #define work_on_cpu_safe(_cpu, _fn, _arg) \ ({ \ static struct lock_class_key __key; \ \ work_on_cpu_safe_key(_cpu, _fn, _arg, &__key); \ }) #endif /* CONFIG_SMP */ #ifdef CONFIG_FREEZER extern void freeze_workqueues_begin(void); extern bool freeze_workqueues_busy(void); extern void thaw_workqueues(void); #endif /* CONFIG_FREEZER */ #ifdef CONFIG_SYSFS int workqueue_sysfs_register(struct workqueue_struct *wq); #else /* CONFIG_SYSFS */ static inline int workqueue_sysfs_register(struct workqueue_struct *wq) { return 0; } #endif /* CONFIG_SYSFS */ #ifdef CONFIG_WQ_WATCHDOG void wq_watchdog_touch(int cpu); #else /* CONFIG_WQ_WATCHDOG */ static inline void wq_watchdog_touch(int cpu) { } #endif /* CONFIG_WQ_WATCHDOG */ #ifdef CONFIG_SMP int workqueue_prepare_cpu(unsigned int cpu); int workqueue_online_cpu(unsigned int cpu); int workqueue_offline_cpu(unsigned int cpu); #endif void __init workqueue_init_early(void); void __init workqueue_init(void); void __init workqueue_init_topology(void); #endif |
| 7 7 26 6 30 24 16 16 16 285 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 | /* SPDX-License-Identifier: GPL-2.0 */ /* * fscrypt_private.h * * Copyright (C) 2015, Google, Inc. * * Originally written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar. * Heavily modified since then. */ #ifndef _FSCRYPT_PRIVATE_H #define _FSCRYPT_PRIVATE_H #include <linux/fscrypt.h> #include <linux/siphash.h> #include <crypto/hash.h> #include <linux/blk-crypto.h> #define CONST_STRLEN(str) (sizeof(str) - 1) #define FSCRYPT_FILE_NONCE_SIZE 16 /* * Minimum size of an fscrypt master key. Note: a longer key will be required * if ciphers with a 256-bit security strength are used. This is just the * absolute minimum, which applies when only 128-bit encryption is used. */ #define FSCRYPT_MIN_KEY_SIZE 16 #define FSCRYPT_CONTEXT_V1 1 #define FSCRYPT_CONTEXT_V2 2 /* Keep this in sync with include/uapi/linux/fscrypt.h */ #define FSCRYPT_MODE_MAX FSCRYPT_MODE_AES_256_HCTR2 struct fscrypt_context_v1 { u8 version; /* FSCRYPT_CONTEXT_V1 */ u8 contents_encryption_mode; u8 filenames_encryption_mode; u8 flags; u8 master_key_descriptor[FSCRYPT_KEY_DESCRIPTOR_SIZE]; u8 nonce[FSCRYPT_FILE_NONCE_SIZE]; }; struct fscrypt_context_v2 { u8 version; /* FSCRYPT_CONTEXT_V2 */ u8 contents_encryption_mode; u8 filenames_encryption_mode; u8 flags; u8 log2_data_unit_size; u8 __reserved[3]; u8 master_key_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]; u8 nonce[FSCRYPT_FILE_NONCE_SIZE]; }; /* * fscrypt_context - the encryption context of an inode * * This is the on-disk equivalent of an fscrypt_policy, stored alongside each * encrypted file usually in a hidden extended attribute. It contains the * fields from the fscrypt_policy, in order to identify the encryption algorithm * and key with which the file is encrypted. It also contains a nonce that was * randomly generated by fscrypt itself; this is used as KDF input or as a tweak * to cause different files to be encrypted differently. */ union fscrypt_context { u8 version; struct fscrypt_context_v1 v1; struct fscrypt_context_v2 v2; }; /* * Return the size expected for the given fscrypt_context based on its version * number, or 0 if the context version is unrecognized. */ static inline int fscrypt_context_size(const union fscrypt_context *ctx) { switch (ctx->version) { case FSCRYPT_CONTEXT_V1: BUILD_BUG_ON(sizeof(ctx->v1) != 28); return sizeof(ctx->v1); case FSCRYPT_CONTEXT_V2: BUILD_BUG_ON(sizeof(ctx->v2) != 40); return sizeof(ctx->v2); } return 0; } /* Check whether an fscrypt_context has a recognized version number and size */ static inline bool fscrypt_context_is_valid(const union fscrypt_context *ctx, int ctx_size) { return ctx_size >= 1 && ctx_size == fscrypt_context_size(ctx); } /* Retrieve the context's nonce, assuming the context was already validated */ static inline const u8 *fscrypt_context_nonce(const union fscrypt_context *ctx) { switch (ctx->version) { case FSCRYPT_CONTEXT_V1: return ctx->v1.nonce; case FSCRYPT_CONTEXT_V2: return ctx->v2.nonce; } WARN_ON_ONCE(1); return NULL; } union fscrypt_policy { u8 version; struct fscrypt_policy_v1 v1; struct fscrypt_policy_v2 v2; }; /* * Return the size expected for the given fscrypt_policy based on its version * number, or 0 if the policy version is unrecognized. */ static inline int fscrypt_policy_size(const union fscrypt_policy *policy) { switch (policy->version) { case FSCRYPT_POLICY_V1: return sizeof(policy->v1); case FSCRYPT_POLICY_V2: return sizeof(policy->v2); } return 0; } /* Return the contents encryption mode of a valid encryption policy */ static inline u8 fscrypt_policy_contents_mode(const union fscrypt_policy *policy) { switch (policy->version) { case FSCRYPT_POLICY_V1: return policy->v1.contents_encryption_mode; case FSCRYPT_POLICY_V2: return policy->v2.contents_encryption_mode; } BUG(); } /* Return the filenames encryption mode of a valid encryption policy */ static inline u8 fscrypt_policy_fnames_mode(const union fscrypt_policy *policy) { switch (policy->version) { case FSCRYPT_POLICY_V1: return policy->v1.filenames_encryption_mode; case FSCRYPT_POLICY_V2: return policy->v2.filenames_encryption_mode; } BUG(); } /* Return the flags (FSCRYPT_POLICY_FLAG*) of a valid encryption policy */ static inline u8 fscrypt_policy_flags(const union fscrypt_policy *policy) { switch (policy->version) { case FSCRYPT_POLICY_V1: return policy->v1.flags; case FSCRYPT_POLICY_V2: return policy->v2.flags; } BUG(); } static inline int fscrypt_policy_v2_du_bits(const struct fscrypt_policy_v2 *policy, const struct inode *inode) { return policy->log2_data_unit_size ?: inode->i_blkbits; } static inline int fscrypt_policy_du_bits(const union fscrypt_policy *policy, const struct inode *inode) { switch (policy->version) { case FSCRYPT_POLICY_V1: return inode->i_blkbits; case FSCRYPT_POLICY_V2: return fscrypt_policy_v2_du_bits(&policy->v2, inode); } BUG(); } /* * For encrypted symlinks, the ciphertext length is stored at the beginning * of the string in little-endian format. */ struct fscrypt_symlink_data { __le16 len; char encrypted_path[]; } __packed; /** * struct fscrypt_prepared_key - a key prepared for actual encryption/decryption * @tfm: crypto API transform object * @blk_key: key for blk-crypto * * Normally only one of the fields will be non-NULL. */ struct fscrypt_prepared_key { struct crypto_skcipher *tfm; #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT struct blk_crypto_key *blk_key; #endif }; /* * fscrypt_inode_info - the "encryption key" for an inode * * When an encrypted file's key is made available, an instance of this struct is * allocated and stored in ->i_crypt_info. Once created, it remains until the * inode is evicted. */ struct fscrypt_inode_info { /* The key in a form prepared for actual encryption/decryption */ struct fscrypt_prepared_key ci_enc_key; /* True if ci_enc_key should be freed when this struct is freed */ u8 ci_owns_key : 1; #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT /* * True if this inode will use inline encryption (blk-crypto) instead of * the traditional filesystem-layer encryption. */ u8 ci_inlinecrypt : 1; #endif /* True if ci_dirhash_key is initialized */ u8 ci_dirhash_key_initialized : 1; /* * log2 of the data unit size (granularity of contents encryption) of * this file. This is computable from ci_policy and ci_inode but is * cached here for efficiency. Only used for regular files. */ u8 ci_data_unit_bits; /* Cached value: log2 of number of data units per FS block */ u8 ci_data_units_per_block_bits; /* Hashed inode number. Only set for IV_INO_LBLK_32 */ u32 ci_hashed_ino; /* * Encryption mode used for this inode. It corresponds to either the * contents or filenames encryption mode, depending on the inode type. */ struct fscrypt_mode *ci_mode; /* Back-pointer to the inode */ struct inode *ci_inode; /* * The master key with which this inode was unlocked (decrypted). This * will be NULL if the master key was found in a process-subscribed * keyring rather than in the filesystem-level keyring. */ struct fscrypt_master_key *ci_master_key; /* * Link in list of inodes that were unlocked with the master key. * Only used when ->ci_master_key is set. */ struct list_head ci_master_key_link; /* * If non-NULL, then encryption is done using the master key directly * and ci_enc_key will equal ci_direct_key->dk_key. */ struct fscrypt_direct_key *ci_direct_key; /* * This inode's hash key for filenames. This is a 128-bit SipHash-2-4 * key. This is only set for directories that use a keyed dirhash over * the plaintext filenames -- currently just casefolded directories. */ siphash_key_t ci_dirhash_key; /* The encryption policy used by this inode */ union fscrypt_policy ci_policy; /* This inode's nonce, copied from the fscrypt_context */ u8 ci_nonce[FSCRYPT_FILE_NONCE_SIZE]; }; typedef enum { FS_DECRYPT = 0, FS_ENCRYPT, } fscrypt_direction_t; /* crypto.c */ extern struct kmem_cache *fscrypt_inode_info_cachep; int fscrypt_initialize(struct super_block *sb); int fscrypt_crypt_data_unit(const struct fscrypt_inode_info *ci, fscrypt_direction_t rw, u64 index, struct page *src_page, struct page *dest_page, unsigned int len, unsigned int offs, gfp_t gfp_flags); struct page *fscrypt_alloc_bounce_page(gfp_t gfp_flags); void __printf(3, 4) __cold fscrypt_msg(const struct inode *inode, const char *level, const char *fmt, ...); #define fscrypt_warn(inode, fmt, ...) \ fscrypt_msg((inode), KERN_WARNING, fmt, ##__VA_ARGS__) #define fscrypt_err(inode, fmt, ...) \ fscrypt_msg((inode), KERN_ERR, fmt, ##__VA_ARGS__) #define FSCRYPT_MAX_IV_SIZE 32 union fscrypt_iv { struct { /* zero-based index of data unit within the file */ __le64 index; /* per-file nonce; only set in DIRECT_KEY mode */ u8 nonce[FSCRYPT_FILE_NONCE_SIZE]; }; u8 raw[FSCRYPT_MAX_IV_SIZE]; __le64 dun[FSCRYPT_MAX_IV_SIZE / sizeof(__le64)]; }; void fscrypt_generate_iv(union fscrypt_iv *iv, u64 index, const struct fscrypt_inode_info *ci); /* * Return the number of bits used by the maximum file data unit index that is * possible on the given filesystem, using the given log2 data unit size. */ static inline int fscrypt_max_file_dun_bits(const struct super_block *sb, int du_bits) { return fls64(sb->s_maxbytes - 1) - du_bits; } /* fname.c */ bool __fscrypt_fname_encrypted_size(const union fscrypt_policy *policy, u32 orig_len, u32 max_len, u32 *encrypted_len_ret); /* hkdf.c */ struct fscrypt_hkdf { struct crypto_shash *hmac_tfm; }; int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key, unsigned int master_key_size); /* * The list of contexts in which fscrypt uses HKDF. These values are used as * the first byte of the HKDF application-specific info string to guarantee that * info strings are never repeated between contexts. This ensures that all HKDF * outputs are unique and cryptographically isolated, i.e. knowledge of one * output doesn't reveal another. */ #define HKDF_CONTEXT_KEY_IDENTIFIER 1 /* info=<empty> */ #define HKDF_CONTEXT_PER_FILE_ENC_KEY 2 /* info=file_nonce */ #define HKDF_CONTEXT_DIRECT_KEY 3 /* info=mode_num */ #define HKDF_CONTEXT_IV_INO_LBLK_64_KEY 4 /* info=mode_num||fs_uuid */ #define HKDF_CONTEXT_DIRHASH_KEY 5 /* info=file_nonce */ #define HKDF_CONTEXT_IV_INO_LBLK_32_KEY 6 /* info=mode_num||fs_uuid */ #define HKDF_CONTEXT_INODE_HASH_KEY 7 /* info=<empty> */ int fscrypt_hkdf_expand(const struct fscrypt_hkdf *hkdf, u8 context, const u8 *info, unsigned int infolen, u8 *okm, unsigned int okmlen); void fscrypt_destroy_hkdf(struct fscrypt_hkdf *hkdf); /* inline_crypt.c */ #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT int fscrypt_select_encryption_impl(struct fscrypt_inode_info *ci); static inline bool fscrypt_using_inline_encryption(const struct fscrypt_inode_info *ci) { return ci->ci_inlinecrypt; } int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key, const u8 *raw_key, const struct fscrypt_inode_info *ci); void fscrypt_destroy_inline_crypt_key(struct super_block *sb, struct fscrypt_prepared_key *prep_key); /* * Check whether the crypto transform or blk-crypto key has been allocated in * @prep_key, depending on which encryption implementation the file will use. */ static inline bool fscrypt_is_key_prepared(struct fscrypt_prepared_key *prep_key, const struct fscrypt_inode_info *ci) { /* * The two smp_load_acquire()'s here pair with the smp_store_release()'s * in fscrypt_prepare_inline_crypt_key() and fscrypt_prepare_key(). * I.e., in some cases (namely, if this prep_key is a per-mode * encryption key) another task can publish blk_key or tfm concurrently, * executing a RELEASE barrier. We need to use smp_load_acquire() here * to safely ACQUIRE the memory the other task published. */ if (fscrypt_using_inline_encryption(ci)) return smp_load_acquire(&prep_key->blk_key) != NULL; return smp_load_acquire(&prep_key->tfm) != NULL; } #else /* CONFIG_FS_ENCRYPTION_INLINE_CRYPT */ static inline int fscrypt_select_encryption_impl(struct fscrypt_inode_info *ci) { return 0; } static inline bool fscrypt_using_inline_encryption(const struct fscrypt_inode_info *ci) { return false; } static inline int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key, const u8 *raw_key, const struct fscrypt_inode_info *ci) { WARN_ON_ONCE(1); return -EOPNOTSUPP; } static inline void fscrypt_destroy_inline_crypt_key(struct super_block *sb, struct fscrypt_prepared_key *prep_key) { } static inline bool fscrypt_is_key_prepared(struct fscrypt_prepared_key *prep_key, const struct fscrypt_inode_info *ci) { return smp_load_acquire(&prep_key->tfm) != NULL; } #endif /* !CONFIG_FS_ENCRYPTION_INLINE_CRYPT */ /* keyring.c */ /* * fscrypt_master_key_secret - secret key material of an in-use master key */ struct fscrypt_master_key_secret { /* * For v2 policy keys: HKDF context keyed by this master key. * For v1 policy keys: not set (hkdf.hmac_tfm == NULL). */ struct fscrypt_hkdf hkdf; /* * Size of the raw key in bytes. This remains set even if ->raw was * zeroized due to no longer being needed. I.e. we still remember the * size of the key even if we don't need to remember the key itself. */ u32 size; /* For v1 policy keys: the raw key. Wiped for v2 policy keys. */ u8 raw[FSCRYPT_MAX_KEY_SIZE]; } __randomize_layout; /* * fscrypt_master_key - an in-use master key * * This represents a master encryption key which has been added to the * filesystem. There are three high-level states that a key can be in: * * FSCRYPT_KEY_STATUS_PRESENT * Key is fully usable; it can be used to unlock inodes that are encrypted * with it (this includes being able to create new inodes). ->mk_present * indicates whether the key is in this state. ->mk_secret exists, the key * is in the keyring, and ->mk_active_refs > 0 due to ->mk_present. * * FSCRYPT_KEY_STATUS_INCOMPLETELY_REMOVED * Removal of this key has been initiated, but some inodes that were * unlocked with it are still in-use. Like ABSENT, ->mk_secret is wiped, * and the key can no longer be used to unlock inodes. Unlike ABSENT, the * key is still in the keyring; ->mk_decrypted_inodes is nonempty; and * ->mk_active_refs > 0, being equal to the size of ->mk_decrypted_inodes. * * This state transitions to ABSENT if ->mk_decrypted_inodes becomes empty, * or to PRESENT if FS_IOC_ADD_ENCRYPTION_KEY is called again for this key. * * FSCRYPT_KEY_STATUS_ABSENT * Key is fully removed. The key is no longer in the keyring, * ->mk_decrypted_inodes is empty, ->mk_active_refs == 0, ->mk_secret is * wiped, and the key can no longer be used to unlock inodes. */ struct fscrypt_master_key { /* * Link in ->s_master_keys->key_hashtable. * Only valid if ->mk_active_refs > 0. */ struct hlist_node mk_node; /* Semaphore that protects ->mk_secret, ->mk_users, and ->mk_present */ struct rw_semaphore mk_sem; /* * Active and structural reference counts. An active ref guarantees * that the struct continues to exist, continues to be in the keyring * ->s_master_keys, and that any embedded subkeys (e.g. * ->mk_direct_keys) that have been prepared continue to exist. * A structural ref only guarantees that the struct continues to exist. * * There is one active ref associated with ->mk_present being true, and * one active ref for each inode in ->mk_decrypted_inodes. * * There is one structural ref associated with the active refcount being * nonzero. Finding a key in the keyring also takes a structural ref, * which is then held temporarily while the key is operated on. */ refcount_t mk_active_refs; refcount_t mk_struct_refs; struct rcu_head mk_rcu_head; /* * The secret key material. Wiped as soon as it is no longer needed; * for details, see the fscrypt_master_key struct comment. * * Locking: protected by ->mk_sem. */ struct fscrypt_master_key_secret mk_secret; /* * For v1 policy keys: an arbitrary key descriptor which was assigned by * userspace (->descriptor). * * For v2 policy keys: a cryptographic hash of this key (->identifier). */ struct fscrypt_key_specifier mk_spec; /* * Keyring which contains a key of type 'key_type_fscrypt_user' for each * user who has added this key. Normally each key will be added by just * one user, but it's possible that multiple users share a key, and in * that case we need to keep track of those users so that one user can't * remove the key before the others want it removed too. * * This is NULL for v1 policy keys; those can only be added by root. * * Locking: protected by ->mk_sem. (We don't just rely on the keyrings * subsystem semaphore ->mk_users->sem, as we need support for atomic * search+insert along with proper synchronization with other fields.) */ struct key *mk_users; /* * List of inodes that were unlocked using this key. This allows the * inodes to be evicted efficiently if the key is removed. */ struct list_head mk_decrypted_inodes; spinlock_t mk_decrypted_inodes_lock; /* * Per-mode encryption keys for the various types of encryption policies * that use them. Allocated and derived on-demand. */ struct fscrypt_prepared_key mk_direct_keys[FSCRYPT_MODE_MAX + 1]; struct fscrypt_prepared_key mk_iv_ino_lblk_64_keys[FSCRYPT_MODE_MAX + 1]; struct fscrypt_prepared_key mk_iv_ino_lblk_32_keys[FSCRYPT_MODE_MAX + 1]; /* Hash key for inode numbers. Initialized only when needed. */ siphash_key_t mk_ino_hash_key; bool mk_ino_hash_key_initialized; /* * Whether this key is in the "present" state, i.e. fully usable. For * details, see the fscrypt_master_key struct comment. * * Locking: protected by ->mk_sem, but can be read locklessly using * READ_ONCE(). Writers must use WRITE_ONCE() when concurrent readers * are possible. */ bool mk_present; } __randomize_layout; static inline const char *master_key_spec_type( const struct fscrypt_key_specifier *spec) { switch (spec->type) { case FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR: return "descriptor"; case FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER: return "identifier"; } return "[unknown]"; } static inline int master_key_spec_len(const struct fscrypt_key_specifier *spec) { switch (spec->type) { case FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR: return FSCRYPT_KEY_DESCRIPTOR_SIZE; case FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER: return FSCRYPT_KEY_IDENTIFIER_SIZE; } return 0; } void fscrypt_put_master_key(struct fscrypt_master_key *mk); void fscrypt_put_master_key_activeref(struct super_block *sb, struct fscrypt_master_key *mk); struct fscrypt_master_key * fscrypt_find_master_key(struct super_block *sb, const struct fscrypt_key_specifier *mk_spec); int fscrypt_get_test_dummy_key_identifier( u8 key_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]); int fscrypt_add_test_dummy_key(struct super_block *sb, struct fscrypt_key_specifier *key_spec); int fscrypt_verify_key_added(struct super_block *sb, const u8 identifier[FSCRYPT_KEY_IDENTIFIER_SIZE]); int __init fscrypt_init_keyring(void); /* keysetup.c */ struct fscrypt_mode { const char *friendly_name; const char *cipher_str; int keysize; /* key size in bytes */ int security_strength; /* security strength in bytes */ int ivsize; /* IV size in bytes */ int logged_cryptoapi_impl; int logged_blk_crypto_native; int logged_blk_crypto_fallback; enum blk_crypto_mode_num blk_crypto_mode; }; extern struct fscrypt_mode fscrypt_modes[]; int fscrypt_prepare_key(struct fscrypt_prepared_key *prep_key, const u8 *raw_key, const struct fscrypt_inode_info *ci); void fscrypt_destroy_prepared_key(struct super_block *sb, struct fscrypt_prepared_key *prep_key); int fscrypt_set_per_file_enc_key(struct fscrypt_inode_info *ci, const u8 *raw_key); int fscrypt_derive_dirhash_key(struct fscrypt_inode_info *ci, const struct fscrypt_master_key *mk); void fscrypt_hash_inode_number(struct fscrypt_inode_info *ci, const struct fscrypt_master_key *mk); int fscrypt_get_encryption_info(struct inode *inode, bool allow_unsupported); /** * fscrypt_require_key() - require an inode's encryption key * @inode: the inode we need the key for * * If the inode is encrypted, set up its encryption key if not already done. * Then require that the key be present and return -ENOKEY otherwise. * * No locks are needed, and the key will live as long as the struct inode --- so * it won't go away from under you. * * Return: 0 on success, -ENOKEY if the key is missing, or another -errno code * if a problem occurred while setting up the encryption key. */ static inline int fscrypt_require_key(struct inode *inode) { if (IS_ENCRYPTED(inode)) { int err = fscrypt_get_encryption_info(inode, false); if (err) return err; if (!fscrypt_has_encryption_key(inode)) return -ENOKEY; } return 0; } /* keysetup_v1.c */ void fscrypt_put_direct_key(struct fscrypt_direct_key *dk); int fscrypt_setup_v1_file_key(struct fscrypt_inode_info *ci, const u8 *raw_master_key); int fscrypt_setup_v1_file_key_via_subscribed_keyrings( struct fscrypt_inode_info *ci); /* policy.c */ bool fscrypt_policies_equal(const union fscrypt_policy *policy1, const union fscrypt_policy *policy2); int fscrypt_policy_to_key_spec(const union fscrypt_policy *policy, struct fscrypt_key_specifier *key_spec); const union fscrypt_policy *fscrypt_get_dummy_policy(struct super_block *sb); bool fscrypt_supported_policy(const union fscrypt_policy *policy_u, const struct inode *inode); int fscrypt_policy_from_context(union fscrypt_policy *policy_u, const union fscrypt_context *ctx_u, int ctx_size); const union fscrypt_policy *fscrypt_policy_to_inherit(struct inode *dir); #endif /* _FSCRYPT_PRIVATE_H */ |
| 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (c) 1999-2002 Vojtech Pavlik */ #ifndef _INPUT_H #define _INPUT_H #include <linux/time.h> #include <linux/list.h> #include <uapi/linux/input.h> /* Implementation details, userspace should not care about these */ #define ABS_MT_FIRST ABS_MT_TOUCH_MAJOR #define ABS_MT_LAST ABS_MT_TOOL_Y /* * In-kernel definitions. */ #include <linux/device.h> #include <linux/fs.h> #include <linux/timer.h> #include <linux/mod_devicetable.h> struct input_dev_poller; /** * struct input_value - input value representation * @type: type of value (EV_KEY, EV_ABS, etc) * @code: the value code * @value: the value */ struct input_value { __u16 type; __u16 code; __s32 value; }; enum input_clock_type { INPUT_CLK_REAL = 0, INPUT_CLK_MONO, INPUT_CLK_BOOT, INPUT_CLK_MAX }; /** * struct input_dev - represents an input device * @name: name of the device * @phys: physical path to the device in the system hierarchy * @uniq: unique identification code for the device (if device has it) * @id: id of the device (struct input_id) * @propbit: bitmap of device properties and quirks * @evbit: bitmap of types of events supported by the device (EV_KEY, * EV_REL, etc.) * @keybit: bitmap of keys/buttons this device has * @relbit: bitmap of relative axes for the device * @absbit: bitmap of absolute axes for the device * @mscbit: bitmap of miscellaneous events supported by the device * @ledbit: bitmap of leds present on the device * @sndbit: bitmap of sound effects supported by the device * @ffbit: bitmap of force feedback effects supported by the device * @swbit: bitmap of switches present on the device * @hint_events_per_packet: average number of events generated by the * device in a packet (between EV_SYN/SYN_REPORT events). Used by * event handlers to estimate size of the buffer needed to hold * events. * @keycodemax: size of keycode table * @keycodesize: size of elements in keycode table * @keycode: map of scancodes to keycodes for this device * @getkeycode: optional legacy method to retrieve current keymap. * @setkeycode: optional method to alter current keymap, used to implement * sparse keymaps. If not supplied default mechanism will be used. * The method is being called while holding event_lock and thus must * not sleep * @ff: force feedback structure associated with the device if device * supports force feedback effects * @poller: poller structure associated with the device if device is * set up to use polling mode * @repeat_key: stores key code of the last key pressed; used to implement * software autorepeat * @timer: timer for software autorepeat * @rep: current values for autorepeat parameters (delay, rate) * @mt: pointer to multitouch state * @absinfo: array of &struct input_absinfo elements holding information * about absolute axes (current value, min, max, flat, fuzz, * resolution) * @key: reflects current state of device's keys/buttons * @led: reflects current state of device's LEDs * @snd: reflects current state of sound effects * @sw: reflects current state of device's switches * @open: this method is called when the very first user calls * input_open_device(). The driver must prepare the device * to start generating events (start polling thread, * request an IRQ, submit URB, etc.). The meaning of open() is * to start providing events to the input core. * @close: this method is called when the very last user calls * input_close_device(). The meaning of close() is to stop * providing events to the input core. * @flush: purges the device. Most commonly used to get rid of force * feedback effects loaded into the device when disconnecting * from it * @event: event handler for events sent _to_ the device, like EV_LED * or EV_SND. The device is expected to carry out the requested * action (turn on a LED, play sound, etc.) The call is protected * by @event_lock and must not sleep * @grab: input handle that currently has the device grabbed (via * EVIOCGRAB ioctl). When a handle grabs a device it becomes sole * recipient for all input events coming from the device * @event_lock: this spinlock is taken when input core receives * and processes a new event for the device (in input_event()). * Code that accesses and/or modifies parameters of a device * (such as keymap or absmin, absmax, absfuzz, etc.) after device * has been registered with input core must take this lock. * @mutex: serializes calls to open(), close() and flush() methods * @users: stores number of users (input handlers) that opened this * device. It is used by input_open_device() and input_close_device() * to make sure that dev->open() is only called when the first * user opens device and dev->close() is called when the very * last user closes the device * @going_away: marks devices that are in a middle of unregistering and * causes input_open_device*() fail with -ENODEV. * @dev: driver model's view of this device * @h_list: list of input handles associated with the device. When * accessing the list dev->mutex must be held * @node: used to place the device onto input_dev_list * @num_vals: number of values queued in the current frame * @max_vals: maximum number of values queued in a frame * @vals: array of values queued in the current frame * @devres_managed: indicates that devices is managed with devres framework * and needs not be explicitly unregistered or freed. * @timestamp: storage for a timestamp set by input_set_timestamp called * by a driver * @inhibited: indicates that the input device is inhibited. If that is * the case then input core ignores any events generated by the device. * Device's close() is called when it is being inhibited and its open() * is called when it is being uninhibited. */ struct input_dev { const char *name; const char *phys; const char *uniq; struct input_id id; unsigned long propbit[BITS_TO_LONGS(INPUT_PROP_CNT)]; unsigned long evbit[BITS_TO_LONGS(EV_CNT)]; unsigned long keybit[BITS_TO_LONGS(KEY_CNT)]; unsigned long relbit[BITS_TO_LONGS(REL_CNT)]; unsigned long absbit[BITS_TO_LONGS(ABS_CNT)]; unsigned long mscbit[BITS_TO_LONGS(MSC_CNT)]; unsigned long ledbit[BITS_TO_LONGS(LED_CNT)]; unsigned long sndbit[BITS_TO_LONGS(SND_CNT)]; unsigned long ffbit[BITS_TO_LONGS(FF_CNT)]; unsigned long swbit[BITS_TO_LONGS(SW_CNT)]; unsigned int hint_events_per_packet; unsigned int keycodemax; unsigned int keycodesize; void *keycode; int (*setkeycode)(struct input_dev *dev, const struct input_keymap_entry *ke, unsigned int *old_keycode); int (*getkeycode)(struct input_dev *dev, struct input_keymap_entry *ke); struct ff_device *ff; struct input_dev_poller *poller; unsigned int repeat_key; struct timer_list timer; int rep[REP_CNT]; struct input_mt *mt; struct input_absinfo *absinfo; unsigned long key[BITS_TO_LONGS(KEY_CNT)]; unsigned long led[BITS_TO_LONGS(LED_CNT)]; unsigned long snd[BITS_TO_LONGS(SND_CNT)]; unsigned long sw[BITS_TO_LONGS(SW_CNT)]; int (*open)(struct input_dev *dev); void (*close)(struct input_dev *dev); int (*flush)(struct input_dev *dev, struct file *file); int (*event)(struct input_dev *dev, unsigned int type, unsigned int code, int value); struct input_handle __rcu *grab; spinlock_t event_lock; struct mutex mutex; unsigned int users; bool going_away; struct device dev; struct list_head h_list; struct list_head node; unsigned int num_vals; unsigned int max_vals; struct input_value *vals; bool devres_managed; ktime_t timestamp[INPUT_CLK_MAX]; bool inhibited; }; #define to_input_dev(d) container_of(d, struct input_dev, dev) /* * Verify that we are in sync with input_device_id mod_devicetable.h #defines */ #if EV_MAX != INPUT_DEVICE_ID_EV_MAX #error "EV_MAX and INPUT_DEVICE_ID_EV_MAX do not match" #endif #if KEY_MIN_INTERESTING != INPUT_DEVICE_ID_KEY_MIN_INTERESTING #error "KEY_MIN_INTERESTING and INPUT_DEVICE_ID_KEY_MIN_INTERESTING do not match" #endif #if KEY_MAX != INPUT_DEVICE_ID_KEY_MAX #error "KEY_MAX and INPUT_DEVICE_ID_KEY_MAX do not match" #endif #if REL_MAX != INPUT_DEVICE_ID_REL_MAX #error "REL_MAX and INPUT_DEVICE_ID_REL_MAX do not match" #endif #if ABS_MAX != INPUT_DEVICE_ID_ABS_MAX #error "ABS_MAX and INPUT_DEVICE_ID_ABS_MAX do not match" #endif #if MSC_MAX != INPUT_DEVICE_ID_MSC_MAX #error "MSC_MAX and INPUT_DEVICE_ID_MSC_MAX do not match" #endif #if LED_MAX != INPUT_DEVICE_ID_LED_MAX #error "LED_MAX and INPUT_DEVICE_ID_LED_MAX do not match" #endif #if SND_MAX != INPUT_DEVICE_ID_SND_MAX #error "SND_MAX and INPUT_DEVICE_ID_SND_MAX do not match" #endif #if FF_MAX != INPUT_DEVICE_ID_FF_MAX #error "FF_MAX and INPUT_DEVICE_ID_FF_MAX do not match" #endif #if SW_MAX != INPUT_DEVICE_ID_SW_MAX #error "SW_MAX and INPUT_DEVICE_ID_SW_MAX do not match" #endif #if INPUT_PROP_MAX != INPUT_DEVICE_ID_PROP_MAX #error "INPUT_PROP_MAX and INPUT_DEVICE_ID_PROP_MAX do not match" #endif #define INPUT_DEVICE_ID_MATCH_DEVICE \ (INPUT_DEVICE_ID_MATCH_BUS | INPUT_DEVICE_ID_MATCH_VENDOR | INPUT_DEVICE_ID_MATCH_PRODUCT) #define INPUT_DEVICE_ID_MATCH_DEVICE_AND_VERSION \ (INPUT_DEVICE_ID_MATCH_DEVICE | INPUT_DEVICE_ID_MATCH_VERSION) struct input_handle; /** * struct input_handler - implements one of interfaces for input devices * @private: driver-specific data * @event: event handler. This method is being called by input core with * interrupts disabled and dev->event_lock spinlock held and so * it may not sleep * @events: event sequence handler. This method is being called by * input core with interrupts disabled and dev->event_lock * spinlock held and so it may not sleep. The method must return * number of events passed to it. * @filter: similar to @event; separates normal event handlers from * "filters". * @match: called after comparing device's id with handler's id_table * to perform fine-grained matching between device and handler * @connect: called when attaching a handler to an input device * @disconnect: disconnects a handler from input device * @start: starts handler for given handle. This function is called by * input core right after connect() method and also when a process * that "grabbed" a device releases it * @passive_observer: set to %true by drivers only interested in observing * data stream from devices if there are other users present. Such * drivers will not result in starting underlying hardware device * when input_open_device() is called for their handles * @legacy_minors: set to %true by drivers using legacy minor ranges * @minor: beginning of range of 32 legacy minors for devices this driver * can provide * @name: name of the handler, to be shown in /proc/bus/input/handlers * @id_table: pointer to a table of input_device_ids this driver can * handle * @h_list: list of input handles associated with the handler * @node: for placing the driver onto input_handler_list * * Input handlers attach to input devices and create input handles. There * are likely several handlers attached to any given input device at the * same time. All of them will get their copy of input event generated by * the device. * * The very same structure is used to implement input filters. Input core * allows filters to run first and will not pass event to regular handlers * if any of the filters indicate that the event should be filtered (by * returning %true from their filter() method). * * Note that input core serializes calls to connect() and disconnect() * methods. */ struct input_handler { void *private; void (*event)(struct input_handle *handle, unsigned int type, unsigned int code, int value); unsigned int (*events)(struct input_handle *handle, struct input_value *vals, unsigned int count); bool (*filter)(struct input_handle *handle, unsigned int type, unsigned int code, int value); bool (*match)(struct input_handler *handler, struct input_dev *dev); int (*connect)(struct input_handler *handler, struct input_dev *dev, const struct input_device_id *id); void (*disconnect)(struct input_handle *handle); void (*start)(struct input_handle *handle); bool passive_observer; bool legacy_minors; int minor; const char *name; const struct input_device_id *id_table; struct list_head h_list; struct list_head node; }; /** * struct input_handle - links input device with an input handler * @private: handler-specific data * @open: counter showing whether the handle is 'open', i.e. should deliver * events from its device * @name: name given to the handle by handler that created it * @dev: input device the handle is attached to * @handler: handler that works with the device through this handle * @handle_events: event sequence handler. It is set up by the input core * according to event handling method specified in the @handler. See * input_handle_setup_event_handler(). * This method is being called by the input core with interrupts disabled * and dev->event_lock spinlock held and so it may not sleep. * @d_node: used to put the handle on device's list of attached handles * @h_node: used to put the handle on handler's list of handles from which * it gets events */ struct input_handle { void *private; int open; const char *name; struct input_dev *dev; struct input_handler *handler; unsigned int (*handle_events)(struct input_handle *handle, struct input_value *vals, unsigned int count); struct list_head d_node; struct list_head h_node; }; struct input_dev __must_check *input_allocate_device(void); struct input_dev __must_check *devm_input_allocate_device(struct device *); void input_free_device(struct input_dev *dev); static inline struct input_dev *input_get_device(struct input_dev *dev) { return dev ? to_input_dev(get_device(&dev->dev)) : NULL; } static inline void input_put_device(struct input_dev *dev) { if (dev) put_device(&dev->dev); } static inline void *input_get_drvdata(struct input_dev *dev) { return dev_get_drvdata(&dev->dev); } static inline void input_set_drvdata(struct input_dev *dev, void *data) { dev_set_drvdata(&dev->dev, data); } int __must_check input_register_device(struct input_dev *); void input_unregister_device(struct input_dev *); void input_reset_device(struct input_dev *); int input_setup_polling(struct input_dev *dev, void (*poll_fn)(struct input_dev *dev)); void input_set_poll_interval(struct input_dev *dev, unsigned int interval); void input_set_min_poll_interval(struct input_dev *dev, unsigned int interval); void input_set_max_poll_interval(struct input_dev *dev, unsigned int interval); int input_get_poll_interval(struct input_dev *dev); int __must_check input_register_handler(struct input_handler *); void input_unregister_handler(struct input_handler *); int __must_check input_get_new_minor(int legacy_base, unsigned int legacy_num, bool allow_dynamic); void input_free_minor(unsigned int minor); int input_handler_for_each_handle(struct input_handler *, void *data, int (*fn)(struct input_handle *, void *)); int input_register_handle(struct input_handle *); void input_unregister_handle(struct input_handle *); int input_grab_device(struct input_handle *); void input_release_device(struct input_handle *); int input_open_device(struct input_handle *); void input_close_device(struct input_handle *); int input_flush_device(struct input_handle *handle, struct file *file); void input_set_timestamp(struct input_dev *dev, ktime_t timestamp); ktime_t *input_get_timestamp(struct input_dev *dev); void input_event(struct input_dev *dev, unsigned int type, unsigned int code, int value); void input_inject_event(struct input_handle *handle, unsigned int type, unsigned int code, int value); static inline void input_report_key(struct input_dev *dev, unsigned int code, int value) { input_event(dev, EV_KEY, code, !!value); } static inline void input_report_rel(struct input_dev *dev, unsigned int code, int value) { input_event(dev, EV_REL, code, value); } static inline void input_report_abs(struct input_dev *dev, unsigned int code, int value) { input_event(dev, EV_ABS, code, value); } static inline void input_report_ff_status(struct input_dev *dev, unsigned int code, int value) { input_event(dev, EV_FF_STATUS, code, value); } static inline void input_report_switch(struct input_dev *dev, unsigned int code, int value) { input_event(dev, EV_SW, code, !!value); } static inline void input_sync(struct input_dev *dev) { input_event(dev, EV_SYN, SYN_REPORT, 0); } static inline void input_mt_sync(struct input_dev *dev) { input_event(dev, EV_SYN, SYN_MT_REPORT, 0); } void input_set_capability(struct input_dev *dev, unsigned int type, unsigned int code); /** * input_set_events_per_packet - tell handlers about the driver event rate * @dev: the input device used by the driver * @n_events: the average number of events between calls to input_sync() * * If the event rate sent from a device is unusually large, use this * function to set the expected event rate. This will allow handlers * to set up an appropriate buffer size for the event stream, in order * to minimize information loss. */ static inline void input_set_events_per_packet(struct input_dev *dev, int n_events) { dev->hint_events_per_packet = n_events; } void input_alloc_absinfo(struct input_dev *dev); void input_set_abs_params(struct input_dev *dev, unsigned int axis, int min, int max, int fuzz, int flat); void input_copy_abs(struct input_dev *dst, unsigned int dst_axis, const struct input_dev *src, unsigned int src_axis); #define INPUT_GENERATE_ABS_ACCESSORS(_suffix, _item) \ static inline int input_abs_get_##_suffix(struct input_dev *dev, \ unsigned int axis) \ { \ return dev->absinfo ? dev->absinfo[axis]._item : 0; \ } \ \ static inline void input_abs_set_##_suffix(struct input_dev *dev, \ unsigned int axis, int val) \ { \ input_alloc_absinfo(dev); \ if (dev->absinfo) \ dev->absinfo[axis]._item = val; \ } INPUT_GENERATE_ABS_ACCESSORS(val, value) INPUT_GENERATE_ABS_ACCESSORS(min, minimum) INPUT_GENERATE_ABS_ACCESSORS(max, maximum) INPUT_GENERATE_ABS_ACCESSORS(fuzz, fuzz) INPUT_GENERATE_ABS_ACCESSORS(flat, flat) INPUT_GENERATE_ABS_ACCESSORS(res, resolution) int input_scancode_to_scalar(const struct input_keymap_entry *ke, unsigned int *scancode); int input_get_keycode(struct input_dev *dev, struct input_keymap_entry *ke); int input_set_keycode(struct input_dev *dev, const struct input_keymap_entry *ke); bool input_match_device_id(const struct input_dev *dev, const struct input_device_id *id); void input_enable_softrepeat(struct input_dev *dev, int delay, int period); bool input_device_enabled(struct input_dev *dev); extern const struct class input_class; /** * struct ff_device - force-feedback part of an input device * @upload: Called to upload an new effect into device * @erase: Called to erase an effect from device * @playback: Called to request device to start playing specified effect * @set_gain: Called to set specified gain * @set_autocenter: Called to auto-center device * @destroy: called by input core when parent input device is being * destroyed * @private: driver-specific data, will be freed automatically * @ffbit: bitmap of force feedback capabilities truly supported by * device (not emulated like ones in input_dev->ffbit) * @mutex: mutex for serializing access to the device * @max_effects: maximum number of effects supported by device * @effects: pointer to an array of effects currently loaded into device * @effect_owners: array of effect owners; when file handle owning * an effect gets closed the effect is automatically erased * * Every force-feedback device must implement upload() and playback() * methods; erase() is optional. set_gain() and set_autocenter() need * only be implemented if driver sets up FF_GAIN and FF_AUTOCENTER * bits. * * Note that playback(), set_gain() and set_autocenter() are called with * dev->event_lock spinlock held and interrupts off and thus may not * sleep. */ struct ff_device { int (*upload)(struct input_dev *dev, struct ff_effect *effect, struct ff_effect *old); int (*erase)(struct input_dev *dev, int effect_id); int (*playback)(struct input_dev *dev, int effect_id, int value); void (*set_gain)(struct input_dev *dev, u16 gain); void (*set_autocenter)(struct input_dev *dev, u16 magnitude); void (*destroy)(struct ff_device *); void *private; unsigned long ffbit[BITS_TO_LONGS(FF_CNT)]; struct mutex mutex; int max_effects; struct ff_effect *effects; struct file *effect_owners[] __counted_by(max_effects); }; int input_ff_create(struct input_dev *dev, unsigned int max_effects); void input_ff_destroy(struct input_dev *dev); int input_ff_event(struct input_dev *dev, unsigned int type, unsigned int code, int value); int input_ff_upload(struct input_dev *dev, struct ff_effect *effect, struct file *file); int input_ff_erase(struct input_dev *dev, int effect_id, struct file *file); int input_ff_flush(struct input_dev *dev, struct file *file); int input_ff_create_memless(struct input_dev *dev, void *data, int (*play_effect)(struct input_dev *, void *, struct ff_effect *)); #endif |
| 14 2 22 14 10 11 6 3 1 2 2 2 2 2 2 1 3 3 2 1 1 1 1 1 1 1 1 1 2 1 1 2 1 1 2 1 1 3 2 1 1 3 3 1 1 3 3 3 2 1 1 1 2 2 1 1 2 1 1 1 3 1 2 2 1 1 3 1 2 1 1 2 3 1 2 2 2 1 1 1 1 1 2 1 1 1 1 3 1 1 1 1 1 1 1 1 2 49 2 1 1 2 42 3 3 2 2 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 | /* * Copyright (c) 2005-2006 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include <linux/completion.h> #include <linux/file.h> #include <linux/mutex.h> #include <linux/poll.h> #include <linux/sched.h> #include <linux/idr.h> #include <linux/in.h> #include <linux/in6.h> #include <linux/miscdevice.h> #include <linux/slab.h> #include <linux/sysctl.h> #include <linux/module.h> #include <linux/nsproxy.h> #include <linux/nospec.h> #include <rdma/rdma_user_cm.h> #include <rdma/ib_marshall.h> #include <rdma/rdma_cm.h> #include <rdma/rdma_cm_ib.h> #include <rdma/ib_addr.h> #include <rdma/ib.h> #include <rdma/ib_cm.h> #include <rdma/rdma_netlink.h> #include "core_priv.h" MODULE_AUTHOR("Sean Hefty"); MODULE_DESCRIPTION("RDMA Userspace Connection Manager Access"); MODULE_LICENSE("Dual BSD/GPL"); static unsigned int max_backlog = 1024; static struct ctl_table_header *ucma_ctl_table_hdr; static struct ctl_table ucma_ctl_table[] = { { .procname = "max_backlog", .data = &max_backlog, .maxlen = sizeof max_backlog, .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_INT_MAX, }, }; struct ucma_file { struct mutex mut; struct file *filp; struct list_head ctx_list; struct list_head event_list; wait_queue_head_t poll_wait; }; struct ucma_context { u32 id; struct completion comp; refcount_t ref; int events_reported; atomic_t backlog; struct ucma_file *file; struct rdma_cm_id *cm_id; struct mutex mutex; u64 uid; struct list_head list; struct list_head mc_list; struct work_struct close_work; }; struct ucma_multicast { struct ucma_context *ctx; u32 id; int events_reported; u64 uid; u8 join_state; struct list_head list; struct sockaddr_storage addr; }; struct ucma_event { struct ucma_context *ctx; struct ucma_context *conn_req_ctx; struct ucma_multicast *mc; struct list_head list; struct rdma_ucm_event_resp resp; }; static DEFINE_XARRAY_ALLOC(ctx_table); static DEFINE_XARRAY_ALLOC(multicast_table); static const struct file_operations ucma_fops; static int ucma_destroy_private_ctx(struct ucma_context *ctx); static inline struct ucma_context *_ucma_find_context(int id, struct ucma_file *file) { struct ucma_context *ctx; ctx = xa_load(&ctx_table, id); if (!ctx) ctx = ERR_PTR(-ENOENT); else if (ctx->file != file) ctx = ERR_PTR(-EINVAL); return ctx; } static struct ucma_context *ucma_get_ctx(struct ucma_file *file, int id) { struct ucma_context *ctx; xa_lock(&ctx_table); ctx = _ucma_find_context(id, file); if (!IS_ERR(ctx)) if (!refcount_inc_not_zero(&ctx->ref)) ctx = ERR_PTR(-ENXIO); xa_unlock(&ctx_table); return ctx; } static void ucma_put_ctx(struct ucma_context *ctx) { if (refcount_dec_and_test(&ctx->ref)) complete(&ctx->comp); } /* * Same as ucm_get_ctx but requires that ->cm_id->device is valid, eg that the * CM_ID is bound. */ static struct ucma_context *ucma_get_ctx_dev(struct ucma_file *file, int id) { struct ucma_context *ctx = ucma_get_ctx(file, id); if (IS_ERR(ctx)) return ctx; if (!ctx->cm_id->device) { ucma_put_ctx(ctx); return ERR_PTR(-EINVAL); } return ctx; } static void ucma_close_id(struct work_struct *work) { struct ucma_context *ctx = container_of(work, struct ucma_context, close_work); /* once all inflight tasks are finished, we close all underlying * resources. The context is still alive till its explicit destryoing * by its creator. This puts back the xarray's reference. */ ucma_put_ctx(ctx); wait_for_completion(&ctx->comp); /* No new events will be generated after destroying the id. */ rdma_destroy_id(ctx->cm_id); /* Reading the cm_id without holding a positive ref is not allowed */ ctx->cm_id = NULL; } static struct ucma_context *ucma_alloc_ctx(struct ucma_file *file) { struct ucma_context *ctx; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) return NULL; INIT_WORK(&ctx->close_work, ucma_close_id); init_completion(&ctx->comp); INIT_LIST_HEAD(&ctx->mc_list); /* So list_del() will work if we don't do ucma_finish_ctx() */ INIT_LIST_HEAD(&ctx->list); ctx->file = file; mutex_init(&ctx->mutex); if (xa_alloc(&ctx_table, &ctx->id, NULL, xa_limit_32b, GFP_KERNEL)) { kfree(ctx); return NULL; } return ctx; } static void ucma_set_ctx_cm_id(struct ucma_context *ctx, struct rdma_cm_id *cm_id) { refcount_set(&ctx->ref, 1); ctx->cm_id = cm_id; } static void ucma_finish_ctx(struct ucma_context *ctx) { lockdep_assert_held(&ctx->file->mut); list_add_tail(&ctx->list, &ctx->file->ctx_list); xa_store(&ctx_table, ctx->id, ctx, GFP_KERNEL); } static void ucma_copy_conn_event(struct rdma_ucm_conn_param *dst, struct rdma_conn_param *src) { if (src->private_data_len) memcpy(dst->private_data, src->private_data, src->private_data_len); dst->private_data_len = src->private_data_len; dst->responder_resources = src->responder_resources; dst->initiator_depth = src->initiator_depth; dst->flow_control = src->flow_control; dst->retry_count = src->retry_count; dst->rnr_retry_count = src->rnr_retry_count; dst->srq = src->srq; dst->qp_num = src->qp_num; } static void ucma_copy_ud_event(struct ib_device *device, struct rdma_ucm_ud_param *dst, struct rdma_ud_param *src) { if (src->private_data_len) memcpy(dst->private_data, src->private_data, src->private_data_len); dst->private_data_len = src->private_data_len; ib_copy_ah_attr_to_user(device, &dst->ah_attr, &src->ah_attr); dst->qp_num = src->qp_num; dst->qkey = src->qkey; } static struct ucma_event *ucma_create_uevent(struct ucma_context *ctx, struct rdma_cm_event *event) { struct ucma_event *uevent; uevent = kzalloc(sizeof(*uevent), GFP_KERNEL); if (!uevent) return NULL; uevent->ctx = ctx; switch (event->event) { case RDMA_CM_EVENT_MULTICAST_JOIN: case RDMA_CM_EVENT_MULTICAST_ERROR: uevent->mc = (struct ucma_multicast *) event->param.ud.private_data; uevent->resp.uid = uevent->mc->uid; uevent->resp.id = uevent->mc->id; break; default: uevent->resp.uid = ctx->uid; uevent->resp.id = ctx->id; break; } uevent->resp.event = event->event; uevent->resp.status = event->status; if (ctx->cm_id->qp_type == IB_QPT_UD) ucma_copy_ud_event(ctx->cm_id->device, &uevent->resp.param.ud, &event->param.ud); else ucma_copy_conn_event(&uevent->resp.param.conn, &event->param.conn); uevent->resp.ece.vendor_id = event->ece.vendor_id; uevent->resp.ece.attr_mod = event->ece.attr_mod; return uevent; } static int ucma_connect_event_handler(struct rdma_cm_id *cm_id, struct rdma_cm_event *event) { struct ucma_context *listen_ctx = cm_id->context; struct ucma_context *ctx; struct ucma_event *uevent; if (!atomic_add_unless(&listen_ctx->backlog, -1, 0)) return -ENOMEM; ctx = ucma_alloc_ctx(listen_ctx->file); if (!ctx) goto err_backlog; ucma_set_ctx_cm_id(ctx, cm_id); uevent = ucma_create_uevent(listen_ctx, event); if (!uevent) goto err_alloc; uevent->conn_req_ctx = ctx; uevent->resp.id = ctx->id; ctx->cm_id->context = ctx; mutex_lock(&ctx->file->mut); ucma_finish_ctx(ctx); list_add_tail(&uevent->list, &ctx->file->event_list); mutex_unlock(&ctx->file->mut); wake_up_interruptible(&ctx->file->poll_wait); return 0; err_alloc: ucma_destroy_private_ctx(ctx); err_backlog: atomic_inc(&listen_ctx->backlog); /* Returning error causes the new ID to be destroyed */ return -ENOMEM; } static int ucma_event_handler(struct rdma_cm_id *cm_id, struct rdma_cm_event *event) { struct ucma_event *uevent; struct ucma_context *ctx = cm_id->context; if (event->event == RDMA_CM_EVENT_CONNECT_REQUEST) return ucma_connect_event_handler(cm_id, event); /* * We ignore events for new connections until userspace has set their * context. This can only happen if an error occurs on a new connection * before the user accepts it. This is okay, since the accept will just * fail later. However, we do need to release the underlying HW * resources in case of a device removal event. */ if (ctx->uid) { uevent = ucma_create_uevent(ctx, event); if (!uevent) return 0; mutex_lock(&ctx->file->mut); list_add_tail(&uevent->list, &ctx->file->event_list); mutex_unlock(&ctx->file->mut); wake_up_interruptible(&ctx->file->poll_wait); } if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) { xa_lock(&ctx_table); if (xa_load(&ctx_table, ctx->id) == ctx) queue_work(system_unbound_wq, &ctx->close_work); xa_unlock(&ctx_table); } return 0; } static ssize_t ucma_get_event(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_get_event cmd; struct ucma_event *uevent; /* * Old 32 bit user space does not send the 4 byte padding in the * reserved field. We don't care, allow it to keep working. */ if (out_len < sizeof(uevent->resp) - sizeof(uevent->resp.reserved) - sizeof(uevent->resp.ece)) return -ENOSPC; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; mutex_lock(&file->mut); while (list_empty(&file->event_list)) { mutex_unlock(&file->mut); if (file->filp->f_flags & O_NONBLOCK) return -EAGAIN; if (wait_event_interruptible(file->poll_wait, !list_empty(&file->event_list))) return -ERESTARTSYS; mutex_lock(&file->mut); } uevent = list_first_entry(&file->event_list, struct ucma_event, list); if (copy_to_user(u64_to_user_ptr(cmd.response), &uevent->resp, min_t(size_t, out_len, sizeof(uevent->resp)))) { mutex_unlock(&file->mut); return -EFAULT; } list_del(&uevent->list); uevent->ctx->events_reported++; if (uevent->mc) uevent->mc->events_reported++; if (uevent->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST) atomic_inc(&uevent->ctx->backlog); mutex_unlock(&file->mut); kfree(uevent); return 0; } static int ucma_get_qp_type(struct rdma_ucm_create_id *cmd, enum ib_qp_type *qp_type) { switch (cmd->ps) { case RDMA_PS_TCP: *qp_type = IB_QPT_RC; return 0; case RDMA_PS_UDP: case RDMA_PS_IPOIB: *qp_type = IB_QPT_UD; return 0; case RDMA_PS_IB: *qp_type = cmd->qp_type; return 0; default: return -EINVAL; } } static ssize_t ucma_create_id(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_create_id cmd; struct rdma_ucm_create_id_resp resp; struct ucma_context *ctx; struct rdma_cm_id *cm_id; enum ib_qp_type qp_type; int ret; if (out_len < sizeof(resp)) return -ENOSPC; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; ret = ucma_get_qp_type(&cmd, &qp_type); if (ret) return ret; ctx = ucma_alloc_ctx(file); if (!ctx) return -ENOMEM; ctx->uid = cmd.uid; cm_id = rdma_create_user_id(ucma_event_handler, ctx, cmd.ps, qp_type); if (IS_ERR(cm_id)) { ret = PTR_ERR(cm_id); goto err1; } ucma_set_ctx_cm_id(ctx, cm_id); resp.id = ctx->id; if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof(resp))) { ret = -EFAULT; goto err1; } mutex_lock(&file->mut); ucma_finish_ctx(ctx); mutex_unlock(&file->mut); return 0; err1: ucma_destroy_private_ctx(ctx); return ret; } static void ucma_cleanup_multicast(struct ucma_context *ctx) { struct ucma_multicast *mc, *tmp; xa_lock(&multicast_table); list_for_each_entry_safe(mc, tmp, &ctx->mc_list, list) { list_del(&mc->list); /* * At this point mc->ctx->ref is 0 so the mc cannot leave the * lock on the reader and this is enough serialization */ __xa_erase(&multicast_table, mc->id); kfree(mc); } xa_unlock(&multicast_table); } static void ucma_cleanup_mc_events(struct ucma_multicast *mc) { struct ucma_event *uevent, *tmp; rdma_lock_handler(mc->ctx->cm_id); mutex_lock(&mc->ctx->file->mut); list_for_each_entry_safe(uevent, tmp, &mc->ctx->file->event_list, list) { if (uevent->mc != mc) continue; list_del(&uevent->list); kfree(uevent); } mutex_unlock(&mc->ctx->file->mut); rdma_unlock_handler(mc->ctx->cm_id); } static int ucma_cleanup_ctx_events(struct ucma_context *ctx) { int events_reported; struct ucma_event *uevent, *tmp; LIST_HEAD(list); /* Cleanup events not yet reported to the user.*/ mutex_lock(&ctx->file->mut); list_for_each_entry_safe(uevent, tmp, &ctx->file->event_list, list) { if (uevent->ctx != ctx) continue; if (uevent->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST && xa_cmpxchg(&ctx_table, uevent->conn_req_ctx->id, uevent->conn_req_ctx, XA_ZERO_ENTRY, GFP_KERNEL) == uevent->conn_req_ctx) { list_move_tail(&uevent->list, &list); continue; } list_del(&uevent->list); kfree(uevent); } list_del(&ctx->list); events_reported = ctx->events_reported; mutex_unlock(&ctx->file->mut); /* * If this was a listening ID then any connections spawned from it that * have not been delivered to userspace are cleaned up too. Must be done * outside any locks. */ list_for_each_entry_safe(uevent, tmp, &list, list) { ucma_destroy_private_ctx(uevent->conn_req_ctx); kfree(uevent); } return events_reported; } /* * When this is called the xarray must have a XA_ZERO_ENTRY in the ctx->id (ie * the ctx is not public to the user). This either because: * - ucma_finish_ctx() hasn't been called * - xa_cmpxchg() succeed to remove the entry (only one thread can succeed) */ static int ucma_destroy_private_ctx(struct ucma_context *ctx) { int events_reported; /* * Destroy the underlying cm_id. New work queuing is prevented now by * the removal from the xarray. Once the work is cancled ref will either * be 0 because the work ran to completion and consumed the ref from the * xarray, or it will be positive because we still have the ref from the * xarray. This can also be 0 in cases where cm_id was never set */ cancel_work_sync(&ctx->close_work); if (refcount_read(&ctx->ref)) ucma_close_id(&ctx->close_work); events_reported = ucma_cleanup_ctx_events(ctx); ucma_cleanup_multicast(ctx); WARN_ON(xa_cmpxchg(&ctx_table, ctx->id, XA_ZERO_ENTRY, NULL, GFP_KERNEL) != NULL); mutex_destroy(&ctx->mutex); kfree(ctx); return events_reported; } static ssize_t ucma_destroy_id(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_destroy_id cmd; struct rdma_ucm_destroy_id_resp resp; struct ucma_context *ctx; int ret = 0; if (out_len < sizeof(resp)) return -ENOSPC; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; xa_lock(&ctx_table); ctx = _ucma_find_context(cmd.id, file); if (!IS_ERR(ctx)) { if (__xa_cmpxchg(&ctx_table, ctx->id, ctx, XA_ZERO_ENTRY, GFP_KERNEL) != ctx) ctx = ERR_PTR(-ENOENT); } xa_unlock(&ctx_table); if (IS_ERR(ctx)) return PTR_ERR(ctx); resp.events_reported = ucma_destroy_private_ctx(ctx); if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof(resp))) ret = -EFAULT; return ret; } static ssize_t ucma_bind_ip(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_bind_ip cmd; struct ucma_context *ctx; int ret; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; if (!rdma_addr_size_in6(&cmd.addr)) return -EINVAL; ctx = ucma_get_ctx(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); mutex_lock(&ctx->mutex); ret = rdma_bind_addr(ctx->cm_id, (struct sockaddr *) &cmd.addr); mutex_unlock(&ctx->mutex); ucma_put_ctx(ctx); return ret; } static ssize_t ucma_bind(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_bind cmd; struct ucma_context *ctx; int ret; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; if (cmd.reserved || !cmd.addr_size || cmd.addr_size != rdma_addr_size_kss(&cmd.addr)) return -EINVAL; ctx = ucma_get_ctx(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); mutex_lock(&ctx->mutex); ret = rdma_bind_addr(ctx->cm_id, (struct sockaddr *) &cmd.addr); mutex_unlock(&ctx->mutex); ucma_put_ctx(ctx); return ret; } static ssize_t ucma_resolve_ip(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_resolve_ip cmd; struct ucma_context *ctx; int ret; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; if ((cmd.src_addr.sin6_family && !rdma_addr_size_in6(&cmd.src_addr)) || !rdma_addr_size_in6(&cmd.dst_addr)) return -EINVAL; ctx = ucma_get_ctx(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); mutex_lock(&ctx->mutex); ret = rdma_resolve_addr(ctx->cm_id, (struct sockaddr *) &cmd.src_addr, (struct sockaddr *) &cmd.dst_addr, cmd.timeout_ms); mutex_unlock(&ctx->mutex); ucma_put_ctx(ctx); return ret; } static ssize_t ucma_resolve_addr(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_resolve_addr cmd; struct ucma_context *ctx; int ret; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; if (cmd.reserved || (cmd.src_size && (cmd.src_size != rdma_addr_size_kss(&cmd.src_addr))) || !cmd.dst_size || (cmd.dst_size != rdma_addr_size_kss(&cmd.dst_addr))) return -EINVAL; ctx = ucma_get_ctx(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); mutex_lock(&ctx->mutex); ret = rdma_resolve_addr(ctx->cm_id, (struct sockaddr *) &cmd.src_addr, (struct sockaddr *) &cmd.dst_addr, cmd.timeout_ms); mutex_unlock(&ctx->mutex); ucma_put_ctx(ctx); return ret; } static ssize_t ucma_resolve_route(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_resolve_route cmd; struct ucma_context *ctx; int ret; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; ctx = ucma_get_ctx_dev(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); mutex_lock(&ctx->mutex); ret = rdma_resolve_route(ctx->cm_id, cmd.timeout_ms); mutex_unlock(&ctx->mutex); ucma_put_ctx(ctx); return ret; } static void ucma_copy_ib_route(struct rdma_ucm_query_route_resp *resp, struct rdma_route *route) { struct rdma_dev_addr *dev_addr; resp->num_paths = route->num_pri_alt_paths; switch (route->num_pri_alt_paths) { case 0: dev_addr = &route->addr.dev_addr; rdma_addr_get_dgid(dev_addr, (union ib_gid *) &resp->ib_route[0].dgid); rdma_addr_get_sgid(dev_addr, (union ib_gid *) &resp->ib_route[0].sgid); resp->ib_route[0].pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr)); break; case 2: ib_copy_path_rec_to_user(&resp->ib_route[1], &route->path_rec[1]); fallthrough; case 1: ib_copy_path_rec_to_user(&resp->ib_route[0], &route->path_rec[0]); break; default: break; } } static void ucma_copy_iboe_route(struct rdma_ucm_query_route_resp *resp, struct rdma_route *route) { resp->num_paths = route->num_pri_alt_paths; switch (route->num_pri_alt_paths) { case 0: rdma_ip2gid((struct sockaddr *)&route->addr.dst_addr, (union ib_gid *)&resp->ib_route[0].dgid); rdma_ip2gid((struct sockaddr *)&route->addr.src_addr, (union ib_gid *)&resp->ib_route[0].sgid); resp->ib_route[0].pkey = cpu_to_be16(0xffff); break; case 2: ib_copy_path_rec_to_user(&resp->ib_route[1], &route->path_rec[1]); fallthrough; case 1: ib_copy_path_rec_to_user(&resp->ib_route[0], &route->path_rec[0]); break; default: break; } } static void ucma_copy_iw_route(struct rdma_ucm_query_route_resp *resp, struct rdma_route *route) { struct rdma_dev_addr *dev_addr; dev_addr = &route->addr.dev_addr; rdma_addr_get_dgid(dev_addr, (union ib_gid *) &resp->ib_route[0].dgid); rdma_addr_get_sgid(dev_addr, (union ib_gid *) &resp->ib_route[0].sgid); } static ssize_t ucma_query_route(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_query cmd; struct rdma_ucm_query_route_resp resp; struct ucma_context *ctx; struct sockaddr *addr; int ret = 0; if (out_len < offsetof(struct rdma_ucm_query_route_resp, ibdev_index)) return -ENOSPC; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; ctx = ucma_get_ctx(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); mutex_lock(&ctx->mutex); memset(&resp, 0, sizeof resp); addr = (struct sockaddr *) &ctx->cm_id->route.addr.src_addr; memcpy(&resp.src_addr, addr, addr->sa_family == AF_INET ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6)); addr = (struct sockaddr *) &ctx->cm_id->route.addr.dst_addr; memcpy(&resp.dst_addr, addr, addr->sa_family == AF_INET ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6)); if (!ctx->cm_id->device) goto out; resp.node_guid = (__force __u64) ctx->cm_id->device->node_guid; resp.ibdev_index = ctx->cm_id->device->index; resp.port_num = ctx->cm_id->port_num; if (rdma_cap_ib_sa(ctx->cm_id->device, ctx->cm_id->port_num)) ucma_copy_ib_route(&resp, &ctx->cm_id->route); else if (rdma_protocol_roce(ctx->cm_id->device, ctx->cm_id->port_num)) ucma_copy_iboe_route(&resp, &ctx->cm_id->route); else if (rdma_protocol_iwarp(ctx->cm_id->device, ctx->cm_id->port_num)) ucma_copy_iw_route(&resp, &ctx->cm_id->route); out: mutex_unlock(&ctx->mutex); if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, min_t(size_t, out_len, sizeof(resp)))) ret = -EFAULT; ucma_put_ctx(ctx); return ret; } static void ucma_query_device_addr(struct rdma_cm_id *cm_id, struct rdma_ucm_query_addr_resp *resp) { if (!cm_id->device) return; resp->node_guid = (__force __u64) cm_id->device->node_guid; resp->ibdev_index = cm_id->device->index; resp->port_num = cm_id->port_num; resp->pkey = (__force __u16) cpu_to_be16( ib_addr_get_pkey(&cm_id->route.addr.dev_addr)); } static ssize_t ucma_query_addr(struct ucma_context *ctx, void __user *response, int out_len) { struct rdma_ucm_query_addr_resp resp; struct sockaddr *addr; int ret = 0; if (out_len < offsetof(struct rdma_ucm_query_addr_resp, ibdev_index)) return -ENOSPC; memset(&resp, 0, sizeof resp); addr = (struct sockaddr *) &ctx->cm_id->route.addr.src_addr; resp.src_size = rdma_addr_size(addr); memcpy(&resp.src_addr, addr, resp.src_size); addr = (struct sockaddr *) &ctx->cm_id->route.addr.dst_addr; resp.dst_size = rdma_addr_size(addr); memcpy(&resp.dst_addr, addr, resp.dst_size); ucma_query_device_addr(ctx->cm_id, &resp); if (copy_to_user(response, &resp, min_t(size_t, out_len, sizeof(resp)))) ret = -EFAULT; return ret; } static ssize_t ucma_query_path(struct ucma_context *ctx, void __user *response, int out_len) { struct rdma_ucm_query_path_resp *resp; int i, ret = 0; if (out_len < sizeof(*resp)) return -ENOSPC; resp = kzalloc(out_len, GFP_KERNEL); if (!resp) return -ENOMEM; resp->num_paths = ctx->cm_id->route.num_pri_alt_paths; for (i = 0, out_len -= sizeof(*resp); i < resp->num_paths && out_len > sizeof(struct ib_path_rec_data); i++, out_len -= sizeof(struct ib_path_rec_data)) { struct sa_path_rec *rec = &ctx->cm_id->route.path_rec[i]; resp->path_data[i].flags = IB_PATH_GMP | IB_PATH_PRIMARY | IB_PATH_BIDIRECTIONAL; if (rec->rec_type == SA_PATH_REC_TYPE_OPA) { struct sa_path_rec ib; sa_convert_path_opa_to_ib(&ib, rec); ib_sa_pack_path(&ib, &resp->path_data[i].path_rec); } else { ib_sa_pack_path(rec, &resp->path_data[i].path_rec); } } if (copy_to_user(response, resp, struct_size(resp, path_data, i))) ret = -EFAULT; kfree(resp); return ret; } static ssize_t ucma_query_gid(struct ucma_context *ctx, void __user *response, int out_len) { struct rdma_ucm_query_addr_resp resp; struct sockaddr_ib *addr; int ret = 0; if (out_len < offsetof(struct rdma_ucm_query_addr_resp, ibdev_index)) return -ENOSPC; memset(&resp, 0, sizeof resp); ucma_query_device_addr(ctx->cm_id, &resp); addr = (struct sockaddr_ib *) &resp.src_addr; resp.src_size = sizeof(*addr); if (ctx->cm_id->route.addr.src_addr.ss_family == AF_IB) { memcpy(addr, &ctx->cm_id->route.addr.src_addr, resp.src_size); } else { addr->sib_family = AF_IB; addr->sib_pkey = (__force __be16) resp.pkey; rdma_read_gids(ctx->cm_id, (union ib_gid *)&addr->sib_addr, NULL); addr->sib_sid = rdma_get_service_id(ctx->cm_id, (struct sockaddr *) &ctx->cm_id->route.addr.src_addr); } addr = (struct sockaddr_ib *) &resp.dst_addr; resp.dst_size = sizeof(*addr); if (ctx->cm_id->route.addr.dst_addr.ss_family == AF_IB) { memcpy(addr, &ctx->cm_id->route.addr.dst_addr, resp.dst_size); } else { addr->sib_family = AF_IB; addr->sib_pkey = (__force __be16) resp.pkey; rdma_read_gids(ctx->cm_id, NULL, (union ib_gid *)&addr->sib_addr); addr->sib_sid = rdma_get_service_id(ctx->cm_id, (struct sockaddr *) &ctx->cm_id->route.addr.dst_addr); } if (copy_to_user(response, &resp, min_t(size_t, out_len, sizeof(resp)))) ret = -EFAULT; return ret; } static ssize_t ucma_query(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_query cmd; struct ucma_context *ctx; void __user *response; int ret; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; response = u64_to_user_ptr(cmd.response); ctx = ucma_get_ctx(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); mutex_lock(&ctx->mutex); switch (cmd.option) { case RDMA_USER_CM_QUERY_ADDR: ret = ucma_query_addr(ctx, response, out_len); break; case RDMA_USER_CM_QUERY_PATH: ret = ucma_query_path(ctx, response, out_len); break; case RDMA_USER_CM_QUERY_GID: ret = ucma_query_gid(ctx, response, out_len); break; default: ret = -ENOSYS; break; } mutex_unlock(&ctx->mutex); ucma_put_ctx(ctx); return ret; } static void ucma_copy_conn_param(struct rdma_cm_id *id, struct rdma_conn_param *dst, struct rdma_ucm_conn_param *src) { dst->private_data = src->private_data; dst->private_data_len = src->private_data_len; dst->responder_resources = src->responder_resources; dst->initiator_depth = src->initiator_depth; dst->flow_control = src->flow_control; dst->retry_count = src->retry_count; dst->rnr_retry_count = src->rnr_retry_count; dst->srq = src->srq; dst->qp_num = src->qp_num & 0xFFFFFF; dst->qkey = (id->route.addr.src_addr.ss_family == AF_IB) ? src->qkey : 0; } static ssize_t ucma_connect(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_conn_param conn_param; struct rdma_ucm_ece ece = {}; struct rdma_ucm_connect cmd; struct ucma_context *ctx; size_t in_size; int ret; if (in_len < offsetofend(typeof(cmd), reserved)) return -EINVAL; in_size = min_t(size_t, in_len, sizeof(cmd)); if (copy_from_user(&cmd, inbuf, in_size)) return -EFAULT; if (!cmd.conn_param.valid) return -EINVAL; ctx = ucma_get_ctx_dev(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); ucma_copy_conn_param(ctx->cm_id, &conn_param, &cmd.conn_param); if (offsetofend(typeof(cmd), ece) <= in_size) { ece.vendor_id = cmd.ece.vendor_id; ece.attr_mod = cmd.ece.attr_mod; } mutex_lock(&ctx->mutex); ret = rdma_connect_ece(ctx->cm_id, &conn_param, &ece); mutex_unlock(&ctx->mutex); ucma_put_ctx(ctx); return ret; } static ssize_t ucma_listen(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_listen cmd; struct ucma_context *ctx; int ret; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; ctx = ucma_get_ctx(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); if (cmd.backlog <= 0 || cmd.backlog > max_backlog) cmd.backlog = max_backlog; atomic_set(&ctx->backlog, cmd.backlog); mutex_lock(&ctx->mutex); ret = rdma_listen(ctx->cm_id, cmd.backlog); mutex_unlock(&ctx->mutex); ucma_put_ctx(ctx); return ret; } static ssize_t ucma_accept(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_accept cmd; struct rdma_conn_param conn_param; struct rdma_ucm_ece ece = {}; struct ucma_context *ctx; size_t in_size; int ret; if (in_len < offsetofend(typeof(cmd), reserved)) return -EINVAL; in_size = min_t(size_t, in_len, sizeof(cmd)); if (copy_from_user(&cmd, inbuf, in_size)) return -EFAULT; ctx = ucma_get_ctx_dev(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); if (offsetofend(typeof(cmd), ece) <= in_size) { ece.vendor_id = cmd.ece.vendor_id; ece.attr_mod = cmd.ece.attr_mod; } if (cmd.conn_param.valid) { ucma_copy_conn_param(ctx->cm_id, &conn_param, &cmd.conn_param); mutex_lock(&ctx->mutex); rdma_lock_handler(ctx->cm_id); ret = rdma_accept_ece(ctx->cm_id, &conn_param, &ece); if (!ret) { /* The uid must be set atomically with the handler */ ctx->uid = cmd.uid; } rdma_unlock_handler(ctx->cm_id); mutex_unlock(&ctx->mutex); } else { mutex_lock(&ctx->mutex); rdma_lock_handler(ctx->cm_id); ret = rdma_accept_ece(ctx->cm_id, NULL, &ece); rdma_unlock_handler(ctx->cm_id); mutex_unlock(&ctx->mutex); } ucma_put_ctx(ctx); return ret; } static ssize_t ucma_reject(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_reject cmd; struct ucma_context *ctx; int ret; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; if (!cmd.reason) cmd.reason = IB_CM_REJ_CONSUMER_DEFINED; switch (cmd.reason) { case IB_CM_REJ_CONSUMER_DEFINED: case IB_CM_REJ_VENDOR_OPTION_NOT_SUPPORTED: break; default: return -EINVAL; } ctx = ucma_get_ctx_dev(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); mutex_lock(&ctx->mutex); ret = rdma_reject(ctx->cm_id, cmd.private_data, cmd.private_data_len, cmd.reason); mutex_unlock(&ctx->mutex); ucma_put_ctx(ctx); return ret; } static ssize_t ucma_disconnect(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_disconnect cmd; struct ucma_context *ctx; int ret; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; ctx = ucma_get_ctx_dev(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); mutex_lock(&ctx->mutex); ret = rdma_disconnect(ctx->cm_id); mutex_unlock(&ctx->mutex); ucma_put_ctx(ctx); return ret; } static ssize_t ucma_init_qp_attr(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_init_qp_attr cmd; struct ib_uverbs_qp_attr resp; struct ucma_context *ctx; struct ib_qp_attr qp_attr; int ret; if (out_len < sizeof(resp)) return -ENOSPC; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; if (cmd.qp_state > IB_QPS_ERR) return -EINVAL; ctx = ucma_get_ctx_dev(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); resp.qp_attr_mask = 0; memset(&qp_attr, 0, sizeof qp_attr); qp_attr.qp_state = cmd.qp_state; mutex_lock(&ctx->mutex); ret = rdma_init_qp_attr(ctx->cm_id, &qp_attr, &resp.qp_attr_mask); mutex_unlock(&ctx->mutex); if (ret) goto out; ib_copy_qp_attr_to_user(ctx->cm_id->device, &resp, &qp_attr); if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof(resp))) ret = -EFAULT; out: ucma_put_ctx(ctx); return ret; } static int ucma_set_option_id(struct ucma_context *ctx, int optname, void *optval, size_t optlen) { int ret = 0; switch (optname) { case RDMA_OPTION_ID_TOS: if (optlen != sizeof(u8)) { ret = -EINVAL; break; } rdma_set_service_type(ctx->cm_id, *((u8 *) optval)); break; case RDMA_OPTION_ID_REUSEADDR: if (optlen != sizeof(int)) { ret = -EINVAL; break; } ret = rdma_set_reuseaddr(ctx->cm_id, *((int *) optval) ? 1 : 0); break; case RDMA_OPTION_ID_AFONLY: if (optlen != sizeof(int)) { ret = -EINVAL; break; } ret = rdma_set_afonly(ctx->cm_id, *((int *) optval) ? 1 : 0); break; case RDMA_OPTION_ID_ACK_TIMEOUT: if (optlen != sizeof(u8)) { ret = -EINVAL; break; } ret = rdma_set_ack_timeout(ctx->cm_id, *((u8 *)optval)); break; default: ret = -ENOSYS; } return ret; } static int ucma_set_ib_path(struct ucma_context *ctx, struct ib_path_rec_data *path_data, size_t optlen) { struct sa_path_rec sa_path; struct rdma_cm_event event; int ret; if (optlen % sizeof(*path_data)) return -EINVAL; for (; optlen; optlen -= sizeof(*path_data), path_data++) { if (path_data->flags == (IB_PATH_GMP | IB_PATH_PRIMARY | IB_PATH_BIDIRECTIONAL)) break; } if (!optlen) return -EINVAL; if (!ctx->cm_id->device) return -EINVAL; memset(&sa_path, 0, sizeof(sa_path)); sa_path.rec_type = SA_PATH_REC_TYPE_IB; ib_sa_unpack_path(path_data->path_rec, &sa_path); if (rdma_cap_opa_ah(ctx->cm_id->device, ctx->cm_id->port_num)) { struct sa_path_rec opa; sa_convert_path_ib_to_opa(&opa, &sa_path); mutex_lock(&ctx->mutex); ret = rdma_set_ib_path(ctx->cm_id, &opa); mutex_unlock(&ctx->mutex); } else { mutex_lock(&ctx->mutex); ret = rdma_set_ib_path(ctx->cm_id, &sa_path); mutex_unlock(&ctx->mutex); } if (ret) return ret; memset(&event, 0, sizeof event); event.event = RDMA_CM_EVENT_ROUTE_RESOLVED; return ucma_event_handler(ctx->cm_id, &event); } static int ucma_set_option_ib(struct ucma_context *ctx, int optname, void *optval, size_t optlen) { int ret; switch (optname) { case RDMA_OPTION_IB_PATH: ret = ucma_set_ib_path(ctx, optval, optlen); break; default: ret = -ENOSYS; } return ret; } static int ucma_set_option_level(struct ucma_context *ctx, int level, int optname, void *optval, size_t optlen) { int ret; switch (level) { case RDMA_OPTION_ID: mutex_lock(&ctx->mutex); ret = ucma_set_option_id(ctx, optname, optval, optlen); mutex_unlock(&ctx->mutex); break; case RDMA_OPTION_IB: ret = ucma_set_option_ib(ctx, optname, optval, optlen); break; default: ret = -ENOSYS; } return ret; } static ssize_t ucma_set_option(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_set_option cmd; struct ucma_context *ctx; void *optval; int ret; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; if (unlikely(cmd.optlen > KMALLOC_MAX_SIZE)) return -EINVAL; ctx = ucma_get_ctx(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); optval = memdup_user(u64_to_user_ptr(cmd.optval), cmd.optlen); if (IS_ERR(optval)) { ret = PTR_ERR(optval); goto out; } ret = ucma_set_option_level(ctx, cmd.level, cmd.optname, optval, cmd.optlen); kfree(optval); out: ucma_put_ctx(ctx); return ret; } static ssize_t ucma_notify(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_notify cmd; struct ucma_context *ctx; int ret = -EINVAL; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; ctx = ucma_get_ctx(file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); mutex_lock(&ctx->mutex); if (ctx->cm_id->device) ret = rdma_notify(ctx->cm_id, (enum ib_event_type)cmd.event); mutex_unlock(&ctx->mutex); ucma_put_ctx(ctx); return ret; } static ssize_t ucma_process_join(struct ucma_file *file, struct rdma_ucm_join_mcast *cmd, int out_len) { struct rdma_ucm_create_id_resp resp; struct ucma_context *ctx; struct ucma_multicast *mc; struct sockaddr *addr; int ret; u8 join_state; if (out_len < sizeof(resp)) return -ENOSPC; addr = (struct sockaddr *) &cmd->addr; if (cmd->addr_size != rdma_addr_size(addr)) return -EINVAL; if (cmd->join_flags == RDMA_MC_JOIN_FLAG_FULLMEMBER) join_state = BIT(FULLMEMBER_JOIN); else if (cmd->join_flags == RDMA_MC_JOIN_FLAG_SENDONLY_FULLMEMBER) join_state = BIT(SENDONLY_FULLMEMBER_JOIN); else return -EINVAL; ctx = ucma_get_ctx_dev(file, cmd->id); if (IS_ERR(ctx)) return PTR_ERR(ctx); mc = kzalloc(sizeof(*mc), GFP_KERNEL); if (!mc) { ret = -ENOMEM; goto err_put_ctx; } mc->ctx = ctx; mc->join_state = join_state; mc->uid = cmd->uid; memcpy(&mc->addr, addr, cmd->addr_size); xa_lock(&multicast_table); if (__xa_alloc(&multicast_table, &mc->id, NULL, xa_limit_32b, GFP_KERNEL)) { ret = -ENOMEM; goto err_free_mc; } list_add_tail(&mc->list, &ctx->mc_list); xa_unlock(&multicast_table); mutex_lock(&ctx->mutex); ret = rdma_join_multicast(ctx->cm_id, (struct sockaddr *)&mc->addr, join_state, mc); mutex_unlock(&ctx->mutex); if (ret) goto err_xa_erase; resp.id = mc->id; if (copy_to_user(u64_to_user_ptr(cmd->response), &resp, sizeof(resp))) { ret = -EFAULT; goto err_leave_multicast; } xa_store(&multicast_table, mc->id, mc, 0); ucma_put_ctx(ctx); return 0; err_leave_multicast: mutex_lock(&ctx->mutex); rdma_leave_multicast(ctx->cm_id, (struct sockaddr *) &mc->addr); mutex_unlock(&ctx->mutex); ucma_cleanup_mc_events(mc); err_xa_erase: xa_lock(&multicast_table); list_del(&mc->list); __xa_erase(&multicast_table, mc->id); err_free_mc: xa_unlock(&multicast_table); kfree(mc); err_put_ctx: ucma_put_ctx(ctx); return ret; } static ssize_t ucma_join_ip_multicast(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_join_ip_mcast cmd; struct rdma_ucm_join_mcast join_cmd; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; join_cmd.response = cmd.response; join_cmd.uid = cmd.uid; join_cmd.id = cmd.id; join_cmd.addr_size = rdma_addr_size_in6(&cmd.addr); if (!join_cmd.addr_size) return -EINVAL; join_cmd.join_flags = RDMA_MC_JOIN_FLAG_FULLMEMBER; memcpy(&join_cmd.addr, &cmd.addr, join_cmd.addr_size); return ucma_process_join(file, &join_cmd, out_len); } static ssize_t ucma_join_multicast(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_join_mcast cmd; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; if (!rdma_addr_size_kss(&cmd.addr)) return -EINVAL; return ucma_process_join(file, &cmd, out_len); } static ssize_t ucma_leave_multicast(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_destroy_id cmd; struct rdma_ucm_destroy_id_resp resp; struct ucma_multicast *mc; int ret = 0; if (out_len < sizeof(resp)) return -ENOSPC; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; xa_lock(&multicast_table); mc = xa_load(&multicast_table, cmd.id); if (!mc) mc = ERR_PTR(-ENOENT); else if (READ_ONCE(mc->ctx->file) != file) mc = ERR_PTR(-EINVAL); else if (!refcount_inc_not_zero(&mc->ctx->ref)) mc = ERR_PTR(-ENXIO); if (IS_ERR(mc)) { xa_unlock(&multicast_table); ret = PTR_ERR(mc); goto out; } list_del(&mc->list); __xa_erase(&multicast_table, mc->id); xa_unlock(&multicast_table); mutex_lock(&mc->ctx->mutex); rdma_leave_multicast(mc->ctx->cm_id, (struct sockaddr *) &mc->addr); mutex_unlock(&mc->ctx->mutex); ucma_cleanup_mc_events(mc); ucma_put_ctx(mc->ctx); resp.events_reported = mc->events_reported; kfree(mc); if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof(resp))) ret = -EFAULT; out: return ret; } static ssize_t ucma_migrate_id(struct ucma_file *new_file, const char __user *inbuf, int in_len, int out_len) { struct rdma_ucm_migrate_id cmd; struct rdma_ucm_migrate_resp resp; struct ucma_event *uevent, *tmp; struct ucma_context *ctx; LIST_HEAD(event_list); struct ucma_file *cur_file; int ret = 0; if (copy_from_user(&cmd, inbuf, sizeof(cmd))) return -EFAULT; /* Get current fd to protect against it being closed */ CLASS(fd, f)(cmd.fd); if (fd_empty(f)) return -ENOENT; if (fd_file(f)->f_op != &ucma_fops) return -EINVAL; cur_file = fd_file(f)->private_data; /* Validate current fd and prevent destruction of id. */ ctx = ucma_get_ctx(cur_file, cmd.id); if (IS_ERR(ctx)) return PTR_ERR(ctx); rdma_lock_handler(ctx->cm_id); /* * ctx->file can only be changed under the handler & xa_lock. xa_load() * must be checked again to ensure the ctx hasn't begun destruction * since the ucma_get_ctx(). */ xa_lock(&ctx_table); if (_ucma_find_context(cmd.id, cur_file) != ctx) { xa_unlock(&ctx_table); ret = -ENOENT; goto err_unlock; } ctx->file = new_file; xa_unlock(&ctx_table); mutex_lock(&cur_file->mut); list_del(&ctx->list); /* * At this point lock_handler() prevents addition of new uevents for * this ctx. */ list_for_each_entry_safe(uevent, tmp, &cur_file->event_list, list) if (uevent->ctx == ctx) list_move_tail(&uevent->list, &event_list); resp.events_reported = ctx->events_reported; mutex_unlock(&cur_file->mut); mutex_lock(&new_file->mut); list_add_tail(&ctx->list, &new_file->ctx_list); list_splice_tail(&event_list, &new_file->event_list); mutex_unlock(&new_file->mut); if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof(resp))) ret = -EFAULT; err_unlock: rdma_unlock_handler(ctx->cm_id); ucma_put_ctx(ctx); return ret; } static ssize_t (*ucma_cmd_table[])(struct ucma_file *file, const char __user *inbuf, int in_len, int out_len) = { [RDMA_USER_CM_CMD_CREATE_ID] = ucma_create_id, [RDMA_USER_CM_CMD_DESTROY_ID] = ucma_destroy_id, [RDMA_USER_CM_CMD_BIND_IP] = ucma_bind_ip, [RDMA_USER_CM_CMD_RESOLVE_IP] = ucma_resolve_ip, [RDMA_USER_CM_CMD_RESOLVE_ROUTE] = ucma_resolve_route, [RDMA_USER_CM_CMD_QUERY_ROUTE] = ucma_query_route, [RDMA_USER_CM_CMD_CONNECT] = ucma_connect, [RDMA_USER_CM_CMD_LISTEN] = ucma_listen, [RDMA_USER_CM_CMD_ACCEPT] = ucma_accept, [RDMA_USER_CM_CMD_REJECT] = ucma_reject, [RDMA_USER_CM_CMD_DISCONNECT] = ucma_disconnect, [RDMA_USER_CM_CMD_INIT_QP_ATTR] = ucma_init_qp_attr, [RDMA_USER_CM_CMD_GET_EVENT] = ucma_get_event, [RDMA_USER_CM_CMD_GET_OPTION] = NULL, [RDMA_USER_CM_CMD_SET_OPTION] = ucma_set_option, [RDMA_USER_CM_CMD_NOTIFY] = ucma_notify, [RDMA_USER_CM_CMD_JOIN_IP_MCAST] = ucma_join_ip_multicast, [RDMA_USER_CM_CMD_LEAVE_MCAST] = ucma_leave_multicast, [RDMA_USER_CM_CMD_MIGRATE_ID] = ucma_migrate_id, [RDMA_USER_CM_CMD_QUERY] = ucma_query, [RDMA_USER_CM_CMD_BIND] = ucma_bind, [RDMA_USER_CM_CMD_RESOLVE_ADDR] = ucma_resolve_addr, [RDMA_USER_CM_CMD_JOIN_MCAST] = ucma_join_multicast }; static ssize_t ucma_write(struct file *filp, const char __user *buf, size_t len, loff_t *pos) { struct ucma_file *file = filp->private_data; struct rdma_ucm_cmd_hdr hdr; ssize_t ret; if (!ib_safe_file_access(filp)) { pr_err_once("%s: process %d (%s) changed security contexts after opening file descriptor, this is not allowed.\n", __func__, task_tgid_vnr(current), current->comm); return -EACCES; } if (len < sizeof(hdr)) return -EINVAL; if (copy_from_user(&hdr, buf, sizeof(hdr))) return -EFAULT; if (hdr.cmd >= ARRAY_SIZE(ucma_cmd_table)) return -EINVAL; hdr.cmd = array_index_nospec(hdr.cmd, ARRAY_SIZE(ucma_cmd_table)); if (hdr.in + sizeof(hdr) > len) return -EINVAL; if (!ucma_cmd_table[hdr.cmd]) return -ENOSYS; ret = ucma_cmd_table[hdr.cmd](file, buf + sizeof(hdr), hdr.in, hdr.out); if (!ret) ret = len; return ret; } static __poll_t ucma_poll(struct file *filp, struct poll_table_struct *wait) { struct ucma_file *file = filp->private_data; __poll_t mask = 0; poll_wait(filp, &file->poll_wait, wait); if (!list_empty(&file->event_list)) mask = EPOLLIN | EPOLLRDNORM; return mask; } /* * ucma_open() does not need the BKL: * * - no global state is referred to; * - there is no ioctl method to race against; * - no further module initialization is required for open to work * after the device is registered. */ static int ucma_open(struct inode *inode, struct file *filp) { struct ucma_file *file; file = kmalloc(sizeof *file, GFP_KERNEL); if (!file) return -ENOMEM; INIT_LIST_HEAD(&file->event_list); INIT_LIST_HEAD(&file->ctx_list); init_waitqueue_head(&file->poll_wait); mutex_init(&file->mut); filp->private_data = file; file->filp = filp; return stream_open(inode, filp); } static int ucma_close(struct inode *inode, struct file *filp) { struct ucma_file *file = filp->private_data; /* * All paths that touch ctx_list or ctx_list starting from write() are * prevented by this being a FD release function. The list_add_tail() in * ucma_connect_event_handler() can run concurrently, however it only * adds to the list *after* a listening ID. By only reading the first of * the list, and relying on ucma_destroy_private_ctx() to block * ucma_connect_event_handler(), no additional locking is needed. */ while (!list_empty(&file->ctx_list)) { struct ucma_context *ctx = list_first_entry( &file->ctx_list, struct ucma_context, list); WARN_ON(xa_cmpxchg(&ctx_table, ctx->id, ctx, XA_ZERO_ENTRY, GFP_KERNEL) != ctx); ucma_destroy_private_ctx(ctx); } kfree(file); return 0; } static const struct file_operations ucma_fops = { .owner = THIS_MODULE, .open = ucma_open, .release = ucma_close, .write = ucma_write, .poll = ucma_poll, }; static struct miscdevice ucma_misc = { .minor = MISC_DYNAMIC_MINOR, .name = "rdma_cm", .nodename = "infiniband/rdma_cm", .mode = 0666, .fops = &ucma_fops, }; static int ucma_get_global_nl_info(struct ib_client_nl_info *res) { res->abi = RDMA_USER_CM_ABI_VERSION; res->cdev = ucma_misc.this_device; return 0; } static struct ib_client rdma_cma_client = { .name = "rdma_cm", .get_global_nl_info = ucma_get_global_nl_info, }; MODULE_ALIAS_RDMA_CLIENT("rdma_cm"); static ssize_t abi_version_show(struct device *dev, struct device_attribute *attr, char *buf) { return sysfs_emit(buf, "%d\n", RDMA_USER_CM_ABI_VERSION); } static DEVICE_ATTR_RO(abi_version); static int __init ucma_init(void) { int ret; ret = misc_register(&ucma_misc); if (ret) return ret; ret = device_create_file(ucma_misc.this_device, &dev_attr_abi_version); if (ret) { pr_err("rdma_ucm: couldn't create abi_version attr\n"); goto err1; } ucma_ctl_table_hdr = register_net_sysctl(&init_net, "net/rdma_ucm", ucma_ctl_table); if (!ucma_ctl_table_hdr) { pr_err("rdma_ucm: couldn't register sysctl paths\n"); ret = -ENOMEM; goto err2; } ret = ib_register_client(&rdma_cma_client); if (ret) goto err3; return 0; err3: unregister_net_sysctl_table(ucma_ctl_table_hdr); err2: device_remove_file(ucma_misc.this_device, &dev_attr_abi_version); err1: misc_deregister(&ucma_misc); return ret; } static void __exit ucma_cleanup(void) { ib_unregister_client(&rdma_cma_client); unregister_net_sysctl_table(ucma_ctl_table_hdr); device_remove_file(ucma_misc.this_device, &dev_attr_abi_version); misc_deregister(&ucma_misc); } module_init(ucma_init); module_exit(ucma_cleanup); |
| 186 186 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 | // SPDX-License-Identifier: GPL-2.0 /* * Assorted bcachefs debug code * * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> * Copyright 2012 Google, Inc. */ #include "bcachefs.h" #include "alloc_foreground.h" #include "bkey_methods.h" #include "btree_cache.h" #include "btree_io.h" #include "btree_iter.h" #include "btree_locking.h" #include "btree_update.h" #include "btree_update_interior.h" #include "buckets.h" #include "debug.h" #include "error.h" #include "extents.h" #include "fsck.h" #include "inode.h" #include "journal_reclaim.h" #include "super.h" #include <linux/console.h> #include <linux/debugfs.h> #include <linux/module.h> #include <linux/random.h> #include <linux/seq_file.h> static struct dentry *bch_debug; static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b, struct extent_ptr_decoded pick) { struct btree *v = c->verify_data; struct btree_node *n_ondisk = c->verify_ondisk; struct btree_node *n_sorted = c->verify_data->data; struct bset *sorted, *inmemory = &b->data->keys; struct bio *bio; bool failed = false, saw_error = false; struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); if (!ca) return false; bio = bio_alloc_bioset(ca->disk_sb.bdev, buf_pages(n_sorted, btree_buf_bytes(b)), REQ_OP_READ|REQ_META, GFP_NOFS, &c->btree_bio); bio->bi_iter.bi_sector = pick.ptr.offset; bch2_bio_map(bio, n_sorted, btree_buf_bytes(b)); submit_bio_wait(bio); bio_put(bio); percpu_ref_put(&ca->io_ref[READ]); memcpy(n_ondisk, n_sorted, btree_buf_bytes(b)); v->written = 0; if (bch2_btree_node_read_done(c, ca, v, false, &saw_error) || saw_error) return false; n_sorted = c->verify_data->data; sorted = &n_sorted->keys; if (inmemory->u64s != sorted->u64s || memcmp(inmemory->start, sorted->start, vstruct_end(inmemory) - (void *) inmemory->start)) { unsigned offset = 0, sectors; struct bset *i; unsigned j; console_lock(); printk(KERN_ERR "*** in memory:\n"); bch2_dump_bset(c, b, inmemory, 0); printk(KERN_ERR "*** read back in:\n"); bch2_dump_bset(c, v, sorted, 0); while (offset < v->written) { if (!offset) { i = &n_ondisk->keys; sectors = vstruct_blocks(n_ondisk, c->block_bits) << c->block_bits; } else { struct btree_node_entry *bne = (void *) n_ondisk + (offset << 9); i = &bne->keys; sectors = vstruct_blocks(bne, c->block_bits) << c->block_bits; } printk(KERN_ERR "*** on disk block %u:\n", offset); bch2_dump_bset(c, b, i, offset); offset += sectors; } for (j = 0; j < le16_to_cpu(inmemory->u64s); j++) if (inmemory->_data[j] != sorted->_data[j]) break; console_unlock(); bch_err(c, "verify failed at key %u", j); failed = true; } if (v->written != b->written) { bch_err(c, "written wrong: expected %u, got %u", b->written, v->written); failed = true; } return failed; } void __bch2_btree_verify(struct bch_fs *c, struct btree *b) { struct bkey_ptrs_c ptrs; struct extent_ptr_decoded p; const union bch_extent_entry *entry; struct btree *v; struct bset *inmemory = &b->data->keys; struct bkey_packed *k; bool failed = false; if (c->opts.nochanges) return; bch2_btree_node_io_lock(b); mutex_lock(&c->verify_lock); if (!c->verify_ondisk) { c->verify_ondisk = kvmalloc(btree_buf_bytes(b), GFP_KERNEL); if (!c->verify_ondisk) goto out; } if (!c->verify_data) { c->verify_data = __bch2_btree_node_mem_alloc(c); if (!c->verify_data) goto out; list_del_init(&c->verify_data->list); } BUG_ON(b->nsets != 1); for (k = inmemory->start; k != vstruct_last(inmemory); k = bkey_p_next(k)) if (k->type == KEY_TYPE_btree_ptr_v2) ((struct bch_btree_ptr_v2 *) bkeyp_val(&b->format, k))->mem_ptr = 0; v = c->verify_data; bkey_copy(&v->key, &b->key); v->c.level = b->c.level; v->c.btree_id = b->c.btree_id; bch2_btree_keys_init(v); ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(&b->key)); bkey_for_each_ptr_decode(&b->key.k, ptrs, p, entry) failed |= bch2_btree_verify_replica(c, b, p); if (failed) { struct printbuf buf = PRINTBUF; bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); bch2_fs_fatal_error(c, ": btree node verify failed for: %s\n", buf.buf); printbuf_exit(&buf); } out: mutex_unlock(&c->verify_lock); bch2_btree_node_io_unlock(b); } void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c, const struct btree *b) { struct btree_node *n_ondisk = NULL; struct extent_ptr_decoded pick; struct bch_dev *ca; struct bio *bio = NULL; unsigned offset = 0; int ret; if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), NULL, &pick, -1) <= 0) { prt_printf(out, "error getting device to read from: invalid device\n"); return; } ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); if (!ca) { prt_printf(out, "error getting device to read from: not online\n"); return; } n_ondisk = kvmalloc(btree_buf_bytes(b), GFP_KERNEL); if (!n_ondisk) { prt_printf(out, "memory allocation failure\n"); goto out; } bio = bio_alloc_bioset(ca->disk_sb.bdev, buf_pages(n_ondisk, btree_buf_bytes(b)), REQ_OP_READ|REQ_META, GFP_NOFS, &c->btree_bio); bio->bi_iter.bi_sector = pick.ptr.offset; bch2_bio_map(bio, n_ondisk, btree_buf_bytes(b)); ret = submit_bio_wait(bio); if (ret) { prt_printf(out, "IO error reading btree node: %s\n", bch2_err_str(ret)); goto out; } while (offset < btree_sectors(c)) { struct bset *i; struct nonce nonce; struct bch_csum csum; struct bkey_packed *k; unsigned sectors; if (!offset) { i = &n_ondisk->keys; if (!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i))) { prt_printf(out, "unknown checksum type at offset %u: %llu\n", offset, BSET_CSUM_TYPE(i)); goto out; } nonce = btree_nonce(i, offset << 9); csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, n_ondisk); if (bch2_crc_cmp(csum, n_ondisk->csum)) { prt_printf(out, "invalid checksum\n"); goto out; } bset_encrypt(c, i, offset << 9); sectors = vstruct_sectors(n_ondisk, c->block_bits); } else { struct btree_node_entry *bne = (void *) n_ondisk + (offset << 9); i = &bne->keys; if (i->seq != n_ondisk->keys.seq) break; if (!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i))) { prt_printf(out, "unknown checksum type at offset %u: %llu\n", offset, BSET_CSUM_TYPE(i)); goto out; } nonce = btree_nonce(i, offset << 9); csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); if (bch2_crc_cmp(csum, bne->csum)) { prt_printf(out, "invalid checksum"); goto out; } bset_encrypt(c, i, offset << 9); sectors = vstruct_sectors(bne, c->block_bits); } prt_printf(out, " offset %u version %u, journal seq %llu\n", offset, le16_to_cpu(i->version), le64_to_cpu(i->journal_seq)); offset += sectors; printbuf_indent_add(out, 4); for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k)) { struct bkey u; bch2_bkey_val_to_text(out, c, bkey_disassemble(b, k, &u)); prt_newline(out); } printbuf_indent_sub(out, 4); } out: if (bio) bio_put(bio); kvfree(n_ondisk); percpu_ref_put(&ca->io_ref[READ]); } #ifdef CONFIG_DEBUG_FS /* XXX: bch_fs refcounting */ struct dump_iter { struct bch_fs *c; enum btree_id id; struct bpos from; struct bpos prev_node; u64 iter; struct printbuf buf; char __user *ubuf; /* destination user buffer */ size_t size; /* size of requested read */ ssize_t ret; /* bytes read so far */ }; static ssize_t flush_buf(struct dump_iter *i) { if (i->buf.pos) { size_t bytes = min_t(size_t, i->buf.pos, i->size); int copied = bytes - copy_to_user(i->ubuf, i->buf.buf, bytes); i->ret += copied; i->ubuf += copied; i->size -= copied; i->buf.pos -= copied; memmove(i->buf.buf, i->buf.buf + copied, i->buf.pos); if (copied != bytes) return -EFAULT; } return i->size ? 0 : i->ret; } static int bch2_dump_open(struct inode *inode, struct file *file) { struct btree_debug *bd = inode->i_private; struct dump_iter *i; i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL); if (!i) return -ENOMEM; file->private_data = i; i->from = POS_MIN; i->iter = 0; i->c = container_of(bd, struct bch_fs, btree_debug[bd->id]); i->id = bd->id; i->buf = PRINTBUF; return 0; } static int bch2_dump_release(struct inode *inode, struct file *file) { struct dump_iter *i = file->private_data; printbuf_exit(&i->buf); kfree(i); return 0; } static ssize_t bch2_read_btree(struct file *file, char __user *buf, size_t size, loff_t *ppos) { struct dump_iter *i = file->private_data; i->ubuf = buf; i->size = size; i->ret = 0; return flush_buf(i) ?: bch2_trans_run(i->c, for_each_btree_key(trans, iter, i->id, i->from, BTREE_ITER_prefetch| BTREE_ITER_all_snapshots, k, ({ bch2_bkey_val_to_text(&i->buf, i->c, k); prt_newline(&i->buf); bch2_trans_unlock(trans); i->from = bpos_successor(iter.pos); flush_buf(i); }))) ?: i->ret; } static const struct file_operations btree_debug_ops = { .owner = THIS_MODULE, .open = bch2_dump_open, .release = bch2_dump_release, .read = bch2_read_btree, }; static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf, size_t size, loff_t *ppos) { struct dump_iter *i = file->private_data; i->ubuf = buf; i->size = size; i->ret = 0; ssize_t ret = flush_buf(i); if (ret) return ret; if (bpos_eq(SPOS_MAX, i->from)) return i->ret; return bch2_trans_run(i->c, for_each_btree_node(trans, iter, i->id, i->from, 0, b, ({ bch2_btree_node_to_text(&i->buf, i->c, b); i->from = !bpos_eq(SPOS_MAX, b->key.k.p) ? bpos_successor(b->key.k.p) : b->key.k.p; drop_locks_do(trans, flush_buf(i)); }))) ?: i->ret; } static const struct file_operations btree_format_debug_ops = { .owner = THIS_MODULE, .open = bch2_dump_open, .release = bch2_dump_release, .read = bch2_read_btree_formats, }; static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf, size_t size, loff_t *ppos) { struct dump_iter *i = file->private_data; i->ubuf = buf; i->size = size; i->ret = 0; return flush_buf(i) ?: bch2_trans_run(i->c, for_each_btree_key(trans, iter, i->id, i->from, BTREE_ITER_prefetch| BTREE_ITER_all_snapshots, k, ({ struct btree_path_level *l = &btree_iter_path(trans, &iter)->l[0]; struct bkey_packed *_k = bch2_btree_node_iter_peek(&l->iter, l->b); if (bpos_gt(l->b->key.k.p, i->prev_node)) { bch2_btree_node_to_text(&i->buf, i->c, l->b); i->prev_node = l->b->key.k.p; } bch2_bfloat_to_text(&i->buf, l->b, _k); bch2_trans_unlock(trans); i->from = bpos_successor(iter.pos); flush_buf(i); }))) ?: i->ret; } static const struct file_operations bfloat_failed_debug_ops = { .owner = THIS_MODULE, .open = bch2_dump_open, .release = bch2_dump_release, .read = bch2_read_bfloat_failed, }; static void bch2_cached_btree_node_to_text(struct printbuf *out, struct bch_fs *c, struct btree *b) { if (!out->nr_tabstops) printbuf_tabstop_push(out, 32); prt_printf(out, "%px ", b); bch2_btree_id_level_to_text(out, b->c.btree_id, b->c.level); prt_printf(out, "\n"); printbuf_indent_add(out, 2); bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key)); prt_newline(out); prt_printf(out, "flags:\t"); prt_bitflags(out, bch2_btree_node_flags, b->flags); prt_newline(out); prt_printf(out, "pcpu read locks:\t%u\n", b->c.lock.readers != NULL); prt_printf(out, "written:\t%u\n", b->written); prt_printf(out, "writes blocked:\t%u\n", !list_empty_careful(&b->write_blocked)); prt_printf(out, "will make reachable:\t%lx\n", b->will_make_reachable); prt_printf(out, "journal pin %px:\t%llu\n", &b->writes[0].journal, b->writes[0].journal.seq); prt_printf(out, "journal pin %px:\t%llu\n", &b->writes[1].journal, b->writes[1].journal.seq); printbuf_indent_sub(out, 2); } static ssize_t bch2_cached_btree_nodes_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) { struct dump_iter *i = file->private_data; struct bch_fs *c = i->c; bool done = false; ssize_t ret = 0; i->ubuf = buf; i->size = size; i->ret = 0; do { struct bucket_table *tbl; struct rhash_head *pos; struct btree *b; ret = flush_buf(i); if (ret) return ret; rcu_read_lock(); i->buf.atomic++; tbl = rht_dereference_rcu(c->btree_cache.table.tbl, &c->btree_cache.table); if (i->iter < tbl->size) { rht_for_each_entry_rcu(b, pos, tbl, i->iter, hash) bch2_cached_btree_node_to_text(&i->buf, c, b); i->iter++; } else { done = true; } --i->buf.atomic; rcu_read_unlock(); } while (!done); if (i->buf.allocation_failure) ret = -ENOMEM; if (!ret) ret = flush_buf(i); return ret ?: i->ret; } static const struct file_operations cached_btree_nodes_ops = { .owner = THIS_MODULE, .open = bch2_dump_open, .release = bch2_dump_release, .read = bch2_cached_btree_nodes_read, }; typedef int (*list_cmp_fn)(const struct list_head *l, const struct list_head *r); static void list_sort(struct list_head *head, list_cmp_fn cmp) { struct list_head *pos; list_for_each(pos, head) while (!list_is_last(pos, head) && cmp(pos, pos->next) > 0) { struct list_head *pos2, *next = pos->next; list_del(next); list_for_each(pos2, head) if (cmp(next, pos2) < 0) goto pos_found; BUG(); pos_found: list_add_tail(next, pos2); } } static int list_ptr_order_cmp(const struct list_head *l, const struct list_head *r) { return cmp_int(l, r); } static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) { struct dump_iter *i = file->private_data; struct bch_fs *c = i->c; struct btree_trans *trans; ssize_t ret = 0; i->ubuf = buf; i->size = size; i->ret = 0; restart: seqmutex_lock(&c->btree_trans_lock); list_sort(&c->btree_trans_list, list_ptr_order_cmp); list_for_each_entry(trans, &c->btree_trans_list, list) { if ((ulong) trans <= i->iter) continue; i->iter = (ulong) trans; if (!closure_get_not_zero(&trans->ref)) continue; u32 seq = seqmutex_unlock(&c->btree_trans_lock); bch2_btree_trans_to_text(&i->buf, trans); prt_printf(&i->buf, "backtrace:\n"); printbuf_indent_add(&i->buf, 2); bch2_prt_task_backtrace(&i->buf, trans->locking_wait.task, 0, GFP_KERNEL); printbuf_indent_sub(&i->buf, 2); prt_newline(&i->buf); closure_put(&trans->ref); ret = flush_buf(i); if (ret) goto unlocked; if (!seqmutex_relock(&c->btree_trans_lock, seq)) goto restart; } seqmutex_unlock(&c->btree_trans_lock); unlocked: if (i->buf.allocation_failure) ret = -ENOMEM; if (!ret) ret = flush_buf(i); return ret ?: i->ret; } static const struct file_operations btree_transactions_ops = { .owner = THIS_MODULE, .open = bch2_dump_open, .release = bch2_dump_release, .read = bch2_btree_transactions_read, }; static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) { struct dump_iter *i = file->private_data; struct bch_fs *c = i->c; bool done = false; int err; i->ubuf = buf; i->size = size; i->ret = 0; while (1) { err = flush_buf(i); if (err) return err; if (!i->size) break; if (done) break; done = bch2_journal_seq_pins_to_text(&i->buf, &c->journal, &i->iter); i->iter++; } if (i->buf.allocation_failure) return -ENOMEM; return i->ret; } static const struct file_operations journal_pins_ops = { .owner = THIS_MODULE, .open = bch2_dump_open, .release = bch2_dump_release, .read = bch2_journal_pins_read, }; static ssize_t bch2_btree_updates_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) { struct dump_iter *i = file->private_data; struct bch_fs *c = i->c; int err; i->ubuf = buf; i->size = size; i->ret = 0; if (!i->iter) { bch2_btree_updates_to_text(&i->buf, c); i->iter++; } err = flush_buf(i); if (err) return err; if (i->buf.allocation_failure) return -ENOMEM; return i->ret; } static const struct file_operations btree_updates_ops = { .owner = THIS_MODULE, .open = bch2_dump_open, .release = bch2_dump_release, .read = bch2_btree_updates_read, }; static int btree_transaction_stats_open(struct inode *inode, struct file *file) { struct bch_fs *c = inode->i_private; struct dump_iter *i; i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL); if (!i) return -ENOMEM; i->iter = 1; i->c = c; i->buf = PRINTBUF; file->private_data = i; return 0; } static int btree_transaction_stats_release(struct inode *inode, struct file *file) { struct dump_iter *i = file->private_data; printbuf_exit(&i->buf); kfree(i); return 0; } static ssize_t btree_transaction_stats_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) { struct dump_iter *i = file->private_data; struct bch_fs *c = i->c; int err; i->ubuf = buf; i->size = size; i->ret = 0; while (1) { struct btree_transaction_stats *s = &c->btree_transaction_stats[i->iter]; err = flush_buf(i); if (err) return err; if (!i->size) break; if (i->iter == ARRAY_SIZE(bch2_btree_transaction_fns) || !bch2_btree_transaction_fns[i->iter]) break; prt_printf(&i->buf, "%s:\n", bch2_btree_transaction_fns[i->iter]); printbuf_indent_add(&i->buf, 2); mutex_lock(&s->lock); prt_printf(&i->buf, "Max mem used: %u\n", s->max_mem); prt_printf(&i->buf, "Transaction duration:\n"); printbuf_indent_add(&i->buf, 2); bch2_time_stats_to_text(&i->buf, &s->duration); printbuf_indent_sub(&i->buf, 2); if (IS_ENABLED(CONFIG_BCACHEFS_LOCK_TIME_STATS)) { prt_printf(&i->buf, "Lock hold times:\n"); printbuf_indent_add(&i->buf, 2); bch2_time_stats_to_text(&i->buf, &s->lock_hold_times); printbuf_indent_sub(&i->buf, 2); } if (s->max_paths_text) { prt_printf(&i->buf, "Maximum allocated btree paths (%u):\n", s->nr_max_paths); printbuf_indent_add(&i->buf, 2); prt_str_indented(&i->buf, s->max_paths_text); printbuf_indent_sub(&i->buf, 2); } mutex_unlock(&s->lock); printbuf_indent_sub(&i->buf, 2); prt_newline(&i->buf); i->iter++; } if (i->buf.allocation_failure) return -ENOMEM; return i->ret; } static const struct file_operations btree_transaction_stats_op = { .owner = THIS_MODULE, .open = btree_transaction_stats_open, .release = btree_transaction_stats_release, .read = btree_transaction_stats_read, }; /* walk btree transactions until we find a deadlock and print it */ static void btree_deadlock_to_text(struct printbuf *out, struct bch_fs *c) { struct btree_trans *trans; ulong iter = 0; restart: seqmutex_lock(&c->btree_trans_lock); list_sort(&c->btree_trans_list, list_ptr_order_cmp); list_for_each_entry(trans, &c->btree_trans_list, list) { if ((ulong) trans <= iter) continue; iter = (ulong) trans; if (!closure_get_not_zero(&trans->ref)) continue; u32 seq = seqmutex_unlock(&c->btree_trans_lock); bool found = bch2_check_for_deadlock(trans, out) != 0; closure_put(&trans->ref); if (found) return; if (!seqmutex_relock(&c->btree_trans_lock, seq)) goto restart; } seqmutex_unlock(&c->btree_trans_lock); } typedef void (*fs_to_text_fn)(struct printbuf *, struct bch_fs *); static ssize_t bch2_simple_print(struct file *file, char __user *buf, size_t size, loff_t *ppos, fs_to_text_fn fn) { struct dump_iter *i = file->private_data; struct bch_fs *c = i->c; ssize_t ret = 0; i->ubuf = buf; i->size = size; i->ret = 0; if (!i->iter) { fn(&i->buf, c); i->iter++; } if (i->buf.allocation_failure) ret = -ENOMEM; if (!ret) ret = flush_buf(i); return ret ?: i->ret; } static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) { return bch2_simple_print(file, buf, size, ppos, btree_deadlock_to_text); } static const struct file_operations btree_deadlock_ops = { .owner = THIS_MODULE, .open = bch2_dump_open, .release = bch2_dump_release, .read = bch2_btree_deadlock_read, }; static ssize_t bch2_write_points_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) { return bch2_simple_print(file, buf, size, ppos, bch2_write_points_to_text); } static const struct file_operations write_points_ops = { .owner = THIS_MODULE, .open = bch2_dump_open, .release = bch2_dump_release, .read = bch2_write_points_read, }; void bch2_fs_debug_exit(struct bch_fs *c) { if (!IS_ERR_OR_NULL(c->fs_debug_dir)) debugfs_remove_recursive(c->fs_debug_dir); } static void bch2_fs_debug_btree_init(struct bch_fs *c, struct btree_debug *bd) { struct dentry *d; d = debugfs_create_dir(bch2_btree_id_str(bd->id), c->btree_debug_dir); debugfs_create_file("keys", 0400, d, bd, &btree_debug_ops); debugfs_create_file("formats", 0400, d, bd, &btree_format_debug_ops); debugfs_create_file("bfloat-failed", 0400, d, bd, &bfloat_failed_debug_ops); } void bch2_fs_debug_init(struct bch_fs *c) { struct btree_debug *bd; char name[100]; if (IS_ERR_OR_NULL(bch_debug)) return; snprintf(name, sizeof(name), "%pU", c->sb.user_uuid.b); c->fs_debug_dir = debugfs_create_dir(name, bch_debug); if (IS_ERR_OR_NULL(c->fs_debug_dir)) return; debugfs_create_file("cached_btree_nodes", 0400, c->fs_debug_dir, c->btree_debug, &cached_btree_nodes_ops); debugfs_create_file("btree_transactions", 0400, c->fs_debug_dir, c->btree_debug, &btree_transactions_ops); debugfs_create_file("journal_pins", 0400, c->fs_debug_dir, c->btree_debug, &journal_pins_ops); debugfs_create_file("btree_updates", 0400, c->fs_debug_dir, c->btree_debug, &btree_updates_ops); debugfs_create_file("btree_transaction_stats", 0400, c->fs_debug_dir, c, &btree_transaction_stats_op); debugfs_create_file("btree_deadlock", 0400, c->fs_debug_dir, c->btree_debug, &btree_deadlock_ops); debugfs_create_file("write_points", 0400, c->fs_debug_dir, c->btree_debug, &write_points_ops); c->btree_debug_dir = debugfs_create_dir("btrees", c->fs_debug_dir); if (IS_ERR_OR_NULL(c->btree_debug_dir)) return; for (bd = c->btree_debug; bd < c->btree_debug + ARRAY_SIZE(c->btree_debug); bd++) { bd->id = bd - c->btree_debug; bch2_fs_debug_btree_init(c, bd); } } #endif void bch2_debug_exit(void) { if (!IS_ERR_OR_NULL(bch_debug)) debugfs_remove_recursive(bch_debug); } int __init bch2_debug_init(void) { bch_debug = debugfs_create_dir("bcachefs", NULL); return 0; } |
| 41 41 41 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 | // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" #include "bbpos.h" #include "alloc_background.h" #include "backpointers.h" #include "bkey_buf.h" #include "btree_cache.h" #include "btree_update.h" #include "btree_update_interior.h" #include "btree_write_buffer.h" #include "checksum.h" #include "disk_accounting.h" #include "error.h" #include "progress.h" #include <linux/mm.h> int bch2_backpointer_validate(struct bch_fs *c, struct bkey_s_c k, struct bkey_validate_context from) { struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); int ret = 0; bkey_fsck_err_on(bp.v->level > BTREE_MAX_DEPTH, c, backpointer_level_bad, "backpointer level bad: %u >= %u", bp.v->level, BTREE_MAX_DEPTH); bkey_fsck_err_on(bp.k->p.inode == BCH_SB_MEMBER_INVALID, c, backpointer_dev_bad, "backpointer for BCH_SB_MEMBER_INVALID"); fsck_err: return ret; } void bch2_backpointer_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) { struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); rcu_read_lock(); struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp.k->p.inode); if (ca) { u32 bucket_offset; struct bpos bucket = bp_pos_to_bucket_and_offset(ca, bp.k->p, &bucket_offset); rcu_read_unlock(); prt_printf(out, "bucket=%llu:%llu:%u ", bucket.inode, bucket.offset, bucket_offset); } else { rcu_read_unlock(); prt_printf(out, "sector=%llu:%llu ", bp.k->p.inode, bp.k->p.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT); } bch2_btree_id_level_to_text(out, bp.v->btree_id, bp.v->level); prt_str(out, " data_type="); bch2_prt_data_type(out, bp.v->data_type); prt_printf(out, " suboffset=%u len=%u gen=%u pos=", (u32) bp.k->p.offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT), bp.v->bucket_len, bp.v->bucket_gen); bch2_bpos_to_text(out, bp.v->pos); } void bch2_backpointer_swab(struct bkey_s k) { struct bkey_s_backpointer bp = bkey_s_to_backpointer(k); bp.v->bucket_len = swab32(bp.v->bucket_len); bch2_bpos_swab(&bp.v->pos); } static bool extent_matches_bp(struct bch_fs *c, enum btree_id btree_id, unsigned level, struct bkey_s_c k, struct bkey_s_c_backpointer bp) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct extent_ptr_decoded p; bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { struct bkey_i_backpointer bp2; bch2_extent_ptr_to_bp(c, btree_id, level, k, p, entry, &bp2); if (bpos_eq(bp.k->p, bp2.k.p) && !memcmp(bp.v, &bp2.v, sizeof(bp2.v))) return true; } return false; } static noinline int backpointer_mod_err(struct btree_trans *trans, struct bkey_s_c orig_k, struct bkey_i_backpointer *new_bp, struct bkey_s_c found_bp, bool insert) { struct bch_fs *c = trans->c; struct printbuf buf = PRINTBUF; int ret = 0; if (insert) { prt_printf(&buf, "existing backpointer found when inserting "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&new_bp->k_i)); prt_newline(&buf); printbuf_indent_add(&buf, 2); prt_printf(&buf, "found "); bch2_bkey_val_to_text(&buf, c, found_bp); prt_newline(&buf); prt_printf(&buf, "for "); bch2_bkey_val_to_text(&buf, c, orig_k); bch_err(c, "%s", buf.buf); } else if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers) { prt_printf(&buf, "backpointer not found when deleting\n"); printbuf_indent_add(&buf, 2); prt_printf(&buf, "searching for "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&new_bp->k_i)); prt_newline(&buf); prt_printf(&buf, "got "); bch2_bkey_val_to_text(&buf, c, found_bp); prt_newline(&buf); prt_printf(&buf, "for "); bch2_bkey_val_to_text(&buf, c, orig_k); } if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers && __bch2_inconsistent_error(c, &buf)) ret = -BCH_ERR_erofs_unfixed_errors; bch_err(c, "%s", buf.buf); printbuf_exit(&buf); return ret; } int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans, struct bkey_s_c orig_k, struct bkey_i_backpointer *bp, bool insert) { struct btree_iter bp_iter; struct bkey_s_c k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, bp->k.p, BTREE_ITER_intent| BTREE_ITER_slots| BTREE_ITER_with_updates); int ret = bkey_err(k); if (ret) return ret; if (insert ? k.k->type : (k.k->type != KEY_TYPE_backpointer || memcmp(bkey_s_c_to_backpointer(k).v, &bp->v, sizeof(bp->v)))) { ret = backpointer_mod_err(trans, orig_k, bp, k, insert); if (ret) goto err; } if (!insert) { bp->k.type = KEY_TYPE_deleted; set_bkey_val_u64s(&bp->k, 0); } ret = bch2_trans_update(trans, &bp_iter, &bp->k_i, 0); err: bch2_trans_iter_exit(trans, &bp_iter); return ret; } static int bch2_backpointer_del(struct btree_trans *trans, struct bpos pos) { return (likely(!bch2_backpointers_no_use_write_buffer) ? bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, pos) : bch2_btree_delete(trans, BTREE_ID_backpointers, pos, 0)) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); } static inline int bch2_backpointers_maybe_flush(struct btree_trans *trans, struct bkey_s_c visiting_k, struct bkey_buf *last_flushed) { return likely(!bch2_backpointers_no_use_write_buffer) ? bch2_btree_write_buffer_maybe_flush(trans, visiting_k, last_flushed) : 0; } static int backpointer_target_not_found(struct btree_trans *trans, struct bkey_s_c_backpointer bp, struct bkey_s_c target_k, struct bkey_buf *last_flushed) { struct bch_fs *c = trans->c; struct printbuf buf = PRINTBUF; int ret = 0; /* * If we're using the btree write buffer, the backpointer we were * looking at may have already been deleted - failure to find what it * pointed to is not an error: */ ret = last_flushed ? bch2_backpointers_maybe_flush(trans, bp.s_c, last_flushed) : 0; if (ret) return ret; prt_printf(&buf, "backpointer doesn't match %s it points to:\n", bp.v->level ? "btree node" : "extent"); bch2_bkey_val_to_text(&buf, c, bp.s_c); prt_newline(&buf); bch2_bkey_val_to_text(&buf, c, target_k); struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(target_k); const union bch_extent_entry *entry; struct extent_ptr_decoded p; bkey_for_each_ptr_decode(target_k.k, ptrs, p, entry) if (p.ptr.dev == bp.k->p.inode) { prt_newline(&buf); struct bkey_i_backpointer bp2; bch2_extent_ptr_to_bp(c, bp.v->btree_id, bp.v->level, target_k, p, entry, &bp2); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&bp2.k_i)); } if (fsck_err(trans, backpointer_to_missing_ptr, "%s", buf.buf)) ret = bch2_backpointer_del(trans, bp.k->p); fsck_err: printbuf_exit(&buf); return ret; } struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans, struct bkey_s_c_backpointer bp, struct btree_iter *iter, unsigned iter_flags, struct bkey_buf *last_flushed) { struct bch_fs *c = trans->c; if (unlikely(bp.v->btree_id >= btree_id_nr_alive(c))) return bkey_s_c_null; bch2_trans_node_iter_init(trans, iter, bp.v->btree_id, bp.v->pos, 0, bp.v->level, iter_flags); struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, iter); if (bkey_err(k)) { bch2_trans_iter_exit(trans, iter); return k; } /* * peek_slot() doesn't normally return NULL - except when we ask for a * key at a btree level that doesn't exist. * * We may want to revisit this and change peek_slot(): */ if (!k.k) { bkey_init(&iter->k); iter->k.p = bp.v->pos; k.k = &iter->k; } if (k.k && extent_matches_bp(c, bp.v->btree_id, bp.v->level, k, bp)) return k; bch2_trans_iter_exit(trans, iter); if (!bp.v->level) { int ret = backpointer_target_not_found(trans, bp, k, last_flushed); return ret ? bkey_s_c_err(ret) : bkey_s_c_null; } else { struct btree *b = bch2_backpointer_get_node(trans, bp, iter, last_flushed); if (b == ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node)) return bkey_s_c_null; if (IS_ERR_OR_NULL(b)) return ((struct bkey_s_c) { .k = ERR_CAST(b) }); return bkey_i_to_s_c(&b->key); } } struct btree *bch2_backpointer_get_node(struct btree_trans *trans, struct bkey_s_c_backpointer bp, struct btree_iter *iter, struct bkey_buf *last_flushed) { struct bch_fs *c = trans->c; BUG_ON(!bp.v->level); bch2_trans_node_iter_init(trans, iter, bp.v->btree_id, bp.v->pos, 0, bp.v->level - 1, 0); struct btree *b = bch2_btree_iter_peek_node(trans, iter); if (IS_ERR_OR_NULL(b)) goto err; BUG_ON(b->c.level != bp.v->level - 1); if (extent_matches_bp(c, bp.v->btree_id, bp.v->level, bkey_i_to_s_c(&b->key), bp)) return b; if (btree_node_will_make_reachable(b)) { b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node); } else { int ret = backpointer_target_not_found(trans, bp, bkey_i_to_s_c(&b->key), last_flushed); b = ret ? ERR_PTR(ret) : NULL; } err: bch2_trans_iter_exit(trans, iter); return b; } static int bch2_check_backpointer_has_valid_bucket(struct btree_trans *trans, struct bkey_s_c k, struct bkey_buf *last_flushed) { if (k.k->type != KEY_TYPE_backpointer) return 0; struct bch_fs *c = trans->c; struct btree_iter alloc_iter = {}; struct bkey_s_c alloc_k; struct printbuf buf = PRINTBUF; int ret = 0; struct bpos bucket; if (!bp_pos_to_bucket_nodev_noerror(c, k.k->p, &bucket)) { ret = bch2_backpointers_maybe_flush(trans, k, last_flushed); if (ret) goto out; if (fsck_err(trans, backpointer_to_missing_device, "backpointer for missing device:\n%s", (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) ret = bch2_backpointer_del(trans, k.k->p); goto out; } alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, bucket, 0); ret = bkey_err(alloc_k); if (ret) goto out; if (alloc_k.k->type != KEY_TYPE_alloc_v4) { ret = bch2_backpointers_maybe_flush(trans, k, last_flushed); if (ret) goto out; if (fsck_err(trans, backpointer_to_missing_alloc, "backpointer for nonexistent alloc key: %llu:%llu:0\n%s", alloc_iter.pos.inode, alloc_iter.pos.offset, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) ret = bch2_backpointer_del(trans, k.k->p); } out: fsck_err: bch2_trans_iter_exit(trans, &alloc_iter); printbuf_exit(&buf); return ret; } /* verify that every backpointer has a corresponding alloc key */ int bch2_check_btree_backpointers(struct bch_fs *c) { struct bkey_buf last_flushed; bch2_bkey_buf_init(&last_flushed); bkey_init(&last_flushed.k->k); int ret = bch2_trans_run(c, for_each_btree_key_commit(trans, iter, BTREE_ID_backpointers, POS_MIN, 0, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_check_backpointer_has_valid_bucket(trans, k, &last_flushed))); bch2_bkey_buf_exit(&last_flushed, c); bch_err_fn(c, ret); return ret; } struct extents_to_bp_state { struct bpos bp_start; struct bpos bp_end; struct bkey_buf last_flushed; }; static int drop_dev_and_update(struct btree_trans *trans, enum btree_id btree, struct bkey_s_c extent, unsigned dev) { struct bkey_i *n = bch2_bkey_make_mut_noupdate(trans, extent); int ret = PTR_ERR_OR_ZERO(n); if (ret) return ret; bch2_bkey_drop_device(bkey_i_to_s(n), dev); return bch2_btree_insert_trans(trans, btree, n, 0); } static int check_extent_checksum(struct btree_trans *trans, enum btree_id btree, struct bkey_s_c extent, enum btree_id o_btree, struct bkey_s_c extent2, unsigned dev) { struct bch_fs *c = trans->c; struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(extent); const union bch_extent_entry *entry; struct extent_ptr_decoded p; struct printbuf buf = PRINTBUF; void *data_buf = NULL; struct bio *bio = NULL; size_t bytes; int ret = 0; if (bkey_is_btree_ptr(extent.k)) return false; bkey_for_each_ptr_decode(extent.k, ptrs, p, entry) if (p.ptr.dev == dev) goto found; BUG(); found: if (!p.crc.csum_type) return false; bytes = p.crc.compressed_size << 9; struct bch_dev *ca = bch2_dev_get_ioref(c, dev, READ); if (!ca) return false; data_buf = kvmalloc(bytes, GFP_KERNEL); if (!data_buf) { ret = -ENOMEM; goto err; } bio = bio_alloc(ca->disk_sb.bdev, buf_pages(data_buf, bytes), REQ_OP_READ, GFP_KERNEL); bio->bi_iter.bi_sector = p.ptr.offset; bch2_bio_map(bio, data_buf, bytes); ret = submit_bio_wait(bio); if (ret) goto err; prt_printf(&buf, "extents pointing to same space, but first extent checksum bad:\n"); bch2_btree_id_to_text(&buf, btree); prt_str(&buf, " "); bch2_bkey_val_to_text(&buf, c, extent); prt_newline(&buf); bch2_btree_id_to_text(&buf, o_btree); prt_str(&buf, " "); bch2_bkey_val_to_text(&buf, c, extent2); struct nonce nonce = extent_nonce(extent.k->bversion, p.crc); struct bch_csum csum = bch2_checksum(c, p.crc.csum_type, nonce, data_buf, bytes); if (fsck_err_on(bch2_crc_cmp(csum, p.crc.csum), trans, dup_backpointer_to_bad_csum_extent, "%s", buf.buf)) ret = drop_dev_and_update(trans, btree, extent, dev) ?: 1; fsck_err: err: if (bio) bio_put(bio); kvfree(data_buf); percpu_ref_put(&ca->io_ref[READ]); printbuf_exit(&buf); return ret; } static int check_bp_exists(struct btree_trans *trans, struct extents_to_bp_state *s, struct bkey_i_backpointer *bp, struct bkey_s_c orig_k) { struct bch_fs *c = trans->c; struct btree_iter other_extent_iter = {}; struct printbuf buf = PRINTBUF; if (bpos_lt(bp->k.p, s->bp_start) || bpos_gt(bp->k.p, s->bp_end)) return 0; struct btree_iter bp_iter; struct bkey_s_c bp_k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, bp->k.p, 0); int ret = bkey_err(bp_k); if (ret) goto err; if (bp_k.k->type != KEY_TYPE_backpointer || memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp->v, sizeof(bp->v))) { ret = bch2_btree_write_buffer_maybe_flush(trans, orig_k, &s->last_flushed); if (ret) goto err; goto check_existing_bp; } out: err: fsck_err: bch2_trans_iter_exit(trans, &other_extent_iter); bch2_trans_iter_exit(trans, &bp_iter); printbuf_exit(&buf); return ret; check_existing_bp: /* Do we have a backpointer for a different extent? */ if (bp_k.k->type != KEY_TYPE_backpointer) goto missing; struct bkey_s_c_backpointer other_bp = bkey_s_c_to_backpointer(bp_k); struct bkey_s_c other_extent = bch2_backpointer_get_key(trans, other_bp, &other_extent_iter, 0, NULL); ret = bkey_err(other_extent); if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) ret = 0; if (ret) goto err; if (!other_extent.k) goto missing; rcu_read_lock(); struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp->k.p.inode); if (ca) { struct bkey_ptrs_c other_extent_ptrs = bch2_bkey_ptrs_c(other_extent); bkey_for_each_ptr(other_extent_ptrs, ptr) if (ptr->dev == bp->k.p.inode && dev_ptr_stale_rcu(ca, ptr)) { ret = drop_dev_and_update(trans, other_bp.v->btree_id, other_extent, bp->k.p.inode); if (ret) goto err; goto out; } } rcu_read_unlock(); if (bch2_extents_match(orig_k, other_extent)) { printbuf_reset(&buf); prt_printf(&buf, "duplicate versions of same extent, deleting smaller\n"); bch2_bkey_val_to_text(&buf, c, orig_k); prt_newline(&buf); bch2_bkey_val_to_text(&buf, c, other_extent); bch_err(c, "%s", buf.buf); if (other_extent.k->size <= orig_k.k->size) { ret = drop_dev_and_update(trans, other_bp.v->btree_id, other_extent, bp->k.p.inode); if (ret) goto err; goto out; } else { ret = drop_dev_and_update(trans, bp->v.btree_id, orig_k, bp->k.p.inode); if (ret) goto err; goto missing; } } ret = check_extent_checksum(trans, other_bp.v->btree_id, other_extent, bp->v.btree_id, orig_k, bp->k.p.inode); if (ret < 0) goto err; if (ret) { ret = 0; goto missing; } ret = check_extent_checksum(trans, bp->v.btree_id, orig_k, other_bp.v->btree_id, other_extent, bp->k.p.inode); if (ret < 0) goto err; if (ret) { ret = 0; goto out; } printbuf_reset(&buf); prt_printf(&buf, "duplicate extents pointing to same space on dev %llu\n", bp->k.p.inode); bch2_bkey_val_to_text(&buf, c, orig_k); prt_newline(&buf); bch2_bkey_val_to_text(&buf, c, other_extent); bch_err(c, "%s", buf.buf); ret = -BCH_ERR_fsck_repair_unimplemented; goto err; missing: printbuf_reset(&buf); prt_str(&buf, "missing backpointer\nfor: "); bch2_bkey_val_to_text(&buf, c, orig_k); prt_printf(&buf, "\nwant: "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&bp->k_i)); prt_printf(&buf, "\ngot: "); bch2_bkey_val_to_text(&buf, c, bp_k); if (fsck_err(trans, ptr_to_missing_backpointer, "%s", buf.buf)) ret = bch2_bucket_backpointer_mod(trans, orig_k, bp, true); goto out; } static int check_extent_to_backpointers(struct btree_trans *trans, struct extents_to_bp_state *s, enum btree_id btree, unsigned level, struct bkey_s_c k) { struct bch_fs *c = trans->c; struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; struct extent_ptr_decoded p; bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { if (p.ptr.dev == BCH_SB_MEMBER_INVALID) continue; rcu_read_lock(); struct bch_dev *ca = bch2_dev_rcu_noerror(c, p.ptr.dev); bool check = ca && test_bit(PTR_BUCKET_NR(ca, &p.ptr), ca->bucket_backpointer_mismatches); bool empty = ca && test_bit(PTR_BUCKET_NR(ca, &p.ptr), ca->bucket_backpointer_empty); bool stale = p.ptr.cached && (!ca || dev_ptr_stale_rcu(ca, &p.ptr)); rcu_read_unlock(); if ((check || empty) && !stale) { struct bkey_i_backpointer bp; bch2_extent_ptr_to_bp(c, btree, level, k, p, entry, &bp); int ret = check ? check_bp_exists(trans, s, &bp, k) : bch2_bucket_backpointer_mod(trans, k, &bp, true); if (ret) return ret; } } return 0; } static int check_btree_root_to_backpointers(struct btree_trans *trans, struct extents_to_bp_state *s, enum btree_id btree_id, int *level) { struct bch_fs *c = trans->c; struct btree_iter iter; struct btree *b; struct bkey_s_c k; int ret; retry: bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0, bch2_btree_id_root(c, btree_id)->b->c.level, 0); b = bch2_btree_iter_peek_node(trans, &iter); ret = PTR_ERR_OR_ZERO(b); if (ret) goto err; if (b != btree_node_root(c, b)) { bch2_trans_iter_exit(trans, &iter); goto retry; } *level = b->c.level; k = bkey_i_to_s_c(&b->key); ret = check_extent_to_backpointers(trans, s, btree_id, b->c.level + 1, k); err: bch2_trans_iter_exit(trans, &iter); return ret; } static inline struct bbpos bp_to_bbpos(struct bch_backpointer bp) { return (struct bbpos) { .btree = bp.btree_id, .pos = bp.pos, }; } static u64 mem_may_pin_bytes(struct bch_fs *c) { struct sysinfo i; si_meminfo(&i); u64 mem_bytes = i.totalram * i.mem_unit; return div_u64(mem_bytes * c->opts.fsck_memory_usage_percent, 100); } static size_t btree_nodes_fit_in_ram(struct bch_fs *c) { return div_u64(mem_may_pin_bytes(c), c->opts.btree_node_size); } static int bch2_get_btree_in_memory_pos(struct btree_trans *trans, u64 btree_leaf_mask, u64 btree_interior_mask, struct bbpos start, struct bbpos *end) { struct bch_fs *c = trans->c; s64 mem_may_pin = mem_may_pin_bytes(c); int ret = 0; bch2_btree_cache_unpin(c); btree_interior_mask |= btree_leaf_mask; c->btree_cache.pinned_nodes_mask[0] = btree_leaf_mask; c->btree_cache.pinned_nodes_mask[1] = btree_interior_mask; c->btree_cache.pinned_nodes_start = start; c->btree_cache.pinned_nodes_end = *end = BBPOS_MAX; for (enum btree_id btree = start.btree; btree < BTREE_ID_NR && !ret; btree++) { unsigned depth = (BIT_ULL(btree) & btree_leaf_mask) ? 0 : 1; if (!(BIT_ULL(btree) & btree_leaf_mask) && !(BIT_ULL(btree) & btree_interior_mask)) continue; ret = __for_each_btree_node(trans, iter, btree, btree == start.btree ? start.pos : POS_MIN, 0, depth, BTREE_ITER_prefetch, b, ({ mem_may_pin -= btree_buf_bytes(b); if (mem_may_pin <= 0) { c->btree_cache.pinned_nodes_end = *end = BBPOS(btree, b->key.k.p); break; } bch2_node_pin(c, b); 0; })); } return ret; } static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, struct extents_to_bp_state *s) { struct bch_fs *c = trans->c; struct progress_indicator_state progress; int ret = 0; bch2_progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_extents)|BIT_ULL(BTREE_ID_reflink)); for (enum btree_id btree_id = 0; btree_id < btree_id_nr_alive(c); btree_id++) { int level, depth = btree_type_has_ptrs(btree_id) ? 0 : 1; ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, check_btree_root_to_backpointers(trans, s, btree_id, &level)); if (ret) return ret; while (level >= depth) { struct btree_iter iter; bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0, level, BTREE_ITER_prefetch); ret = for_each_btree_key_continue(trans, iter, 0, k, ({ bch2_progress_update_iter(trans, &progress, &iter, "extents_to_backpointers"); check_extent_to_backpointers(trans, s, btree_id, level, k) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); })); if (ret) return ret; --level; } } return 0; } enum alloc_sector_counter { ALLOC_dirty, ALLOC_cached, ALLOC_stripe, ALLOC_SECTORS_NR }; static int data_type_to_alloc_counter(enum bch_data_type t) { switch (t) { case BCH_DATA_btree: case BCH_DATA_user: return ALLOC_dirty; case BCH_DATA_cached: return ALLOC_cached; case BCH_DATA_stripe: case BCH_DATA_parity: return ALLOC_stripe; default: return -1; } } static int check_bucket_backpointers_to_extents(struct btree_trans *, struct bch_dev *, struct bpos); static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct bkey_s_c alloc_k, struct bkey_buf *last_flushed) { struct bch_fs *c = trans->c; struct bch_alloc_v4 a_convert; const struct bch_alloc_v4 *a = bch2_alloc_to_v4(alloc_k, &a_convert); bool need_commit = false; if (a->data_type == BCH_DATA_sb || a->data_type == BCH_DATA_journal || a->data_type == BCH_DATA_parity) return 0; u32 sectors[ALLOC_SECTORS_NR]; memset(sectors, 0, sizeof(sectors)); struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(trans->c, alloc_k.k->p); if (!ca) return 0; struct btree_iter iter; struct bkey_s_c bp_k; int ret = 0; for_each_btree_key_max_norestart(trans, iter, BTREE_ID_backpointers, bucket_pos_to_bp_start(ca, alloc_k.k->p), bucket_pos_to_bp_end(ca, alloc_k.k->p), 0, bp_k, ret) { if (bp_k.k->type != KEY_TYPE_backpointer) continue; struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(bp_k); if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_backpointer_bucket_gen && (bp.v->bucket_gen != a->gen || bp.v->pad)) { ret = bch2_backpointer_del(trans, bp_k.k->p); if (ret) break; need_commit = true; continue; } if (bp.v->bucket_gen != a->gen) continue; int alloc_counter = data_type_to_alloc_counter(bp.v->data_type); if (alloc_counter < 0) continue; sectors[alloc_counter] += bp.v->bucket_len; }; bch2_trans_iter_exit(trans, &iter); if (ret) goto err; if (need_commit) { ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); if (ret) goto err; } if (sectors[ALLOC_dirty] != a->dirty_sectors || sectors[ALLOC_cached] != a->cached_sectors || sectors[ALLOC_stripe] != a->stripe_sectors) { if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_backpointer_bucket_gen) { ret = bch2_backpointers_maybe_flush(trans, alloc_k, last_flushed); if (ret) goto err; } if (sectors[ALLOC_dirty] > a->dirty_sectors || sectors[ALLOC_cached] > a->cached_sectors || sectors[ALLOC_stripe] > a->stripe_sectors) { ret = check_bucket_backpointers_to_extents(trans, ca, alloc_k.k->p) ?: -BCH_ERR_transaction_restart_nested; goto err; } if (!sectors[ALLOC_dirty] && !sectors[ALLOC_stripe] && !sectors[ALLOC_cached]) __set_bit(alloc_k.k->p.offset, ca->bucket_backpointer_empty); else __set_bit(alloc_k.k->p.offset, ca->bucket_backpointer_mismatches); } err: bch2_dev_put(ca); return ret; } static bool backpointer_node_has_missing(struct bch_fs *c, struct bkey_s_c k) { switch (k.k->type) { case KEY_TYPE_btree_ptr_v2: { bool ret = false; rcu_read_lock(); struct bpos pos = bkey_s_c_to_btree_ptr_v2(k).v->min_key; while (pos.inode <= k.k->p.inode) { if (pos.inode >= c->sb.nr_devices) break; struct bch_dev *ca = bch2_dev_rcu_noerror(c, pos.inode); if (!ca) goto next; struct bpos bucket = bp_pos_to_bucket(ca, pos); bucket.offset = find_next_bit(ca->bucket_backpointer_mismatches, ca->mi.nbuckets, bucket.offset); if (bucket.offset == ca->mi.nbuckets) goto next; ret = bpos_le(bucket_pos_to_bp_end(ca, bucket), k.k->p); if (ret) break; next: pos = SPOS(pos.inode + 1, 0, 0); } rcu_read_unlock(); return ret; } case KEY_TYPE_btree_ptr: return true; default: return false; } } static int btree_node_get_and_pin(struct btree_trans *trans, struct bkey_i *k, enum btree_id btree, unsigned level) { struct btree_iter iter; bch2_trans_node_iter_init(trans, &iter, btree, k->k.p, 0, level, 0); struct btree *b = bch2_btree_iter_peek_node(trans, &iter); int ret = PTR_ERR_OR_ZERO(b); if (ret) goto err; if (b) bch2_node_pin(trans->c, b); err: bch2_trans_iter_exit(trans, &iter); return ret; } static int bch2_pin_backpointer_nodes_with_missing(struct btree_trans *trans, struct bpos start, struct bpos *end) { struct bch_fs *c = trans->c; int ret = 0; struct bkey_buf tmp; bch2_bkey_buf_init(&tmp); bch2_btree_cache_unpin(c); *end = SPOS_MAX; s64 mem_may_pin = mem_may_pin_bytes(c); struct btree_iter iter; bch2_trans_node_iter_init(trans, &iter, BTREE_ID_backpointers, start, 0, 1, BTREE_ITER_prefetch); ret = for_each_btree_key_continue(trans, iter, 0, k, ({ if (!backpointer_node_has_missing(c, k)) continue; mem_may_pin -= c->opts.btree_node_size; if (mem_may_pin <= 0) break; bch2_bkey_buf_reassemble(&tmp, c, k); struct btree_path *path = btree_iter_path(trans, &iter); BUG_ON(path->level != 1); bch2_btree_node_prefetch(trans, path, tmp.k, path->btree_id, path->level - 1); })); if (ret) return ret; struct bpos pinned = SPOS_MAX; mem_may_pin = mem_may_pin_bytes(c); bch2_trans_node_iter_init(trans, &iter, BTREE_ID_backpointers, start, 0, 1, BTREE_ITER_prefetch); ret = for_each_btree_key_continue(trans, iter, 0, k, ({ if (!backpointer_node_has_missing(c, k)) continue; mem_may_pin -= c->opts.btree_node_size; if (mem_may_pin <= 0) { *end = pinned; break; } bch2_bkey_buf_reassemble(&tmp, c, k); struct btree_path *path = btree_iter_path(trans, &iter); BUG_ON(path->level != 1); int ret2 = btree_node_get_and_pin(trans, tmp.k, path->btree_id, path->level - 1); if (!ret2) pinned = tmp.k->k.p; ret; })); if (ret) return ret; return ret; } int bch2_check_extents_to_backpointers(struct bch_fs *c) { int ret = 0; /* * Can't allow devices to come/go/resize while we have bucket bitmaps * allocated */ down_read(&c->state_lock); for_each_member_device(c, ca) { BUG_ON(ca->bucket_backpointer_mismatches); ca->bucket_backpointer_mismatches = kvcalloc(BITS_TO_LONGS(ca->mi.nbuckets), sizeof(unsigned long), GFP_KERNEL); ca->bucket_backpointer_empty = kvcalloc(BITS_TO_LONGS(ca->mi.nbuckets), sizeof(unsigned long), GFP_KERNEL); if (!ca->bucket_backpointer_mismatches || !ca->bucket_backpointer_empty) { bch2_dev_put(ca); ret = -BCH_ERR_ENOMEM_backpointer_mismatches_bitmap; goto err_free_bitmaps; } } struct btree_trans *trans = bch2_trans_get(c); struct extents_to_bp_state s = { .bp_start = POS_MIN }; bch2_bkey_buf_init(&s.last_flushed); bkey_init(&s.last_flushed.k->k); ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, BTREE_ITER_prefetch, k, ({ check_bucket_backpointer_mismatch(trans, k, &s.last_flushed); })); if (ret) goto err; u64 nr_buckets = 0, nr_mismatches = 0, nr_empty = 0; for_each_member_device(c, ca) { nr_buckets += ca->mi.nbuckets; nr_mismatches += bitmap_weight(ca->bucket_backpointer_mismatches, ca->mi.nbuckets); nr_empty += bitmap_weight(ca->bucket_backpointer_empty, ca->mi.nbuckets); } if (!nr_mismatches && !nr_empty) goto err; bch_info(c, "scanning for missing backpointers in %llu/%llu buckets", nr_mismatches + nr_empty, nr_buckets); while (1) { ret = bch2_pin_backpointer_nodes_with_missing(trans, s.bp_start, &s.bp_end); if (ret) break; if ( bpos_eq(s.bp_start, POS_MIN) && !bpos_eq(s.bp_end, SPOS_MAX)) bch_verbose(c, "%s(): alloc info does not fit in ram, running in multiple passes with %zu nodes per pass", __func__, btree_nodes_fit_in_ram(c)); if (!bpos_eq(s.bp_start, POS_MIN) || !bpos_eq(s.bp_end, SPOS_MAX)) { struct printbuf buf = PRINTBUF; prt_str(&buf, "check_extents_to_backpointers(): "); bch2_bpos_to_text(&buf, s.bp_start); prt_str(&buf, "-"); bch2_bpos_to_text(&buf, s.bp_end); bch_verbose(c, "%s", buf.buf); printbuf_exit(&buf); } ret = bch2_check_extents_to_backpointers_pass(trans, &s); if (ret || bpos_eq(s.bp_end, SPOS_MAX)) break; s.bp_start = bpos_successor(s.bp_end); } err: bch2_trans_put(trans); bch2_bkey_buf_exit(&s.last_flushed, c); bch2_btree_cache_unpin(c); err_free_bitmaps: for_each_member_device(c, ca) { kvfree(ca->bucket_backpointer_empty); ca->bucket_backpointer_empty = NULL; kvfree(ca->bucket_backpointer_mismatches); ca->bucket_backpointer_mismatches = NULL; } up_read(&c->state_lock); bch_err_fn(c, ret); return ret; } static int check_one_backpointer(struct btree_trans *trans, struct bbpos start, struct bbpos end, struct bkey_s_c bp_k, struct bkey_buf *last_flushed) { if (bp_k.k->type != KEY_TYPE_backpointer) return 0; struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(bp_k); struct bbpos pos = bp_to_bbpos(*bp.v); if (bbpos_cmp(pos, start) < 0 || bbpos_cmp(pos, end) > 0) return 0; struct btree_iter iter; struct bkey_s_c k = bch2_backpointer_get_key(trans, bp, &iter, 0, last_flushed); int ret = bkey_err(k); if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) return 0; if (ret) return ret; bch2_trans_iter_exit(trans, &iter); return ret; } static int check_bucket_backpointers_to_extents(struct btree_trans *trans, struct bch_dev *ca, struct bpos bucket) { u32 restart_count = trans->restart_count; struct bkey_buf last_flushed; bch2_bkey_buf_init(&last_flushed); bkey_init(&last_flushed.k->k); int ret = for_each_btree_key_max(trans, iter, BTREE_ID_backpointers, bucket_pos_to_bp_start(ca, bucket), bucket_pos_to_bp_end(ca, bucket), 0, k, check_one_backpointer(trans, BBPOS_MIN, BBPOS_MAX, k, &last_flushed) ); bch2_bkey_buf_exit(&last_flushed, trans->c); return ret ?: trans_was_restarted(trans, restart_count); } static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans, struct bbpos start, struct bbpos end) { struct bch_fs *c = trans->c; struct bkey_buf last_flushed; struct progress_indicator_state progress; bch2_bkey_buf_init(&last_flushed); bkey_init(&last_flushed.k->k); bch2_progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_backpointers)); int ret = for_each_btree_key(trans, iter, BTREE_ID_backpointers, POS_MIN, BTREE_ITER_prefetch, k, ({ bch2_progress_update_iter(trans, &progress, &iter, "backpointers_to_extents"); check_one_backpointer(trans, start, end, k, &last_flushed); })); bch2_bkey_buf_exit(&last_flushed, c); return ret; } int bch2_check_backpointers_to_extents(struct bch_fs *c) { struct btree_trans *trans = bch2_trans_get(c); struct bbpos start = (struct bbpos) { .btree = 0, .pos = POS_MIN, }, end; int ret; while (1) { ret = bch2_get_btree_in_memory_pos(trans, BIT_ULL(BTREE_ID_extents)| BIT_ULL(BTREE_ID_reflink), ~0, start, &end); if (ret) break; if (!bbpos_cmp(start, BBPOS_MIN) && bbpos_cmp(end, BBPOS_MAX)) bch_verbose(c, "%s(): extents do not fit in ram, running in multiple passes with %zu nodes per pass", __func__, btree_nodes_fit_in_ram(c)); if (bbpos_cmp(start, BBPOS_MIN) || bbpos_cmp(end, BBPOS_MAX)) { struct printbuf buf = PRINTBUF; prt_str(&buf, "check_backpointers_to_extents(): "); bch2_bbpos_to_text(&buf, start); prt_str(&buf, "-"); bch2_bbpos_to_text(&buf, end); bch_verbose(c, "%s", buf.buf); printbuf_exit(&buf); } ret = bch2_check_backpointers_to_extents_pass(trans, start, end); if (ret || !bbpos_cmp(end, BBPOS_MAX)) break; start = bbpos_successor(end); } bch2_trans_put(trans); bch2_btree_cache_unpin(c); bch_err_fn(c, ret); return ret; } |
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef BTRFS_DEFRAG_H #define BTRFS_DEFRAG_H #include <linux/types.h> #include <linux/compiler_types.h> struct file_ra_state; struct btrfs_inode; struct btrfs_fs_info; struct btrfs_root; struct btrfs_trans_handle; struct btrfs_ioctl_defrag_range_args; int btrfs_defrag_file(struct btrfs_inode *inode, struct file_ra_state *ra, struct btrfs_ioctl_defrag_range_args *range, u64 newer_than, unsigned long max_to_defrag); int __init btrfs_auto_defrag_init(void); void __cold btrfs_auto_defrag_exit(void); void btrfs_add_inode_defrag(struct btrfs_inode *inode, u32 extent_thresh); int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info); void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info); int btrfs_defrag_root(struct btrfs_root *root); static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info) { return signal_pending(current); } #endif |
| 3866 3857 3664 3662 3674 3656 3660 3667 3677 11 3668 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 | // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/filesystems.c * * Copyright (C) 1991, 1992 Linus Torvalds * * table of configured filesystems */ #include <linux/syscalls.h> #include <linux/fs.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/kmod.h> #include <linux/init.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/fs_parser.h> /* * Handling of filesystem drivers list. * Rules: * Inclusion to/removals from/scanning of list are protected by spinlock. * During the unload module must call unregister_filesystem(). * We can access the fields of list element if: * 1) spinlock is held or * 2) we hold the reference to the module. * The latter can be guaranteed by call of try_module_get(); if it * returned 0 we must skip the element, otherwise we got the reference. * Once the reference is obtained we can drop the spinlock. */ static struct file_system_type *file_systems; static DEFINE_RWLOCK(file_systems_lock); /* WARNING: This can be used only if we _already_ own a reference */ struct file_system_type *get_filesystem(struct file_system_type *fs) { __module_get(fs->owner); return fs; } void put_filesystem(struct file_system_type *fs) { module_put(fs->owner); } static struct file_system_type **find_filesystem(const char *name, unsigned len) { struct file_system_type **p; for (p = &file_systems; *p; p = &(*p)->next) if (strncmp((*p)->name, name, len) == 0 && !(*p)->name[len]) break; return p; } /** * register_filesystem - register a new filesystem * @fs: the file system structure * * Adds the file system passed to the list of file systems the kernel * is aware of for mount and other syscalls. Returns 0 on success, * or a negative errno code on an error. * * The &struct file_system_type that is passed is linked into the kernel * structures and must not be freed until the file system has been * unregistered. */ int register_filesystem(struct file_system_type * fs) { int res = 0; struct file_system_type ** p; if (fs->parameters && !fs_validate_description(fs->name, fs->parameters)) return -EINVAL; BUG_ON(strchr(fs->name, '.')); if (fs->next) return -EBUSY; write_lock(&file_systems_lock); p = find_filesystem(fs->name, strlen(fs->name)); if (*p) res = -EBUSY; else *p = fs; write_unlock(&file_systems_lock); return res; } EXPORT_SYMBOL(register_filesystem); /** * unregister_filesystem - unregister a file system * @fs: filesystem to unregister * * Remove a file system that was previously successfully registered * with the kernel. An error is returned if the file system is not found. * Zero is returned on a success. * * Once this function has returned the &struct file_system_type structure * may be freed or reused. */ int unregister_filesystem(struct file_system_type * fs) { struct file_system_type ** tmp; write_lock(&file_systems_lock); tmp = &file_systems; while (*tmp) { if (fs == *tmp) { *tmp = fs->next; fs->next = NULL; write_unlock(&file_systems_lock); synchronize_rcu(); return 0; } tmp = &(*tmp)->next; } write_unlock(&file_systems_lock); return -EINVAL; } EXPORT_SYMBOL(unregister_filesystem); #ifdef CONFIG_SYSFS_SYSCALL static int fs_index(const char __user * __name) { struct file_system_type * tmp; struct filename *name; int err, index; name = getname(__name); err = PTR_ERR(name); if (IS_ERR(name)) return err; err = -EINVAL; read_lock(&file_systems_lock); for (tmp=file_systems, index=0 ; tmp ; tmp=tmp->next, index++) { if (strcmp(tmp->name, name->name) == 0) { err = index; break; } } read_unlock(&file_systems_lock); putname(name); return err; } static int fs_name(unsigned int index, char __user * buf) { struct file_system_type * tmp; int len, res; read_lock(&file_systems_lock); for (tmp = file_systems; tmp; tmp = tmp->next, index--) if (index <= 0 && try_module_get(tmp->owner)) break; read_unlock(&file_systems_lock); if (!tmp) return -EINVAL; /* OK, we got the reference, so we can safely block */ len = strlen(tmp->name) + 1; res = copy_to_user(buf, tmp->name, len) ? -EFAULT : 0; put_filesystem(tmp); return res; } static int fs_maxindex(void) { struct file_system_type * tmp; int index; read_lock(&file_systems_lock); for (tmp = file_systems, index = 0 ; tmp ; tmp = tmp->next, index++) ; read_unlock(&file_systems_lock); return index; } /* * Whee.. Weird sysv syscall. */ SYSCALL_DEFINE3(sysfs, int, option, unsigned long, arg1, unsigned long, arg2) { int retval = -EINVAL; switch (option) { case 1: retval = fs_index((const char __user *) arg1); break; case 2: retval = fs_name(arg1, (char __user *) arg2); break; case 3: retval = fs_maxindex(); break; } return retval; } #endif int __init list_bdev_fs_names(char *buf, size_t size) { struct file_system_type *p; size_t len; int count = 0; read_lock(&file_systems_lock); for (p = file_systems; p; p = p->next) { if (!(p->fs_flags & FS_REQUIRES_DEV)) continue; len = strlen(p->name) + 1; if (len > size) { pr_warn("%s: truncating file system list\n", __func__); break; } memcpy(buf, p->name, len); buf += len; size -= len; count++; } read_unlock(&file_systems_lock); return count; } #ifdef CONFIG_PROC_FS static int filesystems_proc_show(struct seq_file *m, void *v) { struct file_system_type * tmp; read_lock(&file_systems_lock); tmp = file_systems; while (tmp) { seq_printf(m, "%s\t%s\n", (tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev", tmp->name); tmp = tmp->next; } read_unlock(&file_systems_lock); return 0; } static int __init proc_filesystems_init(void) { proc_create_single("filesystems", 0, NULL, filesystems_proc_show); return 0; } module_init(proc_filesystems_init); #endif static struct file_system_type *__get_fs_type(const char *name, int len) { struct file_system_type *fs; read_lock(&file_systems_lock); fs = *(find_filesystem(name, len)); if (fs && !try_module_get(fs->owner)) fs = NULL; read_unlock(&file_systems_lock); return fs; } struct file_system_type *get_fs_type(const char *name) { struct file_system_type *fs; const char *dot = strchr(name, '.'); int len = dot ? dot - name : strlen(name); fs = __get_fs_type(name, len); if (!fs && (request_module("fs-%.*s", len, name) == 0)) { fs = __get_fs_type(name, len); if (!fs) pr_warn_once("request_module fs-%.*s succeeded, but still no fs?\n", len, name); } if (dot && fs && !(fs->fs_flags & FS_HAS_SUBTYPE)) { put_filesystem(fs); fs = NULL; } return fs; } EXPORT_SYMBOL(get_fs_type); |
| 13 99 99 99 53 186 187 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 | // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" #include "clock.h" #include <linux/freezer.h> #include <linux/kthread.h> #include <linux/preempt.h> static inline bool io_timer_cmp(const void *l, const void *r, void __always_unused *args) { struct io_timer **_l = (struct io_timer **)l; struct io_timer **_r = (struct io_timer **)r; return (*_l)->expire < (*_r)->expire; } static const struct min_heap_callbacks callbacks = { .less = io_timer_cmp, .swp = NULL, }; void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer) { spin_lock(&clock->timer_lock); if (time_after_eq64((u64) atomic64_read(&clock->now), timer->expire)) { spin_unlock(&clock->timer_lock); timer->fn(timer); return; } for (size_t i = 0; i < clock->timers.nr; i++) if (clock->timers.data[i] == timer) goto out; BUG_ON(!min_heap_push(&clock->timers, &timer, &callbacks, NULL)); out: spin_unlock(&clock->timer_lock); } void bch2_io_timer_del(struct io_clock *clock, struct io_timer *timer) { spin_lock(&clock->timer_lock); for (size_t i = 0; i < clock->timers.nr; i++) if (clock->timers.data[i] == timer) { min_heap_del(&clock->timers, i, &callbacks, NULL); break; } spin_unlock(&clock->timer_lock); } struct io_clock_wait { struct io_timer io_timer; struct timer_list cpu_timer; struct task_struct *task; int expired; }; static void io_clock_wait_fn(struct io_timer *timer) { struct io_clock_wait *wait = container_of(timer, struct io_clock_wait, io_timer); wait->expired = 1; wake_up_process(wait->task); } static void io_clock_cpu_timeout(struct timer_list *timer) { struct io_clock_wait *wait = container_of(timer, struct io_clock_wait, cpu_timer); wait->expired = 1; wake_up_process(wait->task); } void bch2_io_clock_schedule_timeout(struct io_clock *clock, u64 until) { struct io_clock_wait wait = { .io_timer.expire = until, .io_timer.fn = io_clock_wait_fn, .io_timer.fn2 = (void *) _RET_IP_, .task = current, }; bch2_io_timer_add(clock, &wait.io_timer); schedule(); bch2_io_timer_del(clock, &wait.io_timer); } void bch2_kthread_io_clock_wait(struct io_clock *clock, u64 io_until, unsigned long cpu_timeout) { bool kthread = (current->flags & PF_KTHREAD) != 0; struct io_clock_wait wait = { .io_timer.expire = io_until, .io_timer.fn = io_clock_wait_fn, .io_timer.fn2 = (void *) _RET_IP_, .task = current, }; bch2_io_timer_add(clock, &wait.io_timer); timer_setup_on_stack(&wait.cpu_timer, io_clock_cpu_timeout, 0); if (cpu_timeout != MAX_SCHEDULE_TIMEOUT) mod_timer(&wait.cpu_timer, cpu_timeout + jiffies); do { set_current_state(TASK_INTERRUPTIBLE); if (kthread && kthread_should_stop()) break; if (wait.expired) break; schedule(); try_to_freeze(); } while (0); __set_current_state(TASK_RUNNING); timer_delete_sync(&wait.cpu_timer); destroy_timer_on_stack(&wait.cpu_timer); bch2_io_timer_del(clock, &wait.io_timer); } static struct io_timer *get_expired_timer(struct io_clock *clock, u64 now) { struct io_timer *ret = NULL; if (clock->timers.nr && time_after_eq64(now, clock->timers.data[0]->expire)) { ret = *min_heap_peek(&clock->timers); min_heap_pop(&clock->timers, &callbacks, NULL); } return ret; } void __bch2_increment_clock(struct io_clock *clock, u64 sectors) { struct io_timer *timer; u64 now = atomic64_add_return(sectors, &clock->now); spin_lock(&clock->timer_lock); while ((timer = get_expired_timer(clock, now))) timer->fn(timer); spin_unlock(&clock->timer_lock); } void bch2_io_timers_to_text(struct printbuf *out, struct io_clock *clock) { out->atomic++; spin_lock(&clock->timer_lock); u64 now = atomic64_read(&clock->now); printbuf_tabstop_push(out, 40); prt_printf(out, "current time:\t%llu\n", now); for (unsigned i = 0; i < clock->timers.nr; i++) prt_printf(out, "%ps %ps:\t%llu\n", clock->timers.data[i]->fn, clock->timers.data[i]->fn2, clock->timers.data[i]->expire); spin_unlock(&clock->timer_lock); --out->atomic; } void bch2_io_clock_exit(struct io_clock *clock) { free_heap(&clock->timers); free_percpu(clock->pcpu_buf); } int bch2_io_clock_init(struct io_clock *clock) { atomic64_set(&clock->now, 0); spin_lock_init(&clock->timer_lock); clock->max_slop = IO_CLOCK_PCPU_SECTORS * num_possible_cpus(); clock->pcpu_buf = alloc_percpu(*clock->pcpu_buf); if (!clock->pcpu_buf) return -BCH_ERR_ENOMEM_io_clock_init; if (!init_heap(&clock->timers, NR_IO_TIMERS, GFP_KERNEL)) return -BCH_ERR_ENOMEM_io_clock_init; return 0; } |
| 2 2 2 2 2 3 3 3 3 3 3 3 2 2 2 2 2 2 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 | /* * POSIX message queues filesystem for Linux. * * Copyright (C) 2003,2004 Krzysztof Benedyczak (golbi@mat.uni.torun.pl) * Michal Wronski (michal.wronski@gmail.com) * * Spinlocks: Mohamed Abbas (abbas.mohamed@intel.com) * Lockless receive & send, fd based notify: * Manfred Spraul (manfred@colorfullife.com) * * Audit: George Wilson (ltcgcw@us.ibm.com) * * This file is released under the GPL. */ #include <linux/capability.h> #include <linux/init.h> #include <linux/pagemap.h> #include <linux/file.h> #include <linux/mount.h> #include <linux/fs_context.h> #include <linux/namei.h> #include <linux/sysctl.h> #include <linux/poll.h> #include <linux/mqueue.h> #include <linux/msg.h> #include <linux/skbuff.h> #include <linux/vmalloc.h> #include <linux/netlink.h> #include <linux/syscalls.h> #include <linux/audit.h> #include <linux/signal.h> #include <linux/mutex.h> #include <linux/nsproxy.h> #include <linux/pid.h> #include <linux/ipc_namespace.h> #include <linux/user_namespace.h> #include <linux/slab.h> #include <linux/sched/wake_q.h> #include <linux/sched/signal.h> #include <linux/sched/user.h> #include <net/sock.h> #include "util.h" struct mqueue_fs_context { struct ipc_namespace *ipc_ns; bool newns; /* Set if newly created ipc namespace */ }; #define MQUEUE_MAGIC 0x19800202 #define DIRENT_SIZE 20 #define FILENT_SIZE 80 #define SEND 0 #define RECV 1 #define STATE_NONE 0 #define STATE_READY 1 struct posix_msg_tree_node { struct rb_node rb_node; struct list_head msg_list; int priority; }; /* * Locking: * * Accesses to a message queue are synchronized by acquiring info->lock. * * There are two notable exceptions: * - The actual wakeup of a sleeping task is performed using the wake_q * framework. info->lock is already released when wake_up_q is called. * - The exit codepaths after sleeping check ext_wait_queue->state without * any locks. If it is STATE_READY, then the syscall is completed without * acquiring info->lock. * * MQ_BARRIER: * To achieve proper release/acquire memory barrier pairing, the state is set to * STATE_READY with smp_store_release(), and it is read with READ_ONCE followed * by smp_acquire__after_ctrl_dep(). In addition, wake_q_add_safe() is used. * * This prevents the following races: * * 1) With the simple wake_q_add(), the task could be gone already before * the increase of the reference happens * Thread A * Thread B * WRITE_ONCE(wait.state, STATE_NONE); * schedule_hrtimeout() * wake_q_add(A) * if (cmpxchg()) // success * ->state = STATE_READY (reordered) * <timeout returns> * if (wait.state == STATE_READY) return; * sysret to user space * sys_exit() * get_task_struct() // UaF * * Solution: Use wake_q_add_safe() and perform the get_task_struct() before * the smp_store_release() that does ->state = STATE_READY. * * 2) Without proper _release/_acquire barriers, the woken up task * could read stale data * * Thread A * Thread B * do_mq_timedreceive * WRITE_ONCE(wait.state, STATE_NONE); * schedule_hrtimeout() * state = STATE_READY; * <timeout returns> * if (wait.state == STATE_READY) return; * msg_ptr = wait.msg; // Access to stale data! * receiver->msg = message; (reordered) * * Solution: use _release and _acquire barriers. * * 3) There is intentionally no barrier when setting current->state * to TASK_INTERRUPTIBLE: spin_unlock(&info->lock) provides the * release memory barrier, and the wakeup is triggered when holding * info->lock, i.e. spin_lock(&info->lock) provided a pairing * acquire memory barrier. */ struct ext_wait_queue { /* queue of sleeping tasks */ struct task_struct *task; struct list_head list; struct msg_msg *msg; /* ptr of loaded message */ int state; /* one of STATE_* values */ }; struct mqueue_inode_info { spinlock_t lock; struct inode vfs_inode; wait_queue_head_t wait_q; struct rb_root msg_tree; struct rb_node *msg_tree_rightmost; struct posix_msg_tree_node *node_cache; struct mq_attr attr; struct sigevent notify; struct pid *notify_owner; u32 notify_self_exec_id; struct user_namespace *notify_user_ns; struct ucounts *ucounts; /* user who created, for accounting */ struct sock *notify_sock; struct sk_buff *notify_cookie; /* for tasks waiting for free space and messages, respectively */ struct ext_wait_queue e_wait_q[2]; unsigned long qsize; /* size of queue in memory (sum of all msgs) */ }; static struct file_system_type mqueue_fs_type; static const struct inode_operations mqueue_dir_inode_operations; static const struct file_operations mqueue_file_operations; static const struct super_operations mqueue_super_ops; static const struct fs_context_operations mqueue_fs_context_ops; static void remove_notification(struct mqueue_inode_info *info); static struct kmem_cache *mqueue_inode_cachep; static inline struct mqueue_inode_info *MQUEUE_I(struct inode *inode) { return container_of(inode, struct mqueue_inode_info, vfs_inode); } /* * This routine should be called with the mq_lock held. */ static inline struct ipc_namespace *__get_ns_from_inode(struct inode *inode) { return get_ipc_ns(inode->i_sb->s_fs_info); } static struct ipc_namespace *get_ns_from_inode(struct inode *inode) { struct ipc_namespace *ns; spin_lock(&mq_lock); ns = __get_ns_from_inode(inode); spin_unlock(&mq_lock); return ns; } /* Auxiliary functions to manipulate messages' list */ static int msg_insert(struct msg_msg *msg, struct mqueue_inode_info *info) { struct rb_node **p, *parent = NULL; struct posix_msg_tree_node *leaf; bool rightmost = true; p = &info->msg_tree.rb_node; while (*p) { parent = *p; leaf = rb_entry(parent, struct posix_msg_tree_node, rb_node); if (likely(leaf->priority == msg->m_type)) goto insert_msg; else if (msg->m_type < leaf->priority) { p = &(*p)->rb_left; rightmost = false; } else p = &(*p)->rb_right; } if (info->node_cache) { leaf = info->node_cache; info->node_cache = NULL; } else { leaf = kmalloc(sizeof(*leaf), GFP_ATOMIC); if (!leaf) return -ENOMEM; INIT_LIST_HEAD(&leaf->msg_list); } leaf->priority = msg->m_type; if (rightmost) info->msg_tree_rightmost = &leaf->rb_node; rb_link_node(&leaf->rb_node, parent, p); rb_insert_color(&leaf->rb_node, &info->msg_tree); insert_msg: info->attr.mq_curmsgs++; info->qsize += msg->m_ts; list_add_tail(&msg->m_list, &leaf->msg_list); return 0; } static inline void msg_tree_erase(struct posix_msg_tree_node *leaf, struct mqueue_inode_info *info) { struct rb_node *node = &leaf->rb_node; if (info->msg_tree_rightmost == node) info->msg_tree_rightmost = rb_prev(node); rb_erase(node, &info->msg_tree); if (info->node_cache) kfree(leaf); else info->node_cache = leaf; } static inline struct msg_msg *msg_get(struct mqueue_inode_info *info) { struct rb_node *parent = NULL; struct posix_msg_tree_node *leaf; struct msg_msg *msg; try_again: /* * During insert, low priorities go to the left and high to the * right. On receive, we want the highest priorities first, so * walk all the way to the right. */ parent = info->msg_tree_rightmost; if (!parent) { if (info->attr.mq_curmsgs) { pr_warn_once("Inconsistency in POSIX message queue, " "no tree element, but supposedly messages " "should exist!\n"); info->attr.mq_curmsgs = 0; } return NULL; } leaf = rb_entry(parent, struct posix_msg_tree_node, rb_node); if (unlikely(list_empty(&leaf->msg_list))) { pr_warn_once("Inconsistency in POSIX message queue, " "empty leaf node but we haven't implemented " "lazy leaf delete!\n"); msg_tree_erase(leaf, info); goto try_again; } else { msg = list_first_entry(&leaf->msg_list, struct msg_msg, m_list); list_del(&msg->m_list); if (list_empty(&leaf->msg_list)) { msg_tree_erase(leaf, info); } } info->attr.mq_curmsgs--; info->qsize -= msg->m_ts; return msg; } static struct inode *mqueue_get_inode(struct super_block *sb, struct ipc_namespace *ipc_ns, umode_t mode, struct mq_attr *attr) { struct inode *inode; int ret = -ENOMEM; inode = new_inode(sb); if (!inode) goto err; inode->i_ino = get_next_ino(); inode->i_mode = mode; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); simple_inode_init_ts(inode); if (S_ISREG(mode)) { struct mqueue_inode_info *info; unsigned long mq_bytes, mq_treesize; inode->i_fop = &mqueue_file_operations; inode->i_size = FILENT_SIZE; /* mqueue specific info */ info = MQUEUE_I(inode); spin_lock_init(&info->lock); init_waitqueue_head(&info->wait_q); INIT_LIST_HEAD(&info->e_wait_q[0].list); INIT_LIST_HEAD(&info->e_wait_q[1].list); info->notify_owner = NULL; info->notify_user_ns = NULL; info->qsize = 0; info->ucounts = NULL; /* set when all is ok */ info->msg_tree = RB_ROOT; info->msg_tree_rightmost = NULL; info->node_cache = NULL; memset(&info->attr, 0, sizeof(info->attr)); info->attr.mq_maxmsg = min(ipc_ns->mq_msg_max, ipc_ns->mq_msg_default); info->attr.mq_msgsize = min(ipc_ns->mq_msgsize_max, ipc_ns->mq_msgsize_default); if (attr) { info->attr.mq_maxmsg = attr->mq_maxmsg; info->attr.mq_msgsize = attr->mq_msgsize; } /* * We used to allocate a static array of pointers and account * the size of that array as well as one msg_msg struct per * possible message into the queue size. That's no longer * accurate as the queue is now an rbtree and will grow and * shrink depending on usage patterns. We can, however, still * account one msg_msg struct per message, but the nodes are * allocated depending on priority usage, and most programs * only use one, or a handful, of priorities. However, since * this is pinned memory, we need to assume worst case, so * that means the min(mq_maxmsg, max_priorities) * struct * posix_msg_tree_node. */ ret = -EINVAL; if (info->attr.mq_maxmsg <= 0 || info->attr.mq_msgsize <= 0) goto out_inode; if (capable(CAP_SYS_RESOURCE)) { if (info->attr.mq_maxmsg > HARD_MSGMAX || info->attr.mq_msgsize > HARD_MSGSIZEMAX) goto out_inode; } else { if (info->attr.mq_maxmsg > ipc_ns->mq_msg_max || info->attr.mq_msgsize > ipc_ns->mq_msgsize_max) goto out_inode; } ret = -EOVERFLOW; /* check for overflow */ if (info->attr.mq_msgsize > ULONG_MAX/info->attr.mq_maxmsg) goto out_inode; mq_treesize = info->attr.mq_maxmsg * sizeof(struct msg_msg) + min_t(unsigned int, info->attr.mq_maxmsg, MQ_PRIO_MAX) * sizeof(struct posix_msg_tree_node); mq_bytes = info->attr.mq_maxmsg * info->attr.mq_msgsize; if (mq_bytes + mq_treesize < mq_bytes) goto out_inode; mq_bytes += mq_treesize; info->ucounts = get_ucounts(current_ucounts()); if (info->ucounts) { long msgqueue; spin_lock(&mq_lock); msgqueue = inc_rlimit_ucounts(info->ucounts, UCOUNT_RLIMIT_MSGQUEUE, mq_bytes); if (msgqueue == LONG_MAX || msgqueue > rlimit(RLIMIT_MSGQUEUE)) { dec_rlimit_ucounts(info->ucounts, UCOUNT_RLIMIT_MSGQUEUE, mq_bytes); spin_unlock(&mq_lock); put_ucounts(info->ucounts); info->ucounts = NULL; /* mqueue_evict_inode() releases info->messages */ ret = -EMFILE; goto out_inode; } spin_unlock(&mq_lock); } } else if (S_ISDIR(mode)) { inc_nlink(inode); /* Some things misbehave if size == 0 on a directory */ inode->i_size = 2 * DIRENT_SIZE; inode->i_op = &mqueue_dir_inode_operations; inode->i_fop = &simple_dir_operations; } return inode; out_inode: iput(inode); err: return ERR_PTR(ret); } static int mqueue_fill_super(struct super_block *sb, struct fs_context *fc) { struct inode *inode; struct ipc_namespace *ns = sb->s_fs_info; sb->s_iflags |= SB_I_NOEXEC | SB_I_NODEV; sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; sb->s_magic = MQUEUE_MAGIC; sb->s_op = &mqueue_super_ops; inode = mqueue_get_inode(sb, ns, S_IFDIR | S_ISVTX | S_IRWXUGO, NULL); if (IS_ERR(inode)) return PTR_ERR(inode); sb->s_root = d_make_root(inode); if (!sb->s_root) return -ENOMEM; return 0; } static int mqueue_get_tree(struct fs_context *fc) { struct mqueue_fs_context *ctx = fc->fs_private; /* * With a newly created ipc namespace, we don't need to do a search * for an ipc namespace match, but we still need to set s_fs_info. */ if (ctx->newns) { fc->s_fs_info = ctx->ipc_ns; return get_tree_nodev(fc, mqueue_fill_super); } return get_tree_keyed(fc, mqueue_fill_super, ctx->ipc_ns); } static void mqueue_fs_context_free(struct fs_context *fc) { struct mqueue_fs_context *ctx = fc->fs_private; put_ipc_ns(ctx->ipc_ns); kfree(ctx); } static int mqueue_init_fs_context(struct fs_context *fc) { struct mqueue_fs_context *ctx; ctx = kzalloc(sizeof(struct mqueue_fs_context), GFP_KERNEL); if (!ctx) return -ENOMEM; ctx->ipc_ns = get_ipc_ns(current->nsproxy->ipc_ns); put_user_ns(fc->user_ns); fc->user_ns = get_user_ns(ctx->ipc_ns->user_ns); fc->fs_private = ctx; fc->ops = &mqueue_fs_context_ops; return 0; } /* * mq_init_ns() is currently the only caller of mq_create_mount(). * So the ns parameter is always a newly created ipc namespace. */ static struct vfsmount *mq_create_mount(struct ipc_namespace *ns) { struct mqueue_fs_context *ctx; struct fs_context *fc; struct vfsmount *mnt; fc = fs_context_for_mount(&mqueue_fs_type, SB_KERNMOUNT); if (IS_ERR(fc)) return ERR_CAST(fc); ctx = fc->fs_private; ctx->newns = true; put_ipc_ns(ctx->ipc_ns); ctx->ipc_ns = get_ipc_ns(ns); put_user_ns(fc->user_ns); fc->user_ns = get_user_ns(ctx->ipc_ns->user_ns); mnt = fc_mount(fc); put_fs_context(fc); return mnt; } static void init_once(void *foo) { struct mqueue_inode_info *p = foo; inode_init_once(&p->vfs_inode); } static struct inode *mqueue_alloc_inode(struct super_block *sb) { struct mqueue_inode_info *ei; ei = alloc_inode_sb(sb, mqueue_inode_cachep, GFP_KERNEL); if (!ei) return NULL; return &ei->vfs_inode; } static void mqueue_free_inode(struct inode *inode) { kmem_cache_free(mqueue_inode_cachep, MQUEUE_I(inode)); } static void mqueue_evict_inode(struct inode *inode) { struct mqueue_inode_info *info; struct ipc_namespace *ipc_ns; struct msg_msg *msg, *nmsg; LIST_HEAD(tmp_msg); clear_inode(inode); if (S_ISDIR(inode->i_mode)) return; ipc_ns = get_ns_from_inode(inode); info = MQUEUE_I(inode); spin_lock(&info->lock); while ((msg = msg_get(info)) != NULL) list_add_tail(&msg->m_list, &tmp_msg); kfree(info->node_cache); spin_unlock(&info->lock); list_for_each_entry_safe(msg, nmsg, &tmp_msg, m_list) { list_del(&msg->m_list); free_msg(msg); } if (info->ucounts) { unsigned long mq_bytes, mq_treesize; /* Total amount of bytes accounted for the mqueue */ mq_treesize = info->attr.mq_maxmsg * sizeof(struct msg_msg) + min_t(unsigned int, info->attr.mq_maxmsg, MQ_PRIO_MAX) * sizeof(struct posix_msg_tree_node); mq_bytes = mq_treesize + (info->attr.mq_maxmsg * info->attr.mq_msgsize); spin_lock(&mq_lock); dec_rlimit_ucounts(info->ucounts, UCOUNT_RLIMIT_MSGQUEUE, mq_bytes); /* * get_ns_from_inode() ensures that the * (ipc_ns = sb->s_fs_info) is either a valid ipc_ns * to which we now hold a reference, or it is NULL. * We can't put it here under mq_lock, though. */ if (ipc_ns) ipc_ns->mq_queues_count--; spin_unlock(&mq_lock); put_ucounts(info->ucounts); info->ucounts = NULL; } if (ipc_ns) put_ipc_ns(ipc_ns); } static int mqueue_create_attr(struct dentry *dentry, umode_t mode, void *arg) { struct inode *dir = dentry->d_parent->d_inode; struct inode *inode; struct mq_attr *attr = arg; int error; struct ipc_namespace *ipc_ns; spin_lock(&mq_lock); ipc_ns = __get_ns_from_inode(dir); if (!ipc_ns) { error = -EACCES; goto out_unlock; } if (ipc_ns->mq_queues_count >= ipc_ns->mq_queues_max && !capable(CAP_SYS_RESOURCE)) { error = -ENOSPC; goto out_unlock; } ipc_ns->mq_queues_count++; spin_unlock(&mq_lock); inode = mqueue_get_inode(dir->i_sb, ipc_ns, mode, attr); if (IS_ERR(inode)) { error = PTR_ERR(inode); spin_lock(&mq_lock); ipc_ns->mq_queues_count--; goto out_unlock; } put_ipc_ns(ipc_ns); dir->i_size += DIRENT_SIZE; simple_inode_init_ts(dir); d_instantiate(dentry, inode); dget(dentry); return 0; out_unlock: spin_unlock(&mq_lock); if (ipc_ns) put_ipc_ns(ipc_ns); return error; } static int mqueue_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { return mqueue_create_attr(dentry, mode, NULL); } static int mqueue_unlink(struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(dentry); simple_inode_init_ts(dir); dir->i_size -= DIRENT_SIZE; drop_nlink(inode); dput(dentry); return 0; } /* * This is routine for system read from queue file. * To avoid mess with doing here some sort of mq_receive we allow * to read only queue size & notification info (the only values * that are interesting from user point of view and aren't accessible * through std routines) */ static ssize_t mqueue_read_file(struct file *filp, char __user *u_data, size_t count, loff_t *off) { struct inode *inode = file_inode(filp); struct mqueue_inode_info *info = MQUEUE_I(inode); char buffer[FILENT_SIZE]; ssize_t ret; spin_lock(&info->lock); snprintf(buffer, sizeof(buffer), "QSIZE:%-10lu NOTIFY:%-5d SIGNO:%-5d NOTIFY_PID:%-6d\n", info->qsize, info->notify_owner ? info->notify.sigev_notify : 0, (info->notify_owner && info->notify.sigev_notify == SIGEV_SIGNAL) ? info->notify.sigev_signo : 0, pid_vnr(info->notify_owner)); spin_unlock(&info->lock); buffer[sizeof(buffer)-1] = '\0'; ret = simple_read_from_buffer(u_data, count, off, buffer, strlen(buffer)); if (ret <= 0) return ret; inode_set_atime_to_ts(inode, inode_set_ctime_current(inode)); return ret; } static int mqueue_flush_file(struct file *filp, fl_owner_t id) { struct mqueue_inode_info *info = MQUEUE_I(file_inode(filp)); spin_lock(&info->lock); if (task_tgid(current) == info->notify_owner) remove_notification(info); spin_unlock(&info->lock); return 0; } static __poll_t mqueue_poll_file(struct file *filp, struct poll_table_struct *poll_tab) { struct mqueue_inode_info *info = MQUEUE_I(file_inode(filp)); __poll_t retval = 0; poll_wait(filp, &info->wait_q, poll_tab); spin_lock(&info->lock); if (info->attr.mq_curmsgs) retval = EPOLLIN | EPOLLRDNORM; if (info->attr.mq_curmsgs < info->attr.mq_maxmsg) retval |= EPOLLOUT | EPOLLWRNORM; spin_unlock(&info->lock); return retval; } /* Adds current to info->e_wait_q[sr] before element with smaller prio */ static void wq_add(struct mqueue_inode_info *info, int sr, struct ext_wait_queue *ewp) { struct ext_wait_queue *walk; list_for_each_entry(walk, &info->e_wait_q[sr].list, list) { if (walk->task->prio <= current->prio) { list_add_tail(&ewp->list, &walk->list); return; } } list_add_tail(&ewp->list, &info->e_wait_q[sr].list); } /* * Puts current task to sleep. Caller must hold queue lock. After return * lock isn't held. * sr: SEND or RECV */ static int wq_sleep(struct mqueue_inode_info *info, int sr, ktime_t *timeout, struct ext_wait_queue *ewp) __releases(&info->lock) { int retval; signed long time; wq_add(info, sr, ewp); for (;;) { /* memory barrier not required, we hold info->lock */ __set_current_state(TASK_INTERRUPTIBLE); spin_unlock(&info->lock); time = schedule_hrtimeout_range_clock(timeout, 0, HRTIMER_MODE_ABS, CLOCK_REALTIME); if (READ_ONCE(ewp->state) == STATE_READY) { /* see MQ_BARRIER for purpose/pairing */ smp_acquire__after_ctrl_dep(); retval = 0; goto out; } spin_lock(&info->lock); /* we hold info->lock, so no memory barrier required */ if (READ_ONCE(ewp->state) == STATE_READY) { retval = 0; goto out_unlock; } if (signal_pending(current)) { retval = -ERESTARTSYS; break; } if (time == 0) { retval = -ETIMEDOUT; break; } } list_del(&ewp->list); out_unlock: spin_unlock(&info->lock); out: return retval; } /* * Returns waiting task that should be serviced first or NULL if none exists */ static struct ext_wait_queue *wq_get_first_waiter( struct mqueue_inode_info *info, int sr) { struct list_head *ptr; ptr = info->e_wait_q[sr].list.prev; if (ptr == &info->e_wait_q[sr].list) return NULL; return list_entry(ptr, struct ext_wait_queue, list); } static inline void set_cookie(struct sk_buff *skb, char code) { ((char *)skb->data)[NOTIFY_COOKIE_LEN-1] = code; } /* * The next function is only to split too long sys_mq_timedsend */ static void __do_notify(struct mqueue_inode_info *info) { /* notification * invoked when there is registered process and there isn't process * waiting synchronously for message AND state of queue changed from * empty to not empty. Here we are sure that no one is waiting * synchronously. */ if (info->notify_owner && info->attr.mq_curmsgs == 1) { switch (info->notify.sigev_notify) { case SIGEV_NONE: break; case SIGEV_SIGNAL: { struct kernel_siginfo sig_i; struct task_struct *task; /* do_mq_notify() accepts sigev_signo == 0, why?? */ if (!info->notify.sigev_signo) break; clear_siginfo(&sig_i); sig_i.si_signo = info->notify.sigev_signo; sig_i.si_errno = 0; sig_i.si_code = SI_MESGQ; sig_i.si_value = info->notify.sigev_value; rcu_read_lock(); /* map current pid/uid into info->owner's namespaces */ sig_i.si_pid = task_tgid_nr_ns(current, ns_of_pid(info->notify_owner)); sig_i.si_uid = from_kuid_munged(info->notify_user_ns, current_uid()); /* * We can't use kill_pid_info(), this signal should * bypass check_kill_permission(). It is from kernel * but si_fromuser() can't know this. * We do check the self_exec_id, to avoid sending * signals to programs that don't expect them. */ task = pid_task(info->notify_owner, PIDTYPE_TGID); if (task && task->self_exec_id == info->notify_self_exec_id) { do_send_sig_info(info->notify.sigev_signo, &sig_i, task, PIDTYPE_TGID); } rcu_read_unlock(); break; } case SIGEV_THREAD: set_cookie(info->notify_cookie, NOTIFY_WOKENUP); netlink_sendskb(info->notify_sock, info->notify_cookie); break; } /* after notification unregisters process */ put_pid(info->notify_owner); put_user_ns(info->notify_user_ns); info->notify_owner = NULL; info->notify_user_ns = NULL; } wake_up(&info->wait_q); } static int prepare_timeout(const struct __kernel_timespec __user *u_abs_timeout, struct timespec64 *ts) { if (get_timespec64(ts, u_abs_timeout)) return -EFAULT; if (!timespec64_valid(ts)) return -EINVAL; return 0; } static void remove_notification(struct mqueue_inode_info *info) { if (info->notify_owner != NULL && info->notify.sigev_notify == SIGEV_THREAD) { set_cookie(info->notify_cookie, NOTIFY_REMOVED); netlink_sendskb(info->notify_sock, info->notify_cookie); } put_pid(info->notify_owner); put_user_ns(info->notify_user_ns); info->notify_owner = NULL; info->notify_user_ns = NULL; } static int prepare_open(struct dentry *dentry, int oflag, int ro, umode_t mode, struct filename *name, struct mq_attr *attr) { static const int oflag2acc[O_ACCMODE] = { MAY_READ, MAY_WRITE, MAY_READ | MAY_WRITE }; int acc; if (d_really_is_negative(dentry)) { if (!(oflag & O_CREAT)) return -ENOENT; if (ro) return ro; audit_inode_parent_hidden(name, dentry->d_parent); return vfs_mkobj(dentry, mode & ~current_umask(), mqueue_create_attr, attr); } /* it already existed */ audit_inode(name, dentry, 0); if ((oflag & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL)) return -EEXIST; if ((oflag & O_ACCMODE) == (O_RDWR | O_WRONLY)) return -EINVAL; acc = oflag2acc[oflag & O_ACCMODE]; return inode_permission(&nop_mnt_idmap, d_inode(dentry), acc); } static int do_mq_open(const char __user *u_name, int oflag, umode_t mode, struct mq_attr *attr) { struct vfsmount *mnt = current->nsproxy->ipc_ns->mq_mnt; struct dentry *root = mnt->mnt_root; struct filename *name; struct path path; int fd, error; int ro; audit_mq_open(oflag, mode, attr); name = getname(u_name); if (IS_ERR(name)) return PTR_ERR(name); fd = get_unused_fd_flags(O_CLOEXEC); if (fd < 0) goto out_putname; ro = mnt_want_write(mnt); /* we'll drop it in any case */ inode_lock(d_inode(root)); path.dentry = lookup_one_len(name->name, root, strlen(name->name)); if (IS_ERR(path.dentry)) { error = PTR_ERR(path.dentry); goto out_putfd; } path.mnt = mntget(mnt); error = prepare_open(path.dentry, oflag, ro, mode, name, attr); if (!error) { struct file *file = dentry_open(&path, oflag, current_cred()); if (!IS_ERR(file)) fd_install(fd, file); else error = PTR_ERR(file); } path_put(&path); out_putfd: if (error) { put_unused_fd(fd); fd = error; } inode_unlock(d_inode(root)); if (!ro) mnt_drop_write(mnt); out_putname: putname(name); return fd; } SYSCALL_DEFINE4(mq_open, const char __user *, u_name, int, oflag, umode_t, mode, struct mq_attr __user *, u_attr) { struct mq_attr attr; if (u_attr && copy_from_user(&attr, u_attr, sizeof(struct mq_attr))) return -EFAULT; return do_mq_open(u_name, oflag, mode, u_attr ? &attr : NULL); } SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name) { int err; struct filename *name; struct dentry *dentry; struct inode *inode = NULL; struct ipc_namespace *ipc_ns = current->nsproxy->ipc_ns; struct vfsmount *mnt = ipc_ns->mq_mnt; name = getname(u_name); if (IS_ERR(name)) return PTR_ERR(name); audit_inode_parent_hidden(name, mnt->mnt_root); err = mnt_want_write(mnt); if (err) goto out_name; inode_lock_nested(d_inode(mnt->mnt_root), I_MUTEX_PARENT); dentry = lookup_one_len(name->name, mnt->mnt_root, strlen(name->name)); if (IS_ERR(dentry)) { err = PTR_ERR(dentry); goto out_unlock; } inode = d_inode(dentry); if (!inode) { err = -ENOENT; } else { ihold(inode); err = vfs_unlink(&nop_mnt_idmap, d_inode(dentry->d_parent), dentry, NULL); } dput(dentry); out_unlock: inode_unlock(d_inode(mnt->mnt_root)); iput(inode); mnt_drop_write(mnt); out_name: putname(name); return err; } /* Pipelined send and receive functions. * * If a receiver finds no waiting message, then it registers itself in the * list of waiting receivers. A sender checks that list before adding the new * message into the message array. If there is a waiting receiver, then it * bypasses the message array and directly hands the message over to the * receiver. The receiver accepts the message and returns without grabbing the * queue spinlock: * * - Set pointer to message. * - Queue the receiver task for later wakeup (without the info->lock). * - Update its state to STATE_READY. Now the receiver can continue. * - Wake up the process after the lock is dropped. Should the process wake up * before this wakeup (due to a timeout or a signal) it will either see * STATE_READY and continue or acquire the lock to check the state again. * * The same algorithm is used for senders. */ static inline void __pipelined_op(struct wake_q_head *wake_q, struct mqueue_inode_info *info, struct ext_wait_queue *this) { struct task_struct *task; list_del(&this->list); task = get_task_struct(this->task); /* see MQ_BARRIER for purpose/pairing */ smp_store_release(&this->state, STATE_READY); wake_q_add_safe(wake_q, task); } /* pipelined_send() - send a message directly to the task waiting in * sys_mq_timedreceive() (without inserting message into a queue). */ static inline void pipelined_send(struct wake_q_head *wake_q, struct mqueue_inode_info *info, struct msg_msg *message, struct ext_wait_queue *receiver) { receiver->msg = message; __pipelined_op(wake_q, info, receiver); } /* pipelined_receive() - if there is task waiting in sys_mq_timedsend() * gets its message and put to the queue (we have one free place for sure). */ static inline void pipelined_receive(struct wake_q_head *wake_q, struct mqueue_inode_info *info) { struct ext_wait_queue *sender = wq_get_first_waiter(info, SEND); if (!sender) { /* for poll */ wake_up_interruptible(&info->wait_q); return; } if (msg_insert(sender->msg, info)) return; __pipelined_op(wake_q, info, sender); } static int do_mq_timedsend(mqd_t mqdes, const char __user *u_msg_ptr, size_t msg_len, unsigned int msg_prio, struct timespec64 *ts) { struct inode *inode; struct ext_wait_queue wait; struct ext_wait_queue *receiver; struct msg_msg *msg_ptr; struct mqueue_inode_info *info; ktime_t expires, *timeout = NULL; struct posix_msg_tree_node *new_leaf = NULL; int ret = 0; DEFINE_WAKE_Q(wake_q); if (unlikely(msg_prio >= (unsigned long) MQ_PRIO_MAX)) return -EINVAL; if (ts) { expires = timespec64_to_ktime(*ts); timeout = &expires; } audit_mq_sendrecv(mqdes, msg_len, msg_prio, ts); CLASS(fd, f)(mqdes); if (fd_empty(f)) return -EBADF; inode = file_inode(fd_file(f)); if (unlikely(fd_file(f)->f_op != &mqueue_file_operations)) return -EBADF; info = MQUEUE_I(inode); audit_file(fd_file(f)); if (unlikely(!(fd_file(f)->f_mode & FMODE_WRITE))) return -EBADF; if (unlikely(msg_len > info->attr.mq_msgsize)) return -EMSGSIZE; /* First try to allocate memory, before doing anything with * existing queues. */ msg_ptr = load_msg(u_msg_ptr, msg_len); if (IS_ERR(msg_ptr)) return PTR_ERR(msg_ptr); msg_ptr->m_ts = msg_len; msg_ptr->m_type = msg_prio; /* * msg_insert really wants us to have a valid, spare node struct so * it doesn't have to kmalloc a GFP_ATOMIC allocation, but it will * fall back to that if necessary. */ if (!info->node_cache) new_leaf = kmalloc(sizeof(*new_leaf), GFP_KERNEL); spin_lock(&info->lock); if (!info->node_cache && new_leaf) { /* Save our speculative allocation into the cache */ INIT_LIST_HEAD(&new_leaf->msg_list); info->node_cache = new_leaf; new_leaf = NULL; } else { kfree(new_leaf); } if (info->attr.mq_curmsgs == info->attr.mq_maxmsg) { if (fd_file(f)->f_flags & O_NONBLOCK) { ret = -EAGAIN; } else { wait.task = current; wait.msg = (void *) msg_ptr; /* memory barrier not required, we hold info->lock */ WRITE_ONCE(wait.state, STATE_NONE); ret = wq_sleep(info, SEND, timeout, &wait); /* * wq_sleep must be called with info->lock held, and * returns with the lock released */ goto out_free; } } else { receiver = wq_get_first_waiter(info, RECV); if (receiver) { pipelined_send(&wake_q, info, msg_ptr, receiver); } else { /* adds message to the queue */ ret = msg_insert(msg_ptr, info); if (ret) goto out_unlock; __do_notify(info); } simple_inode_init_ts(inode); } out_unlock: spin_unlock(&info->lock); wake_up_q(&wake_q); out_free: if (ret) free_msg(msg_ptr); return ret; } static int do_mq_timedreceive(mqd_t mqdes, char __user *u_msg_ptr, size_t msg_len, unsigned int __user *u_msg_prio, struct timespec64 *ts) { ssize_t ret; struct msg_msg *msg_ptr; struct inode *inode; struct mqueue_inode_info *info; struct ext_wait_queue wait; ktime_t expires, *timeout = NULL; struct posix_msg_tree_node *new_leaf = NULL; if (ts) { expires = timespec64_to_ktime(*ts); timeout = &expires; } audit_mq_sendrecv(mqdes, msg_len, 0, ts); CLASS(fd, f)(mqdes); if (fd_empty(f)) return -EBADF; inode = file_inode(fd_file(f)); if (unlikely(fd_file(f)->f_op != &mqueue_file_operations)) return -EBADF; info = MQUEUE_I(inode); audit_file(fd_file(f)); if (unlikely(!(fd_file(f)->f_mode & FMODE_READ))) return -EBADF; /* checks if buffer is big enough */ if (unlikely(msg_len < info->attr.mq_msgsize)) return -EMSGSIZE; /* * msg_insert really wants us to have a valid, spare node struct so * it doesn't have to kmalloc a GFP_ATOMIC allocation, but it will * fall back to that if necessary. */ if (!info->node_cache) new_leaf = kmalloc(sizeof(*new_leaf), GFP_KERNEL); spin_lock(&info->lock); if (!info->node_cache && new_leaf) { /* Save our speculative allocation into the cache */ INIT_LIST_HEAD(&new_leaf->msg_list); info->node_cache = new_leaf; } else { kfree(new_leaf); } if (info->attr.mq_curmsgs == 0) { if (fd_file(f)->f_flags & O_NONBLOCK) { spin_unlock(&info->lock); ret = -EAGAIN; } else { wait.task = current; /* memory barrier not required, we hold info->lock */ WRITE_ONCE(wait.state, STATE_NONE); ret = wq_sleep(info, RECV, timeout, &wait); msg_ptr = wait.msg; } } else { DEFINE_WAKE_Q(wake_q); msg_ptr = msg_get(info); simple_inode_init_ts(inode); /* There is now free space in queue. */ pipelined_receive(&wake_q, info); spin_unlock(&info->lock); wake_up_q(&wake_q); ret = 0; } if (ret == 0) { ret = msg_ptr->m_ts; if ((u_msg_prio && put_user(msg_ptr->m_type, u_msg_prio)) || store_msg(u_msg_ptr, msg_ptr, msg_ptr->m_ts)) { ret = -EFAULT; } free_msg(msg_ptr); } return ret; } SYSCALL_DEFINE5(mq_timedsend, mqd_t, mqdes, const char __user *, u_msg_ptr, size_t, msg_len, unsigned int, msg_prio, const struct __kernel_timespec __user *, u_abs_timeout) { struct timespec64 ts, *p = NULL; if (u_abs_timeout) { int res = prepare_timeout(u_abs_timeout, &ts); if (res) return res; p = &ts; } return do_mq_timedsend(mqdes, u_msg_ptr, msg_len, msg_prio, p); } SYSCALL_DEFINE5(mq_timedreceive, mqd_t, mqdes, char __user *, u_msg_ptr, size_t, msg_len, unsigned int __user *, u_msg_prio, const struct __kernel_timespec __user *, u_abs_timeout) { struct timespec64 ts, *p = NULL; if (u_abs_timeout) { int res = prepare_timeout(u_abs_timeout, &ts); if (res) return res; p = &ts; } return do_mq_timedreceive(mqdes, u_msg_ptr, msg_len, u_msg_prio, p); } /* * Notes: the case when user wants us to deregister (with NULL as pointer) * and he isn't currently owner of notification, will be silently discarded. * It isn't explicitly defined in the POSIX. */ static int do_mq_notify(mqd_t mqdes, const struct sigevent *notification) { int ret; struct sock *sock; struct inode *inode; struct mqueue_inode_info *info; struct sk_buff *nc; audit_mq_notify(mqdes, notification); nc = NULL; sock = NULL; if (notification != NULL) { if (unlikely(notification->sigev_notify != SIGEV_NONE && notification->sigev_notify != SIGEV_SIGNAL && notification->sigev_notify != SIGEV_THREAD)) return -EINVAL; if (notification->sigev_notify == SIGEV_SIGNAL && !valid_signal(notification->sigev_signo)) { return -EINVAL; } if (notification->sigev_notify == SIGEV_THREAD) { long timeo; /* create the notify skb */ nc = alloc_skb(NOTIFY_COOKIE_LEN, GFP_KERNEL); if (!nc) return -ENOMEM; if (copy_from_user(nc->data, notification->sigev_value.sival_ptr, NOTIFY_COOKIE_LEN)) { kfree_skb(nc); return -EFAULT; } /* TODO: add a header? */ skb_put(nc, NOTIFY_COOKIE_LEN); /* and attach it to the socket */ retry: sock = netlink_getsockbyfd(notification->sigev_signo); if (IS_ERR(sock)) { kfree_skb(nc); return PTR_ERR(sock); } timeo = MAX_SCHEDULE_TIMEOUT; ret = netlink_attachskb(sock, nc, &timeo, NULL); if (ret == 1) goto retry; if (ret) return ret; } } CLASS(fd, f)(mqdes); if (fd_empty(f)) { ret = -EBADF; goto out; } inode = file_inode(fd_file(f)); if (unlikely(fd_file(f)->f_op != &mqueue_file_operations)) { ret = -EBADF; goto out; } info = MQUEUE_I(inode); ret = 0; spin_lock(&info->lock); if (notification == NULL) { if (info->notify_owner == task_tgid(current)) { remove_notification(info); inode_set_atime_to_ts(inode, inode_set_ctime_current(inode)); } } else if (info->notify_owner != NULL) { ret = -EBUSY; } else { switch (notification->sigev_notify) { case SIGEV_NONE: info->notify.sigev_notify = SIGEV_NONE; break; case SIGEV_THREAD: info->notify_sock = sock; info->notify_cookie = nc; sock = NULL; nc = NULL; info->notify.sigev_notify = SIGEV_THREAD; break; case SIGEV_SIGNAL: info->notify.sigev_signo = notification->sigev_signo; info->notify.sigev_value = notification->sigev_value; info->notify.sigev_notify = SIGEV_SIGNAL; info->notify_self_exec_id = current->self_exec_id; break; } info->notify_owner = get_pid(task_tgid(current)); info->notify_user_ns = get_user_ns(current_user_ns()); inode_set_atime_to_ts(inode, inode_set_ctime_current(inode)); } spin_unlock(&info->lock); out: if (sock) netlink_detachskb(sock, nc); return ret; } SYSCALL_DEFINE2(mq_notify, mqd_t, mqdes, const struct sigevent __user *, u_notification) { struct sigevent n, *p = NULL; if (u_notification) { if (copy_from_user(&n, u_notification, sizeof(struct sigevent))) return -EFAULT; p = &n; } return do_mq_notify(mqdes, p); } static int do_mq_getsetattr(int mqdes, struct mq_attr *new, struct mq_attr *old) { struct inode *inode; struct mqueue_inode_info *info; if (new && (new->mq_flags & (~O_NONBLOCK))) return -EINVAL; CLASS(fd, f)(mqdes); if (fd_empty(f)) return -EBADF; if (unlikely(fd_file(f)->f_op != &mqueue_file_operations)) return -EBADF; inode = file_inode(fd_file(f)); info = MQUEUE_I(inode); spin_lock(&info->lock); if (old) { *old = info->attr; old->mq_flags = fd_file(f)->f_flags & O_NONBLOCK; } if (new) { audit_mq_getsetattr(mqdes, new); spin_lock(&fd_file(f)->f_lock); if (new->mq_flags & O_NONBLOCK) fd_file(f)->f_flags |= O_NONBLOCK; else fd_file(f)->f_flags &= ~O_NONBLOCK; spin_unlock(&fd_file(f)->f_lock); inode_set_atime_to_ts(inode, inode_set_ctime_current(inode)); } spin_unlock(&info->lock); return 0; } SYSCALL_DEFINE3(mq_getsetattr, mqd_t, mqdes, const struct mq_attr __user *, u_mqstat, struct mq_attr __user *, u_omqstat) { int ret; struct mq_attr mqstat, omqstat; struct mq_attr *new = NULL, *old = NULL; if (u_mqstat) { new = &mqstat; if (copy_from_user(new, u_mqstat, sizeof(struct mq_attr))) return -EFAULT; } if (u_omqstat) old = &omqstat; ret = do_mq_getsetattr(mqdes, new, old); if (ret || !old) return ret; if (copy_to_user(u_omqstat, old, sizeof(struct mq_attr))) return -EFAULT; return 0; } #ifdef CONFIG_COMPAT struct compat_mq_attr { compat_long_t mq_flags; /* message queue flags */ compat_long_t mq_maxmsg; /* maximum number of messages */ compat_long_t mq_msgsize; /* maximum message size */ compat_long_t mq_curmsgs; /* number of messages currently queued */ compat_long_t __reserved[4]; /* ignored for input, zeroed for output */ }; static inline int get_compat_mq_attr(struct mq_attr *attr, const struct compat_mq_attr __user *uattr) { struct compat_mq_attr v; if (copy_from_user(&v, uattr, sizeof(*uattr))) return -EFAULT; memset(attr, 0, sizeof(*attr)); attr->mq_flags = v.mq_flags; attr->mq_maxmsg = v.mq_maxmsg; attr->mq_msgsize = v.mq_msgsize; attr->mq_curmsgs = v.mq_curmsgs; return 0; } static inline int put_compat_mq_attr(const struct mq_attr *attr, struct compat_mq_attr __user *uattr) { struct compat_mq_attr v; memset(&v, 0, sizeof(v)); v.mq_flags = attr->mq_flags; v.mq_maxmsg = attr->mq_maxmsg; v.mq_msgsize = attr->mq_msgsize; v.mq_curmsgs = attr->mq_curmsgs; if (copy_to_user(uattr, &v, sizeof(*uattr))) return -EFAULT; return 0; } COMPAT_SYSCALL_DEFINE4(mq_open, const char __user *, u_name, int, oflag, compat_mode_t, mode, struct compat_mq_attr __user *, u_attr) { struct mq_attr attr, *p = NULL; if (u_attr && oflag & O_CREAT) { p = &attr; if (get_compat_mq_attr(&attr, u_attr)) return -EFAULT; } return do_mq_open(u_name, oflag, mode, p); } COMPAT_SYSCALL_DEFINE2(mq_notify, mqd_t, mqdes, const struct compat_sigevent __user *, u_notification) { struct sigevent n, *p = NULL; if (u_notification) { if (get_compat_sigevent(&n, u_notification)) return -EFAULT; if (n.sigev_notify == SIGEV_THREAD) n.sigev_value.sival_ptr = compat_ptr(n.sigev_value.sival_int); p = &n; } return do_mq_notify(mqdes, p); } COMPAT_SYSCALL_DEFINE3(mq_getsetattr, mqd_t, mqdes, const struct compat_mq_attr __user *, u_mqstat, struct compat_mq_attr __user *, u_omqstat) { int ret; struct mq_attr mqstat, omqstat; struct mq_attr *new = NULL, *old = NULL; if (u_mqstat) { new = &mqstat; if (get_compat_mq_attr(new, u_mqstat)) return -EFAULT; } if (u_omqstat) old = &omqstat; ret = do_mq_getsetattr(mqdes, new, old); if (ret || !old) return ret; if (put_compat_mq_attr(old, u_omqstat)) return -EFAULT; return 0; } #endif #ifdef CONFIG_COMPAT_32BIT_TIME static int compat_prepare_timeout(const struct old_timespec32 __user *p, struct timespec64 *ts) { if (get_old_timespec32(ts, p)) return -EFAULT; if (!timespec64_valid(ts)) return -EINVAL; return 0; } SYSCALL_DEFINE5(mq_timedsend_time32, mqd_t, mqdes, const char __user *, u_msg_ptr, unsigned int, msg_len, unsigned int, msg_prio, const struct old_timespec32 __user *, u_abs_timeout) { struct timespec64 ts, *p = NULL; if (u_abs_timeout) { int res = compat_prepare_timeout(u_abs_timeout, &ts); if (res) return res; p = &ts; } return do_mq_timedsend(mqdes, u_msg_ptr, msg_len, msg_prio, p); } SYSCALL_DEFINE5(mq_timedreceive_time32, mqd_t, mqdes, char __user *, u_msg_ptr, unsigned int, msg_len, unsigned int __user *, u_msg_prio, const struct old_timespec32 __user *, u_abs_timeout) { struct timespec64 ts, *p = NULL; if (u_abs_timeout) { int res = compat_prepare_timeout(u_abs_timeout, &ts); if (res) return res; p = &ts; } return do_mq_timedreceive(mqdes, u_msg_ptr, msg_len, u_msg_prio, p); } #endif static const struct inode_operations mqueue_dir_inode_operations = { .lookup = simple_lookup, .create = mqueue_create, .unlink = mqueue_unlink, }; static const struct file_operations mqueue_file_operations = { .flush = mqueue_flush_file, .poll = mqueue_poll_file, .read = mqueue_read_file, .llseek = default_llseek, }; static const struct super_operations mqueue_super_ops = { .alloc_inode = mqueue_alloc_inode, .free_inode = mqueue_free_inode, .evict_inode = mqueue_evict_inode, .statfs = simple_statfs, }; static const struct fs_context_operations mqueue_fs_context_ops = { .free = mqueue_fs_context_free, .get_tree = mqueue_get_tree, }; static struct file_system_type mqueue_fs_type = { .name = "mqueue", .init_fs_context = mqueue_init_fs_context, .kill_sb = kill_litter_super, .fs_flags = FS_USERNS_MOUNT, }; int mq_init_ns(struct ipc_namespace *ns) { struct vfsmount *m; ns->mq_queues_count = 0; ns->mq_queues_max = DFLT_QUEUESMAX; ns->mq_msg_max = DFLT_MSGMAX; ns->mq_msgsize_max = DFLT_MSGSIZEMAX; ns->mq_msg_default = DFLT_MSG; ns->mq_msgsize_default = DFLT_MSGSIZE; m = mq_create_mount(ns); if (IS_ERR(m)) return PTR_ERR(m); ns->mq_mnt = m; return 0; } void mq_clear_sbinfo(struct ipc_namespace *ns) { ns->mq_mnt->mnt_sb->s_fs_info = NULL; } static int __init init_mqueue_fs(void) { int error; mqueue_inode_cachep = kmem_cache_create("mqueue_inode_cache", sizeof(struct mqueue_inode_info), 0, SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT, init_once); if (mqueue_inode_cachep == NULL) return -ENOMEM; if (!setup_mq_sysctls(&init_ipc_ns)) { pr_warn("sysctl registration failed\n"); error = -ENOMEM; goto out_kmem; } error = register_filesystem(&mqueue_fs_type); if (error) goto out_sysctl; spin_lock_init(&mq_lock); error = mq_init_ns(&init_ipc_ns); if (error) goto out_filesystem; return 0; out_filesystem: unregister_filesystem(&mqueue_fs_type); out_sysctl: retire_mq_sysctls(&init_ipc_ns); out_kmem: kmem_cache_destroy(mqueue_inode_cachep); return error; } device_initcall(init_mqueue_fs); |
| 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) International Business Machines Corp., 2006 * * Author: Artem Bityutskiy (Битюцкий Артём) */ #include "ubi.h" #include <linux/debugfs.h> #include <linux/uaccess.h> #include <linux/module.h> #include <linux/seq_file.h> #include <linux/fault-inject.h> #ifdef CONFIG_MTD_UBI_FAULT_INJECTION static DECLARE_FAULT_ATTR(fault_eccerr_attr); static DECLARE_FAULT_ATTR(fault_bitflips_attr); static DECLARE_FAULT_ATTR(fault_read_failure_attr); static DECLARE_FAULT_ATTR(fault_write_failure_attr); static DECLARE_FAULT_ATTR(fault_erase_failure_attr); static DECLARE_FAULT_ATTR(fault_power_cut_attr); static DECLARE_FAULT_ATTR(fault_io_ff_attr); static DECLARE_FAULT_ATTR(fault_io_ff_bitflips_attr); static DECLARE_FAULT_ATTR(fault_bad_hdr_attr); static DECLARE_FAULT_ATTR(fault_bad_hdr_ebadmsg_attr); #define FAIL_ACTION(name, fault_attr) \ bool should_fail_##name(void) \ { \ return should_fail(&fault_attr, 1); \ } FAIL_ACTION(eccerr, fault_eccerr_attr) FAIL_ACTION(bitflips, fault_bitflips_attr) FAIL_ACTION(read_failure, fault_read_failure_attr) FAIL_ACTION(write_failure, fault_write_failure_attr) FAIL_ACTION(erase_failure, fault_erase_failure_attr) FAIL_ACTION(power_cut, fault_power_cut_attr) FAIL_ACTION(io_ff, fault_io_ff_attr) FAIL_ACTION(io_ff_bitflips, fault_io_ff_bitflips_attr) FAIL_ACTION(bad_hdr, fault_bad_hdr_attr) FAIL_ACTION(bad_hdr_ebadmsg, fault_bad_hdr_ebadmsg_attr) #endif /** * ubi_dump_flash - dump a region of flash. * @ubi: UBI device description object * @pnum: the physical eraseblock number to dump * @offset: the starting offset within the physical eraseblock to dump * @len: the length of the region to dump */ void ubi_dump_flash(struct ubi_device *ubi, int pnum, int offset, int len) { int err; size_t read; void *buf; loff_t addr = (loff_t)pnum * ubi->peb_size + offset; buf = vmalloc(len); if (!buf) return; err = mtd_read(ubi->mtd, addr, len, &read, buf); if (err && err != -EUCLEAN) { ubi_err(ubi, "err %d while reading %d bytes from PEB %d:%d, read %zd bytes", err, len, pnum, offset, read); goto out; } ubi_msg(ubi, "dumping %d bytes of data from PEB %d, offset %d", len, pnum, offset); print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_OFFSET, 32, 1, buf, len, 1); out: vfree(buf); return; } /** * ubi_dump_ec_hdr - dump an erase counter header. * @ec_hdr: the erase counter header to dump */ void ubi_dump_ec_hdr(const struct ubi_ec_hdr *ec_hdr) { pr_err("Erase counter header dump:\n"); pr_err("\tmagic %#08x\n", be32_to_cpu(ec_hdr->magic)); pr_err("\tversion %d\n", (int)ec_hdr->version); pr_err("\tec %llu\n", (long long)be64_to_cpu(ec_hdr->ec)); pr_err("\tvid_hdr_offset %d\n", be32_to_cpu(ec_hdr->vid_hdr_offset)); pr_err("\tdata_offset %d\n", be32_to_cpu(ec_hdr->data_offset)); pr_err("\timage_seq %d\n", be32_to_cpu(ec_hdr->image_seq)); pr_err("\thdr_crc %#08x\n", be32_to_cpu(ec_hdr->hdr_crc)); pr_err("erase counter header hexdump:\n"); print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_OFFSET, 32, 1, ec_hdr, UBI_EC_HDR_SIZE, 1); } /** * ubi_dump_vid_hdr - dump a volume identifier header. * @vid_hdr: the volume identifier header to dump */ void ubi_dump_vid_hdr(const struct ubi_vid_hdr *vid_hdr) { pr_err("Volume identifier header dump:\n"); pr_err("\tmagic %08x\n", be32_to_cpu(vid_hdr->magic)); pr_err("\tversion %d\n", (int)vid_hdr->version); pr_err("\tvol_type %d\n", (int)vid_hdr->vol_type); pr_err("\tcopy_flag %d\n", (int)vid_hdr->copy_flag); pr_err("\tcompat %d\n", (int)vid_hdr->compat); pr_err("\tvol_id %d\n", be32_to_cpu(vid_hdr->vol_id)); pr_err("\tlnum %d\n", be32_to_cpu(vid_hdr->lnum)); pr_err("\tdata_size %d\n", be32_to_cpu(vid_hdr->data_size)); pr_err("\tused_ebs %d\n", be32_to_cpu(vid_hdr->used_ebs)); pr_err("\tdata_pad %d\n", be32_to_cpu(vid_hdr->data_pad)); pr_err("\tsqnum %llu\n", (unsigned long long)be64_to_cpu(vid_hdr->sqnum)); pr_err("\thdr_crc %08x\n", be32_to_cpu(vid_hdr->hdr_crc)); pr_err("Volume identifier header hexdump:\n"); print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_OFFSET, 32, 1, vid_hdr, UBI_VID_HDR_SIZE, 1); } /** * ubi_dump_vol_info - dump volume information. * @vol: UBI volume description object */ void ubi_dump_vol_info(const struct ubi_volume *vol) { pr_err("Volume information dump:\n"); pr_err("\tvol_id %d\n", vol->vol_id); pr_err("\treserved_pebs %d\n", vol->reserved_pebs); pr_err("\talignment %d\n", vol->alignment); pr_err("\tdata_pad %d\n", vol->data_pad); pr_err("\tvol_type %d\n", vol->vol_type); pr_err("\tname_len %d\n", vol->name_len); pr_err("\tusable_leb_size %d\n", vol->usable_leb_size); pr_err("\tused_ebs %d\n", vol->used_ebs); pr_err("\tused_bytes %lld\n", vol->used_bytes); pr_err("\tlast_eb_bytes %d\n", vol->last_eb_bytes); pr_err("\tcorrupted %d\n", vol->corrupted); pr_err("\tupd_marker %d\n", vol->upd_marker); pr_err("\tskip_check %d\n", vol->skip_check); if (vol->name_len <= UBI_VOL_NAME_MAX && strnlen(vol->name, vol->name_len + 1) == vol->name_len) { pr_err("\tname %s\n", vol->name); } else { pr_err("\t1st 5 characters of name: %c%c%c%c%c\n", vol->name[0], vol->name[1], vol->name[2], vol->name[3], vol->name[4]); } } /** * ubi_dump_vtbl_record - dump a &struct ubi_vtbl_record object. * @r: the object to dump * @idx: volume table index */ void ubi_dump_vtbl_record(const struct ubi_vtbl_record *r, int idx) { int name_len = be16_to_cpu(r->name_len); pr_err("Volume table record %d dump:\n", idx); pr_err("\treserved_pebs %d\n", be32_to_cpu(r->reserved_pebs)); pr_err("\talignment %d\n", be32_to_cpu(r->alignment)); pr_err("\tdata_pad %d\n", be32_to_cpu(r->data_pad)); pr_err("\tvol_type %d\n", (int)r->vol_type); pr_err("\tupd_marker %d\n", (int)r->upd_marker); pr_err("\tname_len %d\n", name_len); if (r->name[0] == '\0') { pr_err("\tname NULL\n"); return; } if (name_len <= UBI_VOL_NAME_MAX && strnlen(&r->name[0], name_len + 1) == name_len) { pr_err("\tname %s\n", &r->name[0]); } else { pr_err("\t1st 5 characters of name: %c%c%c%c%c\n", r->name[0], r->name[1], r->name[2], r->name[3], r->name[4]); } pr_err("\tcrc %#08x\n", be32_to_cpu(r->crc)); } /** * ubi_dump_av - dump a &struct ubi_ainf_volume object. * @av: the object to dump */ void ubi_dump_av(const struct ubi_ainf_volume *av) { pr_err("Volume attaching information dump:\n"); pr_err("\tvol_id %d\n", av->vol_id); pr_err("\thighest_lnum %d\n", av->highest_lnum); pr_err("\tleb_count %d\n", av->leb_count); pr_err("\tcompat %d\n", av->compat); pr_err("\tvol_type %d\n", av->vol_type); pr_err("\tused_ebs %d\n", av->used_ebs); pr_err("\tlast_data_size %d\n", av->last_data_size); pr_err("\tdata_pad %d\n", av->data_pad); } /** * ubi_dump_aeb - dump a &struct ubi_ainf_peb object. * @aeb: the object to dump * @type: object type: 0 - not corrupted, 1 - corrupted */ void ubi_dump_aeb(const struct ubi_ainf_peb *aeb, int type) { pr_err("eraseblock attaching information dump:\n"); pr_err("\tec %d\n", aeb->ec); pr_err("\tpnum %d\n", aeb->pnum); if (type == 0) { pr_err("\tlnum %d\n", aeb->lnum); pr_err("\tscrub %d\n", aeb->scrub); pr_err("\tsqnum %llu\n", aeb->sqnum); } } /** * ubi_dump_mkvol_req - dump a &struct ubi_mkvol_req object. * @req: the object to dump */ void ubi_dump_mkvol_req(const struct ubi_mkvol_req *req) { char nm[17]; pr_err("Volume creation request dump:\n"); pr_err("\tvol_id %d\n", req->vol_id); pr_err("\talignment %d\n", req->alignment); pr_err("\tbytes %lld\n", (long long)req->bytes); pr_err("\tvol_type %d\n", req->vol_type); pr_err("\tname_len %d\n", req->name_len); memcpy(nm, req->name, 16); nm[16] = 0; pr_err("\t1st 16 characters of name: %s\n", nm); } /* * Root directory for UBI stuff in debugfs. Contains sub-directories which * contain the stuff specific to particular UBI devices. */ static struct dentry *dfs_rootdir; #ifdef CONFIG_MTD_UBI_FAULT_INJECTION static void dfs_create_fault_entry(struct dentry *parent) { struct dentry *dir; dir = debugfs_create_dir("fault_inject", parent); if (IS_ERR_OR_NULL(dir)) { int err = dir ? PTR_ERR(dir) : -ENODEV; pr_warn("UBI error: cannot create \"fault_inject\" debugfs directory, error %d\n", err); return; } fault_create_debugfs_attr("emulate_eccerr", dir, &fault_eccerr_attr); fault_create_debugfs_attr("emulate_read_failure", dir, &fault_read_failure_attr); fault_create_debugfs_attr("emulate_bitflips", dir, &fault_bitflips_attr); fault_create_debugfs_attr("emulate_write_failure", dir, &fault_write_failure_attr); fault_create_debugfs_attr("emulate_erase_failure", dir, &fault_erase_failure_attr); fault_create_debugfs_attr("emulate_power_cut", dir, &fault_power_cut_attr); fault_create_debugfs_attr("emulate_io_ff", dir, &fault_io_ff_attr); fault_create_debugfs_attr("emulate_io_ff_bitflips", dir, &fault_io_ff_bitflips_attr); fault_create_debugfs_attr("emulate_bad_hdr", dir, &fault_bad_hdr_attr); fault_create_debugfs_attr("emulate_bad_hdr_ebadmsg", dir, &fault_bad_hdr_ebadmsg_attr); } #endif /** * ubi_debugfs_init - create UBI debugfs directory. * * Create UBI debugfs directory. Returns zero in case of success and a negative * error code in case of failure. */ int ubi_debugfs_init(void) { if (!IS_ENABLED(CONFIG_DEBUG_FS)) return 0; dfs_rootdir = debugfs_create_dir("ubi", NULL); if (IS_ERR_OR_NULL(dfs_rootdir)) { int err = dfs_rootdir ? PTR_ERR(dfs_rootdir) : -ENODEV; pr_err("UBI error: cannot create \"ubi\" debugfs directory, error %d\n", err); return err; } #ifdef CONFIG_MTD_UBI_FAULT_INJECTION dfs_create_fault_entry(dfs_rootdir); #endif return 0; } /** * ubi_debugfs_exit - remove UBI debugfs directory. */ void ubi_debugfs_exit(void) { if (IS_ENABLED(CONFIG_DEBUG_FS)) debugfs_remove(dfs_rootdir); } /* Read an UBI debugfs file */ static ssize_t dfs_file_read(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { unsigned long ubi_num = (unsigned long)file->private_data; struct dentry *dent = file->f_path.dentry; struct ubi_device *ubi; struct ubi_debug_info *d; char buf[16]; int val; ubi = ubi_get_device(ubi_num); if (!ubi) return -ENODEV; d = &ubi->dbg; if (dent == d->dfs_chk_gen) val = d->chk_gen; else if (dent == d->dfs_chk_io) val = d->chk_io; else if (dent == d->dfs_chk_fastmap) val = d->chk_fastmap; else if (dent == d->dfs_disable_bgt) val = d->disable_bgt; else if (dent == d->dfs_emulate_bitflips) val = d->emulate_bitflips; else if (dent == d->dfs_emulate_io_failures) val = d->emulate_io_failures; else if (dent == d->dfs_emulate_failures) { snprintf(buf, sizeof(buf), "0x%04x\n", d->emulate_failures); count = simple_read_from_buffer(user_buf, count, ppos, buf, strlen(buf)); goto out; } else if (dent == d->dfs_emulate_power_cut) { snprintf(buf, sizeof(buf), "%u\n", d->emulate_power_cut); count = simple_read_from_buffer(user_buf, count, ppos, buf, strlen(buf)); goto out; } else if (dent == d->dfs_power_cut_min) { snprintf(buf, sizeof(buf), "%u\n", d->power_cut_min); count = simple_read_from_buffer(user_buf, count, ppos, buf, strlen(buf)); goto out; } else if (dent == d->dfs_power_cut_max) { snprintf(buf, sizeof(buf), "%u\n", d->power_cut_max); count = simple_read_from_buffer(user_buf, count, ppos, buf, strlen(buf)); goto out; } else { count = -EINVAL; goto out; } if (val) buf[0] = '1'; else buf[0] = '0'; buf[1] = '\n'; buf[2] = 0x00; count = simple_read_from_buffer(user_buf, count, ppos, buf, 2); out: ubi_put_device(ubi); return count; } /* Write an UBI debugfs file */ static ssize_t dfs_file_write(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { unsigned long ubi_num = (unsigned long)file->private_data; struct dentry *dent = file->f_path.dentry; struct ubi_device *ubi; struct ubi_debug_info *d; size_t buf_size; char buf[16] = {0}; int val; ubi = ubi_get_device(ubi_num); if (!ubi) return -ENODEV; d = &ubi->dbg; buf_size = min_t(size_t, count, (sizeof(buf) - 1)); if (copy_from_user(buf, user_buf, buf_size)) { count = -EFAULT; goto out; } if (dent == d->dfs_emulate_failures) { if (kstrtouint(buf, 0, &d->emulate_failures) != 0) count = -EINVAL; goto out; } else if (dent == d->dfs_power_cut_min) { if (kstrtouint(buf, 0, &d->power_cut_min) != 0) count = -EINVAL; goto out; } else if (dent == d->dfs_power_cut_max) { if (kstrtouint(buf, 0, &d->power_cut_max) != 0) count = -EINVAL; goto out; } else if (dent == d->dfs_emulate_power_cut) { if (kstrtoint(buf, 0, &val) != 0) count = -EINVAL; else d->emulate_power_cut = val; goto out; } if (buf[0] == '1') val = 1; else if (buf[0] == '0') val = 0; else { count = -EINVAL; goto out; } if (dent == d->dfs_chk_gen) d->chk_gen = val; else if (dent == d->dfs_chk_io) d->chk_io = val; else if (dent == d->dfs_chk_fastmap) d->chk_fastmap = val; else if (dent == d->dfs_disable_bgt) d->disable_bgt = val; else if (dent == d->dfs_emulate_bitflips) d->emulate_bitflips = val; else if (dent == d->dfs_emulate_io_failures) d->emulate_io_failures = val; else count = -EINVAL; out: ubi_put_device(ubi); return count; } /* File operations for all UBI debugfs files except * detailed_erase_block_info */ static const struct file_operations dfs_fops = { .read = dfs_file_read, .write = dfs_file_write, .open = simple_open, .owner = THIS_MODULE, }; /* As long as the position is less then that total number of erase blocks, * we still have more to print. */ static void *eraseblk_count_seq_start(struct seq_file *s, loff_t *pos) { struct ubi_device *ubi = s->private; if (*pos < ubi->peb_count) return pos; return NULL; } /* Since we are using the position as the iterator, we just need to check if we * are done and increment the position. */ static void *eraseblk_count_seq_next(struct seq_file *s, void *v, loff_t *pos) { struct ubi_device *ubi = s->private; (*pos)++; if (*pos < ubi->peb_count) return pos; return NULL; } static void eraseblk_count_seq_stop(struct seq_file *s, void *v) { } static int eraseblk_count_seq_show(struct seq_file *s, void *iter) { struct ubi_device *ubi = s->private; struct ubi_wl_entry *wl; int *block_number = iter; int erase_count = -1; int err; /* If this is the start, print a header */ if (*block_number == 0) seq_puts(s, "physical_block_number\terase_count\n"); err = ubi_io_is_bad(ubi, *block_number); if (err) return err; spin_lock(&ubi->wl_lock); wl = ubi->lookuptbl[*block_number]; if (wl) erase_count = wl->ec; spin_unlock(&ubi->wl_lock); if (erase_count < 0) return 0; seq_printf(s, "%-22d\t%-11d\n", *block_number, erase_count); return 0; } static const struct seq_operations eraseblk_count_seq_ops = { .start = eraseblk_count_seq_start, .next = eraseblk_count_seq_next, .stop = eraseblk_count_seq_stop, .show = eraseblk_count_seq_show }; static int eraseblk_count_open(struct inode *inode, struct file *f) { struct seq_file *s; int err; err = seq_open(f, &eraseblk_count_seq_ops); if (err) return err; s = f->private_data; s->private = ubi_get_device((unsigned long)inode->i_private); if (!s->private) return -ENODEV; else return 0; } static int eraseblk_count_release(struct inode *inode, struct file *f) { struct seq_file *s = f->private_data; struct ubi_device *ubi = s->private; ubi_put_device(ubi); return seq_release(inode, f); } static const struct file_operations eraseblk_count_fops = { .owner = THIS_MODULE, .open = eraseblk_count_open, .read = seq_read, .llseek = seq_lseek, .release = eraseblk_count_release, }; /** * ubi_debugfs_init_dev - initialize debugfs for an UBI device. * @ubi: UBI device description object * * This function creates all debugfs files for UBI device @ubi. Returns zero in * case of success and a negative error code in case of failure. */ int ubi_debugfs_init_dev(struct ubi_device *ubi) { unsigned long ubi_num = ubi->ubi_num; struct ubi_debug_info *d = &ubi->dbg; umode_t mode = S_IRUSR | S_IWUSR; int n; if (!IS_ENABLED(CONFIG_DEBUG_FS)) return 0; n = snprintf(d->dfs_dir_name, UBI_DFS_DIR_LEN, UBI_DFS_DIR_NAME, ubi->ubi_num); if (n >= UBI_DFS_DIR_LEN) { /* The array size is too small */ return -EINVAL; } d->dfs_dir = debugfs_create_dir(d->dfs_dir_name, dfs_rootdir); d->dfs_chk_gen = debugfs_create_file("chk_gen", mode, d->dfs_dir, (void *)ubi_num, &dfs_fops); d->dfs_chk_io = debugfs_create_file("chk_io", mode, d->dfs_dir, (void *)ubi_num, &dfs_fops); d->dfs_chk_fastmap = debugfs_create_file("chk_fastmap", mode, d->dfs_dir, (void *)ubi_num, &dfs_fops); d->dfs_disable_bgt = debugfs_create_file("tst_disable_bgt", mode, d->dfs_dir, (void *)ubi_num, &dfs_fops); d->dfs_emulate_bitflips = debugfs_create_file("tst_emulate_bitflips", mode, d->dfs_dir, (void *)ubi_num, &dfs_fops); d->dfs_emulate_io_failures = debugfs_create_file("tst_emulate_io_failures", mode, d->dfs_dir, (void *)ubi_num, &dfs_fops); d->dfs_emulate_power_cut = debugfs_create_file("tst_emulate_power_cut", mode, d->dfs_dir, (void *)ubi_num, &dfs_fops); d->dfs_power_cut_min = debugfs_create_file("tst_emulate_power_cut_min", mode, d->dfs_dir, (void *)ubi_num, &dfs_fops); d->dfs_power_cut_max = debugfs_create_file("tst_emulate_power_cut_max", mode, d->dfs_dir, (void *)ubi_num, &dfs_fops); debugfs_create_file("detailed_erase_block_info", S_IRUSR, d->dfs_dir, (void *)ubi_num, &eraseblk_count_fops); #ifdef CONFIG_MTD_UBI_FAULT_INJECTION d->dfs_emulate_failures = debugfs_create_file("emulate_failures", mode, d->dfs_dir, (void *)ubi_num, &dfs_fops); #endif return 0; } /** * ubi_debugfs_exit_dev - free all debugfs files corresponding to device @ubi * @ubi: UBI device description object */ void ubi_debugfs_exit_dev(struct ubi_device *ubi) { if (IS_ENABLED(CONFIG_DEBUG_FS)) debugfs_remove_recursive(ubi->dbg.dfs_dir); } /** * ubi_dbg_power_cut - emulate a power cut if it is time to do so * @ubi: UBI device description object * @caller: Flags set to indicate from where the function is being called * * Returns non-zero if a power cut was emulated, zero if not. */ int ubi_dbg_power_cut(struct ubi_device *ubi, int caller) { unsigned int range; if ((ubi->dbg.emulate_power_cut & caller) == 0) return 0; if (ubi->dbg.power_cut_counter == 0) { ubi->dbg.power_cut_counter = ubi->dbg.power_cut_min; if (ubi->dbg.power_cut_max > ubi->dbg.power_cut_min) { range = ubi->dbg.power_cut_max - ubi->dbg.power_cut_min; ubi->dbg.power_cut_counter += get_random_u32_below(range); } return 0; } ubi->dbg.power_cut_counter--; if (ubi->dbg.power_cut_counter) return 0; return 1; } |
| 803 158 661 642 84 19 18 19 9 11 18 9 18 8 10 9 7 6 10 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 | // SPDX-License-Identifier: GPL-2.0 /* * class.c - basic device class management * * Copyright (c) 2002-3 Patrick Mochel * Copyright (c) 2002-3 Open Source Development Labs * Copyright (c) 2003-2004 Greg Kroah-Hartman * Copyright (c) 2003-2004 IBM Corp. */ #include <linux/device/class.h> #include <linux/device.h> #include <linux/module.h> #include <linux/init.h> #include <linux/string.h> #include <linux/kdev_t.h> #include <linux/err.h> #include <linux/slab.h> #include <linux/blkdev.h> #include <linux/mutex.h> #include "base.h" /* /sys/class */ static struct kset *class_kset; #define to_class_attr(_attr) container_of(_attr, struct class_attribute, attr) /** * class_to_subsys - Turn a struct class into a struct subsys_private * * @class: pointer to the struct bus_type to look up * * The driver core internals need to work on the subsys_private structure, not * the external struct class pointer. This function walks the list of * registered classes in the system and finds the matching one and returns the * internal struct subsys_private that relates to that class. * * Note, the reference count of the return value is INCREMENTED if it is not * NULL. A call to subsys_put() must be done when finished with the pointer in * order for it to be properly freed. */ struct subsys_private *class_to_subsys(const struct class *class) { struct subsys_private *sp = NULL; struct kobject *kobj; if (!class || !class_kset) return NULL; spin_lock(&class_kset->list_lock); if (list_empty(&class_kset->list)) goto done; list_for_each_entry(kobj, &class_kset->list, entry) { struct kset *kset = container_of(kobj, struct kset, kobj); sp = container_of_const(kset, struct subsys_private, subsys); if (sp->class == class) goto done; } sp = NULL; done: sp = subsys_get(sp); spin_unlock(&class_kset->list_lock); return sp; } static ssize_t class_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { struct class_attribute *class_attr = to_class_attr(attr); struct subsys_private *cp = to_subsys_private(kobj); ssize_t ret = -EIO; if (class_attr->show) ret = class_attr->show(cp->class, class_attr, buf); return ret; } static ssize_t class_attr_store(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count) { struct class_attribute *class_attr = to_class_attr(attr); struct subsys_private *cp = to_subsys_private(kobj); ssize_t ret = -EIO; if (class_attr->store) ret = class_attr->store(cp->class, class_attr, buf, count); return ret; } static void class_release(struct kobject *kobj) { struct subsys_private *cp = to_subsys_private(kobj); const struct class *class = cp->class; pr_debug("class '%s': release.\n", class->name); if (class->class_release) class->class_release(class); else pr_debug("class '%s' does not have a release() function, " "be careful\n", class->name); lockdep_unregister_key(&cp->lock_key); kfree(cp); } static const struct kobj_ns_type_operations *class_child_ns_type(const struct kobject *kobj) { const struct subsys_private *cp = to_subsys_private(kobj); const struct class *class = cp->class; return class->ns_type; } static const struct sysfs_ops class_sysfs_ops = { .show = class_attr_show, .store = class_attr_store, }; static const struct kobj_type class_ktype = { .sysfs_ops = &class_sysfs_ops, .release = class_release, .child_ns_type = class_child_ns_type, }; int class_create_file_ns(const struct class *cls, const struct class_attribute *attr, const void *ns) { struct subsys_private *sp = class_to_subsys(cls); int error; if (!sp) return -EINVAL; error = sysfs_create_file_ns(&sp->subsys.kobj, &attr->attr, ns); subsys_put(sp); return error; } EXPORT_SYMBOL_GPL(class_create_file_ns); void class_remove_file_ns(const struct class *cls, const struct class_attribute *attr, const void *ns) { struct subsys_private *sp = class_to_subsys(cls); if (!sp) return; sysfs_remove_file_ns(&sp->subsys.kobj, &attr->attr, ns); subsys_put(sp); } EXPORT_SYMBOL_GPL(class_remove_file_ns); static struct device *klist_class_to_dev(struct klist_node *n) { struct device_private *p = to_device_private_class(n); return p->device; } static void klist_class_dev_get(struct klist_node *n) { struct device *dev = klist_class_to_dev(n); get_device(dev); } static void klist_class_dev_put(struct klist_node *n) { struct device *dev = klist_class_to_dev(n); put_device(dev); } int class_register(const struct class *cls) { struct subsys_private *cp; struct lock_class_key *key; int error; pr_debug("device class '%s': registering\n", cls->name); if (cls->ns_type && !cls->namespace) { pr_err("%s: class '%s' does not have namespace\n", __func__, cls->name); return -EINVAL; } if (!cls->ns_type && cls->namespace) { pr_err("%s: class '%s' does not have ns_type\n", __func__, cls->name); return -EINVAL; } cp = kzalloc(sizeof(*cp), GFP_KERNEL); if (!cp) return -ENOMEM; klist_init(&cp->klist_devices, klist_class_dev_get, klist_class_dev_put); INIT_LIST_HEAD(&cp->interfaces); kset_init(&cp->glue_dirs); key = &cp->lock_key; lockdep_register_key(key); __mutex_init(&cp->mutex, "subsys mutex", key); error = kobject_set_name(&cp->subsys.kobj, "%s", cls->name); if (error) goto err_out; cp->subsys.kobj.kset = class_kset; cp->subsys.kobj.ktype = &class_ktype; cp->class = cls; error = kset_register(&cp->subsys); if (error) goto err_out; error = sysfs_create_groups(&cp->subsys.kobj, cls->class_groups); if (error) { kobject_del(&cp->subsys.kobj); kfree_const(cp->subsys.kobj.name); goto err_out; } return 0; err_out: lockdep_unregister_key(key); kfree(cp); return error; } EXPORT_SYMBOL_GPL(class_register); void class_unregister(const struct class *cls) { struct subsys_private *sp = class_to_subsys(cls); if (!sp) return; pr_debug("device class '%s': unregistering\n", cls->name); sysfs_remove_groups(&sp->subsys.kobj, cls->class_groups); kset_unregister(&sp->subsys); subsys_put(sp); } EXPORT_SYMBOL_GPL(class_unregister); static void class_create_release(const struct class *cls) { pr_debug("%s called for %s\n", __func__, cls->name); kfree(cls); } /** * class_create - create a struct class structure * @name: pointer to a string for the name of this class. * * This is used to create a struct class pointer that can then be used * in calls to device_create(). * * Returns &struct class pointer on success, or ERR_PTR() on error. * * Note, the pointer created here is to be destroyed when finished by * making a call to class_destroy(). */ struct class *class_create(const char *name) { struct class *cls; int retval; cls = kzalloc(sizeof(*cls), GFP_KERNEL); if (!cls) { retval = -ENOMEM; goto error; } cls->name = name; cls->class_release = class_create_release; retval = class_register(cls); if (retval) goto error; return cls; error: kfree(cls); return ERR_PTR(retval); } EXPORT_SYMBOL_GPL(class_create); /** * class_destroy - destroys a struct class structure * @cls: pointer to the struct class that is to be destroyed * * Note, the pointer to be destroyed must have been created with a call * to class_create(). */ void class_destroy(const struct class *cls) { if (IS_ERR_OR_NULL(cls)) return; class_unregister(cls); } EXPORT_SYMBOL_GPL(class_destroy); /** * class_dev_iter_init - initialize class device iterator * @iter: class iterator to initialize * @class: the class we wanna iterate over * @start: the device to start iterating from, if any * @type: device_type of the devices to iterate over, NULL for all * * Initialize class iterator @iter such that it iterates over devices * of @class. If @start is set, the list iteration will start there, * otherwise if it is NULL, the iteration starts at the beginning of * the list. */ void class_dev_iter_init(struct class_dev_iter *iter, const struct class *class, const struct device *start, const struct device_type *type) { struct subsys_private *sp = class_to_subsys(class); struct klist_node *start_knode = NULL; memset(iter, 0, sizeof(*iter)); if (!sp) { pr_crit("%s: class %p was not registered yet\n", __func__, class); return; } if (start) start_knode = &start->p->knode_class; klist_iter_init_node(&sp->klist_devices, &iter->ki, start_knode); iter->type = type; iter->sp = sp; } EXPORT_SYMBOL_GPL(class_dev_iter_init); /** * class_dev_iter_next - iterate to the next device * @iter: class iterator to proceed * * Proceed @iter to the next device and return it. Returns NULL if * iteration is complete. * * The returned device is referenced and won't be released till * iterator is proceed to the next device or exited. The caller is * free to do whatever it wants to do with the device including * calling back into class code. */ struct device *class_dev_iter_next(struct class_dev_iter *iter) { struct klist_node *knode; struct device *dev; if (!iter->sp) return NULL; while (1) { knode = klist_next(&iter->ki); if (!knode) return NULL; dev = klist_class_to_dev(knode); if (!iter->type || iter->type == dev->type) return dev; } } EXPORT_SYMBOL_GPL(class_dev_iter_next); /** * class_dev_iter_exit - finish iteration * @iter: class iterator to finish * * Finish an iteration. Always call this function after iteration is * complete whether the iteration ran till the end or not. */ void class_dev_iter_exit(struct class_dev_iter *iter) { klist_iter_exit(&iter->ki); subsys_put(iter->sp); } EXPORT_SYMBOL_GPL(class_dev_iter_exit); /** * class_for_each_device - device iterator * @class: the class we're iterating * @start: the device to start with in the list, if any. * @data: data for the callback * @fn: function to be called for each device * * Iterate over @class's list of devices, and call @fn for each, * passing it @data. If @start is set, the list iteration will start * there, otherwise if it is NULL, the iteration starts at the * beginning of the list. * * We check the return of @fn each time. If it returns anything * other than 0, we break out and return that value. * * @fn is allowed to do anything including calling back into class * code. There's no locking restriction. */ int class_for_each_device(const struct class *class, const struct device *start, void *data, device_iter_t fn) { struct subsys_private *sp = class_to_subsys(class); struct class_dev_iter iter; struct device *dev; int error = 0; if (!class) return -EINVAL; if (!sp) { WARN(1, "%s called for class '%s' before it was registered", __func__, class->name); return -EINVAL; } class_dev_iter_init(&iter, class, start, NULL); while ((dev = class_dev_iter_next(&iter))) { error = fn(dev, data); if (error) break; } class_dev_iter_exit(&iter); subsys_put(sp); return error; } EXPORT_SYMBOL_GPL(class_for_each_device); /** * class_find_device - device iterator for locating a particular device * @class: the class we're iterating * @start: Device to begin with * @data: data for the match function * @match: function to check device * * This is similar to the class_for_each_dev() function above, but it * returns a reference to a device that is 'found' for later use, as * determined by the @match callback. * * The callback should return 0 if the device doesn't match and non-zero * if it does. If the callback returns non-zero, this function will * return to the caller and not iterate over any more devices. * * Note, you will need to drop the reference with put_device() after use. * * @match is allowed to do anything including calling back into class * code. There's no locking restriction. */ struct device *class_find_device(const struct class *class, const struct device *start, const void *data, device_match_t match) { struct subsys_private *sp = class_to_subsys(class); struct class_dev_iter iter; struct device *dev; if (!class) return NULL; if (!sp) { WARN(1, "%s called for class '%s' before it was registered", __func__, class->name); return NULL; } class_dev_iter_init(&iter, class, start, NULL); while ((dev = class_dev_iter_next(&iter))) { if (match(dev, data)) { get_device(dev); break; } } class_dev_iter_exit(&iter); subsys_put(sp); return dev; } EXPORT_SYMBOL_GPL(class_find_device); int class_interface_register(struct class_interface *class_intf) { struct subsys_private *sp; const struct class *parent; struct class_dev_iter iter; struct device *dev; if (!class_intf || !class_intf->class) return -ENODEV; parent = class_intf->class; sp = class_to_subsys(parent); if (!sp) return -EINVAL; /* * Reference in sp is now incremented and will be dropped when * the interface is removed in the call to class_interface_unregister() */ mutex_lock(&sp->mutex); list_add_tail(&class_intf->node, &sp->interfaces); if (class_intf->add_dev) { class_dev_iter_init(&iter, parent, NULL, NULL); while ((dev = class_dev_iter_next(&iter))) class_intf->add_dev(dev); class_dev_iter_exit(&iter); } mutex_unlock(&sp->mutex); return 0; } EXPORT_SYMBOL_GPL(class_interface_register); void class_interface_unregister(struct class_interface *class_intf) { struct subsys_private *sp; const struct class *parent = class_intf->class; struct class_dev_iter iter; struct device *dev; if (!parent) return; sp = class_to_subsys(parent); if (!sp) return; mutex_lock(&sp->mutex); list_del_init(&class_intf->node); if (class_intf->remove_dev) { class_dev_iter_init(&iter, parent, NULL, NULL); while ((dev = class_dev_iter_next(&iter))) class_intf->remove_dev(dev); class_dev_iter_exit(&iter); } mutex_unlock(&sp->mutex); /* * Decrement the reference count twice, once for the class_to_subsys() * call in the start of this function, and the second one from the * reference increment in class_interface_register() */ subsys_put(sp); subsys_put(sp); } EXPORT_SYMBOL_GPL(class_interface_unregister); ssize_t show_class_attr_string(const struct class *class, const struct class_attribute *attr, char *buf) { struct class_attribute_string *cs; cs = container_of(attr, struct class_attribute_string, attr); return sysfs_emit(buf, "%s\n", cs->str); } EXPORT_SYMBOL_GPL(show_class_attr_string); struct class_compat { struct kobject *kobj; }; /** * class_compat_register - register a compatibility class * @name: the name of the class * * Compatibility class are meant as a temporary user-space compatibility * workaround when converting a family of class devices to a bus devices. */ struct class_compat *class_compat_register(const char *name) { struct class_compat *cls; cls = kmalloc(sizeof(struct class_compat), GFP_KERNEL); if (!cls) return NULL; cls->kobj = kobject_create_and_add(name, &class_kset->kobj); if (!cls->kobj) { kfree(cls); return NULL; } return cls; } EXPORT_SYMBOL_GPL(class_compat_register); /** * class_compat_unregister - unregister a compatibility class * @cls: the class to unregister */ void class_compat_unregister(struct class_compat *cls) { kobject_put(cls->kobj); kfree(cls); } EXPORT_SYMBOL_GPL(class_compat_unregister); /** * class_compat_create_link - create a compatibility class device link to * a bus device * @cls: the compatibility class * @dev: the target bus device */ int class_compat_create_link(struct class_compat *cls, struct device *dev) { return sysfs_create_link(cls->kobj, &dev->kobj, dev_name(dev)); } EXPORT_SYMBOL_GPL(class_compat_create_link); /** * class_compat_remove_link - remove a compatibility class device link to * a bus device * @cls: the compatibility class * @dev: the target bus device */ void class_compat_remove_link(struct class_compat *cls, struct device *dev) { sysfs_remove_link(cls->kobj, dev_name(dev)); } EXPORT_SYMBOL_GPL(class_compat_remove_link); /** * class_is_registered - determine if at this moment in time, a class is * registered in the driver core or not. * @class: the class to check * * Returns a boolean to state if the class is registered in the driver core * or not. Note that the value could switch right after this call is made, * so only use this in places where you "know" it is safe to do so (usually * to determine if the specific class has been registered yet or not). * * Be careful in using this. */ bool class_is_registered(const struct class *class) { struct subsys_private *sp = class_to_subsys(class); bool is_initialized = false; if (sp) { is_initialized = true; subsys_put(sp); } return is_initialized; } EXPORT_SYMBOL_GPL(class_is_registered); int __init classes_init(void) { class_kset = kset_create_and_add("class", NULL, NULL); if (!class_kset) return -ENOMEM; return 0; } |
| 109 109 109 109 109 3 115 46 38 40 29 114 131 3 3 118 97 29 133 94 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_ALLOC_BACKGROUND_H #define _BCACHEFS_ALLOC_BACKGROUND_H #include "bcachefs.h" #include "alloc_types.h" #include "buckets.h" #include "debug.h" #include "super.h" /* How out of date a pointer gen is allowed to be: */ #define BUCKET_GC_GEN_MAX 96U static inline bool bch2_dev_bucket_exists(struct bch_fs *c, struct bpos pos) { rcu_read_lock(); struct bch_dev *ca = bch2_dev_rcu_noerror(c, pos.inode); bool ret = ca && bucket_valid(ca, pos.offset); rcu_read_unlock(); return ret; } static inline u64 bucket_to_u64(struct bpos bucket) { return (bucket.inode << 48) | bucket.offset; } static inline struct bpos u64_to_bucket(u64 bucket) { return POS(bucket >> 48, bucket & ~(~0ULL << 48)); } static inline u8 alloc_gc_gen(struct bch_alloc_v4 a) { return a.gen - a.oldest_gen; } static inline void alloc_to_bucket(struct bucket *dst, struct bch_alloc_v4 src) { dst->gen = src.gen; dst->data_type = src.data_type; dst->stripe_sectors = src.stripe_sectors; dst->dirty_sectors = src.dirty_sectors; dst->cached_sectors = src.cached_sectors; dst->stripe = src.stripe; } static inline void __bucket_m_to_alloc(struct bch_alloc_v4 *dst, struct bucket src) { dst->gen = src.gen; dst->data_type = src.data_type; dst->stripe_sectors = src.stripe_sectors; dst->dirty_sectors = src.dirty_sectors; dst->cached_sectors = src.cached_sectors; dst->stripe = src.stripe; } static inline struct bch_alloc_v4 bucket_m_to_alloc(struct bucket b) { struct bch_alloc_v4 ret = {}; __bucket_m_to_alloc(&ret, b); return ret; } static inline enum bch_data_type bucket_data_type(enum bch_data_type data_type) { switch (data_type) { case BCH_DATA_cached: case BCH_DATA_stripe: return BCH_DATA_user; default: return data_type; } } static inline bool bucket_data_type_mismatch(enum bch_data_type bucket, enum bch_data_type ptr) { return !data_type_is_empty(bucket) && bucket_data_type(bucket) != bucket_data_type(ptr); } /* * It is my general preference to use unsigned types for unsigned quantities - * however, these helpers are used in disk accounting calculations run by * triggers where the output will be negated and added to an s64. unsigned is * right out even though all these quantities will fit in 32 bits, since it * won't be sign extended correctly; u64 will negate "correctly", but s64 is the * simpler option here. */ static inline s64 bch2_bucket_sectors_total(struct bch_alloc_v4 a) { return a.stripe_sectors + a.dirty_sectors + a.cached_sectors; } static inline s64 bch2_bucket_sectors_dirty(struct bch_alloc_v4 a) { return a.stripe_sectors + a.dirty_sectors; } static inline s64 bch2_bucket_sectors(struct bch_alloc_v4 a) { return a.data_type == BCH_DATA_cached ? a.cached_sectors : bch2_bucket_sectors_dirty(a); } static inline s64 bch2_bucket_sectors_fragmented(struct bch_dev *ca, struct bch_alloc_v4 a) { int d = bch2_bucket_sectors(a); return d ? max(0, ca->mi.bucket_size - d) : 0; } static inline s64 bch2_gc_bucket_sectors_fragmented(struct bch_dev *ca, struct bucket a) { int d = a.stripe_sectors + a.dirty_sectors; return d ? max(0, ca->mi.bucket_size - d) : 0; } static inline s64 bch2_bucket_sectors_unstriped(struct bch_alloc_v4 a) { return a.data_type == BCH_DATA_stripe ? a.dirty_sectors : 0; } static inline enum bch_data_type alloc_data_type(struct bch_alloc_v4 a, enum bch_data_type data_type) { if (a.stripe) return data_type == BCH_DATA_parity ? data_type : BCH_DATA_stripe; if (bch2_bucket_sectors_dirty(a)) return bucket_data_type(data_type); if (a.cached_sectors) return BCH_DATA_cached; if (BCH_ALLOC_V4_NEED_DISCARD(&a)) return BCH_DATA_need_discard; if (alloc_gc_gen(a) >= BUCKET_GC_GEN_MAX) return BCH_DATA_need_gc_gens; return BCH_DATA_free; } static inline void alloc_data_type_set(struct bch_alloc_v4 *a, enum bch_data_type data_type) { a->data_type = alloc_data_type(*a, data_type); } static inline u64 alloc_lru_idx_read(struct bch_alloc_v4 a) { return a.data_type == BCH_DATA_cached ? a.io_time[READ] & LRU_TIME_MAX : 0; } #define DATA_TYPES_MOVABLE \ ((1U << BCH_DATA_btree)| \ (1U << BCH_DATA_user)| \ (1U << BCH_DATA_stripe)) static inline bool data_type_movable(enum bch_data_type type) { return (1U << type) & DATA_TYPES_MOVABLE; } static inline u64 alloc_lru_idx_fragmentation(struct bch_alloc_v4 a, struct bch_dev *ca) { if (a.data_type >= BCH_DATA_NR) return 0; if (!data_type_movable(a.data_type) || !bch2_bucket_sectors_fragmented(ca, a)) return 0; /* * avoid overflowing LRU_TIME_BITS on a corrupted fs, when * bucket_sectors_dirty is (much) bigger than bucket_size */ u64 d = min_t(s64, bch2_bucket_sectors_dirty(a), ca->mi.bucket_size); return div_u64(d * (1ULL << 31), ca->mi.bucket_size); } static inline u64 alloc_freespace_genbits(struct bch_alloc_v4 a) { return ((u64) alloc_gc_gen(a) >> 4) << 56; } static inline struct bpos alloc_freespace_pos(struct bpos pos, struct bch_alloc_v4 a) { pos.offset |= alloc_freespace_genbits(a); return pos; } static inline unsigned alloc_v4_u64s_noerror(const struct bch_alloc_v4 *a) { return (BCH_ALLOC_V4_BACKPOINTERS_START(a) ?: BCH_ALLOC_V4_U64s_V0) + BCH_ALLOC_V4_NR_BACKPOINTERS(a) * (sizeof(struct bch_backpointer) / sizeof(u64)); } static inline unsigned alloc_v4_u64s(const struct bch_alloc_v4 *a) { unsigned ret = alloc_v4_u64s_noerror(a); BUG_ON(ret > U8_MAX - BKEY_U64s); return ret; } static inline void set_alloc_v4_u64s(struct bkey_i_alloc_v4 *a) { set_bkey_val_u64s(&a->k, alloc_v4_u64s(&a->v)); } struct bkey_i_alloc_v4 * bch2_trans_start_alloc_update_noupdate(struct btree_trans *, struct btree_iter *, struct bpos); struct bkey_i_alloc_v4 * bch2_trans_start_alloc_update(struct btree_trans *, struct bpos, enum btree_iter_update_trigger_flags); void __bch2_alloc_to_v4(struct bkey_s_c, struct bch_alloc_v4 *); static inline const struct bch_alloc_v4 *bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *convert) { const struct bch_alloc_v4 *ret; if (unlikely(k.k->type != KEY_TYPE_alloc_v4)) goto slowpath; ret = bkey_s_c_to_alloc_v4(k).v; if (BCH_ALLOC_V4_BACKPOINTERS_START(ret) != BCH_ALLOC_V4_U64s) goto slowpath; return ret; slowpath: __bch2_alloc_to_v4(k, convert); return convert; } struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *, struct bkey_s_c); int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int); int bch2_alloc_v1_validate(struct bch_fs *, struct bkey_s_c, struct bkey_validate_context); int bch2_alloc_v2_validate(struct bch_fs *, struct bkey_s_c, struct bkey_validate_context); int bch2_alloc_v3_validate(struct bch_fs *, struct bkey_s_c, struct bkey_validate_context); int bch2_alloc_v4_validate(struct bch_fs *, struct bkey_s_c, struct bkey_validate_context); void bch2_alloc_v4_swab(struct bkey_s); void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_alloc ((struct bkey_ops) { \ .key_validate = bch2_alloc_v1_validate, \ .val_to_text = bch2_alloc_to_text, \ .trigger = bch2_trigger_alloc, \ .min_val_size = 8, \ }) #define bch2_bkey_ops_alloc_v2 ((struct bkey_ops) { \ .key_validate = bch2_alloc_v2_validate, \ .val_to_text = bch2_alloc_to_text, \ .trigger = bch2_trigger_alloc, \ .min_val_size = 8, \ }) #define bch2_bkey_ops_alloc_v3 ((struct bkey_ops) { \ .key_validate = bch2_alloc_v3_validate, \ .val_to_text = bch2_alloc_to_text, \ .trigger = bch2_trigger_alloc, \ .min_val_size = 16, \ }) #define bch2_bkey_ops_alloc_v4 ((struct bkey_ops) { \ .key_validate = bch2_alloc_v4_validate, \ .val_to_text = bch2_alloc_to_text, \ .swab = bch2_alloc_v4_swab, \ .trigger = bch2_trigger_alloc, \ .min_val_size = 48, \ }) int bch2_bucket_gens_validate(struct bch_fs *, struct bkey_s_c, struct bkey_validate_context); void bch2_bucket_gens_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_bucket_gens ((struct bkey_ops) { \ .key_validate = bch2_bucket_gens_validate, \ .val_to_text = bch2_bucket_gens_to_text, \ }) int bch2_bucket_gens_init(struct bch_fs *); static inline bool bkey_is_alloc(const struct bkey *k) { return k->type == KEY_TYPE_alloc || k->type == KEY_TYPE_alloc_v2 || k->type == KEY_TYPE_alloc_v3; } int bch2_alloc_read(struct bch_fs *); int bch2_alloc_key_to_dev_counters(struct btree_trans *, struct bch_dev *, const struct bch_alloc_v4 *, const struct bch_alloc_v4 *, unsigned); int bch2_trigger_alloc(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_s, enum btree_iter_update_trigger_flags); int bch2_check_discard_freespace_key(struct btree_trans *, struct btree_iter *, u8 *, bool); int bch2_check_alloc_info(struct bch_fs *); int bch2_check_alloc_to_lru_refs(struct bch_fs *); void bch2_dev_do_discards(struct bch_dev *); void bch2_do_discards(struct bch_fs *); static inline u64 should_invalidate_buckets(struct bch_dev *ca, struct bch_dev_usage u) { u64 want_free = ca->mi.nbuckets >> 7; u64 free = max_t(s64, 0, u.buckets[BCH_DATA_free] + u.buckets[BCH_DATA_need_discard] - bch2_dev_buckets_reserved(ca, BCH_WATERMARK_stripe)); return clamp_t(s64, want_free - free, 0, u.buckets[BCH_DATA_cached]); } void bch2_dev_do_invalidates(struct bch_dev *); void bch2_do_invalidates(struct bch_fs *); static inline struct bch_backpointer *alloc_v4_backpointers(struct bch_alloc_v4 *a) { return (void *) ((u64 *) &a->v + (BCH_ALLOC_V4_BACKPOINTERS_START(a) ?: BCH_ALLOC_V4_U64s_V0)); } static inline const struct bch_backpointer *alloc_v4_backpointers_c(const struct bch_alloc_v4 *a) { return (void *) ((u64 *) &a->v + BCH_ALLOC_V4_BACKPOINTERS_START(a)); } int bch2_dev_freespace_init(struct bch_fs *, struct bch_dev *, u64, u64); int bch2_fs_freespace_init(struct bch_fs *); int bch2_dev_remove_alloc(struct bch_fs *, struct bch_dev *); void bch2_recalc_capacity(struct bch_fs *); u64 bch2_min_rw_member_capacity(struct bch_fs *); void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *); void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *); void bch2_dev_allocator_background_exit(struct bch_dev *); void bch2_dev_allocator_background_init(struct bch_dev *); void bch2_fs_allocator_background_init(struct bch_fs *); #endif /* _BCACHEFS_ALLOC_BACKGROUND_H */ |
| 7019 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * Landlock - Credential hooks * * Copyright © 2019-2020 Mickaël Salaün <mic@digikod.net> * Copyright © 2019-2020 ANSSI * Copyright © 2021-2025 Microsoft Corporation */ #ifndef _SECURITY_LANDLOCK_CRED_H #define _SECURITY_LANDLOCK_CRED_H #include <linux/container_of.h> #include <linux/cred.h> #include <linux/init.h> #include <linux/rcupdate.h> #include "access.h" #include "limits.h" #include "ruleset.h" #include "setup.h" /** * struct landlock_cred_security - Credential security blob * * This structure is packed to minimize the size of struct * landlock_file_security. However, it is always aligned in the LSM cred blob, * see lsm_set_blob_size(). */ struct landlock_cred_security { /** * @domain: Immutable ruleset enforced on a task. */ struct landlock_ruleset *domain; #ifdef CONFIG_AUDIT /** * @domain_exec: Bitmask identifying the domain layers that were enforced by * the current task's executed file (i.e. no new execve(2) since * landlock_restrict_self(2)). */ u16 domain_exec; /** * @log_subdomains_off: Set if the domain descendants's log_status should be * set to %LANDLOCK_LOG_DISABLED. This is not a landlock_hierarchy * configuration because it applies to future descendant domains and it does * not require a current domain. */ u8 log_subdomains_off : 1; #endif /* CONFIG_AUDIT */ } __packed; #ifdef CONFIG_AUDIT /* Makes sure all layer executions can be stored. */ static_assert(BITS_PER_TYPE(typeof_member(struct landlock_cred_security, domain_exec)) >= LANDLOCK_MAX_NUM_LAYERS); #endif /* CONFIG_AUDIT */ static inline struct landlock_cred_security * landlock_cred(const struct cred *cred) { return cred->security + landlock_blob_sizes.lbs_cred; } static inline struct landlock_ruleset *landlock_get_current_domain(void) { return landlock_cred(current_cred())->domain; } /* * The call needs to come from an RCU read-side critical section. */ static inline const struct landlock_ruleset * landlock_get_task_domain(const struct task_struct *const task) { return landlock_cred(__task_cred(task))->domain; } static inline bool landlocked(const struct task_struct *const task) { bool has_dom; if (task == current) return !!landlock_get_current_domain(); rcu_read_lock(); has_dom = !!landlock_get_task_domain(task); rcu_read_unlock(); return has_dom; } /** * landlock_get_applicable_subject - Return the subject's Landlock credential * if its enforced domain applies to (i.e. * handles) at least one of the access rights * specified in @masks * * @cred: credential * @masks: access masks * @handle_layer: returned youngest layer handling a subset of @masks. Not set * if the function returns NULL. * * Returns: landlock_cred(@cred) if any access rights specified in @masks is * handled, or NULL otherwise. */ static inline const struct landlock_cred_security * landlock_get_applicable_subject(const struct cred *const cred, const struct access_masks masks, size_t *const handle_layer) { const union access_masks_all masks_all = { .masks = masks, }; const struct landlock_ruleset *domain; ssize_t layer_level; if (!cred) return NULL; domain = landlock_cred(cred)->domain; if (!domain) return NULL; for (layer_level = domain->num_layers - 1; layer_level >= 0; layer_level--) { union access_masks_all layer = { .masks = domain->access_masks[layer_level], }; if (layer.all & masks_all.all) { if (handle_layer) *handle_layer = layer_level; return landlock_cred(cred); } } return NULL; } __init void landlock_add_cred_hooks(void); #endif /* _SECURITY_LANDLOCK_CRED_H */ |
| 1 1 1 1 2 1 1 20 20 3 13 5 5 1 2 1 1 1 2 1 2 2 1 2 2 10 1 3 4 4 2 6 6 2 3 5 4 1 1 1 13 1 1 2 4 2 1 1 5 1 1 1 31 25 7 1 1 1 1 1 16 1 17 17 33 32 33 33 35 35 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 | // SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/nfs/fs_context.c * * Copyright (C) 1992 Rick Sladkey * Conversion to new mount api Copyright (C) David Howells * * NFS mount handling. * * Split from fs/nfs/super.c by David Howells <dhowells@redhat.com> */ #include <linux/compat.h> #include <linux/module.h> #include <linux/fs.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> #include <linux/nfs_fs.h> #include <linux/nfs_mount.h> #include <linux/nfs4_mount.h> #include <net/handshake.h> #include "nfs.h" #include "internal.h" #include "nfstrace.h" #define NFSDBG_FACILITY NFSDBG_MOUNT #if IS_ENABLED(CONFIG_NFS_V3) #define NFS_DEFAULT_VERSION 3 #else #define NFS_DEFAULT_VERSION 2 #endif #define NFS_MAX_CONNECTIONS 16 enum nfs_param { Opt_ac, Opt_acdirmax, Opt_acdirmin, Opt_acl, Opt_acregmax, Opt_acregmin, Opt_actimeo, Opt_addr, Opt_bg, Opt_bsize, Opt_clientaddr, Opt_cto, Opt_alignwrite, Opt_fatal_neterrors, Opt_fg, Opt_fscache, Opt_fscache_flag, Opt_hard, Opt_intr, Opt_local_lock, Opt_lock, Opt_lookupcache, Opt_migration, Opt_minorversion, Opt_mountaddr, Opt_mounthost, Opt_mountport, Opt_mountproto, Opt_mountvers, Opt_namelen, Opt_nconnect, Opt_max_connect, Opt_port, Opt_posix, Opt_proto, Opt_rdirplus, Opt_rdirplus_none, Opt_rdirplus_force, Opt_rdma, Opt_resvport, Opt_retrans, Opt_retry, Opt_rsize, Opt_sec, Opt_sharecache, Opt_sloppy, Opt_soft, Opt_softerr, Opt_softreval, Opt_source, Opt_tcp, Opt_timeo, Opt_trunkdiscovery, Opt_udp, Opt_v, Opt_vers, Opt_wsize, Opt_write, Opt_xprtsec, }; enum { Opt_fatal_neterrors_default, Opt_fatal_neterrors_enetunreach, Opt_fatal_neterrors_none, }; static const struct constant_table nfs_param_enums_fatal_neterrors[] = { { "default", Opt_fatal_neterrors_default }, { "ENETDOWN:ENETUNREACH", Opt_fatal_neterrors_enetunreach }, { "ENETUNREACH:ENETDOWN", Opt_fatal_neterrors_enetunreach }, { "none", Opt_fatal_neterrors_none }, {} }; enum { Opt_local_lock_all, Opt_local_lock_flock, Opt_local_lock_none, Opt_local_lock_posix, }; static const struct constant_table nfs_param_enums_local_lock[] = { { "all", Opt_local_lock_all }, { "flock", Opt_local_lock_flock }, { "posix", Opt_local_lock_posix }, { "none", Opt_local_lock_none }, {} }; enum { Opt_lookupcache_all, Opt_lookupcache_none, Opt_lookupcache_positive, }; static const struct constant_table nfs_param_enums_lookupcache[] = { { "all", Opt_lookupcache_all }, { "none", Opt_lookupcache_none }, { "pos", Opt_lookupcache_positive }, { "positive", Opt_lookupcache_positive }, {} }; enum { Opt_write_lazy, Opt_write_eager, Opt_write_wait, }; static const struct constant_table nfs_param_enums_write[] = { { "lazy", Opt_write_lazy }, { "eager", Opt_write_eager }, { "wait", Opt_write_wait }, {} }; static const struct fs_parameter_spec nfs_fs_parameters[] = { fsparam_flag_no("ac", Opt_ac), fsparam_u32 ("acdirmax", Opt_acdirmax), fsparam_u32 ("acdirmin", Opt_acdirmin), fsparam_flag_no("acl", Opt_acl), fsparam_u32 ("acregmax", Opt_acregmax), fsparam_u32 ("acregmin", Opt_acregmin), fsparam_u32 ("actimeo", Opt_actimeo), fsparam_string("addr", Opt_addr), fsparam_flag ("bg", Opt_bg), fsparam_u32 ("bsize", Opt_bsize), fsparam_string("clientaddr", Opt_clientaddr), fsparam_flag_no("cto", Opt_cto), fsparam_flag_no("alignwrite", Opt_alignwrite), fsparam_enum("fatal_neterrors", Opt_fatal_neterrors, nfs_param_enums_fatal_neterrors), fsparam_flag ("fg", Opt_fg), fsparam_flag_no("fsc", Opt_fscache_flag), fsparam_string("fsc", Opt_fscache), fsparam_flag ("hard", Opt_hard), __fsparam(NULL, "intr", Opt_intr, fs_param_neg_with_no|fs_param_deprecated, NULL), fsparam_enum ("local_lock", Opt_local_lock, nfs_param_enums_local_lock), fsparam_flag_no("lock", Opt_lock), fsparam_enum ("lookupcache", Opt_lookupcache, nfs_param_enums_lookupcache), fsparam_flag_no("migration", Opt_migration), fsparam_u32 ("minorversion", Opt_minorversion), fsparam_string("mountaddr", Opt_mountaddr), fsparam_string("mounthost", Opt_mounthost), fsparam_u32 ("mountport", Opt_mountport), fsparam_string("mountproto", Opt_mountproto), fsparam_u32 ("mountvers", Opt_mountvers), fsparam_u32 ("namlen", Opt_namelen), fsparam_u32 ("nconnect", Opt_nconnect), fsparam_u32 ("max_connect", Opt_max_connect), fsparam_string("nfsvers", Opt_vers), fsparam_u32 ("port", Opt_port), fsparam_flag_no("posix", Opt_posix), fsparam_string("proto", Opt_proto), fsparam_flag_no("rdirplus", Opt_rdirplus), // rdirplus|nordirplus fsparam_string("rdirplus", Opt_rdirplus), // rdirplus=... fsparam_flag ("rdma", Opt_rdma), fsparam_flag_no("resvport", Opt_resvport), fsparam_u32 ("retrans", Opt_retrans), fsparam_string("retry", Opt_retry), fsparam_u32 ("rsize", Opt_rsize), fsparam_string("sec", Opt_sec), fsparam_flag_no("sharecache", Opt_sharecache), fsparam_flag ("sloppy", Opt_sloppy), fsparam_flag ("soft", Opt_soft), fsparam_flag ("softerr", Opt_softerr), fsparam_flag ("softreval", Opt_softreval), fsparam_string("source", Opt_source), fsparam_flag ("tcp", Opt_tcp), fsparam_u32 ("timeo", Opt_timeo), fsparam_flag_no("trunkdiscovery", Opt_trunkdiscovery), fsparam_flag ("udp", Opt_udp), fsparam_flag ("v2", Opt_v), fsparam_flag ("v3", Opt_v), fsparam_flag ("v4", Opt_v), fsparam_flag ("v4.0", Opt_v), fsparam_flag ("v4.1", Opt_v), fsparam_flag ("v4.2", Opt_v), fsparam_string("vers", Opt_vers), fsparam_enum ("write", Opt_write, nfs_param_enums_write), fsparam_u32 ("wsize", Opt_wsize), fsparam_string("xprtsec", Opt_xprtsec), {} }; enum { Opt_vers_2, Opt_vers_3, Opt_vers_4, Opt_vers_4_0, Opt_vers_4_1, Opt_vers_4_2, }; static const struct constant_table nfs_vers_tokens[] = { { "2", Opt_vers_2 }, { "3", Opt_vers_3 }, { "4", Opt_vers_4 }, { "4.0", Opt_vers_4_0 }, { "4.1", Opt_vers_4_1 }, { "4.2", Opt_vers_4_2 }, {} }; enum { Opt_xprt_rdma, Opt_xprt_rdma6, Opt_xprt_tcp, Opt_xprt_tcp6, Opt_xprt_udp, Opt_xprt_udp6, nr__Opt_xprt }; static const struct constant_table nfs_xprt_protocol_tokens[] = { { "rdma", Opt_xprt_rdma }, { "rdma6", Opt_xprt_rdma6 }, { "tcp", Opt_xprt_tcp }, { "tcp6", Opt_xprt_tcp6 }, { "udp", Opt_xprt_udp }, { "udp6", Opt_xprt_udp6 }, {} }; enum { Opt_sec_krb5, Opt_sec_krb5i, Opt_sec_krb5p, Opt_sec_lkey, Opt_sec_lkeyi, Opt_sec_lkeyp, Opt_sec_none, Opt_sec_spkm, Opt_sec_spkmi, Opt_sec_spkmp, Opt_sec_sys, nr__Opt_sec }; static const struct constant_table nfs_secflavor_tokens[] = { { "krb5", Opt_sec_krb5 }, { "krb5i", Opt_sec_krb5i }, { "krb5p", Opt_sec_krb5p }, { "lkey", Opt_sec_lkey }, { "lkeyi", Opt_sec_lkeyi }, { "lkeyp", Opt_sec_lkeyp }, { "none", Opt_sec_none }, { "null", Opt_sec_none }, { "spkm3", Opt_sec_spkm }, { "spkm3i", Opt_sec_spkmi }, { "spkm3p", Opt_sec_spkmp }, { "sys", Opt_sec_sys }, {} }; enum { Opt_xprtsec_none, Opt_xprtsec_tls, Opt_xprtsec_mtls, nr__Opt_xprtsec }; static const struct constant_table nfs_xprtsec_policies[] = { { "none", Opt_xprtsec_none }, { "tls", Opt_xprtsec_tls }, { "mtls", Opt_xprtsec_mtls }, {} }; static const struct constant_table nfs_rdirplus_tokens[] = { { "none", Opt_rdirplus_none }, { "force", Opt_rdirplus_force }, {} }; /* * Sanity-check a server address provided by the mount command. * * Address family must be initialized, and address must not be * the ANY address for that family. */ static int nfs_verify_server_address(struct sockaddr_storage *addr) { switch (addr->ss_family) { case AF_INET: { struct sockaddr_in *sa = (struct sockaddr_in *)addr; return sa->sin_addr.s_addr != htonl(INADDR_ANY); } case AF_INET6: { struct in6_addr *sa = &((struct sockaddr_in6 *)addr)->sin6_addr; return !ipv6_addr_any(sa); } } return 0; } #ifdef CONFIG_NFS_DISABLE_UDP_SUPPORT static bool nfs_server_transport_udp_invalid(const struct nfs_fs_context *ctx) { return true; } #else static bool nfs_server_transport_udp_invalid(const struct nfs_fs_context *ctx) { if (ctx->version == 4) return true; return false; } #endif /* * Sanity check the NFS transport protocol. */ static int nfs_validate_transport_protocol(struct fs_context *fc, struct nfs_fs_context *ctx) { switch (ctx->nfs_server.protocol) { case XPRT_TRANSPORT_UDP: if (nfs_server_transport_udp_invalid(ctx)) goto out_invalid_transport_udp; break; case XPRT_TRANSPORT_TCP: case XPRT_TRANSPORT_RDMA: break; default: ctx->nfs_server.protocol = XPRT_TRANSPORT_TCP; } if (ctx->xprtsec.policy != RPC_XPRTSEC_NONE) switch (ctx->nfs_server.protocol) { case XPRT_TRANSPORT_TCP: ctx->nfs_server.protocol = XPRT_TRANSPORT_TCP_TLS; break; default: goto out_invalid_xprtsec_policy; } return 0; out_invalid_transport_udp: return nfs_invalf(fc, "NFS: Unsupported transport protocol udp"); out_invalid_xprtsec_policy: return nfs_invalf(fc, "NFS: Transport does not support xprtsec"); } /* * For text based NFSv2/v3 mounts, the mount protocol transport default * settings should depend upon the specified NFS transport. */ static void nfs_set_mount_transport_protocol(struct nfs_fs_context *ctx) { if (ctx->mount_server.protocol == XPRT_TRANSPORT_UDP || ctx->mount_server.protocol == XPRT_TRANSPORT_TCP) return; switch (ctx->nfs_server.protocol) { case XPRT_TRANSPORT_UDP: ctx->mount_server.protocol = XPRT_TRANSPORT_UDP; break; case XPRT_TRANSPORT_TCP: case XPRT_TRANSPORT_RDMA: ctx->mount_server.protocol = XPRT_TRANSPORT_TCP; } } /* * Add 'flavor' to 'auth_info' if not already present. * Returns true if 'flavor' ends up in the list, false otherwise */ static int nfs_auth_info_add(struct fs_context *fc, struct nfs_auth_info *auth_info, rpc_authflavor_t flavor) { unsigned int i; unsigned int max_flavor_len = ARRAY_SIZE(auth_info->flavors); /* make sure this flavor isn't already in the list */ for (i = 0; i < auth_info->flavor_len; i++) { if (flavor == auth_info->flavors[i]) return 0; } if (auth_info->flavor_len + 1 >= max_flavor_len) return nfs_invalf(fc, "NFS: too many sec= flavors"); auth_info->flavors[auth_info->flavor_len++] = flavor; return 0; } /* * Parse the value of the 'sec=' option. */ static int nfs_parse_security_flavors(struct fs_context *fc, struct fs_parameter *param) { struct nfs_fs_context *ctx = nfs_fc2context(fc); rpc_authflavor_t pseudoflavor; char *string = param->string, *p; int ret; trace_nfs_mount_assign(param->key, string); while ((p = strsep(&string, ":")) != NULL) { if (!*p) continue; switch (lookup_constant(nfs_secflavor_tokens, p, -1)) { case Opt_sec_none: pseudoflavor = RPC_AUTH_NULL; break; case Opt_sec_sys: pseudoflavor = RPC_AUTH_UNIX; break; case Opt_sec_krb5: pseudoflavor = RPC_AUTH_GSS_KRB5; break; case Opt_sec_krb5i: pseudoflavor = RPC_AUTH_GSS_KRB5I; break; case Opt_sec_krb5p: pseudoflavor = RPC_AUTH_GSS_KRB5P; break; case Opt_sec_lkey: pseudoflavor = RPC_AUTH_GSS_LKEY; break; case Opt_sec_lkeyi: pseudoflavor = RPC_AUTH_GSS_LKEYI; break; case Opt_sec_lkeyp: pseudoflavor = RPC_AUTH_GSS_LKEYP; break; case Opt_sec_spkm: pseudoflavor = RPC_AUTH_GSS_SPKM; break; case Opt_sec_spkmi: pseudoflavor = RPC_AUTH_GSS_SPKMI; break; case Opt_sec_spkmp: pseudoflavor = RPC_AUTH_GSS_SPKMP; break; default: return nfs_invalf(fc, "NFS: sec=%s option not recognized", p); } ret = nfs_auth_info_add(fc, &ctx->auth_info, pseudoflavor); if (ret < 0) return ret; } return 0; } static int nfs_parse_xprtsec_policy(struct fs_context *fc, struct fs_parameter *param) { struct nfs_fs_context *ctx = nfs_fc2context(fc); trace_nfs_mount_assign(param->key, param->string); switch (lookup_constant(nfs_xprtsec_policies, param->string, -1)) { case Opt_xprtsec_none: ctx->xprtsec.policy = RPC_XPRTSEC_NONE; break; case Opt_xprtsec_tls: ctx->xprtsec.policy = RPC_XPRTSEC_TLS_ANON; break; case Opt_xprtsec_mtls: ctx->xprtsec.policy = RPC_XPRTSEC_TLS_X509; break; default: return nfs_invalf(fc, "NFS: Unrecognized transport security policy"); } return 0; } static int nfs_parse_version_string(struct fs_context *fc, const char *string) { struct nfs_fs_context *ctx = nfs_fc2context(fc); ctx->flags &= ~NFS_MOUNT_VER3; switch (lookup_constant(nfs_vers_tokens, string, -1)) { case Opt_vers_2: ctx->version = 2; break; case Opt_vers_3: ctx->flags |= NFS_MOUNT_VER3; ctx->version = 3; break; case Opt_vers_4: /* Backward compatibility option. In future, * the mount program should always supply * a NFSv4 minor version number. */ ctx->version = 4; break; case Opt_vers_4_0: ctx->version = 4; ctx->minorversion = 0; break; case Opt_vers_4_1: ctx->version = 4; ctx->minorversion = 1; break; case Opt_vers_4_2: ctx->version = 4; ctx->minorversion = 2; break; default: return nfs_invalf(fc, "NFS: Unsupported NFS version"); } return 0; } /* * Parse a single mount parameter. */ static int nfs_fs_context_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct fs_parse_result result; struct nfs_fs_context *ctx = nfs_fc2context(fc); unsigned short protofamily, mountfamily; unsigned int len; int ret, opt; trace_nfs_mount_option(param); opt = fs_parse(fc, nfs_fs_parameters, param, &result); if (opt < 0) return (opt == -ENOPARAM && ctx->sloppy) ? 1 : opt; if (fc->security) ctx->has_sec_mnt_opts = 1; switch (opt) { case Opt_source: if (fc->source) return nfs_invalf(fc, "NFS: Multiple sources not supported"); fc->source = param->string; param->string = NULL; break; /* * boolean options: foo/nofoo */ case Opt_soft: ctx->flags |= NFS_MOUNT_SOFT; ctx->flags &= ~NFS_MOUNT_SOFTERR; break; case Opt_softerr: ctx->flags |= NFS_MOUNT_SOFTERR | NFS_MOUNT_SOFTREVAL; ctx->flags &= ~NFS_MOUNT_SOFT; break; case Opt_hard: ctx->flags &= ~(NFS_MOUNT_SOFT | NFS_MOUNT_SOFTERR | NFS_MOUNT_SOFTREVAL); break; case Opt_softreval: if (result.negated) ctx->flags &= ~NFS_MOUNT_SOFTREVAL; else ctx->flags |= NFS_MOUNT_SOFTREVAL; break; case Opt_posix: if (result.negated) ctx->flags &= ~NFS_MOUNT_POSIX; else ctx->flags |= NFS_MOUNT_POSIX; break; case Opt_cto: if (result.negated) ctx->flags |= NFS_MOUNT_NOCTO; else ctx->flags &= ~NFS_MOUNT_NOCTO; break; case Opt_trunkdiscovery: if (result.negated) ctx->flags &= ~NFS_MOUNT_TRUNK_DISCOVERY; else ctx->flags |= NFS_MOUNT_TRUNK_DISCOVERY; break; case Opt_alignwrite: if (result.negated) ctx->flags |= NFS_MOUNT_NO_ALIGNWRITE; else ctx->flags &= ~NFS_MOUNT_NO_ALIGNWRITE; break; case Opt_ac: if (result.negated) ctx->flags |= NFS_MOUNT_NOAC; else ctx->flags &= ~NFS_MOUNT_NOAC; break; case Opt_lock: if (result.negated) { ctx->lock_status = NFS_LOCK_NOLOCK; ctx->flags |= NFS_MOUNT_NONLM; ctx->flags |= (NFS_MOUNT_LOCAL_FLOCK | NFS_MOUNT_LOCAL_FCNTL); } else { ctx->lock_status = NFS_LOCK_LOCK; ctx->flags &= ~NFS_MOUNT_NONLM; ctx->flags &= ~(NFS_MOUNT_LOCAL_FLOCK | NFS_MOUNT_LOCAL_FCNTL); } break; case Opt_udp: ctx->flags &= ~NFS_MOUNT_TCP; ctx->nfs_server.protocol = XPRT_TRANSPORT_UDP; break; case Opt_tcp: case Opt_rdma: ctx->flags |= NFS_MOUNT_TCP; /* for side protocols */ ret = xprt_find_transport_ident(param->key); if (ret < 0) goto out_bad_transport; ctx->nfs_server.protocol = ret; break; case Opt_acl: if (result.negated) ctx->flags |= NFS_MOUNT_NOACL; else ctx->flags &= ~NFS_MOUNT_NOACL; break; case Opt_rdirplus: if (result.negated) { ctx->flags &= ~NFS_MOUNT_FORCE_RDIRPLUS; ctx->flags |= NFS_MOUNT_NORDIRPLUS; } else if (!param->string) { ctx->flags &= ~(NFS_MOUNT_NORDIRPLUS | NFS_MOUNT_FORCE_RDIRPLUS); } else { switch (lookup_constant(nfs_rdirplus_tokens, param->string, -1)) { case Opt_rdirplus_none: ctx->flags &= ~NFS_MOUNT_FORCE_RDIRPLUS; ctx->flags |= NFS_MOUNT_NORDIRPLUS; break; case Opt_rdirplus_force: ctx->flags &= ~NFS_MOUNT_NORDIRPLUS; ctx->flags |= NFS_MOUNT_FORCE_RDIRPLUS; break; default: goto out_invalid_value; } } break; case Opt_sharecache: if (result.negated) ctx->flags |= NFS_MOUNT_UNSHARED; else ctx->flags &= ~NFS_MOUNT_UNSHARED; break; case Opt_resvport: if (result.negated) ctx->flags |= NFS_MOUNT_NORESVPORT; else ctx->flags &= ~NFS_MOUNT_NORESVPORT; break; case Opt_fscache_flag: if (result.negated) ctx->options &= ~NFS_OPTION_FSCACHE; else ctx->options |= NFS_OPTION_FSCACHE; kfree(ctx->fscache_uniq); ctx->fscache_uniq = NULL; break; case Opt_fscache: trace_nfs_mount_assign(param->key, param->string); ctx->options |= NFS_OPTION_FSCACHE; kfree(ctx->fscache_uniq); ctx->fscache_uniq = param->string; param->string = NULL; break; case Opt_migration: if (result.negated) ctx->options &= ~NFS_OPTION_MIGRATION; else ctx->options |= NFS_OPTION_MIGRATION; break; /* * options that take numeric values */ case Opt_port: if (result.uint_32 > USHRT_MAX) goto out_of_bounds; ctx->nfs_server.port = result.uint_32; break; case Opt_rsize: ctx->rsize = result.uint_32; break; case Opt_wsize: ctx->wsize = result.uint_32; break; case Opt_bsize: ctx->bsize = result.uint_32; break; case Opt_timeo: if (result.uint_32 < 1 || result.uint_32 > INT_MAX) goto out_of_bounds; ctx->timeo = result.uint_32; break; case Opt_retrans: if (result.uint_32 > INT_MAX) goto out_of_bounds; ctx->retrans = result.uint_32; break; case Opt_acregmin: ctx->acregmin = result.uint_32; break; case Opt_acregmax: ctx->acregmax = result.uint_32; break; case Opt_acdirmin: ctx->acdirmin = result.uint_32; break; case Opt_acdirmax: ctx->acdirmax = result.uint_32; break; case Opt_actimeo: ctx->acregmin = result.uint_32; ctx->acregmax = result.uint_32; ctx->acdirmin = result.uint_32; ctx->acdirmax = result.uint_32; break; case Opt_namelen: ctx->namlen = result.uint_32; break; case Opt_mountport: if (result.uint_32 > USHRT_MAX) goto out_of_bounds; ctx->mount_server.port = result.uint_32; break; case Opt_mountvers: if (result.uint_32 < NFS_MNT_VERSION || result.uint_32 > NFS_MNT3_VERSION) goto out_of_bounds; ctx->mount_server.version = result.uint_32; break; case Opt_minorversion: if (result.uint_32 > NFS4_MAX_MINOR_VERSION) goto out_of_bounds; ctx->minorversion = result.uint_32; break; /* * options that take text values */ case Opt_v: ret = nfs_parse_version_string(fc, param->key + 1); if (ret < 0) return ret; break; case Opt_vers: if (!param->string) goto out_invalid_value; trace_nfs_mount_assign(param->key, param->string); ret = nfs_parse_version_string(fc, param->string); if (ret < 0) return ret; break; case Opt_sec: ret = nfs_parse_security_flavors(fc, param); if (ret < 0) return ret; break; case Opt_xprtsec: ret = nfs_parse_xprtsec_policy(fc, param); if (ret < 0) return ret; break; case Opt_proto: if (!param->string) goto out_invalid_value; trace_nfs_mount_assign(param->key, param->string); protofamily = AF_INET; switch (lookup_constant(nfs_xprt_protocol_tokens, param->string, -1)) { case Opt_xprt_udp6: protofamily = AF_INET6; fallthrough; case Opt_xprt_udp: ctx->flags &= ~NFS_MOUNT_TCP; ctx->nfs_server.protocol = XPRT_TRANSPORT_UDP; break; case Opt_xprt_tcp6: protofamily = AF_INET6; fallthrough; case Opt_xprt_tcp: ctx->flags |= NFS_MOUNT_TCP; ctx->nfs_server.protocol = XPRT_TRANSPORT_TCP; break; case Opt_xprt_rdma6: protofamily = AF_INET6; fallthrough; case Opt_xprt_rdma: /* vector side protocols to TCP */ ctx->flags |= NFS_MOUNT_TCP; ret = xprt_find_transport_ident(param->string); if (ret < 0) goto out_bad_transport; ctx->nfs_server.protocol = ret; break; default: goto out_bad_transport; } ctx->protofamily = protofamily; break; case Opt_mountproto: if (!param->string) goto out_invalid_value; trace_nfs_mount_assign(param->key, param->string); mountfamily = AF_INET; switch (lookup_constant(nfs_xprt_protocol_tokens, param->string, -1)) { case Opt_xprt_udp6: mountfamily = AF_INET6; fallthrough; case Opt_xprt_udp: ctx->mount_server.protocol = XPRT_TRANSPORT_UDP; break; case Opt_xprt_tcp6: mountfamily = AF_INET6; fallthrough; case Opt_xprt_tcp: ctx->mount_server.protocol = XPRT_TRANSPORT_TCP; break; case Opt_xprt_rdma: /* not used for side protocols */ default: goto out_bad_transport; } ctx->mountfamily = mountfamily; break; case Opt_addr: trace_nfs_mount_assign(param->key, param->string); len = rpc_pton(fc->net_ns, param->string, param->size, &ctx->nfs_server.address, sizeof(ctx->nfs_server._address)); if (len == 0) goto out_invalid_address; ctx->nfs_server.addrlen = len; break; case Opt_clientaddr: trace_nfs_mount_assign(param->key, param->string); kfree(ctx->client_address); ctx->client_address = param->string; param->string = NULL; break; case Opt_mounthost: trace_nfs_mount_assign(param->key, param->string); kfree(ctx->mount_server.hostname); ctx->mount_server.hostname = param->string; param->string = NULL; break; case Opt_mountaddr: trace_nfs_mount_assign(param->key, param->string); len = rpc_pton(fc->net_ns, param->string, param->size, &ctx->mount_server.address, sizeof(ctx->mount_server._address)); if (len == 0) goto out_invalid_address; ctx->mount_server.addrlen = len; break; case Opt_nconnect: trace_nfs_mount_assign(param->key, param->string); if (result.uint_32 < 1 || result.uint_32 > NFS_MAX_CONNECTIONS) goto out_of_bounds; ctx->nfs_server.nconnect = result.uint_32; break; case Opt_max_connect: trace_nfs_mount_assign(param->key, param->string); if (result.uint_32 < 1 || result.uint_32 > NFS_MAX_TRANSPORTS) goto out_of_bounds; ctx->nfs_server.max_connect = result.uint_32; break; case Opt_fatal_neterrors: trace_nfs_mount_assign(param->key, param->string); switch (result.uint_32) { case Opt_fatal_neterrors_default: if (fc->net_ns != &init_net) ctx->flags |= NFS_MOUNT_NETUNREACH_FATAL; else ctx->flags &= ~NFS_MOUNT_NETUNREACH_FATAL; break; case Opt_fatal_neterrors_enetunreach: ctx->flags |= NFS_MOUNT_NETUNREACH_FATAL; break; case Opt_fatal_neterrors_none: ctx->flags &= ~NFS_MOUNT_NETUNREACH_FATAL; break; default: goto out_invalid_value; } break; case Opt_lookupcache: trace_nfs_mount_assign(param->key, param->string); switch (result.uint_32) { case Opt_lookupcache_all: ctx->flags &= ~(NFS_MOUNT_LOOKUP_CACHE_NONEG|NFS_MOUNT_LOOKUP_CACHE_NONE); break; case Opt_lookupcache_positive: ctx->flags &= ~NFS_MOUNT_LOOKUP_CACHE_NONE; ctx->flags |= NFS_MOUNT_LOOKUP_CACHE_NONEG; break; case Opt_lookupcache_none: ctx->flags |= NFS_MOUNT_LOOKUP_CACHE_NONEG|NFS_MOUNT_LOOKUP_CACHE_NONE; break; default: goto out_invalid_value; } break; case Opt_local_lock: trace_nfs_mount_assign(param->key, param->string); switch (result.uint_32) { case Opt_local_lock_all: ctx->flags |= (NFS_MOUNT_LOCAL_FLOCK | NFS_MOUNT_LOCAL_FCNTL); break; case Opt_local_lock_flock: ctx->flags |= NFS_MOUNT_LOCAL_FLOCK; break; case Opt_local_lock_posix: ctx->flags |= NFS_MOUNT_LOCAL_FCNTL; break; case Opt_local_lock_none: ctx->flags &= ~(NFS_MOUNT_LOCAL_FLOCK | NFS_MOUNT_LOCAL_FCNTL); break; default: goto out_invalid_value; } break; case Opt_write: trace_nfs_mount_assign(param->key, param->string); switch (result.uint_32) { case Opt_write_lazy: ctx->flags &= ~(NFS_MOUNT_WRITE_EAGER | NFS_MOUNT_WRITE_WAIT); break; case Opt_write_eager: ctx->flags |= NFS_MOUNT_WRITE_EAGER; ctx->flags &= ~NFS_MOUNT_WRITE_WAIT; break; case Opt_write_wait: ctx->flags |= NFS_MOUNT_WRITE_EAGER | NFS_MOUNT_WRITE_WAIT; break; default: goto out_invalid_value; } break; /* * Special options */ case Opt_sloppy: ctx->sloppy = true; break; } return 0; out_invalid_value: return nfs_invalf(fc, "NFS: Bad mount option value specified"); out_invalid_address: return nfs_invalf(fc, "NFS: Bad IP address specified"); out_of_bounds: return nfs_invalf(fc, "NFS: Value for '%s' out of range", param->key); out_bad_transport: return nfs_invalf(fc, "NFS: Unrecognized transport protocol"); } /* * Split fc->source into "hostname:export_path". * * The leftmost colon demarks the split between the server's hostname * and the export path. If the hostname starts with a left square * bracket, then it may contain colons. * * Note: caller frees hostname and export path, even on error. */ static int nfs_parse_source(struct fs_context *fc, size_t maxnamlen, size_t maxpathlen) { struct nfs_fs_context *ctx = nfs_fc2context(fc); const char *dev_name = fc->source; size_t len; const char *end; if (unlikely(!dev_name || !*dev_name)) return -EINVAL; /* Is the host name protected with square brakcets? */ if (*dev_name == '[') { end = strchr(++dev_name, ']'); if (end == NULL || end[1] != ':') goto out_bad_devname; len = end - dev_name; end++; } else { const char *comma; end = strchr(dev_name, ':'); if (end == NULL) goto out_bad_devname; len = end - dev_name; /* kill possible hostname list: not supported */ comma = memchr(dev_name, ',', len); if (comma) len = comma - dev_name; } if (len > maxnamlen) goto out_hostname; kfree(ctx->nfs_server.hostname); /* N.B. caller will free nfs_server.hostname in all cases */ ctx->nfs_server.hostname = kmemdup_nul(dev_name, len, GFP_KERNEL); if (!ctx->nfs_server.hostname) goto out_nomem; len = strlen(++end); if (len > maxpathlen) goto out_path; ctx->nfs_server.export_path = kmemdup_nul(end, len, GFP_KERNEL); if (!ctx->nfs_server.export_path) goto out_nomem; trace_nfs_mount_path(ctx->nfs_server.export_path); return 0; out_bad_devname: return nfs_invalf(fc, "NFS: device name not in host:path format"); out_nomem: nfs_errorf(fc, "NFS: not enough memory to parse device name"); return -ENOMEM; out_hostname: nfs_errorf(fc, "NFS: server hostname too long"); return -ENAMETOOLONG; out_path: nfs_errorf(fc, "NFS: export pathname too long"); return -ENAMETOOLONG; } static inline bool is_remount_fc(struct fs_context *fc) { return fc->root != NULL; } /* * Parse monolithic NFS2/NFS3 mount data * - fills in the mount root filehandle * * For option strings, user space handles the following behaviors: * * + DNS: mapping server host name to IP address ("addr=" option) * * + failure mode: how to behave if a mount request can't be handled * immediately ("fg/bg" option) * * + retry: how often to retry a mount request ("retry=" option) * * + breaking back: trying proto=udp after proto=tcp, v2 after v3, * mountproto=tcp after mountproto=udp, and so on */ static int nfs23_parse_monolithic(struct fs_context *fc, struct nfs_mount_data *data) { struct nfs_fs_context *ctx = nfs_fc2context(fc); struct nfs_fh *mntfh = ctx->mntfh; struct sockaddr_storage *sap = &ctx->nfs_server._address; int extra_flags = NFS_MOUNT_LEGACY_INTERFACE; int ret; if (data == NULL) goto out_no_data; ctx->version = NFS_DEFAULT_VERSION; switch (data->version) { case 1: data->namlen = 0; fallthrough; case 2: data->bsize = 0; fallthrough; case 3: if (data->flags & NFS_MOUNT_VER3) goto out_no_v3; data->root.size = NFS2_FHSIZE; memcpy(data->root.data, data->old_root.data, NFS2_FHSIZE); /* Turn off security negotiation */ extra_flags |= NFS_MOUNT_SECFLAVOUR; fallthrough; case 4: if (data->flags & NFS_MOUNT_SECFLAVOUR) goto out_no_sec; fallthrough; case 5: memset(data->context, 0, sizeof(data->context)); fallthrough; case 6: if (data->flags & NFS_MOUNT_VER3) { if (data->root.size > NFS3_FHSIZE || data->root.size == 0) goto out_invalid_fh; mntfh->size = data->root.size; ctx->version = 3; } else { mntfh->size = NFS2_FHSIZE; ctx->version = 2; } memcpy(mntfh->data, data->root.data, mntfh->size); if (mntfh->size < sizeof(mntfh->data)) memset(mntfh->data + mntfh->size, 0, sizeof(mntfh->data) - mntfh->size); /* * for proto == XPRT_TRANSPORT_UDP, which is what uses * to_exponential, implying shift: limit the shift value * to BITS_PER_LONG (majortimeo is unsigned long) */ if (!(data->flags & NFS_MOUNT_TCP)) /* this will be UDP */ if (data->retrans >= 64) /* shift value is too large */ goto out_invalid_data; /* * Translate to nfs_fs_context, which nfs_fill_super * can deal with. */ ctx->flags = data->flags & NFS_MOUNT_FLAGMASK; ctx->flags |= extra_flags; ctx->rsize = data->rsize; ctx->wsize = data->wsize; ctx->timeo = data->timeo; ctx->retrans = data->retrans; ctx->acregmin = data->acregmin; ctx->acregmax = data->acregmax; ctx->acdirmin = data->acdirmin; ctx->acdirmax = data->acdirmax; ctx->need_mount = false; if (!is_remount_fc(fc)) { memcpy(sap, &data->addr, sizeof(data->addr)); ctx->nfs_server.addrlen = sizeof(data->addr); ctx->nfs_server.port = ntohs(data->addr.sin_port); } if (sap->ss_family != AF_INET || !nfs_verify_server_address(sap)) goto out_no_address; if (!(data->flags & NFS_MOUNT_TCP)) ctx->nfs_server.protocol = XPRT_TRANSPORT_UDP; /* N.B. caller will free nfs_server.hostname in all cases */ ctx->nfs_server.hostname = kstrdup(data->hostname, GFP_KERNEL); if (!ctx->nfs_server.hostname) goto out_nomem; ctx->namlen = data->namlen; ctx->bsize = data->bsize; if (data->flags & NFS_MOUNT_SECFLAVOUR) ctx->selected_flavor = data->pseudoflavor; else ctx->selected_flavor = RPC_AUTH_UNIX; if (!(data->flags & NFS_MOUNT_NONLM)) ctx->flags &= ~(NFS_MOUNT_LOCAL_FLOCK| NFS_MOUNT_LOCAL_FCNTL); else ctx->flags |= (NFS_MOUNT_LOCAL_FLOCK| NFS_MOUNT_LOCAL_FCNTL); /* * The legacy version 6 binary mount data from userspace has a * field used only to transport selinux information into the * kernel. To continue to support that functionality we * have a touch of selinux knowledge here in the NFS code. The * userspace code converted context=blah to just blah so we are * converting back to the full string selinux understands. */ if (data->context[0]){ #ifdef CONFIG_SECURITY_SELINUX int ret; data->context[NFS_MAX_CONTEXT_LEN] = '\0'; ret = vfs_parse_fs_string(fc, "context", data->context, strlen(data->context)); if (ret < 0) return ret; #else return -EINVAL; #endif } break; default: goto generic; } ret = nfs_validate_transport_protocol(fc, ctx); if (ret) return ret; ctx->skip_reconfig_option_check = true; return 0; generic: return generic_parse_monolithic(fc, data); out_no_data: if (is_remount_fc(fc)) { ctx->skip_reconfig_option_check = true; return 0; } return nfs_invalf(fc, "NFS: mount program didn't pass any mount data"); out_no_v3: return nfs_invalf(fc, "NFS: nfs_mount_data version does not support v3"); out_no_sec: return nfs_invalf(fc, "NFS: nfs_mount_data version supports only AUTH_SYS"); out_nomem: return -ENOMEM; out_no_address: return nfs_invalf(fc, "NFS: mount program didn't pass remote address"); out_invalid_fh: return nfs_invalf(fc, "NFS: invalid root filehandle"); out_invalid_data: return nfs_invalf(fc, "NFS: invalid binary mount data"); } #if IS_ENABLED(CONFIG_NFS_V4) struct compat_nfs_string { compat_uint_t len; compat_uptr_t data; }; static inline void compat_nfs_string(struct nfs_string *dst, struct compat_nfs_string *src) { dst->data = compat_ptr(src->data); dst->len = src->len; } struct compat_nfs4_mount_data_v1 { compat_int_t version; compat_int_t flags; compat_int_t rsize; compat_int_t wsize; compat_int_t timeo; compat_int_t retrans; compat_int_t acregmin; compat_int_t acregmax; compat_int_t acdirmin; compat_int_t acdirmax; struct compat_nfs_string client_addr; struct compat_nfs_string mnt_path; struct compat_nfs_string hostname; compat_uint_t host_addrlen; compat_uptr_t host_addr; compat_int_t proto; compat_int_t auth_flavourlen; compat_uptr_t auth_flavours; }; static void nfs4_compat_mount_data_conv(struct nfs4_mount_data *data) { struct compat_nfs4_mount_data_v1 *compat = (struct compat_nfs4_mount_data_v1 *)data; /* copy the fields backwards */ data->auth_flavours = compat_ptr(compat->auth_flavours); data->auth_flavourlen = compat->auth_flavourlen; data->proto = compat->proto; data->host_addr = compat_ptr(compat->host_addr); data->host_addrlen = compat->host_addrlen; compat_nfs_string(&data->hostname, &compat->hostname); compat_nfs_string(&data->mnt_path, &compat->mnt_path); compat_nfs_string(&data->client_addr, &compat->client_addr); data->acdirmax = compat->acdirmax; data->acdirmin = compat->acdirmin; data->acregmax = compat->acregmax; data->acregmin = compat->acregmin; data->retrans = compat->retrans; data->timeo = compat->timeo; data->wsize = compat->wsize; data->rsize = compat->rsize; data->flags = compat->flags; data->version = compat->version; } /* * Validate NFSv4 mount options */ static int nfs4_parse_monolithic(struct fs_context *fc, struct nfs4_mount_data *data) { struct nfs_fs_context *ctx = nfs_fc2context(fc); struct sockaddr_storage *sap = &ctx->nfs_server._address; int ret; char *c; if (!data) { if (is_remount_fc(fc)) goto done; return nfs_invalf(fc, "NFS4: mount program didn't pass any mount data"); } ctx->version = 4; if (data->version != 1) return generic_parse_monolithic(fc, data); if (in_compat_syscall()) nfs4_compat_mount_data_conv(data); if (data->host_addrlen > sizeof(ctx->nfs_server.address)) goto out_no_address; if (data->host_addrlen == 0) goto out_no_address; ctx->nfs_server.addrlen = data->host_addrlen; if (copy_from_user(sap, data->host_addr, data->host_addrlen)) return -EFAULT; if (!nfs_verify_server_address(sap)) goto out_no_address; ctx->nfs_server.port = ntohs(((struct sockaddr_in *)sap)->sin_port); if (data->auth_flavourlen) { rpc_authflavor_t pseudoflavor; if (data->auth_flavourlen > 1) goto out_inval_auth; if (copy_from_user(&pseudoflavor, data->auth_flavours, sizeof(pseudoflavor))) return -EFAULT; ctx->selected_flavor = pseudoflavor; } else { ctx->selected_flavor = RPC_AUTH_UNIX; } c = strndup_user(data->hostname.data, NFS4_MAXNAMLEN); if (IS_ERR(c)) return PTR_ERR(c); ctx->nfs_server.hostname = c; c = strndup_user(data->mnt_path.data, NFS4_MAXPATHLEN); if (IS_ERR(c)) return PTR_ERR(c); ctx->nfs_server.export_path = c; trace_nfs_mount_path(c); c = strndup_user(data->client_addr.data, 16); if (IS_ERR(c)) return PTR_ERR(c); ctx->client_address = c; /* * Translate to nfs_fs_context, which nfs_fill_super * can deal with. */ ctx->flags = data->flags & NFS4_MOUNT_FLAGMASK; ctx->rsize = data->rsize; ctx->wsize = data->wsize; ctx->timeo = data->timeo; ctx->retrans = data->retrans; ctx->acregmin = data->acregmin; ctx->acregmax = data->acregmax; ctx->acdirmin = data->acdirmin; ctx->acdirmax = data->acdirmax; ctx->nfs_server.protocol = data->proto; ret = nfs_validate_transport_protocol(fc, ctx); if (ret) return ret; done: ctx->skip_reconfig_option_check = true; return 0; out_inval_auth: return nfs_invalf(fc, "NFS4: Invalid number of RPC auth flavours %d", data->auth_flavourlen); out_no_address: return nfs_invalf(fc, "NFS4: mount program didn't pass remote address"); } #endif /* * Parse a monolithic block of data from sys_mount(). */ static int nfs_fs_context_parse_monolithic(struct fs_context *fc, void *data) { if (fc->fs_type == &nfs_fs_type) return nfs23_parse_monolithic(fc, data); #if IS_ENABLED(CONFIG_NFS_V4) if (fc->fs_type == &nfs4_fs_type) return nfs4_parse_monolithic(fc, data); #endif return nfs_invalf(fc, "NFS: Unsupported monolithic data version"); } /* * Validate the preparsed information in the config. */ static int nfs_fs_context_validate(struct fs_context *fc) { struct nfs_fs_context *ctx = nfs_fc2context(fc); struct nfs_subversion *nfs_mod; struct sockaddr_storage *sap = &ctx->nfs_server._address; int max_namelen = PAGE_SIZE; int max_pathlen = NFS_MAXPATHLEN; int port = 0; int ret; if (!fc->source) goto out_no_device_name; /* Check for sanity first. */ if (ctx->minorversion && ctx->version != 4) goto out_minorversion_mismatch; if (ctx->options & NFS_OPTION_MIGRATION && (ctx->version != 4 || ctx->minorversion != 0)) goto out_migration_misuse; /* Verify that any proto=/mountproto= options match the address * families in the addr=/mountaddr= options. */ if (ctx->protofamily != AF_UNSPEC && ctx->protofamily != ctx->nfs_server.address.sa_family) goto out_proto_mismatch; if (ctx->mountfamily != AF_UNSPEC) { if (ctx->mount_server.addrlen) { if (ctx->mountfamily != ctx->mount_server.address.sa_family) goto out_mountproto_mismatch; } else { if (ctx->mountfamily != ctx->nfs_server.address.sa_family) goto out_mountproto_mismatch; } } if (!nfs_verify_server_address(sap)) goto out_no_address; ret = nfs_validate_transport_protocol(fc, ctx); if (ret) return ret; if (ctx->version == 4) { if (IS_ENABLED(CONFIG_NFS_V4)) { if (ctx->nfs_server.protocol == XPRT_TRANSPORT_RDMA) port = NFS_RDMA_PORT; else port = NFS_PORT; max_namelen = NFS4_MAXNAMLEN; max_pathlen = NFS4_MAXPATHLEN; ctx->flags &= ~(NFS_MOUNT_NONLM | NFS_MOUNT_NOACL | NFS_MOUNT_VER3 | NFS_MOUNT_LOCAL_FLOCK | NFS_MOUNT_LOCAL_FCNTL); } else { goto out_v4_not_compiled; } } else { nfs_set_mount_transport_protocol(ctx); if (ctx->nfs_server.protocol == XPRT_TRANSPORT_RDMA) port = NFS_RDMA_PORT; } nfs_set_port(sap, &ctx->nfs_server.port, port); ret = nfs_parse_source(fc, max_namelen, max_pathlen); if (ret < 0) return ret; /* Load the NFS protocol module if we haven't done so yet */ if (!ctx->nfs_mod) { nfs_mod = find_nfs_version(ctx->version); if (IS_ERR(nfs_mod)) { ret = PTR_ERR(nfs_mod); goto out_version_unavailable; } ctx->nfs_mod = nfs_mod; } /* Ensure the filesystem context has the correct fs_type */ if (fc->fs_type != ctx->nfs_mod->nfs_fs) { module_put(fc->fs_type->owner); __module_get(ctx->nfs_mod->nfs_fs->owner); fc->fs_type = ctx->nfs_mod->nfs_fs; } return 0; out_no_device_name: return nfs_invalf(fc, "NFS: Device name not specified"); out_v4_not_compiled: nfs_errorf(fc, "NFS: NFSv4 is not compiled into kernel"); return -EPROTONOSUPPORT; out_no_address: return nfs_invalf(fc, "NFS: mount program didn't pass remote address"); out_mountproto_mismatch: return nfs_invalf(fc, "NFS: Mount server address does not match mountproto= option"); out_proto_mismatch: return nfs_invalf(fc, "NFS: Server address does not match proto= option"); out_minorversion_mismatch: return nfs_invalf(fc, "NFS: Mount option vers=%u does not support minorversion=%u", ctx->version, ctx->minorversion); out_migration_misuse: return nfs_invalf(fc, "NFS: 'Migration' not supported for this NFS version"); out_version_unavailable: nfs_errorf(fc, "NFS: Version unavailable"); return ret; } /* * Create an NFS superblock by the appropriate method. */ static int nfs_get_tree(struct fs_context *fc) { struct nfs_fs_context *ctx = nfs_fc2context(fc); int err = nfs_fs_context_validate(fc); if (err) return err; if (!ctx->internal) return ctx->nfs_mod->rpc_ops->try_get_tree(fc); else return nfs_get_tree_common(fc); } /* * Handle duplication of a configuration. The caller copied *src into *sc, but * it can't deal with resource pointers in the filesystem context, so we have * to do that. We need to clear pointers, copy data or get extra refs as * appropriate. */ static int nfs_fs_context_dup(struct fs_context *fc, struct fs_context *src_fc) { struct nfs_fs_context *src = nfs_fc2context(src_fc), *ctx; ctx = kmemdup(src, sizeof(struct nfs_fs_context), GFP_KERNEL); if (!ctx) return -ENOMEM; ctx->mntfh = nfs_alloc_fhandle(); if (!ctx->mntfh) { kfree(ctx); return -ENOMEM; } nfs_copy_fh(ctx->mntfh, src->mntfh); get_nfs_version(ctx->nfs_mod); ctx->client_address = NULL; ctx->mount_server.hostname = NULL; ctx->nfs_server.export_path = NULL; ctx->nfs_server.hostname = NULL; ctx->fscache_uniq = NULL; ctx->clone_data.fattr = NULL; fc->fs_private = ctx; return 0; } static void nfs_fs_context_free(struct fs_context *fc) { struct nfs_fs_context *ctx = nfs_fc2context(fc); if (ctx) { if (ctx->server) nfs_free_server(ctx->server); if (ctx->nfs_mod) put_nfs_version(ctx->nfs_mod); kfree(ctx->client_address); kfree(ctx->mount_server.hostname); kfree(ctx->nfs_server.export_path); kfree(ctx->nfs_server.hostname); kfree(ctx->fscache_uniq); nfs_free_fhandle(ctx->mntfh); nfs_free_fattr(ctx->clone_data.fattr); kfree(ctx); } } static const struct fs_context_operations nfs_fs_context_ops = { .free = nfs_fs_context_free, .dup = nfs_fs_context_dup, .parse_param = nfs_fs_context_parse_param, .parse_monolithic = nfs_fs_context_parse_monolithic, .get_tree = nfs_get_tree, .reconfigure = nfs_reconfigure, }; /* * Prepare superblock configuration. We use the namespaces attached to the * context. This may be the current process's namespaces, or it may be a * container's namespaces. */ static int nfs_init_fs_context(struct fs_context *fc) { struct nfs_fs_context *ctx; ctx = kzalloc(sizeof(struct nfs_fs_context), GFP_KERNEL); if (unlikely(!ctx)) return -ENOMEM; ctx->mntfh = nfs_alloc_fhandle(); if (unlikely(!ctx->mntfh)) { kfree(ctx); return -ENOMEM; } ctx->protofamily = AF_UNSPEC; ctx->mountfamily = AF_UNSPEC; ctx->mount_server.port = NFS_UNSPEC_PORT; if (fc->root) { /* reconfigure, start with the current config */ struct nfs_server *nfss = fc->root->d_sb->s_fs_info; struct net *net = nfss->nfs_client->cl_net; ctx->flags = nfss->flags; ctx->rsize = nfss->rsize; ctx->wsize = nfss->wsize; ctx->retrans = nfss->client->cl_timeout->to_retries; ctx->selected_flavor = nfss->client->cl_auth->au_flavor; ctx->acregmin = nfss->acregmin / HZ; ctx->acregmax = nfss->acregmax / HZ; ctx->acdirmin = nfss->acdirmin / HZ; ctx->acdirmax = nfss->acdirmax / HZ; ctx->timeo = 10U * nfss->client->cl_timeout->to_initval / HZ; ctx->nfs_server.port = nfss->port; ctx->nfs_server.addrlen = nfss->nfs_client->cl_addrlen; ctx->version = nfss->nfs_client->rpc_ops->version; ctx->minorversion = nfss->nfs_client->cl_minorversion; memcpy(&ctx->nfs_server._address, &nfss->nfs_client->cl_addr, ctx->nfs_server.addrlen); if (fc->net_ns != net) { put_net(fc->net_ns); fc->net_ns = get_net(net); } ctx->nfs_mod = nfss->nfs_client->cl_nfs_mod; get_nfs_version(ctx->nfs_mod); } else { /* defaults */ ctx->timeo = NFS_UNSPEC_TIMEO; ctx->retrans = NFS_UNSPEC_RETRANS; ctx->acregmin = NFS_DEF_ACREGMIN; ctx->acregmax = NFS_DEF_ACREGMAX; ctx->acdirmin = NFS_DEF_ACDIRMIN; ctx->acdirmax = NFS_DEF_ACDIRMAX; ctx->nfs_server.port = NFS_UNSPEC_PORT; ctx->nfs_server.protocol = XPRT_TRANSPORT_TCP; ctx->selected_flavor = RPC_AUTH_MAXFLAVOR; ctx->minorversion = 0; ctx->need_mount = true; ctx->xprtsec.policy = RPC_XPRTSEC_NONE; ctx->xprtsec.cert_serial = TLS_NO_CERT; ctx->xprtsec.privkey_serial = TLS_NO_PRIVKEY; if (fc->net_ns != &init_net) ctx->flags |= NFS_MOUNT_NETUNREACH_FATAL; fc->s_iflags |= SB_I_STABLE_WRITES; } fc->fs_private = ctx; fc->ops = &nfs_fs_context_ops; return 0; } struct file_system_type nfs_fs_type = { .owner = THIS_MODULE, .name = "nfs", .init_fs_context = nfs_init_fs_context, .parameters = nfs_fs_parameters, .kill_sb = nfs_kill_super, .fs_flags = FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA, }; MODULE_ALIAS_FS("nfs"); EXPORT_SYMBOL_GPL(nfs_fs_type); #if IS_ENABLED(CONFIG_NFS_V4) struct file_system_type nfs4_fs_type = { .owner = THIS_MODULE, .name = "nfs4", .init_fs_context = nfs_init_fs_context, .parameters = nfs_fs_parameters, .kill_sb = nfs_kill_super, .fs_flags = FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA, }; MODULE_ALIAS_FS("nfs4"); MODULE_ALIAS("nfs4"); EXPORT_SYMBOL_GPL(nfs4_fs_type); #endif /* CONFIG_NFS_V4 */ |
| 1391 1825 226 1748 1013 290 1826 1391 8 9 8 48 41 2 46 2 2 206 79 162 162 9 9 253 479 601 479 163 414 163 8 232 8 8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_HIGHMEM_H #define _LINUX_HIGHMEM_H #include <linux/fs.h> #include <linux/kernel.h> #include <linux/bug.h> #include <linux/cacheflush.h> #include <linux/kmsan.h> #include <linux/mm.h> #include <linux/uaccess.h> #include <linux/hardirq.h> #include "highmem-internal.h" /** * kmap - Map a page for long term usage * @page: Pointer to the page to be mapped * * Returns: The virtual address of the mapping * * Can only be invoked from preemptible task context because on 32bit * systems with CONFIG_HIGHMEM enabled this function might sleep. * * For systems with CONFIG_HIGHMEM=n and for pages in the low memory area * this returns the virtual address of the direct kernel mapping. * * The returned virtual address is globally visible and valid up to the * point where it is unmapped via kunmap(). The pointer can be handed to * other contexts. * * For highmem pages on 32bit systems this can be slow as the mapping space * is limited and protected by a global lock. In case that there is no * mapping slot available the function blocks until a slot is released via * kunmap(). */ static inline void *kmap(struct page *page); /** * kunmap - Unmap the virtual address mapped by kmap() * @page: Pointer to the page which was mapped by kmap() * * Counterpart to kmap(). A NOOP for CONFIG_HIGHMEM=n and for mappings of * pages in the low memory area. */ static inline void kunmap(struct page *page); /** * kmap_to_page - Get the page for a kmap'ed address * @addr: The address to look up * * Returns: The page which is mapped to @addr. */ static inline struct page *kmap_to_page(void *addr); /** * kmap_flush_unused - Flush all unused kmap mappings in order to * remove stray mappings */ static inline void kmap_flush_unused(void); /** * kmap_local_page - Map a page for temporary usage * @page: Pointer to the page to be mapped * * Returns: The virtual address of the mapping * * Can be invoked from any context, including interrupts. * * Requires careful handling when nesting multiple mappings because the map * management is stack based. The unmap has to be in the reverse order of * the map operation: * * addr1 = kmap_local_page(page1); * addr2 = kmap_local_page(page2); * ... * kunmap_local(addr2); * kunmap_local(addr1); * * Unmapping addr1 before addr2 is invalid and causes malfunction. * * Contrary to kmap() mappings the mapping is only valid in the context of * the caller and cannot be handed to other contexts. * * On CONFIG_HIGHMEM=n kernels and for low memory pages this returns the * virtual address of the direct mapping. Only real highmem pages are * temporarily mapped. * * While kmap_local_page() is significantly faster than kmap() for the highmem * case it comes with restrictions about the pointer validity. * * On HIGHMEM enabled systems mapping a highmem page has the side effect of * disabling migration in order to keep the virtual address stable across * preemption. No caller of kmap_local_page() can rely on this side effect. */ static inline void *kmap_local_page(struct page *page); /** * kmap_local_folio - Map a page in this folio for temporary usage * @folio: The folio containing the page. * @offset: The byte offset within the folio which identifies the page. * * Requires careful handling when nesting multiple mappings because the map * management is stack based. The unmap has to be in the reverse order of * the map operation:: * * addr1 = kmap_local_folio(folio1, offset1); * addr2 = kmap_local_folio(folio2, offset2); * ... * kunmap_local(addr2); * kunmap_local(addr1); * * Unmapping addr1 before addr2 is invalid and causes malfunction. * * Contrary to kmap() mappings the mapping is only valid in the context of * the caller and cannot be handed to other contexts. * * On CONFIG_HIGHMEM=n kernels and for low memory pages this returns the * virtual address of the direct mapping. Only real highmem pages are * temporarily mapped. * * While it is significantly faster than kmap() for the highmem case it * comes with restrictions about the pointer validity. * * On HIGHMEM enabled systems mapping a highmem page has the side effect of * disabling migration in order to keep the virtual address stable across * preemption. No caller of kmap_local_folio() can rely on this side effect. * * Context: Can be invoked from any context. * Return: The virtual address of @offset. */ static inline void *kmap_local_folio(struct folio *folio, size_t offset); /** * kmap_atomic - Atomically map a page for temporary usage - Deprecated! * @page: Pointer to the page to be mapped * * Returns: The virtual address of the mapping * * In fact a wrapper around kmap_local_page() which also disables pagefaults * and, depending on PREEMPT_RT configuration, also CPU migration and * preemption. Therefore users should not count on the latter two side effects. * * Mappings should always be released by kunmap_atomic(). * * Do not use in new code. Use kmap_local_page() instead. * * It is used in atomic context when code wants to access the contents of a * page that might be allocated from high memory (see __GFP_HIGHMEM), for * example a page in the pagecache. The API has two functions, and they * can be used in a manner similar to the following:: * * // Find the page of interest. * struct page *page = find_get_page(mapping, offset); * * // Gain access to the contents of that page. * void *vaddr = kmap_atomic(page); * * // Do something to the contents of that page. * memset(vaddr, 0, PAGE_SIZE); * * // Unmap that page. * kunmap_atomic(vaddr); * * Note that the kunmap_atomic() call takes the result of the kmap_atomic() * call, not the argument. * * If you need to map two pages because you want to copy from one page to * another you need to keep the kmap_atomic calls strictly nested, like: * * vaddr1 = kmap_atomic(page1); * vaddr2 = kmap_atomic(page2); * * memcpy(vaddr1, vaddr2, PAGE_SIZE); * * kunmap_atomic(vaddr2); * kunmap_atomic(vaddr1); */ static inline void *kmap_atomic(struct page *page); /* Highmem related interfaces for management code */ static inline unsigned long nr_free_highpages(void); static inline unsigned long totalhigh_pages(void); #ifndef ARCH_HAS_FLUSH_ANON_PAGE static inline void flush_anon_page(struct vm_area_struct *vma, struct page *page, unsigned long vmaddr) { } #endif #ifndef ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE static inline void flush_kernel_vmap_range(void *vaddr, int size) { } static inline void invalidate_kernel_vmap_range(void *vaddr, int size) { } #endif /* when CONFIG_HIGHMEM is not set these will be plain clear/copy_page */ #ifndef clear_user_highpage static inline void clear_user_highpage(struct page *page, unsigned long vaddr) { void *addr = kmap_local_page(page); clear_user_page(addr, vaddr, page); kunmap_local(addr); } #endif #ifndef vma_alloc_zeroed_movable_folio /** * vma_alloc_zeroed_movable_folio - Allocate a zeroed page for a VMA. * @vma: The VMA the page is to be allocated for. * @vaddr: The virtual address the page will be inserted into. * * This function will allocate a page suitable for inserting into this * VMA at this virtual address. It may be allocated from highmem or * the movable zone. An architecture may provide its own implementation. * * Return: A folio containing one allocated and zeroed page or NULL if * we are out of memory. */ static inline struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma, unsigned long vaddr) { struct folio *folio; folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vaddr); if (folio && user_alloc_needs_zeroing()) clear_user_highpage(&folio->page, vaddr); return folio; } #endif static inline void clear_highpage(struct page *page) { void *kaddr = kmap_local_page(page); clear_page(kaddr); kunmap_local(kaddr); } static inline void clear_highpage_kasan_tagged(struct page *page) { void *kaddr = kmap_local_page(page); clear_page(kasan_reset_tag(kaddr)); kunmap_local(kaddr); } #ifndef __HAVE_ARCH_TAG_CLEAR_HIGHPAGE static inline void tag_clear_highpage(struct page *page) { } #endif /* * If we pass in a base or tail page, we can zero up to PAGE_SIZE. * If we pass in a head page, we can zero up to the size of the compound page. */ #ifdef CONFIG_HIGHMEM void zero_user_segments(struct page *page, unsigned start1, unsigned end1, unsigned start2, unsigned end2); #else static inline void zero_user_segments(struct page *page, unsigned start1, unsigned end1, unsigned start2, unsigned end2) { void *kaddr = kmap_local_page(page); unsigned int i; BUG_ON(end1 > page_size(page) || end2 > page_size(page)); if (end1 > start1) memset(kaddr + start1, 0, end1 - start1); if (end2 > start2) memset(kaddr + start2, 0, end2 - start2); kunmap_local(kaddr); for (i = 0; i < compound_nr(page); i++) flush_dcache_page(page + i); } #endif static inline void zero_user_segment(struct page *page, unsigned start, unsigned end) { zero_user_segments(page, start, end, 0, 0); } static inline void zero_user(struct page *page, unsigned start, unsigned size) { zero_user_segments(page, start, start + size, 0, 0); } #ifndef __HAVE_ARCH_COPY_USER_HIGHPAGE static inline void copy_user_highpage(struct page *to, struct page *from, unsigned long vaddr, struct vm_area_struct *vma) { char *vfrom, *vto; vfrom = kmap_local_page(from); vto = kmap_local_page(to); copy_user_page(vto, vfrom, vaddr, to); kmsan_unpoison_memory(page_address(to), PAGE_SIZE); kunmap_local(vto); kunmap_local(vfrom); } #endif #ifndef __HAVE_ARCH_COPY_HIGHPAGE static inline void copy_highpage(struct page *to, struct page *from) { char *vfrom, *vto; vfrom = kmap_local_page(from); vto = kmap_local_page(to); copy_page(vto, vfrom); kmsan_copy_page_meta(to, from); kunmap_local(vto); kunmap_local(vfrom); } #endif #ifdef copy_mc_to_kernel /* * If architecture supports machine check exception handling, define the * #MC versions of copy_user_highpage and copy_highpage. They copy a memory * page with #MC in source page (@from) handled, and return the number * of bytes not copied if there was a #MC, otherwise 0 for success. */ static inline int copy_mc_user_highpage(struct page *to, struct page *from, unsigned long vaddr, struct vm_area_struct *vma) { unsigned long ret; char *vfrom, *vto; vfrom = kmap_local_page(from); vto = kmap_local_page(to); ret = copy_mc_to_kernel(vto, vfrom, PAGE_SIZE); if (!ret) kmsan_unpoison_memory(page_address(to), PAGE_SIZE); kunmap_local(vto); kunmap_local(vfrom); if (ret) memory_failure_queue(page_to_pfn(from), 0); return ret; } static inline int copy_mc_highpage(struct page *to, struct page *from) { unsigned long ret; char *vfrom, *vto; vfrom = kmap_local_page(from); vto = kmap_local_page(to); ret = copy_mc_to_kernel(vto, vfrom, PAGE_SIZE); if (!ret) kmsan_copy_page_meta(to, from); kunmap_local(vto); kunmap_local(vfrom); if (ret) memory_failure_queue(page_to_pfn(from), 0); return ret; } #else static inline int copy_mc_user_highpage(struct page *to, struct page *from, unsigned long vaddr, struct vm_area_struct *vma) { copy_user_highpage(to, from, vaddr, vma); return 0; } static inline int copy_mc_highpage(struct page *to, struct page *from) { copy_highpage(to, from); return 0; } #endif static inline void memcpy_page(struct page *dst_page, size_t dst_off, struct page *src_page, size_t src_off, size_t len) { char *dst = kmap_local_page(dst_page); char *src = kmap_local_page(src_page); VM_BUG_ON(dst_off + len > PAGE_SIZE || src_off + len > PAGE_SIZE); memcpy(dst + dst_off, src + src_off, len); kunmap_local(src); kunmap_local(dst); } static inline void memset_page(struct page *page, size_t offset, int val, size_t len) { char *addr = kmap_local_page(page); VM_BUG_ON(offset + len > PAGE_SIZE); memset(addr + offset, val, len); kunmap_local(addr); } static inline void memcpy_from_page(char *to, struct page *page, size_t offset, size_t len) { char *from = kmap_local_page(page); VM_BUG_ON(offset + len > PAGE_SIZE); memcpy(to, from + offset, len); kunmap_local(from); } static inline void memcpy_to_page(struct page *page, size_t offset, const char *from, size_t len) { char *to = kmap_local_page(page); VM_BUG_ON(offset + len > PAGE_SIZE); memcpy(to + offset, from, len); flush_dcache_page(page); kunmap_local(to); } static inline void memzero_page(struct page *page, size_t offset, size_t len) { char *addr = kmap_local_page(page); VM_BUG_ON(offset + len > PAGE_SIZE); memset(addr + offset, 0, len); flush_dcache_page(page); kunmap_local(addr); } /** * memcpy_from_folio - Copy a range of bytes from a folio. * @to: The memory to copy to. * @folio: The folio to read from. * @offset: The first byte in the folio to read. * @len: The number of bytes to copy. */ static inline void memcpy_from_folio(char *to, struct folio *folio, size_t offset, size_t len) { VM_BUG_ON(offset + len > folio_size(folio)); do { const char *from = kmap_local_folio(folio, offset); size_t chunk = len; if (folio_test_highmem(folio) && chunk > PAGE_SIZE - offset_in_page(offset)) chunk = PAGE_SIZE - offset_in_page(offset); memcpy(to, from, chunk); kunmap_local(from); to += chunk; offset += chunk; len -= chunk; } while (len > 0); } /** * memcpy_to_folio - Copy a range of bytes to a folio. * @folio: The folio to write to. * @offset: The first byte in the folio to store to. * @from: The memory to copy from. * @len: The number of bytes to copy. */ static inline void memcpy_to_folio(struct folio *folio, size_t offset, const char *from, size_t len) { VM_BUG_ON(offset + len > folio_size(folio)); do { char *to = kmap_local_folio(folio, offset); size_t chunk = len; if (folio_test_highmem(folio) && chunk > PAGE_SIZE - offset_in_page(offset)) chunk = PAGE_SIZE - offset_in_page(offset); memcpy(to, from, chunk); kunmap_local(to); from += chunk; offset += chunk; len -= chunk; } while (len > 0); flush_dcache_folio(folio); } /** * folio_zero_tail - Zero the tail of a folio. * @folio: The folio to zero. * @offset: The byte offset in the folio to start zeroing at. * @kaddr: The address the folio is currently mapped to. * * If you have already used kmap_local_folio() to map a folio, written * some data to it and now need to zero the end of the folio (and flush * the dcache), you can use this function. If you do not have the * folio kmapped (eg the folio has been partially populated by DMA), * use folio_zero_range() or folio_zero_segment() instead. * * Return: An address which can be passed to kunmap_local(). */ static inline __must_check void *folio_zero_tail(struct folio *folio, size_t offset, void *kaddr) { size_t len = folio_size(folio) - offset; if (folio_test_highmem(folio)) { size_t max = PAGE_SIZE - offset_in_page(offset); while (len > max) { memset(kaddr, 0, max); kunmap_local(kaddr); len -= max; offset += max; max = PAGE_SIZE; kaddr = kmap_local_folio(folio, offset); } } memset(kaddr, 0, len); flush_dcache_folio(folio); return kaddr; } /** * folio_fill_tail - Copy some data to a folio and pad with zeroes. * @folio: The destination folio. * @offset: The offset into @folio at which to start copying. * @from: The data to copy. * @len: How many bytes of data to copy. * * This function is most useful for filesystems which support inline data. * When they want to copy data from the inode into the page cache, this * function does everything for them. It supports large folios even on * HIGHMEM configurations. */ static inline void folio_fill_tail(struct folio *folio, size_t offset, const char *from, size_t len) { char *to = kmap_local_folio(folio, offset); VM_BUG_ON(offset + len > folio_size(folio)); if (folio_test_highmem(folio)) { size_t max = PAGE_SIZE - offset_in_page(offset); while (len > max) { memcpy(to, from, max); kunmap_local(to); len -= max; from += max; offset += max; max = PAGE_SIZE; to = kmap_local_folio(folio, offset); } } memcpy(to, from, len); to = folio_zero_tail(folio, offset + len, to + len); kunmap_local(to); } /** * memcpy_from_file_folio - Copy some bytes from a file folio. * @to: The destination buffer. * @folio: The folio to copy from. * @pos: The position in the file. * @len: The maximum number of bytes to copy. * * Copy up to @len bytes from this folio. This may be limited by PAGE_SIZE * if the folio comes from HIGHMEM, and by the size of the folio. * * Return: The number of bytes copied from the folio. */ static inline size_t memcpy_from_file_folio(char *to, struct folio *folio, loff_t pos, size_t len) { size_t offset = offset_in_folio(folio, pos); char *from = kmap_local_folio(folio, offset); if (folio_test_highmem(folio)) { offset = offset_in_page(offset); len = min_t(size_t, len, PAGE_SIZE - offset); } else len = min(len, folio_size(folio) - offset); memcpy(to, from, len); kunmap_local(from); return len; } /** * folio_zero_segments() - Zero two byte ranges in a folio. * @folio: The folio to write to. * @start1: The first byte to zero. * @xend1: One more than the last byte in the first range. * @start2: The first byte to zero in the second range. * @xend2: One more than the last byte in the second range. */ static inline void folio_zero_segments(struct folio *folio, size_t start1, size_t xend1, size_t start2, size_t xend2) { zero_user_segments(&folio->page, start1, xend1, start2, xend2); } /** * folio_zero_segment() - Zero a byte range in a folio. * @folio: The folio to write to. * @start: The first byte to zero. * @xend: One more than the last byte to zero. */ static inline void folio_zero_segment(struct folio *folio, size_t start, size_t xend) { zero_user_segments(&folio->page, start, xend, 0, 0); } /** * folio_zero_range() - Zero a byte range in a folio. * @folio: The folio to write to. * @start: The first byte to zero. * @length: The number of bytes to zero. */ static inline void folio_zero_range(struct folio *folio, size_t start, size_t length) { zero_user_segments(&folio->page, start, start + length, 0, 0); } /** * folio_release_kmap - Unmap a folio and drop a refcount. * @folio: The folio to release. * @addr: The address previously returned by a call to kmap_local_folio(). * * It is common, eg in directory handling to kmap a folio. This function * unmaps the folio and drops the refcount that was being held to keep the * folio alive while we accessed it. */ static inline void folio_release_kmap(struct folio *folio, void *addr) { kunmap_local(addr); folio_put(folio); } static inline void unmap_and_put_page(struct page *page, void *addr) { folio_release_kmap(page_folio(page), addr); } #endif /* _LINUX_HIGHMEM_H */ |
| 329 329 329 30 44 15 30 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2012 Fusion-io All rights reserved. * Copyright (C) 2012 Intel Corp. All rights reserved. */ #include <linux/sched.h> #include <linux/bio.h> #include <linux/slab.h> #include <linux/blkdev.h> #include <linux/raid/pq.h> #include <linux/hash.h> #include <linux/list_sort.h> #include <linux/raid/xor.h> #include <linux/mm.h> #include "messages.h" #include "ctree.h" #include "disk-io.h" #include "volumes.h" #include "raid56.h" #include "async-thread.h" #include "file-item.h" #include "btrfs_inode.h" /* set when additional merges to this rbio are not allowed */ #define RBIO_RMW_LOCKED_BIT 1 /* * set when this rbio is sitting in the hash, but it is just a cache * of past RMW */ #define RBIO_CACHE_BIT 2 /* * set when it is safe to trust the stripe_pages for caching */ #define RBIO_CACHE_READY_BIT 3 #define RBIO_CACHE_SIZE 1024 #define BTRFS_STRIPE_HASH_TABLE_BITS 11 static void dump_bioc(const struct btrfs_fs_info *fs_info, const struct btrfs_io_context *bioc) { if (unlikely(!bioc)) { btrfs_crit(fs_info, "bioc=NULL"); return; } btrfs_crit(fs_info, "bioc logical=%llu full_stripe=%llu size=%llu map_type=0x%llx mirror=%u replace_nr_stripes=%u replace_stripe_src=%d num_stripes=%u", bioc->logical, bioc->full_stripe_logical, bioc->size, bioc->map_type, bioc->mirror_num, bioc->replace_nr_stripes, bioc->replace_stripe_src, bioc->num_stripes); for (int i = 0; i < bioc->num_stripes; i++) { btrfs_crit(fs_info, " nr=%d devid=%llu physical=%llu", i, bioc->stripes[i].dev->devid, bioc->stripes[i].physical); } } static void btrfs_dump_rbio(const struct btrfs_fs_info *fs_info, const struct btrfs_raid_bio *rbio) { if (!IS_ENABLED(CONFIG_BTRFS_ASSERT)) return; dump_bioc(fs_info, rbio->bioc); btrfs_crit(fs_info, "rbio flags=0x%lx nr_sectors=%u nr_data=%u real_stripes=%u stripe_nsectors=%u scrubp=%u dbitmap=0x%lx", rbio->flags, rbio->nr_sectors, rbio->nr_data, rbio->real_stripes, rbio->stripe_nsectors, rbio->scrubp, rbio->dbitmap); } #define ASSERT_RBIO(expr, rbio) \ ({ \ if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) { \ const struct btrfs_fs_info *__fs_info = (rbio)->bioc ? \ (rbio)->bioc->fs_info : NULL; \ \ btrfs_dump_rbio(__fs_info, (rbio)); \ } \ ASSERT((expr)); \ }) #define ASSERT_RBIO_STRIPE(expr, rbio, stripe_nr) \ ({ \ if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) { \ const struct btrfs_fs_info *__fs_info = (rbio)->bioc ? \ (rbio)->bioc->fs_info : NULL; \ \ btrfs_dump_rbio(__fs_info, (rbio)); \ btrfs_crit(__fs_info, "stripe_nr=%d", (stripe_nr)); \ } \ ASSERT((expr)); \ }) #define ASSERT_RBIO_SECTOR(expr, rbio, sector_nr) \ ({ \ if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) { \ const struct btrfs_fs_info *__fs_info = (rbio)->bioc ? \ (rbio)->bioc->fs_info : NULL; \ \ btrfs_dump_rbio(__fs_info, (rbio)); \ btrfs_crit(__fs_info, "sector_nr=%d", (sector_nr)); \ } \ ASSERT((expr)); \ }) #define ASSERT_RBIO_LOGICAL(expr, rbio, logical) \ ({ \ if (IS_ENABLED(CONFIG_BTRFS_ASSERT) && unlikely(!(expr))) { \ const struct btrfs_fs_info *__fs_info = (rbio)->bioc ? \ (rbio)->bioc->fs_info : NULL; \ \ btrfs_dump_rbio(__fs_info, (rbio)); \ btrfs_crit(__fs_info, "logical=%llu", (logical)); \ } \ ASSERT((expr)); \ }) /* Used by the raid56 code to lock stripes for read/modify/write */ struct btrfs_stripe_hash { struct list_head hash_list; spinlock_t lock; }; /* Used by the raid56 code to lock stripes for read/modify/write */ struct btrfs_stripe_hash_table { struct list_head stripe_cache; spinlock_t cache_lock; int cache_size; struct btrfs_stripe_hash table[]; }; /* * A bvec like structure to present a sector inside a page. * * Unlike bvec we don't need bvlen, as it's fixed to sectorsize. */ struct sector_ptr { struct page *page; unsigned int pgoff:24; unsigned int uptodate:8; }; static void rmw_rbio_work(struct work_struct *work); static void rmw_rbio_work_locked(struct work_struct *work); static void index_rbio_pages(struct btrfs_raid_bio *rbio); static int alloc_rbio_pages(struct btrfs_raid_bio *rbio); static int finish_parity_scrub(struct btrfs_raid_bio *rbio); static void scrub_rbio_work_locked(struct work_struct *work); static void free_raid_bio_pointers(struct btrfs_raid_bio *rbio) { bitmap_free(rbio->error_bitmap); kfree(rbio->stripe_pages); kfree(rbio->bio_sectors); kfree(rbio->stripe_sectors); kfree(rbio->finish_pointers); } static void free_raid_bio(struct btrfs_raid_bio *rbio) { int i; if (!refcount_dec_and_test(&rbio->refs)) return; WARN_ON(!list_empty(&rbio->stripe_cache)); WARN_ON(!list_empty(&rbio->hash_list)); WARN_ON(!bio_list_empty(&rbio->bio_list)); for (i = 0; i < rbio->nr_pages; i++) { if (rbio->stripe_pages[i]) { __free_page(rbio->stripe_pages[i]); rbio->stripe_pages[i] = NULL; } } btrfs_put_bioc(rbio->bioc); free_raid_bio_pointers(rbio); kfree(rbio); } static void start_async_work(struct btrfs_raid_bio *rbio, work_func_t work_func) { INIT_WORK(&rbio->work, work_func); queue_work(rbio->bioc->fs_info->rmw_workers, &rbio->work); } /* * the stripe hash table is used for locking, and to collect * bios in hopes of making a full stripe */ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) { struct btrfs_stripe_hash_table *table; struct btrfs_stripe_hash_table *x; struct btrfs_stripe_hash *cur; struct btrfs_stripe_hash *h; int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS; int i; if (info->stripe_hash_table) return 0; /* * The table is large, starting with order 4 and can go as high as * order 7 in case lock debugging is turned on. * * Try harder to allocate and fallback to vmalloc to lower the chance * of a failing mount. */ table = kvzalloc(struct_size(table, table, num_entries), GFP_KERNEL); if (!table) return -ENOMEM; spin_lock_init(&table->cache_lock); INIT_LIST_HEAD(&table->stripe_cache); h = table->table; for (i = 0; i < num_entries; i++) { cur = h + i; INIT_LIST_HEAD(&cur->hash_list); spin_lock_init(&cur->lock); } x = cmpxchg(&info->stripe_hash_table, NULL, table); kvfree(x); return 0; } /* * caching an rbio means to copy anything from the * bio_sectors array into the stripe_pages array. We * use the page uptodate bit in the stripe cache array * to indicate if it has valid data * * once the caching is done, we set the cache ready * bit. */ static void cache_rbio_pages(struct btrfs_raid_bio *rbio) { int i; int ret; ret = alloc_rbio_pages(rbio); if (ret) return; for (i = 0; i < rbio->nr_sectors; i++) { /* Some range not covered by bio (partial write), skip it */ if (!rbio->bio_sectors[i].page) { /* * Even if the sector is not covered by bio, if it is * a data sector it should still be uptodate as it is * read from disk. */ if (i < rbio->nr_data * rbio->stripe_nsectors) ASSERT(rbio->stripe_sectors[i].uptodate); continue; } ASSERT(rbio->stripe_sectors[i].page); memcpy_page(rbio->stripe_sectors[i].page, rbio->stripe_sectors[i].pgoff, rbio->bio_sectors[i].page, rbio->bio_sectors[i].pgoff, rbio->bioc->fs_info->sectorsize); rbio->stripe_sectors[i].uptodate = 1; } set_bit(RBIO_CACHE_READY_BIT, &rbio->flags); } /* * we hash on the first logical address of the stripe */ static int rbio_bucket(struct btrfs_raid_bio *rbio) { u64 num = rbio->bioc->full_stripe_logical; /* * we shift down quite a bit. We're using byte * addressing, and most of the lower bits are zeros. * This tends to upset hash_64, and it consistently * returns just one or two different values. * * shifting off the lower bits fixes things. */ return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS); } static bool full_page_sectors_uptodate(struct btrfs_raid_bio *rbio, unsigned int page_nr) { const u32 sectorsize = rbio->bioc->fs_info->sectorsize; const u32 sectors_per_page = PAGE_SIZE / sectorsize; int i; ASSERT(page_nr < rbio->nr_pages); for (i = sectors_per_page * page_nr; i < sectors_per_page * page_nr + sectors_per_page; i++) { if (!rbio->stripe_sectors[i].uptodate) return false; } return true; } /* * Update the stripe_sectors[] array to use correct page and pgoff * * Should be called every time any page pointer in stripes_pages[] got modified. */ static void index_stripe_sectors(struct btrfs_raid_bio *rbio) { const u32 sectorsize = rbio->bioc->fs_info->sectorsize; u32 offset; int i; for (i = 0, offset = 0; i < rbio->nr_sectors; i++, offset += sectorsize) { int page_index = offset >> PAGE_SHIFT; ASSERT(page_index < rbio->nr_pages); rbio->stripe_sectors[i].page = rbio->stripe_pages[page_index]; rbio->stripe_sectors[i].pgoff = offset_in_page(offset); } } static void steal_rbio_page(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest, int page_nr) { const u32 sectorsize = src->bioc->fs_info->sectorsize; const u32 sectors_per_page = PAGE_SIZE / sectorsize; int i; if (dest->stripe_pages[page_nr]) __free_page(dest->stripe_pages[page_nr]); dest->stripe_pages[page_nr] = src->stripe_pages[page_nr]; src->stripe_pages[page_nr] = NULL; /* Also update the sector->uptodate bits. */ for (i = sectors_per_page * page_nr; i < sectors_per_page * page_nr + sectors_per_page; i++) dest->stripe_sectors[i].uptodate = true; } static bool is_data_stripe_page(struct btrfs_raid_bio *rbio, int page_nr) { const int sector_nr = (page_nr << PAGE_SHIFT) >> rbio->bioc->fs_info->sectorsize_bits; /* * We have ensured PAGE_SIZE is aligned with sectorsize, thus * we won't have a page which is half data half parity. * * Thus if the first sector of the page belongs to data stripes, then * the full page belongs to data stripes. */ return (sector_nr < rbio->nr_data * rbio->stripe_nsectors); } /* * Stealing an rbio means taking all the uptodate pages from the stripe array * in the source rbio and putting them into the destination rbio. * * This will also update the involved stripe_sectors[] which are referring to * the old pages. */ static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest) { int i; if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags)) return; for (i = 0; i < dest->nr_pages; i++) { struct page *p = src->stripe_pages[i]; /* * We don't need to steal P/Q pages as they will always be * regenerated for RMW or full write anyway. */ if (!is_data_stripe_page(src, i)) continue; /* * If @src already has RBIO_CACHE_READY_BIT, it should have * all data stripe pages present and uptodate. */ ASSERT(p); ASSERT(full_page_sectors_uptodate(src, i)); steal_rbio_page(src, dest, i); } index_stripe_sectors(dest); index_stripe_sectors(src); } /* * merging means we take the bio_list from the victim and * splice it into the destination. The victim should * be discarded afterwards. * * must be called with dest->rbio_list_lock held */ static void merge_rbio(struct btrfs_raid_bio *dest, struct btrfs_raid_bio *victim) { bio_list_merge_init(&dest->bio_list, &victim->bio_list); dest->bio_list_bytes += victim->bio_list_bytes; /* Also inherit the bitmaps from @victim. */ bitmap_or(&dest->dbitmap, &victim->dbitmap, &dest->dbitmap, dest->stripe_nsectors); } /* * used to prune items that are in the cache. The caller * must hold the hash table lock. */ static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio) { int bucket = rbio_bucket(rbio); struct btrfs_stripe_hash_table *table; struct btrfs_stripe_hash *h; int freeit = 0; /* * check the bit again under the hash table lock. */ if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) return; table = rbio->bioc->fs_info->stripe_hash_table; h = table->table + bucket; /* hold the lock for the bucket because we may be * removing it from the hash table */ spin_lock(&h->lock); /* * hold the lock for the bio list because we need * to make sure the bio list is empty */ spin_lock(&rbio->bio_list_lock); if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) { list_del_init(&rbio->stripe_cache); table->cache_size -= 1; freeit = 1; /* if the bio list isn't empty, this rbio is * still involved in an IO. We take it out * of the cache list, and drop the ref that * was held for the list. * * If the bio_list was empty, we also remove * the rbio from the hash_table, and drop * the corresponding ref */ if (bio_list_empty(&rbio->bio_list)) { if (!list_empty(&rbio->hash_list)) { list_del_init(&rbio->hash_list); refcount_dec(&rbio->refs); BUG_ON(!list_empty(&rbio->plug_list)); } } } spin_unlock(&rbio->bio_list_lock); spin_unlock(&h->lock); if (freeit) free_raid_bio(rbio); } /* * prune a given rbio from the cache */ static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio) { struct btrfs_stripe_hash_table *table; if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) return; table = rbio->bioc->fs_info->stripe_hash_table; spin_lock(&table->cache_lock); __remove_rbio_from_cache(rbio); spin_unlock(&table->cache_lock); } /* * remove everything in the cache */ static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info) { struct btrfs_stripe_hash_table *table; struct btrfs_raid_bio *rbio; table = info->stripe_hash_table; spin_lock(&table->cache_lock); while (!list_empty(&table->stripe_cache)) { rbio = list_entry(table->stripe_cache.next, struct btrfs_raid_bio, stripe_cache); __remove_rbio_from_cache(rbio); } spin_unlock(&table->cache_lock); } /* * remove all cached entries and free the hash table * used by unmount */ void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info) { if (!info->stripe_hash_table) return; btrfs_clear_rbio_cache(info); kvfree(info->stripe_hash_table); info->stripe_hash_table = NULL; } /* * insert an rbio into the stripe cache. It * must have already been prepared by calling * cache_rbio_pages * * If this rbio was already cached, it gets * moved to the front of the lru. * * If the size of the rbio cache is too big, we * prune an item. */ static void cache_rbio(struct btrfs_raid_bio *rbio) { struct btrfs_stripe_hash_table *table; if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags)) return; table = rbio->bioc->fs_info->stripe_hash_table; spin_lock(&table->cache_lock); spin_lock(&rbio->bio_list_lock); /* bump our ref if we were not in the list before */ if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags)) refcount_inc(&rbio->refs); if (!list_empty(&rbio->stripe_cache)){ list_move(&rbio->stripe_cache, &table->stripe_cache); } else { list_add(&rbio->stripe_cache, &table->stripe_cache); table->cache_size += 1; } spin_unlock(&rbio->bio_list_lock); if (table->cache_size > RBIO_CACHE_SIZE) { struct btrfs_raid_bio *found; found = list_entry(table->stripe_cache.prev, struct btrfs_raid_bio, stripe_cache); if (found != rbio) __remove_rbio_from_cache(found); } spin_unlock(&table->cache_lock); } /* * helper function to run the xor_blocks api. It is only * able to do MAX_XOR_BLOCKS at a time, so we need to * loop through. */ static void run_xor(void **pages, int src_cnt, ssize_t len) { int src_off = 0; int xor_src_cnt = 0; void *dest = pages[src_cnt]; while(src_cnt > 0) { xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS); xor_blocks(xor_src_cnt, len, dest, pages + src_off); src_cnt -= xor_src_cnt; src_off += xor_src_cnt; } } /* * Returns true if the bio list inside this rbio covers an entire stripe (no * rmw required). */ static int rbio_is_full(struct btrfs_raid_bio *rbio) { unsigned long size = rbio->bio_list_bytes; int ret = 1; spin_lock(&rbio->bio_list_lock); if (size != rbio->nr_data * BTRFS_STRIPE_LEN) ret = 0; BUG_ON(size > rbio->nr_data * BTRFS_STRIPE_LEN); spin_unlock(&rbio->bio_list_lock); return ret; } /* * returns 1 if it is safe to merge two rbios together. * The merging is safe if the two rbios correspond to * the same stripe and if they are both going in the same * direction (read vs write), and if neither one is * locked for final IO * * The caller is responsible for locking such that * rmw_locked is safe to test */ static int rbio_can_merge(struct btrfs_raid_bio *last, struct btrfs_raid_bio *cur) { if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) || test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) return 0; /* * we can't merge with cached rbios, since the * idea is that when we merge the destination * rbio is going to run our IO for us. We can * steal from cached rbios though, other functions * handle that. */ if (test_bit(RBIO_CACHE_BIT, &last->flags) || test_bit(RBIO_CACHE_BIT, &cur->flags)) return 0; if (last->bioc->full_stripe_logical != cur->bioc->full_stripe_logical) return 0; /* we can't merge with different operations */ if (last->operation != cur->operation) return 0; /* * We've need read the full stripe from the drive. * check and repair the parity and write the new results. * * We're not allowed to add any new bios to the * bio list here, anyone else that wants to * change this stripe needs to do their own rmw. */ if (last->operation == BTRFS_RBIO_PARITY_SCRUB) return 0; if (last->operation == BTRFS_RBIO_READ_REBUILD) return 0; return 1; } static unsigned int rbio_stripe_sector_index(const struct btrfs_raid_bio *rbio, unsigned int stripe_nr, unsigned int sector_nr) { ASSERT_RBIO_STRIPE(stripe_nr < rbio->real_stripes, rbio, stripe_nr); ASSERT_RBIO_SECTOR(sector_nr < rbio->stripe_nsectors, rbio, sector_nr); return stripe_nr * rbio->stripe_nsectors + sector_nr; } /* Return a sector from rbio->stripe_sectors, not from the bio list */ static struct sector_ptr *rbio_stripe_sector(const struct btrfs_raid_bio *rbio, unsigned int stripe_nr, unsigned int sector_nr) { return &rbio->stripe_sectors[rbio_stripe_sector_index(rbio, stripe_nr, sector_nr)]; } /* Grab a sector inside P stripe */ static struct sector_ptr *rbio_pstripe_sector(const struct btrfs_raid_bio *rbio, unsigned int sector_nr) { return rbio_stripe_sector(rbio, rbio->nr_data, sector_nr); } /* Grab a sector inside Q stripe, return NULL if not RAID6 */ static struct sector_ptr *rbio_qstripe_sector(const struct btrfs_raid_bio *rbio, unsigned int sector_nr) { if (rbio->nr_data + 1 == rbio->real_stripes) return NULL; return rbio_stripe_sector(rbio, rbio->nr_data + 1, sector_nr); } /* * The first stripe in the table for a logical address * has the lock. rbios are added in one of three ways: * * 1) Nobody has the stripe locked yet. The rbio is given * the lock and 0 is returned. The caller must start the IO * themselves. * * 2) Someone has the stripe locked, but we're able to merge * with the lock owner. The rbio is freed and the IO will * start automatically along with the existing rbio. 1 is returned. * * 3) Someone has the stripe locked, but we're not able to merge. * The rbio is added to the lock owner's plug list, or merged into * an rbio already on the plug list. When the lock owner unlocks, * the next rbio on the list is run and the IO is started automatically. * 1 is returned * * If we return 0, the caller still owns the rbio and must continue with * IO submission. If we return 1, the caller must assume the rbio has * already been freed. */ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) { struct btrfs_stripe_hash *h; struct btrfs_raid_bio *cur; struct btrfs_raid_bio *pending; struct btrfs_raid_bio *freeit = NULL; struct btrfs_raid_bio *cache_drop = NULL; int ret = 0; h = rbio->bioc->fs_info->stripe_hash_table->table + rbio_bucket(rbio); spin_lock(&h->lock); list_for_each_entry(cur, &h->hash_list, hash_list) { if (cur->bioc->full_stripe_logical != rbio->bioc->full_stripe_logical) continue; spin_lock(&cur->bio_list_lock); /* Can we steal this cached rbio's pages? */ if (bio_list_empty(&cur->bio_list) && list_empty(&cur->plug_list) && test_bit(RBIO_CACHE_BIT, &cur->flags) && !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) { list_del_init(&cur->hash_list); refcount_dec(&cur->refs); steal_rbio(cur, rbio); cache_drop = cur; spin_unlock(&cur->bio_list_lock); goto lockit; } /* Can we merge into the lock owner? */ if (rbio_can_merge(cur, rbio)) { merge_rbio(cur, rbio); spin_unlock(&cur->bio_list_lock); freeit = rbio; ret = 1; goto out; } /* * We couldn't merge with the running rbio, see if we can merge * with the pending ones. We don't have to check for rmw_locked * because there is no way they are inside finish_rmw right now */ list_for_each_entry(pending, &cur->plug_list, plug_list) { if (rbio_can_merge(pending, rbio)) { merge_rbio(pending, rbio); spin_unlock(&cur->bio_list_lock); freeit = rbio; ret = 1; goto out; } } /* * No merging, put us on the tail of the plug list, our rbio * will be started with the currently running rbio unlocks */ list_add_tail(&rbio->plug_list, &cur->plug_list); spin_unlock(&cur->bio_list_lock); ret = 1; goto out; } lockit: refcount_inc(&rbio->refs); list_add(&rbio->hash_list, &h->hash_list); out: spin_unlock(&h->lock); if (cache_drop) remove_rbio_from_cache(cache_drop); if (freeit) free_raid_bio(freeit); return ret; } static void recover_rbio_work_locked(struct work_struct *work); /* * called as rmw or parity rebuild is completed. If the plug list has more * rbios waiting for this stripe, the next one on the list will be started */ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) { int bucket; struct btrfs_stripe_hash *h; int keep_cache = 0; bucket = rbio_bucket(rbio); h = rbio->bioc->fs_info->stripe_hash_table->table + bucket; if (list_empty(&rbio->plug_list)) cache_rbio(rbio); spin_lock(&h->lock); spin_lock(&rbio->bio_list_lock); if (!list_empty(&rbio->hash_list)) { /* * if we're still cached and there is no other IO * to perform, just leave this rbio here for others * to steal from later */ if (list_empty(&rbio->plug_list) && test_bit(RBIO_CACHE_BIT, &rbio->flags)) { keep_cache = 1; clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); BUG_ON(!bio_list_empty(&rbio->bio_list)); goto done; } list_del_init(&rbio->hash_list); refcount_dec(&rbio->refs); /* * we use the plug list to hold all the rbios * waiting for the chance to lock this stripe. * hand the lock over to one of them. */ if (!list_empty(&rbio->plug_list)) { struct btrfs_raid_bio *next; struct list_head *head = rbio->plug_list.next; next = list_entry(head, struct btrfs_raid_bio, plug_list); list_del_init(&rbio->plug_list); list_add(&next->hash_list, &h->hash_list); refcount_inc(&next->refs); spin_unlock(&rbio->bio_list_lock); spin_unlock(&h->lock); if (next->operation == BTRFS_RBIO_READ_REBUILD) { start_async_work(next, recover_rbio_work_locked); } else if (next->operation == BTRFS_RBIO_WRITE) { steal_rbio(rbio, next); start_async_work(next, rmw_rbio_work_locked); } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) { steal_rbio(rbio, next); start_async_work(next, scrub_rbio_work_locked); } goto done_nolock; } } done: spin_unlock(&rbio->bio_list_lock); spin_unlock(&h->lock); done_nolock: if (!keep_cache) remove_rbio_from_cache(rbio); } static void rbio_endio_bio_list(struct bio *cur, blk_status_t err) { struct bio *next; while (cur) { next = cur->bi_next; cur->bi_next = NULL; cur->bi_status = err; bio_endio(cur); cur = next; } } /* * this frees the rbio and runs through all the bios in the * bio_list and calls end_io on them */ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err) { struct bio *cur = bio_list_get(&rbio->bio_list); struct bio *extra; kfree(rbio->csum_buf); bitmap_free(rbio->csum_bitmap); rbio->csum_buf = NULL; rbio->csum_bitmap = NULL; /* * Clear the data bitmap, as the rbio may be cached for later usage. * do this before before unlock_stripe() so there will be no new bio * for this bio. */ bitmap_clear(&rbio->dbitmap, 0, rbio->stripe_nsectors); /* * At this moment, rbio->bio_list is empty, however since rbio does not * always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the * hash list, rbio may be merged with others so that rbio->bio_list * becomes non-empty. * Once unlock_stripe() is done, rbio->bio_list will not be updated any * more and we can call bio_endio() on all queued bios. */ unlock_stripe(rbio); extra = bio_list_get(&rbio->bio_list); free_raid_bio(rbio); rbio_endio_bio_list(cur, err); if (extra) rbio_endio_bio_list(extra, err); } /* * Get a sector pointer specified by its @stripe_nr and @sector_nr. * * @rbio: The raid bio * @stripe_nr: Stripe number, valid range [0, real_stripe) * @sector_nr: Sector number inside the stripe, * valid range [0, stripe_nsectors) * @bio_list_only: Whether to use sectors inside the bio list only. * * The read/modify/write code wants to reuse the original bio page as much * as possible, and only use stripe_sectors as fallback. */ static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio, int stripe_nr, int sector_nr, bool bio_list_only) { struct sector_ptr *sector; int index; ASSERT_RBIO_STRIPE(stripe_nr >= 0 && stripe_nr < rbio->real_stripes, rbio, stripe_nr); ASSERT_RBIO_SECTOR(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors, rbio, sector_nr); index = stripe_nr * rbio->stripe_nsectors + sector_nr; ASSERT(index >= 0 && index < rbio->nr_sectors); spin_lock(&rbio->bio_list_lock); sector = &rbio->bio_sectors[index]; if (sector->page || bio_list_only) { /* Don't return sector without a valid page pointer */ if (!sector->page) sector = NULL; spin_unlock(&rbio->bio_list_lock); return sector; } spin_unlock(&rbio->bio_list_lock); return &rbio->stripe_sectors[index]; } /* * allocation and initial setup for the btrfs_raid_bio. Not * this does not allocate any pages for rbio->pages. */ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, struct btrfs_io_context *bioc) { const unsigned int real_stripes = bioc->num_stripes - bioc->replace_nr_stripes; const unsigned int stripe_npages = BTRFS_STRIPE_LEN >> PAGE_SHIFT; const unsigned int num_pages = stripe_npages * real_stripes; const unsigned int stripe_nsectors = BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits; const unsigned int num_sectors = stripe_nsectors * real_stripes; struct btrfs_raid_bio *rbio; /* PAGE_SIZE must also be aligned to sectorsize for subpage support */ ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize)); /* * Our current stripe len should be fixed to 64k thus stripe_nsectors * (at most 16) should be no larger than BITS_PER_LONG. */ ASSERT(stripe_nsectors <= BITS_PER_LONG); /* * Real stripes must be between 2 (2 disks RAID5, aka RAID1) and 256 * (limited by u8). */ ASSERT(real_stripes >= 2); ASSERT(real_stripes <= U8_MAX); rbio = kzalloc(sizeof(*rbio), GFP_NOFS); if (!rbio) return ERR_PTR(-ENOMEM); rbio->stripe_pages = kcalloc(num_pages, sizeof(struct page *), GFP_NOFS); rbio->bio_sectors = kcalloc(num_sectors, sizeof(struct sector_ptr), GFP_NOFS); rbio->stripe_sectors = kcalloc(num_sectors, sizeof(struct sector_ptr), GFP_NOFS); rbio->finish_pointers = kcalloc(real_stripes, sizeof(void *), GFP_NOFS); rbio->error_bitmap = bitmap_zalloc(num_sectors, GFP_NOFS); if (!rbio->stripe_pages || !rbio->bio_sectors || !rbio->stripe_sectors || !rbio->finish_pointers || !rbio->error_bitmap) { free_raid_bio_pointers(rbio); kfree(rbio); return ERR_PTR(-ENOMEM); } bio_list_init(&rbio->bio_list); init_waitqueue_head(&rbio->io_wait); INIT_LIST_HEAD(&rbio->plug_list); spin_lock_init(&rbio->bio_list_lock); INIT_LIST_HEAD(&rbio->stripe_cache); INIT_LIST_HEAD(&rbio->hash_list); btrfs_get_bioc(bioc); rbio->bioc = bioc; rbio->nr_pages = num_pages; rbio->nr_sectors = num_sectors; rbio->real_stripes = real_stripes; rbio->stripe_npages = stripe_npages; rbio->stripe_nsectors = stripe_nsectors; refcount_set(&rbio->refs, 1); atomic_set(&rbio->stripes_pending, 0); ASSERT(btrfs_nr_parity_stripes(bioc->map_type)); rbio->nr_data = real_stripes - btrfs_nr_parity_stripes(bioc->map_type); ASSERT(rbio->nr_data > 0); return rbio; } /* allocate pages for all the stripes in the bio, including parity */ static int alloc_rbio_pages(struct btrfs_raid_bio *rbio) { int ret; ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages, false); if (ret < 0) return ret; /* Mapping all sectors */ index_stripe_sectors(rbio); return 0; } /* only allocate pages for p/q stripes */ static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) { const int data_pages = rbio->nr_data * rbio->stripe_npages; int ret; ret = btrfs_alloc_page_array(rbio->nr_pages - data_pages, rbio->stripe_pages + data_pages, false); if (ret < 0) return ret; index_stripe_sectors(rbio); return 0; } /* * Return the total number of errors found in the vertical stripe of @sector_nr. * * @faila and @failb will also be updated to the first and second stripe * number of the errors. */ static int get_rbio_veritical_errors(struct btrfs_raid_bio *rbio, int sector_nr, int *faila, int *failb) { int stripe_nr; int found_errors = 0; if (faila || failb) { /* * Both @faila and @failb should be valid pointers if any of * them is specified. */ ASSERT(faila && failb); *faila = -1; *failb = -1; } for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) { int total_sector_nr = stripe_nr * rbio->stripe_nsectors + sector_nr; if (test_bit(total_sector_nr, rbio->error_bitmap)) { found_errors++; if (faila) { /* Update faila and failb. */ if (*faila < 0) *faila = stripe_nr; else if (*failb < 0) *failb = stripe_nr; } } } return found_errors; } /* * Add a single sector @sector into our list of bios for IO. * * Return 0 if everything went well. * Return <0 for error. */ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, struct bio_list *bio_list, struct sector_ptr *sector, unsigned int stripe_nr, unsigned int sector_nr, enum req_op op) { const u32 sectorsize = rbio->bioc->fs_info->sectorsize; struct bio *last = bio_list->tail; int ret; struct bio *bio; struct btrfs_io_stripe *stripe; u64 disk_start; /* * Note: here stripe_nr has taken device replace into consideration, * thus it can be larger than rbio->real_stripe. * So here we check against bioc->num_stripes, not rbio->real_stripes. */ ASSERT_RBIO_STRIPE(stripe_nr >= 0 && stripe_nr < rbio->bioc->num_stripes, rbio, stripe_nr); ASSERT_RBIO_SECTOR(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors, rbio, sector_nr); ASSERT(sector->page); stripe = &rbio->bioc->stripes[stripe_nr]; disk_start = stripe->physical + sector_nr * sectorsize; /* if the device is missing, just fail this stripe */ if (!stripe->dev->bdev) { int found_errors; set_bit(stripe_nr * rbio->stripe_nsectors + sector_nr, rbio->error_bitmap); /* Check if we have reached tolerance early. */ found_errors = get_rbio_veritical_errors(rbio, sector_nr, NULL, NULL); if (found_errors > rbio->bioc->max_errors) return -EIO; return 0; } /* see if we can add this page onto our existing bio */ if (last) { u64 last_end = last->bi_iter.bi_sector << SECTOR_SHIFT; last_end += last->bi_iter.bi_size; /* * we can't merge these if they are from different * devices or if they are not contiguous */ if (last_end == disk_start && !last->bi_status && last->bi_bdev == stripe->dev->bdev) { ret = bio_add_page(last, sector->page, sectorsize, sector->pgoff); if (ret == sectorsize) return 0; } } /* put a new bio on the list */ bio = bio_alloc(stripe->dev->bdev, max(BTRFS_STRIPE_LEN >> PAGE_SHIFT, 1), op, GFP_NOFS); bio->bi_iter.bi_sector = disk_start >> SECTOR_SHIFT; bio->bi_private = rbio; __bio_add_page(bio, sector->page, sectorsize, sector->pgoff); bio_list_add(bio_list, bio); return 0; } static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio) { const u32 sectorsize = rbio->bioc->fs_info->sectorsize; struct bio_vec bvec; struct bvec_iter iter; u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - rbio->bioc->full_stripe_logical; bio_for_each_segment(bvec, bio, iter) { u32 bvec_offset; for (bvec_offset = 0; bvec_offset < bvec.bv_len; bvec_offset += sectorsize, offset += sectorsize) { int index = offset / sectorsize; struct sector_ptr *sector = &rbio->bio_sectors[index]; sector->page = bvec.bv_page; sector->pgoff = bvec.bv_offset + bvec_offset; ASSERT(sector->pgoff < PAGE_SIZE); } } } /* * helper function to walk our bio list and populate the bio_pages array with * the result. This seems expensive, but it is faster than constantly * searching through the bio list as we setup the IO in finish_rmw or stripe * reconstruction. * * This must be called before you trust the answers from page_in_rbio */ static void index_rbio_pages(struct btrfs_raid_bio *rbio) { struct bio *bio; spin_lock(&rbio->bio_list_lock); bio_list_for_each(bio, &rbio->bio_list) index_one_bio(rbio, bio); spin_unlock(&rbio->bio_list_lock); } static void bio_get_trace_info(struct btrfs_raid_bio *rbio, struct bio *bio, struct raid56_bio_trace_info *trace_info) { const struct btrfs_io_context *bioc = rbio->bioc; int i; ASSERT(bioc); /* We rely on bio->bi_bdev to find the stripe number. */ if (!bio->bi_bdev) goto not_found; for (i = 0; i < bioc->num_stripes; i++) { if (bio->bi_bdev != bioc->stripes[i].dev->bdev) continue; trace_info->stripe_nr = i; trace_info->devid = bioc->stripes[i].dev->devid; trace_info->offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - bioc->stripes[i].physical; return; } not_found: trace_info->devid = -1; trace_info->offset = -1; trace_info->stripe_nr = -1; } static inline void bio_list_put(struct bio_list *bio_list) { struct bio *bio; while ((bio = bio_list_pop(bio_list))) bio_put(bio); } static void assert_rbio(struct btrfs_raid_bio *rbio) { if (!IS_ENABLED(CONFIG_BTRFS_ASSERT)) return; /* * At least two stripes (2 disks RAID5), and since real_stripes is U8, * we won't go beyond 256 disks anyway. */ ASSERT_RBIO(rbio->real_stripes >= 2, rbio); ASSERT_RBIO(rbio->nr_data > 0, rbio); /* * This is another check to make sure nr data stripes is smaller * than total stripes. */ ASSERT_RBIO(rbio->nr_data < rbio->real_stripes, rbio); } /* Generate PQ for one vertical stripe. */ static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr) { void **pointers = rbio->finish_pointers; const u32 sectorsize = rbio->bioc->fs_info->sectorsize; struct sector_ptr *sector; int stripe; const bool has_qstripe = rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6; /* First collect one sector from each data stripe */ for (stripe = 0; stripe < rbio->nr_data; stripe++) { sector = sector_in_rbio(rbio, stripe, sectornr, 0); pointers[stripe] = kmap_local_page(sector->page) + sector->pgoff; } /* Then add the parity stripe */ sector = rbio_pstripe_sector(rbio, sectornr); sector->uptodate = 1; pointers[stripe++] = kmap_local_page(sector->page) + sector->pgoff; if (has_qstripe) { /* * RAID6, add the qstripe and call the library function * to fill in our p/q */ sector = rbio_qstripe_sector(rbio, sectornr); sector->uptodate = 1; pointers[stripe++] = kmap_local_page(sector->page) + sector->pgoff; assert_rbio(rbio); raid6_call.gen_syndrome(rbio->real_stripes, sectorsize, pointers); } else { /* raid5 */ memcpy(pointers[rbio->nr_data], pointers[0], sectorsize); run_xor(pointers + 1, rbio->nr_data - 1, sectorsize); } for (stripe = stripe - 1; stripe >= 0; stripe--) kunmap_local(pointers[stripe]); } static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio, struct bio_list *bio_list) { /* The total sector number inside the full stripe. */ int total_sector_nr; int sectornr; int stripe; int ret; ASSERT(bio_list_size(bio_list) == 0); /* We should have at least one data sector. */ ASSERT(bitmap_weight(&rbio->dbitmap, rbio->stripe_nsectors)); /* * Reset errors, as we may have errors inherited from from degraded * write. */ bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); /* * Start assembly. Make bios for everything from the higher layers (the * bio_list in our rbio) and our P/Q. Ignore everything else. */ for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; total_sector_nr++) { struct sector_ptr *sector; stripe = total_sector_nr / rbio->stripe_nsectors; sectornr = total_sector_nr % rbio->stripe_nsectors; /* This vertical stripe has no data, skip it. */ if (!test_bit(sectornr, &rbio->dbitmap)) continue; if (stripe < rbio->nr_data) { sector = sector_in_rbio(rbio, stripe, sectornr, 1); if (!sector) continue; } else { sector = rbio_stripe_sector(rbio, stripe, sectornr); } ret = rbio_add_io_sector(rbio, bio_list, sector, stripe, sectornr, REQ_OP_WRITE); if (ret) goto error; } if (likely(!rbio->bioc->replace_nr_stripes)) return 0; /* * Make a copy for the replace target device. * * Thus the source stripe number (in replace_stripe_src) should be valid. */ ASSERT(rbio->bioc->replace_stripe_src >= 0); for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; total_sector_nr++) { struct sector_ptr *sector; stripe = total_sector_nr / rbio->stripe_nsectors; sectornr = total_sector_nr % rbio->stripe_nsectors; /* * For RAID56, there is only one device that can be replaced, * and replace_stripe_src[0] indicates the stripe number we * need to copy from. */ if (stripe != rbio->bioc->replace_stripe_src) { /* * We can skip the whole stripe completely, note * total_sector_nr will be increased by one anyway. */ ASSERT(sectornr == 0); total_sector_nr += rbio->stripe_nsectors - 1; continue; } /* This vertical stripe has no data, skip it. */ if (!test_bit(sectornr, &rbio->dbitmap)) continue; if (stripe < rbio->nr_data) { sector = sector_in_rbio(rbio, stripe, sectornr, 1); if (!sector) continue; } else { sector = rbio_stripe_sector(rbio, stripe, sectornr); } ret = rbio_add_io_sector(rbio, bio_list, sector, rbio->real_stripes, sectornr, REQ_OP_WRITE); if (ret) goto error; } return 0; error: bio_list_put(bio_list); return -EIO; } static void set_rbio_range_error(struct btrfs_raid_bio *rbio, struct bio *bio) { struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - rbio->bioc->full_stripe_logical; int total_nr_sector = offset >> fs_info->sectorsize_bits; ASSERT(total_nr_sector < rbio->nr_data * rbio->stripe_nsectors); bitmap_set(rbio->error_bitmap, total_nr_sector, bio->bi_iter.bi_size >> fs_info->sectorsize_bits); /* * Special handling for raid56_alloc_missing_rbio() used by * scrub/replace. Unlike call path in raid56_parity_recover(), they * pass an empty bio here. Thus we have to find out the missing device * and mark the stripe error instead. */ if (bio->bi_iter.bi_size == 0) { bool found_missing = false; int stripe_nr; for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) { if (!rbio->bioc->stripes[stripe_nr].dev->bdev) { found_missing = true; bitmap_set(rbio->error_bitmap, stripe_nr * rbio->stripe_nsectors, rbio->stripe_nsectors); } } ASSERT(found_missing); } } /* * For subpage case, we can no longer set page Up-to-date directly for * stripe_pages[], thus we need to locate the sector. */ static struct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio, struct page *page, unsigned int pgoff) { int i; for (i = 0; i < rbio->nr_sectors; i++) { struct sector_ptr *sector = &rbio->stripe_sectors[i]; if (sector->page == page && sector->pgoff == pgoff) return sector; } return NULL; } /* * this sets each page in the bio uptodate. It should only be used on private * rbio pages, nothing that comes in from the higher layers */ static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio) { const u32 sectorsize = rbio->bioc->fs_info->sectorsize; struct bio_vec *bvec; struct bvec_iter_all iter_all; ASSERT(!bio_flagged(bio, BIO_CLONED)); bio_for_each_segment_all(bvec, bio, iter_all) { struct sector_ptr *sector; int pgoff; for (pgoff = bvec->bv_offset; pgoff - bvec->bv_offset < bvec->bv_len; pgoff += sectorsize) { sector = find_stripe_sector(rbio, bvec->bv_page, pgoff); ASSERT(sector); if (sector) sector->uptodate = 1; } } } static int get_bio_sector_nr(struct btrfs_raid_bio *rbio, struct bio *bio) { struct bio_vec *bv = bio_first_bvec_all(bio); int i; for (i = 0; i < rbio->nr_sectors; i++) { struct sector_ptr *sector; sector = &rbio->stripe_sectors[i]; if (sector->page == bv->bv_page && sector->pgoff == bv->bv_offset) break; sector = &rbio->bio_sectors[i]; if (sector->page == bv->bv_page && sector->pgoff == bv->bv_offset) break; } ASSERT(i < rbio->nr_sectors); return i; } static void rbio_update_error_bitmap(struct btrfs_raid_bio *rbio, struct bio *bio) { int total_sector_nr = get_bio_sector_nr(rbio, bio); u32 bio_size = 0; struct bio_vec *bvec; int i; bio_for_each_bvec_all(bvec, bio, i) bio_size += bvec->bv_len; /* * Since we can have multiple bios touching the error_bitmap, we cannot * call bitmap_set() without protection. * * Instead use set_bit() for each bit, as set_bit() itself is atomic. */ for (i = total_sector_nr; i < total_sector_nr + (bio_size >> rbio->bioc->fs_info->sectorsize_bits); i++) set_bit(i, rbio->error_bitmap); } /* Verify the data sectors at read time. */ static void verify_bio_data_sectors(struct btrfs_raid_bio *rbio, struct bio *bio) { struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; int total_sector_nr = get_bio_sector_nr(rbio, bio); struct bio_vec *bvec; struct bvec_iter_all iter_all; /* No data csum for the whole stripe, no need to verify. */ if (!rbio->csum_bitmap || !rbio->csum_buf) return; /* P/Q stripes, they have no data csum to verify against. */ if (total_sector_nr >= rbio->nr_data * rbio->stripe_nsectors) return; bio_for_each_segment_all(bvec, bio, iter_all) { int bv_offset; for (bv_offset = bvec->bv_offset; bv_offset < bvec->bv_offset + bvec->bv_len; bv_offset += fs_info->sectorsize, total_sector_nr++) { u8 csum_buf[BTRFS_CSUM_SIZE]; u8 *expected_csum = rbio->csum_buf + total_sector_nr * fs_info->csum_size; int ret; /* No csum for this sector, skip to the next sector. */ if (!test_bit(total_sector_nr, rbio->csum_bitmap)) continue; ret = btrfs_check_sector_csum(fs_info, bvec->bv_page, bv_offset, csum_buf, expected_csum); if (ret < 0) set_bit(total_sector_nr, rbio->error_bitmap); } } } static void raid_wait_read_end_io(struct bio *bio) { struct btrfs_raid_bio *rbio = bio->bi_private; if (bio->bi_status) { rbio_update_error_bitmap(rbio, bio); } else { set_bio_pages_uptodate(rbio, bio); verify_bio_data_sectors(rbio, bio); } bio_put(bio); if (atomic_dec_and_test(&rbio->stripes_pending)) wake_up(&rbio->io_wait); } static void submit_read_wait_bio_list(struct btrfs_raid_bio *rbio, struct bio_list *bio_list) { struct bio *bio; atomic_set(&rbio->stripes_pending, bio_list_size(bio_list)); while ((bio = bio_list_pop(bio_list))) { bio->bi_end_io = raid_wait_read_end_io; if (trace_raid56_read_enabled()) { struct raid56_bio_trace_info trace_info = { 0 }; bio_get_trace_info(rbio, bio, &trace_info); trace_raid56_read(rbio, bio, &trace_info); } submit_bio(bio); } wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); } static int alloc_rbio_data_pages(struct btrfs_raid_bio *rbio) { const int data_pages = rbio->nr_data * rbio->stripe_npages; int ret; ret = btrfs_alloc_page_array(data_pages, rbio->stripe_pages, false); if (ret < 0) return ret; index_stripe_sectors(rbio); return 0; } /* * We use plugging call backs to collect full stripes. * Any time we get a partial stripe write while plugged * we collect it into a list. When the unplug comes down, * we sort the list by logical block number and merge * everything we can into the same rbios */ struct btrfs_plug_cb { struct blk_plug_cb cb; struct btrfs_fs_info *info; struct list_head rbio_list; }; /* * rbios on the plug list are sorted for easier merging. */ static int plug_cmp(void *priv, const struct list_head *a, const struct list_head *b) { const struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio, plug_list); const struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio, plug_list); u64 a_sector = ra->bio_list.head->bi_iter.bi_sector; u64 b_sector = rb->bio_list.head->bi_iter.bi_sector; if (a_sector < b_sector) return -1; if (a_sector > b_sector) return 1; return 0; } static void raid_unplug(struct blk_plug_cb *cb, bool from_schedule) { struct btrfs_plug_cb *plug = container_of(cb, struct btrfs_plug_cb, cb); struct btrfs_raid_bio *cur; struct btrfs_raid_bio *last = NULL; list_sort(NULL, &plug->rbio_list, plug_cmp); while (!list_empty(&plug->rbio_list)) { cur = list_entry(plug->rbio_list.next, struct btrfs_raid_bio, plug_list); list_del_init(&cur->plug_list); if (rbio_is_full(cur)) { /* We have a full stripe, queue it down. */ start_async_work(cur, rmw_rbio_work); continue; } if (last) { if (rbio_can_merge(last, cur)) { merge_rbio(last, cur); free_raid_bio(cur); continue; } start_async_work(last, rmw_rbio_work); } last = cur; } if (last) start_async_work(last, rmw_rbio_work); kfree(plug); } /* Add the original bio into rbio->bio_list, and update rbio::dbitmap. */ static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio) { const struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; const u64 orig_logical = orig_bio->bi_iter.bi_sector << SECTOR_SHIFT; const u64 full_stripe_start = rbio->bioc->full_stripe_logical; const u32 orig_len = orig_bio->bi_iter.bi_size; const u32 sectorsize = fs_info->sectorsize; u64 cur_logical; ASSERT_RBIO_LOGICAL(orig_logical >= full_stripe_start && orig_logical + orig_len <= full_stripe_start + rbio->nr_data * BTRFS_STRIPE_LEN, rbio, orig_logical); bio_list_add(&rbio->bio_list, orig_bio); rbio->bio_list_bytes += orig_bio->bi_iter.bi_size; /* Update the dbitmap. */ for (cur_logical = orig_logical; cur_logical < orig_logical + orig_len; cur_logical += sectorsize) { int bit = ((u32)(cur_logical - full_stripe_start) >> fs_info->sectorsize_bits) % rbio->stripe_nsectors; set_bit(bit, &rbio->dbitmap); } } /* * our main entry point for writes from the rest of the FS. */ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc) { struct btrfs_fs_info *fs_info = bioc->fs_info; struct btrfs_raid_bio *rbio; struct btrfs_plug_cb *plug = NULL; struct blk_plug_cb *cb; rbio = alloc_rbio(fs_info, bioc); if (IS_ERR(rbio)) { bio->bi_status = errno_to_blk_status(PTR_ERR(rbio)); bio_endio(bio); return; } rbio->operation = BTRFS_RBIO_WRITE; rbio_add_bio(rbio, bio); /* * Don't plug on full rbios, just get them out the door * as quickly as we can */ if (!rbio_is_full(rbio)) { cb = blk_check_plugged(raid_unplug, fs_info, sizeof(*plug)); if (cb) { plug = container_of(cb, struct btrfs_plug_cb, cb); if (!plug->info) { plug->info = fs_info; INIT_LIST_HEAD(&plug->rbio_list); } list_add_tail(&rbio->plug_list, &plug->rbio_list); return; } } /* * Either we don't have any existing plug, or we're doing a full stripe, * queue the rmw work now. */ start_async_work(rbio, rmw_rbio_work); } static int verify_one_sector(struct btrfs_raid_bio *rbio, int stripe_nr, int sector_nr) { struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; struct sector_ptr *sector; u8 csum_buf[BTRFS_CSUM_SIZE]; u8 *csum_expected; int ret; if (!rbio->csum_bitmap || !rbio->csum_buf) return 0; /* No way to verify P/Q as they are not covered by data csum. */ if (stripe_nr >= rbio->nr_data) return 0; /* * If we're rebuilding a read, we have to use pages from the * bio list if possible. */ if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0); } else { sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr); } ASSERT(sector->page); csum_expected = rbio->csum_buf + (stripe_nr * rbio->stripe_nsectors + sector_nr) * fs_info->csum_size; ret = btrfs_check_sector_csum(fs_info, sector->page, sector->pgoff, csum_buf, csum_expected); return ret; } /* * Recover a vertical stripe specified by @sector_nr. * @*pointers are the pre-allocated pointers by the caller, so we don't * need to allocate/free the pointers again and again. */ static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, void **pointers, void **unmap_array) { struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; struct sector_ptr *sector; const u32 sectorsize = fs_info->sectorsize; int found_errors; int faila; int failb; int stripe_nr; int ret = 0; /* * Now we just use bitmap to mark the horizontal stripes in * which we have data when doing parity scrub. */ if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB && !test_bit(sector_nr, &rbio->dbitmap)) return 0; found_errors = get_rbio_veritical_errors(rbio, sector_nr, &faila, &failb); /* * No errors in the vertical stripe, skip it. Can happen for recovery * which only part of a stripe failed csum check. */ if (!found_errors) return 0; if (found_errors > rbio->bioc->max_errors) return -EIO; /* * Setup our array of pointers with sectors from each stripe * * NOTE: store a duplicate array of pointers to preserve the * pointer order. */ for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) { /* * If we're rebuilding a read, we have to use pages from the * bio list if possible. */ if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0); } else { sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr); } ASSERT(sector->page); pointers[stripe_nr] = kmap_local_page(sector->page) + sector->pgoff; unmap_array[stripe_nr] = pointers[stripe_nr]; } /* All raid6 handling here */ if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) { /* Single failure, rebuild from parity raid5 style */ if (failb < 0) { if (faila == rbio->nr_data) /* * Just the P stripe has failed, without * a bad data or Q stripe. * We have nothing to do, just skip the * recovery for this stripe. */ goto cleanup; /* * a single failure in raid6 is rebuilt * in the pstripe code below */ goto pstripe; } /* * If the q stripe is failed, do a pstripe reconstruction from * the xors. * If both the q stripe and the P stripe are failed, we're * here due to a crc mismatch and we can't give them the * data they want. */ if (failb == rbio->real_stripes - 1) { if (faila == rbio->real_stripes - 2) /* * Only P and Q are corrupted. * We only care about data stripes recovery, * can skip this vertical stripe. */ goto cleanup; /* * Otherwise we have one bad data stripe and * a good P stripe. raid5! */ goto pstripe; } if (failb == rbio->real_stripes - 2) { raid6_datap_recov(rbio->real_stripes, sectorsize, faila, pointers); } else { raid6_2data_recov(rbio->real_stripes, sectorsize, faila, failb, pointers); } } else { void *p; /* Rebuild from P stripe here (raid5 or raid6). */ ASSERT(failb == -1); pstripe: /* Copy parity block into failed block to start with */ memcpy(pointers[faila], pointers[rbio->nr_data], sectorsize); /* Rearrange the pointer array */ p = pointers[faila]; for (stripe_nr = faila; stripe_nr < rbio->nr_data - 1; stripe_nr++) pointers[stripe_nr] = pointers[stripe_nr + 1]; pointers[rbio->nr_data - 1] = p; /* Xor in the rest */ run_xor(pointers, rbio->nr_data - 1, sectorsize); } /* * No matter if this is a RMW or recovery, we should have all * failed sectors repaired in the vertical stripe, thus they are now * uptodate. * Especially if we determine to cache the rbio, we need to * have at least all data sectors uptodate. * * If possible, also check if the repaired sector matches its data * checksum. */ if (faila >= 0) { ret = verify_one_sector(rbio, faila, sector_nr); if (ret < 0) goto cleanup; sector = rbio_stripe_sector(rbio, faila, sector_nr); sector->uptodate = 1; } if (failb >= 0) { ret = verify_one_sector(rbio, failb, sector_nr); if (ret < 0) goto cleanup; sector = rbio_stripe_sector(rbio, failb, sector_nr); sector->uptodate = 1; } cleanup: for (stripe_nr = rbio->real_stripes - 1; stripe_nr >= 0; stripe_nr--) kunmap_local(unmap_array[stripe_nr]); return ret; } static int recover_sectors(struct btrfs_raid_bio *rbio) { void **pointers = NULL; void **unmap_array = NULL; int sectornr; int ret = 0; /* * @pointers array stores the pointer for each sector. * * @unmap_array stores copy of pointers that does not get reordered * during reconstruction so that kunmap_local works. */ pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); if (!pointers || !unmap_array) { ret = -ENOMEM; goto out; } if (rbio->operation == BTRFS_RBIO_READ_REBUILD) { spin_lock(&rbio->bio_list_lock); set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); spin_unlock(&rbio->bio_list_lock); } index_rbio_pages(rbio); for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { ret = recover_vertical(rbio, sectornr, pointers, unmap_array); if (ret < 0) break; } out: kfree(pointers); kfree(unmap_array); return ret; } static void recover_rbio(struct btrfs_raid_bio *rbio) { struct bio_list bio_list = BIO_EMPTY_LIST; int total_sector_nr; int ret = 0; /* * Either we're doing recover for a read failure or degraded write, * caller should have set error bitmap correctly. */ ASSERT(bitmap_weight(rbio->error_bitmap, rbio->nr_sectors)); /* For recovery, we need to read all sectors including P/Q. */ ret = alloc_rbio_pages(rbio); if (ret < 0) goto out; index_rbio_pages(rbio); /* * Read everything that hasn't failed. However this time we will * not trust any cached sector. * As we may read out some stale data but higher layer is not reading * that stale part. * * So here we always re-read everything in recovery path. */ for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; total_sector_nr++) { int stripe = total_sector_nr / rbio->stripe_nsectors; int sectornr = total_sector_nr % rbio->stripe_nsectors; struct sector_ptr *sector; /* * Skip the range which has error. It can be a range which is * marked error (for csum mismatch), or it can be a missing * device. */ if (!rbio->bioc->stripes[stripe].dev->bdev || test_bit(total_sector_nr, rbio->error_bitmap)) { /* * Also set the error bit for missing device, which * may not yet have its error bit set. */ set_bit(total_sector_nr, rbio->error_bitmap); continue; } sector = rbio_stripe_sector(rbio, stripe, sectornr); ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, sectornr, REQ_OP_READ); if (ret < 0) { bio_list_put(&bio_list); goto out; } } submit_read_wait_bio_list(rbio, &bio_list); ret = recover_sectors(rbio); out: rbio_orig_end_io(rbio, errno_to_blk_status(ret)); } static void recover_rbio_work(struct work_struct *work) { struct btrfs_raid_bio *rbio; rbio = container_of(work, struct btrfs_raid_bio, work); if (!lock_stripe_add(rbio)) recover_rbio(rbio); } static void recover_rbio_work_locked(struct work_struct *work) { recover_rbio(container_of(work, struct btrfs_raid_bio, work)); } static void set_rbio_raid6_extra_error(struct btrfs_raid_bio *rbio, int mirror_num) { bool found = false; int sector_nr; /* * This is for RAID6 extra recovery tries, thus mirror number should * be large than 2. * Mirror 1 means read from data stripes. Mirror 2 means rebuild using * RAID5 methods. */ ASSERT(mirror_num > 2); for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) { int found_errors; int faila; int failb; found_errors = get_rbio_veritical_errors(rbio, sector_nr, &faila, &failb); /* This vertical stripe doesn't have errors. */ if (!found_errors) continue; /* * If we found errors, there should be only one error marked * by previous set_rbio_range_error(). */ ASSERT(found_errors == 1); found = true; /* Now select another stripe to mark as error. */ failb = rbio->real_stripes - (mirror_num - 1); if (failb <= faila) failb--; /* Set the extra bit in error bitmap. */ if (failb >= 0) set_bit(failb * rbio->stripe_nsectors + sector_nr, rbio->error_bitmap); } /* We should found at least one vertical stripe with error.*/ ASSERT(found); } /* * the main entry point for reads from the higher layers. This * is really only called when the normal read path had a failure, * so we assume the bio they send down corresponds to a failed part * of the drive. */ void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, int mirror_num) { struct btrfs_fs_info *fs_info = bioc->fs_info; struct btrfs_raid_bio *rbio; rbio = alloc_rbio(fs_info, bioc); if (IS_ERR(rbio)) { bio->bi_status = errno_to_blk_status(PTR_ERR(rbio)); bio_endio(bio); return; } rbio->operation = BTRFS_RBIO_READ_REBUILD; rbio_add_bio(rbio, bio); set_rbio_range_error(rbio, bio); /* * Loop retry: * for 'mirror == 2', reconstruct from all other stripes. * for 'mirror_num > 2', select a stripe to fail on every retry. */ if (mirror_num > 2) set_rbio_raid6_extra_error(rbio, mirror_num); start_async_work(rbio, recover_rbio_work); } static void fill_data_csums(struct btrfs_raid_bio *rbio) { struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; struct btrfs_root *csum_root = btrfs_csum_root(fs_info, rbio->bioc->full_stripe_logical); const u64 start = rbio->bioc->full_stripe_logical; const u32 len = (rbio->nr_data * rbio->stripe_nsectors) << fs_info->sectorsize_bits; int ret; /* The rbio should not have its csum buffer initialized. */ ASSERT(!rbio->csum_buf && !rbio->csum_bitmap); /* * Skip the csum search if: * * - The rbio doesn't belong to data block groups * Then we are doing IO for tree blocks, no need to search csums. * * - The rbio belongs to mixed block groups * This is to avoid deadlock, as we're already holding the full * stripe lock, if we trigger a metadata read, and it needs to do * raid56 recovery, we will deadlock. */ if (!(rbio->bioc->map_type & BTRFS_BLOCK_GROUP_DATA) || rbio->bioc->map_type & BTRFS_BLOCK_GROUP_METADATA) return; rbio->csum_buf = kzalloc(rbio->nr_data * rbio->stripe_nsectors * fs_info->csum_size, GFP_NOFS); rbio->csum_bitmap = bitmap_zalloc(rbio->nr_data * rbio->stripe_nsectors, GFP_NOFS); if (!rbio->csum_buf || !rbio->csum_bitmap) { ret = -ENOMEM; goto error; } ret = btrfs_lookup_csums_bitmap(csum_root, NULL, start, start + len - 1, rbio->csum_buf, rbio->csum_bitmap); if (ret < 0) goto error; if (bitmap_empty(rbio->csum_bitmap, len >> fs_info->sectorsize_bits)) goto no_csum; return; error: /* * We failed to allocate memory or grab the csum, but it's not fatal, * we can still continue. But better to warn users that RMW is no * longer safe for this particular sub-stripe write. */ btrfs_warn_rl(fs_info, "sub-stripe write for full stripe %llu is not safe, failed to get csum: %d", rbio->bioc->full_stripe_logical, ret); no_csum: kfree(rbio->csum_buf); bitmap_free(rbio->csum_bitmap); rbio->csum_buf = NULL; rbio->csum_bitmap = NULL; } static int rmw_read_wait_recover(struct btrfs_raid_bio *rbio) { struct bio_list bio_list = BIO_EMPTY_LIST; int total_sector_nr; int ret = 0; /* * Fill the data csums we need for data verification. We need to fill * the csum_bitmap/csum_buf first, as our endio function will try to * verify the data sectors. */ fill_data_csums(rbio); /* * Build a list of bios to read all sectors (including data and P/Q). * * This behavior is to compensate the later csum verification and recovery. */ for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; total_sector_nr++) { struct sector_ptr *sector; int stripe = total_sector_nr / rbio->stripe_nsectors; int sectornr = total_sector_nr % rbio->stripe_nsectors; sector = rbio_stripe_sector(rbio, stripe, sectornr); ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, sectornr, REQ_OP_READ); if (ret) { bio_list_put(&bio_list); return ret; } } /* * We may or may not have any corrupted sectors (including missing dev * and csum mismatch), just let recover_sectors() to handle them all. */ submit_read_wait_bio_list(rbio, &bio_list); return recover_sectors(rbio); } static void raid_wait_write_end_io(struct bio *bio) { struct btrfs_raid_bio *rbio = bio->bi_private; blk_status_t err = bio->bi_status; if (err) rbio_update_error_bitmap(rbio, bio); bio_put(bio); if (atomic_dec_and_test(&rbio->stripes_pending)) wake_up(&rbio->io_wait); } static void submit_write_bios(struct btrfs_raid_bio *rbio, struct bio_list *bio_list) { struct bio *bio; atomic_set(&rbio->stripes_pending, bio_list_size(bio_list)); while ((bio = bio_list_pop(bio_list))) { bio->bi_end_io = raid_wait_write_end_io; if (trace_raid56_write_enabled()) { struct raid56_bio_trace_info trace_info = { 0 }; bio_get_trace_info(rbio, bio, &trace_info); trace_raid56_write(rbio, bio, &trace_info); } submit_bio(bio); } } /* * To determine if we need to read any sector from the disk. * Should only be utilized in RMW path, to skip cached rbio. */ static bool need_read_stripe_sectors(struct btrfs_raid_bio *rbio) { int i; for (i = 0; i < rbio->nr_data * rbio->stripe_nsectors; i++) { struct sector_ptr *sector = &rbio->stripe_sectors[i]; /* * We have a sector which doesn't have page nor uptodate, * thus this rbio can not be cached one, as cached one must * have all its data sectors present and uptodate. */ if (!sector->page || !sector->uptodate) return true; } return false; } static void rmw_rbio(struct btrfs_raid_bio *rbio) { struct bio_list bio_list; int sectornr; int ret = 0; /* * Allocate the pages for parity first, as P/Q pages will always be * needed for both full-stripe and sub-stripe writes. */ ret = alloc_rbio_parity_pages(rbio); if (ret < 0) goto out; /* * Either full stripe write, or we have every data sector already * cached, can go to write path immediately. */ if (!rbio_is_full(rbio) && need_read_stripe_sectors(rbio)) { /* * Now we're doing sub-stripe write, also need all data stripes * to do the full RMW. */ ret = alloc_rbio_data_pages(rbio); if (ret < 0) goto out; index_rbio_pages(rbio); ret = rmw_read_wait_recover(rbio); if (ret < 0) goto out; } /* * At this stage we're not allowed to add any new bios to the * bio list any more, anyone else that wants to change this stripe * needs to do their own rmw. */ spin_lock(&rbio->bio_list_lock); set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); spin_unlock(&rbio->bio_list_lock); bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); index_rbio_pages(rbio); /* * We don't cache full rbios because we're assuming * the higher layers are unlikely to use this area of * the disk again soon. If they do use it again, * hopefully they will send another full bio. */ if (!rbio_is_full(rbio)) cache_rbio_pages(rbio); else clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) generate_pq_vertical(rbio, sectornr); bio_list_init(&bio_list); ret = rmw_assemble_write_bios(rbio, &bio_list); if (ret < 0) goto out; /* We should have at least one bio assembled. */ ASSERT(bio_list_size(&bio_list)); submit_write_bios(rbio, &bio_list); wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); /* We may have more errors than our tolerance during the read. */ for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { int found_errors; found_errors = get_rbio_veritical_errors(rbio, sectornr, NULL, NULL); if (found_errors > rbio->bioc->max_errors) { ret = -EIO; break; } } out: rbio_orig_end_io(rbio, errno_to_blk_status(ret)); } static void rmw_rbio_work(struct work_struct *work) { struct btrfs_raid_bio *rbio; rbio = container_of(work, struct btrfs_raid_bio, work); if (lock_stripe_add(rbio) == 0) rmw_rbio(rbio); } static void rmw_rbio_work_locked(struct work_struct *work) { rmw_rbio(container_of(work, struct btrfs_raid_bio, work)); } /* * The following code is used to scrub/replace the parity stripe * * Caller must have already increased bio_counter for getting @bioc. * * Note: We need make sure all the pages that add into the scrub/replace * raid bio are correct and not be changed during the scrub/replace. That * is those pages just hold metadata or file data with checksum. */ struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, struct btrfs_io_context *bioc, struct btrfs_device *scrub_dev, unsigned long *dbitmap, int stripe_nsectors) { struct btrfs_fs_info *fs_info = bioc->fs_info; struct btrfs_raid_bio *rbio; int i; rbio = alloc_rbio(fs_info, bioc); if (IS_ERR(rbio)) return NULL; bio_list_add(&rbio->bio_list, bio); /* * This is a special bio which is used to hold the completion handler * and make the scrub rbio is similar to the other types */ ASSERT(!bio->bi_iter.bi_size); rbio->operation = BTRFS_RBIO_PARITY_SCRUB; /* * After mapping bioc with BTRFS_MAP_WRITE, parities have been sorted * to the end position, so this search can start from the first parity * stripe. */ for (i = rbio->nr_data; i < rbio->real_stripes; i++) { if (bioc->stripes[i].dev == scrub_dev) { rbio->scrubp = i; break; } } ASSERT_RBIO_STRIPE(i < rbio->real_stripes, rbio, i); bitmap_copy(&rbio->dbitmap, dbitmap, stripe_nsectors); return rbio; } /* * We just scrub the parity that we have correct data on the same horizontal, * so we needn't allocate all pages for all the stripes. */ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio) { const u32 sectorsize = rbio->bioc->fs_info->sectorsize; int total_sector_nr; for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; total_sector_nr++) { struct page *page; int sectornr = total_sector_nr % rbio->stripe_nsectors; int index = (total_sector_nr * sectorsize) >> PAGE_SHIFT; if (!test_bit(sectornr, &rbio->dbitmap)) continue; if (rbio->stripe_pages[index]) continue; page = alloc_page(GFP_NOFS); if (!page) return -ENOMEM; rbio->stripe_pages[index] = page; } index_stripe_sectors(rbio); return 0; } static int finish_parity_scrub(struct btrfs_raid_bio *rbio) { struct btrfs_io_context *bioc = rbio->bioc; const u32 sectorsize = bioc->fs_info->sectorsize; void **pointers = rbio->finish_pointers; unsigned long *pbitmap = &rbio->finish_pbitmap; int nr_data = rbio->nr_data; int stripe; int sectornr; bool has_qstripe; struct sector_ptr p_sector = { 0 }; struct sector_ptr q_sector = { 0 }; struct bio_list bio_list; int is_replace = 0; int ret; bio_list_init(&bio_list); if (rbio->real_stripes - rbio->nr_data == 1) has_qstripe = false; else if (rbio->real_stripes - rbio->nr_data == 2) has_qstripe = true; else BUG(); /* * Replace is running and our P/Q stripe is being replaced, then we * need to duplicate the final write to replace target. */ if (bioc->replace_nr_stripes && bioc->replace_stripe_src == rbio->scrubp) { is_replace = 1; bitmap_copy(pbitmap, &rbio->dbitmap, rbio->stripe_nsectors); } /* * Because the higher layers(scrubber) are unlikely to * use this area of the disk again soon, so don't cache * it. */ clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); p_sector.page = alloc_page(GFP_NOFS); if (!p_sector.page) return -ENOMEM; p_sector.pgoff = 0; p_sector.uptodate = 1; if (has_qstripe) { /* RAID6, allocate and map temp space for the Q stripe */ q_sector.page = alloc_page(GFP_NOFS); if (!q_sector.page) { __free_page(p_sector.page); p_sector.page = NULL; return -ENOMEM; } q_sector.pgoff = 0; q_sector.uptodate = 1; pointers[rbio->real_stripes - 1] = kmap_local_page(q_sector.page); } bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); /* Map the parity stripe just once */ pointers[nr_data] = kmap_local_page(p_sector.page); for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) { struct sector_ptr *sector; void *parity; /* first collect one page from each data stripe */ for (stripe = 0; stripe < nr_data; stripe++) { sector = sector_in_rbio(rbio, stripe, sectornr, 0); pointers[stripe] = kmap_local_page(sector->page) + sector->pgoff; } if (has_qstripe) { assert_rbio(rbio); /* RAID6, call the library function to fill in our P/Q */ raid6_call.gen_syndrome(rbio->real_stripes, sectorsize, pointers); } else { /* raid5 */ memcpy(pointers[nr_data], pointers[0], sectorsize); run_xor(pointers + 1, nr_data - 1, sectorsize); } /* Check scrubbing parity and repair it */ sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); parity = kmap_local_page(sector->page) + sector->pgoff; if (memcmp(parity, pointers[rbio->scrubp], sectorsize) != 0) memcpy(parity, pointers[rbio->scrubp], sectorsize); else /* Parity is right, needn't writeback */ bitmap_clear(&rbio->dbitmap, sectornr, 1); kunmap_local(parity); for (stripe = nr_data - 1; stripe >= 0; stripe--) kunmap_local(pointers[stripe]); } kunmap_local(pointers[nr_data]); __free_page(p_sector.page); p_sector.page = NULL; if (q_sector.page) { kunmap_local(pointers[rbio->real_stripes - 1]); __free_page(q_sector.page); q_sector.page = NULL; } /* * time to start writing. Make bios for everything from the * higher layers (the bio_list in our rbio) and our p/q. Ignore * everything else. */ for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) { struct sector_ptr *sector; sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); ret = rbio_add_io_sector(rbio, &bio_list, sector, rbio->scrubp, sectornr, REQ_OP_WRITE); if (ret) goto cleanup; } if (!is_replace) goto submit_write; /* * Replace is running and our parity stripe needs to be duplicated to * the target device. Check we have a valid source stripe number. */ ASSERT_RBIO(rbio->bioc->replace_stripe_src >= 0, rbio); for_each_set_bit(sectornr, pbitmap, rbio->stripe_nsectors) { struct sector_ptr *sector; sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); ret = rbio_add_io_sector(rbio, &bio_list, sector, rbio->real_stripes, sectornr, REQ_OP_WRITE); if (ret) goto cleanup; } submit_write: submit_write_bios(rbio, &bio_list); return 0; cleanup: bio_list_put(&bio_list); return ret; } static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe) { if (stripe >= 0 && stripe < rbio->nr_data) return 1; return 0; } static int recover_scrub_rbio(struct btrfs_raid_bio *rbio) { void **pointers = NULL; void **unmap_array = NULL; int sector_nr; int ret = 0; /* * @pointers array stores the pointer for each sector. * * @unmap_array stores copy of pointers that does not get reordered * during reconstruction so that kunmap_local works. */ pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); if (!pointers || !unmap_array) { ret = -ENOMEM; goto out; } for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) { int dfail = 0, failp = -1; int faila; int failb; int found_errors; found_errors = get_rbio_veritical_errors(rbio, sector_nr, &faila, &failb); if (found_errors > rbio->bioc->max_errors) { ret = -EIO; goto out; } if (found_errors == 0) continue; /* We should have at least one error here. */ ASSERT(faila >= 0 || failb >= 0); if (is_data_stripe(rbio, faila)) dfail++; else if (is_parity_stripe(faila)) failp = faila; if (is_data_stripe(rbio, failb)) dfail++; else if (is_parity_stripe(failb)) failp = failb; /* * Because we can not use a scrubbing parity to repair the * data, so the capability of the repair is declined. (In the * case of RAID5, we can not repair anything.) */ if (dfail > rbio->bioc->max_errors - 1) { ret = -EIO; goto out; } /* * If all data is good, only parity is correctly, just repair * the parity, no need to recover data stripes. */ if (dfail == 0) continue; /* * Here means we got one corrupted data stripe and one * corrupted parity on RAID6, if the corrupted parity is * scrubbing parity, luckily, use the other one to repair the * data, or we can not repair the data stripe. */ if (failp != rbio->scrubp) { ret = -EIO; goto out; } ret = recover_vertical(rbio, sector_nr, pointers, unmap_array); if (ret < 0) goto out; } out: kfree(pointers); kfree(unmap_array); return ret; } static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio) { struct bio_list bio_list = BIO_EMPTY_LIST; int total_sector_nr; int ret = 0; /* Build a list of bios to read all the missing parts. */ for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; total_sector_nr++) { int sectornr = total_sector_nr % rbio->stripe_nsectors; int stripe = total_sector_nr / rbio->stripe_nsectors; struct sector_ptr *sector; /* No data in the vertical stripe, no need to read. */ if (!test_bit(sectornr, &rbio->dbitmap)) continue; /* * We want to find all the sectors missing from the rbio and * read them from the disk. If sector_in_rbio() finds a sector * in the bio list we don't need to read it off the stripe. */ sector = sector_in_rbio(rbio, stripe, sectornr, 1); if (sector) continue; sector = rbio_stripe_sector(rbio, stripe, sectornr); /* * The bio cache may have handed us an uptodate sector. If so, * use it. */ if (sector->uptodate) continue; ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, sectornr, REQ_OP_READ); if (ret) { bio_list_put(&bio_list); return ret; } } submit_read_wait_bio_list(rbio, &bio_list); return 0; } static void scrub_rbio(struct btrfs_raid_bio *rbio) { int sector_nr; int ret; ret = alloc_rbio_essential_pages(rbio); if (ret) goto out; bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); ret = scrub_assemble_read_bios(rbio); if (ret < 0) goto out; /* We may have some failures, recover the failed sectors first. */ ret = recover_scrub_rbio(rbio); if (ret < 0) goto out; /* * We have every sector properly prepared. Can finish the scrub * and writeback the good content. */ ret = finish_parity_scrub(rbio); wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) { int found_errors; found_errors = get_rbio_veritical_errors(rbio, sector_nr, NULL, NULL); if (found_errors > rbio->bioc->max_errors) { ret = -EIO; break; } } out: rbio_orig_end_io(rbio, errno_to_blk_status(ret)); } static void scrub_rbio_work_locked(struct work_struct *work) { scrub_rbio(container_of(work, struct btrfs_raid_bio, work)); } void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio) { if (!lock_stripe_add(rbio)) start_async_work(rbio, scrub_rbio_work_locked); } /* * This is for scrub call sites where we already have correct data contents. * This allows us to avoid reading data stripes again. * * Unfortunately here we have to do page copy, other than reusing the pages. * This is due to the fact rbio has its own page management for its cache. */ void raid56_parity_cache_data_pages(struct btrfs_raid_bio *rbio, struct page **data_pages, u64 data_logical) { const u64 offset_in_full_stripe = data_logical - rbio->bioc->full_stripe_logical; const int page_index = offset_in_full_stripe >> PAGE_SHIFT; const u32 sectorsize = rbio->bioc->fs_info->sectorsize; const u32 sectors_per_page = PAGE_SIZE / sectorsize; int ret; /* * If we hit ENOMEM temporarily, but later at * raid56_parity_submit_scrub_rbio() time it succeeded, we just do * the extra read, not a big deal. * * If we hit ENOMEM later at raid56_parity_submit_scrub_rbio() time, * the bio would got proper error number set. */ ret = alloc_rbio_data_pages(rbio); if (ret < 0) return; /* data_logical must be at stripe boundary and inside the full stripe. */ ASSERT(IS_ALIGNED(offset_in_full_stripe, BTRFS_STRIPE_LEN)); ASSERT(offset_in_full_stripe < (rbio->nr_data << BTRFS_STRIPE_LEN_SHIFT)); for (int page_nr = 0; page_nr < (BTRFS_STRIPE_LEN >> PAGE_SHIFT); page_nr++) { struct page *dst = rbio->stripe_pages[page_nr + page_index]; struct page *src = data_pages[page_nr]; memcpy_page(dst, 0, src, 0, PAGE_SIZE); for (int sector_nr = sectors_per_page * page_index; sector_nr < sectors_per_page * (page_index + 1); sector_nr++) rbio->stripe_sectors[sector_nr].uptodate = true; } } |
| 11 11 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 | // SPDX-License-Identifier: GPL-2.0 /* * Block stat tracking code * * Copyright (C) 2016 Jens Axboe */ #include <linux/kernel.h> #include <linux/rculist.h> #include "blk-stat.h" #include "blk-mq.h" #include "blk.h" struct blk_queue_stats { struct list_head callbacks; spinlock_t lock; int accounting; }; void blk_rq_stat_init(struct blk_rq_stat *stat) { stat->min = -1ULL; stat->max = stat->nr_samples = stat->mean = 0; stat->batch = 0; } /* src is a per-cpu stat, mean isn't initialized */ void blk_rq_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src) { if (dst->nr_samples + src->nr_samples <= dst->nr_samples) return; dst->min = min(dst->min, src->min); dst->max = max(dst->max, src->max); dst->mean = div_u64(src->batch + dst->mean * dst->nr_samples, dst->nr_samples + src->nr_samples); dst->nr_samples += src->nr_samples; } void blk_rq_stat_add(struct blk_rq_stat *stat, u64 value) { stat->min = min(stat->min, value); stat->max = max(stat->max, value); stat->batch += value; stat->nr_samples++; } void blk_stat_add(struct request *rq, u64 now) { struct request_queue *q = rq->q; struct blk_stat_callback *cb; struct blk_rq_stat *stat; int bucket, cpu; u64 value; value = (now >= rq->io_start_time_ns) ? now - rq->io_start_time_ns : 0; rcu_read_lock(); cpu = get_cpu(); list_for_each_entry_rcu(cb, &q->stats->callbacks, list) { if (!blk_stat_is_active(cb)) continue; bucket = cb->bucket_fn(rq); if (bucket < 0) continue; stat = &per_cpu_ptr(cb->cpu_stat, cpu)[bucket]; blk_rq_stat_add(stat, value); } put_cpu(); rcu_read_unlock(); } static void blk_stat_timer_fn(struct timer_list *t) { struct blk_stat_callback *cb = from_timer(cb, t, timer); unsigned int bucket; int cpu; for (bucket = 0; bucket < cb->buckets; bucket++) blk_rq_stat_init(&cb->stat[bucket]); for_each_online_cpu(cpu) { struct blk_rq_stat *cpu_stat; cpu_stat = per_cpu_ptr(cb->cpu_stat, cpu); for (bucket = 0; bucket < cb->buckets; bucket++) { blk_rq_stat_sum(&cb->stat[bucket], &cpu_stat[bucket]); blk_rq_stat_init(&cpu_stat[bucket]); } } cb->timer_fn(cb); } struct blk_stat_callback * blk_stat_alloc_callback(void (*timer_fn)(struct blk_stat_callback *), int (*bucket_fn)(const struct request *), unsigned int buckets, void *data) { struct blk_stat_callback *cb; cb = kmalloc(sizeof(*cb), GFP_KERNEL); if (!cb) return NULL; cb->stat = kmalloc_array(buckets, sizeof(struct blk_rq_stat), GFP_KERNEL); if (!cb->stat) { kfree(cb); return NULL; } cb->cpu_stat = __alloc_percpu(buckets * sizeof(struct blk_rq_stat), __alignof__(struct blk_rq_stat)); if (!cb->cpu_stat) { kfree(cb->stat); kfree(cb); return NULL; } cb->timer_fn = timer_fn; cb->bucket_fn = bucket_fn; cb->data = data; cb->buckets = buckets; timer_setup(&cb->timer, blk_stat_timer_fn, 0); return cb; } void blk_stat_add_callback(struct request_queue *q, struct blk_stat_callback *cb) { unsigned int bucket; unsigned long flags; int cpu; for_each_possible_cpu(cpu) { struct blk_rq_stat *cpu_stat; cpu_stat = per_cpu_ptr(cb->cpu_stat, cpu); for (bucket = 0; bucket < cb->buckets; bucket++) blk_rq_stat_init(&cpu_stat[bucket]); } spin_lock_irqsave(&q->stats->lock, flags); list_add_tail_rcu(&cb->list, &q->stats->callbacks); blk_queue_flag_set(QUEUE_FLAG_STATS, q); spin_unlock_irqrestore(&q->stats->lock, flags); } void blk_stat_remove_callback(struct request_queue *q, struct blk_stat_callback *cb) { unsigned long flags; spin_lock_irqsave(&q->stats->lock, flags); list_del_rcu(&cb->list); if (list_empty(&q->stats->callbacks) && !q->stats->accounting) blk_queue_flag_clear(QUEUE_FLAG_STATS, q); spin_unlock_irqrestore(&q->stats->lock, flags); timer_delete_sync(&cb->timer); } static void blk_stat_free_callback_rcu(struct rcu_head *head) { struct blk_stat_callback *cb; cb = container_of(head, struct blk_stat_callback, rcu); free_percpu(cb->cpu_stat); kfree(cb->stat); kfree(cb); } void blk_stat_free_callback(struct blk_stat_callback *cb) { if (cb) call_rcu(&cb->rcu, blk_stat_free_callback_rcu); } void blk_stat_disable_accounting(struct request_queue *q) { unsigned long flags; spin_lock_irqsave(&q->stats->lock, flags); if (!--q->stats->accounting && list_empty(&q->stats->callbacks)) blk_queue_flag_clear(QUEUE_FLAG_STATS, q); spin_unlock_irqrestore(&q->stats->lock, flags); } EXPORT_SYMBOL_GPL(blk_stat_disable_accounting); void blk_stat_enable_accounting(struct request_queue *q) { unsigned long flags; spin_lock_irqsave(&q->stats->lock, flags); if (!q->stats->accounting++ && list_empty(&q->stats->callbacks)) blk_queue_flag_set(QUEUE_FLAG_STATS, q); spin_unlock_irqrestore(&q->stats->lock, flags); } EXPORT_SYMBOL_GPL(blk_stat_enable_accounting); struct blk_queue_stats *blk_alloc_queue_stats(void) { struct blk_queue_stats *stats; stats = kmalloc(sizeof(*stats), GFP_KERNEL); if (!stats) return NULL; INIT_LIST_HEAD(&stats->callbacks); spin_lock_init(&stats->lock); stats->accounting = 0; return stats; } void blk_free_queue_stats(struct blk_queue_stats *stats) { if (!stats) return; WARN_ON(!list_empty(&stats->callbacks)); kfree(stats); } |
| 5 5 3 3 1 1 1 5 5 5 5 5 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 | // SPDX-License-Identifier: GPL-2.0 /* * HugeTLB Vmemmap Optimization (HVO) * * Copyright (c) 2020, ByteDance. All rights reserved. * * Author: Muchun Song <songmuchun@bytedance.com> * * See Documentation/mm/vmemmap_dedup.rst */ #define pr_fmt(fmt) "HugeTLB: " fmt #include <linux/pgtable.h> #include <linux/moduleparam.h> #include <linux/bootmem_info.h> #include <linux/mmdebug.h> #include <linux/pagewalk.h> #include <asm/pgalloc.h> #include <asm/tlbflush.h> #include "hugetlb_vmemmap.h" /** * struct vmemmap_remap_walk - walk vmemmap page table * * @remap_pte: called for each lowest-level entry (PTE). * @nr_walked: the number of walked pte. * @reuse_page: the page which is reused for the tail vmemmap pages. * @reuse_addr: the virtual address of the @reuse_page page. * @vmemmap_pages: the list head of the vmemmap pages that can be freed * or is mapped from. * @flags: used to modify behavior in vmemmap page table walking * operations. */ struct vmemmap_remap_walk { void (*remap_pte)(pte_t *pte, unsigned long addr, struct vmemmap_remap_walk *walk); unsigned long nr_walked; struct page *reuse_page; unsigned long reuse_addr; struct list_head *vmemmap_pages; /* Skip the TLB flush when we split the PMD */ #define VMEMMAP_SPLIT_NO_TLB_FLUSH BIT(0) /* Skip the TLB flush when we remap the PTE */ #define VMEMMAP_REMAP_NO_TLB_FLUSH BIT(1) /* synchronize_rcu() to avoid writes from page_ref_add_unless() */ #define VMEMMAP_SYNCHRONIZE_RCU BIT(2) unsigned long flags; }; static int vmemmap_split_pmd(pmd_t *pmd, struct page *head, unsigned long start, struct vmemmap_remap_walk *walk) { pmd_t __pmd; int i; unsigned long addr = start; pte_t *pgtable; pgtable = pte_alloc_one_kernel(&init_mm); if (!pgtable) return -ENOMEM; pmd_populate_kernel(&init_mm, &__pmd, pgtable); for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) { pte_t entry, *pte; pgprot_t pgprot = PAGE_KERNEL; entry = mk_pte(head + i, pgprot); pte = pte_offset_kernel(&__pmd, addr); set_pte_at(&init_mm, addr, pte, entry); } spin_lock(&init_mm.page_table_lock); if (likely(pmd_leaf(*pmd))) { /* * Higher order allocations from buddy allocator must be able to * be treated as indepdenent small pages (as they can be freed * individually). */ if (!PageReserved(head)) split_page(head, get_order(PMD_SIZE)); /* Make pte visible before pmd. See comment in pmd_install(). */ smp_wmb(); pmd_populate_kernel(&init_mm, pmd, pgtable); if (!(walk->flags & VMEMMAP_SPLIT_NO_TLB_FLUSH)) flush_tlb_kernel_range(start, start + PMD_SIZE); } else { pte_free_kernel(&init_mm, pgtable); } spin_unlock(&init_mm.page_table_lock); return 0; } static int vmemmap_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next, struct mm_walk *walk) { int ret = 0; struct page *head; struct vmemmap_remap_walk *vmemmap_walk = walk->private; /* Only splitting, not remapping the vmemmap pages. */ if (!vmemmap_walk->remap_pte) walk->action = ACTION_CONTINUE; spin_lock(&init_mm.page_table_lock); head = pmd_leaf(*pmd) ? pmd_page(*pmd) : NULL; /* * Due to HugeTLB alignment requirements and the vmemmap * pages being at the start of the hotplugged memory * region in memory_hotplug.memmap_on_memory case. Checking * the vmemmap page associated with the first vmemmap page * if it is self-hosted is sufficient. * * [ hotplugged memory ] * [ section ][...][ section ] * [ vmemmap ][ usable memory ] * ^ | ^ | * +--+ | | * +------------------------+ */ if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) && unlikely(!vmemmap_walk->nr_walked)) { struct page *page = head ? head + pte_index(addr) : pte_page(ptep_get(pte_offset_kernel(pmd, addr))); if (PageVmemmapSelfHosted(page)) ret = -ENOTSUPP; } spin_unlock(&init_mm.page_table_lock); if (!head || ret) return ret; return vmemmap_split_pmd(pmd, head, addr & PMD_MASK, vmemmap_walk); } static int vmemmap_pte_entry(pte_t *pte, unsigned long addr, unsigned long next, struct mm_walk *walk) { struct vmemmap_remap_walk *vmemmap_walk = walk->private; /* * The reuse_page is found 'first' in page table walking before * starting remapping. */ if (!vmemmap_walk->reuse_page) vmemmap_walk->reuse_page = pte_page(ptep_get(pte)); else vmemmap_walk->remap_pte(pte, addr, vmemmap_walk); vmemmap_walk->nr_walked++; return 0; } static const struct mm_walk_ops vmemmap_remap_ops = { .pmd_entry = vmemmap_pmd_entry, .pte_entry = vmemmap_pte_entry, }; static int vmemmap_remap_range(unsigned long start, unsigned long end, struct vmemmap_remap_walk *walk) { int ret; VM_BUG_ON(!PAGE_ALIGNED(start | end)); mmap_read_lock(&init_mm); ret = walk_page_range_novma(&init_mm, start, end, &vmemmap_remap_ops, NULL, walk); mmap_read_unlock(&init_mm); if (ret) return ret; if (walk->remap_pte && !(walk->flags & VMEMMAP_REMAP_NO_TLB_FLUSH)) flush_tlb_kernel_range(start, end); return 0; } /* * Free a vmemmap page. A vmemmap page can be allocated from the memblock * allocator or buddy allocator. If the PG_reserved flag is set, it means * that it allocated from the memblock allocator, just free it via the * free_bootmem_page(). Otherwise, use __free_page(). */ static inline void free_vmemmap_page(struct page *page) { if (PageReserved(page)) { memmap_boot_pages_add(-1); free_bootmem_page(page); } else { memmap_pages_add(-1); __free_page(page); } } /* Free a list of the vmemmap pages */ static void free_vmemmap_page_list(struct list_head *list) { struct page *page, *next; list_for_each_entry_safe(page, next, list, lru) free_vmemmap_page(page); } static void vmemmap_remap_pte(pte_t *pte, unsigned long addr, struct vmemmap_remap_walk *walk) { /* * Remap the tail pages as read-only to catch illegal write operation * to the tail pages. */ pgprot_t pgprot = PAGE_KERNEL_RO; struct page *page = pte_page(ptep_get(pte)); pte_t entry; /* Remapping the head page requires r/w */ if (unlikely(addr == walk->reuse_addr)) { pgprot = PAGE_KERNEL; list_del(&walk->reuse_page->lru); /* * Makes sure that preceding stores to the page contents from * vmemmap_remap_free() become visible before the set_pte_at() * write. */ smp_wmb(); } entry = mk_pte(walk->reuse_page, pgprot); list_add(&page->lru, walk->vmemmap_pages); set_pte_at(&init_mm, addr, pte, entry); } /* * How many struct page structs need to be reset. When we reuse the head * struct page, the special metadata (e.g. page->flags or page->mapping) * cannot copy to the tail struct page structs. The invalid value will be * checked in the free_tail_page_prepare(). In order to avoid the message * of "corrupted mapping in tail page". We need to reset at least 4 (one * head struct page struct and three tail struct page structs) struct page * structs. */ #define NR_RESET_STRUCT_PAGE 4 static inline void reset_struct_pages(struct page *start) { struct page *from = start + NR_RESET_STRUCT_PAGE; BUILD_BUG_ON(NR_RESET_STRUCT_PAGE * 2 > PAGE_SIZE / sizeof(struct page)); memcpy(start, from, sizeof(*from) * NR_RESET_STRUCT_PAGE); } static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, struct vmemmap_remap_walk *walk) { pgprot_t pgprot = PAGE_KERNEL; struct page *page; void *to; BUG_ON(pte_page(ptep_get(pte)) != walk->reuse_page); page = list_first_entry(walk->vmemmap_pages, struct page, lru); list_del(&page->lru); to = page_to_virt(page); copy_page(to, (void *)walk->reuse_addr); reset_struct_pages(to); /* * Makes sure that preceding stores to the page contents become visible * before the set_pte_at() write. */ smp_wmb(); set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot)); } /** * vmemmap_remap_split - split the vmemmap virtual address range [@start, @end) * backing PMDs of the directmap into PTEs * @start: start address of the vmemmap virtual address range that we want * to remap. * @end: end address of the vmemmap virtual address range that we want to * remap. * @reuse: reuse address. * * Return: %0 on success, negative error code otherwise. */ static int vmemmap_remap_split(unsigned long start, unsigned long end, unsigned long reuse) { struct vmemmap_remap_walk walk = { .remap_pte = NULL, .flags = VMEMMAP_SPLIT_NO_TLB_FLUSH, }; /* See the comment in the vmemmap_remap_free(). */ BUG_ON(start - reuse != PAGE_SIZE); return vmemmap_remap_range(reuse, end, &walk); } /** * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end) * to the page which @reuse is mapped to, then free vmemmap * which the range are mapped to. * @start: start address of the vmemmap virtual address range that we want * to remap. * @end: end address of the vmemmap virtual address range that we want to * remap. * @reuse: reuse address. * @vmemmap_pages: list to deposit vmemmap pages to be freed. It is callers * responsibility to free pages. * @flags: modifications to vmemmap_remap_walk flags * * Return: %0 on success, negative error code otherwise. */ static int vmemmap_remap_free(unsigned long start, unsigned long end, unsigned long reuse, struct list_head *vmemmap_pages, unsigned long flags) { int ret; struct vmemmap_remap_walk walk = { .remap_pte = vmemmap_remap_pte, .reuse_addr = reuse, .vmemmap_pages = vmemmap_pages, .flags = flags, }; int nid = page_to_nid((struct page *)reuse); gfp_t gfp_mask = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN; /* * Allocate a new head vmemmap page to avoid breaking a contiguous * block of struct page memory when freeing it back to page allocator * in free_vmemmap_page_list(). This will allow the likely contiguous * struct page backing memory to be kept contiguous and allowing for * more allocations of hugepages. Fallback to the currently * mapped head page in case should it fail to allocate. */ walk.reuse_page = alloc_pages_node(nid, gfp_mask, 0); if (walk.reuse_page) { copy_page(page_to_virt(walk.reuse_page), (void *)walk.reuse_addr); list_add(&walk.reuse_page->lru, vmemmap_pages); memmap_pages_add(1); } /* * In order to make remapping routine most efficient for the huge pages, * the routine of vmemmap page table walking has the following rules * (see more details from the vmemmap_pte_range()): * * - The range [@start, @end) and the range [@reuse, @reuse + PAGE_SIZE) * should be continuous. * - The @reuse address is part of the range [@reuse, @end) that we are * walking which is passed to vmemmap_remap_range(). * - The @reuse address is the first in the complete range. * * So we need to make sure that @start and @reuse meet the above rules. */ BUG_ON(start - reuse != PAGE_SIZE); ret = vmemmap_remap_range(reuse, end, &walk); if (ret && walk.nr_walked) { end = reuse + walk.nr_walked * PAGE_SIZE; /* * vmemmap_pages contains pages from the previous * vmemmap_remap_range call which failed. These * are pages which were removed from the vmemmap. * They will be restored in the following call. */ walk = (struct vmemmap_remap_walk) { .remap_pte = vmemmap_restore_pte, .reuse_addr = reuse, .vmemmap_pages = vmemmap_pages, .flags = 0, }; vmemmap_remap_range(reuse, end, &walk); } return ret; } static int alloc_vmemmap_page_list(unsigned long start, unsigned long end, struct list_head *list) { gfp_t gfp_mask = GFP_KERNEL | __GFP_RETRY_MAYFAIL; unsigned long nr_pages = (end - start) >> PAGE_SHIFT; int nid = page_to_nid((struct page *)start); struct page *page, *next; int i; for (i = 0; i < nr_pages; i++) { page = alloc_pages_node(nid, gfp_mask, 0); if (!page) goto out; list_add(&page->lru, list); } memmap_pages_add(nr_pages); return 0; out: list_for_each_entry_safe(page, next, list, lru) __free_page(page); return -ENOMEM; } /** * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end) * to the page which is from the @vmemmap_pages * respectively. * @start: start address of the vmemmap virtual address range that we want * to remap. * @end: end address of the vmemmap virtual address range that we want to * remap. * @reuse: reuse address. * @flags: modifications to vmemmap_remap_walk flags * * Return: %0 on success, negative error code otherwise. */ static int vmemmap_remap_alloc(unsigned long start, unsigned long end, unsigned long reuse, unsigned long flags) { LIST_HEAD(vmemmap_pages); struct vmemmap_remap_walk walk = { .remap_pte = vmemmap_restore_pte, .reuse_addr = reuse, .vmemmap_pages = &vmemmap_pages, .flags = flags, }; /* See the comment in the vmemmap_remap_free(). */ BUG_ON(start - reuse != PAGE_SIZE); if (alloc_vmemmap_page_list(start, end, &vmemmap_pages)) return -ENOMEM; return vmemmap_remap_range(reuse, end, &walk); } DEFINE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key); EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key); static bool vmemmap_optimize_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON); static int __init hugetlb_vmemmap_optimize_param(char *buf) { return kstrtobool(buf, &vmemmap_optimize_enabled); } early_param("hugetlb_free_vmemmap", hugetlb_vmemmap_optimize_param); static int __hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio, unsigned long flags) { int ret; unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end; unsigned long vmemmap_reuse; VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio); VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio), folio); if (!folio_test_hugetlb_vmemmap_optimized(folio)) return 0; if (flags & VMEMMAP_SYNCHRONIZE_RCU) synchronize_rcu(); vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); vmemmap_reuse = vmemmap_start; vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE; /* * The pages which the vmemmap virtual address range [@vmemmap_start, * @vmemmap_end) are mapped to are freed to the buddy allocator, and * the range is mapped to the page which @vmemmap_reuse is mapped to. * When a HugeTLB page is freed to the buddy allocator, previously * discarded vmemmap pages must be allocated and remapping. */ ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, vmemmap_reuse, flags); if (!ret) { folio_clear_hugetlb_vmemmap_optimized(folio); static_branch_dec(&hugetlb_optimize_vmemmap_key); } return ret; } /** * hugetlb_vmemmap_restore_folio - restore previously optimized (by * hugetlb_vmemmap_optimize_folio()) vmemmap pages which * will be reallocated and remapped. * @h: struct hstate. * @folio: the folio whose vmemmap pages will be restored. * * Return: %0 if @folio's vmemmap pages have been reallocated and remapped, * negative error code otherwise. */ int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio) { return __hugetlb_vmemmap_restore_folio(h, folio, VMEMMAP_SYNCHRONIZE_RCU); } /** * hugetlb_vmemmap_restore_folios - restore vmemmap for every folio on the list. * @h: hstate. * @folio_list: list of folios. * @non_hvo_folios: Output list of folios for which vmemmap exists. * * Return: number of folios for which vmemmap was restored, or an error code * if an error was encountered restoring vmemmap for a folio. * Folios that have vmemmap are moved to the non_hvo_folios * list. Processing of entries stops when the first error is * encountered. The folio that experienced the error and all * non-processed folios will remain on folio_list. */ long hugetlb_vmemmap_restore_folios(const struct hstate *h, struct list_head *folio_list, struct list_head *non_hvo_folios) { struct folio *folio, *t_folio; long restored = 0; long ret = 0; unsigned long flags = VMEMMAP_REMAP_NO_TLB_FLUSH | VMEMMAP_SYNCHRONIZE_RCU; list_for_each_entry_safe(folio, t_folio, folio_list, lru) { if (folio_test_hugetlb_vmemmap_optimized(folio)) { ret = __hugetlb_vmemmap_restore_folio(h, folio, flags); /* only need to synchronize_rcu() once for each batch */ flags &= ~VMEMMAP_SYNCHRONIZE_RCU; if (ret) break; restored++; } /* Add non-optimized folios to output list */ list_move(&folio->lru, non_hvo_folios); } if (restored) flush_tlb_all(); if (!ret) ret = restored; return ret; } /* Return true iff a HugeTLB whose vmemmap should and can be optimized. */ static bool vmemmap_should_optimize_folio(const struct hstate *h, struct folio *folio) { if (folio_test_hugetlb_vmemmap_optimized(folio)) return false; if (!READ_ONCE(vmemmap_optimize_enabled)) return false; if (!hugetlb_vmemmap_optimizable(h)) return false; return true; } static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio, struct list_head *vmemmap_pages, unsigned long flags) { int ret = 0; unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end; unsigned long vmemmap_reuse; VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio); VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio), folio); if (!vmemmap_should_optimize_folio(h, folio)) return ret; static_branch_inc(&hugetlb_optimize_vmemmap_key); if (flags & VMEMMAP_SYNCHRONIZE_RCU) synchronize_rcu(); /* * Very Subtle * If VMEMMAP_REMAP_NO_TLB_FLUSH is set, TLB flushing is not performed * immediately after remapping. As a result, subsequent accesses * and modifications to struct pages associated with the hugetlb * page could be to the OLD struct pages. Set the vmemmap optimized * flag here so that it is copied to the new head page. This keeps * the old and new struct pages in sync. * If there is an error during optimization, we will immediately FLUSH * the TLB and clear the flag below. */ folio_set_hugetlb_vmemmap_optimized(folio); vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); vmemmap_reuse = vmemmap_start; vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE; /* * Remap the vmemmap virtual address range [@vmemmap_start, @vmemmap_end) * to the page which @vmemmap_reuse is mapped to. Add pages previously * mapping the range to vmemmap_pages list so that they can be freed by * the caller. */ ret = vmemmap_remap_free(vmemmap_start, vmemmap_end, vmemmap_reuse, vmemmap_pages, flags); if (ret) { static_branch_dec(&hugetlb_optimize_vmemmap_key); folio_clear_hugetlb_vmemmap_optimized(folio); } return ret; } /** * hugetlb_vmemmap_optimize_folio - optimize @folio's vmemmap pages. * @h: struct hstate. * @folio: the folio whose vmemmap pages will be optimized. * * This function only tries to optimize @folio's vmemmap pages and does not * guarantee that the optimization will succeed after it returns. The caller * can use folio_test_hugetlb_vmemmap_optimized(@folio) to detect if @folio's * vmemmap pages have been optimized. */ void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio) { LIST_HEAD(vmemmap_pages); __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, VMEMMAP_SYNCHRONIZE_RCU); free_vmemmap_page_list(&vmemmap_pages); } static int hugetlb_vmemmap_split_folio(const struct hstate *h, struct folio *folio) { unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end; unsigned long vmemmap_reuse; if (!vmemmap_should_optimize_folio(h, folio)) return 0; vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h); vmemmap_reuse = vmemmap_start; vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE; /* * Split PMDs on the vmemmap virtual address range [@vmemmap_start, * @vmemmap_end] */ return vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse); } static void __hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list, bool boot) { struct folio *folio; int nr_to_optimize; LIST_HEAD(vmemmap_pages); unsigned long flags = VMEMMAP_REMAP_NO_TLB_FLUSH | VMEMMAP_SYNCHRONIZE_RCU; nr_to_optimize = 0; list_for_each_entry(folio, folio_list, lru) { int ret; unsigned long spfn, epfn; if (boot && folio_test_hugetlb_vmemmap_optimized(folio)) { /* * Already optimized by pre-HVO, just map the * mirrored tail page structs RO. */ spfn = (unsigned long)&folio->page; epfn = spfn + pages_per_huge_page(h); vmemmap_wrprotect_hvo(spfn, epfn, folio_nid(folio), HUGETLB_VMEMMAP_RESERVE_SIZE); register_page_bootmem_memmap(pfn_to_section_nr(spfn), &folio->page, HUGETLB_VMEMMAP_RESERVE_SIZE); static_branch_inc(&hugetlb_optimize_vmemmap_key); continue; } nr_to_optimize++; ret = hugetlb_vmemmap_split_folio(h, folio); /* * Spliting the PMD requires allocating a page, thus lets fail * early once we encounter the first OOM. No point in retrying * as it can be dynamically done on remap with the memory * we get back from the vmemmap deduplication. */ if (ret == -ENOMEM) break; } if (!nr_to_optimize) /* * All pre-HVO folios, nothing left to do. It's ok if * there is a mix of pre-HVO and not yet HVO-ed folios * here, as __hugetlb_vmemmap_optimize_folio() will * skip any folios that already have the optimized flag * set, see vmemmap_should_optimize_folio(). */ goto out; flush_tlb_all(); list_for_each_entry(folio, folio_list, lru) { int ret; ret = __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, flags); /* only need to synchronize_rcu() once for each batch */ flags &= ~VMEMMAP_SYNCHRONIZE_RCU; /* * Pages to be freed may have been accumulated. If we * encounter an ENOMEM, free what we have and try again. * This can occur in the case that both spliting fails * halfway and head page allocation also failed. In this * case __hugetlb_vmemmap_optimize_folio() would free memory * allowing more vmemmap remaps to occur. */ if (ret == -ENOMEM && !list_empty(&vmemmap_pages)) { flush_tlb_all(); free_vmemmap_page_list(&vmemmap_pages); INIT_LIST_HEAD(&vmemmap_pages); __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, flags); } } out: flush_tlb_all(); free_vmemmap_page_list(&vmemmap_pages); } void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list) { __hugetlb_vmemmap_optimize_folios(h, folio_list, false); } void hugetlb_vmemmap_optimize_bootmem_folios(struct hstate *h, struct list_head *folio_list) { __hugetlb_vmemmap_optimize_folios(h, folio_list, true); } #ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT /* Return true of a bootmem allocated HugeTLB page should be pre-HVO-ed */ static bool vmemmap_should_optimize_bootmem_page(struct huge_bootmem_page *m) { unsigned long section_size, psize, pmd_vmemmap_size; phys_addr_t paddr; if (!READ_ONCE(vmemmap_optimize_enabled)) return false; if (!hugetlb_vmemmap_optimizable(m->hstate)) return false; psize = huge_page_size(m->hstate); paddr = virt_to_phys(m); /* * Pre-HVO only works if the bootmem huge page * is aligned to the section size. */ section_size = (1UL << PA_SECTION_SHIFT); if (!IS_ALIGNED(paddr, section_size) || !IS_ALIGNED(psize, section_size)) return false; /* * The pre-HVO code does not deal with splitting PMDS, * so the bootmem page must be aligned to the number * of base pages that can be mapped with one vmemmap PMD. */ pmd_vmemmap_size = (PMD_SIZE / (sizeof(struct page))) << PAGE_SHIFT; if (!IS_ALIGNED(paddr, pmd_vmemmap_size) || !IS_ALIGNED(psize, pmd_vmemmap_size)) return false; return true; } /* * Initialize memmap section for a gigantic page, HVO-style. */ void __init hugetlb_vmemmap_init_early(int nid) { unsigned long psize, paddr, section_size; unsigned long ns, i, pnum, pfn, nr_pages; unsigned long start, end; struct huge_bootmem_page *m = NULL; void *map; /* * Noting to do if bootmem pages were not allocated * early in boot, or if HVO wasn't enabled in the * first place. */ if (!hugetlb_bootmem_allocated()) return; if (!READ_ONCE(vmemmap_optimize_enabled)) return; section_size = (1UL << PA_SECTION_SHIFT); list_for_each_entry(m, &huge_boot_pages[nid], list) { if (!vmemmap_should_optimize_bootmem_page(m)) continue; nr_pages = pages_per_huge_page(m->hstate); psize = nr_pages << PAGE_SHIFT; paddr = virt_to_phys(m); pfn = PHYS_PFN(paddr); map = pfn_to_page(pfn); start = (unsigned long)map; end = start + nr_pages * sizeof(struct page); if (vmemmap_populate_hvo(start, end, nid, HUGETLB_VMEMMAP_RESERVE_SIZE) < 0) continue; memmap_boot_pages_add(HUGETLB_VMEMMAP_RESERVE_SIZE / PAGE_SIZE); pnum = pfn_to_section_nr(pfn); ns = psize / section_size; for (i = 0; i < ns; i++) { sparse_init_early_section(nid, map, pnum, SECTION_IS_VMEMMAP_PREINIT); map += section_map_size(); pnum++; } m->flags |= HUGE_BOOTMEM_HVO; } } void __init hugetlb_vmemmap_init_late(int nid) { struct huge_bootmem_page *m, *tm; unsigned long phys, nr_pages, start, end; unsigned long pfn, nr_mmap; struct hstate *h; void *map; if (!hugetlb_bootmem_allocated()) return; if (!READ_ONCE(vmemmap_optimize_enabled)) return; list_for_each_entry_safe(m, tm, &huge_boot_pages[nid], list) { if (!(m->flags & HUGE_BOOTMEM_HVO)) continue; phys = virt_to_phys(m); h = m->hstate; pfn = PHYS_PFN(phys); nr_pages = pages_per_huge_page(h); if (!hugetlb_bootmem_page_zones_valid(nid, m)) { /* * Oops, the hugetlb page spans multiple zones. * Remove it from the list, and undo HVO. */ list_del(&m->list); map = pfn_to_page(pfn); start = (unsigned long)map; end = start + nr_pages * sizeof(struct page); vmemmap_undo_hvo(start, end, nid, HUGETLB_VMEMMAP_RESERVE_SIZE); nr_mmap = end - start - HUGETLB_VMEMMAP_RESERVE_SIZE; memmap_boot_pages_add(DIV_ROUND_UP(nr_mmap, PAGE_SIZE)); memblock_phys_free(phys, huge_page_size(h)); continue; } else m->flags |= HUGE_BOOTMEM_ZONES_VALID; } } #endif static const struct ctl_table hugetlb_vmemmap_sysctls[] = { { .procname = "hugetlb_optimize_vmemmap", .data = &vmemmap_optimize_enabled, .maxlen = sizeof(vmemmap_optimize_enabled), .mode = 0644, .proc_handler = proc_dobool, }, }; static int __init hugetlb_vmemmap_init(void) { const struct hstate *h; /* HUGETLB_VMEMMAP_RESERVE_SIZE should cover all used struct pages */ BUILD_BUG_ON(__NR_USED_SUBPAGE > HUGETLB_VMEMMAP_RESERVE_PAGES); for_each_hstate(h) { if (hugetlb_vmemmap_optimizable(h)) { register_sysctl_init("vm", hugetlb_vmemmap_sysctls); break; } } return 0; } late_initcall(hugetlb_vmemmap_init); |
| 12 12 257 257 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 | // SPDX-License-Identifier: GPL-2.0 /* * shstk.c - Intel shadow stack support * * Copyright (c) 2021, Intel Corporation. * Yu-cheng Yu <yu-cheng.yu@intel.com> */ #include <linux/sched.h> #include <linux/bitops.h> #include <linux/types.h> #include <linux/mm.h> #include <linux/mman.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/sched/signal.h> #include <linux/compat.h> #include <linux/sizes.h> #include <linux/user.h> #include <linux/syscalls.h> #include <asm/msr.h> #include <asm/fpu/xstate.h> #include <asm/fpu/types.h> #include <asm/shstk.h> #include <asm/special_insns.h> #include <asm/fpu/api.h> #include <asm/prctl.h> #define SS_FRAME_SIZE 8 static bool features_enabled(unsigned long features) { return current->thread.features & features; } static void features_set(unsigned long features) { current->thread.features |= features; } static void features_clr(unsigned long features) { current->thread.features &= ~features; } /* * Create a restore token on the shadow stack. A token is always 8-byte * and aligned to 8. */ static int create_rstor_token(unsigned long ssp, unsigned long *token_addr) { unsigned long addr; /* Token must be aligned */ if (!IS_ALIGNED(ssp, 8)) return -EINVAL; addr = ssp - SS_FRAME_SIZE; /* * SSP is aligned, so reserved bits and mode bit are a zero, just mark * the token 64-bit. */ ssp |= BIT(0); if (write_user_shstk_64((u64 __user *)addr, (u64)ssp)) return -EFAULT; if (token_addr) *token_addr = addr; return 0; } /* * VM_SHADOW_STACK will have a guard page. This helps userspace protect * itself from attacks. The reasoning is as follows: * * The shadow stack pointer(SSP) is moved by CALL, RET, and INCSSPQ. The * INCSSP instruction can increment the shadow stack pointer. It is the * shadow stack analog of an instruction like: * * addq $0x80, %rsp * * However, there is one important difference between an ADD on %rsp * and INCSSP. In addition to modifying SSP, INCSSP also reads from the * memory of the first and last elements that were "popped". It can be * thought of as acting like this: * * READ_ONCE(ssp); // read+discard top element on stack * ssp += nr_to_pop * 8; // move the shadow stack * READ_ONCE(ssp-8); // read+discard last popped stack element * * The maximum distance INCSSP can move the SSP is 2040 bytes, before * it would read the memory. Therefore a single page gap will be enough * to prevent any operation from shifting the SSP to an adjacent stack, * since it would have to land in the gap at least once, causing a * fault. */ static unsigned long alloc_shstk(unsigned long addr, unsigned long size, unsigned long token_offset, bool set_res_tok) { int flags = MAP_ANONYMOUS | MAP_PRIVATE | MAP_ABOVE4G; struct mm_struct *mm = current->mm; unsigned long mapped_addr, unused; if (addr) flags |= MAP_FIXED_NOREPLACE; mmap_write_lock(mm); mapped_addr = do_mmap(NULL, addr, size, PROT_READ, flags, VM_SHADOW_STACK | VM_WRITE, 0, &unused, NULL); mmap_write_unlock(mm); if (!set_res_tok || IS_ERR_VALUE(mapped_addr)) goto out; if (create_rstor_token(mapped_addr + token_offset, NULL)) { vm_munmap(mapped_addr, size); return -EINVAL; } out: return mapped_addr; } static unsigned long adjust_shstk_size(unsigned long size) { if (size) return PAGE_ALIGN(size); return PAGE_ALIGN(min_t(unsigned long long, rlimit(RLIMIT_STACK), SZ_4G)); } static void unmap_shadow_stack(u64 base, u64 size) { int r; r = vm_munmap(base, size); /* * mmap_write_lock_killable() failed with -EINTR. This means * the process is about to die and have it's MM cleaned up. * This task shouldn't ever make it back to userspace. In this * case it is ok to leak a shadow stack, so just exit out. */ if (r == -EINTR) return; /* * For all other types of vm_munmap() failure, either the * system is out of memory or there is bug. */ WARN_ON_ONCE(r); } static int shstk_setup(void) { struct thread_shstk *shstk = ¤t->thread.shstk; unsigned long addr, size; /* Already enabled */ if (features_enabled(ARCH_SHSTK_SHSTK)) return 0; /* Also not supported for 32 bit */ if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || in_ia32_syscall()) return -EOPNOTSUPP; size = adjust_shstk_size(0); addr = alloc_shstk(0, size, 0, false); if (IS_ERR_VALUE(addr)) return PTR_ERR((void *)addr); fpregs_lock_and_load(); wrmsrl(MSR_IA32_PL3_SSP, addr + size); wrmsrl(MSR_IA32_U_CET, CET_SHSTK_EN); fpregs_unlock(); shstk->base = addr; shstk->size = size; features_set(ARCH_SHSTK_SHSTK); return 0; } void reset_thread_features(void) { memset(¤t->thread.shstk, 0, sizeof(struct thread_shstk)); current->thread.features = 0; current->thread.features_locked = 0; } unsigned long shstk_alloc_thread_stack(struct task_struct *tsk, unsigned long clone_flags, unsigned long stack_size) { struct thread_shstk *shstk = &tsk->thread.shstk; unsigned long addr, size; /* * If shadow stack is not enabled on the new thread, skip any * switch to a new shadow stack. */ if (!features_enabled(ARCH_SHSTK_SHSTK)) return 0; /* * For CLONE_VFORK the child will share the parents shadow stack. * Make sure to clear the internal tracking of the thread shadow * stack so the freeing logic run for child knows to leave it alone. */ if (clone_flags & CLONE_VFORK) { shstk->base = 0; shstk->size = 0; return 0; } /* * For !CLONE_VM the child will use a copy of the parents shadow * stack. */ if (!(clone_flags & CLONE_VM)) return 0; size = adjust_shstk_size(stack_size); addr = alloc_shstk(0, size, 0, false); if (IS_ERR_VALUE(addr)) return addr; shstk->base = addr; shstk->size = size; return addr + size; } static unsigned long get_user_shstk_addr(void) { unsigned long long ssp; fpregs_lock_and_load(); rdmsrl(MSR_IA32_PL3_SSP, ssp); fpregs_unlock(); return ssp; } #define SHSTK_DATA_BIT BIT(63) static int put_shstk_data(u64 __user *addr, u64 data) { if (WARN_ON_ONCE(data & SHSTK_DATA_BIT)) return -EINVAL; /* * Mark the high bit so that the sigframe can't be processed as a * return address. */ if (write_user_shstk_64(addr, data | SHSTK_DATA_BIT)) return -EFAULT; return 0; } static int get_shstk_data(unsigned long *data, unsigned long __user *addr) { unsigned long ldata; if (unlikely(get_user(ldata, addr))) return -EFAULT; if (!(ldata & SHSTK_DATA_BIT)) return -EINVAL; *data = ldata & ~SHSTK_DATA_BIT; return 0; } static int shstk_push_sigframe(unsigned long *ssp) { unsigned long target_ssp = *ssp; /* Token must be aligned */ if (!IS_ALIGNED(target_ssp, 8)) return -EINVAL; *ssp -= SS_FRAME_SIZE; if (put_shstk_data((void __user *)*ssp, target_ssp)) return -EFAULT; return 0; } static int shstk_pop_sigframe(unsigned long *ssp) { struct vm_area_struct *vma; unsigned long token_addr; bool need_to_check_vma; int err = 1; /* * It is possible for the SSP to be off the end of a shadow stack by 4 * or 8 bytes. If the shadow stack is at the start of a page or 4 bytes * before it, it might be this case, so check that the address being * read is actually shadow stack. */ if (!IS_ALIGNED(*ssp, 8)) return -EINVAL; need_to_check_vma = PAGE_ALIGN(*ssp) == *ssp; if (need_to_check_vma) mmap_read_lock_killable(current->mm); err = get_shstk_data(&token_addr, (unsigned long __user *)*ssp); if (unlikely(err)) goto out_err; if (need_to_check_vma) { vma = find_vma(current->mm, *ssp); if (!vma || !(vma->vm_flags & VM_SHADOW_STACK)) { err = -EFAULT; goto out_err; } mmap_read_unlock(current->mm); } /* Restore SSP aligned? */ if (unlikely(!IS_ALIGNED(token_addr, 8))) return -EINVAL; /* SSP in userspace? */ if (unlikely(token_addr >= TASK_SIZE_MAX)) return -EINVAL; *ssp = token_addr; return 0; out_err: if (need_to_check_vma) mmap_read_unlock(current->mm); return err; } int setup_signal_shadow_stack(struct ksignal *ksig) { void __user *restorer = ksig->ka.sa.sa_restorer; unsigned long ssp; int err; if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || !features_enabled(ARCH_SHSTK_SHSTK)) return 0; if (!restorer) return -EINVAL; ssp = get_user_shstk_addr(); if (unlikely(!ssp)) return -EINVAL; err = shstk_push_sigframe(&ssp); if (unlikely(err)) return err; /* Push restorer address */ ssp -= SS_FRAME_SIZE; err = write_user_shstk_64((u64 __user *)ssp, (u64)restorer); if (unlikely(err)) return -EFAULT; fpregs_lock_and_load(); wrmsrl(MSR_IA32_PL3_SSP, ssp); fpregs_unlock(); return 0; } int restore_signal_shadow_stack(void) { unsigned long ssp; int err; if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || !features_enabled(ARCH_SHSTK_SHSTK)) return 0; ssp = get_user_shstk_addr(); if (unlikely(!ssp)) return -EINVAL; err = shstk_pop_sigframe(&ssp); if (unlikely(err)) return err; fpregs_lock_and_load(); wrmsrl(MSR_IA32_PL3_SSP, ssp); fpregs_unlock(); return 0; } void shstk_free(struct task_struct *tsk) { struct thread_shstk *shstk = &tsk->thread.shstk; if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || !features_enabled(ARCH_SHSTK_SHSTK)) return; /* * When fork() with CLONE_VM fails, the child (tsk) already has a * shadow stack allocated, and exit_thread() calls this function to * free it. In this case the parent (current) and the child share * the same mm struct. */ if (!tsk->mm || tsk->mm != current->mm) return; /* * If shstk->base is NULL, then this task is not managing its * own shadow stack (CLONE_VFORK). So skip freeing it. */ if (!shstk->base) return; /* * shstk->base is NULL for CLONE_VFORK child tasks, and so is * normal. But size = 0 on a shstk->base is not normal and * indicated an attempt to free the thread shadow stack twice. * Warn about it. */ if (WARN_ON(!shstk->size)) return; unmap_shadow_stack(shstk->base, shstk->size); shstk->size = 0; } static int wrss_control(bool enable) { u64 msrval; if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK)) return -EOPNOTSUPP; /* * Only enable WRSS if shadow stack is enabled. If shadow stack is not * enabled, WRSS will already be disabled, so don't bother clearing it * when disabling. */ if (!features_enabled(ARCH_SHSTK_SHSTK)) return -EPERM; /* Already enabled/disabled? */ if (features_enabled(ARCH_SHSTK_WRSS) == enable) return 0; fpregs_lock_and_load(); rdmsrl(MSR_IA32_U_CET, msrval); if (enable) { features_set(ARCH_SHSTK_WRSS); msrval |= CET_WRSS_EN; } else { features_clr(ARCH_SHSTK_WRSS); if (!(msrval & CET_WRSS_EN)) goto unlock; msrval &= ~CET_WRSS_EN; } wrmsrl(MSR_IA32_U_CET, msrval); unlock: fpregs_unlock(); return 0; } static int shstk_disable(void) { if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK)) return -EOPNOTSUPP; /* Already disabled? */ if (!features_enabled(ARCH_SHSTK_SHSTK)) return 0; fpregs_lock_and_load(); /* Disable WRSS too when disabling shadow stack */ wrmsrl(MSR_IA32_U_CET, 0); wrmsrl(MSR_IA32_PL3_SSP, 0); fpregs_unlock(); shstk_free(current); features_clr(ARCH_SHSTK_SHSTK | ARCH_SHSTK_WRSS); return 0; } SYSCALL_DEFINE3(map_shadow_stack, unsigned long, addr, unsigned long, size, unsigned int, flags) { bool set_tok = flags & SHADOW_STACK_SET_TOKEN; unsigned long aligned_size; if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK)) return -EOPNOTSUPP; if (flags & ~SHADOW_STACK_SET_TOKEN) return -EINVAL; /* If there isn't space for a token */ if (set_tok && size < 8) return -ENOSPC; if (addr && addr < SZ_4G) return -ERANGE; /* * An overflow would result in attempting to write the restore token * to the wrong location. Not catastrophic, but just return the right * error code and block it. */ aligned_size = PAGE_ALIGN(size); if (aligned_size < size) return -EOVERFLOW; return alloc_shstk(addr, aligned_size, size, set_tok); } long shstk_prctl(struct task_struct *task, int option, unsigned long arg2) { unsigned long features = arg2; if (option == ARCH_SHSTK_STATUS) { return put_user(task->thread.features, (unsigned long __user *)arg2); } if (option == ARCH_SHSTK_LOCK) { task->thread.features_locked |= features; return 0; } /* Only allow via ptrace */ if (task != current) { if (option == ARCH_SHSTK_UNLOCK && IS_ENABLED(CONFIG_CHECKPOINT_RESTORE)) { task->thread.features_locked &= ~features; return 0; } return -EINVAL; } /* Do not allow to change locked features */ if (features & task->thread.features_locked) return -EPERM; /* Only support enabling/disabling one feature at a time. */ if (hweight_long(features) > 1) return -EINVAL; if (option == ARCH_SHSTK_DISABLE) { if (features & ARCH_SHSTK_WRSS) return wrss_control(false); if (features & ARCH_SHSTK_SHSTK) return shstk_disable(); return -EINVAL; } /* Handle ARCH_SHSTK_ENABLE */ if (features & ARCH_SHSTK_SHSTK) return shstk_setup(); if (features & ARCH_SHSTK_WRSS) return wrss_control(true); return -EINVAL; } int shstk_update_last_frame(unsigned long val) { unsigned long ssp; if (!features_enabled(ARCH_SHSTK_SHSTK)) return 0; ssp = get_user_shstk_addr(); return write_user_shstk_64((u64 __user *)ssp, (u64)val); } bool shstk_is_enabled(void) { return features_enabled(ARCH_SHSTK_SHSTK); } |
| 88 88 5 5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 | /* * Aug 8, 2011 Bob Pearson with help from Joakim Tjernlund and George Spelvin * cleaned up code to current version of sparse and added the slicing-by-8 * algorithm to the closely similar existing slicing-by-4 algorithm. * * Oct 15, 2000 Matt Domsch <Matt_Domsch@dell.com> * Nicer crc32 functions/docs submitted by linux@horizon.com. Thanks! * Code was from the public domain, copyright abandoned. Code was * subsequently included in the kernel, thus was re-licensed under the * GNU GPL v2. * * Oct 12, 2000 Matt Domsch <Matt_Domsch@dell.com> * Same crc32 function was used in 5 other places in the kernel. * I made one version, and deleted the others. * There are various incantations of crc32(). Some use a seed of 0 or ~0. * Some xor at the end with ~0. The generic crc32() function takes * seed as an argument, and doesn't xor at the end. Then individual * users can do whatever they need. * drivers/net/smc9194.c uses seed ~0, doesn't xor with ~0. * fs/jffs2 uses seed 0, doesn't xor with ~0. * fs/partitions/efi.c uses seed ~0, xor's with ~0. * * This source code is licensed under the GNU General Public License, * Version 2. See the file COPYING for more details. */ /* see: Documentation/staging/crc32.rst for a description of algorithms */ #include <linux/crc32.h> #include <linux/crc32poly.h> #include <linux/module.h> #include <linux/types.h> #include "crc32table.h" MODULE_AUTHOR("Matt Domsch <Matt_Domsch@dell.com>"); MODULE_DESCRIPTION("Various CRC32 calculations"); MODULE_LICENSE("GPL"); u32 crc32_le_base(u32 crc, const u8 *p, size_t len) { while (len--) crc = (crc >> 8) ^ crc32table_le[(crc & 255) ^ *p++]; return crc; } EXPORT_SYMBOL(crc32_le_base); u32 crc32c_base(u32 crc, const u8 *p, size_t len) { while (len--) crc = (crc >> 8) ^ crc32ctable_le[(crc & 255) ^ *p++]; return crc; } EXPORT_SYMBOL(crc32c_base); /* * This multiplies the polynomials x and y modulo the given modulus. * This follows the "little-endian" CRC convention that the lsbit * represents the highest power of x, and the msbit represents x^0. */ static u32 gf2_multiply(u32 x, u32 y, u32 modulus) { u32 product = x & 1 ? y : 0; int i; for (i = 0; i < 31; i++) { product = (product >> 1) ^ (product & 1 ? modulus : 0); x >>= 1; product ^= x & 1 ? y : 0; } return product; } /** * crc32_generic_shift - Append @len 0 bytes to crc, in logarithmic time * @crc: The original little-endian CRC (i.e. lsbit is x^31 coefficient) * @len: The number of bytes. @crc is multiplied by x^(8*@len) * @polynomial: The modulus used to reduce the result to 32 bits. * * It's possible to parallelize CRC computations by computing a CRC * over separate ranges of a buffer, then summing them. * This shifts the given CRC by 8*len bits (i.e. produces the same effect * as appending len bytes of zero to the data), in time proportional * to log(len). */ static u32 crc32_generic_shift(u32 crc, size_t len, u32 polynomial) { u32 power = polynomial; /* CRC of x^32 */ int i; /* Shift up to 32 bits in the simple linear way */ for (i = 0; i < 8 * (int)(len & 3); i++) crc = (crc >> 1) ^ (crc & 1 ? polynomial : 0); len >>= 2; if (!len) return crc; for (;;) { /* "power" is x^(2^i), modulo the polynomial */ if (len & 1) crc = gf2_multiply(crc, power, polynomial); len >>= 1; if (!len) break; /* Square power, advancing to x^(2^(i+1)) */ power = gf2_multiply(power, power, polynomial); } return crc; } u32 crc32_le_shift(u32 crc, size_t len) { return crc32_generic_shift(crc, len, CRC32_POLY_LE); } EXPORT_SYMBOL(crc32_le_shift); u32 crc32c_shift(u32 crc, size_t len) { return crc32_generic_shift(crc, len, CRC32C_POLY_LE); } EXPORT_SYMBOL(crc32c_shift); u32 crc32_be_base(u32 crc, const u8 *p, size_t len) { while (len--) crc = (crc << 8) ^ crc32table_be[(crc >> 24) ^ *p++]; return crc; } EXPORT_SYMBOL(crc32_be_base); |
| 233 232 233 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * ocfs2_lockid.h * * Defines OCFS2 lockid bits. * * Copyright (C) 2002, 2005 Oracle. All rights reserved. */ #ifndef OCFS2_LOCKID_H #define OCFS2_LOCKID_H /* lock ids are made up in the following manner: * name[0] --> type * name[1-6] --> 6 pad characters, reserved for now * name[7-22] --> block number, expressed in hex as 16 chars * name[23-30] --> i_generation, expressed in hex 8 chars * name[31] --> '\0' */ #define OCFS2_LOCK_ID_MAX_LEN 32 #define OCFS2_LOCK_ID_PAD "000000" #define OCFS2_DENTRY_LOCK_INO_START 18 enum ocfs2_lock_type { OCFS2_LOCK_TYPE_META = 0, OCFS2_LOCK_TYPE_DATA, OCFS2_LOCK_TYPE_SUPER, OCFS2_LOCK_TYPE_RENAME, OCFS2_LOCK_TYPE_RW, OCFS2_LOCK_TYPE_DENTRY, OCFS2_LOCK_TYPE_OPEN, OCFS2_LOCK_TYPE_FLOCK, OCFS2_LOCK_TYPE_QINFO, OCFS2_LOCK_TYPE_NFS_SYNC, OCFS2_LOCK_TYPE_ORPHAN_SCAN, OCFS2_LOCK_TYPE_REFCOUNT, OCFS2_LOCK_TYPE_TRIM_FS, OCFS2_NUM_LOCK_TYPES }; static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type) { char c; switch (type) { case OCFS2_LOCK_TYPE_META: c = 'M'; break; case OCFS2_LOCK_TYPE_DATA: c = 'D'; break; case OCFS2_LOCK_TYPE_SUPER: c = 'S'; break; case OCFS2_LOCK_TYPE_RENAME: c = 'R'; break; case OCFS2_LOCK_TYPE_RW: c = 'W'; break; case OCFS2_LOCK_TYPE_DENTRY: c = 'N'; break; case OCFS2_LOCK_TYPE_OPEN: c = 'O'; break; case OCFS2_LOCK_TYPE_FLOCK: c = 'F'; break; case OCFS2_LOCK_TYPE_QINFO: c = 'Q'; break; case OCFS2_LOCK_TYPE_NFS_SYNC: c = 'Y'; break; case OCFS2_LOCK_TYPE_ORPHAN_SCAN: c = 'P'; break; case OCFS2_LOCK_TYPE_REFCOUNT: c = 'T'; break; case OCFS2_LOCK_TYPE_TRIM_FS: c = 'I'; break; default: c = '\0'; } return c; } static char *ocfs2_lock_type_strings[] = { [OCFS2_LOCK_TYPE_META] = "Meta", [OCFS2_LOCK_TYPE_DATA] = "Data", [OCFS2_LOCK_TYPE_SUPER] = "Super", [OCFS2_LOCK_TYPE_RENAME] = "Rename", /* Need to differentiate from [R]ename.. serializing writes is the * important job it does, anyway. */ [OCFS2_LOCK_TYPE_RW] = "Write/Read", [OCFS2_LOCK_TYPE_DENTRY] = "Dentry", [OCFS2_LOCK_TYPE_OPEN] = "Open", [OCFS2_LOCK_TYPE_FLOCK] = "Flock", [OCFS2_LOCK_TYPE_QINFO] = "Quota", [OCFS2_LOCK_TYPE_NFS_SYNC] = "NFSSync", [OCFS2_LOCK_TYPE_ORPHAN_SCAN] = "OrphanScan", [OCFS2_LOCK_TYPE_REFCOUNT] = "Refcount", [OCFS2_LOCK_TYPE_TRIM_FS] = "TrimFs", }; static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type) { #ifdef __KERNEL__ BUG_ON(type >= OCFS2_NUM_LOCK_TYPES); #endif return ocfs2_lock_type_strings[type]; } #endif /* OCFS2_LOCKID_H */ |
| 23 2 2 19 19 13 12 8 13 13 12 12 2 10 10 10 1 1 1 9 6 1 2 2 7 1 10 1 9 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 | // SPDX-License-Identifier: GPL-2.0 /* * Opening fs-verity files * * Copyright 2019 Google LLC */ #include "fsverity_private.h" #include <linux/mm.h> #include <linux/slab.h> static struct kmem_cache *fsverity_info_cachep; /** * fsverity_init_merkle_tree_params() - initialize Merkle tree parameters * @params: the parameters struct to initialize * @inode: the inode for which the Merkle tree is being built * @hash_algorithm: number of hash algorithm to use * @log_blocksize: log base 2 of block size to use * @salt: pointer to salt (optional) * @salt_size: size of salt, possibly 0 * * Validate the hash algorithm and block size, then compute the tree topology * (num levels, num blocks in each level, etc.) and initialize @params. * * Return: 0 on success, -errno on failure */ int fsverity_init_merkle_tree_params(struct merkle_tree_params *params, const struct inode *inode, unsigned int hash_algorithm, unsigned int log_blocksize, const u8 *salt, size_t salt_size) { const struct fsverity_hash_alg *hash_alg; int err; u64 blocks; u64 blocks_in_level[FS_VERITY_MAX_LEVELS]; u64 offset; int level; memset(params, 0, sizeof(*params)); hash_alg = fsverity_get_hash_alg(inode, hash_algorithm); if (IS_ERR(hash_alg)) return PTR_ERR(hash_alg); params->hash_alg = hash_alg; params->digest_size = hash_alg->digest_size; params->hashstate = fsverity_prepare_hash_state(hash_alg, salt, salt_size); if (IS_ERR(params->hashstate)) { err = PTR_ERR(params->hashstate); params->hashstate = NULL; fsverity_err(inode, "Error %d preparing hash state", err); goto out_err; } /* * fs/verity/ directly assumes that the Merkle tree block size is a * power of 2 less than or equal to PAGE_SIZE. Another restriction * arises from the interaction between fs/verity/ and the filesystems * themselves: filesystems expect to be able to verify a single * filesystem block of data at a time. Therefore, the Merkle tree block * size must also be less than or equal to the filesystem block size. * * The above are the only hard limitations, so in theory the Merkle tree * block size could be as small as twice the digest size. However, * that's not useful, and it would result in some unusually deep and * large Merkle trees. So we currently require that the Merkle tree * block size be at least 1024 bytes. That's small enough to test the * sub-page block case on systems with 4K pages, but not too small. */ if (log_blocksize < 10 || log_blocksize > PAGE_SHIFT || log_blocksize > inode->i_blkbits) { fsverity_warn(inode, "Unsupported log_blocksize: %u", log_blocksize); err = -EINVAL; goto out_err; } params->log_blocksize = log_blocksize; params->block_size = 1 << log_blocksize; params->log_blocks_per_page = PAGE_SHIFT - log_blocksize; params->blocks_per_page = 1 << params->log_blocks_per_page; if (WARN_ON_ONCE(!is_power_of_2(params->digest_size))) { err = -EINVAL; goto out_err; } if (params->block_size < 2 * params->digest_size) { fsverity_warn(inode, "Merkle tree block size (%u) too small for hash algorithm \"%s\"", params->block_size, hash_alg->name); err = -EINVAL; goto out_err; } params->log_digestsize = ilog2(params->digest_size); params->log_arity = log_blocksize - params->log_digestsize; params->hashes_per_block = 1 << params->log_arity; /* * Compute the number of levels in the Merkle tree and create a map from * level to the starting block of that level. Level 'num_levels - 1' is * the root and is stored first. Level 0 is the level directly "above" * the data blocks and is stored last. */ /* Compute number of levels and the number of blocks in each level */ blocks = ((u64)inode->i_size + params->block_size - 1) >> log_blocksize; while (blocks > 1) { if (params->num_levels >= FS_VERITY_MAX_LEVELS) { fsverity_err(inode, "Too many levels in Merkle tree"); err = -EFBIG; goto out_err; } blocks = (blocks + params->hashes_per_block - 1) >> params->log_arity; blocks_in_level[params->num_levels++] = blocks; } /* Compute the starting block of each level */ offset = 0; for (level = (int)params->num_levels - 1; level >= 0; level--) { params->level_start[level] = offset; offset += blocks_in_level[level]; } /* * With block_size != PAGE_SIZE, an in-memory bitmap will need to be * allocated to track the "verified" status of hash blocks. Don't allow * this bitmap to get too large. For now, limit it to 1 MiB, which * limits the file size to about 4.4 TB with SHA-256 and 4K blocks. * * Together with the fact that the data, and thus also the Merkle tree, * cannot have more than ULONG_MAX pages, this implies that hash block * indices can always fit in an 'unsigned long'. But to be safe, we * explicitly check for that too. Note, this is only for hash block * indices; data block indices might not fit in an 'unsigned long'. */ if ((params->block_size != PAGE_SIZE && offset > 1 << 23) || offset > ULONG_MAX) { fsverity_err(inode, "Too many blocks in Merkle tree"); err = -EFBIG; goto out_err; } params->tree_size = offset << log_blocksize; params->tree_pages = PAGE_ALIGN(params->tree_size) >> PAGE_SHIFT; return 0; out_err: kfree(params->hashstate); memset(params, 0, sizeof(*params)); return err; } /* * Compute the file digest by hashing the fsverity_descriptor excluding the * builtin signature and with the sig_size field set to 0. */ static int compute_file_digest(const struct fsverity_hash_alg *hash_alg, struct fsverity_descriptor *desc, u8 *file_digest) { __le32 sig_size = desc->sig_size; int err; desc->sig_size = 0; err = fsverity_hash_buffer(hash_alg, desc, sizeof(*desc), file_digest); desc->sig_size = sig_size; return err; } /* * Create a new fsverity_info from the given fsverity_descriptor (with optional * appended builtin signature), and check the signature if present. The * fsverity_descriptor must have already undergone basic validation. */ struct fsverity_info *fsverity_create_info(const struct inode *inode, struct fsverity_descriptor *desc) { struct fsverity_info *vi; int err; vi = kmem_cache_zalloc(fsverity_info_cachep, GFP_KERNEL); if (!vi) return ERR_PTR(-ENOMEM); vi->inode = inode; err = fsverity_init_merkle_tree_params(&vi->tree_params, inode, desc->hash_algorithm, desc->log_blocksize, desc->salt, desc->salt_size); if (err) { fsverity_err(inode, "Error %d initializing Merkle tree parameters", err); goto fail; } memcpy(vi->root_hash, desc->root_hash, vi->tree_params.digest_size); err = compute_file_digest(vi->tree_params.hash_alg, desc, vi->file_digest); if (err) { fsverity_err(inode, "Error %d computing file digest", err); goto fail; } err = fsverity_verify_signature(vi, desc->signature, le32_to_cpu(desc->sig_size)); if (err) goto fail; if (vi->tree_params.block_size != PAGE_SIZE) { /* * When the Merkle tree block size and page size differ, we use * a bitmap to keep track of which hash blocks have been * verified. This bitmap must contain one bit per hash block, * including alignment to a page boundary at the end. * * Eventually, to support extremely large files in an efficient * way, it might be necessary to make pages of this bitmap * reclaimable. But for now, simply allocating the whole bitmap * is a simple solution that works well on the files on which * fsverity is realistically used. E.g., with SHA-256 and 4K * blocks, a 100MB file only needs a 24-byte bitmap, and the * bitmap for any file under 17GB fits in a 4K page. */ unsigned long num_bits = vi->tree_params.tree_pages << vi->tree_params.log_blocks_per_page; vi->hash_block_verified = kvcalloc(BITS_TO_LONGS(num_bits), sizeof(unsigned long), GFP_KERNEL); if (!vi->hash_block_verified) { err = -ENOMEM; goto fail; } } return vi; fail: fsverity_free_info(vi); return ERR_PTR(err); } void fsverity_set_info(struct inode *inode, struct fsverity_info *vi) { /* * Multiple tasks may race to set ->i_verity_info, so use * cmpxchg_release(). This pairs with the smp_load_acquire() in * fsverity_get_info(). I.e., here we publish ->i_verity_info with a * RELEASE barrier so that other tasks can ACQUIRE it. */ if (cmpxchg_release(&inode->i_verity_info, NULL, vi) != NULL) { /* Lost the race, so free the fsverity_info we allocated. */ fsverity_free_info(vi); /* * Afterwards, the caller may access ->i_verity_info directly, * so make sure to ACQUIRE the winning fsverity_info. */ (void)fsverity_get_info(inode); } } void fsverity_free_info(struct fsverity_info *vi) { if (!vi) return; kfree(vi->tree_params.hashstate); kvfree(vi->hash_block_verified); kmem_cache_free(fsverity_info_cachep, vi); } static bool validate_fsverity_descriptor(struct inode *inode, const struct fsverity_descriptor *desc, size_t desc_size) { if (desc_size < sizeof(*desc)) { fsverity_err(inode, "Unrecognized descriptor size: %zu bytes", desc_size); return false; } if (desc->version != 1) { fsverity_err(inode, "Unrecognized descriptor version: %u", desc->version); return false; } if (memchr_inv(desc->__reserved, 0, sizeof(desc->__reserved))) { fsverity_err(inode, "Reserved bits set in descriptor"); return false; } if (desc->salt_size > sizeof(desc->salt)) { fsverity_err(inode, "Invalid salt_size: %u", desc->salt_size); return false; } if (le64_to_cpu(desc->data_size) != inode->i_size) { fsverity_err(inode, "Wrong data_size: %llu (desc) != %lld (inode)", le64_to_cpu(desc->data_size), inode->i_size); return false; } if (le32_to_cpu(desc->sig_size) > desc_size - sizeof(*desc)) { fsverity_err(inode, "Signature overflows verity descriptor"); return false; } return true; } /* * Read the inode's fsverity_descriptor (with optional appended builtin * signature) from the filesystem, and do basic validation of it. */ int fsverity_get_descriptor(struct inode *inode, struct fsverity_descriptor **desc_ret) { int res; struct fsverity_descriptor *desc; res = inode->i_sb->s_vop->get_verity_descriptor(inode, NULL, 0); if (res < 0) { fsverity_err(inode, "Error %d getting verity descriptor size", res); return res; } if (res > FS_VERITY_MAX_DESCRIPTOR_SIZE) { fsverity_err(inode, "Verity descriptor is too large (%d bytes)", res); return -EMSGSIZE; } desc = kmalloc(res, GFP_KERNEL); if (!desc) return -ENOMEM; res = inode->i_sb->s_vop->get_verity_descriptor(inode, desc, res); if (res < 0) { fsverity_err(inode, "Error %d reading verity descriptor", res); kfree(desc); return res; } if (!validate_fsverity_descriptor(inode, desc, res)) { kfree(desc); return -EINVAL; } *desc_ret = desc; return 0; } /* Ensure the inode has an ->i_verity_info */ static int ensure_verity_info(struct inode *inode) { struct fsverity_info *vi = fsverity_get_info(inode); struct fsverity_descriptor *desc; int err; if (vi) return 0; err = fsverity_get_descriptor(inode, &desc); if (err) return err; vi = fsverity_create_info(inode, desc); if (IS_ERR(vi)) { err = PTR_ERR(vi); goto out_free_desc; } fsverity_set_info(inode, vi); err = 0; out_free_desc: kfree(desc); return err; } int __fsverity_file_open(struct inode *inode, struct file *filp) { if (filp->f_mode & FMODE_WRITE) return -EPERM; return ensure_verity_info(inode); } EXPORT_SYMBOL_GPL(__fsverity_file_open); int __fsverity_prepare_setattr(struct dentry *dentry, struct iattr *attr) { if (attr->ia_valid & ATTR_SIZE) return -EPERM; return 0; } EXPORT_SYMBOL_GPL(__fsverity_prepare_setattr); void __fsverity_cleanup_inode(struct inode *inode) { fsverity_free_info(inode->i_verity_info); inode->i_verity_info = NULL; } EXPORT_SYMBOL_GPL(__fsverity_cleanup_inode); void __init fsverity_init_info_cache(void) { fsverity_info_cachep = KMEM_CACHE_USERCOPY( fsverity_info, SLAB_RECLAIM_ACCOUNT | SLAB_PANIC, file_digest); } |
| 70 403 335 13 188 173 293 335 296 232 367 367 368 368 5 25 438 415 25 16 17 15 24 6 6 6 5 5 5 9 9 7 7 7 9 9 9 9 7 2 2 2 9 6 9 6 5 1 6 1 314 322 6 9 9 438 437 433 431 17 6 2 18 292 294 293 63 1 293 246 48 376 156 220 315 376 315 315 184 315 316 183 21 294 294 294 294 294 313 376 375 156 219 308 70 377 308 70 378 378 3 82 256 38 378 469 389 103 429 428 429 428 429 429 334 335 331 335 162 6 160 173 26 173 335 32 335 128 52 13 94 127 120 83 128 93 189 17 173 173 171 19 19 112 82 19 166 126 120 132 114 101 128 331 331 49 329 329 329 95 95 77 90 329 334 160 261 335 82 324 307 335 308 106 308 202 202 200 2 200 2 335 331 292 112 308 88 268 214 71 192 188 328 268 242 54 190 210 71 71 71 414 393 8 10 63 10 57 17 17 55 40 23 23 40 40 40 328 328 241 212 328 328 2 2 2 2 2 2 2 355 355 25 1 234 41 16 41 217 208 408 408 70 21 409 81 374 253 191 111 79 86 353 409 70 409 409 387 35 28 213 89 408 372 408 370 36 414 277 205 27 287 62 216 352 132 408 323 325 408 320 245 164 409 245 331 40 316 10 10 2 2 328 407 409 320 397 71 407 18 409 374 371 373 374 374 373 374 374 374 374 374 374 373 373 374 366 6 373 1 164 209 373 372 1 1 1 1 1 1 1 1 1 1 374 373 21 374 374 373 374 374 374 373 352 21 374 374 25 25 25 19 6 25 25 25 25 25 25 25 25 25 16 16 16 16 16 393 11 392 1 393 392 392 392 331 162 11 392 239 154 154 139 55 392 392 331 330 330 11 329 330 143 26 26 26 121 26 26 23 4 23 23 4 4 23 23 23 26 415 280 143 124 124 98 33 124 20 51 56 23 19 2 22 6 7 7 124 68 74 124 124 121 124 124 1 4 77 74 72 416 416 61 37 26 5 6 90 89 90 90 90 61 417 260 246 18 63 63 61 20 61 144 149 149 149 149 149 149 294 292 88 417 88 42 42 42 42 88 88 47 62 62 17 57 18 62 88 88 88 88 88 77 77 77 74 22 74 74 73 4 89 70 21 21 21 2 21 21 21 21 21 21 21 581 220 478 475 61 61 475 61 61 60 61 416 392 394 394 260 249 250 210 59 149 417 415 413 417 392 90 89 145 330 147 330 335 148 416 89 89 89 89 89 89 21 70 70 429 429 379 60 34 85 58 58 338 336 385 61 416 26 153 309 11 330 89 330 416 363 93 329 33 70 416 197 33 190 190 190 28 162 161 161 150 12 162 1 161 1 162 209 193 35 209 7 209 192 35 192 10 34 12 190 27 7 23 144 94 94 197 5 5 8 8 14 9 1 8 2 2 3 5 5 6 6 6 5 3 9 9 9 18 18 18 18 1 1 6 10 10 2 11 14 4 10 11 2 9 9 16 22 2 20 3 16 16 3 7 16 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204 6205 6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226 6227 6228 6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244 6245 6246 6247 6248 6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289 6290 6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310 6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354 6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476 6477 6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494 6495 6496 6497 6498 6499 6500 6501 6502 6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514 6515 6516 6517 6518 6519 6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530 6531 6532 6533 6534 6535 6536 6537 6538 6539 6540 6541 6542 6543 6544 6545 6546 6547 6548 6549 6550 6551 6552 6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578 6579 6580 6581 6582 6583 6584 6585 6586 6587 6588 6589 6590 6591 6592 6593 6594 6595 6596 6597 6598 6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611 6612 6613 6614 6615 6616 6617 6618 6619 6620 6621 6622 6623 6624 6625 6626 6627 6628 6629 6630 6631 6632 6633 6634 6635 6636 6637 6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650 6651 6652 6653 6654 6655 6656 6657 6658 6659 6660 6661 6662 6663 6664 6665 6666 6667 6668 6669 6670 6671 6672 6673 6674 6675 6676 6677 6678 6679 6680 6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695 6696 6697 6698 6699 6700 6701 6702 6703 6704 6705 6706 6707 6708 6709 6710 6711 6712 6713 6714 6715 6716 6717 6718 6719 6720 6721 6722 6723 6724 6725 6726 6727 6728 6729 6730 6731 6732 6733 6734 6735 6736 6737 6738 6739 6740 6741 6742 6743 6744 6745 6746 6747 6748 6749 6750 6751 6752 6753 6754 6755 6756 6757 6758 6759 6760 6761 6762 6763 6764 6765 6766 6767 6768 6769 6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791 6792 6793 6794 6795 6796 6797 6798 6799 6800 6801 6802 6803 6804 6805 6806 6807 6808 6809 6810 6811 6812 6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873 6874 6875 6876 6877 6878 6879 6880 6881 6882 6883 6884 6885 6886 6887 6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910 6911 6912 6913 6914 6915 6916 6917 6918 6919 6920 6921 6922 6923 6924 6925 6926 6927 6928 6929 6930 6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954 6955 6956 6957 6958 6959 6960 6961 6962 6963 6964 6965 6966 6967 6968 6969 6970 6971 6972 6973 6974 6975 6976 6977 6978 6979 6980 6981 6982 6983 6984 6985 6986 6987 6988 6989 6990 6991 6992 6993 6994 6995 6996 6997 6998 6999 7000 7001 7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014 7015 7016 7017 7018 7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034 7035 7036 7037 7038 7039 7040 7041 7042 7043 7044 7045 7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056 | // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com * Written by Alex Tomas <alex@clusterfs.com> */ /* * mballoc.c contains the multiblocks allocation routines */ #include "ext4_jbd2.h" #include "mballoc.h" #include <linux/log2.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/nospec.h> #include <linux/backing-dev.h> #include <linux/freezer.h> #include <trace/events/ext4.h> #include <kunit/static_stub.h> /* * MUSTDO: * - test ext4_ext_search_left() and ext4_ext_search_right() * - search for metadata in few groups * * TODO v4: * - normalization should take into account whether file is still open * - discard preallocations if no free space left (policy?) * - don't normalize tails * - quota * - reservation for superuser * * TODO v3: * - bitmap read-ahead (proposed by Oleg Drokin aka green) * - track min/max extents in each group for better group selection * - mb_mark_used() may allocate chunk right after splitting buddy * - tree of groups sorted by number of free blocks * - error handling */ /* * The allocation request involve request for multiple number of blocks * near to the goal(block) value specified. * * During initialization phase of the allocator we decide to use the * group preallocation or inode preallocation depending on the size of * the file. The size of the file could be the resulting file size we * would have after allocation, or the current file size, which ever * is larger. If the size is less than sbi->s_mb_stream_request we * select to use the group preallocation. The default value of * s_mb_stream_request is 16 blocks. This can also be tuned via * /sys/fs/ext4/<partition>/mb_stream_req. The value is represented in * terms of number of blocks. * * The main motivation for having small file use group preallocation is to * ensure that we have small files closer together on the disk. * * First stage the allocator looks at the inode prealloc list, * ext4_inode_info->i_prealloc_list, which contains list of prealloc * spaces for this particular inode. The inode prealloc space is * represented as: * * pa_lstart -> the logical start block for this prealloc space * pa_pstart -> the physical start block for this prealloc space * pa_len -> length for this prealloc space (in clusters) * pa_free -> free space available in this prealloc space (in clusters) * * The inode preallocation space is used looking at the _logical_ start * block. If only the logical file block falls within the range of prealloc * space we will consume the particular prealloc space. This makes sure that * we have contiguous physical blocks representing the file blocks * * The important thing to be noted in case of inode prealloc space is that * we don't modify the values associated to inode prealloc space except * pa_free. * * If we are not able to find blocks in the inode prealloc space and if we * have the group allocation flag set then we look at the locality group * prealloc space. These are per CPU prealloc list represented as * * ext4_sb_info.s_locality_groups[smp_processor_id()] * * The reason for having a per cpu locality group is to reduce the contention * between CPUs. It is possible to get scheduled at this point. * * The locality group prealloc space is used looking at whether we have * enough free space (pa_free) within the prealloc space. * * If we can't allocate blocks via inode prealloc or/and locality group * prealloc then we look at the buddy cache. The buddy cache is represented * by ext4_sb_info.s_buddy_cache (struct inode) whose file offset gets * mapped to the buddy and bitmap information regarding different * groups. The buddy information is attached to buddy cache inode so that * we can access them through the page cache. The information regarding * each group is loaded via ext4_mb_load_buddy. The information involve * block bitmap and buddy information. The information are stored in the * inode as: * * { page } * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]... * * * one block each for bitmap and buddy information. So for each group we * take up 2 blocks. A page can contain blocks_per_page (PAGE_SIZE / * blocksize) blocks. So it can have information regarding groups_per_page * which is blocks_per_page/2 * * The buddy cache inode is not stored on disk. The inode is thrown * away when the filesystem is unmounted. * * We look for count number of blocks in the buddy cache. If we were able * to locate that many free blocks we return with additional information * regarding rest of the contiguous physical block available * * Before allocating blocks via buddy cache we normalize the request * blocks. This ensure we ask for more blocks that we needed. The extra * blocks that we get after allocation is added to the respective prealloc * list. In case of inode preallocation we follow a list of heuristics * based on file size. This can be found in ext4_mb_normalize_request. If * we are doing a group prealloc we try to normalize the request to * sbi->s_mb_group_prealloc. The default value of s_mb_group_prealloc is * dependent on the cluster size; for non-bigalloc file systems, it is * 512 blocks. This can be tuned via * /sys/fs/ext4/<partition>/mb_group_prealloc. The value is represented in * terms of number of blocks. If we have mounted the file system with -O * stripe=<value> option the group prealloc request is normalized to the * smallest multiple of the stripe value (sbi->s_stripe) which is * greater than the default mb_group_prealloc. * * If "mb_optimize_scan" mount option is set, we maintain in memory group info * structures in two data structures: * * 1) Array of largest free order lists (sbi->s_mb_largest_free_orders) * * Locking: sbi->s_mb_largest_free_orders_locks(array of rw locks) * * This is an array of lists where the index in the array represents the * largest free order in the buddy bitmap of the participating group infos of * that list. So, there are exactly MB_NUM_ORDERS(sb) (which means total * number of buddy bitmap orders possible) number of lists. Group-infos are * placed in appropriate lists. * * 2) Average fragment size lists (sbi->s_mb_avg_fragment_size) * * Locking: sbi->s_mb_avg_fragment_size_locks(array of rw locks) * * This is an array of lists where in the i-th list there are groups with * average fragment size >= 2^i and < 2^(i+1). The average fragment size * is computed as ext4_group_info->bb_free / ext4_group_info->bb_fragments. * Note that we don't bother with a special list for completely empty groups * so we only have MB_NUM_ORDERS(sb) lists. * * When "mb_optimize_scan" mount option is set, mballoc consults the above data * structures to decide the order in which groups are to be traversed for * fulfilling an allocation request. * * At CR_POWER2_ALIGNED , we look for groups which have the largest_free_order * >= the order of the request. We directly look at the largest free order list * in the data structure (1) above where largest_free_order = order of the * request. If that list is empty, we look at remaining list in the increasing * order of largest_free_order. This allows us to perform CR_POWER2_ALIGNED * lookup in O(1) time. * * At CR_GOAL_LEN_FAST, we only consider groups where * average fragment size > request size. So, we lookup a group which has average * fragment size just above or equal to request size using our average fragment * size group lists (data structure 2) in O(1) time. * * At CR_BEST_AVAIL_LEN, we aim to optimize allocations which can't be satisfied * in CR_GOAL_LEN_FAST. The fact that we couldn't find a group in * CR_GOAL_LEN_FAST suggests that there is no BG that has avg * fragment size > goal length. So before falling to the slower * CR_GOAL_LEN_SLOW, in CR_BEST_AVAIL_LEN we proactively trim goal length and * then use the same fragment lists as CR_GOAL_LEN_FAST to find a BG with a big * enough average fragment size. This increases the chances of finding a * suitable block group in O(1) time and results in faster allocation at the * cost of reduced size of allocation. * * If "mb_optimize_scan" mount option is not set, mballoc traverses groups in * linear order which requires O(N) search time for each CR_POWER2_ALIGNED and * CR_GOAL_LEN_FAST phase. * * The regular allocator (using the buddy cache) supports a few tunables. * * /sys/fs/ext4/<partition>/mb_min_to_scan * /sys/fs/ext4/<partition>/mb_max_to_scan * /sys/fs/ext4/<partition>/mb_order2_req * /sys/fs/ext4/<partition>/mb_max_linear_groups * * The regular allocator uses buddy scan only if the request len is power of * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The * value of s_mb_order2_reqs can be tuned via * /sys/fs/ext4/<partition>/mb_order2_req. If the request len is equal to * stripe size (sbi->s_stripe), we try to search for contiguous block in * stripe size. This should result in better allocation on RAID setups. If * not, we search in the specific group using bitmap for best extents. The * tunable min_to_scan and max_to_scan control the behaviour here. * min_to_scan indicate how long the mballoc __must__ look for a best * extent and max_to_scan indicates how long the mballoc __can__ look for a * best extent in the found extents. Searching for the blocks starts with * the group specified as the goal value in allocation context via * ac_g_ex. Each group is first checked based on the criteria whether it * can be used for allocation. ext4_mb_good_group explains how the groups are * checked. * * When "mb_optimize_scan" is turned on, as mentioned above, the groups may not * get traversed linearly. That may result in subsequent allocations being not * close to each other. And so, the underlying device may get filled up in a * non-linear fashion. While that may not matter on non-rotational devices, for * rotational devices that may result in higher seek times. "mb_max_linear_groups" * tells mballoc how many groups mballoc should search linearly before * performing consulting above data structures for more efficient lookups. For * non rotational devices, this value defaults to 0 and for rotational devices * this is set to MB_DEFAULT_LINEAR_LIMIT. * * Both the prealloc space are getting populated as above. So for the first * request we will hit the buddy cache which will result in this prealloc * space getting filled. The prealloc space is then later used for the * subsequent request. */ /* * mballoc operates on the following data: * - on-disk bitmap * - in-core buddy (actually includes buddy and bitmap) * - preallocation descriptors (PAs) * * there are two types of preallocations: * - inode * assiged to specific inode and can be used for this inode only. * it describes part of inode's space preallocated to specific * physical blocks. any block from that preallocated can be used * independent. the descriptor just tracks number of blocks left * unused. so, before taking some block from descriptor, one must * make sure corresponded logical block isn't allocated yet. this * also means that freeing any block within descriptor's range * must discard all preallocated blocks. * - locality group * assigned to specific locality group which does not translate to * permanent set of inodes: inode can join and leave group. space * from this type of preallocation can be used for any inode. thus * it's consumed from the beginning to the end. * * relation between them can be expressed as: * in-core buddy = on-disk bitmap + preallocation descriptors * * this mean blocks mballoc considers used are: * - allocated blocks (persistent) * - preallocated blocks (non-persistent) * * consistency in mballoc world means that at any time a block is either * free or used in ALL structures. notice: "any time" should not be read * literally -- time is discrete and delimited by locks. * * to keep it simple, we don't use block numbers, instead we count number of * blocks: how many blocks marked used/free in on-disk bitmap, buddy and PA. * * all operations can be expressed as: * - init buddy: buddy = on-disk + PAs * - new PA: buddy += N; PA = N * - use inode PA: on-disk += N; PA -= N * - discard inode PA buddy -= on-disk - PA; PA = 0 * - use locality group PA on-disk += N; PA -= N * - discard locality group PA buddy -= PA; PA = 0 * note: 'buddy -= on-disk - PA' is used to show that on-disk bitmap * is used in real operation because we can't know actual used * bits from PA, only from on-disk bitmap * * if we follow this strict logic, then all operations above should be atomic. * given some of them can block, we'd have to use something like semaphores * killing performance on high-end SMP hardware. let's try to relax it using * the following knowledge: * 1) if buddy is referenced, it's already initialized * 2) while block is used in buddy and the buddy is referenced, * nobody can re-allocate that block * 3) we work on bitmaps and '+' actually means 'set bits'. if on-disk has * bit set and PA claims same block, it's OK. IOW, one can set bit in * on-disk bitmap if buddy has same bit set or/and PA covers corresponded * block * * so, now we're building a concurrency table: * - init buddy vs. * - new PA * blocks for PA are allocated in the buddy, buddy must be referenced * until PA is linked to allocation group to avoid concurrent buddy init * - use inode PA * we need to make sure that either on-disk bitmap or PA has uptodate data * given (3) we care that PA-=N operation doesn't interfere with init * - discard inode PA * the simplest way would be to have buddy initialized by the discard * - use locality group PA * again PA-=N must be serialized with init * - discard locality group PA * the simplest way would be to have buddy initialized by the discard * - new PA vs. * - use inode PA * i_data_sem serializes them * - discard inode PA * discard process must wait until PA isn't used by another process * - use locality group PA * some mutex should serialize them * - discard locality group PA * discard process must wait until PA isn't used by another process * - use inode PA * - use inode PA * i_data_sem or another mutex should serializes them * - discard inode PA * discard process must wait until PA isn't used by another process * - use locality group PA * nothing wrong here -- they're different PAs covering different blocks * - discard locality group PA * discard process must wait until PA isn't used by another process * * now we're ready to make few consequences: * - PA is referenced and while it is no discard is possible * - PA is referenced until block isn't marked in on-disk bitmap * - PA changes only after on-disk bitmap * - discard must not compete with init. either init is done before * any discard or they're serialized somehow * - buddy init as sum of on-disk bitmap and PAs is done atomically * * a special case when we've used PA to emptiness. no need to modify buddy * in this case, but we should care about concurrent init * */ /* * Logic in few words: * * - allocation: * load group * find blocks * mark bits in on-disk bitmap * release group * * - use preallocation: * find proper PA (per-inode or group) * load group * mark bits in on-disk bitmap * release group * release PA * * - free: * load group * mark bits in on-disk bitmap * release group * * - discard preallocations in group: * mark PAs deleted * move them onto local list * load on-disk bitmap * load group * remove PA from object (inode or locality group) * mark free blocks in-core * * - discard inode's preallocations: */ /* * Locking rules * * Locks: * - bitlock on a group (group) * - object (inode/locality) (object) * - per-pa lock (pa) * - cr_power2_aligned lists lock (cr_power2_aligned) * - cr_goal_len_fast lists lock (cr_goal_len_fast) * * Paths: * - new pa * object * group * * - find and use pa: * pa * * - release consumed pa: * pa * group * object * * - generate in-core bitmap: * group * pa * * - discard all for given object (inode, locality group): * object * pa * group * * - discard all for given group: * group * pa * group * object * * - allocation path (ext4_mb_regular_allocator) * group * cr_power2_aligned/cr_goal_len_fast */ static struct kmem_cache *ext4_pspace_cachep; static struct kmem_cache *ext4_ac_cachep; static struct kmem_cache *ext4_free_data_cachep; /* We create slab caches for groupinfo data structures based on the * superblock block size. There will be one per mounted filesystem for * each unique s_blocksize_bits */ #define NR_GRPINFO_CACHES 8 static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES]; static const char * const ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = { "ext4_groupinfo_1k", "ext4_groupinfo_2k", "ext4_groupinfo_4k", "ext4_groupinfo_8k", "ext4_groupinfo_16k", "ext4_groupinfo_32k", "ext4_groupinfo_64k", "ext4_groupinfo_128k" }; static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ext4_group_t group); static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac); static bool ext4_mb_good_group(struct ext4_allocation_context *ac, ext4_group_t group, enum criteria cr); static int ext4_try_to_trim_range(struct super_block *sb, struct ext4_buddy *e4b, ext4_grpblk_t start, ext4_grpblk_t max, ext4_grpblk_t minblocks); /* * The algorithm using this percpu seq counter goes below: * 1. We sample the percpu discard_pa_seq counter before trying for block * allocation in ext4_mb_new_blocks(). * 2. We increment this percpu discard_pa_seq counter when we either allocate * or free these blocks i.e. while marking those blocks as used/free in * mb_mark_used()/mb_free_blocks(). * 3. We also increment this percpu seq counter when we successfully identify * that the bb_prealloc_list is not empty and hence proceed for discarding * of those PAs inside ext4_mb_discard_group_preallocations(). * * Now to make sure that the regular fast path of block allocation is not * affected, as a small optimization we only sample the percpu seq counter * on that cpu. Only when the block allocation fails and when freed blocks * found were 0, that is when we sample percpu seq counter for all cpus using * below function ext4_get_discard_pa_seq_sum(). This happens after making * sure that all the PAs on grp->bb_prealloc_list got freed or if it's empty. */ static DEFINE_PER_CPU(u64, discard_pa_seq); static inline u64 ext4_get_discard_pa_seq_sum(void) { int __cpu; u64 __seq = 0; for_each_possible_cpu(__cpu) __seq += per_cpu(discard_pa_seq, __cpu); return __seq; } static inline void *mb_correct_addr_and_bit(int *bit, void *addr) { #if BITS_PER_LONG == 64 *bit += ((unsigned long) addr & 7UL) << 3; addr = (void *) ((unsigned long) addr & ~7UL); #elif BITS_PER_LONG == 32 *bit += ((unsigned long) addr & 3UL) << 3; addr = (void *) ((unsigned long) addr & ~3UL); #else #error "how many bits you are?!" #endif return addr; } static inline int mb_test_bit(int bit, void *addr) { /* * ext4_test_bit on architecture like powerpc * needs unsigned long aligned address */ addr = mb_correct_addr_and_bit(&bit, addr); return ext4_test_bit(bit, addr); } static inline void mb_set_bit(int bit, void *addr) { addr = mb_correct_addr_and_bit(&bit, addr); ext4_set_bit(bit, addr); } static inline void mb_clear_bit(int bit, void *addr) { addr = mb_correct_addr_and_bit(&bit, addr); ext4_clear_bit(bit, addr); } static inline int mb_test_and_clear_bit(int bit, void *addr) { addr = mb_correct_addr_and_bit(&bit, addr); return ext4_test_and_clear_bit(bit, addr); } static inline int mb_find_next_zero_bit(void *addr, int max, int start) { int fix = 0, ret, tmpmax; addr = mb_correct_addr_and_bit(&fix, addr); tmpmax = max + fix; start += fix; ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix; if (ret > max) return max; return ret; } static inline int mb_find_next_bit(void *addr, int max, int start) { int fix = 0, ret, tmpmax; addr = mb_correct_addr_and_bit(&fix, addr); tmpmax = max + fix; start += fix; ret = ext4_find_next_bit(addr, tmpmax, start) - fix; if (ret > max) return max; return ret; } static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max) { char *bb; BUG_ON(e4b->bd_bitmap == e4b->bd_buddy); BUG_ON(max == NULL); if (order > e4b->bd_blkbits + 1) { *max = 0; return NULL; } /* at order 0 we see each particular block */ if (order == 0) { *max = 1 << (e4b->bd_blkbits + 3); return e4b->bd_bitmap; } bb = e4b->bd_buddy + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order]; *max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order]; return bb; } #ifdef DOUBLE_CHECK static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b, int first, int count) { int i; struct super_block *sb = e4b->bd_sb; if (unlikely(e4b->bd_info->bb_bitmap == NULL)) return; assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); for (i = 0; i < count; i++) { if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) { ext4_fsblk_t blocknr; blocknr = ext4_group_first_block_no(sb, e4b->bd_group); blocknr += EXT4_C2B(EXT4_SB(sb), first + i); ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT); ext4_grp_locked_error(sb, e4b->bd_group, inode ? inode->i_ino : 0, blocknr, "freeing block already freed " "(bit %u)", first + i); } mb_clear_bit(first + i, e4b->bd_info->bb_bitmap); } } static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count) { int i; if (unlikely(e4b->bd_info->bb_bitmap == NULL)) return; assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); for (i = 0; i < count; i++) { BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap)); mb_set_bit(first + i, e4b->bd_info->bb_bitmap); } } static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) { if (unlikely(e4b->bd_info->bb_bitmap == NULL)) return; if (memcmp(e4b->bd_info->bb_bitmap, bitmap, e4b->bd_sb->s_blocksize)) { unsigned char *b1, *b2; int i; b1 = (unsigned char *) e4b->bd_info->bb_bitmap; b2 = (unsigned char *) bitmap; for (i = 0; i < e4b->bd_sb->s_blocksize; i++) { if (b1[i] != b2[i]) { ext4_msg(e4b->bd_sb, KERN_ERR, "corruption in group %u " "at byte %u(%u): %x in copy != %x " "on disk/prealloc", e4b->bd_group, i, i * 8, b1[i], b2[i]); BUG(); } } } } static void mb_group_bb_bitmap_alloc(struct super_block *sb, struct ext4_group_info *grp, ext4_group_t group) { struct buffer_head *bh; grp->bb_bitmap = kmalloc(sb->s_blocksize, GFP_NOFS); if (!grp->bb_bitmap) return; bh = ext4_read_block_bitmap(sb, group); if (IS_ERR_OR_NULL(bh)) { kfree(grp->bb_bitmap); grp->bb_bitmap = NULL; return; } memcpy(grp->bb_bitmap, bh->b_data, sb->s_blocksize); put_bh(bh); } static void mb_group_bb_bitmap_free(struct ext4_group_info *grp) { kfree(grp->bb_bitmap); } #else static inline void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b, int first, int count) { return; } static inline void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count) { return; } static inline void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) { return; } static inline void mb_group_bb_bitmap_alloc(struct super_block *sb, struct ext4_group_info *grp, ext4_group_t group) { return; } static inline void mb_group_bb_bitmap_free(struct ext4_group_info *grp) { return; } #endif #ifdef AGGRESSIVE_CHECK #define MB_CHECK_ASSERT(assert) \ do { \ if (!(assert)) { \ printk(KERN_EMERG \ "Assertion failure in %s() at %s:%d: \"%s\"\n", \ function, file, line, # assert); \ BUG(); \ } \ } while (0) static void __mb_check_buddy(struct ext4_buddy *e4b, char *file, const char *function, int line) { struct super_block *sb = e4b->bd_sb; int order = e4b->bd_blkbits + 1; int max; int max2; int i; int j; int k; int count; struct ext4_group_info *grp; int fragments = 0; int fstart; struct list_head *cur; void *buddy; void *buddy2; if (e4b->bd_info->bb_check_counter++ % 10) return; while (order > 1) { buddy = mb_find_buddy(e4b, order, &max); MB_CHECK_ASSERT(buddy); buddy2 = mb_find_buddy(e4b, order - 1, &max2); MB_CHECK_ASSERT(buddy2); MB_CHECK_ASSERT(buddy != buddy2); MB_CHECK_ASSERT(max * 2 == max2); count = 0; for (i = 0; i < max; i++) { if (mb_test_bit(i, buddy)) { /* only single bit in buddy2 may be 0 */ if (!mb_test_bit(i << 1, buddy2)) { MB_CHECK_ASSERT( mb_test_bit((i<<1)+1, buddy2)); } continue; } /* both bits in buddy2 must be 1 */ MB_CHECK_ASSERT(mb_test_bit(i << 1, buddy2)); MB_CHECK_ASSERT(mb_test_bit((i << 1) + 1, buddy2)); for (j = 0; j < (1 << order); j++) { k = (i * (1 << order)) + j; MB_CHECK_ASSERT( !mb_test_bit(k, e4b->bd_bitmap)); } count++; } MB_CHECK_ASSERT(e4b->bd_info->bb_counters[order] == count); order--; } fstart = -1; buddy = mb_find_buddy(e4b, 0, &max); for (i = 0; i < max; i++) { if (!mb_test_bit(i, buddy)) { MB_CHECK_ASSERT(i >= e4b->bd_info->bb_first_free); if (fstart == -1) { fragments++; fstart = i; } continue; } fstart = -1; /* check used bits only */ for (j = 0; j < e4b->bd_blkbits + 1; j++) { buddy2 = mb_find_buddy(e4b, j, &max2); k = i >> j; MB_CHECK_ASSERT(k < max2); MB_CHECK_ASSERT(mb_test_bit(k, buddy2)); } } MB_CHECK_ASSERT(!EXT4_MB_GRP_NEED_INIT(e4b->bd_info)); MB_CHECK_ASSERT(e4b->bd_info->bb_fragments == fragments); grp = ext4_get_group_info(sb, e4b->bd_group); if (!grp) return; list_for_each(cur, &grp->bb_prealloc_list) { ext4_group_t groupnr; struct ext4_prealloc_space *pa; pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); ext4_get_group_no_and_offset(sb, pa->pa_pstart, &groupnr, &k); MB_CHECK_ASSERT(groupnr == e4b->bd_group); for (i = 0; i < pa->pa_len; i++) MB_CHECK_ASSERT(mb_test_bit(k + i, buddy)); } } #undef MB_CHECK_ASSERT #define mb_check_buddy(e4b) __mb_check_buddy(e4b, \ __FILE__, __func__, __LINE__) #else #define mb_check_buddy(e4b) #endif /* * Divide blocks started from @first with length @len into * smaller chunks with power of 2 blocks. * Clear the bits in bitmap which the blocks of the chunk(s) covered, * then increase bb_counters[] for corresponded chunk size. */ static void ext4_mb_mark_free_simple(struct super_block *sb, void *buddy, ext4_grpblk_t first, ext4_grpblk_t len, struct ext4_group_info *grp) { struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_grpblk_t min; ext4_grpblk_t max; ext4_grpblk_t chunk; unsigned int border; BUG_ON(len > EXT4_CLUSTERS_PER_GROUP(sb)); border = 2 << sb->s_blocksize_bits; while (len > 0) { /* find how many blocks can be covered since this position */ max = ffs(first | border) - 1; /* find how many blocks of power 2 we need to mark */ min = fls(len) - 1; if (max < min) min = max; chunk = 1 << min; /* mark multiblock chunks only */ grp->bb_counters[min]++; if (min > 0) mb_clear_bit(first >> min, buddy + sbi->s_mb_offsets[min]); len -= chunk; first += chunk; } } static int mb_avg_fragment_size_order(struct super_block *sb, ext4_grpblk_t len) { int order; /* * We don't bother with a special lists groups with only 1 block free * extents and for completely empty groups. */ order = fls(len) - 2; if (order < 0) return 0; if (order == MB_NUM_ORDERS(sb)) order--; if (WARN_ON_ONCE(order > MB_NUM_ORDERS(sb))) order = MB_NUM_ORDERS(sb) - 1; return order; } /* Move group to appropriate avg_fragment_size list */ static void mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp) { struct ext4_sb_info *sbi = EXT4_SB(sb); int new_order; if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || grp->bb_fragments == 0) return; new_order = mb_avg_fragment_size_order(sb, grp->bb_free / grp->bb_fragments); if (new_order == grp->bb_avg_fragment_size_order) return; if (grp->bb_avg_fragment_size_order != -1) { write_lock(&sbi->s_mb_avg_fragment_size_locks[ grp->bb_avg_fragment_size_order]); list_del(&grp->bb_avg_fragment_size_node); write_unlock(&sbi->s_mb_avg_fragment_size_locks[ grp->bb_avg_fragment_size_order]); } grp->bb_avg_fragment_size_order = new_order; write_lock(&sbi->s_mb_avg_fragment_size_locks[ grp->bb_avg_fragment_size_order]); list_add_tail(&grp->bb_avg_fragment_size_node, &sbi->s_mb_avg_fragment_size[grp->bb_avg_fragment_size_order]); write_unlock(&sbi->s_mb_avg_fragment_size_locks[ grp->bb_avg_fragment_size_order]); } /* * Choose next group by traversing largest_free_order lists. Updates *new_cr if * cr level needs an update. */ static void ext4_mb_choose_next_group_p2_aligned(struct ext4_allocation_context *ac, enum criteria *new_cr, ext4_group_t *group) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_group_info *iter; int i; if (ac->ac_status == AC_STATUS_FOUND) return; if (unlikely(sbi->s_mb_stats && ac->ac_flags & EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED)) atomic_inc(&sbi->s_bal_p2_aligned_bad_suggestions); for (i = ac->ac_2order; i < MB_NUM_ORDERS(ac->ac_sb); i++) { if (list_empty(&sbi->s_mb_largest_free_orders[i])) continue; read_lock(&sbi->s_mb_largest_free_orders_locks[i]); if (list_empty(&sbi->s_mb_largest_free_orders[i])) { read_unlock(&sbi->s_mb_largest_free_orders_locks[i]); continue; } list_for_each_entry(iter, &sbi->s_mb_largest_free_orders[i], bb_largest_free_order_node) { if (sbi->s_mb_stats) atomic64_inc(&sbi->s_bal_cX_groups_considered[CR_POWER2_ALIGNED]); if (likely(ext4_mb_good_group(ac, iter->bb_group, CR_POWER2_ALIGNED))) { *group = iter->bb_group; ac->ac_flags |= EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED; read_unlock(&sbi->s_mb_largest_free_orders_locks[i]); return; } } read_unlock(&sbi->s_mb_largest_free_orders_locks[i]); } /* Increment cr and search again if no group is found */ *new_cr = CR_GOAL_LEN_FAST; } /* * Find a suitable group of given order from the average fragments list. */ static struct ext4_group_info * ext4_mb_find_good_group_avg_frag_lists(struct ext4_allocation_context *ac, int order) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct list_head *frag_list = &sbi->s_mb_avg_fragment_size[order]; rwlock_t *frag_list_lock = &sbi->s_mb_avg_fragment_size_locks[order]; struct ext4_group_info *grp = NULL, *iter; enum criteria cr = ac->ac_criteria; if (list_empty(frag_list)) return NULL; read_lock(frag_list_lock); if (list_empty(frag_list)) { read_unlock(frag_list_lock); return NULL; } list_for_each_entry(iter, frag_list, bb_avg_fragment_size_node) { if (sbi->s_mb_stats) atomic64_inc(&sbi->s_bal_cX_groups_considered[cr]); if (likely(ext4_mb_good_group(ac, iter->bb_group, cr))) { grp = iter; break; } } read_unlock(frag_list_lock); return grp; } /* * Choose next group by traversing average fragment size list of suitable * order. Updates *new_cr if cr level needs an update. */ static void ext4_mb_choose_next_group_goal_fast(struct ext4_allocation_context *ac, enum criteria *new_cr, ext4_group_t *group) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_group_info *grp = NULL; int i; if (unlikely(ac->ac_flags & EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED)) { if (sbi->s_mb_stats) atomic_inc(&sbi->s_bal_goal_fast_bad_suggestions); } for (i = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len); i < MB_NUM_ORDERS(ac->ac_sb); i++) { grp = ext4_mb_find_good_group_avg_frag_lists(ac, i); if (grp) { *group = grp->bb_group; ac->ac_flags |= EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED; return; } } /* * CR_BEST_AVAIL_LEN works based on the concept that we have * a larger normalized goal len request which can be trimmed to * a smaller goal len such that it can still satisfy original * request len. However, allocation request for non-regular * files never gets normalized. * See function ext4_mb_normalize_request() (EXT4_MB_HINT_DATA). */ if (ac->ac_flags & EXT4_MB_HINT_DATA) *new_cr = CR_BEST_AVAIL_LEN; else *new_cr = CR_GOAL_LEN_SLOW; } /* * We couldn't find a group in CR_GOAL_LEN_FAST so try to find the highest free fragment * order we have and proactively trim the goal request length to that order to * find a suitable group faster. * * This optimizes allocation speed at the cost of slightly reduced * preallocations. However, we make sure that we don't trim the request too * much and fall to CR_GOAL_LEN_SLOW in that case. */ static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context *ac, enum criteria *new_cr, ext4_group_t *group) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_group_info *grp = NULL; int i, order, min_order; unsigned long num_stripe_clusters = 0; if (unlikely(ac->ac_flags & EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED)) { if (sbi->s_mb_stats) atomic_inc(&sbi->s_bal_best_avail_bad_suggestions); } /* * mb_avg_fragment_size_order() returns order in a way that makes * retrieving back the length using (1 << order) inaccurate. Hence, use * fls() instead since we need to know the actual length while modifying * goal length. */ order = fls(ac->ac_g_ex.fe_len) - 1; if (WARN_ON_ONCE(order - 1 > MB_NUM_ORDERS(ac->ac_sb))) order = MB_NUM_ORDERS(ac->ac_sb); min_order = order - sbi->s_mb_best_avail_max_trim_order; if (min_order < 0) min_order = 0; if (sbi->s_stripe > 0) { /* * We are assuming that stripe size is always a multiple of * cluster ratio otherwise __ext4_fill_super exists early. */ num_stripe_clusters = EXT4_NUM_B2C(sbi, sbi->s_stripe); if (1 << min_order < num_stripe_clusters) /* * We consider 1 order less because later we round * up the goal len to num_stripe_clusters */ min_order = fls(num_stripe_clusters) - 1; } if (1 << min_order < ac->ac_o_ex.fe_len) min_order = fls(ac->ac_o_ex.fe_len); for (i = order; i >= min_order; i--) { int frag_order; /* * Scale down goal len to make sure we find something * in the free fragments list. Basically, reduce * preallocations. */ ac->ac_g_ex.fe_len = 1 << i; if (num_stripe_clusters > 0) { /* * Try to round up the adjusted goal length to * stripe size (in cluster units) multiple for * efficiency. */ ac->ac_g_ex.fe_len = roundup(ac->ac_g_ex.fe_len, num_stripe_clusters); } frag_order = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len); grp = ext4_mb_find_good_group_avg_frag_lists(ac, frag_order); if (grp) { *group = grp->bb_group; ac->ac_flags |= EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED; return; } } /* Reset goal length to original goal length before falling into CR_GOAL_LEN_SLOW */ ac->ac_g_ex.fe_len = ac->ac_orig_goal_len; *new_cr = CR_GOAL_LEN_SLOW; } static inline int should_optimize_scan(struct ext4_allocation_context *ac) { if (unlikely(!test_opt2(ac->ac_sb, MB_OPTIMIZE_SCAN))) return 0; if (ac->ac_criteria >= CR_GOAL_LEN_SLOW) return 0; if (!ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) return 0; return 1; } /* * Return next linear group for allocation. */ static ext4_group_t next_linear_group(ext4_group_t group, ext4_group_t ngroups) { /* * Artificially restricted ngroups for non-extent * files makes group > ngroups possible on first loop. */ return group + 1 >= ngroups ? 0 : group + 1; } /* * ext4_mb_choose_next_group: choose next group for allocation. * * @ac Allocation Context * @new_cr This is an output parameter. If the there is no good group * available at current CR level, this field is updated to indicate * the new cr level that should be used. * @group This is an input / output parameter. As an input it indicates the * next group that the allocator intends to use for allocation. As * output, this field indicates the next group that should be used as * determined by the optimization functions. * @ngroups Total number of groups */ static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac, enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups) { *new_cr = ac->ac_criteria; if (!should_optimize_scan(ac)) { *group = next_linear_group(*group, ngroups); return; } /* * Optimized scanning can return non adjacent groups which can cause * seek overhead for rotational disks. So try few linear groups before * trying optimized scan. */ if (ac->ac_groups_linear_remaining) { *group = next_linear_group(*group, ngroups); ac->ac_groups_linear_remaining--; return; } if (*new_cr == CR_POWER2_ALIGNED) { ext4_mb_choose_next_group_p2_aligned(ac, new_cr, group); } else if (*new_cr == CR_GOAL_LEN_FAST) { ext4_mb_choose_next_group_goal_fast(ac, new_cr, group); } else if (*new_cr == CR_BEST_AVAIL_LEN) { ext4_mb_choose_next_group_best_avail(ac, new_cr, group); } else { /* * TODO: For CR_GOAL_LEN_SLOW, we can arrange groups in an * rb tree sorted by bb_free. But until that happens, we should * never come here. */ WARN_ON(1); } } /* * Cache the order of the largest free extent we have available in this block * group. */ static void mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp) { struct ext4_sb_info *sbi = EXT4_SB(sb); int i; for (i = MB_NUM_ORDERS(sb) - 1; i >= 0; i--) if (grp->bb_counters[i] > 0) break; /* No need to move between order lists? */ if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || i == grp->bb_largest_free_order) { grp->bb_largest_free_order = i; return; } if (grp->bb_largest_free_order >= 0) { write_lock(&sbi->s_mb_largest_free_orders_locks[ grp->bb_largest_free_order]); list_del_init(&grp->bb_largest_free_order_node); write_unlock(&sbi->s_mb_largest_free_orders_locks[ grp->bb_largest_free_order]); } grp->bb_largest_free_order = i; if (grp->bb_largest_free_order >= 0 && grp->bb_free) { write_lock(&sbi->s_mb_largest_free_orders_locks[ grp->bb_largest_free_order]); list_add_tail(&grp->bb_largest_free_order_node, &sbi->s_mb_largest_free_orders[grp->bb_largest_free_order]); write_unlock(&sbi->s_mb_largest_free_orders_locks[ grp->bb_largest_free_order]); } } static noinline_for_stack void ext4_mb_generate_buddy(struct super_block *sb, void *buddy, void *bitmap, ext4_group_t group, struct ext4_group_info *grp) { struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); ext4_grpblk_t i = 0; ext4_grpblk_t first; ext4_grpblk_t len; unsigned free = 0; unsigned fragments = 0; unsigned long long period = get_cycles(); /* initialize buddy from bitmap which is aggregation * of on-disk bitmap and preallocations */ i = mb_find_next_zero_bit(bitmap, max, 0); grp->bb_first_free = i; while (i < max) { fragments++; first = i; i = mb_find_next_bit(bitmap, max, i); len = i - first; free += len; if (len > 1) ext4_mb_mark_free_simple(sb, buddy, first, len, grp); else grp->bb_counters[0]++; if (i < max) i = mb_find_next_zero_bit(bitmap, max, i); } grp->bb_fragments = fragments; if (free != grp->bb_free) { ext4_grp_locked_error(sb, group, 0, 0, "block bitmap and bg descriptor " "inconsistent: %u vs %u free clusters", free, grp->bb_free); /* * If we intend to continue, we consider group descriptor * corrupt and update bb_free using bitmap value */ grp->bb_free = free; ext4_mark_group_bitmap_corrupted(sb, group, EXT4_GROUP_INFO_BBITMAP_CORRUPT); } mb_set_largest_free_order(sb, grp); mb_update_avg_fragment_size(sb, grp); clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); period = get_cycles() - period; atomic_inc(&sbi->s_mb_buddies_generated); atomic64_add(period, &sbi->s_mb_generation_time); } static void mb_regenerate_buddy(struct ext4_buddy *e4b) { int count; int order = 1; void *buddy; while ((buddy = mb_find_buddy(e4b, order++, &count))) mb_set_bits(buddy, 0, count); e4b->bd_info->bb_fragments = 0; memset(e4b->bd_info->bb_counters, 0, sizeof(*e4b->bd_info->bb_counters) * (e4b->bd_sb->s_blocksize_bits + 2)); ext4_mb_generate_buddy(e4b->bd_sb, e4b->bd_buddy, e4b->bd_bitmap, e4b->bd_group, e4b->bd_info); } /* The buddy information is attached the buddy cache inode * for convenience. The information regarding each group * is loaded via ext4_mb_load_buddy. The information involve * block bitmap and buddy information. The information are * stored in the inode as * * { page } * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]... * * * one block each for bitmap and buddy information. * So for each group we take up 2 blocks. A page can * contain blocks_per_page (PAGE_SIZE / blocksize) blocks. * So it can have information regarding groups_per_page which * is blocks_per_page/2 * * Locking note: This routine takes the block group lock of all groups * for this page; do not hold this lock when calling this routine! */ static int ext4_mb_init_cache(struct folio *folio, char *incore, gfp_t gfp) { ext4_group_t ngroups; unsigned int blocksize; int blocks_per_page; int groups_per_page; int err = 0; int i; ext4_group_t first_group, group; int first_block; struct super_block *sb; struct buffer_head *bhs; struct buffer_head **bh = NULL; struct inode *inode; char *data; char *bitmap; struct ext4_group_info *grinfo; inode = folio->mapping->host; sb = inode->i_sb; ngroups = ext4_get_groups_count(sb); blocksize = i_blocksize(inode); blocks_per_page = PAGE_SIZE / blocksize; mb_debug(sb, "init folio %lu\n", folio->index); groups_per_page = blocks_per_page >> 1; if (groups_per_page == 0) groups_per_page = 1; /* allocate buffer_heads to read bitmaps */ if (groups_per_page > 1) { i = sizeof(struct buffer_head *) * groups_per_page; bh = kzalloc(i, gfp); if (bh == NULL) return -ENOMEM; } else bh = &bhs; first_group = folio->index * blocks_per_page / 2; /* read all groups the folio covers into the cache */ for (i = 0, group = first_group; i < groups_per_page; i++, group++) { if (group >= ngroups) break; grinfo = ext4_get_group_info(sb, group); if (!grinfo) continue; /* * If page is uptodate then we came here after online resize * which added some new uninitialized group info structs, so * we must skip all initialized uptodate buddies on the folio, * which may be currently in use by an allocating task. */ if (folio_test_uptodate(folio) && !EXT4_MB_GRP_NEED_INIT(grinfo)) { bh[i] = NULL; continue; } bh[i] = ext4_read_block_bitmap_nowait(sb, group, false); if (IS_ERR(bh[i])) { err = PTR_ERR(bh[i]); bh[i] = NULL; goto out; } mb_debug(sb, "read bitmap for group %u\n", group); } /* wait for I/O completion */ for (i = 0, group = first_group; i < groups_per_page; i++, group++) { int err2; if (!bh[i]) continue; err2 = ext4_wait_block_bitmap(sb, group, bh[i]); if (!err) err = err2; } first_block = folio->index * blocks_per_page; for (i = 0; i < blocks_per_page; i++) { group = (first_block + i) >> 1; if (group >= ngroups) break; if (!bh[group - first_group]) /* skip initialized uptodate buddy */ continue; if (!buffer_verified(bh[group - first_group])) /* Skip faulty bitmaps */ continue; err = 0; /* * data carry information regarding this * particular group in the format specified * above * */ data = folio_address(folio) + (i * blocksize); bitmap = bh[group - first_group]->b_data; /* * We place the buddy block and bitmap block * close together */ grinfo = ext4_get_group_info(sb, group); if (!grinfo) { err = -EFSCORRUPTED; goto out; } if ((first_block + i) & 1) { /* this is block of buddy */ BUG_ON(incore == NULL); mb_debug(sb, "put buddy for group %u in folio %lu/%x\n", group, folio->index, i * blocksize); trace_ext4_mb_buddy_bitmap_load(sb, group); grinfo->bb_fragments = 0; memset(grinfo->bb_counters, 0, sizeof(*grinfo->bb_counters) * (MB_NUM_ORDERS(sb))); /* * incore got set to the group block bitmap below */ ext4_lock_group(sb, group); /* init the buddy */ memset(data, 0xff, blocksize); ext4_mb_generate_buddy(sb, data, incore, group, grinfo); ext4_unlock_group(sb, group); incore = NULL; } else { /* this is block of bitmap */ BUG_ON(incore != NULL); mb_debug(sb, "put bitmap for group %u in folio %lu/%x\n", group, folio->index, i * blocksize); trace_ext4_mb_bitmap_load(sb, group); /* see comments in ext4_mb_put_pa() */ ext4_lock_group(sb, group); memcpy(data, bitmap, blocksize); /* mark all preallocated blks used in in-core bitmap */ ext4_mb_generate_from_pa(sb, data, group); WARN_ON_ONCE(!RB_EMPTY_ROOT(&grinfo->bb_free_root)); ext4_unlock_group(sb, group); /* set incore so that the buddy information can be * generated using this */ incore = data; } } folio_mark_uptodate(folio); out: if (bh) { for (i = 0; i < groups_per_page; i++) brelse(bh[i]); if (bh != &bhs) kfree(bh); } return err; } /* * Lock the buddy and bitmap pages. This make sure other parallel init_group * on the same buddy page doesn't happen whild holding the buddy page lock. * Return locked buddy and bitmap pages on e4b struct. If buddy and bitmap * are on the same page e4b->bd_buddy_folio is NULL and return value is 0. */ static int ext4_mb_get_buddy_page_lock(struct super_block *sb, ext4_group_t group, struct ext4_buddy *e4b, gfp_t gfp) { struct inode *inode = EXT4_SB(sb)->s_buddy_cache; int block, pnum, poff; int blocks_per_page; struct folio *folio; e4b->bd_buddy_folio = NULL; e4b->bd_bitmap_folio = NULL; blocks_per_page = PAGE_SIZE / sb->s_blocksize; /* * the buddy cache inode stores the block bitmap * and buddy information in consecutive blocks. * So for each group we need two blocks. */ block = group * 2; pnum = block / blocks_per_page; poff = block % blocks_per_page; folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); if (IS_ERR(folio)) return PTR_ERR(folio); BUG_ON(folio->mapping != inode->i_mapping); e4b->bd_bitmap_folio = folio; e4b->bd_bitmap = folio_address(folio) + (poff * sb->s_blocksize); if (blocks_per_page >= 2) { /* buddy and bitmap are on the same page */ return 0; } /* blocks_per_page == 1, hence we need another page for the buddy */ folio = __filemap_get_folio(inode->i_mapping, block + 1, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); if (IS_ERR(folio)) return PTR_ERR(folio); BUG_ON(folio->mapping != inode->i_mapping); e4b->bd_buddy_folio = folio; return 0; } static void ext4_mb_put_buddy_page_lock(struct ext4_buddy *e4b) { if (e4b->bd_bitmap_folio) { folio_unlock(e4b->bd_bitmap_folio); folio_put(e4b->bd_bitmap_folio); } if (e4b->bd_buddy_folio) { folio_unlock(e4b->bd_buddy_folio); folio_put(e4b->bd_buddy_folio); } } /* * Locking note: This routine calls ext4_mb_init_cache(), which takes the * block group lock of all groups for this page; do not hold the BG lock when * calling this routine! */ static noinline_for_stack int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp) { struct ext4_group_info *this_grp; struct ext4_buddy e4b; struct folio *folio; int ret = 0; might_sleep(); mb_debug(sb, "init group %u\n", group); this_grp = ext4_get_group_info(sb, group); if (!this_grp) return -EFSCORRUPTED; /* * This ensures that we don't reinit the buddy cache * page which map to the group from which we are already * allocating. If we are looking at the buddy cache we would * have taken a reference using ext4_mb_load_buddy and that * would have pinned buddy page to page cache. * The call to ext4_mb_get_buddy_page_lock will mark the * page accessed. */ ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b, gfp); if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) { /* * somebody initialized the group * return without doing anything */ goto err; } folio = e4b.bd_bitmap_folio; ret = ext4_mb_init_cache(folio, NULL, gfp); if (ret) goto err; if (!folio_test_uptodate(folio)) { ret = -EIO; goto err; } if (e4b.bd_buddy_folio == NULL) { /* * If both the bitmap and buddy are in * the same page we don't need to force * init the buddy */ ret = 0; goto err; } /* init buddy cache */ folio = e4b.bd_buddy_folio; ret = ext4_mb_init_cache(folio, e4b.bd_bitmap, gfp); if (ret) goto err; if (!folio_test_uptodate(folio)) { ret = -EIO; goto err; } err: ext4_mb_put_buddy_page_lock(&e4b); return ret; } /* * Locking note: This routine calls ext4_mb_init_cache(), which takes the * block group lock of all groups for this page; do not hold the BG lock when * calling this routine! */ static noinline_for_stack int ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, struct ext4_buddy *e4b, gfp_t gfp) { int blocks_per_page; int block; int pnum; int poff; struct folio *folio; int ret; struct ext4_group_info *grp; struct ext4_sb_info *sbi = EXT4_SB(sb); struct inode *inode = sbi->s_buddy_cache; might_sleep(); mb_debug(sb, "load group %u\n", group); blocks_per_page = PAGE_SIZE / sb->s_blocksize; grp = ext4_get_group_info(sb, group); if (!grp) return -EFSCORRUPTED; e4b->bd_blkbits = sb->s_blocksize_bits; e4b->bd_info = grp; e4b->bd_sb = sb; e4b->bd_group = group; e4b->bd_buddy_folio = NULL; e4b->bd_bitmap_folio = NULL; if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { /* * we need full data about the group * to make a good selection */ ret = ext4_mb_init_group(sb, group, gfp); if (ret) return ret; } /* * the buddy cache inode stores the block bitmap * and buddy information in consecutive blocks. * So for each group we need two blocks. */ block = group * 2; pnum = block / blocks_per_page; poff = block % blocks_per_page; /* Avoid locking the folio in the fast path ... */ folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_ACCESSED, 0); if (IS_ERR(folio) || !folio_test_uptodate(folio)) { if (!IS_ERR(folio)) /* * drop the folio reference and try * to get the folio with lock. If we * are not uptodate that implies * somebody just created the folio but * is yet to initialize it. So * wait for it to initialize. */ folio_put(folio); folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); if (!IS_ERR(folio)) { if (WARN_RATELIMIT(folio->mapping != inode->i_mapping, "ext4: bitmap's mapping != inode->i_mapping\n")) { /* should never happen */ folio_unlock(folio); ret = -EINVAL; goto err; } if (!folio_test_uptodate(folio)) { ret = ext4_mb_init_cache(folio, NULL, gfp); if (ret) { folio_unlock(folio); goto err; } mb_cmp_bitmaps(e4b, folio_address(folio) + (poff * sb->s_blocksize)); } folio_unlock(folio); } } if (IS_ERR(folio)) { ret = PTR_ERR(folio); goto err; } if (!folio_test_uptodate(folio)) { ret = -EIO; goto err; } /* Folios marked accessed already */ e4b->bd_bitmap_folio = folio; e4b->bd_bitmap = folio_address(folio) + (poff * sb->s_blocksize); block++; pnum = block / blocks_per_page; poff = block % blocks_per_page; folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_ACCESSED, 0); if (IS_ERR(folio) || !folio_test_uptodate(folio)) { if (!IS_ERR(folio)) folio_put(folio); folio = __filemap_get_folio(inode->i_mapping, pnum, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); if (!IS_ERR(folio)) { if (WARN_RATELIMIT(folio->mapping != inode->i_mapping, "ext4: buddy bitmap's mapping != inode->i_mapping\n")) { /* should never happen */ folio_unlock(folio); ret = -EINVAL; goto err; } if (!folio_test_uptodate(folio)) { ret = ext4_mb_init_cache(folio, e4b->bd_bitmap, gfp); if (ret) { folio_unlock(folio); goto err; } } folio_unlock(folio); } } if (IS_ERR(folio)) { ret = PTR_ERR(folio); goto err; } if (!folio_test_uptodate(folio)) { ret = -EIO; goto err; } /* Folios marked accessed already */ e4b->bd_buddy_folio = folio; e4b->bd_buddy = folio_address(folio) + (poff * sb->s_blocksize); return 0; err: if (!IS_ERR_OR_NULL(folio)) folio_put(folio); if (e4b->bd_bitmap_folio) folio_put(e4b->bd_bitmap_folio); e4b->bd_buddy = NULL; e4b->bd_bitmap = NULL; return ret; } static int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, struct ext4_buddy *e4b) { return ext4_mb_load_buddy_gfp(sb, group, e4b, GFP_NOFS); } static void ext4_mb_unload_buddy(struct ext4_buddy *e4b) { if (e4b->bd_bitmap_folio) folio_put(e4b->bd_bitmap_folio); if (e4b->bd_buddy_folio) folio_put(e4b->bd_buddy_folio); } static int mb_find_order_for_block(struct ext4_buddy *e4b, int block) { int order = 1, max; void *bb; BUG_ON(e4b->bd_bitmap == e4b->bd_buddy); BUG_ON(block >= (1 << (e4b->bd_blkbits + 3))); while (order <= e4b->bd_blkbits + 1) { bb = mb_find_buddy(e4b, order, &max); if (!mb_test_bit(block >> order, bb)) { /* this block is part of buddy of order 'order' */ return order; } order++; } return 0; } static void mb_clear_bits(void *bm, int cur, int len) { __u32 *addr; len = cur + len; while (cur < len) { if ((cur & 31) == 0 && (len - cur) >= 32) { /* fast path: clear whole word at once */ addr = bm + (cur >> 3); *addr = 0; cur += 32; continue; } mb_clear_bit(cur, bm); cur++; } } /* clear bits in given range * will return first found zero bit if any, -1 otherwise */ static int mb_test_and_clear_bits(void *bm, int cur, int len) { __u32 *addr; int zero_bit = -1; len = cur + len; while (cur < len) { if ((cur & 31) == 0 && (len - cur) >= 32) { /* fast path: clear whole word at once */ addr = bm + (cur >> 3); if (*addr != (__u32)(-1) && zero_bit == -1) zero_bit = cur + mb_find_next_zero_bit(addr, 32, 0); *addr = 0; cur += 32; continue; } if (!mb_test_and_clear_bit(cur, bm) && zero_bit == -1) zero_bit = cur; cur++; } return zero_bit; } void mb_set_bits(void *bm, int cur, int len) { __u32 *addr; len = cur + len; while (cur < len) { if ((cur & 31) == 0 && (len - cur) >= 32) { /* fast path: set whole word at once */ addr = bm + (cur >> 3); *addr = 0xffffffff; cur += 32; continue; } mb_set_bit(cur, bm); cur++; } } static inline int mb_buddy_adjust_border(int* bit, void* bitmap, int side) { if (mb_test_bit(*bit + side, bitmap)) { mb_clear_bit(*bit, bitmap); (*bit) -= side; return 1; } else { (*bit) += side; mb_set_bit(*bit, bitmap); return -1; } } static void mb_buddy_mark_free(struct ext4_buddy *e4b, int first, int last) { int max; int order = 1; void *buddy = mb_find_buddy(e4b, order, &max); while (buddy) { void *buddy2; /* Bits in range [first; last] are known to be set since * corresponding blocks were allocated. Bits in range * (first; last) will stay set because they form buddies on * upper layer. We just deal with borders if they don't * align with upper layer and then go up. * Releasing entire group is all about clearing * single bit of highest order buddy. */ /* Example: * --------------------------------- * | 1 | 1 | 1 | 1 | * --------------------------------- * | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | * --------------------------------- * 0 1 2 3 4 5 6 7 * \_____________________/ * * Neither [1] nor [6] is aligned to above layer. * Left neighbour [0] is free, so mark it busy, * decrease bb_counters and extend range to * [0; 6] * Right neighbour [7] is busy. It can't be coaleasced with [6], so * mark [6] free, increase bb_counters and shrink range to * [0; 5]. * Then shift range to [0; 2], go up and do the same. */ if (first & 1) e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&first, buddy, -1); if (!(last & 1)) e4b->bd_info->bb_counters[order] += mb_buddy_adjust_border(&last, buddy, 1); if (first > last) break; order++; buddy2 = mb_find_buddy(e4b, order, &max); if (!buddy2) { mb_clear_bits(buddy, first, last - first + 1); e4b->bd_info->bb_counters[order - 1] += last - first + 1; break; } first >>= 1; last >>= 1; buddy = buddy2; } } static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, int first, int count) { int left_is_free = 0; int right_is_free = 0; int block; int last = first + count - 1; struct super_block *sb = e4b->bd_sb; if (WARN_ON(count == 0)) return; BUG_ON(last >= (sb->s_blocksize << 3)); assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); /* Don't bother if the block group is corrupt. */ if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) return; mb_check_buddy(e4b); mb_free_blocks_double(inode, e4b, first, count); /* access memory sequentially: check left neighbour, * clear range and then check right neighbour */ if (first != 0) left_is_free = !mb_test_bit(first - 1, e4b->bd_bitmap); block = mb_test_and_clear_bits(e4b->bd_bitmap, first, count); if (last + 1 < EXT4_SB(sb)->s_mb_maxs[0]) right_is_free = !mb_test_bit(last + 1, e4b->bd_bitmap); if (unlikely(block != -1)) { struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_fsblk_t blocknr; /* * Fastcommit replay can free already freed blocks which * corrupts allocation info. Regenerate it. */ if (sbi->s_mount_state & EXT4_FC_REPLAY) { mb_regenerate_buddy(e4b); goto check; } blocknr = ext4_group_first_block_no(sb, e4b->bd_group); blocknr += EXT4_C2B(sbi, block); ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT); ext4_grp_locked_error(sb, e4b->bd_group, inode ? inode->i_ino : 0, blocknr, "freeing already freed block (bit %u); block bitmap corrupt.", block); return; } this_cpu_inc(discard_pa_seq); e4b->bd_info->bb_free += count; if (first < e4b->bd_info->bb_first_free) e4b->bd_info->bb_first_free = first; /* let's maintain fragments counter */ if (left_is_free && right_is_free) e4b->bd_info->bb_fragments--; else if (!left_is_free && !right_is_free) e4b->bd_info->bb_fragments++; /* buddy[0] == bd_bitmap is a special case, so handle * it right away and let mb_buddy_mark_free stay free of * zero order checks. * Check if neighbours are to be coaleasced, * adjust bitmap bb_counters and borders appropriately. */ if (first & 1) { first += !left_is_free; e4b->bd_info->bb_counters[0] += left_is_free ? -1 : 1; } if (!(last & 1)) { last -= !right_is_free; e4b->bd_info->bb_counters[0] += right_is_free ? -1 : 1; } if (first <= last) mb_buddy_mark_free(e4b, first >> 1, last >> 1); mb_set_largest_free_order(sb, e4b->bd_info); mb_update_avg_fragment_size(sb, e4b->bd_info); check: mb_check_buddy(e4b); } static int mb_find_extent(struct ext4_buddy *e4b, int block, int needed, struct ext4_free_extent *ex) { int max, order, next; void *buddy; assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); BUG_ON(ex == NULL); buddy = mb_find_buddy(e4b, 0, &max); BUG_ON(buddy == NULL); BUG_ON(block >= max); if (mb_test_bit(block, buddy)) { ex->fe_len = 0; ex->fe_start = 0; ex->fe_group = 0; return 0; } /* find actual order */ order = mb_find_order_for_block(e4b, block); ex->fe_len = (1 << order) - (block & ((1 << order) - 1)); ex->fe_start = block; ex->fe_group = e4b->bd_group; block = block >> order; while (needed > ex->fe_len && mb_find_buddy(e4b, order, &max)) { if (block + 1 >= max) break; next = (block + 1) * (1 << order); if (mb_test_bit(next, e4b->bd_bitmap)) break; order = mb_find_order_for_block(e4b, next); block = next >> order; ex->fe_len += 1 << order; } if (ex->fe_start + ex->fe_len > EXT4_CLUSTERS_PER_GROUP(e4b->bd_sb)) { /* Should never happen! (but apparently sometimes does?!?) */ WARN_ON(1); ext4_grp_locked_error(e4b->bd_sb, e4b->bd_group, 0, 0, "corruption or bug in mb_find_extent " "block=%d, order=%d needed=%d ex=%u/%d/%d@%u", block, order, needed, ex->fe_group, ex->fe_start, ex->fe_len, ex->fe_logical); ex->fe_len = 0; ex->fe_start = 0; ex->fe_group = 0; } return ex->fe_len; } static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) { int ord; int mlen = 0; int max = 0; int start = ex->fe_start; int len = ex->fe_len; unsigned ret = 0; int len0 = len; void *buddy; int ord_start, ord_end; BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3)); BUG_ON(e4b->bd_group != ex->fe_group); assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); mb_check_buddy(e4b); mb_mark_used_double(e4b, start, len); this_cpu_inc(discard_pa_seq); e4b->bd_info->bb_free -= len; if (e4b->bd_info->bb_first_free == start) e4b->bd_info->bb_first_free += len; /* let's maintain fragments counter */ if (start != 0) mlen = !mb_test_bit(start - 1, e4b->bd_bitmap); if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0]) max = !mb_test_bit(start + len, e4b->bd_bitmap); if (mlen && max) e4b->bd_info->bb_fragments++; else if (!mlen && !max) e4b->bd_info->bb_fragments--; /* let's maintain buddy itself */ while (len) { ord = mb_find_order_for_block(e4b, start); if (((start >> ord) << ord) == start && len >= (1 << ord)) { /* the whole chunk may be allocated at once! */ mlen = 1 << ord; buddy = mb_find_buddy(e4b, ord, &max); BUG_ON((start >> ord) >= max); mb_set_bit(start >> ord, buddy); e4b->bd_info->bb_counters[ord]--; start += mlen; len -= mlen; BUG_ON(len < 0); continue; } /* store for history */ if (ret == 0) ret = len | (ord << 16); BUG_ON(ord <= 0); buddy = mb_find_buddy(e4b, ord, &max); mb_set_bit(start >> ord, buddy); e4b->bd_info->bb_counters[ord]--; ord_start = (start >> ord) << ord; ord_end = ord_start + (1 << ord); /* first chunk */ if (start > ord_start) ext4_mb_mark_free_simple(e4b->bd_sb, e4b->bd_buddy, ord_start, start - ord_start, e4b->bd_info); /* last chunk */ if (start + len < ord_end) { ext4_mb_mark_free_simple(e4b->bd_sb, e4b->bd_buddy, start + len, ord_end - (start + len), e4b->bd_info); break; } len = start + len - ord_end; start = ord_end; } mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info); mb_update_avg_fragment_size(e4b->bd_sb, e4b->bd_info); mb_set_bits(e4b->bd_bitmap, ex->fe_start, len0); mb_check_buddy(e4b); return ret; } /* * Must be called under group lock! */ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac, struct ext4_buddy *e4b) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); int ret; BUG_ON(ac->ac_b_ex.fe_group != e4b->bd_group); BUG_ON(ac->ac_status == AC_STATUS_FOUND); ac->ac_b_ex.fe_len = min(ac->ac_b_ex.fe_len, ac->ac_g_ex.fe_len); ac->ac_b_ex.fe_logical = ac->ac_g_ex.fe_logical; ret = mb_mark_used(e4b, &ac->ac_b_ex); /* preallocation can change ac_b_ex, thus we store actually * allocated blocks for history */ ac->ac_f_ex = ac->ac_b_ex; ac->ac_status = AC_STATUS_FOUND; ac->ac_tail = ret & 0xffff; ac->ac_buddy = ret >> 16; /* * take the page reference. We want the page to be pinned * so that we don't get a ext4_mb_init_cache_call for this * group until we update the bitmap. That would mean we * double allocate blocks. The reference is dropped * in ext4_mb_release_context */ ac->ac_bitmap_folio = e4b->bd_bitmap_folio; folio_get(ac->ac_bitmap_folio); ac->ac_buddy_folio = e4b->bd_buddy_folio; folio_get(ac->ac_buddy_folio); /* store last allocated for subsequent stream allocation */ if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { spin_lock(&sbi->s_md_lock); sbi->s_mb_last_group = ac->ac_f_ex.fe_group; sbi->s_mb_last_start = ac->ac_f_ex.fe_start; spin_unlock(&sbi->s_md_lock); } /* * As we've just preallocated more space than * user requested originally, we store allocated * space in a special descriptor. */ if (ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len) ext4_mb_new_preallocation(ac); } static void ext4_mb_check_limits(struct ext4_allocation_context *ac, struct ext4_buddy *e4b, int finish_group) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_free_extent *bex = &ac->ac_b_ex; struct ext4_free_extent *gex = &ac->ac_g_ex; if (ac->ac_status == AC_STATUS_FOUND) return; /* * We don't want to scan for a whole year */ if (ac->ac_found > sbi->s_mb_max_to_scan && !(ac->ac_flags & EXT4_MB_HINT_FIRST)) { ac->ac_status = AC_STATUS_BREAK; return; } /* * Haven't found good chunk so far, let's continue */ if (bex->fe_len < gex->fe_len) return; if (finish_group || ac->ac_found > sbi->s_mb_min_to_scan) ext4_mb_use_best_found(ac, e4b); } /* * The routine checks whether found extent is good enough. If it is, * then the extent gets marked used and flag is set to the context * to stop scanning. Otherwise, the extent is compared with the * previous found extent and if new one is better, then it's stored * in the context. Later, the best found extent will be used, if * mballoc can't find good enough extent. * * The algorithm used is roughly as follows: * * * If free extent found is exactly as big as goal, then * stop the scan and use it immediately * * * If free extent found is smaller than goal, then keep retrying * upto a max of sbi->s_mb_max_to_scan times (default 200). After * that stop scanning and use whatever we have. * * * If free extent found is bigger than goal, then keep retrying * upto a max of sbi->s_mb_min_to_scan times (default 10) before * stopping the scan and using the extent. * * * FIXME: real allocation policy is to be designed yet! */ static void ext4_mb_measure_extent(struct ext4_allocation_context *ac, struct ext4_free_extent *ex, struct ext4_buddy *e4b) { struct ext4_free_extent *bex = &ac->ac_b_ex; struct ext4_free_extent *gex = &ac->ac_g_ex; BUG_ON(ex->fe_len <= 0); BUG_ON(ex->fe_len > EXT4_CLUSTERS_PER_GROUP(ac->ac_sb)); BUG_ON(ex->fe_start >= EXT4_CLUSTERS_PER_GROUP(ac->ac_sb)); BUG_ON(ac->ac_status != AC_STATUS_CONTINUE); ac->ac_found++; ac->ac_cX_found[ac->ac_criteria]++; /* * The special case - take what you catch first */ if (unlikely(ac->ac_flags & EXT4_MB_HINT_FIRST)) { *bex = *ex; ext4_mb_use_best_found(ac, e4b); return; } /* * Let's check whether the chuck is good enough */ if (ex->fe_len == gex->fe_len) { *bex = *ex; ext4_mb_use_best_found(ac, e4b); return; } /* * If this is first found extent, just store it in the context */ if (bex->fe_len == 0) { *bex = *ex; return; } /* * If new found extent is better, store it in the context */ if (bex->fe_len < gex->fe_len) { /* if the request isn't satisfied, any found extent * larger than previous best one is better */ if (ex->fe_len > bex->fe_len) *bex = *ex; } else if (ex->fe_len > gex->fe_len) { /* if the request is satisfied, then we try to find * an extent that still satisfy the request, but is * smaller than previous one */ if (ex->fe_len < bex->fe_len) *bex = *ex; } ext4_mb_check_limits(ac, e4b, 0); } static noinline_for_stack void ext4_mb_try_best_found(struct ext4_allocation_context *ac, struct ext4_buddy *e4b) { struct ext4_free_extent ex = ac->ac_b_ex; ext4_group_t group = ex.fe_group; int max; int err; BUG_ON(ex.fe_len <= 0); err = ext4_mb_load_buddy(ac->ac_sb, group, e4b); if (err) return; ext4_lock_group(ac->ac_sb, group); if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) goto out; max = mb_find_extent(e4b, ex.fe_start, ex.fe_len, &ex); if (max > 0) { ac->ac_b_ex = ex; ext4_mb_use_best_found(ac, e4b); } out: ext4_unlock_group(ac->ac_sb, group); ext4_mb_unload_buddy(e4b); } static noinline_for_stack int ext4_mb_find_by_goal(struct ext4_allocation_context *ac, struct ext4_buddy *e4b) { ext4_group_t group = ac->ac_g_ex.fe_group; int max; int err; struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); struct ext4_free_extent ex; if (!grp) return -EFSCORRUPTED; if (!(ac->ac_flags & (EXT4_MB_HINT_TRY_GOAL | EXT4_MB_HINT_GOAL_ONLY))) return 0; if (grp->bb_free == 0) return 0; err = ext4_mb_load_buddy(ac->ac_sb, group, e4b); if (err) return err; ext4_lock_group(ac->ac_sb, group); if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) goto out; max = mb_find_extent(e4b, ac->ac_g_ex.fe_start, ac->ac_g_ex.fe_len, &ex); ex.fe_logical = 0xDEADFA11; /* debug value */ if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == EXT4_NUM_B2C(sbi, sbi->s_stripe)) { ext4_fsblk_t start; start = ext4_grp_offs_to_block(ac->ac_sb, &ex); /* use do_div to get remainder (would be 64-bit modulo) */ if (do_div(start, sbi->s_stripe) == 0) { ac->ac_found++; ac->ac_b_ex = ex; ext4_mb_use_best_found(ac, e4b); } } else if (max >= ac->ac_g_ex.fe_len) { BUG_ON(ex.fe_len <= 0); BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group); BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start); ac->ac_found++; ac->ac_b_ex = ex; ext4_mb_use_best_found(ac, e4b); } else if (max > 0 && (ac->ac_flags & EXT4_MB_HINT_MERGE)) { /* Sometimes, caller may want to merge even small * number of blocks to an existing extent */ BUG_ON(ex.fe_len <= 0); BUG_ON(ex.fe_group != ac->ac_g_ex.fe_group); BUG_ON(ex.fe_start != ac->ac_g_ex.fe_start); ac->ac_found++; ac->ac_b_ex = ex; ext4_mb_use_best_found(ac, e4b); } out: ext4_unlock_group(ac->ac_sb, group); ext4_mb_unload_buddy(e4b); return 0; } /* * The routine scans buddy structures (not bitmap!) from given order * to max order and tries to find big enough chunk to satisfy the req */ static noinline_for_stack void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac, struct ext4_buddy *e4b) { struct super_block *sb = ac->ac_sb; struct ext4_group_info *grp = e4b->bd_info; void *buddy; int i; int k; int max; BUG_ON(ac->ac_2order <= 0); for (i = ac->ac_2order; i < MB_NUM_ORDERS(sb); i++) { if (grp->bb_counters[i] == 0) continue; buddy = mb_find_buddy(e4b, i, &max); if (WARN_RATELIMIT(buddy == NULL, "ext4: mb_simple_scan_group: mb_find_buddy failed, (%d)\n", i)) continue; k = mb_find_next_zero_bit(buddy, max, 0); if (k >= max) { ext4_mark_group_bitmap_corrupted(ac->ac_sb, e4b->bd_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT); ext4_grp_locked_error(ac->ac_sb, e4b->bd_group, 0, 0, "%d free clusters of order %d. But found 0", grp->bb_counters[i], i); break; } ac->ac_found++; ac->ac_cX_found[ac->ac_criteria]++; ac->ac_b_ex.fe_len = 1 << i; ac->ac_b_ex.fe_start = k << i; ac->ac_b_ex.fe_group = e4b->bd_group; ext4_mb_use_best_found(ac, e4b); BUG_ON(ac->ac_f_ex.fe_len != ac->ac_g_ex.fe_len); if (EXT4_SB(sb)->s_mb_stats) atomic_inc(&EXT4_SB(sb)->s_bal_2orders); break; } } /* * The routine scans the group and measures all found extents. * In order to optimize scanning, caller must pass number of * free blocks in the group, so the routine can know upper limit. */ static noinline_for_stack void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, struct ext4_buddy *e4b) { struct super_block *sb = ac->ac_sb; void *bitmap = e4b->bd_bitmap; struct ext4_free_extent ex; int i, j, freelen; int free; free = e4b->bd_info->bb_free; if (WARN_ON(free <= 0)) return; i = e4b->bd_info->bb_first_free; while (free && ac->ac_status == AC_STATUS_CONTINUE) { i = mb_find_next_zero_bit(bitmap, EXT4_CLUSTERS_PER_GROUP(sb), i); if (i >= EXT4_CLUSTERS_PER_GROUP(sb)) { /* * IF we have corrupt bitmap, we won't find any * free blocks even though group info says we * have free blocks */ ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT); ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, "%d free clusters as per " "group info. But bitmap says 0", free); break; } if (!ext4_mb_cr_expensive(ac->ac_criteria)) { /* * In CR_GOAL_LEN_FAST and CR_BEST_AVAIL_LEN, we are * sure that this group will have a large enough * continuous free extent, so skip over the smaller free * extents */ j = mb_find_next_bit(bitmap, EXT4_CLUSTERS_PER_GROUP(sb), i); freelen = j - i; if (freelen < ac->ac_g_ex.fe_len) { i = j; free -= freelen; continue; } } mb_find_extent(e4b, i, ac->ac_g_ex.fe_len, &ex); if (WARN_ON(ex.fe_len <= 0)) break; if (free < ex.fe_len) { ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT); ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, "%d free clusters as per " "group info. But got %d blocks", free, ex.fe_len); /* * The number of free blocks differs. This mostly * indicate that the bitmap is corrupt. So exit * without claiming the space. */ break; } ex.fe_logical = 0xDEADC0DE; /* debug value */ ext4_mb_measure_extent(ac, &ex, e4b); i += ex.fe_len; free -= ex.fe_len; } ext4_mb_check_limits(ac, e4b, 1); } /* * This is a special case for storages like raid5 * we try to find stripe-aligned chunks for stripe-size-multiple requests */ static noinline_for_stack void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, struct ext4_buddy *e4b) { struct super_block *sb = ac->ac_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); void *bitmap = e4b->bd_bitmap; struct ext4_free_extent ex; ext4_fsblk_t first_group_block; ext4_fsblk_t a; ext4_grpblk_t i, stripe; int max; BUG_ON(sbi->s_stripe == 0); /* find first stripe-aligned block in group */ first_group_block = ext4_group_first_block_no(sb, e4b->bd_group); a = first_group_block + sbi->s_stripe - 1; do_div(a, sbi->s_stripe); i = (a * sbi->s_stripe) - first_group_block; stripe = EXT4_NUM_B2C(sbi, sbi->s_stripe); i = EXT4_B2C(sbi, i); while (i < EXT4_CLUSTERS_PER_GROUP(sb)) { if (!mb_test_bit(i, bitmap)) { max = mb_find_extent(e4b, i, stripe, &ex); if (max >= stripe) { ac->ac_found++; ac->ac_cX_found[ac->ac_criteria]++; ex.fe_logical = 0xDEADF00D; /* debug value */ ac->ac_b_ex = ex; ext4_mb_use_best_found(ac, e4b); break; } } i += stripe; } } /* * This is also called BEFORE we load the buddy bitmap. * Returns either 1 or 0 indicating that the group is either suitable * for the allocation or not. */ static bool ext4_mb_good_group(struct ext4_allocation_context *ac, ext4_group_t group, enum criteria cr) { ext4_grpblk_t free, fragments; int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb)); struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); BUG_ON(cr < CR_POWER2_ALIGNED || cr >= EXT4_MB_NUM_CRS); if (unlikely(!grp || EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) return false; free = grp->bb_free; if (free == 0) return false; fragments = grp->bb_fragments; if (fragments == 0) return false; switch (cr) { case CR_POWER2_ALIGNED: BUG_ON(ac->ac_2order == 0); /* Avoid using the first bg of a flexgroup for data files */ if ((ac->ac_flags & EXT4_MB_HINT_DATA) && (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) && ((group % flex_size) == 0)) return false; if (free < ac->ac_g_ex.fe_len) return false; if (ac->ac_2order >= MB_NUM_ORDERS(ac->ac_sb)) return true; if (grp->bb_largest_free_order < ac->ac_2order) return false; return true; case CR_GOAL_LEN_FAST: case CR_BEST_AVAIL_LEN: if ((free / fragments) >= ac->ac_g_ex.fe_len) return true; break; case CR_GOAL_LEN_SLOW: if (free >= ac->ac_g_ex.fe_len) return true; break; case CR_ANY_FREE: return true; default: BUG(); } return false; } /* * This could return negative error code if something goes wrong * during ext4_mb_init_group(). This should not be called with * ext4_lock_group() held. * * Note: because we are conditionally operating with the group lock in * the EXT4_MB_STRICT_CHECK case, we need to fake out sparse in this * function using __acquire and __release. This means we need to be * super careful before messing with the error path handling via "goto * out"! */ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, ext4_group_t group, enum criteria cr) { struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); struct super_block *sb = ac->ac_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); bool should_lock = ac->ac_flags & EXT4_MB_STRICT_CHECK; ext4_grpblk_t free; int ret = 0; if (!grp) return -EFSCORRUPTED; if (sbi->s_mb_stats) atomic64_inc(&sbi->s_bal_cX_groups_considered[ac->ac_criteria]); if (should_lock) { ext4_lock_group(sb, group); __release(ext4_group_lock_ptr(sb, group)); } free = grp->bb_free; if (free == 0) goto out; /* * In all criterias except CR_ANY_FREE we try to avoid groups that * can't possibly satisfy the full goal request due to insufficient * free blocks. */ if (cr < CR_ANY_FREE && free < ac->ac_g_ex.fe_len) goto out; if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) goto out; if (should_lock) { __acquire(ext4_group_lock_ptr(sb, group)); ext4_unlock_group(sb, group); } /* We only do this if the grp has never been initialized */ if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL); int ret; /* * CR_POWER2_ALIGNED/CR_GOAL_LEN_FAST is a very optimistic * search to find large good chunks almost for free. If buddy * data is not ready, then this optimization makes no sense. But * we never skip the first block group in a flex_bg, since this * gets used for metadata block allocation, and we want to make * sure we locate metadata blocks in the first block group in * the flex_bg if possible. */ if (!ext4_mb_cr_expensive(cr) && (!sbi->s_log_groups_per_flex || ((group & ((1 << sbi->s_log_groups_per_flex) - 1)) != 0)) && !(ext4_has_group_desc_csum(sb) && (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) return 0; ret = ext4_mb_init_group(sb, group, GFP_NOFS); if (ret) return ret; } if (should_lock) { ext4_lock_group(sb, group); __release(ext4_group_lock_ptr(sb, group)); } ret = ext4_mb_good_group(ac, group, cr); out: if (should_lock) { __acquire(ext4_group_lock_ptr(sb, group)); ext4_unlock_group(sb, group); } return ret; } /* * Start prefetching @nr block bitmaps starting at @group. * Return the next group which needs to be prefetched. */ ext4_group_t ext4_mb_prefetch(struct super_block *sb, ext4_group_t group, unsigned int nr, int *cnt) { ext4_group_t ngroups = ext4_get_groups_count(sb); struct buffer_head *bh; struct blk_plug plug; blk_start_plug(&plug); while (nr-- > 0) { struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL); struct ext4_group_info *grp = ext4_get_group_info(sb, group); /* * Prefetch block groups with free blocks; but don't * bother if it is marked uninitialized on disk, since * it won't require I/O to read. Also only try to * prefetch once, so we avoid getblk() call, which can * be expensive. */ if (gdp && grp && !EXT4_MB_GRP_TEST_AND_SET_READ(grp) && EXT4_MB_GRP_NEED_INIT(grp) && ext4_free_group_clusters(sb, gdp) > 0 ) { bh = ext4_read_block_bitmap_nowait(sb, group, true); if (bh && !IS_ERR(bh)) { if (!buffer_uptodate(bh) && cnt) (*cnt)++; brelse(bh); } } if (++group >= ngroups) group = 0; } blk_finish_plug(&plug); return group; } /* * Prefetching reads the block bitmap into the buffer cache; but we * need to make sure that the buddy bitmap in the page cache has been * initialized. Note that ext4_mb_init_group() will block if the I/O * is not yet completed, or indeed if it was not initiated by * ext4_mb_prefetch did not start the I/O. * * TODO: We should actually kick off the buddy bitmap setup in a work * queue when the buffer I/O is completed, so that we don't block * waiting for the block allocation bitmap read to finish when * ext4_mb_prefetch_fini is called from ext4_mb_regular_allocator(). */ void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group, unsigned int nr) { struct ext4_group_desc *gdp; struct ext4_group_info *grp; while (nr-- > 0) { if (!group) group = ext4_get_groups_count(sb); group--; gdp = ext4_get_group_desc(sb, group, NULL); grp = ext4_get_group_info(sb, group); if (grp && gdp && EXT4_MB_GRP_NEED_INIT(grp) && ext4_free_group_clusters(sb, gdp) > 0) { if (ext4_mb_init_group(sb, group, GFP_NOFS)) break; } } } static noinline_for_stack int ext4_mb_regular_allocator(struct ext4_allocation_context *ac) { ext4_group_t prefetch_grp = 0, ngroups, group, i; enum criteria new_cr, cr = CR_GOAL_LEN_FAST; int err = 0, first_err = 0; unsigned int nr = 0, prefetch_ios = 0; struct ext4_sb_info *sbi; struct super_block *sb; struct ext4_buddy e4b; int lost; sb = ac->ac_sb; sbi = EXT4_SB(sb); ngroups = ext4_get_groups_count(sb); /* non-extent files are limited to low blocks/groups */ if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS))) ngroups = sbi->s_blockfile_groups; BUG_ON(ac->ac_status == AC_STATUS_FOUND); /* first, try the goal */ err = ext4_mb_find_by_goal(ac, &e4b); if (err || ac->ac_status == AC_STATUS_FOUND) goto out; if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) goto out; /* * ac->ac_2order is set only if the fe_len is a power of 2 * if ac->ac_2order is set we also set criteria to CR_POWER2_ALIGNED * so that we try exact allocation using buddy. */ i = fls(ac->ac_g_ex.fe_len); ac->ac_2order = 0; /* * We search using buddy data only if the order of the request * is greater than equal to the sbi_s_mb_order2_reqs * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req * We also support searching for power-of-two requests only for * requests upto maximum buddy size we have constructed. */ if (i >= sbi->s_mb_order2_reqs && i <= MB_NUM_ORDERS(sb)) { if (is_power_of_2(ac->ac_g_ex.fe_len)) ac->ac_2order = array_index_nospec(i - 1, MB_NUM_ORDERS(sb)); } /* if stream allocation is enabled, use global goal */ if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { /* TBD: may be hot point */ spin_lock(&sbi->s_md_lock); ac->ac_g_ex.fe_group = sbi->s_mb_last_group; ac->ac_g_ex.fe_start = sbi->s_mb_last_start; spin_unlock(&sbi->s_md_lock); } /* * Let's just scan groups to find more-less suitable blocks We * start with CR_GOAL_LEN_FAST, unless it is power of 2 * aligned, in which case let's do that faster approach first. */ if (ac->ac_2order) cr = CR_POWER2_ALIGNED; repeat: for (; cr < EXT4_MB_NUM_CRS && ac->ac_status == AC_STATUS_CONTINUE; cr++) { ac->ac_criteria = cr; /* * searching for the right group start * from the goal value specified */ group = ac->ac_g_ex.fe_group; ac->ac_groups_linear_remaining = sbi->s_mb_max_linear_groups; prefetch_grp = group; nr = 0; for (i = 0, new_cr = cr; i < ngroups; i++, ext4_mb_choose_next_group(ac, &new_cr, &group, ngroups)) { int ret = 0; cond_resched(); if (new_cr != cr) { cr = new_cr; goto repeat; } /* * Batch reads of the block allocation bitmaps * to get multiple READs in flight; limit * prefetching at inexpensive CR, otherwise mballoc * can spend a lot of time loading imperfect groups */ if ((prefetch_grp == group) && (ext4_mb_cr_expensive(cr) || prefetch_ios < sbi->s_mb_prefetch_limit)) { nr = sbi->s_mb_prefetch; if (ext4_has_feature_flex_bg(sb)) { nr = 1 << sbi->s_log_groups_per_flex; nr -= group & (nr - 1); nr = min(nr, sbi->s_mb_prefetch); } prefetch_grp = ext4_mb_prefetch(sb, group, nr, &prefetch_ios); } /* This now checks without needing the buddy page */ ret = ext4_mb_good_group_nolock(ac, group, cr); if (ret <= 0) { if (!first_err) first_err = ret; continue; } err = ext4_mb_load_buddy(sb, group, &e4b); if (err) goto out; ext4_lock_group(sb, group); /* * We need to check again after locking the * block group */ ret = ext4_mb_good_group(ac, group, cr); if (ret == 0) { ext4_unlock_group(sb, group); ext4_mb_unload_buddy(&e4b); continue; } ac->ac_groups_scanned++; if (cr == CR_POWER2_ALIGNED) ext4_mb_simple_scan_group(ac, &e4b); else { bool is_stripe_aligned = (sbi->s_stripe >= sbi->s_cluster_ratio) && !(ac->ac_g_ex.fe_len % EXT4_NUM_B2C(sbi, sbi->s_stripe)); if ((cr == CR_GOAL_LEN_FAST || cr == CR_BEST_AVAIL_LEN) && is_stripe_aligned) ext4_mb_scan_aligned(ac, &e4b); if (ac->ac_status == AC_STATUS_CONTINUE) ext4_mb_complex_scan_group(ac, &e4b); } ext4_unlock_group(sb, group); ext4_mb_unload_buddy(&e4b); if (ac->ac_status != AC_STATUS_CONTINUE) break; } /* Processed all groups and haven't found blocks */ if (sbi->s_mb_stats && i == ngroups) atomic64_inc(&sbi->s_bal_cX_failed[cr]); if (i == ngroups && ac->ac_criteria == CR_BEST_AVAIL_LEN) /* Reset goal length to original goal length before * falling into CR_GOAL_LEN_SLOW */ ac->ac_g_ex.fe_len = ac->ac_orig_goal_len; } if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND && !(ac->ac_flags & EXT4_MB_HINT_FIRST)) { /* * We've been searching too long. Let's try to allocate * the best chunk we've found so far */ ext4_mb_try_best_found(ac, &e4b); if (ac->ac_status != AC_STATUS_FOUND) { /* * Someone more lucky has already allocated it. * The only thing we can do is just take first * found block(s) */ lost = atomic_inc_return(&sbi->s_mb_lost_chunks); mb_debug(sb, "lost chunk, group: %u, start: %d, len: %d, lost: %d\n", ac->ac_b_ex.fe_group, ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len, lost); ac->ac_b_ex.fe_group = 0; ac->ac_b_ex.fe_start = 0; ac->ac_b_ex.fe_len = 0; ac->ac_status = AC_STATUS_CONTINUE; ac->ac_flags |= EXT4_MB_HINT_FIRST; cr = CR_ANY_FREE; goto repeat; } } if (sbi->s_mb_stats && ac->ac_status == AC_STATUS_FOUND) atomic64_inc(&sbi->s_bal_cX_hits[ac->ac_criteria]); out: if (!err && ac->ac_status != AC_STATUS_FOUND && first_err) err = first_err; mb_debug(sb, "Best len %d, origin len %d, ac_status %u, ac_flags 0x%x, cr %d ret %d\n", ac->ac_b_ex.fe_len, ac->ac_o_ex.fe_len, ac->ac_status, ac->ac_flags, cr, err); if (nr) ext4_mb_prefetch_fini(sb, prefetch_grp, nr); return err; } static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) { struct super_block *sb = pde_data(file_inode(seq->file)); ext4_group_t group; if (*pos < 0 || *pos >= ext4_get_groups_count(sb)) return NULL; group = *pos + 1; return (void *) ((unsigned long) group); } static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) { struct super_block *sb = pde_data(file_inode(seq->file)); ext4_group_t group; ++*pos; if (*pos < 0 || *pos >= ext4_get_groups_count(sb)) return NULL; group = *pos + 1; return (void *) ((unsigned long) group); } static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) { struct super_block *sb = pde_data(file_inode(seq->file)); ext4_group_t group = (ext4_group_t) ((unsigned long) v); int i, err; char nbuf[16]; struct ext4_buddy e4b; struct ext4_group_info *grinfo; unsigned char blocksize_bits = min_t(unsigned char, sb->s_blocksize_bits, EXT4_MAX_BLOCK_LOG_SIZE); DEFINE_RAW_FLEX(struct ext4_group_info, sg, bb_counters, EXT4_MAX_BLOCK_LOG_SIZE + 2); group--; if (group == 0) seq_puts(seq, "#group: free frags first [" " 2^0 2^1 2^2 2^3 2^4 2^5 2^6 " " 2^7 2^8 2^9 2^10 2^11 2^12 2^13 ]\n"); i = (blocksize_bits + 2) * sizeof(sg->bb_counters[0]) + sizeof(struct ext4_group_info); grinfo = ext4_get_group_info(sb, group); if (!grinfo) return 0; /* Load the group info in memory only if not already loaded. */ if (unlikely(EXT4_MB_GRP_NEED_INIT(grinfo))) { err = ext4_mb_load_buddy(sb, group, &e4b); if (err) { seq_printf(seq, "#%-5u: %s\n", group, ext4_decode_error(NULL, err, nbuf)); return 0; } ext4_mb_unload_buddy(&e4b); } /* * We care only about free space counters in the group info and * these are safe to access even after the buddy has been unloaded */ memcpy(sg, grinfo, i); seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg->bb_free, sg->bb_fragments, sg->bb_first_free); for (i = 0; i <= 13; i++) seq_printf(seq, " %-5u", i <= blocksize_bits + 1 ? sg->bb_counters[i] : 0); seq_puts(seq, " ]"); if (EXT4_MB_GRP_BBITMAP_CORRUPT(sg)) seq_puts(seq, " Block bitmap corrupted!"); seq_putc(seq, '\n'); return 0; } static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v) { } const struct seq_operations ext4_mb_seq_groups_ops = { .start = ext4_mb_seq_groups_start, .next = ext4_mb_seq_groups_next, .stop = ext4_mb_seq_groups_stop, .show = ext4_mb_seq_groups_show, }; int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset) { struct super_block *sb = seq->private; struct ext4_sb_info *sbi = EXT4_SB(sb); seq_puts(seq, "mballoc:\n"); if (!sbi->s_mb_stats) { seq_puts(seq, "\tmb stats collection turned off.\n"); seq_puts( seq, "\tTo enable, please write \"1\" to sysfs file mb_stats.\n"); return 0; } seq_printf(seq, "\treqs: %u\n", atomic_read(&sbi->s_bal_reqs)); seq_printf(seq, "\tsuccess: %u\n", atomic_read(&sbi->s_bal_success)); seq_printf(seq, "\tgroups_scanned: %u\n", atomic_read(&sbi->s_bal_groups_scanned)); /* CR_POWER2_ALIGNED stats */ seq_puts(seq, "\tcr_p2_aligned_stats:\n"); seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[CR_POWER2_ALIGNED])); seq_printf( seq, "\t\tgroups_considered: %llu\n", atomic64_read( &sbi->s_bal_cX_groups_considered[CR_POWER2_ALIGNED])); seq_printf(seq, "\t\textents_scanned: %u\n", atomic_read(&sbi->s_bal_cX_ex_scanned[CR_POWER2_ALIGNED])); seq_printf(seq, "\t\tuseless_loops: %llu\n", atomic64_read(&sbi->s_bal_cX_failed[CR_POWER2_ALIGNED])); seq_printf(seq, "\t\tbad_suggestions: %u\n", atomic_read(&sbi->s_bal_p2_aligned_bad_suggestions)); /* CR_GOAL_LEN_FAST stats */ seq_puts(seq, "\tcr_goal_fast_stats:\n"); seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[CR_GOAL_LEN_FAST])); seq_printf(seq, "\t\tgroups_considered: %llu\n", atomic64_read( &sbi->s_bal_cX_groups_considered[CR_GOAL_LEN_FAST])); seq_printf(seq, "\t\textents_scanned: %u\n", atomic_read(&sbi->s_bal_cX_ex_scanned[CR_GOAL_LEN_FAST])); seq_printf(seq, "\t\tuseless_loops: %llu\n", atomic64_read(&sbi->s_bal_cX_failed[CR_GOAL_LEN_FAST])); seq_printf(seq, "\t\tbad_suggestions: %u\n", atomic_read(&sbi->s_bal_goal_fast_bad_suggestions)); /* CR_BEST_AVAIL_LEN stats */ seq_puts(seq, "\tcr_best_avail_stats:\n"); seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[CR_BEST_AVAIL_LEN])); seq_printf( seq, "\t\tgroups_considered: %llu\n", atomic64_read( &sbi->s_bal_cX_groups_considered[CR_BEST_AVAIL_LEN])); seq_printf(seq, "\t\textents_scanned: %u\n", atomic_read(&sbi->s_bal_cX_ex_scanned[CR_BEST_AVAIL_LEN])); seq_printf(seq, "\t\tuseless_loops: %llu\n", atomic64_read(&sbi->s_bal_cX_failed[CR_BEST_AVAIL_LEN])); seq_printf(seq, "\t\tbad_suggestions: %u\n", atomic_read(&sbi->s_bal_best_avail_bad_suggestions)); /* CR_GOAL_LEN_SLOW stats */ seq_puts(seq, "\tcr_goal_slow_stats:\n"); seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[CR_GOAL_LEN_SLOW])); seq_printf(seq, "\t\tgroups_considered: %llu\n", atomic64_read( &sbi->s_bal_cX_groups_considered[CR_GOAL_LEN_SLOW])); seq_printf(seq, "\t\textents_scanned: %u\n", atomic_read(&sbi->s_bal_cX_ex_scanned[CR_GOAL_LEN_SLOW])); seq_printf(seq, "\t\tuseless_loops: %llu\n", atomic64_read(&sbi->s_bal_cX_failed[CR_GOAL_LEN_SLOW])); /* CR_ANY_FREE stats */ seq_puts(seq, "\tcr_any_free_stats:\n"); seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[CR_ANY_FREE])); seq_printf( seq, "\t\tgroups_considered: %llu\n", atomic64_read(&sbi->s_bal_cX_groups_considered[CR_ANY_FREE])); seq_printf(seq, "\t\textents_scanned: %u\n", atomic_read(&sbi->s_bal_cX_ex_scanned[CR_ANY_FREE])); seq_printf(seq, "\t\tuseless_loops: %llu\n", atomic64_read(&sbi->s_bal_cX_failed[CR_ANY_FREE])); /* Aggregates */ seq_printf(seq, "\textents_scanned: %u\n", atomic_read(&sbi->s_bal_ex_scanned)); seq_printf(seq, "\t\tgoal_hits: %u\n", atomic_read(&sbi->s_bal_goals)); seq_printf(seq, "\t\tlen_goal_hits: %u\n", atomic_read(&sbi->s_bal_len_goals)); seq_printf(seq, "\t\t2^n_hits: %u\n", atomic_read(&sbi->s_bal_2orders)); seq_printf(seq, "\t\tbreaks: %u\n", atomic_read(&sbi->s_bal_breaks)); seq_printf(seq, "\t\tlost: %u\n", atomic_read(&sbi->s_mb_lost_chunks)); seq_printf(seq, "\tbuddies_generated: %u/%u\n", atomic_read(&sbi->s_mb_buddies_generated), ext4_get_groups_count(sb)); seq_printf(seq, "\tbuddies_time_used: %llu\n", atomic64_read(&sbi->s_mb_generation_time)); seq_printf(seq, "\tpreallocated: %u\n", atomic_read(&sbi->s_mb_preallocated)); seq_printf(seq, "\tdiscarded: %u\n", atomic_read(&sbi->s_mb_discarded)); return 0; } static void *ext4_mb_seq_structs_summary_start(struct seq_file *seq, loff_t *pos) { struct super_block *sb = pde_data(file_inode(seq->file)); unsigned long position; if (*pos < 0 || *pos >= 2*MB_NUM_ORDERS(sb)) return NULL; position = *pos + 1; return (void *) ((unsigned long) position); } static void *ext4_mb_seq_structs_summary_next(struct seq_file *seq, void *v, loff_t *pos) { struct super_block *sb = pde_data(file_inode(seq->file)); unsigned long position; ++*pos; if (*pos < 0 || *pos >= 2*MB_NUM_ORDERS(sb)) return NULL; position = *pos + 1; return (void *) ((unsigned long) position); } static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v) { struct super_block *sb = pde_data(file_inode(seq->file)); struct ext4_sb_info *sbi = EXT4_SB(sb); unsigned long position = ((unsigned long) v); struct ext4_group_info *grp; unsigned int count; position--; if (position >= MB_NUM_ORDERS(sb)) { position -= MB_NUM_ORDERS(sb); if (position == 0) seq_puts(seq, "avg_fragment_size_lists:\n"); count = 0; read_lock(&sbi->s_mb_avg_fragment_size_locks[position]); list_for_each_entry(grp, &sbi->s_mb_avg_fragment_size[position], bb_avg_fragment_size_node) count++; read_unlock(&sbi->s_mb_avg_fragment_size_locks[position]); seq_printf(seq, "\tlist_order_%u_groups: %u\n", (unsigned int)position, count); return 0; } if (position == 0) { seq_printf(seq, "optimize_scan: %d\n", test_opt2(sb, MB_OPTIMIZE_SCAN) ? 1 : 0); seq_puts(seq, "max_free_order_lists:\n"); } count = 0; read_lock(&sbi->s_mb_largest_free_orders_locks[position]); list_for_each_entry(grp, &sbi->s_mb_largest_free_orders[position], bb_largest_free_order_node) count++; read_unlock(&sbi->s_mb_largest_free_orders_locks[position]); seq_printf(seq, "\tlist_order_%u_groups: %u\n", (unsigned int)position, count); return 0; } static void ext4_mb_seq_structs_summary_stop(struct seq_file *seq, void *v) { } const struct seq_operations ext4_mb_seq_structs_summary_ops = { .start = ext4_mb_seq_structs_summary_start, .next = ext4_mb_seq_structs_summary_next, .stop = ext4_mb_seq_structs_summary_stop, .show = ext4_mb_seq_structs_summary_show, }; static struct kmem_cache *get_groupinfo_cache(int blocksize_bits) { int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE; struct kmem_cache *cachep = ext4_groupinfo_caches[cache_index]; BUG_ON(!cachep); return cachep; } /* * Allocate the top-level s_group_info array for the specified number * of groups */ int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups) { struct ext4_sb_info *sbi = EXT4_SB(sb); unsigned size; struct ext4_group_info ***old_groupinfo, ***new_groupinfo; size = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >> EXT4_DESC_PER_BLOCK_BITS(sb); if (size <= sbi->s_group_info_size) return 0; size = roundup_pow_of_two(sizeof(*sbi->s_group_info) * size); new_groupinfo = kvzalloc(size, GFP_KERNEL); if (!new_groupinfo) { ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group"); return -ENOMEM; } rcu_read_lock(); old_groupinfo = rcu_dereference(sbi->s_group_info); if (old_groupinfo) memcpy(new_groupinfo, old_groupinfo, sbi->s_group_info_size * sizeof(*sbi->s_group_info)); rcu_read_unlock(); rcu_assign_pointer(sbi->s_group_info, new_groupinfo); sbi->s_group_info_size = size / sizeof(*sbi->s_group_info); if (old_groupinfo) ext4_kvfree_array_rcu(old_groupinfo); ext4_debug("allocated s_groupinfo array for %d meta_bg's\n", sbi->s_group_info_size); return 0; } /* Create and initialize ext4_group_info data for the given group. */ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, struct ext4_group_desc *desc) { int i; int metalen = 0; int idx = group >> EXT4_DESC_PER_BLOCK_BITS(sb); struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_group_info **meta_group_info; struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits); /* * First check if this group is the first of a reserved block. * If it's true, we have to allocate a new table of pointers * to ext4_group_info structures */ if (group % EXT4_DESC_PER_BLOCK(sb) == 0) { metalen = sizeof(*meta_group_info) << EXT4_DESC_PER_BLOCK_BITS(sb); meta_group_info = kmalloc(metalen, GFP_NOFS); if (meta_group_info == NULL) { ext4_msg(sb, KERN_ERR, "can't allocate mem " "for a buddy group"); return -ENOMEM; } rcu_read_lock(); rcu_dereference(sbi->s_group_info)[idx] = meta_group_info; rcu_read_unlock(); } meta_group_info = sbi_array_rcu_deref(sbi, s_group_info, idx); i = group & (EXT4_DESC_PER_BLOCK(sb) - 1); meta_group_info[i] = kmem_cache_zalloc(cachep, GFP_NOFS); if (meta_group_info[i] == NULL) { ext4_msg(sb, KERN_ERR, "can't allocate buddy mem"); goto exit_group_info; } set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(meta_group_info[i]->bb_state)); /* * initialize bb_free to be able to skip * empty groups without initialization */ if (ext4_has_group_desc_csum(sb) && (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) { meta_group_info[i]->bb_free = ext4_free_clusters_after_init(sb, group, desc); } else { meta_group_info[i]->bb_free = ext4_free_group_clusters(sb, desc); } INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); init_rwsem(&meta_group_info[i]->alloc_sem); meta_group_info[i]->bb_free_root = RB_ROOT; INIT_LIST_HEAD(&meta_group_info[i]->bb_largest_free_order_node); INIT_LIST_HEAD(&meta_group_info[i]->bb_avg_fragment_size_node); meta_group_info[i]->bb_largest_free_order = -1; /* uninit */ meta_group_info[i]->bb_avg_fragment_size_order = -1; /* uninit */ meta_group_info[i]->bb_group = group; mb_group_bb_bitmap_alloc(sb, meta_group_info[i], group); return 0; exit_group_info: /* If a meta_group_info table has been allocated, release it now */ if (group % EXT4_DESC_PER_BLOCK(sb) == 0) { struct ext4_group_info ***group_info; rcu_read_lock(); group_info = rcu_dereference(sbi->s_group_info); kfree(group_info[idx]); group_info[idx] = NULL; rcu_read_unlock(); } return -ENOMEM; } /* ext4_mb_add_groupinfo */ static int ext4_mb_init_backend(struct super_block *sb) { ext4_group_t ngroups = ext4_get_groups_count(sb); ext4_group_t i; struct ext4_sb_info *sbi = EXT4_SB(sb); int err; struct ext4_group_desc *desc; struct ext4_group_info ***group_info; struct kmem_cache *cachep; err = ext4_mb_alloc_groupinfo(sb, ngroups); if (err) return err; sbi->s_buddy_cache = new_inode(sb); if (sbi->s_buddy_cache == NULL) { ext4_msg(sb, KERN_ERR, "can't get new inode"); goto err_freesgi; } /* To avoid potentially colliding with an valid on-disk inode number, * use EXT4_BAD_INO for the buddy cache inode number. This inode is * not in the inode hash, so it should never be found by iget(), but * this will avoid confusion if it ever shows up during debugging. */ sbi->s_buddy_cache->i_ino = EXT4_BAD_INO; EXT4_I(sbi->s_buddy_cache)->i_disksize = 0; for (i = 0; i < ngroups; i++) { cond_resched(); desc = ext4_get_group_desc(sb, i, NULL); if (desc == NULL) { ext4_msg(sb, KERN_ERR, "can't read descriptor %u", i); goto err_freebuddy; } if (ext4_mb_add_groupinfo(sb, i, desc) != 0) goto err_freebuddy; } if (ext4_has_feature_flex_bg(sb)) { /* a single flex group is supposed to be read by a single IO. * 2 ^ s_log_groups_per_flex != UINT_MAX as s_mb_prefetch is * unsigned integer, so the maximum shift is 32. */ if (sbi->s_es->s_log_groups_per_flex >= 32) { ext4_msg(sb, KERN_ERR, "too many log groups per flexible block group"); goto err_freebuddy; } sbi->s_mb_prefetch = min_t(uint, 1 << sbi->s_es->s_log_groups_per_flex, BLK_MAX_SEGMENT_SIZE >> (sb->s_blocksize_bits - 9)); sbi->s_mb_prefetch *= 8; /* 8 prefetch IOs in flight at most */ } else { sbi->s_mb_prefetch = 32; } if (sbi->s_mb_prefetch > ext4_get_groups_count(sb)) sbi->s_mb_prefetch = ext4_get_groups_count(sb); /* * now many real IOs to prefetch within a single allocation at * CR_POWER2_ALIGNED. Given CR_POWER2_ALIGNED is an CPU-related * optimization we shouldn't try to load too many groups, at some point * we should start to use what we've got in memory. * with an average random access time 5ms, it'd take a second to get * 200 groups (* N with flex_bg), so let's make this limit 4 */ sbi->s_mb_prefetch_limit = sbi->s_mb_prefetch * 4; if (sbi->s_mb_prefetch_limit > ext4_get_groups_count(sb)) sbi->s_mb_prefetch_limit = ext4_get_groups_count(sb); return 0; err_freebuddy: cachep = get_groupinfo_cache(sb->s_blocksize_bits); while (i-- > 0) { struct ext4_group_info *grp = ext4_get_group_info(sb, i); if (grp) kmem_cache_free(cachep, grp); } i = sbi->s_group_info_size; rcu_read_lock(); group_info = rcu_dereference(sbi->s_group_info); while (i-- > 0) kfree(group_info[i]); rcu_read_unlock(); iput(sbi->s_buddy_cache); err_freesgi: rcu_read_lock(); kvfree(rcu_dereference(sbi->s_group_info)); rcu_read_unlock(); return -ENOMEM; } static void ext4_groupinfo_destroy_slabs(void) { int i; for (i = 0; i < NR_GRPINFO_CACHES; i++) { kmem_cache_destroy(ext4_groupinfo_caches[i]); ext4_groupinfo_caches[i] = NULL; } } static int ext4_groupinfo_create_slab(size_t size) { static DEFINE_MUTEX(ext4_grpinfo_slab_create_mutex); int slab_size; int blocksize_bits = order_base_2(size); int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE; struct kmem_cache *cachep; if (cache_index >= NR_GRPINFO_CACHES) return -EINVAL; if (unlikely(cache_index < 0)) cache_index = 0; mutex_lock(&ext4_grpinfo_slab_create_mutex); if (ext4_groupinfo_caches[cache_index]) { mutex_unlock(&ext4_grpinfo_slab_create_mutex); return 0; /* Already created */ } slab_size = offsetof(struct ext4_group_info, bb_counters[blocksize_bits + 2]); cachep = kmem_cache_create(ext4_groupinfo_slab_names[cache_index], slab_size, 0, SLAB_RECLAIM_ACCOUNT, NULL); ext4_groupinfo_caches[cache_index] = cachep; mutex_unlock(&ext4_grpinfo_slab_create_mutex); if (!cachep) { printk(KERN_EMERG "EXT4-fs: no memory for groupinfo slab cache\n"); return -ENOMEM; } return 0; } static void ext4_discard_work(struct work_struct *work) { struct ext4_sb_info *sbi = container_of(work, struct ext4_sb_info, s_discard_work); struct super_block *sb = sbi->s_sb; struct ext4_free_data *fd, *nfd; struct ext4_buddy e4b; LIST_HEAD(discard_list); ext4_group_t grp, load_grp; int err = 0; spin_lock(&sbi->s_md_lock); list_splice_init(&sbi->s_discard_list, &discard_list); spin_unlock(&sbi->s_md_lock); load_grp = UINT_MAX; list_for_each_entry_safe(fd, nfd, &discard_list, efd_list) { /* * If filesystem is umounting or no memory or suffering * from no space, give up the discard */ if ((sb->s_flags & SB_ACTIVE) && !err && !atomic_read(&sbi->s_retry_alloc_pending)) { grp = fd->efd_group; if (grp != load_grp) { if (load_grp != UINT_MAX) ext4_mb_unload_buddy(&e4b); err = ext4_mb_load_buddy(sb, grp, &e4b); if (err) { kmem_cache_free(ext4_free_data_cachep, fd); load_grp = UINT_MAX; continue; } else { load_grp = grp; } } ext4_lock_group(sb, grp); ext4_try_to_trim_range(sb, &e4b, fd->efd_start_cluster, fd->efd_start_cluster + fd->efd_count - 1, 1); ext4_unlock_group(sb, grp); } kmem_cache_free(ext4_free_data_cachep, fd); } if (load_grp != UINT_MAX) ext4_mb_unload_buddy(&e4b); } int ext4_mb_init(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); unsigned i, j; unsigned offset, offset_incr; unsigned max; int ret; i = MB_NUM_ORDERS(sb) * sizeof(*sbi->s_mb_offsets); sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL); if (sbi->s_mb_offsets == NULL) { ret = -ENOMEM; goto out; } i = MB_NUM_ORDERS(sb) * sizeof(*sbi->s_mb_maxs); sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); if (sbi->s_mb_maxs == NULL) { ret = -ENOMEM; goto out; } ret = ext4_groupinfo_create_slab(sb->s_blocksize); if (ret < 0) goto out; /* order 0 is regular bitmap */ sbi->s_mb_maxs[0] = sb->s_blocksize << 3; sbi->s_mb_offsets[0] = 0; i = 1; offset = 0; offset_incr = 1 << (sb->s_blocksize_bits - 1); max = sb->s_blocksize << 2; do { sbi->s_mb_offsets[i] = offset; sbi->s_mb_maxs[i] = max; offset += offset_incr; offset_incr = offset_incr >> 1; max = max >> 1; i++; } while (i < MB_NUM_ORDERS(sb)); sbi->s_mb_avg_fragment_size = kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head), GFP_KERNEL); if (!sbi->s_mb_avg_fragment_size) { ret = -ENOMEM; goto out; } sbi->s_mb_avg_fragment_size_locks = kmalloc_array(MB_NUM_ORDERS(sb), sizeof(rwlock_t), GFP_KERNEL); if (!sbi->s_mb_avg_fragment_size_locks) { ret = -ENOMEM; goto out; } for (i = 0; i < MB_NUM_ORDERS(sb); i++) { INIT_LIST_HEAD(&sbi->s_mb_avg_fragment_size[i]); rwlock_init(&sbi->s_mb_avg_fragment_size_locks[i]); } sbi->s_mb_largest_free_orders = kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head), GFP_KERNEL); if (!sbi->s_mb_largest_free_orders) { ret = -ENOMEM; goto out; } sbi->s_mb_largest_free_orders_locks = kmalloc_array(MB_NUM_ORDERS(sb), sizeof(rwlock_t), GFP_KERNEL); if (!sbi->s_mb_largest_free_orders_locks) { ret = -ENOMEM; goto out; } for (i = 0; i < MB_NUM_ORDERS(sb); i++) { INIT_LIST_HEAD(&sbi->s_mb_largest_free_orders[i]); rwlock_init(&sbi->s_mb_largest_free_orders_locks[i]); } spin_lock_init(&sbi->s_md_lock); sbi->s_mb_free_pending = 0; INIT_LIST_HEAD(&sbi->s_freed_data_list[0]); INIT_LIST_HEAD(&sbi->s_freed_data_list[1]); INIT_LIST_HEAD(&sbi->s_discard_list); INIT_WORK(&sbi->s_discard_work, ext4_discard_work); atomic_set(&sbi->s_retry_alloc_pending, 0); sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN; sbi->s_mb_min_to_scan = MB_DEFAULT_MIN_TO_SCAN; sbi->s_mb_stats = MB_DEFAULT_STATS; sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD; sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS; sbi->s_mb_best_avail_max_trim_order = MB_DEFAULT_BEST_AVAIL_TRIM_ORDER; /* * The default group preallocation is 512, which for 4k block * sizes translates to 2 megabytes. However for bigalloc file * systems, this is probably too big (i.e, if the cluster size * is 1 megabyte, then group preallocation size becomes half a * gigabyte!). As a default, we will keep a two megabyte * group pralloc size for cluster sizes up to 64k, and after * that, we will force a minimum group preallocation size of * 32 clusters. This translates to 8 megs when the cluster * size is 256k, and 32 megs when the cluster size is 1 meg, * which seems reasonable as a default. */ sbi->s_mb_group_prealloc = max(MB_DEFAULT_GROUP_PREALLOC >> sbi->s_cluster_bits, 32); /* * If there is a s_stripe > 1, then we set the s_mb_group_prealloc * to the lowest multiple of s_stripe which is bigger than * the s_mb_group_prealloc as determined above. We want * the preallocation size to be an exact multiple of the * RAID stripe size so that preallocations don't fragment * the stripes. */ if (sbi->s_stripe > 1) { sbi->s_mb_group_prealloc = roundup( sbi->s_mb_group_prealloc, EXT4_NUM_B2C(sbi, sbi->s_stripe)); } sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); if (sbi->s_locality_groups == NULL) { ret = -ENOMEM; goto out; } for_each_possible_cpu(i) { struct ext4_locality_group *lg; lg = per_cpu_ptr(sbi->s_locality_groups, i); mutex_init(&lg->lg_mutex); for (j = 0; j < PREALLOC_TB_SIZE; j++) INIT_LIST_HEAD(&lg->lg_prealloc_list[j]); spin_lock_init(&lg->lg_prealloc_lock); } if (bdev_nonrot(sb->s_bdev)) sbi->s_mb_max_linear_groups = 0; else sbi->s_mb_max_linear_groups = MB_DEFAULT_LINEAR_LIMIT; /* init file for buddy data */ ret = ext4_mb_init_backend(sb); if (ret != 0) goto out_free_locality_groups; return 0; out_free_locality_groups: free_percpu(sbi->s_locality_groups); sbi->s_locality_groups = NULL; out: kfree(sbi->s_mb_avg_fragment_size); kfree(sbi->s_mb_avg_fragment_size_locks); kfree(sbi->s_mb_largest_free_orders); kfree(sbi->s_mb_largest_free_orders_locks); kfree(sbi->s_mb_offsets); sbi->s_mb_offsets = NULL; kfree(sbi->s_mb_maxs); sbi->s_mb_maxs = NULL; return ret; } /* need to called with the ext4 group lock held */ static int ext4_mb_cleanup_pa(struct ext4_group_info *grp) { struct ext4_prealloc_space *pa; struct list_head *cur, *tmp; int count = 0; list_for_each_safe(cur, tmp, &grp->bb_prealloc_list) { pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); list_del(&pa->pa_group_list); count++; kmem_cache_free(ext4_pspace_cachep, pa); } return count; } void ext4_mb_release(struct super_block *sb) { ext4_group_t ngroups = ext4_get_groups_count(sb); ext4_group_t i; int num_meta_group_infos; struct ext4_group_info *grinfo, ***group_info; struct ext4_sb_info *sbi = EXT4_SB(sb); struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits); int count; if (test_opt(sb, DISCARD)) { /* * wait the discard work to drain all of ext4_free_data */ flush_work(&sbi->s_discard_work); WARN_ON_ONCE(!list_empty(&sbi->s_discard_list)); } if (sbi->s_group_info) { for (i = 0; i < ngroups; i++) { cond_resched(); grinfo = ext4_get_group_info(sb, i); if (!grinfo) continue; mb_group_bb_bitmap_free(grinfo); ext4_lock_group(sb, i); count = ext4_mb_cleanup_pa(grinfo); if (count) mb_debug(sb, "mballoc: %d PAs left\n", count); ext4_unlock_group(sb, i); kmem_cache_free(cachep, grinfo); } num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >> EXT4_DESC_PER_BLOCK_BITS(sb); rcu_read_lock(); group_info = rcu_dereference(sbi->s_group_info); for (i = 0; i < num_meta_group_infos; i++) kfree(group_info[i]); kvfree(group_info); rcu_read_unlock(); } kfree(sbi->s_mb_avg_fragment_size); kfree(sbi->s_mb_avg_fragment_size_locks); kfree(sbi->s_mb_largest_free_orders); kfree(sbi->s_mb_largest_free_orders_locks); kfree(sbi->s_mb_offsets); kfree(sbi->s_mb_maxs); iput(sbi->s_buddy_cache); if (sbi->s_mb_stats) { ext4_msg(sb, KERN_INFO, "mballoc: %u blocks %u reqs (%u success)", atomic_read(&sbi->s_bal_allocated), atomic_read(&sbi->s_bal_reqs), atomic_read(&sbi->s_bal_success)); ext4_msg(sb, KERN_INFO, "mballoc: %u extents scanned, %u groups scanned, %u goal hits, " "%u 2^N hits, %u breaks, %u lost", atomic_read(&sbi->s_bal_ex_scanned), atomic_read(&sbi->s_bal_groups_scanned), atomic_read(&sbi->s_bal_goals), atomic_read(&sbi->s_bal_2orders), atomic_read(&sbi->s_bal_breaks), atomic_read(&sbi->s_mb_lost_chunks)); ext4_msg(sb, KERN_INFO, "mballoc: %u generated and it took %llu", atomic_read(&sbi->s_mb_buddies_generated), atomic64_read(&sbi->s_mb_generation_time)); ext4_msg(sb, KERN_INFO, "mballoc: %u preallocated, %u discarded", atomic_read(&sbi->s_mb_preallocated), atomic_read(&sbi->s_mb_discarded)); } free_percpu(sbi->s_locality_groups); } static inline int ext4_issue_discard(struct super_block *sb, ext4_group_t block_group, ext4_grpblk_t cluster, int count) { ext4_fsblk_t discard_block; discard_block = (EXT4_C2B(EXT4_SB(sb), cluster) + ext4_group_first_block_no(sb, block_group)); count = EXT4_C2B(EXT4_SB(sb), count); trace_ext4_discard_blocks(sb, (unsigned long long) discard_block, count); return sb_issue_discard(sb, discard_block, count, GFP_NOFS, 0); } static void ext4_free_data_in_buddy(struct super_block *sb, struct ext4_free_data *entry) { struct ext4_buddy e4b; struct ext4_group_info *db; int err, count = 0; mb_debug(sb, "gonna free %u blocks in group %u (0x%p):", entry->efd_count, entry->efd_group, entry); err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b); /* we expect to find existing buddy because it's pinned */ BUG_ON(err != 0); spin_lock(&EXT4_SB(sb)->s_md_lock); EXT4_SB(sb)->s_mb_free_pending -= entry->efd_count; spin_unlock(&EXT4_SB(sb)->s_md_lock); db = e4b.bd_info; /* there are blocks to put in buddy to make them really free */ count += entry->efd_count; ext4_lock_group(sb, entry->efd_group); /* Take it out of per group rb tree */ rb_erase(&entry->efd_node, &(db->bb_free_root)); mb_free_blocks(NULL, &e4b, entry->efd_start_cluster, entry->efd_count); /* * Clear the trimmed flag for the group so that the next * ext4_trim_fs can trim it. */ EXT4_MB_GRP_CLEAR_TRIMMED(db); if (!db->bb_free_root.rb_node) { /* No more items in the per group rb tree * balance refcounts from ext4_mb_free_metadata() */ folio_put(e4b.bd_buddy_folio); folio_put(e4b.bd_bitmap_folio); } ext4_unlock_group(sb, entry->efd_group); ext4_mb_unload_buddy(&e4b); mb_debug(sb, "freed %d blocks in 1 structures\n", count); } /* * This function is called by the jbd2 layer once the commit has finished, * so we know we can free the blocks that were released with that commit. */ void ext4_process_freed_data(struct super_block *sb, tid_t commit_tid) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_free_data *entry, *tmp; LIST_HEAD(freed_data_list); struct list_head *s_freed_head = &sbi->s_freed_data_list[commit_tid & 1]; bool wake; list_replace_init(s_freed_head, &freed_data_list); list_for_each_entry(entry, &freed_data_list, efd_list) ext4_free_data_in_buddy(sb, entry); if (test_opt(sb, DISCARD)) { spin_lock(&sbi->s_md_lock); wake = list_empty(&sbi->s_discard_list); list_splice_tail(&freed_data_list, &sbi->s_discard_list); spin_unlock(&sbi->s_md_lock); if (wake) queue_work(system_unbound_wq, &sbi->s_discard_work); } else { list_for_each_entry_safe(entry, tmp, &freed_data_list, efd_list) kmem_cache_free(ext4_free_data_cachep, entry); } } int __init ext4_init_mballoc(void) { ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space, SLAB_RECLAIM_ACCOUNT); if (ext4_pspace_cachep == NULL) goto out; ext4_ac_cachep = KMEM_CACHE(ext4_allocation_context, SLAB_RECLAIM_ACCOUNT); if (ext4_ac_cachep == NULL) goto out_pa_free; ext4_free_data_cachep = KMEM_CACHE(ext4_free_data, SLAB_RECLAIM_ACCOUNT); if (ext4_free_data_cachep == NULL) goto out_ac_free; return 0; out_ac_free: kmem_cache_destroy(ext4_ac_cachep); out_pa_free: kmem_cache_destroy(ext4_pspace_cachep); out: return -ENOMEM; } void ext4_exit_mballoc(void) { /* * Wait for completion of call_rcu()'s on ext4_pspace_cachep * before destroying the slab cache. */ rcu_barrier(); kmem_cache_destroy(ext4_pspace_cachep); kmem_cache_destroy(ext4_ac_cachep); kmem_cache_destroy(ext4_free_data_cachep); ext4_groupinfo_destroy_slabs(); } #define EXT4_MB_BITMAP_MARKED_CHECK 0x0001 #define EXT4_MB_SYNC_UPDATE 0x0002 static int ext4_mb_mark_context(handle_t *handle, struct super_block *sb, bool state, ext4_group_t group, ext4_grpblk_t blkoff, ext4_grpblk_t len, int flags, ext4_grpblk_t *ret_changed) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct buffer_head *bitmap_bh = NULL; struct ext4_group_desc *gdp; struct buffer_head *gdp_bh; int err; unsigned int i, already, changed = len; KUNIT_STATIC_STUB_REDIRECT(ext4_mb_mark_context, handle, sb, state, group, blkoff, len, flags, ret_changed); if (ret_changed) *ret_changed = 0; bitmap_bh = ext4_read_block_bitmap(sb, group); if (IS_ERR(bitmap_bh)) return PTR_ERR(bitmap_bh); if (handle) { BUFFER_TRACE(bitmap_bh, "getting write access"); err = ext4_journal_get_write_access(handle, sb, bitmap_bh, EXT4_JTR_NONE); if (err) goto out_err; } err = -EIO; gdp = ext4_get_group_desc(sb, group, &gdp_bh); if (!gdp) goto out_err; if (handle) { BUFFER_TRACE(gdp_bh, "get_write_access"); err = ext4_journal_get_write_access(handle, sb, gdp_bh, EXT4_JTR_NONE); if (err) goto out_err; } ext4_lock_group(sb, group); if (ext4_has_group_desc_csum(sb) && (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) { gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); ext4_free_group_clusters_set(sb, gdp, ext4_free_clusters_after_init(sb, group, gdp)); } if (flags & EXT4_MB_BITMAP_MARKED_CHECK) { already = 0; for (i = 0; i < len; i++) if (mb_test_bit(blkoff + i, bitmap_bh->b_data) == state) already++; changed = len - already; } if (state) { mb_set_bits(bitmap_bh->b_data, blkoff, len); ext4_free_group_clusters_set(sb, gdp, ext4_free_group_clusters(sb, gdp) - changed); } else { mb_clear_bits(bitmap_bh->b_data, blkoff, len); ext4_free_group_clusters_set(sb, gdp, ext4_free_group_clusters(sb, gdp) + changed); } ext4_block_bitmap_csum_set(sb, gdp, bitmap_bh); ext4_group_desc_csum_set(sb, group, gdp); ext4_unlock_group(sb, group); if (ret_changed) *ret_changed = changed; if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, group); struct flex_groups *fg = sbi_array_rcu_deref(sbi, s_flex_groups, flex_group); if (state) atomic64_sub(changed, &fg->free_clusters); else atomic64_add(changed, &fg->free_clusters); } err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); if (err) goto out_err; err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh); if (err) goto out_err; if (flags & EXT4_MB_SYNC_UPDATE) { sync_dirty_buffer(bitmap_bh); sync_dirty_buffer(gdp_bh); } out_err: brelse(bitmap_bh); return err; } /* * Check quota and mark chosen space (ac->ac_b_ex) non-free in bitmaps * Returns 0 if success or error code */ static noinline_for_stack int ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, handle_t *handle, unsigned int reserv_clstrs) { struct ext4_group_desc *gdp; struct ext4_sb_info *sbi; struct super_block *sb; ext4_fsblk_t block; int err, len; int flags = 0; ext4_grpblk_t changed; BUG_ON(ac->ac_status != AC_STATUS_FOUND); BUG_ON(ac->ac_b_ex.fe_len <= 0); sb = ac->ac_sb; sbi = EXT4_SB(sb); gdp = ext4_get_group_desc(sb, ac->ac_b_ex.fe_group, NULL); if (!gdp) return -EIO; ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group, ext4_free_group_clusters(sb, gdp)); block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len); if (!ext4_inode_block_valid(ac->ac_inode, block, len)) { ext4_error(sb, "Allocating blocks %llu-%llu which overlap " "fs metadata", block, block+len); /* File system mounted not to panic on error * Fix the bitmap and return EFSCORRUPTED * We leak some of the blocks here. */ err = ext4_mb_mark_context(handle, sb, true, ac->ac_b_ex.fe_group, ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len, 0, NULL); if (!err) err = -EFSCORRUPTED; return err; } #ifdef AGGRESSIVE_CHECK flags |= EXT4_MB_BITMAP_MARKED_CHECK; #endif err = ext4_mb_mark_context(handle, sb, true, ac->ac_b_ex.fe_group, ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len, flags, &changed); if (err && changed == 0) return err; #ifdef AGGRESSIVE_CHECK BUG_ON(changed != ac->ac_b_ex.fe_len); #endif percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len); /* * Now reduce the dirty block count also. Should not go negative */ if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED)) /* release all the reserved blocks if non delalloc */ percpu_counter_sub(&sbi->s_dirtyclusters_counter, reserv_clstrs); return err; } /* * Idempotent helper for Ext4 fast commit replay path to set the state of * blocks in bitmaps and update counters. */ void ext4_mb_mark_bb(struct super_block *sb, ext4_fsblk_t block, int len, bool state) { struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_group_t group; ext4_grpblk_t blkoff; int err = 0; unsigned int clen, thisgrp_len; while (len > 0) { ext4_get_group_no_and_offset(sb, block, &group, &blkoff); /* * Check to see if we are freeing blocks across a group * boundary. * In case of flex_bg, this can happen that (block, len) may * span across more than one group. In that case we need to * get the corresponding group metadata to work with. * For this we have goto again loop. */ thisgrp_len = min_t(unsigned int, (unsigned int)len, EXT4_BLOCKS_PER_GROUP(sb) - EXT4_C2B(sbi, blkoff)); clen = EXT4_NUM_B2C(sbi, thisgrp_len); if (!ext4_sb_block_valid(sb, NULL, block, thisgrp_len)) { ext4_error(sb, "Marking blocks in system zone - " "Block = %llu, len = %u", block, thisgrp_len); break; } err = ext4_mb_mark_context(NULL, sb, state, group, blkoff, clen, EXT4_MB_BITMAP_MARKED_CHECK | EXT4_MB_SYNC_UPDATE, NULL); if (err) break; block += thisgrp_len; len -= thisgrp_len; BUG_ON(len < 0); } } /* * here we normalize request for locality group * Group request are normalized to s_mb_group_prealloc, which goes to * s_strip if we set the same via mount option. * s_mb_group_prealloc can be configured via * /sys/fs/ext4/<partition>/mb_group_prealloc * * XXX: should we try to preallocate more than the group has now? */ static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac) { struct super_block *sb = ac->ac_sb; struct ext4_locality_group *lg = ac->ac_lg; BUG_ON(lg == NULL); ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc; mb_debug(sb, "goal %u blocks for locality group\n", ac->ac_g_ex.fe_len); } /* * This function returns the next element to look at during inode * PA rbtree walk. We assume that we have held the inode PA rbtree lock * (ei->i_prealloc_lock) * * new_start The start of the range we want to compare * cur_start The existing start that we are comparing against * node The node of the rb_tree */ static inline struct rb_node* ext4_mb_pa_rb_next_iter(ext4_lblk_t new_start, ext4_lblk_t cur_start, struct rb_node *node) { if (new_start < cur_start) return node->rb_left; else return node->rb_right; } static inline void ext4_mb_pa_assert_overlap(struct ext4_allocation_context *ac, ext4_lblk_t start, loff_t end) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); struct ext4_prealloc_space *tmp_pa; ext4_lblk_t tmp_pa_start; loff_t tmp_pa_end; struct rb_node *iter; read_lock(&ei->i_prealloc_lock); for (iter = ei->i_prealloc_node.rb_node; iter; iter = ext4_mb_pa_rb_next_iter(start, tmp_pa_start, iter)) { tmp_pa = rb_entry(iter, struct ext4_prealloc_space, pa_node.inode_node); tmp_pa_start = tmp_pa->pa_lstart; tmp_pa_end = pa_logical_end(sbi, tmp_pa); spin_lock(&tmp_pa->pa_lock); if (tmp_pa->pa_deleted == 0) BUG_ON(!(start >= tmp_pa_end || end <= tmp_pa_start)); spin_unlock(&tmp_pa->pa_lock); } read_unlock(&ei->i_prealloc_lock); } /* * Given an allocation context "ac" and a range "start", "end", check * and adjust boundaries if the range overlaps with any of the existing * preallocatoins stored in the corresponding inode of the allocation context. * * Parameters: * ac allocation context * start start of the new range * end end of the new range */ static inline void ext4_mb_pa_adjust_overlap(struct ext4_allocation_context *ac, ext4_lblk_t *start, loff_t *end) { struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_prealloc_space *tmp_pa = NULL, *left_pa = NULL, *right_pa = NULL; struct rb_node *iter; ext4_lblk_t new_start, tmp_pa_start, right_pa_start = -1; loff_t new_end, tmp_pa_end, left_pa_end = -1; new_start = *start; new_end = *end; /* * Adjust the normalized range so that it doesn't overlap with any * existing preallocated blocks(PAs). Make sure to hold the rbtree lock * so it doesn't change underneath us. */ read_lock(&ei->i_prealloc_lock); /* Step 1: find any one immediate neighboring PA of the normalized range */ for (iter = ei->i_prealloc_node.rb_node; iter; iter = ext4_mb_pa_rb_next_iter(ac->ac_o_ex.fe_logical, tmp_pa_start, iter)) { tmp_pa = rb_entry(iter, struct ext4_prealloc_space, pa_node.inode_node); tmp_pa_start = tmp_pa->pa_lstart; tmp_pa_end = pa_logical_end(sbi, tmp_pa); /* PA must not overlap original request */ spin_lock(&tmp_pa->pa_lock); if (tmp_pa->pa_deleted == 0) BUG_ON(!(ac->ac_o_ex.fe_logical >= tmp_pa_end || ac->ac_o_ex.fe_logical < tmp_pa_start)); spin_unlock(&tmp_pa->pa_lock); } /* * Step 2: check if the found PA is left or right neighbor and * get the other neighbor */ if (tmp_pa) { if (tmp_pa->pa_lstart < ac->ac_o_ex.fe_logical) { struct rb_node *tmp; left_pa = tmp_pa; tmp = rb_next(&left_pa->pa_node.inode_node); if (tmp) { right_pa = rb_entry(tmp, struct ext4_prealloc_space, pa_node.inode_node); } } else { struct rb_node *tmp; right_pa = tmp_pa; tmp = rb_prev(&right_pa->pa_node.inode_node); if (tmp) { left_pa = rb_entry(tmp, struct ext4_prealloc_space, pa_node.inode_node); } } } /* Step 3: get the non deleted neighbors */ if (left_pa) { for (iter = &left_pa->pa_node.inode_node;; iter = rb_prev(iter)) { if (!iter) { left_pa = NULL; break; } tmp_pa = rb_entry(iter, struct ext4_prealloc_space, pa_node.inode_node); left_pa = tmp_pa; spin_lock(&tmp_pa->pa_lock); if (tmp_pa->pa_deleted == 0) { spin_unlock(&tmp_pa->pa_lock); break; } spin_unlock(&tmp_pa->pa_lock); } } if (right_pa) { for (iter = &right_pa->pa_node.inode_node;; iter = rb_next(iter)) { if (!iter) { right_pa = NULL; break; } tmp_pa = rb_entry(iter, struct ext4_prealloc_space, pa_node.inode_node); right_pa = tmp_pa; spin_lock(&tmp_pa->pa_lock); if (tmp_pa->pa_deleted == 0) { spin_unlock(&tmp_pa->pa_lock); break; } spin_unlock(&tmp_pa->pa_lock); } } if (left_pa) { left_pa_end = pa_logical_end(sbi, left_pa); BUG_ON(left_pa_end > ac->ac_o_ex.fe_logical); } if (right_pa) { right_pa_start = right_pa->pa_lstart; BUG_ON(right_pa_start <= ac->ac_o_ex.fe_logical); } /* Step 4: trim our normalized range to not overlap with the neighbors */ if (left_pa) { if (left_pa_end > new_start) new_start = left_pa_end; } if (right_pa) { if (right_pa_start < new_end) new_end = right_pa_start; } read_unlock(&ei->i_prealloc_lock); /* XXX: extra loop to check we really don't overlap preallocations */ ext4_mb_pa_assert_overlap(ac, new_start, new_end); *start = new_start; *end = new_end; } /* * Normalization means making request better in terms of * size and alignment */ static noinline_for_stack void ext4_mb_normalize_request(struct ext4_allocation_context *ac, struct ext4_allocation_request *ar) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_super_block *es = sbi->s_es; int bsbits, max; loff_t size, start_off, end; loff_t orig_size __maybe_unused; ext4_lblk_t start; /* do normalize only data requests, metadata requests do not need preallocation */ if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) return; /* sometime caller may want exact blocks */ if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) return; /* caller may indicate that preallocation isn't * required (it's a tail, for example) */ if (ac->ac_flags & EXT4_MB_HINT_NOPREALLOC) return; if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) { ext4_mb_normalize_group_request(ac); return ; } bsbits = ac->ac_sb->s_blocksize_bits; /* first, let's learn actual file size * given current request is allocated */ size = extent_logical_end(sbi, &ac->ac_o_ex); size = size << bsbits; if (size < i_size_read(ac->ac_inode)) size = i_size_read(ac->ac_inode); orig_size = size; /* max size of free chunks */ max = 2 << bsbits; #define NRL_CHECK_SIZE(req, size, max, chunk_size) \ (req <= (size) || max <= (chunk_size)) /* first, try to predict filesize */ /* XXX: should this table be tunable? */ start_off = 0; if (size <= 16 * 1024) { size = 16 * 1024; } else if (size <= 32 * 1024) { size = 32 * 1024; } else if (size <= 64 * 1024) { size = 64 * 1024; } else if (size <= 128 * 1024) { size = 128 * 1024; } else if (size <= 256 * 1024) { size = 256 * 1024; } else if (size <= 512 * 1024) { size = 512 * 1024; } else if (size <= 1024 * 1024) { size = 1024 * 1024; } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) { start_off = ((loff_t)ac->ac_o_ex.fe_logical >> (21 - bsbits)) << 21; size = 2 * 1024 * 1024; } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) { start_off = ((loff_t)ac->ac_o_ex.fe_logical >> (22 - bsbits)) << 22; size = 4 * 1024 * 1024; } else if (NRL_CHECK_SIZE(EXT4_C2B(sbi, ac->ac_o_ex.fe_len), (8<<20)>>bsbits, max, 8 * 1024)) { start_off = ((loff_t)ac->ac_o_ex.fe_logical >> (23 - bsbits)) << 23; size = 8 * 1024 * 1024; } else { start_off = (loff_t) ac->ac_o_ex.fe_logical << bsbits; size = (loff_t) EXT4_C2B(sbi, ac->ac_o_ex.fe_len) << bsbits; } size = size >> bsbits; start = start_off >> bsbits; /* * For tiny groups (smaller than 8MB) the chosen allocation * alignment may be larger than group size. Make sure the * alignment does not move allocation to a different group which * makes mballoc fail assertions later. */ start = max(start, rounddown(ac->ac_o_ex.fe_logical, (ext4_lblk_t)EXT4_BLOCKS_PER_GROUP(ac->ac_sb))); /* avoid unnecessary preallocation that may trigger assertions */ if (start + size > EXT_MAX_BLOCKS) size = EXT_MAX_BLOCKS - start; /* don't cover already allocated blocks in selected range */ if (ar->pleft && start <= ar->lleft) { size -= ar->lleft + 1 - start; start = ar->lleft + 1; } if (ar->pright && start + size - 1 >= ar->lright) size -= start + size - ar->lright; /* * Trim allocation request for filesystems with artificially small * groups. */ if (size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)) size = EXT4_BLOCKS_PER_GROUP(ac->ac_sb); end = start + size; ext4_mb_pa_adjust_overlap(ac, &start, &end); size = end - start; /* * In this function "start" and "size" are normalized for better * alignment and length such that we could preallocate more blocks. * This normalization is done such that original request of * ac->ac_o_ex.fe_logical & fe_len should always lie within "start" and * "size" boundaries. * (Note fe_len can be relaxed since FS block allocation API does not * provide gurantee on number of contiguous blocks allocation since that * depends upon free space left, etc). * In case of inode pa, later we use the allocated blocks * [pa_pstart + fe_logical - pa_lstart, fe_len/size] from the preallocated * range of goal/best blocks [start, size] to put it at the * ac_o_ex.fe_logical extent of this inode. * (See ext4_mb_use_inode_pa() for more details) */ if (start + size <= ac->ac_o_ex.fe_logical || start > ac->ac_o_ex.fe_logical) { ext4_msg(ac->ac_sb, KERN_ERR, "start %lu, size %lu, fe_logical %lu", (unsigned long) start, (unsigned long) size, (unsigned long) ac->ac_o_ex.fe_logical); BUG(); } BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); /* now prepare goal request */ /* XXX: is it better to align blocks WRT to logical * placement or satisfy big request as is */ ac->ac_g_ex.fe_logical = start; ac->ac_g_ex.fe_len = EXT4_NUM_B2C(sbi, size); ac->ac_orig_goal_len = ac->ac_g_ex.fe_len; /* define goal start in order to merge */ if (ar->pright && (ar->lright == (start + size)) && ar->pright >= size && ar->pright - size >= le32_to_cpu(es->s_first_data_block)) { /* merge to the right */ ext4_get_group_no_and_offset(ac->ac_sb, ar->pright - size, &ac->ac_g_ex.fe_group, &ac->ac_g_ex.fe_start); ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL; } if (ar->pleft && (ar->lleft + 1 == start) && ar->pleft + 1 < ext4_blocks_count(es)) { /* merge to the left */ ext4_get_group_no_and_offset(ac->ac_sb, ar->pleft + 1, &ac->ac_g_ex.fe_group, &ac->ac_g_ex.fe_start); ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL; } mb_debug(ac->ac_sb, "goal: %lld(was %lld) blocks at %u\n", size, orig_size, start); } static void ext4_mb_collect_stats(struct ext4_allocation_context *ac) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); if (sbi->s_mb_stats && ac->ac_g_ex.fe_len >= 1) { atomic_inc(&sbi->s_bal_reqs); atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated); if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len) atomic_inc(&sbi->s_bal_success); atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned); for (int i=0; i<EXT4_MB_NUM_CRS; i++) { atomic_add(ac->ac_cX_found[i], &sbi->s_bal_cX_ex_scanned[i]); } atomic_add(ac->ac_groups_scanned, &sbi->s_bal_groups_scanned); if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) atomic_inc(&sbi->s_bal_goals); /* did we allocate as much as normalizer originally wanted? */ if (ac->ac_f_ex.fe_len == ac->ac_orig_goal_len) atomic_inc(&sbi->s_bal_len_goals); if (ac->ac_found > sbi->s_mb_max_to_scan) atomic_inc(&sbi->s_bal_breaks); } if (ac->ac_op == EXT4_MB_HISTORY_ALLOC) trace_ext4_mballoc_alloc(ac); else trace_ext4_mballoc_prealloc(ac); } /* * Called on failure; free up any blocks from the inode PA for this * context. We don't need this for MB_GROUP_PA because we only change * pa_free in ext4_mb_release_context(), but on failure, we've already * zeroed out ac->ac_b_ex.fe_len, so group_pa->pa_free is not changed. */ static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac) { struct ext4_prealloc_space *pa = ac->ac_pa; struct ext4_buddy e4b; int err; if (pa == NULL) { if (ac->ac_f_ex.fe_len == 0) return; err = ext4_mb_load_buddy(ac->ac_sb, ac->ac_f_ex.fe_group, &e4b); if (WARN_RATELIMIT(err, "ext4: mb_load_buddy failed (%d)", err)) /* * This should never happen since we pin the * pages in the ext4_allocation_context so * ext4_mb_load_buddy() should never fail. */ return; ext4_lock_group(ac->ac_sb, ac->ac_f_ex.fe_group); mb_free_blocks(ac->ac_inode, &e4b, ac->ac_f_ex.fe_start, ac->ac_f_ex.fe_len); ext4_unlock_group(ac->ac_sb, ac->ac_f_ex.fe_group); ext4_mb_unload_buddy(&e4b); return; } if (pa->pa_type == MB_INODE_PA) { spin_lock(&pa->pa_lock); pa->pa_free += ac->ac_b_ex.fe_len; spin_unlock(&pa->pa_lock); } } /* * use blocks preallocated to inode */ static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac, struct ext4_prealloc_space *pa) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); ext4_fsblk_t start; ext4_fsblk_t end; int len; /* found preallocated blocks, use them */ start = pa->pa_pstart + (ac->ac_o_ex.fe_logical - pa->pa_lstart); end = min(pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len), start + EXT4_C2B(sbi, ac->ac_o_ex.fe_len)); len = EXT4_NUM_B2C(sbi, end - start); ext4_get_group_no_and_offset(ac->ac_sb, start, &ac->ac_b_ex.fe_group, &ac->ac_b_ex.fe_start); ac->ac_b_ex.fe_len = len; ac->ac_status = AC_STATUS_FOUND; ac->ac_pa = pa; BUG_ON(start < pa->pa_pstart); BUG_ON(end > pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len)); BUG_ON(pa->pa_free < len); BUG_ON(ac->ac_b_ex.fe_len <= 0); pa->pa_free -= len; mb_debug(ac->ac_sb, "use %llu/%d from inode pa %p\n", start, len, pa); } /* * use blocks preallocated to locality group */ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac, struct ext4_prealloc_space *pa) { unsigned int len = ac->ac_o_ex.fe_len; ext4_get_group_no_and_offset(ac->ac_sb, pa->pa_pstart, &ac->ac_b_ex.fe_group, &ac->ac_b_ex.fe_start); ac->ac_b_ex.fe_len = len; ac->ac_status = AC_STATUS_FOUND; ac->ac_pa = pa; /* we don't correct pa_pstart or pa_len here to avoid * possible race when the group is being loaded concurrently * instead we correct pa later, after blocks are marked * in on-disk bitmap -- see ext4_mb_release_context() * Other CPUs are prevented from allocating from this pa by lg_mutex */ mb_debug(ac->ac_sb, "use %u/%u from group pa %p\n", pa->pa_lstart, len, pa); } /* * Return the prealloc space that have minimal distance * from the goal block. @cpa is the prealloc * space that is having currently known minimal distance * from the goal block. */ static struct ext4_prealloc_space * ext4_mb_check_group_pa(ext4_fsblk_t goal_block, struct ext4_prealloc_space *pa, struct ext4_prealloc_space *cpa) { ext4_fsblk_t cur_distance, new_distance; if (cpa == NULL) { atomic_inc(&pa->pa_count); return pa; } cur_distance = abs(goal_block - cpa->pa_pstart); new_distance = abs(goal_block - pa->pa_pstart); if (cur_distance <= new_distance) return cpa; /* drop the previous reference */ atomic_dec(&cpa->pa_count); atomic_inc(&pa->pa_count); return pa; } /* * check if found pa meets EXT4_MB_HINT_GOAL_ONLY */ static bool ext4_mb_pa_goal_check(struct ext4_allocation_context *ac, struct ext4_prealloc_space *pa) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); ext4_fsblk_t start; if (likely(!(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))) return true; /* * If EXT4_MB_HINT_GOAL_ONLY is set, ac_g_ex will not be adjusted * in ext4_mb_normalize_request and will keep same with ac_o_ex * from ext4_mb_initialize_context. Choose ac_g_ex here to keep * consistent with ext4_mb_find_by_goal. */ start = pa->pa_pstart + (ac->ac_g_ex.fe_logical - pa->pa_lstart); if (ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex) != start) return false; if (ac->ac_g_ex.fe_len > pa->pa_len - EXT4_B2C(sbi, ac->ac_g_ex.fe_logical - pa->pa_lstart)) return false; return true; } /* * search goal blocks in preallocated space */ static noinline_for_stack bool ext4_mb_use_preallocated(struct ext4_allocation_context *ac) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); int order, i; struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); struct ext4_locality_group *lg; struct ext4_prealloc_space *tmp_pa = NULL, *cpa = NULL; struct rb_node *iter; ext4_fsblk_t goal_block; /* only data can be preallocated */ if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) return false; /* * first, try per-file preallocation by searching the inode pa rbtree. * * Here, we can't do a direct traversal of the tree because * ext4_mb_discard_group_preallocation() can paralelly mark the pa * deleted and that can cause direct traversal to skip some entries. */ read_lock(&ei->i_prealloc_lock); if (RB_EMPTY_ROOT(&ei->i_prealloc_node)) { goto try_group_pa; } /* * Step 1: Find a pa with logical start immediately adjacent to the * original logical start. This could be on the left or right. * * (tmp_pa->pa_lstart never changes so we can skip locking for it). */ for (iter = ei->i_prealloc_node.rb_node; iter; iter = ext4_mb_pa_rb_next_iter(ac->ac_o_ex.fe_logical, tmp_pa->pa_lstart, iter)) { tmp_pa = rb_entry(iter, struct ext4_prealloc_space, pa_node.inode_node); } /* * Step 2: The adjacent pa might be to the right of logical start, find * the left adjacent pa. After this step we'd have a valid tmp_pa whose * logical start is towards the left of original request's logical start */ if (tmp_pa->pa_lstart > ac->ac_o_ex.fe_logical) { struct rb_node *tmp; tmp = rb_prev(&tmp_pa->pa_node.inode_node); if (tmp) { tmp_pa = rb_entry(tmp, struct ext4_prealloc_space, pa_node.inode_node); } else { /* * If there is no adjacent pa to the left then finding * an overlapping pa is not possible hence stop searching * inode pa tree */ goto try_group_pa; } } BUG_ON(!(tmp_pa && tmp_pa->pa_lstart <= ac->ac_o_ex.fe_logical)); /* * Step 3: If the left adjacent pa is deleted, keep moving left to find * the first non deleted adjacent pa. After this step we should have a * valid tmp_pa which is guaranteed to be non deleted. */ for (iter = &tmp_pa->pa_node.inode_node;; iter = rb_prev(iter)) { if (!iter) { /* * no non deleted left adjacent pa, so stop searching * inode pa tree */ goto try_group_pa; } tmp_pa = rb_entry(iter, struct ext4_prealloc_space, pa_node.inode_node); spin_lock(&tmp_pa->pa_lock); if (tmp_pa->pa_deleted == 0) { /* * We will keep holding the pa_lock from * this point on because we don't want group discard * to delete this pa underneath us. Since group * discard is anyways an ENOSPC operation it * should be okay for it to wait a few more cycles. */ break; } else { spin_unlock(&tmp_pa->pa_lock); } } BUG_ON(!(tmp_pa && tmp_pa->pa_lstart <= ac->ac_o_ex.fe_logical)); BUG_ON(tmp_pa->pa_deleted == 1); /* * Step 4: We now have the non deleted left adjacent pa. Only this * pa can possibly satisfy the request hence check if it overlaps * original logical start and stop searching if it doesn't. */ if (ac->ac_o_ex.fe_logical >= pa_logical_end(sbi, tmp_pa)) { spin_unlock(&tmp_pa->pa_lock); goto try_group_pa; } /* non-extent files can't have physical blocks past 2^32 */ if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) && (tmp_pa->pa_pstart + EXT4_C2B(sbi, tmp_pa->pa_len) > EXT4_MAX_BLOCK_FILE_PHYS)) { /* * Since PAs don't overlap, we won't find any other PA to * satisfy this. */ spin_unlock(&tmp_pa->pa_lock); goto try_group_pa; } if (tmp_pa->pa_free && likely(ext4_mb_pa_goal_check(ac, tmp_pa))) { atomic_inc(&tmp_pa->pa_count); ext4_mb_use_inode_pa(ac, tmp_pa); spin_unlock(&tmp_pa->pa_lock); read_unlock(&ei->i_prealloc_lock); return true; } else { /* * We found a valid overlapping pa but couldn't use it because * it had no free blocks. This should ideally never happen * because: * * 1. When a new inode pa is added to rbtree it must have * pa_free > 0 since otherwise we won't actually need * preallocation. * * 2. An inode pa that is in the rbtree can only have it's * pa_free become zero when another thread calls: * ext4_mb_new_blocks * ext4_mb_use_preallocated * ext4_mb_use_inode_pa * * 3. Further, after the above calls make pa_free == 0, we will * immediately remove it from the rbtree in: * ext4_mb_new_blocks * ext4_mb_release_context * ext4_mb_put_pa * * 4. Since the pa_free becoming 0 and pa_free getting removed * from tree both happen in ext4_mb_new_blocks, which is always * called with i_data_sem held for data allocations, we can be * sure that another process will never see a pa in rbtree with * pa_free == 0. */ WARN_ON_ONCE(tmp_pa->pa_free == 0); } spin_unlock(&tmp_pa->pa_lock); try_group_pa: read_unlock(&ei->i_prealloc_lock); /* can we use group allocation? */ if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)) return false; /* inode may have no locality group for some reason */ lg = ac->ac_lg; if (lg == NULL) return false; order = fls(ac->ac_o_ex.fe_len) - 1; if (order > PREALLOC_TB_SIZE - 1) /* The max size of hash table is PREALLOC_TB_SIZE */ order = PREALLOC_TB_SIZE - 1; goal_block = ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex); /* * search for the prealloc space that is having * minimal distance from the goal block. */ for (i = order; i < PREALLOC_TB_SIZE; i++) { rcu_read_lock(); list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[i], pa_node.lg_list) { spin_lock(&tmp_pa->pa_lock); if (tmp_pa->pa_deleted == 0 && tmp_pa->pa_free >= ac->ac_o_ex.fe_len) { cpa = ext4_mb_check_group_pa(goal_block, tmp_pa, cpa); } spin_unlock(&tmp_pa->pa_lock); } rcu_read_unlock(); } if (cpa) { ext4_mb_use_group_pa(ac, cpa); return true; } return false; } /* * the function goes through all preallocation in this group and marks them * used in in-core bitmap. buddy must be generated from this bitmap * Need to be called with ext4 group lock held */ static noinline_for_stack void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ext4_group_t group) { struct ext4_group_info *grp = ext4_get_group_info(sb, group); struct ext4_prealloc_space *pa; struct list_head *cur; ext4_group_t groupnr; ext4_grpblk_t start; int preallocated = 0; int len; if (!grp) return; /* all form of preallocation discards first load group, * so the only competing code is preallocation use. * we don't need any locking here * notice we do NOT ignore preallocations with pa_deleted * otherwise we could leave used blocks available for * allocation in buddy when concurrent ext4_mb_put_pa() * is dropping preallocation */ list_for_each(cur, &grp->bb_prealloc_list) { pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); spin_lock(&pa->pa_lock); ext4_get_group_no_and_offset(sb, pa->pa_pstart, &groupnr, &start); len = pa->pa_len; spin_unlock(&pa->pa_lock); if (unlikely(len == 0)) continue; BUG_ON(groupnr != group); mb_set_bits(bitmap, start, len); preallocated += len; } mb_debug(sb, "preallocated %d for group %u\n", preallocated, group); } static void ext4_mb_mark_pa_deleted(struct super_block *sb, struct ext4_prealloc_space *pa) { struct ext4_inode_info *ei; if (pa->pa_deleted) { ext4_warning(sb, "deleted pa, type:%d, pblk:%llu, lblk:%u, len:%d\n", pa->pa_type, pa->pa_pstart, pa->pa_lstart, pa->pa_len); return; } pa->pa_deleted = 1; if (pa->pa_type == MB_INODE_PA) { ei = EXT4_I(pa->pa_inode); atomic_dec(&ei->i_prealloc_active); } } static inline void ext4_mb_pa_free(struct ext4_prealloc_space *pa) { BUG_ON(!pa); BUG_ON(atomic_read(&pa->pa_count)); BUG_ON(pa->pa_deleted == 0); kmem_cache_free(ext4_pspace_cachep, pa); } static void ext4_mb_pa_callback(struct rcu_head *head) { struct ext4_prealloc_space *pa; pa = container_of(head, struct ext4_prealloc_space, u.pa_rcu); ext4_mb_pa_free(pa); } /* * drops a reference to preallocated space descriptor * if this was the last reference and the space is consumed */ static void ext4_mb_put_pa(struct ext4_allocation_context *ac, struct super_block *sb, struct ext4_prealloc_space *pa) { ext4_group_t grp; ext4_fsblk_t grp_blk; struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); /* in this short window concurrent discard can set pa_deleted */ spin_lock(&pa->pa_lock); if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) { spin_unlock(&pa->pa_lock); return; } if (pa->pa_deleted == 1) { spin_unlock(&pa->pa_lock); return; } ext4_mb_mark_pa_deleted(sb, pa); spin_unlock(&pa->pa_lock); grp_blk = pa->pa_pstart; /* * If doing group-based preallocation, pa_pstart may be in the * next group when pa is used up */ if (pa->pa_type == MB_GROUP_PA) grp_blk--; grp = ext4_get_group_number(sb, grp_blk); /* * possible race: * * P1 (buddy init) P2 (regular allocation) * find block B in PA * copy on-disk bitmap to buddy * mark B in on-disk bitmap * drop PA from group * mark all PAs in buddy * * thus, P1 initializes buddy with B available. to prevent this * we make "copy" and "mark all PAs" atomic and serialize "drop PA" * against that pair */ ext4_lock_group(sb, grp); list_del(&pa->pa_group_list); ext4_unlock_group(sb, grp); if (pa->pa_type == MB_INODE_PA) { write_lock(pa->pa_node_lock.inode_lock); rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node); write_unlock(pa->pa_node_lock.inode_lock); ext4_mb_pa_free(pa); } else { spin_lock(pa->pa_node_lock.lg_lock); list_del_rcu(&pa->pa_node.lg_list); spin_unlock(pa->pa_node_lock.lg_lock); call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); } } static void ext4_mb_pa_rb_insert(struct rb_root *root, struct rb_node *new) { struct rb_node **iter = &root->rb_node, *parent = NULL; struct ext4_prealloc_space *iter_pa, *new_pa; ext4_lblk_t iter_start, new_start; while (*iter) { iter_pa = rb_entry(*iter, struct ext4_prealloc_space, pa_node.inode_node); new_pa = rb_entry(new, struct ext4_prealloc_space, pa_node.inode_node); iter_start = iter_pa->pa_lstart; new_start = new_pa->pa_lstart; parent = *iter; if (new_start < iter_start) iter = &((*iter)->rb_left); else iter = &((*iter)->rb_right); } rb_link_node(new, parent, iter); rb_insert_color(new, root); } /* * creates new preallocated space for given inode */ static noinline_for_stack void ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) { struct super_block *sb = ac->ac_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_prealloc_space *pa; struct ext4_group_info *grp; struct ext4_inode_info *ei; /* preallocate only when found space is larger then requested */ BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len); BUG_ON(ac->ac_status != AC_STATUS_FOUND); BUG_ON(!S_ISREG(ac->ac_inode->i_mode)); BUG_ON(ac->ac_pa == NULL); pa = ac->ac_pa; if (ac->ac_b_ex.fe_len < ac->ac_orig_goal_len) { struct ext4_free_extent ex = { .fe_logical = ac->ac_g_ex.fe_logical, .fe_len = ac->ac_orig_goal_len, }; loff_t orig_goal_end = extent_logical_end(sbi, &ex); loff_t o_ex_end = extent_logical_end(sbi, &ac->ac_o_ex); /* * We can't allocate as much as normalizer wants, so we try * to get proper lstart to cover the original request, except * when the goal doesn't cover the original request as below: * * orig_ex:2045/2055(10), isize:8417280 -> normalized:0/2048 * best_ex:0/200(200) -> adjusted: 1848/2048(200) */ BUG_ON(ac->ac_g_ex.fe_logical > ac->ac_o_ex.fe_logical); BUG_ON(ac->ac_g_ex.fe_len < ac->ac_o_ex.fe_len); /* * Use the below logic for adjusting best extent as it keeps * fragmentation in check while ensuring logical range of best * extent doesn't overflow out of goal extent: * * 1. Check if best ex can be kept at end of goal (before * cr_best_avail trimmed it) and still cover original start * 2. Else, check if best ex can be kept at start of goal and * still cover original end * 3. Else, keep the best ex at start of original request. */ ex.fe_len = ac->ac_b_ex.fe_len; ex.fe_logical = orig_goal_end - EXT4_C2B(sbi, ex.fe_len); if (ac->ac_o_ex.fe_logical >= ex.fe_logical) goto adjust_bex; ex.fe_logical = ac->ac_g_ex.fe_logical; if (o_ex_end <= extent_logical_end(sbi, &ex)) goto adjust_bex; ex.fe_logical = ac->ac_o_ex.fe_logical; adjust_bex: ac->ac_b_ex.fe_logical = ex.fe_logical; BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical); BUG_ON(extent_logical_end(sbi, &ex) > orig_goal_end); } pa->pa_lstart = ac->ac_b_ex.fe_logical; pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); pa->pa_len = ac->ac_b_ex.fe_len; pa->pa_free = pa->pa_len; spin_lock_init(&pa->pa_lock); INIT_LIST_HEAD(&pa->pa_group_list); pa->pa_deleted = 0; pa->pa_type = MB_INODE_PA; mb_debug(sb, "new inode pa %p: %llu/%d for %u\n", pa, pa->pa_pstart, pa->pa_len, pa->pa_lstart); trace_ext4_mb_new_inode_pa(ac, pa); atomic_add(pa->pa_free, &sbi->s_mb_preallocated); ext4_mb_use_inode_pa(ac, pa); ei = EXT4_I(ac->ac_inode); grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group); if (!grp) return; pa->pa_node_lock.inode_lock = &ei->i_prealloc_lock; pa->pa_inode = ac->ac_inode; list_add(&pa->pa_group_list, &grp->bb_prealloc_list); write_lock(pa->pa_node_lock.inode_lock); ext4_mb_pa_rb_insert(&ei->i_prealloc_node, &pa->pa_node.inode_node); write_unlock(pa->pa_node_lock.inode_lock); atomic_inc(&ei->i_prealloc_active); } /* * creates new preallocated space for locality group inodes belongs to */ static noinline_for_stack void ext4_mb_new_group_pa(struct ext4_allocation_context *ac) { struct super_block *sb = ac->ac_sb; struct ext4_locality_group *lg; struct ext4_prealloc_space *pa; struct ext4_group_info *grp; /* preallocate only when found space is larger then requested */ BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len); BUG_ON(ac->ac_status != AC_STATUS_FOUND); BUG_ON(!S_ISREG(ac->ac_inode->i_mode)); BUG_ON(ac->ac_pa == NULL); pa = ac->ac_pa; pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); pa->pa_lstart = pa->pa_pstart; pa->pa_len = ac->ac_b_ex.fe_len; pa->pa_free = pa->pa_len; spin_lock_init(&pa->pa_lock); INIT_LIST_HEAD(&pa->pa_node.lg_list); INIT_LIST_HEAD(&pa->pa_group_list); pa->pa_deleted = 0; pa->pa_type = MB_GROUP_PA; mb_debug(sb, "new group pa %p: %llu/%d for %u\n", pa, pa->pa_pstart, pa->pa_len, pa->pa_lstart); trace_ext4_mb_new_group_pa(ac, pa); ext4_mb_use_group_pa(ac, pa); atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated); grp = ext4_get_group_info(sb, ac->ac_b_ex.fe_group); if (!grp) return; lg = ac->ac_lg; BUG_ON(lg == NULL); pa->pa_node_lock.lg_lock = &lg->lg_prealloc_lock; pa->pa_inode = NULL; list_add(&pa->pa_group_list, &grp->bb_prealloc_list); /* * We will later add the new pa to the right bucket * after updating the pa_free in ext4_mb_release_context */ } static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac) { if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) ext4_mb_new_group_pa(ac); else ext4_mb_new_inode_pa(ac); } /* * finds all unused blocks in on-disk bitmap, frees them in * in-core bitmap and buddy. * @pa must be unlinked from inode and group lists, so that * nobody else can find/use it. * the caller MUST hold group/inode locks. * TODO: optimize the case when there are no in-core structures yet */ static noinline_for_stack void ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, struct ext4_prealloc_space *pa) { struct super_block *sb = e4b->bd_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); unsigned int end; unsigned int next; ext4_group_t group; ext4_grpblk_t bit; unsigned long long grp_blk_start; int free = 0; BUG_ON(pa->pa_deleted == 0); ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); grp_blk_start = pa->pa_pstart - EXT4_C2B(sbi, bit); BUG_ON(group != e4b->bd_group && pa->pa_len != 0); end = bit + pa->pa_len; while (bit < end) { bit = mb_find_next_zero_bit(bitmap_bh->b_data, end, bit); if (bit >= end) break; next = mb_find_next_bit(bitmap_bh->b_data, end, bit); mb_debug(sb, "free preallocated %u/%u in group %u\n", (unsigned) ext4_group_first_block_no(sb, group) + bit, (unsigned) next - bit, (unsigned) group); free += next - bit; trace_ext4_mballoc_discard(sb, NULL, group, bit, next - bit); trace_ext4_mb_release_inode_pa(pa, (grp_blk_start + EXT4_C2B(sbi, bit)), next - bit); mb_free_blocks(pa->pa_inode, e4b, bit, next - bit); bit = next + 1; } if (free != pa->pa_free) { ext4_msg(e4b->bd_sb, KERN_CRIT, "pa %p: logic %lu, phys. %lu, len %d", pa, (unsigned long) pa->pa_lstart, (unsigned long) pa->pa_pstart, pa->pa_len); ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u", free, pa->pa_free); /* * pa is already deleted so we use the value obtained * from the bitmap and continue. */ } atomic_add(free, &sbi->s_mb_discarded); } static noinline_for_stack void ext4_mb_release_group_pa(struct ext4_buddy *e4b, struct ext4_prealloc_space *pa) { struct super_block *sb = e4b->bd_sb; ext4_group_t group; ext4_grpblk_t bit; trace_ext4_mb_release_group_pa(sb, pa); BUG_ON(pa->pa_deleted == 0); ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); if (unlikely(group != e4b->bd_group && pa->pa_len != 0)) { ext4_warning(sb, "bad group: expected %u, group %u, pa_start %llu", e4b->bd_group, group, pa->pa_pstart); return; } mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len); atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded); trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len); } /* * releases all preallocations in given group * * first, we need to decide discard policy: * - when do we discard * 1) ENOSPC * - how many do we discard * 1) how many requested */ static noinline_for_stack int ext4_mb_discard_group_preallocations(struct super_block *sb, ext4_group_t group, int *busy) { struct ext4_group_info *grp = ext4_get_group_info(sb, group); struct buffer_head *bitmap_bh = NULL; struct ext4_prealloc_space *pa, *tmp; LIST_HEAD(list); struct ext4_buddy e4b; struct ext4_inode_info *ei; int err; int free = 0; if (!grp) return 0; mb_debug(sb, "discard preallocation for group %u\n", group); if (list_empty(&grp->bb_prealloc_list)) goto out_dbg; bitmap_bh = ext4_read_block_bitmap(sb, group); if (IS_ERR(bitmap_bh)) { err = PTR_ERR(bitmap_bh); ext4_error_err(sb, -err, "Error %d reading block bitmap for %u", err, group); goto out_dbg; } err = ext4_mb_load_buddy(sb, group, &e4b); if (err) { ext4_warning(sb, "Error %d loading buddy information for %u", err, group); put_bh(bitmap_bh); goto out_dbg; } ext4_lock_group(sb, group); list_for_each_entry_safe(pa, tmp, &grp->bb_prealloc_list, pa_group_list) { spin_lock(&pa->pa_lock); if (atomic_read(&pa->pa_count)) { spin_unlock(&pa->pa_lock); *busy = 1; continue; } if (pa->pa_deleted) { spin_unlock(&pa->pa_lock); continue; } /* seems this one can be freed ... */ ext4_mb_mark_pa_deleted(sb, pa); if (!free) this_cpu_inc(discard_pa_seq); /* we can trust pa_free ... */ free += pa->pa_free; spin_unlock(&pa->pa_lock); list_del(&pa->pa_group_list); list_add(&pa->u.pa_tmp_list, &list); } /* now free all selected PAs */ list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { /* remove from object (inode or locality group) */ if (pa->pa_type == MB_GROUP_PA) { spin_lock(pa->pa_node_lock.lg_lock); list_del_rcu(&pa->pa_node.lg_list); spin_unlock(pa->pa_node_lock.lg_lock); } else { write_lock(pa->pa_node_lock.inode_lock); ei = EXT4_I(pa->pa_inode); rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node); write_unlock(pa->pa_node_lock.inode_lock); } list_del(&pa->u.pa_tmp_list); if (pa->pa_type == MB_GROUP_PA) { ext4_mb_release_group_pa(&e4b, pa); call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); } else { ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa); ext4_mb_pa_free(pa); } } ext4_unlock_group(sb, group); ext4_mb_unload_buddy(&e4b); put_bh(bitmap_bh); out_dbg: mb_debug(sb, "discarded (%d) blocks preallocated for group %u bb_free (%d)\n", free, group, grp->bb_free); return free; } /* * releases all non-used preallocated blocks for given inode * * It's important to discard preallocations under i_data_sem * We don't want another block to be served from the prealloc * space when we are discarding the inode prealloc space. * * FIXME!! Make sure it is valid at all the call sites */ void ext4_discard_preallocations(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); struct super_block *sb = inode->i_sb; struct buffer_head *bitmap_bh = NULL; struct ext4_prealloc_space *pa, *tmp; ext4_group_t group = 0; LIST_HEAD(list); struct ext4_buddy e4b; struct rb_node *iter; int err; if (!S_ISREG(inode->i_mode)) return; if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) return; mb_debug(sb, "discard preallocation for inode %lu\n", inode->i_ino); trace_ext4_discard_preallocations(inode, atomic_read(&ei->i_prealloc_active)); repeat: /* first, collect all pa's in the inode */ write_lock(&ei->i_prealloc_lock); for (iter = rb_first(&ei->i_prealloc_node); iter; iter = rb_next(iter)) { pa = rb_entry(iter, struct ext4_prealloc_space, pa_node.inode_node); BUG_ON(pa->pa_node_lock.inode_lock != &ei->i_prealloc_lock); spin_lock(&pa->pa_lock); if (atomic_read(&pa->pa_count)) { /* this shouldn't happen often - nobody should * use preallocation while we're discarding it */ spin_unlock(&pa->pa_lock); write_unlock(&ei->i_prealloc_lock); ext4_msg(sb, KERN_ERR, "uh-oh! used pa while discarding"); WARN_ON(1); schedule_timeout_uninterruptible(HZ); goto repeat; } if (pa->pa_deleted == 0) { ext4_mb_mark_pa_deleted(sb, pa); spin_unlock(&pa->pa_lock); rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node); list_add(&pa->u.pa_tmp_list, &list); continue; } /* someone is deleting pa right now */ spin_unlock(&pa->pa_lock); write_unlock(&ei->i_prealloc_lock); /* we have to wait here because pa_deleted * doesn't mean pa is already unlinked from * the list. as we might be called from * ->clear_inode() the inode will get freed * and concurrent thread which is unlinking * pa from inode's list may access already * freed memory, bad-bad-bad */ /* XXX: if this happens too often, we can * add a flag to force wait only in case * of ->clear_inode(), but not in case of * regular truncate */ schedule_timeout_uninterruptible(HZ); goto repeat; } write_unlock(&ei->i_prealloc_lock); list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) { BUG_ON(pa->pa_type != MB_INODE_PA); group = ext4_get_group_number(sb, pa->pa_pstart); err = ext4_mb_load_buddy_gfp(sb, group, &e4b, GFP_NOFS|__GFP_NOFAIL); if (err) { ext4_error_err(sb, -err, "Error %d loading buddy information for %u", err, group); continue; } bitmap_bh = ext4_read_block_bitmap(sb, group); if (IS_ERR(bitmap_bh)) { err = PTR_ERR(bitmap_bh); ext4_error_err(sb, -err, "Error %d reading block bitmap for %u", err, group); ext4_mb_unload_buddy(&e4b); continue; } ext4_lock_group(sb, group); list_del(&pa->pa_group_list); ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa); ext4_unlock_group(sb, group); ext4_mb_unload_buddy(&e4b); put_bh(bitmap_bh); list_del(&pa->u.pa_tmp_list); ext4_mb_pa_free(pa); } } static int ext4_mb_pa_alloc(struct ext4_allocation_context *ac) { struct ext4_prealloc_space *pa; BUG_ON(ext4_pspace_cachep == NULL); pa = kmem_cache_zalloc(ext4_pspace_cachep, GFP_NOFS); if (!pa) return -ENOMEM; atomic_set(&pa->pa_count, 1); ac->ac_pa = pa; return 0; } static void ext4_mb_pa_put_free(struct ext4_allocation_context *ac) { struct ext4_prealloc_space *pa = ac->ac_pa; BUG_ON(!pa); ac->ac_pa = NULL; WARN_ON(!atomic_dec_and_test(&pa->pa_count)); /* * current function is only called due to an error or due to * len of found blocks < len of requested blocks hence the PA has not * been added to grp->bb_prealloc_list. So we don't need to lock it */ pa->pa_deleted = 1; ext4_mb_pa_free(pa); } #ifdef CONFIG_EXT4_DEBUG static inline void ext4_mb_show_pa(struct super_block *sb) { ext4_group_t i, ngroups; if (ext4_emergency_state(sb)) return; ngroups = ext4_get_groups_count(sb); mb_debug(sb, "groups: "); for (i = 0; i < ngroups; i++) { struct ext4_group_info *grp = ext4_get_group_info(sb, i); struct ext4_prealloc_space *pa; ext4_grpblk_t start; struct list_head *cur; if (!grp) continue; ext4_lock_group(sb, i); list_for_each(cur, &grp->bb_prealloc_list) { pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); spin_lock(&pa->pa_lock); ext4_get_group_no_and_offset(sb, pa->pa_pstart, NULL, &start); spin_unlock(&pa->pa_lock); mb_debug(sb, "PA:%u:%d:%d\n", i, start, pa->pa_len); } ext4_unlock_group(sb, i); mb_debug(sb, "%u: %d/%d\n", i, grp->bb_free, grp->bb_fragments); } } static void ext4_mb_show_ac(struct ext4_allocation_context *ac) { struct super_block *sb = ac->ac_sb; if (ext4_emergency_state(sb)) return; mb_debug(sb, "Can't allocate:" " Allocation context details:"); mb_debug(sb, "status %u flags 0x%x", ac->ac_status, ac->ac_flags); mb_debug(sb, "orig %lu/%lu/%lu@%lu, " "goal %lu/%lu/%lu@%lu, " "best %lu/%lu/%lu@%lu cr %d", (unsigned long)ac->ac_o_ex.fe_group, (unsigned long)ac->ac_o_ex.fe_start, (unsigned long)ac->ac_o_ex.fe_len, (unsigned long)ac->ac_o_ex.fe_logical, (unsigned long)ac->ac_g_ex.fe_group, (unsigned long)ac->ac_g_ex.fe_start, (unsigned long)ac->ac_g_ex.fe_len, (unsigned long)ac->ac_g_ex.fe_logical, (unsigned long)ac->ac_b_ex.fe_group, (unsigned long)ac->ac_b_ex.fe_start, (unsigned long)ac->ac_b_ex.fe_len, (unsigned long)ac->ac_b_ex.fe_logical, (int)ac->ac_criteria); mb_debug(sb, "%u found", ac->ac_found); mb_debug(sb, "used pa: %s, ", str_yes_no(ac->ac_pa)); if (ac->ac_pa) mb_debug(sb, "pa_type %s\n", ac->ac_pa->pa_type == MB_GROUP_PA ? "group pa" : "inode pa"); ext4_mb_show_pa(sb); } #else static inline void ext4_mb_show_pa(struct super_block *sb) { } static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac) { ext4_mb_show_pa(ac->ac_sb); } #endif /* * We use locality group preallocation for small size file. The size of the * file is determined by the current size or the resulting size after * allocation which ever is larger * * One can tune this size via /sys/fs/ext4/<partition>/mb_stream_req */ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); int bsbits = ac->ac_sb->s_blocksize_bits; loff_t size, isize; bool inode_pa_eligible, group_pa_eligible; if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) return; if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) return; group_pa_eligible = sbi->s_mb_group_prealloc > 0; inode_pa_eligible = true; size = extent_logical_end(sbi, &ac->ac_o_ex); isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1) >> bsbits; /* No point in using inode preallocation for closed files */ if ((size == isize) && !ext4_fs_is_busy(sbi) && !inode_is_open_for_write(ac->ac_inode)) inode_pa_eligible = false; size = max(size, isize); /* Don't use group allocation for large files */ if (size > sbi->s_mb_stream_request) group_pa_eligible = false; if (!group_pa_eligible) { if (inode_pa_eligible) ac->ac_flags |= EXT4_MB_STREAM_ALLOC; else ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC; return; } BUG_ON(ac->ac_lg != NULL); /* * locality group prealloc space are per cpu. The reason for having * per cpu locality group is to reduce the contention between block * request from multiple CPUs. */ ac->ac_lg = raw_cpu_ptr(sbi->s_locality_groups); /* we're going to use group allocation */ ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC; /* serialize all allocations in the group */ mutex_lock(&ac->ac_lg->lg_mutex); } static noinline_for_stack void ext4_mb_initialize_context(struct ext4_allocation_context *ac, struct ext4_allocation_request *ar) { struct super_block *sb = ar->inode->i_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; ext4_group_t group; unsigned int len; ext4_fsblk_t goal; ext4_grpblk_t block; /* we can't allocate > group size */ len = ar->len; /* just a dirty hack to filter too big requests */ if (len >= EXT4_CLUSTERS_PER_GROUP(sb)) len = EXT4_CLUSTERS_PER_GROUP(sb); /* start searching from the goal */ goal = ar->goal; if (goal < le32_to_cpu(es->s_first_data_block) || goal >= ext4_blocks_count(es)) goal = le32_to_cpu(es->s_first_data_block); ext4_get_group_no_and_offset(sb, goal, &group, &block); /* set up allocation goals */ ac->ac_b_ex.fe_logical = EXT4_LBLK_CMASK(sbi, ar->logical); ac->ac_status = AC_STATUS_CONTINUE; ac->ac_sb = sb; ac->ac_inode = ar->inode; ac->ac_o_ex.fe_logical = ac->ac_b_ex.fe_logical; ac->ac_o_ex.fe_group = group; ac->ac_o_ex.fe_start = block; ac->ac_o_ex.fe_len = len; ac->ac_g_ex = ac->ac_o_ex; ac->ac_orig_goal_len = ac->ac_g_ex.fe_len; ac->ac_flags = ar->flags; /* we have to define context: we'll work with a file or * locality group. this is a policy, actually */ ext4_mb_group_or_file(ac); mb_debug(sb, "init ac: %u blocks @ %u, goal %u, flags 0x%x, 2^%d, " "left: %u/%u, right %u/%u to %swritable\n", (unsigned) ar->len, (unsigned) ar->logical, (unsigned) ar->goal, ac->ac_flags, ac->ac_2order, (unsigned) ar->lleft, (unsigned) ar->pleft, (unsigned) ar->lright, (unsigned) ar->pright, inode_is_open_for_write(ar->inode) ? "" : "non-"); } static noinline_for_stack void ext4_mb_discard_lg_preallocations(struct super_block *sb, struct ext4_locality_group *lg, int order, int total_entries) { ext4_group_t group = 0; struct ext4_buddy e4b; LIST_HEAD(discard_list); struct ext4_prealloc_space *pa, *tmp; mb_debug(sb, "discard locality group preallocation\n"); spin_lock(&lg->lg_prealloc_lock); list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[order], pa_node.lg_list, lockdep_is_held(&lg->lg_prealloc_lock)) { spin_lock(&pa->pa_lock); if (atomic_read(&pa->pa_count)) { /* * This is the pa that we just used * for block allocation. So don't * free that */ spin_unlock(&pa->pa_lock); continue; } if (pa->pa_deleted) { spin_unlock(&pa->pa_lock); continue; } /* only lg prealloc space */ BUG_ON(pa->pa_type != MB_GROUP_PA); /* seems this one can be freed ... */ ext4_mb_mark_pa_deleted(sb, pa); spin_unlock(&pa->pa_lock); list_del_rcu(&pa->pa_node.lg_list); list_add(&pa->u.pa_tmp_list, &discard_list); total_entries--; if (total_entries <= 5) { /* * we want to keep only 5 entries * allowing it to grow to 8. This * mak sure we don't call discard * soon for this list. */ break; } } spin_unlock(&lg->lg_prealloc_lock); list_for_each_entry_safe(pa, tmp, &discard_list, u.pa_tmp_list) { int err; group = ext4_get_group_number(sb, pa->pa_pstart); err = ext4_mb_load_buddy_gfp(sb, group, &e4b, GFP_NOFS|__GFP_NOFAIL); if (err) { ext4_error_err(sb, -err, "Error %d loading buddy information for %u", err, group); continue; } ext4_lock_group(sb, group); list_del(&pa->pa_group_list); ext4_mb_release_group_pa(&e4b, pa); ext4_unlock_group(sb, group); ext4_mb_unload_buddy(&e4b); list_del(&pa->u.pa_tmp_list); call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); } } /* * We have incremented pa_count. So it cannot be freed at this * point. Also we hold lg_mutex. So no parallel allocation is * possible from this lg. That means pa_free cannot be updated. * * A parallel ext4_mb_discard_group_preallocations is possible. * which can cause the lg_prealloc_list to be updated. */ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac) { int order, added = 0, lg_prealloc_count = 1; struct super_block *sb = ac->ac_sb; struct ext4_locality_group *lg = ac->ac_lg; struct ext4_prealloc_space *tmp_pa, *pa = ac->ac_pa; order = fls(pa->pa_free) - 1; if (order > PREALLOC_TB_SIZE - 1) /* The max size of hash table is PREALLOC_TB_SIZE */ order = PREALLOC_TB_SIZE - 1; /* Add the prealloc space to lg */ spin_lock(&lg->lg_prealloc_lock); list_for_each_entry_rcu(tmp_pa, &lg->lg_prealloc_list[order], pa_node.lg_list, lockdep_is_held(&lg->lg_prealloc_lock)) { spin_lock(&tmp_pa->pa_lock); if (tmp_pa->pa_deleted) { spin_unlock(&tmp_pa->pa_lock); continue; } if (!added && pa->pa_free < tmp_pa->pa_free) { /* Add to the tail of the previous entry */ list_add_tail_rcu(&pa->pa_node.lg_list, &tmp_pa->pa_node.lg_list); added = 1; /* * we want to count the total * number of entries in the list */ } spin_unlock(&tmp_pa->pa_lock); lg_prealloc_count++; } if (!added) list_add_tail_rcu(&pa->pa_node.lg_list, &lg->lg_prealloc_list[order]); spin_unlock(&lg->lg_prealloc_lock); /* Now trim the list to be not more than 8 elements */ if (lg_prealloc_count > 8) ext4_mb_discard_lg_preallocations(sb, lg, order, lg_prealloc_count); } /* * release all resource we used in allocation */ static void ext4_mb_release_context(struct ext4_allocation_context *ac) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_prealloc_space *pa = ac->ac_pa; if (pa) { if (pa->pa_type == MB_GROUP_PA) { /* see comment in ext4_mb_use_group_pa() */ spin_lock(&pa->pa_lock); pa->pa_pstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len); pa->pa_lstart += EXT4_C2B(sbi, ac->ac_b_ex.fe_len); pa->pa_free -= ac->ac_b_ex.fe_len; pa->pa_len -= ac->ac_b_ex.fe_len; spin_unlock(&pa->pa_lock); /* * We want to add the pa to the right bucket. * Remove it from the list and while adding * make sure the list to which we are adding * doesn't grow big. */ if (likely(pa->pa_free)) { spin_lock(pa->pa_node_lock.lg_lock); list_del_rcu(&pa->pa_node.lg_list); spin_unlock(pa->pa_node_lock.lg_lock); ext4_mb_add_n_trim(ac); } } ext4_mb_put_pa(ac, ac->ac_sb, pa); } if (ac->ac_bitmap_folio) folio_put(ac->ac_bitmap_folio); if (ac->ac_buddy_folio) folio_put(ac->ac_buddy_folio); if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) mutex_unlock(&ac->ac_lg->lg_mutex); ext4_mb_collect_stats(ac); } static int ext4_mb_discard_preallocations(struct super_block *sb, int needed) { ext4_group_t i, ngroups = ext4_get_groups_count(sb); int ret; int freed = 0, busy = 0; int retry = 0; trace_ext4_mb_discard_preallocations(sb, needed); if (needed == 0) needed = EXT4_CLUSTERS_PER_GROUP(sb) + 1; repeat: for (i = 0; i < ngroups && needed > 0; i++) { ret = ext4_mb_discard_group_preallocations(sb, i, &busy); freed += ret; needed -= ret; cond_resched(); } if (needed > 0 && busy && ++retry < 3) { busy = 0; goto repeat; } return freed; } static bool ext4_mb_discard_preallocations_should_retry(struct super_block *sb, struct ext4_allocation_context *ac, u64 *seq) { int freed; u64 seq_retry = 0; bool ret = false; freed = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len); if (freed) { ret = true; goto out_dbg; } seq_retry = ext4_get_discard_pa_seq_sum(); if (!(ac->ac_flags & EXT4_MB_STRICT_CHECK) || seq_retry != *seq) { ac->ac_flags |= EXT4_MB_STRICT_CHECK; *seq = seq_retry; ret = true; } out_dbg: mb_debug(sb, "freed %d, retry ? %s\n", freed, str_yes_no(ret)); return ret; } /* * Simple allocator for Ext4 fast commit replay path. It searches for blocks * linearly starting at the goal block and also excludes the blocks which * are going to be in use after fast commit replay. */ static ext4_fsblk_t ext4_mb_new_blocks_simple(struct ext4_allocation_request *ar, int *errp) { struct buffer_head *bitmap_bh; struct super_block *sb = ar->inode->i_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_group_t group, nr; ext4_grpblk_t blkoff; ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb); ext4_grpblk_t i = 0; ext4_fsblk_t goal, block; struct ext4_super_block *es = sbi->s_es; goal = ar->goal; if (goal < le32_to_cpu(es->s_first_data_block) || goal >= ext4_blocks_count(es)) goal = le32_to_cpu(es->s_first_data_block); ar->len = 0; ext4_get_group_no_and_offset(sb, goal, &group, &blkoff); for (nr = ext4_get_groups_count(sb); nr > 0; nr--) { bitmap_bh = ext4_read_block_bitmap(sb, group); if (IS_ERR(bitmap_bh)) { *errp = PTR_ERR(bitmap_bh); pr_warn("Failed to read block bitmap\n"); return 0; } while (1) { i = mb_find_next_zero_bit(bitmap_bh->b_data, max, blkoff); if (i >= max) break; if (ext4_fc_replay_check_excluded(sb, ext4_group_first_block_no(sb, group) + EXT4_C2B(sbi, i))) { blkoff = i + 1; } else break; } brelse(bitmap_bh); if (i < max) break; if (++group >= ext4_get_groups_count(sb)) group = 0; blkoff = 0; } if (i >= max) { *errp = -ENOSPC; return 0; } block = ext4_group_first_block_no(sb, group) + EXT4_C2B(sbi, i); ext4_mb_mark_bb(sb, block, 1, true); ar->len = 1; *errp = 0; return block; } /* * Main entry point into mballoc to allocate blocks * it tries to use preallocation first, then falls back * to usual allocation */ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, struct ext4_allocation_request *ar, int *errp) { struct ext4_allocation_context *ac = NULL; struct ext4_sb_info *sbi; struct super_block *sb; ext4_fsblk_t block = 0; unsigned int inquota = 0; unsigned int reserv_clstrs = 0; int retries = 0; u64 seq; might_sleep(); sb = ar->inode->i_sb; sbi = EXT4_SB(sb); trace_ext4_request_blocks(ar); if (sbi->s_mount_state & EXT4_FC_REPLAY) return ext4_mb_new_blocks_simple(ar, errp); /* Allow to use superuser reservation for quota file */ if (ext4_is_quota_file(ar->inode)) ar->flags |= EXT4_MB_USE_ROOT_BLOCKS; if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0) { /* Without delayed allocation we need to verify * there is enough free blocks to do block allocation * and verify allocation doesn't exceed the quota limits. */ while (ar->len && ext4_claim_free_clusters(sbi, ar->len, ar->flags)) { /* let others to free the space */ cond_resched(); ar->len = ar->len >> 1; } if (!ar->len) { ext4_mb_show_pa(sb); *errp = -ENOSPC; return 0; } reserv_clstrs = ar->len; if (ar->flags & EXT4_MB_USE_ROOT_BLOCKS) { dquot_alloc_block_nofail(ar->inode, EXT4_C2B(sbi, ar->len)); } else { while (ar->len && dquot_alloc_block(ar->inode, EXT4_C2B(sbi, ar->len))) { ar->flags |= EXT4_MB_HINT_NOPREALLOC; ar->len--; } } inquota = ar->len; if (ar->len == 0) { *errp = -EDQUOT; goto out; } } ac = kmem_cache_zalloc(ext4_ac_cachep, GFP_NOFS); if (!ac) { ar->len = 0; *errp = -ENOMEM; goto out; } ext4_mb_initialize_context(ac, ar); ac->ac_op = EXT4_MB_HISTORY_PREALLOC; seq = this_cpu_read(discard_pa_seq); if (!ext4_mb_use_preallocated(ac)) { ac->ac_op = EXT4_MB_HISTORY_ALLOC; ext4_mb_normalize_request(ac, ar); *errp = ext4_mb_pa_alloc(ac); if (*errp) goto errout; repeat: /* allocate space in core */ *errp = ext4_mb_regular_allocator(ac); /* * pa allocated above is added to grp->bb_prealloc_list only * when we were able to allocate some block i.e. when * ac->ac_status == AC_STATUS_FOUND. * And error from above mean ac->ac_status != AC_STATUS_FOUND * So we have to free this pa here itself. */ if (*errp) { ext4_mb_pa_put_free(ac); ext4_discard_allocated_blocks(ac); goto errout; } if (ac->ac_status == AC_STATUS_FOUND && ac->ac_o_ex.fe_len >= ac->ac_f_ex.fe_len) ext4_mb_pa_put_free(ac); } if (likely(ac->ac_status == AC_STATUS_FOUND)) { *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs); if (*errp) { ext4_discard_allocated_blocks(ac); goto errout; } else { block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); ar->len = ac->ac_b_ex.fe_len; } } else { if (++retries < 3 && ext4_mb_discard_preallocations_should_retry(sb, ac, &seq)) goto repeat; /* * If block allocation fails then the pa allocated above * needs to be freed here itself. */ ext4_mb_pa_put_free(ac); *errp = -ENOSPC; } if (*errp) { errout: ac->ac_b_ex.fe_len = 0; ar->len = 0; ext4_mb_show_ac(ac); } ext4_mb_release_context(ac); kmem_cache_free(ext4_ac_cachep, ac); out: if (inquota && ar->len < inquota) dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len)); if (!ar->len) { if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0) /* release all the reserved blocks if non delalloc */ percpu_counter_sub(&sbi->s_dirtyclusters_counter, reserv_clstrs); } trace_ext4_allocate_blocks(ar, (unsigned long long)block); return block; } /* * We can merge two free data extents only if the physical blocks * are contiguous, AND the extents were freed by the same transaction, * AND the blocks are associated with the same group. */ static void ext4_try_merge_freed_extent(struct ext4_sb_info *sbi, struct ext4_free_data *entry, struct ext4_free_data *new_entry, struct rb_root *entry_rb_root) { if ((entry->efd_tid != new_entry->efd_tid) || (entry->efd_group != new_entry->efd_group)) return; if (entry->efd_start_cluster + entry->efd_count == new_entry->efd_start_cluster) { new_entry->efd_start_cluster = entry->efd_start_cluster; new_entry->efd_count += entry->efd_count; } else if (new_entry->efd_start_cluster + new_entry->efd_count == entry->efd_start_cluster) { new_entry->efd_count += entry->efd_count; } else return; spin_lock(&sbi->s_md_lock); list_del(&entry->efd_list); spin_unlock(&sbi->s_md_lock); rb_erase(&entry->efd_node, entry_rb_root); kmem_cache_free(ext4_free_data_cachep, entry); } static noinline_for_stack void ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, struct ext4_free_data *new_entry) { ext4_group_t group = e4b->bd_group; ext4_grpblk_t cluster; ext4_grpblk_t clusters = new_entry->efd_count; struct ext4_free_data *entry; struct ext4_group_info *db = e4b->bd_info; struct super_block *sb = e4b->bd_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); struct rb_node **n = &db->bb_free_root.rb_node, *node; struct rb_node *parent = NULL, *new_node; BUG_ON(!ext4_handle_valid(handle)); BUG_ON(e4b->bd_bitmap_folio == NULL); BUG_ON(e4b->bd_buddy_folio == NULL); new_node = &new_entry->efd_node; cluster = new_entry->efd_start_cluster; if (!*n) { /* first free block exent. We need to protect buddy cache from being freed, * otherwise we'll refresh it from * on-disk bitmap and lose not-yet-available * blocks */ folio_get(e4b->bd_buddy_folio); folio_get(e4b->bd_bitmap_folio); } while (*n) { parent = *n; entry = rb_entry(parent, struct ext4_free_data, efd_node); if (cluster < entry->efd_start_cluster) n = &(*n)->rb_left; else if (cluster >= (entry->efd_start_cluster + entry->efd_count)) n = &(*n)->rb_right; else { ext4_grp_locked_error(sb, group, 0, ext4_group_first_block_no(sb, group) + EXT4_C2B(sbi, cluster), "Block already on to-be-freed list"); kmem_cache_free(ext4_free_data_cachep, new_entry); return; } } rb_link_node(new_node, parent, n); rb_insert_color(new_node, &db->bb_free_root); /* Now try to see the extent can be merged to left and right */ node = rb_prev(new_node); if (node) { entry = rb_entry(node, struct ext4_free_data, efd_node); ext4_try_merge_freed_extent(sbi, entry, new_entry, &(db->bb_free_root)); } node = rb_next(new_node); if (node) { entry = rb_entry(node, struct ext4_free_data, efd_node); ext4_try_merge_freed_extent(sbi, entry, new_entry, &(db->bb_free_root)); } spin_lock(&sbi->s_md_lock); list_add_tail(&new_entry->efd_list, &sbi->s_freed_data_list[new_entry->efd_tid & 1]); sbi->s_mb_free_pending += clusters; spin_unlock(&sbi->s_md_lock); } static void ext4_free_blocks_simple(struct inode *inode, ext4_fsblk_t block, unsigned long count) { struct super_block *sb = inode->i_sb; ext4_group_t group; ext4_grpblk_t blkoff; ext4_get_group_no_and_offset(sb, block, &group, &blkoff); ext4_mb_mark_context(NULL, sb, false, group, blkoff, count, EXT4_MB_BITMAP_MARKED_CHECK | EXT4_MB_SYNC_UPDATE, NULL); } /** * ext4_mb_clear_bb() -- helper function for freeing blocks. * Used by ext4_free_blocks() * @handle: handle for this transaction * @inode: inode * @block: starting physical block to be freed * @count: number of blocks to be freed * @flags: flags used by ext4_free_blocks */ static void ext4_mb_clear_bb(handle_t *handle, struct inode *inode, ext4_fsblk_t block, unsigned long count, int flags) { struct super_block *sb = inode->i_sb; struct ext4_group_info *grp; unsigned int overflow; ext4_grpblk_t bit; ext4_group_t block_group; struct ext4_sb_info *sbi; struct ext4_buddy e4b; unsigned int count_clusters; int err = 0; int mark_flags = 0; ext4_grpblk_t changed; sbi = EXT4_SB(sb); if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) && !ext4_inode_block_valid(inode, block, count)) { ext4_error(sb, "Freeing blocks in system zone - " "Block = %llu, count = %lu", block, count); /* err = 0. ext4_std_error should be a no op */ goto error_out; } flags |= EXT4_FREE_BLOCKS_VALIDATED; do_more: overflow = 0; ext4_get_group_no_and_offset(sb, block, &block_group, &bit); grp = ext4_get_group_info(sb, block_group); if (unlikely(!grp || EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) return; /* * Check to see if we are freeing blocks across a group * boundary. */ if (EXT4_C2B(sbi, bit) + count > EXT4_BLOCKS_PER_GROUP(sb)) { overflow = EXT4_C2B(sbi, bit) + count - EXT4_BLOCKS_PER_GROUP(sb); count -= overflow; /* The range changed so it's no longer validated */ flags &= ~EXT4_FREE_BLOCKS_VALIDATED; } count_clusters = EXT4_NUM_B2C(sbi, count); trace_ext4_mballoc_free(sb, inode, block_group, bit, count_clusters); /* __GFP_NOFAIL: retry infinitely, ignore TIF_MEMDIE and memcg limit. */ err = ext4_mb_load_buddy_gfp(sb, block_group, &e4b, GFP_NOFS|__GFP_NOFAIL); if (err) goto error_out; if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) && !ext4_inode_block_valid(inode, block, count)) { ext4_error(sb, "Freeing blocks in system zone - " "Block = %llu, count = %lu", block, count); /* err = 0. ext4_std_error should be a no op */ goto error_clean; } #ifdef AGGRESSIVE_CHECK mark_flags |= EXT4_MB_BITMAP_MARKED_CHECK; #endif err = ext4_mb_mark_context(handle, sb, false, block_group, bit, count_clusters, mark_flags, &changed); if (err && changed == 0) goto error_clean; #ifdef AGGRESSIVE_CHECK BUG_ON(changed != count_clusters); #endif /* * We need to make sure we don't reuse the freed block until after the * transaction is committed. We make an exception if the inode is to be * written in writeback mode since writeback mode has weak data * consistency guarantees. */ if (ext4_handle_valid(handle) && ((flags & EXT4_FREE_BLOCKS_METADATA) || !ext4_should_writeback_data(inode))) { struct ext4_free_data *new_entry; /* * We use __GFP_NOFAIL because ext4_free_blocks() is not allowed * to fail. */ new_entry = kmem_cache_alloc(ext4_free_data_cachep, GFP_NOFS|__GFP_NOFAIL); new_entry->efd_start_cluster = bit; new_entry->efd_group = block_group; new_entry->efd_count = count_clusters; new_entry->efd_tid = handle->h_transaction->t_tid; ext4_lock_group(sb, block_group); ext4_mb_free_metadata(handle, &e4b, new_entry); } else { if (test_opt(sb, DISCARD)) { err = ext4_issue_discard(sb, block_group, bit, count_clusters); /* * Ignore EOPNOTSUPP error. This is consistent with * what happens when using journal. */ if (err == -EOPNOTSUPP) err = 0; if (err) ext4_msg(sb, KERN_WARNING, "discard request in" " group:%u block:%d count:%lu failed" " with %d", block_group, bit, count, err); } EXT4_MB_GRP_CLEAR_TRIMMED(e4b.bd_info); ext4_lock_group(sb, block_group); mb_free_blocks(inode, &e4b, bit, count_clusters); } ext4_unlock_group(sb, block_group); /* * on a bigalloc file system, defer the s_freeclusters_counter * update to the caller (ext4_remove_space and friends) so they * can determine if a cluster freed here should be rereserved */ if (!(flags & EXT4_FREE_BLOCKS_RERESERVE_CLUSTER)) { if (!(flags & EXT4_FREE_BLOCKS_NO_QUOT_UPDATE)) dquot_free_block(inode, EXT4_C2B(sbi, count_clusters)); percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters); } if (overflow && !err) { block += count; count = overflow; ext4_mb_unload_buddy(&e4b); /* The range changed so it's no longer validated */ flags &= ~EXT4_FREE_BLOCKS_VALIDATED; goto do_more; } error_clean: ext4_mb_unload_buddy(&e4b); error_out: ext4_std_error(sb, err); } /** * ext4_free_blocks() -- Free given blocks and update quota * @handle: handle for this transaction * @inode: inode * @bh: optional buffer of the block to be freed * @block: starting physical block to be freed * @count: number of blocks to be freed * @flags: flags used by ext4_free_blocks */ void ext4_free_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bh, ext4_fsblk_t block, unsigned long count, int flags) { struct super_block *sb = inode->i_sb; unsigned int overflow; struct ext4_sb_info *sbi; sbi = EXT4_SB(sb); if (bh) { if (block) BUG_ON(block != bh->b_blocknr); else block = bh->b_blocknr; } if (sbi->s_mount_state & EXT4_FC_REPLAY) { ext4_free_blocks_simple(inode, block, EXT4_NUM_B2C(sbi, count)); return; } might_sleep(); if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) && !ext4_inode_block_valid(inode, block, count)) { ext4_error(sb, "Freeing blocks not in datazone - " "block = %llu, count = %lu", block, count); return; } flags |= EXT4_FREE_BLOCKS_VALIDATED; ext4_debug("freeing block %llu\n", block); trace_ext4_free_blocks(inode, block, count, flags); if (bh && (flags & EXT4_FREE_BLOCKS_FORGET)) { BUG_ON(count > 1); ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, inode, bh, block); } /* * If the extent to be freed does not begin on a cluster * boundary, we need to deal with partial clusters at the * beginning and end of the extent. Normally we will free * blocks at the beginning or the end unless we are explicitly * requested to avoid doing so. */ overflow = EXT4_PBLK_COFF(sbi, block); if (overflow) { if (flags & EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER) { overflow = sbi->s_cluster_ratio - overflow; block += overflow; if (count > overflow) count -= overflow; else return; } else { block -= overflow; count += overflow; } /* The range changed so it's no longer validated */ flags &= ~EXT4_FREE_BLOCKS_VALIDATED; } overflow = EXT4_LBLK_COFF(sbi, count); if (overflow) { if (flags & EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER) { if (count > overflow) count -= overflow; else return; } else count += sbi->s_cluster_ratio - overflow; /* The range changed so it's no longer validated */ flags &= ~EXT4_FREE_BLOCKS_VALIDATED; } if (!bh && (flags & EXT4_FREE_BLOCKS_FORGET)) { int i; int is_metadata = flags & EXT4_FREE_BLOCKS_METADATA; for (i = 0; i < count; i++) { cond_resched(); if (is_metadata) bh = sb_find_get_block_nonatomic(inode->i_sb, block + i); ext4_forget(handle, is_metadata, inode, bh, block + i); } } ext4_mb_clear_bb(handle, inode, block, count, flags); } /** * ext4_group_add_blocks() -- Add given blocks to an existing group * @handle: handle to this transaction * @sb: super block * @block: start physical block to add to the block group * @count: number of blocks to free * * This marks the blocks as free in the bitmap and buddy. */ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, ext4_fsblk_t block, unsigned long count) { ext4_group_t block_group; ext4_grpblk_t bit; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_buddy e4b; int err = 0; ext4_fsblk_t first_cluster = EXT4_B2C(sbi, block); ext4_fsblk_t last_cluster = EXT4_B2C(sbi, block + count - 1); unsigned long cluster_count = last_cluster - first_cluster + 1; ext4_grpblk_t changed; ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1); if (cluster_count == 0) return 0; ext4_get_group_no_and_offset(sb, block, &block_group, &bit); /* * Check to see if we are freeing blocks across a group * boundary. */ if (bit + cluster_count > EXT4_CLUSTERS_PER_GROUP(sb)) { ext4_warning(sb, "too many blocks added to group %u", block_group); err = -EINVAL; goto error_out; } err = ext4_mb_load_buddy(sb, block_group, &e4b); if (err) goto error_out; if (!ext4_sb_block_valid(sb, NULL, block, count)) { ext4_error(sb, "Adding blocks in system zones - " "Block = %llu, count = %lu", block, count); err = -EINVAL; goto error_clean; } err = ext4_mb_mark_context(handle, sb, false, block_group, bit, cluster_count, EXT4_MB_BITMAP_MARKED_CHECK, &changed); if (err && changed == 0) goto error_clean; if (changed != cluster_count) ext4_error(sb, "bit already cleared in group %u", block_group); ext4_lock_group(sb, block_group); mb_free_blocks(NULL, &e4b, bit, cluster_count); ext4_unlock_group(sb, block_group); percpu_counter_add(&sbi->s_freeclusters_counter, changed); error_clean: ext4_mb_unload_buddy(&e4b); error_out: ext4_std_error(sb, err); return err; } /** * ext4_trim_extent -- function to TRIM one single free extent in the group * @sb: super block for the file system * @start: starting block of the free extent in the alloc. group * @count: number of blocks to TRIM * @e4b: ext4 buddy for the group * * Trim "count" blocks starting at "start" in the "group". To assure that no * one will allocate those blocks, mark it as used in buddy bitmap. This must * be called with under the group lock. */ static int ext4_trim_extent(struct super_block *sb, int start, int count, struct ext4_buddy *e4b) __releases(bitlock) __acquires(bitlock) { struct ext4_free_extent ex; ext4_group_t group = e4b->bd_group; int ret = 0; trace_ext4_trim_extent(sb, group, start, count); assert_spin_locked(ext4_group_lock_ptr(sb, group)); ex.fe_start = start; ex.fe_group = group; ex.fe_len = count; /* * Mark blocks used, so no one can reuse them while * being trimmed. */ mb_mark_used(e4b, &ex); ext4_unlock_group(sb, group); ret = ext4_issue_discard(sb, group, start, count); ext4_lock_group(sb, group); mb_free_blocks(NULL, e4b, start, ex.fe_len); return ret; } static ext4_grpblk_t ext4_last_grp_cluster(struct super_block *sb, ext4_group_t grp) { unsigned long nr_clusters_in_group; if (grp < (ext4_get_groups_count(sb) - 1)) nr_clusters_in_group = EXT4_CLUSTERS_PER_GROUP(sb); else nr_clusters_in_group = (ext4_blocks_count(EXT4_SB(sb)->s_es) - ext4_group_first_block_no(sb, grp)) >> EXT4_CLUSTER_BITS(sb); return nr_clusters_in_group - 1; } static bool ext4_trim_interrupted(void) { return fatal_signal_pending(current) || freezing(current); } static int ext4_try_to_trim_range(struct super_block *sb, struct ext4_buddy *e4b, ext4_grpblk_t start, ext4_grpblk_t max, ext4_grpblk_t minblocks) __acquires(ext4_group_lock_ptr(sb, e4b->bd_group)) __releases(ext4_group_lock_ptr(sb, e4b->bd_group)) { ext4_grpblk_t next, count, free_count, last, origin_start; bool set_trimmed = false; void *bitmap; if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) return 0; last = ext4_last_grp_cluster(sb, e4b->bd_group); bitmap = e4b->bd_bitmap; if (start == 0 && max >= last) set_trimmed = true; origin_start = start; start = max(e4b->bd_info->bb_first_free, start); count = 0; free_count = 0; while (start <= max) { start = mb_find_next_zero_bit(bitmap, max + 1, start); if (start > max) break; next = mb_find_next_bit(bitmap, last + 1, start); if (origin_start == 0 && next >= last) set_trimmed = true; if ((next - start) >= minblocks) { int ret = ext4_trim_extent(sb, start, next - start, e4b); if (ret && ret != -EOPNOTSUPP) return count; count += next - start; } free_count += next - start; start = next + 1; if (ext4_trim_interrupted()) return count; if (need_resched()) { ext4_unlock_group(sb, e4b->bd_group); cond_resched(); ext4_lock_group(sb, e4b->bd_group); } if ((e4b->bd_info->bb_free - free_count) < minblocks) break; } if (set_trimmed) EXT4_MB_GRP_SET_TRIMMED(e4b->bd_info); return count; } /** * ext4_trim_all_free -- function to trim all free space in alloc. group * @sb: super block for file system * @group: group to be trimmed * @start: first group block to examine * @max: last group block to examine * @minblocks: minimum extent block count * * ext4_trim_all_free walks through group's block bitmap searching for free * extents. When the free extent is found, mark it as used in group buddy * bitmap. Then issue a TRIM command on this extent and free the extent in * the group buddy bitmap. */ static ext4_grpblk_t ext4_trim_all_free(struct super_block *sb, ext4_group_t group, ext4_grpblk_t start, ext4_grpblk_t max, ext4_grpblk_t minblocks) { struct ext4_buddy e4b; int ret; trace_ext4_trim_all_free(sb, group, start, max); ret = ext4_mb_load_buddy(sb, group, &e4b); if (ret) { ext4_warning(sb, "Error %d loading buddy information for %u", ret, group); return ret; } ext4_lock_group(sb, group); if (!EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) || minblocks < EXT4_SB(sb)->s_last_trim_minblks) ret = ext4_try_to_trim_range(sb, &e4b, start, max, minblocks); else ret = 0; ext4_unlock_group(sb, group); ext4_mb_unload_buddy(&e4b); ext4_debug("trimmed %d blocks in the group %d\n", ret, group); return ret; } /** * ext4_trim_fs() -- trim ioctl handle function * @sb: superblock for filesystem * @range: fstrim_range structure * * start: First Byte to trim * len: number of Bytes to trim from start * minlen: minimum extent length in Bytes * ext4_trim_fs goes through all allocation groups containing Bytes from * start to start+len. For each such a group ext4_trim_all_free function * is invoked to trim all free space. */ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) { unsigned int discard_granularity = bdev_discard_granularity(sb->s_bdev); struct ext4_group_info *grp; ext4_group_t group, first_group, last_group; ext4_grpblk_t cnt = 0, first_cluster, last_cluster; uint64_t start, end, minlen, trimmed = 0; ext4_fsblk_t first_data_blk = le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); ext4_fsblk_t max_blks = ext4_blocks_count(EXT4_SB(sb)->s_es); int ret = 0; start = range->start >> sb->s_blocksize_bits; end = start + (range->len >> sb->s_blocksize_bits) - 1; minlen = EXT4_NUM_B2C(EXT4_SB(sb), range->minlen >> sb->s_blocksize_bits); if (minlen > EXT4_CLUSTERS_PER_GROUP(sb) || start >= max_blks || range->len < sb->s_blocksize) return -EINVAL; /* No point to try to trim less than discard granularity */ if (range->minlen < discard_granularity) { minlen = EXT4_NUM_B2C(EXT4_SB(sb), discard_granularity >> sb->s_blocksize_bits); if (minlen > EXT4_CLUSTERS_PER_GROUP(sb)) goto out; } if (end >= max_blks - 1) end = max_blks - 1; if (end <= first_data_blk) goto out; if (start < first_data_blk) start = first_data_blk; /* Determine first and last group to examine based on start and end */ ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start, &first_group, &first_cluster); ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) end, &last_group, &last_cluster); /* end now represents the last cluster to discard in this group */ end = EXT4_CLUSTERS_PER_GROUP(sb) - 1; for (group = first_group; group <= last_group; group++) { if (ext4_trim_interrupted()) break; grp = ext4_get_group_info(sb, group); if (!grp) continue; /* We only do this if the grp has never been initialized */ if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { ret = ext4_mb_init_group(sb, group, GFP_NOFS); if (ret) break; } /* * For all the groups except the last one, last cluster will * always be EXT4_CLUSTERS_PER_GROUP(sb)-1, so we only need to * change it for the last group, note that last_cluster is * already computed earlier by ext4_get_group_no_and_offset() */ if (group == last_group) end = last_cluster; if (grp->bb_free >= minlen) { cnt = ext4_trim_all_free(sb, group, first_cluster, end, minlen); if (cnt < 0) { ret = cnt; break; } trimmed += cnt; } /* * For every group except the first one, we are sure * that the first cluster to discard will be cluster #0. */ first_cluster = 0; } if (!ret) EXT4_SB(sb)->s_last_trim_minblks = minlen; out: range->len = EXT4_C2B(EXT4_SB(sb), trimmed) << sb->s_blocksize_bits; return ret; } /* Iterate all the free extents in the group. */ int ext4_mballoc_query_range( struct super_block *sb, ext4_group_t group, ext4_grpblk_t first, ext4_grpblk_t end, ext4_mballoc_query_range_fn meta_formatter, ext4_mballoc_query_range_fn formatter, void *priv) { void *bitmap; ext4_grpblk_t start, next; struct ext4_buddy e4b; int error; error = ext4_mb_load_buddy(sb, group, &e4b); if (error) return error; bitmap = e4b.bd_bitmap; ext4_lock_group(sb, group); start = max(e4b.bd_info->bb_first_free, first); if (end >= EXT4_CLUSTERS_PER_GROUP(sb)) end = EXT4_CLUSTERS_PER_GROUP(sb) - 1; if (meta_formatter && start != first) { if (start > end) start = end; ext4_unlock_group(sb, group); error = meta_formatter(sb, group, first, start - first, priv); if (error) goto out_unload; ext4_lock_group(sb, group); } while (start <= end) { start = mb_find_next_zero_bit(bitmap, end + 1, start); if (start > end) break; next = mb_find_next_bit(bitmap, end + 1, start); ext4_unlock_group(sb, group); error = formatter(sb, group, start, next - start, priv); if (error) goto out_unload; ext4_lock_group(sb, group); start = next + 1; } ext4_unlock_group(sb, group); out_unload: ext4_mb_unload_buddy(&e4b); return error; } #ifdef CONFIG_EXT4_KUNIT_TESTS #include "mballoc-test.c" #endif |
| 2 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 | // SPDX-License-Identifier: GPL-2.0-or-later /* * drivers/net/macsec.c - MACsec device * * Copyright (c) 2015 Sabrina Dubroca <sd@queasysnail.net> */ #include <linux/types.h> #include <linux/skbuff.h> #include <linux/socket.h> #include <linux/module.h> #include <crypto/aead.h> #include <linux/etherdevice.h> #include <linux/netdevice.h> #include <linux/rtnetlink.h> #include <linux/refcount.h> #include <net/genetlink.h> #include <net/sock.h> #include <net/gro_cells.h> #include <net/macsec.h> #include <net/dst_metadata.h> #include <net/netdev_lock.h> #include <linux/phy.h> #include <linux/byteorder/generic.h> #include <linux/if_arp.h> #include <uapi/linux/if_macsec.h> /* SecTAG length = macsec_eth_header without the optional SCI */ #define MACSEC_TAG_LEN 6 struct macsec_eth_header { struct ethhdr eth; /* SecTAG */ u8 tci_an; #if defined(__LITTLE_ENDIAN_BITFIELD) u8 short_length:6, unused:2; #elif defined(__BIG_ENDIAN_BITFIELD) u8 unused:2, short_length:6; #else #error "Please fix <asm/byteorder.h>" #endif __be32 packet_number; u8 secure_channel_id[8]; /* optional */ } __packed; /* minimum secure data length deemed "not short", see IEEE 802.1AE-2006 9.7 */ #define MIN_NON_SHORT_LEN 48 #define GCM_AES_IV_LEN 12 #define for_each_rxsc(secy, sc) \ for (sc = rcu_dereference_bh(secy->rx_sc); \ sc; \ sc = rcu_dereference_bh(sc->next)) #define for_each_rxsc_rtnl(secy, sc) \ for (sc = rtnl_dereference(secy->rx_sc); \ sc; \ sc = rtnl_dereference(sc->next)) #define pn_same_half(pn1, pn2) (!(((pn1) >> 31) ^ ((pn2) >> 31))) struct gcm_iv_xpn { union { u8 short_secure_channel_id[4]; ssci_t ssci; }; __be64 pn; } __packed; struct gcm_iv { union { u8 secure_channel_id[8]; sci_t sci; }; __be32 pn; }; #define MACSEC_VALIDATE_DEFAULT MACSEC_VALIDATE_STRICT struct pcpu_secy_stats { struct macsec_dev_stats stats; struct u64_stats_sync syncp; }; /** * struct macsec_dev - private data * @secy: SecY config * @real_dev: pointer to underlying netdevice * @dev_tracker: refcount tracker for @real_dev reference * @stats: MACsec device stats * @secys: linked list of SecY's on the underlying device * @gro_cells: pointer to the Generic Receive Offload cell * @offload: status of offloading on the MACsec device * @insert_tx_tag: when offloading, device requires to insert an * additional tag */ struct macsec_dev { struct macsec_secy secy; struct net_device *real_dev; netdevice_tracker dev_tracker; struct pcpu_secy_stats __percpu *stats; struct list_head secys; struct gro_cells gro_cells; enum macsec_offload offload; bool insert_tx_tag; }; /** * struct macsec_rxh_data - rx_handler private argument * @secys: linked list of SecY's on this underlying device */ struct macsec_rxh_data { struct list_head secys; }; static struct macsec_dev *macsec_priv(const struct net_device *dev) { return (struct macsec_dev *)netdev_priv(dev); } static struct macsec_rxh_data *macsec_data_rcu(const struct net_device *dev) { return rcu_dereference_bh(dev->rx_handler_data); } static struct macsec_rxh_data *macsec_data_rtnl(const struct net_device *dev) { return rtnl_dereference(dev->rx_handler_data); } struct macsec_cb { struct aead_request *req; union { struct macsec_tx_sa *tx_sa; struct macsec_rx_sa *rx_sa; }; u8 assoc_num; bool valid; bool has_sci; }; static struct macsec_rx_sa *macsec_rxsa_get(struct macsec_rx_sa __rcu *ptr) { struct macsec_rx_sa *sa = rcu_dereference_bh(ptr); if (!sa || !sa->active) return NULL; if (!refcount_inc_not_zero(&sa->refcnt)) return NULL; return sa; } static void free_rx_sc_rcu(struct rcu_head *head) { struct macsec_rx_sc *rx_sc = container_of(head, struct macsec_rx_sc, rcu_head); free_percpu(rx_sc->stats); kfree(rx_sc); } static struct macsec_rx_sc *macsec_rxsc_get(struct macsec_rx_sc *sc) { return refcount_inc_not_zero(&sc->refcnt) ? sc : NULL; } static void macsec_rxsc_put(struct macsec_rx_sc *sc) { if (refcount_dec_and_test(&sc->refcnt)) call_rcu(&sc->rcu_head, free_rx_sc_rcu); } static void free_rxsa(struct rcu_head *head) { struct macsec_rx_sa *sa = container_of(head, struct macsec_rx_sa, rcu); crypto_free_aead(sa->key.tfm); free_percpu(sa->stats); kfree(sa); } static void macsec_rxsa_put(struct macsec_rx_sa *sa) { if (refcount_dec_and_test(&sa->refcnt)) call_rcu(&sa->rcu, free_rxsa); } static struct macsec_tx_sa *macsec_txsa_get(struct macsec_tx_sa __rcu *ptr) { struct macsec_tx_sa *sa = rcu_dereference_bh(ptr); if (!sa || !sa->active) return NULL; if (!refcount_inc_not_zero(&sa->refcnt)) return NULL; return sa; } static void free_txsa(struct rcu_head *head) { struct macsec_tx_sa *sa = container_of(head, struct macsec_tx_sa, rcu); crypto_free_aead(sa->key.tfm); free_percpu(sa->stats); kfree(sa); } static void macsec_txsa_put(struct macsec_tx_sa *sa) { if (refcount_dec_and_test(&sa->refcnt)) call_rcu(&sa->rcu, free_txsa); } static struct macsec_cb *macsec_skb_cb(struct sk_buff *skb) { BUILD_BUG_ON(sizeof(struct macsec_cb) > sizeof(skb->cb)); return (struct macsec_cb *)skb->cb; } #define MACSEC_PORT_SCB (0x0000) #define MACSEC_UNDEF_SCI ((__force sci_t)0xffffffffffffffffULL) #define MACSEC_UNDEF_SSCI ((__force ssci_t)0xffffffff) #define MACSEC_GCM_AES_128_SAK_LEN 16 #define MACSEC_GCM_AES_256_SAK_LEN 32 #define DEFAULT_SAK_LEN MACSEC_GCM_AES_128_SAK_LEN #define DEFAULT_XPN false #define DEFAULT_SEND_SCI true #define DEFAULT_ENCRYPT false #define DEFAULT_ENCODING_SA 0 #define MACSEC_XPN_MAX_REPLAY_WINDOW (((1 << 30) - 1)) static sci_t make_sci(const u8 *addr, __be16 port) { sci_t sci; memcpy(&sci, addr, ETH_ALEN); memcpy(((char *)&sci) + ETH_ALEN, &port, sizeof(port)); return sci; } static sci_t macsec_frame_sci(struct macsec_eth_header *hdr, bool sci_present) { sci_t sci; if (sci_present) memcpy(&sci, hdr->secure_channel_id, sizeof(hdr->secure_channel_id)); else sci = make_sci(hdr->eth.h_source, MACSEC_PORT_ES); return sci; } static unsigned int macsec_sectag_len(bool sci_present) { return MACSEC_TAG_LEN + (sci_present ? MACSEC_SCI_LEN : 0); } static unsigned int macsec_hdr_len(bool sci_present) { return macsec_sectag_len(sci_present) + ETH_HLEN; } static unsigned int macsec_extra_len(bool sci_present) { return macsec_sectag_len(sci_present) + sizeof(__be16); } /* Fill SecTAG according to IEEE 802.1AE-2006 10.5.3 */ static void macsec_fill_sectag(struct macsec_eth_header *h, const struct macsec_secy *secy, u32 pn, bool sci_present) { const struct macsec_tx_sc *tx_sc = &secy->tx_sc; memset(&h->tci_an, 0, macsec_sectag_len(sci_present)); h->eth.h_proto = htons(ETH_P_MACSEC); if (sci_present) { h->tci_an |= MACSEC_TCI_SC; memcpy(&h->secure_channel_id, &secy->sci, sizeof(h->secure_channel_id)); } else { if (tx_sc->end_station) h->tci_an |= MACSEC_TCI_ES; if (tx_sc->scb) h->tci_an |= MACSEC_TCI_SCB; } h->packet_number = htonl(pn); /* with GCM, C/E clear for !encrypt, both set for encrypt */ if (tx_sc->encrypt) h->tci_an |= MACSEC_TCI_CONFID; else if (secy->icv_len != MACSEC_DEFAULT_ICV_LEN) h->tci_an |= MACSEC_TCI_C; h->tci_an |= tx_sc->encoding_sa; } static void macsec_set_shortlen(struct macsec_eth_header *h, size_t data_len) { if (data_len < MIN_NON_SHORT_LEN) h->short_length = data_len; } /* Checks if a MACsec interface is being offloaded to an hardware engine */ static bool macsec_is_offloaded(struct macsec_dev *macsec) { if (macsec->offload == MACSEC_OFFLOAD_MAC || macsec->offload == MACSEC_OFFLOAD_PHY) return true; return false; } /* Checks if underlying layers implement MACsec offloading functions. */ static bool macsec_check_offload(enum macsec_offload offload, struct macsec_dev *macsec) { if (!macsec || !macsec->real_dev) return false; if (offload == MACSEC_OFFLOAD_PHY) return macsec->real_dev->phydev && macsec->real_dev->phydev->macsec_ops; else if (offload == MACSEC_OFFLOAD_MAC) return macsec->real_dev->features & NETIF_F_HW_MACSEC && macsec->real_dev->macsec_ops; return false; } static const struct macsec_ops *__macsec_get_ops(enum macsec_offload offload, struct macsec_dev *macsec, struct macsec_context *ctx) { if (ctx) { memset(ctx, 0, sizeof(*ctx)); ctx->offload = offload; if (offload == MACSEC_OFFLOAD_PHY) ctx->phydev = macsec->real_dev->phydev; else if (offload == MACSEC_OFFLOAD_MAC) ctx->netdev = macsec->real_dev; } if (offload == MACSEC_OFFLOAD_PHY) return macsec->real_dev->phydev->macsec_ops; else return macsec->real_dev->macsec_ops; } /* Returns a pointer to the MACsec ops struct if any and updates the MACsec * context device reference if provided. */ static const struct macsec_ops *macsec_get_ops(struct macsec_dev *macsec, struct macsec_context *ctx) { if (!macsec_check_offload(macsec->offload, macsec)) return NULL; return __macsec_get_ops(macsec->offload, macsec, ctx); } /* validate MACsec packet according to IEEE 802.1AE-2018 9.12 */ static bool macsec_validate_skb(struct sk_buff *skb, u16 icv_len, bool xpn) { struct macsec_eth_header *h = (struct macsec_eth_header *)skb->data; int len = skb->len - 2 * ETH_ALEN; int extra_len = macsec_extra_len(!!(h->tci_an & MACSEC_TCI_SC)) + icv_len; /* a) It comprises at least 17 octets */ if (skb->len <= 16) return false; /* b) MACsec EtherType: already checked */ /* c) V bit is clear */ if (h->tci_an & MACSEC_TCI_VERSION) return false; /* d) ES or SCB => !SC */ if ((h->tci_an & MACSEC_TCI_ES || h->tci_an & MACSEC_TCI_SCB) && (h->tci_an & MACSEC_TCI_SC)) return false; /* e) Bits 7 and 8 of octet 4 of the SecTAG are clear */ if (h->unused) return false; /* rx.pn != 0 if not XPN (figure 10-5 with 802.11AEbw-2013 amendment) */ if (!h->packet_number && !xpn) return false; /* length check, f) g) h) i) */ if (h->short_length) return len == extra_len + h->short_length; return len >= extra_len + MIN_NON_SHORT_LEN; } #define MACSEC_NEEDED_HEADROOM (macsec_extra_len(true)) #define MACSEC_NEEDED_TAILROOM MACSEC_STD_ICV_LEN static void macsec_fill_iv_xpn(unsigned char *iv, ssci_t ssci, u64 pn, salt_t salt) { struct gcm_iv_xpn *gcm_iv = (struct gcm_iv_xpn *)iv; gcm_iv->ssci = ssci ^ salt.ssci; gcm_iv->pn = cpu_to_be64(pn) ^ salt.pn; } static void macsec_fill_iv(unsigned char *iv, sci_t sci, u32 pn) { struct gcm_iv *gcm_iv = (struct gcm_iv *)iv; gcm_iv->sci = sci; gcm_iv->pn = htonl(pn); } static struct macsec_eth_header *macsec_ethhdr(struct sk_buff *skb) { return (struct macsec_eth_header *)skb_mac_header(skb); } static void __macsec_pn_wrapped(struct macsec_secy *secy, struct macsec_tx_sa *tx_sa) { pr_debug("PN wrapped, transitioning to !oper\n"); tx_sa->active = false; if (secy->protect_frames) secy->operational = false; } void macsec_pn_wrapped(struct macsec_secy *secy, struct macsec_tx_sa *tx_sa) { spin_lock_bh(&tx_sa->lock); __macsec_pn_wrapped(secy, tx_sa); spin_unlock_bh(&tx_sa->lock); } EXPORT_SYMBOL_GPL(macsec_pn_wrapped); static pn_t tx_sa_update_pn(struct macsec_tx_sa *tx_sa, struct macsec_secy *secy) { pn_t pn; spin_lock_bh(&tx_sa->lock); pn = tx_sa->next_pn_halves; if (secy->xpn) tx_sa->next_pn++; else tx_sa->next_pn_halves.lower++; if (tx_sa->next_pn == 0) __macsec_pn_wrapped(secy, tx_sa); spin_unlock_bh(&tx_sa->lock); return pn; } static void macsec_encrypt_finish(struct sk_buff *skb, struct net_device *dev) { struct macsec_dev *macsec = netdev_priv(dev); skb->dev = macsec->real_dev; skb_reset_mac_header(skb); skb->protocol = eth_hdr(skb)->h_proto; } static unsigned int macsec_msdu_len(struct sk_buff *skb) { struct macsec_dev *macsec = macsec_priv(skb->dev); struct macsec_secy *secy = &macsec->secy; bool sci_present = macsec_skb_cb(skb)->has_sci; return skb->len - macsec_hdr_len(sci_present) - secy->icv_len; } static void macsec_count_tx(struct sk_buff *skb, struct macsec_tx_sc *tx_sc, struct macsec_tx_sa *tx_sa) { unsigned int msdu_len = macsec_msdu_len(skb); struct pcpu_tx_sc_stats *txsc_stats = this_cpu_ptr(tx_sc->stats); u64_stats_update_begin(&txsc_stats->syncp); if (tx_sc->encrypt) { txsc_stats->stats.OutOctetsEncrypted += msdu_len; txsc_stats->stats.OutPktsEncrypted++; this_cpu_inc(tx_sa->stats->OutPktsEncrypted); } else { txsc_stats->stats.OutOctetsProtected += msdu_len; txsc_stats->stats.OutPktsProtected++; this_cpu_inc(tx_sa->stats->OutPktsProtected); } u64_stats_update_end(&txsc_stats->syncp); } static void count_tx(struct net_device *dev, int ret, int len) { if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) dev_sw_netstats_tx_add(dev, 1, len); } static void macsec_encrypt_done(void *data, int err) { struct sk_buff *skb = data; struct net_device *dev = skb->dev; struct macsec_dev *macsec = macsec_priv(dev); struct macsec_tx_sa *sa = macsec_skb_cb(skb)->tx_sa; int len, ret; aead_request_free(macsec_skb_cb(skb)->req); rcu_read_lock_bh(); macsec_count_tx(skb, &macsec->secy.tx_sc, macsec_skb_cb(skb)->tx_sa); /* packet is encrypted/protected so tx_bytes must be calculated */ len = macsec_msdu_len(skb) + 2 * ETH_ALEN; macsec_encrypt_finish(skb, dev); ret = dev_queue_xmit(skb); count_tx(dev, ret, len); rcu_read_unlock_bh(); macsec_txsa_put(sa); dev_put(dev); } static struct aead_request *macsec_alloc_req(struct crypto_aead *tfm, unsigned char **iv, struct scatterlist **sg, int num_frags) { size_t size, iv_offset, sg_offset; struct aead_request *req; void *tmp; size = sizeof(struct aead_request) + crypto_aead_reqsize(tfm); iv_offset = size; size += GCM_AES_IV_LEN; size = ALIGN(size, __alignof__(struct scatterlist)); sg_offset = size; size += sizeof(struct scatterlist) * num_frags; tmp = kmalloc(size, GFP_ATOMIC); if (!tmp) return NULL; *iv = (unsigned char *)(tmp + iv_offset); *sg = (struct scatterlist *)(tmp + sg_offset); req = tmp; aead_request_set_tfm(req, tfm); return req; } static struct sk_buff *macsec_encrypt(struct sk_buff *skb, struct net_device *dev) { int ret; struct scatterlist *sg; struct sk_buff *trailer; unsigned char *iv; struct ethhdr *eth; struct macsec_eth_header *hh; size_t unprotected_len; struct aead_request *req; struct macsec_secy *secy; struct macsec_tx_sc *tx_sc; struct macsec_tx_sa *tx_sa; struct macsec_dev *macsec = macsec_priv(dev); bool sci_present; pn_t pn; secy = &macsec->secy; tx_sc = &secy->tx_sc; /* 10.5.1 TX SA assignment */ tx_sa = macsec_txsa_get(tx_sc->sa[tx_sc->encoding_sa]); if (!tx_sa) { secy->operational = false; kfree_skb(skb); return ERR_PTR(-EINVAL); } if (unlikely(skb_headroom(skb) < MACSEC_NEEDED_HEADROOM || skb_tailroom(skb) < MACSEC_NEEDED_TAILROOM)) { struct sk_buff *nskb = skb_copy_expand(skb, MACSEC_NEEDED_HEADROOM, MACSEC_NEEDED_TAILROOM, GFP_ATOMIC); if (likely(nskb)) { consume_skb(skb); skb = nskb; } else { macsec_txsa_put(tx_sa); kfree_skb(skb); return ERR_PTR(-ENOMEM); } } else { skb = skb_unshare(skb, GFP_ATOMIC); if (!skb) { macsec_txsa_put(tx_sa); return ERR_PTR(-ENOMEM); } } unprotected_len = skb->len; eth = eth_hdr(skb); sci_present = macsec_send_sci(secy); hh = skb_push(skb, macsec_extra_len(sci_present)); memmove(hh, eth, 2 * ETH_ALEN); pn = tx_sa_update_pn(tx_sa, secy); if (pn.full64 == 0) { macsec_txsa_put(tx_sa); kfree_skb(skb); return ERR_PTR(-ENOLINK); } macsec_fill_sectag(hh, secy, pn.lower, sci_present); macsec_set_shortlen(hh, unprotected_len - 2 * ETH_ALEN); skb_put(skb, secy->icv_len); if (skb->len - ETH_HLEN > macsec_priv(dev)->real_dev->mtu) { struct pcpu_secy_stats *secy_stats = this_cpu_ptr(macsec->stats); u64_stats_update_begin(&secy_stats->syncp); secy_stats->stats.OutPktsTooLong++; u64_stats_update_end(&secy_stats->syncp); macsec_txsa_put(tx_sa); kfree_skb(skb); return ERR_PTR(-EINVAL); } ret = skb_cow_data(skb, 0, &trailer); if (unlikely(ret < 0)) { macsec_txsa_put(tx_sa); kfree_skb(skb); return ERR_PTR(ret); } req = macsec_alloc_req(tx_sa->key.tfm, &iv, &sg, ret); if (!req) { macsec_txsa_put(tx_sa); kfree_skb(skb); return ERR_PTR(-ENOMEM); } if (secy->xpn) macsec_fill_iv_xpn(iv, tx_sa->ssci, pn.full64, tx_sa->key.salt); else macsec_fill_iv(iv, secy->sci, pn.lower); sg_init_table(sg, ret); ret = skb_to_sgvec(skb, sg, 0, skb->len); if (unlikely(ret < 0)) { aead_request_free(req); macsec_txsa_put(tx_sa); kfree_skb(skb); return ERR_PTR(ret); } if (tx_sc->encrypt) { int len = skb->len - macsec_hdr_len(sci_present) - secy->icv_len; aead_request_set_crypt(req, sg, sg, len, iv); aead_request_set_ad(req, macsec_hdr_len(sci_present)); } else { aead_request_set_crypt(req, sg, sg, 0, iv); aead_request_set_ad(req, skb->len - secy->icv_len); } macsec_skb_cb(skb)->req = req; macsec_skb_cb(skb)->tx_sa = tx_sa; macsec_skb_cb(skb)->has_sci = sci_present; aead_request_set_callback(req, 0, macsec_encrypt_done, skb); dev_hold(skb->dev); ret = crypto_aead_encrypt(req); if (ret == -EINPROGRESS) { return ERR_PTR(ret); } else if (ret != 0) { dev_put(skb->dev); kfree_skb(skb); aead_request_free(req); macsec_txsa_put(tx_sa); return ERR_PTR(-EINVAL); } dev_put(skb->dev); aead_request_free(req); macsec_txsa_put(tx_sa); return skb; } static bool macsec_post_decrypt(struct sk_buff *skb, struct macsec_secy *secy, u32 pn) { struct macsec_rx_sa *rx_sa = macsec_skb_cb(skb)->rx_sa; struct pcpu_rx_sc_stats *rxsc_stats = this_cpu_ptr(rx_sa->sc->stats); struct macsec_eth_header *hdr = macsec_ethhdr(skb); u32 lowest_pn = 0; spin_lock(&rx_sa->lock); if (rx_sa->next_pn_halves.lower >= secy->replay_window) lowest_pn = rx_sa->next_pn_halves.lower - secy->replay_window; /* Now perform replay protection check again * (see IEEE 802.1AE-2006 figure 10-5) */ if (secy->replay_protect && pn < lowest_pn && (!secy->xpn || pn_same_half(pn, lowest_pn))) { spin_unlock(&rx_sa->lock); u64_stats_update_begin(&rxsc_stats->syncp); rxsc_stats->stats.InPktsLate++; u64_stats_update_end(&rxsc_stats->syncp); DEV_STATS_INC(secy->netdev, rx_dropped); return false; } if (secy->validate_frames != MACSEC_VALIDATE_DISABLED) { unsigned int msdu_len = macsec_msdu_len(skb); u64_stats_update_begin(&rxsc_stats->syncp); if (hdr->tci_an & MACSEC_TCI_E) rxsc_stats->stats.InOctetsDecrypted += msdu_len; else rxsc_stats->stats.InOctetsValidated += msdu_len; u64_stats_update_end(&rxsc_stats->syncp); } if (!macsec_skb_cb(skb)->valid) { spin_unlock(&rx_sa->lock); /* 10.6.5 */ if (hdr->tci_an & MACSEC_TCI_C || secy->validate_frames == MACSEC_VALIDATE_STRICT) { u64_stats_update_begin(&rxsc_stats->syncp); rxsc_stats->stats.InPktsNotValid++; u64_stats_update_end(&rxsc_stats->syncp); this_cpu_inc(rx_sa->stats->InPktsNotValid); DEV_STATS_INC(secy->netdev, rx_errors); return false; } u64_stats_update_begin(&rxsc_stats->syncp); if (secy->validate_frames == MACSEC_VALIDATE_CHECK) { rxsc_stats->stats.InPktsInvalid++; this_cpu_inc(rx_sa->stats->InPktsInvalid); } else if (pn < lowest_pn) { rxsc_stats->stats.InPktsDelayed++; } else { rxsc_stats->stats.InPktsUnchecked++; } u64_stats_update_end(&rxsc_stats->syncp); } else { u64_stats_update_begin(&rxsc_stats->syncp); if (pn < lowest_pn) { rxsc_stats->stats.InPktsDelayed++; } else { rxsc_stats->stats.InPktsOK++; this_cpu_inc(rx_sa->stats->InPktsOK); } u64_stats_update_end(&rxsc_stats->syncp); // Instead of "pn >=" - to support pn overflow in xpn if (pn + 1 > rx_sa->next_pn_halves.lower) { rx_sa->next_pn_halves.lower = pn + 1; } else if (secy->xpn && !pn_same_half(pn, rx_sa->next_pn_halves.lower)) { rx_sa->next_pn_halves.upper++; rx_sa->next_pn_halves.lower = pn + 1; } spin_unlock(&rx_sa->lock); } return true; } static void macsec_reset_skb(struct sk_buff *skb, struct net_device *dev) { skb->pkt_type = PACKET_HOST; skb->protocol = eth_type_trans(skb, dev); skb_reset_network_header(skb); if (!skb_transport_header_was_set(skb)) skb_reset_transport_header(skb); skb_reset_mac_len(skb); } static void macsec_finalize_skb(struct sk_buff *skb, u8 icv_len, u8 hdr_len) { skb->ip_summed = CHECKSUM_NONE; memmove(skb->data + hdr_len, skb->data, 2 * ETH_ALEN); skb_pull(skb, hdr_len); pskb_trim_unique(skb, skb->len - icv_len); } static void count_rx(struct net_device *dev, int len) { dev_sw_netstats_rx_add(dev, len); } static void macsec_decrypt_done(void *data, int err) { struct sk_buff *skb = data; struct net_device *dev = skb->dev; struct macsec_dev *macsec = macsec_priv(dev); struct macsec_rx_sa *rx_sa = macsec_skb_cb(skb)->rx_sa; struct macsec_rx_sc *rx_sc = rx_sa->sc; int len; u32 pn; aead_request_free(macsec_skb_cb(skb)->req); if (!err) macsec_skb_cb(skb)->valid = true; rcu_read_lock_bh(); pn = ntohl(macsec_ethhdr(skb)->packet_number); if (!macsec_post_decrypt(skb, &macsec->secy, pn)) { rcu_read_unlock_bh(); kfree_skb(skb); goto out; } macsec_finalize_skb(skb, macsec->secy.icv_len, macsec_extra_len(macsec_skb_cb(skb)->has_sci)); len = skb->len; macsec_reset_skb(skb, macsec->secy.netdev); if (gro_cells_receive(&macsec->gro_cells, skb) == NET_RX_SUCCESS) count_rx(dev, len); rcu_read_unlock_bh(); out: macsec_rxsa_put(rx_sa); macsec_rxsc_put(rx_sc); dev_put(dev); } static struct sk_buff *macsec_decrypt(struct sk_buff *skb, struct net_device *dev, struct macsec_rx_sa *rx_sa, sci_t sci, struct macsec_secy *secy) { int ret; struct scatterlist *sg; struct sk_buff *trailer; unsigned char *iv; struct aead_request *req; struct macsec_eth_header *hdr; u32 hdr_pn; u16 icv_len = secy->icv_len; macsec_skb_cb(skb)->valid = false; skb = skb_share_check(skb, GFP_ATOMIC); if (!skb) return ERR_PTR(-ENOMEM); ret = skb_cow_data(skb, 0, &trailer); if (unlikely(ret < 0)) { kfree_skb(skb); return ERR_PTR(ret); } req = macsec_alloc_req(rx_sa->key.tfm, &iv, &sg, ret); if (!req) { kfree_skb(skb); return ERR_PTR(-ENOMEM); } hdr = (struct macsec_eth_header *)skb->data; hdr_pn = ntohl(hdr->packet_number); if (secy->xpn) { pn_t recovered_pn = rx_sa->next_pn_halves; recovered_pn.lower = hdr_pn; if (hdr_pn < rx_sa->next_pn_halves.lower && !pn_same_half(hdr_pn, rx_sa->next_pn_halves.lower)) recovered_pn.upper++; macsec_fill_iv_xpn(iv, rx_sa->ssci, recovered_pn.full64, rx_sa->key.salt); } else { macsec_fill_iv(iv, sci, hdr_pn); } sg_init_table(sg, ret); ret = skb_to_sgvec(skb, sg, 0, skb->len); if (unlikely(ret < 0)) { aead_request_free(req); kfree_skb(skb); return ERR_PTR(ret); } if (hdr->tci_an & MACSEC_TCI_E) { /* confidentiality: ethernet + macsec header * authenticated, encrypted payload */ int len = skb->len - macsec_hdr_len(macsec_skb_cb(skb)->has_sci); aead_request_set_crypt(req, sg, sg, len, iv); aead_request_set_ad(req, macsec_hdr_len(macsec_skb_cb(skb)->has_sci)); skb = skb_unshare(skb, GFP_ATOMIC); if (!skb) { aead_request_free(req); return ERR_PTR(-ENOMEM); } } else { /* integrity only: all headers + data authenticated */ aead_request_set_crypt(req, sg, sg, icv_len, iv); aead_request_set_ad(req, skb->len - icv_len); } macsec_skb_cb(skb)->req = req; skb->dev = dev; aead_request_set_callback(req, 0, macsec_decrypt_done, skb); dev_hold(dev); ret = crypto_aead_decrypt(req); if (ret == -EINPROGRESS) { return ERR_PTR(ret); } else if (ret != 0) { /* decryption/authentication failed * 10.6 if validateFrames is disabled, deliver anyway */ if (ret != -EBADMSG) { kfree_skb(skb); skb = ERR_PTR(ret); } } else { macsec_skb_cb(skb)->valid = true; } dev_put(dev); aead_request_free(req); return skb; } static struct macsec_rx_sc *find_rx_sc(struct macsec_secy *secy, sci_t sci) { struct macsec_rx_sc *rx_sc; for_each_rxsc(secy, rx_sc) { if (rx_sc->sci == sci) return rx_sc; } return NULL; } static struct macsec_rx_sc *find_rx_sc_rtnl(struct macsec_secy *secy, sci_t sci) { struct macsec_rx_sc *rx_sc; for_each_rxsc_rtnl(secy, rx_sc) { if (rx_sc->sci == sci) return rx_sc; } return NULL; } static enum rx_handler_result handle_not_macsec(struct sk_buff *skb) { /* Deliver to the uncontrolled port by default */ enum rx_handler_result ret = RX_HANDLER_PASS; struct ethhdr *hdr = eth_hdr(skb); struct metadata_dst *md_dst; struct macsec_rxh_data *rxd; struct macsec_dev *macsec; bool is_macsec_md_dst; rcu_read_lock(); rxd = macsec_data_rcu(skb->dev); md_dst = skb_metadata_dst(skb); is_macsec_md_dst = md_dst && md_dst->type == METADATA_MACSEC; list_for_each_entry_rcu(macsec, &rxd->secys, secys) { struct sk_buff *nskb; struct pcpu_secy_stats *secy_stats = this_cpu_ptr(macsec->stats); struct net_device *ndev = macsec->secy.netdev; /* If h/w offloading is enabled, HW decodes frames and strips * the SecTAG, so we have to deduce which port to deliver to. */ if (macsec_is_offloaded(macsec) && netif_running(ndev)) { const struct macsec_ops *ops; ops = macsec_get_ops(macsec, NULL); if (ops->rx_uses_md_dst && !is_macsec_md_dst) continue; if (is_macsec_md_dst) { struct macsec_rx_sc *rx_sc; /* All drivers that implement MACsec offload * support using skb metadata destinations must * indicate that they do so. */ DEBUG_NET_WARN_ON_ONCE(!ops->rx_uses_md_dst); rx_sc = find_rx_sc(&macsec->secy, md_dst->u.macsec_info.sci); if (!rx_sc) continue; /* device indicated macsec offload occurred */ skb->dev = ndev; skb->pkt_type = PACKET_HOST; eth_skb_pkt_type(skb, ndev); ret = RX_HANDLER_ANOTHER; goto out; } /* This datapath is insecure because it is unable to * enforce isolation of broadcast/multicast traffic and * unicast traffic with promiscuous mode on the macsec * netdev. Since the core stack has no mechanism to * check that the hardware did indeed receive MACsec * traffic, it is possible that the response handling * done by the MACsec port was to a plaintext packet. * This violates the MACsec protocol standard. */ if (ether_addr_equal_64bits(hdr->h_dest, ndev->dev_addr)) { /* exact match, divert skb to this port */ skb->dev = ndev; skb->pkt_type = PACKET_HOST; ret = RX_HANDLER_ANOTHER; goto out; } else if (is_multicast_ether_addr_64bits( hdr->h_dest)) { /* multicast frame, deliver on this port too */ nskb = skb_clone(skb, GFP_ATOMIC); if (!nskb) break; nskb->dev = ndev; eth_skb_pkt_type(nskb, ndev); __netif_rx(nskb); } else if (ndev->flags & IFF_PROMISC) { skb->dev = ndev; skb->pkt_type = PACKET_HOST; ret = RX_HANDLER_ANOTHER; goto out; } continue; } /* 10.6 If the management control validateFrames is not * Strict, frames without a SecTAG are received, counted, and * delivered to the Controlled Port */ if (macsec->secy.validate_frames == MACSEC_VALIDATE_STRICT) { u64_stats_update_begin(&secy_stats->syncp); secy_stats->stats.InPktsNoTag++; u64_stats_update_end(&secy_stats->syncp); DEV_STATS_INC(macsec->secy.netdev, rx_dropped); continue; } /* deliver on this port */ nskb = skb_clone(skb, GFP_ATOMIC); if (!nskb) break; nskb->dev = ndev; if (__netif_rx(nskb) == NET_RX_SUCCESS) { u64_stats_update_begin(&secy_stats->syncp); secy_stats->stats.InPktsUntagged++; u64_stats_update_end(&secy_stats->syncp); } } out: rcu_read_unlock(); return ret; } static rx_handler_result_t macsec_handle_frame(struct sk_buff **pskb) { struct sk_buff *skb = *pskb; struct net_device *dev = skb->dev; struct macsec_eth_header *hdr; struct macsec_secy *secy = NULL; struct macsec_rx_sc *rx_sc; struct macsec_rx_sa *rx_sa; struct macsec_rxh_data *rxd; struct macsec_dev *macsec; unsigned int len; sci_t sci; u32 hdr_pn; bool cbit; struct pcpu_rx_sc_stats *rxsc_stats; struct pcpu_secy_stats *secy_stats; bool pulled_sci; int ret; if (skb_headroom(skb) < ETH_HLEN) goto drop_direct; hdr = macsec_ethhdr(skb); if (hdr->eth.h_proto != htons(ETH_P_MACSEC)) return handle_not_macsec(skb); skb = skb_unshare(skb, GFP_ATOMIC); *pskb = skb; if (!skb) return RX_HANDLER_CONSUMED; pulled_sci = pskb_may_pull(skb, macsec_extra_len(true)); if (!pulled_sci) { if (!pskb_may_pull(skb, macsec_extra_len(false))) goto drop_direct; } hdr = macsec_ethhdr(skb); /* Frames with a SecTAG that has the TCI E bit set but the C * bit clear are discarded, as this reserved encoding is used * to identify frames with a SecTAG that are not to be * delivered to the Controlled Port. */ if ((hdr->tci_an & (MACSEC_TCI_C | MACSEC_TCI_E)) == MACSEC_TCI_E) return RX_HANDLER_PASS; /* now, pull the extra length */ if (hdr->tci_an & MACSEC_TCI_SC) { if (!pulled_sci) goto drop_direct; } /* ethernet header is part of crypto processing */ skb_push(skb, ETH_HLEN); macsec_skb_cb(skb)->has_sci = !!(hdr->tci_an & MACSEC_TCI_SC); macsec_skb_cb(skb)->assoc_num = hdr->tci_an & MACSEC_AN_MASK; sci = macsec_frame_sci(hdr, macsec_skb_cb(skb)->has_sci); rcu_read_lock(); rxd = macsec_data_rcu(skb->dev); list_for_each_entry_rcu(macsec, &rxd->secys, secys) { struct macsec_rx_sc *sc = find_rx_sc(&macsec->secy, sci); sc = sc ? macsec_rxsc_get(sc) : NULL; if (sc) { secy = &macsec->secy; rx_sc = sc; break; } } if (!secy) goto nosci; dev = secy->netdev; macsec = macsec_priv(dev); secy_stats = this_cpu_ptr(macsec->stats); rxsc_stats = this_cpu_ptr(rx_sc->stats); if (!macsec_validate_skb(skb, secy->icv_len, secy->xpn)) { u64_stats_update_begin(&secy_stats->syncp); secy_stats->stats.InPktsBadTag++; u64_stats_update_end(&secy_stats->syncp); DEV_STATS_INC(secy->netdev, rx_errors); goto drop_nosa; } rx_sa = macsec_rxsa_get(rx_sc->sa[macsec_skb_cb(skb)->assoc_num]); if (!rx_sa) { /* 10.6.1 if the SA is not in use */ /* If validateFrames is Strict or the C bit in the * SecTAG is set, discard */ if (hdr->tci_an & MACSEC_TCI_C || secy->validate_frames == MACSEC_VALIDATE_STRICT) { u64_stats_update_begin(&rxsc_stats->syncp); rxsc_stats->stats.InPktsNotUsingSA++; u64_stats_update_end(&rxsc_stats->syncp); DEV_STATS_INC(secy->netdev, rx_errors); goto drop_nosa; } /* not Strict, the frame (with the SecTAG and ICV * removed) is delivered to the Controlled Port. */ u64_stats_update_begin(&rxsc_stats->syncp); rxsc_stats->stats.InPktsUnusedSA++; u64_stats_update_end(&rxsc_stats->syncp); goto deliver; } /* First, PN check to avoid decrypting obviously wrong packets */ hdr_pn = ntohl(hdr->packet_number); if (secy->replay_protect) { bool late; spin_lock(&rx_sa->lock); late = rx_sa->next_pn_halves.lower >= secy->replay_window && hdr_pn < (rx_sa->next_pn_halves.lower - secy->replay_window); if (secy->xpn) late = late && pn_same_half(rx_sa->next_pn_halves.lower, hdr_pn); spin_unlock(&rx_sa->lock); if (late) { u64_stats_update_begin(&rxsc_stats->syncp); rxsc_stats->stats.InPktsLate++; u64_stats_update_end(&rxsc_stats->syncp); DEV_STATS_INC(macsec->secy.netdev, rx_dropped); goto drop; } } macsec_skb_cb(skb)->rx_sa = rx_sa; /* Disabled && !changed text => skip validation */ if (hdr->tci_an & MACSEC_TCI_C || secy->validate_frames != MACSEC_VALIDATE_DISABLED) skb = macsec_decrypt(skb, dev, rx_sa, sci, secy); if (IS_ERR(skb)) { /* the decrypt callback needs the reference */ if (PTR_ERR(skb) != -EINPROGRESS) { macsec_rxsa_put(rx_sa); macsec_rxsc_put(rx_sc); } rcu_read_unlock(); *pskb = NULL; return RX_HANDLER_CONSUMED; } if (!macsec_post_decrypt(skb, secy, hdr_pn)) goto drop; deliver: macsec_finalize_skb(skb, secy->icv_len, macsec_extra_len(macsec_skb_cb(skb)->has_sci)); len = skb->len; macsec_reset_skb(skb, secy->netdev); if (rx_sa) macsec_rxsa_put(rx_sa); macsec_rxsc_put(rx_sc); skb_orphan(skb); ret = gro_cells_receive(&macsec->gro_cells, skb); if (ret == NET_RX_SUCCESS) count_rx(dev, len); else DEV_STATS_INC(macsec->secy.netdev, rx_dropped); rcu_read_unlock(); *pskb = NULL; return RX_HANDLER_CONSUMED; drop: macsec_rxsa_put(rx_sa); drop_nosa: macsec_rxsc_put(rx_sc); rcu_read_unlock(); drop_direct: kfree_skb(skb); *pskb = NULL; return RX_HANDLER_CONSUMED; nosci: /* 10.6.1 if the SC is not found */ cbit = !!(hdr->tci_an & MACSEC_TCI_C); if (!cbit) macsec_finalize_skb(skb, MACSEC_DEFAULT_ICV_LEN, macsec_extra_len(macsec_skb_cb(skb)->has_sci)); list_for_each_entry_rcu(macsec, &rxd->secys, secys) { struct sk_buff *nskb; secy_stats = this_cpu_ptr(macsec->stats); /* If validateFrames is Strict or the C bit in the * SecTAG is set, discard */ if (cbit || macsec->secy.validate_frames == MACSEC_VALIDATE_STRICT) { u64_stats_update_begin(&secy_stats->syncp); secy_stats->stats.InPktsNoSCI++; u64_stats_update_end(&secy_stats->syncp); DEV_STATS_INC(macsec->secy.netdev, rx_errors); continue; } /* not strict, the frame (with the SecTAG and ICV * removed) is delivered to the Controlled Port. */ nskb = skb_clone(skb, GFP_ATOMIC); if (!nskb) break; macsec_reset_skb(nskb, macsec->secy.netdev); ret = __netif_rx(nskb); if (ret == NET_RX_SUCCESS) { u64_stats_update_begin(&secy_stats->syncp); secy_stats->stats.InPktsUnknownSCI++; u64_stats_update_end(&secy_stats->syncp); } else { DEV_STATS_INC(macsec->secy.netdev, rx_dropped); } } rcu_read_unlock(); *pskb = skb; return RX_HANDLER_PASS; } static struct crypto_aead *macsec_alloc_tfm(char *key, int key_len, int icv_len) { struct crypto_aead *tfm; int ret; tfm = crypto_alloc_aead("gcm(aes)", 0, 0); if (IS_ERR(tfm)) return tfm; ret = crypto_aead_setkey(tfm, key, key_len); if (ret < 0) goto fail; ret = crypto_aead_setauthsize(tfm, icv_len); if (ret < 0) goto fail; return tfm; fail: crypto_free_aead(tfm); return ERR_PTR(ret); } static int init_rx_sa(struct macsec_rx_sa *rx_sa, char *sak, int key_len, int icv_len) { rx_sa->stats = alloc_percpu(struct macsec_rx_sa_stats); if (!rx_sa->stats) return -ENOMEM; rx_sa->key.tfm = macsec_alloc_tfm(sak, key_len, icv_len); if (IS_ERR(rx_sa->key.tfm)) { free_percpu(rx_sa->stats); return PTR_ERR(rx_sa->key.tfm); } rx_sa->ssci = MACSEC_UNDEF_SSCI; rx_sa->active = false; rx_sa->next_pn = 1; refcount_set(&rx_sa->refcnt, 1); spin_lock_init(&rx_sa->lock); return 0; } static void clear_rx_sa(struct macsec_rx_sa *rx_sa) { rx_sa->active = false; macsec_rxsa_put(rx_sa); } static void free_rx_sc(struct macsec_rx_sc *rx_sc) { int i; for (i = 0; i < MACSEC_NUM_AN; i++) { struct macsec_rx_sa *sa = rtnl_dereference(rx_sc->sa[i]); RCU_INIT_POINTER(rx_sc->sa[i], NULL); if (sa) clear_rx_sa(sa); } macsec_rxsc_put(rx_sc); } static struct macsec_rx_sc *del_rx_sc(struct macsec_secy *secy, sci_t sci) { struct macsec_rx_sc *rx_sc, __rcu **rx_scp; for (rx_scp = &secy->rx_sc, rx_sc = rtnl_dereference(*rx_scp); rx_sc; rx_scp = &rx_sc->next, rx_sc = rtnl_dereference(*rx_scp)) { if (rx_sc->sci == sci) { if (rx_sc->active) secy->n_rx_sc--; rcu_assign_pointer(*rx_scp, rx_sc->next); return rx_sc; } } return NULL; } static struct macsec_rx_sc *create_rx_sc(struct net_device *dev, sci_t sci, bool active) { struct macsec_rx_sc *rx_sc; struct macsec_dev *macsec; struct net_device *real_dev = macsec_priv(dev)->real_dev; struct macsec_rxh_data *rxd = macsec_data_rtnl(real_dev); struct macsec_secy *secy; list_for_each_entry(macsec, &rxd->secys, secys) { if (find_rx_sc_rtnl(&macsec->secy, sci)) return ERR_PTR(-EEXIST); } rx_sc = kzalloc(sizeof(*rx_sc), GFP_KERNEL); if (!rx_sc) return ERR_PTR(-ENOMEM); rx_sc->stats = netdev_alloc_pcpu_stats(struct pcpu_rx_sc_stats); if (!rx_sc->stats) { kfree(rx_sc); return ERR_PTR(-ENOMEM); } rx_sc->sci = sci; rx_sc->active = active; refcount_set(&rx_sc->refcnt, 1); secy = &macsec_priv(dev)->secy; rcu_assign_pointer(rx_sc->next, secy->rx_sc); rcu_assign_pointer(secy->rx_sc, rx_sc); if (rx_sc->active) secy->n_rx_sc++; return rx_sc; } static int init_tx_sa(struct macsec_tx_sa *tx_sa, char *sak, int key_len, int icv_len) { tx_sa->stats = alloc_percpu(struct macsec_tx_sa_stats); if (!tx_sa->stats) return -ENOMEM; tx_sa->key.tfm = macsec_alloc_tfm(sak, key_len, icv_len); if (IS_ERR(tx_sa->key.tfm)) { free_percpu(tx_sa->stats); return PTR_ERR(tx_sa->key.tfm); } tx_sa->ssci = MACSEC_UNDEF_SSCI; tx_sa->active = false; refcount_set(&tx_sa->refcnt, 1); spin_lock_init(&tx_sa->lock); return 0; } static void clear_tx_sa(struct macsec_tx_sa *tx_sa) { tx_sa->active = false; macsec_txsa_put(tx_sa); } static struct genl_family macsec_fam; static struct net_device *get_dev_from_nl(struct net *net, struct nlattr **attrs) { int ifindex = nla_get_u32(attrs[MACSEC_ATTR_IFINDEX]); struct net_device *dev; dev = __dev_get_by_index(net, ifindex); if (!dev) return ERR_PTR(-ENODEV); if (!netif_is_macsec(dev)) return ERR_PTR(-ENODEV); return dev; } static enum macsec_offload nla_get_offload(const struct nlattr *nla) { return (__force enum macsec_offload)nla_get_u8(nla); } static sci_t nla_get_sci(const struct nlattr *nla) { return (__force sci_t)nla_get_u64(nla); } static int nla_put_sci(struct sk_buff *skb, int attrtype, sci_t value, int padattr) { return nla_put_u64_64bit(skb, attrtype, (__force u64)value, padattr); } static ssci_t nla_get_ssci(const struct nlattr *nla) { return (__force ssci_t)nla_get_u32(nla); } static int nla_put_ssci(struct sk_buff *skb, int attrtype, ssci_t value) { return nla_put_u32(skb, attrtype, (__force u64)value); } static struct macsec_tx_sa *get_txsa_from_nl(struct net *net, struct nlattr **attrs, struct nlattr **tb_sa, struct net_device **devp, struct macsec_secy **secyp, struct macsec_tx_sc **scp, u8 *assoc_num) { struct net_device *dev; struct macsec_secy *secy; struct macsec_tx_sc *tx_sc; struct macsec_tx_sa *tx_sa; if (!tb_sa[MACSEC_SA_ATTR_AN]) return ERR_PTR(-EINVAL); *assoc_num = nla_get_u8(tb_sa[MACSEC_SA_ATTR_AN]); dev = get_dev_from_nl(net, attrs); if (IS_ERR(dev)) return ERR_CAST(dev); if (*assoc_num >= MACSEC_NUM_AN) return ERR_PTR(-EINVAL); secy = &macsec_priv(dev)->secy; tx_sc = &secy->tx_sc; tx_sa = rtnl_dereference(tx_sc->sa[*assoc_num]); if (!tx_sa) return ERR_PTR(-ENODEV); *devp = dev; *scp = tx_sc; *secyp = secy; return tx_sa; } static struct macsec_rx_sc *get_rxsc_from_nl(struct net *net, struct nlattr **attrs, struct nlattr **tb_rxsc, struct net_device **devp, struct macsec_secy **secyp) { struct net_device *dev; struct macsec_secy *secy; struct macsec_rx_sc *rx_sc; sci_t sci; dev = get_dev_from_nl(net, attrs); if (IS_ERR(dev)) return ERR_CAST(dev); secy = &macsec_priv(dev)->secy; if (!tb_rxsc[MACSEC_RXSC_ATTR_SCI]) return ERR_PTR(-EINVAL); sci = nla_get_sci(tb_rxsc[MACSEC_RXSC_ATTR_SCI]); rx_sc = find_rx_sc_rtnl(secy, sci); if (!rx_sc) return ERR_PTR(-ENODEV); *secyp = secy; *devp = dev; return rx_sc; } static struct macsec_rx_sa *get_rxsa_from_nl(struct net *net, struct nlattr **attrs, struct nlattr **tb_rxsc, struct nlattr **tb_sa, struct net_device **devp, struct macsec_secy **secyp, struct macsec_rx_sc **scp, u8 *assoc_num) { struct macsec_rx_sc *rx_sc; struct macsec_rx_sa *rx_sa; if (!tb_sa[MACSEC_SA_ATTR_AN]) return ERR_PTR(-EINVAL); *assoc_num = nla_get_u8(tb_sa[MACSEC_SA_ATTR_AN]); if (*assoc_num >= MACSEC_NUM_AN) return ERR_PTR(-EINVAL); rx_sc = get_rxsc_from_nl(net, attrs, tb_rxsc, devp, secyp); if (IS_ERR(rx_sc)) return ERR_CAST(rx_sc); rx_sa = rtnl_dereference(rx_sc->sa[*assoc_num]); if (!rx_sa) return ERR_PTR(-ENODEV); *scp = rx_sc; return rx_sa; } static const struct nla_policy macsec_genl_policy[NUM_MACSEC_ATTR] = { [MACSEC_ATTR_IFINDEX] = { .type = NLA_U32 }, [MACSEC_ATTR_RXSC_CONFIG] = { .type = NLA_NESTED }, [MACSEC_ATTR_SA_CONFIG] = { .type = NLA_NESTED }, [MACSEC_ATTR_OFFLOAD] = { .type = NLA_NESTED }, }; static const struct nla_policy macsec_genl_rxsc_policy[NUM_MACSEC_RXSC_ATTR] = { [MACSEC_RXSC_ATTR_SCI] = { .type = NLA_U64 }, [MACSEC_RXSC_ATTR_ACTIVE] = { .type = NLA_U8 }, }; static const struct nla_policy macsec_genl_sa_policy[NUM_MACSEC_SA_ATTR] = { [MACSEC_SA_ATTR_AN] = { .type = NLA_U8 }, [MACSEC_SA_ATTR_ACTIVE] = { .type = NLA_U8 }, [MACSEC_SA_ATTR_PN] = NLA_POLICY_MIN_LEN(4), [MACSEC_SA_ATTR_KEYID] = { .type = NLA_BINARY, .len = MACSEC_KEYID_LEN, }, [MACSEC_SA_ATTR_KEY] = { .type = NLA_BINARY, .len = MACSEC_MAX_KEY_LEN, }, [MACSEC_SA_ATTR_SSCI] = { .type = NLA_U32 }, [MACSEC_SA_ATTR_SALT] = { .type = NLA_BINARY, .len = MACSEC_SALT_LEN, }, }; static const struct nla_policy macsec_genl_offload_policy[NUM_MACSEC_OFFLOAD_ATTR] = { [MACSEC_OFFLOAD_ATTR_TYPE] = { .type = NLA_U8 }, }; /* Offloads an operation to a device driver */ static int macsec_offload(int (* const func)(struct macsec_context *), struct macsec_context *ctx) { int ret; if (unlikely(!func)) return 0; if (ctx->offload == MACSEC_OFFLOAD_PHY) mutex_lock(&ctx->phydev->lock); ret = (*func)(ctx); if (ctx->offload == MACSEC_OFFLOAD_PHY) mutex_unlock(&ctx->phydev->lock); return ret; } static int parse_sa_config(struct nlattr **attrs, struct nlattr **tb_sa) { if (!attrs[MACSEC_ATTR_SA_CONFIG]) return -EINVAL; if (nla_parse_nested_deprecated(tb_sa, MACSEC_SA_ATTR_MAX, attrs[MACSEC_ATTR_SA_CONFIG], macsec_genl_sa_policy, NULL)) return -EINVAL; return 0; } static int parse_rxsc_config(struct nlattr **attrs, struct nlattr **tb_rxsc) { if (!attrs[MACSEC_ATTR_RXSC_CONFIG]) return -EINVAL; if (nla_parse_nested_deprecated(tb_rxsc, MACSEC_RXSC_ATTR_MAX, attrs[MACSEC_ATTR_RXSC_CONFIG], macsec_genl_rxsc_policy, NULL)) return -EINVAL; return 0; } static bool validate_add_rxsa(struct nlattr **attrs) { if (!attrs[MACSEC_SA_ATTR_AN] || !attrs[MACSEC_SA_ATTR_KEY] || !attrs[MACSEC_SA_ATTR_KEYID]) return false; if (nla_get_u8(attrs[MACSEC_SA_ATTR_AN]) >= MACSEC_NUM_AN) return false; if (attrs[MACSEC_SA_ATTR_PN] && nla_get_u64(attrs[MACSEC_SA_ATTR_PN]) == 0) return false; if (attrs[MACSEC_SA_ATTR_ACTIVE]) { if (nla_get_u8(attrs[MACSEC_SA_ATTR_ACTIVE]) > 1) return false; } if (nla_len(attrs[MACSEC_SA_ATTR_KEYID]) != MACSEC_KEYID_LEN) return false; return true; } static int macsec_add_rxsa(struct sk_buff *skb, struct genl_info *info) { struct net_device *dev; struct nlattr **attrs = info->attrs; struct macsec_secy *secy; struct macsec_rx_sc *rx_sc; struct macsec_rx_sa *rx_sa; unsigned char assoc_num; int pn_len; struct nlattr *tb_rxsc[MACSEC_RXSC_ATTR_MAX + 1]; struct nlattr *tb_sa[MACSEC_SA_ATTR_MAX + 1]; int err; if (!attrs[MACSEC_ATTR_IFINDEX]) return -EINVAL; if (parse_sa_config(attrs, tb_sa)) return -EINVAL; if (parse_rxsc_config(attrs, tb_rxsc)) return -EINVAL; if (!validate_add_rxsa(tb_sa)) return -EINVAL; rtnl_lock(); rx_sc = get_rxsc_from_nl(genl_info_net(info), attrs, tb_rxsc, &dev, &secy); if (IS_ERR(rx_sc)) { rtnl_unlock(); return PTR_ERR(rx_sc); } assoc_num = nla_get_u8(tb_sa[MACSEC_SA_ATTR_AN]); if (nla_len(tb_sa[MACSEC_SA_ATTR_KEY]) != secy->key_len) { pr_notice("macsec: nl: add_rxsa: bad key length: %d != %d\n", nla_len(tb_sa[MACSEC_SA_ATTR_KEY]), secy->key_len); rtnl_unlock(); return -EINVAL; } pn_len = secy->xpn ? MACSEC_XPN_PN_LEN : MACSEC_DEFAULT_PN_LEN; if (tb_sa[MACSEC_SA_ATTR_PN] && nla_len(tb_sa[MACSEC_SA_ATTR_PN]) != pn_len) { pr_notice("macsec: nl: add_rxsa: bad pn length: %d != %d\n", nla_len(tb_sa[MACSEC_SA_ATTR_PN]), pn_len); rtnl_unlock(); return -EINVAL; } if (secy->xpn) { if (!tb_sa[MACSEC_SA_ATTR_SSCI] || !tb_sa[MACSEC_SA_ATTR_SALT]) { rtnl_unlock(); return -EINVAL; } if (nla_len(tb_sa[MACSEC_SA_ATTR_SALT]) != MACSEC_SALT_LEN) { pr_notice("macsec: nl: add_rxsa: bad salt length: %d != %d\n", nla_len(tb_sa[MACSEC_SA_ATTR_SALT]), MACSEC_SALT_LEN); rtnl_unlock(); return -EINVAL; } } rx_sa = rtnl_dereference(rx_sc->sa[assoc_num]); if (rx_sa) { rtnl_unlock(); return -EBUSY; } rx_sa = kmalloc(sizeof(*rx_sa), GFP_KERNEL); if (!rx_sa) { rtnl_unlock(); return -ENOMEM; } err = init_rx_sa(rx_sa, nla_data(tb_sa[MACSEC_SA_ATTR_KEY]), secy->key_len, secy->icv_len); if (err < 0) { kfree(rx_sa); rtnl_unlock(); return err; } if (tb_sa[MACSEC_SA_ATTR_PN]) { spin_lock_bh(&rx_sa->lock); rx_sa->next_pn = nla_get_u64(tb_sa[MACSEC_SA_ATTR_PN]); spin_unlock_bh(&rx_sa->lock); } if (tb_sa[MACSEC_SA_ATTR_ACTIVE]) rx_sa->active = !!nla_get_u8(tb_sa[MACSEC_SA_ATTR_ACTIVE]); rx_sa->sc = rx_sc; if (secy->xpn) { rx_sa->ssci = nla_get_ssci(tb_sa[MACSEC_SA_ATTR_SSCI]); nla_memcpy(rx_sa->key.salt.bytes, tb_sa[MACSEC_SA_ATTR_SALT], MACSEC_SALT_LEN); } /* If h/w offloading is available, propagate to the device */ if (macsec_is_offloaded(netdev_priv(dev))) { const struct macsec_ops *ops; struct macsec_context ctx; ops = macsec_get_ops(netdev_priv(dev), &ctx); if (!ops) { err = -EOPNOTSUPP; goto cleanup; } ctx.sa.assoc_num = assoc_num; ctx.sa.rx_sa = rx_sa; ctx.secy = secy; memcpy(ctx.sa.key, nla_data(tb_sa[MACSEC_SA_ATTR_KEY]), secy->key_len); err = macsec_offload(ops->mdo_add_rxsa, &ctx); memzero_explicit(ctx.sa.key, secy->key_len); if (err) goto cleanup; } nla_memcpy(rx_sa->key.id, tb_sa[MACSEC_SA_ATTR_KEYID], MACSEC_KEYID_LEN); rcu_assign_pointer(rx_sc->sa[assoc_num], rx_sa); rtnl_unlock(); return 0; cleanup: macsec_rxsa_put(rx_sa); rtnl_unlock(); return err; } static bool validate_add_rxsc(struct nlattr **attrs) { if (!attrs[MACSEC_RXSC_ATTR_SCI]) return false; if (attrs[MACSEC_RXSC_ATTR_ACTIVE]) { if (nla_get_u8(attrs[MACSEC_RXSC_ATTR_ACTIVE]) > 1) return false; } return true; } static int macsec_add_rxsc(struct sk_buff *skb, struct genl_info *info) { struct net_device *dev; sci_t sci = MACSEC_UNDEF_SCI; struct nlattr **attrs = info->attrs; struct macsec_rx_sc *rx_sc; struct nlattr *tb_rxsc[MACSEC_RXSC_ATTR_MAX + 1]; struct macsec_secy *secy; bool active = true; int ret; if (!attrs[MACSEC_ATTR_IFINDEX]) return -EINVAL; if (parse_rxsc_config(attrs, tb_rxsc)) return -EINVAL; if (!validate_add_rxsc(tb_rxsc)) return -EINVAL; rtnl_lock(); dev = get_dev_from_nl(genl_info_net(info), attrs); if (IS_ERR(dev)) { rtnl_unlock(); return PTR_ERR(dev); } secy = &macsec_priv(dev)->secy; sci = nla_get_sci(tb_rxsc[MACSEC_RXSC_ATTR_SCI]); if (tb_rxsc[MACSEC_RXSC_ATTR_ACTIVE]) active = nla_get_u8(tb_rxsc[MACSEC_RXSC_ATTR_ACTIVE]); rx_sc = create_rx_sc(dev, sci, active); if (IS_ERR(rx_sc)) { rtnl_unlock(); return PTR_ERR(rx_sc); } if (macsec_is_offloaded(netdev_priv(dev))) { const struct macsec_ops *ops; struct macsec_context ctx; ops = macsec_get_ops(netdev_priv(dev), &ctx); if (!ops) { ret = -EOPNOTSUPP; goto cleanup; } ctx.rx_sc = rx_sc; ctx.secy = secy; ret = macsec_offload(ops->mdo_add_rxsc, &ctx); if (ret) goto cleanup; } rtnl_unlock(); return 0; cleanup: del_rx_sc(secy, sci); free_rx_sc(rx_sc); rtnl_unlock(); return ret; } static bool validate_add_txsa(struct nlattr **attrs) { if (!attrs[MACSEC_SA_ATTR_AN] || !attrs[MACSEC_SA_ATTR_PN] || !attrs[MACSEC_SA_ATTR_KEY] || !attrs[MACSEC_SA_ATTR_KEYID]) return false; if (nla_get_u8(attrs[MACSEC_SA_ATTR_AN]) >= MACSEC_NUM_AN) return false; if (nla_get_u64(attrs[MACSEC_SA_ATTR_PN]) == 0) return false; if (attrs[MACSEC_SA_ATTR_ACTIVE]) { if (nla_get_u8(attrs[MACSEC_SA_ATTR_ACTIVE]) > 1) return false; } if (nla_len(attrs[MACSEC_SA_ATTR_KEYID]) != MACSEC_KEYID_LEN) return false; return true; } static int macsec_add_txsa(struct sk_buff *skb, struct genl_info *info) { struct net_device *dev; struct nlattr **attrs = info->attrs; struct macsec_secy *secy; struct macsec_tx_sc *tx_sc; struct macsec_tx_sa *tx_sa; unsigned char assoc_num; int pn_len; struct nlattr *tb_sa[MACSEC_SA_ATTR_MAX + 1]; bool was_operational; int err; if (!attrs[MACSEC_ATTR_IFINDEX]) return -EINVAL; if (parse_sa_config(attrs, tb_sa)) return -EINVAL; if (!validate_add_txsa(tb_sa)) return -EINVAL; rtnl_lock(); dev = get_dev_from_nl(genl_info_net(info), attrs); if (IS_ERR(dev)) { rtnl_unlock(); return PTR_ERR(dev); } secy = &macsec_priv(dev)->secy; tx_sc = &secy->tx_sc; assoc_num = nla_get_u8(tb_sa[MACSEC_SA_ATTR_AN]); if (nla_len(tb_sa[MACSEC_SA_ATTR_KEY]) != secy->key_len) { pr_notice("macsec: nl: add_txsa: bad key length: %d != %d\n", nla_len(tb_sa[MACSEC_SA_ATTR_KEY]), secy->key_len); rtnl_unlock(); return -EINVAL; } pn_len = secy->xpn ? MACSEC_XPN_PN_LEN : MACSEC_DEFAULT_PN_LEN; if (nla_len(tb_sa[MACSEC_SA_ATTR_PN]) != pn_len) { pr_notice("macsec: nl: add_txsa: bad pn length: %d != %d\n", nla_len(tb_sa[MACSEC_SA_ATTR_PN]), pn_len); rtnl_unlock(); return -EINVAL; } if (secy->xpn) { if (!tb_sa[MACSEC_SA_ATTR_SSCI] || !tb_sa[MACSEC_SA_ATTR_SALT]) { rtnl_unlock(); return -EINVAL; } if (nla_len(tb_sa[MACSEC_SA_ATTR_SALT]) != MACSEC_SALT_LEN) { pr_notice("macsec: nl: add_txsa: bad salt length: %d != %d\n", nla_len(tb_sa[MACSEC_SA_ATTR_SALT]), MACSEC_SALT_LEN); rtnl_unlock(); return -EINVAL; } } tx_sa = rtnl_dereference(tx_sc->sa[assoc_num]); if (tx_sa) { rtnl_unlock(); return -EBUSY; } tx_sa = kmalloc(sizeof(*tx_sa), GFP_KERNEL); if (!tx_sa) { rtnl_unlock(); return -ENOMEM; } err = init_tx_sa(tx_sa, nla_data(tb_sa[MACSEC_SA_ATTR_KEY]), secy->key_len, secy->icv_len); if (err < 0) { kfree(tx_sa); rtnl_unlock(); return err; } spin_lock_bh(&tx_sa->lock); tx_sa->next_pn = nla_get_u64(tb_sa[MACSEC_SA_ATTR_PN]); spin_unlock_bh(&tx_sa->lock); if (tb_sa[MACSEC_SA_ATTR_ACTIVE]) tx_sa->active = !!nla_get_u8(tb_sa[MACSEC_SA_ATTR_ACTIVE]); was_operational = secy->operational; if (assoc_num == tx_sc->encoding_sa && tx_sa->active) secy->operational = true; if (secy->xpn) { tx_sa->ssci = nla_get_ssci(tb_sa[MACSEC_SA_ATTR_SSCI]); nla_memcpy(tx_sa->key.salt.bytes, tb_sa[MACSEC_SA_ATTR_SALT], MACSEC_SALT_LEN); } /* If h/w offloading is available, propagate to the device */ if (macsec_is_offloaded(netdev_priv(dev))) { const struct macsec_ops *ops; struct macsec_context ctx; ops = macsec_get_ops(netdev_priv(dev), &ctx); if (!ops) { err = -EOPNOTSUPP; goto cleanup; } ctx.sa.assoc_num = assoc_num; ctx.sa.tx_sa = tx_sa; ctx.secy = secy; memcpy(ctx.sa.key, nla_data(tb_sa[MACSEC_SA_ATTR_KEY]), secy->key_len); err = macsec_offload(ops->mdo_add_txsa, &ctx); memzero_explicit(ctx.sa.key, secy->key_len); if (err) goto cleanup; } nla_memcpy(tx_sa->key.id, tb_sa[MACSEC_SA_ATTR_KEYID], MACSEC_KEYID_LEN); rcu_assign_pointer(tx_sc->sa[assoc_num], tx_sa); rtnl_unlock(); return 0; cleanup: secy->operational = was_operational; macsec_txsa_put(tx_sa); rtnl_unlock(); return err; } static int macsec_del_rxsa(struct sk_buff *skb, struct genl_info *info) { struct nlattr **attrs = info->attrs; struct net_device *dev; struct macsec_secy *secy; struct macsec_rx_sc *rx_sc; struct macsec_rx_sa *rx_sa; u8 assoc_num; struct nlattr *tb_rxsc[MACSEC_RXSC_ATTR_MAX + 1]; struct nlattr *tb_sa[MACSEC_SA_ATTR_MAX + 1]; int ret; if (!attrs[MACSEC_ATTR_IFINDEX]) return -EINVAL; if (parse_sa_config(attrs, tb_sa)) return -EINVAL; if (parse_rxsc_config(attrs, tb_rxsc)) return -EINVAL; rtnl_lock(); rx_sa = get_rxsa_from_nl(genl_info_net(info), attrs, tb_rxsc, tb_sa, &dev, &secy, &rx_sc, &assoc_num); if (IS_ERR(rx_sa)) { rtnl_unlock(); return PTR_ERR(rx_sa); } if (rx_sa->active) { rtnl_unlock(); return -EBUSY; } /* If h/w offloading is available, propagate to the device */ if (macsec_is_offloaded(netdev_priv(dev))) { const struct macsec_ops *ops; struct macsec_context ctx; ops = macsec_get_ops(netdev_priv(dev), &ctx); if (!ops) { ret = -EOPNOTSUPP; goto cleanup; } ctx.sa.assoc_num = assoc_num; ctx.sa.rx_sa = rx_sa; ctx.secy = secy; ret = macsec_offload(ops->mdo_del_rxsa, &ctx); if (ret) goto cleanup; } RCU_INIT_POINTER(rx_sc->sa[assoc_num], NULL); clear_rx_sa(rx_sa); rtnl_unlock(); return 0; cleanup: rtnl_unlock(); return ret; } static int macsec_del_rxsc(struct sk_buff *skb, struct genl_info *info) { struct nlattr **attrs = info->attrs; struct net_device *dev; struct macsec_secy *secy; struct macsec_rx_sc *rx_sc; sci_t sci; struct nlattr *tb_rxsc[MACSEC_RXSC_ATTR_MAX + 1]; int ret; if (!attrs[MACSEC_ATTR_IFINDEX]) return -EINVAL; if (parse_rxsc_config(attrs, tb_rxsc)) return -EINVAL; if (!tb_rxsc[MACSEC_RXSC_ATTR_SCI]) return -EINVAL; rtnl_lock(); dev = get_dev_from_nl(genl_info_net(info), info->attrs); if (IS_ERR(dev)) { rtnl_unlock(); return PTR_ERR(dev); } secy = &macsec_priv(dev)->secy; sci = nla_get_sci(tb_rxsc[MACSEC_RXSC_ATTR_SCI]); rx_sc = del_rx_sc(secy, sci); if (!rx_sc) { rtnl_unlock(); return -ENODEV; } /* If h/w offloading is available, propagate to the device */ if (macsec_is_offloaded(netdev_priv(dev))) { const struct macsec_ops *ops; struct macsec_context ctx; ops = macsec_get_ops(netdev_priv(dev), &ctx); if (!ops) { ret = -EOPNOTSUPP; goto cleanup; } ctx.rx_sc = rx_sc; ctx.secy = secy; ret = macsec_offload(ops->mdo_del_rxsc, &ctx); if (ret) goto cleanup; } free_rx_sc(rx_sc); rtnl_unlock(); return 0; cleanup: rtnl_unlock(); return ret; } static int macsec_del_txsa(struct sk_buff *skb, struct genl_info *info) { struct nlattr **attrs = info->attrs; struct net_device *dev; struct macsec_secy *secy; struct macsec_tx_sc *tx_sc; struct macsec_tx_sa *tx_sa; u8 assoc_num; struct nlattr *tb_sa[MACSEC_SA_ATTR_MAX + 1]; int ret; if (!attrs[MACSEC_ATTR_IFINDEX]) return -EINVAL; if (parse_sa_config(attrs, tb_sa)) return -EINVAL; rtnl_lock(); tx_sa = get_txsa_from_nl(genl_info_net(info), attrs, tb_sa, &dev, &secy, &tx_sc, &assoc_num); if (IS_ERR(tx_sa)) { rtnl_unlock(); return PTR_ERR(tx_sa); } if (tx_sa->active) { rtnl_unlock(); return -EBUSY; } /* If h/w offloading is available, propagate to the device */ if (macsec_is_offloaded(netdev_priv(dev))) { const struct macsec_ops *ops; struct macsec_context ctx; ops = macsec_get_ops(netdev_priv(dev), &ctx); if (!ops) { ret = -EOPNOTSUPP; goto cleanup; } ctx.sa.assoc_num = assoc_num; ctx.sa.tx_sa = tx_sa; ctx.secy = secy; ret = macsec_offload(ops->mdo_del_txsa, &ctx); if (ret) goto cleanup; } RCU_INIT_POINTER(tx_sc->sa[assoc_num], NULL); clear_tx_sa(tx_sa); rtnl_unlock(); return 0; cleanup: rtnl_unlock(); return ret; } static bool validate_upd_sa(struct nlattr **attrs) { if (!attrs[MACSEC_SA_ATTR_AN] || attrs[MACSEC_SA_ATTR_KEY] || attrs[MACSEC_SA_ATTR_KEYID] || attrs[MACSEC_SA_ATTR_SSCI] || attrs[MACSEC_SA_ATTR_SALT]) return false; if (nla_get_u8(attrs[MACSEC_SA_ATTR_AN]) >= MACSEC_NUM_AN) return false; if (attrs[MACSEC_SA_ATTR_PN] && nla_get_u64(attrs[MACSEC_SA_ATTR_PN]) == 0) return false; if (attrs[MACSEC_SA_ATTR_ACTIVE]) { if (nla_get_u8(attrs[MACSEC_SA_ATTR_ACTIVE]) > 1) return false; } return true; } static int macsec_upd_txsa(struct sk_buff *skb, struct genl_info *info) { struct nlattr **attrs = info->attrs; struct net_device *dev; struct macsec_secy *secy; struct macsec_tx_sc *tx_sc; struct macsec_tx_sa *tx_sa; u8 assoc_num; struct nlattr *tb_sa[MACSEC_SA_ATTR_MAX + 1]; bool was_operational, was_active; pn_t prev_pn; int ret = 0; prev_pn.full64 = 0; if (!attrs[MACSEC_ATTR_IFINDEX]) return -EINVAL; if (parse_sa_config(attrs, tb_sa)) return -EINVAL; if (!validate_upd_sa(tb_sa)) return -EINVAL; rtnl_lock(); tx_sa = get_txsa_from_nl(genl_info_net(info), attrs, tb_sa, &dev, &secy, &tx_sc, &assoc_num); if (IS_ERR(tx_sa)) { rtnl_unlock(); return PTR_ERR(tx_sa); } if (tb_sa[MACSEC_SA_ATTR_PN]) { int pn_len; pn_len = secy->xpn ? MACSEC_XPN_PN_LEN : MACSEC_DEFAULT_PN_LEN; if (nla_len(tb_sa[MACSEC_SA_ATTR_PN]) != pn_len) { pr_notice("macsec: nl: upd_txsa: bad pn length: %d != %d\n", nla_len(tb_sa[MACSEC_SA_ATTR_PN]), pn_len); rtnl_unlock(); return -EINVAL; } spin_lock_bh(&tx_sa->lock); prev_pn = tx_sa->next_pn_halves; tx_sa->next_pn = nla_get_u64(tb_sa[MACSEC_SA_ATTR_PN]); spin_unlock_bh(&tx_sa->lock); } was_active = tx_sa->active; if (tb_sa[MACSEC_SA_ATTR_ACTIVE]) tx_sa->active = nla_get_u8(tb_sa[MACSEC_SA_ATTR_ACTIVE]); was_operational = secy->operational; if (assoc_num == tx_sc->encoding_sa) secy->operational = tx_sa->active; /* If h/w offloading is available, propagate to the device */ if (macsec_is_offloaded(netdev_priv(dev))) { const struct macsec_ops *ops; struct macsec_context ctx; ops = macsec_get_ops(netdev_priv(dev), &ctx); if (!ops) { ret = -EOPNOTSUPP; goto cleanup; } ctx.sa.assoc_num = assoc_num; ctx.sa.tx_sa = tx_sa; ctx.sa.update_pn = !!prev_pn.full64; ctx.secy = secy; ret = macsec_offload(ops->mdo_upd_txsa, &ctx); if (ret) goto cleanup; } rtnl_unlock(); return 0; cleanup: if (tb_sa[MACSEC_SA_ATTR_PN]) { spin_lock_bh(&tx_sa->lock); tx_sa->next_pn_halves = prev_pn; spin_unlock_bh(&tx_sa->lock); } tx_sa->active = was_active; secy->operational = was_operational; rtnl_unlock(); return ret; } static int macsec_upd_rxsa(struct sk_buff *skb, struct genl_info *info) { struct nlattr **attrs = info->attrs; struct net_device *dev; struct macsec_secy *secy; struct macsec_rx_sc *rx_sc; struct macsec_rx_sa *rx_sa; u8 assoc_num; struct nlattr *tb_rxsc[MACSEC_RXSC_ATTR_MAX + 1]; struct nlattr *tb_sa[MACSEC_SA_ATTR_MAX + 1]; bool was_active; pn_t prev_pn; int ret = 0; prev_pn.full64 = 0; if (!attrs[MACSEC_ATTR_IFINDEX]) return -EINVAL; if (parse_rxsc_config(attrs, tb_rxsc)) return -EINVAL; if (parse_sa_config(attrs, tb_sa)) return -EINVAL; if (!validate_upd_sa(tb_sa)) return -EINVAL; rtnl_lock(); rx_sa = get_rxsa_from_nl(genl_info_net(info), attrs, tb_rxsc, tb_sa, &dev, &secy, &rx_sc, &assoc_num); if (IS_ERR(rx_sa)) { rtnl_unlock(); return PTR_ERR(rx_sa); } if (tb_sa[MACSEC_SA_ATTR_PN]) { int pn_len; pn_len = secy->xpn ? MACSEC_XPN_PN_LEN : MACSEC_DEFAULT_PN_LEN; if (nla_len(tb_sa[MACSEC_SA_ATTR_PN]) != pn_len) { pr_notice("macsec: nl: upd_rxsa: bad pn length: %d != %d\n", nla_len(tb_sa[MACSEC_SA_ATTR_PN]), pn_len); rtnl_unlock(); return -EINVAL; } spin_lock_bh(&rx_sa->lock); prev_pn = rx_sa->next_pn_halves; rx_sa->next_pn = nla_get_u64(tb_sa[MACSEC_SA_ATTR_PN]); spin_unlock_bh(&rx_sa->lock); } was_active = rx_sa->active; if (tb_sa[MACSEC_SA_ATTR_ACTIVE]) rx_sa->active = nla_get_u8(tb_sa[MACSEC_SA_ATTR_ACTIVE]); /* If h/w offloading is available, propagate to the device */ if (macsec_is_offloaded(netdev_priv(dev))) { const struct macsec_ops *ops; struct macsec_context ctx; ops = macsec_get_ops(netdev_priv(dev), &ctx); if (!ops) { ret = -EOPNOTSUPP; goto cleanup; } ctx.sa.assoc_num = assoc_num; ctx.sa.rx_sa = rx_sa; ctx.sa.update_pn = !!prev_pn.full64; ctx.secy = secy; ret = macsec_offload(ops->mdo_upd_rxsa, &ctx); if (ret) goto cleanup; } rtnl_unlock(); return 0; cleanup: if (tb_sa[MACSEC_SA_ATTR_PN]) { spin_lock_bh(&rx_sa->lock); rx_sa->next_pn_halves = prev_pn; spin_unlock_bh(&rx_sa->lock); } rx_sa->active = was_active; rtnl_unlock(); return ret; } static int macsec_upd_rxsc(struct sk_buff *skb, struct genl_info *info) { struct nlattr **attrs = info->attrs; struct net_device *dev; struct macsec_secy *secy; struct macsec_rx_sc *rx_sc; struct nlattr *tb_rxsc[MACSEC_RXSC_ATTR_MAX + 1]; unsigned int prev_n_rx_sc; bool was_active; int ret; if (!attrs[MACSEC_ATTR_IFINDEX]) return -EINVAL; if (parse_rxsc_config(attrs, tb_rxsc)) return -EINVAL; if (!validate_add_rxsc(tb_rxsc)) return -EINVAL; rtnl_lock(); rx_sc = get_rxsc_from_nl(genl_info_net(info), attrs, tb_rxsc, &dev, &secy); if (IS_ERR(rx_sc)) { rtnl_unlock(); return PTR_ERR(rx_sc); } was_active = rx_sc->active; prev_n_rx_sc = secy->n_rx_sc; if (tb_rxsc[MACSEC_RXSC_ATTR_ACTIVE]) { bool new = !!nla_get_u8(tb_rxsc[MACSEC_RXSC_ATTR_ACTIVE]); if (rx_sc->active != new) secy->n_rx_sc += new ? 1 : -1; rx_sc->active = new; } /* If h/w offloading is available, propagate to the device */ if (macsec_is_offloaded(netdev_priv(dev))) { const struct macsec_ops *ops; struct macsec_context ctx; ops = macsec_get_ops(netdev_priv(dev), &ctx); if (!ops) { ret = -EOPNOTSUPP; goto cleanup; } ctx.rx_sc = rx_sc; ctx.secy = secy; ret = macsec_offload(ops->mdo_upd_rxsc, &ctx); if (ret) goto cleanup; } rtnl_unlock(); return 0; cleanup: secy->n_rx_sc = prev_n_rx_sc; rx_sc->active = was_active; rtnl_unlock(); return ret; } static bool macsec_is_configured(struct macsec_dev *macsec) { struct macsec_secy *secy = &macsec->secy; struct macsec_tx_sc *tx_sc = &secy->tx_sc; int i; if (secy->rx_sc) return true; for (i = 0; i < MACSEC_NUM_AN; i++) if (tx_sc->sa[i]) return true; return false; } static bool macsec_needs_tx_tag(struct macsec_dev *macsec, const struct macsec_ops *ops) { return macsec->offload == MACSEC_OFFLOAD_PHY && ops->mdo_insert_tx_tag; } static void macsec_set_head_tail_room(struct net_device *dev) { struct macsec_dev *macsec = macsec_priv(dev); struct net_device *real_dev = macsec->real_dev; int needed_headroom, needed_tailroom; const struct macsec_ops *ops; ops = macsec_get_ops(macsec, NULL); if (ops) { needed_headroom = ops->needed_headroom; needed_tailroom = ops->needed_tailroom; } else { needed_headroom = MACSEC_NEEDED_HEADROOM; needed_tailroom = MACSEC_NEEDED_TAILROOM; } dev->needed_headroom = real_dev->needed_headroom + needed_headroom; dev->needed_tailroom = real_dev->needed_tailroom + needed_tailroom; } static void macsec_inherit_tso_max(struct net_device *dev) { struct macsec_dev *macsec = macsec_priv(dev); /* if macsec is offloaded, we need to follow the lower * device's capabilities. otherwise, we can ignore them. */ if (macsec_is_offloaded(macsec)) netif_inherit_tso_max(dev, macsec->real_dev); } static int macsec_update_offload(struct net_device *dev, enum macsec_offload offload) { enum macsec_offload prev_offload; const struct macsec_ops *ops; struct macsec_context ctx; struct macsec_dev *macsec; int ret = 0; macsec = macsec_priv(dev); /* Check if the offloading mode is supported by the underlying layers */ if (offload != MACSEC_OFFLOAD_OFF && !macsec_check_offload(offload, macsec)) return -EOPNOTSUPP; /* Check if the net device is busy. */ if (netif_running(dev)) return -EBUSY; /* Check if the device already has rules configured: we do not support * rules migration. */ if (macsec_is_configured(macsec)) return -EBUSY; prev_offload = macsec->offload; ops = __macsec_get_ops(offload == MACSEC_OFFLOAD_OFF ? prev_offload : offload, macsec, &ctx); if (!ops) return -EOPNOTSUPP; macsec->offload = offload; ctx.secy = &macsec->secy; ret = offload == MACSEC_OFFLOAD_OFF ? macsec_offload(ops->mdo_del_secy, &ctx) : macsec_offload(ops->mdo_add_secy, &ctx); if (ret) { macsec->offload = prev_offload; return ret; } macsec_set_head_tail_room(dev); macsec->insert_tx_tag = macsec_needs_tx_tag(macsec, ops); macsec_inherit_tso_max(dev); netdev_update_features(dev); return ret; } static int macsec_upd_offload(struct sk_buff *skb, struct genl_info *info) { struct nlattr *tb_offload[MACSEC_OFFLOAD_ATTR_MAX + 1]; struct nlattr **attrs = info->attrs; enum macsec_offload offload; struct macsec_dev *macsec; struct net_device *dev; int ret = 0; if (!attrs[MACSEC_ATTR_IFINDEX]) return -EINVAL; if (!attrs[MACSEC_ATTR_OFFLOAD]) return -EINVAL; if (nla_parse_nested_deprecated(tb_offload, MACSEC_OFFLOAD_ATTR_MAX, attrs[MACSEC_ATTR_OFFLOAD], macsec_genl_offload_policy, NULL)) return -EINVAL; rtnl_lock(); dev = get_dev_from_nl(genl_info_net(info), attrs); if (IS_ERR(dev)) { ret = PTR_ERR(dev); goto out; } macsec = macsec_priv(dev); if (!tb_offload[MACSEC_OFFLOAD_ATTR_TYPE]) { ret = -EINVAL; goto out; } offload = nla_get_u8(tb_offload[MACSEC_OFFLOAD_ATTR_TYPE]); if (macsec->offload != offload) ret = macsec_update_offload(dev, offload); out: rtnl_unlock(); return ret; } static void get_tx_sa_stats(struct net_device *dev, int an, struct macsec_tx_sa *tx_sa, struct macsec_tx_sa_stats *sum) { struct macsec_dev *macsec = macsec_priv(dev); int cpu; /* If h/w offloading is available, propagate to the device */ if (macsec_is_offloaded(macsec)) { const struct macsec_ops *ops; struct macsec_context ctx; ops = macsec_get_ops(macsec, &ctx); if (ops) { ctx.sa.assoc_num = an; ctx.sa.tx_sa = tx_sa; ctx.stats.tx_sa_stats = sum; ctx.secy = &macsec_priv(dev)->secy; macsec_offload(ops->mdo_get_tx_sa_stats, &ctx); } return; } for_each_possible_cpu(cpu) { const struct macsec_tx_sa_stats *stats = per_cpu_ptr(tx_sa->stats, cpu); sum->OutPktsProtected += stats->OutPktsProtected; sum->OutPktsEncrypted += stats->OutPktsEncrypted; } } static int copy_tx_sa_stats(struct sk_buff *skb, struct macsec_tx_sa_stats *sum) { if (nla_put_u32(skb, MACSEC_SA_STATS_ATTR_OUT_PKTS_PROTECTED, sum->OutPktsProtected) || nla_put_u32(skb, MACSEC_SA_STATS_ATTR_OUT_PKTS_ENCRYPTED, sum->OutPktsEncrypted)) return -EMSGSIZE; return 0; } static void get_rx_sa_stats(struct net_device *dev, struct macsec_rx_sc *rx_sc, int an, struct macsec_rx_sa *rx_sa, struct macsec_rx_sa_stats *sum) { struct macsec_dev *macsec = macsec_priv(dev); int cpu; /* If h/w offloading is available, propagate to the device */ if (macsec_is_offloaded(macsec)) { const struct macsec_ops *ops; struct macsec_context ctx; ops = macsec_get_ops(macsec, &ctx); if (ops) { ctx.sa.assoc_num = an; ctx.sa.rx_sa = rx_sa; ctx.stats.rx_sa_stats = sum; ctx.secy = &macsec_priv(dev)->secy; ctx.rx_sc = rx_sc; macsec_offload(ops->mdo_get_rx_sa_stats, &ctx); } return; } for_each_possible_cpu(cpu) { const struct macsec_rx_sa_stats *stats = per_cpu_ptr(rx_sa->stats, cpu); sum->InPktsOK += stats->InPktsOK; sum->InPktsInvalid += stats->InPktsInvalid; sum->InPktsNotValid += stats->InPktsNotValid; sum->InPktsNotUsingSA += stats->InPktsNotUsingSA; sum->InPktsUnusedSA += stats->InPktsUnusedSA; } } static int copy_rx_sa_stats(struct sk_buff *skb, struct macsec_rx_sa_stats *sum) { if (nla_put_u32(skb, MACSEC_SA_STATS_ATTR_IN_PKTS_OK, sum->InPktsOK) || nla_put_u32(skb, MACSEC_SA_STATS_ATTR_IN_PKTS_INVALID, sum->InPktsInvalid) || nla_put_u32(skb, MACSEC_SA_STATS_ATTR_IN_PKTS_NOT_VALID, sum->InPktsNotValid) || nla_put_u32(skb, MACSEC_SA_STATS_ATTR_IN_PKTS_NOT_USING_SA, sum->InPktsNotUsingSA) || nla_put_u32(skb, MACSEC_SA_STATS_ATTR_IN_PKTS_UNUSED_SA, sum->InPktsUnusedSA)) return -EMSGSIZE; return 0; } static void get_rx_sc_stats(struct net_device *dev, struct macsec_rx_sc *rx_sc, struct macsec_rx_sc_stats *sum) { struct macsec_dev *macsec = macsec_priv(dev); int cpu; /* If h/w offloading is available, propagate to the device */ if (macsec_is_offloaded(macsec)) { const struct macsec_ops *ops; struct macsec_context ctx; ops = macsec_get_ops(macsec, &ctx); if (ops) { ctx.stats.rx_sc_stats = sum; ctx.secy = &macsec_priv(dev)->secy; ctx.rx_sc = rx_sc; macsec_offload(ops->mdo_get_rx_sc_stats, &ctx); } return; } for_each_possible_cpu(cpu) { const struct pcpu_rx_sc_stats *stats; struct macsec_rx_sc_stats tmp; unsigned int start; stats = per_cpu_ptr(rx_sc->stats, cpu); do { start = u64_stats_fetch_begin(&stats->syncp); memcpy(&tmp, &stats->stats, sizeof(tmp)); } while (u64_stats_fetch_retry(&stats->syncp, start)); sum->InOctetsValidated += tmp.InOctetsValidated; sum->InOctetsDecrypted += tmp.InOctetsDecrypted; sum->InPktsUnchecked += tmp.InPktsUnchecked; sum->InPktsDelayed += tmp.InPktsDelayed; sum->InPktsOK += tmp.InPktsOK; sum->InPktsInvalid += tmp.InPktsInvalid; sum->InPktsLate += tmp.InPktsLate; sum->InPktsNotValid += tmp.InPktsNotValid; sum->InPktsNotUsingSA += tmp.InPktsNotUsingSA; sum->InPktsUnusedSA += tmp.InPktsUnusedSA; } } static int copy_rx_sc_stats(struct sk_buff *skb, struct macsec_rx_sc_stats *sum) { if (nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_OCTETS_VALIDATED, sum->InOctetsValidated, MACSEC_RXSC_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_OCTETS_DECRYPTED, sum->InOctetsDecrypted, MACSEC_RXSC_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_PKTS_UNCHECKED, sum->InPktsUnchecked, MACSEC_RXSC_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_PKTS_DELAYED, sum->InPktsDelayed, MACSEC_RXSC_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_PKTS_OK, sum->InPktsOK, MACSEC_RXSC_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_PKTS_INVALID, sum->InPktsInvalid, MACSEC_RXSC_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_PKTS_LATE, sum->InPktsLate, MACSEC_RXSC_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_PKTS_NOT_VALID, sum->InPktsNotValid, MACSEC_RXSC_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_PKTS_NOT_USING_SA, sum->InPktsNotUsingSA, MACSEC_RXSC_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, MACSEC_RXSC_STATS_ATTR_IN_PKTS_UNUSED_SA, sum->InPktsUnusedSA, MACSEC_RXSC_STATS_ATTR_PAD)) return -EMSGSIZE; return 0; } static void get_tx_sc_stats(struct net_device *dev, struct macsec_tx_sc_stats *sum) { struct macsec_dev *macsec = macsec_priv(dev); int cpu; /* If h/w offloading is available, propagate to the device */ if (macsec_is_offloaded(macsec)) { const struct macsec_ops *ops; struct macsec_context ctx; ops = macsec_get_ops(macsec, &ctx); if (ops) { ctx.stats.tx_sc_stats = sum; ctx.secy = &macsec_priv(dev)->secy; macsec_offload(ops->mdo_get_tx_sc_stats, &ctx); } return; } for_each_possible_cpu(cpu) { const struct pcpu_tx_sc_stats *stats; struct macsec_tx_sc_stats tmp; unsigned int start; stats = per_cpu_ptr(macsec_priv(dev)->secy.tx_sc.stats, cpu); do { start = u64_stats_fetch_begin(&stats->syncp); memcpy(&tmp, &stats->stats, sizeof(tmp)); } while (u64_stats_fetch_retry(&stats->syncp, start)); sum->OutPktsProtected += tmp.OutPktsProtected; sum->OutPktsEncrypted += tmp.OutPktsEncrypted; sum->OutOctetsProtected += tmp.OutOctetsProtected; sum->OutOctetsEncrypted += tmp.OutOctetsEncrypted; } } static int copy_tx_sc_stats(struct sk_buff *skb, struct macsec_tx_sc_stats *sum) { if (nla_put_u64_64bit(skb, MACSEC_TXSC_STATS_ATTR_OUT_PKTS_PROTECTED, sum->OutPktsProtected, MACSEC_TXSC_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, MACSEC_TXSC_STATS_ATTR_OUT_PKTS_ENCRYPTED, sum->OutPktsEncrypted, MACSEC_TXSC_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, MACSEC_TXSC_STATS_ATTR_OUT_OCTETS_PROTECTED, sum->OutOctetsProtected, MACSEC_TXSC_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, MACSEC_TXSC_STATS_ATTR_OUT_OCTETS_ENCRYPTED, sum->OutOctetsEncrypted, MACSEC_TXSC_STATS_ATTR_PAD)) return -EMSGSIZE; return 0; } static void get_secy_stats(struct net_device *dev, struct macsec_dev_stats *sum) { struct macsec_dev *macsec = macsec_priv(dev); int cpu; /* If h/w offloading is available, propagate to the device */ if (macsec_is_offloaded(macsec)) { const struct macsec_ops *ops; struct macsec_context ctx; ops = macsec_get_ops(macsec, &ctx); if (ops) { ctx.stats.dev_stats = sum; ctx.secy = &macsec_priv(dev)->secy; macsec_offload(ops->mdo_get_dev_stats, &ctx); } return; } for_each_possible_cpu(cpu) { const struct pcpu_secy_stats *stats; struct macsec_dev_stats tmp; unsigned int start; stats = per_cpu_ptr(macsec_priv(dev)->stats, cpu); do { start = u64_stats_fetch_begin(&stats->syncp); memcpy(&tmp, &stats->stats, sizeof(tmp)); } while (u64_stats_fetch_retry(&stats->syncp, start)); sum->OutPktsUntagged += tmp.OutPktsUntagged; sum->InPktsUntagged += tmp.InPktsUntagged; sum->OutPktsTooLong += tmp.OutPktsTooLong; sum->InPktsNoTag += tmp.InPktsNoTag; sum->InPktsBadTag += tmp.InPktsBadTag; sum->InPktsUnknownSCI += tmp.InPktsUnknownSCI; sum->InPktsNoSCI += tmp.InPktsNoSCI; sum->InPktsOverrun += tmp.InPktsOverrun; } } static int copy_secy_stats(struct sk_buff *skb, struct macsec_dev_stats *sum) { if (nla_put_u64_64bit(skb, MACSEC_SECY_STATS_ATTR_OUT_PKTS_UNTAGGED, sum->OutPktsUntagged, MACSEC_SECY_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, MACSEC_SECY_STATS_ATTR_IN_PKTS_UNTAGGED, sum->InPktsUntagged, MACSEC_SECY_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, MACSEC_SECY_STATS_ATTR_OUT_PKTS_TOO_LONG, sum->OutPktsTooLong, MACSEC_SECY_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, MACSEC_SECY_STATS_ATTR_IN_PKTS_NO_TAG, sum->InPktsNoTag, MACSEC_SECY_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, MACSEC_SECY_STATS_ATTR_IN_PKTS_BAD_TAG, sum->InPktsBadTag, MACSEC_SECY_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, MACSEC_SECY_STATS_ATTR_IN_PKTS_UNKNOWN_SCI, sum->InPktsUnknownSCI, MACSEC_SECY_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, MACSEC_SECY_STATS_ATTR_IN_PKTS_NO_SCI, sum->InPktsNoSCI, MACSEC_SECY_STATS_ATTR_PAD) || nla_put_u64_64bit(skb, MACSEC_SECY_STATS_ATTR_IN_PKTS_OVERRUN, sum->InPktsOverrun, MACSEC_SECY_STATS_ATTR_PAD)) return -EMSGSIZE; return 0; } static int nla_put_secy(struct macsec_secy *secy, struct sk_buff *skb) { struct macsec_tx_sc *tx_sc = &secy->tx_sc; struct nlattr *secy_nest = nla_nest_start_noflag(skb, MACSEC_ATTR_SECY); u64 csid; if (!secy_nest) return 1; switch (secy->key_len) { case MACSEC_GCM_AES_128_SAK_LEN: csid = secy->xpn ? MACSEC_CIPHER_ID_GCM_AES_XPN_128 : MACSEC_DEFAULT_CIPHER_ID; break; case MACSEC_GCM_AES_256_SAK_LEN: csid = secy->xpn ? MACSEC_CIPHER_ID_GCM_AES_XPN_256 : MACSEC_CIPHER_ID_GCM_AES_256; break; default: goto cancel; } if (nla_put_sci(skb, MACSEC_SECY_ATTR_SCI, secy->sci, MACSEC_SECY_ATTR_PAD) || nla_put_u64_64bit(skb, MACSEC_SECY_ATTR_CIPHER_SUITE, csid, MACSEC_SECY_ATTR_PAD) || nla_put_u8(skb, MACSEC_SECY_ATTR_ICV_LEN, secy->icv_len) || nla_put_u8(skb, MACSEC_SECY_ATTR_OPER, secy->operational) || nla_put_u8(skb, MACSEC_SECY_ATTR_PROTECT, secy->protect_frames) || nla_put_u8(skb, MACSEC_SECY_ATTR_REPLAY, secy->replay_protect) || nla_put_u8(skb, MACSEC_SECY_ATTR_VALIDATE, secy->validate_frames) || nla_put_u8(skb, MACSEC_SECY_ATTR_ENCRYPT, tx_sc->encrypt) || nla_put_u8(skb, MACSEC_SECY_ATTR_INC_SCI, tx_sc->send_sci) || nla_put_u8(skb, MACSEC_SECY_ATTR_ES, tx_sc->end_station) || nla_put_u8(skb, MACSEC_SECY_ATTR_SCB, tx_sc->scb) || nla_put_u8(skb, MACSEC_SECY_ATTR_ENCODING_SA, tx_sc->encoding_sa)) goto cancel; if (secy->replay_protect) { if (nla_put_u32(skb, MACSEC_SECY_ATTR_WINDOW, secy->replay_window)) goto cancel; } nla_nest_end(skb, secy_nest); return 0; cancel: nla_nest_cancel(skb, secy_nest); return 1; } static noinline_for_stack int dump_secy(struct macsec_secy *secy, struct net_device *dev, struct sk_buff *skb, struct netlink_callback *cb) { struct macsec_tx_sc_stats tx_sc_stats = {0, }; struct macsec_tx_sa_stats tx_sa_stats = {0, }; struct macsec_rx_sc_stats rx_sc_stats = {0, }; struct macsec_rx_sa_stats rx_sa_stats = {0, }; struct macsec_dev *macsec = netdev_priv(dev); struct macsec_dev_stats dev_stats = {0, }; struct macsec_tx_sc *tx_sc = &secy->tx_sc; struct nlattr *txsa_list, *rxsc_list; struct macsec_rx_sc *rx_sc; struct nlattr *attr; void *hdr; int i, j; hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &macsec_fam, NLM_F_MULTI, MACSEC_CMD_GET_TXSC); if (!hdr) return -EMSGSIZE; genl_dump_check_consistent(cb, hdr); if (nla_put_u32(skb, MACSEC_ATTR_IFINDEX, dev->ifindex)) goto nla_put_failure; attr = nla_nest_start_noflag(skb, MACSEC_ATTR_OFFLOAD); if (!attr) goto nla_put_failure; if (nla_put_u8(skb, MACSEC_OFFLOAD_ATTR_TYPE, macsec->offload)) goto nla_put_failure; nla_nest_end(skb, attr); if (nla_put_secy(secy, skb)) goto nla_put_failure; attr = nla_nest_start_noflag(skb, MACSEC_ATTR_TXSC_STATS); if (!attr) goto nla_put_failure; get_tx_sc_stats(dev, &tx_sc_stats); if (copy_tx_sc_stats(skb, &tx_sc_stats)) { nla_nest_cancel(skb, attr); goto nla_put_failure; } nla_nest_end(skb, attr); attr = nla_nest_start_noflag(skb, MACSEC_ATTR_SECY_STATS); if (!attr) goto nla_put_failure; get_secy_stats(dev, &dev_stats); if (copy_secy_stats(skb, &dev_stats)) { nla_nest_cancel(skb, attr); goto nla_put_failure; } nla_nest_end(skb, attr); txsa_list = nla_nest_start_noflag(skb, MACSEC_ATTR_TXSA_LIST); if (!txsa_list) goto nla_put_failure; for (i = 0, j = 1; i < MACSEC_NUM_AN; i++) { struct macsec_tx_sa *tx_sa = rtnl_dereference(tx_sc->sa[i]); struct nlattr *txsa_nest; u64 pn; int pn_len; if (!tx_sa) continue; txsa_nest = nla_nest_start_noflag(skb, j++); if (!txsa_nest) { nla_nest_cancel(skb, txsa_list); goto nla_put_failure; } attr = nla_nest_start_noflag(skb, MACSEC_SA_ATTR_STATS); if (!attr) { nla_nest_cancel(skb, txsa_nest); nla_nest_cancel(skb, txsa_list); goto nla_put_failure; } memset(&tx_sa_stats, 0, sizeof(tx_sa_stats)); get_tx_sa_stats(dev, i, tx_sa, &tx_sa_stats); if (copy_tx_sa_stats(skb, &tx_sa_stats)) { nla_nest_cancel(skb, attr); nla_nest_cancel(skb, txsa_nest); nla_nest_cancel(skb, txsa_list); goto nla_put_failure; } nla_nest_end(skb, attr); if (secy->xpn) { pn = tx_sa->next_pn; pn_len = MACSEC_XPN_PN_LEN; } else { pn = tx_sa->next_pn_halves.lower; pn_len = MACSEC_DEFAULT_PN_LEN; } if (nla_put_u8(skb, MACSEC_SA_ATTR_AN, i) || nla_put(skb, MACSEC_SA_ATTR_PN, pn_len, &pn) || nla_put(skb, MACSEC_SA_ATTR_KEYID, MACSEC_KEYID_LEN, tx_sa->key.id) || (secy->xpn && nla_put_ssci(skb, MACSEC_SA_ATTR_SSCI, tx_sa->ssci)) || nla_put_u8(skb, MACSEC_SA_ATTR_ACTIVE, tx_sa->active)) { nla_nest_cancel(skb, txsa_nest); nla_nest_cancel(skb, txsa_list); goto nla_put_failure; } nla_nest_end(skb, txsa_nest); } nla_nest_end(skb, txsa_list); rxsc_list = nla_nest_start_noflag(skb, MACSEC_ATTR_RXSC_LIST); if (!rxsc_list) goto nla_put_failure; j = 1; for_each_rxsc_rtnl(secy, rx_sc) { int k; struct nlattr *rxsa_list; struct nlattr *rxsc_nest = nla_nest_start_noflag(skb, j++); if (!rxsc_nest) { nla_nest_cancel(skb, rxsc_list); goto nla_put_failure; } if (nla_put_u8(skb, MACSEC_RXSC_ATTR_ACTIVE, rx_sc->active) || nla_put_sci(skb, MACSEC_RXSC_ATTR_SCI, rx_sc->sci, MACSEC_RXSC_ATTR_PAD)) { nla_nest_cancel(skb, rxsc_nest); nla_nest_cancel(skb, rxsc_list); goto nla_put_failure; } attr = nla_nest_start_noflag(skb, MACSEC_RXSC_ATTR_STATS); if (!attr) { nla_nest_cancel(skb, rxsc_nest); nla_nest_cancel(skb, rxsc_list); goto nla_put_failure; } memset(&rx_sc_stats, 0, sizeof(rx_sc_stats)); get_rx_sc_stats(dev, rx_sc, &rx_sc_stats); if (copy_rx_sc_stats(skb, &rx_sc_stats)) { nla_nest_cancel(skb, attr); nla_nest_cancel(skb, rxsc_nest); nla_nest_cancel(skb, rxsc_list); goto nla_put_failure; } nla_nest_end(skb, attr); rxsa_list = nla_nest_start_noflag(skb, MACSEC_RXSC_ATTR_SA_LIST); if (!rxsa_list) { nla_nest_cancel(skb, rxsc_nest); nla_nest_cancel(skb, rxsc_list); goto nla_put_failure; } for (i = 0, k = 1; i < MACSEC_NUM_AN; i++) { struct macsec_rx_sa *rx_sa = rtnl_dereference(rx_sc->sa[i]); struct nlattr *rxsa_nest; u64 pn; int pn_len; if (!rx_sa) continue; rxsa_nest = nla_nest_start_noflag(skb, k++); if (!rxsa_nest) { nla_nest_cancel(skb, rxsa_list); nla_nest_cancel(skb, rxsc_nest); nla_nest_cancel(skb, rxsc_list); goto nla_put_failure; } attr = nla_nest_start_noflag(skb, MACSEC_SA_ATTR_STATS); if (!attr) { nla_nest_cancel(skb, rxsa_list); nla_nest_cancel(skb, rxsc_nest); nla_nest_cancel(skb, rxsc_list); goto nla_put_failure; } memset(&rx_sa_stats, 0, sizeof(rx_sa_stats)); get_rx_sa_stats(dev, rx_sc, i, rx_sa, &rx_sa_stats); if (copy_rx_sa_stats(skb, &rx_sa_stats)) { nla_nest_cancel(skb, attr); nla_nest_cancel(skb, rxsa_list); nla_nest_cancel(skb, rxsc_nest); nla_nest_cancel(skb, rxsc_list); goto nla_put_failure; } nla_nest_end(skb, attr); if (secy->xpn) { pn = rx_sa->next_pn; pn_len = MACSEC_XPN_PN_LEN; } else { pn = rx_sa->next_pn_halves.lower; pn_len = MACSEC_DEFAULT_PN_LEN; } if (nla_put_u8(skb, MACSEC_SA_ATTR_AN, i) || nla_put(skb, MACSEC_SA_ATTR_PN, pn_len, &pn) || nla_put(skb, MACSEC_SA_ATTR_KEYID, MACSEC_KEYID_LEN, rx_sa->key.id) || (secy->xpn && nla_put_ssci(skb, MACSEC_SA_ATTR_SSCI, rx_sa->ssci)) || nla_put_u8(skb, MACSEC_SA_ATTR_ACTIVE, rx_sa->active)) { nla_nest_cancel(skb, rxsa_nest); nla_nest_cancel(skb, rxsc_nest); nla_nest_cancel(skb, rxsc_list); goto nla_put_failure; } nla_nest_end(skb, rxsa_nest); } nla_nest_end(skb, rxsa_list); nla_nest_end(skb, rxsc_nest); } nla_nest_end(skb, rxsc_list); genlmsg_end(skb, hdr); return 0; nla_put_failure: genlmsg_cancel(skb, hdr); return -EMSGSIZE; } static int macsec_generation = 1; /* protected by RTNL */ static int macsec_dump_txsc(struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); struct net_device *dev; int dev_idx, d; dev_idx = cb->args[0]; d = 0; rtnl_lock(); cb->seq = macsec_generation; for_each_netdev(net, dev) { struct macsec_secy *secy; if (d < dev_idx) goto next; if (!netif_is_macsec(dev)) goto next; secy = &macsec_priv(dev)->secy; if (dump_secy(secy, dev, skb, cb) < 0) goto done; next: d++; } done: rtnl_unlock(); cb->args[0] = d; return skb->len; } static const struct genl_small_ops macsec_genl_ops[] = { { .cmd = MACSEC_CMD_GET_TXSC, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .dumpit = macsec_dump_txsc, }, { .cmd = MACSEC_CMD_ADD_RXSC, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = macsec_add_rxsc, .flags = GENL_ADMIN_PERM, }, { .cmd = MACSEC_CMD_DEL_RXSC, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = macsec_del_rxsc, .flags = GENL_ADMIN_PERM, }, { .cmd = MACSEC_CMD_UPD_RXSC, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = macsec_upd_rxsc, .flags = GENL_ADMIN_PERM, }, { .cmd = MACSEC_CMD_ADD_TXSA, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = macsec_add_txsa, .flags = GENL_ADMIN_PERM, }, { .cmd = MACSEC_CMD_DEL_TXSA, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = macsec_del_txsa, .flags = GENL_ADMIN_PERM, }, { .cmd = MACSEC_CMD_UPD_TXSA, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = macsec_upd_txsa, .flags = GENL_ADMIN_PERM, }, { .cmd = MACSEC_CMD_ADD_RXSA, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = macsec_add_rxsa, .flags = GENL_ADMIN_PERM, }, { .cmd = MACSEC_CMD_DEL_RXSA, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = macsec_del_rxsa, .flags = GENL_ADMIN_PERM, }, { .cmd = MACSEC_CMD_UPD_RXSA, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = macsec_upd_rxsa, .flags = GENL_ADMIN_PERM, }, { .cmd = MACSEC_CMD_UPD_OFFLOAD, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = macsec_upd_offload, .flags = GENL_ADMIN_PERM, }, }; static struct genl_family macsec_fam __ro_after_init = { .name = MACSEC_GENL_NAME, .hdrsize = 0, .version = MACSEC_GENL_VERSION, .maxattr = MACSEC_ATTR_MAX, .policy = macsec_genl_policy, .netnsok = true, .module = THIS_MODULE, .small_ops = macsec_genl_ops, .n_small_ops = ARRAY_SIZE(macsec_genl_ops), .resv_start_op = MACSEC_CMD_UPD_OFFLOAD + 1, }; static struct sk_buff *macsec_insert_tx_tag(struct sk_buff *skb, struct net_device *dev) { struct macsec_dev *macsec = macsec_priv(dev); const struct macsec_ops *ops; struct phy_device *phydev; struct macsec_context ctx; int skb_final_len; int err; ops = macsec_get_ops(macsec, &ctx); skb_final_len = skb->len - ETH_HLEN + ops->needed_headroom + ops->needed_tailroom; if (unlikely(skb_final_len > macsec->real_dev->mtu)) { err = -EINVAL; goto cleanup; } phydev = macsec->real_dev->phydev; err = skb_ensure_writable_head_tail(skb, dev); if (unlikely(err < 0)) goto cleanup; err = ops->mdo_insert_tx_tag(phydev, skb); if (unlikely(err)) goto cleanup; return skb; cleanup: kfree_skb(skb); return ERR_PTR(err); } static netdev_tx_t macsec_start_xmit(struct sk_buff *skb, struct net_device *dev) { struct macsec_dev *macsec = netdev_priv(dev); struct macsec_secy *secy = &macsec->secy; struct pcpu_secy_stats *secy_stats; int ret, len; if (macsec_is_offloaded(netdev_priv(dev))) { struct metadata_dst *md_dst = secy->tx_sc.md_dst; skb_dst_drop(skb); dst_hold(&md_dst->dst); skb_dst_set(skb, &md_dst->dst); if (macsec->insert_tx_tag) { skb = macsec_insert_tx_tag(skb, dev); if (IS_ERR(skb)) { DEV_STATS_INC(dev, tx_dropped); return NETDEV_TX_OK; } } skb->dev = macsec->real_dev; return dev_queue_xmit(skb); } /* 10.5 */ if (!secy->protect_frames) { secy_stats = this_cpu_ptr(macsec->stats); u64_stats_update_begin(&secy_stats->syncp); secy_stats->stats.OutPktsUntagged++; u64_stats_update_end(&secy_stats->syncp); skb->dev = macsec->real_dev; len = skb->len; ret = dev_queue_xmit(skb); count_tx(dev, ret, len); return ret; } if (!secy->operational) { kfree_skb(skb); DEV_STATS_INC(dev, tx_dropped); return NETDEV_TX_OK; } len = skb->len; skb = macsec_encrypt(skb, dev); if (IS_ERR(skb)) { if (PTR_ERR(skb) != -EINPROGRESS) DEV_STATS_INC(dev, tx_dropped); return NETDEV_TX_OK; } macsec_count_tx(skb, &macsec->secy.tx_sc, macsec_skb_cb(skb)->tx_sa); macsec_encrypt_finish(skb, dev); ret = dev_queue_xmit(skb); count_tx(dev, ret, len); return ret; } #define MACSEC_FEATURES \ (NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST) #define MACSEC_OFFLOAD_FEATURES \ (MACSEC_FEATURES | NETIF_F_GSO_SOFTWARE | NETIF_F_SOFT_FEATURES | \ NETIF_F_LRO | NETIF_F_RXHASH | NETIF_F_CSUM_MASK | NETIF_F_RXCSUM) static int macsec_dev_init(struct net_device *dev) { struct macsec_dev *macsec = macsec_priv(dev); struct net_device *real_dev = macsec->real_dev; int err; err = gro_cells_init(&macsec->gro_cells, dev); if (err) return err; macsec_inherit_tso_max(dev); dev->hw_features = real_dev->hw_features & MACSEC_OFFLOAD_FEATURES; dev->hw_features |= NETIF_F_GSO_SOFTWARE; dev->features = real_dev->features & MACSEC_OFFLOAD_FEATURES; dev->features |= NETIF_F_GSO_SOFTWARE; dev->lltx = true; dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; macsec_set_head_tail_room(dev); if (is_zero_ether_addr(dev->dev_addr)) eth_hw_addr_inherit(dev, real_dev); if (is_zero_ether_addr(dev->broadcast)) memcpy(dev->broadcast, real_dev->broadcast, dev->addr_len); /* Get macsec's reference to real_dev */ netdev_hold(real_dev, &macsec->dev_tracker, GFP_KERNEL); return 0; } static void macsec_dev_uninit(struct net_device *dev) { struct macsec_dev *macsec = macsec_priv(dev); gro_cells_destroy(&macsec->gro_cells); } static netdev_features_t macsec_fix_features(struct net_device *dev, netdev_features_t features) { struct macsec_dev *macsec = macsec_priv(dev); struct net_device *real_dev = macsec->real_dev; netdev_features_t mask; mask = macsec_is_offloaded(macsec) ? MACSEC_OFFLOAD_FEATURES : MACSEC_FEATURES; features &= (real_dev->features & mask) | NETIF_F_GSO_SOFTWARE | NETIF_F_SOFT_FEATURES; return features; } static int macsec_dev_open(struct net_device *dev) { struct macsec_dev *macsec = macsec_priv(dev); struct net_device *real_dev = macsec->real_dev; int err; err = dev_uc_add(real_dev, dev->dev_addr); if (err < 0) return err; if (dev->flags & IFF_ALLMULTI) { err = dev_set_allmulti(real_dev, 1); if (err < 0) goto del_unicast; } if (dev->flags & IFF_PROMISC) { err = dev_set_promiscuity(real_dev, 1); if (err < 0) goto clear_allmulti; } /* If h/w offloading is available, propagate to the device */ if (macsec_is_offloaded(macsec)) { const struct macsec_ops *ops; struct macsec_context ctx; ops = macsec_get_ops(netdev_priv(dev), &ctx); if (!ops) { err = -EOPNOTSUPP; goto clear_allmulti; } ctx.secy = &macsec->secy; err = macsec_offload(ops->mdo_dev_open, &ctx); if (err) goto clear_allmulti; } if (netif_carrier_ok(real_dev)) netif_carrier_on(dev); return 0; clear_allmulti: if (dev->flags & IFF_ALLMULTI) dev_set_allmulti(real_dev, -1); del_unicast: dev_uc_del(real_dev, dev->dev_addr); netif_carrier_off(dev); return err; } static int macsec_dev_stop(struct net_device *dev) { struct macsec_dev *macsec = macsec_priv(dev); struct net_device *real_dev = macsec->real_dev; netif_carrier_off(dev); /* If h/w offloading is available, propagate to the device */ if (macsec_is_offloaded(macsec)) { const struct macsec_ops *ops; struct macsec_context ctx; ops = macsec_get_ops(macsec, &ctx); if (ops) { ctx.secy = &macsec->secy; macsec_offload(ops->mdo_dev_stop, &ctx); } } dev_mc_unsync(real_dev, dev); dev_uc_unsync(real_dev, dev); if (dev->flags & IFF_ALLMULTI) dev_set_allmulti(real_dev, -1); if (dev->flags & IFF_PROMISC) dev_set_promiscuity(real_dev, -1); dev_uc_del(real_dev, dev->dev_addr); return 0; } static void macsec_dev_change_rx_flags(struct net_device *dev, int change) { struct net_device *real_dev = macsec_priv(dev)->real_dev; if (!(dev->flags & IFF_UP)) return; if (change & IFF_ALLMULTI) dev_set_allmulti(real_dev, dev->flags & IFF_ALLMULTI ? 1 : -1); if (change & IFF_PROMISC) dev_set_promiscuity(real_dev, dev->flags & IFF_PROMISC ? 1 : -1); } static void macsec_dev_set_rx_mode(struct net_device *dev) { struct net_device *real_dev = macsec_priv(dev)->real_dev; dev_mc_sync(real_dev, dev); dev_uc_sync(real_dev, dev); } static int macsec_set_mac_address(struct net_device *dev, void *p) { struct macsec_dev *macsec = macsec_priv(dev); struct net_device *real_dev = macsec->real_dev; struct sockaddr *addr = p; u8 old_addr[ETH_ALEN]; int err; if (!is_valid_ether_addr(addr->sa_data)) return -EADDRNOTAVAIL; if (dev->flags & IFF_UP) { err = dev_uc_add(real_dev, addr->sa_data); if (err < 0) return err; } ether_addr_copy(old_addr, dev->dev_addr); eth_hw_addr_set(dev, addr->sa_data); /* If h/w offloading is available, propagate to the device */ if (macsec_is_offloaded(macsec)) { const struct macsec_ops *ops; struct macsec_context ctx; ops = macsec_get_ops(macsec, &ctx); if (!ops) { err = -EOPNOTSUPP; goto restore_old_addr; } ctx.secy = &macsec->secy; err = macsec_offload(ops->mdo_upd_secy, &ctx); if (err) goto restore_old_addr; } if (dev->flags & IFF_UP) dev_uc_del(real_dev, old_addr); return 0; restore_old_addr: if (dev->flags & IFF_UP) dev_uc_del(real_dev, addr->sa_data); eth_hw_addr_set(dev, old_addr); return err; } static int macsec_change_mtu(struct net_device *dev, int new_mtu) { struct macsec_dev *macsec = macsec_priv(dev); unsigned int extra = macsec->secy.icv_len + macsec_extra_len(true); if (macsec->real_dev->mtu - extra < new_mtu) return -ERANGE; WRITE_ONCE(dev->mtu, new_mtu); return 0; } static void macsec_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *s) { if (!dev->tstats) return; dev_fetch_sw_netstats(s, dev->tstats); s->rx_dropped = DEV_STATS_READ(dev, rx_dropped); s->tx_dropped = DEV_STATS_READ(dev, tx_dropped); s->rx_errors = DEV_STATS_READ(dev, rx_errors); } static int macsec_get_iflink(const struct net_device *dev) { return READ_ONCE(macsec_priv(dev)->real_dev->ifindex); } static const struct net_device_ops macsec_netdev_ops = { .ndo_init = macsec_dev_init, .ndo_uninit = macsec_dev_uninit, .ndo_open = macsec_dev_open, .ndo_stop = macsec_dev_stop, .ndo_fix_features = macsec_fix_features, .ndo_change_mtu = macsec_change_mtu, .ndo_set_rx_mode = macsec_dev_set_rx_mode, .ndo_change_rx_flags = macsec_dev_change_rx_flags, .ndo_set_mac_address = macsec_set_mac_address, .ndo_start_xmit = macsec_start_xmit, .ndo_get_stats64 = macsec_get_stats64, .ndo_get_iflink = macsec_get_iflink, }; static const struct device_type macsec_type = { .name = "macsec", }; static const struct nla_policy macsec_rtnl_policy[IFLA_MACSEC_MAX + 1] = { [IFLA_MACSEC_SCI] = { .type = NLA_U64 }, [IFLA_MACSEC_PORT] = { .type = NLA_U16 }, [IFLA_MACSEC_ICV_LEN] = { .type = NLA_U8 }, [IFLA_MACSEC_CIPHER_SUITE] = { .type = NLA_U64 }, [IFLA_MACSEC_WINDOW] = { .type = NLA_U32 }, [IFLA_MACSEC_ENCODING_SA] = { .type = NLA_U8 }, [IFLA_MACSEC_ENCRYPT] = { .type = NLA_U8 }, [IFLA_MACSEC_PROTECT] = { .type = NLA_U8 }, [IFLA_MACSEC_INC_SCI] = { .type = NLA_U8 }, [IFLA_MACSEC_ES] = { .type = NLA_U8 }, [IFLA_MACSEC_SCB] = { .type = NLA_U8 }, [IFLA_MACSEC_REPLAY_PROTECT] = { .type = NLA_U8 }, [IFLA_MACSEC_VALIDATION] = { .type = NLA_U8 }, [IFLA_MACSEC_OFFLOAD] = { .type = NLA_U8 }, }; static void macsec_free_netdev(struct net_device *dev) { struct macsec_dev *macsec = macsec_priv(dev); dst_release(&macsec->secy.tx_sc.md_dst->dst); free_percpu(macsec->stats); free_percpu(macsec->secy.tx_sc.stats); /* Get rid of the macsec's reference to real_dev */ netdev_put(macsec->real_dev, &macsec->dev_tracker); } static void macsec_setup(struct net_device *dev) { ether_setup(dev); dev->min_mtu = 0; dev->max_mtu = ETH_MAX_MTU; dev->priv_flags |= IFF_NO_QUEUE; dev->netdev_ops = &macsec_netdev_ops; dev->needs_free_netdev = true; dev->priv_destructor = macsec_free_netdev; SET_NETDEV_DEVTYPE(dev, &macsec_type); eth_zero_addr(dev->broadcast); } static int macsec_changelink_common(struct net_device *dev, struct nlattr *data[]) { struct macsec_secy *secy; struct macsec_tx_sc *tx_sc; secy = &macsec_priv(dev)->secy; tx_sc = &secy->tx_sc; if (data[IFLA_MACSEC_ENCODING_SA]) { struct macsec_tx_sa *tx_sa; tx_sc->encoding_sa = nla_get_u8(data[IFLA_MACSEC_ENCODING_SA]); tx_sa = rtnl_dereference(tx_sc->sa[tx_sc->encoding_sa]); secy->operational = tx_sa && tx_sa->active; } if (data[IFLA_MACSEC_ENCRYPT]) tx_sc->encrypt = !!nla_get_u8(data[IFLA_MACSEC_ENCRYPT]); if (data[IFLA_MACSEC_PROTECT]) secy->protect_frames = !!nla_get_u8(data[IFLA_MACSEC_PROTECT]); if (data[IFLA_MACSEC_INC_SCI]) tx_sc->send_sci = !!nla_get_u8(data[IFLA_MACSEC_INC_SCI]); if (data[IFLA_MACSEC_ES]) tx_sc->end_station = !!nla_get_u8(data[IFLA_MACSEC_ES]); if (data[IFLA_MACSEC_SCB]) tx_sc->scb = !!nla_get_u8(data[IFLA_MACSEC_SCB]); if (data[IFLA_MACSEC_REPLAY_PROTECT]) secy->replay_protect = !!nla_get_u8(data[IFLA_MACSEC_REPLAY_PROTECT]); if (data[IFLA_MACSEC_VALIDATION]) secy->validate_frames = nla_get_u8(data[IFLA_MACSEC_VALIDATION]); if (data[IFLA_MACSEC_CIPHER_SUITE]) { switch (nla_get_u64(data[IFLA_MACSEC_CIPHER_SUITE])) { case MACSEC_CIPHER_ID_GCM_AES_128: case MACSEC_DEFAULT_CIPHER_ID: secy->key_len = MACSEC_GCM_AES_128_SAK_LEN; secy->xpn = false; break; case MACSEC_CIPHER_ID_GCM_AES_256: secy->key_len = MACSEC_GCM_AES_256_SAK_LEN; secy->xpn = false; break; case MACSEC_CIPHER_ID_GCM_AES_XPN_128: secy->key_len = MACSEC_GCM_AES_128_SAK_LEN; secy->xpn = true; break; case MACSEC_CIPHER_ID_GCM_AES_XPN_256: secy->key_len = MACSEC_GCM_AES_256_SAK_LEN; secy->xpn = true; break; default: return -EINVAL; } } if (data[IFLA_MACSEC_WINDOW]) { secy->replay_window = nla_get_u32(data[IFLA_MACSEC_WINDOW]); /* IEEE 802.1AEbw-2013 10.7.8 - maximum replay window * for XPN cipher suites */ if (secy->xpn && secy->replay_window > MACSEC_XPN_MAX_REPLAY_WINDOW) return -EINVAL; } return 0; } static int macsec_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { struct macsec_dev *macsec = macsec_priv(dev); bool macsec_offload_state_change = false; enum macsec_offload offload; struct macsec_tx_sc tx_sc; struct macsec_secy secy; int ret; if (!data) return 0; if (data[IFLA_MACSEC_CIPHER_SUITE] || data[IFLA_MACSEC_ICV_LEN] || data[IFLA_MACSEC_SCI] || data[IFLA_MACSEC_PORT]) return -EINVAL; /* Keep a copy of unmodified secy and tx_sc, in case the offload * propagation fails, to revert macsec_changelink_common. */ memcpy(&secy, &macsec->secy, sizeof(secy)); memcpy(&tx_sc, &macsec->secy.tx_sc, sizeof(tx_sc)); ret = macsec_changelink_common(dev, data); if (ret) goto cleanup; if (data[IFLA_MACSEC_OFFLOAD]) { offload = nla_get_u8(data[IFLA_MACSEC_OFFLOAD]); if (macsec->offload != offload) { macsec_offload_state_change = true; ret = macsec_update_offload(dev, offload); if (ret) goto cleanup; } } /* If h/w offloading is available, propagate to the device */ if (!macsec_offload_state_change && macsec_is_offloaded(macsec)) { const struct macsec_ops *ops; struct macsec_context ctx; ops = macsec_get_ops(netdev_priv(dev), &ctx); if (!ops) { ret = -EOPNOTSUPP; goto cleanup; } ctx.secy = &macsec->secy; ret = macsec_offload(ops->mdo_upd_secy, &ctx); if (ret) goto cleanup; } return 0; cleanup: memcpy(&macsec->secy.tx_sc, &tx_sc, sizeof(tx_sc)); memcpy(&macsec->secy, &secy, sizeof(secy)); return ret; } static void macsec_del_dev(struct macsec_dev *macsec) { int i; while (macsec->secy.rx_sc) { struct macsec_rx_sc *rx_sc = rtnl_dereference(macsec->secy.rx_sc); rcu_assign_pointer(macsec->secy.rx_sc, rx_sc->next); free_rx_sc(rx_sc); } for (i = 0; i < MACSEC_NUM_AN; i++) { struct macsec_tx_sa *sa = rtnl_dereference(macsec->secy.tx_sc.sa[i]); if (sa) { RCU_INIT_POINTER(macsec->secy.tx_sc.sa[i], NULL); clear_tx_sa(sa); } } } static void macsec_common_dellink(struct net_device *dev, struct list_head *head) { struct macsec_dev *macsec = macsec_priv(dev); struct net_device *real_dev = macsec->real_dev; /* If h/w offloading is available, propagate to the device */ if (macsec_is_offloaded(macsec)) { const struct macsec_ops *ops; struct macsec_context ctx; ops = macsec_get_ops(netdev_priv(dev), &ctx); if (ops) { ctx.secy = &macsec->secy; macsec_offload(ops->mdo_del_secy, &ctx); } } unregister_netdevice_queue(dev, head); list_del_rcu(&macsec->secys); macsec_del_dev(macsec); netdev_upper_dev_unlink(real_dev, dev); macsec_generation++; } static void macsec_dellink(struct net_device *dev, struct list_head *head) { struct macsec_dev *macsec = macsec_priv(dev); struct net_device *real_dev = macsec->real_dev; struct macsec_rxh_data *rxd = macsec_data_rtnl(real_dev); macsec_common_dellink(dev, head); if (list_empty(&rxd->secys)) { netdev_rx_handler_unregister(real_dev); kfree(rxd); } } static int register_macsec_dev(struct net_device *real_dev, struct net_device *dev) { struct macsec_dev *macsec = macsec_priv(dev); struct macsec_rxh_data *rxd = macsec_data_rtnl(real_dev); if (!rxd) { int err; rxd = kmalloc(sizeof(*rxd), GFP_KERNEL); if (!rxd) return -ENOMEM; INIT_LIST_HEAD(&rxd->secys); err = netdev_rx_handler_register(real_dev, macsec_handle_frame, rxd); if (err < 0) { kfree(rxd); return err; } } list_add_tail_rcu(&macsec->secys, &rxd->secys); return 0; } static bool sci_exists(struct net_device *dev, sci_t sci) { struct macsec_rxh_data *rxd = macsec_data_rtnl(dev); struct macsec_dev *macsec; list_for_each_entry(macsec, &rxd->secys, secys) { if (macsec->secy.sci == sci) return true; } return false; } static sci_t dev_to_sci(struct net_device *dev, __be16 port) { return make_sci(dev->dev_addr, port); } static int macsec_add_dev(struct net_device *dev, sci_t sci, u8 icv_len) { struct macsec_dev *macsec = macsec_priv(dev); struct macsec_secy *secy = &macsec->secy; macsec->stats = netdev_alloc_pcpu_stats(struct pcpu_secy_stats); if (!macsec->stats) return -ENOMEM; secy->tx_sc.stats = netdev_alloc_pcpu_stats(struct pcpu_tx_sc_stats); if (!secy->tx_sc.stats) return -ENOMEM; secy->tx_sc.md_dst = metadata_dst_alloc(0, METADATA_MACSEC, GFP_KERNEL); if (!secy->tx_sc.md_dst) /* macsec and secy percpu stats will be freed when unregistering * net_device in macsec_free_netdev() */ return -ENOMEM; if (sci == MACSEC_UNDEF_SCI) sci = dev_to_sci(dev, MACSEC_PORT_ES); secy->netdev = dev; secy->operational = true; secy->key_len = DEFAULT_SAK_LEN; secy->icv_len = icv_len; secy->validate_frames = MACSEC_VALIDATE_DEFAULT; secy->protect_frames = true; secy->replay_protect = false; secy->xpn = DEFAULT_XPN; secy->sci = sci; secy->tx_sc.md_dst->u.macsec_info.sci = sci; secy->tx_sc.active = true; secy->tx_sc.encoding_sa = DEFAULT_ENCODING_SA; secy->tx_sc.encrypt = DEFAULT_ENCRYPT; secy->tx_sc.send_sci = DEFAULT_SEND_SCI; secy->tx_sc.end_station = false; secy->tx_sc.scb = false; return 0; } static struct lock_class_key macsec_netdev_addr_lock_key; static int macsec_newlink(struct net_device *dev, struct rtnl_newlink_params *params, struct netlink_ext_ack *extack) { struct net *link_net = rtnl_newlink_link_net(params); struct macsec_dev *macsec = macsec_priv(dev); struct nlattr **data = params->data; struct nlattr **tb = params->tb; rx_handler_func_t *rx_handler; u8 icv_len = MACSEC_DEFAULT_ICV_LEN; struct net_device *real_dev; int err, mtu; sci_t sci; if (!tb[IFLA_LINK]) return -EINVAL; real_dev = __dev_get_by_index(link_net, nla_get_u32(tb[IFLA_LINK])); if (!real_dev) return -ENODEV; if (real_dev->type != ARPHRD_ETHER) return -EINVAL; dev->priv_flags |= IFF_MACSEC; macsec->real_dev = real_dev; if (data && data[IFLA_MACSEC_OFFLOAD]) macsec->offload = nla_get_offload(data[IFLA_MACSEC_OFFLOAD]); else /* MACsec offloading is off by default */ macsec->offload = MACSEC_OFFLOAD_OFF; /* Check if the offloading mode is supported by the underlying layers */ if (macsec->offload != MACSEC_OFFLOAD_OFF && !macsec_check_offload(macsec->offload, macsec)) return -EOPNOTSUPP; /* send_sci must be set to true when transmit sci explicitly is set */ if ((data && data[IFLA_MACSEC_SCI]) && (data && data[IFLA_MACSEC_INC_SCI])) { u8 send_sci = !!nla_get_u8(data[IFLA_MACSEC_INC_SCI]); if (!send_sci) return -EINVAL; } if (data && data[IFLA_MACSEC_ICV_LEN]) icv_len = nla_get_u8(data[IFLA_MACSEC_ICV_LEN]); mtu = real_dev->mtu - icv_len - macsec_extra_len(true); if (mtu < 0) dev->mtu = 0; else dev->mtu = mtu; rx_handler = rtnl_dereference(real_dev->rx_handler); if (rx_handler && rx_handler != macsec_handle_frame) return -EBUSY; err = register_netdevice(dev); if (err < 0) return err; netdev_lockdep_set_classes(dev); lockdep_set_class(&dev->addr_list_lock, &macsec_netdev_addr_lock_key); err = netdev_upper_dev_link(real_dev, dev, extack); if (err < 0) goto unregister; /* need to be already registered so that ->init has run and * the MAC addr is set */ if (data && data[IFLA_MACSEC_SCI]) sci = nla_get_sci(data[IFLA_MACSEC_SCI]); else if (data && data[IFLA_MACSEC_PORT]) sci = dev_to_sci(dev, nla_get_be16(data[IFLA_MACSEC_PORT])); else sci = dev_to_sci(dev, MACSEC_PORT_ES); if (rx_handler && sci_exists(real_dev, sci)) { err = -EBUSY; goto unlink; } err = macsec_add_dev(dev, sci, icv_len); if (err) goto unlink; if (data) { err = macsec_changelink_common(dev, data); if (err) goto del_dev; } /* If h/w offloading is available, propagate to the device */ if (macsec_is_offloaded(macsec)) { const struct macsec_ops *ops; struct macsec_context ctx; ops = macsec_get_ops(macsec, &ctx); if (ops) { ctx.secy = &macsec->secy; err = macsec_offload(ops->mdo_add_secy, &ctx); if (err) goto del_dev; macsec->insert_tx_tag = macsec_needs_tx_tag(macsec, ops); } } err = register_macsec_dev(real_dev, dev); if (err < 0) goto del_dev; netif_stacked_transfer_operstate(real_dev, dev); linkwatch_fire_event(dev); macsec_generation++; return 0; del_dev: macsec_del_dev(macsec); unlink: netdev_upper_dev_unlink(real_dev, dev); unregister: unregister_netdevice(dev); return err; } static int macsec_validate_attr(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) { u64 csid = MACSEC_DEFAULT_CIPHER_ID; u8 icv_len = MACSEC_DEFAULT_ICV_LEN; int flag; bool es, scb, sci; if (!data) return 0; if (data[IFLA_MACSEC_CIPHER_SUITE]) csid = nla_get_u64(data[IFLA_MACSEC_CIPHER_SUITE]); if (data[IFLA_MACSEC_ICV_LEN]) { icv_len = nla_get_u8(data[IFLA_MACSEC_ICV_LEN]); if (icv_len != MACSEC_DEFAULT_ICV_LEN) { char dummy_key[DEFAULT_SAK_LEN] = { 0 }; struct crypto_aead *dummy_tfm; dummy_tfm = macsec_alloc_tfm(dummy_key, DEFAULT_SAK_LEN, icv_len); if (IS_ERR(dummy_tfm)) return PTR_ERR(dummy_tfm); crypto_free_aead(dummy_tfm); } } switch (csid) { case MACSEC_CIPHER_ID_GCM_AES_128: case MACSEC_CIPHER_ID_GCM_AES_256: case MACSEC_CIPHER_ID_GCM_AES_XPN_128: case MACSEC_CIPHER_ID_GCM_AES_XPN_256: case MACSEC_DEFAULT_CIPHER_ID: if (icv_len < MACSEC_MIN_ICV_LEN || icv_len > MACSEC_STD_ICV_LEN) return -EINVAL; break; default: return -EINVAL; } if (data[IFLA_MACSEC_ENCODING_SA]) { if (nla_get_u8(data[IFLA_MACSEC_ENCODING_SA]) >= MACSEC_NUM_AN) return -EINVAL; } for (flag = IFLA_MACSEC_ENCODING_SA + 1; flag < IFLA_MACSEC_VALIDATION; flag++) { if (data[flag]) { if (nla_get_u8(data[flag]) > 1) return -EINVAL; } } es = nla_get_u8_default(data[IFLA_MACSEC_ES], false); sci = nla_get_u8_default(data[IFLA_MACSEC_INC_SCI], false); scb = nla_get_u8_default(data[IFLA_MACSEC_SCB], false); if ((sci && (scb || es)) || (scb && es)) return -EINVAL; if (data[IFLA_MACSEC_VALIDATION] && nla_get_u8(data[IFLA_MACSEC_VALIDATION]) > MACSEC_VALIDATE_MAX) return -EINVAL; if ((data[IFLA_MACSEC_REPLAY_PROTECT] && nla_get_u8(data[IFLA_MACSEC_REPLAY_PROTECT])) && !data[IFLA_MACSEC_WINDOW]) return -EINVAL; return 0; } static struct net *macsec_get_link_net(const struct net_device *dev) { return dev_net(macsec_priv(dev)->real_dev); } struct net_device *macsec_get_real_dev(const struct net_device *dev) { return macsec_priv(dev)->real_dev; } EXPORT_SYMBOL_GPL(macsec_get_real_dev); bool macsec_netdev_is_offloaded(struct net_device *dev) { return macsec_is_offloaded(macsec_priv(dev)); } EXPORT_SYMBOL_GPL(macsec_netdev_is_offloaded); static size_t macsec_get_size(const struct net_device *dev) { return nla_total_size_64bit(8) + /* IFLA_MACSEC_SCI */ nla_total_size(1) + /* IFLA_MACSEC_ICV_LEN */ nla_total_size_64bit(8) + /* IFLA_MACSEC_CIPHER_SUITE */ nla_total_size(4) + /* IFLA_MACSEC_WINDOW */ nla_total_size(1) + /* IFLA_MACSEC_ENCODING_SA */ nla_total_size(1) + /* IFLA_MACSEC_ENCRYPT */ nla_total_size(1) + /* IFLA_MACSEC_PROTECT */ nla_total_size(1) + /* IFLA_MACSEC_INC_SCI */ nla_total_size(1) + /* IFLA_MACSEC_ES */ nla_total_size(1) + /* IFLA_MACSEC_SCB */ nla_total_size(1) + /* IFLA_MACSEC_REPLAY_PROTECT */ nla_total_size(1) + /* IFLA_MACSEC_VALIDATION */ nla_total_size(1) + /* IFLA_MACSEC_OFFLOAD */ 0; } static int macsec_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct macsec_tx_sc *tx_sc; struct macsec_dev *macsec; struct macsec_secy *secy; u64 csid; macsec = macsec_priv(dev); secy = &macsec->secy; tx_sc = &secy->tx_sc; switch (secy->key_len) { case MACSEC_GCM_AES_128_SAK_LEN: csid = secy->xpn ? MACSEC_CIPHER_ID_GCM_AES_XPN_128 : MACSEC_DEFAULT_CIPHER_ID; break; case MACSEC_GCM_AES_256_SAK_LEN: csid = secy->xpn ? MACSEC_CIPHER_ID_GCM_AES_XPN_256 : MACSEC_CIPHER_ID_GCM_AES_256; break; default: goto nla_put_failure; } if (nla_put_sci(skb, IFLA_MACSEC_SCI, secy->sci, IFLA_MACSEC_PAD) || nla_put_u8(skb, IFLA_MACSEC_ICV_LEN, secy->icv_len) || nla_put_u64_64bit(skb, IFLA_MACSEC_CIPHER_SUITE, csid, IFLA_MACSEC_PAD) || nla_put_u8(skb, IFLA_MACSEC_ENCODING_SA, tx_sc->encoding_sa) || nla_put_u8(skb, IFLA_MACSEC_ENCRYPT, tx_sc->encrypt) || nla_put_u8(skb, IFLA_MACSEC_PROTECT, secy->protect_frames) || nla_put_u8(skb, IFLA_MACSEC_INC_SCI, tx_sc->send_sci) || nla_put_u8(skb, IFLA_MACSEC_ES, tx_sc->end_station) || nla_put_u8(skb, IFLA_MACSEC_SCB, tx_sc->scb) || nla_put_u8(skb, IFLA_MACSEC_REPLAY_PROTECT, secy->replay_protect) || nla_put_u8(skb, IFLA_MACSEC_VALIDATION, secy->validate_frames) || nla_put_u8(skb, IFLA_MACSEC_OFFLOAD, macsec->offload) || 0) goto nla_put_failure; if (secy->replay_protect) { if (nla_put_u32(skb, IFLA_MACSEC_WINDOW, secy->replay_window)) goto nla_put_failure; } return 0; nla_put_failure: return -EMSGSIZE; } static struct rtnl_link_ops macsec_link_ops __read_mostly = { .kind = "macsec", .priv_size = sizeof(struct macsec_dev), .maxtype = IFLA_MACSEC_MAX, .policy = macsec_rtnl_policy, .setup = macsec_setup, .validate = macsec_validate_attr, .newlink = macsec_newlink, .changelink = macsec_changelink, .dellink = macsec_dellink, .get_size = macsec_get_size, .fill_info = macsec_fill_info, .get_link_net = macsec_get_link_net, }; static bool is_macsec_master(struct net_device *dev) { return rcu_access_pointer(dev->rx_handler) == macsec_handle_frame; } static int macsec_notify(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *real_dev = netdev_notifier_info_to_dev(ptr); struct macsec_rxh_data *rxd; struct macsec_dev *m, *n; LIST_HEAD(head); if (!is_macsec_master(real_dev)) return NOTIFY_DONE; rxd = macsec_data_rtnl(real_dev); switch (event) { case NETDEV_DOWN: case NETDEV_UP: case NETDEV_CHANGE: list_for_each_entry_safe(m, n, &rxd->secys, secys) { struct net_device *dev = m->secy.netdev; netif_stacked_transfer_operstate(real_dev, dev); } break; case NETDEV_UNREGISTER: list_for_each_entry_safe(m, n, &rxd->secys, secys) { macsec_common_dellink(m->secy.netdev, &head); } netdev_rx_handler_unregister(real_dev); kfree(rxd); unregister_netdevice_many(&head); break; case NETDEV_CHANGEMTU: list_for_each_entry(m, &rxd->secys, secys) { struct net_device *dev = m->secy.netdev; unsigned int mtu = real_dev->mtu - (m->secy.icv_len + macsec_extra_len(true)); if (dev->mtu > mtu) dev_set_mtu(dev, mtu); } break; case NETDEV_FEAT_CHANGE: list_for_each_entry(m, &rxd->secys, secys) { macsec_inherit_tso_max(m->secy.netdev); netdev_update_features(m->secy.netdev); } break; } return NOTIFY_OK; } static struct notifier_block macsec_notifier = { .notifier_call = macsec_notify, }; static int __init macsec_init(void) { int err; pr_info("MACsec IEEE 802.1AE\n"); err = register_netdevice_notifier(&macsec_notifier); if (err) return err; err = rtnl_link_register(&macsec_link_ops); if (err) goto notifier; err = genl_register_family(&macsec_fam); if (err) goto rtnl; return 0; rtnl: rtnl_link_unregister(&macsec_link_ops); notifier: unregister_netdevice_notifier(&macsec_notifier); return err; } static void __exit macsec_exit(void) { genl_unregister_family(&macsec_fam); rtnl_link_unregister(&macsec_link_ops); unregister_netdevice_notifier(&macsec_notifier); rcu_barrier(); } module_init(macsec_init); module_exit(macsec_exit); MODULE_ALIAS_RTNL_LINK("macsec"); MODULE_ALIAS_GENL_FAMILY("macsec"); MODULE_DESCRIPTION("MACsec IEEE 802.1AE"); MODULE_LICENSE("GPL v2"); |
| 11 10 11 11 3 3 3 9 9 9 6 11 11 11 11 1 1 12 8 13 9 12 3 10 11 3 10 3 8 10 12 12 11 2 2 2 2 12 12 3 3 10 10 12 12 12 1 8 2 11 10 6 6 5 6 6 4 5 4 3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 | /* * PCM Plug-In shared (kernel/library) code * Copyright (c) 1999 by Jaroslav Kysela <perex@perex.cz> * Copyright (c) 2000 by Abramo Bagnara <abramo@alsa-project.org> * * * This library is free software; you can redistribute it and/or modify * it under the terms of the GNU Library General Public License as * published by the Free Software Foundation; either version 2 of * the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Library General Public License for more details. * * You should have received a copy of the GNU Library General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * */ #if 0 #define PLUGIN_DEBUG #endif #include <linux/slab.h> #include <linux/time.h> #include <linux/vmalloc.h> #include <sound/core.h> #include <sound/pcm.h> #include <sound/pcm_params.h> #include "pcm_plugin.h" #define snd_pcm_plug_first(plug) ((plug)->runtime->oss.plugin_first) #define snd_pcm_plug_last(plug) ((plug)->runtime->oss.plugin_last) /* * because some cards might have rates "very close", we ignore * all "resampling" requests within +-5% */ static int rate_match(unsigned int src_rate, unsigned int dst_rate) { unsigned int low = (src_rate * 95) / 100; unsigned int high = (src_rate * 105) / 100; return dst_rate >= low && dst_rate <= high; } static int snd_pcm_plugin_alloc(struct snd_pcm_plugin *plugin, snd_pcm_uframes_t frames) { struct snd_pcm_plugin_format *format; ssize_t width; size_t size; unsigned int channel; struct snd_pcm_plugin_channel *c; if (plugin->stream == SNDRV_PCM_STREAM_PLAYBACK) { format = &plugin->src_format; } else { format = &plugin->dst_format; } width = snd_pcm_format_physical_width(format->format); if (width < 0) return width; size = array3_size(frames, format->channels, width); /* check for too large period size once again */ if (size > 1024 * 1024) return -ENOMEM; if (snd_BUG_ON(size % 8)) return -ENXIO; size /= 8; if (plugin->buf_frames < frames) { kvfree(plugin->buf); plugin->buf = kvzalloc(size, GFP_KERNEL); plugin->buf_frames = frames; } if (!plugin->buf) { plugin->buf_frames = 0; return -ENOMEM; } c = plugin->buf_channels; if (plugin->access == SNDRV_PCM_ACCESS_RW_INTERLEAVED) { for (channel = 0; channel < format->channels; channel++, c++) { c->frames = frames; c->enabled = 1; c->wanted = 0; c->area.addr = plugin->buf; c->area.first = channel * width; c->area.step = format->channels * width; } } else if (plugin->access == SNDRV_PCM_ACCESS_RW_NONINTERLEAVED) { if (snd_BUG_ON(size % format->channels)) return -EINVAL; size /= format->channels; for (channel = 0; channel < format->channels; channel++, c++) { c->frames = frames; c->enabled = 1; c->wanted = 0; c->area.addr = plugin->buf + (channel * size); c->area.first = 0; c->area.step = width; } } else return -EINVAL; return 0; } int snd_pcm_plug_alloc(struct snd_pcm_substream *plug, snd_pcm_uframes_t frames) { int err; if (snd_BUG_ON(!snd_pcm_plug_first(plug))) return -ENXIO; if (snd_pcm_plug_stream(plug) == SNDRV_PCM_STREAM_PLAYBACK) { struct snd_pcm_plugin *plugin = snd_pcm_plug_first(plug); while (plugin->next) { if (plugin->dst_frames) frames = plugin->dst_frames(plugin, frames); if ((snd_pcm_sframes_t)frames <= 0) return -ENXIO; plugin = plugin->next; err = snd_pcm_plugin_alloc(plugin, frames); if (err < 0) return err; } } else { struct snd_pcm_plugin *plugin = snd_pcm_plug_last(plug); while (plugin->prev) { if (plugin->src_frames) frames = plugin->src_frames(plugin, frames); if ((snd_pcm_sframes_t)frames <= 0) return -ENXIO; plugin = plugin->prev; err = snd_pcm_plugin_alloc(plugin, frames); if (err < 0) return err; } } return 0; } snd_pcm_sframes_t snd_pcm_plugin_client_channels(struct snd_pcm_plugin *plugin, snd_pcm_uframes_t frames, struct snd_pcm_plugin_channel **channels) { *channels = plugin->buf_channels; return frames; } int snd_pcm_plugin_build(struct snd_pcm_substream *plug, const char *name, struct snd_pcm_plugin_format *src_format, struct snd_pcm_plugin_format *dst_format, size_t extra, struct snd_pcm_plugin **ret) { struct snd_pcm_plugin *plugin; unsigned int channels; if (snd_BUG_ON(!plug)) return -ENXIO; if (snd_BUG_ON(!src_format || !dst_format)) return -ENXIO; plugin = kzalloc(sizeof(*plugin) + extra, GFP_KERNEL); if (plugin == NULL) return -ENOMEM; plugin->name = name; plugin->plug = plug; plugin->stream = snd_pcm_plug_stream(plug); plugin->access = SNDRV_PCM_ACCESS_RW_INTERLEAVED; plugin->src_format = *src_format; plugin->src_width = snd_pcm_format_physical_width(src_format->format); snd_BUG_ON(plugin->src_width <= 0); plugin->dst_format = *dst_format; plugin->dst_width = snd_pcm_format_physical_width(dst_format->format); snd_BUG_ON(plugin->dst_width <= 0); if (plugin->stream == SNDRV_PCM_STREAM_PLAYBACK) channels = src_format->channels; else channels = dst_format->channels; plugin->buf_channels = kcalloc(channels, sizeof(*plugin->buf_channels), GFP_KERNEL); if (plugin->buf_channels == NULL) { snd_pcm_plugin_free(plugin); return -ENOMEM; } plugin->client_channels = snd_pcm_plugin_client_channels; *ret = plugin; return 0; } int snd_pcm_plugin_free(struct snd_pcm_plugin *plugin) { if (! plugin) return 0; if (plugin->private_free) plugin->private_free(plugin); kfree(plugin->buf_channels); kvfree(plugin->buf); kfree(plugin); return 0; } static snd_pcm_sframes_t calc_dst_frames(struct snd_pcm_substream *plug, snd_pcm_sframes_t frames, bool check_size) { struct snd_pcm_plugin *plugin, *plugin_next; plugin = snd_pcm_plug_first(plug); while (plugin && frames > 0) { plugin_next = plugin->next; if (check_size && plugin->buf_frames && frames > plugin->buf_frames) frames = plugin->buf_frames; if (plugin->dst_frames) { frames = plugin->dst_frames(plugin, frames); if (frames < 0) return frames; } plugin = plugin_next; } return frames; } static snd_pcm_sframes_t calc_src_frames(struct snd_pcm_substream *plug, snd_pcm_sframes_t frames, bool check_size) { struct snd_pcm_plugin *plugin, *plugin_prev; plugin = snd_pcm_plug_last(plug); while (plugin && frames > 0) { plugin_prev = plugin->prev; if (plugin->src_frames) { frames = plugin->src_frames(plugin, frames); if (frames < 0) return frames; } if (check_size && plugin->buf_frames && frames > plugin->buf_frames) frames = plugin->buf_frames; plugin = plugin_prev; } return frames; } snd_pcm_sframes_t snd_pcm_plug_client_size(struct snd_pcm_substream *plug, snd_pcm_uframes_t drv_frames) { if (snd_BUG_ON(!plug)) return -ENXIO; switch (snd_pcm_plug_stream(plug)) { case SNDRV_PCM_STREAM_PLAYBACK: return calc_src_frames(plug, drv_frames, false); case SNDRV_PCM_STREAM_CAPTURE: return calc_dst_frames(plug, drv_frames, false); default: snd_BUG(); return -EINVAL; } } snd_pcm_sframes_t snd_pcm_plug_slave_size(struct snd_pcm_substream *plug, snd_pcm_uframes_t clt_frames) { if (snd_BUG_ON(!plug)) return -ENXIO; switch (snd_pcm_plug_stream(plug)) { case SNDRV_PCM_STREAM_PLAYBACK: return calc_dst_frames(plug, clt_frames, false); case SNDRV_PCM_STREAM_CAPTURE: return calc_src_frames(plug, clt_frames, false); default: snd_BUG(); return -EINVAL; } } static int snd_pcm_plug_formats(const struct snd_mask *mask, snd_pcm_format_t format) { struct snd_mask formats = *mask; u64 linfmts = (SNDRV_PCM_FMTBIT_U8 | SNDRV_PCM_FMTBIT_S8 | SNDRV_PCM_FMTBIT_U16_LE | SNDRV_PCM_FMTBIT_S16_LE | SNDRV_PCM_FMTBIT_U16_BE | SNDRV_PCM_FMTBIT_S16_BE | SNDRV_PCM_FMTBIT_U24_LE | SNDRV_PCM_FMTBIT_S24_LE | SNDRV_PCM_FMTBIT_U24_BE | SNDRV_PCM_FMTBIT_S24_BE | SNDRV_PCM_FMTBIT_U24_3LE | SNDRV_PCM_FMTBIT_S24_3LE | SNDRV_PCM_FMTBIT_U24_3BE | SNDRV_PCM_FMTBIT_S24_3BE | SNDRV_PCM_FMTBIT_U32_LE | SNDRV_PCM_FMTBIT_S32_LE | SNDRV_PCM_FMTBIT_U32_BE | SNDRV_PCM_FMTBIT_S32_BE); snd_mask_set(&formats, (__force int)SNDRV_PCM_FORMAT_MU_LAW); if (formats.bits[0] & lower_32_bits(linfmts)) formats.bits[0] |= lower_32_bits(linfmts); if (formats.bits[1] & upper_32_bits(linfmts)) formats.bits[1] |= upper_32_bits(linfmts); return snd_mask_test(&formats, (__force int)format); } static const snd_pcm_format_t preferred_formats[] = { SNDRV_PCM_FORMAT_S16_LE, SNDRV_PCM_FORMAT_S16_BE, SNDRV_PCM_FORMAT_U16_LE, SNDRV_PCM_FORMAT_U16_BE, SNDRV_PCM_FORMAT_S24_3LE, SNDRV_PCM_FORMAT_S24_3BE, SNDRV_PCM_FORMAT_U24_3LE, SNDRV_PCM_FORMAT_U24_3BE, SNDRV_PCM_FORMAT_S24_LE, SNDRV_PCM_FORMAT_S24_BE, SNDRV_PCM_FORMAT_U24_LE, SNDRV_PCM_FORMAT_U24_BE, SNDRV_PCM_FORMAT_S32_LE, SNDRV_PCM_FORMAT_S32_BE, SNDRV_PCM_FORMAT_U32_LE, SNDRV_PCM_FORMAT_U32_BE, SNDRV_PCM_FORMAT_S8, SNDRV_PCM_FORMAT_U8 }; snd_pcm_format_t snd_pcm_plug_slave_format(snd_pcm_format_t format, const struct snd_mask *format_mask) { int i; if (snd_mask_test(format_mask, (__force int)format)) return format; if (!snd_pcm_plug_formats(format_mask, format)) return (__force snd_pcm_format_t)-EINVAL; if (snd_pcm_format_linear(format)) { unsigned int width = snd_pcm_format_width(format); int unsignd = snd_pcm_format_unsigned(format) > 0; int big = snd_pcm_format_big_endian(format) > 0; unsigned int badness, best = -1; snd_pcm_format_t best_format = (__force snd_pcm_format_t)-1; for (i = 0; i < ARRAY_SIZE(preferred_formats); i++) { snd_pcm_format_t f = preferred_formats[i]; unsigned int w; if (!snd_mask_test(format_mask, (__force int)f)) continue; w = snd_pcm_format_width(f); if (w >= width) badness = w - width; else badness = width - w + 32; badness += snd_pcm_format_unsigned(f) != unsignd; badness += snd_pcm_format_big_endian(f) != big; if (badness < best) { best_format = f; best = badness; } } if ((__force int)best_format >= 0) return best_format; else return (__force snd_pcm_format_t)-EINVAL; } else { switch (format) { case SNDRV_PCM_FORMAT_MU_LAW: for (i = 0; i < ARRAY_SIZE(preferred_formats); ++i) { snd_pcm_format_t format1 = preferred_formats[i]; if (snd_mask_test(format_mask, (__force int)format1)) return format1; } fallthrough; default: return (__force snd_pcm_format_t)-EINVAL; } } } int snd_pcm_plug_format_plugins(struct snd_pcm_substream *plug, struct snd_pcm_hw_params *params, struct snd_pcm_hw_params *slave_params) { struct snd_pcm_plugin_format tmpformat; struct snd_pcm_plugin_format dstformat; struct snd_pcm_plugin_format srcformat; snd_pcm_access_t src_access, dst_access; struct snd_pcm_plugin *plugin = NULL; int err; int stream = snd_pcm_plug_stream(plug); int slave_interleaved = (params_channels(slave_params) == 1 || params_access(slave_params) == SNDRV_PCM_ACCESS_RW_INTERLEAVED); switch (stream) { case SNDRV_PCM_STREAM_PLAYBACK: dstformat.format = params_format(slave_params); dstformat.rate = params_rate(slave_params); dstformat.channels = params_channels(slave_params); srcformat.format = params_format(params); srcformat.rate = params_rate(params); srcformat.channels = params_channels(params); src_access = SNDRV_PCM_ACCESS_RW_INTERLEAVED; dst_access = (slave_interleaved ? SNDRV_PCM_ACCESS_RW_INTERLEAVED : SNDRV_PCM_ACCESS_RW_NONINTERLEAVED); break; case SNDRV_PCM_STREAM_CAPTURE: dstformat.format = params_format(params); dstformat.rate = params_rate(params); dstformat.channels = params_channels(params); srcformat.format = params_format(slave_params); srcformat.rate = params_rate(slave_params); srcformat.channels = params_channels(slave_params); src_access = (slave_interleaved ? SNDRV_PCM_ACCESS_RW_INTERLEAVED : SNDRV_PCM_ACCESS_RW_NONINTERLEAVED); dst_access = SNDRV_PCM_ACCESS_RW_INTERLEAVED; break; default: snd_BUG(); return -EINVAL; } tmpformat = srcformat; pdprintf("srcformat: format=%i, rate=%i, channels=%i\n", srcformat.format, srcformat.rate, srcformat.channels); pdprintf("dstformat: format=%i, rate=%i, channels=%i\n", dstformat.format, dstformat.rate, dstformat.channels); /* Format change (linearization) */ if (! rate_match(srcformat.rate, dstformat.rate) && ! snd_pcm_format_linear(srcformat.format)) { if (srcformat.format != SNDRV_PCM_FORMAT_MU_LAW) return -EINVAL; tmpformat.format = SNDRV_PCM_FORMAT_S16; err = snd_pcm_plugin_build_mulaw(plug, &srcformat, &tmpformat, &plugin); if (err < 0) return err; err = snd_pcm_plugin_append(plugin); if (err < 0) { snd_pcm_plugin_free(plugin); return err; } srcformat = tmpformat; src_access = dst_access; } /* channels reduction */ if (srcformat.channels > dstformat.channels) { tmpformat.channels = dstformat.channels; err = snd_pcm_plugin_build_route(plug, &srcformat, &tmpformat, &plugin); pdprintf("channels reduction: src=%i, dst=%i returns %i\n", srcformat.channels, tmpformat.channels, err); if (err < 0) return err; err = snd_pcm_plugin_append(plugin); if (err < 0) { snd_pcm_plugin_free(plugin); return err; } srcformat = tmpformat; src_access = dst_access; } /* rate resampling */ if (!rate_match(srcformat.rate, dstformat.rate)) { if (srcformat.format != SNDRV_PCM_FORMAT_S16) { /* convert to S16 for resampling */ tmpformat.format = SNDRV_PCM_FORMAT_S16; err = snd_pcm_plugin_build_linear(plug, &srcformat, &tmpformat, &plugin); if (err < 0) return err; err = snd_pcm_plugin_append(plugin); if (err < 0) { snd_pcm_plugin_free(plugin); return err; } srcformat = tmpformat; src_access = dst_access; } tmpformat.rate = dstformat.rate; err = snd_pcm_plugin_build_rate(plug, &srcformat, &tmpformat, &plugin); pdprintf("rate down resampling: src=%i, dst=%i returns %i\n", srcformat.rate, tmpformat.rate, err); if (err < 0) return err; err = snd_pcm_plugin_append(plugin); if (err < 0) { snd_pcm_plugin_free(plugin); return err; } srcformat = tmpformat; src_access = dst_access; } /* format change */ if (srcformat.format != dstformat.format) { tmpformat.format = dstformat.format; if (srcformat.format == SNDRV_PCM_FORMAT_MU_LAW || tmpformat.format == SNDRV_PCM_FORMAT_MU_LAW) { err = snd_pcm_plugin_build_mulaw(plug, &srcformat, &tmpformat, &plugin); } else if (snd_pcm_format_linear(srcformat.format) && snd_pcm_format_linear(tmpformat.format)) { err = snd_pcm_plugin_build_linear(plug, &srcformat, &tmpformat, &plugin); } else return -EINVAL; pdprintf("format change: src=%i, dst=%i returns %i\n", srcformat.format, tmpformat.format, err); if (err < 0) return err; err = snd_pcm_plugin_append(plugin); if (err < 0) { snd_pcm_plugin_free(plugin); return err; } srcformat = tmpformat; src_access = dst_access; } /* channels extension */ if (srcformat.channels < dstformat.channels) { tmpformat.channels = dstformat.channels; err = snd_pcm_plugin_build_route(plug, &srcformat, &tmpformat, &plugin); pdprintf("channels extension: src=%i, dst=%i returns %i\n", srcformat.channels, tmpformat.channels, err); if (err < 0) return err; err = snd_pcm_plugin_append(plugin); if (err < 0) { snd_pcm_plugin_free(plugin); return err; } srcformat = tmpformat; src_access = dst_access; } /* de-interleave */ if (src_access != dst_access) { err = snd_pcm_plugin_build_copy(plug, &srcformat, &tmpformat, &plugin); pdprintf("interleave change (copy: returns %i)\n", err); if (err < 0) return err; err = snd_pcm_plugin_append(plugin); if (err < 0) { snd_pcm_plugin_free(plugin); return err; } } return 0; } snd_pcm_sframes_t snd_pcm_plug_client_channels_buf(struct snd_pcm_substream *plug, char *buf, snd_pcm_uframes_t count, struct snd_pcm_plugin_channel **channels) { struct snd_pcm_plugin *plugin; struct snd_pcm_plugin_channel *v; struct snd_pcm_plugin_format *format; int width, nchannels, channel; int stream = snd_pcm_plug_stream(plug); if (snd_BUG_ON(!buf)) return -ENXIO; if (stream == SNDRV_PCM_STREAM_PLAYBACK) { plugin = snd_pcm_plug_first(plug); format = &plugin->src_format; } else { plugin = snd_pcm_plug_last(plug); format = &plugin->dst_format; } v = plugin->buf_channels; *channels = v; width = snd_pcm_format_physical_width(format->format); if (width < 0) return width; nchannels = format->channels; if (snd_BUG_ON(plugin->access != SNDRV_PCM_ACCESS_RW_INTERLEAVED && format->channels > 1)) return -ENXIO; for (channel = 0; channel < nchannels; channel++, v++) { v->frames = count; v->enabled = 1; v->wanted = (stream == SNDRV_PCM_STREAM_CAPTURE); v->area.addr = buf; v->area.first = channel * width; v->area.step = nchannels * width; } return count; } snd_pcm_sframes_t snd_pcm_plug_write_transfer(struct snd_pcm_substream *plug, struct snd_pcm_plugin_channel *src_channels, snd_pcm_uframes_t size) { struct snd_pcm_plugin *plugin, *next; struct snd_pcm_plugin_channel *dst_channels; int err; snd_pcm_sframes_t frames = size; plugin = snd_pcm_plug_first(plug); while (plugin) { if (frames <= 0) return frames; next = plugin->next; if (next) { snd_pcm_sframes_t frames1 = frames; if (plugin->dst_frames) { frames1 = plugin->dst_frames(plugin, frames); if (frames1 <= 0) return frames1; } err = next->client_channels(next, frames1, &dst_channels); if (err < 0) return err; if (err != frames1) { frames = err; if (plugin->src_frames) { frames = plugin->src_frames(plugin, frames1); if (frames <= 0) return frames; } } } else dst_channels = NULL; pdprintf("write plugin: %s, %li\n", plugin->name, frames); frames = plugin->transfer(plugin, src_channels, dst_channels, frames); if (frames < 0) return frames; src_channels = dst_channels; plugin = next; } return calc_src_frames(plug, frames, true); } snd_pcm_sframes_t snd_pcm_plug_read_transfer(struct snd_pcm_substream *plug, struct snd_pcm_plugin_channel *dst_channels_final, snd_pcm_uframes_t size) { struct snd_pcm_plugin *plugin, *next; struct snd_pcm_plugin_channel *src_channels, *dst_channels; snd_pcm_sframes_t frames = size; int err; frames = calc_src_frames(plug, frames, true); if (frames < 0) return frames; src_channels = NULL; plugin = snd_pcm_plug_first(plug); while (plugin && frames > 0) { next = plugin->next; if (next) { err = plugin->client_channels(plugin, frames, &dst_channels); if (err < 0) return err; frames = err; } else { dst_channels = dst_channels_final; } pdprintf("read plugin: %s, %li\n", plugin->name, frames); frames = plugin->transfer(plugin, src_channels, dst_channels, frames); if (frames < 0) return frames; plugin = next; src_channels = dst_channels; } return frames; } int snd_pcm_area_silence(const struct snd_pcm_channel_area *dst_area, size_t dst_offset, size_t samples, snd_pcm_format_t format) { /* FIXME: sub byte resolution and odd dst_offset */ unsigned char *dst; unsigned int dst_step; int width; const unsigned char *silence; if (!dst_area->addr) return 0; dst = dst_area->addr + (dst_area->first + dst_area->step * dst_offset) / 8; width = snd_pcm_format_physical_width(format); if (width <= 0) return -EINVAL; if (dst_area->step == (unsigned int) width && width >= 8) return snd_pcm_format_set_silence(format, dst, samples); silence = snd_pcm_format_silence_64(format); if (! silence) return -EINVAL; dst_step = dst_area->step / 8; if (width == 4) { /* Ima ADPCM */ int dstbit = dst_area->first % 8; int dstbit_step = dst_area->step % 8; while (samples-- > 0) { if (dstbit) *dst &= 0xf0; else *dst &= 0x0f; dst += dst_step; dstbit += dstbit_step; if (dstbit == 8) { dst++; dstbit = 0; } } } else { width /= 8; while (samples-- > 0) { memcpy(dst, silence, width); dst += dst_step; } } return 0; } int snd_pcm_area_copy(const struct snd_pcm_channel_area *src_area, size_t src_offset, const struct snd_pcm_channel_area *dst_area, size_t dst_offset, size_t samples, snd_pcm_format_t format) { /* FIXME: sub byte resolution and odd dst_offset */ char *src, *dst; int width; int src_step, dst_step; src = src_area->addr + (src_area->first + src_area->step * src_offset) / 8; if (!src_area->addr) return snd_pcm_area_silence(dst_area, dst_offset, samples, format); dst = dst_area->addr + (dst_area->first + dst_area->step * dst_offset) / 8; if (!dst_area->addr) return 0; width = snd_pcm_format_physical_width(format); if (width <= 0) return -EINVAL; if (src_area->step == (unsigned int) width && dst_area->step == (unsigned int) width && width >= 8) { size_t bytes = samples * width / 8; memcpy(dst, src, bytes); return 0; } src_step = src_area->step / 8; dst_step = dst_area->step / 8; if (width == 4) { /* Ima ADPCM */ int srcbit = src_area->first % 8; int srcbit_step = src_area->step % 8; int dstbit = dst_area->first % 8; int dstbit_step = dst_area->step % 8; while (samples-- > 0) { unsigned char srcval; if (srcbit) srcval = *src & 0x0f; else srcval = (*src & 0xf0) >> 4; if (dstbit) *dst = (*dst & 0xf0) | srcval; else *dst = (*dst & 0x0f) | (srcval << 4); src += src_step; srcbit += srcbit_step; if (srcbit == 8) { src++; srcbit = 0; } dst += dst_step; dstbit += dstbit_step; if (dstbit == 8) { dst++; dstbit = 0; } } } else { width /= 8; while (samples-- > 0) { memcpy(dst, src, width); src += src_step; dst += dst_step; } } return 0; } |
| 1 1 7 1 3 1 1 1 1 1 11 1 1 1 1 7 1 2 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) 2020-2024 Oracle. All Rights Reserved. * Author: Darrick J. Wong <djwong@kernel.org> */ #include "xfs.h" #include "xfs_shared.h" #include "xfs_format.h" #include "xfs_log_format.h" #include "xfs_trans_resv.h" #include "xfs_mount.h" #include "xfs_defer.h" #include "xfs_inode.h" #include "xfs_trans.h" #include "xfs_quota.h" #include "xfs_bmap_util.h" #include "xfs_reflink.h" #include "xfs_trace.h" #include "xfs_exchrange.h" #include "xfs_exchmaps.h" #include "xfs_sb.h" #include "xfs_icache.h" #include "xfs_log.h" #include "xfs_rtbitmap.h" #include <linux/fsnotify.h> /* Lock (and optionally join) two inodes for a file range exchange. */ void xfs_exchrange_ilock( struct xfs_trans *tp, struct xfs_inode *ip1, struct xfs_inode *ip2) { if (ip1 != ip2) xfs_lock_two_inodes(ip1, XFS_ILOCK_EXCL, ip2, XFS_ILOCK_EXCL); else xfs_ilock(ip1, XFS_ILOCK_EXCL); if (tp) { xfs_trans_ijoin(tp, ip1, 0); if (ip2 != ip1) xfs_trans_ijoin(tp, ip2, 0); } } /* Unlock two inodes after a file range exchange operation. */ void xfs_exchrange_iunlock( struct xfs_inode *ip1, struct xfs_inode *ip2) { if (ip2 != ip1) xfs_iunlock(ip2, XFS_ILOCK_EXCL); xfs_iunlock(ip1, XFS_ILOCK_EXCL); } /* * Estimate the resource requirements to exchange file contents between the two * files. The caller is required to hold the IOLOCK and the MMAPLOCK and to * have flushed both inodes' pagecache and active direct-ios. */ int xfs_exchrange_estimate( struct xfs_exchmaps_req *req) { int error; xfs_exchrange_ilock(NULL, req->ip1, req->ip2); error = xfs_exchmaps_estimate(req); xfs_exchrange_iunlock(req->ip1, req->ip2); return error; } /* * Check that file2's metadata agree with the snapshot that we took for the * range commit request. * * This should be called after the filesystem has locked /all/ inode metadata * against modification. */ STATIC int xfs_exchrange_check_freshness( const struct xfs_exchrange *fxr, struct xfs_inode *ip2) { struct inode *inode2 = VFS_I(ip2); struct timespec64 ctime = inode_get_ctime(inode2); struct timespec64 mtime = inode_get_mtime(inode2); trace_xfs_exchrange_freshness(fxr, ip2); /* Check that file2 hasn't otherwise been modified. */ if (fxr->file2_ino != ip2->i_ino || fxr->file2_gen != inode2->i_generation || !timespec64_equal(&fxr->file2_ctime, &ctime) || !timespec64_equal(&fxr->file2_mtime, &mtime)) return -EBUSY; return 0; } #define QRETRY_IP1 (0x1) #define QRETRY_IP2 (0x2) /* * Obtain a quota reservation to make sure we don't hit EDQUOT. We can skip * this if quota enforcement is disabled or if both inodes' dquots are the * same. The qretry structure must be initialized to zeroes before the first * call to this function. */ STATIC int xfs_exchrange_reserve_quota( struct xfs_trans *tp, const struct xfs_exchmaps_req *req, unsigned int *qretry) { int64_t ddelta, rdelta; int ip1_error = 0; int error; ASSERT(!xfs_is_metadir_inode(req->ip1)); ASSERT(!xfs_is_metadir_inode(req->ip2)); /* * Don't bother with a quota reservation if we're not enforcing them * or the two inodes have the same dquots. */ if (!XFS_IS_QUOTA_ON(tp->t_mountp) || req->ip1 == req->ip2 || (req->ip1->i_udquot == req->ip2->i_udquot && req->ip1->i_gdquot == req->ip2->i_gdquot && req->ip1->i_pdquot == req->ip2->i_pdquot)) return 0; *qretry = 0; /* * For each file, compute the net gain in the number of regular blocks * that will be mapped into that file and reserve that much quota. The * quota counts must be able to absorb at least that much space. */ ddelta = req->ip2_bcount - req->ip1_bcount; rdelta = req->ip2_rtbcount - req->ip1_rtbcount; if (ddelta > 0 || rdelta > 0) { error = xfs_trans_reserve_quota_nblks(tp, req->ip1, ddelta > 0 ? ddelta : 0, rdelta > 0 ? rdelta : 0, false); if (error == -EDQUOT || error == -ENOSPC) { /* * Save this error and see what happens if we try to * reserve quota for ip2. Then report both. */ *qretry |= QRETRY_IP1; ip1_error = error; error = 0; } if (error) return error; } if (ddelta < 0 || rdelta < 0) { error = xfs_trans_reserve_quota_nblks(tp, req->ip2, ddelta < 0 ? -ddelta : 0, rdelta < 0 ? -rdelta : 0, false); if (error == -EDQUOT || error == -ENOSPC) *qretry |= QRETRY_IP2; if (error) return error; } if (ip1_error) return ip1_error; /* * For each file, forcibly reserve the gross gain in mapped blocks so * that we don't trip over any quota block reservation assertions. * We must reserve the gross gain because the quota code subtracts from * bcount the number of blocks that we unmap; it does not add that * quantity back to the quota block reservation. */ error = xfs_trans_reserve_quota_nblks(tp, req->ip1, req->ip1_bcount, req->ip1_rtbcount, true); if (error) return error; return xfs_trans_reserve_quota_nblks(tp, req->ip2, req->ip2_bcount, req->ip2_rtbcount, true); } /* Exchange the mappings (and hence the contents) of two files' forks. */ STATIC int xfs_exchrange_mappings( const struct xfs_exchrange *fxr, struct xfs_inode *ip1, struct xfs_inode *ip2) { struct xfs_mount *mp = ip1->i_mount; struct xfs_exchmaps_req req = { .ip1 = ip1, .ip2 = ip2, .startoff1 = XFS_B_TO_FSBT(mp, fxr->file1_offset), .startoff2 = XFS_B_TO_FSBT(mp, fxr->file2_offset), .blockcount = XFS_B_TO_FSB(mp, fxr->length), }; struct xfs_trans *tp; unsigned int qretry; bool retried = false; int error; trace_xfs_exchrange_mappings(fxr, ip1, ip2); if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) req.flags |= XFS_EXCHMAPS_SET_SIZES; if (fxr->flags & XFS_EXCHANGE_RANGE_FILE1_WRITTEN) req.flags |= XFS_EXCHMAPS_INO1_WRITTEN; /* * Round the request length up to the nearest file allocation unit. * The prep function already checked that the request offsets and * length in @fxr are safe to round up. */ if (xfs_inode_has_bigrtalloc(ip2)) req.blockcount = xfs_blen_roundup_rtx(mp, req.blockcount); error = xfs_exchrange_estimate(&req); if (error) return error; retry: /* Allocate the transaction, lock the inodes, and join them. */ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, req.resblks, 0, XFS_TRANS_RES_FDBLKS, &tp); if (error) return error; xfs_exchrange_ilock(tp, ip1, ip2); trace_xfs_exchrange_before(ip2, 2); trace_xfs_exchrange_before(ip1, 1); error = xfs_exchmaps_check_forks(mp, &req); if (error) goto out_trans_cancel; /* * Reserve ourselves some quota if any of them are in enforcing mode. * In theory we only need enough to satisfy the change in the number * of blocks between the two ranges being remapped. */ error = xfs_exchrange_reserve_quota(tp, &req, &qretry); if ((error == -EDQUOT || error == -ENOSPC) && !retried) { xfs_trans_cancel(tp); xfs_exchrange_iunlock(ip1, ip2); if (qretry & QRETRY_IP1) xfs_blockgc_free_quota(ip1, 0); if (qretry & QRETRY_IP2) xfs_blockgc_free_quota(ip2, 0); retried = true; goto retry; } if (error) goto out_trans_cancel; /* If we got this far on a dry run, all parameters are ok. */ if (fxr->flags & XFS_EXCHANGE_RANGE_DRY_RUN) goto out_trans_cancel; /* Update the mtime and ctime of both files. */ if (fxr->flags & __XFS_EXCHANGE_RANGE_UPD_CMTIME1) xfs_trans_ichgtime(tp, ip1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); if (fxr->flags & __XFS_EXCHANGE_RANGE_UPD_CMTIME2) xfs_trans_ichgtime(tp, ip2, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); xfs_exchange_mappings(tp, &req); /* * Force the log to persist metadata updates if the caller or the * administrator requires this. The generic prep function already * flushed the relevant parts of the page cache. */ if (xfs_has_wsync(mp) || (fxr->flags & XFS_EXCHANGE_RANGE_DSYNC)) xfs_trans_set_sync(tp); error = xfs_trans_commit(tp); trace_xfs_exchrange_after(ip2, 2); trace_xfs_exchrange_after(ip1, 1); if (error) goto out_unlock; /* * If the caller wanted us to exchange the contents of two complete * files of unequal length, exchange the incore sizes now. This should * be safe because we flushed both files' page caches, exchanged all * the mappings, and updated the ondisk sizes. */ if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) { loff_t temp; temp = i_size_read(VFS_I(ip2)); i_size_write(VFS_I(ip2), i_size_read(VFS_I(ip1))); i_size_write(VFS_I(ip1), temp); } out_unlock: xfs_exchrange_iunlock(ip1, ip2); return error; out_trans_cancel: xfs_trans_cancel(tp); goto out_unlock; } /* * Generic code for exchanging ranges of two files via XFS_IOC_EXCHANGE_RANGE. * This part deals with struct file objects and byte ranges and does not deal * with XFS-specific data structures such as xfs_inodes and block ranges. This * separation may some day facilitate porting to another filesystem. * * The goal is to exchange fxr.length bytes starting at fxr.file1_offset in * file1 with the same number of bytes starting at fxr.file2_offset in file2. * Implementations must call xfs_exchange_range_prep to prepare the two * files prior to taking locks; and they must update the inode change and mod * times of both files as part of the metadata update. The timestamp update * and freshness checks must be done atomically as part of the data exchange * operation to ensure correctness of the freshness check. * xfs_exchange_range_finish must be called after the operation completes * successfully but before locks are dropped. */ /* * Performs necessary checks before doing a range exchange, having stabilized * mutable inode attributes via i_rwsem. */ static inline int xfs_exchange_range_checks( struct xfs_exchrange *fxr, unsigned int alloc_unit) { struct inode *inode1 = file_inode(fxr->file1); loff_t size1 = i_size_read(inode1); struct inode *inode2 = file_inode(fxr->file2); loff_t size2 = i_size_read(inode2); uint64_t allocmask = alloc_unit - 1; int64_t test_len; uint64_t blen; loff_t tmp; int error; /* Don't touch certain kinds of inodes */ if (IS_IMMUTABLE(inode1) || IS_IMMUTABLE(inode2)) return -EPERM; if (IS_SWAPFILE(inode1) || IS_SWAPFILE(inode2)) return -ETXTBSY; /* Ranges cannot start after EOF. */ if (fxr->file1_offset > size1 || fxr->file2_offset > size2) return -EINVAL; if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) { /* * If the caller said to exchange to EOF, we set the length of * the request large enough to cover everything to the end of * both files. */ fxr->length = max_t(int64_t, size1 - fxr->file1_offset, size2 - fxr->file2_offset); } else { /* * Otherwise we require both ranges to end within EOF. */ if (fxr->file1_offset + fxr->length > size1 || fxr->file2_offset + fxr->length > size2) return -EINVAL; } /* * The start of both ranges must be aligned to the file allocation * unit. */ if (!IS_ALIGNED(fxr->file1_offset, alloc_unit) || !IS_ALIGNED(fxr->file2_offset, alloc_unit)) return -EINVAL; /* Ensure offsets don't wrap. */ if (check_add_overflow(fxr->file1_offset, fxr->length, &tmp) || check_add_overflow(fxr->file2_offset, fxr->length, &tmp)) return -EINVAL; /* * Make sure we don't hit any file size limits. If we hit any size * limits such that test_length was adjusted, we abort the whole * operation. */ test_len = fxr->length; error = generic_write_check_limits(fxr->file2, fxr->file2_offset, &test_len); if (error) return error; error = generic_write_check_limits(fxr->file1, fxr->file1_offset, &test_len); if (error) return error; if (test_len != fxr->length) return -EINVAL; /* * If the user wanted us to exchange up to the infile's EOF, round up * to the next allocation unit boundary for this check. Do the same * for the outfile. * * Otherwise, reject the range length if it's not aligned to an * allocation unit. */ if (fxr->file1_offset + fxr->length == size1) blen = ALIGN(size1, alloc_unit) - fxr->file1_offset; else if (fxr->file2_offset + fxr->length == size2) blen = ALIGN(size2, alloc_unit) - fxr->file2_offset; else if (!IS_ALIGNED(fxr->length, alloc_unit)) return -EINVAL; else blen = fxr->length; /* Don't allow overlapped exchanges within the same file. */ if (inode1 == inode2 && fxr->file2_offset + blen > fxr->file1_offset && fxr->file1_offset + blen > fxr->file2_offset) return -EINVAL; /* * Ensure that we don't exchange a partial EOF block into the middle of * another file. */ if ((fxr->length & allocmask) == 0) return 0; blen = fxr->length; if (fxr->file2_offset + blen < size2) blen &= ~allocmask; if (fxr->file1_offset + blen < size1) blen &= ~allocmask; return blen == fxr->length ? 0 : -EINVAL; } /* * Check that the two inodes are eligible for range exchanges, the ranges make * sense, and then flush all dirty data. Caller must ensure that the inodes * have been locked against any other modifications. */ static inline int xfs_exchange_range_prep( struct xfs_exchrange *fxr, unsigned int alloc_unit) { struct inode *inode1 = file_inode(fxr->file1); struct inode *inode2 = file_inode(fxr->file2); bool same_inode = (inode1 == inode2); int error; /* Check that we don't violate system file offset limits. */ error = xfs_exchange_range_checks(fxr, alloc_unit); if (error || fxr->length == 0) return error; /* Wait for the completion of any pending IOs on both files */ inode_dio_wait(inode1); if (!same_inode) inode_dio_wait(inode2); error = filemap_write_and_wait_range(inode1->i_mapping, fxr->file1_offset, fxr->file1_offset + fxr->length - 1); if (error) return error; error = filemap_write_and_wait_range(inode2->i_mapping, fxr->file2_offset, fxr->file2_offset + fxr->length - 1); if (error) return error; /* * If the files or inodes involved require synchronous writes, amend * the request to force the filesystem to flush all data and metadata * to disk after the operation completes. */ if (((fxr->file1->f_flags | fxr->file2->f_flags) & O_SYNC) || IS_SYNC(inode1) || IS_SYNC(inode2)) fxr->flags |= XFS_EXCHANGE_RANGE_DSYNC; return 0; } /* * Finish a range exchange operation, if it was successful. Caller must ensure * that the inodes are still locked against any other modifications. */ static inline int xfs_exchange_range_finish( struct xfs_exchrange *fxr) { int error; error = file_remove_privs(fxr->file1); if (error) return error; if (file_inode(fxr->file1) == file_inode(fxr->file2)) return 0; return file_remove_privs(fxr->file2); } /* * Check the alignment of an exchange request when the allocation unit size * isn't a power of two. The generic file-level helpers use (fast) * bitmask-based alignment checks, but here we have to use slow long division. */ static int xfs_exchrange_check_rtalign( const struct xfs_exchrange *fxr, struct xfs_inode *ip1, struct xfs_inode *ip2, unsigned int alloc_unit) { uint64_t length = fxr->length; uint64_t blen; loff_t size1, size2; size1 = i_size_read(VFS_I(ip1)); size2 = i_size_read(VFS_I(ip2)); /* The start of both ranges must be aligned to a rt extent. */ if (!isaligned_64(fxr->file1_offset, alloc_unit) || !isaligned_64(fxr->file2_offset, alloc_unit)) return -EINVAL; if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) length = max_t(int64_t, size1 - fxr->file1_offset, size2 - fxr->file2_offset); /* * If the user wanted us to exchange up to the infile's EOF, round up * to the next rt extent boundary for this check. Do the same for the * outfile. * * Otherwise, reject the range length if it's not rt extent aligned. * We already confirmed the starting offsets' rt extent block * alignment. */ if (fxr->file1_offset + length == size1) blen = roundup_64(size1, alloc_unit) - fxr->file1_offset; else if (fxr->file2_offset + length == size2) blen = roundup_64(size2, alloc_unit) - fxr->file2_offset; else if (!isaligned_64(length, alloc_unit)) return -EINVAL; else blen = length; /* Don't allow overlapped exchanges within the same file. */ if (ip1 == ip2 && fxr->file2_offset + blen > fxr->file1_offset && fxr->file1_offset + blen > fxr->file2_offset) return -EINVAL; /* * Ensure that we don't exchange a partial EOF rt extent into the * middle of another file. */ if (isaligned_64(length, alloc_unit)) return 0; blen = length; if (fxr->file2_offset + length < size2) blen = rounddown_64(blen, alloc_unit); if (fxr->file1_offset + blen < size1) blen = rounddown_64(blen, alloc_unit); return blen == length ? 0 : -EINVAL; } /* Prepare two files to have their data exchanged. */ STATIC int xfs_exchrange_prep( struct xfs_exchrange *fxr, struct xfs_inode *ip1, struct xfs_inode *ip2) { struct xfs_mount *mp = ip2->i_mount; unsigned int alloc_unit = xfs_inode_alloc_unitsize(ip2); int error; trace_xfs_exchrange_prep(fxr, ip1, ip2); /* Verify both files are either real-time or non-realtime */ if (XFS_IS_REALTIME_INODE(ip1) != XFS_IS_REALTIME_INODE(ip2)) return -EINVAL; /* Check non-power of two alignment issues, if necessary. */ if (!is_power_of_2(alloc_unit)) { error = xfs_exchrange_check_rtalign(fxr, ip1, ip2, alloc_unit); if (error) return error; /* * Do the generic file-level checks with the regular block * alignment. */ alloc_unit = mp->m_sb.sb_blocksize; } error = xfs_exchange_range_prep(fxr, alloc_unit); if (error || fxr->length == 0) return error; if (fxr->flags & __XFS_EXCHANGE_RANGE_CHECK_FRESH2) { error = xfs_exchrange_check_freshness(fxr, ip2); if (error) return error; } /* Attach dquots to both inodes before changing block maps. */ error = xfs_qm_dqattach(ip2); if (error) return error; error = xfs_qm_dqattach(ip1); if (error) return error; trace_xfs_exchrange_flush(fxr, ip1, ip2); /* Flush the relevant ranges of both files. */ error = xfs_flush_unmap_range(ip2, fxr->file2_offset, fxr->length); if (error) return error; error = xfs_flush_unmap_range(ip1, fxr->file1_offset, fxr->length); if (error) return error; /* * Cancel CoW fork preallocations for the ranges of both files. The * prep function should have flushed all the dirty data, so the only * CoW mappings remaining should be speculative. */ if (xfs_inode_has_cow_data(ip1)) { error = xfs_reflink_cancel_cow_range(ip1, fxr->file1_offset, fxr->length, true); if (error) return error; } if (xfs_inode_has_cow_data(ip2)) { error = xfs_reflink_cancel_cow_range(ip2, fxr->file2_offset, fxr->length, true); if (error) return error; } return 0; } /* * Exchange contents of files. This is the binding between the generic * file-level concepts and the XFS inode-specific implementation. */ STATIC int xfs_exchrange_contents( struct xfs_exchrange *fxr) { struct inode *inode1 = file_inode(fxr->file1); struct inode *inode2 = file_inode(fxr->file2); struct xfs_inode *ip1 = XFS_I(inode1); struct xfs_inode *ip2 = XFS_I(inode2); struct xfs_mount *mp = ip1->i_mount; int error; if (!xfs_has_exchange_range(mp)) return -EOPNOTSUPP; if (fxr->flags & ~(XFS_EXCHANGE_RANGE_ALL_FLAGS | XFS_EXCHANGE_RANGE_PRIV_FLAGS)) return -EINVAL; if (xfs_is_shutdown(mp)) return -EIO; /* Lock both files against IO */ error = xfs_ilock2_io_mmap(ip1, ip2); if (error) goto out_err; /* Prepare and then exchange file contents. */ error = xfs_exchrange_prep(fxr, ip1, ip2); if (error) goto out_unlock; error = xfs_exchrange_mappings(fxr, ip1, ip2); if (error) goto out_unlock; /* * Finish the exchange by removing special file privileges like any * other file write would do. This may involve turning on support for * logged xattrs if either file has security capabilities. */ error = xfs_exchange_range_finish(fxr); if (error) goto out_unlock; out_unlock: xfs_iunlock2_io_mmap(ip1, ip2); out_err: if (error) trace_xfs_exchrange_error(ip2, error, _RET_IP_); return error; } /* Exchange parts of two files. */ static int xfs_exchange_range( struct xfs_exchrange *fxr) { struct inode *inode1 = file_inode(fxr->file1); struct inode *inode2 = file_inode(fxr->file2); loff_t check_len = fxr->length; int ret; BUILD_BUG_ON(XFS_EXCHANGE_RANGE_ALL_FLAGS & XFS_EXCHANGE_RANGE_PRIV_FLAGS); /* Both files must be on the same mount/filesystem. */ if (fxr->file1->f_path.mnt != fxr->file2->f_path.mnt) return -EXDEV; if (fxr->flags & ~(XFS_EXCHANGE_RANGE_ALL_FLAGS | __XFS_EXCHANGE_RANGE_CHECK_FRESH2)) return -EINVAL; /* Userspace requests only honored for regular files. */ if (S_ISDIR(inode1->i_mode) || S_ISDIR(inode2->i_mode)) return -EISDIR; if (!S_ISREG(inode1->i_mode) || !S_ISREG(inode2->i_mode)) return -EINVAL; /* Both files must be opened for read and write. */ if (!(fxr->file1->f_mode & FMODE_READ) || !(fxr->file1->f_mode & FMODE_WRITE) || !(fxr->file2->f_mode & FMODE_READ) || !(fxr->file2->f_mode & FMODE_WRITE)) return -EBADF; /* Neither file can be opened append-only. */ if ((fxr->file1->f_flags & O_APPEND) || (fxr->file2->f_flags & O_APPEND)) return -EBADF; /* * If we're exchanging to EOF we can't calculate the length until taking * the iolock. Pass a 0 length to remap_verify_area similar to the * FICLONE and FICLONERANGE ioctls that support cloning to EOF as well. */ if (fxr->flags & XFS_EXCHANGE_RANGE_TO_EOF) check_len = 0; ret = remap_verify_area(fxr->file1, fxr->file1_offset, check_len, true); if (ret) return ret; ret = remap_verify_area(fxr->file2, fxr->file2_offset, check_len, true); if (ret) return ret; /* Update cmtime if the fd/inode don't forbid it. */ if (!(fxr->file1->f_mode & FMODE_NOCMTIME) && !IS_NOCMTIME(inode1)) fxr->flags |= __XFS_EXCHANGE_RANGE_UPD_CMTIME1; if (!(fxr->file2->f_mode & FMODE_NOCMTIME) && !IS_NOCMTIME(inode2)) fxr->flags |= __XFS_EXCHANGE_RANGE_UPD_CMTIME2; file_start_write(fxr->file2); ret = xfs_exchrange_contents(fxr); file_end_write(fxr->file2); if (ret) return ret; fsnotify_modify(fxr->file1); if (fxr->file2 != fxr->file1) fsnotify_modify(fxr->file2); return 0; } /* Collect exchange-range arguments from userspace. */ long xfs_ioc_exchange_range( struct file *file, struct xfs_exchange_range __user *argp) { struct xfs_exchrange fxr = { .file2 = file, }; struct xfs_exchange_range args; if (copy_from_user(&args, argp, sizeof(args))) return -EFAULT; if (memchr_inv(&args.pad, 0, sizeof(args.pad))) return -EINVAL; if (args.flags & ~XFS_EXCHANGE_RANGE_ALL_FLAGS) return -EINVAL; fxr.file1_offset = args.file1_offset; fxr.file2_offset = args.file2_offset; fxr.length = args.length; fxr.flags = args.flags; CLASS(fd, file1)(args.file1_fd); if (fd_empty(file1)) return -EBADF; fxr.file1 = fd_file(file1); return xfs_exchange_range(&fxr); } /* Opaque freshness blob for XFS_IOC_COMMIT_RANGE */ struct xfs_commit_range_fresh { xfs_fsid_t fsid; /* m_fixedfsid */ __u64 file2_ino; /* inode number */ __s64 file2_mtime; /* modification time */ __s64 file2_ctime; /* change time */ __s32 file2_mtime_nsec; /* mod time, nsec */ __s32 file2_ctime_nsec; /* change time, nsec */ __u32 file2_gen; /* inode generation */ __u32 magic; /* zero */ }; #define XCR_FRESH_MAGIC 0x444F524B /* DORK */ /* Set up a commitrange operation by sampling file2's write-related attrs */ long xfs_ioc_start_commit( struct file *file, struct xfs_commit_range __user *argp) { struct xfs_commit_range args = { }; struct kstat kstat = { }; struct xfs_commit_range_fresh *kern_f; struct xfs_commit_range_fresh __user *user_f; struct inode *inode2 = file_inode(file); struct xfs_inode *ip2 = XFS_I(inode2); const unsigned int lockflags = XFS_IOLOCK_SHARED | XFS_MMAPLOCK_SHARED | XFS_ILOCK_SHARED; BUILD_BUG_ON(sizeof(struct xfs_commit_range_fresh) != sizeof(args.file2_freshness)); kern_f = (struct xfs_commit_range_fresh *)&args.file2_freshness; memcpy(&kern_f->fsid, ip2->i_mount->m_fixedfsid, sizeof(xfs_fsid_t)); xfs_ilock(ip2, lockflags); /* Force writing of a distinct ctime if any writes happen. */ fill_mg_cmtime(&kstat, STATX_CTIME | STATX_MTIME, inode2); kern_f->file2_ctime = kstat.ctime.tv_sec; kern_f->file2_ctime_nsec = kstat.ctime.tv_nsec; kern_f->file2_mtime = kstat.mtime.tv_sec; kern_f->file2_mtime_nsec = kstat.mtime.tv_nsec; kern_f->file2_ino = ip2->i_ino; kern_f->file2_gen = inode2->i_generation; kern_f->magic = XCR_FRESH_MAGIC; xfs_iunlock(ip2, lockflags); user_f = (struct xfs_commit_range_fresh __user *)&argp->file2_freshness; if (copy_to_user(user_f, kern_f, sizeof(*kern_f))) return -EFAULT; return 0; } /* * Exchange file1 and file2 contents if file2 has not been written since the * start commit operation. */ long xfs_ioc_commit_range( struct file *file, struct xfs_commit_range __user *argp) { struct xfs_exchrange fxr = { .file2 = file, }; struct xfs_commit_range args; struct xfs_commit_range_fresh *kern_f; struct xfs_inode *ip2 = XFS_I(file_inode(file)); struct xfs_mount *mp = ip2->i_mount; kern_f = (struct xfs_commit_range_fresh *)&args.file2_freshness; if (copy_from_user(&args, argp, sizeof(args))) return -EFAULT; if (args.flags & ~XFS_EXCHANGE_RANGE_ALL_FLAGS) return -EINVAL; if (kern_f->magic != XCR_FRESH_MAGIC) return -EBUSY; if (memcmp(&kern_f->fsid, mp->m_fixedfsid, sizeof(xfs_fsid_t))) return -EBUSY; fxr.file1_offset = args.file1_offset; fxr.file2_offset = args.file2_offset; fxr.length = args.length; fxr.flags = args.flags | __XFS_EXCHANGE_RANGE_CHECK_FRESH2; fxr.file2_ino = kern_f->file2_ino; fxr.file2_gen = kern_f->file2_gen; fxr.file2_mtime.tv_sec = kern_f->file2_mtime; fxr.file2_mtime.tv_nsec = kern_f->file2_mtime_nsec; fxr.file2_ctime.tv_sec = kern_f->file2_ctime; fxr.file2_ctime.tv_nsec = kern_f->file2_ctime_nsec; CLASS(fd, file1)(args.file1_fd); if (fd_empty(file1)) return -EBADF; fxr.file1 = fd_file(file1); return xfs_exchange_range(&fxr); } |
| 35 36 26 6 1 1 33 54 54 6 6 6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2007 Jens Axboe <jens.axboe@oracle.com> * * Scatterlist handling helpers. */ #include <linux/export.h> #include <linux/slab.h> #include <linux/scatterlist.h> #include <linux/highmem.h> #include <linux/kmemleak.h> #include <linux/bvec.h> #include <linux/uio.h> #include <linux/folio_queue.h> /** * sg_next - return the next scatterlist entry in a list * @sg: The current sg entry * * Description: * Usually the next entry will be @sg@ + 1, but if this sg element is part * of a chained scatterlist, it could jump to the start of a new * scatterlist array. * **/ struct scatterlist *sg_next(struct scatterlist *sg) { if (sg_is_last(sg)) return NULL; sg++; if (unlikely(sg_is_chain(sg))) sg = sg_chain_ptr(sg); return sg; } EXPORT_SYMBOL(sg_next); /** * sg_nents - return total count of entries in scatterlist * @sg: The scatterlist * * Description: * Allows to know how many entries are in sg, taking into account * chaining as well * **/ int sg_nents(struct scatterlist *sg) { int nents; for (nents = 0; sg; sg = sg_next(sg)) nents++; return nents; } EXPORT_SYMBOL(sg_nents); /** * sg_nents_for_len - return total count of entries in scatterlist * needed to satisfy the supplied length * @sg: The scatterlist * @len: The total required length * * Description: * Determines the number of entries in sg that are required to meet * the supplied length, taking into account chaining as well * * Returns: * the number of sg entries needed, negative error on failure * **/ int sg_nents_for_len(struct scatterlist *sg, u64 len) { int nents; u64 total; if (!len) return 0; for (nents = 0, total = 0; sg; sg = sg_next(sg)) { nents++; total += sg->length; if (total >= len) return nents; } return -EINVAL; } EXPORT_SYMBOL(sg_nents_for_len); /** * sg_last - return the last scatterlist entry in a list * @sgl: First entry in the scatterlist * @nents: Number of entries in the scatterlist * * Description: * Should only be used casually, it (currently) scans the entire list * to get the last entry. * * Note that the @sgl@ pointer passed in need not be the first one, * the important bit is that @nents@ denotes the number of entries that * exist from @sgl@. * **/ struct scatterlist *sg_last(struct scatterlist *sgl, unsigned int nents) { struct scatterlist *sg, *ret = NULL; unsigned int i; for_each_sg(sgl, sg, nents, i) ret = sg; BUG_ON(!sg_is_last(ret)); return ret; } EXPORT_SYMBOL(sg_last); /** * sg_init_table - Initialize SG table * @sgl: The SG table * @nents: Number of entries in table * * Notes: * If this is part of a chained sg table, sg_mark_end() should be * used only on the last table part. * **/ void sg_init_table(struct scatterlist *sgl, unsigned int nents) { memset(sgl, 0, sizeof(*sgl) * nents); sg_init_marker(sgl, nents); } EXPORT_SYMBOL(sg_init_table); /** * sg_init_one - Initialize a single entry sg list * @sg: SG entry * @buf: Virtual address for IO * @buflen: IO length * **/ void sg_init_one(struct scatterlist *sg, const void *buf, unsigned int buflen) { sg_init_table(sg, 1); sg_set_buf(sg, buf, buflen); } EXPORT_SYMBOL(sg_init_one); /* * The default behaviour of sg_alloc_table() is to use these kmalloc/kfree * helpers. */ static struct scatterlist *sg_kmalloc(unsigned int nents, gfp_t gfp_mask) { if (nents == SG_MAX_SINGLE_ALLOC) { /* * Kmemleak doesn't track page allocations as they are not * commonly used (in a raw form) for kernel data structures. * As we chain together a list of pages and then a normal * kmalloc (tracked by kmemleak), in order to for that last * allocation not to become decoupled (and thus a * false-positive) we need to inform kmemleak of all the * intermediate allocations. */ void *ptr = (void *) __get_free_page(gfp_mask); kmemleak_alloc(ptr, PAGE_SIZE, 1, gfp_mask); return ptr; } else return kmalloc_array(nents, sizeof(struct scatterlist), gfp_mask); } static void sg_kfree(struct scatterlist *sg, unsigned int nents) { if (nents == SG_MAX_SINGLE_ALLOC) { kmemleak_free(sg); free_page((unsigned long) sg); } else kfree(sg); } /** * __sg_free_table - Free a previously mapped sg table * @table: The sg table header to use * @max_ents: The maximum number of entries per single scatterlist * @nents_first_chunk: Number of entries int the (preallocated) first * scatterlist chunk, 0 means no such preallocated first chunk * @free_fn: Free function * @num_ents: Number of entries in the table * * Description: * Free an sg table previously allocated and setup with * __sg_alloc_table(). The @max_ents value must be identical to * that previously used with __sg_alloc_table(). * **/ void __sg_free_table(struct sg_table *table, unsigned int max_ents, unsigned int nents_first_chunk, sg_free_fn *free_fn, unsigned int num_ents) { struct scatterlist *sgl, *next; unsigned curr_max_ents = nents_first_chunk ?: max_ents; if (unlikely(!table->sgl)) return; sgl = table->sgl; while (num_ents) { unsigned int alloc_size = num_ents; unsigned int sg_size; /* * If we have more than max_ents segments left, * then assign 'next' to the sg table after the current one. * sg_size is then one less than alloc size, since the last * element is the chain pointer. */ if (alloc_size > curr_max_ents) { next = sg_chain_ptr(&sgl[curr_max_ents - 1]); alloc_size = curr_max_ents; sg_size = alloc_size - 1; } else { sg_size = alloc_size; next = NULL; } num_ents -= sg_size; if (nents_first_chunk) nents_first_chunk = 0; else free_fn(sgl, alloc_size); sgl = next; curr_max_ents = max_ents; } table->sgl = NULL; } EXPORT_SYMBOL(__sg_free_table); /** * sg_free_append_table - Free a previously allocated append sg table. * @table: The mapped sg append table header * **/ void sg_free_append_table(struct sg_append_table *table) { __sg_free_table(&table->sgt, SG_MAX_SINGLE_ALLOC, 0, sg_kfree, table->total_nents); } EXPORT_SYMBOL(sg_free_append_table); /** * sg_free_table - Free a previously allocated sg table * @table: The mapped sg table header * **/ void sg_free_table(struct sg_table *table) { __sg_free_table(table, SG_MAX_SINGLE_ALLOC, 0, sg_kfree, table->orig_nents); } EXPORT_SYMBOL(sg_free_table); /** * __sg_alloc_table - Allocate and initialize an sg table with given allocator * @table: The sg table header to use * @nents: Number of entries in sg list * @max_ents: The maximum number of entries the allocator returns per call * @first_chunk: first SGL if preallocated (may be %NULL) * @nents_first_chunk: Number of entries in the (preallocated) first * scatterlist chunk, 0 means no such preallocated chunk provided by user * @gfp_mask: GFP allocation mask * @alloc_fn: Allocator to use * * Description: * This function returns a @table @nents long. The allocator is * defined to return scatterlist chunks of maximum size @max_ents. * Thus if @nents is bigger than @max_ents, the scatterlists will be * chained in units of @max_ents. * * Notes: * If this function returns non-0 (eg failure), the caller must call * __sg_free_table() to cleanup any leftover allocations. * **/ int __sg_alloc_table(struct sg_table *table, unsigned int nents, unsigned int max_ents, struct scatterlist *first_chunk, unsigned int nents_first_chunk, gfp_t gfp_mask, sg_alloc_fn *alloc_fn) { struct scatterlist *sg, *prv; unsigned int left; unsigned curr_max_ents = nents_first_chunk ?: max_ents; unsigned prv_max_ents; memset(table, 0, sizeof(*table)); if (nents == 0) return -EINVAL; #ifdef CONFIG_ARCH_NO_SG_CHAIN if (WARN_ON_ONCE(nents > max_ents)) return -EINVAL; #endif left = nents; prv = NULL; do { unsigned int sg_size, alloc_size = left; if (alloc_size > curr_max_ents) { alloc_size = curr_max_ents; sg_size = alloc_size - 1; } else sg_size = alloc_size; left -= sg_size; if (first_chunk) { sg = first_chunk; first_chunk = NULL; } else { sg = alloc_fn(alloc_size, gfp_mask); } if (unlikely(!sg)) { /* * Adjust entry count to reflect that the last * entry of the previous table won't be used for * linkage. Without this, sg_kfree() may get * confused. */ if (prv) table->nents = ++table->orig_nents; return -ENOMEM; } sg_init_table(sg, alloc_size); table->nents = table->orig_nents += sg_size; /* * If this is the first mapping, assign the sg table header. * If this is not the first mapping, chain previous part. */ if (prv) sg_chain(prv, prv_max_ents, sg); else table->sgl = sg; /* * If no more entries after this one, mark the end */ if (!left) sg_mark_end(&sg[sg_size - 1]); prv = sg; prv_max_ents = curr_max_ents; curr_max_ents = max_ents; } while (left); return 0; } EXPORT_SYMBOL(__sg_alloc_table); /** * sg_alloc_table - Allocate and initialize an sg table * @table: The sg table header to use * @nents: Number of entries in sg list * @gfp_mask: GFP allocation mask * * Description: * Allocate and initialize an sg table. If @nents@ is larger than * SG_MAX_SINGLE_ALLOC a chained sg table will be setup. * **/ int sg_alloc_table(struct sg_table *table, unsigned int nents, gfp_t gfp_mask) { int ret; ret = __sg_alloc_table(table, nents, SG_MAX_SINGLE_ALLOC, NULL, 0, gfp_mask, sg_kmalloc); if (unlikely(ret)) sg_free_table(table); return ret; } EXPORT_SYMBOL(sg_alloc_table); static struct scatterlist *get_next_sg(struct sg_append_table *table, struct scatterlist *cur, unsigned long needed_sges, gfp_t gfp_mask) { struct scatterlist *new_sg, *next_sg; unsigned int alloc_size; if (cur) { next_sg = sg_next(cur); /* Check if last entry should be keeped for chainning */ if (!sg_is_last(next_sg) || needed_sges == 1) return next_sg; } alloc_size = min_t(unsigned long, needed_sges, SG_MAX_SINGLE_ALLOC); new_sg = sg_kmalloc(alloc_size, gfp_mask); if (!new_sg) return ERR_PTR(-ENOMEM); sg_init_table(new_sg, alloc_size); if (cur) { table->total_nents += alloc_size - 1; __sg_chain(next_sg, new_sg); } else { table->sgt.sgl = new_sg; table->total_nents = alloc_size; } return new_sg; } static bool pages_are_mergeable(struct page *a, struct page *b) { if (page_to_pfn(a) != page_to_pfn(b) + 1) return false; if (!zone_device_pages_have_same_pgmap(a, b)) return false; return true; } /** * sg_alloc_append_table_from_pages - Allocate and initialize an append sg * table from an array of pages * @sgt_append: The sg append table to use * @pages: Pointer to an array of page pointers * @n_pages: Number of pages in the pages array * @offset: Offset from start of the first page to the start of a buffer * @size: Number of valid bytes in the buffer (after offset) * @max_segment: Maximum size of a scatterlist element in bytes * @left_pages: Left pages caller have to set after this call * @gfp_mask: GFP allocation mask * * Description: * In the first call it allocate and initialize an sg table from a list of * pages, else reuse the scatterlist from sgt_append. Contiguous ranges of * the pages are squashed into a single scatterlist entry up to the maximum * size specified in @max_segment. A user may provide an offset at a start * and a size of valid data in a buffer specified by the page array. The * returned sg table is released by sg_free_append_table * * Returns: * 0 on success, negative error on failure * * Notes: * If this function returns non-0 (eg failure), the caller must call * sg_free_append_table() to cleanup any leftover allocations. * * In the fist call, sgt_append must by initialized. */ int sg_alloc_append_table_from_pages(struct sg_append_table *sgt_append, struct page **pages, unsigned int n_pages, unsigned int offset, unsigned long size, unsigned int max_segment, unsigned int left_pages, gfp_t gfp_mask) { unsigned int chunks, cur_page, seg_len, i, prv_len = 0; unsigned int added_nents = 0; struct scatterlist *s = sgt_append->prv; struct page *last_pg; /* * The algorithm below requires max_segment to be aligned to PAGE_SIZE * otherwise it can overshoot. */ max_segment = ALIGN_DOWN(max_segment, PAGE_SIZE); if (WARN_ON(max_segment < PAGE_SIZE)) return -EINVAL; if (IS_ENABLED(CONFIG_ARCH_NO_SG_CHAIN) && sgt_append->prv) return -EOPNOTSUPP; if (sgt_append->prv) { unsigned long next_pfn; if (WARN_ON(offset)) return -EINVAL; /* Merge contiguous pages into the last SG */ prv_len = sgt_append->prv->length; next_pfn = (sg_phys(sgt_append->prv) + prv_len) / PAGE_SIZE; if (page_to_pfn(pages[0]) == next_pfn) { last_pg = pfn_to_page(next_pfn - 1); while (n_pages && pages_are_mergeable(pages[0], last_pg)) { if (sgt_append->prv->length + PAGE_SIZE > max_segment) break; sgt_append->prv->length += PAGE_SIZE; last_pg = pages[0]; pages++; n_pages--; } if (!n_pages) goto out; } } /* compute number of contiguous chunks */ chunks = 1; seg_len = 0; for (i = 1; i < n_pages; i++) { seg_len += PAGE_SIZE; if (seg_len >= max_segment || !pages_are_mergeable(pages[i], pages[i - 1])) { chunks++; seg_len = 0; } } /* merging chunks and putting them into the scatterlist */ cur_page = 0; for (i = 0; i < chunks; i++) { unsigned int j, chunk_size; /* look for the end of the current chunk */ seg_len = 0; for (j = cur_page + 1; j < n_pages; j++) { seg_len += PAGE_SIZE; if (seg_len >= max_segment || !pages_are_mergeable(pages[j], pages[j - 1])) break; } /* Pass how many chunks might be left */ s = get_next_sg(sgt_append, s, chunks - i + left_pages, gfp_mask); if (IS_ERR(s)) { /* * Adjust entry length to be as before function was * called. */ if (sgt_append->prv) sgt_append->prv->length = prv_len; return PTR_ERR(s); } chunk_size = ((j - cur_page) << PAGE_SHIFT) - offset; sg_set_page(s, pages[cur_page], min_t(unsigned long, size, chunk_size), offset); added_nents++; size -= chunk_size; offset = 0; cur_page = j; } sgt_append->sgt.nents += added_nents; sgt_append->sgt.orig_nents = sgt_append->sgt.nents; sgt_append->prv = s; out: if (!left_pages) sg_mark_end(s); return 0; } EXPORT_SYMBOL(sg_alloc_append_table_from_pages); /** * sg_alloc_table_from_pages_segment - Allocate and initialize an sg table from * an array of pages and given maximum * segment. * @sgt: The sg table header to use * @pages: Pointer to an array of page pointers * @n_pages: Number of pages in the pages array * @offset: Offset from start of the first page to the start of a buffer * @size: Number of valid bytes in the buffer (after offset) * @max_segment: Maximum size of a scatterlist element in bytes * @gfp_mask: GFP allocation mask * * Description: * Allocate and initialize an sg table from a list of pages. Contiguous * ranges of the pages are squashed into a single scatterlist node up to the * maximum size specified in @max_segment. A user may provide an offset at a * start and a size of valid data in a buffer specified by the page array. * * The returned sg table is released by sg_free_table. * * Returns: * 0 on success, negative error on failure */ int sg_alloc_table_from_pages_segment(struct sg_table *sgt, struct page **pages, unsigned int n_pages, unsigned int offset, unsigned long size, unsigned int max_segment, gfp_t gfp_mask) { struct sg_append_table append = {}; int err; err = sg_alloc_append_table_from_pages(&append, pages, n_pages, offset, size, max_segment, 0, gfp_mask); if (err) { sg_free_append_table(&append); return err; } memcpy(sgt, &append.sgt, sizeof(*sgt)); WARN_ON(append.total_nents != sgt->orig_nents); return 0; } EXPORT_SYMBOL(sg_alloc_table_from_pages_segment); #ifdef CONFIG_SGL_ALLOC /** * sgl_alloc_order - allocate a scatterlist and its pages * @length: Length in bytes of the scatterlist. Must be at least one * @order: Second argument for alloc_pages() * @chainable: Whether or not to allocate an extra element in the scatterlist * for scatterlist chaining purposes * @gfp: Memory allocation flags * @nent_p: [out] Number of entries in the scatterlist that have pages * * Returns: A pointer to an initialized scatterlist or %NULL upon failure. */ struct scatterlist *sgl_alloc_order(unsigned long long length, unsigned int order, bool chainable, gfp_t gfp, unsigned int *nent_p) { struct scatterlist *sgl, *sg; struct page *page; unsigned int nent, nalloc; u32 elem_len; nent = round_up(length, PAGE_SIZE << order) >> (PAGE_SHIFT + order); /* Check for integer overflow */ if (length > (nent << (PAGE_SHIFT + order))) return NULL; nalloc = nent; if (chainable) { /* Check for integer overflow */ if (nalloc + 1 < nalloc) return NULL; nalloc++; } sgl = kmalloc_array(nalloc, sizeof(struct scatterlist), gfp & ~GFP_DMA); if (!sgl) return NULL; sg_init_table(sgl, nalloc); sg = sgl; while (length) { elem_len = min_t(u64, length, PAGE_SIZE << order); page = alloc_pages(gfp, order); if (!page) { sgl_free_order(sgl, order); return NULL; } sg_set_page(sg, page, elem_len, 0); length -= elem_len; sg = sg_next(sg); } WARN_ONCE(length, "length = %lld\n", length); if (nent_p) *nent_p = nent; return sgl; } EXPORT_SYMBOL(sgl_alloc_order); /** * sgl_alloc - allocate a scatterlist and its pages * @length: Length in bytes of the scatterlist * @gfp: Memory allocation flags * @nent_p: [out] Number of entries in the scatterlist * * Returns: A pointer to an initialized scatterlist or %NULL upon failure. */ struct scatterlist *sgl_alloc(unsigned long long length, gfp_t gfp, unsigned int *nent_p) { return sgl_alloc_order(length, 0, false, gfp, nent_p); } EXPORT_SYMBOL(sgl_alloc); /** * sgl_free_n_order - free a scatterlist and its pages * @sgl: Scatterlist with one or more elements * @nents: Maximum number of elements to free * @order: Second argument for __free_pages() * * Notes: * - If several scatterlists have been chained and each chain element is * freed separately then it's essential to set nents correctly to avoid that a * page would get freed twice. * - All pages in a chained scatterlist can be freed at once by setting @nents * to a high number. */ void sgl_free_n_order(struct scatterlist *sgl, int nents, int order) { struct scatterlist *sg; struct page *page; int i; for_each_sg(sgl, sg, nents, i) { if (!sg) break; page = sg_page(sg); if (page) __free_pages(page, order); } kfree(sgl); } EXPORT_SYMBOL(sgl_free_n_order); /** * sgl_free_order - free a scatterlist and its pages * @sgl: Scatterlist with one or more elements * @order: Second argument for __free_pages() */ void sgl_free_order(struct scatterlist *sgl, int order) { sgl_free_n_order(sgl, INT_MAX, order); } EXPORT_SYMBOL(sgl_free_order); /** * sgl_free - free a scatterlist and its pages * @sgl: Scatterlist with one or more elements */ void sgl_free(struct scatterlist *sgl) { sgl_free_order(sgl, 0); } EXPORT_SYMBOL(sgl_free); #endif /* CONFIG_SGL_ALLOC */ void __sg_page_iter_start(struct sg_page_iter *piter, struct scatterlist *sglist, unsigned int nents, unsigned long pgoffset) { piter->__pg_advance = 0; piter->__nents = nents; piter->sg = sglist; piter->sg_pgoffset = pgoffset; } EXPORT_SYMBOL(__sg_page_iter_start); static int sg_page_count(struct scatterlist *sg) { return PAGE_ALIGN(sg->offset + sg->length) >> PAGE_SHIFT; } bool __sg_page_iter_next(struct sg_page_iter *piter) { if (!piter->__nents || !piter->sg) return false; piter->sg_pgoffset += piter->__pg_advance; piter->__pg_advance = 1; while (piter->sg_pgoffset >= sg_page_count(piter->sg)) { piter->sg_pgoffset -= sg_page_count(piter->sg); piter->sg = sg_next(piter->sg); if (!--piter->__nents || !piter->sg) return false; } return true; } EXPORT_SYMBOL(__sg_page_iter_next); static int sg_dma_page_count(struct scatterlist *sg) { return PAGE_ALIGN(sg->offset + sg_dma_len(sg)) >> PAGE_SHIFT; } bool __sg_page_iter_dma_next(struct sg_dma_page_iter *dma_iter) { struct sg_page_iter *piter = &dma_iter->base; if (!piter->__nents || !piter->sg) return false; piter->sg_pgoffset += piter->__pg_advance; piter->__pg_advance = 1; while (piter->sg_pgoffset >= sg_dma_page_count(piter->sg)) { piter->sg_pgoffset -= sg_dma_page_count(piter->sg); piter->sg = sg_next(piter->sg); if (!--piter->__nents || !piter->sg) return false; } return true; } EXPORT_SYMBOL(__sg_page_iter_dma_next); /** * sg_miter_start - start mapping iteration over a sg list * @miter: sg mapping iter to be started * @sgl: sg list to iterate over * @nents: number of sg entries * @flags: sg iterator flags * * Description: * Starts mapping iterator @miter. * * Context: * Don't care. */ void sg_miter_start(struct sg_mapping_iter *miter, struct scatterlist *sgl, unsigned int nents, unsigned int flags) { memset(miter, 0, sizeof(struct sg_mapping_iter)); __sg_page_iter_start(&miter->piter, sgl, nents, 0); WARN_ON(!(flags & (SG_MITER_TO_SG | SG_MITER_FROM_SG))); miter->__flags = flags; } EXPORT_SYMBOL(sg_miter_start); static bool sg_miter_get_next_page(struct sg_mapping_iter *miter) { if (!miter->__remaining) { struct scatterlist *sg; if (!__sg_page_iter_next(&miter->piter)) return false; sg = miter->piter.sg; miter->__offset = miter->piter.sg_pgoffset ? 0 : sg->offset; miter->piter.sg_pgoffset += miter->__offset >> PAGE_SHIFT; miter->__offset &= PAGE_SIZE - 1; miter->__remaining = sg->offset + sg->length - (miter->piter.sg_pgoffset << PAGE_SHIFT) - miter->__offset; miter->__remaining = min_t(unsigned long, miter->__remaining, PAGE_SIZE - miter->__offset); } return true; } /** * sg_miter_skip - reposition mapping iterator * @miter: sg mapping iter to be skipped * @offset: number of bytes to plus the current location * * Description: * Sets the offset of @miter to its current location plus @offset bytes. * If mapping iterator @miter has been proceeded by sg_miter_next(), this * stops @miter. * * Context: * Don't care. * * Returns: * true if @miter contains the valid mapping. false if end of sg * list is reached. */ bool sg_miter_skip(struct sg_mapping_iter *miter, off_t offset) { sg_miter_stop(miter); while (offset) { off_t consumed; if (!sg_miter_get_next_page(miter)) return false; consumed = min_t(off_t, offset, miter->__remaining); miter->__offset += consumed; miter->__remaining -= consumed; offset -= consumed; } return true; } EXPORT_SYMBOL(sg_miter_skip); /** * sg_miter_next - proceed mapping iterator to the next mapping * @miter: sg mapping iter to proceed * * Description: * Proceeds @miter to the next mapping. @miter should have been started * using sg_miter_start(). On successful return, @miter->page, * @miter->addr and @miter->length point to the current mapping. * * Context: * May sleep if !SG_MITER_ATOMIC && !SG_MITER_LOCAL. * * Returns: * true if @miter contains the next mapping. false if end of sg * list is reached. */ bool sg_miter_next(struct sg_mapping_iter *miter) { sg_miter_stop(miter); /* * Get to the next page if necessary. * __remaining, __offset is adjusted by sg_miter_stop */ if (!sg_miter_get_next_page(miter)) return false; miter->page = sg_page_iter_page(&miter->piter); miter->consumed = miter->length = miter->__remaining; if (miter->__flags & SG_MITER_ATOMIC) miter->addr = kmap_atomic(miter->page) + miter->__offset; else if (miter->__flags & SG_MITER_LOCAL) miter->addr = kmap_local_page(miter->page) + miter->__offset; else miter->addr = kmap(miter->page) + miter->__offset; return true; } EXPORT_SYMBOL(sg_miter_next); /** * sg_miter_stop - stop mapping iteration * @miter: sg mapping iter to be stopped * * Description: * Stops mapping iterator @miter. @miter should have been started * using sg_miter_start(). A stopped iteration can be resumed by * calling sg_miter_next() on it. This is useful when resources (kmap) * need to be released during iteration. * * Context: * Don't care otherwise. */ void sg_miter_stop(struct sg_mapping_iter *miter) { WARN_ON(miter->consumed > miter->length); /* drop resources from the last iteration */ if (miter->addr) { miter->__offset += miter->consumed; miter->__remaining -= miter->consumed; if (miter->__flags & SG_MITER_TO_SG) flush_dcache_page(miter->page); if (miter->__flags & SG_MITER_ATOMIC) { WARN_ON_ONCE(!pagefault_disabled()); kunmap_atomic(miter->addr); } else if (miter->__flags & SG_MITER_LOCAL) kunmap_local(miter->addr); else kunmap(miter->page); miter->page = NULL; miter->addr = NULL; miter->length = 0; miter->consumed = 0; } } EXPORT_SYMBOL(sg_miter_stop); /** * sg_copy_buffer - Copy data between a linear buffer and an SG list * @sgl: The SG list * @nents: Number of SG entries * @buf: Where to copy from * @buflen: The number of bytes to copy * @skip: Number of bytes to skip before copying * @to_buffer: transfer direction (true == from an sg list to a * buffer, false == from a buffer to an sg list) * * Returns the number of copied bytes. * **/ size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents, void *buf, size_t buflen, off_t skip, bool to_buffer) { unsigned int offset = 0; struct sg_mapping_iter miter; unsigned int sg_flags = SG_MITER_LOCAL; if (to_buffer) sg_flags |= SG_MITER_FROM_SG; else sg_flags |= SG_MITER_TO_SG; sg_miter_start(&miter, sgl, nents, sg_flags); if (!sg_miter_skip(&miter, skip)) return 0; while ((offset < buflen) && sg_miter_next(&miter)) { unsigned int len; len = min(miter.length, buflen - offset); if (to_buffer) memcpy(buf + offset, miter.addr, len); else memcpy(miter.addr, buf + offset, len); offset += len; } sg_miter_stop(&miter); return offset; } EXPORT_SYMBOL(sg_copy_buffer); /** * sg_copy_from_buffer - Copy from a linear buffer to an SG list * @sgl: The SG list * @nents: Number of SG entries * @buf: Where to copy from * @buflen: The number of bytes to copy * * Returns the number of copied bytes. * **/ size_t sg_copy_from_buffer(struct scatterlist *sgl, unsigned int nents, const void *buf, size_t buflen) { return sg_copy_buffer(sgl, nents, (void *)buf, buflen, 0, false); } EXPORT_SYMBOL(sg_copy_from_buffer); /** * sg_copy_to_buffer - Copy from an SG list to a linear buffer * @sgl: The SG list * @nents: Number of SG entries * @buf: Where to copy to * @buflen: The number of bytes to copy * * Returns the number of copied bytes. * **/ size_t sg_copy_to_buffer(struct scatterlist *sgl, unsigned int nents, void *buf, size_t buflen) { return sg_copy_buffer(sgl, nents, buf, buflen, 0, true); } EXPORT_SYMBOL(sg_copy_to_buffer); /** * sg_pcopy_from_buffer - Copy from a linear buffer to an SG list * @sgl: The SG list * @nents: Number of SG entries * @buf: Where to copy from * @buflen: The number of bytes to copy * @skip: Number of bytes to skip before copying * * Returns the number of copied bytes. * **/ size_t sg_pcopy_from_buffer(struct scatterlist *sgl, unsigned int nents, const void *buf, size_t buflen, off_t skip) { return sg_copy_buffer(sgl, nents, (void *)buf, buflen, skip, false); } EXPORT_SYMBOL(sg_pcopy_from_buffer); /** * sg_pcopy_to_buffer - Copy from an SG list to a linear buffer * @sgl: The SG list * @nents: Number of SG entries * @buf: Where to copy to * @buflen: The number of bytes to copy * @skip: Number of bytes to skip before copying * * Returns the number of copied bytes. * **/ size_t sg_pcopy_to_buffer(struct scatterlist *sgl, unsigned int nents, void *buf, size_t buflen, off_t skip) { return sg_copy_buffer(sgl, nents, buf, buflen, skip, true); } EXPORT_SYMBOL(sg_pcopy_to_buffer); /** * sg_zero_buffer - Zero-out a part of a SG list * @sgl: The SG list * @nents: Number of SG entries * @buflen: The number of bytes to zero out * @skip: Number of bytes to skip before zeroing * * Returns the number of bytes zeroed. **/ size_t sg_zero_buffer(struct scatterlist *sgl, unsigned int nents, size_t buflen, off_t skip) { unsigned int offset = 0; struct sg_mapping_iter miter; unsigned int sg_flags = SG_MITER_LOCAL | SG_MITER_TO_SG; sg_miter_start(&miter, sgl, nents, sg_flags); if (!sg_miter_skip(&miter, skip)) return false; while (offset < buflen && sg_miter_next(&miter)) { unsigned int len; len = min(miter.length, buflen - offset); memset(miter.addr, 0, len); offset += len; } sg_miter_stop(&miter); return offset; } EXPORT_SYMBOL(sg_zero_buffer); /* * Extract and pin a list of up to sg_max pages from UBUF- or IOVEC-class * iterators, and add them to the scatterlist. */ static ssize_t extract_user_to_sg(struct iov_iter *iter, ssize_t maxsize, struct sg_table *sgtable, unsigned int sg_max, iov_iter_extraction_t extraction_flags) { struct scatterlist *sg = sgtable->sgl + sgtable->nents; struct page **pages; unsigned int npages; ssize_t ret = 0, res; size_t len, off; /* We decant the page list into the tail of the scatterlist */ pages = (void *)sgtable->sgl + array_size(sg_max, sizeof(struct scatterlist)); pages -= sg_max; do { res = iov_iter_extract_pages(iter, &pages, maxsize, sg_max, extraction_flags, &off); if (res <= 0) goto failed; len = res; maxsize -= len; ret += len; npages = DIV_ROUND_UP(off + len, PAGE_SIZE); sg_max -= npages; for (; npages > 0; npages--) { struct page *page = *pages; size_t seg = min_t(size_t, PAGE_SIZE - off, len); *pages++ = NULL; sg_set_page(sg, page, seg, off); sgtable->nents++; sg++; len -= seg; off = 0; } } while (maxsize > 0 && sg_max > 0); return ret; failed: while (sgtable->nents > sgtable->orig_nents) unpin_user_page(sg_page(&sgtable->sgl[--sgtable->nents])); return res; } /* * Extract up to sg_max pages from a BVEC-type iterator and add them to the * scatterlist. The pages are not pinned. */ static ssize_t extract_bvec_to_sg(struct iov_iter *iter, ssize_t maxsize, struct sg_table *sgtable, unsigned int sg_max, iov_iter_extraction_t extraction_flags) { const struct bio_vec *bv = iter->bvec; struct scatterlist *sg = sgtable->sgl + sgtable->nents; unsigned long start = iter->iov_offset; unsigned int i; ssize_t ret = 0; for (i = 0; i < iter->nr_segs; i++) { size_t off, len; len = bv[i].bv_len; if (start >= len) { start -= len; continue; } len = min_t(size_t, maxsize, len - start); off = bv[i].bv_offset + start; sg_set_page(sg, bv[i].bv_page, len, off); sgtable->nents++; sg++; sg_max--; ret += len; maxsize -= len; if (maxsize <= 0 || sg_max == 0) break; start = 0; } if (ret > 0) iov_iter_advance(iter, ret); return ret; } /* * Extract up to sg_max pages from a KVEC-type iterator and add them to the * scatterlist. This can deal with vmalloc'd buffers as well as kmalloc'd or * static buffers. The pages are not pinned. */ static ssize_t extract_kvec_to_sg(struct iov_iter *iter, ssize_t maxsize, struct sg_table *sgtable, unsigned int sg_max, iov_iter_extraction_t extraction_flags) { const struct kvec *kv = iter->kvec; struct scatterlist *sg = sgtable->sgl + sgtable->nents; unsigned long start = iter->iov_offset; unsigned int i; ssize_t ret = 0; for (i = 0; i < iter->nr_segs; i++) { struct page *page; unsigned long kaddr; size_t off, len, seg; len = kv[i].iov_len; if (start >= len) { start -= len; continue; } kaddr = (unsigned long)kv[i].iov_base + start; off = kaddr & ~PAGE_MASK; len = min_t(size_t, maxsize, len - start); kaddr &= PAGE_MASK; maxsize -= len; ret += len; do { seg = min_t(size_t, len, PAGE_SIZE - off); if (is_vmalloc_or_module_addr((void *)kaddr)) page = vmalloc_to_page((void *)kaddr); else page = virt_to_page((void *)kaddr); sg_set_page(sg, page, len, off); sgtable->nents++; sg++; sg_max--; len -= seg; kaddr += PAGE_SIZE; off = 0; } while (len > 0 && sg_max > 0); if (maxsize <= 0 || sg_max == 0) break; start = 0; } if (ret > 0) iov_iter_advance(iter, ret); return ret; } /* * Extract up to sg_max folios from an FOLIOQ-type iterator and add them to * the scatterlist. The pages are not pinned. */ static ssize_t extract_folioq_to_sg(struct iov_iter *iter, ssize_t maxsize, struct sg_table *sgtable, unsigned int sg_max, iov_iter_extraction_t extraction_flags) { const struct folio_queue *folioq = iter->folioq; struct scatterlist *sg = sgtable->sgl + sgtable->nents; unsigned int slot = iter->folioq_slot; ssize_t ret = 0; size_t offset = iter->iov_offset; BUG_ON(!folioq); if (slot >= folioq_nr_slots(folioq)) { folioq = folioq->next; if (WARN_ON_ONCE(!folioq)) return 0; slot = 0; } do { struct folio *folio = folioq_folio(folioq, slot); size_t fsize = folioq_folio_size(folioq, slot); if (offset < fsize) { size_t part = umin(maxsize - ret, fsize - offset); sg_set_page(sg, folio_page(folio, 0), part, offset); sgtable->nents++; sg++; sg_max--; offset += part; ret += part; } if (offset >= fsize) { offset = 0; slot++; if (slot >= folioq_nr_slots(folioq)) { if (!folioq->next) { WARN_ON_ONCE(ret < iter->count); break; } folioq = folioq->next; slot = 0; } } } while (sg_max > 0 && ret < maxsize); iter->folioq = folioq; iter->folioq_slot = slot; iter->iov_offset = offset; iter->count -= ret; return ret; } /* * Extract up to sg_max folios from an XARRAY-type iterator and add them to * the scatterlist. The pages are not pinned. */ static ssize_t extract_xarray_to_sg(struct iov_iter *iter, ssize_t maxsize, struct sg_table *sgtable, unsigned int sg_max, iov_iter_extraction_t extraction_flags) { struct scatterlist *sg = sgtable->sgl + sgtable->nents; struct xarray *xa = iter->xarray; struct folio *folio; loff_t start = iter->xarray_start + iter->iov_offset; pgoff_t index = start / PAGE_SIZE; ssize_t ret = 0; size_t offset, len; XA_STATE(xas, xa, index); rcu_read_lock(); xas_for_each(&xas, folio, ULONG_MAX) { if (xas_retry(&xas, folio)) continue; if (WARN_ON(xa_is_value(folio))) break; if (WARN_ON(folio_test_hugetlb(folio))) break; offset = offset_in_folio(folio, start); len = min_t(size_t, maxsize, folio_size(folio) - offset); sg_set_page(sg, folio_page(folio, 0), len, offset); sgtable->nents++; sg++; sg_max--; maxsize -= len; ret += len; if (maxsize <= 0 || sg_max == 0) break; } rcu_read_unlock(); if (ret > 0) iov_iter_advance(iter, ret); return ret; } /** * extract_iter_to_sg - Extract pages from an iterator and add to an sglist * @iter: The iterator to extract from * @maxsize: The amount of iterator to copy * @sgtable: The scatterlist table to fill in * @sg_max: Maximum number of elements in @sgtable that may be filled * @extraction_flags: Flags to qualify the request * * Extract the page fragments from the given amount of the source iterator and * add them to a scatterlist that refers to all of those bits, to a maximum * addition of @sg_max elements. * * The pages referred to by UBUF- and IOVEC-type iterators are extracted and * pinned; BVEC-, KVEC-, FOLIOQ- and XARRAY-type are extracted but aren't * pinned; DISCARD-type is not supported. * * No end mark is placed on the scatterlist; that's left to the caller. * * @extraction_flags can have ITER_ALLOW_P2PDMA set to request peer-to-peer DMA * be allowed on the pages extracted. * * If successful, @sgtable->nents is updated to include the number of elements * added and the number of bytes added is returned. @sgtable->orig_nents is * left unaltered. * * The iov_iter_extract_mode() function should be used to query how cleanup * should be performed. */ ssize_t extract_iter_to_sg(struct iov_iter *iter, size_t maxsize, struct sg_table *sgtable, unsigned int sg_max, iov_iter_extraction_t extraction_flags) { if (maxsize == 0) return 0; switch (iov_iter_type(iter)) { case ITER_UBUF: case ITER_IOVEC: return extract_user_to_sg(iter, maxsize, sgtable, sg_max, extraction_flags); case ITER_BVEC: return extract_bvec_to_sg(iter, maxsize, sgtable, sg_max, extraction_flags); case ITER_KVEC: return extract_kvec_to_sg(iter, maxsize, sgtable, sg_max, extraction_flags); case ITER_FOLIOQ: return extract_folioq_to_sg(iter, maxsize, sgtable, sg_max, extraction_flags); case ITER_XARRAY: return extract_xarray_to_sg(iter, maxsize, sgtable, sg_max, extraction_flags); default: pr_err("%s(%u) unsupported\n", __func__, iov_iter_type(iter)); WARN_ON_ONCE(1); return -EIO; } } EXPORT_SYMBOL_GPL(extract_iter_to_sg); |
| 22 22 15 15 19 19 11 10 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Squashfs - a compressed read only filesystem for Linux * * Copyright (c) 2010 LG Electronics * Chan Jeong <chan.jeong@lge.com> * * lzo_wrapper.c */ #include <linux/mutex.h> #include <linux/bio.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/lzo.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" #include "squashfs.h" #include "decompressor.h" #include "page_actor.h" struct squashfs_lzo { void *input; void *output; }; static void *lzo_init(struct squashfs_sb_info *msblk, void *buff) { int block_size = max_t(int, msblk->block_size, SQUASHFS_METADATA_SIZE); struct squashfs_lzo *stream = kzalloc(sizeof(*stream), GFP_KERNEL); if (stream == NULL) goto failed; stream->input = vmalloc(block_size); if (stream->input == NULL) goto failed; stream->output = vmalloc(block_size); if (stream->output == NULL) goto failed2; return stream; failed2: vfree(stream->input); failed: ERROR("Failed to allocate lzo workspace\n"); kfree(stream); return ERR_PTR(-ENOMEM); } static void lzo_free(void *strm) { struct squashfs_lzo *stream = strm; if (stream) { vfree(stream->input); vfree(stream->output); } kfree(stream); } static int lzo_uncompress(struct squashfs_sb_info *msblk, void *strm, struct bio *bio, int offset, int length, struct squashfs_page_actor *output) { struct bvec_iter_all iter_all = {}; struct bio_vec *bvec = bvec_init_iter_all(&iter_all); struct squashfs_lzo *stream = strm; void *buff = stream->input, *data; int bytes = length, res; size_t out_len = output->length; while (bio_next_segment(bio, &iter_all)) { int avail = min(bytes, ((int)bvec->bv_len) - offset); data = bvec_virt(bvec); memcpy(buff, data + offset, avail); buff += avail; bytes -= avail; offset = 0; } res = lzo1x_decompress_safe(stream->input, (size_t)length, stream->output, &out_len); if (res != LZO_E_OK) goto failed; res = bytes = (int)out_len; data = squashfs_first_page(output); buff = stream->output; while (data) { if (bytes <= PAGE_SIZE) { if (!IS_ERR(data)) memcpy(data, buff, bytes); break; } else { if (!IS_ERR(data)) memcpy(data, buff, PAGE_SIZE); buff += PAGE_SIZE; bytes -= PAGE_SIZE; data = squashfs_next_page(output); } } squashfs_finish_page(output); return res; failed: return -EIO; } const struct squashfs_decompressor squashfs_lzo_comp_ops = { .init = lzo_init, .free = lzo_free, .decompress = lzo_uncompress, .id = LZO_COMPRESSION, .name = "lzo", .alloc_buffer = 0, .supported = 1 }; |
| 1 1 5 9 8 9 11 11 3 8 11 13 1 1 11 11 2 9 8 16 1 12 2 13 2 8 7 5 10 10 10 10 10 18 18 28 30 28 1 1 7 1 1 1 1 1 2 11 5 1 1 1 1 1 1 1 1 20 3 17 20 20 20 18 18 18 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) 2020, Microsoft Corporation. * * Author(s): Steve French <stfrench@microsoft.com> * David Howells <dhowells@redhat.com> */ /* #include <linux/module.h> #include <linux/nsproxy.h> #include <linux/slab.h> #include <linux/magic.h> #include <linux/security.h> #include <net/net_namespace.h> #ifdef CONFIG_CIFS_DFS_UPCALL #include "dfs_cache.h" #endif */ #include <linux/ctype.h> #include <linux/fs_context.h> #include <linux/fs_parser.h> #include <linux/fs.h> #include <linux/mount.h> #include <linux/parser.h> #include <linux/utsname.h> #include "cifsfs.h" #include "cifspdu.h" #include "cifsglob.h" #include "cifsproto.h" #include "cifs_unicode.h" #include "cifs_debug.h" #include "cifs_fs_sb.h" #include "ntlmssp.h" #include "nterr.h" #include "rfc1002pdu.h" #include "fs_context.h" DEFINE_MUTEX(cifs_mount_mutex); static const match_table_t cifs_smb_version_tokens = { { Smb_1, SMB1_VERSION_STRING }, { Smb_20, SMB20_VERSION_STRING}, { Smb_21, SMB21_VERSION_STRING }, { Smb_30, SMB30_VERSION_STRING }, { Smb_302, SMB302_VERSION_STRING }, { Smb_302, ALT_SMB302_VERSION_STRING }, { Smb_311, SMB311_VERSION_STRING }, { Smb_311, ALT_SMB311_VERSION_STRING }, { Smb_3any, SMB3ANY_VERSION_STRING }, { Smb_default, SMBDEFAULT_VERSION_STRING }, { Smb_version_err, NULL } }; static const match_table_t cifs_secflavor_tokens = { { Opt_sec_krb5, "krb5" }, { Opt_sec_krb5i, "krb5i" }, { Opt_sec_krb5p, "krb5p" }, { Opt_sec_ntlmsspi, "ntlmsspi" }, { Opt_sec_ntlmssp, "ntlmssp" }, { Opt_sec_ntlmv2, "nontlm" }, { Opt_sec_ntlmv2, "ntlmv2" }, { Opt_sec_ntlmv2i, "ntlmv2i" }, { Opt_sec_none, "none" }, { Opt_sec_err, NULL } }; static const match_table_t cifs_upcall_target = { { Opt_upcall_target_mount, "mount" }, { Opt_upcall_target_application, "app" }, { Opt_upcall_target_err, NULL } }; const struct fs_parameter_spec smb3_fs_parameters[] = { /* Mount options that take no arguments */ fsparam_flag_no("user_xattr", Opt_user_xattr), fsparam_flag_no("forceuid", Opt_forceuid), fsparam_flag_no("multichannel", Opt_multichannel), fsparam_flag_no("forcegid", Opt_forcegid), fsparam_flag("noblocksend", Opt_noblocksend), fsparam_flag("noautotune", Opt_noautotune), fsparam_flag("nolease", Opt_nolease), fsparam_flag_no("hard", Opt_hard), fsparam_flag_no("soft", Opt_soft), fsparam_flag_no("perm", Opt_perm), fsparam_flag("nodelete", Opt_nodelete), fsparam_flag_no("mapposix", Opt_mapposix), fsparam_flag("mapchars", Opt_mapchars), fsparam_flag("nomapchars", Opt_nomapchars), fsparam_flag_no("sfu", Opt_sfu), fsparam_flag("nodfs", Opt_nodfs), fsparam_flag_no("posixpaths", Opt_posixpaths), fsparam_flag_no("unix", Opt_unix), fsparam_flag_no("linux", Opt_unix), fsparam_flag_no("posix", Opt_unix), fsparam_flag("nocase", Opt_nocase), fsparam_flag("ignorecase", Opt_nocase), fsparam_flag_no("brl", Opt_brl), fsparam_flag_no("handlecache", Opt_handlecache), fsparam_flag("forcemandatorylock", Opt_forcemandatorylock), fsparam_flag("forcemand", Opt_forcemandatorylock), fsparam_flag("setuidfromacl", Opt_setuidfromacl), fsparam_flag("idsfromsid", Opt_setuidfromacl), fsparam_flag_no("setuids", Opt_setuids), fsparam_flag_no("dynperm", Opt_dynperm), fsparam_flag_no("intr", Opt_intr), fsparam_flag_no("strictsync", Opt_strictsync), fsparam_flag_no("serverino", Opt_serverino), fsparam_flag("rwpidforward", Opt_rwpidforward), fsparam_flag("cifsacl", Opt_cifsacl), fsparam_flag_no("acl", Opt_acl), fsparam_flag("locallease", Opt_locallease), fsparam_flag("sign", Opt_sign), fsparam_flag("ignore_signature", Opt_ignore_signature), fsparam_flag("signloosely", Opt_ignore_signature), fsparam_flag("seal", Opt_seal), fsparam_flag("noac", Opt_noac), fsparam_flag("fsc", Opt_fsc), fsparam_flag("mfsymlinks", Opt_mfsymlinks), fsparam_flag("multiuser", Opt_multiuser), fsparam_flag("sloppy", Opt_sloppy), fsparam_flag("nosharesock", Opt_nosharesock), fsparam_flag_no("persistenthandles", Opt_persistent), fsparam_flag_no("resilienthandles", Opt_resilient), fsparam_flag_no("tcpnodelay", Opt_tcp_nodelay), fsparam_flag("nosparse", Opt_nosparse), fsparam_flag("domainauto", Opt_domainauto), fsparam_flag("rdma", Opt_rdma), fsparam_flag("modesid", Opt_modesid), fsparam_flag("modefromsid", Opt_modesid), fsparam_flag("rootfs", Opt_rootfs), fsparam_flag("compress", Opt_compress), fsparam_flag("witness", Opt_witness), fsparam_flag_no("nativesocket", Opt_nativesocket), fsparam_flag_no("unicode", Opt_unicode), fsparam_flag_no("nbsessinit", Opt_nbsessinit), /* Mount options which take uid or gid */ fsparam_uid("backupuid", Opt_backupuid), fsparam_gid("backupgid", Opt_backupgid), fsparam_uid("uid", Opt_uid), fsparam_uid("cruid", Opt_cruid), fsparam_gid("gid", Opt_gid), /* Mount options which take numeric value */ fsparam_u32("file_mode", Opt_file_mode), fsparam_u32("dirmode", Opt_dirmode), fsparam_u32("dir_mode", Opt_dirmode), fsparam_u32("port", Opt_port), fsparam_u32("min_enc_offload", Opt_min_enc_offload), fsparam_u32("retrans", Opt_retrans), fsparam_u32("esize", Opt_min_enc_offload), fsparam_u32("bsize", Opt_blocksize), fsparam_u32("rasize", Opt_rasize), fsparam_u32("rsize", Opt_rsize), fsparam_u32("wsize", Opt_wsize), fsparam_u32("actimeo", Opt_actimeo), fsparam_u32("acdirmax", Opt_acdirmax), fsparam_u32("acregmax", Opt_acregmax), fsparam_u32("closetimeo", Opt_closetimeo), fsparam_u32("echo_interval", Opt_echo_interval), fsparam_u32("max_credits", Opt_max_credits), fsparam_u32("max_cached_dirs", Opt_max_cached_dirs), fsparam_u32("handletimeout", Opt_handletimeout), fsparam_u64("snapshot", Opt_snapshot), fsparam_u32("max_channels", Opt_max_channels), /* Mount options which take string value */ fsparam_string("source", Opt_source), fsparam_string("user", Opt_user), fsparam_string("username", Opt_user), fsparam_string("pass", Opt_pass), fsparam_string("password", Opt_pass), fsparam_string("pass2", Opt_pass2), fsparam_string("password2", Opt_pass2), fsparam_string("ip", Opt_ip), fsparam_string("addr", Opt_ip), fsparam_string("domain", Opt_domain), fsparam_string("dom", Opt_domain), fsparam_string("srcaddr", Opt_srcaddr), fsparam_string("iocharset", Opt_iocharset), fsparam_string("netbiosname", Opt_netbiosname), fsparam_string("servern", Opt_servern), fsparam_string("ver", Opt_ver), fsparam_string("vers", Opt_vers), fsparam_string("sec", Opt_sec), fsparam_string("cache", Opt_cache), fsparam_string("reparse", Opt_reparse), fsparam_string("upcall_target", Opt_upcalltarget), fsparam_string("symlink", Opt_symlink), fsparam_string("symlinkroot", Opt_symlinkroot), /* Arguments that should be ignored */ fsparam_flag("guest", Opt_ignore), fsparam_flag("noatime", Opt_ignore), fsparam_flag("relatime", Opt_ignore), fsparam_flag("_netdev", Opt_ignore), fsparam_flag_no("suid", Opt_ignore), fsparam_flag_no("exec", Opt_ignore), fsparam_flag_no("dev", Opt_ignore), fsparam_flag_no("mand", Opt_ignore), fsparam_flag_no("auto", Opt_ignore), fsparam_string("cred", Opt_ignore), fsparam_string("credentials", Opt_ignore), /* * UNC and prefixpath is now extracted from Opt_source * in the new mount API so we can just ignore them going forward. */ fsparam_string("unc", Opt_ignore), fsparam_string("prefixpath", Opt_ignore), {} }; static int cifs_parse_security_flavors(struct fs_context *fc, char *value, struct smb3_fs_context *ctx) { substring_t args[MAX_OPT_ARGS]; /* * With mount options, the last one should win. Reset any existing * settings back to default. */ ctx->sectype = Unspecified; ctx->sign = false; switch (match_token(value, cifs_secflavor_tokens, args)) { case Opt_sec_krb5p: cifs_errorf(fc, "sec=krb5p is not supported. Use sec=krb5,seal instead\n"); return 1; case Opt_sec_krb5i: ctx->sign = true; fallthrough; case Opt_sec_krb5: ctx->sectype = Kerberos; break; case Opt_sec_ntlmsspi: ctx->sign = true; fallthrough; case Opt_sec_ntlmssp: ctx->sectype = RawNTLMSSP; break; case Opt_sec_ntlmv2i: ctx->sign = true; fallthrough; case Opt_sec_ntlmv2: ctx->sectype = NTLMv2; break; case Opt_sec_none: ctx->nullauth = 1; kfree(ctx->username); ctx->username = NULL; break; default: cifs_errorf(fc, "bad security option: %s\n", value); return 1; } return 0; } static int cifs_parse_upcall_target(struct fs_context *fc, char *value, struct smb3_fs_context *ctx) { substring_t args[MAX_OPT_ARGS]; ctx->upcall_target = UPTARGET_UNSPECIFIED; switch (match_token(value, cifs_upcall_target, args)) { case Opt_upcall_target_mount: ctx->upcall_target = UPTARGET_MOUNT; break; case Opt_upcall_target_application: ctx->upcall_target = UPTARGET_APP; break; default: cifs_errorf(fc, "bad upcall target: %s\n", value); return 1; } return 0; } static const match_table_t cifs_cacheflavor_tokens = { { Opt_cache_loose, "loose" }, { Opt_cache_strict, "strict" }, { Opt_cache_none, "none" }, { Opt_cache_ro, "ro" }, { Opt_cache_rw, "singleclient" }, { Opt_cache_err, NULL } }; static int cifs_parse_cache_flavor(struct fs_context *fc, char *value, struct smb3_fs_context *ctx) { substring_t args[MAX_OPT_ARGS]; switch (match_token(value, cifs_cacheflavor_tokens, args)) { case Opt_cache_loose: ctx->direct_io = false; ctx->strict_io = false; ctx->cache_ro = false; ctx->cache_rw = false; break; case Opt_cache_strict: ctx->direct_io = false; ctx->strict_io = true; ctx->cache_ro = false; ctx->cache_rw = false; break; case Opt_cache_none: ctx->direct_io = true; ctx->strict_io = false; ctx->cache_ro = false; ctx->cache_rw = false; break; case Opt_cache_ro: ctx->direct_io = false; ctx->strict_io = false; ctx->cache_ro = true; ctx->cache_rw = false; break; case Opt_cache_rw: ctx->direct_io = false; ctx->strict_io = false; ctx->cache_ro = false; ctx->cache_rw = true; break; default: cifs_errorf(fc, "bad cache= option: %s\n", value); return 1; } return 0; } static const match_table_t reparse_flavor_tokens = { { Opt_reparse_default, "default" }, { Opt_reparse_none, "none" }, { Opt_reparse_nfs, "nfs" }, { Opt_reparse_wsl, "wsl" }, { Opt_reparse_err, NULL }, }; static int parse_reparse_flavor(struct fs_context *fc, char *value, struct smb3_fs_context *ctx) { substring_t args[MAX_OPT_ARGS]; switch (match_token(value, reparse_flavor_tokens, args)) { case Opt_reparse_default: ctx->reparse_type = CIFS_REPARSE_TYPE_DEFAULT; break; case Opt_reparse_none: ctx->reparse_type = CIFS_REPARSE_TYPE_NONE; break; case Opt_reparse_nfs: ctx->reparse_type = CIFS_REPARSE_TYPE_NFS; break; case Opt_reparse_wsl: ctx->reparse_type = CIFS_REPARSE_TYPE_WSL; break; default: cifs_errorf(fc, "bad reparse= option: %s\n", value); return 1; } return 0; } static const match_table_t symlink_flavor_tokens = { { Opt_symlink_default, "default" }, { Opt_symlink_none, "none" }, { Opt_symlink_native, "native" }, { Opt_symlink_unix, "unix" }, { Opt_symlink_mfsymlinks, "mfsymlinks" }, { Opt_symlink_sfu, "sfu" }, { Opt_symlink_nfs, "nfs" }, { Opt_symlink_wsl, "wsl" }, { Opt_symlink_err, NULL }, }; static int parse_symlink_flavor(struct fs_context *fc, char *value, struct smb3_fs_context *ctx) { substring_t args[MAX_OPT_ARGS]; switch (match_token(value, symlink_flavor_tokens, args)) { case Opt_symlink_default: ctx->symlink_type = CIFS_SYMLINK_TYPE_DEFAULT; break; case Opt_symlink_none: ctx->symlink_type = CIFS_SYMLINK_TYPE_NONE; break; case Opt_symlink_native: ctx->symlink_type = CIFS_SYMLINK_TYPE_NATIVE; break; case Opt_symlink_unix: ctx->symlink_type = CIFS_SYMLINK_TYPE_UNIX; break; case Opt_symlink_mfsymlinks: ctx->symlink_type = CIFS_SYMLINK_TYPE_MFSYMLINKS; break; case Opt_symlink_sfu: ctx->symlink_type = CIFS_SYMLINK_TYPE_SFU; break; case Opt_symlink_nfs: ctx->symlink_type = CIFS_SYMLINK_TYPE_NFS; break; case Opt_symlink_wsl: ctx->symlink_type = CIFS_SYMLINK_TYPE_WSL; break; default: cifs_errorf(fc, "bad symlink= option: %s\n", value); return 1; } return 0; } #define DUP_CTX_STR(field) \ do { \ if (ctx->field) { \ new_ctx->field = kstrdup(ctx->field, GFP_ATOMIC); \ if (new_ctx->field == NULL) { \ smb3_cleanup_fs_context_contents(new_ctx); \ return -ENOMEM; \ } \ } \ } while (0) int smb3_fs_context_dup(struct smb3_fs_context *new_ctx, struct smb3_fs_context *ctx) { memcpy(new_ctx, ctx, sizeof(*ctx)); new_ctx->prepath = NULL; new_ctx->nodename = NULL; new_ctx->username = NULL; new_ctx->password = NULL; new_ctx->password2 = NULL; new_ctx->server_hostname = NULL; new_ctx->domainname = NULL; new_ctx->UNC = NULL; new_ctx->source = NULL; new_ctx->iocharset = NULL; new_ctx->leaf_fullpath = NULL; new_ctx->dns_dom = NULL; new_ctx->symlinkroot = NULL; /* * Make sure to stay in sync with smb3_cleanup_fs_context_contents() */ DUP_CTX_STR(prepath); DUP_CTX_STR(username); DUP_CTX_STR(password); DUP_CTX_STR(password2); DUP_CTX_STR(server_hostname); DUP_CTX_STR(UNC); DUP_CTX_STR(source); DUP_CTX_STR(domainname); DUP_CTX_STR(nodename); DUP_CTX_STR(iocharset); DUP_CTX_STR(leaf_fullpath); DUP_CTX_STR(dns_dom); DUP_CTX_STR(symlinkroot); return 0; } static int cifs_parse_smb_version(struct fs_context *fc, char *value, struct smb3_fs_context *ctx, bool is_smb3) { substring_t args[MAX_OPT_ARGS]; switch (match_token(value, cifs_smb_version_tokens, args)) { #ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY case Smb_1: if (disable_legacy_dialects) { cifs_errorf(fc, "mount with legacy dialect disabled\n"); return 1; } if (is_smb3) { cifs_errorf(fc, "vers=1.0 (cifs) not permitted when mounting with smb3\n"); return 1; } cifs_errorf(fc, "Use of the less secure dialect vers=1.0 is not recommended unless required for access to very old servers\n"); ctx->ops = &smb1_operations; ctx->vals = &smb1_values; break; case Smb_20: if (disable_legacy_dialects) { cifs_errorf(fc, "mount with legacy dialect disabled\n"); return 1; } if (is_smb3) { cifs_errorf(fc, "vers=2.0 not permitted when mounting with smb3\n"); return 1; } ctx->ops = &smb20_operations; ctx->vals = &smb20_values; break; #else case Smb_1: cifs_errorf(fc, "vers=1.0 (cifs) mount not permitted when legacy dialects disabled\n"); return 1; case Smb_20: cifs_errorf(fc, "vers=2.0 mount not permitted when legacy dialects disabled\n"); return 1; #endif /* CIFS_ALLOW_INSECURE_LEGACY */ case Smb_21: ctx->ops = &smb21_operations; ctx->vals = &smb21_values; break; case Smb_30: ctx->ops = &smb30_operations; ctx->vals = &smb30_values; break; case Smb_302: ctx->ops = &smb30_operations; /* currently identical with 3.0 */ ctx->vals = &smb302_values; break; case Smb_311: ctx->ops = &smb311_operations; ctx->vals = &smb311_values; break; case Smb_3any: ctx->ops = &smb30_operations; /* currently identical with 3.0 */ ctx->vals = &smb3any_values; break; case Smb_default: ctx->ops = &smb30_operations; ctx->vals = &smbdefault_values; break; default: cifs_errorf(fc, "Unknown vers= option specified: %s\n", value); return 1; } return 0; } int smb3_parse_opt(const char *options, const char *key, char **val) { int rc = -ENOENT; char *opts, *orig, *p; orig = opts = kstrdup(options, GFP_KERNEL); if (!opts) return -ENOMEM; while ((p = strsep(&opts, ","))) { char *nval; if (!*p) continue; if (strncasecmp(p, key, strlen(key))) continue; nval = strchr(p, '='); if (nval) { if (nval == p) continue; *nval++ = 0; *val = kstrdup(nval, GFP_KERNEL); rc = !*val ? -ENOMEM : 0; goto out; } } out: kfree(orig); return rc; } /* * Remove duplicate path delimiters. Windows is supposed to do that * but there are some bugs that prevent rename from working if there are * multiple delimiters. * * Return a sanitized duplicate of @path or NULL for empty prefix paths. * Otherwise, return ERR_PTR. * * @gfp indicates the GFP_* flags for kstrdup. * The caller is responsible for freeing the original. */ #define IS_DELIM(c) ((c) == '/' || (c) == '\\') char *cifs_sanitize_prepath(char *prepath, gfp_t gfp) { char *cursor1 = prepath, *cursor2 = prepath; char *s; /* skip all prepended delimiters */ while (IS_DELIM(*cursor1)) cursor1++; /* copy the first letter */ *cursor2 = *cursor1; /* copy the remainder... */ while (*(cursor1++)) { /* ... skipping all duplicated delimiters */ if (IS_DELIM(*cursor1) && IS_DELIM(*cursor2)) continue; *(++cursor2) = *cursor1; } /* if the last character is a delimiter, skip it */ if (IS_DELIM(*(cursor2 - 1))) cursor2--; *cursor2 = '\0'; if (!*prepath) return NULL; s = kstrdup(prepath, gfp); if (!s) return ERR_PTR(-ENOMEM); return s; } /* * Return full path based on the values of @ctx->{UNC,prepath}. * * It is assumed that both values were already parsed by smb3_parse_devname(). */ char *smb3_fs_context_fullpath(const struct smb3_fs_context *ctx, char dirsep) { size_t ulen, plen; char *s; ulen = strlen(ctx->UNC); plen = ctx->prepath ? strlen(ctx->prepath) + 1 : 0; s = kmalloc(ulen + plen + 1, GFP_KERNEL); if (!s) return ERR_PTR(-ENOMEM); memcpy(s, ctx->UNC, ulen); if (plen) { s[ulen] = dirsep; memcpy(s + ulen + 1, ctx->prepath, plen); } s[ulen + plen] = '\0'; convert_delimiter(s, dirsep); return s; } /* * Parse a devname into substrings and populate the ctx->UNC and ctx->prepath * fields with the result. Returns 0 on success and an error otherwise * (e.g. ENOMEM or EINVAL) */ int smb3_parse_devname(const char *devname, struct smb3_fs_context *ctx) { char *pos; const char *delims = "/\\"; size_t len; int rc; if (unlikely(!devname || !*devname)) { cifs_dbg(VFS, "Device name not specified\n"); return -EINVAL; } /* make sure we have a valid UNC double delimiter prefix */ len = strspn(devname, delims); if (len != 2) return -EINVAL; /* find delimiter between host and sharename */ pos = strpbrk(devname + 2, delims); if (!pos) return -EINVAL; /* record the server hostname */ kfree(ctx->server_hostname); ctx->server_hostname = kstrndup(devname + 2, pos - devname - 2, GFP_KERNEL); if (!ctx->server_hostname) return -ENOMEM; /* skip past delimiter */ ++pos; /* now go until next delimiter or end of string */ len = strcspn(pos, delims); if (!len) return -EINVAL; /* move "pos" up to delimiter or NULL */ pos += len; kfree(ctx->UNC); ctx->UNC = kstrndup(devname, pos - devname, GFP_KERNEL); if (!ctx->UNC) return -ENOMEM; convert_delimiter(ctx->UNC, '\\'); /* skip any delimiter */ if (*pos == '/' || *pos == '\\') pos++; kfree(ctx->prepath); ctx->prepath = NULL; /* If pos is NULL then no prepath */ if (!*pos) return 0; ctx->prepath = cifs_sanitize_prepath(pos, GFP_KERNEL); if (IS_ERR(ctx->prepath)) { rc = PTR_ERR(ctx->prepath); ctx->prepath = NULL; return rc; } return 0; } static void smb3_fs_context_free(struct fs_context *fc); static int smb3_fs_context_parse_param(struct fs_context *fc, struct fs_parameter *param); static int smb3_fs_context_parse_monolithic(struct fs_context *fc, void *data); static int smb3_get_tree(struct fs_context *fc); static int smb3_reconfigure(struct fs_context *fc); static const struct fs_context_operations smb3_fs_context_ops = { .free = smb3_fs_context_free, .parse_param = smb3_fs_context_parse_param, .parse_monolithic = smb3_fs_context_parse_monolithic, .get_tree = smb3_get_tree, .reconfigure = smb3_reconfigure, }; /* * Parse a monolithic block of data from sys_mount(). * smb3_fs_context_parse_monolithic - Parse key[=val][,key[=val]]* mount data * @ctx: The superblock configuration to fill in. * @data: The data to parse * * Parse a blob of data that's in key[=val][,key[=val]]* form. This can be * called from the ->monolithic_mount_data() fs_context operation. * * Returns 0 on success or the error returned by the ->parse_option() fs_context * operation on failure. */ static int smb3_fs_context_parse_monolithic(struct fs_context *fc, void *data) { char *options = data, *key; int ret = 0; if (!options) return 0; ret = security_sb_eat_lsm_opts(options, &fc->security); if (ret) return ret; /* BB Need to add support for sep= here TBD */ while ((key = strsep(&options, ",")) != NULL) { size_t len; char *value; if (*key == 0) break; /* Check if following character is the deliminator If yes, * we have encountered a double deliminator reset the NULL * character to the deliminator */ while (options && options[0] == ',') { len = strlen(key); strcpy(key + len, options); options = strchr(options, ','); if (options) *options++ = 0; } len = 0; value = strchr(key, '='); if (value) { if (value == key) continue; *value++ = 0; len = strlen(value); } ret = vfs_parse_fs_string(fc, key, value, len); if (ret < 0) break; } return ret; } /* * Validate the preparsed information in the config. */ static int smb3_fs_context_validate(struct fs_context *fc) { struct smb3_fs_context *ctx = smb3_fc2context(fc); if (ctx->rdma && ctx->vals->protocol_id < SMB30_PROT_ID) { cifs_errorf(fc, "SMB Direct requires Version >=3.0\n"); return -EOPNOTSUPP; } #ifndef CONFIG_KEYS /* Muliuser mounts require CONFIG_KEYS support */ if (ctx->multiuser) { cifs_errorf(fc, "Multiuser mounts require kernels with CONFIG_KEYS enabled\n"); return -1; } #endif if (ctx->got_version == false) pr_warn_once("No dialect specified on mount. Default has changed to a more secure dialect, SMB2.1 or later (e.g. SMB3.1.1), from CIFS (SMB1). To use the less secure SMB1 dialect to access old servers which do not support SMB3.1.1 (or even SMB3 or SMB2.1) specify vers=1.0 on mount.\n"); if (!ctx->UNC) { cifs_errorf(fc, "CIFS mount error: No usable UNC path provided in device string!\n"); return -1; } /* make sure UNC has a share name */ if (strlen(ctx->UNC) < 3 || !strchr(ctx->UNC + 3, '\\')) { cifs_errorf(fc, "Malformed UNC. Unable to find share name.\n"); return -ENOENT; } if (!ctx->got_ip) { int len; const char *slash; /* No ip= option specified? Try to get it from UNC */ /* Use the address part of the UNC. */ slash = strchr(&ctx->UNC[2], '\\'); len = slash - &ctx->UNC[2]; if (!cifs_convert_address((struct sockaddr *)&ctx->dstaddr, &ctx->UNC[2], len)) { pr_err("Unable to determine destination address\n"); return -EHOSTUNREACH; } } /* set the port that we got earlier */ cifs_set_port((struct sockaddr *)&ctx->dstaddr, ctx->port); if (ctx->uid_specified && !ctx->forceuid_specified) { ctx->override_uid = 1; pr_notice("enabling forceuid mount option implicitly because uid= option is specified\n"); } if (ctx->gid_specified && !ctx->forcegid_specified) { ctx->override_gid = 1; pr_notice("enabling forcegid mount option implicitly because gid= option is specified\n"); } if (ctx->override_uid && !ctx->uid_specified) { ctx->override_uid = 0; pr_notice("ignoring forceuid mount option specified with no uid= option\n"); } if (ctx->override_gid && !ctx->gid_specified) { ctx->override_gid = 0; pr_notice("ignoring forcegid mount option specified with no gid= option\n"); } return 0; } static int smb3_get_tree_common(struct fs_context *fc) { struct smb3_fs_context *ctx = smb3_fc2context(fc); struct dentry *root; int rc = 0; root = cifs_smb3_do_mount(fc->fs_type, 0, ctx); if (IS_ERR(root)) return PTR_ERR(root); fc->root = root; return rc; } /* * Create an SMB3 superblock from the parameters passed. */ static int smb3_get_tree(struct fs_context *fc) { int err = smb3_fs_context_validate(fc); int ret; if (err) return err; cifs_mount_lock(); ret = smb3_get_tree_common(fc); cifs_mount_unlock(); return ret; } static void smb3_fs_context_free(struct fs_context *fc) { struct smb3_fs_context *ctx = smb3_fc2context(fc); smb3_cleanup_fs_context(ctx); } /* * Compare the old and new proposed context during reconfigure * and check if the changes are compatible. */ static int smb3_verify_reconfigure_ctx(struct fs_context *fc, struct smb3_fs_context *new_ctx, struct smb3_fs_context *old_ctx, bool need_recon) { if (new_ctx->posix_paths != old_ctx->posix_paths) { cifs_errorf(fc, "can not change posixpaths during remount\n"); return -EINVAL; } if (new_ctx->sectype != old_ctx->sectype) { cifs_errorf(fc, "can not change sec during remount\n"); return -EINVAL; } if (new_ctx->multiuser != old_ctx->multiuser) { cifs_errorf(fc, "can not change multiuser during remount\n"); return -EINVAL; } if (new_ctx->UNC && (!old_ctx->UNC || strcmp(new_ctx->UNC, old_ctx->UNC))) { cifs_errorf(fc, "can not change UNC during remount\n"); return -EINVAL; } if (new_ctx->username && (!old_ctx->username || strcmp(new_ctx->username, old_ctx->username))) { cifs_errorf(fc, "can not change username during remount\n"); return -EINVAL; } if (new_ctx->password && (!old_ctx->password || strcmp(new_ctx->password, old_ctx->password))) { if (need_recon == false) { cifs_errorf(fc, "can not change password of active session during remount\n"); return -EINVAL; } else if (old_ctx->sectype == Kerberos) { cifs_errorf(fc, "can not change password for Kerberos via remount\n"); return -EINVAL; } } if (new_ctx->domainname && (!old_ctx->domainname || strcmp(new_ctx->domainname, old_ctx->domainname))) { cifs_errorf(fc, "can not change domainname during remount\n"); return -EINVAL; } if (strcmp(new_ctx->workstation_name, old_ctx->workstation_name)) { cifs_errorf(fc, "can not change workstation_name during remount\n"); return -EINVAL; } if (new_ctx->nodename && (!old_ctx->nodename || strcmp(new_ctx->nodename, old_ctx->nodename))) { cifs_errorf(fc, "can not change nodename during remount\n"); return -EINVAL; } if (new_ctx->iocharset && (!old_ctx->iocharset || strcmp(new_ctx->iocharset, old_ctx->iocharset))) { cifs_errorf(fc, "can not change iocharset during remount\n"); return -EINVAL; } if (new_ctx->unicode != old_ctx->unicode) { cifs_errorf(fc, "can not change unicode during remount\n"); return -EINVAL; } if (new_ctx->rfc1001_sessinit != old_ctx->rfc1001_sessinit) { cifs_errorf(fc, "can not change nbsessinit during remount\n"); return -EINVAL; } return 0; } #define STEAL_STRING(cifs_sb, ctx, field) \ do { \ kfree(ctx->field); \ ctx->field = cifs_sb->ctx->field; \ cifs_sb->ctx->field = NULL; \ } while (0) #define STEAL_STRING_SENSITIVE(cifs_sb, ctx, field) \ do { \ kfree_sensitive(ctx->field); \ ctx->field = cifs_sb->ctx->field; \ cifs_sb->ctx->field = NULL; \ } while (0) int smb3_sync_session_ctx_passwords(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses) { if (ses->password && cifs_sb->ctx->password && strcmp(ses->password, cifs_sb->ctx->password)) { kfree_sensitive(cifs_sb->ctx->password); cifs_sb->ctx->password = kstrdup(ses->password, GFP_KERNEL); if (!cifs_sb->ctx->password) return -ENOMEM; } if (ses->password2 && cifs_sb->ctx->password2 && strcmp(ses->password2, cifs_sb->ctx->password2)) { kfree_sensitive(cifs_sb->ctx->password2); cifs_sb->ctx->password2 = kstrdup(ses->password2, GFP_KERNEL); if (!cifs_sb->ctx->password2) { kfree_sensitive(cifs_sb->ctx->password); cifs_sb->ctx->password = NULL; return -ENOMEM; } } return 0; } static int smb3_reconfigure(struct fs_context *fc) { struct smb3_fs_context *ctx = smb3_fc2context(fc); struct dentry *root = fc->root; struct cifs_sb_info *cifs_sb = CIFS_SB(root->d_sb); struct cifs_ses *ses = cifs_sb_master_tcon(cifs_sb)->ses; char *new_password = NULL, *new_password2 = NULL; bool need_recon = false; int rc; if (ses->expired_pwd) need_recon = true; rc = smb3_verify_reconfigure_ctx(fc, ctx, cifs_sb->ctx, need_recon); if (rc) return rc; /* * We can not change UNC/username/password/domainname/ * workstation_name/nodename/iocharset * during reconnect so ignore what we have in the new context and * just use what we already have in cifs_sb->ctx. */ STEAL_STRING(cifs_sb, ctx, UNC); STEAL_STRING(cifs_sb, ctx, source); STEAL_STRING(cifs_sb, ctx, username); if (need_recon == false) STEAL_STRING_SENSITIVE(cifs_sb, ctx, password); else { if (ctx->password) { new_password = kstrdup(ctx->password, GFP_KERNEL); if (!new_password) return -ENOMEM; } else STEAL_STRING_SENSITIVE(cifs_sb, ctx, password); } /* * if a new password2 has been specified, then reset it's value * inside the ses struct */ if (ctx->password2) { new_password2 = kstrdup(ctx->password2, GFP_KERNEL); if (!new_password2) { kfree_sensitive(new_password); return -ENOMEM; } } else STEAL_STRING_SENSITIVE(cifs_sb, ctx, password2); /* * we may update the passwords in the ses struct below. Make sure we do * not race with smb2_reconnect */ mutex_lock(&ses->session_mutex); /* * smb2_reconnect may swap password and password2 in case session setup * failed. First get ctx passwords in sync with ses passwords. It should * be okay to do this even if this function were to return an error at a * later stage */ rc = smb3_sync_session_ctx_passwords(cifs_sb, ses); if (rc) { mutex_unlock(&ses->session_mutex); return rc; } /* * now that allocations for passwords are done, commit them */ if (new_password) { kfree_sensitive(ses->password); ses->password = new_password; } if (new_password2) { kfree_sensitive(ses->password2); ses->password2 = new_password2; } mutex_unlock(&ses->session_mutex); STEAL_STRING(cifs_sb, ctx, domainname); STEAL_STRING(cifs_sb, ctx, nodename); STEAL_STRING(cifs_sb, ctx, iocharset); /* if rsize or wsize not passed in on remount, use previous values */ if (ctx->rsize == 0) ctx->rsize = cifs_sb->ctx->rsize; if (ctx->wsize == 0) ctx->wsize = cifs_sb->ctx->wsize; smb3_cleanup_fs_context_contents(cifs_sb->ctx); rc = smb3_fs_context_dup(cifs_sb->ctx, ctx); smb3_update_mnt_flags(cifs_sb); #ifdef CONFIG_CIFS_DFS_UPCALL if (!rc) rc = dfs_cache_remount_fs(cifs_sb); #endif return rc; } static int smb3_fs_context_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct fs_parse_result result; struct smb3_fs_context *ctx = smb3_fc2context(fc); int i, opt; bool is_smb3 = !strcmp(fc->fs_type->name, "smb3"); bool skip_parsing = false; char *hostname; cifs_dbg(FYI, "CIFS: parsing cifs mount option '%s'\n", param->key); /* * fs_parse can not handle string options with an empty value so * we will need special handling of them. */ if (param->type == fs_value_is_string && param->string[0] == 0) { if (!strcmp("pass", param->key) || !strcmp("password", param->key)) { skip_parsing = true; opt = Opt_pass; } else if (!strcmp("user", param->key) || !strcmp("username", param->key)) { skip_parsing = true; opt = Opt_user; } else if (!strcmp("pass2", param->key) || !strcmp("password2", param->key)) { skip_parsing = true; opt = Opt_pass2; } } if (!skip_parsing) { opt = fs_parse(fc, smb3_fs_parameters, param, &result); if (opt < 0) return ctx->sloppy ? 1 : opt; } switch (opt) { case Opt_compress: if (!IS_ENABLED(CONFIG_CIFS_COMPRESSION)) { cifs_errorf(fc, "CONFIG_CIFS_COMPRESSION kernel config option is unset\n"); goto cifs_parse_mount_err; } ctx->compress = true; cifs_dbg(VFS, "SMB3 compression support is experimental\n"); break; case Opt_nodfs: ctx->nodfs = 1; break; case Opt_hard: if (result.negated) { if (ctx->retry == 1) cifs_dbg(VFS, "conflicting hard vs. soft mount options\n"); ctx->retry = 0; } else ctx->retry = 1; break; case Opt_soft: if (result.negated) ctx->retry = 1; else { if (ctx->retry == 1) cifs_dbg(VFS, "conflicting hard vs soft mount options\n"); ctx->retry = 0; } break; case Opt_mapposix: if (result.negated) ctx->remap = false; else { ctx->remap = true; ctx->sfu_remap = false; /* disable SFU mapping */ } break; case Opt_mapchars: if (result.negated) ctx->sfu_remap = false; else { ctx->sfu_remap = true; ctx->remap = false; /* disable SFM (mapposix) mapping */ } break; case Opt_user_xattr: if (result.negated) ctx->no_xattr = 1; else ctx->no_xattr = 0; break; case Opt_forceuid: if (result.negated) ctx->override_uid = 0; else ctx->override_uid = 1; ctx->forceuid_specified = true; break; case Opt_forcegid: if (result.negated) ctx->override_gid = 0; else ctx->override_gid = 1; ctx->forcegid_specified = true; break; case Opt_perm: if (result.negated) ctx->noperm = 1; else ctx->noperm = 0; break; case Opt_dynperm: if (result.negated) ctx->dynperm = 0; else ctx->dynperm = 1; break; case Opt_sfu: if (result.negated) ctx->sfu_emul = 0; else ctx->sfu_emul = 1; break; case Opt_noblocksend: ctx->noblocksnd = 1; break; case Opt_noautotune: ctx->noautotune = 1; break; case Opt_nolease: ctx->no_lease = 1; break; case Opt_nosparse: ctx->no_sparse = 1; break; case Opt_nodelete: ctx->nodelete = 1; break; case Opt_multichannel: if (result.negated) { ctx->multichannel = false; ctx->max_channels = 1; } else { ctx->multichannel = true; /* if number of channels not specified, default to 2 */ if (ctx->max_channels < 2) ctx->max_channels = 2; } break; case Opt_uid: ctx->linux_uid = result.uid; ctx->uid_specified = true; break; case Opt_cruid: ctx->cred_uid = result.uid; ctx->cruid_specified = true; break; case Opt_backupuid: ctx->backupuid = result.uid; ctx->backupuid_specified = true; break; case Opt_backupgid: ctx->backupgid = result.gid; ctx->backupgid_specified = true; break; case Opt_gid: ctx->linux_gid = result.gid; ctx->gid_specified = true; break; case Opt_port: ctx->port = result.uint_32; break; case Opt_file_mode: ctx->file_mode = result.uint_32; break; case Opt_dirmode: ctx->dir_mode = result.uint_32; break; case Opt_min_enc_offload: ctx->min_offload = result.uint_32; break; case Opt_retrans: ctx->retrans = result.uint_32; break; case Opt_blocksize: /* * inode blocksize realistically should never need to be * less than 16K or greater than 16M and default is 1MB. * Note that small inode block sizes (e.g. 64K) can lead * to very poor performance of common tools like cp and scp */ if ((result.uint_32 < CIFS_MAX_MSGSIZE) || (result.uint_32 > (4 * SMB3_DEFAULT_IOSIZE))) { cifs_errorf(fc, "%s: Invalid blocksize\n", __func__); goto cifs_parse_mount_err; } ctx->bsize = result.uint_32; ctx->got_bsize = true; break; case Opt_rasize: /* * readahead size realistically should never need to be * less than 1M (CIFS_DEFAULT_IOSIZE) or greater than 32M * (perhaps an exception should be considered in the * for the case of a large number of channels * when multichannel is negotiated) since that would lead * to plenty of parallel I/O in flight to the server. * Note that smaller read ahead sizes would * hurt performance of common tools like cp and scp * which often trigger sequential i/o with read ahead */ if ((result.uint_32 > (8 * SMB3_DEFAULT_IOSIZE)) || (result.uint_32 < CIFS_DEFAULT_IOSIZE)) { cifs_errorf(fc, "%s: Invalid rasize %d vs. %d\n", __func__, result.uint_32, SMB3_DEFAULT_IOSIZE); goto cifs_parse_mount_err; } ctx->rasize = result.uint_32; break; case Opt_rsize: ctx->rsize = result.uint_32; ctx->got_rsize = true; ctx->vol_rsize = ctx->rsize; break; case Opt_wsize: ctx->wsize = result.uint_32; ctx->got_wsize = true; if (ctx->wsize % PAGE_SIZE != 0) { ctx->wsize = round_down(ctx->wsize, PAGE_SIZE); if (ctx->wsize == 0) { ctx->wsize = PAGE_SIZE; cifs_dbg(VFS, "wsize too small, reset to minimum %ld\n", PAGE_SIZE); } else { cifs_dbg(VFS, "wsize rounded down to %d to multiple of PAGE_SIZE %ld\n", ctx->wsize, PAGE_SIZE); } } ctx->vol_wsize = ctx->wsize; break; case Opt_acregmax: if (result.uint_32 > CIFS_MAX_ACTIMEO / HZ) { cifs_errorf(fc, "acregmax too large\n"); goto cifs_parse_mount_err; } ctx->acregmax = HZ * result.uint_32; break; case Opt_acdirmax: if (result.uint_32 > CIFS_MAX_ACTIMEO / HZ) { cifs_errorf(fc, "acdirmax too large\n"); goto cifs_parse_mount_err; } ctx->acdirmax = HZ * result.uint_32; break; case Opt_actimeo: if (result.uint_32 > CIFS_MAX_ACTIMEO / HZ) { cifs_errorf(fc, "timeout too large\n"); goto cifs_parse_mount_err; } if ((ctx->acdirmax != CIFS_DEF_ACTIMEO) || (ctx->acregmax != CIFS_DEF_ACTIMEO)) { cifs_errorf(fc, "actimeo ignored since acregmax or acdirmax specified\n"); break; } ctx->acdirmax = ctx->acregmax = HZ * result.uint_32; break; case Opt_closetimeo: if (result.uint_32 > SMB3_MAX_DCLOSETIMEO / HZ) { cifs_errorf(fc, "closetimeo too large\n"); goto cifs_parse_mount_err; } ctx->closetimeo = HZ * result.uint_32; break; case Opt_echo_interval: if (result.uint_32 < SMB_ECHO_INTERVAL_MIN || result.uint_32 > SMB_ECHO_INTERVAL_MAX) { cifs_errorf(fc, "echo interval is out of bounds\n"); goto cifs_parse_mount_err; } ctx->echo_interval = result.uint_32; break; case Opt_snapshot: ctx->snapshot_time = result.uint_64; break; case Opt_max_credits: if (result.uint_32 < 20 || result.uint_32 > 60000) { cifs_errorf(fc, "%s: Invalid max_credits value\n", __func__); goto cifs_parse_mount_err; } ctx->max_credits = result.uint_32; break; case Opt_max_channels: if (result.uint_32 < 1 || result.uint_32 > CIFS_MAX_CHANNELS) { cifs_errorf(fc, "%s: Invalid max_channels value, needs to be 1-%d\n", __func__, CIFS_MAX_CHANNELS); goto cifs_parse_mount_err; } ctx->max_channels = result.uint_32; /* If more than one channel requested ... they want multichan */ if (result.uint_32 > 1) ctx->multichannel = true; break; case Opt_max_cached_dirs: if (result.uint_32 < 1) { cifs_errorf(fc, "%s: Invalid max_cached_dirs, needs to be 1 or more\n", __func__); goto cifs_parse_mount_err; } ctx->max_cached_dirs = result.uint_32; break; case Opt_handletimeout: ctx->handle_timeout = result.uint_32; if (ctx->handle_timeout > SMB3_MAX_HANDLE_TIMEOUT) { cifs_errorf(fc, "Invalid handle cache timeout, longer than 16 minutes\n"); goto cifs_parse_mount_err; } break; case Opt_source: kfree(ctx->UNC); ctx->UNC = NULL; switch (smb3_parse_devname(param->string, ctx)) { case 0: break; case -ENOMEM: cifs_errorf(fc, "Unable to allocate memory for devname\n"); goto cifs_parse_mount_err; case -EINVAL: cifs_errorf(fc, "Malformed UNC in devname\n"); goto cifs_parse_mount_err; default: cifs_errorf(fc, "Unknown error parsing devname\n"); goto cifs_parse_mount_err; } ctx->source = smb3_fs_context_fullpath(ctx, '/'); if (IS_ERR(ctx->source)) { ctx->source = NULL; cifs_errorf(fc, "OOM when copying UNC string\n"); goto cifs_parse_mount_err; } fc->source = kstrdup(ctx->source, GFP_KERNEL); if (fc->source == NULL) { cifs_errorf(fc, "OOM when copying UNC string\n"); goto cifs_parse_mount_err; } hostname = extract_hostname(ctx->UNC); if (IS_ERR(hostname)) { cifs_errorf(fc, "Cannot extract hostname from UNC string\n"); goto cifs_parse_mount_err; } /* last byte, type, is 0x20 for servr type */ memset(ctx->target_rfc1001_name, 0x20, RFC1001_NAME_LEN_WITH_NULL); for (i = 0; i < RFC1001_NAME_LEN && hostname[i] != 0; i++) ctx->target_rfc1001_name[i] = toupper(hostname[i]); kfree(hostname); break; case Opt_user: kfree(ctx->username); ctx->username = NULL; if (ctx->nullauth) break; if (strlen(param->string) == 0) { /* null user, ie. anonymous authentication */ ctx->nullauth = 1; break; } if (strnlen(param->string, CIFS_MAX_USERNAME_LEN) > CIFS_MAX_USERNAME_LEN) { pr_warn("username too long\n"); goto cifs_parse_mount_err; } ctx->username = kstrdup(param->string, GFP_KERNEL); if (ctx->username == NULL) { cifs_errorf(fc, "OOM when copying username string\n"); goto cifs_parse_mount_err; } break; case Opt_pass: kfree_sensitive(ctx->password); ctx->password = NULL; if (strlen(param->string) == 0) break; ctx->password = kstrdup(param->string, GFP_KERNEL); if (ctx->password == NULL) { cifs_errorf(fc, "OOM when copying password string\n"); goto cifs_parse_mount_err; } break; case Opt_pass2: kfree_sensitive(ctx->password2); ctx->password2 = NULL; if (strlen(param->string) == 0) break; ctx->password2 = kstrdup(param->string, GFP_KERNEL); if (ctx->password2 == NULL) { cifs_errorf(fc, "OOM when copying password2 string\n"); goto cifs_parse_mount_err; } break; case Opt_ip: if (strlen(param->string) == 0) { ctx->got_ip = false; break; } if (!cifs_convert_address((struct sockaddr *)&ctx->dstaddr, param->string, strlen(param->string))) { pr_err("bad ip= option (%s)\n", param->string); goto cifs_parse_mount_err; } ctx->got_ip = true; break; case Opt_domain: if (strnlen(param->string, CIFS_MAX_DOMAINNAME_LEN) == CIFS_MAX_DOMAINNAME_LEN) { pr_warn("domain name too long\n"); goto cifs_parse_mount_err; } kfree(ctx->domainname); ctx->domainname = kstrdup(param->string, GFP_KERNEL); if (ctx->domainname == NULL) { cifs_errorf(fc, "OOM when copying domainname string\n"); goto cifs_parse_mount_err; } cifs_dbg(FYI, "Domain name set\n"); break; case Opt_srcaddr: if (!cifs_convert_address( (struct sockaddr *)&ctx->srcaddr, param->string, strlen(param->string))) { pr_warn("Could not parse srcaddr: %s\n", param->string); goto cifs_parse_mount_err; } break; case Opt_iocharset: if (strnlen(param->string, 1024) >= 65) { pr_warn("iocharset name too long\n"); goto cifs_parse_mount_err; } if (strncasecmp(param->string, "default", 7) != 0) { kfree(ctx->iocharset); ctx->iocharset = kstrdup(param->string, GFP_KERNEL); if (ctx->iocharset == NULL) { cifs_errorf(fc, "OOM when copying iocharset string\n"); goto cifs_parse_mount_err; } } /* if iocharset not set then load_nls_default * is used by caller */ cifs_dbg(FYI, "iocharset set to %s\n", ctx->iocharset); break; case Opt_netbiosname: memset(ctx->source_rfc1001_name, 0x20, RFC1001_NAME_LEN); /* * FIXME: are there cases in which a comma can * be valid in workstation netbios name (and * need special handling)? */ for (i = 0; i < RFC1001_NAME_LEN; i++) { /* don't ucase netbiosname for user */ if (param->string[i] == 0) break; ctx->source_rfc1001_name[i] = param->string[i]; } /* The string has 16th byte zero still from * set at top of the function */ if (i == RFC1001_NAME_LEN && param->string[i] != 0) pr_warn("netbiosname longer than 15 truncated\n"); break; case Opt_servern: /* last byte, type, is 0x20 for servr type */ memset(ctx->target_rfc1001_name, 0x20, RFC1001_NAME_LEN_WITH_NULL); /* * BB are there cases in which a comma can be valid in this * workstation netbios name (and need special handling)? */ /* user or mount helper must uppercase the netbios name */ for (i = 0; i < 15; i++) { if (param->string[i] == 0) break; ctx->target_rfc1001_name[i] = param->string[i]; } /* The string has 16th byte zero still from set at top of function */ if (i == RFC1001_NAME_LEN && param->string[i] != 0) pr_warn("server netbiosname longer than 15 truncated\n"); break; case Opt_nbsessinit: ctx->rfc1001_sessinit = !result.negated; cifs_dbg(FYI, "rfc1001_sessinit set to %d\n", ctx->rfc1001_sessinit); break; case Opt_ver: /* version of mount userspace tools, not dialect */ /* If interface changes in mount.cifs bump to new ver */ if (strncasecmp(param->string, "1", 1) == 0) { if (strlen(param->string) > 1) { pr_warn("Bad mount helper ver=%s. Did you want SMB1 (CIFS) dialect and mean to type vers=1.0 instead?\n", param->string); goto cifs_parse_mount_err; } /* This is the default */ break; } /* For all other value, error */ pr_warn("Invalid mount helper version specified\n"); goto cifs_parse_mount_err; case Opt_vers: /* protocol version (dialect) */ if (cifs_parse_smb_version(fc, param->string, ctx, is_smb3) != 0) goto cifs_parse_mount_err; ctx->got_version = true; break; case Opt_sec: if (cifs_parse_security_flavors(fc, param->string, ctx) != 0) goto cifs_parse_mount_err; break; case Opt_upcalltarget: if (cifs_parse_upcall_target(fc, param->string, ctx) != 0) goto cifs_parse_mount_err; break; case Opt_cache: if (cifs_parse_cache_flavor(fc, param->string, ctx) != 0) goto cifs_parse_mount_err; break; case Opt_witness: #ifndef CONFIG_CIFS_SWN_UPCALL cifs_errorf(fc, "Witness support needs CONFIG_CIFS_SWN_UPCALL config option\n"); goto cifs_parse_mount_err; #endif ctx->witness = true; pr_warn_once("Witness protocol support is experimental\n"); break; case Opt_unicode: ctx->unicode = !result.negated; cifs_dbg(FYI, "unicode set to %d\n", ctx->unicode); break; case Opt_rootfs: #ifndef CONFIG_CIFS_ROOT cifs_dbg(VFS, "rootfs support requires CONFIG_CIFS_ROOT config option\n"); goto cifs_parse_mount_err; #endif ctx->rootfs = true; break; case Opt_posixpaths: if (result.negated) ctx->posix_paths = 0; else ctx->posix_paths = 1; break; case Opt_unix: if (result.negated) { if (ctx->linux_ext == 1) pr_warn_once("conflicting posix mount options specified\n"); ctx->linux_ext = 0; ctx->no_linux_ext = 1; } else { if (ctx->no_linux_ext == 1) pr_warn_once("conflicting posix mount options specified\n"); ctx->linux_ext = 1; ctx->no_linux_ext = 0; } break; case Opt_nocase: ctx->nocase = 1; break; case Opt_brl: if (result.negated) { /* * turn off mandatory locking in mode * if remote locking is turned off since the * local vfs will do advisory */ if (ctx->file_mode == (S_IALLUGO & ~(S_ISUID | S_IXGRP))) ctx->file_mode = S_IALLUGO; ctx->nobrl = 1; } else ctx->nobrl = 0; break; case Opt_handlecache: if (result.negated) ctx->nohandlecache = 1; else ctx->nohandlecache = 0; break; case Opt_forcemandatorylock: ctx->mand_lock = 1; break; case Opt_setuids: ctx->setuids = result.negated; break; case Opt_intr: ctx->intr = !result.negated; break; case Opt_setuidfromacl: ctx->setuidfromacl = 1; break; case Opt_strictsync: ctx->nostrictsync = result.negated; break; case Opt_serverino: ctx->server_ino = !result.negated; break; case Opt_rwpidforward: ctx->rwpidforward = 1; break; case Opt_modesid: ctx->mode_ace = 1; break; case Opt_cifsacl: ctx->cifs_acl = !result.negated; break; case Opt_acl: ctx->no_psx_acl = result.negated; break; case Opt_locallease: ctx->local_lease = 1; break; case Opt_sign: ctx->sign = true; break; case Opt_ignore_signature: ctx->sign = true; ctx->ignore_signature = true; break; case Opt_seal: /* we do not do the following in secFlags because seal * is a per tree connection (mount) not a per socket * or per-smb connection option in the protocol * vol->secFlg |= CIFSSEC_MUST_SEAL; */ ctx->seal = 1; break; case Opt_noac: pr_warn("Mount option noac not supported. Instead set /proc/fs/cifs/LookupCacheEnabled to 0\n"); break; case Opt_fsc: #ifndef CONFIG_CIFS_FSCACHE cifs_errorf(fc, "FS-Cache support needs CONFIG_CIFS_FSCACHE kernel config option set\n"); goto cifs_parse_mount_err; #endif ctx->fsc = true; break; case Opt_mfsymlinks: ctx->mfsymlinks = true; break; case Opt_multiuser: ctx->multiuser = true; break; case Opt_sloppy: ctx->sloppy = true; break; case Opt_nosharesock: ctx->nosharesock = true; break; case Opt_persistent: if (result.negated) { ctx->nopersistent = true; if (ctx->persistent) { cifs_errorf(fc, "persistenthandles mount options conflict\n"); goto cifs_parse_mount_err; } } else { ctx->persistent = true; if ((ctx->nopersistent) || (ctx->resilient)) { cifs_errorf(fc, "persistenthandles mount options conflict\n"); goto cifs_parse_mount_err; } } break; case Opt_resilient: if (result.negated) { ctx->resilient = false; /* already the default */ } else { ctx->resilient = true; if (ctx->persistent) { cifs_errorf(fc, "persistenthandles mount options conflict\n"); goto cifs_parse_mount_err; } } break; case Opt_tcp_nodelay: /* tcp nodelay should not usually be needed since we CORK/UNCORK the socket */ if (result.negated) ctx->sockopt_tcp_nodelay = false; else ctx->sockopt_tcp_nodelay = true; break; case Opt_domainauto: ctx->domainauto = true; break; case Opt_rdma: ctx->rdma = true; break; case Opt_reparse: if (parse_reparse_flavor(fc, param->string, ctx)) goto cifs_parse_mount_err; break; case Opt_nativesocket: ctx->nonativesocket = result.negated; break; case Opt_symlink: if (parse_symlink_flavor(fc, param->string, ctx)) goto cifs_parse_mount_err; break; case Opt_symlinkroot: if (param->string[0] != '/') { cifs_errorf(fc, "symlinkroot mount options must be absolute path\n"); goto cifs_parse_mount_err; } kfree(ctx->symlinkroot); ctx->symlinkroot = kstrdup(param->string, GFP_KERNEL); if (!ctx->symlinkroot) goto cifs_parse_mount_err; break; } /* case Opt_ignore: - is ignored as expected ... */ if (ctx->multiuser && ctx->upcall_target == UPTARGET_MOUNT) { cifs_errorf(fc, "multiuser mount option not supported with upcalltarget set as 'mount'\n"); goto cifs_parse_mount_err; } /* * By default resolve all native absolute symlinks relative to "/mnt/". * Same default has drvfs driver running in WSL for resolving SMB shares. */ if (!ctx->symlinkroot) ctx->symlinkroot = kstrdup("/mnt/", GFP_KERNEL); return 0; cifs_parse_mount_err: kfree_sensitive(ctx->password); ctx->password = NULL; kfree_sensitive(ctx->password2); ctx->password2 = NULL; return -EINVAL; } enum cifs_symlink_type get_cifs_symlink_type(struct cifs_sb_info *cifs_sb) { if (cifs_sb->ctx->symlink_type == CIFS_SYMLINK_TYPE_DEFAULT) { if (cifs_sb->ctx->mfsymlinks) return CIFS_SYMLINK_TYPE_MFSYMLINKS; else if (cifs_sb->ctx->sfu_emul) return CIFS_SYMLINK_TYPE_SFU; else if (cifs_sb->ctx->linux_ext && !cifs_sb->ctx->no_linux_ext) return CIFS_SYMLINK_TYPE_UNIX; else if (cifs_sb->ctx->reparse_type != CIFS_REPARSE_TYPE_NONE) return CIFS_SYMLINK_TYPE_NATIVE; else return CIFS_SYMLINK_TYPE_NONE; } else { return cifs_sb->ctx->symlink_type; } } int smb3_init_fs_context(struct fs_context *fc) { struct smb3_fs_context *ctx; char *nodename = utsname()->nodename; int i; ctx = kzalloc(sizeof(struct smb3_fs_context), GFP_KERNEL); if (unlikely(!ctx)) return -ENOMEM; strscpy(ctx->workstation_name, nodename, sizeof(ctx->workstation_name)); /* * does not have to be perfect mapping since field is * informational, only used for servers that do not support * port 445 and it can be overridden at mount time */ memset(ctx->source_rfc1001_name, 0x20, RFC1001_NAME_LEN); for (i = 0; i < strnlen(nodename, RFC1001_NAME_LEN); i++) ctx->source_rfc1001_name[i] = toupper(nodename[i]); ctx->source_rfc1001_name[RFC1001_NAME_LEN] = 0; /* * null target name indicates to use *SMBSERVR default called name * if we end up sending RFC1001 session initialize */ ctx->target_rfc1001_name[0] = 0; ctx->rfc1001_sessinit = -1; /* autodetect based on port number */ ctx->cred_uid = current_uid(); ctx->linux_uid = current_uid(); ctx->linux_gid = current_gid(); /* By default 4MB read ahead size, 1MB block size */ ctx->bsize = CIFS_DEFAULT_IOSIZE; /* can improve cp performance significantly */ ctx->rasize = 0; /* 0 = use default (ie negotiated rsize) for read ahead pages */ /* * default to SFM style remapping of seven reserved characters * unless user overrides it or we negotiate CIFS POSIX where * it is unnecessary. Can not simultaneously use more than one mapping * since then readdir could list files that open could not open */ ctx->remap = true; /* default to only allowing write access to owner of the mount */ ctx->dir_mode = ctx->file_mode = S_IRUGO | S_IXUGO | S_IWUSR; /* ctx->retry default is 0 (i.e. "soft" limited retry not hard retry) */ /* default is always to request posix paths. */ ctx->posix_paths = 1; /* default to using server inode numbers where available */ ctx->server_ino = 1; /* default is to use strict cifs caching semantics */ ctx->strict_io = true; ctx->acregmax = CIFS_DEF_ACTIMEO; ctx->acdirmax = CIFS_DEF_ACTIMEO; ctx->closetimeo = SMB3_DEF_DCLOSETIMEO; ctx->max_cached_dirs = MAX_CACHED_FIDS; /* Most clients set timeout to 0, allows server to use its default */ ctx->handle_timeout = 0; /* See MS-SMB2 spec section 2.2.14.2.12 */ /* offer SMB2.1 and later (SMB3 etc). Secure and widely accepted */ ctx->ops = &smb30_operations; ctx->vals = &smbdefault_values; ctx->echo_interval = SMB_ECHO_INTERVAL_DEFAULT; /* default to no multichannel (single server connection) */ ctx->multichannel = false; ctx->max_channels = 1; ctx->backupuid_specified = false; /* no backup intent for a user */ ctx->backupgid_specified = false; /* no backup intent for a group */ ctx->retrans = 1; ctx->reparse_type = CIFS_REPARSE_TYPE_DEFAULT; ctx->symlink_type = CIFS_SYMLINK_TYPE_DEFAULT; ctx->nonativesocket = 0; ctx->unicode = -1; /* autodetect, but prefer UNICODE mode */ /* * short int override_uid = -1; * short int override_gid = -1; * char *nodename = strdup(utsname()->nodename); * struct sockaddr *dstaddr = (struct sockaddr *)&vol->dstaddr; */ fc->fs_private = ctx; fc->ops = &smb3_fs_context_ops; return 0; } void smb3_cleanup_fs_context_contents(struct smb3_fs_context *ctx) { if (ctx == NULL) return; /* * Make sure this stays in sync with smb3_fs_context_dup() */ kfree(ctx->username); ctx->username = NULL; kfree_sensitive(ctx->password); ctx->password = NULL; kfree_sensitive(ctx->password2); ctx->password2 = NULL; kfree(ctx->server_hostname); ctx->server_hostname = NULL; kfree(ctx->UNC); ctx->UNC = NULL; kfree(ctx->source); ctx->source = NULL; kfree(ctx->domainname); ctx->domainname = NULL; kfree(ctx->nodename); ctx->nodename = NULL; kfree(ctx->iocharset); ctx->iocharset = NULL; kfree(ctx->prepath); ctx->prepath = NULL; kfree(ctx->leaf_fullpath); ctx->leaf_fullpath = NULL; kfree(ctx->dns_dom); ctx->dns_dom = NULL; kfree(ctx->symlinkroot); ctx->symlinkroot = NULL; } void smb3_cleanup_fs_context(struct smb3_fs_context *ctx) { if (!ctx) return; smb3_cleanup_fs_context_contents(ctx); kfree(ctx); } void smb3_update_mnt_flags(struct cifs_sb_info *cifs_sb) { struct smb3_fs_context *ctx = cifs_sb->ctx; if (ctx->nodfs) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_DFS; else cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_NO_DFS; if (ctx->noperm) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_PERM; else cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_NO_PERM; if (ctx->setuids) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_SET_UID; else cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SET_UID; if (ctx->setuidfromacl) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_UID_FROM_ACL; else cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_UID_FROM_ACL; if (ctx->server_ino) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_SERVER_INUM; else cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM; if (ctx->remap) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_MAP_SFM_CHR; else cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_MAP_SFM_CHR; if (ctx->sfu_remap) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_MAP_SPECIAL_CHR; else cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_MAP_SPECIAL_CHR; if (ctx->no_xattr) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_XATTR; else cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_NO_XATTR; if (ctx->sfu_emul) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_UNX_EMUL; else cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_UNX_EMUL; if (ctx->nobrl) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_BRL; else cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_NO_BRL; if (ctx->nohandlecache) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_HANDLE_CACHE; else cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_NO_HANDLE_CACHE; if (ctx->nostrictsync) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NOSSYNC; else cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_NOSSYNC; if (ctx->mand_lock) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NOPOSIXBRL; else cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_NOPOSIXBRL; if (ctx->rwpidforward) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_RWPIDFORWARD; else cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_RWPIDFORWARD; if (ctx->mode_ace) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_MODE_FROM_SID; else cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_MODE_FROM_SID; if (ctx->cifs_acl) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_ACL; else cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_CIFS_ACL; if (ctx->backupuid_specified) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_BACKUPUID; else cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_CIFS_BACKUPUID; if (ctx->backupgid_specified) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_CIFS_BACKUPGID; else cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_CIFS_BACKUPGID; if (ctx->override_uid) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_OVERR_UID; else cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_OVERR_UID; if (ctx->override_gid) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_OVERR_GID; else cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_OVERR_GID; if (ctx->dynperm) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DYNPERM; else cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_DYNPERM; if (ctx->fsc) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_FSCACHE; else cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_FSCACHE; if (ctx->multiuser) cifs_sb->mnt_cifs_flags |= (CIFS_MOUNT_MULTIUSER | CIFS_MOUNT_NO_PERM); else cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_MULTIUSER; if (ctx->strict_io) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_STRICT_IO; else cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_STRICT_IO; if (ctx->direct_io) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_DIRECT_IO; else cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_DIRECT_IO; if (ctx->mfsymlinks) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_MF_SYMLINKS; else cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_MF_SYMLINKS; if (ctx->mfsymlinks) { if (ctx->sfu_emul) { /* * Our SFU ("Services for Unix") emulation allows now * creating new and reading existing SFU symlinks. * Older Linux kernel versions were not able to neither * read existing nor create new SFU symlinks. But * creating and reading SFU style mknod and FIFOs was * supported for long time. When "mfsymlinks" and * "sfu" are both enabled at the same time, it allows * reading both types of symlinks, but will only create * them with mfsymlinks format. This allows better * Apple compatibility, compatibility with older Linux * kernel clients (probably better for Samba too) * while still recognizing old Windows style symlinks. */ cifs_dbg(VFS, "mount options mfsymlinks and sfu both enabled\n"); } } cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SHUTDOWN; return; } |
| 369 370 4673 4673 4672 4676 3884 3868 4676 58 4563 4555 4555 4553 49 4550 370 370 370 370 4290 4286 4290 4283 442 73 442 4 29 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef LINUX_MM_INLINE_H #define LINUX_MM_INLINE_H #include <linux/atomic.h> #include <linux/huge_mm.h> #include <linux/mm_types.h> #include <linux/swap.h> #include <linux/string.h> #include <linux/userfaultfd_k.h> #include <linux/swapops.h> /** * folio_is_file_lru - Should the folio be on a file LRU or anon LRU? * @folio: The folio to test. * * We would like to get this info without a page flag, but the state * needs to survive until the folio is last deleted from the LRU, which * could be as far down as __page_cache_release. * * Return: An integer (not a boolean!) used to sort a folio onto the * right LRU list and to account folios correctly. * 1 if @folio is a regular filesystem backed page cache folio * or a lazily freed anonymous folio (e.g. via MADV_FREE). * 0 if @folio is a normal anonymous folio, a tmpfs folio or otherwise * ram or swap backed folio. */ static inline int folio_is_file_lru(struct folio *folio) { return !folio_test_swapbacked(folio); } static inline int page_is_file_lru(struct page *page) { return folio_is_file_lru(page_folio(page)); } static __always_inline void __update_lru_size(struct lruvec *lruvec, enum lru_list lru, enum zone_type zid, long nr_pages) { struct pglist_data *pgdat = lruvec_pgdat(lruvec); lockdep_assert_held(&lruvec->lru_lock); WARN_ON_ONCE(nr_pages != (int)nr_pages); __mod_lruvec_state(lruvec, NR_LRU_BASE + lru, nr_pages); __mod_zone_page_state(&pgdat->node_zones[zid], NR_ZONE_LRU_BASE + lru, nr_pages); } static __always_inline void update_lru_size(struct lruvec *lruvec, enum lru_list lru, enum zone_type zid, long nr_pages) { __update_lru_size(lruvec, lru, zid, nr_pages); #ifdef CONFIG_MEMCG mem_cgroup_update_lru_size(lruvec, lru, zid, nr_pages); #endif } /** * __folio_clear_lru_flags - Clear page lru flags before releasing a page. * @folio: The folio that was on lru and now has a zero reference. */ static __always_inline void __folio_clear_lru_flags(struct folio *folio) { VM_BUG_ON_FOLIO(!folio_test_lru(folio), folio); __folio_clear_lru(folio); /* this shouldn't happen, so leave the flags to bad_page() */ if (folio_test_active(folio) && folio_test_unevictable(folio)) return; __folio_clear_active(folio); __folio_clear_unevictable(folio); } /** * folio_lru_list - Which LRU list should a folio be on? * @folio: The folio to test. * * Return: The LRU list a folio should be on, as an index * into the array of LRU lists. */ static __always_inline enum lru_list folio_lru_list(struct folio *folio) { enum lru_list lru; VM_BUG_ON_FOLIO(folio_test_active(folio) && folio_test_unevictable(folio), folio); if (folio_test_unevictable(folio)) return LRU_UNEVICTABLE; lru = folio_is_file_lru(folio) ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON; if (folio_test_active(folio)) lru += LRU_ACTIVE; return lru; } #ifdef CONFIG_LRU_GEN #ifdef CONFIG_LRU_GEN_ENABLED static inline bool lru_gen_enabled(void) { DECLARE_STATIC_KEY_TRUE(lru_gen_caps[NR_LRU_GEN_CAPS]); return static_branch_likely(&lru_gen_caps[LRU_GEN_CORE]); } #else static inline bool lru_gen_enabled(void) { DECLARE_STATIC_KEY_FALSE(lru_gen_caps[NR_LRU_GEN_CAPS]); return static_branch_unlikely(&lru_gen_caps[LRU_GEN_CORE]); } #endif static inline bool lru_gen_in_fault(void) { return current->in_lru_fault; } static inline int lru_gen_from_seq(unsigned long seq) { return seq % MAX_NR_GENS; } static inline int lru_hist_from_seq(unsigned long seq) { return seq % NR_HIST_GENS; } static inline int lru_tier_from_refs(int refs, bool workingset) { VM_WARN_ON_ONCE(refs > BIT(LRU_REFS_WIDTH)); /* see the comment on MAX_NR_TIERS */ return workingset ? MAX_NR_TIERS - 1 : order_base_2(refs); } static inline int folio_lru_refs(struct folio *folio) { unsigned long flags = READ_ONCE(folio->flags); if (!(flags & BIT(PG_referenced))) return 0; /* * Return the total number of accesses including PG_referenced. Also see * the comment on LRU_REFS_FLAGS. */ return ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + 1; } static inline int folio_lru_gen(struct folio *folio) { unsigned long flags = READ_ONCE(folio->flags); return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; } static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen) { unsigned long max_seq = lruvec->lrugen.max_seq; VM_WARN_ON_ONCE(gen >= MAX_NR_GENS); /* see the comment on MIN_NR_GENS */ return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1); } static inline void lru_gen_update_size(struct lruvec *lruvec, struct folio *folio, int old_gen, int new_gen) { int type = folio_is_file_lru(folio); int zone = folio_zonenum(folio); int delta = folio_nr_pages(folio); enum lru_list lru = type * LRU_INACTIVE_FILE; struct lru_gen_folio *lrugen = &lruvec->lrugen; VM_WARN_ON_ONCE(old_gen != -1 && old_gen >= MAX_NR_GENS); VM_WARN_ON_ONCE(new_gen != -1 && new_gen >= MAX_NR_GENS); VM_WARN_ON_ONCE(old_gen == -1 && new_gen == -1); if (old_gen >= 0) WRITE_ONCE(lrugen->nr_pages[old_gen][type][zone], lrugen->nr_pages[old_gen][type][zone] - delta); if (new_gen >= 0) WRITE_ONCE(lrugen->nr_pages[new_gen][type][zone], lrugen->nr_pages[new_gen][type][zone] + delta); /* addition */ if (old_gen < 0) { if (lru_gen_is_active(lruvec, new_gen)) lru += LRU_ACTIVE; __update_lru_size(lruvec, lru, zone, delta); return; } /* deletion */ if (new_gen < 0) { if (lru_gen_is_active(lruvec, old_gen)) lru += LRU_ACTIVE; __update_lru_size(lruvec, lru, zone, -delta); return; } /* promotion */ if (!lru_gen_is_active(lruvec, old_gen) && lru_gen_is_active(lruvec, new_gen)) { __update_lru_size(lruvec, lru, zone, -delta); __update_lru_size(lruvec, lru + LRU_ACTIVE, zone, delta); } /* demotion requires isolation, e.g., lru_deactivate_fn() */ VM_WARN_ON_ONCE(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen)); } static inline unsigned long lru_gen_folio_seq(struct lruvec *lruvec, struct folio *folio, bool reclaiming) { int gen; int type = folio_is_file_lru(folio); struct lru_gen_folio *lrugen = &lruvec->lrugen; /* * +-----------------------------------+-----------------------------------+ * | Accessed through page tables and | Accessed through file descriptors | * | promoted by folio_update_gen() | and protected by folio_inc_gen() | * +-----------------------------------+-----------------------------------+ * | PG_active (set while isolated) | | * +-----------------+-----------------+-----------------+-----------------+ * | PG_workingset | PG_referenced | PG_workingset | LRU_REFS_FLAGS | * +-----------------------------------+-----------------------------------+ * |<---------- MIN_NR_GENS ---------->| | * |<---------------------------- MAX_NR_GENS ---------------------------->| */ if (folio_test_active(folio)) gen = MIN_NR_GENS - folio_test_workingset(folio); else if (reclaiming) gen = MAX_NR_GENS; else if ((!folio_is_file_lru(folio) && !folio_test_swapcache(folio)) || (folio_test_reclaim(folio) && (folio_test_dirty(folio) || folio_test_writeback(folio)))) gen = MIN_NR_GENS; else gen = MAX_NR_GENS - folio_test_workingset(folio); return max(READ_ONCE(lrugen->max_seq) - gen + 1, READ_ONCE(lrugen->min_seq[type])); } static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) { unsigned long seq; unsigned long flags; int gen = folio_lru_gen(folio); int type = folio_is_file_lru(folio); int zone = folio_zonenum(folio); struct lru_gen_folio *lrugen = &lruvec->lrugen; VM_WARN_ON_ONCE_FOLIO(gen != -1, folio); if (folio_test_unevictable(folio) || !lrugen->enabled) return false; seq = lru_gen_folio_seq(lruvec, folio, reclaiming); gen = lru_gen_from_seq(seq); flags = (gen + 1UL) << LRU_GEN_PGOFF; /* see the comment on MIN_NR_GENS about PG_active */ set_mask_bits(&folio->flags, LRU_GEN_MASK | BIT(PG_active), flags); lru_gen_update_size(lruvec, folio, -1, gen); /* for folio_rotate_reclaimable() */ if (reclaiming) list_add_tail(&folio->lru, &lrugen->folios[gen][type][zone]); else list_add(&folio->lru, &lrugen->folios[gen][type][zone]); return true; } static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) { unsigned long flags; int gen = folio_lru_gen(folio); if (gen < 0) return false; VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio); VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio); /* for folio_migrate_flags() */ flags = !reclaiming && lru_gen_is_active(lruvec, gen) ? BIT(PG_active) : 0; flags = set_mask_bits(&folio->flags, LRU_GEN_MASK, flags); gen = ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; lru_gen_update_size(lruvec, folio, gen, -1); list_del(&folio->lru); return true; } static inline void folio_migrate_refs(struct folio *new, struct folio *old) { unsigned long refs = READ_ONCE(old->flags) & LRU_REFS_MASK; set_mask_bits(&new->flags, LRU_REFS_MASK, refs); } #else /* !CONFIG_LRU_GEN */ static inline bool lru_gen_enabled(void) { return false; } static inline bool lru_gen_in_fault(void) { return false; } static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) { return false; } static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, bool reclaiming) { return false; } static inline void folio_migrate_refs(struct folio *new, struct folio *old) { } #endif /* CONFIG_LRU_GEN */ static __always_inline void lruvec_add_folio(struct lruvec *lruvec, struct folio *folio) { enum lru_list lru = folio_lru_list(folio); if (lru_gen_add_folio(lruvec, folio, false)) return; update_lru_size(lruvec, lru, folio_zonenum(folio), folio_nr_pages(folio)); if (lru != LRU_UNEVICTABLE) list_add(&folio->lru, &lruvec->lists[lru]); } static __always_inline void lruvec_add_folio_tail(struct lruvec *lruvec, struct folio *folio) { enum lru_list lru = folio_lru_list(folio); if (lru_gen_add_folio(lruvec, folio, true)) return; update_lru_size(lruvec, lru, folio_zonenum(folio), folio_nr_pages(folio)); /* This is not expected to be used on LRU_UNEVICTABLE */ list_add_tail(&folio->lru, &lruvec->lists[lru]); } static __always_inline void lruvec_del_folio(struct lruvec *lruvec, struct folio *folio) { enum lru_list lru = folio_lru_list(folio); if (lru_gen_del_folio(lruvec, folio, false)) return; if (lru != LRU_UNEVICTABLE) list_del(&folio->lru); update_lru_size(lruvec, lru, folio_zonenum(folio), -folio_nr_pages(folio)); } #ifdef CONFIG_ANON_VMA_NAME /* mmap_lock should be read-locked */ static inline void anon_vma_name_get(struct anon_vma_name *anon_name) { if (anon_name) kref_get(&anon_name->kref); } static inline void anon_vma_name_put(struct anon_vma_name *anon_name) { if (anon_name) kref_put(&anon_name->kref, anon_vma_name_free); } static inline struct anon_vma_name *anon_vma_name_reuse(struct anon_vma_name *anon_name) { /* Prevent anon_name refcount saturation early on */ if (kref_read(&anon_name->kref) < REFCOUNT_MAX) { anon_vma_name_get(anon_name); return anon_name; } return anon_vma_name_alloc(anon_name->name); } static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma, struct vm_area_struct *new_vma) { struct anon_vma_name *anon_name = anon_vma_name(orig_vma); if (anon_name) new_vma->anon_name = anon_vma_name_reuse(anon_name); } static inline void free_anon_vma_name(struct vm_area_struct *vma) { /* * Not using anon_vma_name because it generates a warning if mmap_lock * is not held, which might be the case here. */ anon_vma_name_put(vma->anon_name); } static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1, struct anon_vma_name *anon_name2) { if (anon_name1 == anon_name2) return true; return anon_name1 && anon_name2 && !strcmp(anon_name1->name, anon_name2->name); } #else /* CONFIG_ANON_VMA_NAME */ static inline void anon_vma_name_get(struct anon_vma_name *anon_name) {} static inline void anon_vma_name_put(struct anon_vma_name *anon_name) {} static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma, struct vm_area_struct *new_vma) {} static inline void free_anon_vma_name(struct vm_area_struct *vma) {} static inline bool anon_vma_name_eq(struct anon_vma_name *anon_name1, struct anon_vma_name *anon_name2) { return true; } #endif /* CONFIG_ANON_VMA_NAME */ static inline void init_tlb_flush_pending(struct mm_struct *mm) { atomic_set(&mm->tlb_flush_pending, 0); } static inline void inc_tlb_flush_pending(struct mm_struct *mm) { atomic_inc(&mm->tlb_flush_pending); /* * The only time this value is relevant is when there are indeed pages * to flush. And we'll only flush pages after changing them, which * requires the PTL. * * So the ordering here is: * * atomic_inc(&mm->tlb_flush_pending); * spin_lock(&ptl); * ... * set_pte_at(); * spin_unlock(&ptl); * * spin_lock(&ptl) * mm_tlb_flush_pending(); * .... * spin_unlock(&ptl); * * flush_tlb_range(); * atomic_dec(&mm->tlb_flush_pending); * * Where the increment if constrained by the PTL unlock, it thus * ensures that the increment is visible if the PTE modification is * visible. After all, if there is no PTE modification, nobody cares * about TLB flushes either. * * This very much relies on users (mm_tlb_flush_pending() and * mm_tlb_flush_nested()) only caring about _specific_ PTEs (and * therefore specific PTLs), because with SPLIT_PTE_PTLOCKS and RCpc * locks (PPC) the unlock of one doesn't order against the lock of * another PTL. * * The decrement is ordered by the flush_tlb_range(), such that * mm_tlb_flush_pending() will not return false unless all flushes have * completed. */ } static inline void dec_tlb_flush_pending(struct mm_struct *mm) { /* * See inc_tlb_flush_pending(). * * This cannot be smp_mb__before_atomic() because smp_mb() simply does * not order against TLB invalidate completion, which is what we need. * * Therefore we must rely on tlb_flush_*() to guarantee order. */ atomic_dec(&mm->tlb_flush_pending); } static inline bool mm_tlb_flush_pending(struct mm_struct *mm) { /* * Must be called after having acquired the PTL; orders against that * PTLs release and therefore ensures that if we observe the modified * PTE we must also observe the increment from inc_tlb_flush_pending(). * * That is, it only guarantees to return true if there is a flush * pending for _this_ PTL. */ return atomic_read(&mm->tlb_flush_pending); } static inline bool mm_tlb_flush_nested(struct mm_struct *mm) { /* * Similar to mm_tlb_flush_pending(), we must have acquired the PTL * for which there is a TLB flush pending in order to guarantee * we've seen both that PTE modification and the increment. * * (no requirement on actually still holding the PTL, that is irrelevant) */ return atomic_read(&mm->tlb_flush_pending) > 1; } #ifdef CONFIG_MMU /* * Computes the pte marker to copy from the given source entry into dst_vma. * If no marker should be copied, returns 0. * The caller should insert a new pte created with make_pte_marker(). */ static inline pte_marker copy_pte_marker( swp_entry_t entry, struct vm_area_struct *dst_vma) { pte_marker srcm = pte_marker_get(entry); /* Always copy error entries. */ pte_marker dstm = srcm & (PTE_MARKER_POISONED | PTE_MARKER_GUARD); /* Only copy PTE markers if UFFD register matches. */ if ((srcm & PTE_MARKER_UFFD_WP) && userfaultfd_wp(dst_vma)) dstm |= PTE_MARKER_UFFD_WP; return dstm; } #endif /* * If this pte is wr-protected by uffd-wp in any form, arm the special pte to * replace a none pte. NOTE! This should only be called when *pte is already * cleared so we will never accidentally replace something valuable. Meanwhile * none pte also means we are not demoting the pte so tlb flushed is not needed. * E.g., when pte cleared the caller should have taken care of the tlb flush. * * Must be called with pgtable lock held so that no thread will see the none * pte, and if they see it, they'll fault and serialize at the pgtable lock. * * Returns true if an uffd-wp pte was installed, false otherwise. */ static inline bool pte_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr, pte_t *pte, pte_t pteval) { #ifdef CONFIG_PTE_MARKER_UFFD_WP bool arm_uffd_pte = false; /* The current status of the pte should be "cleared" before calling */ WARN_ON_ONCE(!pte_none(ptep_get(pte))); /* * NOTE: userfaultfd_wp_unpopulated() doesn't need this whole * thing, because when zapping either it means it's dropping the * page, or in TTU where the present pte will be quickly replaced * with a swap pte. There's no way of leaking the bit. */ if (vma_is_anonymous(vma) || !userfaultfd_wp(vma)) return false; /* A uffd-wp wr-protected normal pte */ if (unlikely(pte_present(pteval) && pte_uffd_wp(pteval))) arm_uffd_pte = true; /* * A uffd-wp wr-protected swap pte. Note: this should even cover an * existing pte marker with uffd-wp bit set. */ if (unlikely(pte_swp_uffd_wp_any(pteval))) arm_uffd_pte = true; if (unlikely(arm_uffd_pte)) { set_pte_at(vma->vm_mm, addr, pte, make_pte_marker(PTE_MARKER_UFFD_WP)); return true; } #endif return false; } static inline bool vma_has_recency(struct vm_area_struct *vma) { if (vma->vm_flags & (VM_SEQ_READ | VM_RAND_READ)) return false; if (vma->vm_file && (vma->vm_file->f_mode & FMODE_NOREUSE)) return false; return true; } #endif |
| 379 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 | /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright 2019 Google LLC */ #ifndef __LINUX_BLK_CRYPTO_H #define __LINUX_BLK_CRYPTO_H #include <linux/minmax.h> #include <linux/types.h> #include <uapi/linux/blk-crypto.h> enum blk_crypto_mode_num { BLK_ENCRYPTION_MODE_INVALID, BLK_ENCRYPTION_MODE_AES_256_XTS, BLK_ENCRYPTION_MODE_AES_128_CBC_ESSIV, BLK_ENCRYPTION_MODE_ADIANTUM, BLK_ENCRYPTION_MODE_SM4_XTS, BLK_ENCRYPTION_MODE_MAX, }; /* * Supported types of keys. Must be bitflags due to their use in * blk_crypto_profile::key_types_supported. */ enum blk_crypto_key_type { /* * Raw keys (i.e. "software keys"). These keys are simply kept in raw, * plaintext form in kernel memory. */ BLK_CRYPTO_KEY_TYPE_RAW = 0x1, /* * Hardware-wrapped keys. These keys are only present in kernel memory * in ephemerally-wrapped form, and they can only be unwrapped by * dedicated hardware. For details, see the "Hardware-wrapped keys" * section of Documentation/block/inline-encryption.rst. */ BLK_CRYPTO_KEY_TYPE_HW_WRAPPED = 0x2, }; /* * Currently the maximum raw key size is 64 bytes, as that is the key size of * BLK_ENCRYPTION_MODE_AES_256_XTS which takes the longest key. * * The maximum hardware-wrapped key size depends on the hardware's key wrapping * algorithm, which is a hardware implementation detail, so it isn't precisely * specified. But currently 128 bytes is plenty in practice. Implementations * are recommended to wrap a 32-byte key for the hardware KDF with AES-256-GCM, * which should result in a size closer to 64 bytes than 128. * * Both of these values can trivially be increased if ever needed. */ #define BLK_CRYPTO_MAX_RAW_KEY_SIZE 64 #define BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE 128 #define BLK_CRYPTO_MAX_ANY_KEY_SIZE \ MAX(BLK_CRYPTO_MAX_RAW_KEY_SIZE, BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE) /* * Size of the "software secret" which can be derived from a hardware-wrapped * key. This is currently always 32 bytes. Note, the choice of 32 bytes * assumes that the software secret is only used directly for algorithms that * don't require more than a 256-bit key to get the desired security strength. * If it were to be used e.g. directly as an AES-256-XTS key, then this would * need to be increased (which is possible if hardware supports it, but care * would need to be taken to avoid breaking users who need exactly 32 bytes). */ #define BLK_CRYPTO_SW_SECRET_SIZE 32 /** * struct blk_crypto_config - an inline encryption key's crypto configuration * @crypto_mode: encryption algorithm this key is for * @data_unit_size: the data unit size for all encryption/decryptions with this * key. This is the size in bytes of each individual plaintext and * ciphertext. This is always a power of 2. It might be e.g. the * filesystem block size or the disk sector size. * @dun_bytes: the maximum number of bytes of DUN used when using this key * @key_type: the type of this key -- either raw or hardware-wrapped */ struct blk_crypto_config { enum blk_crypto_mode_num crypto_mode; unsigned int data_unit_size; unsigned int dun_bytes; enum blk_crypto_key_type key_type; }; /** * struct blk_crypto_key - an inline encryption key * @crypto_cfg: the crypto mode, data unit size, key type, and other * characteristics of this key and how it will be used * @data_unit_size_bits: log2 of data_unit_size * @size: size of this key in bytes. The size of a raw key is fixed for a given * crypto mode, but the size of a hardware-wrapped key can vary. * @bytes: the bytes of this key. Only the first @size bytes are significant. * * A blk_crypto_key is immutable once created, and many bios can reference it at * the same time. It must not be freed until all bios using it have completed * and it has been evicted from all devices on which it may have been used. */ struct blk_crypto_key { struct blk_crypto_config crypto_cfg; unsigned int data_unit_size_bits; unsigned int size; u8 bytes[BLK_CRYPTO_MAX_ANY_KEY_SIZE]; }; #define BLK_CRYPTO_MAX_IV_SIZE 32 #define BLK_CRYPTO_DUN_ARRAY_SIZE (BLK_CRYPTO_MAX_IV_SIZE / sizeof(u64)) /** * struct bio_crypt_ctx - an inline encryption context * @bc_key: the key, algorithm, and data unit size to use * @bc_dun: the data unit number (starting IV) to use * * A bio_crypt_ctx specifies that the contents of the bio will be encrypted (for * write requests) or decrypted (for read requests) inline by the storage device * or controller, or by the crypto API fallback. */ struct bio_crypt_ctx { const struct blk_crypto_key *bc_key; u64 bc_dun[BLK_CRYPTO_DUN_ARRAY_SIZE]; }; #include <linux/blk_types.h> #include <linux/blkdev.h> #ifdef CONFIG_BLK_INLINE_ENCRYPTION static inline bool bio_has_crypt_ctx(struct bio *bio) { return bio->bi_crypt_context; } void bio_crypt_set_ctx(struct bio *bio, const struct blk_crypto_key *key, const u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE], gfp_t gfp_mask); bool bio_crypt_dun_is_contiguous(const struct bio_crypt_ctx *bc, unsigned int bytes, const u64 next_dun[BLK_CRYPTO_DUN_ARRAY_SIZE]); int blk_crypto_init_key(struct blk_crypto_key *blk_key, const u8 *key_bytes, size_t key_size, enum blk_crypto_key_type key_type, enum blk_crypto_mode_num crypto_mode, unsigned int dun_bytes, unsigned int data_unit_size); int blk_crypto_start_using_key(struct block_device *bdev, const struct blk_crypto_key *key); void blk_crypto_evict_key(struct block_device *bdev, const struct blk_crypto_key *key); bool blk_crypto_config_supported_natively(struct block_device *bdev, const struct blk_crypto_config *cfg); bool blk_crypto_config_supported(struct block_device *bdev, const struct blk_crypto_config *cfg); int blk_crypto_derive_sw_secret(struct block_device *bdev, const u8 *eph_key, size_t eph_key_size, u8 sw_secret[BLK_CRYPTO_SW_SECRET_SIZE]); #else /* CONFIG_BLK_INLINE_ENCRYPTION */ static inline bool bio_has_crypt_ctx(struct bio *bio) { return false; } #endif /* CONFIG_BLK_INLINE_ENCRYPTION */ int __bio_crypt_clone(struct bio *dst, struct bio *src, gfp_t gfp_mask); /** * bio_crypt_clone - clone bio encryption context * @dst: destination bio * @src: source bio * @gfp_mask: memory allocation flags * * If @src has an encryption context, clone it to @dst. * * Return: 0 on success, -ENOMEM if out of memory. -ENOMEM is only possible if * @gfp_mask doesn't include %__GFP_DIRECT_RECLAIM. */ static inline int bio_crypt_clone(struct bio *dst, struct bio *src, gfp_t gfp_mask) { if (bio_has_crypt_ctx(src)) return __bio_crypt_clone(dst, src, gfp_mask); return 0; } #endif /* __LINUX_BLK_CRYPTO_H */ |
| 35 36 28 20 28 64 64 61 27 8 8 8 64 28 28 28 63 64 64 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 | // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" #include "btree_update.h" #include "btree_update_interior.h" #include "buckets.h" #include "debug.h" #include "extents.h" #include "extent_update.h" /* * This counts the number of iterators to the alloc & ec btrees we'll need * inserting/removing this extent: */ static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; unsigned ret = 0, lru = 0; bkey_extent_entry_for_each(ptrs, entry) { switch (__extent_entry_type(entry)) { case BCH_EXTENT_ENTRY_ptr: /* Might also be updating LRU btree */ if (entry->ptr.cached) lru++; fallthrough; case BCH_EXTENT_ENTRY_stripe_ptr: ret++; } } /* * Updating keys in the alloc btree may also update keys in the * freespace or discard btrees: */ return lru + ret * 2; } static int count_iters_for_insert(struct btree_trans *trans, struct bkey_s_c k, unsigned offset, struct bpos *end, unsigned *nr_iters, unsigned max_iters) { int ret = 0, ret2 = 0; if (*nr_iters >= max_iters) { *end = bpos_min(*end, k.k->p); ret = 1; } switch (k.k->type) { case KEY_TYPE_extent: case KEY_TYPE_reflink_v: *nr_iters += bch2_bkey_nr_alloc_ptrs(k); if (*nr_iters >= max_iters) { *end = bpos_min(*end, k.k->p); ret = 1; } break; case KEY_TYPE_reflink_p: { struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); u64 idx = REFLINK_P_IDX(p.v); unsigned sectors = bpos_min(*end, p.k->p).offset - bkey_start_offset(p.k); struct btree_iter iter; struct bkey_s_c r_k; for_each_btree_key_norestart(trans, iter, BTREE_ID_reflink, POS(0, idx + offset), BTREE_ITER_slots, r_k, ret2) { if (bkey_ge(bkey_start_pos(r_k.k), POS(0, idx + sectors))) break; /* extent_update_to_keys(), for the reflink_v update */ *nr_iters += 1; *nr_iters += 1 + bch2_bkey_nr_alloc_ptrs(r_k); if (*nr_iters >= max_iters) { struct bpos pos = bkey_start_pos(k.k); pos.offset += min_t(u64, k.k->size, r_k.k->p.offset - idx); *end = bpos_min(*end, pos); ret = 1; break; } } bch2_trans_iter_exit(trans, &iter); break; } } return ret2 ?: ret; } #define EXTENT_ITERS_MAX (BTREE_ITER_INITIAL / 3) int bch2_extent_atomic_end(struct btree_trans *trans, struct btree_iter *iter, struct bkey_i *insert, struct bpos *end) { struct btree_iter copy; struct bkey_s_c k; unsigned nr_iters = 0; int ret; ret = bch2_btree_iter_traverse(trans, iter); if (ret) return ret; *end = insert->k.p; /* extent_update_to_keys(): */ nr_iters += 1; ret = count_iters_for_insert(trans, bkey_i_to_s_c(insert), 0, end, &nr_iters, EXTENT_ITERS_MAX / 2); if (ret < 0) return ret; bch2_trans_copy_iter(trans, ©, iter); for_each_btree_key_max_continue_norestart(trans, copy, insert->k.p, 0, k, ret) { unsigned offset = 0; if (bkey_gt(bkey_start_pos(&insert->k), bkey_start_pos(k.k))) offset = bkey_start_offset(&insert->k) - bkey_start_offset(k.k); /* extent_handle_overwrites(): */ switch (bch2_extent_overlap(&insert->k, k.k)) { case BCH_EXTENT_OVERLAP_ALL: case BCH_EXTENT_OVERLAP_FRONT: nr_iters += 1; break; case BCH_EXTENT_OVERLAP_BACK: case BCH_EXTENT_OVERLAP_MIDDLE: nr_iters += 2; break; } ret = count_iters_for_insert(trans, k, offset, end, &nr_iters, EXTENT_ITERS_MAX); if (ret) break; } bch2_trans_iter_exit(trans, ©); return ret < 0 ? ret : 0; } int bch2_extent_trim_atomic(struct btree_trans *trans, struct btree_iter *iter, struct bkey_i *k) { struct bpos end; int ret; ret = bch2_extent_atomic_end(trans, iter, k, &end); if (ret) return ret; bch2_cut_back(end, k); return 0; } |
| 4 4 4 13 4 4 4 4 4 4 4 4 3 1 4 4 4 4 2 2 2 2 45 2 2 2 2 2 2 2 2 2 2 2 2 2 2 7 7 7 5 2 7 7 2 54 17 1 18 30 13 13 13 13 13 13 6 13 8 6 6 8 2 5 9 5 6 8 13 13 12 1 12 2 13 13 7 6 13 30 5 43 10 10 10 10 10 1 10 10 53 53 186 184 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 | // SPDX-License-Identifier: GPL-2.0 /* * Some low level IO code, and hacks for various block layer limitations * * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> * Copyright 2012 Google, Inc. */ #include "bcachefs.h" #include "alloc_background.h" #include "alloc_foreground.h" #include "btree_update.h" #include "buckets.h" #include "checksum.h" #include "clock.h" #include "compress.h" #include "data_update.h" #include "disk_groups.h" #include "ec.h" #include "error.h" #include "io_read.h" #include "io_misc.h" #include "io_write.h" #include "reflink.h" #include "subvolume.h" #include "trace.h" #include <linux/random.h> #include <linux/sched/mm.h> #ifdef CONFIG_BCACHEFS_DEBUG static unsigned bch2_read_corrupt_ratio; module_param_named(read_corrupt_ratio, bch2_read_corrupt_ratio, uint, 0644); MODULE_PARM_DESC(read_corrupt_ratio, ""); #endif #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT static bool bch2_target_congested(struct bch_fs *c, u16 target) { const struct bch_devs_mask *devs; unsigned d, nr = 0, total = 0; u64 now = local_clock(), last; s64 congested; struct bch_dev *ca; if (!target) return false; rcu_read_lock(); devs = bch2_target_to_mask(c, target) ?: &c->rw_devs[BCH_DATA_user]; for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) { ca = rcu_dereference(c->devs[d]); if (!ca) continue; congested = atomic_read(&ca->congested); last = READ_ONCE(ca->congested_last); if (time_after64(now, last)) congested -= (now - last) >> 12; total += max(congested, 0LL); nr++; } rcu_read_unlock(); return get_random_u32_below(nr * CONGESTED_MAX) < total; } #else static bool bch2_target_congested(struct bch_fs *c, u16 target) { return false; } #endif /* Cache promotion on read */ struct promote_op { struct rcu_head rcu; u64 start_time; struct rhash_head hash; struct bpos pos; struct work_struct work; struct data_update write; struct bio_vec bi_inline_vecs[]; /* must be last */ }; static const struct rhashtable_params bch_promote_params = { .head_offset = offsetof(struct promote_op, hash), .key_offset = offsetof(struct promote_op, pos), .key_len = sizeof(struct bpos), .automatic_shrinking = true, }; static inline bool have_io_error(struct bch_io_failures *failed) { return failed && failed->nr; } static inline struct data_update *rbio_data_update(struct bch_read_bio *rbio) { EBUG_ON(rbio->split); return rbio->data_update ? container_of(rbio, struct data_update, rbio) : NULL; } static bool ptr_being_rewritten(struct bch_read_bio *orig, unsigned dev) { struct data_update *u = rbio_data_update(orig); if (!u) return false; struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(u->k.k)); unsigned i = 0; bkey_for_each_ptr(ptrs, ptr) { if (ptr->dev == dev && u->data_opts.rewrite_ptrs & BIT(i)) return true; i++; } return false; } static inline int should_promote(struct bch_fs *c, struct bkey_s_c k, struct bpos pos, struct bch_io_opts opts, unsigned flags, struct bch_io_failures *failed) { if (!have_io_error(failed)) { BUG_ON(!opts.promote_target); if (!(flags & BCH_READ_may_promote)) return -BCH_ERR_nopromote_may_not; if (bch2_bkey_has_target(c, k, opts.promote_target)) return -BCH_ERR_nopromote_already_promoted; if (bkey_extent_is_unwritten(k)) return -BCH_ERR_nopromote_unwritten; if (bch2_target_congested(c, opts.promote_target)) return -BCH_ERR_nopromote_congested; } if (rhashtable_lookup_fast(&c->promote_table, &pos, bch_promote_params)) return -BCH_ERR_nopromote_in_flight; return 0; } static noinline void promote_free(struct bch_read_bio *rbio) { struct promote_op *op = container_of(rbio, struct promote_op, write.rbio); struct bch_fs *c = rbio->c; int ret = rhashtable_remove_fast(&c->promote_table, &op->hash, bch_promote_params); BUG_ON(ret); bch2_data_update_exit(&op->write); bch2_write_ref_put(c, BCH_WRITE_REF_promote); kfree_rcu(op, rcu); } static void promote_done(struct bch_write_op *wop) { struct promote_op *op = container_of(wop, struct promote_op, write.op); struct bch_fs *c = op->write.rbio.c; bch2_time_stats_update(&c->times[BCH_TIME_data_promote], op->start_time); promote_free(&op->write.rbio); } static void promote_start_work(struct work_struct *work) { struct promote_op *op = container_of(work, struct promote_op, work); bch2_data_update_read_done(&op->write); } static noinline void promote_start(struct bch_read_bio *rbio) { struct promote_op *op = container_of(rbio, struct promote_op, write.rbio); trace_and_count(op->write.op.c, io_read_promote, &rbio->bio); INIT_WORK(&op->work, promote_start_work); queue_work(rbio->c->write_ref_wq, &op->work); } static struct bch_read_bio *__promote_alloc(struct btree_trans *trans, enum btree_id btree_id, struct bkey_s_c k, struct bpos pos, struct extent_ptr_decoded *pick, unsigned sectors, struct bch_read_bio *orig, struct bch_io_failures *failed) { struct bch_fs *c = trans->c; int ret; struct data_update_opts update_opts = { .write_flags = BCH_WRITE_alloc_nowait }; if (!have_io_error(failed)) { update_opts.target = orig->opts.promote_target; update_opts.extra_replicas = 1; update_opts.write_flags |= BCH_WRITE_cached; update_opts.write_flags |= BCH_WRITE_only_specified_devs; } else { update_opts.target = orig->opts.foreground_target; struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); unsigned ptr_bit = 1; bkey_for_each_ptr(ptrs, ptr) { if (bch2_dev_io_failures(failed, ptr->dev) && !ptr_being_rewritten(orig, ptr->dev)) update_opts.rewrite_ptrs |= ptr_bit; ptr_bit <<= 1; } if (!update_opts.rewrite_ptrs) return NULL; } if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote)) return ERR_PTR(-BCH_ERR_nopromote_no_writes); struct promote_op *op = kzalloc(sizeof(*op), GFP_KERNEL); if (!op) { ret = -BCH_ERR_nopromote_enomem; goto err_put; } op->start_time = local_clock(); op->pos = pos; if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash, bch_promote_params)) { ret = -BCH_ERR_nopromote_in_flight; goto err; } ret = bch2_data_update_init(trans, NULL, NULL, &op->write, writepoint_hashed((unsigned long) current), &orig->opts, update_opts, btree_id, k); op->write.type = BCH_DATA_UPDATE_promote; /* * possible errors: -BCH_ERR_nocow_lock_blocked, * -BCH_ERR_ENOSPC_disk_reservation: */ if (ret) goto err_remove_hash; rbio_init_fragment(&op->write.rbio.bio, orig); op->write.rbio.bounce = true; op->write.rbio.promote = true; op->write.op.end_io = promote_done; return &op->write.rbio; err_remove_hash: BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash, bch_promote_params)); err: bio_free_pages(&op->write.op.wbio.bio); /* We may have added to the rhashtable and thus need rcu freeing: */ kfree_rcu(op, rcu); err_put: bch2_write_ref_put(c, BCH_WRITE_REF_promote); return ERR_PTR(ret); } noinline static struct bch_read_bio *promote_alloc(struct btree_trans *trans, struct bvec_iter iter, struct bkey_s_c k, struct extent_ptr_decoded *pick, unsigned flags, struct bch_read_bio *orig, bool *bounce, bool *read_full, struct bch_io_failures *failed) { struct bch_fs *c = trans->c; /* * if failed != NULL we're not actually doing a promote, we're * recovering from an io/checksum error */ bool promote_full = (have_io_error(failed) || *read_full || READ_ONCE(c->opts.promote_whole_extents)); /* data might have to be decompressed in the write path: */ unsigned sectors = promote_full ? max(pick->crc.compressed_size, pick->crc.live_size) : bvec_iter_sectors(iter); struct bpos pos = promote_full ? bkey_start_pos(k.k) : POS(k.k->p.inode, iter.bi_sector); int ret; ret = should_promote(c, k, pos, orig->opts, flags, failed); if (ret) goto nopromote; struct bch_read_bio *promote = __promote_alloc(trans, k.k->type == KEY_TYPE_reflink_v ? BTREE_ID_reflink : BTREE_ID_extents, k, pos, pick, sectors, orig, failed); if (!promote) return NULL; ret = PTR_ERR_OR_ZERO(promote); if (ret) goto nopromote; *bounce = true; *read_full = promote_full; return promote; nopromote: trace_io_read_nopromote(c, ret); return NULL; } /* Read */ static int bch2_read_err_msg_trans(struct btree_trans *trans, struct printbuf *out, struct bch_read_bio *rbio, struct bpos read_pos) { int ret = lockrestart_do(trans, bch2_inum_offset_err_msg_trans(trans, out, (subvol_inum) { rbio->subvol, read_pos.inode }, read_pos.offset << 9)); if (ret) return ret; if (rbio->data_update) prt_str(out, "(internal move) "); return 0; } static void bch2_read_err_msg(struct bch_fs *c, struct printbuf *out, struct bch_read_bio *rbio, struct bpos read_pos) { bch2_trans_run(c, bch2_read_err_msg_trans(trans, out, rbio, read_pos)); } enum rbio_context { RBIO_CONTEXT_NULL, RBIO_CONTEXT_HIGHPRI, RBIO_CONTEXT_UNBOUND, }; static inline struct bch_read_bio * bch2_rbio_parent(struct bch_read_bio *rbio) { return rbio->split ? rbio->parent : rbio; } __always_inline static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn, enum rbio_context context, struct workqueue_struct *wq) { if (context <= rbio->context) { fn(&rbio->work); } else { rbio->work.func = fn; rbio->context = context; queue_work(wq, &rbio->work); } } static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio) { BUG_ON(rbio->bounce && !rbio->split); if (rbio->have_ioref) { struct bch_dev *ca = bch2_dev_have_ref(rbio->c, rbio->pick.ptr.dev); percpu_ref_put(&ca->io_ref[READ]); } if (rbio->split) { struct bch_read_bio *parent = rbio->parent; if (unlikely(rbio->promote)) { if (!rbio->bio.bi_status) promote_start(rbio); else promote_free(rbio); } else { if (rbio->bounce) bch2_bio_free_pages_pool(rbio->c, &rbio->bio); bio_put(&rbio->bio); } rbio = parent; } return rbio; } /* * Only called on a top level bch_read_bio to complete an entire read request, * not a split: */ static void bch2_rbio_done(struct bch_read_bio *rbio) { if (rbio->start_time) bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read], rbio->start_time); bio_endio(&rbio->bio); } static noinline int bch2_read_retry_nodecode(struct btree_trans *trans, struct bch_read_bio *rbio, struct bvec_iter bvec_iter, struct bch_io_failures *failed, unsigned flags) { struct data_update *u = container_of(rbio, struct data_update, rbio); retry: bch2_trans_begin(trans); struct btree_iter iter; struct bkey_s_c k; int ret = lockrestart_do(trans, bkey_err(k = bch2_bkey_get_iter(trans, &iter, u->btree_id, bkey_start_pos(&u->k.k->k), 0))); if (ret) goto err; if (!bkey_and_val_eq(k, bkey_i_to_s_c(u->k.k))) { /* extent we wanted to read no longer exists: */ rbio->ret = -BCH_ERR_data_read_key_overwritten; goto err; } ret = __bch2_read_extent(trans, rbio, bvec_iter, bkey_start_pos(&u->k.k->k), u->btree_id, bkey_i_to_s_c(u->k.k), 0, failed, flags, -1); err: bch2_trans_iter_exit(trans, &iter); if (bch2_err_matches(ret, BCH_ERR_data_read_retry)) goto retry; if (ret) { rbio->bio.bi_status = BLK_STS_IOERR; rbio->ret = ret; } BUG_ON(atomic_read(&rbio->bio.__bi_remaining) != 1); return ret; } static void bch2_rbio_retry(struct work_struct *work) { struct bch_read_bio *rbio = container_of(work, struct bch_read_bio, work); struct bch_fs *c = rbio->c; struct bvec_iter iter = rbio->bvec_iter; unsigned flags = rbio->flags; subvol_inum inum = { .subvol = rbio->subvol, .inum = rbio->read_pos.inode, }; struct bch_io_failures failed = { .nr = 0 }; int orig_error = rbio->ret; struct btree_trans *trans = bch2_trans_get(c); trace_io_read_retry(&rbio->bio); this_cpu_add(c->counters[BCH_COUNTER_io_read_retry], bvec_iter_sectors(rbio->bvec_iter)); if (bch2_err_matches(rbio->ret, BCH_ERR_data_read_retry_avoid)) bch2_mark_io_failure(&failed, &rbio->pick, rbio->ret == -BCH_ERR_data_read_retry_csum_err); if (!rbio->split) { rbio->bio.bi_status = 0; rbio->ret = 0; } unsigned subvol = rbio->subvol; struct bpos read_pos = rbio->read_pos; rbio = bch2_rbio_free(rbio); flags |= BCH_READ_in_retry; flags &= ~BCH_READ_may_promote; flags &= ~BCH_READ_last_fragment; flags |= BCH_READ_must_clone; int ret = rbio->data_update ? bch2_read_retry_nodecode(trans, rbio, iter, &failed, flags) : __bch2_read(trans, rbio, iter, inum, &failed, flags); if (ret) { rbio->ret = ret; rbio->bio.bi_status = BLK_STS_IOERR; } else if (orig_error != -BCH_ERR_data_read_retry_csum_err_maybe_userspace && orig_error != -BCH_ERR_data_read_ptr_stale_race && !failed.nr) { struct printbuf buf = PRINTBUF; lockrestart_do(trans, bch2_inum_offset_err_msg_trans(trans, &buf, (subvol_inum) { subvol, read_pos.inode }, read_pos.offset << 9)); if (rbio->data_update) prt_str(&buf, "(internal move) "); prt_str(&buf, "successful retry"); bch_err_ratelimited(c, "%s", buf.buf); printbuf_exit(&buf); } bch2_rbio_done(rbio); bch2_trans_put(trans); } static void bch2_rbio_error(struct bch_read_bio *rbio, int ret, blk_status_t blk_error) { BUG_ON(ret >= 0); rbio->ret = ret; rbio->bio.bi_status = blk_error; bch2_rbio_parent(rbio)->saw_error = true; if (rbio->flags & BCH_READ_in_retry) return; if (bch2_err_matches(ret, BCH_ERR_data_read_retry)) { bch2_rbio_punt(rbio, bch2_rbio_retry, RBIO_CONTEXT_UNBOUND, system_unbound_wq); } else { rbio = bch2_rbio_free(rbio); rbio->ret = ret; rbio->bio.bi_status = blk_error; bch2_rbio_done(rbio); } } static void bch2_read_io_err(struct work_struct *work) { struct bch_read_bio *rbio = container_of(work, struct bch_read_bio, work); struct bio *bio = &rbio->bio; struct bch_fs *c = rbio->c; struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; struct printbuf buf = PRINTBUF; bch2_read_err_msg(c, &buf, rbio, rbio->read_pos); prt_printf(&buf, "data read error: %s", bch2_blk_status_to_str(bio->bi_status)); if (ca) bch_err_ratelimited(ca, "%s", buf.buf); else bch_err_ratelimited(c, "%s", buf.buf); printbuf_exit(&buf); bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_io_err, bio->bi_status); } static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, struct bch_read_bio *rbio) { struct bch_fs *c = rbio->c; u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset; struct bch_extent_crc_unpacked new_crc; struct btree_iter iter; struct bkey_i *new; struct bkey_s_c k; int ret = 0; if (crc_is_compressed(rbio->pick.crc)) return 0; k = bch2_bkey_get_iter(trans, &iter, rbio->data_btree, rbio->data_pos, BTREE_ITER_slots|BTREE_ITER_intent); if ((ret = bkey_err(k))) goto out; if (bversion_cmp(k.k->bversion, rbio->version) || !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset)) goto out; /* Extent was merged? */ if (bkey_start_offset(k.k) < data_offset || k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size) goto out; if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version, rbio->pick.crc, NULL, &new_crc, bkey_start_offset(k.k) - data_offset, k.k->size, rbio->pick.crc.csum_type)) { bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)"); ret = 0; goto out; } /* * going to be temporarily appending another checksum entry: */ new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + sizeof(struct bch_extent_crc128)); if ((ret = PTR_ERR_OR_ZERO(new))) goto out; bkey_reassemble(new, k); if (!bch2_bkey_narrow_crcs(new, new_crc)) goto out; ret = bch2_trans_update(trans, &iter, new, BTREE_UPDATE_internal_snapshot_node); out: bch2_trans_iter_exit(trans, &iter); return ret; } static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) { bch2_trans_commit_do(rbio->c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, __bch2_rbio_narrow_crcs(trans, rbio)); } static void bch2_read_csum_err(struct work_struct *work) { struct bch_read_bio *rbio = container_of(work, struct bch_read_bio, work); struct bch_fs *c = rbio->c; struct bio *src = &rbio->bio; struct bch_extent_crc_unpacked crc = rbio->pick.crc; struct nonce nonce = extent_nonce(rbio->version, crc); struct bch_csum csum = bch2_checksum_bio(c, crc.csum_type, nonce, src); struct printbuf buf = PRINTBUF; bch2_read_err_msg(c, &buf, rbio, rbio->read_pos); prt_str(&buf, "data "); bch2_csum_err_msg(&buf, crc.csum_type, rbio->pick.crc.csum, csum); struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; if (ca) bch_err_ratelimited(ca, "%s", buf.buf); else bch_err_ratelimited(c, "%s", buf.buf); bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err, BLK_STS_IOERR); printbuf_exit(&buf); } static void bch2_read_decompress_err(struct work_struct *work) { struct bch_read_bio *rbio = container_of(work, struct bch_read_bio, work); struct bch_fs *c = rbio->c; struct printbuf buf = PRINTBUF; bch2_read_err_msg(c, &buf, rbio, rbio->read_pos); prt_str(&buf, "decompression error"); struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; if (ca) bch_err_ratelimited(ca, "%s", buf.buf); else bch_err_ratelimited(c, "%s", buf.buf); bch2_rbio_error(rbio, -BCH_ERR_data_read_decompress_err, BLK_STS_IOERR); printbuf_exit(&buf); } static void bch2_read_decrypt_err(struct work_struct *work) { struct bch_read_bio *rbio = container_of(work, struct bch_read_bio, work); struct bch_fs *c = rbio->c; struct printbuf buf = PRINTBUF; bch2_read_err_msg(c, &buf, rbio, rbio->read_pos); prt_str(&buf, "decrypt error"); struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; if (ca) bch_err_ratelimited(ca, "%s", buf.buf); else bch_err_ratelimited(c, "%s", buf.buf); bch2_rbio_error(rbio, -BCH_ERR_data_read_decrypt_err, BLK_STS_IOERR); printbuf_exit(&buf); } /* Inner part that may run in process context */ static void __bch2_read_endio(struct work_struct *work) { struct bch_read_bio *rbio = container_of(work, struct bch_read_bio, work); struct bch_fs *c = rbio->c; struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; struct bch_read_bio *parent = bch2_rbio_parent(rbio); struct bio *src = &rbio->bio; struct bio *dst = &parent->bio; struct bvec_iter dst_iter = rbio->bvec_iter; struct bch_extent_crc_unpacked crc = rbio->pick.crc; struct nonce nonce = extent_nonce(rbio->version, crc); unsigned nofs_flags; struct bch_csum csum; int ret; nofs_flags = memalloc_nofs_save(); /* Reset iterator for checksumming and copying bounced data: */ if (rbio->bounce) { src->bi_iter.bi_size = crc.compressed_size << 9; src->bi_iter.bi_idx = 0; src->bi_iter.bi_bvec_done = 0; } else { src->bi_iter = rbio->bvec_iter; } bch2_maybe_corrupt_bio(src, bch2_read_corrupt_ratio); csum = bch2_checksum_bio(c, crc.csum_type, nonce, src); bool csum_good = !bch2_crc_cmp(csum, rbio->pick.crc.csum) || c->opts.no_data_io; /* * Checksum error: if the bio wasn't bounced, we may have been * reading into buffers owned by userspace (that userspace can * scribble over) - retry the read, bouncing it this time: */ if (!csum_good && !rbio->bounce && (rbio->flags & BCH_READ_user_mapped)) { rbio->flags |= BCH_READ_must_bounce; bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err_maybe_userspace, BLK_STS_IOERR); goto out; } bch2_account_io_completion(ca, BCH_MEMBER_ERROR_checksum, 0, csum_good); if (!csum_good) goto csum_err; /* * XXX * We need to rework the narrow_crcs path to deliver the read completion * first, and then punt to a different workqueue, otherwise we're * holding up reads while doing btree updates which is bad for memory * reclaim. */ if (unlikely(rbio->narrow_crcs)) bch2_rbio_narrow_crcs(rbio); if (likely(!parent->data_update)) { /* Adjust crc to point to subset of data we want: */ crc.offset += rbio->offset_into_extent; crc.live_size = bvec_iter_sectors(rbio->bvec_iter); if (crc_is_compressed(crc)) { ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); if (ret) goto decrypt_err; if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) && !c->opts.no_data_io) goto decompression_err; } else { /* don't need to decrypt the entire bio: */ nonce = nonce_add(nonce, crc.offset << 9); bio_advance(src, crc.offset << 9); BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size); src->bi_iter.bi_size = dst_iter.bi_size; ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); if (ret) goto decrypt_err; if (rbio->bounce) { struct bvec_iter src_iter = src->bi_iter; bio_copy_data_iter(dst, &dst_iter, src, &src_iter); } } } else { if (rbio->split) rbio->parent->pick = rbio->pick; if (rbio->bounce) { struct bvec_iter src_iter = src->bi_iter; bio_copy_data_iter(dst, &dst_iter, src, &src_iter); } } if (rbio->promote) { /* * Re encrypt data we decrypted, so it's consistent with * rbio->crc: */ ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); if (ret) goto decrypt_err; } if (likely(!(rbio->flags & BCH_READ_in_retry))) { rbio = bch2_rbio_free(rbio); bch2_rbio_done(rbio); } out: memalloc_nofs_restore(nofs_flags); return; csum_err: bch2_rbio_punt(rbio, bch2_read_csum_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq); goto out; decompression_err: bch2_rbio_punt(rbio, bch2_read_decompress_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq); goto out; decrypt_err: bch2_rbio_punt(rbio, bch2_read_decrypt_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq); goto out; } static void bch2_read_endio(struct bio *bio) { struct bch_read_bio *rbio = container_of(bio, struct bch_read_bio, bio); struct bch_fs *c = rbio->c; struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; struct workqueue_struct *wq = NULL; enum rbio_context context = RBIO_CONTEXT_NULL; bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, rbio->submit_time, !bio->bi_status); if (!rbio->split) rbio->bio.bi_end_io = rbio->end_io; if (unlikely(bio->bi_status)) { bch2_rbio_punt(rbio, bch2_read_io_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq); return; } if (((rbio->flags & BCH_READ_retry_if_stale) && race_fault()) || (ca && dev_ptr_stale(ca, &rbio->pick.ptr))) { trace_and_count(c, io_read_reuse_race, &rbio->bio); if (rbio->flags & BCH_READ_retry_if_stale) bch2_rbio_error(rbio, -BCH_ERR_data_read_ptr_stale_retry, BLK_STS_AGAIN); else bch2_rbio_error(rbio, -BCH_ERR_data_read_ptr_stale_race, BLK_STS_AGAIN); return; } if (rbio->narrow_crcs || rbio->promote || crc_is_compressed(rbio->pick.crc) || bch2_csum_type_is_encryption(rbio->pick.crc.csum_type)) context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq; else if (rbio->pick.crc.csum_type) context = RBIO_CONTEXT_HIGHPRI, wq = system_highpri_wq; bch2_rbio_punt(rbio, __bch2_read_endio, context, wq); } static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans, struct bch_dev *ca, struct bkey_s_c k, struct bch_extent_ptr ptr) { struct bch_fs *c = trans->c; struct btree_iter iter; struct printbuf buf = PRINTBUF; int ret; bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, PTR_BUCKET_POS(ca, &ptr), BTREE_ITER_cached); int gen = bucket_gen_get(ca, iter.pos.offset); if (gen >= 0) { prt_printf(&buf, "Attempting to read from stale dirty pointer:\n"); printbuf_indent_add(&buf, 2); bch2_bkey_val_to_text(&buf, c, k); prt_newline(&buf); prt_printf(&buf, "memory gen: %u", gen); ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(trans, &iter))); if (!ret) { prt_newline(&buf); bch2_bkey_val_to_text(&buf, c, k); } } else { prt_printf(&buf, "Attempting to read from invalid bucket %llu:%llu:\n", iter.pos.inode, iter.pos.offset); printbuf_indent_add(&buf, 2); prt_printf(&buf, "first bucket %u nbuckets %llu\n", ca->mi.first_bucket, ca->mi.nbuckets); bch2_bkey_val_to_text(&buf, c, k); prt_newline(&buf); } bch2_fs_inconsistent(c, "%s", buf.buf); bch2_trans_iter_exit(trans, &iter); printbuf_exit(&buf); } int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, struct bvec_iter iter, struct bpos read_pos, enum btree_id data_btree, struct bkey_s_c k, unsigned offset_into_extent, struct bch_io_failures *failed, unsigned flags, int dev) { struct bch_fs *c = trans->c; struct extent_ptr_decoded pick; struct bch_read_bio *rbio = NULL; bool bounce = false, read_full = false, narrow_crcs = false; struct bpos data_pos = bkey_start_pos(k.k); struct data_update *u = rbio_data_update(orig); int ret = 0; if (bkey_extent_is_inline_data(k.k)) { unsigned bytes = min_t(unsigned, iter.bi_size, bkey_inline_data_bytes(k.k)); swap(iter.bi_size, bytes); memcpy_to_bio(&orig->bio, iter, bkey_inline_data_p(k)); swap(iter.bi_size, bytes); bio_advance_iter(&orig->bio, &iter, bytes); zero_fill_bio_iter(&orig->bio, iter); this_cpu_add(c->counters[BCH_COUNTER_io_read_inline], bvec_iter_sectors(iter)); goto out_read_done; } retry_pick: ret = bch2_bkey_pick_read_device(c, k, failed, &pick, dev); /* hole or reservation - just zero fill: */ if (!ret) goto hole; if (unlikely(ret < 0)) { struct printbuf buf = PRINTBUF; bch2_read_err_msg_trans(trans, &buf, orig, read_pos); prt_printf(&buf, "%s\n ", bch2_err_str(ret)); bch2_bkey_val_to_text(&buf, c, k); bch_err_ratelimited(c, "%s", buf.buf); printbuf_exit(&buf); goto err; } if (unlikely(bch2_csum_type_is_encryption(pick.crc.csum_type)) && !c->chacha20_key_set) { struct printbuf buf = PRINTBUF; bch2_read_err_msg_trans(trans, &buf, orig, read_pos); prt_printf(&buf, "attempting to read encrypted data without encryption key\n "); bch2_bkey_val_to_text(&buf, c, k); bch_err_ratelimited(c, "%s", buf.buf); printbuf_exit(&buf); ret = -BCH_ERR_data_read_no_encryption_key; goto err; } struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); /* * Stale dirty pointers are treated as IO errors, but @failed isn't * allocated unless we're in the retry path - so if we're not in the * retry path, don't check here, it'll be caught in bch2_read_endio() * and we'll end up in the retry path: */ if ((flags & BCH_READ_in_retry) && !pick.ptr.cached && ca && unlikely(dev_ptr_stale(ca, &pick.ptr))) { read_from_stale_dirty_pointer(trans, ca, k, pick.ptr); bch2_mark_io_failure(failed, &pick, false); percpu_ref_put(&ca->io_ref[READ]); goto retry_pick; } if (likely(!u)) { if (!(flags & BCH_READ_last_fragment) || bio_flagged(&orig->bio, BIO_CHAIN)) flags |= BCH_READ_must_clone; narrow_crcs = !(flags & BCH_READ_in_retry) && bch2_can_narrow_extent_crcs(k, pick.crc); if (narrow_crcs && (flags & BCH_READ_user_mapped)) flags |= BCH_READ_must_bounce; EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); if (crc_is_compressed(pick.crc) || (pick.crc.csum_type != BCH_CSUM_none && (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || (bch2_csum_type_is_encryption(pick.crc.csum_type) && (flags & BCH_READ_user_mapped)) || (flags & BCH_READ_must_bounce)))) { read_full = true; bounce = true; } } else { /* * can happen if we retry, and the extent we were going to read * has been merged in the meantime: */ if (pick.crc.compressed_size > u->op.wbio.bio.bi_iter.bi_size) { if (ca) percpu_ref_put(&ca->io_ref[READ]); rbio->ret = -BCH_ERR_data_read_buffer_too_small; goto out_read_done; } iter.bi_size = pick.crc.compressed_size << 9; read_full = true; } if (orig->opts.promote_target || have_io_error(failed)) rbio = promote_alloc(trans, iter, k, &pick, flags, orig, &bounce, &read_full, failed); if (!read_full) { EBUG_ON(crc_is_compressed(pick.crc)); EBUG_ON(pick.crc.csum_type && (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || bvec_iter_sectors(iter) != pick.crc.live_size || pick.crc.offset || offset_into_extent)); data_pos.offset += offset_into_extent; pick.ptr.offset += pick.crc.offset + offset_into_extent; offset_into_extent = 0; pick.crc.compressed_size = bvec_iter_sectors(iter); pick.crc.uncompressed_size = bvec_iter_sectors(iter); pick.crc.offset = 0; pick.crc.live_size = bvec_iter_sectors(iter); } if (rbio) { /* * promote already allocated bounce rbio: * promote needs to allocate a bio big enough for uncompressing * data in the write path, but we're not going to use it all * here: */ EBUG_ON(rbio->bio.bi_iter.bi_size < pick.crc.compressed_size << 9); rbio->bio.bi_iter.bi_size = pick.crc.compressed_size << 9; } else if (bounce) { unsigned sectors = pick.crc.compressed_size; rbio = rbio_init_fragment(bio_alloc_bioset(NULL, DIV_ROUND_UP(sectors, PAGE_SECTORS), 0, GFP_NOFS, &c->bio_read_split), orig); bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9); rbio->bounce = true; } else if (flags & BCH_READ_must_clone) { /* * Have to clone if there were any splits, due to error * reporting issues (if a split errored, and retrying didn't * work, when it reports the error to its parent (us) we don't * know if the error was from our bio, and we should retry, or * from the whole bio, in which case we don't want to retry and * lose the error) */ rbio = rbio_init_fragment(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS, &c->bio_read_split), orig); rbio->bio.bi_iter = iter; } else { rbio = orig; rbio->bio.bi_iter = iter; EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN)); } EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size); rbio->submit_time = local_clock(); if (!rbio->split) rbio->end_io = orig->bio.bi_end_io; rbio->bvec_iter = iter; rbio->offset_into_extent= offset_into_extent; rbio->flags = flags; rbio->have_ioref = ca != NULL; rbio->narrow_crcs = narrow_crcs; rbio->ret = 0; rbio->context = 0; rbio->pick = pick; rbio->subvol = orig->subvol; rbio->read_pos = read_pos; rbio->data_btree = data_btree; rbio->data_pos = data_pos; rbio->version = k.k->bversion; INIT_WORK(&rbio->work, NULL); rbio->bio.bi_opf = orig->bio.bi_opf; rbio->bio.bi_iter.bi_sector = pick.ptr.offset; rbio->bio.bi_end_io = bch2_read_endio; if (rbio->bounce) trace_and_count(c, io_read_bounce, &rbio->bio); if (!u) this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio)); else this_cpu_add(c->counters[BCH_COUNTER_io_move_read], bio_sectors(&rbio->bio)); bch2_increment_clock(c, bio_sectors(&rbio->bio), READ); /* * If it's being moved internally, we don't want to flag it as a cache * hit: */ if (ca && pick.ptr.cached && !u) bch2_bucket_io_time_reset(trans, pick.ptr.dev, PTR_BUCKET_NR(ca, &pick.ptr), READ); if (!(flags & (BCH_READ_in_retry|BCH_READ_last_fragment))) { bio_inc_remaining(&orig->bio); trace_and_count(c, io_read_split, &orig->bio); } /* * Unlock the iterator while the btree node's lock is still in * cache, before doing the IO: */ if (!(flags & BCH_READ_in_retry)) bch2_trans_unlock(trans); else bch2_trans_unlock_long(trans); if (likely(!rbio->pick.do_ec_reconstruct)) { if (unlikely(!rbio->have_ioref)) { struct printbuf buf = PRINTBUF; bch2_read_err_msg_trans(trans, &buf, rbio, read_pos); prt_printf(&buf, "no device to read from:\n "); bch2_bkey_val_to_text(&buf, c, k); bch_err_ratelimited(c, "%s", buf.buf); printbuf_exit(&buf); bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_device_offline, BLK_STS_IOERR); goto out; } this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_user], bio_sectors(&rbio->bio)); bio_set_dev(&rbio->bio, ca->disk_sb.bdev); if (unlikely(c->opts.no_data_io)) { if (likely(!(flags & BCH_READ_in_retry))) bio_endio(&rbio->bio); } else { if (likely(!(flags & BCH_READ_in_retry))) submit_bio(&rbio->bio); else submit_bio_wait(&rbio->bio); } /* * We just submitted IO which may block, we expect relock fail * events and shouldn't count them: */ trans->notrace_relock_fail = true; } else { /* Attempting reconstruct read: */ if (bch2_ec_read_extent(trans, rbio, k)) { bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_ec_reconstruct_err, BLK_STS_IOERR); goto out; } if (likely(!(flags & BCH_READ_in_retry))) bio_endio(&rbio->bio); } out: if (likely(!(flags & BCH_READ_in_retry))) { return 0; } else { bch2_trans_unlock(trans); int ret; rbio->context = RBIO_CONTEXT_UNBOUND; bch2_read_endio(&rbio->bio); ret = rbio->ret; rbio = bch2_rbio_free(rbio); if (bch2_err_matches(ret, BCH_ERR_data_read_retry_avoid)) bch2_mark_io_failure(failed, &pick, ret == -BCH_ERR_data_read_retry_csum_err); return ret; } err: if (flags & BCH_READ_in_retry) return ret; orig->bio.bi_status = BLK_STS_IOERR; orig->ret = ret; goto out_read_done; hole: this_cpu_add(c->counters[BCH_COUNTER_io_read_hole], bvec_iter_sectors(iter)); /* * won't normally happen in the data update (bch2_move_extent()) path, * but if we retry and the extent we wanted to read no longer exists we * have to signal that: */ if (u) orig->ret = -BCH_ERR_data_read_key_overwritten; zero_fill_bio_iter(&orig->bio, iter); out_read_done: if ((flags & BCH_READ_last_fragment) && !(flags & BCH_READ_in_retry)) bch2_rbio_done(orig); return 0; } int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio, struct bvec_iter bvec_iter, subvol_inum inum, struct bch_io_failures *failed, unsigned flags) { struct bch_fs *c = trans->c; struct btree_iter iter; struct bkey_buf sk; struct bkey_s_c k; int ret; EBUG_ON(rbio->data_update); bch2_bkey_buf_init(&sk); bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, POS(inum.inum, bvec_iter.bi_sector), BTREE_ITER_slots); while (1) { enum btree_id data_btree = BTREE_ID_extents; bch2_trans_begin(trans); u32 snapshot; ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); if (ret) goto err; bch2_btree_iter_set_snapshot(trans, &iter, snapshot); bch2_btree_iter_set_pos(trans, &iter, POS(inum.inum, bvec_iter.bi_sector)); k = bch2_btree_iter_peek_slot(trans, &iter); ret = bkey_err(k); if (ret) goto err; s64 offset_into_extent = iter.pos.offset - bkey_start_offset(k.k); unsigned sectors = k.k->size - offset_into_extent; bch2_bkey_buf_reassemble(&sk, c, k); ret = bch2_read_indirect_extent(trans, &data_btree, &offset_into_extent, &sk); if (ret) goto err; k = bkey_i_to_s_c(sk.k); /* * With indirect extents, the amount of data to read is the min * of the original extent and the indirect extent: */ sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent); unsigned bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9; swap(bvec_iter.bi_size, bytes); if (bvec_iter.bi_size == bytes) flags |= BCH_READ_last_fragment; ret = __bch2_read_extent(trans, rbio, bvec_iter, iter.pos, data_btree, k, offset_into_extent, failed, flags, -1); swap(bvec_iter.bi_size, bytes); if (ret) goto err; if (flags & BCH_READ_last_fragment) break; bio_advance_iter(&rbio->bio, &bvec_iter, bytes); err: if (ret == -BCH_ERR_data_read_retry_csum_err_maybe_userspace) flags |= BCH_READ_must_bounce; if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart) && !bch2_err_matches(ret, BCH_ERR_data_read_retry)) break; } bch2_trans_iter_exit(trans, &iter); if (unlikely(ret)) { if (ret != -BCH_ERR_extent_poisoned) { struct printbuf buf = PRINTBUF; lockrestart_do(trans, bch2_inum_offset_err_msg_trans(trans, &buf, inum, bvec_iter.bi_sector << 9)); prt_printf(&buf, "data read error: %s", bch2_err_str(ret)); bch_err_ratelimited(c, "%s", buf.buf); printbuf_exit(&buf); } rbio->bio.bi_status = BLK_STS_IOERR; rbio->ret = ret; if (!(flags & BCH_READ_in_retry)) bch2_rbio_done(rbio); } bch2_bkey_buf_exit(&sk, c); return ret; } void bch2_fs_io_read_exit(struct bch_fs *c) { if (c->promote_table.tbl) rhashtable_destroy(&c->promote_table); bioset_exit(&c->bio_read_split); bioset_exit(&c->bio_read); } int bch2_fs_io_read_init(struct bch_fs *c) { if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio), BIOSET_NEED_BVECS)) return -BCH_ERR_ENOMEM_bio_read_init; if (bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio), BIOSET_NEED_BVECS)) return -BCH_ERR_ENOMEM_bio_read_split_init; if (rhashtable_init(&c->promote_table, &bch_promote_params)) return -BCH_ERR_ENOMEM_promote_table_init; return 0; } |
| 154 154 163 32 137 163 163 162 162 163 21 21 21 21 162 163 186 186 185 184 220 221 19 19 218 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 | #include <linux/atomic.h> #include <linux/export.h> #include <linux/generic-radix-tree.h> #include <linux/gfp.h> #include <linux/kmemleak.h> /* * Returns pointer to the specified byte @offset within @radix, or NULL if not * allocated */ void *__genradix_ptr(struct __genradix *radix, size_t offset) { return __genradix_ptr_inlined(radix, offset); } EXPORT_SYMBOL(__genradix_ptr); /* * Returns pointer to the specified byte @offset within @radix, allocating it if * necessary - newly allocated slots are always zeroed out: */ void *__genradix_ptr_alloc(struct __genradix *radix, size_t offset, struct genradix_node **preallocated, gfp_t gfp_mask) { struct genradix_root *v = READ_ONCE(radix->root); struct genradix_node *n, *new_node = NULL; unsigned level; if (preallocated) swap(new_node, *preallocated); /* Increase tree depth if necessary: */ while (1) { struct genradix_root *r = v, *new_root; n = genradix_root_to_node(r); level = genradix_root_to_depth(r); if (n && ilog2(offset) < genradix_depth_shift(level)) break; if (!new_node) { new_node = genradix_alloc_node(gfp_mask); if (!new_node) return NULL; } new_node->children[0] = n; new_root = ((struct genradix_root *) ((unsigned long) new_node | (n ? level + 1 : 0))); if ((v = cmpxchg_release(&radix->root, r, new_root)) == r) { v = new_root; new_node = NULL; } else { new_node->children[0] = NULL; } } while (level--) { struct genradix_node **p = &n->children[offset >> genradix_depth_shift(level)]; offset &= genradix_depth_size(level) - 1; n = READ_ONCE(*p); if (!n) { if (!new_node) { new_node = genradix_alloc_node(gfp_mask); if (!new_node) return NULL; } if (!(n = cmpxchg_release(p, NULL, new_node))) swap(n, new_node); } } if (new_node) genradix_free_node(new_node); return &n->data[offset]; } EXPORT_SYMBOL(__genradix_ptr_alloc); void *__genradix_iter_peek(struct genradix_iter *iter, struct __genradix *radix, size_t objs_per_page) { struct genradix_root *r; struct genradix_node *n; unsigned level, i; if (iter->offset == SIZE_MAX) return NULL; restart: r = READ_ONCE(radix->root); if (!r) return NULL; n = genradix_root_to_node(r); level = genradix_root_to_depth(r); if (ilog2(iter->offset) >= genradix_depth_shift(level)) return NULL; while (level) { level--; i = (iter->offset >> genradix_depth_shift(level)) & (GENRADIX_ARY - 1); while (!n->children[i]) { size_t objs_per_ptr = genradix_depth_size(level); if (iter->offset + objs_per_ptr < iter->offset) { iter->offset = SIZE_MAX; iter->pos = SIZE_MAX; return NULL; } i++; iter->offset = round_down(iter->offset + objs_per_ptr, objs_per_ptr); iter->pos = (iter->offset >> GENRADIX_NODE_SHIFT) * objs_per_page; if (i == GENRADIX_ARY) goto restart; } n = n->children[i]; } return &n->data[iter->offset & (GENRADIX_NODE_SIZE - 1)]; } EXPORT_SYMBOL(__genradix_iter_peek); void *__genradix_iter_peek_prev(struct genradix_iter *iter, struct __genradix *radix, size_t objs_per_page, size_t obj_size_plus_page_remainder) { struct genradix_root *r; struct genradix_node *n; unsigned level, i; if (iter->offset == SIZE_MAX) return NULL; restart: r = READ_ONCE(radix->root); if (!r) return NULL; n = genradix_root_to_node(r); level = genradix_root_to_depth(r); if (ilog2(iter->offset) >= genradix_depth_shift(level)) { iter->offset = genradix_depth_size(level); iter->pos = (iter->offset >> GENRADIX_NODE_SHIFT) * objs_per_page; iter->offset -= obj_size_plus_page_remainder; iter->pos--; } while (level) { level--; i = (iter->offset >> genradix_depth_shift(level)) & (GENRADIX_ARY - 1); while (!n->children[i]) { size_t objs_per_ptr = genradix_depth_size(level); iter->offset = round_down(iter->offset, objs_per_ptr); iter->pos = (iter->offset >> GENRADIX_NODE_SHIFT) * objs_per_page; if (!iter->offset) return NULL; iter->offset -= obj_size_plus_page_remainder; iter->pos--; if (!i) goto restart; --i; } n = n->children[i]; } return &n->data[iter->offset & (GENRADIX_NODE_SIZE - 1)]; } EXPORT_SYMBOL(__genradix_iter_peek_prev); static void genradix_free_recurse(struct genradix_node *n, unsigned level) { if (level) { unsigned i; for (i = 0; i < GENRADIX_ARY; i++) if (n->children[i]) genradix_free_recurse(n->children[i], level - 1); } genradix_free_node(n); } int __genradix_prealloc(struct __genradix *radix, size_t size, gfp_t gfp_mask) { size_t offset; for (offset = 0; offset < size; offset += GENRADIX_NODE_SIZE) if (!__genradix_ptr_alloc(radix, offset, NULL, gfp_mask)) return -ENOMEM; return 0; } EXPORT_SYMBOL(__genradix_prealloc); void __genradix_free(struct __genradix *radix) { struct genradix_root *r = xchg(&radix->root, NULL); genradix_free_recurse(genradix_root_to_node(r), genradix_root_to_depth(r)); } EXPORT_SYMBOL(__genradix_free); |
| 124 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_DEBUG_H #define _BCACHEFS_DEBUG_H #include "bcachefs.h" struct bio; struct btree; struct bch_fs; void __bch2_btree_verify(struct bch_fs *, struct btree *); void bch2_btree_node_ondisk_to_text(struct printbuf *, struct bch_fs *, const struct btree *); static inline void bch2_btree_verify(struct bch_fs *c, struct btree *b) { if (bch2_verify_btree_ondisk) __bch2_btree_verify(c, b); } #ifdef CONFIG_DEBUG_FS void bch2_fs_debug_exit(struct bch_fs *); void bch2_fs_debug_init(struct bch_fs *); #else static inline void bch2_fs_debug_exit(struct bch_fs *c) {} static inline void bch2_fs_debug_init(struct bch_fs *c) {} #endif void bch2_debug_exit(void); int bch2_debug_init(void); #endif /* _BCACHEFS_DEBUG_H */ |
| 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 | // SPDX-License-Identifier: GPL-2.0-only /* * Copyright 2002-2005, Instant802 Networks, Inc. * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2015 - 2017 Intel Deutschland GmbH * Copyright (C) 2018-2024 Intel Corporation */ #include <linux/module.h> #include <linux/init.h> #include <linux/etherdevice.h> #include <linux/netdevice.h> #include <linux/types.h> #include <linux/slab.h> #include <linux/skbuff.h> #include <linux/if_arp.h> #include <linux/timer.h> #include <linux/rtnetlink.h> #include <net/codel.h> #include <net/mac80211.h> #include "ieee80211_i.h" #include "driver-ops.h" #include "rate.h" #include "sta_info.h" #include "debugfs_sta.h" #include "mesh.h" #include "wme.h" /** * DOC: STA information lifetime rules * * STA info structures (&struct sta_info) are managed in a hash table * for faster lookup and a list for iteration. They are managed using * RCU, i.e. access to the list and hash table is protected by RCU. * * Upon allocating a STA info structure with sta_info_alloc(), the caller * owns that structure. It must then insert it into the hash table using * either sta_info_insert() or sta_info_insert_rcu(); only in the latter * case (which acquires an rcu read section but must not be called from * within one) will the pointer still be valid after the call. Note that * the caller may not do much with the STA info before inserting it; in * particular, it may not start any mesh peer link management or add * encryption keys. * * When the insertion fails (sta_info_insert()) returns non-zero), the * structure will have been freed by sta_info_insert()! * * Station entries are added by mac80211 when you establish a link with a * peer. This means different things for the different type of interfaces * we support. For a regular station this mean we add the AP sta when we * receive an association response from the AP. For IBSS this occurs when * get to know about a peer on the same IBSS. For WDS we add the sta for * the peer immediately upon device open. When using AP mode we add stations * for each respective station upon request from userspace through nl80211. * * In order to remove a STA info structure, various sta_info_destroy_*() * calls are available. * * There is no concept of ownership on a STA entry; each structure is * owned by the global hash table/list until it is removed. All users of * the structure need to be RCU protected so that the structure won't be * freed before they are done using it. */ struct sta_link_alloc { struct link_sta_info info; struct ieee80211_link_sta sta; struct rcu_head rcu_head; }; static const struct rhashtable_params sta_rht_params = { .nelem_hint = 3, /* start small */ .automatic_shrinking = true, .head_offset = offsetof(struct sta_info, hash_node), .key_offset = offsetof(struct sta_info, addr), .key_len = ETH_ALEN, .max_size = CONFIG_MAC80211_STA_HASH_MAX_SIZE, }; static const struct rhashtable_params link_sta_rht_params = { .nelem_hint = 3, /* start small */ .automatic_shrinking = true, .head_offset = offsetof(struct link_sta_info, link_hash_node), .key_offset = offsetof(struct link_sta_info, addr), .key_len = ETH_ALEN, .max_size = CONFIG_MAC80211_STA_HASH_MAX_SIZE, }; static int sta_info_hash_del(struct ieee80211_local *local, struct sta_info *sta) { return rhltable_remove(&local->sta_hash, &sta->hash_node, sta_rht_params); } static int link_sta_info_hash_add(struct ieee80211_local *local, struct link_sta_info *link_sta) { lockdep_assert_wiphy(local->hw.wiphy); return rhltable_insert(&local->link_sta_hash, &link_sta->link_hash_node, link_sta_rht_params); } static int link_sta_info_hash_del(struct ieee80211_local *local, struct link_sta_info *link_sta) { lockdep_assert_wiphy(local->hw.wiphy); return rhltable_remove(&local->link_sta_hash, &link_sta->link_hash_node, link_sta_rht_params); } void ieee80211_purge_sta_txqs(struct sta_info *sta) { struct ieee80211_local *local = sta->sdata->local; int i; for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) { struct txq_info *txqi; if (!sta->sta.txq[i]) continue; txqi = to_txq_info(sta->sta.txq[i]); ieee80211_txq_purge(local, txqi); } } static void __cleanup_single_sta(struct sta_info *sta) { int ac, i; struct tid_ampdu_tx *tid_tx; struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; struct ps_data *ps; if (test_sta_flag(sta, WLAN_STA_PS_STA) || test_sta_flag(sta, WLAN_STA_PS_DRIVER) || test_sta_flag(sta, WLAN_STA_PS_DELIVER)) { if (sta->sdata->vif.type == NL80211_IFTYPE_AP || sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN) ps = &sdata->bss->ps; else if (ieee80211_vif_is_mesh(&sdata->vif)) ps = &sdata->u.mesh.ps; else return; clear_sta_flag(sta, WLAN_STA_PS_STA); clear_sta_flag(sta, WLAN_STA_PS_DRIVER); clear_sta_flag(sta, WLAN_STA_PS_DELIVER); atomic_dec(&ps->num_sta_ps); } ieee80211_purge_sta_txqs(sta); for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { local->total_ps_buffered -= skb_queue_len(&sta->ps_tx_buf[ac]); ieee80211_purge_tx_queue(&local->hw, &sta->ps_tx_buf[ac]); ieee80211_purge_tx_queue(&local->hw, &sta->tx_filtered[ac]); } if (ieee80211_vif_is_mesh(&sdata->vif)) mesh_sta_cleanup(sta); cancel_work_sync(&sta->drv_deliver_wk); /* * Destroy aggregation state here. It would be nice to wait for the * driver to finish aggregation stop and then clean up, but for now * drivers have to handle aggregation stop being requested, followed * directly by station destruction. */ for (i = 0; i < IEEE80211_NUM_TIDS; i++) { kfree(sta->ampdu_mlme.tid_start_tx[i]); tid_tx = rcu_dereference_raw(sta->ampdu_mlme.tid_tx[i]); if (!tid_tx) continue; ieee80211_purge_tx_queue(&local->hw, &tid_tx->pending); kfree(tid_tx); } } static void cleanup_single_sta(struct sta_info *sta) { struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; __cleanup_single_sta(sta); sta_info_free(local, sta); } struct rhlist_head *sta_info_hash_lookup(struct ieee80211_local *local, const u8 *addr) { return rhltable_lookup(&local->sta_hash, addr, sta_rht_params); } /* protected by RCU */ struct sta_info *sta_info_get(struct ieee80211_sub_if_data *sdata, const u8 *addr) { struct ieee80211_local *local = sdata->local; struct rhlist_head *tmp; struct sta_info *sta; rcu_read_lock(); for_each_sta_info(local, addr, sta, tmp) { if (sta->sdata == sdata) { rcu_read_unlock(); /* this is safe as the caller must already hold * another rcu read section or the mutex */ return sta; } } rcu_read_unlock(); return NULL; } /* * Get sta info either from the specified interface * or from one of its vlans */ struct sta_info *sta_info_get_bss(struct ieee80211_sub_if_data *sdata, const u8 *addr) { struct ieee80211_local *local = sdata->local; struct rhlist_head *tmp; struct sta_info *sta; rcu_read_lock(); for_each_sta_info(local, addr, sta, tmp) { if (sta->sdata == sdata || (sta->sdata->bss && sta->sdata->bss == sdata->bss)) { rcu_read_unlock(); /* this is safe as the caller must already hold * another rcu read section or the mutex */ return sta; } } rcu_read_unlock(); return NULL; } struct rhlist_head *link_sta_info_hash_lookup(struct ieee80211_local *local, const u8 *addr) { return rhltable_lookup(&local->link_sta_hash, addr, link_sta_rht_params); } struct link_sta_info * link_sta_info_get_bss(struct ieee80211_sub_if_data *sdata, const u8 *addr) { struct ieee80211_local *local = sdata->local; struct rhlist_head *tmp; struct link_sta_info *link_sta; rcu_read_lock(); for_each_link_sta_info(local, addr, link_sta, tmp) { struct sta_info *sta = link_sta->sta; if (sta->sdata == sdata || (sta->sdata->bss && sta->sdata->bss == sdata->bss)) { rcu_read_unlock(); /* this is safe as the caller must already hold * another rcu read section or the mutex */ return link_sta; } } rcu_read_unlock(); return NULL; } struct ieee80211_sta * ieee80211_find_sta_by_link_addrs(struct ieee80211_hw *hw, const u8 *addr, const u8 *localaddr, unsigned int *link_id) { struct ieee80211_local *local = hw_to_local(hw); struct link_sta_info *link_sta; struct rhlist_head *tmp; for_each_link_sta_info(local, addr, link_sta, tmp) { struct sta_info *sta = link_sta->sta; struct ieee80211_link_data *link; u8 _link_id = link_sta->link_id; if (!localaddr) { if (link_id) *link_id = _link_id; return &sta->sta; } link = rcu_dereference(sta->sdata->link[_link_id]); if (!link) continue; if (memcmp(link->conf->addr, localaddr, ETH_ALEN)) continue; if (link_id) *link_id = _link_id; return &sta->sta; } return NULL; } EXPORT_SYMBOL_GPL(ieee80211_find_sta_by_link_addrs); struct sta_info *sta_info_get_by_addrs(struct ieee80211_local *local, const u8 *sta_addr, const u8 *vif_addr) { struct rhlist_head *tmp; struct sta_info *sta; for_each_sta_info(local, sta_addr, sta, tmp) { if (ether_addr_equal(vif_addr, sta->sdata->vif.addr)) return sta; } return NULL; } struct sta_info *sta_info_get_by_idx(struct ieee80211_sub_if_data *sdata, int idx) { struct ieee80211_local *local = sdata->local; struct sta_info *sta; int i = 0; list_for_each_entry_rcu(sta, &local->sta_list, list, lockdep_is_held(&local->hw.wiphy->mtx)) { if (sdata != sta->sdata) continue; if (i < idx) { ++i; continue; } return sta; } return NULL; } static void sta_info_free_link(struct link_sta_info *link_sta) { free_percpu(link_sta->pcpu_rx_stats); } static void sta_remove_link(struct sta_info *sta, unsigned int link_id, bool unhash) { struct sta_link_alloc *alloc = NULL; struct link_sta_info *link_sta; lockdep_assert_wiphy(sta->local->hw.wiphy); link_sta = rcu_access_pointer(sta->link[link_id]); if (WARN_ON(!link_sta)) return; if (unhash) link_sta_info_hash_del(sta->local, link_sta); if (test_sta_flag(sta, WLAN_STA_INSERTED)) ieee80211_link_sta_debugfs_remove(link_sta); if (link_sta != &sta->deflink) alloc = container_of(link_sta, typeof(*alloc), info); sta->sta.valid_links &= ~BIT(link_id); RCU_INIT_POINTER(sta->link[link_id], NULL); RCU_INIT_POINTER(sta->sta.link[link_id], NULL); if (alloc) { sta_info_free_link(&alloc->info); kfree_rcu(alloc, rcu_head); } ieee80211_sta_recalc_aggregates(&sta->sta); } /** * sta_info_free - free STA * * @local: pointer to the global information * @sta: STA info to free * * This function must undo everything done by sta_info_alloc() * that may happen before sta_info_insert(). It may only be * called when sta_info_insert() has not been attempted (and * if that fails, the station is freed anyway.) */ void sta_info_free(struct ieee80211_local *local, struct sta_info *sta) { int i; for (i = 0; i < ARRAY_SIZE(sta->link); i++) { struct link_sta_info *link_sta; link_sta = rcu_access_pointer(sta->link[i]); if (!link_sta) continue; sta_remove_link(sta, i, false); } /* * If we had used sta_info_pre_move_state() then we might not * have gone through the state transitions down again, so do * it here now (and warn if it's inserted). * * This will clear state such as fast TX/RX that may have been * allocated during state transitions. */ while (sta->sta_state > IEEE80211_STA_NONE) { int ret; WARN_ON_ONCE(test_sta_flag(sta, WLAN_STA_INSERTED)); ret = sta_info_move_state(sta, sta->sta_state - 1); if (WARN_ONCE(ret, "sta_info_move_state() returned %d\n", ret)) break; } if (sta->rate_ctrl) rate_control_free_sta(sta); sta_dbg(sta->sdata, "Destroyed STA %pM\n", sta->sta.addr); kfree(to_txq_info(sta->sta.txq[0])); kfree(rcu_dereference_raw(sta->sta.rates)); #ifdef CONFIG_MAC80211_MESH kfree(sta->mesh); #endif sta_info_free_link(&sta->deflink); kfree(sta); } static int sta_info_hash_add(struct ieee80211_local *local, struct sta_info *sta) { return rhltable_insert(&local->sta_hash, &sta->hash_node, sta_rht_params); } static void sta_deliver_ps_frames(struct work_struct *wk) { struct sta_info *sta; sta = container_of(wk, struct sta_info, drv_deliver_wk); if (sta->dead) return; local_bh_disable(); if (!test_sta_flag(sta, WLAN_STA_PS_STA)) ieee80211_sta_ps_deliver_wakeup(sta); else if (test_and_clear_sta_flag(sta, WLAN_STA_PSPOLL)) ieee80211_sta_ps_deliver_poll_response(sta); else if (test_and_clear_sta_flag(sta, WLAN_STA_UAPSD)) ieee80211_sta_ps_deliver_uapsd(sta); local_bh_enable(); } static int sta_prepare_rate_control(struct ieee80211_local *local, struct sta_info *sta, gfp_t gfp) { if (ieee80211_hw_check(&local->hw, HAS_RATE_CONTROL)) return 0; sta->rate_ctrl = local->rate_ctrl; sta->rate_ctrl_priv = rate_control_alloc_sta(sta->rate_ctrl, sta, gfp); if (!sta->rate_ctrl_priv) return -ENOMEM; return 0; } static int sta_info_alloc_link(struct ieee80211_local *local, struct link_sta_info *link_info, gfp_t gfp) { struct ieee80211_hw *hw = &local->hw; int i; if (ieee80211_hw_check(hw, USES_RSS)) { link_info->pcpu_rx_stats = alloc_percpu_gfp(struct ieee80211_sta_rx_stats, gfp); if (!link_info->pcpu_rx_stats) return -ENOMEM; } link_info->rx_stats.last_rx = jiffies; u64_stats_init(&link_info->rx_stats.syncp); ewma_signal_init(&link_info->rx_stats_avg.signal); ewma_avg_signal_init(&link_info->status_stats.avg_ack_signal); for (i = 0; i < ARRAY_SIZE(link_info->rx_stats_avg.chain_signal); i++) ewma_signal_init(&link_info->rx_stats_avg.chain_signal[i]); link_info->rx_omi_bw_rx = IEEE80211_STA_RX_BW_MAX; link_info->rx_omi_bw_tx = IEEE80211_STA_RX_BW_MAX; link_info->rx_omi_bw_staging = IEEE80211_STA_RX_BW_MAX; /* * Cause (a) warning(s) if IEEE80211_STA_RX_BW_MAX != 320 * or if new values are added to the enum. */ switch (link_info->cur_max_bandwidth) { case IEEE80211_STA_RX_BW_20: case IEEE80211_STA_RX_BW_40: case IEEE80211_STA_RX_BW_80: case IEEE80211_STA_RX_BW_160: case IEEE80211_STA_RX_BW_MAX: /* intentionally nothing */ break; } return 0; } static void sta_info_add_link(struct sta_info *sta, unsigned int link_id, struct link_sta_info *link_info, struct ieee80211_link_sta *link_sta) { link_info->sta = sta; link_info->link_id = link_id; link_info->pub = link_sta; link_info->pub->sta = &sta->sta; link_sta->link_id = link_id; rcu_assign_pointer(sta->link[link_id], link_info); rcu_assign_pointer(sta->sta.link[link_id], link_sta); link_sta->smps_mode = IEEE80211_SMPS_OFF; link_sta->agg.max_rc_amsdu_len = IEEE80211_MAX_MPDU_LEN_HT_BA; } static struct sta_info * __sta_info_alloc(struct ieee80211_sub_if_data *sdata, const u8 *addr, int link_id, const u8 *link_addr, gfp_t gfp) { struct ieee80211_local *local = sdata->local; struct ieee80211_hw *hw = &local->hw; struct sta_info *sta; void *txq_data; int size; int i; sta = kzalloc(sizeof(*sta) + hw->sta_data_size, gfp); if (!sta) return NULL; sta->local = local; sta->sdata = sdata; if (sta_info_alloc_link(local, &sta->deflink, gfp)) goto free; if (link_id >= 0) { sta_info_add_link(sta, link_id, &sta->deflink, &sta->sta.deflink); sta->sta.valid_links = BIT(link_id); } else { sta_info_add_link(sta, 0, &sta->deflink, &sta->sta.deflink); } sta->sta.cur = &sta->sta.deflink.agg; spin_lock_init(&sta->lock); spin_lock_init(&sta->ps_lock); INIT_WORK(&sta->drv_deliver_wk, sta_deliver_ps_frames); wiphy_work_init(&sta->ampdu_mlme.work, ieee80211_ba_session_work); #ifdef CONFIG_MAC80211_MESH if (ieee80211_vif_is_mesh(&sdata->vif)) { sta->mesh = kzalloc(sizeof(*sta->mesh), gfp); if (!sta->mesh) goto free; sta->mesh->plink_sta = sta; spin_lock_init(&sta->mesh->plink_lock); if (!sdata->u.mesh.user_mpm) timer_setup(&sta->mesh->plink_timer, mesh_plink_timer, 0); sta->mesh->nonpeer_pm = NL80211_MESH_POWER_ACTIVE; } #endif memcpy(sta->addr, addr, ETH_ALEN); memcpy(sta->sta.addr, addr, ETH_ALEN); memcpy(sta->deflink.addr, link_addr, ETH_ALEN); memcpy(sta->sta.deflink.addr, link_addr, ETH_ALEN); sta->sta.max_rx_aggregation_subframes = local->hw.max_rx_aggregation_subframes; /* TODO link specific alloc and assignments for MLO Link STA */ /* Extended Key ID needs to install keys for keyid 0 and 1 Rx-only. * The Tx path starts to use a key as soon as the key slot ptk_idx * references to is not NULL. To not use the initial Rx-only key * prematurely for Tx initialize ptk_idx to an impossible PTK keyid * which always will refer to a NULL key. */ BUILD_BUG_ON(ARRAY_SIZE(sta->ptk) <= INVALID_PTK_KEYIDX); sta->ptk_idx = INVALID_PTK_KEYIDX; ieee80211_init_frag_cache(&sta->frags); sta->sta_state = IEEE80211_STA_NONE; if (sdata->vif.type == NL80211_IFTYPE_MESH_POINT) sta->amsdu_mesh_control = -1; /* Mark TID as unreserved */ sta->reserved_tid = IEEE80211_TID_UNRESERVED; sta->last_connected = ktime_get_seconds(); size = sizeof(struct txq_info) + ALIGN(hw->txq_data_size, sizeof(void *)); txq_data = kcalloc(ARRAY_SIZE(sta->sta.txq), size, gfp); if (!txq_data) goto free; for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) { struct txq_info *txq = txq_data + i * size; /* might not do anything for the (bufferable) MMPDU TXQ */ ieee80211_txq_init(sdata, sta, txq, i); } if (sta_prepare_rate_control(local, sta, gfp)) goto free_txq; sta->airtime_weight = IEEE80211_DEFAULT_AIRTIME_WEIGHT; for (i = 0; i < IEEE80211_NUM_ACS; i++) { skb_queue_head_init(&sta->ps_tx_buf[i]); skb_queue_head_init(&sta->tx_filtered[i]); sta->airtime[i].deficit = sta->airtime_weight; atomic_set(&sta->airtime[i].aql_tx_pending, 0); sta->airtime[i].aql_limit_low = local->aql_txq_limit_low[i]; sta->airtime[i].aql_limit_high = local->aql_txq_limit_high[i]; } for (i = 0; i < IEEE80211_NUM_TIDS; i++) sta->last_seq_ctrl[i] = cpu_to_le16(USHRT_MAX); for (i = 0; i < NUM_NL80211_BANDS; i++) { u32 mandatory = 0; int r; if (!hw->wiphy->bands[i]) continue; switch (i) { case NL80211_BAND_2GHZ: case NL80211_BAND_LC: /* * We use both here, even if we cannot really know for * sure the station will support both, but the only use * for this is when we don't know anything yet and send * management frames, and then we'll pick the lowest * possible rate anyway. * If we don't include _G here, we cannot find a rate * in P2P, and thus trigger the WARN_ONCE() in rate.c */ mandatory = IEEE80211_RATE_MANDATORY_B | IEEE80211_RATE_MANDATORY_G; break; case NL80211_BAND_5GHZ: mandatory = IEEE80211_RATE_MANDATORY_A; break; case NL80211_BAND_60GHZ: WARN_ON(1); mandatory = 0; break; } for (r = 0; r < hw->wiphy->bands[i]->n_bitrates; r++) { struct ieee80211_rate *rate; rate = &hw->wiphy->bands[i]->bitrates[r]; if (!(rate->flags & mandatory)) continue; sta->sta.deflink.supp_rates[i] |= BIT(r); } } sta->cparams.ce_threshold = CODEL_DISABLED_THRESHOLD; sta->cparams.target = MS2TIME(20); sta->cparams.interval = MS2TIME(100); sta->cparams.ecn = true; sta->cparams.ce_threshold_selector = 0; sta->cparams.ce_threshold_mask = 0; sta_dbg(sdata, "Allocated STA %pM\n", sta->sta.addr); return sta; free_txq: kfree(to_txq_info(sta->sta.txq[0])); free: sta_info_free_link(&sta->deflink); #ifdef CONFIG_MAC80211_MESH kfree(sta->mesh); #endif kfree(sta); return NULL; } struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata, const u8 *addr, gfp_t gfp) { return __sta_info_alloc(sdata, addr, -1, addr, gfp); } struct sta_info *sta_info_alloc_with_link(struct ieee80211_sub_if_data *sdata, const u8 *mld_addr, unsigned int link_id, const u8 *link_addr, gfp_t gfp) { return __sta_info_alloc(sdata, mld_addr, link_id, link_addr, gfp); } static int sta_info_insert_check(struct sta_info *sta) { struct ieee80211_sub_if_data *sdata = sta->sdata; lockdep_assert_wiphy(sdata->local->hw.wiphy); /* * Can't be a WARN_ON because it can be triggered through a race: * something inserts a STA (on one CPU) without holding the RTNL * and another CPU turns off the net device. */ if (unlikely(!ieee80211_sdata_running(sdata))) return -ENETDOWN; if (WARN_ON(ether_addr_equal(sta->sta.addr, sdata->vif.addr) || !is_valid_ether_addr(sta->sta.addr))) return -EINVAL; /* The RCU read lock is required by rhashtable due to * asynchronous resize/rehash. We also require the mutex * for correctness. */ rcu_read_lock(); if (ieee80211_hw_check(&sdata->local->hw, NEEDS_UNIQUE_STA_ADDR) && ieee80211_find_sta_by_ifaddr(&sdata->local->hw, sta->addr, NULL)) { rcu_read_unlock(); return -ENOTUNIQ; } rcu_read_unlock(); return 0; } static int sta_info_insert_drv_state(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct sta_info *sta) { enum ieee80211_sta_state state; int err = 0; for (state = IEEE80211_STA_NOTEXIST; state < sta->sta_state; state++) { err = drv_sta_state(local, sdata, sta, state, state + 1); if (err) break; } if (!err) { /* * Drivers using legacy sta_add/sta_remove callbacks only * get uploaded set to true after sta_add is called. */ if (!local->ops->sta_add) sta->uploaded = true; return 0; } if (sdata->vif.type == NL80211_IFTYPE_ADHOC) { sdata_info(sdata, "failed to move IBSS STA %pM to state %d (%d) - keeping it anyway\n", sta->sta.addr, state + 1, err); err = 0; } /* unwind on error */ for (; state > IEEE80211_STA_NOTEXIST; state--) WARN_ON(drv_sta_state(local, sdata, sta, state, state - 1)); return err; } static void ieee80211_recalc_p2p_go_ps_allowed(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; bool allow_p2p_go_ps = sdata->vif.p2p; struct sta_info *sta; rcu_read_lock(); list_for_each_entry_rcu(sta, &local->sta_list, list) { if (sdata != sta->sdata || !test_sta_flag(sta, WLAN_STA_ASSOC)) continue; if (!sta->sta.support_p2p_ps) { allow_p2p_go_ps = false; break; } } rcu_read_unlock(); if (allow_p2p_go_ps != sdata->vif.bss_conf.allow_p2p_go_ps) { sdata->vif.bss_conf.allow_p2p_go_ps = allow_p2p_go_ps; ieee80211_link_info_change_notify(sdata, &sdata->deflink, BSS_CHANGED_P2P_PS); } } static int sta_info_insert_finish(struct sta_info *sta) __acquires(RCU) { struct ieee80211_local *local = sta->local; struct ieee80211_sub_if_data *sdata = sta->sdata; struct station_info *sinfo = NULL; int err = 0; lockdep_assert_wiphy(local->hw.wiphy); /* check if STA exists already */ if (sta_info_get_bss(sdata, sta->sta.addr)) { err = -EEXIST; goto out_cleanup; } sinfo = kzalloc(sizeof(struct station_info), GFP_KERNEL); if (!sinfo) { err = -ENOMEM; goto out_cleanup; } local->num_sta++; local->sta_generation++; smp_mb(); /* simplify things and don't accept BA sessions yet */ set_sta_flag(sta, WLAN_STA_BLOCK_BA); /* make the station visible */ err = sta_info_hash_add(local, sta); if (err) goto out_drop_sta; if (sta->sta.valid_links) { err = link_sta_info_hash_add(local, &sta->deflink); if (err) { sta_info_hash_del(local, sta); goto out_drop_sta; } } list_add_tail_rcu(&sta->list, &local->sta_list); /* update channel context before notifying the driver about state * change, this enables driver using the updated channel context right away. */ if (sta->sta_state >= IEEE80211_STA_ASSOC) { ieee80211_recalc_min_chandef(sta->sdata, -1); if (!sta->sta.support_p2p_ps) ieee80211_recalc_p2p_go_ps_allowed(sta->sdata); } /* notify driver */ err = sta_info_insert_drv_state(local, sdata, sta); if (err) goto out_remove; set_sta_flag(sta, WLAN_STA_INSERTED); /* accept BA sessions now */ clear_sta_flag(sta, WLAN_STA_BLOCK_BA); ieee80211_sta_debugfs_add(sta); rate_control_add_sta_debugfs(sta); if (sta->sta.valid_links) { int i; for (i = 0; i < ARRAY_SIZE(sta->link); i++) { struct link_sta_info *link_sta; link_sta = rcu_dereference_protected(sta->link[i], lockdep_is_held(&local->hw.wiphy->mtx)); if (!link_sta) continue; ieee80211_link_sta_debugfs_add(link_sta); if (sdata->vif.active_links & BIT(i)) ieee80211_link_sta_debugfs_drv_add(link_sta); } } else { ieee80211_link_sta_debugfs_add(&sta->deflink); ieee80211_link_sta_debugfs_drv_add(&sta->deflink); } sinfo->generation = local->sta_generation; cfg80211_new_sta(sdata->dev, sta->sta.addr, sinfo, GFP_KERNEL); kfree(sinfo); sta_dbg(sdata, "Inserted STA %pM\n", sta->sta.addr); /* move reference to rcu-protected */ rcu_read_lock(); if (ieee80211_vif_is_mesh(&sdata->vif)) mesh_accept_plinks_update(sdata); ieee80211_check_fast_xmit(sta); return 0; out_remove: if (sta->sta.valid_links) link_sta_info_hash_del(local, &sta->deflink); sta_info_hash_del(local, sta); list_del_rcu(&sta->list); out_drop_sta: local->num_sta--; synchronize_net(); out_cleanup: cleanup_single_sta(sta); kfree(sinfo); rcu_read_lock(); return err; } int sta_info_insert_rcu(struct sta_info *sta) __acquires(RCU) { struct ieee80211_local *local = sta->local; int err; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); err = sta_info_insert_check(sta); if (err) { sta_info_free(local, sta); rcu_read_lock(); return err; } return sta_info_insert_finish(sta); } int sta_info_insert(struct sta_info *sta) { int err = sta_info_insert_rcu(sta); rcu_read_unlock(); return err; } static inline void __bss_tim_set(u8 *tim, u16 id) { /* * This format has been mandated by the IEEE specifications, * so this line may not be changed to use the __set_bit() format. */ tim[id / 8] |= (1 << (id % 8)); } static inline void __bss_tim_clear(u8 *tim, u16 id) { /* * This format has been mandated by the IEEE specifications, * so this line may not be changed to use the __clear_bit() format. */ tim[id / 8] &= ~(1 << (id % 8)); } static inline bool __bss_tim_get(u8 *tim, u16 id) { /* * This format has been mandated by the IEEE specifications, * so this line may not be changed to use the test_bit() format. */ return tim[id / 8] & (1 << (id % 8)); } static unsigned long ieee80211_tids_for_ac(int ac) { /* If we ever support TIDs > 7, this obviously needs to be adjusted */ switch (ac) { case IEEE80211_AC_VO: return BIT(6) | BIT(7); case IEEE80211_AC_VI: return BIT(4) | BIT(5); case IEEE80211_AC_BE: return BIT(0) | BIT(3); case IEEE80211_AC_BK: return BIT(1) | BIT(2); default: WARN_ON(1); return 0; } } static void __sta_info_recalc_tim(struct sta_info *sta, bool ignore_pending) { struct ieee80211_local *local = sta->local; struct ps_data *ps; bool indicate_tim = false; u8 ignore_for_tim = sta->sta.uapsd_queues; int ac; u16 id = sta->sta.aid; if (sta->sdata->vif.type == NL80211_IFTYPE_AP || sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN) { if (WARN_ON_ONCE(!sta->sdata->bss)) return; ps = &sta->sdata->bss->ps; #ifdef CONFIG_MAC80211_MESH } else if (ieee80211_vif_is_mesh(&sta->sdata->vif)) { ps = &sta->sdata->u.mesh.ps; #endif } else { return; } /* No need to do anything if the driver does all */ if (ieee80211_hw_check(&local->hw, AP_LINK_PS) && !local->ops->set_tim) return; if (sta->dead) goto done; /* * If all ACs are delivery-enabled then we should build * the TIM bit for all ACs anyway; if only some are then * we ignore those and build the TIM bit using only the * non-enabled ones. */ if (ignore_for_tim == BIT(IEEE80211_NUM_ACS) - 1) ignore_for_tim = 0; if (ignore_pending) ignore_for_tim = BIT(IEEE80211_NUM_ACS) - 1; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { unsigned long tids; if (ignore_for_tim & ieee80211_ac_to_qos_mask[ac]) continue; indicate_tim |= !skb_queue_empty(&sta->tx_filtered[ac]) || !skb_queue_empty(&sta->ps_tx_buf[ac]); if (indicate_tim) break; tids = ieee80211_tids_for_ac(ac); indicate_tim |= sta->driver_buffered_tids & tids; indicate_tim |= sta->txq_buffered_tids & tids; } done: spin_lock_bh(&local->tim_lock); if (indicate_tim == __bss_tim_get(ps->tim, id)) goto out_unlock; if (indicate_tim) __bss_tim_set(ps->tim, id); else __bss_tim_clear(ps->tim, id); if (local->ops->set_tim && !WARN_ON(sta->dead)) { local->tim_in_locked_section = true; drv_set_tim(local, &sta->sta, indicate_tim); local->tim_in_locked_section = false; } out_unlock: spin_unlock_bh(&local->tim_lock); } void sta_info_recalc_tim(struct sta_info *sta) { __sta_info_recalc_tim(sta, false); } static bool sta_info_buffer_expired(struct sta_info *sta, struct sk_buff *skb) { struct ieee80211_tx_info *info; int timeout; if (!skb) return false; info = IEEE80211_SKB_CB(skb); /* Timeout: (2 * listen_interval * beacon_int * 1024 / 1000000) sec */ timeout = (sta->listen_interval * sta->sdata->vif.bss_conf.beacon_int * 32 / 15625) * HZ; if (timeout < STA_TX_BUFFER_EXPIRE) timeout = STA_TX_BUFFER_EXPIRE; return time_after(jiffies, info->control.jiffies + timeout); } static bool sta_info_cleanup_expire_buffered_ac(struct ieee80211_local *local, struct sta_info *sta, int ac) { unsigned long flags; struct sk_buff *skb; /* * First check for frames that should expire on the filtered * queue. Frames here were rejected by the driver and are on * a separate queue to avoid reordering with normal PS-buffered * frames. They also aren't accounted for right now in the * total_ps_buffered counter. */ for (;;) { spin_lock_irqsave(&sta->tx_filtered[ac].lock, flags); skb = skb_peek(&sta->tx_filtered[ac]); if (sta_info_buffer_expired(sta, skb)) skb = __skb_dequeue(&sta->tx_filtered[ac]); else skb = NULL; spin_unlock_irqrestore(&sta->tx_filtered[ac].lock, flags); /* * Frames are queued in order, so if this one * hasn't expired yet we can stop testing. If * we actually reached the end of the queue we * also need to stop, of course. */ if (!skb) break; ieee80211_free_txskb(&local->hw, skb); } /* * Now also check the normal PS-buffered queue, this will * only find something if the filtered queue was emptied * since the filtered frames are all before the normal PS * buffered frames. */ for (;;) { spin_lock_irqsave(&sta->ps_tx_buf[ac].lock, flags); skb = skb_peek(&sta->ps_tx_buf[ac]); if (sta_info_buffer_expired(sta, skb)) skb = __skb_dequeue(&sta->ps_tx_buf[ac]); else skb = NULL; spin_unlock_irqrestore(&sta->ps_tx_buf[ac].lock, flags); /* * frames are queued in order, so if this one * hasn't expired yet (or we reached the end of * the queue) we can stop testing */ if (!skb) break; local->total_ps_buffered--; ps_dbg(sta->sdata, "Buffered frame expired (STA %pM)\n", sta->sta.addr); ieee80211_free_txskb(&local->hw, skb); } /* * Finally, recalculate the TIM bit for this station -- it might * now be clear because the station was too slow to retrieve its * frames. */ sta_info_recalc_tim(sta); /* * Return whether there are any frames still buffered, this is * used to check whether the cleanup timer still needs to run, * if there are no frames we don't need to rearm the timer. */ return !(skb_queue_empty(&sta->ps_tx_buf[ac]) && skb_queue_empty(&sta->tx_filtered[ac])); } static bool sta_info_cleanup_expire_buffered(struct ieee80211_local *local, struct sta_info *sta) { bool have_buffered = false; int ac; /* This is only necessary for stations on BSS/MBSS interfaces */ if (!sta->sdata->bss && !ieee80211_vif_is_mesh(&sta->sdata->vif)) return false; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) have_buffered |= sta_info_cleanup_expire_buffered_ac(local, sta, ac); return have_buffered; } static int __must_check __sta_info_destroy_part1(struct sta_info *sta) { struct ieee80211_local *local; struct ieee80211_sub_if_data *sdata; int ret, i; might_sleep(); if (!sta) return -ENOENT; local = sta->local; sdata = sta->sdata; lockdep_assert_wiphy(local->hw.wiphy); /* * Before removing the station from the driver and * rate control, it might still start new aggregation * sessions -- block that to make sure the tear-down * will be sufficient. */ set_sta_flag(sta, WLAN_STA_BLOCK_BA); ieee80211_sta_tear_down_BA_sessions(sta, AGG_STOP_DESTROY_STA); /* * Before removing the station from the driver there might be pending * rx frames on RSS queues sent prior to the disassociation - wait for * all such frames to be processed. */ drv_sync_rx_queues(local, sta); for (i = 0; i < ARRAY_SIZE(sta->link); i++) { struct link_sta_info *link_sta; if (!(sta->sta.valid_links & BIT(i))) continue; link_sta = rcu_dereference_protected(sta->link[i], lockdep_is_held(&local->hw.wiphy->mtx)); link_sta_info_hash_del(local, link_sta); } ret = sta_info_hash_del(local, sta); if (WARN_ON(ret)) return ret; /* * for TDLS peers, make sure to return to the base channel before * removal. */ if (test_sta_flag(sta, WLAN_STA_TDLS_OFF_CHANNEL)) { drv_tdls_cancel_channel_switch(local, sdata, &sta->sta); clear_sta_flag(sta, WLAN_STA_TDLS_OFF_CHANNEL); } list_del_rcu(&sta->list); sta->removed = true; if (sta->uploaded) drv_sta_pre_rcu_remove(local, sta->sdata, sta); if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN && rcu_access_pointer(sdata->u.vlan.sta) == sta) RCU_INIT_POINTER(sdata->u.vlan.sta, NULL); return 0; } static int _sta_info_move_state(struct sta_info *sta, enum ieee80211_sta_state new_state, bool recalc) { struct ieee80211_local *local = sta->local; might_sleep(); if (sta->sta_state == new_state) return 0; /* check allowed transitions first */ switch (new_state) { case IEEE80211_STA_NONE: if (sta->sta_state != IEEE80211_STA_AUTH) return -EINVAL; break; case IEEE80211_STA_AUTH: if (sta->sta_state != IEEE80211_STA_NONE && sta->sta_state != IEEE80211_STA_ASSOC) return -EINVAL; break; case IEEE80211_STA_ASSOC: if (sta->sta_state != IEEE80211_STA_AUTH && sta->sta_state != IEEE80211_STA_AUTHORIZED) return -EINVAL; break; case IEEE80211_STA_AUTHORIZED: if (sta->sta_state != IEEE80211_STA_ASSOC) return -EINVAL; break; default: WARN(1, "invalid state %d", new_state); return -EINVAL; } sta_dbg(sta->sdata, "moving STA %pM to state %d\n", sta->sta.addr, new_state); /* notify the driver before the actual changes so it can * fail the transition if the state is increasing. * The driver is required not to fail when the transition * is decreasing the state, so first, do all the preparation * work and only then, notify the driver. */ if (new_state > sta->sta_state && test_sta_flag(sta, WLAN_STA_INSERTED)) { int err = drv_sta_state(sta->local, sta->sdata, sta, sta->sta_state, new_state); if (err) return err; } /* reflect the change in all state variables */ switch (new_state) { case IEEE80211_STA_NONE: if (sta->sta_state == IEEE80211_STA_AUTH) clear_bit(WLAN_STA_AUTH, &sta->_flags); break; case IEEE80211_STA_AUTH: if (sta->sta_state == IEEE80211_STA_NONE) { set_bit(WLAN_STA_AUTH, &sta->_flags); } else if (sta->sta_state == IEEE80211_STA_ASSOC) { clear_bit(WLAN_STA_ASSOC, &sta->_flags); if (recalc) { ieee80211_recalc_min_chandef(sta->sdata, -1); if (!sta->sta.support_p2p_ps) ieee80211_recalc_p2p_go_ps_allowed(sta->sdata); } } break; case IEEE80211_STA_ASSOC: if (sta->sta_state == IEEE80211_STA_AUTH) { set_bit(WLAN_STA_ASSOC, &sta->_flags); sta->assoc_at = ktime_get_boottime_ns(); if (recalc) { ieee80211_recalc_min_chandef(sta->sdata, -1); if (!sta->sta.support_p2p_ps) ieee80211_recalc_p2p_go_ps_allowed(sta->sdata); } } else if (sta->sta_state == IEEE80211_STA_AUTHORIZED) { ieee80211_vif_dec_num_mcast(sta->sdata); clear_bit(WLAN_STA_AUTHORIZED, &sta->_flags); /* * If we have encryption offload, flush (station) queues * (after ensuring concurrent TX completed) so we won't * transmit anything later unencrypted if/when keys are * also removed, which might otherwise happen depending * on how the hardware offload works. */ if (local->ops->set_key) { synchronize_net(); if (local->ops->flush_sta) drv_flush_sta(local, sta->sdata, sta); else ieee80211_flush_queues(local, sta->sdata, false); } ieee80211_clear_fast_xmit(sta); ieee80211_clear_fast_rx(sta); } break; case IEEE80211_STA_AUTHORIZED: if (sta->sta_state == IEEE80211_STA_ASSOC) { ieee80211_vif_inc_num_mcast(sta->sdata); set_bit(WLAN_STA_AUTHORIZED, &sta->_flags); ieee80211_check_fast_xmit(sta); ieee80211_check_fast_rx(sta); } if (sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN || sta->sdata->vif.type == NL80211_IFTYPE_AP) cfg80211_send_layer2_update(sta->sdata->dev, sta->sta.addr); break; default: break; } if (new_state < sta->sta_state && test_sta_flag(sta, WLAN_STA_INSERTED)) { int err = drv_sta_state(sta->local, sta->sdata, sta, sta->sta_state, new_state); WARN_ONCE(err, "Driver is not allowed to fail if the sta_state is transitioning down the list: %d\n", err); } sta->sta_state = new_state; return 0; } int sta_info_move_state(struct sta_info *sta, enum ieee80211_sta_state new_state) { return _sta_info_move_state(sta, new_state, true); } static void __sta_info_destroy_part2(struct sta_info *sta, bool recalc) { struct ieee80211_local *local = sta->local; struct ieee80211_sub_if_data *sdata = sta->sdata; struct station_info *sinfo; int ret; /* * NOTE: This assumes at least synchronize_net() was done * after _part1 and before _part2! */ /* * There's a potential race in _part1 where we set WLAN_STA_BLOCK_BA * but someone might have just gotten past a check, and not yet into * queuing the work/creating the data/etc. * * Do another round of destruction so that the worker is certainly * canceled before we later free the station. * * Since this is after synchronize_rcu()/synchronize_net() we're now * certain that nobody can actually hold a reference to the STA and * be calling e.g. ieee80211_start_tx_ba_session(). */ ieee80211_sta_tear_down_BA_sessions(sta, AGG_STOP_DESTROY_STA); might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); if (sta->sta_state == IEEE80211_STA_AUTHORIZED) { ret = _sta_info_move_state(sta, IEEE80211_STA_ASSOC, recalc); WARN_ON_ONCE(ret); } /* now keys can no longer be reached */ ieee80211_free_sta_keys(local, sta); /* disable TIM bit - last chance to tell driver */ __sta_info_recalc_tim(sta, true); sta->dead = true; local->num_sta--; local->sta_generation++; while (sta->sta_state > IEEE80211_STA_NONE) { ret = _sta_info_move_state(sta, sta->sta_state - 1, recalc); if (ret) { WARN_ON_ONCE(1); break; } } if (sta->uploaded) { ret = drv_sta_state(local, sdata, sta, IEEE80211_STA_NONE, IEEE80211_STA_NOTEXIST); WARN_ON_ONCE(ret != 0); } sta_dbg(sdata, "Removed STA %pM\n", sta->sta.addr); sinfo = kzalloc(sizeof(*sinfo), GFP_KERNEL); if (sinfo) sta_set_sinfo(sta, sinfo, true); cfg80211_del_sta_sinfo(sdata->dev, sta->sta.addr, sinfo, GFP_KERNEL); kfree(sinfo); ieee80211_sta_debugfs_remove(sta); ieee80211_destroy_frag_cache(&sta->frags); cleanup_single_sta(sta); } int __must_check __sta_info_destroy(struct sta_info *sta) { int err = __sta_info_destroy_part1(sta); if (err) return err; synchronize_net(); __sta_info_destroy_part2(sta, true); return 0; } int sta_info_destroy_addr(struct ieee80211_sub_if_data *sdata, const u8 *addr) { struct sta_info *sta; lockdep_assert_wiphy(sdata->local->hw.wiphy); sta = sta_info_get(sdata, addr); return __sta_info_destroy(sta); } int sta_info_destroy_addr_bss(struct ieee80211_sub_if_data *sdata, const u8 *addr) { struct sta_info *sta; lockdep_assert_wiphy(sdata->local->hw.wiphy); sta = sta_info_get_bss(sdata, addr); return __sta_info_destroy(sta); } static void sta_info_cleanup(struct timer_list *t) { struct ieee80211_local *local = from_timer(local, t, sta_cleanup); struct sta_info *sta; bool timer_needed = false; rcu_read_lock(); list_for_each_entry_rcu(sta, &local->sta_list, list) if (sta_info_cleanup_expire_buffered(local, sta)) timer_needed = true; rcu_read_unlock(); if (local->quiescing) return; if (!timer_needed) return; mod_timer(&local->sta_cleanup, round_jiffies(jiffies + STA_INFO_CLEANUP_INTERVAL)); } int sta_info_init(struct ieee80211_local *local) { int err; err = rhltable_init(&local->sta_hash, &sta_rht_params); if (err) return err; err = rhltable_init(&local->link_sta_hash, &link_sta_rht_params); if (err) { rhltable_destroy(&local->sta_hash); return err; } spin_lock_init(&local->tim_lock); INIT_LIST_HEAD(&local->sta_list); timer_setup(&local->sta_cleanup, sta_info_cleanup, 0); return 0; } void sta_info_stop(struct ieee80211_local *local) { timer_delete_sync(&local->sta_cleanup); rhltable_destroy(&local->sta_hash); rhltable_destroy(&local->link_sta_hash); } int __sta_info_flush(struct ieee80211_sub_if_data *sdata, bool vlans, int link_id, struct sta_info *do_not_flush_sta) { struct ieee80211_local *local = sdata->local; struct sta_info *sta, *tmp; LIST_HEAD(free_list); int ret = 0; might_sleep(); lockdep_assert_wiphy(local->hw.wiphy); WARN_ON(vlans && sdata->vif.type != NL80211_IFTYPE_AP); WARN_ON(vlans && !sdata->bss); list_for_each_entry_safe(sta, tmp, &local->sta_list, list) { if (sdata != sta->sdata && (!vlans || sdata->bss != sta->sdata->bss)) continue; if (sta == do_not_flush_sta) continue; if (link_id >= 0 && sta->sta.valid_links && !(sta->sta.valid_links & BIT(link_id))) continue; if (!WARN_ON(__sta_info_destroy_part1(sta))) list_add(&sta->free_list, &free_list); ret++; } if (!list_empty(&free_list)) { bool support_p2p_ps = true; synchronize_net(); list_for_each_entry_safe(sta, tmp, &free_list, free_list) { if (!sta->sta.support_p2p_ps) support_p2p_ps = false; __sta_info_destroy_part2(sta, false); } ieee80211_recalc_min_chandef(sdata, -1); if (!support_p2p_ps) ieee80211_recalc_p2p_go_ps_allowed(sdata); } return ret; } void ieee80211_sta_expire(struct ieee80211_sub_if_data *sdata, unsigned long exp_time) { struct ieee80211_local *local = sdata->local; struct sta_info *sta, *tmp; lockdep_assert_wiphy(local->hw.wiphy); list_for_each_entry_safe(sta, tmp, &local->sta_list, list) { unsigned long last_active = ieee80211_sta_last_active(sta); if (sdata != sta->sdata) continue; if (time_is_before_jiffies(last_active + exp_time)) { sta_dbg(sta->sdata, "expiring inactive STA %pM\n", sta->sta.addr); if (ieee80211_vif_is_mesh(&sdata->vif) && test_sta_flag(sta, WLAN_STA_PS_STA)) atomic_dec(&sdata->u.mesh.ps.num_sta_ps); WARN_ON(__sta_info_destroy(sta)); } } } struct ieee80211_sta *ieee80211_find_sta_by_ifaddr(struct ieee80211_hw *hw, const u8 *addr, const u8 *localaddr) { struct ieee80211_local *local = hw_to_local(hw); struct rhlist_head *tmp; struct sta_info *sta; /* * Just return a random station if localaddr is NULL * ... first in list. */ for_each_sta_info(local, addr, sta, tmp) { if (localaddr && !ether_addr_equal(sta->sdata->vif.addr, localaddr)) continue; if (!sta->uploaded) return NULL; return &sta->sta; } return NULL; } EXPORT_SYMBOL_GPL(ieee80211_find_sta_by_ifaddr); struct ieee80211_sta *ieee80211_find_sta(struct ieee80211_vif *vif, const u8 *addr) { struct sta_info *sta; if (!vif) return NULL; sta = sta_info_get_bss(vif_to_sdata(vif), addr); if (!sta) return NULL; if (!sta->uploaded) return NULL; return &sta->sta; } EXPORT_SYMBOL(ieee80211_find_sta); /* powersave support code */ void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta) { struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; struct sk_buff_head pending; int filtered = 0, buffered = 0, ac, i; unsigned long flags; struct ps_data *ps; if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) sdata = container_of(sdata->bss, struct ieee80211_sub_if_data, u.ap); if (sdata->vif.type == NL80211_IFTYPE_AP) ps = &sdata->bss->ps; else if (ieee80211_vif_is_mesh(&sdata->vif)) ps = &sdata->u.mesh.ps; else return; clear_sta_flag(sta, WLAN_STA_SP); BUILD_BUG_ON(BITS_TO_LONGS(IEEE80211_NUM_TIDS) > 1); sta->driver_buffered_tids = 0; sta->txq_buffered_tids = 0; if (!ieee80211_hw_check(&local->hw, AP_LINK_PS)) drv_sta_notify(local, sdata, STA_NOTIFY_AWAKE, &sta->sta); for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) { if (!sta->sta.txq[i] || !txq_has_queue(sta->sta.txq[i])) continue; schedule_and_wake_txq(local, to_txq_info(sta->sta.txq[i])); } skb_queue_head_init(&pending); /* sync with ieee80211_tx_h_unicast_ps_buf */ spin_lock_bh(&sta->ps_lock); /* Send all buffered frames to the station */ for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { int count = skb_queue_len(&pending), tmp; spin_lock_irqsave(&sta->tx_filtered[ac].lock, flags); skb_queue_splice_tail_init(&sta->tx_filtered[ac], &pending); spin_unlock_irqrestore(&sta->tx_filtered[ac].lock, flags); tmp = skb_queue_len(&pending); filtered += tmp - count; count = tmp; spin_lock_irqsave(&sta->ps_tx_buf[ac].lock, flags); skb_queue_splice_tail_init(&sta->ps_tx_buf[ac], &pending); spin_unlock_irqrestore(&sta->ps_tx_buf[ac].lock, flags); tmp = skb_queue_len(&pending); buffered += tmp - count; } ieee80211_add_pending_skbs(local, &pending); /* now we're no longer in the deliver code */ clear_sta_flag(sta, WLAN_STA_PS_DELIVER); /* The station might have polled and then woken up before we responded, * so clear these flags now to avoid them sticking around. */ clear_sta_flag(sta, WLAN_STA_PSPOLL); clear_sta_flag(sta, WLAN_STA_UAPSD); spin_unlock_bh(&sta->ps_lock); atomic_dec(&ps->num_sta_ps); local->total_ps_buffered -= buffered; sta_info_recalc_tim(sta); ps_dbg(sdata, "STA %pM aid %d sending %d filtered/%d PS frames since STA woke up\n", sta->sta.addr, sta->sta.aid, filtered, buffered); ieee80211_check_fast_xmit(sta); } static void ieee80211_send_null_response(struct sta_info *sta, int tid, enum ieee80211_frame_release_type reason, bool call_driver, bool more_data) { struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; struct ieee80211_qos_hdr *nullfunc; struct sk_buff *skb; int size = sizeof(*nullfunc); __le16 fc; bool qos = sta->sta.wme; struct ieee80211_tx_info *info; struct ieee80211_chanctx_conf *chanctx_conf; if (qos) { fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_QOS_NULLFUNC | IEEE80211_FCTL_FROMDS); } else { size -= 2; fc = cpu_to_le16(IEEE80211_FTYPE_DATA | IEEE80211_STYPE_NULLFUNC | IEEE80211_FCTL_FROMDS); } skb = dev_alloc_skb(local->hw.extra_tx_headroom + size); if (!skb) return; skb_reserve(skb, local->hw.extra_tx_headroom); nullfunc = skb_put(skb, size); nullfunc->frame_control = fc; nullfunc->duration_id = 0; memcpy(nullfunc->addr1, sta->sta.addr, ETH_ALEN); memcpy(nullfunc->addr2, sdata->vif.addr, ETH_ALEN); memcpy(nullfunc->addr3, sdata->vif.addr, ETH_ALEN); nullfunc->seq_ctrl = 0; skb->priority = tid; skb_set_queue_mapping(skb, ieee802_1d_to_ac[tid]); if (qos) { nullfunc->qos_ctrl = cpu_to_le16(tid); if (reason == IEEE80211_FRAME_RELEASE_UAPSD) { nullfunc->qos_ctrl |= cpu_to_le16(IEEE80211_QOS_CTL_EOSP); if (more_data) nullfunc->frame_control |= cpu_to_le16(IEEE80211_FCTL_MOREDATA); } } info = IEEE80211_SKB_CB(skb); /* * Tell TX path to send this frame even though the * STA may still remain is PS mode after this frame * exchange. Also set EOSP to indicate this packet * ends the poll/service period. */ info->flags |= IEEE80211_TX_CTL_NO_PS_BUFFER | IEEE80211_TX_STATUS_EOSP | IEEE80211_TX_CTL_REQ_TX_STATUS; info->control.flags |= IEEE80211_TX_CTRL_PS_RESPONSE; if (call_driver) drv_allow_buffered_frames(local, sta, BIT(tid), 1, reason, false); skb->dev = sdata->dev; rcu_read_lock(); chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf); if (WARN_ON(!chanctx_conf)) { rcu_read_unlock(); kfree_skb(skb); return; } info->band = chanctx_conf->def.chan->band; ieee80211_xmit(sdata, sta, skb); rcu_read_unlock(); } static int find_highest_prio_tid(unsigned long tids) { /* lower 3 TIDs aren't ordered perfectly */ if (tids & 0xF8) return fls(tids) - 1; /* TID 0 is BE just like TID 3 */ if (tids & BIT(0)) return 0; return fls(tids) - 1; } /* Indicates if the MORE_DATA bit should be set in the last * frame obtained by ieee80211_sta_ps_get_frames. * Note that driver_release_tids is relevant only if * reason = IEEE80211_FRAME_RELEASE_PSPOLL */ static bool ieee80211_sta_ps_more_data(struct sta_info *sta, u8 ignored_acs, enum ieee80211_frame_release_type reason, unsigned long driver_release_tids) { int ac; /* If the driver has data on more than one TID then * certainly there's more data if we release just a * single frame now (from a single TID). This will * only happen for PS-Poll. */ if (reason == IEEE80211_FRAME_RELEASE_PSPOLL && hweight16(driver_release_tids) > 1) return true; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { if (ignored_acs & ieee80211_ac_to_qos_mask[ac]) continue; if (!skb_queue_empty(&sta->tx_filtered[ac]) || !skb_queue_empty(&sta->ps_tx_buf[ac])) return true; } return false; } static void ieee80211_sta_ps_get_frames(struct sta_info *sta, int n_frames, u8 ignored_acs, enum ieee80211_frame_release_type reason, struct sk_buff_head *frames, unsigned long *driver_release_tids) { struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; int ac; /* Get response frame(s) and more data bit for the last one. */ for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { unsigned long tids; if (ignored_acs & ieee80211_ac_to_qos_mask[ac]) continue; tids = ieee80211_tids_for_ac(ac); /* if we already have frames from software, then we can't also * release from hardware queues */ if (skb_queue_empty(frames)) { *driver_release_tids |= sta->driver_buffered_tids & tids; *driver_release_tids |= sta->txq_buffered_tids & tids; } if (!*driver_release_tids) { struct sk_buff *skb; while (n_frames > 0) { skb = skb_dequeue(&sta->tx_filtered[ac]); if (!skb) { skb = skb_dequeue( &sta->ps_tx_buf[ac]); if (skb) local->total_ps_buffered--; } if (!skb) break; n_frames--; __skb_queue_tail(frames, skb); } } /* If we have more frames buffered on this AC, then abort the * loop since we can't send more data from other ACs before * the buffered frames from this. */ if (!skb_queue_empty(&sta->tx_filtered[ac]) || !skb_queue_empty(&sta->ps_tx_buf[ac])) break; } } static void ieee80211_sta_ps_deliver_response(struct sta_info *sta, int n_frames, u8 ignored_acs, enum ieee80211_frame_release_type reason) { struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; unsigned long driver_release_tids = 0; struct sk_buff_head frames; bool more_data; /* Service or PS-Poll period starts */ set_sta_flag(sta, WLAN_STA_SP); __skb_queue_head_init(&frames); ieee80211_sta_ps_get_frames(sta, n_frames, ignored_acs, reason, &frames, &driver_release_tids); more_data = ieee80211_sta_ps_more_data(sta, ignored_acs, reason, driver_release_tids); if (driver_release_tids && reason == IEEE80211_FRAME_RELEASE_PSPOLL) driver_release_tids = BIT(find_highest_prio_tid(driver_release_tids)); if (skb_queue_empty(&frames) && !driver_release_tids) { int tid, ac; /* * For PS-Poll, this can only happen due to a race condition * when we set the TIM bit and the station notices it, but * before it can poll for the frame we expire it. * * For uAPSD, this is said in the standard (11.2.1.5 h): * At each unscheduled SP for a non-AP STA, the AP shall * attempt to transmit at least one MSDU or MMPDU, but no * more than the value specified in the Max SP Length field * in the QoS Capability element from delivery-enabled ACs, * that are destined for the non-AP STA. * * Since we have no other MSDU/MMPDU, transmit a QoS null frame. */ /* This will evaluate to 1, 3, 5 or 7. */ for (ac = IEEE80211_AC_VO; ac < IEEE80211_NUM_ACS; ac++) if (!(ignored_acs & ieee80211_ac_to_qos_mask[ac])) break; tid = 7 - 2 * ac; ieee80211_send_null_response(sta, tid, reason, true, false); } else if (!driver_release_tids) { struct sk_buff_head pending; struct sk_buff *skb; int num = 0; u16 tids = 0; bool need_null = false; skb_queue_head_init(&pending); while ((skb = __skb_dequeue(&frames))) { struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); struct ieee80211_hdr *hdr = (void *) skb->data; u8 *qoshdr = NULL; num++; /* * Tell TX path to send this frame even though the * STA may still remain is PS mode after this frame * exchange. */ info->flags |= IEEE80211_TX_CTL_NO_PS_BUFFER; info->control.flags |= IEEE80211_TX_CTRL_PS_RESPONSE; /* * Use MoreData flag to indicate whether there are * more buffered frames for this STA */ if (more_data || !skb_queue_empty(&frames)) hdr->frame_control |= cpu_to_le16(IEEE80211_FCTL_MOREDATA); else hdr->frame_control &= cpu_to_le16(~IEEE80211_FCTL_MOREDATA); if (ieee80211_is_data_qos(hdr->frame_control) || ieee80211_is_qos_nullfunc(hdr->frame_control)) qoshdr = ieee80211_get_qos_ctl(hdr); tids |= BIT(skb->priority); __skb_queue_tail(&pending, skb); /* end service period after last frame or add one */ if (!skb_queue_empty(&frames)) continue; if (reason != IEEE80211_FRAME_RELEASE_UAPSD) { /* for PS-Poll, there's only one frame */ info->flags |= IEEE80211_TX_STATUS_EOSP | IEEE80211_TX_CTL_REQ_TX_STATUS; break; } /* For uAPSD, things are a bit more complicated. If the * last frame has a QoS header (i.e. is a QoS-data or * QoS-nulldata frame) then just set the EOSP bit there * and be done. * If the frame doesn't have a QoS header (which means * it should be a bufferable MMPDU) then we can't set * the EOSP bit in the QoS header; add a QoS-nulldata * frame to the list to send it after the MMPDU. * * Note that this code is only in the mac80211-release * code path, we assume that the driver will not buffer * anything but QoS-data frames, or if it does, will * create the QoS-nulldata frame by itself if needed. * * Cf. 802.11-2012 10.2.1.10 (c). */ if (qoshdr) { *qoshdr |= IEEE80211_QOS_CTL_EOSP; info->flags |= IEEE80211_TX_STATUS_EOSP | IEEE80211_TX_CTL_REQ_TX_STATUS; } else { /* The standard isn't completely clear on this * as it says the more-data bit should be set * if there are more BUs. The QoS-Null frame * we're about to send isn't buffered yet, we * only create it below, but let's pretend it * was buffered just in case some clients only * expect more-data=0 when eosp=1. */ hdr->frame_control |= cpu_to_le16(IEEE80211_FCTL_MOREDATA); need_null = true; num++; } break; } drv_allow_buffered_frames(local, sta, tids, num, reason, more_data); ieee80211_add_pending_skbs(local, &pending); if (need_null) ieee80211_send_null_response( sta, find_highest_prio_tid(tids), reason, false, false); sta_info_recalc_tim(sta); } else { int tid; /* * We need to release a frame that is buffered somewhere in the * driver ... it'll have to handle that. * Note that the driver also has to check the number of frames * on the TIDs we're releasing from - if there are more than * n_frames it has to set the more-data bit (if we didn't ask * it to set it anyway due to other buffered frames); if there * are fewer than n_frames it has to make sure to adjust that * to allow the service period to end properly. */ drv_release_buffered_frames(local, sta, driver_release_tids, n_frames, reason, more_data); /* * Note that we don't recalculate the TIM bit here as it would * most likely have no effect at all unless the driver told us * that the TID(s) became empty before returning here from the * release function. * Either way, however, when the driver tells us that the TID(s) * became empty or we find that a txq became empty, we'll do the * TIM recalculation. */ for (tid = 0; tid < ARRAY_SIZE(sta->sta.txq); tid++) { if (!sta->sta.txq[tid] || !(driver_release_tids & BIT(tid)) || txq_has_queue(sta->sta.txq[tid])) continue; sta_info_recalc_tim(sta); break; } } } void ieee80211_sta_ps_deliver_poll_response(struct sta_info *sta) { u8 ignore_for_response = sta->sta.uapsd_queues; /* * If all ACs are delivery-enabled then we should reply * from any of them, if only some are enabled we reply * only from the non-enabled ones. */ if (ignore_for_response == BIT(IEEE80211_NUM_ACS) - 1) ignore_for_response = 0; ieee80211_sta_ps_deliver_response(sta, 1, ignore_for_response, IEEE80211_FRAME_RELEASE_PSPOLL); } void ieee80211_sta_ps_deliver_uapsd(struct sta_info *sta) { int n_frames = sta->sta.max_sp; u8 delivery_enabled = sta->sta.uapsd_queues; /* * If we ever grow support for TSPEC this might happen if * the TSPEC update from hostapd comes in between a trigger * frame setting WLAN_STA_UAPSD in the RX path and this * actually getting called. */ if (!delivery_enabled) return; switch (sta->sta.max_sp) { case 1: n_frames = 2; break; case 2: n_frames = 4; break; case 3: n_frames = 6; break; case 0: /* XXX: what is a good value? */ n_frames = 128; break; } ieee80211_sta_ps_deliver_response(sta, n_frames, ~delivery_enabled, IEEE80211_FRAME_RELEASE_UAPSD); } void ieee80211_sta_block_awake(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta, bool block) { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); trace_api_sta_block_awake(sta->local, pubsta, block); if (block) { set_sta_flag(sta, WLAN_STA_PS_DRIVER); ieee80211_clear_fast_xmit(sta); return; } if (!test_sta_flag(sta, WLAN_STA_PS_DRIVER)) return; if (!test_sta_flag(sta, WLAN_STA_PS_STA)) { set_sta_flag(sta, WLAN_STA_PS_DELIVER); clear_sta_flag(sta, WLAN_STA_PS_DRIVER); ieee80211_queue_work(hw, &sta->drv_deliver_wk); } else if (test_sta_flag(sta, WLAN_STA_PSPOLL) || test_sta_flag(sta, WLAN_STA_UAPSD)) { /* must be asleep in this case */ clear_sta_flag(sta, WLAN_STA_PS_DRIVER); ieee80211_queue_work(hw, &sta->drv_deliver_wk); } else { clear_sta_flag(sta, WLAN_STA_PS_DRIVER); ieee80211_check_fast_xmit(sta); } } EXPORT_SYMBOL(ieee80211_sta_block_awake); void ieee80211_sta_eosp(struct ieee80211_sta *pubsta) { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); struct ieee80211_local *local = sta->local; trace_api_eosp(local, pubsta); clear_sta_flag(sta, WLAN_STA_SP); } EXPORT_SYMBOL(ieee80211_sta_eosp); void ieee80211_send_eosp_nullfunc(struct ieee80211_sta *pubsta, int tid) { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); enum ieee80211_frame_release_type reason; bool more_data; trace_api_send_eosp_nullfunc(sta->local, pubsta, tid); reason = IEEE80211_FRAME_RELEASE_UAPSD; more_data = ieee80211_sta_ps_more_data(sta, ~sta->sta.uapsd_queues, reason, 0); ieee80211_send_null_response(sta, tid, reason, false, more_data); } EXPORT_SYMBOL(ieee80211_send_eosp_nullfunc); void ieee80211_sta_set_buffered(struct ieee80211_sta *pubsta, u8 tid, bool buffered) { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); if (WARN_ON(tid >= IEEE80211_NUM_TIDS)) return; trace_api_sta_set_buffered(sta->local, pubsta, tid, buffered); if (buffered) set_bit(tid, &sta->driver_buffered_tids); else clear_bit(tid, &sta->driver_buffered_tids); sta_info_recalc_tim(sta); } EXPORT_SYMBOL(ieee80211_sta_set_buffered); void ieee80211_sta_register_airtime(struct ieee80211_sta *pubsta, u8 tid, u32 tx_airtime, u32 rx_airtime) { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); struct ieee80211_local *local = sta->sdata->local; u8 ac = ieee80211_ac_from_tid(tid); u32 airtime = 0; if (sta->local->airtime_flags & AIRTIME_USE_TX) airtime += tx_airtime; if (sta->local->airtime_flags & AIRTIME_USE_RX) airtime += rx_airtime; spin_lock_bh(&local->active_txq_lock[ac]); sta->airtime[ac].tx_airtime += tx_airtime; sta->airtime[ac].rx_airtime += rx_airtime; if (ieee80211_sta_keep_active(sta, ac)) sta->airtime[ac].deficit -= airtime; spin_unlock_bh(&local->active_txq_lock[ac]); } EXPORT_SYMBOL(ieee80211_sta_register_airtime); void __ieee80211_sta_recalc_aggregates(struct sta_info *sta, u16 active_links) { bool first = true; int link_id; if (!sta->sta.valid_links || !sta->sta.mlo) { sta->sta.cur = &sta->sta.deflink.agg; return; } rcu_read_lock(); for (link_id = 0; link_id < ARRAY_SIZE((sta)->link); link_id++) { struct ieee80211_link_sta *link_sta; int i; if (!(active_links & BIT(link_id))) continue; link_sta = rcu_dereference(sta->sta.link[link_id]); if (!link_sta) continue; if (first) { sta->cur = sta->sta.deflink.agg; first = false; continue; } sta->cur.max_amsdu_len = min(sta->cur.max_amsdu_len, link_sta->agg.max_amsdu_len); sta->cur.max_rc_amsdu_len = min(sta->cur.max_rc_amsdu_len, link_sta->agg.max_rc_amsdu_len); for (i = 0; i < ARRAY_SIZE(sta->cur.max_tid_amsdu_len); i++) sta->cur.max_tid_amsdu_len[i] = min(sta->cur.max_tid_amsdu_len[i], link_sta->agg.max_tid_amsdu_len[i]); } rcu_read_unlock(); sta->sta.cur = &sta->cur; } void ieee80211_sta_recalc_aggregates(struct ieee80211_sta *pubsta) { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); __ieee80211_sta_recalc_aggregates(sta, sta->sdata->vif.active_links); } EXPORT_SYMBOL(ieee80211_sta_recalc_aggregates); void ieee80211_sta_update_pending_airtime(struct ieee80211_local *local, struct sta_info *sta, u8 ac, u16 tx_airtime, bool tx_completed) { int tx_pending; if (!wiphy_ext_feature_isset(local->hw.wiphy, NL80211_EXT_FEATURE_AQL)) return; if (!tx_completed) { if (sta) atomic_add(tx_airtime, &sta->airtime[ac].aql_tx_pending); atomic_add(tx_airtime, &local->aql_total_pending_airtime); atomic_add(tx_airtime, &local->aql_ac_pending_airtime[ac]); return; } if (sta) { tx_pending = atomic_sub_return(tx_airtime, &sta->airtime[ac].aql_tx_pending); if (tx_pending < 0) atomic_cmpxchg(&sta->airtime[ac].aql_tx_pending, tx_pending, 0); } atomic_sub(tx_airtime, &local->aql_total_pending_airtime); tx_pending = atomic_sub_return(tx_airtime, &local->aql_ac_pending_airtime[ac]); if (WARN_ONCE(tx_pending < 0, "Device %s AC %d pending airtime underflow: %u, %u", wiphy_name(local->hw.wiphy), ac, tx_pending, tx_airtime)) { atomic_cmpxchg(&local->aql_ac_pending_airtime[ac], tx_pending, 0); atomic_sub(tx_pending, &local->aql_total_pending_airtime); } } static struct ieee80211_sta_rx_stats * sta_get_last_rx_stats(struct sta_info *sta) { struct ieee80211_sta_rx_stats *stats = &sta->deflink.rx_stats; int cpu; if (!sta->deflink.pcpu_rx_stats) return stats; for_each_possible_cpu(cpu) { struct ieee80211_sta_rx_stats *cpustats; cpustats = per_cpu_ptr(sta->deflink.pcpu_rx_stats, cpu); if (time_after(cpustats->last_rx, stats->last_rx)) stats = cpustats; } return stats; } static void sta_stats_decode_rate(struct ieee80211_local *local, u32 rate, struct rate_info *rinfo) { rinfo->bw = STA_STATS_GET(BW, rate); switch (STA_STATS_GET(TYPE, rate)) { case STA_STATS_RATE_TYPE_VHT: rinfo->flags = RATE_INFO_FLAGS_VHT_MCS; rinfo->mcs = STA_STATS_GET(VHT_MCS, rate); rinfo->nss = STA_STATS_GET(VHT_NSS, rate); if (STA_STATS_GET(SGI, rate)) rinfo->flags |= RATE_INFO_FLAGS_SHORT_GI; break; case STA_STATS_RATE_TYPE_HT: rinfo->flags = RATE_INFO_FLAGS_MCS; rinfo->mcs = STA_STATS_GET(HT_MCS, rate); if (STA_STATS_GET(SGI, rate)) rinfo->flags |= RATE_INFO_FLAGS_SHORT_GI; break; case STA_STATS_RATE_TYPE_LEGACY: { struct ieee80211_supported_band *sband; u16 brate; unsigned int shift; int band = STA_STATS_GET(LEGACY_BAND, rate); int rate_idx = STA_STATS_GET(LEGACY_IDX, rate); sband = local->hw.wiphy->bands[band]; if (WARN_ON_ONCE(!sband->bitrates)) break; brate = sband->bitrates[rate_idx].bitrate; if (rinfo->bw == RATE_INFO_BW_5) shift = 2; else if (rinfo->bw == RATE_INFO_BW_10) shift = 1; else shift = 0; rinfo->legacy = DIV_ROUND_UP(brate, 1 << shift); break; } case STA_STATS_RATE_TYPE_HE: rinfo->flags = RATE_INFO_FLAGS_HE_MCS; rinfo->mcs = STA_STATS_GET(HE_MCS, rate); rinfo->nss = STA_STATS_GET(HE_NSS, rate); rinfo->he_gi = STA_STATS_GET(HE_GI, rate); rinfo->he_ru_alloc = STA_STATS_GET(HE_RU, rate); rinfo->he_dcm = STA_STATS_GET(HE_DCM, rate); break; case STA_STATS_RATE_TYPE_EHT: rinfo->flags = RATE_INFO_FLAGS_EHT_MCS; rinfo->mcs = STA_STATS_GET(EHT_MCS, rate); rinfo->nss = STA_STATS_GET(EHT_NSS, rate); rinfo->eht_gi = STA_STATS_GET(EHT_GI, rate); rinfo->eht_ru_alloc = STA_STATS_GET(EHT_RU, rate); break; } } static int sta_set_rate_info_rx(struct sta_info *sta, struct rate_info *rinfo) { u32 rate = READ_ONCE(sta_get_last_rx_stats(sta)->last_rate); if (rate == STA_STATS_RATE_INVALID) return -EINVAL; sta_stats_decode_rate(sta->local, rate, rinfo); return 0; } static inline u64 sta_get_tidstats_msdu(struct ieee80211_sta_rx_stats *rxstats, int tid) { unsigned int start; u64 value; do { start = u64_stats_fetch_begin(&rxstats->syncp); value = rxstats->msdu[tid]; } while (u64_stats_fetch_retry(&rxstats->syncp, start)); return value; } static void sta_set_tidstats(struct sta_info *sta, struct cfg80211_tid_stats *tidstats, int tid) { struct ieee80211_local *local = sta->local; int cpu; if (!(tidstats->filled & BIT(NL80211_TID_STATS_RX_MSDU))) { tidstats->rx_msdu += sta_get_tidstats_msdu(&sta->deflink.rx_stats, tid); if (sta->deflink.pcpu_rx_stats) { for_each_possible_cpu(cpu) { struct ieee80211_sta_rx_stats *cpurxs; cpurxs = per_cpu_ptr(sta->deflink.pcpu_rx_stats, cpu); tidstats->rx_msdu += sta_get_tidstats_msdu(cpurxs, tid); } } tidstats->filled |= BIT(NL80211_TID_STATS_RX_MSDU); } if (!(tidstats->filled & BIT(NL80211_TID_STATS_TX_MSDU))) { tidstats->filled |= BIT(NL80211_TID_STATS_TX_MSDU); tidstats->tx_msdu = sta->deflink.tx_stats.msdu[tid]; } if (!(tidstats->filled & BIT(NL80211_TID_STATS_TX_MSDU_RETRIES)) && ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) { tidstats->filled |= BIT(NL80211_TID_STATS_TX_MSDU_RETRIES); tidstats->tx_msdu_retries = sta->deflink.status_stats.msdu_retries[tid]; } if (!(tidstats->filled & BIT(NL80211_TID_STATS_TX_MSDU_FAILED)) && ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) { tidstats->filled |= BIT(NL80211_TID_STATS_TX_MSDU_FAILED); tidstats->tx_msdu_failed = sta->deflink.status_stats.msdu_failed[tid]; } if (tid < IEEE80211_NUM_TIDS) { spin_lock_bh(&local->fq.lock); rcu_read_lock(); tidstats->filled |= BIT(NL80211_TID_STATS_TXQ_STATS); ieee80211_fill_txq_stats(&tidstats->txq_stats, to_txq_info(sta->sta.txq[tid])); rcu_read_unlock(); spin_unlock_bh(&local->fq.lock); } } static inline u64 sta_get_stats_bytes(struct ieee80211_sta_rx_stats *rxstats) { unsigned int start; u64 value; do { start = u64_stats_fetch_begin(&rxstats->syncp); value = rxstats->bytes; } while (u64_stats_fetch_retry(&rxstats->syncp, start)); return value; } #ifdef CONFIG_MAC80211_MESH static void sta_set_mesh_sinfo(struct sta_info *sta, struct station_info *sinfo) { struct ieee80211_local *local = sta->sdata->local; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_LLID) | BIT_ULL(NL80211_STA_INFO_PLID) | BIT_ULL(NL80211_STA_INFO_PLINK_STATE) | BIT_ULL(NL80211_STA_INFO_LOCAL_PM) | BIT_ULL(NL80211_STA_INFO_PEER_PM) | BIT_ULL(NL80211_STA_INFO_NONPEER_PM) | BIT_ULL(NL80211_STA_INFO_CONNECTED_TO_GATE) | BIT_ULL(NL80211_STA_INFO_CONNECTED_TO_AS); sinfo->llid = sta->mesh->llid; sinfo->plid = sta->mesh->plid; sinfo->plink_state = sta->mesh->plink_state; if (test_sta_flag(sta, WLAN_STA_TOFFSET_KNOWN)) { sinfo->filled |= BIT_ULL(NL80211_STA_INFO_T_OFFSET); sinfo->t_offset = sta->mesh->t_offset; } sinfo->local_pm = sta->mesh->local_pm; sinfo->peer_pm = sta->mesh->peer_pm; sinfo->nonpeer_pm = sta->mesh->nonpeer_pm; sinfo->connected_to_gate = sta->mesh->connected_to_gate; sinfo->connected_to_as = sta->mesh->connected_to_as; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_AIRTIME_LINK_METRIC); sinfo->airtime_link_metric = airtime_link_metric_get(local, sta); } #endif void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, bool tidstats) { struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; u32 thr = 0; int i, ac, cpu; struct ieee80211_sta_rx_stats *last_rxstats; last_rxstats = sta_get_last_rx_stats(sta); sinfo->generation = sdata->local->sta_generation; /* do before driver, so beacon filtering drivers have a * chance to e.g. just add the number of filtered beacons * (or just modify the value entirely, of course) */ if (sdata->vif.type == NL80211_IFTYPE_STATION) sinfo->rx_beacon = sdata->deflink.u.mgd.count_beacon_signal; drv_sta_statistics(local, sdata, &sta->sta, sinfo); sinfo->filled |= BIT_ULL(NL80211_STA_INFO_INACTIVE_TIME) | BIT_ULL(NL80211_STA_INFO_STA_FLAGS) | BIT_ULL(NL80211_STA_INFO_BSS_PARAM) | BIT_ULL(NL80211_STA_INFO_CONNECTED_TIME) | BIT_ULL(NL80211_STA_INFO_ASSOC_AT_BOOTTIME) | BIT_ULL(NL80211_STA_INFO_RX_DROP_MISC); if (sdata->vif.type == NL80211_IFTYPE_STATION) { sinfo->beacon_loss_count = sdata->deflink.u.mgd.beacon_loss_count; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_BEACON_LOSS); } sinfo->connected_time = ktime_get_seconds() - sta->last_connected; sinfo->assoc_at = sta->assoc_at; sinfo->inactive_time = jiffies_to_msecs(jiffies - ieee80211_sta_last_active(sta)); if (!(sinfo->filled & (BIT_ULL(NL80211_STA_INFO_TX_BYTES64) | BIT_ULL(NL80211_STA_INFO_TX_BYTES)))) { sinfo->tx_bytes = 0; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) sinfo->tx_bytes += sta->deflink.tx_stats.bytes[ac]; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_BYTES64); } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_PACKETS))) { sinfo->tx_packets = 0; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) sinfo->tx_packets += sta->deflink.tx_stats.packets[ac]; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_PACKETS); } if (!(sinfo->filled & (BIT_ULL(NL80211_STA_INFO_RX_BYTES64) | BIT_ULL(NL80211_STA_INFO_RX_BYTES)))) { sinfo->rx_bytes += sta_get_stats_bytes(&sta->deflink.rx_stats); if (sta->deflink.pcpu_rx_stats) { for_each_possible_cpu(cpu) { struct ieee80211_sta_rx_stats *cpurxs; cpurxs = per_cpu_ptr(sta->deflink.pcpu_rx_stats, cpu); sinfo->rx_bytes += sta_get_stats_bytes(cpurxs); } } sinfo->filled |= BIT_ULL(NL80211_STA_INFO_RX_BYTES64); } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_RX_PACKETS))) { sinfo->rx_packets = sta->deflink.rx_stats.packets; if (sta->deflink.pcpu_rx_stats) { for_each_possible_cpu(cpu) { struct ieee80211_sta_rx_stats *cpurxs; cpurxs = per_cpu_ptr(sta->deflink.pcpu_rx_stats, cpu); sinfo->rx_packets += cpurxs->packets; } } sinfo->filled |= BIT_ULL(NL80211_STA_INFO_RX_PACKETS); } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_RETRIES))) { sinfo->tx_retries = sta->deflink.status_stats.retry_count; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_RETRIES); } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_FAILED))) { sinfo->tx_failed = sta->deflink.status_stats.retry_failed; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_FAILED); } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_RX_DURATION))) { for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) sinfo->rx_duration += sta->airtime[ac].rx_airtime; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_RX_DURATION); } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_DURATION))) { for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) sinfo->tx_duration += sta->airtime[ac].tx_airtime; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_DURATION); } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_AIRTIME_WEIGHT))) { sinfo->airtime_weight = sta->airtime_weight; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_AIRTIME_WEIGHT); } sinfo->rx_dropped_misc = sta->deflink.rx_stats.dropped; if (sta->deflink.pcpu_rx_stats) { for_each_possible_cpu(cpu) { struct ieee80211_sta_rx_stats *cpurxs; cpurxs = per_cpu_ptr(sta->deflink.pcpu_rx_stats, cpu); sinfo->rx_dropped_misc += cpurxs->dropped; } } if (sdata->vif.type == NL80211_IFTYPE_STATION && !(sdata->vif.driver_flags & IEEE80211_VIF_BEACON_FILTER)) { sinfo->filled |= BIT_ULL(NL80211_STA_INFO_BEACON_RX) | BIT_ULL(NL80211_STA_INFO_BEACON_SIGNAL_AVG); sinfo->rx_beacon_signal_avg = ieee80211_ave_rssi(&sdata->vif); } if (ieee80211_hw_check(&sta->local->hw, SIGNAL_DBM) || ieee80211_hw_check(&sta->local->hw, SIGNAL_UNSPEC)) { if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_SIGNAL))) { sinfo->signal = (s8)last_rxstats->last_signal; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_SIGNAL); } if (!sta->deflink.pcpu_rx_stats && !(sinfo->filled & BIT_ULL(NL80211_STA_INFO_SIGNAL_AVG))) { sinfo->signal_avg = -ewma_signal_read(&sta->deflink.rx_stats_avg.signal); sinfo->filled |= BIT_ULL(NL80211_STA_INFO_SIGNAL_AVG); } } /* for the average - if pcpu_rx_stats isn't set - rxstats must point to * the sta->rx_stats struct, so the check here is fine with and without * pcpu statistics */ if (last_rxstats->chains && !(sinfo->filled & (BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL) | BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL_AVG)))) { sinfo->filled |= BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL); if (!sta->deflink.pcpu_rx_stats) sinfo->filled |= BIT_ULL(NL80211_STA_INFO_CHAIN_SIGNAL_AVG); sinfo->chains = last_rxstats->chains; for (i = 0; i < ARRAY_SIZE(sinfo->chain_signal); i++) { sinfo->chain_signal[i] = last_rxstats->chain_signal_last[i]; sinfo->chain_signal_avg[i] = -ewma_signal_read(&sta->deflink.rx_stats_avg.chain_signal[i]); } } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_TX_BITRATE)) && !sta->sta.valid_links && ieee80211_rate_valid(&sta->deflink.tx_stats.last_rate)) { sta_set_rate_info_tx(sta, &sta->deflink.tx_stats.last_rate, &sinfo->txrate); sinfo->filled |= BIT_ULL(NL80211_STA_INFO_TX_BITRATE); } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_RX_BITRATE)) && !sta->sta.valid_links) { if (sta_set_rate_info_rx(sta, &sinfo->rxrate) == 0) sinfo->filled |= BIT_ULL(NL80211_STA_INFO_RX_BITRATE); } if (tidstats && !cfg80211_sinfo_alloc_tid_stats(sinfo, GFP_KERNEL)) { for (i = 0; i < IEEE80211_NUM_TIDS + 1; i++) sta_set_tidstats(sta, &sinfo->pertid[i], i); } #ifdef CONFIG_MAC80211_MESH if (ieee80211_vif_is_mesh(&sdata->vif)) sta_set_mesh_sinfo(sta, sinfo); #endif sinfo->bss_param.flags = 0; if (sdata->vif.bss_conf.use_cts_prot) sinfo->bss_param.flags |= BSS_PARAM_FLAGS_CTS_PROT; if (sdata->vif.bss_conf.use_short_preamble) sinfo->bss_param.flags |= BSS_PARAM_FLAGS_SHORT_PREAMBLE; if (sdata->vif.bss_conf.use_short_slot) sinfo->bss_param.flags |= BSS_PARAM_FLAGS_SHORT_SLOT_TIME; sinfo->bss_param.dtim_period = sdata->vif.bss_conf.dtim_period; sinfo->bss_param.beacon_interval = sdata->vif.bss_conf.beacon_int; sinfo->sta_flags.set = 0; sinfo->sta_flags.mask = BIT(NL80211_STA_FLAG_AUTHORIZED) | BIT(NL80211_STA_FLAG_SHORT_PREAMBLE) | BIT(NL80211_STA_FLAG_WME) | BIT(NL80211_STA_FLAG_MFP) | BIT(NL80211_STA_FLAG_AUTHENTICATED) | BIT(NL80211_STA_FLAG_ASSOCIATED) | BIT(NL80211_STA_FLAG_TDLS_PEER); if (test_sta_flag(sta, WLAN_STA_AUTHORIZED)) sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_AUTHORIZED); if (test_sta_flag(sta, WLAN_STA_SHORT_PREAMBLE)) sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_SHORT_PREAMBLE); if (sta->sta.wme) sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_WME); if (test_sta_flag(sta, WLAN_STA_MFP)) sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_MFP); if (test_sta_flag(sta, WLAN_STA_AUTH)) sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_AUTHENTICATED); if (test_sta_flag(sta, WLAN_STA_ASSOC)) sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_ASSOCIATED); if (test_sta_flag(sta, WLAN_STA_TDLS_PEER)) sinfo->sta_flags.set |= BIT(NL80211_STA_FLAG_TDLS_PEER); thr = sta_get_expected_throughput(sta); if (thr != 0) { sinfo->filled |= BIT_ULL(NL80211_STA_INFO_EXPECTED_THROUGHPUT); sinfo->expected_throughput = thr; } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL)) && sta->deflink.status_stats.ack_signal_filled) { sinfo->ack_signal = sta->deflink.status_stats.last_ack_signal; sinfo->filled |= BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL); } if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL_AVG)) && sta->deflink.status_stats.ack_signal_filled) { sinfo->avg_ack_signal = -(s8)ewma_avg_signal_read( &sta->deflink.status_stats.avg_ack_signal); sinfo->filled |= BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL_AVG); } } u32 sta_get_expected_throughput(struct sta_info *sta) { struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; struct rate_control_ref *ref = NULL; u32 thr = 0; if (test_sta_flag(sta, WLAN_STA_RATE_CONTROL)) ref = local->rate_ctrl; /* check if the driver has a SW RC implementation */ if (ref && ref->ops->get_expected_throughput) thr = ref->ops->get_expected_throughput(sta->rate_ctrl_priv); else thr = drv_get_expected_throughput(local, sta); return thr; } unsigned long ieee80211_sta_last_active(struct sta_info *sta) { struct ieee80211_sta_rx_stats *stats = sta_get_last_rx_stats(sta); if (!sta->deflink.status_stats.last_ack || time_after(stats->last_rx, sta->deflink.status_stats.last_ack)) return stats->last_rx; return sta->deflink.status_stats.last_ack; } static void sta_update_codel_params(struct sta_info *sta, u32 thr) { if (thr && thr < STA_SLOW_THRESHOLD * sta->local->num_sta) { sta->cparams.target = MS2TIME(50); sta->cparams.interval = MS2TIME(300); sta->cparams.ecn = false; } else { sta->cparams.target = MS2TIME(20); sta->cparams.interval = MS2TIME(100); sta->cparams.ecn = true; } } void ieee80211_sta_set_expected_throughput(struct ieee80211_sta *pubsta, u32 thr) { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); sta_update_codel_params(sta, thr); } int ieee80211_sta_allocate_link(struct sta_info *sta, unsigned int link_id) { struct ieee80211_sub_if_data *sdata = sta->sdata; struct sta_link_alloc *alloc; int ret; lockdep_assert_wiphy(sdata->local->hw.wiphy); WARN_ON(!test_sta_flag(sta, WLAN_STA_INSERTED)); /* must represent an MLD from the start */ if (WARN_ON(!sta->sta.valid_links)) return -EINVAL; if (WARN_ON(sta->sta.valid_links & BIT(link_id) || sta->link[link_id])) return -EBUSY; alloc = kzalloc(sizeof(*alloc), GFP_KERNEL); if (!alloc) return -ENOMEM; ret = sta_info_alloc_link(sdata->local, &alloc->info, GFP_KERNEL); if (ret) { kfree(alloc); return ret; } sta_info_add_link(sta, link_id, &alloc->info, &alloc->sta); ieee80211_link_sta_debugfs_add(&alloc->info); return 0; } void ieee80211_sta_free_link(struct sta_info *sta, unsigned int link_id) { lockdep_assert_wiphy(sta->sdata->local->hw.wiphy); WARN_ON(!test_sta_flag(sta, WLAN_STA_INSERTED)); sta_remove_link(sta, link_id, false); } int ieee80211_sta_activate_link(struct sta_info *sta, unsigned int link_id) { struct ieee80211_sub_if_data *sdata = sta->sdata; struct link_sta_info *link_sta; u16 old_links = sta->sta.valid_links; u16 new_links = old_links | BIT(link_id); int ret; link_sta = rcu_dereference_protected(sta->link[link_id], lockdep_is_held(&sdata->local->hw.wiphy->mtx)); if (WARN_ON(old_links == new_links || !link_sta)) return -EINVAL; rcu_read_lock(); if (link_sta_info_hash_lookup(sdata->local, link_sta->addr)) { rcu_read_unlock(); return -EALREADY; } /* we only modify under the mutex so this is fine */ rcu_read_unlock(); sta->sta.valid_links = new_links; if (WARN_ON(!test_sta_flag(sta, WLAN_STA_INSERTED))) goto hash; ieee80211_recalc_min_chandef(sdata, link_id); /* Ensure the values are updated for the driver, * redone by sta_remove_link on failure. */ ieee80211_sta_recalc_aggregates(&sta->sta); ret = drv_change_sta_links(sdata->local, sdata, &sta->sta, old_links, new_links); if (ret) { sta->sta.valid_links = old_links; sta_remove_link(sta, link_id, false); return ret; } hash: ret = link_sta_info_hash_add(sdata->local, link_sta); WARN_ON(ret); return 0; } void ieee80211_sta_remove_link(struct sta_info *sta, unsigned int link_id) { struct ieee80211_sub_if_data *sdata = sta->sdata; u16 old_links = sta->sta.valid_links; lockdep_assert_wiphy(sdata->local->hw.wiphy); sta->sta.valid_links &= ~BIT(link_id); if (!WARN_ON(!test_sta_flag(sta, WLAN_STA_INSERTED))) drv_change_sta_links(sdata->local, sdata, &sta->sta, old_links, sta->sta.valid_links); sta_remove_link(sta, link_id, true); } void ieee80211_sta_set_max_amsdu_subframes(struct sta_info *sta, const u8 *ext_capab, unsigned int ext_capab_len) { u8 val; sta->sta.max_amsdu_subframes = 0; if (ext_capab_len < 8) return; /* The sender might not have sent the last bit, consider it to be 0 */ val = u8_get_bits(ext_capab[7], WLAN_EXT_CAPA8_MAX_MSDU_IN_AMSDU_LSB); /* we did get all the bits, take the MSB as well */ if (ext_capab_len >= 9) val |= u8_get_bits(ext_capab[8], WLAN_EXT_CAPA9_MAX_MSDU_IN_AMSDU_MSB) << 1; if (val) sta->sta.max_amsdu_subframes = 4 << (4 - val); } #ifdef CONFIG_LOCKDEP bool lockdep_sta_mutex_held(struct ieee80211_sta *pubsta) { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); return lockdep_is_held(&sta->local->hw.wiphy->mtx); } EXPORT_SYMBOL(lockdep_sta_mutex_held); #endif |
| 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 | // SPDX-License-Identifier: GPL-2.0-or-later /* * Do sleep inside a spin-lock * Copyright (c) 1999 by Takashi Iwai <tiwai@suse.de> */ #include <linux/export.h> #include <sound/core.h> #include "seq_lock.h" /* wait until all locks are released */ void snd_use_lock_sync_helper(snd_use_lock_t *lockp, const char *file, int line) { int warn_count = 5 * HZ; if (atomic_read(lockp) < 0) { pr_warn("ALSA: seq_lock: lock trouble [counter = %d] in %s:%d\n", atomic_read(lockp), file, line); return; } while (atomic_read(lockp) > 0) { if (warn_count-- == 0) pr_warn("ALSA: seq_lock: waiting [%d left] in %s:%d\n", atomic_read(lockp), file, line); schedule_timeout_uninterruptible(1); } } EXPORT_SYMBOL(snd_use_lock_sync_helper); |
| 16 23 16 12 8 13 9 14 16 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 | #ifndef __LZ4DEFS_H__ #define __LZ4DEFS_H__ /* * lz4defs.h -- common and architecture specific defines for the kernel usage * LZ4 - Fast LZ compression algorithm * Copyright (C) 2011-2016, Yann Collet. * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are * met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following disclaimer * in the documentation and/or other materials provided with the * distribution. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * You can contact the author at : * - LZ4 homepage : http://www.lz4.org * - LZ4 source repository : https://github.com/lz4/lz4 * * Changed for kernel usage by: * Sven Schmidt <4sschmid@informatik.uni-hamburg.de> */ #include <linux/unaligned.h> #include <linux/bitops.h> #include <linux/string.h> /* memset, memcpy */ #include <linux/lz4.h> #define FORCE_INLINE __always_inline /*-************************************ * Basic Types **************************************/ #include <linux/types.h> typedef uint8_t BYTE; typedef uint16_t U16; typedef uint32_t U32; typedef int32_t S32; typedef uint64_t U64; typedef uintptr_t uptrval; /*-************************************ * Architecture specifics **************************************/ #if defined(CONFIG_64BIT) #define LZ4_ARCH64 1 #else #define LZ4_ARCH64 0 #endif #if defined(__LITTLE_ENDIAN) #define LZ4_LITTLE_ENDIAN 1 #else #define LZ4_LITTLE_ENDIAN 0 #endif /*-************************************ * Constants **************************************/ #define MINMATCH 4 #define WILDCOPYLENGTH 8 #define LASTLITERALS 5 #define MFLIMIT (WILDCOPYLENGTH + MINMATCH) /* * ensure it's possible to write 2 x wildcopyLength * without overflowing output buffer */ #define MATCH_SAFEGUARD_DISTANCE ((2 * WILDCOPYLENGTH) - MINMATCH) /* Increase this value ==> compression run slower on incompressible data */ #define LZ4_SKIPTRIGGER 6 #define HASH_UNIT sizeof(size_t) #define KB (1 << 10) #define MB (1 << 20) #define GB (1U << 30) #define MAX_DISTANCE LZ4_DISTANCE_MAX #define STEPSIZE sizeof(size_t) #define ML_BITS 4 #define ML_MASK ((1U << ML_BITS) - 1) #define RUN_BITS (8 - ML_BITS) #define RUN_MASK ((1U << RUN_BITS) - 1) /*-************************************ * Reading and writing into memory **************************************/ static FORCE_INLINE U16 LZ4_read16(const void *ptr) { return get_unaligned((const U16 *)ptr); } static FORCE_INLINE U32 LZ4_read32(const void *ptr) { return get_unaligned((const U32 *)ptr); } static FORCE_INLINE size_t LZ4_read_ARCH(const void *ptr) { return get_unaligned((const size_t *)ptr); } static FORCE_INLINE void LZ4_write16(void *memPtr, U16 value) { put_unaligned(value, (U16 *)memPtr); } static FORCE_INLINE void LZ4_write32(void *memPtr, U32 value) { put_unaligned(value, (U32 *)memPtr); } static FORCE_INLINE U16 LZ4_readLE16(const void *memPtr) { return get_unaligned_le16(memPtr); } static FORCE_INLINE void LZ4_writeLE16(void *memPtr, U16 value) { return put_unaligned_le16(value, memPtr); } /* * LZ4 relies on memcpy with a constant size being inlined. In freestanding * environments, the compiler can't assume the implementation of memcpy() is * standard compliant, so apply its specialized memcpy() inlining logic. When * possible, use __builtin_memcpy() to tell the compiler to analyze memcpy() * as-if it were standard compliant, so it can inline it in freestanding * environments. This is needed when decompressing the Linux Kernel, for example. */ #define LZ4_memcpy(dst, src, size) __builtin_memcpy(dst, src, size) #define LZ4_memmove(dst, src, size) __builtin_memmove(dst, src, size) static FORCE_INLINE void LZ4_copy8(void *dst, const void *src) { #if LZ4_ARCH64 U64 a = get_unaligned((const U64 *)src); put_unaligned(a, (U64 *)dst); #else U32 a = get_unaligned((const U32 *)src); U32 b = get_unaligned((const U32 *)src + 1); put_unaligned(a, (U32 *)dst); put_unaligned(b, (U32 *)dst + 1); #endif } /* * customized variant of memcpy, * which can overwrite up to 7 bytes beyond dstEnd */ static FORCE_INLINE void LZ4_wildCopy(void *dstPtr, const void *srcPtr, void *dstEnd) { BYTE *d = (BYTE *)dstPtr; const BYTE *s = (const BYTE *)srcPtr; BYTE *const e = (BYTE *)dstEnd; do { LZ4_copy8(d, s); d += 8; s += 8; } while (d < e); } static FORCE_INLINE unsigned int LZ4_NbCommonBytes(register size_t val) { #if LZ4_LITTLE_ENDIAN return __ffs(val) >> 3; #else return (BITS_PER_LONG - 1 - __fls(val)) >> 3; #endif } static FORCE_INLINE unsigned int LZ4_count( const BYTE *pIn, const BYTE *pMatch, const BYTE *pInLimit) { const BYTE *const pStart = pIn; while (likely(pIn < pInLimit - (STEPSIZE - 1))) { size_t const diff = LZ4_read_ARCH(pMatch) ^ LZ4_read_ARCH(pIn); if (!diff) { pIn += STEPSIZE; pMatch += STEPSIZE; continue; } pIn += LZ4_NbCommonBytes(diff); return (unsigned int)(pIn - pStart); } #if LZ4_ARCH64 if ((pIn < (pInLimit - 3)) && (LZ4_read32(pMatch) == LZ4_read32(pIn))) { pIn += 4; pMatch += 4; } #endif if ((pIn < (pInLimit - 1)) && (LZ4_read16(pMatch) == LZ4_read16(pIn))) { pIn += 2; pMatch += 2; } if ((pIn < pInLimit) && (*pMatch == *pIn)) pIn++; return (unsigned int)(pIn - pStart); } typedef enum { noLimit = 0, limitedOutput = 1 } limitedOutput_directive; typedef enum { byPtr, byU32, byU16 } tableType_t; typedef enum { noDict = 0, withPrefix64k, usingExtDict } dict_directive; typedef enum { noDictIssue = 0, dictSmall } dictIssue_directive; typedef enum { endOnOutputSize = 0, endOnInputSize = 1 } endCondition_directive; typedef enum { decode_full_block = 0, partial_decode = 1 } earlyEnd_directive; #define LZ4_STATIC_ASSERT(c) BUILD_BUG_ON(!(c)) #endif |
| 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 | /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright 2002-2005, Instant802 Networks, Inc. * Copyright 2005, Devicescape Software, Inc. * Copyright (c) 2006 Jiri Benc <jbenc@suse.cz> * Copyright (C) 2022, 2024 Intel Corporation */ #ifndef IEEE80211_RATE_H #define IEEE80211_RATE_H #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/types.h> #include <net/mac80211.h> #include "ieee80211_i.h" #include "sta_info.h" #include "driver-ops.h" struct rate_control_ref { const struct rate_control_ops *ops; void *priv; }; void rate_control_get_rate(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct ieee80211_tx_rate_control *txrc); void rate_control_tx_status(struct ieee80211_local *local, struct ieee80211_tx_status *st); void rate_control_rate_init(struct link_sta_info *link_sta); void rate_control_rate_init_all_links(struct sta_info *sta); void rate_control_rate_update(struct ieee80211_local *local, struct ieee80211_supported_band *sband, struct link_sta_info *link_sta, u32 changed); static inline void *rate_control_alloc_sta(struct rate_control_ref *ref, struct sta_info *sta, gfp_t gfp) { spin_lock_init(&sta->rate_ctrl_lock); return ref->ops->alloc_sta(ref->priv, &sta->sta, gfp); } static inline void rate_control_free_sta(struct sta_info *sta) { struct rate_control_ref *ref = sta->rate_ctrl; struct ieee80211_sta *ista = &sta->sta; void *priv_sta = sta->rate_ctrl_priv; ref->ops->free_sta(ref->priv, ista, priv_sta); } static inline void rate_control_add_sta_debugfs(struct sta_info *sta) { #ifdef CONFIG_MAC80211_DEBUGFS struct rate_control_ref *ref = sta->rate_ctrl; if (ref && sta->debugfs_dir && ref->ops->add_sta_debugfs) ref->ops->add_sta_debugfs(ref->priv, sta->rate_ctrl_priv, sta->debugfs_dir); #endif } extern const struct debugfs_short_fops rcname_ops; static inline void rate_control_add_debugfs(struct ieee80211_local *local) { #ifdef CONFIG_MAC80211_DEBUGFS struct dentry *debugfsdir; if (!local->rate_ctrl) return; if (!local->rate_ctrl->ops->add_debugfs) return; debugfsdir = debugfs_create_dir("rc", local->hw.wiphy->debugfsdir); local->debugfs.rcdir = debugfsdir; debugfs_create_file("name", 0400, debugfsdir, local->rate_ctrl, &rcname_ops); local->rate_ctrl->ops->add_debugfs(&local->hw, local->rate_ctrl->priv, debugfsdir); #endif } void ieee80211_check_rate_mask(struct ieee80211_link_data *link); /* Get a reference to the rate control algorithm. If `name' is NULL, get the * first available algorithm. */ int ieee80211_init_rate_ctrl_alg(struct ieee80211_local *local, const char *name); void rate_control_deinitialize(struct ieee80211_local *local); /* Rate control algorithms */ #ifdef CONFIG_MAC80211_RC_MINSTREL int rc80211_minstrel_init(void); void rc80211_minstrel_exit(void); #else static inline int rc80211_minstrel_init(void) { return 0; } static inline void rc80211_minstrel_exit(void) { } #endif #endif /* IEEE80211_RATE_H */ |
| 29 5 4 6 4 14 4 9 2 7 9 9 1 4 4 4 4 4 9 5 4 4 4 4 4 5 4 14 14 14 5 12 14 14 8 6 6 4 2 2 12 2 7 6 7 7 7 7 14 14 11 14 25 25 25 24 1 25 1 24 1 1 1 5 2 13 1 10 9 28 28 1 1 1 9 10 1 1 6 11 14 14 12 8 23 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 | // SPDX-License-Identifier: LGPL-2.1 /* * Copyright (c) 2008,2009 NEC Software Tohoku, Ltd. * Written by Takashi Sato <t-sato@yk.jp.nec.com> * Akira Fujita <a-fujita@rs.jp.nec.com> */ #include <linux/fs.h> #include <linux/quotaops.h> #include <linux/slab.h> #include <linux/sched/mm.h> #include "ext4_jbd2.h" #include "ext4.h" #include "ext4_extents.h" /** * get_ext_path() - Find an extent path for designated logical block number. * @inode: inode to be searched * @lblock: logical block number to find an extent path * @path: pointer to an extent path * * ext4_find_extent wrapper. Return an extent path pointer on success, * or an error pointer on failure. */ static inline struct ext4_ext_path * get_ext_path(struct inode *inode, ext4_lblk_t lblock, struct ext4_ext_path *path) { path = ext4_find_extent(inode, lblock, path, EXT4_EX_NOCACHE); if (IS_ERR(path)) return path; if (path[ext_depth(inode)].p_ext == NULL) { ext4_free_ext_path(path); return ERR_PTR(-ENODATA); } return path; } /** * ext4_double_down_write_data_sem() - write lock two inodes's i_data_sem * @first: inode to be locked * @second: inode to be locked * * Acquire write lock of i_data_sem of the two inodes */ void ext4_double_down_write_data_sem(struct inode *first, struct inode *second) { if (first < second) { down_write(&EXT4_I(first)->i_data_sem); down_write_nested(&EXT4_I(second)->i_data_sem, I_DATA_SEM_OTHER); } else { down_write(&EXT4_I(second)->i_data_sem); down_write_nested(&EXT4_I(first)->i_data_sem, I_DATA_SEM_OTHER); } } /** * ext4_double_up_write_data_sem - Release two inodes' write lock of i_data_sem * * @orig_inode: original inode structure to be released its lock first * @donor_inode: donor inode structure to be released its lock second * Release write lock of i_data_sem of two inodes (orig and donor). */ void ext4_double_up_write_data_sem(struct inode *orig_inode, struct inode *donor_inode) { up_write(&EXT4_I(orig_inode)->i_data_sem); up_write(&EXT4_I(donor_inode)->i_data_sem); } /** * mext_check_coverage - Check that all extents in range has the same type * * @inode: inode in question * @from: block offset of inode * @count: block count to be checked * @unwritten: extents expected to be unwritten * @err: pointer to save error value * * Return 1 if all extents in range has expected type, and zero otherwise. */ static int mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count, int unwritten, int *err) { struct ext4_ext_path *path = NULL; struct ext4_extent *ext; int ret = 0; ext4_lblk_t last = from + count; while (from < last) { path = get_ext_path(inode, from, path); if (IS_ERR(path)) { *err = PTR_ERR(path); return ret; } ext = path[ext_depth(inode)].p_ext; if (unwritten != ext4_ext_is_unwritten(ext)) goto out; from += ext4_ext_get_actual_len(ext); } ret = 1; out: ext4_free_ext_path(path); return ret; } /** * mext_folio_double_lock - Grab and lock folio on both @inode1 and @inode2 * * @inode1: the inode structure * @inode2: the inode structure * @index1: folio index * @index2: folio index * @folio: result folio vector * * Grab two locked folio for inode's by inode order */ static int mext_folio_double_lock(struct inode *inode1, struct inode *inode2, pgoff_t index1, pgoff_t index2, struct folio *folio[2]) { struct address_space *mapping[2]; unsigned int flags; BUG_ON(!inode1 || !inode2); if (inode1 < inode2) { mapping[0] = inode1->i_mapping; mapping[1] = inode2->i_mapping; } else { swap(index1, index2); mapping[0] = inode2->i_mapping; mapping[1] = inode1->i_mapping; } flags = memalloc_nofs_save(); folio[0] = __filemap_get_folio(mapping[0], index1, FGP_WRITEBEGIN, mapping_gfp_mask(mapping[0])); if (IS_ERR(folio[0])) { memalloc_nofs_restore(flags); return PTR_ERR(folio[0]); } folio[1] = __filemap_get_folio(mapping[1], index2, FGP_WRITEBEGIN, mapping_gfp_mask(mapping[1])); memalloc_nofs_restore(flags); if (IS_ERR(folio[1])) { folio_unlock(folio[0]); folio_put(folio[0]); return PTR_ERR(folio[1]); } /* * __filemap_get_folio() may not wait on folio's writeback if * BDI not demand that. But it is reasonable to be very conservative * here and explicitly wait on folio's writeback */ folio_wait_writeback(folio[0]); folio_wait_writeback(folio[1]); if (inode1 > inode2) swap(folio[0], folio[1]); return 0; } /* Force folio buffers uptodate w/o dropping folio's lock */ static int mext_page_mkuptodate(struct folio *folio, size_t from, size_t to) { struct inode *inode = folio->mapping->host; sector_t block; struct buffer_head *bh, *head; unsigned int blocksize, block_start, block_end; int nr = 0; bool partial = false; BUG_ON(!folio_test_locked(folio)); BUG_ON(folio_test_writeback(folio)); if (folio_test_uptodate(folio)) return 0; blocksize = i_blocksize(inode); head = folio_buffers(folio); if (!head) head = create_empty_buffers(folio, blocksize, 0); block = folio_pos(folio) >> inode->i_blkbits; block_end = 0; bh = head; do { block_start = block_end; block_end = block_start + blocksize; if (block_end <= from || block_start >= to) { if (!buffer_uptodate(bh)) partial = true; continue; } if (buffer_uptodate(bh)) continue; if (!buffer_mapped(bh)) { int err = ext4_get_block(inode, block, bh, 0); if (err) return err; if (!buffer_mapped(bh)) { folio_zero_range(folio, block_start, blocksize); set_buffer_uptodate(bh); continue; } } lock_buffer(bh); if (buffer_uptodate(bh)) { unlock_buffer(bh); continue; } ext4_read_bh_nowait(bh, 0, NULL, false); nr++; } while (block++, (bh = bh->b_this_page) != head); /* No io required */ if (!nr) goto out; bh = head; do { if (bh_offset(bh) + blocksize <= from) continue; if (bh_offset(bh) > to) break; wait_on_buffer(bh); if (buffer_uptodate(bh)) continue; return -EIO; } while ((bh = bh->b_this_page) != head); out: if (!partial) folio_mark_uptodate(folio); return 0; } /** * move_extent_per_page - Move extent data per page * * @o_filp: file structure of original file * @donor_inode: donor inode * @orig_page_offset: page index on original file * @donor_page_offset: page index on donor file * @data_offset_in_page: block index where data swapping starts * @block_len_in_page: the number of blocks to be swapped * @unwritten: orig extent is unwritten or not * @err: pointer to save return value * * Save the data in original inode blocks and replace original inode extents * with donor inode extents by calling ext4_swap_extents(). * Finally, write out the saved data in new original inode blocks. Return * replaced block count. */ static int move_extent_per_page(struct file *o_filp, struct inode *donor_inode, pgoff_t orig_page_offset, pgoff_t donor_page_offset, int data_offset_in_page, int block_len_in_page, int unwritten, int *err) { struct inode *orig_inode = file_inode(o_filp); struct folio *folio[2] = {NULL, NULL}; handle_t *handle; ext4_lblk_t orig_blk_offset, donor_blk_offset; unsigned long blocksize = orig_inode->i_sb->s_blocksize; unsigned int tmp_data_size, data_size, replaced_size; int i, err2, jblocks, retries = 0; int replaced_count = 0; int from = data_offset_in_page << orig_inode->i_blkbits; int blocks_per_page = PAGE_SIZE >> orig_inode->i_blkbits; struct super_block *sb = orig_inode->i_sb; struct buffer_head *bh = NULL; /* * It needs twice the amount of ordinary journal buffers because * inode and donor_inode may change each different metadata blocks. */ again: *err = 0; jblocks = ext4_writepage_trans_blocks(orig_inode) * 2; handle = ext4_journal_start(orig_inode, EXT4_HT_MOVE_EXTENTS, jblocks); if (IS_ERR(handle)) { *err = PTR_ERR(handle); return 0; } orig_blk_offset = orig_page_offset * blocks_per_page + data_offset_in_page; donor_blk_offset = donor_page_offset * blocks_per_page + data_offset_in_page; /* Calculate data_size */ if ((orig_blk_offset + block_len_in_page - 1) == ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) { /* Replace the last block */ tmp_data_size = orig_inode->i_size & (blocksize - 1); /* * If data_size equal zero, it shows data_size is multiples of * blocksize. So we set appropriate value. */ if (tmp_data_size == 0) tmp_data_size = blocksize; data_size = tmp_data_size + ((block_len_in_page - 1) << orig_inode->i_blkbits); } else data_size = block_len_in_page << orig_inode->i_blkbits; replaced_size = data_size; *err = mext_folio_double_lock(orig_inode, donor_inode, orig_page_offset, donor_page_offset, folio); if (unlikely(*err < 0)) goto stop_journal; /* * If orig extent was unwritten it can become initialized * at any time after i_data_sem was dropped, in order to * serialize with delalloc we have recheck extent while we * hold page's lock, if it is still the case data copy is not * necessary, just swap data blocks between orig and donor. */ VM_BUG_ON_FOLIO(folio_test_large(folio[0]), folio[0]); VM_BUG_ON_FOLIO(folio_test_large(folio[1]), folio[1]); VM_BUG_ON_FOLIO(folio_nr_pages(folio[0]) != folio_nr_pages(folio[1]), folio[1]); if (unwritten) { ext4_double_down_write_data_sem(orig_inode, donor_inode); /* If any of extents in range became initialized we have to * fallback to data copying */ unwritten = mext_check_coverage(orig_inode, orig_blk_offset, block_len_in_page, 1, err); if (*err) goto drop_data_sem; unwritten &= mext_check_coverage(donor_inode, donor_blk_offset, block_len_in_page, 1, err); if (*err) goto drop_data_sem; if (!unwritten) { ext4_double_up_write_data_sem(orig_inode, donor_inode); goto data_copy; } if (!filemap_release_folio(folio[0], 0) || !filemap_release_folio(folio[1], 0)) { *err = -EBUSY; goto drop_data_sem; } replaced_count = ext4_swap_extents(handle, orig_inode, donor_inode, orig_blk_offset, donor_blk_offset, block_len_in_page, 1, err); drop_data_sem: ext4_double_up_write_data_sem(orig_inode, donor_inode); goto unlock_folios; } data_copy: *err = mext_page_mkuptodate(folio[0], from, from + replaced_size); if (*err) goto unlock_folios; /* At this point all buffers in range are uptodate, old mapping layout * is no longer required, try to drop it now. */ if (!filemap_release_folio(folio[0], 0) || !filemap_release_folio(folio[1], 0)) { *err = -EBUSY; goto unlock_folios; } ext4_double_down_write_data_sem(orig_inode, donor_inode); replaced_count = ext4_swap_extents(handle, orig_inode, donor_inode, orig_blk_offset, donor_blk_offset, block_len_in_page, 1, err); ext4_double_up_write_data_sem(orig_inode, donor_inode); if (*err) { if (replaced_count) { block_len_in_page = replaced_count; replaced_size = block_len_in_page << orig_inode->i_blkbits; } else goto unlock_folios; } /* Perform all necessary steps similar write_begin()/write_end() * but keeping in mind that i_size will not change */ bh = folio_buffers(folio[0]); if (!bh) bh = create_empty_buffers(folio[0], 1 << orig_inode->i_blkbits, 0); for (i = 0; i < data_offset_in_page; i++) bh = bh->b_this_page; for (i = 0; i < block_len_in_page; i++) { *err = ext4_get_block(orig_inode, orig_blk_offset + i, bh, 0); if (*err < 0) goto repair_branches; bh = bh->b_this_page; } block_commit_write(folio[0], from, from + replaced_size); /* Even in case of data=writeback it is reasonable to pin * inode to transaction, to prevent unexpected data loss */ *err = ext4_jbd2_inode_add_write(handle, orig_inode, (loff_t)orig_page_offset << PAGE_SHIFT, replaced_size); unlock_folios: folio_unlock(folio[0]); folio_put(folio[0]); folio_unlock(folio[1]); folio_put(folio[1]); stop_journal: ext4_journal_stop(handle); if (*err == -ENOSPC && ext4_should_retry_alloc(sb, &retries)) goto again; /* Buffer was busy because probably is pinned to journal transaction, * force transaction commit may help to free it. */ if (*err == -EBUSY && retries++ < 4 && EXT4_SB(sb)->s_journal && jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal)) goto again; return replaced_count; repair_branches: /* * This should never ever happen! * Extents are swapped already, but we are not able to copy data. * Try to swap extents to it's original places */ ext4_double_down_write_data_sem(orig_inode, donor_inode); replaced_count = ext4_swap_extents(handle, donor_inode, orig_inode, orig_blk_offset, donor_blk_offset, block_len_in_page, 0, &err2); ext4_double_up_write_data_sem(orig_inode, donor_inode); if (replaced_count != block_len_in_page) { ext4_error_inode_block(orig_inode, (sector_t)(orig_blk_offset), EIO, "Unable to copy data block," " data will be lost."); *err = -EIO; } replaced_count = 0; goto unlock_folios; } /** * mext_check_arguments - Check whether move extent can be done * * @orig_inode: original inode * @donor_inode: donor inode * @orig_start: logical start offset in block for orig * @donor_start: logical start offset in block for donor * @len: the number of blocks to be moved * * Check the arguments of ext4_move_extents() whether the files can be * exchanged with each other. * Return 0 on success, or a negative error value on failure. */ static int mext_check_arguments(struct inode *orig_inode, struct inode *donor_inode, __u64 orig_start, __u64 donor_start, __u64 *len) { __u64 orig_eof, donor_eof; unsigned int blkbits = orig_inode->i_blkbits; unsigned int blocksize = 1 << blkbits; orig_eof = (i_size_read(orig_inode) + blocksize - 1) >> blkbits; donor_eof = (i_size_read(donor_inode) + blocksize - 1) >> blkbits; if (donor_inode->i_mode & (S_ISUID|S_ISGID)) { ext4_debug("ext4 move extent: suid or sgid is set" " to donor file [ino:orig %lu, donor %lu]\n", orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } if (IS_IMMUTABLE(donor_inode) || IS_APPEND(donor_inode)) return -EPERM; /* Ext4 move extent does not support swap files */ if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) { ext4_debug("ext4 move extent: The argument files should not be swap files [ino:orig %lu, donor %lu]\n", orig_inode->i_ino, donor_inode->i_ino); return -ETXTBSY; } if (ext4_is_quota_file(orig_inode) && ext4_is_quota_file(donor_inode)) { ext4_debug("ext4 move extent: The argument files should not be quota files [ino:orig %lu, donor %lu]\n", orig_inode->i_ino, donor_inode->i_ino); return -EOPNOTSUPP; } /* Ext4 move extent supports only extent based file */ if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS))) { ext4_debug("ext4 move extent: orig file is not extents " "based file [ino:orig %lu]\n", orig_inode->i_ino); return -EOPNOTSUPP; } else if (!(ext4_test_inode_flag(donor_inode, EXT4_INODE_EXTENTS))) { ext4_debug("ext4 move extent: donor file is not extents " "based file [ino:donor %lu]\n", donor_inode->i_ino); return -EOPNOTSUPP; } if ((!orig_inode->i_size) || (!donor_inode->i_size)) { ext4_debug("ext4 move extent: File size is 0 byte\n"); return -EINVAL; } /* Start offset should be same */ if ((orig_start & ~(PAGE_MASK >> orig_inode->i_blkbits)) != (donor_start & ~(PAGE_MASK >> orig_inode->i_blkbits))) { ext4_debug("ext4 move extent: orig and donor's start " "offsets are not aligned [ino:orig %lu, donor %lu]\n", orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } if ((orig_start >= EXT_MAX_BLOCKS) || (donor_start >= EXT_MAX_BLOCKS) || (*len > EXT_MAX_BLOCKS) || (donor_start + *len >= EXT_MAX_BLOCKS) || (orig_start + *len >= EXT_MAX_BLOCKS)) { ext4_debug("ext4 move extent: Can't handle over [%u] blocks " "[ino:orig %lu, donor %lu]\n", EXT_MAX_BLOCKS, orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } if (orig_eof <= orig_start) *len = 0; else if (orig_eof < orig_start + *len - 1) *len = orig_eof - orig_start; if (donor_eof <= donor_start) *len = 0; else if (donor_eof < donor_start + *len - 1) *len = donor_eof - donor_start; if (!*len) { ext4_debug("ext4 move extent: len should not be 0 " "[ino:orig %lu, donor %lu]\n", orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } return 0; } /** * ext4_move_extents - Exchange the specified range of a file * * @o_filp: file structure of the original file * @d_filp: file structure of the donor file * @orig_blk: start offset in block for orig * @donor_blk: start offset in block for donor * @len: the number of blocks to be moved * @moved_len: moved block length * * This function returns 0 and moved block length is set in moved_len * if succeed, otherwise returns error value. * */ int ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk, __u64 donor_blk, __u64 len, __u64 *moved_len) { struct inode *orig_inode = file_inode(o_filp); struct inode *donor_inode = file_inode(d_filp); struct ext4_ext_path *path = NULL; int blocks_per_page = PAGE_SIZE >> orig_inode->i_blkbits; ext4_lblk_t o_end, o_start = orig_blk; ext4_lblk_t d_start = donor_blk; int ret; if (orig_inode->i_sb != donor_inode->i_sb) { ext4_debug("ext4 move extent: The argument files " "should be in same FS [ino:orig %lu, donor %lu]\n", orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } /* orig and donor should be different inodes */ if (orig_inode == donor_inode) { ext4_debug("ext4 move extent: The argument files should not " "be same inode [ino:orig %lu, donor %lu]\n", orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } /* Regular file check */ if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) { ext4_debug("ext4 move extent: The argument files should be " "regular file [ino:orig %lu, donor %lu]\n", orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } /* TODO: it's not obvious how to swap blocks for inodes with full journaling enabled */ if (ext4_should_journal_data(orig_inode) || ext4_should_journal_data(donor_inode)) { ext4_msg(orig_inode->i_sb, KERN_ERR, "Online defrag not supported with data journaling"); return -EOPNOTSUPP; } if (IS_ENCRYPTED(orig_inode) || IS_ENCRYPTED(donor_inode)) { ext4_msg(orig_inode->i_sb, KERN_ERR, "Online defrag not supported for encrypted files"); return -EOPNOTSUPP; } /* Protect orig and donor inodes against a truncate */ lock_two_nondirectories(orig_inode, donor_inode); /* Wait for all existing dio workers */ inode_dio_wait(orig_inode); inode_dio_wait(donor_inode); /* Protect extent tree against block allocations via delalloc */ ext4_double_down_write_data_sem(orig_inode, donor_inode); /* Check the filesystem environment whether move_extent can be done */ ret = mext_check_arguments(orig_inode, donor_inode, orig_blk, donor_blk, &len); if (ret) goto out; o_end = o_start + len; *moved_len = 0; while (o_start < o_end) { struct ext4_extent *ex; ext4_lblk_t cur_blk, next_blk; pgoff_t orig_page_index, donor_page_index; int offset_in_page; int unwritten, cur_len; path = get_ext_path(orig_inode, o_start, path); if (IS_ERR(path)) { ret = PTR_ERR(path); goto out; } ex = path[path->p_depth].p_ext; cur_blk = le32_to_cpu(ex->ee_block); cur_len = ext4_ext_get_actual_len(ex); /* Check hole before the start pos */ if (cur_blk + cur_len - 1 < o_start) { next_blk = ext4_ext_next_allocated_block(path); if (next_blk == EXT_MAX_BLOCKS) { ret = -ENODATA; goto out; } d_start += next_blk - o_start; o_start = next_blk; continue; /* Check hole after the start pos */ } else if (cur_blk > o_start) { /* Skip hole */ d_start += cur_blk - o_start; o_start = cur_blk; /* Extent inside requested range ?*/ if (cur_blk >= o_end) goto out; } else { /* in_range(o_start, o_blk, o_len) */ cur_len += cur_blk - o_start; } unwritten = ext4_ext_is_unwritten(ex); if (o_end - o_start < cur_len) cur_len = o_end - o_start; orig_page_index = o_start >> (PAGE_SHIFT - orig_inode->i_blkbits); donor_page_index = d_start >> (PAGE_SHIFT - donor_inode->i_blkbits); offset_in_page = o_start % blocks_per_page; if (cur_len > blocks_per_page - offset_in_page) cur_len = blocks_per_page - offset_in_page; /* * Up semaphore to avoid following problems: * a. transaction deadlock among ext4_journal_start, * ->write_begin via pagefault, and jbd2_journal_commit * b. racing with ->read_folio, ->write_begin, and * ext4_get_block in move_extent_per_page */ ext4_double_up_write_data_sem(orig_inode, donor_inode); /* Swap original branches with new branches */ *moved_len += move_extent_per_page(o_filp, donor_inode, orig_page_index, donor_page_index, offset_in_page, cur_len, unwritten, &ret); ext4_double_down_write_data_sem(orig_inode, donor_inode); if (ret < 0) break; o_start += cur_len; d_start += cur_len; } out: if (*moved_len) { ext4_discard_preallocations(orig_inode); ext4_discard_preallocations(donor_inode); } ext4_free_ext_path(path); ext4_double_up_write_data_sem(orig_inode, donor_inode); unlock_two_nondirectories(orig_inode, donor_inode); return ret; } |
| 2 4 1 1 1 1 40 35 6 7 1 1 6 2 2 2 2 2 1 2 1 1 2 1 1 3 1 2 1 2 7 7 1 7 1 2 1 1 2 3 2 2 5 4 1 3 1 7 4 51 3 1 1 36 36 15 20 4 1 3 25 13 3 4 2 6 1 1 3 1 2 3 5 1 5 1 4 2 4 2 5 1 6 3 4 2 3 3 3 1 3 6 6 2 4 4 2 4 2 2 2 6 3 2 6 6 6 4 2 3 1 2 3 6 6 6 6 6 3 8 8 1 13 15 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 | /* FUSE: Filesystem in Userspace Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu> This program can be distributed under the terms of the GNU GPL. See the file COPYING. */ #include "fuse_i.h" #include <linux/pagemap.h> #include <linux/file.h> #include <linux/fs_context.h> #include <linux/moduleparam.h> #include <linux/sched.h> #include <linux/namei.h> #include <linux/slab.h> #include <linux/xattr.h> #include <linux/iversion.h> #include <linux/posix_acl.h> #include <linux/security.h> #include <linux/types.h> #include <linux/kernel.h> static bool __read_mostly allow_sys_admin_access; module_param(allow_sys_admin_access, bool, 0644); MODULE_PARM_DESC(allow_sys_admin_access, "Allow users with CAP_SYS_ADMIN in initial userns to bypass allow_other access check"); static void fuse_advise_use_readdirplus(struct inode *dir) { struct fuse_inode *fi = get_fuse_inode(dir); set_bit(FUSE_I_ADVISE_RDPLUS, &fi->state); } #if BITS_PER_LONG >= 64 static inline void __fuse_dentry_settime(struct dentry *entry, u64 time) { entry->d_fsdata = (void *) time; } static inline u64 fuse_dentry_time(const struct dentry *entry) { return (u64)entry->d_fsdata; } #else union fuse_dentry { u64 time; struct rcu_head rcu; }; static inline void __fuse_dentry_settime(struct dentry *dentry, u64 time) { ((union fuse_dentry *) dentry->d_fsdata)->time = time; } static inline u64 fuse_dentry_time(const struct dentry *entry) { return ((union fuse_dentry *) entry->d_fsdata)->time; } #endif static void fuse_dentry_settime(struct dentry *dentry, u64 time) { struct fuse_conn *fc = get_fuse_conn_super(dentry->d_sb); bool delete = !time && fc->delete_stale; /* * Mess with DCACHE_OP_DELETE because dput() will be faster without it. * Don't care about races, either way it's just an optimization */ if ((!delete && (dentry->d_flags & DCACHE_OP_DELETE)) || (delete && !(dentry->d_flags & DCACHE_OP_DELETE))) { spin_lock(&dentry->d_lock); if (!delete) dentry->d_flags &= ~DCACHE_OP_DELETE; else dentry->d_flags |= DCACHE_OP_DELETE; spin_unlock(&dentry->d_lock); } __fuse_dentry_settime(dentry, time); } /* * FUSE caches dentries and attributes with separate timeout. The * time in jiffies until the dentry/attributes are valid is stored in * dentry->d_fsdata and fuse_inode->i_time respectively. */ /* * Calculate the time in jiffies until a dentry/attributes are valid */ u64 fuse_time_to_jiffies(u64 sec, u32 nsec) { if (sec || nsec) { struct timespec64 ts = { sec, min_t(u32, nsec, NSEC_PER_SEC - 1) }; return get_jiffies_64() + timespec64_to_jiffies(&ts); } else return 0; } /* * Set dentry and possibly attribute timeouts from the lookup/mk* * replies */ void fuse_change_entry_timeout(struct dentry *entry, struct fuse_entry_out *o) { fuse_dentry_settime(entry, fuse_time_to_jiffies(o->entry_valid, o->entry_valid_nsec)); } void fuse_invalidate_attr_mask(struct inode *inode, u32 mask) { set_mask_bits(&get_fuse_inode(inode)->inval_mask, 0, mask); } /* * Mark the attributes as stale, so that at the next call to * ->getattr() they will be fetched from userspace */ void fuse_invalidate_attr(struct inode *inode) { fuse_invalidate_attr_mask(inode, STATX_BASIC_STATS); } static void fuse_dir_changed(struct inode *dir) { fuse_invalidate_attr(dir); inode_maybe_inc_iversion(dir, false); } /* * Mark the attributes as stale due to an atime change. Avoid the invalidate if * atime is not used. */ void fuse_invalidate_atime(struct inode *inode) { if (!IS_RDONLY(inode)) fuse_invalidate_attr_mask(inode, STATX_ATIME); } /* * Just mark the entry as stale, so that a next attempt to look it up * will result in a new lookup call to userspace * * This is called when a dentry is about to become negative and the * timeout is unknown (unlink, rmdir, rename and in some cases * lookup) */ void fuse_invalidate_entry_cache(struct dentry *entry) { fuse_dentry_settime(entry, 0); } /* * Same as fuse_invalidate_entry_cache(), but also try to remove the * dentry from the hash */ static void fuse_invalidate_entry(struct dentry *entry) { d_invalidate(entry); fuse_invalidate_entry_cache(entry); } static void fuse_lookup_init(struct fuse_conn *fc, struct fuse_args *args, u64 nodeid, const struct qstr *name, struct fuse_entry_out *outarg) { memset(outarg, 0, sizeof(struct fuse_entry_out)); args->opcode = FUSE_LOOKUP; args->nodeid = nodeid; args->in_numargs = 3; fuse_set_zero_arg0(args); args->in_args[1].size = name->len; args->in_args[1].value = name->name; args->in_args[2].size = 1; args->in_args[2].value = ""; args->out_numargs = 1; args->out_args[0].size = sizeof(struct fuse_entry_out); args->out_args[0].value = outarg; } /* * Check whether the dentry is still valid * * If the entry validity timeout has expired and the dentry is * positive, try to redo the lookup. If the lookup results in a * different inode, then let the VFS invalidate the dentry and redo * the lookup once more. If the lookup results in the same inode, * then refresh the attributes, timeouts and mark the dentry valid. */ static int fuse_dentry_revalidate(struct inode *dir, const struct qstr *name, struct dentry *entry, unsigned int flags) { struct inode *inode; struct fuse_mount *fm; struct fuse_inode *fi; int ret; inode = d_inode_rcu(entry); if (inode && fuse_is_bad(inode)) goto invalid; else if (time_before64(fuse_dentry_time(entry), get_jiffies_64()) || (flags & (LOOKUP_EXCL | LOOKUP_REVAL | LOOKUP_RENAME_TARGET))) { struct fuse_entry_out outarg; FUSE_ARGS(args); struct fuse_forget_link *forget; u64 attr_version; /* For negative dentries, always do a fresh lookup */ if (!inode) goto invalid; ret = -ECHILD; if (flags & LOOKUP_RCU) goto out; fm = get_fuse_mount(inode); forget = fuse_alloc_forget(); ret = -ENOMEM; if (!forget) goto out; attr_version = fuse_get_attr_version(fm->fc); fuse_lookup_init(fm->fc, &args, get_node_id(dir), name, &outarg); ret = fuse_simple_request(fm, &args); /* Zero nodeid is same as -ENOENT */ if (!ret && !outarg.nodeid) ret = -ENOENT; if (!ret) { fi = get_fuse_inode(inode); if (outarg.nodeid != get_node_id(inode) || (bool) IS_AUTOMOUNT(inode) != (bool) (outarg.attr.flags & FUSE_ATTR_SUBMOUNT)) { fuse_queue_forget(fm->fc, forget, outarg.nodeid, 1); goto invalid; } spin_lock(&fi->lock); fi->nlookup++; spin_unlock(&fi->lock); } kfree(forget); if (ret == -ENOMEM || ret == -EINTR) goto out; if (ret || fuse_invalid_attr(&outarg.attr) || fuse_stale_inode(inode, outarg.generation, &outarg.attr)) goto invalid; forget_all_cached_acls(inode); fuse_change_attributes(inode, &outarg.attr, NULL, ATTR_TIMEOUT(&outarg), attr_version); fuse_change_entry_timeout(entry, &outarg); } else if (inode) { fi = get_fuse_inode(inode); if (flags & LOOKUP_RCU) { if (test_bit(FUSE_I_INIT_RDPLUS, &fi->state)) return -ECHILD; } else if (test_and_clear_bit(FUSE_I_INIT_RDPLUS, &fi->state)) { fuse_advise_use_readdirplus(dir); } } ret = 1; out: return ret; invalid: ret = 0; goto out; } #if BITS_PER_LONG < 64 static int fuse_dentry_init(struct dentry *dentry) { dentry->d_fsdata = kzalloc(sizeof(union fuse_dentry), GFP_KERNEL_ACCOUNT | __GFP_RECLAIMABLE); return dentry->d_fsdata ? 0 : -ENOMEM; } static void fuse_dentry_release(struct dentry *dentry) { union fuse_dentry *fd = dentry->d_fsdata; kfree_rcu(fd, rcu); } #endif static int fuse_dentry_delete(const struct dentry *dentry) { return time_before64(fuse_dentry_time(dentry), get_jiffies_64()); } /* * Create a fuse_mount object with a new superblock (with path->dentry * as the root), and return that mount so it can be auto-mounted on * @path. */ static struct vfsmount *fuse_dentry_automount(struct path *path) { struct fs_context *fsc; struct vfsmount *mnt; struct fuse_inode *mp_fi = get_fuse_inode(d_inode(path->dentry)); fsc = fs_context_for_submount(path->mnt->mnt_sb->s_type, path->dentry); if (IS_ERR(fsc)) return ERR_CAST(fsc); /* Pass the FUSE inode of the mount for fuse_get_tree_submount() */ fsc->fs_private = mp_fi; /* Create the submount */ mnt = fc_mount(fsc); if (!IS_ERR(mnt)) mntget(mnt); put_fs_context(fsc); return mnt; } const struct dentry_operations fuse_dentry_operations = { .d_revalidate = fuse_dentry_revalidate, .d_delete = fuse_dentry_delete, #if BITS_PER_LONG < 64 .d_init = fuse_dentry_init, .d_release = fuse_dentry_release, #endif .d_automount = fuse_dentry_automount, }; const struct dentry_operations fuse_root_dentry_operations = { #if BITS_PER_LONG < 64 .d_init = fuse_dentry_init, .d_release = fuse_dentry_release, #endif }; int fuse_valid_type(int m) { return S_ISREG(m) || S_ISDIR(m) || S_ISLNK(m) || S_ISCHR(m) || S_ISBLK(m) || S_ISFIFO(m) || S_ISSOCK(m); } static bool fuse_valid_size(u64 size) { return size <= LLONG_MAX; } bool fuse_invalid_attr(struct fuse_attr *attr) { return !fuse_valid_type(attr->mode) || !fuse_valid_size(attr->size); } int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name, struct fuse_entry_out *outarg, struct inode **inode) { struct fuse_mount *fm = get_fuse_mount_super(sb); FUSE_ARGS(args); struct fuse_forget_link *forget; u64 attr_version, evict_ctr; int err; *inode = NULL; err = -ENAMETOOLONG; if (name->len > fm->fc->name_max) goto out; forget = fuse_alloc_forget(); err = -ENOMEM; if (!forget) goto out; attr_version = fuse_get_attr_version(fm->fc); evict_ctr = fuse_get_evict_ctr(fm->fc); fuse_lookup_init(fm->fc, &args, nodeid, name, outarg); err = fuse_simple_request(fm, &args); /* Zero nodeid is same as -ENOENT, but with valid timeout */ if (err || !outarg->nodeid) goto out_put_forget; err = -EIO; if (fuse_invalid_attr(&outarg->attr)) goto out_put_forget; if (outarg->nodeid == FUSE_ROOT_ID && outarg->generation != 0) { pr_warn_once("root generation should be zero\n"); outarg->generation = 0; } *inode = fuse_iget(sb, outarg->nodeid, outarg->generation, &outarg->attr, ATTR_TIMEOUT(outarg), attr_version, evict_ctr); err = -ENOMEM; if (!*inode) { fuse_queue_forget(fm->fc, forget, outarg->nodeid, 1); goto out; } err = 0; out_put_forget: kfree(forget); out: return err; } static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry, unsigned int flags) { int err; struct fuse_entry_out outarg; struct inode *inode; struct dentry *newent; bool outarg_valid = true; bool locked; if (fuse_is_bad(dir)) return ERR_PTR(-EIO); locked = fuse_lock_inode(dir); err = fuse_lookup_name(dir->i_sb, get_node_id(dir), &entry->d_name, &outarg, &inode); fuse_unlock_inode(dir, locked); if (err == -ENOENT) { outarg_valid = false; err = 0; } if (err) goto out_err; err = -EIO; if (inode && get_node_id(inode) == FUSE_ROOT_ID) goto out_iput; newent = d_splice_alias(inode, entry); err = PTR_ERR(newent); if (IS_ERR(newent)) goto out_err; entry = newent ? newent : entry; if (outarg_valid) fuse_change_entry_timeout(entry, &outarg); else fuse_invalidate_entry_cache(entry); if (inode) fuse_advise_use_readdirplus(dir); return newent; out_iput: iput(inode); out_err: return ERR_PTR(err); } static int get_security_context(struct dentry *entry, umode_t mode, struct fuse_in_arg *ext) { struct fuse_secctx *fctx; struct fuse_secctx_header *header; struct lsm_context lsmctx = { }; void *ptr; u32 total_len = sizeof(*header); int err, nr_ctx = 0; const char *name = NULL; size_t namelen; err = security_dentry_init_security(entry, mode, &entry->d_name, &name, &lsmctx); /* If no LSM is supporting this security hook ignore error */ if (err && err != -EOPNOTSUPP) goto out_err; if (lsmctx.len) { nr_ctx = 1; namelen = strlen(name) + 1; err = -EIO; if (WARN_ON(namelen > XATTR_NAME_MAX + 1 || lsmctx.len > S32_MAX)) goto out_err; total_len += FUSE_REC_ALIGN(sizeof(*fctx) + namelen + lsmctx.len); } err = -ENOMEM; header = ptr = kzalloc(total_len, GFP_KERNEL); if (!ptr) goto out_err; header->nr_secctx = nr_ctx; header->size = total_len; ptr += sizeof(*header); if (nr_ctx) { fctx = ptr; fctx->size = lsmctx.len; ptr += sizeof(*fctx); strcpy(ptr, name); ptr += namelen; memcpy(ptr, lsmctx.context, lsmctx.len); } ext->size = total_len; ext->value = header; err = 0; out_err: if (nr_ctx) security_release_secctx(&lsmctx); return err; } static void *extend_arg(struct fuse_in_arg *buf, u32 bytes) { void *p; u32 newlen = buf->size + bytes; p = krealloc(buf->value, newlen, GFP_KERNEL); if (!p) { kfree(buf->value); buf->size = 0; buf->value = NULL; return NULL; } memset(p + buf->size, 0, bytes); buf->value = p; buf->size = newlen; return p + newlen - bytes; } static u32 fuse_ext_size(size_t size) { return FUSE_REC_ALIGN(sizeof(struct fuse_ext_header) + size); } /* * This adds just a single supplementary group that matches the parent's group. */ static int get_create_supp_group(struct mnt_idmap *idmap, struct inode *dir, struct fuse_in_arg *ext) { struct fuse_conn *fc = get_fuse_conn(dir); struct fuse_ext_header *xh; struct fuse_supp_groups *sg; kgid_t kgid = dir->i_gid; vfsgid_t vfsgid = make_vfsgid(idmap, fc->user_ns, kgid); gid_t parent_gid = from_kgid(fc->user_ns, kgid); u32 sg_len = fuse_ext_size(sizeof(*sg) + sizeof(sg->groups[0])); if (parent_gid == (gid_t) -1 || vfsgid_eq_kgid(vfsgid, current_fsgid()) || !vfsgid_in_group_p(vfsgid)) return 0; xh = extend_arg(ext, sg_len); if (!xh) return -ENOMEM; xh->size = sg_len; xh->type = FUSE_EXT_GROUPS; sg = (struct fuse_supp_groups *) &xh[1]; sg->nr_groups = 1; sg->groups[0] = parent_gid; return 0; } static int get_create_ext(struct mnt_idmap *idmap, struct fuse_args *args, struct inode *dir, struct dentry *dentry, umode_t mode) { struct fuse_conn *fc = get_fuse_conn_super(dentry->d_sb); struct fuse_in_arg ext = { .size = 0, .value = NULL }; int err = 0; if (fc->init_security) err = get_security_context(dentry, mode, &ext); if (!err && fc->create_supp_group) err = get_create_supp_group(idmap, dir, &ext); if (!err && ext.size) { WARN_ON(args->in_numargs >= ARRAY_SIZE(args->in_args)); args->is_ext = true; args->ext_idx = args->in_numargs++; args->in_args[args->ext_idx] = ext; } else { kfree(ext.value); } return err; } static void free_ext_value(struct fuse_args *args) { if (args->is_ext) kfree(args->in_args[args->ext_idx].value); } /* * Atomic create+open operation * * If the filesystem doesn't support this, then fall back to separate * 'mknod' + 'open' requests. */ static int fuse_create_open(struct mnt_idmap *idmap, struct inode *dir, struct dentry *entry, struct file *file, unsigned int flags, umode_t mode, u32 opcode) { int err; struct inode *inode; struct fuse_mount *fm = get_fuse_mount(dir); FUSE_ARGS(args); struct fuse_forget_link *forget; struct fuse_create_in inarg; struct fuse_open_out *outopenp; struct fuse_entry_out outentry; struct fuse_inode *fi; struct fuse_file *ff; bool trunc = flags & O_TRUNC; /* Userspace expects S_IFREG in create mode */ BUG_ON((mode & S_IFMT) != S_IFREG); forget = fuse_alloc_forget(); err = -ENOMEM; if (!forget) goto out_err; err = -ENOMEM; ff = fuse_file_alloc(fm, true); if (!ff) goto out_put_forget_req; if (!fm->fc->dont_mask) mode &= ~current_umask(); flags &= ~O_NOCTTY; memset(&inarg, 0, sizeof(inarg)); memset(&outentry, 0, sizeof(outentry)); inarg.flags = flags; inarg.mode = mode; inarg.umask = current_umask(); if (fm->fc->handle_killpriv_v2 && trunc && !(flags & O_EXCL) && !capable(CAP_FSETID)) { inarg.open_flags |= FUSE_OPEN_KILL_SUIDGID; } args.opcode = opcode; args.nodeid = get_node_id(dir); args.in_numargs = 2; args.in_args[0].size = sizeof(inarg); args.in_args[0].value = &inarg; args.in_args[1].size = entry->d_name.len + 1; args.in_args[1].value = entry->d_name.name; args.out_numargs = 2; args.out_args[0].size = sizeof(outentry); args.out_args[0].value = &outentry; /* Store outarg for fuse_finish_open() */ outopenp = &ff->args->open_outarg; args.out_args[1].size = sizeof(*outopenp); args.out_args[1].value = outopenp; err = get_create_ext(idmap, &args, dir, entry, mode); if (err) goto out_free_ff; err = fuse_simple_idmap_request(idmap, fm, &args); free_ext_value(&args); if (err) goto out_free_ff; err = -EIO; if (!S_ISREG(outentry.attr.mode) || invalid_nodeid(outentry.nodeid) || fuse_invalid_attr(&outentry.attr)) goto out_free_ff; ff->fh = outopenp->fh; ff->nodeid = outentry.nodeid; ff->open_flags = outopenp->open_flags; inode = fuse_iget(dir->i_sb, outentry.nodeid, outentry.generation, &outentry.attr, ATTR_TIMEOUT(&outentry), 0, 0); if (!inode) { flags &= ~(O_CREAT | O_EXCL | O_TRUNC); fuse_sync_release(NULL, ff, flags); fuse_queue_forget(fm->fc, forget, outentry.nodeid, 1); err = -ENOMEM; goto out_err; } kfree(forget); d_instantiate(entry, inode); fuse_change_entry_timeout(entry, &outentry); fuse_dir_changed(dir); err = generic_file_open(inode, file); if (!err) { file->private_data = ff; err = finish_open(file, entry, fuse_finish_open); } if (err) { fi = get_fuse_inode(inode); fuse_sync_release(fi, ff, flags); } else { if (fm->fc->atomic_o_trunc && trunc) truncate_pagecache(inode, 0); else if (!(ff->open_flags & FOPEN_KEEP_CACHE)) invalidate_inode_pages2(inode->i_mapping); } return err; out_free_ff: fuse_file_free(ff); out_put_forget_req: kfree(forget); out_err: return err; } static int fuse_mknod(struct mnt_idmap *, struct inode *, struct dentry *, umode_t, dev_t); static int fuse_atomic_open(struct inode *dir, struct dentry *entry, struct file *file, unsigned flags, umode_t mode) { int err; struct mnt_idmap *idmap = file_mnt_idmap(file); struct fuse_conn *fc = get_fuse_conn(dir); struct dentry *res = NULL; if (fuse_is_bad(dir)) return -EIO; if (d_in_lookup(entry)) { res = fuse_lookup(dir, entry, 0); if (IS_ERR(res)) return PTR_ERR(res); if (res) entry = res; } if (!(flags & O_CREAT) || d_really_is_positive(entry)) goto no_open; /* Only creates */ file->f_mode |= FMODE_CREATED; if (fc->no_create) goto mknod; err = fuse_create_open(idmap, dir, entry, file, flags, mode, FUSE_CREATE); if (err == -ENOSYS) { fc->no_create = 1; goto mknod; } else if (err == -EEXIST) fuse_invalidate_entry(entry); out_dput: dput(res); return err; mknod: err = fuse_mknod(idmap, dir, entry, mode, 0); if (err) goto out_dput; no_open: return finish_no_open(file, res); } /* * Code shared between mknod, mkdir, symlink and link */ static struct dentry *create_new_entry(struct mnt_idmap *idmap, struct fuse_mount *fm, struct fuse_args *args, struct inode *dir, struct dentry *entry, umode_t mode) { struct fuse_entry_out outarg; struct inode *inode; struct dentry *d; int err; struct fuse_forget_link *forget; if (fuse_is_bad(dir)) return ERR_PTR(-EIO); forget = fuse_alloc_forget(); if (!forget) return ERR_PTR(-ENOMEM); memset(&outarg, 0, sizeof(outarg)); args->nodeid = get_node_id(dir); args->out_numargs = 1; args->out_args[0].size = sizeof(outarg); args->out_args[0].value = &outarg; if (args->opcode != FUSE_LINK) { err = get_create_ext(idmap, args, dir, entry, mode); if (err) goto out_put_forget_req; } err = fuse_simple_idmap_request(idmap, fm, args); free_ext_value(args); if (err) goto out_put_forget_req; err = -EIO; if (invalid_nodeid(outarg.nodeid) || fuse_invalid_attr(&outarg.attr)) goto out_put_forget_req; if ((outarg.attr.mode ^ mode) & S_IFMT) goto out_put_forget_req; inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation, &outarg.attr, ATTR_TIMEOUT(&outarg), 0, 0); if (!inode) { fuse_queue_forget(fm->fc, forget, outarg.nodeid, 1); return ERR_PTR(-ENOMEM); } kfree(forget); d_drop(entry); d = d_splice_alias(inode, entry); if (IS_ERR(d)) return d; if (d) fuse_change_entry_timeout(d, &outarg); else fuse_change_entry_timeout(entry, &outarg); fuse_dir_changed(dir); return d; out_put_forget_req: if (err == -EEXIST) fuse_invalidate_entry(entry); kfree(forget); return ERR_PTR(err); } static int create_new_nondir(struct mnt_idmap *idmap, struct fuse_mount *fm, struct fuse_args *args, struct inode *dir, struct dentry *entry, umode_t mode) { /* * Note that when creating anything other than a directory we * can be sure create_new_entry() will NOT return an alternate * dentry as d_splice_alias() only returns an alternate dentry * for directories. So we don't need to check for that case * when passing back the result. */ WARN_ON_ONCE(S_ISDIR(mode)); return PTR_ERR(create_new_entry(idmap, fm, args, dir, entry, mode)); } static int fuse_mknod(struct mnt_idmap *idmap, struct inode *dir, struct dentry *entry, umode_t mode, dev_t rdev) { struct fuse_mknod_in inarg; struct fuse_mount *fm = get_fuse_mount(dir); FUSE_ARGS(args); if (!fm->fc->dont_mask) mode &= ~current_umask(); memset(&inarg, 0, sizeof(inarg)); inarg.mode = mode; inarg.rdev = new_encode_dev(rdev); inarg.umask = current_umask(); args.opcode = FUSE_MKNOD; args.in_numargs = 2; args.in_args[0].size = sizeof(inarg); args.in_args[0].value = &inarg; args.in_args[1].size = entry->d_name.len + 1; args.in_args[1].value = entry->d_name.name; return create_new_nondir(idmap, fm, &args, dir, entry, mode); } static int fuse_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *entry, umode_t mode, bool excl) { return fuse_mknod(idmap, dir, entry, mode, 0); } static int fuse_tmpfile(struct mnt_idmap *idmap, struct inode *dir, struct file *file, umode_t mode) { struct fuse_conn *fc = get_fuse_conn(dir); int err; if (fc->no_tmpfile) return -EOPNOTSUPP; err = fuse_create_open(idmap, dir, file->f_path.dentry, file, file->f_flags, mode, FUSE_TMPFILE); if (err == -ENOSYS) { fc->no_tmpfile = 1; err = -EOPNOTSUPP; } return err; } static struct dentry *fuse_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *entry, umode_t mode) { struct fuse_mkdir_in inarg; struct fuse_mount *fm = get_fuse_mount(dir); FUSE_ARGS(args); if (!fm->fc->dont_mask) mode &= ~current_umask(); memset(&inarg, 0, sizeof(inarg)); inarg.mode = mode; inarg.umask = current_umask(); args.opcode = FUSE_MKDIR; args.in_numargs = 2; args.in_args[0].size = sizeof(inarg); args.in_args[0].value = &inarg; args.in_args[1].size = entry->d_name.len + 1; args.in_args[1].value = entry->d_name.name; return create_new_entry(idmap, fm, &args, dir, entry, S_IFDIR); } static int fuse_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *entry, const char *link) { struct fuse_mount *fm = get_fuse_mount(dir); unsigned len = strlen(link) + 1; FUSE_ARGS(args); args.opcode = FUSE_SYMLINK; args.in_numargs = 3; fuse_set_zero_arg0(&args); args.in_args[1].size = entry->d_name.len + 1; args.in_args[1].value = entry->d_name.name; args.in_args[2].size = len; args.in_args[2].value = link; return create_new_nondir(idmap, fm, &args, dir, entry, S_IFLNK); } void fuse_flush_time_update(struct inode *inode) { int err = sync_inode_metadata(inode, 1); mapping_set_error(inode->i_mapping, err); } static void fuse_update_ctime_in_cache(struct inode *inode) { if (!IS_NOCMTIME(inode)) { inode_set_ctime_current(inode); mark_inode_dirty_sync(inode); fuse_flush_time_update(inode); } } void fuse_update_ctime(struct inode *inode) { fuse_invalidate_attr_mask(inode, STATX_CTIME); fuse_update_ctime_in_cache(inode); } static void fuse_entry_unlinked(struct dentry *entry) { struct inode *inode = d_inode(entry); struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); spin_lock(&fi->lock); fi->attr_version = atomic64_inc_return(&fc->attr_version); /* * If i_nlink == 0 then unlink doesn't make sense, yet this can * happen if userspace filesystem is careless. It would be * difficult to enforce correct nlink usage so just ignore this * condition here */ if (S_ISDIR(inode->i_mode)) clear_nlink(inode); else if (inode->i_nlink > 0) drop_nlink(inode); spin_unlock(&fi->lock); fuse_invalidate_entry_cache(entry); fuse_update_ctime(inode); } static int fuse_unlink(struct inode *dir, struct dentry *entry) { int err; struct fuse_mount *fm = get_fuse_mount(dir); FUSE_ARGS(args); if (fuse_is_bad(dir)) return -EIO; args.opcode = FUSE_UNLINK; args.nodeid = get_node_id(dir); args.in_numargs = 2; fuse_set_zero_arg0(&args); args.in_args[1].size = entry->d_name.len + 1; args.in_args[1].value = entry->d_name.name; err = fuse_simple_request(fm, &args); if (!err) { fuse_dir_changed(dir); fuse_entry_unlinked(entry); } else if (err == -EINTR || err == -ENOENT) fuse_invalidate_entry(entry); return err; } static int fuse_rmdir(struct inode *dir, struct dentry *entry) { int err; struct fuse_mount *fm = get_fuse_mount(dir); FUSE_ARGS(args); if (fuse_is_bad(dir)) return -EIO; args.opcode = FUSE_RMDIR; args.nodeid = get_node_id(dir); args.in_numargs = 2; fuse_set_zero_arg0(&args); args.in_args[1].size = entry->d_name.len + 1; args.in_args[1].value = entry->d_name.name; err = fuse_simple_request(fm, &args); if (!err) { fuse_dir_changed(dir); fuse_entry_unlinked(entry); } else if (err == -EINTR || err == -ENOENT) fuse_invalidate_entry(entry); return err; } static int fuse_rename_common(struct mnt_idmap *idmap, struct inode *olddir, struct dentry *oldent, struct inode *newdir, struct dentry *newent, unsigned int flags, int opcode, size_t argsize) { int err; struct fuse_rename2_in inarg; struct fuse_mount *fm = get_fuse_mount(olddir); FUSE_ARGS(args); memset(&inarg, 0, argsize); inarg.newdir = get_node_id(newdir); inarg.flags = flags; args.opcode = opcode; args.nodeid = get_node_id(olddir); args.in_numargs = 3; args.in_args[0].size = argsize; args.in_args[0].value = &inarg; args.in_args[1].size = oldent->d_name.len + 1; args.in_args[1].value = oldent->d_name.name; args.in_args[2].size = newent->d_name.len + 1; args.in_args[2].value = newent->d_name.name; err = fuse_simple_idmap_request(idmap, fm, &args); if (!err) { /* ctime changes */ fuse_update_ctime(d_inode(oldent)); if (flags & RENAME_EXCHANGE) fuse_update_ctime(d_inode(newent)); fuse_dir_changed(olddir); if (olddir != newdir) fuse_dir_changed(newdir); /* newent will end up negative */ if (!(flags & RENAME_EXCHANGE) && d_really_is_positive(newent)) fuse_entry_unlinked(newent); } else if (err == -EINTR || err == -ENOENT) { /* If request was interrupted, DEITY only knows if the rename actually took place. If the invalidation fails (e.g. some process has CWD under the renamed directory), then there can be inconsistency between the dcache and the real filesystem. Tough luck. */ fuse_invalidate_entry(oldent); if (d_really_is_positive(newent)) fuse_invalidate_entry(newent); } return err; } static int fuse_rename2(struct mnt_idmap *idmap, struct inode *olddir, struct dentry *oldent, struct inode *newdir, struct dentry *newent, unsigned int flags) { struct fuse_conn *fc = get_fuse_conn(olddir); int err; if (fuse_is_bad(olddir)) return -EIO; if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return -EINVAL; if (flags) { if (fc->no_rename2 || fc->minor < 23) return -EINVAL; err = fuse_rename_common((flags & RENAME_WHITEOUT) ? idmap : &invalid_mnt_idmap, olddir, oldent, newdir, newent, flags, FUSE_RENAME2, sizeof(struct fuse_rename2_in)); if (err == -ENOSYS) { fc->no_rename2 = 1; err = -EINVAL; } } else { err = fuse_rename_common(&invalid_mnt_idmap, olddir, oldent, newdir, newent, 0, FUSE_RENAME, sizeof(struct fuse_rename_in)); } return err; } static int fuse_link(struct dentry *entry, struct inode *newdir, struct dentry *newent) { int err; struct fuse_link_in inarg; struct inode *inode = d_inode(entry); struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); if (fm->fc->no_link) goto out; memset(&inarg, 0, sizeof(inarg)); inarg.oldnodeid = get_node_id(inode); args.opcode = FUSE_LINK; args.in_numargs = 2; args.in_args[0].size = sizeof(inarg); args.in_args[0].value = &inarg; args.in_args[1].size = newent->d_name.len + 1; args.in_args[1].value = newent->d_name.name; err = create_new_nondir(&invalid_mnt_idmap, fm, &args, newdir, newent, inode->i_mode); if (!err) fuse_update_ctime_in_cache(inode); else if (err == -EINTR) fuse_invalidate_attr(inode); if (err == -ENOSYS) fm->fc->no_link = 1; out: if (fm->fc->no_link) return -EPERM; return err; } static void fuse_fillattr(struct mnt_idmap *idmap, struct inode *inode, struct fuse_attr *attr, struct kstat *stat) { unsigned int blkbits; struct fuse_conn *fc = get_fuse_conn(inode); vfsuid_t vfsuid = make_vfsuid(idmap, fc->user_ns, make_kuid(fc->user_ns, attr->uid)); vfsgid_t vfsgid = make_vfsgid(idmap, fc->user_ns, make_kgid(fc->user_ns, attr->gid)); stat->dev = inode->i_sb->s_dev; stat->ino = attr->ino; stat->mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777); stat->nlink = attr->nlink; stat->uid = vfsuid_into_kuid(vfsuid); stat->gid = vfsgid_into_kgid(vfsgid); stat->rdev = inode->i_rdev; stat->atime.tv_sec = attr->atime; stat->atime.tv_nsec = attr->atimensec; stat->mtime.tv_sec = attr->mtime; stat->mtime.tv_nsec = attr->mtimensec; stat->ctime.tv_sec = attr->ctime; stat->ctime.tv_nsec = attr->ctimensec; stat->size = attr->size; stat->blocks = attr->blocks; if (attr->blksize != 0) blkbits = ilog2(attr->blksize); else blkbits = inode->i_sb->s_blocksize_bits; stat->blksize = 1 << blkbits; } static void fuse_statx_to_attr(struct fuse_statx *sx, struct fuse_attr *attr) { memset(attr, 0, sizeof(*attr)); attr->ino = sx->ino; attr->size = sx->size; attr->blocks = sx->blocks; attr->atime = sx->atime.tv_sec; attr->mtime = sx->mtime.tv_sec; attr->ctime = sx->ctime.tv_sec; attr->atimensec = sx->atime.tv_nsec; attr->mtimensec = sx->mtime.tv_nsec; attr->ctimensec = sx->ctime.tv_nsec; attr->mode = sx->mode; attr->nlink = sx->nlink; attr->uid = sx->uid; attr->gid = sx->gid; attr->rdev = new_encode_dev(MKDEV(sx->rdev_major, sx->rdev_minor)); attr->blksize = sx->blksize; } static int fuse_do_statx(struct mnt_idmap *idmap, struct inode *inode, struct file *file, struct kstat *stat) { int err; struct fuse_attr attr; struct fuse_statx *sx; struct fuse_statx_in inarg; struct fuse_statx_out outarg; struct fuse_mount *fm = get_fuse_mount(inode); u64 attr_version = fuse_get_attr_version(fm->fc); FUSE_ARGS(args); memset(&inarg, 0, sizeof(inarg)); memset(&outarg, 0, sizeof(outarg)); /* Directories have separate file-handle space */ if (file && S_ISREG(inode->i_mode)) { struct fuse_file *ff = file->private_data; inarg.getattr_flags |= FUSE_GETATTR_FH; inarg.fh = ff->fh; } /* For now leave sync hints as the default, request all stats. */ inarg.sx_flags = 0; inarg.sx_mask = STATX_BASIC_STATS | STATX_BTIME; args.opcode = FUSE_STATX; args.nodeid = get_node_id(inode); args.in_numargs = 1; args.in_args[0].size = sizeof(inarg); args.in_args[0].value = &inarg; args.out_numargs = 1; args.out_args[0].size = sizeof(outarg); args.out_args[0].value = &outarg; err = fuse_simple_request(fm, &args); if (err) return err; sx = &outarg.stat; if (((sx->mask & STATX_SIZE) && !fuse_valid_size(sx->size)) || ((sx->mask & STATX_TYPE) && (!fuse_valid_type(sx->mode) || inode_wrong_type(inode, sx->mode)))) { fuse_make_bad(inode); return -EIO; } fuse_statx_to_attr(&outarg.stat, &attr); if ((sx->mask & STATX_BASIC_STATS) == STATX_BASIC_STATS) { fuse_change_attributes(inode, &attr, &outarg.stat, ATTR_TIMEOUT(&outarg), attr_version); } if (stat) { stat->result_mask = sx->mask & (STATX_BASIC_STATS | STATX_BTIME); stat->btime.tv_sec = sx->btime.tv_sec; stat->btime.tv_nsec = min_t(u32, sx->btime.tv_nsec, NSEC_PER_SEC - 1); fuse_fillattr(idmap, inode, &attr, stat); stat->result_mask |= STATX_TYPE; } return 0; } static int fuse_do_getattr(struct mnt_idmap *idmap, struct inode *inode, struct kstat *stat, struct file *file) { int err; struct fuse_getattr_in inarg; struct fuse_attr_out outarg; struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); u64 attr_version; attr_version = fuse_get_attr_version(fm->fc); memset(&inarg, 0, sizeof(inarg)); memset(&outarg, 0, sizeof(outarg)); /* Directories have separate file-handle space */ if (file && S_ISREG(inode->i_mode)) { struct fuse_file *ff = file->private_data; inarg.getattr_flags |= FUSE_GETATTR_FH; inarg.fh = ff->fh; } args.opcode = FUSE_GETATTR; args.nodeid = get_node_id(inode); args.in_numargs = 1; args.in_args[0].size = sizeof(inarg); args.in_args[0].value = &inarg; args.out_numargs = 1; args.out_args[0].size = sizeof(outarg); args.out_args[0].value = &outarg; err = fuse_simple_request(fm, &args); if (!err) { if (fuse_invalid_attr(&outarg.attr) || inode_wrong_type(inode, outarg.attr.mode)) { fuse_make_bad(inode); err = -EIO; } else { fuse_change_attributes(inode, &outarg.attr, NULL, ATTR_TIMEOUT(&outarg), attr_version); if (stat) fuse_fillattr(idmap, inode, &outarg.attr, stat); } } return err; } static int fuse_update_get_attr(struct mnt_idmap *idmap, struct inode *inode, struct file *file, struct kstat *stat, u32 request_mask, unsigned int flags) { struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_conn *fc = get_fuse_conn(inode); int err = 0; bool sync; u32 inval_mask = READ_ONCE(fi->inval_mask); u32 cache_mask = fuse_get_cache_mask(inode); /* FUSE only supports basic stats and possibly btime */ request_mask &= STATX_BASIC_STATS | STATX_BTIME; retry: if (fc->no_statx) request_mask &= STATX_BASIC_STATS; if (!request_mask) sync = false; else if (flags & AT_STATX_FORCE_SYNC) sync = true; else if (flags & AT_STATX_DONT_SYNC) sync = false; else if (request_mask & inval_mask & ~cache_mask) sync = true; else sync = time_before64(fi->i_time, get_jiffies_64()); if (sync) { forget_all_cached_acls(inode); /* Try statx if BTIME is requested */ if (!fc->no_statx && (request_mask & ~STATX_BASIC_STATS)) { err = fuse_do_statx(idmap, inode, file, stat); if (err == -ENOSYS) { fc->no_statx = 1; err = 0; goto retry; } } else { err = fuse_do_getattr(idmap, inode, stat, file); } } else if (stat) { generic_fillattr(idmap, request_mask, inode, stat); stat->mode = fi->orig_i_mode; stat->ino = fi->orig_ino; if (test_bit(FUSE_I_BTIME, &fi->state)) { stat->btime = fi->i_btime; stat->result_mask |= STATX_BTIME; } } return err; } int fuse_update_attributes(struct inode *inode, struct file *file, u32 mask) { return fuse_update_get_attr(&nop_mnt_idmap, inode, file, NULL, mask, 0); } int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid, u64 child_nodeid, struct qstr *name, u32 flags) { int err = -ENOTDIR; struct inode *parent; struct dentry *dir; struct dentry *entry; parent = fuse_ilookup(fc, parent_nodeid, NULL); if (!parent) return -ENOENT; inode_lock_nested(parent, I_MUTEX_PARENT); if (!S_ISDIR(parent->i_mode)) goto unlock; err = -ENOENT; dir = d_find_alias(parent); if (!dir) goto unlock; name->hash = full_name_hash(dir, name->name, name->len); entry = d_lookup(dir, name); dput(dir); if (!entry) goto unlock; fuse_dir_changed(parent); if (!(flags & FUSE_EXPIRE_ONLY)) d_invalidate(entry); fuse_invalidate_entry_cache(entry); if (child_nodeid != 0 && d_really_is_positive(entry)) { inode_lock(d_inode(entry)); if (get_node_id(d_inode(entry)) != child_nodeid) { err = -ENOENT; goto badentry; } if (d_mountpoint(entry)) { err = -EBUSY; goto badentry; } if (d_is_dir(entry)) { shrink_dcache_parent(entry); if (!simple_empty(entry)) { err = -ENOTEMPTY; goto badentry; } d_inode(entry)->i_flags |= S_DEAD; } dont_mount(entry); clear_nlink(d_inode(entry)); err = 0; badentry: inode_unlock(d_inode(entry)); if (!err) d_delete(entry); } else { err = 0; } dput(entry); unlock: inode_unlock(parent); iput(parent); return err; } static inline bool fuse_permissible_uidgid(struct fuse_conn *fc) { const struct cred *cred = current_cred(); return (uid_eq(cred->euid, fc->user_id) && uid_eq(cred->suid, fc->user_id) && uid_eq(cred->uid, fc->user_id) && gid_eq(cred->egid, fc->group_id) && gid_eq(cred->sgid, fc->group_id) && gid_eq(cred->gid, fc->group_id)); } /* * Calling into a user-controlled filesystem gives the filesystem * daemon ptrace-like capabilities over the current process. This * means, that the filesystem daemon is able to record the exact * filesystem operations performed, and can also control the behavior * of the requester process in otherwise impossible ways. For example * it can delay the operation for arbitrary length of time allowing * DoS against the requester. * * For this reason only those processes can call into the filesystem, * for which the owner of the mount has ptrace privilege. This * excludes processes started by other users, suid or sgid processes. */ bool fuse_allow_current_process(struct fuse_conn *fc) { bool allow; if (fc->allow_other) allow = current_in_userns(fc->user_ns); else allow = fuse_permissible_uidgid(fc); if (!allow && allow_sys_admin_access && capable(CAP_SYS_ADMIN)) allow = true; return allow; } static int fuse_access(struct inode *inode, int mask) { struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); struct fuse_access_in inarg; int err; BUG_ON(mask & MAY_NOT_BLOCK); /* * We should not send FUSE_ACCESS to the userspace * when idmapped mounts are enabled as for this case * we have fc->default_permissions = 1 and access * permission checks are done on the kernel side. */ WARN_ON_ONCE(!(fm->sb->s_iflags & SB_I_NOIDMAP)); if (fm->fc->no_access) return 0; memset(&inarg, 0, sizeof(inarg)); inarg.mask = mask & (MAY_READ | MAY_WRITE | MAY_EXEC); args.opcode = FUSE_ACCESS; args.nodeid = get_node_id(inode); args.in_numargs = 1; args.in_args[0].size = sizeof(inarg); args.in_args[0].value = &inarg; err = fuse_simple_request(fm, &args); if (err == -ENOSYS) { fm->fc->no_access = 1; err = 0; } return err; } static int fuse_perm_getattr(struct inode *inode, int mask) { if (mask & MAY_NOT_BLOCK) return -ECHILD; forget_all_cached_acls(inode); return fuse_do_getattr(&nop_mnt_idmap, inode, NULL, NULL); } /* * Check permission. The two basic access models of FUSE are: * * 1) Local access checking ('default_permissions' mount option) based * on file mode. This is the plain old disk filesystem permission * model. * * 2) "Remote" access checking, where server is responsible for * checking permission in each inode operation. An exception to this * is if ->permission() was invoked from sys_access() in which case an * access request is sent. Execute permission is still checked * locally based on file mode. */ static int fuse_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { struct fuse_conn *fc = get_fuse_conn(inode); bool refreshed = false; int err = 0; if (fuse_is_bad(inode)) return -EIO; if (!fuse_allow_current_process(fc)) return -EACCES; /* * If attributes are needed, refresh them before proceeding */ if (fc->default_permissions || ((mask & MAY_EXEC) && S_ISREG(inode->i_mode))) { struct fuse_inode *fi = get_fuse_inode(inode); u32 perm_mask = STATX_MODE | STATX_UID | STATX_GID; if (perm_mask & READ_ONCE(fi->inval_mask) || time_before64(fi->i_time, get_jiffies_64())) { refreshed = true; err = fuse_perm_getattr(inode, mask); if (err) return err; } } if (fc->default_permissions) { err = generic_permission(idmap, inode, mask); /* If permission is denied, try to refresh file attributes. This is also needed, because the root node will at first have no permissions */ if (err == -EACCES && !refreshed) { err = fuse_perm_getattr(inode, mask); if (!err) err = generic_permission(idmap, inode, mask); } /* Note: the opposite of the above test does not exist. So if permissions are revoked this won't be noticed immediately, only after the attribute timeout has expired */ } else if (mask & (MAY_ACCESS | MAY_CHDIR)) { err = fuse_access(inode, mask); } else if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode)) { if (!(inode->i_mode & S_IXUGO)) { if (refreshed) return -EACCES; err = fuse_perm_getattr(inode, mask); if (!err && !(inode->i_mode & S_IXUGO)) return -EACCES; } } return err; } static int fuse_readlink_page(struct inode *inode, struct folio *folio) { struct fuse_mount *fm = get_fuse_mount(inode); struct fuse_folio_desc desc = { .length = PAGE_SIZE - 1 }; struct fuse_args_pages ap = { .num_folios = 1, .folios = &folio, .descs = &desc, }; char *link; ssize_t res; ap.args.opcode = FUSE_READLINK; ap.args.nodeid = get_node_id(inode); ap.args.out_pages = true; ap.args.out_argvar = true; ap.args.page_zeroing = true; ap.args.out_numargs = 1; ap.args.out_args[0].size = desc.length; res = fuse_simple_request(fm, &ap.args); fuse_invalidate_atime(inode); if (res < 0) return res; if (WARN_ON(res >= PAGE_SIZE)) return -EIO; link = folio_address(folio); link[res] = '\0'; return 0; } static const char *fuse_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *callback) { struct fuse_conn *fc = get_fuse_conn(inode); struct folio *folio; int err; err = -EIO; if (fuse_is_bad(inode)) goto out_err; if (fc->cache_symlinks) return page_get_link_raw(dentry, inode, callback); err = -ECHILD; if (!dentry) goto out_err; folio = folio_alloc(GFP_KERNEL, 0); err = -ENOMEM; if (!folio) goto out_err; err = fuse_readlink_page(inode, folio); if (err) { folio_put(folio); goto out_err; } set_delayed_call(callback, page_put_link, &folio->page); return folio_address(folio); out_err: return ERR_PTR(err); } static int fuse_dir_open(struct inode *inode, struct file *file) { struct fuse_mount *fm = get_fuse_mount(inode); int err; if (fuse_is_bad(inode)) return -EIO; err = generic_file_open(inode, file); if (err) return err; err = fuse_do_open(fm, get_node_id(inode), file, true); if (!err) { struct fuse_file *ff = file->private_data; /* * Keep handling FOPEN_STREAM and FOPEN_NONSEEKABLE for * directories for backward compatibility, though it's unlikely * to be useful. */ if (ff->open_flags & (FOPEN_STREAM | FOPEN_NONSEEKABLE)) nonseekable_open(inode, file); if (!(ff->open_flags & FOPEN_KEEP_CACHE)) invalidate_inode_pages2(inode->i_mapping); } return err; } static int fuse_dir_release(struct inode *inode, struct file *file) { fuse_release_common(file, true); return 0; } static int fuse_dir_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct inode *inode = file->f_mapping->host; struct fuse_conn *fc = get_fuse_conn(inode); int err; if (fuse_is_bad(inode)) return -EIO; if (fc->no_fsyncdir) return 0; inode_lock(inode); err = fuse_fsync_common(file, start, end, datasync, FUSE_FSYNCDIR); if (err == -ENOSYS) { fc->no_fsyncdir = 1; err = 0; } inode_unlock(inode); return err; } static long fuse_dir_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct fuse_conn *fc = get_fuse_conn(file->f_mapping->host); /* FUSE_IOCTL_DIR only supported for API version >= 7.18 */ if (fc->minor < 18) return -ENOTTY; return fuse_ioctl_common(file, cmd, arg, FUSE_IOCTL_DIR); } static long fuse_dir_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct fuse_conn *fc = get_fuse_conn(file->f_mapping->host); if (fc->minor < 18) return -ENOTTY; return fuse_ioctl_common(file, cmd, arg, FUSE_IOCTL_COMPAT | FUSE_IOCTL_DIR); } static bool update_mtime(unsigned ivalid, bool trust_local_mtime) { /* Always update if mtime is explicitly set */ if (ivalid & ATTR_MTIME_SET) return true; /* Or if kernel i_mtime is the official one */ if (trust_local_mtime) return true; /* If it's an open(O_TRUNC) or an ftruncate(), don't update */ if ((ivalid & ATTR_SIZE) && (ivalid & (ATTR_OPEN | ATTR_FILE))) return false; /* In all other cases update */ return true; } static void iattr_to_fattr(struct mnt_idmap *idmap, struct fuse_conn *fc, struct iattr *iattr, struct fuse_setattr_in *arg, bool trust_local_cmtime) { unsigned ivalid = iattr->ia_valid; if (ivalid & ATTR_MODE) arg->valid |= FATTR_MODE, arg->mode = iattr->ia_mode; if (ivalid & ATTR_UID) { kuid_t fsuid = from_vfsuid(idmap, fc->user_ns, iattr->ia_vfsuid); arg->valid |= FATTR_UID; arg->uid = from_kuid(fc->user_ns, fsuid); } if (ivalid & ATTR_GID) { kgid_t fsgid = from_vfsgid(idmap, fc->user_ns, iattr->ia_vfsgid); arg->valid |= FATTR_GID; arg->gid = from_kgid(fc->user_ns, fsgid); } if (ivalid & ATTR_SIZE) arg->valid |= FATTR_SIZE, arg->size = iattr->ia_size; if (ivalid & ATTR_ATIME) { arg->valid |= FATTR_ATIME; arg->atime = iattr->ia_atime.tv_sec; arg->atimensec = iattr->ia_atime.tv_nsec; if (!(ivalid & ATTR_ATIME_SET)) arg->valid |= FATTR_ATIME_NOW; } if ((ivalid & ATTR_MTIME) && update_mtime(ivalid, trust_local_cmtime)) { arg->valid |= FATTR_MTIME; arg->mtime = iattr->ia_mtime.tv_sec; arg->mtimensec = iattr->ia_mtime.tv_nsec; if (!(ivalid & ATTR_MTIME_SET) && !trust_local_cmtime) arg->valid |= FATTR_MTIME_NOW; } if ((ivalid & ATTR_CTIME) && trust_local_cmtime) { arg->valid |= FATTR_CTIME; arg->ctime = iattr->ia_ctime.tv_sec; arg->ctimensec = iattr->ia_ctime.tv_nsec; } } /* * Prevent concurrent writepages on inode * * This is done by adding a negative bias to the inode write counter * and waiting for all pending writes to finish. */ void fuse_set_nowrite(struct inode *inode) { struct fuse_inode *fi = get_fuse_inode(inode); BUG_ON(!inode_is_locked(inode)); spin_lock(&fi->lock); BUG_ON(fi->writectr < 0); fi->writectr += FUSE_NOWRITE; spin_unlock(&fi->lock); wait_event(fi->page_waitq, fi->writectr == FUSE_NOWRITE); } /* * Allow writepages on inode * * Remove the bias from the writecounter and send any queued * writepages. */ static void __fuse_release_nowrite(struct inode *inode) { struct fuse_inode *fi = get_fuse_inode(inode); BUG_ON(fi->writectr != FUSE_NOWRITE); fi->writectr = 0; fuse_flush_writepages(inode); } void fuse_release_nowrite(struct inode *inode) { struct fuse_inode *fi = get_fuse_inode(inode); spin_lock(&fi->lock); __fuse_release_nowrite(inode); spin_unlock(&fi->lock); } static void fuse_setattr_fill(struct fuse_conn *fc, struct fuse_args *args, struct inode *inode, struct fuse_setattr_in *inarg_p, struct fuse_attr_out *outarg_p) { args->opcode = FUSE_SETATTR; args->nodeid = get_node_id(inode); args->in_numargs = 1; args->in_args[0].size = sizeof(*inarg_p); args->in_args[0].value = inarg_p; args->out_numargs = 1; args->out_args[0].size = sizeof(*outarg_p); args->out_args[0].value = outarg_p; } /* * Flush inode->i_mtime to the server */ int fuse_flush_times(struct inode *inode, struct fuse_file *ff) { struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); struct fuse_setattr_in inarg; struct fuse_attr_out outarg; memset(&inarg, 0, sizeof(inarg)); memset(&outarg, 0, sizeof(outarg)); inarg.valid = FATTR_MTIME; inarg.mtime = inode_get_mtime_sec(inode); inarg.mtimensec = inode_get_mtime_nsec(inode); if (fm->fc->minor >= 23) { inarg.valid |= FATTR_CTIME; inarg.ctime = inode_get_ctime_sec(inode); inarg.ctimensec = inode_get_ctime_nsec(inode); } if (ff) { inarg.valid |= FATTR_FH; inarg.fh = ff->fh; } fuse_setattr_fill(fm->fc, &args, inode, &inarg, &outarg); return fuse_simple_request(fm, &args); } /* * Set attributes, and at the same time refresh them. * * Truncation is slightly complicated, because the 'truncate' request * may fail, in which case we don't want to touch the mapping. * vmtruncate() doesn't allow for this case, so do the rlimit checking * and the actual truncation by hand. */ int fuse_do_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr, struct file *file) { struct inode *inode = d_inode(dentry); struct fuse_mount *fm = get_fuse_mount(inode); struct fuse_conn *fc = fm->fc; struct fuse_inode *fi = get_fuse_inode(inode); struct address_space *mapping = inode->i_mapping; FUSE_ARGS(args); struct fuse_setattr_in inarg; struct fuse_attr_out outarg; bool is_truncate = false; bool is_wb = fc->writeback_cache && S_ISREG(inode->i_mode); loff_t oldsize; int err; bool trust_local_cmtime = is_wb; bool fault_blocked = false; if (!fc->default_permissions) attr->ia_valid |= ATTR_FORCE; err = setattr_prepare(idmap, dentry, attr); if (err) return err; if (attr->ia_valid & ATTR_SIZE) { if (WARN_ON(!S_ISREG(inode->i_mode))) return -EIO; is_truncate = true; } if (FUSE_IS_DAX(inode) && is_truncate) { filemap_invalidate_lock(mapping); fault_blocked = true; err = fuse_dax_break_layouts(inode, 0, -1); if (err) { filemap_invalidate_unlock(mapping); return err; } } if (attr->ia_valid & ATTR_OPEN) { /* This is coming from open(..., ... | O_TRUNC); */ WARN_ON(!(attr->ia_valid & ATTR_SIZE)); WARN_ON(attr->ia_size != 0); if (fc->atomic_o_trunc) { /* * No need to send request to userspace, since actual * truncation has already been done by OPEN. But still * need to truncate page cache. */ i_size_write(inode, 0); truncate_pagecache(inode, 0); goto out; } file = NULL; } /* Flush dirty data/metadata before non-truncate SETATTR */ if (is_wb && attr->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_MTIME_SET | ATTR_TIMES_SET)) { err = write_inode_now(inode, true); if (err) return err; fuse_set_nowrite(inode); fuse_release_nowrite(inode); } if (is_truncate) { fuse_set_nowrite(inode); set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); if (trust_local_cmtime && attr->ia_size != inode->i_size) attr->ia_valid |= ATTR_MTIME | ATTR_CTIME; } memset(&inarg, 0, sizeof(inarg)); memset(&outarg, 0, sizeof(outarg)); iattr_to_fattr(idmap, fc, attr, &inarg, trust_local_cmtime); if (file) { struct fuse_file *ff = file->private_data; inarg.valid |= FATTR_FH; inarg.fh = ff->fh; } /* Kill suid/sgid for non-directory chown unconditionally */ if (fc->handle_killpriv_v2 && !S_ISDIR(inode->i_mode) && attr->ia_valid & (ATTR_UID | ATTR_GID)) inarg.valid |= FATTR_KILL_SUIDGID; if (attr->ia_valid & ATTR_SIZE) { /* For mandatory locking in truncate */ inarg.valid |= FATTR_LOCKOWNER; inarg.lock_owner = fuse_lock_owner_id(fc, current->files); /* Kill suid/sgid for truncate only if no CAP_FSETID */ if (fc->handle_killpriv_v2 && !capable(CAP_FSETID)) inarg.valid |= FATTR_KILL_SUIDGID; } fuse_setattr_fill(fc, &args, inode, &inarg, &outarg); err = fuse_simple_request(fm, &args); if (err) { if (err == -EINTR) fuse_invalidate_attr(inode); goto error; } if (fuse_invalid_attr(&outarg.attr) || inode_wrong_type(inode, outarg.attr.mode)) { fuse_make_bad(inode); err = -EIO; goto error; } spin_lock(&fi->lock); /* the kernel maintains i_mtime locally */ if (trust_local_cmtime) { if (attr->ia_valid & ATTR_MTIME) inode_set_mtime_to_ts(inode, attr->ia_mtime); if (attr->ia_valid & ATTR_CTIME) inode_set_ctime_to_ts(inode, attr->ia_ctime); /* FIXME: clear I_DIRTY_SYNC? */ } fuse_change_attributes_common(inode, &outarg.attr, NULL, ATTR_TIMEOUT(&outarg), fuse_get_cache_mask(inode), 0); oldsize = inode->i_size; /* see the comment in fuse_change_attributes() */ if (!is_wb || is_truncate) i_size_write(inode, outarg.attr.size); if (is_truncate) { /* NOTE: this may release/reacquire fi->lock */ __fuse_release_nowrite(inode); } spin_unlock(&fi->lock); /* * Only call invalidate_inode_pages2() after removing * FUSE_NOWRITE, otherwise fuse_launder_folio() would deadlock. */ if ((is_truncate || !is_wb) && S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) { truncate_pagecache(inode, outarg.attr.size); invalidate_inode_pages2(mapping); } clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); out: if (fault_blocked) filemap_invalidate_unlock(mapping); return 0; error: if (is_truncate) fuse_release_nowrite(inode); clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); if (fault_blocked) filemap_invalidate_unlock(mapping); return err; } static int fuse_setattr(struct mnt_idmap *idmap, struct dentry *entry, struct iattr *attr) { struct inode *inode = d_inode(entry); struct fuse_conn *fc = get_fuse_conn(inode); struct file *file = (attr->ia_valid & ATTR_FILE) ? attr->ia_file : NULL; int ret; if (fuse_is_bad(inode)) return -EIO; if (!fuse_allow_current_process(get_fuse_conn(inode))) return -EACCES; if (attr->ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID)) { attr->ia_valid &= ~(ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_MODE); /* * The only sane way to reliably kill suid/sgid is to do it in * the userspace filesystem * * This should be done on write(), truncate() and chown(). */ if (!fc->handle_killpriv && !fc->handle_killpriv_v2) { /* * ia_mode calculation may have used stale i_mode. * Refresh and recalculate. */ ret = fuse_do_getattr(idmap, inode, NULL, file); if (ret) return ret; attr->ia_mode = inode->i_mode; if (inode->i_mode & S_ISUID) { attr->ia_valid |= ATTR_MODE; attr->ia_mode &= ~S_ISUID; } if ((inode->i_mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) { attr->ia_valid |= ATTR_MODE; attr->ia_mode &= ~S_ISGID; } } } if (!attr->ia_valid) return 0; ret = fuse_do_setattr(idmap, entry, attr, file); if (!ret) { /* * If filesystem supports acls it may have updated acl xattrs in * the filesystem, so forget cached acls for the inode. */ if (fc->posix_acl) forget_all_cached_acls(inode); /* Directory mode changed, may need to revalidate access */ if (d_is_dir(entry) && (attr->ia_valid & ATTR_MODE)) fuse_invalidate_entry_cache(entry); } return ret; } static int fuse_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags) { struct inode *inode = d_inode(path->dentry); struct fuse_conn *fc = get_fuse_conn(inode); if (fuse_is_bad(inode)) return -EIO; if (!fuse_allow_current_process(fc)) { if (!request_mask) { /* * If user explicitly requested *nothing* then don't * error out, but return st_dev only. */ stat->result_mask = 0; stat->dev = inode->i_sb->s_dev; return 0; } return -EACCES; } return fuse_update_get_attr(idmap, inode, NULL, stat, request_mask, flags); } static const struct inode_operations fuse_dir_inode_operations = { .lookup = fuse_lookup, .mkdir = fuse_mkdir, .symlink = fuse_symlink, .unlink = fuse_unlink, .rmdir = fuse_rmdir, .rename = fuse_rename2, .link = fuse_link, .setattr = fuse_setattr, .create = fuse_create, .atomic_open = fuse_atomic_open, .tmpfile = fuse_tmpfile, .mknod = fuse_mknod, .permission = fuse_permission, .getattr = fuse_getattr, .listxattr = fuse_listxattr, .get_inode_acl = fuse_get_inode_acl, .get_acl = fuse_get_acl, .set_acl = fuse_set_acl, .fileattr_get = fuse_fileattr_get, .fileattr_set = fuse_fileattr_set, }; static const struct file_operations fuse_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, .iterate_shared = fuse_readdir, .open = fuse_dir_open, .release = fuse_dir_release, .fsync = fuse_dir_fsync, .unlocked_ioctl = fuse_dir_ioctl, .compat_ioctl = fuse_dir_compat_ioctl, }; static const struct inode_operations fuse_common_inode_operations = { .setattr = fuse_setattr, .permission = fuse_permission, .getattr = fuse_getattr, .listxattr = fuse_listxattr, .get_inode_acl = fuse_get_inode_acl, .get_acl = fuse_get_acl, .set_acl = fuse_set_acl, .fileattr_get = fuse_fileattr_get, .fileattr_set = fuse_fileattr_set, }; static const struct inode_operations fuse_symlink_inode_operations = { .setattr = fuse_setattr, .get_link = fuse_get_link, .getattr = fuse_getattr, .listxattr = fuse_listxattr, }; void fuse_init_common(struct inode *inode) { inode->i_op = &fuse_common_inode_operations; } void fuse_init_dir(struct inode *inode) { struct fuse_inode *fi = get_fuse_inode(inode); inode->i_op = &fuse_dir_inode_operations; inode->i_fop = &fuse_dir_operations; spin_lock_init(&fi->rdc.lock); fi->rdc.cached = false; fi->rdc.size = 0; fi->rdc.pos = 0; fi->rdc.version = 0; } static int fuse_symlink_read_folio(struct file *null, struct folio *folio) { int err = fuse_readlink_page(folio->mapping->host, folio); if (!err) folio_mark_uptodate(folio); folio_unlock(folio); return err; } static const struct address_space_operations fuse_symlink_aops = { .read_folio = fuse_symlink_read_folio, }; void fuse_init_symlink(struct inode *inode) { inode->i_op = &fuse_symlink_inode_operations; inode->i_data.a_ops = &fuse_symlink_aops; inode_nohighmem(inode); } |
| 3 3 3 3 3 3 3 3 163 1 23 28 6 5 160 81 155 6 108 198 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 | // SPDX-License-Identifier: GPL-2.0-only /* * misc.c * * PURPOSE * Miscellaneous routines for the OSTA-UDF(tm) filesystem. * * COPYRIGHT * (C) 1998 Dave Boynton * (C) 1998-2004 Ben Fennema * (C) 1999-2000 Stelias Computing Inc * * HISTORY * * 04/19/99 blf partial support for reading/writing specific EA's */ #include "udfdecl.h" #include <linux/fs.h> #include <linux/string.h> #include <linux/crc-itu-t.h> #include "udf_i.h" #include "udf_sb.h" struct genericFormat *udf_add_extendedattr(struct inode *inode, uint32_t size, uint32_t type, uint8_t loc) { uint8_t *ea = NULL, *ad = NULL; int offset; uint16_t crclen; struct udf_inode_info *iinfo = UDF_I(inode); ea = iinfo->i_data; if (iinfo->i_lenEAttr) { ad = iinfo->i_data + iinfo->i_lenEAttr; } else { ad = ea; size += sizeof(struct extendedAttrHeaderDesc); } offset = inode->i_sb->s_blocksize - udf_file_entry_alloc_offset(inode) - iinfo->i_lenAlloc; /* TODO - Check for FreeEASpace */ if (loc & 0x01 && offset >= size) { struct extendedAttrHeaderDesc *eahd; eahd = (struct extendedAttrHeaderDesc *)ea; if (iinfo->i_lenAlloc) memmove(&ad[size], ad, iinfo->i_lenAlloc); if (iinfo->i_lenEAttr) { /* check checksum/crc */ if (eahd->descTag.tagIdent != cpu_to_le16(TAG_IDENT_EAHD) || le32_to_cpu(eahd->descTag.tagLocation) != iinfo->i_location.logicalBlockNum) return NULL; } else { struct udf_sb_info *sbi = UDF_SB(inode->i_sb); size -= sizeof(struct extendedAttrHeaderDesc); iinfo->i_lenEAttr += sizeof(struct extendedAttrHeaderDesc); eahd->descTag.tagIdent = cpu_to_le16(TAG_IDENT_EAHD); if (sbi->s_udfrev >= 0x0200) eahd->descTag.descVersion = cpu_to_le16(3); else eahd->descTag.descVersion = cpu_to_le16(2); eahd->descTag.tagSerialNum = cpu_to_le16(sbi->s_serial_number); eahd->descTag.tagLocation = cpu_to_le32( iinfo->i_location.logicalBlockNum); eahd->impAttrLocation = cpu_to_le32(0xFFFFFFFF); eahd->appAttrLocation = cpu_to_le32(0xFFFFFFFF); } offset = iinfo->i_lenEAttr; if (type < 2048) { if (le32_to_cpu(eahd->appAttrLocation) < iinfo->i_lenEAttr) { uint32_t aal = le32_to_cpu(eahd->appAttrLocation); memmove(&ea[offset - aal + size], &ea[aal], offset - aal); offset -= aal; eahd->appAttrLocation = cpu_to_le32(aal + size); } if (le32_to_cpu(eahd->impAttrLocation) < iinfo->i_lenEAttr) { uint32_t ial = le32_to_cpu(eahd->impAttrLocation); memmove(&ea[offset - ial + size], &ea[ial], offset - ial); offset -= ial; eahd->impAttrLocation = cpu_to_le32(ial + size); } } else if (type < 65536) { if (le32_to_cpu(eahd->appAttrLocation) < iinfo->i_lenEAttr) { uint32_t aal = le32_to_cpu(eahd->appAttrLocation); memmove(&ea[offset - aal + size], &ea[aal], offset - aal); offset -= aal; eahd->appAttrLocation = cpu_to_le32(aal + size); } } /* rewrite CRC + checksum of eahd */ crclen = sizeof(struct extendedAttrHeaderDesc) - sizeof(struct tag); eahd->descTag.descCRCLength = cpu_to_le16(crclen); eahd->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)eahd + sizeof(struct tag), crclen)); eahd->descTag.tagChecksum = udf_tag_checksum(&eahd->descTag); iinfo->i_lenEAttr += size; return (struct genericFormat *)&ea[offset]; } return NULL; } struct genericFormat *udf_get_extendedattr(struct inode *inode, uint32_t type, uint8_t subtype) { struct genericFormat *gaf; uint8_t *ea = NULL; uint32_t offset; struct udf_inode_info *iinfo = UDF_I(inode); ea = iinfo->i_data; if (iinfo->i_lenEAttr) { struct extendedAttrHeaderDesc *eahd; eahd = (struct extendedAttrHeaderDesc *)ea; /* check checksum/crc */ if (eahd->descTag.tagIdent != cpu_to_le16(TAG_IDENT_EAHD) || le32_to_cpu(eahd->descTag.tagLocation) != iinfo->i_location.logicalBlockNum) return NULL; if (type < 2048) offset = sizeof(struct extendedAttrHeaderDesc); else if (type < 65536) offset = le32_to_cpu(eahd->impAttrLocation); else offset = le32_to_cpu(eahd->appAttrLocation); while (offset + sizeof(*gaf) < iinfo->i_lenEAttr) { uint32_t attrLength; gaf = (struct genericFormat *)&ea[offset]; attrLength = le32_to_cpu(gaf->attrLength); /* Detect undersized elements and buffer overflows */ if ((attrLength < sizeof(*gaf)) || (attrLength > (iinfo->i_lenEAttr - offset))) break; if (le32_to_cpu(gaf->attrType) == type && gaf->attrSubtype == subtype) return gaf; else offset += attrLength; } } return NULL; } /* * udf_read_tagged * * PURPOSE * Read the first block of a tagged descriptor. * * HISTORY * July 1, 1997 - Andrew E. Mileski * Written, tested, and released. */ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block, uint32_t location, uint16_t *ident) { struct tag *tag_p; struct buffer_head *bh = NULL; u8 checksum; /* Read the block */ if (block == 0xFFFFFFFF) return NULL; bh = sb_bread(sb, block); if (!bh) { udf_err(sb, "read failed, block=%u, location=%u\n", block, location); return NULL; } tag_p = (struct tag *)(bh->b_data); *ident = le16_to_cpu(tag_p->tagIdent); if (location != le32_to_cpu(tag_p->tagLocation)) { udf_debug("location mismatch block %u, tag %u != %u\n", block, le32_to_cpu(tag_p->tagLocation), location); goto error_out; } /* Verify the tag checksum */ checksum = udf_tag_checksum(tag_p); if (checksum != tag_p->tagChecksum) { udf_err(sb, "tag checksum failed, block %u: 0x%02x != 0x%02x\n", block, checksum, tag_p->tagChecksum); goto error_out; } /* Verify the tag version */ if (tag_p->descVersion != cpu_to_le16(0x0002U) && tag_p->descVersion != cpu_to_le16(0x0003U)) { udf_err(sb, "tag version 0x%04x != 0x0002 || 0x0003, block %u\n", le16_to_cpu(tag_p->descVersion), block); goto error_out; } /* Verify the descriptor CRC */ if (le16_to_cpu(tag_p->descCRCLength) + sizeof(struct tag) > sb->s_blocksize || le16_to_cpu(tag_p->descCRC) == crc_itu_t(0, bh->b_data + sizeof(struct tag), le16_to_cpu(tag_p->descCRCLength))) return bh; udf_debug("Crc failure block %u: crc = %u, crclen = %u\n", block, le16_to_cpu(tag_p->descCRC), le16_to_cpu(tag_p->descCRCLength)); error_out: brelse(bh); return NULL; } struct buffer_head *udf_read_ptagged(struct super_block *sb, struct kernel_lb_addr *loc, uint32_t offset, uint16_t *ident) { return udf_read_tagged(sb, udf_get_lb_pblock(sb, loc, offset), loc->logicalBlockNum + offset, ident); } void udf_update_tag(char *data, int length) { struct tag *tptr = (struct tag *)data; length -= sizeof(struct tag); tptr->descCRCLength = cpu_to_le16(length); tptr->descCRC = cpu_to_le16(crc_itu_t(0, data + sizeof(struct tag), length)); tptr->tagChecksum = udf_tag_checksum(tptr); } void udf_new_tag(char *data, uint16_t ident, uint16_t version, uint16_t snum, uint32_t loc, int length) { struct tag *tptr = (struct tag *)data; tptr->tagIdent = cpu_to_le16(ident); tptr->descVersion = cpu_to_le16(version); tptr->tagSerialNum = cpu_to_le16(snum); tptr->tagLocation = cpu_to_le32(loc); udf_update_tag(data, length); } u8 udf_tag_checksum(const struct tag *t) { u8 *data = (u8 *)t; u8 checksum = 0; int i; for (i = 0; i < sizeof(struct tag); ++i) if (i != 4) /* position of checksum */ checksum += data[i]; return checksum; } |
| 2 2 2 2 2 2 2 2 33 31 32 33 33 31 31 31 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 | // SPDX-License-Identifier: GPL-2.0 /* * NETLINK Netlink attributes * * Authors: Thomas Graf <tgraf@suug.ch> * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> */ #include <linux/export.h> #include <linux/kernel.h> #include <linux/errno.h> #include <linux/jiffies.h> #include <linux/nospec.h> #include <linux/skbuff.h> #include <linux/string.h> #include <linux/types.h> #include <net/netlink.h> /* For these data types, attribute length should be exactly the given * size. However, to maintain compatibility with broken commands, if the * attribute length does not match the expected size a warning is emitted * to the user that the command is sending invalid data and needs to be fixed. */ static const u8 nla_attr_len[NLA_TYPE_MAX+1] = { [NLA_U8] = sizeof(u8), [NLA_U16] = sizeof(u16), [NLA_U32] = sizeof(u32), [NLA_U64] = sizeof(u64), [NLA_S8] = sizeof(s8), [NLA_S16] = sizeof(s16), [NLA_S32] = sizeof(s32), [NLA_S64] = sizeof(s64), [NLA_BE16] = sizeof(__be16), [NLA_BE32] = sizeof(__be32), }; static const u8 nla_attr_minlen[NLA_TYPE_MAX+1] = { [NLA_U8] = sizeof(u8), [NLA_U16] = sizeof(u16), [NLA_U32] = sizeof(u32), [NLA_U64] = sizeof(u64), [NLA_MSECS] = sizeof(u64), [NLA_NESTED] = NLA_HDRLEN, [NLA_S8] = sizeof(s8), [NLA_S16] = sizeof(s16), [NLA_S32] = sizeof(s32), [NLA_S64] = sizeof(s64), [NLA_BE16] = sizeof(__be16), [NLA_BE32] = sizeof(__be32), }; /* * Nested policies might refer back to the original * policy in some cases, and userspace could try to * abuse that and recurse by nesting in the right * ways. Limit recursion to avoid this problem. */ #define MAX_POLICY_RECURSION_DEPTH 10 static int __nla_validate_parse(const struct nlattr *head, int len, int maxtype, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack, struct nlattr **tb, unsigned int depth); static int validate_nla_bitfield32(const struct nlattr *nla, const u32 valid_flags_mask) { const struct nla_bitfield32 *bf = nla_data(nla); if (!valid_flags_mask) return -EINVAL; /*disallow invalid bit selector */ if (bf->selector & ~valid_flags_mask) return -EINVAL; /*disallow invalid bit values */ if (bf->value & ~valid_flags_mask) return -EINVAL; /*disallow valid bit values that are not selected*/ if (bf->value & ~bf->selector) return -EINVAL; return 0; } static int nla_validate_array(const struct nlattr *head, int len, int maxtype, const struct nla_policy *policy, struct netlink_ext_ack *extack, unsigned int validate, unsigned int depth) { const struct nlattr *entry; int rem; nla_for_each_attr(entry, head, len, rem) { int ret; if (nla_len(entry) == 0) continue; if (nla_len(entry) < NLA_HDRLEN) { NL_SET_ERR_MSG_ATTR_POL(extack, entry, policy, "Array element too short"); return -ERANGE; } ret = __nla_validate_parse(nla_data(entry), nla_len(entry), maxtype, policy, validate, extack, NULL, depth + 1); if (ret < 0) return ret; } return 0; } void nla_get_range_unsigned(const struct nla_policy *pt, struct netlink_range_validation *range) { WARN_ON_ONCE(pt->validation_type != NLA_VALIDATE_RANGE_PTR && (pt->min < 0 || pt->max < 0)); range->min = 0; switch (pt->type) { case NLA_U8: range->max = U8_MAX; break; case NLA_U16: case NLA_BE16: case NLA_BINARY: range->max = U16_MAX; break; case NLA_U32: case NLA_BE32: range->max = U32_MAX; break; case NLA_U64: case NLA_UINT: case NLA_MSECS: range->max = U64_MAX; break; default: WARN_ON_ONCE(1); return; } switch (pt->validation_type) { case NLA_VALIDATE_RANGE: case NLA_VALIDATE_RANGE_WARN_TOO_LONG: range->min = pt->min; range->max = pt->max; break; case NLA_VALIDATE_RANGE_PTR: *range = *pt->range; break; case NLA_VALIDATE_MIN: range->min = pt->min; break; case NLA_VALIDATE_MAX: range->max = pt->max; break; default: break; } } static int nla_validate_range_unsigned(const struct nla_policy *pt, const struct nlattr *nla, struct netlink_ext_ack *extack, unsigned int validate) { struct netlink_range_validation range; u64 value; switch (pt->type) { case NLA_U8: value = nla_get_u8(nla); break; case NLA_U16: value = nla_get_u16(nla); break; case NLA_U32: value = nla_get_u32(nla); break; case NLA_U64: value = nla_get_u64(nla); break; case NLA_UINT: value = nla_get_uint(nla); break; case NLA_MSECS: value = nla_get_u64(nla); break; case NLA_BINARY: value = nla_len(nla); break; case NLA_BE16: value = ntohs(nla_get_be16(nla)); break; case NLA_BE32: value = ntohl(nla_get_be32(nla)); break; default: return -EINVAL; } nla_get_range_unsigned(pt, &range); if (pt->validation_type == NLA_VALIDATE_RANGE_WARN_TOO_LONG && pt->type == NLA_BINARY && value > range.max) { pr_warn_ratelimited("netlink: '%s': attribute type %d has an invalid length.\n", current->comm, pt->type); if (validate & NL_VALIDATE_STRICT_ATTRS) { NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "invalid attribute length"); return -EINVAL; } /* this assumes min <= max (don't validate against min) */ return 0; } if (value < range.min || value > range.max) { bool binary = pt->type == NLA_BINARY; if (binary) NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "binary attribute size out of range"); else NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "integer out of range"); return -ERANGE; } return 0; } void nla_get_range_signed(const struct nla_policy *pt, struct netlink_range_validation_signed *range) { switch (pt->type) { case NLA_S8: range->min = S8_MIN; range->max = S8_MAX; break; case NLA_S16: range->min = S16_MIN; range->max = S16_MAX; break; case NLA_S32: range->min = S32_MIN; range->max = S32_MAX; break; case NLA_S64: case NLA_SINT: range->min = S64_MIN; range->max = S64_MAX; break; default: WARN_ON_ONCE(1); return; } switch (pt->validation_type) { case NLA_VALIDATE_RANGE: range->min = pt->min; range->max = pt->max; break; case NLA_VALIDATE_RANGE_PTR: *range = *pt->range_signed; break; case NLA_VALIDATE_MIN: range->min = pt->min; break; case NLA_VALIDATE_MAX: range->max = pt->max; break; default: break; } } static int nla_validate_int_range_signed(const struct nla_policy *pt, const struct nlattr *nla, struct netlink_ext_ack *extack) { struct netlink_range_validation_signed range; s64 value; switch (pt->type) { case NLA_S8: value = nla_get_s8(nla); break; case NLA_S16: value = nla_get_s16(nla); break; case NLA_S32: value = nla_get_s32(nla); break; case NLA_S64: value = nla_get_s64(nla); break; case NLA_SINT: value = nla_get_sint(nla); break; default: return -EINVAL; } nla_get_range_signed(pt, &range); if (value < range.min || value > range.max) { NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "integer out of range"); return -ERANGE; } return 0; } static int nla_validate_int_range(const struct nla_policy *pt, const struct nlattr *nla, struct netlink_ext_ack *extack, unsigned int validate) { switch (pt->type) { case NLA_U8: case NLA_U16: case NLA_U32: case NLA_U64: case NLA_UINT: case NLA_MSECS: case NLA_BINARY: case NLA_BE16: case NLA_BE32: return nla_validate_range_unsigned(pt, nla, extack, validate); case NLA_S8: case NLA_S16: case NLA_S32: case NLA_S64: case NLA_SINT: return nla_validate_int_range_signed(pt, nla, extack); default: WARN_ON(1); return -EINVAL; } } static int nla_validate_mask(const struct nla_policy *pt, const struct nlattr *nla, struct netlink_ext_ack *extack) { u64 value; switch (pt->type) { case NLA_U8: value = nla_get_u8(nla); break; case NLA_U16: value = nla_get_u16(nla); break; case NLA_U32: value = nla_get_u32(nla); break; case NLA_U64: value = nla_get_u64(nla); break; case NLA_UINT: value = nla_get_uint(nla); break; case NLA_BE16: value = ntohs(nla_get_be16(nla)); break; case NLA_BE32: value = ntohl(nla_get_be32(nla)); break; default: return -EINVAL; } if (value & ~(u64)pt->mask) { NL_SET_ERR_MSG_ATTR(extack, nla, "reserved bit set"); return -EINVAL; } return 0; } static int validate_nla(const struct nlattr *nla, int maxtype, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack, unsigned int depth) { u16 strict_start_type = policy[0].strict_start_type; const struct nla_policy *pt; int minlen = 0, attrlen = nla_len(nla), type = nla_type(nla); int err = -ERANGE; if (strict_start_type && type >= strict_start_type) validate |= NL_VALIDATE_STRICT; if (type <= 0 || type > maxtype) return 0; type = array_index_nospec(type, maxtype + 1); pt = &policy[type]; BUG_ON(pt->type > NLA_TYPE_MAX); if (nla_attr_len[pt->type] && attrlen != nla_attr_len[pt->type]) { pr_warn_ratelimited("netlink: '%s': attribute type %d has an invalid length.\n", current->comm, type); if (validate & NL_VALIDATE_STRICT_ATTRS) { NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "invalid attribute length"); return -EINVAL; } } if (validate & NL_VALIDATE_NESTED) { if ((pt->type == NLA_NESTED || pt->type == NLA_NESTED_ARRAY) && !(nla->nla_type & NLA_F_NESTED)) { NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "NLA_F_NESTED is missing"); return -EINVAL; } if (pt->type != NLA_NESTED && pt->type != NLA_NESTED_ARRAY && pt->type != NLA_UNSPEC && (nla->nla_type & NLA_F_NESTED)) { NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "NLA_F_NESTED not expected"); return -EINVAL; } } switch (pt->type) { case NLA_REJECT: if (extack && pt->reject_message) { NL_SET_BAD_ATTR(extack, nla); extack->_msg = pt->reject_message; return -EINVAL; } err = -EINVAL; goto out_err; case NLA_FLAG: if (attrlen > 0) goto out_err; break; case NLA_SINT: case NLA_UINT: if (attrlen != sizeof(u32) && attrlen != sizeof(u64)) { NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "invalid attribute length"); return -EINVAL; } break; case NLA_BITFIELD32: if (attrlen != sizeof(struct nla_bitfield32)) goto out_err; err = validate_nla_bitfield32(nla, pt->bitfield32_valid); if (err) goto out_err; break; case NLA_NUL_STRING: if (pt->len) minlen = min_t(int, attrlen, pt->len + 1); else minlen = attrlen; if (!minlen || memchr(nla_data(nla), '\0', minlen) == NULL) { err = -EINVAL; goto out_err; } fallthrough; case NLA_STRING: if (attrlen < 1) goto out_err; if (pt->len) { char *buf = nla_data(nla); if (buf[attrlen - 1] == '\0') attrlen--; if (attrlen > pt->len) goto out_err; } break; case NLA_BINARY: if (pt->len && attrlen > pt->len) goto out_err; break; case NLA_NESTED: /* a nested attributes is allowed to be empty; if its not, * it must have a size of at least NLA_HDRLEN. */ if (attrlen == 0) break; if (attrlen < NLA_HDRLEN) goto out_err; if (pt->nested_policy) { err = __nla_validate_parse(nla_data(nla), nla_len(nla), pt->len, pt->nested_policy, validate, extack, NULL, depth + 1); if (err < 0) { /* * return directly to preserve the inner * error message/attribute pointer */ return err; } } break; case NLA_NESTED_ARRAY: /* a nested array attribute is allowed to be empty; if its not, * it must have a size of at least NLA_HDRLEN. */ if (attrlen == 0) break; if (attrlen < NLA_HDRLEN) goto out_err; if (pt->nested_policy) { int err; err = nla_validate_array(nla_data(nla), nla_len(nla), pt->len, pt->nested_policy, extack, validate, depth); if (err < 0) { /* * return directly to preserve the inner * error message/attribute pointer */ return err; } } break; case NLA_UNSPEC: if (validate & NL_VALIDATE_UNSPEC) { NL_SET_ERR_MSG_ATTR(extack, nla, "Unsupported attribute"); return -EINVAL; } if (attrlen < pt->len) goto out_err; break; default: if (pt->len) minlen = pt->len; else minlen = nla_attr_minlen[pt->type]; if (attrlen < minlen) goto out_err; } /* further validation */ switch (pt->validation_type) { case NLA_VALIDATE_NONE: /* nothing to do */ break; case NLA_VALIDATE_RANGE_PTR: case NLA_VALIDATE_RANGE: case NLA_VALIDATE_RANGE_WARN_TOO_LONG: case NLA_VALIDATE_MIN: case NLA_VALIDATE_MAX: err = nla_validate_int_range(pt, nla, extack, validate); if (err) return err; break; case NLA_VALIDATE_MASK: err = nla_validate_mask(pt, nla, extack); if (err) return err; break; case NLA_VALIDATE_FUNCTION: if (pt->validate) { err = pt->validate(nla, extack); if (err) return err; } break; } return 0; out_err: NL_SET_ERR_MSG_ATTR_POL(extack, nla, pt, "Attribute failed policy validation"); return err; } static int __nla_validate_parse(const struct nlattr *head, int len, int maxtype, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack, struct nlattr **tb, unsigned int depth) { const struct nlattr *nla; int rem; if (depth >= MAX_POLICY_RECURSION_DEPTH) { NL_SET_ERR_MSG(extack, "allowed policy recursion depth exceeded"); return -EINVAL; } if (tb) memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1)); nla_for_each_attr(nla, head, len, rem) { u16 type = nla_type(nla); if (type == 0 || type > maxtype) { if (validate & NL_VALIDATE_MAXTYPE) { NL_SET_ERR_MSG_ATTR(extack, nla, "Unknown attribute type"); return -EINVAL; } continue; } type = array_index_nospec(type, maxtype + 1); if (policy) { int err = validate_nla(nla, maxtype, policy, validate, extack, depth); if (err < 0) return err; } if (tb) tb[type] = (struct nlattr *)nla; } if (unlikely(rem > 0)) { pr_warn_ratelimited("netlink: %d bytes leftover after parsing attributes in process `%s'.\n", rem, current->comm); NL_SET_ERR_MSG(extack, "bytes leftover after parsing attributes"); if (validate & NL_VALIDATE_TRAILING) return -EINVAL; } return 0; } /** * __nla_validate - Validate a stream of attributes * @head: head of attribute stream * @len: length of attribute stream * @maxtype: maximum attribute type to be expected * @policy: validation policy * @validate: validation strictness * @extack: extended ACK report struct * * Validates all attributes in the specified attribute stream against the * specified policy. Validation depends on the validate flags passed, see * &enum netlink_validation for more details on that. * See documentation of struct nla_policy for more details. * * Returns 0 on success or a negative error code. */ int __nla_validate(const struct nlattr *head, int len, int maxtype, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack) { return __nla_validate_parse(head, len, maxtype, policy, validate, extack, NULL, 0); } EXPORT_SYMBOL(__nla_validate); /** * nla_policy_len - Determine the max. length of a policy * @p: policy to use * @n: number of policies * * Determines the max. length of the policy. It is currently used * to allocated Netlink buffers roughly the size of the actual * message. * * Returns 0 on success or a negative error code. */ int nla_policy_len(const struct nla_policy *p, int n) { int i, len = 0; for (i = 0; i < n; i++, p++) { if (p->len) len += nla_total_size(p->len); else if (nla_attr_len[p->type]) len += nla_total_size(nla_attr_len[p->type]); else if (nla_attr_minlen[p->type]) len += nla_total_size(nla_attr_minlen[p->type]); } return len; } EXPORT_SYMBOL(nla_policy_len); /** * __nla_parse - Parse a stream of attributes into a tb buffer * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected * @head: head of attribute stream * @len: length of attribute stream * @policy: validation policy * @validate: validation strictness * @extack: extended ACK pointer * * Parses a stream of attributes and stores a pointer to each attribute in * the tb array accessible via the attribute type. * Validation is controlled by the @validate parameter. * * Returns 0 on success or a negative error code. */ int __nla_parse(struct nlattr **tb, int maxtype, const struct nlattr *head, int len, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack) { return __nla_validate_parse(head, len, maxtype, policy, validate, extack, tb, 0); } EXPORT_SYMBOL(__nla_parse); /** * nla_find - Find a specific attribute in a stream of attributes * @head: head of attribute stream * @len: length of attribute stream * @attrtype: type of attribute to look for * * Returns the first attribute in the stream matching the specified type. */ struct nlattr *nla_find(const struct nlattr *head, int len, int attrtype) { const struct nlattr *nla; int rem; nla_for_each_attr(nla, head, len, rem) if (nla_type(nla) == attrtype) return (struct nlattr *)nla; return NULL; } EXPORT_SYMBOL(nla_find); /** * nla_strscpy - Copy string attribute payload into a sized buffer * @dst: Where to copy the string to. * @nla: Attribute to copy the string from. * @dstsize: Size of destination buffer. * * Copies at most dstsize - 1 bytes into the destination buffer. * Unlike strscpy() the destination buffer is always padded out. * * Return: * * srclen - Returns @nla length (not including the trailing %NUL). * * -E2BIG - If @dstsize is 0 or greater than U16_MAX or @nla length greater * than @dstsize. */ ssize_t nla_strscpy(char *dst, const struct nlattr *nla, size_t dstsize) { size_t srclen = nla_len(nla); char *src = nla_data(nla); ssize_t ret; size_t len; if (dstsize == 0 || WARN_ON_ONCE(dstsize > U16_MAX)) return -E2BIG; if (srclen > 0 && src[srclen - 1] == '\0') srclen--; if (srclen >= dstsize) { len = dstsize - 1; ret = -E2BIG; } else { len = srclen; ret = len; } memcpy(dst, src, len); /* Zero pad end of dst. */ memset(dst + len, 0, dstsize - len); return ret; } EXPORT_SYMBOL(nla_strscpy); /** * nla_strdup - Copy string attribute payload into a newly allocated buffer * @nla: attribute to copy the string from * @flags: the type of memory to allocate (see kmalloc). * * Returns a pointer to the allocated buffer or NULL on error. */ char *nla_strdup(const struct nlattr *nla, gfp_t flags) { size_t srclen = nla_len(nla); char *src = nla_data(nla), *dst; if (srclen > 0 && src[srclen - 1] == '\0') srclen--; dst = kmalloc(srclen + 1, flags); if (dst != NULL) { memcpy(dst, src, srclen); dst[srclen] = '\0'; } return dst; } EXPORT_SYMBOL(nla_strdup); /** * nla_memcpy - Copy a netlink attribute into another memory area * @dest: where to copy to memcpy * @src: netlink attribute to copy from * @count: size of the destination area * * Note: The number of bytes copied is limited by the length of * attribute's payload. memcpy * * Returns the number of bytes copied. */ int nla_memcpy(void *dest, const struct nlattr *src, int count) { int minlen = min_t(int, count, nla_len(src)); memcpy(dest, nla_data(src), minlen); if (count > minlen) memset(dest + minlen, 0, count - minlen); return minlen; } EXPORT_SYMBOL(nla_memcpy); /** * nla_memcmp - Compare an attribute with sized memory area * @nla: netlink attribute * @data: memory area * @size: size of memory area */ int nla_memcmp(const struct nlattr *nla, const void *data, size_t size) { int d = nla_len(nla) - size; if (d == 0) d = memcmp(nla_data(nla), data, size); return d; } EXPORT_SYMBOL(nla_memcmp); /** * nla_strcmp - Compare a string attribute against a string * @nla: netlink string attribute * @str: another string */ int nla_strcmp(const struct nlattr *nla, const char *str) { int len = strlen(str); char *buf = nla_data(nla); int attrlen = nla_len(nla); int d; while (attrlen > 0 && buf[attrlen - 1] == '\0') attrlen--; d = attrlen - len; if (d == 0) d = memcmp(nla_data(nla), str, len); return d; } EXPORT_SYMBOL(nla_strcmp); #ifdef CONFIG_NET /** * __nla_reserve - reserve room for attribute on the skb * @skb: socket buffer to reserve room on * @attrtype: attribute type * @attrlen: length of attribute payload * * Adds a netlink attribute header to a socket buffer and reserves * room for the payload but does not copy it. * * The caller is responsible to ensure that the skb provides enough * tailroom for the attribute header and payload. */ struct nlattr *__nla_reserve(struct sk_buff *skb, int attrtype, int attrlen) { struct nlattr *nla; nla = skb_put(skb, nla_total_size(attrlen)); nla->nla_type = attrtype; nla->nla_len = nla_attr_size(attrlen); memset((unsigned char *) nla + nla->nla_len, 0, nla_padlen(attrlen)); return nla; } EXPORT_SYMBOL(__nla_reserve); /** * __nla_reserve_64bit - reserve room for attribute on the skb and align it * @skb: socket buffer to reserve room on * @attrtype: attribute type * @attrlen: length of attribute payload * @padattr: attribute type for the padding * * Adds a netlink attribute header to a socket buffer and reserves * room for the payload but does not copy it. It also ensure that this * attribute will have a 64-bit aligned nla_data() area. * * The caller is responsible to ensure that the skb provides enough * tailroom for the attribute header and payload. */ struct nlattr *__nla_reserve_64bit(struct sk_buff *skb, int attrtype, int attrlen, int padattr) { nla_align_64bit(skb, padattr); return __nla_reserve(skb, attrtype, attrlen); } EXPORT_SYMBOL(__nla_reserve_64bit); /** * __nla_reserve_nohdr - reserve room for attribute without header * @skb: socket buffer to reserve room on * @attrlen: length of attribute payload * * Reserves room for attribute payload without a header. * * The caller is responsible to ensure that the skb provides enough * tailroom for the payload. */ void *__nla_reserve_nohdr(struct sk_buff *skb, int attrlen) { return skb_put_zero(skb, NLA_ALIGN(attrlen)); } EXPORT_SYMBOL(__nla_reserve_nohdr); /** * nla_reserve - reserve room for attribute on the skb * @skb: socket buffer to reserve room on * @attrtype: attribute type * @attrlen: length of attribute payload * * Adds a netlink attribute header to a socket buffer and reserves * room for the payload but does not copy it. * * Returns NULL if the tailroom of the skb is insufficient to store * the attribute header and payload. */ struct nlattr *nla_reserve(struct sk_buff *skb, int attrtype, int attrlen) { if (unlikely(skb_tailroom(skb) < nla_total_size(attrlen))) return NULL; return __nla_reserve(skb, attrtype, attrlen); } EXPORT_SYMBOL(nla_reserve); /** * nla_reserve_64bit - reserve room for attribute on the skb and align it * @skb: socket buffer to reserve room on * @attrtype: attribute type * @attrlen: length of attribute payload * @padattr: attribute type for the padding * * Adds a netlink attribute header to a socket buffer and reserves * room for the payload but does not copy it. It also ensure that this * attribute will have a 64-bit aligned nla_data() area. * * Returns NULL if the tailroom of the skb is insufficient to store * the attribute header and payload. */ struct nlattr *nla_reserve_64bit(struct sk_buff *skb, int attrtype, int attrlen, int padattr) { size_t len; if (nla_need_padding_for_64bit(skb)) len = nla_total_size_64bit(attrlen); else len = nla_total_size(attrlen); if (unlikely(skb_tailroom(skb) < len)) return NULL; return __nla_reserve_64bit(skb, attrtype, attrlen, padattr); } EXPORT_SYMBOL(nla_reserve_64bit); /** * nla_reserve_nohdr - reserve room for attribute without header * @skb: socket buffer to reserve room on * @attrlen: length of attribute payload * * Reserves room for attribute payload without a header. * * Returns NULL if the tailroom of the skb is insufficient to store * the attribute payload. */ void *nla_reserve_nohdr(struct sk_buff *skb, int attrlen) { if (unlikely(skb_tailroom(skb) < NLA_ALIGN(attrlen))) return NULL; return __nla_reserve_nohdr(skb, attrlen); } EXPORT_SYMBOL(nla_reserve_nohdr); /** * __nla_put - Add a netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @attrlen: length of attribute payload * @data: head of attribute payload * * The caller is responsible to ensure that the skb provides enough * tailroom for the attribute header and payload. */ void __nla_put(struct sk_buff *skb, int attrtype, int attrlen, const void *data) { struct nlattr *nla; nla = __nla_reserve(skb, attrtype, attrlen); memcpy(nla_data(nla), data, attrlen); } EXPORT_SYMBOL(__nla_put); /** * __nla_put_64bit - Add a netlink attribute to a socket buffer and align it * @skb: socket buffer to add attribute to * @attrtype: attribute type * @attrlen: length of attribute payload * @data: head of attribute payload * @padattr: attribute type for the padding * * The caller is responsible to ensure that the skb provides enough * tailroom for the attribute header and payload. */ void __nla_put_64bit(struct sk_buff *skb, int attrtype, int attrlen, const void *data, int padattr) { struct nlattr *nla; nla = __nla_reserve_64bit(skb, attrtype, attrlen, padattr); memcpy(nla_data(nla), data, attrlen); } EXPORT_SYMBOL(__nla_put_64bit); /** * __nla_put_nohdr - Add a netlink attribute without header * @skb: socket buffer to add attribute to * @attrlen: length of attribute payload * @data: head of attribute payload * * The caller is responsible to ensure that the skb provides enough * tailroom for the attribute payload. */ void __nla_put_nohdr(struct sk_buff *skb, int attrlen, const void *data) { void *start; start = __nla_reserve_nohdr(skb, attrlen); memcpy(start, data, attrlen); } EXPORT_SYMBOL(__nla_put_nohdr); /** * nla_put - Add a netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @attrlen: length of attribute payload * @data: head of attribute payload * * Returns -EMSGSIZE if the tailroom of the skb is insufficient to store * the attribute header and payload. */ int nla_put(struct sk_buff *skb, int attrtype, int attrlen, const void *data) { if (unlikely(skb_tailroom(skb) < nla_total_size(attrlen))) return -EMSGSIZE; __nla_put(skb, attrtype, attrlen, data); return 0; } EXPORT_SYMBOL(nla_put); /** * nla_put_64bit - Add a netlink attribute to a socket buffer and align it * @skb: socket buffer to add attribute to * @attrtype: attribute type * @attrlen: length of attribute payload * @data: head of attribute payload * @padattr: attribute type for the padding * * Returns -EMSGSIZE if the tailroom of the skb is insufficient to store * the attribute header and payload. */ int nla_put_64bit(struct sk_buff *skb, int attrtype, int attrlen, const void *data, int padattr) { size_t len; if (nla_need_padding_for_64bit(skb)) len = nla_total_size_64bit(attrlen); else len = nla_total_size(attrlen); if (unlikely(skb_tailroom(skb) < len)) return -EMSGSIZE; __nla_put_64bit(skb, attrtype, attrlen, data, padattr); return 0; } EXPORT_SYMBOL(nla_put_64bit); /** * nla_put_nohdr - Add a netlink attribute without header * @skb: socket buffer to add attribute to * @attrlen: length of attribute payload * @data: head of attribute payload * * Returns -EMSGSIZE if the tailroom of the skb is insufficient to store * the attribute payload. */ int nla_put_nohdr(struct sk_buff *skb, int attrlen, const void *data) { if (unlikely(skb_tailroom(skb) < NLA_ALIGN(attrlen))) return -EMSGSIZE; __nla_put_nohdr(skb, attrlen, data); return 0; } EXPORT_SYMBOL(nla_put_nohdr); /** * nla_append - Add a netlink attribute without header or padding * @skb: socket buffer to add attribute to * @attrlen: length of attribute payload * @data: head of attribute payload * * Returns -EMSGSIZE if the tailroom of the skb is insufficient to store * the attribute payload. */ int nla_append(struct sk_buff *skb, int attrlen, const void *data) { if (unlikely(skb_tailroom(skb) < NLA_ALIGN(attrlen))) return -EMSGSIZE; skb_put_data(skb, data, attrlen); return 0; } EXPORT_SYMBOL(nla_append); #endif |
| 3 1 1 1 1 1 1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 | // SPDX-License-Identifier: GPL-2.0-only /* * linux/kernel/power/user.c * * This file provides the user space interface for software suspend/resume. * * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> */ #include <linux/suspend.h> #include <linux/reboot.h> #include <linux/string.h> #include <linux/device.h> #include <linux/miscdevice.h> #include <linux/mm.h> #include <linux/swap.h> #include <linux/swapops.h> #include <linux/pm.h> #include <linux/fs.h> #include <linux/compat.h> #include <linux/console.h> #include <linux/cpu.h> #include <linux/freezer.h> #include <linux/uaccess.h> #include "power.h" static bool need_wait; static struct snapshot_data { struct snapshot_handle handle; int swap; int mode; bool frozen; bool ready; bool platform_support; bool free_bitmaps; dev_t dev; } snapshot_state; int is_hibernate_resume_dev(dev_t dev) { return hibernation_available() && snapshot_state.dev == dev; } static int snapshot_open(struct inode *inode, struct file *filp) { struct snapshot_data *data; unsigned int sleep_flags; int error; if (!hibernation_available()) return -EPERM; sleep_flags = lock_system_sleep(); if (!hibernate_acquire()) { error = -EBUSY; goto Unlock; } if ((filp->f_flags & O_ACCMODE) == O_RDWR) { hibernate_release(); error = -ENOSYS; goto Unlock; } nonseekable_open(inode, filp); data = &snapshot_state; filp->private_data = data; memset(&data->handle, 0, sizeof(struct snapshot_handle)); if ((filp->f_flags & O_ACCMODE) == O_RDONLY) { /* Hibernating. The image device should be accessible. */ data->swap = swap_type_of(swsusp_resume_device, 0); data->mode = O_RDONLY; data->free_bitmaps = false; error = pm_notifier_call_chain_robust(PM_HIBERNATION_PREPARE, PM_POST_HIBERNATION); } else { /* * Resuming. We may need to wait for the image device to * appear. */ need_wait = true; data->swap = -1; data->mode = O_WRONLY; error = pm_notifier_call_chain_robust(PM_RESTORE_PREPARE, PM_POST_RESTORE); if (!error) { error = create_basic_memory_bitmaps(); data->free_bitmaps = !error; } } if (error) hibernate_release(); data->frozen = false; data->ready = false; data->platform_support = false; data->dev = 0; Unlock: unlock_system_sleep(sleep_flags); return error; } static int snapshot_release(struct inode *inode, struct file *filp) { struct snapshot_data *data; unsigned int sleep_flags; sleep_flags = lock_system_sleep(); swsusp_free(); data = filp->private_data; data->dev = 0; free_all_swap_pages(data->swap); if (data->frozen) { pm_restore_gfp_mask(); free_basic_memory_bitmaps(); thaw_processes(); } else if (data->free_bitmaps) { free_basic_memory_bitmaps(); } pm_notifier_call_chain(data->mode == O_RDONLY ? PM_POST_HIBERNATION : PM_POST_RESTORE); hibernate_release(); unlock_system_sleep(sleep_flags); return 0; } static ssize_t snapshot_read(struct file *filp, char __user *buf, size_t count, loff_t *offp) { loff_t pg_offp = *offp & ~PAGE_MASK; struct snapshot_data *data; unsigned int sleep_flags; ssize_t res; sleep_flags = lock_system_sleep(); data = filp->private_data; if (!data->ready) { res = -ENODATA; goto Unlock; } if (!pg_offp) { /* on page boundary? */ res = snapshot_read_next(&data->handle); if (res <= 0) goto Unlock; } else { res = PAGE_SIZE - pg_offp; } res = simple_read_from_buffer(buf, count, &pg_offp, data_of(data->handle), res); if (res > 0) *offp += res; Unlock: unlock_system_sleep(sleep_flags); return res; } static ssize_t snapshot_write(struct file *filp, const char __user *buf, size_t count, loff_t *offp) { loff_t pg_offp = *offp & ~PAGE_MASK; struct snapshot_data *data; unsigned long sleep_flags; ssize_t res; if (need_wait) { wait_for_device_probe(); need_wait = false; } sleep_flags = lock_system_sleep(); data = filp->private_data; if (!pg_offp) { res = snapshot_write_next(&data->handle); if (res <= 0) goto unlock; } else { res = PAGE_SIZE; } if (!data_of(data->handle)) { res = -EINVAL; goto unlock; } res = simple_write_to_buffer(data_of(data->handle), res, &pg_offp, buf, count); if (res > 0) *offp += res; unlock: unlock_system_sleep(sleep_flags); return res; } struct compat_resume_swap_area { compat_loff_t offset; u32 dev; } __packed; static int snapshot_set_swap_area(struct snapshot_data *data, void __user *argp) { sector_t offset; dev_t swdev; if (swsusp_swap_in_use()) return -EPERM; if (in_compat_syscall()) { struct compat_resume_swap_area swap_area; if (copy_from_user(&swap_area, argp, sizeof(swap_area))) return -EFAULT; swdev = new_decode_dev(swap_area.dev); offset = swap_area.offset; } else { struct resume_swap_area swap_area; if (copy_from_user(&swap_area, argp, sizeof(swap_area))) return -EFAULT; swdev = new_decode_dev(swap_area.dev); offset = swap_area.offset; } /* * User space encodes device types as two-byte values, * so we need to recode them */ data->swap = swap_type_of(swdev, offset); if (data->swap < 0) return swdev ? -ENODEV : -EINVAL; data->dev = swdev; return 0; } static long snapshot_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { int error = 0; struct snapshot_data *data; loff_t size; sector_t offset; if (need_wait) { wait_for_device_probe(); need_wait = false; } if (_IOC_TYPE(cmd) != SNAPSHOT_IOC_MAGIC) return -ENOTTY; if (_IOC_NR(cmd) > SNAPSHOT_IOC_MAXNR) return -ENOTTY; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (!mutex_trylock(&system_transition_mutex)) return -EBUSY; lock_device_hotplug(); data = filp->private_data; switch (cmd) { case SNAPSHOT_FREEZE: if (data->frozen) break; ksys_sync_helper(); error = freeze_processes(); if (error) break; error = create_basic_memory_bitmaps(); if (error) thaw_processes(); else data->frozen = true; break; case SNAPSHOT_UNFREEZE: if (!data->frozen || data->ready) break; pm_restore_gfp_mask(); free_basic_memory_bitmaps(); data->free_bitmaps = false; thaw_processes(); data->frozen = false; break; case SNAPSHOT_CREATE_IMAGE: if (data->mode != O_RDONLY || !data->frozen || data->ready) { error = -EPERM; break; } pm_restore_gfp_mask(); error = hibernation_snapshot(data->platform_support); if (!error) { error = put_user(in_suspend, (int __user *)arg); data->ready = !freezer_test_done && !error; freezer_test_done = false; } break; case SNAPSHOT_ATOMIC_RESTORE: error = snapshot_write_finalize(&data->handle); if (error) break; if (data->mode != O_WRONLY || !data->frozen || !snapshot_image_loaded(&data->handle)) { error = -EPERM; break; } error = hibernation_restore(data->platform_support); break; case SNAPSHOT_FREE: swsusp_free(); memset(&data->handle, 0, sizeof(struct snapshot_handle)); data->ready = false; /* * It is necessary to thaw kernel threads here, because * SNAPSHOT_CREATE_IMAGE may be invoked directly after * SNAPSHOT_FREE. In that case, if kernel threads were not * thawed, the preallocation of memory carried out by * hibernation_snapshot() might run into problems (i.e. it * might fail or even deadlock). */ thaw_kernel_threads(); break; case SNAPSHOT_PREF_IMAGE_SIZE: image_size = arg; break; case SNAPSHOT_GET_IMAGE_SIZE: if (!data->ready) { error = -ENODATA; break; } size = snapshot_get_image_size(); size <<= PAGE_SHIFT; error = put_user(size, (loff_t __user *)arg); break; case SNAPSHOT_AVAIL_SWAP_SIZE: size = count_swap_pages(data->swap, 1); size <<= PAGE_SHIFT; error = put_user(size, (loff_t __user *)arg); break; case SNAPSHOT_ALLOC_SWAP_PAGE: if (data->swap < 0 || data->swap >= MAX_SWAPFILES) { error = -ENODEV; break; } offset = alloc_swapdev_block(data->swap); if (offset) { offset <<= PAGE_SHIFT; error = put_user(offset, (loff_t __user *)arg); } else { error = -ENOSPC; } break; case SNAPSHOT_FREE_SWAP_PAGES: if (data->swap < 0 || data->swap >= MAX_SWAPFILES) { error = -ENODEV; break; } free_all_swap_pages(data->swap); break; case SNAPSHOT_S2RAM: if (!data->frozen) { error = -EPERM; break; } /* * Tasks are frozen and the notifiers have been called with * PM_HIBERNATION_PREPARE */ error = suspend_devices_and_enter(PM_SUSPEND_MEM); data->ready = false; break; case SNAPSHOT_PLATFORM_SUPPORT: data->platform_support = !!arg; break; case SNAPSHOT_POWER_OFF: if (data->platform_support) error = hibernation_platform_enter(); break; case SNAPSHOT_SET_SWAP_AREA: error = snapshot_set_swap_area(data, (void __user *)arg); break; default: error = -ENOTTY; } unlock_device_hotplug(); mutex_unlock(&system_transition_mutex); return error; } #ifdef CONFIG_COMPAT static long snapshot_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { BUILD_BUG_ON(sizeof(loff_t) != sizeof(compat_loff_t)); switch (cmd) { case SNAPSHOT_GET_IMAGE_SIZE: case SNAPSHOT_AVAIL_SWAP_SIZE: case SNAPSHOT_ALLOC_SWAP_PAGE: case SNAPSHOT_CREATE_IMAGE: case SNAPSHOT_SET_SWAP_AREA: return snapshot_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); default: return snapshot_ioctl(file, cmd, arg); } } #endif /* CONFIG_COMPAT */ static const struct file_operations snapshot_fops = { .open = snapshot_open, .release = snapshot_release, .read = snapshot_read, .write = snapshot_write, .unlocked_ioctl = snapshot_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = snapshot_compat_ioctl, #endif }; static struct miscdevice snapshot_device = { .minor = SNAPSHOT_MINOR, .name = "snapshot", .fops = &snapshot_fops, }; static int __init snapshot_device_init(void) { return misc_register(&snapshot_device); }; device_initcall(snapshot_device_init); |
| 2 2 2 2 2 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 | /* SPDX-License-Identifier: GPL-2.0-or-later */ #ifndef _NET_NETDEV_LOCK_H #define _NET_NETDEV_LOCK_H #include <linux/lockdep.h> #include <linux/netdevice.h> #include <linux/rtnetlink.h> static inline bool netdev_trylock(struct net_device *dev) { return mutex_trylock(&dev->lock); } static inline void netdev_assert_locked(const struct net_device *dev) { lockdep_assert_held(&dev->lock); } static inline void netdev_assert_locked_or_invisible(const struct net_device *dev) { if (dev->reg_state == NETREG_REGISTERED || dev->reg_state == NETREG_UNREGISTERING) netdev_assert_locked(dev); } static inline bool netdev_need_ops_lock(const struct net_device *dev) { bool ret = dev->request_ops_lock || !!dev->queue_mgmt_ops; #if IS_ENABLED(CONFIG_NET_SHAPER) ret |= !!dev->netdev_ops->net_shaper_ops; #endif return ret; } static inline void netdev_lock_ops(struct net_device *dev) { if (netdev_need_ops_lock(dev)) netdev_lock(dev); } static inline void netdev_unlock_ops(struct net_device *dev) { if (netdev_need_ops_lock(dev)) netdev_unlock(dev); } static inline void netdev_ops_assert_locked(const struct net_device *dev) { if (netdev_need_ops_lock(dev)) lockdep_assert_held(&dev->lock); else ASSERT_RTNL(); } static inline void netdev_ops_assert_locked_or_invisible(const struct net_device *dev) { if (dev->reg_state == NETREG_REGISTERED || dev->reg_state == NETREG_UNREGISTERING) netdev_ops_assert_locked(dev); } static inline int netdev_lock_cmp_fn(const struct lockdep_map *a, const struct lockdep_map *b) { /* Only lower devices currently grab the instance lock, so no * real ordering issues can occur. In the near future, only * hardware devices will grab instance lock which also does not * involve any ordering. Suppress lockdep ordering warnings * until (if) we start grabbing instance lock on pure SW * devices (bond/team/veth/etc). */ if (a == b) return 0; return -1; } #define netdev_lockdep_set_classes(dev) \ { \ static struct lock_class_key qdisc_tx_busylock_key; \ static struct lock_class_key qdisc_xmit_lock_key; \ static struct lock_class_key dev_addr_list_lock_key; \ static struct lock_class_key dev_instance_lock_key; \ unsigned int i; \ \ (dev)->qdisc_tx_busylock = &qdisc_tx_busylock_key; \ lockdep_set_class(&(dev)->addr_list_lock, \ &dev_addr_list_lock_key); \ lockdep_set_class(&(dev)->lock, \ &dev_instance_lock_key); \ lock_set_cmp_fn(&dev->lock, netdev_lock_cmp_fn, NULL); \ for (i = 0; i < (dev)->num_tx_queues; i++) \ lockdep_set_class(&(dev)->_tx[i]._xmit_lock, \ &qdisc_xmit_lock_key); \ } int netdev_debug_event(struct notifier_block *nb, unsigned long event, void *ptr); #endif |
| 2 2 99 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _BCACHEFS_BTREE_JOURNAL_ITER_H #define _BCACHEFS_BTREE_JOURNAL_ITER_H #include "bkey.h" struct journal_iter { struct list_head list; enum btree_id btree_id; unsigned level; size_t idx; struct journal_keys *keys; }; /* * Iterate over keys in the btree, with keys from the journal overlaid on top: */ struct btree_and_journal_iter { struct btree_trans *trans; struct btree *b; struct btree_node_iter node_iter; struct bkey unpacked; struct journal_iter journal; struct bpos pos; bool at_end; bool prefetch; bool fail_if_too_many_whiteouts; }; static inline int __journal_key_btree_cmp(enum btree_id l_btree_id, unsigned l_level, const struct journal_key *r) { return -cmp_int(l_level, r->level) ?: cmp_int(l_btree_id, r->btree_id); } static inline int __journal_key_cmp(enum btree_id l_btree_id, unsigned l_level, struct bpos l_pos, const struct journal_key *r) { return __journal_key_btree_cmp(l_btree_id, l_level, r) ?: bpos_cmp(l_pos, r->k->k.p); } static inline int journal_key_cmp(const struct journal_key *l, const struct journal_key *r) { return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r); } struct bkey_i *bch2_journal_keys_peek_max(struct bch_fs *, enum btree_id, unsigned, struct bpos, struct bpos, size_t *); struct bkey_i *bch2_journal_keys_peek_prev_min(struct bch_fs *, enum btree_id, unsigned, struct bpos, struct bpos, size_t *); struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id, unsigned, struct bpos); int bch2_btree_and_journal_iter_prefetch(struct btree_trans *, struct btree_path *, struct btree_and_journal_iter *); int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id, unsigned, struct bkey_i *); int bch2_journal_key_insert(struct bch_fs *, enum btree_id, unsigned, struct bkey_i *); int bch2_journal_key_delete(struct bch_fs *, enum btree_id, unsigned, struct bpos); bool bch2_key_deleted_in_journal(struct btree_trans *, enum btree_id, unsigned, struct bpos); void bch2_journal_key_overwritten(struct bch_fs *, enum btree_id, unsigned, struct bpos); void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *); struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *); void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *); void __bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *, struct btree_and_journal_iter *, struct btree *, struct btree_node_iter, struct bpos); void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *, struct btree_and_journal_iter *, struct btree *); void bch2_journal_keys_put(struct bch_fs *); static inline void bch2_journal_keys_put_initial(struct bch_fs *c) { if (c->journal_keys.initial_ref_held) bch2_journal_keys_put(c); c->journal_keys.initial_ref_held = false; } int bch2_journal_keys_sort(struct bch_fs *); void bch2_shoot_down_journal_keys(struct bch_fs *, enum btree_id, unsigned, unsigned, struct bpos, struct bpos); void bch2_journal_keys_dump(struct bch_fs *); void bch2_fs_journal_keys_init(struct bch_fs *); #endif /* _BCACHEFS_BTREE_JOURNAL_ITER_H */ |
| 7 51 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 | /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Copyright (C) 2002, 2004, 2005 Oracle. All rights reserved. */ #ifndef OCFS2_AOPS_H #define OCFS2_AOPS_H #include <linux/fs.h> int ocfs2_map_folio_blocks(struct folio *folio, u64 *p_blkno, struct inode *inode, unsigned int from, unsigned int to, int new); void ocfs2_unlock_and_free_folios(struct folio **folios, int num_folios); int walk_page_buffers( handle_t *handle, struct buffer_head *head, unsigned from, unsigned to, int *partial, int (*fn)( handle_t *handle, struct buffer_head *bh)); int ocfs2_write_end_nolock(struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, void *fsdata); typedef enum { OCFS2_WRITE_BUFFER = 0, OCFS2_WRITE_DIRECT, OCFS2_WRITE_MMAP, } ocfs2_write_type_t; int ocfs2_write_begin_nolock(struct address_space *mapping, loff_t pos, unsigned len, ocfs2_write_type_t type, struct folio **foliop, void **fsdata, struct buffer_head *di_bh, struct folio *mmap_folio); int ocfs2_read_inline_data(struct inode *inode, struct folio *folio, struct buffer_head *di_bh); int ocfs2_size_fits_inline_data(struct buffer_head *di_bh, u64 new_size); int ocfs2_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); /* all ocfs2_dio_end_io()'s fault */ #define ocfs2_iocb_is_rw_locked(iocb) \ test_bit(0, (unsigned long *)&iocb->private) static inline void ocfs2_iocb_set_rw_locked(struct kiocb *iocb, int level) { set_bit(0, (unsigned long *)&iocb->private); if (level) set_bit(1, (unsigned long *)&iocb->private); else clear_bit(1, (unsigned long *)&iocb->private); } /* * Using a named enum representing lock types in terms of #N bit stored in * iocb->private, which is going to be used for communication between * ocfs2_dio_end_io() and ocfs2_file_write/read_iter(). */ enum ocfs2_iocb_lock_bits { OCFS2_IOCB_RW_LOCK = 0, OCFS2_IOCB_RW_LOCK_LEVEL, OCFS2_IOCB_NUM_LOCKS }; #define ocfs2_iocb_init_rw_locked(iocb) \ (iocb->private = NULL) #define ocfs2_iocb_clear_rw_locked(iocb) \ clear_bit(OCFS2_IOCB_RW_LOCK, (unsigned long *)&iocb->private) #define ocfs2_iocb_rw_locked_level(iocb) \ test_bit(OCFS2_IOCB_RW_LOCK_LEVEL, (unsigned long *)&iocb->private) #endif /* OCFS2_FILE_H */ |
| 9884 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 | /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_CPUFEATURE_H #define _ASM_X86_CPUFEATURE_H #include <asm/processor.h> #if defined(__KERNEL__) && !defined(__ASSEMBLER__) #include <asm/asm.h> #include <linux/bitops.h> #include <asm/alternative.h> #include <asm/cpufeaturemasks.h> enum cpuid_leafs { CPUID_1_EDX = 0, CPUID_8000_0001_EDX, CPUID_8086_0001_EDX, CPUID_LNX_1, CPUID_1_ECX, CPUID_C000_0001_EDX, CPUID_8000_0001_ECX, CPUID_LNX_2, CPUID_LNX_3, CPUID_7_0_EBX, CPUID_D_1_EAX, CPUID_LNX_4, CPUID_7_1_EAX, CPUID_8000_0008_EBX, CPUID_6_EAX, CPUID_8000_000A_EDX, CPUID_7_ECX, CPUID_8000_0007_EBX, CPUID_7_EDX, CPUID_8000_001F_EAX, CPUID_8000_0021_EAX, CPUID_LNX_5, NR_CPUID_WORDS, }; extern const char * const x86_cap_flags[NCAPINTS*32]; extern const char * const x86_power_flags[32]; /* * In order to save room, we index into this array by doing * X86_BUG_<name> - NCAPINTS*32. */ extern const char * const x86_bug_flags[NBUGINTS*32]; #define x86_bug_flag(flag) x86_bug_flags[flag] #define test_cpu_cap(c, bit) \ arch_test_bit(bit, (unsigned long *)((c)->x86_capability)) #define cpu_has(c, bit) \ (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \ test_cpu_cap(c, bit)) #define this_cpu_has(bit) \ (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \ x86_this_cpu_test_bit(bit, cpu_info.x86_capability)) /* * This is the default CPU features testing macro to use in code. * * It is for detection of features which need kernel infrastructure to be * used. It may *not* directly test the CPU itself. Use the cpu_has() family * if you want true runtime testing of CPU features, like in hypervisor code * where you are supporting a possible guest feature where host support for it * is not relevant. */ #define cpu_feature_enabled(bit) \ (__builtin_constant_p(bit) && DISABLED_MASK_BIT_SET(bit) ? 0 : static_cpu_has(bit)) #define boot_cpu_has(bit) cpu_has(&boot_cpu_data, bit) #define set_cpu_cap(c, bit) set_bit(bit, (unsigned long *)((c)->x86_capability)) extern void setup_clear_cpu_cap(unsigned int bit); extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit); void check_cpufeature_deps(struct cpuinfo_x86 *c); #define setup_force_cpu_cap(bit) do { \ \ if (!boot_cpu_has(bit)) \ WARN_ON(alternatives_patched); \ \ set_cpu_cap(&boot_cpu_data, bit); \ set_bit(bit, (unsigned long *)cpu_caps_set); \ } while (0) #define setup_force_cpu_bug(bit) setup_force_cpu_cap(bit) /* * Do not use an "m" constraint for [cap_byte] here: gcc doesn't know * that this is only used on a fallback path and will sometimes cause * it to manifest the address of boot_cpu_data in a register, fouling * the mainline (post-initialization) code. */ static __always_inline bool _static_cpu_has(u16 bit) { asm goto(ALTERNATIVE_TERNARY("jmp 6f", %c[feature], "", "jmp %l[t_no]") ".pushsection .altinstr_aux,\"ax\"\n" "6:\n" " testb %[bitnum], %a[cap_byte]\n" " jnz %l[t_yes]\n" " jmp %l[t_no]\n" ".popsection\n" : : [feature] "i" (bit), [bitnum] "i" (1 << (bit & 7)), [cap_byte] "i" (&((const char *)boot_cpu_data.x86_capability)[bit >> 3]) : : t_yes, t_no); t_yes: return true; t_no: return false; } #define static_cpu_has(bit) \ ( \ __builtin_constant_p(boot_cpu_has(bit)) ? \ boot_cpu_has(bit) : \ _static_cpu_has(bit) \ ) #define cpu_has_bug(c, bit) cpu_has(c, (bit)) #define set_cpu_bug(c, bit) set_cpu_cap(c, (bit)) #define clear_cpu_bug(c, bit) clear_cpu_cap(c, (bit)) #define static_cpu_has_bug(bit) static_cpu_has((bit)) #define boot_cpu_has_bug(bit) cpu_has_bug(&boot_cpu_data, (bit)) #define boot_cpu_set_bug(bit) set_cpu_cap(&boot_cpu_data, (bit)) #define MAX_CPU_FEATURES (NCAPINTS * 32) #define cpu_have_feature boot_cpu_has #define CPU_FEATURE_TYPEFMT "x86,ven%04Xfam%04Xmod%04X" #define CPU_FEATURE_TYPEVAL boot_cpu_data.x86_vendor, boot_cpu_data.x86, \ boot_cpu_data.x86_model #endif /* defined(__KERNEL__) && !defined(__ASSEMBLER__) */ #endif /* _ASM_X86_CPUFEATURE_H */ |
| 2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 | // SPDX-License-Identifier: GPL-2.0-or-later /* * ip_vs_est.c: simple rate estimator for IPVS * * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> * * Changes: Hans Schillstrom <hans.schillstrom@ericsson.com> * Network name space (netns) aware. * Global data moved to netns i.e struct netns_ipvs * Affected data: est_list and est_lock. * estimation_timer() runs with timer per netns. * get_stats()) do the per cpu summing. */ #define KMSG_COMPONENT "IPVS" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt #include <linux/kernel.h> #include <linux/jiffies.h> #include <linux/types.h> #include <linux/interrupt.h> #include <linux/sysctl.h> #include <linux/list.h> #include <linux/rcupdate_wait.h> #include <net/ip_vs.h> /* This code is to estimate rate in a shorter interval (such as 8 seconds) for virtual services and real servers. For measure rate in a long interval, it is easy to implement a user level daemon which periodically reads those statistical counters and measure rate. We measure rate during the last 8 seconds every 2 seconds: avgrate = avgrate*(1-W) + rate*W where W = 2^(-2) NOTES. * Average bps is scaled by 2^5, while average pps and cps are scaled by 2^10. * Netlink users can see 64-bit values but sockopt users are restricted to 32-bit values for conns, packets, bps, cps and pps. * A lot of code is taken from net/core/gen_estimator.c KEY POINTS: - cpustats counters are updated per-cpu in SoftIRQ context with BH disabled - kthreads read the cpustats to update the estimators (svcs, dests, total) - the states of estimators can be read (get stats) or modified (zero stats) from processes KTHREADS: - estimators are added initially to est_temp_list and later kthread 0 distributes them to one or many kthreads for estimation - kthread contexts are created and attached to array - the kthread tasks are started when first service is added, before that the total stats are not estimated - when configuration (cpulist/nice) is changed, the tasks are restarted by work (est_reload_work) - kthread tasks are stopped while the cpulist is empty - the kthread context holds lists with estimators (chains) which are processed every 2 seconds - as estimators can be added dynamically and in bursts, we try to spread them to multiple chains which are estimated at different time - on start, kthread 0 enters calculation phase to determine the chain limits and the limit of estimators per kthread - est_add_ktid: ktid where to add new ests, can point to empty slot where we should add kt data */ static struct lock_class_key __ipvs_est_key; static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs); static void ip_vs_est_drain_temp_list(struct netns_ipvs *ipvs); static void ip_vs_chain_estimation(struct hlist_head *chain) { struct ip_vs_estimator *e; struct ip_vs_cpu_stats *c; struct ip_vs_stats *s; u64 rate; hlist_for_each_entry_rcu(e, chain, list) { u64 conns, inpkts, outpkts, inbytes, outbytes; u64 kconns = 0, kinpkts = 0, koutpkts = 0; u64 kinbytes = 0, koutbytes = 0; unsigned int start; int i; if (kthread_should_stop()) break; s = container_of(e, struct ip_vs_stats, est); for_each_possible_cpu(i) { c = per_cpu_ptr(s->cpustats, i); do { start = u64_stats_fetch_begin(&c->syncp); conns = u64_stats_read(&c->cnt.conns); inpkts = u64_stats_read(&c->cnt.inpkts); outpkts = u64_stats_read(&c->cnt.outpkts); inbytes = u64_stats_read(&c->cnt.inbytes); outbytes = u64_stats_read(&c->cnt.outbytes); } while (u64_stats_fetch_retry(&c->syncp, start)); kconns += conns; kinpkts += inpkts; koutpkts += outpkts; kinbytes += inbytes; koutbytes += outbytes; } spin_lock(&s->lock); s->kstats.conns = kconns; s->kstats.inpkts = kinpkts; s->kstats.outpkts = koutpkts; s->kstats.inbytes = kinbytes; s->kstats.outbytes = koutbytes; /* scaled by 2^10, but divided 2 seconds */ rate = (s->kstats.conns - e->last_conns) << 9; e->last_conns = s->kstats.conns; e->cps += ((s64)rate - (s64)e->cps) >> 2; rate = (s->kstats.inpkts - e->last_inpkts) << 9; e->last_inpkts = s->kstats.inpkts; e->inpps += ((s64)rate - (s64)e->inpps) >> 2; rate = (s->kstats.outpkts - e->last_outpkts) << 9; e->last_outpkts = s->kstats.outpkts; e->outpps += ((s64)rate - (s64)e->outpps) >> 2; /* scaled by 2^5, but divided 2 seconds */ rate = (s->kstats.inbytes - e->last_inbytes) << 4; e->last_inbytes = s->kstats.inbytes; e->inbps += ((s64)rate - (s64)e->inbps) >> 2; rate = (s->kstats.outbytes - e->last_outbytes) << 4; e->last_outbytes = s->kstats.outbytes; e->outbps += ((s64)rate - (s64)e->outbps) >> 2; spin_unlock(&s->lock); } } static void ip_vs_tick_estimation(struct ip_vs_est_kt_data *kd, int row) { struct ip_vs_est_tick_data *td; int cid; rcu_read_lock(); td = rcu_dereference(kd->ticks[row]); if (!td) goto out; for_each_set_bit(cid, td->present, IPVS_EST_TICK_CHAINS) { if (kthread_should_stop()) break; ip_vs_chain_estimation(&td->chains[cid]); cond_resched_rcu(); td = rcu_dereference(kd->ticks[row]); if (!td) break; } out: rcu_read_unlock(); } static int ip_vs_estimation_kthread(void *data) { struct ip_vs_est_kt_data *kd = data; struct netns_ipvs *ipvs = kd->ipvs; int row = kd->est_row; unsigned long now; int id = kd->id; long gap; if (id > 0) { if (!ipvs->est_chain_max) return 0; } else { if (!ipvs->est_chain_max) { ipvs->est_calc_phase = 1; /* commit est_calc_phase before reading est_genid */ smp_mb(); } /* kthread 0 will handle the calc phase */ if (ipvs->est_calc_phase) ip_vs_est_calc_phase(ipvs); } while (1) { if (!id && !hlist_empty(&ipvs->est_temp_list)) ip_vs_est_drain_temp_list(ipvs); set_current_state(TASK_IDLE); if (kthread_should_stop()) break; /* before estimation, check if we should sleep */ now = jiffies; gap = kd->est_timer - now; if (gap > 0) { if (gap > IPVS_EST_TICK) { kd->est_timer = now - IPVS_EST_TICK; gap = IPVS_EST_TICK; } schedule_timeout(gap); } else { __set_current_state(TASK_RUNNING); if (gap < -8 * IPVS_EST_TICK) kd->est_timer = now; } if (kd->tick_len[row]) ip_vs_tick_estimation(kd, row); row++; if (row >= IPVS_EST_NTICKS) row = 0; WRITE_ONCE(kd->est_row, row); kd->est_timer += IPVS_EST_TICK; } __set_current_state(TASK_RUNNING); return 0; } /* Schedule stop/start for kthread tasks */ void ip_vs_est_reload_start(struct netns_ipvs *ipvs) { /* Ignore reloads before first service is added */ if (!ipvs->enable) return; ip_vs_est_stopped_recalc(ipvs); /* Bump the kthread configuration genid */ atomic_inc(&ipvs->est_genid); queue_delayed_work(system_long_wq, &ipvs->est_reload_work, 0); } /* Start kthread task with current configuration */ int ip_vs_est_kthread_start(struct netns_ipvs *ipvs, struct ip_vs_est_kt_data *kd) { unsigned long now; int ret = 0; long gap; lockdep_assert_held(&ipvs->est_mutex); if (kd->task) goto out; now = jiffies; gap = kd->est_timer - now; /* Sync est_timer if task is starting later */ if (abs(gap) > 4 * IPVS_EST_TICK) kd->est_timer = now; kd->task = kthread_create(ip_vs_estimation_kthread, kd, "ipvs-e:%d:%d", ipvs->gen, kd->id); if (IS_ERR(kd->task)) { ret = PTR_ERR(kd->task); kd->task = NULL; goto out; } set_user_nice(kd->task, sysctl_est_nice(ipvs)); set_cpus_allowed_ptr(kd->task, sysctl_est_cpulist(ipvs)); pr_info("starting estimator thread %d...\n", kd->id); wake_up_process(kd->task); out: return ret; } void ip_vs_est_kthread_stop(struct ip_vs_est_kt_data *kd) { if (kd->task) { pr_info("stopping estimator thread %d...\n", kd->id); kthread_stop(kd->task); kd->task = NULL; } } /* Apply parameters to kthread */ static void ip_vs_est_set_params(struct netns_ipvs *ipvs, struct ip_vs_est_kt_data *kd) { kd->chain_max = ipvs->est_chain_max; /* We are using single chain on RCU preemption */ if (IPVS_EST_TICK_CHAINS == 1) kd->chain_max *= IPVS_EST_CHAIN_FACTOR; kd->tick_max = IPVS_EST_TICK_CHAINS * kd->chain_max; kd->est_max_count = IPVS_EST_NTICKS * kd->tick_max; } /* Create and start estimation kthread in a free or new array slot */ static int ip_vs_est_add_kthread(struct netns_ipvs *ipvs) { struct ip_vs_est_kt_data *kd = NULL; int id = ipvs->est_kt_count; int ret = -ENOMEM; void *arr = NULL; int i; if ((unsigned long)ipvs->est_kt_count >= ipvs->est_max_threads && ipvs->enable && ipvs->est_max_threads) return -EINVAL; mutex_lock(&ipvs->est_mutex); for (i = 0; i < id; i++) { if (!ipvs->est_kt_arr[i]) break; } if (i >= id) { arr = krealloc_array(ipvs->est_kt_arr, id + 1, sizeof(struct ip_vs_est_kt_data *), GFP_KERNEL); if (!arr) goto out; ipvs->est_kt_arr = arr; } else { id = i; } kd = kzalloc(sizeof(*kd), GFP_KERNEL); if (!kd) goto out; kd->ipvs = ipvs; bitmap_fill(kd->avail, IPVS_EST_NTICKS); kd->est_timer = jiffies; kd->id = id; ip_vs_est_set_params(ipvs, kd); /* Pre-allocate stats used in calc phase */ if (!id && !kd->calc_stats) { kd->calc_stats = ip_vs_stats_alloc(); if (!kd->calc_stats) goto out; } /* Start kthread tasks only when services are present */ if (ipvs->enable && !ip_vs_est_stopped(ipvs)) { ret = ip_vs_est_kthread_start(ipvs, kd); if (ret < 0) goto out; } if (arr) ipvs->est_kt_count++; ipvs->est_kt_arr[id] = kd; kd = NULL; /* Use most recent kthread for new ests */ ipvs->est_add_ktid = id; ret = 0; out: mutex_unlock(&ipvs->est_mutex); if (kd) { ip_vs_stats_free(kd->calc_stats); kfree(kd); } return ret; } /* Select ktid where to add new ests: available, unused or new slot */ static void ip_vs_est_update_ktid(struct netns_ipvs *ipvs) { int ktid, best = ipvs->est_kt_count; struct ip_vs_est_kt_data *kd; for (ktid = 0; ktid < ipvs->est_kt_count; ktid++) { kd = ipvs->est_kt_arr[ktid]; if (kd) { if (kd->est_count < kd->est_max_count) { best = ktid; break; } } else if (ktid < best) { best = ktid; } } ipvs->est_add_ktid = best; } /* Add estimator to current kthread (est_add_ktid) */ static int ip_vs_enqueue_estimator(struct netns_ipvs *ipvs, struct ip_vs_estimator *est) { struct ip_vs_est_kt_data *kd = NULL; struct ip_vs_est_tick_data *td; int ktid, row, crow, cid, ret; int delay = est->ktrow; BUILD_BUG_ON_MSG(IPVS_EST_TICK_CHAINS > 127, "Too many chains for ktcid"); if (ipvs->est_add_ktid < ipvs->est_kt_count) { kd = ipvs->est_kt_arr[ipvs->est_add_ktid]; if (kd) goto add_est; } ret = ip_vs_est_add_kthread(ipvs); if (ret < 0) goto out; kd = ipvs->est_kt_arr[ipvs->est_add_ktid]; add_est: ktid = kd->id; /* For small number of estimators prefer to use few ticks, * otherwise try to add into the last estimated row. * est_row and add_row point after the row we should use */ if (kd->est_count >= 2 * kd->tick_max || delay < IPVS_EST_NTICKS - 1) crow = READ_ONCE(kd->est_row); else crow = kd->add_row; crow += delay; if (crow >= IPVS_EST_NTICKS) crow -= IPVS_EST_NTICKS; /* Assume initial delay ? */ if (delay >= IPVS_EST_NTICKS - 1) { /* Preserve initial delay or decrease it if no space in tick */ row = crow; if (crow < IPVS_EST_NTICKS - 1) { crow++; row = find_last_bit(kd->avail, crow); } if (row >= crow) row = find_last_bit(kd->avail, IPVS_EST_NTICKS); } else { /* Preserve delay or increase it if no space in tick */ row = IPVS_EST_NTICKS; if (crow > 0) row = find_next_bit(kd->avail, IPVS_EST_NTICKS, crow); if (row >= IPVS_EST_NTICKS) row = find_first_bit(kd->avail, IPVS_EST_NTICKS); } td = rcu_dereference_protected(kd->ticks[row], 1); if (!td) { td = kzalloc(sizeof(*td), GFP_KERNEL); if (!td) { ret = -ENOMEM; goto out; } rcu_assign_pointer(kd->ticks[row], td); } cid = find_first_zero_bit(td->full, IPVS_EST_TICK_CHAINS); kd->est_count++; kd->tick_len[row]++; if (!td->chain_len[cid]) __set_bit(cid, td->present); td->chain_len[cid]++; est->ktid = ktid; est->ktrow = row; est->ktcid = cid; hlist_add_head_rcu(&est->list, &td->chains[cid]); if (td->chain_len[cid] >= kd->chain_max) { __set_bit(cid, td->full); if (kd->tick_len[row] >= kd->tick_max) __clear_bit(row, kd->avail); } /* Update est_add_ktid to point to first available/empty kt slot */ if (kd->est_count == kd->est_max_count) ip_vs_est_update_ktid(ipvs); ret = 0; out: return ret; } /* Start estimation for stats */ int ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats) { struct ip_vs_estimator *est = &stats->est; int ret; if (!ipvs->est_max_threads && ipvs->enable) ipvs->est_max_threads = ip_vs_est_max_threads(ipvs); est->ktid = -1; est->ktrow = IPVS_EST_NTICKS - 1; /* Initial delay */ /* We prefer this code to be short, kthread 0 will requeue the * estimator to available chain. If tasks are disabled, we * will not allocate much memory, just for kt 0. */ ret = 0; if (!ipvs->est_kt_count || !ipvs->est_kt_arr[0]) ret = ip_vs_est_add_kthread(ipvs); if (ret >= 0) hlist_add_head(&est->list, &ipvs->est_temp_list); else INIT_HLIST_NODE(&est->list); return ret; } static void ip_vs_est_kthread_destroy(struct ip_vs_est_kt_data *kd) { if (kd) { if (kd->task) { pr_info("stop unused estimator thread %d...\n", kd->id); kthread_stop(kd->task); } ip_vs_stats_free(kd->calc_stats); kfree(kd); } } /* Unlink estimator from chain */ void ip_vs_stop_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats) { struct ip_vs_estimator *est = &stats->est; struct ip_vs_est_tick_data *td; struct ip_vs_est_kt_data *kd; int ktid = est->ktid; int row = est->ktrow; int cid = est->ktcid; /* Failed to add to chain ? */ if (hlist_unhashed(&est->list)) return; /* On return, estimator can be freed, dequeue it now */ /* In est_temp_list ? */ if (ktid < 0) { hlist_del(&est->list); goto end_kt0; } hlist_del_rcu(&est->list); kd = ipvs->est_kt_arr[ktid]; td = rcu_dereference_protected(kd->ticks[row], 1); __clear_bit(cid, td->full); td->chain_len[cid]--; if (!td->chain_len[cid]) __clear_bit(cid, td->present); kd->tick_len[row]--; __set_bit(row, kd->avail); if (!kd->tick_len[row]) { RCU_INIT_POINTER(kd->ticks[row], NULL); kfree_rcu(td, rcu_head); } kd->est_count--; if (kd->est_count) { /* This kt slot can become available just now, prefer it */ if (ktid < ipvs->est_add_ktid) ipvs->est_add_ktid = ktid; return; } if (ktid > 0) { mutex_lock(&ipvs->est_mutex); ip_vs_est_kthread_destroy(kd); ipvs->est_kt_arr[ktid] = NULL; if (ktid == ipvs->est_kt_count - 1) { ipvs->est_kt_count--; while (ipvs->est_kt_count > 1 && !ipvs->est_kt_arr[ipvs->est_kt_count - 1]) ipvs->est_kt_count--; } mutex_unlock(&ipvs->est_mutex); /* This slot is now empty, prefer another available kt slot */ if (ktid == ipvs->est_add_ktid) ip_vs_est_update_ktid(ipvs); } end_kt0: /* kt 0 is freed after all other kthreads and chains are empty */ if (ipvs->est_kt_count == 1 && hlist_empty(&ipvs->est_temp_list)) { kd = ipvs->est_kt_arr[0]; if (!kd || !kd->est_count) { mutex_lock(&ipvs->est_mutex); if (kd) { ip_vs_est_kthread_destroy(kd); ipvs->est_kt_arr[0] = NULL; } ipvs->est_kt_count--; mutex_unlock(&ipvs->est_mutex); ipvs->est_add_ktid = 0; } } } /* Register all ests from est_temp_list to kthreads */ static void ip_vs_est_drain_temp_list(struct netns_ipvs *ipvs) { struct ip_vs_estimator *est; while (1) { int max = 16; mutex_lock(&__ip_vs_mutex); while (max-- > 0) { est = hlist_entry_safe(ipvs->est_temp_list.first, struct ip_vs_estimator, list); if (est) { if (kthread_should_stop()) goto unlock; hlist_del_init(&est->list); if (ip_vs_enqueue_estimator(ipvs, est) >= 0) continue; est->ktid = -1; hlist_add_head(&est->list, &ipvs->est_temp_list); /* Abort, some entries will not be estimated * until next attempt */ } goto unlock; } mutex_unlock(&__ip_vs_mutex); cond_resched(); } unlock: mutex_unlock(&__ip_vs_mutex); } /* Calculate limits for all kthreads */ static int ip_vs_est_calc_limits(struct netns_ipvs *ipvs, int *chain_max) { DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); struct ip_vs_est_kt_data *kd; struct hlist_head chain; struct ip_vs_stats *s; int cache_factor = 4; int i, loops, ntest; s32 min_est = 0; ktime_t t1, t2; int max = 8; int ret = 1; s64 diff; u64 val; INIT_HLIST_HEAD(&chain); mutex_lock(&__ip_vs_mutex); kd = ipvs->est_kt_arr[0]; mutex_unlock(&__ip_vs_mutex); s = kd ? kd->calc_stats : NULL; if (!s) goto out; hlist_add_head(&s->est.list, &chain); loops = 1; /* Get best result from many tests */ for (ntest = 0; ntest < 12; ntest++) { if (!(ntest & 3)) { /* Wait for cpufreq frequency transition */ wait_event_idle_timeout(wq, kthread_should_stop(), HZ / 50); if (!ipvs->enable || kthread_should_stop()) goto stop; } local_bh_disable(); rcu_read_lock(); /* Put stats in cache */ ip_vs_chain_estimation(&chain); t1 = ktime_get(); for (i = loops * cache_factor; i > 0; i--) ip_vs_chain_estimation(&chain); t2 = ktime_get(); rcu_read_unlock(); local_bh_enable(); if (!ipvs->enable || kthread_should_stop()) goto stop; cond_resched(); diff = ktime_to_ns(ktime_sub(t2, t1)); if (diff <= 1 * NSEC_PER_USEC) { /* Do more loops on low time resolution */ loops *= 2; continue; } if (diff >= NSEC_PER_SEC) continue; val = diff; do_div(val, loops); if (!min_est || val < min_est) { min_est = val; /* goal: 95usec per chain */ val = 95 * NSEC_PER_USEC; if (val >= min_est) { do_div(val, min_est); max = (int)val; } else { max = 1; } } } out: if (s) hlist_del_init(&s->est.list); *chain_max = max; return ret; stop: ret = 0; goto out; } /* Calculate the parameters and apply them in context of kt #0 * ECP: est_calc_phase * ECM: est_chain_max * ECP ECM Insert Chain enable Description * --------------------------------------------------------------------------- * 0 0 est_temp_list 0 create kt #0 context * 0 0 est_temp_list 0->1 service added, start kthread #0 task * 0->1 0 est_temp_list 1 kt task #0 started, enters calc phase * 1 0 est_temp_list 1 kt #0: determine est_chain_max, * stop tasks, move ests to est_temp_list * and free kd for kthreads 1..last * 1->0 0->N kt chains 1 ests can go to kthreads * 0 N kt chains 1 drain est_temp_list, create new kthread * contexts, start tasks, estimate */ static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs) { int genid = atomic_read(&ipvs->est_genid); struct ip_vs_est_tick_data *td; struct ip_vs_est_kt_data *kd; struct ip_vs_estimator *est; struct ip_vs_stats *stats; int id, row, cid, delay; bool last, last_td; int chain_max; int step; if (!ip_vs_est_calc_limits(ipvs, &chain_max)) return; mutex_lock(&__ip_vs_mutex); /* Stop all other tasks, so that we can immediately move the * estimators to est_temp_list without RCU grace period */ mutex_lock(&ipvs->est_mutex); for (id = 1; id < ipvs->est_kt_count; id++) { /* netns clean up started, abort */ if (!ipvs->enable) goto unlock2; kd = ipvs->est_kt_arr[id]; if (!kd) continue; ip_vs_est_kthread_stop(kd); } mutex_unlock(&ipvs->est_mutex); /* Move all estimators to est_temp_list but carefully, * all estimators and kthread data can be released while * we reschedule. Even for kthread 0. */ step = 0; /* Order entries in est_temp_list in ascending delay, so now * walk delay(desc), id(desc), cid(asc) */ delay = IPVS_EST_NTICKS; next_delay: delay--; if (delay < 0) goto end_dequeue; last_kt: /* Destroy contexts backwards */ id = ipvs->est_kt_count; next_kt: if (!ipvs->enable || kthread_should_stop()) goto unlock; id--; if (id < 0) goto next_delay; kd = ipvs->est_kt_arr[id]; if (!kd) goto next_kt; /* kt 0 can exist with empty chains */ if (!id && kd->est_count <= 1) goto next_delay; row = kd->est_row + delay; if (row >= IPVS_EST_NTICKS) row -= IPVS_EST_NTICKS; td = rcu_dereference_protected(kd->ticks[row], 1); if (!td) goto next_kt; cid = 0; walk_chain: if (kthread_should_stop()) goto unlock; step++; if (!(step & 63)) { /* Give chance estimators to be added (to est_temp_list) * and deleted (releasing kthread contexts) */ mutex_unlock(&__ip_vs_mutex); cond_resched(); mutex_lock(&__ip_vs_mutex); /* Current kt released ? */ if (id >= ipvs->est_kt_count) goto last_kt; if (kd != ipvs->est_kt_arr[id]) goto next_kt; /* Current td released ? */ if (td != rcu_dereference_protected(kd->ticks[row], 1)) goto next_kt; /* No fatal changes on the current kd and td */ } est = hlist_entry_safe(td->chains[cid].first, struct ip_vs_estimator, list); if (!est) { cid++; if (cid >= IPVS_EST_TICK_CHAINS) goto next_kt; goto walk_chain; } /* We can cheat and increase est_count to protect kt 0 context * from release but we prefer to keep the last estimator */ last = kd->est_count <= 1; /* Do not free kt #0 data */ if (!id && last) goto next_delay; last_td = kd->tick_len[row] <= 1; stats = container_of(est, struct ip_vs_stats, est); ip_vs_stop_estimator(ipvs, stats); /* Tasks are stopped, move without RCU grace period */ est->ktid = -1; est->ktrow = row - kd->est_row; if (est->ktrow < 0) est->ktrow += IPVS_EST_NTICKS; hlist_add_head(&est->list, &ipvs->est_temp_list); /* kd freed ? */ if (last) goto next_kt; /* td freed ? */ if (last_td) goto next_kt; goto walk_chain; end_dequeue: /* All estimators removed while calculating ? */ if (!ipvs->est_kt_count) goto unlock; kd = ipvs->est_kt_arr[0]; if (!kd) goto unlock; kd->add_row = kd->est_row; ipvs->est_chain_max = chain_max; ip_vs_est_set_params(ipvs, kd); pr_info("using max %d ests per chain, %d per kthread\n", kd->chain_max, kd->est_max_count); /* Try to keep tot_stats in kt0, enqueue it early */ if (ipvs->tot_stats && !hlist_unhashed(&ipvs->tot_stats->s.est.list) && ipvs->tot_stats->s.est.ktid == -1) { hlist_del(&ipvs->tot_stats->s.est.list); hlist_add_head(&ipvs->tot_stats->s.est.list, &ipvs->est_temp_list); } mutex_lock(&ipvs->est_mutex); /* We completed the calc phase, new calc phase not requested */ if (genid == atomic_read(&ipvs->est_genid)) ipvs->est_calc_phase = 0; unlock2: mutex_unlock(&ipvs->est_mutex); unlock: mutex_unlock(&__ip_vs_mutex); } void ip_vs_zero_estimator(struct ip_vs_stats *stats) { struct ip_vs_estimator *est = &stats->est; struct ip_vs_kstats *k = &stats->kstats; /* reset counters, caller must hold the stats->lock lock */ est->last_inbytes = k->inbytes; est->last_outbytes = k->outbytes; est->last_conns = k->conns; est->last_inpkts = k->inpkts; est->last_outpkts = k->outpkts; est->cps = 0; est->inpps = 0; est->outpps = 0; est->inbps = 0; est->outbps = 0; } /* Get decoded rates */ void ip_vs_read_estimator(struct ip_vs_kstats *dst, struct ip_vs_stats *stats) { struct ip_vs_estimator *e = &stats->est; dst->cps = (e->cps + 0x1FF) >> 10; dst->inpps = (e->inpps + 0x1FF) >> 10; dst->outpps = (e->outpps + 0x1FF) >> 10; dst->inbps = (e->inbps + 0xF) >> 5; dst->outbps = (e->outbps + 0xF) >> 5; } int __net_init ip_vs_estimator_net_init(struct netns_ipvs *ipvs) { INIT_HLIST_HEAD(&ipvs->est_temp_list); ipvs->est_kt_arr = NULL; ipvs->est_max_threads = 0; ipvs->est_calc_phase = 0; ipvs->est_chain_max = 0; ipvs->est_kt_count = 0; ipvs->est_add_ktid = 0; atomic_set(&ipvs->est_genid, 0); atomic_set(&ipvs->est_genid_done, 0); __mutex_init(&ipvs->est_mutex, "ipvs->est_mutex", &__ipvs_est_key); return 0; } void __net_exit ip_vs_estimator_net_cleanup(struct netns_ipvs *ipvs) { int i; for (i = 0; i < ipvs->est_kt_count; i++) ip_vs_est_kthread_destroy(ipvs->est_kt_arr[i]); kfree(ipvs->est_kt_arr); mutex_destroy(&ipvs->est_mutex); } |
| 2 2 2 2 2 2 4 4 4 4 4 4 4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 | // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * PACKET - implements raw packet sockets. * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Alan Cox, <gw4pts@gw4pts.ampr.org> * * Fixes: * Alan Cox : verify_area() now used correctly * Alan Cox : new skbuff lists, look ma no backlogs! * Alan Cox : tidied skbuff lists. * Alan Cox : Now uses generic datagram routines I * added. Also fixed the peek/read crash * from all old Linux datagram code. * Alan Cox : Uses the improved datagram code. * Alan Cox : Added NULL's for socket options. * Alan Cox : Re-commented the code. * Alan Cox : Use new kernel side addressing * Rob Janssen : Correct MTU usage. * Dave Platt : Counter leaks caused by incorrect * interrupt locking and some slightly * dubious gcc output. Can you read * compiler: it said _VOLATILE_ * Richard Kooijman : Timestamp fixes. * Alan Cox : New buffers. Use sk->mac.raw. * Alan Cox : sendmsg/recvmsg support. * Alan Cox : Protocol setting support * Alexey Kuznetsov : Untied from IPv4 stack. * Cyrus Durgin : Fixed kerneld for kmod. * Michal Ostrowski : Module initialization cleanup. * Ulises Alonso : Frame number limit removal and * packet_set_ring memory leak. * Eric Biederman : Allow for > 8 byte hardware addresses. * The convention is that longer addresses * will simply extend the hardware address * byte arrays at the end of sockaddr_ll * and packet_mreq. * Johann Baudy : Added TX RING. * Chetan Loke : Implemented TPACKET_V3 block abstraction * layer. * Copyright (C) 2011, <lokec@ccs.neu.edu> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/ethtool.h> #include <linux/filter.h> #include <linux/types.h> #include <linux/mm.h> #include <linux/capability.h> #include <linux/fcntl.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/if_packet.h> #include <linux/wireless.h> #include <linux/kernel.h> #include <linux/kmod.h> #include <linux/slab.h> #include <linux/vmalloc.h> #include <net/net_namespace.h> #include <net/ip.h> #include <net/protocol.h> #include <linux/skbuff.h> #include <net/sock.h> #include <linux/errno.h> #include <linux/timer.h> #include <linux/uaccess.h> #include <asm/ioctls.h> #include <asm/page.h> #include <asm/cacheflush.h> #include <asm/io.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> #include <linux/poll.h> #include <linux/module.h> #include <linux/init.h> #include <linux/mutex.h> #include <linux/if_vlan.h> #include <linux/virtio_net.h> #include <linux/errqueue.h> #include <linux/net_tstamp.h> #include <linux/percpu.h> #ifdef CONFIG_INET #include <net/inet_common.h> #endif #include <linux/bpf.h> #include <net/compat.h> #include <linux/netfilter_netdev.h> #include "internal.h" /* Assumptions: - If the device has no dev->header_ops->create, there is no LL header visible above the device. In this case, its hard_header_len should be 0. The device may prepend its own header internally. In this case, its needed_headroom should be set to the space needed for it to add its internal header. For example, a WiFi driver pretending to be an Ethernet driver should set its hard_header_len to be the Ethernet header length, and set its needed_headroom to be (the real WiFi header length - the fake Ethernet header length). - packet socket receives packets with pulled ll header, so that SOCK_RAW should push it back. On receive: ----------- Incoming, dev_has_header(dev) == true mac_header -> ll header data -> data Outgoing, dev_has_header(dev) == true mac_header -> ll header data -> ll header Incoming, dev_has_header(dev) == false mac_header -> data However drivers often make it point to the ll header. This is incorrect because the ll header should be invisible to us. data -> data Outgoing, dev_has_header(dev) == false mac_header -> data. ll header is invisible to us. data -> data Resume If dev_has_header(dev) == false we are unable to restore the ll header, because it is invisible to us. On transmit: ------------ dev_has_header(dev) == true mac_header -> ll header data -> ll header dev_has_header(dev) == false (ll header is invisible to us) mac_header -> data data -> data We should set network_header on output to the correct position, packet classifier depends on it. */ /* Private packet socket structures. */ /* identical to struct packet_mreq except it has * a longer address field. */ struct packet_mreq_max { int mr_ifindex; unsigned short mr_type; unsigned short mr_alen; unsigned char mr_address[MAX_ADDR_LEN]; }; union tpacket_uhdr { struct tpacket_hdr *h1; struct tpacket2_hdr *h2; struct tpacket3_hdr *h3; void *raw; }; static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, int closing, int tx_ring); #define V3_ALIGNMENT (8) #define BLK_HDR_LEN (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT)) #define BLK_PLUS_PRIV(sz_of_priv) \ (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT)) #define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status) #define BLOCK_NUM_PKTS(x) ((x)->hdr.bh1.num_pkts) #define BLOCK_O2FP(x) ((x)->hdr.bh1.offset_to_first_pkt) #define BLOCK_LEN(x) ((x)->hdr.bh1.blk_len) #define BLOCK_SNUM(x) ((x)->hdr.bh1.seq_num) #define BLOCK_O2PRIV(x) ((x)->offset_to_priv) struct packet_sock; static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev); static void *packet_previous_frame(struct packet_sock *po, struct packet_ring_buffer *rb, int status); static void packet_increment_head(struct packet_ring_buffer *buff); static int prb_curr_blk_in_use(struct tpacket_block_desc *); static void *prb_dispatch_next_block(struct tpacket_kbdq_core *, struct packet_sock *); static void prb_retire_current_block(struct tpacket_kbdq_core *, struct packet_sock *, unsigned int status); static int prb_queue_frozen(struct tpacket_kbdq_core *); static void prb_open_block(struct tpacket_kbdq_core *, struct tpacket_block_desc *); static void prb_retire_rx_blk_timer_expired(struct timer_list *); static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *); static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *); static void prb_clear_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *); static void prb_fill_vlan_info(struct tpacket_kbdq_core *, struct tpacket3_hdr *); static void packet_flush_mclist(struct sock *sk); static u16 packet_pick_tx_queue(struct sk_buff *skb); struct packet_skb_cb { union { struct sockaddr_pkt pkt; union { /* Trick: alias skb original length with * ll.sll_family and ll.protocol in order * to save room. */ unsigned int origlen; struct sockaddr_ll ll; }; } sa; }; #define vio_le() virtio_legacy_is_little_endian() #define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb)) #define GET_PBDQC_FROM_RB(x) ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc)) #define GET_PBLOCK_DESC(x, bid) \ ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer)) #define GET_CURR_PBLOCK_DESC_FROM_CORE(x) \ ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer)) #define GET_NEXT_PRB_BLK_NUM(x) \ (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \ ((x)->kactive_blk_num+1) : 0) static void __fanout_unlink(struct sock *sk, struct packet_sock *po); static void __fanout_link(struct sock *sk, struct packet_sock *po); #ifdef CONFIG_NETFILTER_EGRESS static noinline struct sk_buff *nf_hook_direct_egress(struct sk_buff *skb) { struct sk_buff *next, *head = NULL, *tail; int rc; rcu_read_lock(); for (; skb != NULL; skb = next) { next = skb->next; skb_mark_not_on_list(skb); if (!nf_hook_egress(skb, &rc, skb->dev)) continue; if (!head) head = skb; else tail->next = skb; tail = skb; } rcu_read_unlock(); return head; } #endif static int packet_xmit(const struct packet_sock *po, struct sk_buff *skb) { if (!packet_sock_flag(po, PACKET_SOCK_QDISC_BYPASS)) return dev_queue_xmit(skb); #ifdef CONFIG_NETFILTER_EGRESS if (nf_hook_egress_active()) { skb = nf_hook_direct_egress(skb); if (!skb) return NET_XMIT_DROP; } #endif return dev_direct_xmit(skb, packet_pick_tx_queue(skb)); } static struct net_device *packet_cached_dev_get(struct packet_sock *po) { struct net_device *dev; rcu_read_lock(); dev = rcu_dereference(po->cached_dev); dev_hold(dev); rcu_read_unlock(); return dev; } static void packet_cached_dev_assign(struct packet_sock *po, struct net_device *dev) { rcu_assign_pointer(po->cached_dev, dev); } static void packet_cached_dev_reset(struct packet_sock *po) { RCU_INIT_POINTER(po->cached_dev, NULL); } static u16 packet_pick_tx_queue(struct sk_buff *skb) { struct net_device *dev = skb->dev; const struct net_device_ops *ops = dev->netdev_ops; int cpu = raw_smp_processor_id(); u16 queue_index; #ifdef CONFIG_XPS skb->sender_cpu = cpu + 1; #endif skb_record_rx_queue(skb, cpu % dev->real_num_tx_queues); if (ops->ndo_select_queue) { queue_index = ops->ndo_select_queue(dev, skb, NULL); queue_index = netdev_cap_txqueue(dev, queue_index); } else { queue_index = netdev_pick_tx(dev, skb, NULL); } return queue_index; } /* __register_prot_hook must be invoked through register_prot_hook * or from a context in which asynchronous accesses to the packet * socket is not possible (packet_create()). */ static void __register_prot_hook(struct sock *sk) { struct packet_sock *po = pkt_sk(sk); if (!packet_sock_flag(po, PACKET_SOCK_RUNNING)) { if (po->fanout) __fanout_link(sk, po); else dev_add_pack(&po->prot_hook); sock_hold(sk); packet_sock_flag_set(po, PACKET_SOCK_RUNNING, 1); } } static void register_prot_hook(struct sock *sk) { lockdep_assert_held_once(&pkt_sk(sk)->bind_lock); __register_prot_hook(sk); } /* If the sync parameter is true, we will temporarily drop * the po->bind_lock and do a synchronize_net to make sure no * asynchronous packet processing paths still refer to the elements * of po->prot_hook. If the sync parameter is false, it is the * callers responsibility to take care of this. */ static void __unregister_prot_hook(struct sock *sk, bool sync) { struct packet_sock *po = pkt_sk(sk); lockdep_assert_held_once(&po->bind_lock); packet_sock_flag_set(po, PACKET_SOCK_RUNNING, 0); if (po->fanout) __fanout_unlink(sk, po); else __dev_remove_pack(&po->prot_hook); __sock_put(sk); if (sync) { spin_unlock(&po->bind_lock); synchronize_net(); spin_lock(&po->bind_lock); } } static void unregister_prot_hook(struct sock *sk, bool sync) { struct packet_sock *po = pkt_sk(sk); if (packet_sock_flag(po, PACKET_SOCK_RUNNING)) __unregister_prot_hook(sk, sync); } static inline struct page * __pure pgv_to_page(void *addr) { if (is_vmalloc_addr(addr)) return vmalloc_to_page(addr); return virt_to_page(addr); } static void __packet_set_status(struct packet_sock *po, void *frame, int status) { union tpacket_uhdr h; /* WRITE_ONCE() are paired with READ_ONCE() in __packet_get_status */ h.raw = frame; switch (po->tp_version) { case TPACKET_V1: WRITE_ONCE(h.h1->tp_status, status); flush_dcache_page(pgv_to_page(&h.h1->tp_status)); break; case TPACKET_V2: WRITE_ONCE(h.h2->tp_status, status); flush_dcache_page(pgv_to_page(&h.h2->tp_status)); break; case TPACKET_V3: WRITE_ONCE(h.h3->tp_status, status); flush_dcache_page(pgv_to_page(&h.h3->tp_status)); break; default: WARN(1, "TPACKET version not supported.\n"); BUG(); } smp_wmb(); } static int __packet_get_status(const struct packet_sock *po, void *frame) { union tpacket_uhdr h; smp_rmb(); /* READ_ONCE() are paired with WRITE_ONCE() in __packet_set_status */ h.raw = frame; switch (po->tp_version) { case TPACKET_V1: flush_dcache_page(pgv_to_page(&h.h1->tp_status)); return READ_ONCE(h.h1->tp_status); case TPACKET_V2: flush_dcache_page(pgv_to_page(&h.h2->tp_status)); return READ_ONCE(h.h2->tp_status); case TPACKET_V3: flush_dcache_page(pgv_to_page(&h.h3->tp_status)); return READ_ONCE(h.h3->tp_status); default: WARN(1, "TPACKET version not supported.\n"); BUG(); return 0; } } static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec64 *ts, unsigned int flags) { struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb); if (shhwtstamps && (flags & SOF_TIMESTAMPING_RAW_HARDWARE) && ktime_to_timespec64_cond(shhwtstamps->hwtstamp, ts)) return TP_STATUS_TS_RAW_HARDWARE; if ((flags & SOF_TIMESTAMPING_SOFTWARE) && ktime_to_timespec64_cond(skb_tstamp(skb), ts)) return TP_STATUS_TS_SOFTWARE; return 0; } static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame, struct sk_buff *skb) { union tpacket_uhdr h; struct timespec64 ts; __u32 ts_status; if (!(ts_status = tpacket_get_timestamp(skb, &ts, READ_ONCE(po->tp_tstamp)))) return 0; h.raw = frame; /* * versions 1 through 3 overflow the timestamps in y2106, since they * all store the seconds in a 32-bit unsigned integer. * If we create a version 4, that should have a 64-bit timestamp, * either 64-bit seconds + 32-bit nanoseconds, or just 64-bit * nanoseconds. */ switch (po->tp_version) { case TPACKET_V1: h.h1->tp_sec = ts.tv_sec; h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC; break; case TPACKET_V2: h.h2->tp_sec = ts.tv_sec; h.h2->tp_nsec = ts.tv_nsec; break; case TPACKET_V3: h.h3->tp_sec = ts.tv_sec; h.h3->tp_nsec = ts.tv_nsec; break; default: WARN(1, "TPACKET version not supported.\n"); BUG(); } /* one flush is safe, as both fields always lie on the same cacheline */ flush_dcache_page(pgv_to_page(&h.h1->tp_sec)); smp_wmb(); return ts_status; } static void *packet_lookup_frame(const struct packet_sock *po, const struct packet_ring_buffer *rb, unsigned int position, int status) { unsigned int pg_vec_pos, frame_offset; union tpacket_uhdr h; pg_vec_pos = position / rb->frames_per_block; frame_offset = position % rb->frames_per_block; h.raw = rb->pg_vec[pg_vec_pos].buffer + (frame_offset * rb->frame_size); if (status != __packet_get_status(po, h.raw)) return NULL; return h.raw; } static void *packet_current_frame(struct packet_sock *po, struct packet_ring_buffer *rb, int status) { return packet_lookup_frame(po, rb, rb->head, status); } static u16 vlan_get_tci(const struct sk_buff *skb, struct net_device *dev) { struct vlan_hdr vhdr, *vh; unsigned int header_len; if (!dev) return 0; /* In the SOCK_DGRAM scenario, skb data starts at the network * protocol, which is after the VLAN headers. The outer VLAN * header is at the hard_header_len offset in non-variable * length link layer headers. If it's a VLAN device, the * min_header_len should be used to exclude the VLAN header * size. */ if (dev->min_header_len == dev->hard_header_len) header_len = dev->hard_header_len; else if (is_vlan_dev(dev)) header_len = dev->min_header_len; else return 0; vh = skb_header_pointer(skb, skb_mac_offset(skb) + header_len, sizeof(vhdr), &vhdr); if (unlikely(!vh)) return 0; return ntohs(vh->h_vlan_TCI); } static __be16 vlan_get_protocol_dgram(const struct sk_buff *skb) { __be16 proto = skb->protocol; if (unlikely(eth_type_vlan(proto))) proto = __vlan_get_protocol_offset(skb, proto, skb_mac_offset(skb), NULL); return proto; } static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc) { timer_delete_sync(&pkc->retire_blk_timer); } static void prb_shutdown_retire_blk_timer(struct packet_sock *po, struct sk_buff_head *rb_queue) { struct tpacket_kbdq_core *pkc; pkc = GET_PBDQC_FROM_RB(&po->rx_ring); spin_lock_bh(&rb_queue->lock); pkc->delete_blk_timer = 1; spin_unlock_bh(&rb_queue->lock); prb_del_retire_blk_timer(pkc); } static void prb_setup_retire_blk_timer(struct packet_sock *po) { struct tpacket_kbdq_core *pkc; pkc = GET_PBDQC_FROM_RB(&po->rx_ring); timer_setup(&pkc->retire_blk_timer, prb_retire_rx_blk_timer_expired, 0); pkc->retire_blk_timer.expires = jiffies; } static int prb_calc_retire_blk_tmo(struct packet_sock *po, int blk_size_in_bytes) { struct net_device *dev; unsigned int mbits, div; struct ethtool_link_ksettings ecmd; int err; rtnl_lock(); dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex); if (unlikely(!dev)) { rtnl_unlock(); return DEFAULT_PRB_RETIRE_TOV; } err = __ethtool_get_link_ksettings(dev, &ecmd); rtnl_unlock(); if (err) return DEFAULT_PRB_RETIRE_TOV; /* If the link speed is so slow you don't really * need to worry about perf anyways */ if (ecmd.base.speed < SPEED_1000 || ecmd.base.speed == SPEED_UNKNOWN) return DEFAULT_PRB_RETIRE_TOV; div = ecmd.base.speed / 1000; mbits = (blk_size_in_bytes * 8) / (1024 * 1024); if (div) mbits /= div; if (div) return mbits + 1; return mbits; } static void prb_init_ft_ops(struct tpacket_kbdq_core *p1, union tpacket_req_u *req_u) { p1->feature_req_word = req_u->req3.tp_feature_req_word; } static void init_prb_bdqc(struct packet_sock *po, struct packet_ring_buffer *rb, struct pgv *pg_vec, union tpacket_req_u *req_u) { struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb); struct tpacket_block_desc *pbd; memset(p1, 0x0, sizeof(*p1)); p1->knxt_seq_num = 1; p1->pkbdq = pg_vec; pbd = (struct tpacket_block_desc *)pg_vec[0].buffer; p1->pkblk_start = pg_vec[0].buffer; p1->kblk_size = req_u->req3.tp_block_size; p1->knum_blocks = req_u->req3.tp_block_nr; p1->hdrlen = po->tp_hdrlen; p1->version = po->tp_version; p1->last_kactive_blk_num = 0; po->stats.stats3.tp_freeze_q_cnt = 0; if (req_u->req3.tp_retire_blk_tov) p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov; else p1->retire_blk_tov = prb_calc_retire_blk_tmo(po, req_u->req3.tp_block_size); p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov); p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv; rwlock_init(&p1->blk_fill_in_prog_lock); p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv); prb_init_ft_ops(p1, req_u); prb_setup_retire_blk_timer(po); prb_open_block(p1, pbd); } /* Do NOT update the last_blk_num first. * Assumes sk_buff_head lock is held. */ static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc) { mod_timer(&pkc->retire_blk_timer, jiffies + pkc->tov_in_jiffies); pkc->last_kactive_blk_num = pkc->kactive_blk_num; } /* * Timer logic: * 1) We refresh the timer only when we open a block. * By doing this we don't waste cycles refreshing the timer * on packet-by-packet basis. * * With a 1MB block-size, on a 1Gbps line, it will take * i) ~8 ms to fill a block + ii) memcpy etc. * In this cut we are not accounting for the memcpy time. * * So, if the user sets the 'tmo' to 10ms then the timer * will never fire while the block is still getting filled * (which is what we want). However, the user could choose * to close a block early and that's fine. * * But when the timer does fire, we check whether or not to refresh it. * Since the tmo granularity is in msecs, it is not too expensive * to refresh the timer, lets say every '8' msecs. * Either the user can set the 'tmo' or we can derive it based on * a) line-speed and b) block-size. * prb_calc_retire_blk_tmo() calculates the tmo. * */ static void prb_retire_rx_blk_timer_expired(struct timer_list *t) { struct packet_sock *po = from_timer(po, t, rx_ring.prb_bdqc.retire_blk_timer); struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring); unsigned int frozen; struct tpacket_block_desc *pbd; spin_lock(&po->sk.sk_receive_queue.lock); frozen = prb_queue_frozen(pkc); pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); if (unlikely(pkc->delete_blk_timer)) goto out; /* We only need to plug the race when the block is partially filled. * tpacket_rcv: * lock(); increment BLOCK_NUM_PKTS; unlock() * copy_bits() is in progress ... * timer fires on other cpu: * we can't retire the current block because copy_bits * is in progress. * */ if (BLOCK_NUM_PKTS(pbd)) { /* Waiting for skb_copy_bits to finish... */ write_lock(&pkc->blk_fill_in_prog_lock); write_unlock(&pkc->blk_fill_in_prog_lock); } if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) { if (!frozen) { if (!BLOCK_NUM_PKTS(pbd)) { /* An empty block. Just refresh the timer. */ goto refresh_timer; } prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO); if (!prb_dispatch_next_block(pkc, po)) goto refresh_timer; else goto out; } else { /* Case 1. Queue was frozen because user-space was * lagging behind. */ if (prb_curr_blk_in_use(pbd)) { /* * Ok, user-space is still behind. * So just refresh the timer. */ goto refresh_timer; } else { /* Case 2. queue was frozen,user-space caught up, * now the link went idle && the timer fired. * We don't have a block to close.So we open this * block and restart the timer. * opening a block thaws the queue,restarts timer * Thawing/timer-refresh is a side effect. */ prb_open_block(pkc, pbd); goto out; } } } refresh_timer: _prb_refresh_rx_retire_blk_timer(pkc); out: spin_unlock(&po->sk.sk_receive_queue.lock); } static void prb_flush_block(struct tpacket_kbdq_core *pkc1, struct tpacket_block_desc *pbd1, __u32 status) { /* Flush everything minus the block header */ #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1 u8 *start, *end; start = (u8 *)pbd1; /* Skip the block header(we know header WILL fit in 4K) */ start += PAGE_SIZE; end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end); for (; start < end; start += PAGE_SIZE) flush_dcache_page(pgv_to_page(start)); smp_wmb(); #endif /* Now update the block status. */ BLOCK_STATUS(pbd1) = status; /* Flush the block header */ #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1 start = (u8 *)pbd1; flush_dcache_page(pgv_to_page(start)); smp_wmb(); #endif } /* * Side effect: * * 1) flush the block * 2) Increment active_blk_num * * Note:We DONT refresh the timer on purpose. * Because almost always the next block will be opened. */ static void prb_close_block(struct tpacket_kbdq_core *pkc1, struct tpacket_block_desc *pbd1, struct packet_sock *po, unsigned int stat) { __u32 status = TP_STATUS_USER | stat; struct tpacket3_hdr *last_pkt; struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1; struct sock *sk = &po->sk; if (atomic_read(&po->tp_drops)) status |= TP_STATUS_LOSING; last_pkt = (struct tpacket3_hdr *)pkc1->prev; last_pkt->tp_next_offset = 0; /* Get the ts of the last pkt */ if (BLOCK_NUM_PKTS(pbd1)) { h1->ts_last_pkt.ts_sec = last_pkt->tp_sec; h1->ts_last_pkt.ts_nsec = last_pkt->tp_nsec; } else { /* Ok, we tmo'd - so get the current time. * * It shouldn't really happen as we don't close empty * blocks. See prb_retire_rx_blk_timer_expired(). */ struct timespec64 ts; ktime_get_real_ts64(&ts); h1->ts_last_pkt.ts_sec = ts.tv_sec; h1->ts_last_pkt.ts_nsec = ts.tv_nsec; } smp_wmb(); /* Flush the block */ prb_flush_block(pkc1, pbd1, status); sk->sk_data_ready(sk); pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1); } static void prb_thaw_queue(struct tpacket_kbdq_core *pkc) { pkc->reset_pending_on_curr_blk = 0; } /* * Side effect of opening a block: * * 1) prb_queue is thawed. * 2) retire_blk_timer is refreshed. * */ static void prb_open_block(struct tpacket_kbdq_core *pkc1, struct tpacket_block_desc *pbd1) { struct timespec64 ts; struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1; smp_rmb(); /* We could have just memset this but we will lose the * flexibility of making the priv area sticky */ BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++; BLOCK_NUM_PKTS(pbd1) = 0; BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv); ktime_get_real_ts64(&ts); h1->ts_first_pkt.ts_sec = ts.tv_sec; h1->ts_first_pkt.ts_nsec = ts.tv_nsec; pkc1->pkblk_start = (char *)pbd1; pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv); BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv); BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN; pbd1->version = pkc1->version; pkc1->prev = pkc1->nxt_offset; pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size; prb_thaw_queue(pkc1); _prb_refresh_rx_retire_blk_timer(pkc1); smp_wmb(); } /* * Queue freeze logic: * 1) Assume tp_block_nr = 8 blocks. * 2) At time 't0', user opens Rx ring. * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7 * 4) user-space is either sleeping or processing block '0'. * 5) tpacket_rcv is currently filling block '7', since there is no space left, * it will close block-7,loop around and try to fill block '0'. * call-flow: * __packet_lookup_frame_in_block * prb_retire_current_block() * prb_dispatch_next_block() * |->(BLOCK_STATUS == USER) evaluates to true * 5.1) Since block-0 is currently in-use, we just freeze the queue. * 6) Now there are two cases: * 6.1) Link goes idle right after the queue is frozen. * But remember, the last open_block() refreshed the timer. * When this timer expires,it will refresh itself so that we can * re-open block-0 in near future. * 6.2) Link is busy and keeps on receiving packets. This is a simple * case and __packet_lookup_frame_in_block will check if block-0 * is free and can now be re-used. */ static void prb_freeze_queue(struct tpacket_kbdq_core *pkc, struct packet_sock *po) { pkc->reset_pending_on_curr_blk = 1; po->stats.stats3.tp_freeze_q_cnt++; } #define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT)) /* * If the next block is free then we will dispatch it * and return a good offset. * Else, we will freeze the queue. * So, caller must check the return value. */ static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc, struct packet_sock *po) { struct tpacket_block_desc *pbd; smp_rmb(); /* 1. Get current block num */ pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); /* 2. If this block is currently in_use then freeze the queue */ if (TP_STATUS_USER & BLOCK_STATUS(pbd)) { prb_freeze_queue(pkc, po); return NULL; } /* * 3. * open this block and return the offset where the first packet * needs to get stored. */ prb_open_block(pkc, pbd); return (void *)pkc->nxt_offset; } static void prb_retire_current_block(struct tpacket_kbdq_core *pkc, struct packet_sock *po, unsigned int status) { struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); /* retire/close the current block */ if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) { /* * Plug the case where copy_bits() is in progress on * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't * have space to copy the pkt in the current block and * called prb_retire_current_block() * * We don't need to worry about the TMO case because * the timer-handler already handled this case. */ if (!(status & TP_STATUS_BLK_TMO)) { /* Waiting for skb_copy_bits to finish... */ write_lock(&pkc->blk_fill_in_prog_lock); write_unlock(&pkc->blk_fill_in_prog_lock); } prb_close_block(pkc, pbd, po, status); return; } } static int prb_curr_blk_in_use(struct tpacket_block_desc *pbd) { return TP_STATUS_USER & BLOCK_STATUS(pbd); } static int prb_queue_frozen(struct tpacket_kbdq_core *pkc) { return pkc->reset_pending_on_curr_blk; } static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb) __releases(&pkc->blk_fill_in_prog_lock) { struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb); read_unlock(&pkc->blk_fill_in_prog_lock); } static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc, struct tpacket3_hdr *ppd) { ppd->hv1.tp_rxhash = skb_get_hash(pkc->skb); } static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc, struct tpacket3_hdr *ppd) { ppd->hv1.tp_rxhash = 0; } static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc, struct tpacket3_hdr *ppd) { struct packet_sock *po = container_of(pkc, struct packet_sock, rx_ring.prb_bdqc); if (skb_vlan_tag_present(pkc->skb)) { ppd->hv1.tp_vlan_tci = skb_vlan_tag_get(pkc->skb); ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto); ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID; } else if (unlikely(po->sk.sk_type == SOCK_DGRAM && eth_type_vlan(pkc->skb->protocol))) { ppd->hv1.tp_vlan_tci = vlan_get_tci(pkc->skb, pkc->skb->dev); ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->protocol); ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID; } else { ppd->hv1.tp_vlan_tci = 0; ppd->hv1.tp_vlan_tpid = 0; ppd->tp_status = TP_STATUS_AVAILABLE; } } static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc, struct tpacket3_hdr *ppd) { ppd->hv1.tp_padding = 0; prb_fill_vlan_info(pkc, ppd); if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH) prb_fill_rxhash(pkc, ppd); else prb_clear_rxhash(pkc, ppd); } static void prb_fill_curr_block(char *curr, struct tpacket_kbdq_core *pkc, struct tpacket_block_desc *pbd, unsigned int len) __acquires(&pkc->blk_fill_in_prog_lock) { struct tpacket3_hdr *ppd; ppd = (struct tpacket3_hdr *)curr; ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len); pkc->prev = curr; pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len); BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len); BLOCK_NUM_PKTS(pbd) += 1; read_lock(&pkc->blk_fill_in_prog_lock); prb_run_all_ft_ops(pkc, ppd); } /* Assumes caller has the sk->rx_queue.lock */ static void *__packet_lookup_frame_in_block(struct packet_sock *po, struct sk_buff *skb, unsigned int len ) { struct tpacket_kbdq_core *pkc; struct tpacket_block_desc *pbd; char *curr, *end; pkc = GET_PBDQC_FROM_RB(&po->rx_ring); pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); /* Queue is frozen when user space is lagging behind */ if (prb_queue_frozen(pkc)) { /* * Check if that last block which caused the queue to freeze, * is still in_use by user-space. */ if (prb_curr_blk_in_use(pbd)) { /* Can't record this packet */ return NULL; } else { /* * Ok, the block was released by user-space. * Now let's open that block. * opening a block also thaws the queue. * Thawing is a side effect. */ prb_open_block(pkc, pbd); } } smp_mb(); curr = pkc->nxt_offset; pkc->skb = skb; end = (char *)pbd + pkc->kblk_size; /* first try the current block */ if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) { prb_fill_curr_block(curr, pkc, pbd, len); return (void *)curr; } /* Ok, close the current block */ prb_retire_current_block(pkc, po, 0); /* Now, try to dispatch the next block */ curr = (char *)prb_dispatch_next_block(pkc, po); if (curr) { pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); prb_fill_curr_block(curr, pkc, pbd, len); return (void *)curr; } /* * No free blocks are available.user_space hasn't caught up yet. * Queue was just frozen and now this packet will get dropped. */ return NULL; } static void *packet_current_rx_frame(struct packet_sock *po, struct sk_buff *skb, int status, unsigned int len) { char *curr = NULL; switch (po->tp_version) { case TPACKET_V1: case TPACKET_V2: curr = packet_lookup_frame(po, &po->rx_ring, po->rx_ring.head, status); return curr; case TPACKET_V3: return __packet_lookup_frame_in_block(po, skb, len); default: WARN(1, "TPACKET version not supported\n"); BUG(); return NULL; } } static void *prb_lookup_block(const struct packet_sock *po, const struct packet_ring_buffer *rb, unsigned int idx, int status) { struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb); struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx); if (status != BLOCK_STATUS(pbd)) return NULL; return pbd; } static int prb_previous_blk_num(struct packet_ring_buffer *rb) { unsigned int prev; if (rb->prb_bdqc.kactive_blk_num) prev = rb->prb_bdqc.kactive_blk_num-1; else prev = rb->prb_bdqc.knum_blocks-1; return prev; } /* Assumes caller has held the rx_queue.lock */ static void *__prb_previous_block(struct packet_sock *po, struct packet_ring_buffer *rb, int status) { unsigned int previous = prb_previous_blk_num(rb); return prb_lookup_block(po, rb, previous, status); } static void *packet_previous_rx_frame(struct packet_sock *po, struct packet_ring_buffer *rb, int status) { if (po->tp_version <= TPACKET_V2) return packet_previous_frame(po, rb, status); return __prb_previous_block(po, rb, status); } static void packet_increment_rx_head(struct packet_sock *po, struct packet_ring_buffer *rb) { switch (po->tp_version) { case TPACKET_V1: case TPACKET_V2: return packet_increment_head(rb); case TPACKET_V3: default: WARN(1, "TPACKET version not supported.\n"); BUG(); return; } } static void *packet_previous_frame(struct packet_sock *po, struct packet_ring_buffer *rb, int status) { unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max; return packet_lookup_frame(po, rb, previous, status); } static void packet_increment_head(struct packet_ring_buffer *buff) { buff->head = buff->head != buff->frame_max ? buff->head+1 : 0; } static void packet_inc_pending(struct packet_ring_buffer *rb) { this_cpu_inc(*rb->pending_refcnt); } static void packet_dec_pending(struct packet_ring_buffer *rb) { this_cpu_dec(*rb->pending_refcnt); } static unsigned int packet_read_pending(const struct packet_ring_buffer *rb) { unsigned int refcnt = 0; int cpu; /* We don't use pending refcount in rx_ring. */ if (rb->pending_refcnt == NULL) return 0; for_each_possible_cpu(cpu) refcnt += *per_cpu_ptr(rb->pending_refcnt, cpu); return refcnt; } static int packet_alloc_pending(struct packet_sock *po) { po->rx_ring.pending_refcnt = NULL; po->tx_ring.pending_refcnt = alloc_percpu(unsigned int); if (unlikely(po->tx_ring.pending_refcnt == NULL)) return -ENOBUFS; return 0; } static void packet_free_pending(struct packet_sock *po) { free_percpu(po->tx_ring.pending_refcnt); } #define ROOM_POW_OFF 2 #define ROOM_NONE 0x0 #define ROOM_LOW 0x1 #define ROOM_NORMAL 0x2 static bool __tpacket_has_room(const struct packet_sock *po, int pow_off) { int idx, len; len = READ_ONCE(po->rx_ring.frame_max) + 1; idx = READ_ONCE(po->rx_ring.head); if (pow_off) idx += len >> pow_off; if (idx >= len) idx -= len; return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL); } static bool __tpacket_v3_has_room(const struct packet_sock *po, int pow_off) { int idx, len; len = READ_ONCE(po->rx_ring.prb_bdqc.knum_blocks); idx = READ_ONCE(po->rx_ring.prb_bdqc.kactive_blk_num); if (pow_off) idx += len >> pow_off; if (idx >= len) idx -= len; return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL); } static int __packet_rcv_has_room(const struct packet_sock *po, const struct sk_buff *skb) { const struct sock *sk = &po->sk; int ret = ROOM_NONE; if (po->prot_hook.func != tpacket_rcv) { int rcvbuf = READ_ONCE(sk->sk_rcvbuf); int avail = rcvbuf - atomic_read(&sk->sk_rmem_alloc) - (skb ? skb->truesize : 0); if (avail > (rcvbuf >> ROOM_POW_OFF)) return ROOM_NORMAL; else if (avail > 0) return ROOM_LOW; else return ROOM_NONE; } if (po->tp_version == TPACKET_V3) { if (__tpacket_v3_has_room(po, ROOM_POW_OFF)) ret = ROOM_NORMAL; else if (__tpacket_v3_has_room(po, 0)) ret = ROOM_LOW; } else { if (__tpacket_has_room(po, ROOM_POW_OFF)) ret = ROOM_NORMAL; else if (__tpacket_has_room(po, 0)) ret = ROOM_LOW; } return ret; } static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb) { bool pressure; int ret; ret = __packet_rcv_has_room(po, skb); pressure = ret != ROOM_NORMAL; if (packet_sock_flag(po, PACKET_SOCK_PRESSURE) != pressure) packet_sock_flag_set(po, PACKET_SOCK_PRESSURE, pressure); return ret; } static void packet_rcv_try_clear_pressure(struct packet_sock *po) { if (packet_sock_flag(po, PACKET_SOCK_PRESSURE) && __packet_rcv_has_room(po, NULL) == ROOM_NORMAL) packet_sock_flag_set(po, PACKET_SOCK_PRESSURE, false); } static void packet_sock_destruct(struct sock *sk) { skb_queue_purge(&sk->sk_error_queue); WARN_ON(atomic_read(&sk->sk_rmem_alloc)); WARN_ON(refcount_read(&sk->sk_wmem_alloc)); if (!sock_flag(sk, SOCK_DEAD)) { pr_err("Attempt to release alive packet socket: %p\n", sk); return; } } static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb) { u32 *history = po->rollover->history; u32 victim, rxhash; int i, count = 0; rxhash = skb_get_hash(skb); for (i = 0; i < ROLLOVER_HLEN; i++) if (READ_ONCE(history[i]) == rxhash) count++; victim = get_random_u32_below(ROLLOVER_HLEN); /* Avoid dirtying the cache line if possible */ if (READ_ONCE(history[victim]) != rxhash) WRITE_ONCE(history[victim], rxhash); return count > (ROLLOVER_HLEN >> 1); } static unsigned int fanout_demux_hash(struct packet_fanout *f, struct sk_buff *skb, unsigned int num) { return reciprocal_scale(__skb_get_hash_symmetric(skb), num); } static unsigned int fanout_demux_lb(struct packet_fanout *f, struct sk_buff *skb, unsigned int num) { unsigned int val = atomic_inc_return(&f->rr_cur); return val % num; } static unsigned int fanout_demux_cpu(struct packet_fanout *f, struct sk_buff *skb, unsigned int num) { return smp_processor_id() % num; } static unsigned int fanout_demux_rnd(struct packet_fanout *f, struct sk_buff *skb, unsigned int num) { return get_random_u32_below(num); } static unsigned int fanout_demux_rollover(struct packet_fanout *f, struct sk_buff *skb, unsigned int idx, bool try_self, unsigned int num) { struct packet_sock *po, *po_next, *po_skip = NULL; unsigned int i, j, room = ROOM_NONE; po = pkt_sk(rcu_dereference(f->arr[idx])); if (try_self) { room = packet_rcv_has_room(po, skb); if (room == ROOM_NORMAL || (room == ROOM_LOW && !fanout_flow_is_huge(po, skb))) return idx; po_skip = po; } i = j = min_t(int, po->rollover->sock, num - 1); do { po_next = pkt_sk(rcu_dereference(f->arr[i])); if (po_next != po_skip && !packet_sock_flag(po_next, PACKET_SOCK_PRESSURE) && packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) { if (i != j) po->rollover->sock = i; atomic_long_inc(&po->rollover->num); if (room == ROOM_LOW) atomic_long_inc(&po->rollover->num_huge); return i; } if (++i == num) i = 0; } while (i != j); atomic_long_inc(&po->rollover->num_failed); return idx; } static unsigned int fanout_demux_qm(struct packet_fanout *f, struct sk_buff *skb, unsigned int num) { return skb_get_queue_mapping(skb) % num; } static unsigned int fanout_demux_bpf(struct packet_fanout *f, struct sk_buff *skb, unsigned int num) { struct bpf_prog *prog; unsigned int ret = 0; rcu_read_lock(); prog = rcu_dereference(f->bpf_prog); if (prog) ret = bpf_prog_run_clear_cb(prog, skb) % num; rcu_read_unlock(); return ret; } static bool fanout_has_flag(struct packet_fanout *f, u16 flag) { return f->flags & (flag >> 8); } static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct packet_fanout *f = pt->af_packet_priv; unsigned int num = READ_ONCE(f->num_members); struct net *net = read_pnet(&f->net); struct packet_sock *po; unsigned int idx; if (!net_eq(dev_net(dev), net) || !num) { kfree_skb(skb); return 0; } if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) { skb = ip_check_defrag(net, skb, IP_DEFRAG_AF_PACKET); if (!skb) return 0; } switch (f->type) { case PACKET_FANOUT_HASH: default: idx = fanout_demux_hash(f, skb, num); break; case PACKET_FANOUT_LB: idx = fanout_demux_lb(f, skb, num); break; case PACKET_FANOUT_CPU: idx = fanout_demux_cpu(f, skb, num); break; case PACKET_FANOUT_RND: idx = fanout_demux_rnd(f, skb, num); break; case PACKET_FANOUT_QM: idx = fanout_demux_qm(f, skb, num); break; case PACKET_FANOUT_ROLLOVER: idx = fanout_demux_rollover(f, skb, 0, false, num); break; case PACKET_FANOUT_CBPF: case PACKET_FANOUT_EBPF: idx = fanout_demux_bpf(f, skb, num); break; } if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER)) idx = fanout_demux_rollover(f, skb, idx, true, num); po = pkt_sk(rcu_dereference(f->arr[idx])); return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev); } DEFINE_MUTEX(fanout_mutex); EXPORT_SYMBOL_GPL(fanout_mutex); static LIST_HEAD(fanout_list); static u16 fanout_next_id; static void __fanout_link(struct sock *sk, struct packet_sock *po) { struct packet_fanout *f = po->fanout; spin_lock(&f->lock); rcu_assign_pointer(f->arr[f->num_members], sk); smp_wmb(); f->num_members++; if (f->num_members == 1) dev_add_pack(&f->prot_hook); spin_unlock(&f->lock); } static void __fanout_unlink(struct sock *sk, struct packet_sock *po) { struct packet_fanout *f = po->fanout; int i; spin_lock(&f->lock); for (i = 0; i < f->num_members; i++) { if (rcu_dereference_protected(f->arr[i], lockdep_is_held(&f->lock)) == sk) break; } BUG_ON(i >= f->num_members); rcu_assign_pointer(f->arr[i], rcu_dereference_protected(f->arr[f->num_members - 1], lockdep_is_held(&f->lock))); f->num_members--; if (f->num_members == 0) __dev_remove_pack(&f->prot_hook); spin_unlock(&f->lock); } static bool match_fanout_group(struct packet_type *ptype, struct sock *sk) { if (sk->sk_family != PF_PACKET) return false; return ptype->af_packet_priv == pkt_sk(sk)->fanout; } static void fanout_init_data(struct packet_fanout *f) { switch (f->type) { case PACKET_FANOUT_LB: atomic_set(&f->rr_cur, 0); break; case PACKET_FANOUT_CBPF: case PACKET_FANOUT_EBPF: RCU_INIT_POINTER(f->bpf_prog, NULL); break; } } static void __fanout_set_data_bpf(struct packet_fanout *f, struct bpf_prog *new) { struct bpf_prog *old; spin_lock(&f->lock); old = rcu_dereference_protected(f->bpf_prog, lockdep_is_held(&f->lock)); rcu_assign_pointer(f->bpf_prog, new); spin_unlock(&f->lock); if (old) { synchronize_net(); bpf_prog_destroy(old); } } static int fanout_set_data_cbpf(struct packet_sock *po, sockptr_t data, unsigned int len) { struct bpf_prog *new; struct sock_fprog fprog; int ret; if (sock_flag(&po->sk, SOCK_FILTER_LOCKED)) return -EPERM; ret = copy_bpf_fprog_from_user(&fprog, data, len); if (ret) return ret; ret = bpf_prog_create_from_user(&new, &fprog, NULL, false); if (ret) return ret; __fanout_set_data_bpf(po->fanout, new); return 0; } static int fanout_set_data_ebpf(struct packet_sock *po, sockptr_t data, unsigned int len) { struct bpf_prog *new; u32 fd; if (sock_flag(&po->sk, SOCK_FILTER_LOCKED)) return -EPERM; if (len != sizeof(fd)) return -EINVAL; if (copy_from_sockptr(&fd, data, len)) return -EFAULT; new = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER); if (IS_ERR(new)) return PTR_ERR(new); __fanout_set_data_bpf(po->fanout, new); return 0; } static int fanout_set_data(struct packet_sock *po, sockptr_t data, unsigned int len) { switch (po->fanout->type) { case PACKET_FANOUT_CBPF: return fanout_set_data_cbpf(po, data, len); case PACKET_FANOUT_EBPF: return fanout_set_data_ebpf(po, data, len); default: return -EINVAL; } } static void fanout_release_data(struct packet_fanout *f) { switch (f->type) { case PACKET_FANOUT_CBPF: case PACKET_FANOUT_EBPF: __fanout_set_data_bpf(f, NULL); } } static bool __fanout_id_is_free(struct sock *sk, u16 candidate_id) { struct packet_fanout *f; list_for_each_entry(f, &fanout_list, list) { if (f->id == candidate_id && read_pnet(&f->net) == sock_net(sk)) { return false; } } return true; } static bool fanout_find_new_id(struct sock *sk, u16 *new_id) { u16 id = fanout_next_id; do { if (__fanout_id_is_free(sk, id)) { *new_id = id; fanout_next_id = id + 1; return true; } id++; } while (id != fanout_next_id); return false; } static int fanout_add(struct sock *sk, struct fanout_args *args) { struct packet_rollover *rollover = NULL; struct packet_sock *po = pkt_sk(sk); u16 type_flags = args->type_flags; struct packet_fanout *f, *match; u8 type = type_flags & 0xff; u8 flags = type_flags >> 8; u16 id = args->id; int err; switch (type) { case PACKET_FANOUT_ROLLOVER: if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER) return -EINVAL; break; case PACKET_FANOUT_HASH: case PACKET_FANOUT_LB: case PACKET_FANOUT_CPU: case PACKET_FANOUT_RND: case PACKET_FANOUT_QM: case PACKET_FANOUT_CBPF: case PACKET_FANOUT_EBPF: break; default: return -EINVAL; } mutex_lock(&fanout_mutex); err = -EALREADY; if (po->fanout) goto out; if (type == PACKET_FANOUT_ROLLOVER || (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)) { err = -ENOMEM; rollover = kzalloc(sizeof(*rollover), GFP_KERNEL); if (!rollover) goto out; atomic_long_set(&rollover->num, 0); atomic_long_set(&rollover->num_huge, 0); atomic_long_set(&rollover->num_failed, 0); } if (type_flags & PACKET_FANOUT_FLAG_UNIQUEID) { if (id != 0) { err = -EINVAL; goto out; } if (!fanout_find_new_id(sk, &id)) { err = -ENOMEM; goto out; } /* ephemeral flag for the first socket in the group: drop it */ flags &= ~(PACKET_FANOUT_FLAG_UNIQUEID >> 8); } match = NULL; list_for_each_entry(f, &fanout_list, list) { if (f->id == id && read_pnet(&f->net) == sock_net(sk)) { match = f; break; } } err = -EINVAL; if (match) { if (match->flags != flags) goto out; if (args->max_num_members && args->max_num_members != match->max_num_members) goto out; } else { if (args->max_num_members > PACKET_FANOUT_MAX) goto out; if (!args->max_num_members) /* legacy PACKET_FANOUT_MAX */ args->max_num_members = 256; err = -ENOMEM; match = kvzalloc(struct_size(match, arr, args->max_num_members), GFP_KERNEL); if (!match) goto out; write_pnet(&match->net, sock_net(sk)); match->id = id; match->type = type; match->flags = flags; INIT_LIST_HEAD(&match->list); spin_lock_init(&match->lock); refcount_set(&match->sk_ref, 0); fanout_init_data(match); match->prot_hook.type = po->prot_hook.type; match->prot_hook.dev = po->prot_hook.dev; match->prot_hook.func = packet_rcv_fanout; match->prot_hook.af_packet_priv = match; match->prot_hook.af_packet_net = read_pnet(&match->net); match->prot_hook.id_match = match_fanout_group; match->max_num_members = args->max_num_members; match->prot_hook.ignore_outgoing = type_flags & PACKET_FANOUT_FLAG_IGNORE_OUTGOING; list_add(&match->list, &fanout_list); } err = -EINVAL; spin_lock(&po->bind_lock); if (po->num && match->type == type && match->prot_hook.type == po->prot_hook.type && match->prot_hook.dev == po->prot_hook.dev) { err = -ENOSPC; if (refcount_read(&match->sk_ref) < match->max_num_members) { /* Paired with packet_setsockopt(PACKET_FANOUT_DATA) */ WRITE_ONCE(po->fanout, match); po->rollover = rollover; rollover = NULL; refcount_set(&match->sk_ref, refcount_read(&match->sk_ref) + 1); if (packet_sock_flag(po, PACKET_SOCK_RUNNING)) { __dev_remove_pack(&po->prot_hook); __fanout_link(sk, po); } err = 0; } } spin_unlock(&po->bind_lock); if (err && !refcount_read(&match->sk_ref)) { list_del(&match->list); kvfree(match); } out: kfree(rollover); mutex_unlock(&fanout_mutex); return err; } /* If pkt_sk(sk)->fanout->sk_ref is zero, this function removes * pkt_sk(sk)->fanout from fanout_list and returns pkt_sk(sk)->fanout. * It is the responsibility of the caller to call fanout_release_data() and * free the returned packet_fanout (after synchronize_net()) */ static struct packet_fanout *fanout_release(struct sock *sk) { struct packet_sock *po = pkt_sk(sk); struct packet_fanout *f; mutex_lock(&fanout_mutex); f = po->fanout; if (f) { po->fanout = NULL; if (refcount_dec_and_test(&f->sk_ref)) list_del(&f->list); else f = NULL; } mutex_unlock(&fanout_mutex); return f; } static bool packet_extra_vlan_len_allowed(const struct net_device *dev, struct sk_buff *skb) { /* Earlier code assumed this would be a VLAN pkt, double-check * this now that we have the actual packet in hand. We can only * do this check on Ethernet devices. */ if (unlikely(dev->type != ARPHRD_ETHER)) return false; skb_reset_mac_header(skb); return likely(eth_hdr(skb)->h_proto == htons(ETH_P_8021Q)); } static const struct proto_ops packet_ops; static const struct proto_ops packet_ops_spkt; static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct sock *sk; struct sockaddr_pkt *spkt; /* * When we registered the protocol we saved the socket in the data * field for just this event. */ sk = pt->af_packet_priv; /* * Yank back the headers [hope the device set this * right or kerboom...] * * Incoming packets have ll header pulled, * push it back. * * For outgoing ones skb->data == skb_mac_header(skb) * so that this procedure is noop. */ if (skb->pkt_type == PACKET_LOOPBACK) goto out; if (!net_eq(dev_net(dev), sock_net(sk))) goto out; skb = skb_share_check(skb, GFP_ATOMIC); if (skb == NULL) goto oom; /* drop any routing info */ skb_dst_drop(skb); /* drop conntrack reference */ nf_reset_ct(skb); spkt = &PACKET_SKB_CB(skb)->sa.pkt; skb_push(skb, skb->data - skb_mac_header(skb)); /* * The SOCK_PACKET socket receives _all_ frames. */ spkt->spkt_family = dev->type; strscpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device)); spkt->spkt_protocol = skb->protocol; /* * Charge the memory to the socket. This is done specifically * to prevent sockets using all the memory up. */ if (sock_queue_rcv_skb(sk, skb) == 0) return 0; out: kfree_skb(skb); oom: return 0; } static void packet_parse_headers(struct sk_buff *skb, struct socket *sock) { int depth; if ((!skb->protocol || skb->protocol == htons(ETH_P_ALL)) && sock->type == SOCK_RAW) { skb_reset_mac_header(skb); skb->protocol = dev_parse_header_protocol(skb); } /* Move network header to the right position for VLAN tagged packets */ if (likely(skb->dev->type == ARPHRD_ETHER) && eth_type_vlan(skb->protocol) && vlan_get_protocol_and_depth(skb, skb->protocol, &depth) != 0) skb_set_network_header(skb, depth); skb_probe_transport_header(skb); } /* * Output a raw packet to a device layer. This bypasses all the other * protocol layers and you must therefore supply it with a complete frame */ static int packet_sendmsg_spkt(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; DECLARE_SOCKADDR(struct sockaddr_pkt *, saddr, msg->msg_name); struct sk_buff *skb = NULL; struct net_device *dev; struct sockcm_cookie sockc; __be16 proto = 0; int err; int extra_len = 0; /* * Get and verify the address. */ if (saddr) { if (msg->msg_namelen < sizeof(struct sockaddr)) return -EINVAL; if (msg->msg_namelen == sizeof(struct sockaddr_pkt)) proto = saddr->spkt_protocol; } else return -ENOTCONN; /* SOCK_PACKET must be sent giving an address */ /* * Find the device first to size check it */ saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0; retry: rcu_read_lock(); dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device); err = -ENODEV; if (dev == NULL) goto out_unlock; err = -ENETDOWN; if (!(dev->flags & IFF_UP)) goto out_unlock; /* * You may not queue a frame bigger than the mtu. This is the lowest level * raw protocol and you must do your own fragmentation at this level. */ if (unlikely(sock_flag(sk, SOCK_NOFCS))) { if (!netif_supports_nofcs(dev)) { err = -EPROTONOSUPPORT; goto out_unlock; } extra_len = 4; /* We're doing our own CRC */ } err = -EMSGSIZE; if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len) goto out_unlock; if (!skb) { size_t reserved = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0; rcu_read_unlock(); skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL); if (skb == NULL) return -ENOBUFS; /* FIXME: Save some space for broken drivers that write a hard * header at transmission time by themselves. PPP is the notable * one here. This should really be fixed at the driver level. */ skb_reserve(skb, reserved); skb_reset_network_header(skb); /* Try to align data part correctly */ if (hhlen) { skb->data -= hhlen; skb->tail -= hhlen; if (len < hhlen) skb_reset_network_header(skb); } err = memcpy_from_msg(skb_put(skb, len), msg, len); if (err) goto out_free; goto retry; } if (!dev_validate_header(dev, skb->data, len) || !skb->len) { err = -EINVAL; goto out_unlock; } if (len > (dev->mtu + dev->hard_header_len + extra_len) && !packet_extra_vlan_len_allowed(dev, skb)) { err = -EMSGSIZE; goto out_unlock; } sockcm_init(&sockc, sk); if (msg->msg_controllen) { err = sock_cmsg_send(sk, msg, &sockc); if (unlikely(err)) goto out_unlock; } skb->protocol = proto; skb->dev = dev; skb->priority = sockc.priority; skb->mark = sockc.mark; skb_set_delivery_type_by_clockid(skb, sockc.transmit_time, sk->sk_clockid); skb_setup_tx_timestamp(skb, &sockc); if (unlikely(extra_len == 4)) skb->no_fcs = 1; packet_parse_headers(skb, sock); dev_queue_xmit(skb); rcu_read_unlock(); return len; out_unlock: rcu_read_unlock(); out_free: kfree_skb(skb); return err; } static unsigned int run_filter(struct sk_buff *skb, const struct sock *sk, unsigned int res) { struct sk_filter *filter; rcu_read_lock(); filter = rcu_dereference(sk->sk_filter); if (filter != NULL) res = bpf_prog_run_clear_cb(filter->prog, skb); rcu_read_unlock(); return res; } static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb, size_t *len, int vnet_hdr_sz) { struct virtio_net_hdr_mrg_rxbuf vnet_hdr = { .num_buffers = 0 }; if (*len < vnet_hdr_sz) return -EINVAL; *len -= vnet_hdr_sz; if (virtio_net_hdr_from_skb(skb, (struct virtio_net_hdr *)&vnet_hdr, vio_le(), true, 0)) return -EINVAL; return memcpy_to_msg(msg, (void *)&vnet_hdr, vnet_hdr_sz); } /* * This function makes lazy skb cloning in hope that most of packets * are discarded by BPF. * * Note tricky part: we DO mangle shared skb! skb->data, skb->len * and skb->cb are mangled. It works because (and until) packets * falling here are owned by current CPU. Output packets are cloned * by dev_queue_xmit_nit(), input packets are processed by net_bh * sequentially, so that if we return skb to original state on exit, * we will not harm anyone. */ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { enum skb_drop_reason drop_reason = SKB_CONSUMED; struct sock *sk = NULL; struct sockaddr_ll *sll; struct packet_sock *po; u8 *skb_head = skb->data; int skb_len = skb->len; unsigned int snaplen, res; if (skb->pkt_type == PACKET_LOOPBACK) goto drop; sk = pt->af_packet_priv; po = pkt_sk(sk); if (!net_eq(dev_net(dev), sock_net(sk))) goto drop; skb->dev = dev; if (dev_has_header(dev)) { /* The device has an explicit notion of ll header, * exported to higher levels. * * Otherwise, the device hides details of its frame * structure, so that corresponding packet head is * never delivered to user. */ if (sk->sk_type != SOCK_DGRAM) skb_push(skb, skb->data - skb_mac_header(skb)); else if (skb->pkt_type == PACKET_OUTGOING) { /* Special case: outgoing packets have ll header at head */ skb_pull(skb, skb_network_offset(skb)); } } snaplen = skb_frags_readable(skb) ? skb->len : skb_headlen(skb); res = run_filter(skb, sk, snaplen); if (!res) goto drop_n_restore; if (snaplen > res) snaplen = res; if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) goto drop_n_acct; if (skb_shared(skb)) { struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC); if (nskb == NULL) goto drop_n_acct; if (skb_head != skb->data) { skb->data = skb_head; skb->len = skb_len; } consume_skb(skb); skb = nskb; } sock_skb_cb_check_size(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8); sll = &PACKET_SKB_CB(skb)->sa.ll; sll->sll_hatype = dev->type; sll->sll_pkttype = skb->pkt_type; if (unlikely(packet_sock_flag(po, PACKET_SOCK_ORIGDEV))) sll->sll_ifindex = orig_dev->ifindex; else sll->sll_ifindex = dev->ifindex; sll->sll_halen = dev_parse_header(skb, sll->sll_addr); /* sll->sll_family and sll->sll_protocol are set in packet_recvmsg(). * Use their space for storing the original skb length. */ PACKET_SKB_CB(skb)->sa.origlen = skb->len; if (pskb_trim(skb, snaplen)) goto drop_n_acct; skb_set_owner_r(skb, sk); skb->dev = NULL; skb_dst_drop(skb); /* drop conntrack reference */ nf_reset_ct(skb); spin_lock(&sk->sk_receive_queue.lock); po->stats.stats1.tp_packets++; sock_skb_set_dropcount(sk, skb); skb_clear_delivery_time(skb); __skb_queue_tail(&sk->sk_receive_queue, skb); spin_unlock(&sk->sk_receive_queue.lock); sk->sk_data_ready(sk); return 0; drop_n_acct: atomic_inc(&po->tp_drops); atomic_inc(&sk->sk_drops); drop_reason = SKB_DROP_REASON_PACKET_SOCK_ERROR; drop_n_restore: if (skb_head != skb->data && skb_shared(skb)) { skb->data = skb_head; skb->len = skb_len; } drop: sk_skb_reason_drop(sk, skb, drop_reason); return 0; } static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { enum skb_drop_reason drop_reason = SKB_CONSUMED; struct sock *sk = NULL; struct packet_sock *po; struct sockaddr_ll *sll; union tpacket_uhdr h; u8 *skb_head = skb->data; int skb_len = skb->len; unsigned int snaplen, res; unsigned long status = TP_STATUS_USER; unsigned short macoff, hdrlen; unsigned int netoff; struct sk_buff *copy_skb = NULL; struct timespec64 ts; __u32 ts_status; unsigned int slot_id = 0; int vnet_hdr_sz = 0; /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT. * We may add members to them until current aligned size without forcing * userspace to call getsockopt(..., PACKET_HDRLEN, ...). */ BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32); BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48); if (skb->pkt_type == PACKET_LOOPBACK) goto drop; sk = pt->af_packet_priv; po = pkt_sk(sk); if (!net_eq(dev_net(dev), sock_net(sk))) goto drop; if (dev_has_header(dev)) { if (sk->sk_type != SOCK_DGRAM) skb_push(skb, skb->data - skb_mac_header(skb)); else if (skb->pkt_type == PACKET_OUTGOING) { /* Special case: outgoing packets have ll header at head */ skb_pull(skb, skb_network_offset(skb)); } } snaplen = skb_frags_readable(skb) ? skb->len : skb_headlen(skb); res = run_filter(skb, sk, snaplen); if (!res) goto drop_n_restore; /* If we are flooded, just give up */ if (__packet_rcv_has_room(po, skb) == ROOM_NONE) { atomic_inc(&po->tp_drops); goto drop_n_restore; } if (skb->ip_summed == CHECKSUM_PARTIAL) status |= TP_STATUS_CSUMNOTREADY; else if (skb->pkt_type != PACKET_OUTGOING && skb_csum_unnecessary(skb)) status |= TP_STATUS_CSUM_VALID; if (skb_is_gso(skb) && skb_is_gso_tcp(skb)) status |= TP_STATUS_GSO_TCP; if (snaplen > res) snaplen = res; if (sk->sk_type == SOCK_DGRAM) { macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 + po->tp_reserve; } else { unsigned int maclen = skb_network_offset(skb); netoff = TPACKET_ALIGN(po->tp_hdrlen + (maclen < 16 ? 16 : maclen)) + po->tp_reserve; vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz); if (vnet_hdr_sz) netoff += vnet_hdr_sz; macoff = netoff - maclen; } if (netoff > USHRT_MAX) { atomic_inc(&po->tp_drops); goto drop_n_restore; } if (po->tp_version <= TPACKET_V2) { if (macoff + snaplen > po->rx_ring.frame_size) { if (READ_ONCE(po->copy_thresh) && atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) { if (skb_shared(skb)) { copy_skb = skb_clone(skb, GFP_ATOMIC); } else { copy_skb = skb_get(skb); skb_head = skb->data; } if (copy_skb) { memset(&PACKET_SKB_CB(copy_skb)->sa.ll, 0, sizeof(PACKET_SKB_CB(copy_skb)->sa.ll)); skb_set_owner_r(copy_skb, sk); } } snaplen = po->rx_ring.frame_size - macoff; if ((int)snaplen < 0) { snaplen = 0; vnet_hdr_sz = 0; } } } else if (unlikely(macoff + snaplen > GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) { u32 nval; nval = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len - macoff; pr_err_once("tpacket_rcv: packet too big, clamped from %u to %u. macoff=%u\n", snaplen, nval, macoff); snaplen = nval; if (unlikely((int)snaplen < 0)) { snaplen = 0; macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len; vnet_hdr_sz = 0; } } spin_lock(&sk->sk_receive_queue.lock); h.raw = packet_current_rx_frame(po, skb, TP_STATUS_KERNEL, (macoff+snaplen)); if (!h.raw) goto drop_n_account; if (po->tp_version <= TPACKET_V2) { slot_id = po->rx_ring.head; if (test_bit(slot_id, po->rx_ring.rx_owner_map)) goto drop_n_account; __set_bit(slot_id, po->rx_ring.rx_owner_map); } if (vnet_hdr_sz && virtio_net_hdr_from_skb(skb, h.raw + macoff - sizeof(struct virtio_net_hdr), vio_le(), true, 0)) { if (po->tp_version == TPACKET_V3) prb_clear_blk_fill_status(&po->rx_ring); goto drop_n_account; } if (po->tp_version <= TPACKET_V2) { packet_increment_rx_head(po, &po->rx_ring); /* * LOSING will be reported till you read the stats, * because it's COR - Clear On Read. * Anyways, moving it for V1/V2 only as V3 doesn't need this * at packet level. */ if (atomic_read(&po->tp_drops)) status |= TP_STATUS_LOSING; } po->stats.stats1.tp_packets++; if (copy_skb) { status |= TP_STATUS_COPY; skb_clear_delivery_time(copy_skb); __skb_queue_tail(&sk->sk_receive_queue, copy_skb); } spin_unlock(&sk->sk_receive_queue.lock); skb_copy_bits(skb, 0, h.raw + macoff, snaplen); /* Always timestamp; prefer an existing software timestamp taken * closer to the time of capture. */ ts_status = tpacket_get_timestamp(skb, &ts, READ_ONCE(po->tp_tstamp) | SOF_TIMESTAMPING_SOFTWARE); if (!ts_status) ktime_get_real_ts64(&ts); status |= ts_status; switch (po->tp_version) { case TPACKET_V1: h.h1->tp_len = skb->len; h.h1->tp_snaplen = snaplen; h.h1->tp_mac = macoff; h.h1->tp_net = netoff; h.h1->tp_sec = ts.tv_sec; h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC; hdrlen = sizeof(*h.h1); break; case TPACKET_V2: h.h2->tp_len = skb->len; h.h2->tp_snaplen = snaplen; h.h2->tp_mac = macoff; h.h2->tp_net = netoff; h.h2->tp_sec = ts.tv_sec; h.h2->tp_nsec = ts.tv_nsec; if (skb_vlan_tag_present(skb)) { h.h2->tp_vlan_tci = skb_vlan_tag_get(skb); h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto); status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID; } else if (unlikely(sk->sk_type == SOCK_DGRAM && eth_type_vlan(skb->protocol))) { h.h2->tp_vlan_tci = vlan_get_tci(skb, skb->dev); h.h2->tp_vlan_tpid = ntohs(skb->protocol); status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID; } else { h.h2->tp_vlan_tci = 0; h.h2->tp_vlan_tpid = 0; } memset(h.h2->tp_padding, 0, sizeof(h.h2->tp_padding)); hdrlen = sizeof(*h.h2); break; case TPACKET_V3: /* tp_nxt_offset,vlan are already populated above. * So DONT clear those fields here */ h.h3->tp_status |= status; h.h3->tp_len = skb->len; h.h3->tp_snaplen = snaplen; h.h3->tp_mac = macoff; h.h3->tp_net = netoff; h.h3->tp_sec = ts.tv_sec; h.h3->tp_nsec = ts.tv_nsec; memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding)); hdrlen = sizeof(*h.h3); break; default: BUG(); } sll = h.raw + TPACKET_ALIGN(hdrlen); sll->sll_halen = dev_parse_header(skb, sll->sll_addr); sll->sll_family = AF_PACKET; sll->sll_hatype = dev->type; sll->sll_protocol = (sk->sk_type == SOCK_DGRAM) ? vlan_get_protocol_dgram(skb) : skb->protocol; sll->sll_pkttype = skb->pkt_type; if (unlikely(packet_sock_flag(po, PACKET_SOCK_ORIGDEV))) sll->sll_ifindex = orig_dev->ifindex; else sll->sll_ifindex = dev->ifindex; smp_mb(); #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1 if (po->tp_version <= TPACKET_V2) { u8 *start, *end; end = (u8 *) PAGE_ALIGN((unsigned long) h.raw + macoff + snaplen); for (start = h.raw; start < end; start += PAGE_SIZE) flush_dcache_page(pgv_to_page(start)); } smp_wmb(); #endif if (po->tp_version <= TPACKET_V2) { spin_lock(&sk->sk_receive_queue.lock); __packet_set_status(po, h.raw, status); __clear_bit(slot_id, po->rx_ring.rx_owner_map); spin_unlock(&sk->sk_receive_queue.lock); sk->sk_data_ready(sk); } else if (po->tp_version == TPACKET_V3) { prb_clear_blk_fill_status(&po->rx_ring); } drop_n_restore: if (skb_head != skb->data && skb_shared(skb)) { skb->data = skb_head; skb->len = skb_len; } drop: sk_skb_reason_drop(sk, skb, drop_reason); return 0; drop_n_account: spin_unlock(&sk->sk_receive_queue.lock); atomic_inc(&po->tp_drops); drop_reason = SKB_DROP_REASON_PACKET_SOCK_ERROR; sk->sk_data_ready(sk); sk_skb_reason_drop(sk, copy_skb, drop_reason); goto drop_n_restore; } static void tpacket_destruct_skb(struct sk_buff *skb) { struct packet_sock *po = pkt_sk(skb->sk); if (likely(po->tx_ring.pg_vec)) { void *ph; __u32 ts; ph = skb_zcopy_get_nouarg(skb); packet_dec_pending(&po->tx_ring); ts = __packet_set_timestamp(po, ph, skb); __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts); complete(&po->skb_completion); } sock_wfree(skb); } static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len) { if ((vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) && (__virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) + __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2 > __virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len))) vnet_hdr->hdr_len = __cpu_to_virtio16(vio_le(), __virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) + __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2); if (__virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len) > len) return -EINVAL; return 0; } static int packet_snd_vnet_parse(struct msghdr *msg, size_t *len, struct virtio_net_hdr *vnet_hdr, int vnet_hdr_sz) { int ret; if (*len < vnet_hdr_sz) return -EINVAL; *len -= vnet_hdr_sz; if (!copy_from_iter_full(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter)) return -EFAULT; ret = __packet_snd_vnet_parse(vnet_hdr, *len); if (ret) return ret; /* move iter to point to the start of mac header */ if (vnet_hdr_sz != sizeof(struct virtio_net_hdr)) iov_iter_advance(&msg->msg_iter, vnet_hdr_sz - sizeof(struct virtio_net_hdr)); return 0; } static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, void *frame, struct net_device *dev, void *data, int tp_len, __be16 proto, unsigned char *addr, int hlen, int copylen, const struct sockcm_cookie *sockc) { union tpacket_uhdr ph; int to_write, offset, len, nr_frags, len_max; struct socket *sock = po->sk.sk_socket; struct page *page; int err; ph.raw = frame; skb->protocol = proto; skb->dev = dev; skb->priority = sockc->priority; skb->mark = sockc->mark; skb_set_delivery_type_by_clockid(skb, sockc->transmit_time, po->sk.sk_clockid); skb_setup_tx_timestamp(skb, sockc); skb_zcopy_set_nouarg(skb, ph.raw); skb_reserve(skb, hlen); skb_reset_network_header(skb); to_write = tp_len; if (sock->type == SOCK_DGRAM) { err = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, tp_len); if (unlikely(err < 0)) return -EINVAL; } else if (copylen) { int hdrlen = min_t(int, copylen, tp_len); skb_push(skb, dev->hard_header_len); skb_put(skb, copylen - dev->hard_header_len); err = skb_store_bits(skb, 0, data, hdrlen); if (unlikely(err)) return err; if (!dev_validate_header(dev, skb->data, hdrlen)) return -EINVAL; data += hdrlen; to_write -= hdrlen; } offset = offset_in_page(data); len_max = PAGE_SIZE - offset; len = ((to_write > len_max) ? len_max : to_write); skb->data_len = to_write; skb->len += to_write; skb->truesize += to_write; refcount_add(to_write, &po->sk.sk_wmem_alloc); while (likely(to_write)) { nr_frags = skb_shinfo(skb)->nr_frags; if (unlikely(nr_frags >= MAX_SKB_FRAGS)) { pr_err("Packet exceed the number of skb frags(%u)\n", (unsigned int)MAX_SKB_FRAGS); return -EFAULT; } page = pgv_to_page(data); data += len; flush_dcache_page(page); get_page(page); skb_fill_page_desc(skb, nr_frags, page, offset, len); to_write -= len; offset = 0; len_max = PAGE_SIZE; len = ((to_write > len_max) ? len_max : to_write); } packet_parse_headers(skb, sock); return tp_len; } static int tpacket_parse_header(struct packet_sock *po, void *frame, int size_max, void **data) { union tpacket_uhdr ph; int tp_len, off; ph.raw = frame; switch (po->tp_version) { case TPACKET_V3: if (ph.h3->tp_next_offset != 0) { pr_warn_once("variable sized slot not supported"); return -EINVAL; } tp_len = ph.h3->tp_len; break; case TPACKET_V2: tp_len = ph.h2->tp_len; break; default: tp_len = ph.h1->tp_len; break; } if (unlikely(tp_len > size_max)) { pr_err("packet size is too long (%d > %d)\n", tp_len, size_max); return -EMSGSIZE; } if (unlikely(packet_sock_flag(po, PACKET_SOCK_TX_HAS_OFF))) { int off_min, off_max; off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll); off_max = po->tx_ring.frame_size - tp_len; if (po->sk.sk_type == SOCK_DGRAM) { switch (po->tp_version) { case TPACKET_V3: off = ph.h3->tp_net; break; case TPACKET_V2: off = ph.h2->tp_net; break; default: off = ph.h1->tp_net; break; } } else { switch (po->tp_version) { case TPACKET_V3: off = ph.h3->tp_mac; break; case TPACKET_V2: off = ph.h2->tp_mac; break; default: off = ph.h1->tp_mac; break; } } if (unlikely((off < off_min) || (off_max < off))) return -EINVAL; } else { off = po->tp_hdrlen - sizeof(struct sockaddr_ll); } *data = frame + off; return tp_len; } static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) { struct sk_buff *skb = NULL; struct net_device *dev; struct virtio_net_hdr *vnet_hdr = NULL; struct sockcm_cookie sockc; __be16 proto; int err, reserve = 0; void *ph; DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name); bool need_wait = !(msg->msg_flags & MSG_DONTWAIT); int vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz); unsigned char *addr = NULL; int tp_len, size_max; void *data; int len_sum = 0; int status = TP_STATUS_AVAILABLE; int hlen, tlen, copylen = 0; long timeo = 0; mutex_lock(&po->pg_vec_lock); /* packet_sendmsg() check on tx_ring.pg_vec was lockless, * we need to confirm it under protection of pg_vec_lock. */ if (unlikely(!po->tx_ring.pg_vec)) { err = -EBUSY; goto out; } if (likely(saddr == NULL)) { dev = packet_cached_dev_get(po); proto = READ_ONCE(po->num); } else { err = -EINVAL; if (msg->msg_namelen < sizeof(struct sockaddr_ll)) goto out; if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr))) goto out; proto = saddr->sll_protocol; dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex); if (po->sk.sk_socket->type == SOCK_DGRAM) { if (dev && msg->msg_namelen < dev->addr_len + offsetof(struct sockaddr_ll, sll_addr)) goto out_put; addr = saddr->sll_addr; } } err = -ENXIO; if (unlikely(dev == NULL)) goto out; err = -ENETDOWN; if (unlikely(!(dev->flags & IFF_UP))) goto out_put; sockcm_init(&sockc, &po->sk); if (msg->msg_controllen) { err = sock_cmsg_send(&po->sk, msg, &sockc); if (unlikely(err)) goto out_put; } if (po->sk.sk_socket->type == SOCK_RAW) reserve = dev->hard_header_len; size_max = po->tx_ring.frame_size - (po->tp_hdrlen - sizeof(struct sockaddr_ll)); if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !vnet_hdr_sz) size_max = dev->mtu + reserve + VLAN_HLEN; reinit_completion(&po->skb_completion); do { ph = packet_current_frame(po, &po->tx_ring, TP_STATUS_SEND_REQUEST); if (unlikely(ph == NULL)) { if (need_wait && skb) { timeo = sock_sndtimeo(&po->sk, msg->msg_flags & MSG_DONTWAIT); timeo = wait_for_completion_interruptible_timeout(&po->skb_completion, timeo); if (timeo <= 0) { err = !timeo ? -ETIMEDOUT : -ERESTARTSYS; goto out_put; } } /* check for additional frames */ continue; } skb = NULL; tp_len = tpacket_parse_header(po, ph, size_max, &data); if (tp_len < 0) goto tpacket_error; status = TP_STATUS_SEND_REQUEST; hlen = LL_RESERVED_SPACE(dev); tlen = dev->needed_tailroom; if (vnet_hdr_sz) { vnet_hdr = data; data += vnet_hdr_sz; tp_len -= vnet_hdr_sz; if (tp_len < 0 || __packet_snd_vnet_parse(vnet_hdr, tp_len)) { tp_len = -EINVAL; goto tpacket_error; } copylen = __virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len); } copylen = max_t(int, copylen, dev->hard_header_len); skb = sock_alloc_send_skb(&po->sk, hlen + tlen + sizeof(struct sockaddr_ll) + (copylen - dev->hard_header_len), !need_wait, &err); if (unlikely(skb == NULL)) { /* we assume the socket was initially writeable ... */ if (likely(len_sum > 0)) err = len_sum; goto out_status; } tp_len = tpacket_fill_skb(po, skb, ph, dev, data, tp_len, proto, addr, hlen, copylen, &sockc); if (likely(tp_len >= 0) && tp_len > dev->mtu + reserve && !vnet_hdr_sz && !packet_extra_vlan_len_allowed(dev, skb)) tp_len = -EMSGSIZE; if (unlikely(tp_len < 0)) { tpacket_error: if (packet_sock_flag(po, PACKET_SOCK_TP_LOSS)) { __packet_set_status(po, ph, TP_STATUS_AVAILABLE); packet_increment_head(&po->tx_ring); kfree_skb(skb); continue; } else { status = TP_STATUS_WRONG_FORMAT; err = tp_len; goto out_status; } } if (vnet_hdr_sz) { if (virtio_net_hdr_to_skb(skb, vnet_hdr, vio_le())) { tp_len = -EINVAL; goto tpacket_error; } virtio_net_hdr_set_proto(skb, vnet_hdr); } skb->destructor = tpacket_destruct_skb; __packet_set_status(po, ph, TP_STATUS_SENDING); packet_inc_pending(&po->tx_ring); status = TP_STATUS_SEND_REQUEST; err = packet_xmit(po, skb); if (unlikely(err != 0)) { if (err > 0) err = net_xmit_errno(err); if (err && __packet_get_status(po, ph) == TP_STATUS_AVAILABLE) { /* skb was destructed already */ skb = NULL; goto out_status; } /* * skb was dropped but not destructed yet; * let's treat it like congestion or err < 0 */ err = 0; } packet_increment_head(&po->tx_ring); len_sum += tp_len; } while (likely((ph != NULL) || /* Note: packet_read_pending() might be slow if we have * to call it as it's per_cpu variable, but in fast-path * we already short-circuit the loop with the first * condition, and luckily don't have to go that path * anyway. */ (need_wait && packet_read_pending(&po->tx_ring)))); err = len_sum; goto out_put; out_status: __packet_set_status(po, ph, status); kfree_skb(skb); out_put: dev_put(dev); out: mutex_unlock(&po->pg_vec_lock); return err; } static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad, size_t reserve, size_t len, size_t linear, int noblock, int *err) { struct sk_buff *skb; /* Under a page? Don't bother with paged skb. */ if (prepad + len < PAGE_SIZE || !linear) linear = len; if (len - linear > MAX_SKB_FRAGS * (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) linear = len - MAX_SKB_FRAGS * (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER); skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock, err, PAGE_ALLOC_COSTLY_ORDER); if (!skb) return NULL; skb_reserve(skb, reserve); skb_put(skb, linear); skb->data_len = len - linear; skb->len += len - linear; return skb; } static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name); struct sk_buff *skb; struct net_device *dev; __be16 proto; unsigned char *addr = NULL; int err, reserve = 0; struct sockcm_cookie sockc; struct virtio_net_hdr vnet_hdr = { 0 }; int offset = 0; struct packet_sock *po = pkt_sk(sk); int vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz); int hlen, tlen, linear; int extra_len = 0; /* * Get and verify the address. */ if (likely(saddr == NULL)) { dev = packet_cached_dev_get(po); proto = READ_ONCE(po->num); } else { err = -EINVAL; if (msg->msg_namelen < sizeof(struct sockaddr_ll)) goto out; if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr))) goto out; proto = saddr->sll_protocol; dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex); if (sock->type == SOCK_DGRAM) { if (dev && msg->msg_namelen < dev->addr_len + offsetof(struct sockaddr_ll, sll_addr)) goto out_unlock; addr = saddr->sll_addr; } } err = -ENXIO; if (unlikely(dev == NULL)) goto out_unlock; err = -ENETDOWN; if (unlikely(!(dev->flags & IFF_UP))) goto out_unlock; sockcm_init(&sockc, sk); if (msg->msg_controllen) { err = sock_cmsg_send(sk, msg, &sockc); if (unlikely(err)) goto out_unlock; } if (sock->type == SOCK_RAW) reserve = dev->hard_header_len; if (vnet_hdr_sz) { err = packet_snd_vnet_parse(msg, &len, &vnet_hdr, vnet_hdr_sz); if (err) goto out_unlock; } if (unlikely(sock_flag(sk, SOCK_NOFCS))) { if (!netif_supports_nofcs(dev)) { err = -EPROTONOSUPPORT; goto out_unlock; } extra_len = 4; /* We're doing our own CRC */ } err = -EMSGSIZE; if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + VLAN_HLEN + extra_len)) goto out_unlock; err = -ENOBUFS; hlen = LL_RESERVED_SPACE(dev); tlen = dev->needed_tailroom; linear = __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len); linear = max(linear, min_t(int, len, dev->hard_header_len)); skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, linear, msg->msg_flags & MSG_DONTWAIT, &err); if (skb == NULL) goto out_unlock; skb_reset_network_header(skb); err = -EINVAL; if (sock->type == SOCK_DGRAM) { offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len); if (unlikely(offset < 0)) goto out_free; } else if (reserve) { skb_reserve(skb, -reserve); if (len < reserve + sizeof(struct ipv6hdr) && dev->min_header_len != dev->hard_header_len) skb_reset_network_header(skb); } /* Returns -EFAULT on error */ err = skb_copy_datagram_from_iter(skb, offset, &msg->msg_iter, len); if (err) goto out_free; if ((sock->type == SOCK_RAW && !dev_validate_header(dev, skb->data, len)) || !skb->len) { err = -EINVAL; goto out_free; } skb_setup_tx_timestamp(skb, &sockc); if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + extra_len) && !packet_extra_vlan_len_allowed(dev, skb)) { err = -EMSGSIZE; goto out_free; } skb->protocol = proto; skb->dev = dev; skb->priority = sockc.priority; skb->mark = sockc.mark; skb_set_delivery_type_by_clockid(skb, sockc.transmit_time, sk->sk_clockid); if (unlikely(extra_len == 4)) skb->no_fcs = 1; packet_parse_headers(skb, sock); if (vnet_hdr_sz) { err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le()); if (err) goto out_free; len += vnet_hdr_sz; virtio_net_hdr_set_proto(skb, &vnet_hdr); } err = packet_xmit(po, skb); if (unlikely(err != 0)) { if (err > 0) err = net_xmit_errno(err); if (err) goto out_unlock; } dev_put(dev); return len; out_free: kfree_skb(skb); out_unlock: dev_put(dev); out: return err; } static int packet_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct packet_sock *po = pkt_sk(sk); /* Reading tx_ring.pg_vec without holding pg_vec_lock is racy. * tpacket_snd() will redo the check safely. */ if (data_race(po->tx_ring.pg_vec)) return tpacket_snd(po, msg); return packet_snd(sock, msg, len); } /* * Close a PACKET socket. This is fairly simple. We immediately go * to 'closed' state and remove our protocol entry in the device list. */ static int packet_release(struct socket *sock) { struct sock *sk = sock->sk; struct packet_sock *po; struct packet_fanout *f; struct net *net; union tpacket_req_u req_u; if (!sk) return 0; net = sock_net(sk); po = pkt_sk(sk); mutex_lock(&net->packet.sklist_lock); sk_del_node_init_rcu(sk); mutex_unlock(&net->packet.sklist_lock); sock_prot_inuse_add(net, sk->sk_prot, -1); spin_lock(&po->bind_lock); unregister_prot_hook(sk, false); packet_cached_dev_reset(po); if (po->prot_hook.dev) { netdev_put(po->prot_hook.dev, &po->prot_hook.dev_tracker); po->prot_hook.dev = NULL; } spin_unlock(&po->bind_lock); packet_flush_mclist(sk); lock_sock(sk); if (po->rx_ring.pg_vec) { memset(&req_u, 0, sizeof(req_u)); packet_set_ring(sk, &req_u, 1, 0); } if (po->tx_ring.pg_vec) { memset(&req_u, 0, sizeof(req_u)); packet_set_ring(sk, &req_u, 1, 1); } release_sock(sk); f = fanout_release(sk); synchronize_net(); kfree(po->rollover); if (f) { fanout_release_data(f); kvfree(f); } /* * Now the socket is dead. No more input will appear. */ sock_orphan(sk); sock->sk = NULL; /* Purge queues */ skb_queue_purge(&sk->sk_receive_queue); packet_free_pending(po); sock_put(sk); return 0; } /* * Attach a packet hook. */ static int packet_do_bind(struct sock *sk, const char *name, int ifindex, __be16 proto) { struct packet_sock *po = pkt_sk(sk); struct net_device *dev = NULL; bool unlisted = false; bool need_rehook; int ret = 0; lock_sock(sk); spin_lock(&po->bind_lock); if (!proto) proto = po->num; rcu_read_lock(); if (po->fanout) { ret = -EINVAL; goto out_unlock; } if (name) { dev = dev_get_by_name_rcu(sock_net(sk), name); if (!dev) { ret = -ENODEV; goto out_unlock; } } else if (ifindex) { dev = dev_get_by_index_rcu(sock_net(sk), ifindex); if (!dev) { ret = -ENODEV; goto out_unlock; } } need_rehook = po->prot_hook.type != proto || po->prot_hook.dev != dev; if (need_rehook) { dev_hold(dev); if (packet_sock_flag(po, PACKET_SOCK_RUNNING)) { rcu_read_unlock(); /* prevents packet_notifier() from calling * register_prot_hook() */ WRITE_ONCE(po->num, 0); __unregister_prot_hook(sk, true); rcu_read_lock(); if (dev) unlisted = !dev_get_by_index_rcu(sock_net(sk), dev->ifindex); } BUG_ON(packet_sock_flag(po, PACKET_SOCK_RUNNING)); WRITE_ONCE(po->num, proto); po->prot_hook.type = proto; netdev_put(po->prot_hook.dev, &po->prot_hook.dev_tracker); if (unlikely(unlisted)) { po->prot_hook.dev = NULL; WRITE_ONCE(po->ifindex, -1); packet_cached_dev_reset(po); } else { netdev_hold(dev, &po->prot_hook.dev_tracker, GFP_ATOMIC); po->prot_hook.dev = dev; WRITE_ONCE(po->ifindex, dev ? dev->ifindex : 0); packet_cached_dev_assign(po, dev); } dev_put(dev); } if (proto == 0 || !need_rehook) goto out_unlock; if (!unlisted && (!dev || (dev->flags & IFF_UP))) { register_prot_hook(sk); } else { sk->sk_err = ENETDOWN; if (!sock_flag(sk, SOCK_DEAD)) sk_error_report(sk); } out_unlock: rcu_read_unlock(); spin_unlock(&po->bind_lock); release_sock(sk); return ret; } /* * Bind a packet socket to a device */ static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sock *sk = sock->sk; char name[sizeof(uaddr->sa_data_min) + 1]; /* * Check legality */ if (addr_len != sizeof(struct sockaddr)) return -EINVAL; /* uaddr->sa_data comes from the userspace, it's not guaranteed to be * zero-terminated. */ memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data_min)); name[sizeof(uaddr->sa_data_min)] = 0; return packet_do_bind(sk, name, 0, 0); } static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr; struct sock *sk = sock->sk; /* * Check legality */ if (addr_len < sizeof(struct sockaddr_ll)) return -EINVAL; if (sll->sll_family != AF_PACKET) return -EINVAL; return packet_do_bind(sk, NULL, sll->sll_ifindex, sll->sll_protocol); } static struct proto packet_proto = { .name = "PACKET", .owner = THIS_MODULE, .obj_size = sizeof(struct packet_sock), }; /* * Create a packet of type SOCK_PACKET. */ static int packet_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; struct packet_sock *po; __be16 proto = (__force __be16)protocol; /* weird, but documented */ int err; if (!ns_capable(net->user_ns, CAP_NET_RAW)) return -EPERM; if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW && sock->type != SOCK_PACKET) return -ESOCKTNOSUPPORT; sock->state = SS_UNCONNECTED; err = -ENOBUFS; sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto, kern); if (sk == NULL) goto out; sock->ops = &packet_ops; if (sock->type == SOCK_PACKET) sock->ops = &packet_ops_spkt; po = pkt_sk(sk); err = packet_alloc_pending(po); if (err) goto out_sk_free; sock_init_data(sock, sk); init_completion(&po->skb_completion); sk->sk_family = PF_PACKET; po->num = proto; packet_cached_dev_reset(po); sk->sk_destruct = packet_sock_destruct; /* * Attach a protocol block */ spin_lock_init(&po->bind_lock); mutex_init(&po->pg_vec_lock); po->rollover = NULL; po->prot_hook.func = packet_rcv; if (sock->type == SOCK_PACKET) po->prot_hook.func = packet_rcv_spkt; po->prot_hook.af_packet_priv = sk; po->prot_hook.af_packet_net = sock_net(sk); if (proto) { po->prot_hook.type = proto; __register_prot_hook(sk); } mutex_lock(&net->packet.sklist_lock); sk_add_node_tail_rcu(sk, &net->packet.sklist); mutex_unlock(&net->packet.sklist_lock); sock_prot_inuse_add(net, &packet_proto, 1); return 0; out_sk_free: sk_free(sk); out: return err; } /* * Pull a packet from our receive queue and hand it to the user. * If necessary we block. */ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { struct sock *sk = sock->sk; struct sk_buff *skb; int copied, err; int vnet_hdr_len = READ_ONCE(pkt_sk(sk)->vnet_hdr_sz); unsigned int origlen = 0; err = -EINVAL; if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE)) goto out; #if 0 /* What error should we return now? EUNATTACH? */ if (pkt_sk(sk)->ifindex < 0) return -ENODEV; #endif if (flags & MSG_ERRQUEUE) { err = sock_recv_errqueue(sk, msg, len, SOL_PACKET, PACKET_TX_TIMESTAMP); goto out; } /* * Call the generic datagram receiver. This handles all sorts * of horrible races and re-entrancy so we can forget about it * in the protocol layers. * * Now it will return ENETDOWN, if device have just gone down, * but then it will block. */ skb = skb_recv_datagram(sk, flags, &err); /* * An error occurred so return it. Because skb_recv_datagram() * handles the blocking we don't see and worry about blocking * retries. */ if (skb == NULL) goto out; packet_rcv_try_clear_pressure(pkt_sk(sk)); if (vnet_hdr_len) { err = packet_rcv_vnet(msg, skb, &len, vnet_hdr_len); if (err) goto out_free; } /* You lose any data beyond the buffer you gave. If it worries * a user program they can ask the device for its MTU * anyway. */ copied = skb->len; if (copied > len) { copied = len; msg->msg_flags |= MSG_TRUNC; } err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto out_free; if (sock->type != SOCK_PACKET) { struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll; /* Original length was stored in sockaddr_ll fields */ origlen = PACKET_SKB_CB(skb)->sa.origlen; sll->sll_family = AF_PACKET; sll->sll_protocol = (sock->type == SOCK_DGRAM) ? vlan_get_protocol_dgram(skb) : skb->protocol; } sock_recv_cmsgs(msg, sk, skb); if (msg->msg_name) { const size_t max_len = min(sizeof(skb->cb), sizeof(struct sockaddr_storage)); int copy_len; /* If the address length field is there to be filled * in, we fill it in now. */ if (sock->type == SOCK_PACKET) { __sockaddr_check_size(sizeof(struct sockaddr_pkt)); msg->msg_namelen = sizeof(struct sockaddr_pkt); copy_len = msg->msg_namelen; } else { struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll; msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr); copy_len = msg->msg_namelen; if (msg->msg_namelen < sizeof(struct sockaddr_ll)) { memset(msg->msg_name + offsetof(struct sockaddr_ll, sll_addr), 0, sizeof(sll->sll_addr)); msg->msg_namelen = sizeof(struct sockaddr_ll); } } if (WARN_ON_ONCE(copy_len > max_len)) { copy_len = max_len; msg->msg_namelen = copy_len; } memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa, copy_len); } if (packet_sock_flag(pkt_sk(sk), PACKET_SOCK_AUXDATA)) { struct tpacket_auxdata aux; aux.tp_status = TP_STATUS_USER; if (skb->ip_summed == CHECKSUM_PARTIAL) aux.tp_status |= TP_STATUS_CSUMNOTREADY; else if (skb->pkt_type != PACKET_OUTGOING && skb_csum_unnecessary(skb)) aux.tp_status |= TP_STATUS_CSUM_VALID; if (skb_is_gso(skb) && skb_is_gso_tcp(skb)) aux.tp_status |= TP_STATUS_GSO_TCP; aux.tp_len = origlen; aux.tp_snaplen = skb->len; aux.tp_mac = 0; aux.tp_net = skb_network_offset(skb); if (skb_vlan_tag_present(skb)) { aux.tp_vlan_tci = skb_vlan_tag_get(skb); aux.tp_vlan_tpid = ntohs(skb->vlan_proto); aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID; } else if (unlikely(sock->type == SOCK_DGRAM && eth_type_vlan(skb->protocol))) { struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll; struct net_device *dev; rcu_read_lock(); dev = dev_get_by_index_rcu(sock_net(sk), sll->sll_ifindex); if (dev) { aux.tp_vlan_tci = vlan_get_tci(skb, dev); aux.tp_vlan_tpid = ntohs(skb->protocol); aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID; } else { aux.tp_vlan_tci = 0; aux.tp_vlan_tpid = 0; } rcu_read_unlock(); } else { aux.tp_vlan_tci = 0; aux.tp_vlan_tpid = 0; } put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux); } /* * Free or return the buffer as appropriate. Again this * hides all the races and re-entrancy issues from us. */ err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied); out_free: skb_free_datagram(sk, skb); out: return err; } static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr, int peer) { struct net_device *dev; struct sock *sk = sock->sk; if (peer) return -EOPNOTSUPP; uaddr->sa_family = AF_PACKET; memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data_min)); rcu_read_lock(); dev = dev_get_by_index_rcu(sock_net(sk), READ_ONCE(pkt_sk(sk)->ifindex)); if (dev) strscpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data_min)); rcu_read_unlock(); return sizeof(*uaddr); } static int packet_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { struct net_device *dev; struct sock *sk = sock->sk; struct packet_sock *po = pkt_sk(sk); DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr); int ifindex; if (peer) return -EOPNOTSUPP; ifindex = READ_ONCE(po->ifindex); sll->sll_family = AF_PACKET; sll->sll_ifindex = ifindex; sll->sll_protocol = READ_ONCE(po->num); sll->sll_pkttype = 0; rcu_read_lock(); dev = dev_get_by_index_rcu(sock_net(sk), ifindex); if (dev) { sll->sll_hatype = dev->type; sll->sll_halen = dev->addr_len; /* Let __fortify_memcpy_chk() know the actual buffer size. */ memcpy(((struct sockaddr_storage *)sll)->__data + offsetof(struct sockaddr_ll, sll_addr) - offsetofend(struct sockaddr_ll, sll_family), dev->dev_addr, dev->addr_len); } else { sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */ sll->sll_halen = 0; } rcu_read_unlock(); return offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen; } static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i, int what) { switch (i->type) { case PACKET_MR_MULTICAST: if (i->alen != dev->addr_len) return -EINVAL; if (what > 0) return dev_mc_add(dev, i->addr); else return dev_mc_del(dev, i->addr); break; case PACKET_MR_PROMISC: return dev_set_promiscuity(dev, what); case PACKET_MR_ALLMULTI: return dev_set_allmulti(dev, what); case PACKET_MR_UNICAST: if (i->alen != dev->addr_len) return -EINVAL; if (what > 0) return dev_uc_add(dev, i->addr); else return dev_uc_del(dev, i->addr); break; default: break; } return 0; } static void packet_dev_mclist_delete(struct net_device *dev, struct packet_mclist **mlp) { struct packet_mclist *ml; while ((ml = *mlp) != NULL) { if (ml->ifindex == dev->ifindex) { packet_dev_mc(dev, ml, -1); *mlp = ml->next; kfree(ml); } else mlp = &ml->next; } } static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq) { struct packet_sock *po = pkt_sk(sk); struct packet_mclist *ml, *i; struct net_device *dev; int err; rtnl_lock(); err = -ENODEV; dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex); if (!dev) goto done; err = -EINVAL; if (mreq->mr_alen > dev->addr_len) goto done; err = -ENOBUFS; i = kmalloc(sizeof(*i), GFP_KERNEL); if (i == NULL) goto done; err = 0; for (ml = po->mclist; ml; ml = ml->next) { if (ml->ifindex == mreq->mr_ifindex && ml->type == mreq->mr_type && ml->alen == mreq->mr_alen && memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) { ml->count++; /* Free the new element ... */ kfree(i); goto done; } } i->type = mreq->mr_type; i->ifindex = mreq->mr_ifindex; i->alen = mreq->mr_alen; memcpy(i->addr, mreq->mr_address, i->alen); memset(i->addr + i->alen, 0, sizeof(i->addr) - i->alen); i->count = 1; i->next = po->mclist; po->mclist = i; err = packet_dev_mc(dev, i, 1); if (err) { po->mclist = i->next; kfree(i); } done: rtnl_unlock(); return err; } static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq) { struct packet_mclist *ml, **mlp; rtnl_lock(); for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) { if (ml->ifindex == mreq->mr_ifindex && ml->type == mreq->mr_type && ml->alen == mreq- |