150 150 150 149 149 150 150 300 300 300 300 41 41 41 40 41 36 35 36 630 628 628 614 203 508 150 38 38 11 593 52 592 591 592 54 592 51 592 51 591 592 591 600 1 240 229 231 8 232 239 355 355 317 313 313 330 3 2 311 311 309 308 196 3 308 308 308 239 308 112 306 4 303 305 219 303 192 305 308 308 313 266 261 8 254 266 402 402 355 807 801 772 800 13 792 13 798 20 3 793 792 691 691 693 693 645 544 462 402 646 354 405 322 123 1 277 17 17 3 38 19 34 28 6 3 3 3 23 15 23 26 28 1907 234 1901 1316 1322 3 3 3 272 273 137 184 178 45 35 10 178 136 90 12 45 45 176 176 258 41 37 8 7 5 63 63 116 255 464 463 3 3 3 178 790 21 16 21 2 21 21 21 7 7 7 10 10 10 17 17 23 3 21 21 20 21 16 10 10 10 10 22 1 1 1 9 12 7 5 5 132 131 131 2 131 131 131 131 131 131 131 131 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 // SPDX-License-Identifier: GPL-2.0-or-later /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * PF_INET protocol family socket handler. * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Florian La Roche, <flla@stud.uni-sb.de> * Alan Cox, <A.Cox@swansea.ac.uk> * * Changes (see also sock.c) * * piggy, * Karl Knutson : Socket protocol table * A.N.Kuznetsov : Socket death error in accept(). * John Richardson : Fix non blocking error in connect() * so sockets that fail to connect * don't return -EINPROGRESS. * Alan Cox : Asynchronous I/O support * Alan Cox : Keep correct socket pointer on sock * structures * when accept() ed * Alan Cox : Semantics of SO_LINGER aren't state * moved to close when you look carefully. * With this fixed and the accept bug fixed * some RPC stuff seems happier. * Niibe Yutaka : 4.4BSD style write async I/O * Alan Cox, * Tony Gale : Fixed reuse semantics. * Alan Cox : bind() shouldn't abort existing but dead * sockets. Stops FTP netin:.. I hope. * Alan Cox : bind() works correctly for RAW sockets. * Note that FreeBSD at least was broken * in this respect so be careful with * compatibility tests... * Alan Cox : routing cache support * Alan Cox : memzero the socket structure for * compactness. * Matt Day : nonblock connect error handler * Alan Cox : Allow large numbers of pending sockets * (eg for big web sites), but only if * specifically application requested. * Alan Cox : New buffering throughout IP. Used * dumbly. * Alan Cox : New buffering now used smartly. * Alan Cox : BSD rather than common sense * interpretation of listen. * Germano Caronni : Assorted small races. * Alan Cox : sendmsg/recvmsg basic support. * Alan Cox : Only sendmsg/recvmsg now supported. * Alan Cox : Locked down bind (see security list). * Alan Cox : Loosened bind a little. * Mike McLagan : ADD/DEL DLCI Ioctls * Willy Konynenberg : Transparent proxying support. * David S. Miller : New socket lookup architecture. * Some other random speedups. * Cyrus Durgin : Cleaned up file for kmod hacks. * Andi Kleen : Fix inet_stream_connect TCP race. */ #define pr_fmt(fmt) "IPv4: " fmt #include <linux/err.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/kernel.h> #include <linux/kmod.h> #include <linux/sched.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/capability.h> #include <linux/fcntl.h> #include <linux/mm.h> #include <linux/interrupt.h> #include <linux/stat.h> #include <linux/init.h> #include <linux/poll.h> #include <linux/netfilter_ipv4.h> #include <linux/random.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/inet.h> #include <linux/igmp.h> #include <linux/inetdevice.h> #include <linux/netdevice.h> #include <net/checksum.h> #include <net/ip.h> #include <net/protocol.h> #include <net/arp.h> #include <net/route.h> #include <net/ip_fib.h> #include <net/inet_connection_sock.h> #include <net/gro.h> #include <net/gso.h> #include <net/tcp.h> #include <net/udp.h> #include <net/udplite.h> #include <net/ping.h> #include <linux/skbuff.h> #include <net/sock.h> #include <net/raw.h> #include <net/icmp.h> #include <net/inet_common.h> #include <net/ip_tunnels.h> #include <net/xfrm.h> #include <net/net_namespace.h> #include <net/secure_seq.h> #ifdef CONFIG_IP_MROUTE #include <linux/mroute.h> #endif #include <net/l3mdev.h> #include <net/compat.h> #include <trace/events/sock.h> /* The inetsw table contains everything that inet_create needs to * build a new socket. */ static struct list_head inetsw[SOCK_MAX]; static DEFINE_SPINLOCK(inetsw_lock); /* New destruction routine */ void inet_sock_destruct(struct sock *sk) { struct inet_sock *inet = inet_sk(sk); __skb_queue_purge(&sk->sk_receive_queue); __skb_queue_purge(&sk->sk_error_queue); sk_mem_reclaim_final(sk); if (sk->sk_type == SOCK_STREAM && sk->sk_state != TCP_CLOSE) { pr_err("Attempt to release TCP socket in state %d %p\n", sk->sk_state, sk); return; } if (!sock_flag(sk, SOCK_DEAD)) { pr_err("Attempt to release alive inet socket %p\n", sk); return; } WARN_ON_ONCE(atomic_read(&sk->sk_rmem_alloc)); WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc)); WARN_ON_ONCE(sk->sk_wmem_queued); WARN_ON_ONCE(sk_forward_alloc_get(sk)); kfree(rcu_dereference_protected(inet->inet_opt, 1)); dst_release(rcu_dereference_protected(sk->sk_dst_cache, 1)); dst_release(rcu_dereference_protected(sk->sk_rx_dst, 1)); } EXPORT_SYMBOL(inet_sock_destruct); /* * The routines beyond this point handle the behaviour of an AF_INET * socket object. Mostly it punts to the subprotocols of IP to do * the work. */ /* * Automatically bind an unbound socket. */ static int inet_autobind(struct sock *sk) { struct inet_sock *inet; /* We may need to bind the socket. */ lock_sock(sk); inet = inet_sk(sk); if (!inet->inet_num) { if (sk->sk_prot->get_port(sk, 0)) { release_sock(sk); return -EAGAIN; } inet->inet_sport = htons(inet->inet_num); } release_sock(sk); return 0; } int __inet_listen_sk(struct sock *sk, int backlog) { unsigned char old_state = sk->sk_state; int err, tcp_fastopen; if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN))) return -EINVAL; WRITE_ONCE(sk->sk_max_ack_backlog, backlog); /* Really, if the socket is already in listen state * we can only allow the backlog to be adjusted. */ if (old_state != TCP_LISTEN) { /* Enable TFO w/o requiring TCP_FASTOPEN socket option. * Note that only TCP sockets (SOCK_STREAM) will reach here. * Also fastopen backlog may already been set via the option * because the socket was in TCP_LISTEN state previously but * was shutdown() rather than close(). */ tcp_fastopen = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_fastopen); if ((tcp_fastopen & TFO_SERVER_WO_SOCKOPT1) && (tcp_fastopen & TFO_SERVER_ENABLE) && !inet_csk(sk)->icsk_accept_queue.fastopenq.max_qlen) { fastopen_queue_tune(sk, backlog); tcp_fastopen_init_key_once(sock_net(sk)); } err = inet_csk_listen_start(sk); if (err) return err; tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_LISTEN_CB, 0, NULL); } return 0; } /* * Move a socket into listening state. */ int inet_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; int err = -EINVAL; lock_sock(sk); if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM) goto out; err = __inet_listen_sk(sk, backlog); out: release_sock(sk); return err; } EXPORT_SYMBOL(inet_listen); /* * Create an inet socket. */ static int inet_create(struct net *net, struct socket *sock, int protocol, int kern) { struct sock *sk; struct inet_protosw *answer; struct inet_sock *inet; struct proto *answer_prot; unsigned char answer_flags; int try_loading_module = 0; int err; if (protocol < 0 || protocol >= IPPROTO_MAX) return -EINVAL; sock->state = SS_UNCONNECTED; /* Look for the requested type/protocol pair. */ lookup_protocol: err = -ESOCKTNOSUPPORT; rcu_read_lock(); list_for_each_entry_rcu(answer, &inetsw[sock->type], list) { err = 0; /* Check the non-wild match. */ if (protocol == answer->protocol) { if (protocol != IPPROTO_IP) break; } else { /* Check for the two wild cases. */ if (IPPROTO_IP == protocol) { protocol = answer->protocol; break; } if (IPPROTO_IP == answer->protocol) break; } err = -EPROTONOSUPPORT; } if (unlikely(err)) { if (try_loading_module < 2) { rcu_read_unlock(); /* * Be more specific, e.g. net-pf-2-proto-132-type-1 * (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM) */ if (++try_loading_module == 1) request_module("net-pf-%d-proto-%d-type-%d", PF_INET, protocol, sock->type); /* * Fall back to generic, e.g. net-pf-2-proto-132 * (net-pf-PF_INET-proto-IPPROTO_SCTP) */ else request_module("net-pf-%d-proto-%d", PF_INET, protocol); goto lookup_protocol; } else goto out_rcu_unlock; } err = -EPERM; if (sock->type == SOCK_RAW && !kern && !ns_capable(net->user_ns, CAP_NET_RAW)) goto out_rcu_unlock; sock->ops = answer->ops; answer_prot = answer->prot; answer_flags = answer->flags; rcu_read_unlock(); WARN_ON(!answer_prot->slab); err = -ENOMEM; sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot, kern); if (!sk) goto out; err = 0; if (INET_PROTOSW_REUSE & answer_flags) sk->sk_reuse = SK_CAN_REUSE; inet = inet_sk(sk); inet_assign_bit(IS_ICSK, sk, INET_PROTOSW_ICSK & answer_flags); inet_clear_bit(NODEFRAG, sk); if (SOCK_RAW == sock->type) { inet->inet_num = protocol; if (IPPROTO_RAW == protocol) inet_set_bit(HDRINCL, sk); } if (READ_ONCE(net->ipv4.sysctl_ip_no_pmtu_disc)) inet->pmtudisc = IP_PMTUDISC_DONT; else inet->pmtudisc = IP_PMTUDISC_WANT; atomic_set(&inet->inet_id, 0); sock_init_data(sock, sk); sk->sk_destruct = inet_sock_destruct; sk->sk_protocol = protocol; sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv; sk->sk_txrehash = READ_ONCE(net->core.sysctl_txrehash); inet->uc_ttl = -1; inet_set_bit(MC_LOOP, sk); inet->mc_ttl = 1; inet_set_bit(MC_ALL, sk); inet->mc_index = 0; inet->mc_list = NULL; inet->rcv_tos = 0; if (inet->inet_num) { /* It assumes that any protocol which allows * the user to assign a number at socket * creation time automatically * shares. */ inet->inet_sport = htons(inet->inet_num); /* Add to protocol hash chains. */ err = sk->sk_prot->hash(sk); if (err) { sk_common_release(sk); goto out; } } if (sk->sk_prot->init) { err = sk->sk_prot->init(sk); if (err) { sk_common_release(sk); goto out; } } if (!kern) { err = BPF_CGROUP_RUN_PROG_INET_SOCK(sk); if (err) { sk_common_release(sk); goto out; } } out: return err; out_rcu_unlock: rcu_read_unlock(); goto out; } /* * The peer socket should always be NULL (or else). When we call this * function we are destroying the object and from then on nobody * should refer to it. */ int inet_release(struct socket *sock) { struct sock *sk = sock->sk; if (sk) { long timeout; if (!sk->sk_kern_sock) BPF_CGROUP_RUN_PROG_INET_SOCK_RELEASE(sk); /* Applications forget to leave groups before exiting */ ip_mc_drop_socket(sk); /* If linger is set, we don't return until the close * is complete. Otherwise we return immediately. The * actually closing is done the same either way. * * If the close is due to the process exiting, we never * linger.. */ timeout = 0; if (sock_flag(sk, SOCK_LINGER) && !(current->flags & PF_EXITING)) timeout = sk->sk_lingertime; sk->sk_prot->close(sk, timeout); sock->sk = NULL; } return 0; } EXPORT_SYMBOL(inet_release); int inet_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len) { u32 flags = BIND_WITH_LOCK; int err; /* If the socket has its own bind function then use it. (RAW) */ if (sk->sk_prot->bind) { return sk->sk_prot->bind(sk, uaddr, addr_len); } if (addr_len < sizeof(struct sockaddr_in)) return -EINVAL; /* BPF prog is run before any checks are done so that if the prog * changes context in a wrong way it will be caught. */ err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, &addr_len, CGROUP_INET4_BIND, &flags); if (err) return err; return __inet_bind(sk, uaddr, addr_len, flags); } int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { return inet_bind_sk(sock->sk, uaddr, addr_len); } EXPORT_SYMBOL(inet_bind); int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, u32 flags) { struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; struct inet_sock *inet = inet_sk(sk); struct net *net = sock_net(sk); unsigned short snum; int chk_addr_ret; u32 tb_id = RT_TABLE_LOCAL; int err; if (addr->sin_family != AF_INET) { /* Compatibility games : accept AF_UNSPEC (mapped to AF_INET) * only if s_addr is INADDR_ANY. */ err = -EAFNOSUPPORT; if (addr->sin_family != AF_UNSPEC || addr->sin_addr.s_addr != htonl(INADDR_ANY)) goto out; } tb_id = l3mdev_fib_table_by_index(net, sk->sk_bound_dev_if) ? : tb_id; chk_addr_ret = inet_addr_type_table(net, addr->sin_addr.s_addr, tb_id); /* Not specified by any standard per-se, however it breaks too * many applications when removed. It is unfortunate since * allowing applications to make a non-local bind solves * several problems with systems using dynamic addressing. * (ie. your servers still start up even if your ISDN link * is temporarily down) */ err = -EADDRNOTAVAIL; if (!inet_addr_valid_or_nonlocal(net, inet, addr->sin_addr.s_addr, chk_addr_ret)) goto out; snum = ntohs(addr->sin_port); err = -EACCES; if (!(flags & BIND_NO_CAP_NET_BIND_SERVICE) && snum && inet_port_requires_bind_service(net, snum) && !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) goto out; /* We keep a pair of addresses. rcv_saddr is the one * used by hash lookups, and saddr is used for transmit. * * In the BSD API these are the same except where it * would be illegal to use them (multicast/broadcast) in * which case the sending device address is used. */ if (flags & BIND_WITH_LOCK) lock_sock(sk); /* Check these errors (active socket, double bind). */ err = -EINVAL; if (sk->sk_state != TCP_CLOSE || inet->inet_num) goto out_release_sock; inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr; if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST) inet->inet_saddr = 0; /* Use device */ /* Make sure we are allowed to bind here. */ if (snum || !(inet_test_bit(BIND_ADDRESS_NO_PORT, sk) || (flags & BIND_FORCE_ADDRESS_NO_PORT))) { err = sk->sk_prot->get_port(sk, snum); if (err) { inet->inet_saddr = inet->inet_rcv_saddr = 0; goto out_release_sock; } if (!(flags & BIND_FROM_BPF)) { err = BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk); if (err) { inet->inet_saddr = inet->inet_rcv_saddr = 0; if (sk->sk_prot->put_port) sk->sk_prot->put_port(sk); goto out_release_sock; } } } if (inet->inet_rcv_saddr) sk->sk_userlocks |= SOCK_BINDADDR_LOCK; if (snum) sk->sk_userlocks |= SOCK_BINDPORT_LOCK; inet->inet_sport = htons(inet->inet_num); inet->inet_daddr = 0; inet->inet_dport = 0; sk_dst_reset(sk); err = 0; out_release_sock: if (flags & BIND_WITH_LOCK) release_sock(sk); out: return err; } int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) { struct sock *sk = sock->sk; const struct proto *prot; int err; if (addr_len < sizeof(uaddr->sa_family)) return -EINVAL; /* IPV6_ADDRFORM can change sk->sk_prot under us. */ prot = READ_ONCE(sk->sk_prot); if (uaddr->sa_family == AF_UNSPEC) return prot->disconnect(sk, flags); if (BPF_CGROUP_PRE_CONNECT_ENABLED(sk)) { err = prot->pre_connect(sk, uaddr, addr_len); if (err) return err; } if (data_race(!inet_sk(sk)->inet_num) && inet_autobind(sk)) return -EAGAIN; return prot->connect(sk, uaddr, addr_len); } EXPORT_SYMBOL(inet_dgram_connect); static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias) { DEFINE_WAIT_FUNC(wait, woken_wake_function); add_wait_queue(sk_sleep(sk), &wait); sk->sk_write_pending += writebias; /* Basic assumption: if someone sets sk->sk_err, he _must_ * change state of the socket from TCP_SYN_*. * Connect() does not allow to get error notifications * without closing the socket. */ while ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { release_sock(sk); timeo = wait_woken(&wait, TASK_INTERRUPTIBLE, timeo); lock_sock(sk); if (signal_pending(current) || !timeo) break; } remove_wait_queue(sk_sleep(sk), &wait); sk->sk_write_pending -= writebias; return timeo; } /* * Connect to a remote host. There is regrettably still a little * TCP 'magic' in here. */ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags, int is_sendmsg) { struct sock *sk = sock->sk; int err; long timeo; /* * uaddr can be NULL and addr_len can be 0 if: * sk is a TCP fastopen active socket and * TCP_FASTOPEN_CONNECT sockopt is set and * we already have a valid cookie for this socket. * In this case, user can call write() after connect(). * write() will invoke tcp_sendmsg_fastopen() which calls * __inet_stream_connect(). */ if (uaddr) { if (addr_len < sizeof(uaddr->sa_family)) return -EINVAL; if (uaddr->sa_family == AF_UNSPEC) { sk->sk_disconnects++; err = sk->sk_prot->disconnect(sk, flags); sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED; goto out; } } switch (sock->state) { default: err = -EINVAL; goto out; case SS_CONNECTED: err = -EISCONN; goto out; case SS_CONNECTING: if (inet_test_bit(DEFER_CONNECT, sk)) err = is_sendmsg ? -EINPROGRESS : -EISCONN; else err = -EALREADY; /* Fall out of switch with err, set for this state */ break; case SS_UNCONNECTED: err = -EISCONN; if (sk->sk_state != TCP_CLOSE) goto out; if (BPF_CGROUP_PRE_CONNECT_ENABLED(sk)) { err = sk->sk_prot->pre_connect(sk, uaddr, addr_len); if (err) goto out; } err = sk->sk_prot->connect(sk, uaddr, addr_len); if (err < 0) goto out; sock->state = SS_CONNECTING; if (!err && inet_test_bit(DEFER_CONNECT, sk)) goto out; /* Just entered SS_CONNECTING state; the only * difference is that return value in non-blocking * case is EINPROGRESS, rather than EALREADY. */ err = -EINPROGRESS; break; } timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { int writebias = (sk->sk_protocol == IPPROTO_TCP) && tcp_sk(sk)->fastopen_req && tcp_sk(sk)->fastopen_req->data ? 1 : 0; int dis = sk->sk_disconnects; /* Error code is set above */ if (!timeo || !inet_wait_for_connect(sk, timeo, writebias)) goto out; err = sock_intr_errno(timeo); if (signal_pending(current)) goto out; if (dis != sk->sk_disconnects) { err = -EPIPE; goto out; } } /* Connection was closed by RST, timeout, ICMP error * or another process disconnected us. */ if (sk->sk_state == TCP_CLOSE) goto sock_error; /* sk->sk_err may be not zero now, if RECVERR was ordered by user * and error was received after socket entered established state. * Hence, it is handled normally after connect() return successfully. */ sock->state = SS_CONNECTED; err = 0; out: return err; sock_error: err = sock_error(sk) ? : -ECONNABORTED; sock->state = SS_UNCONNECTED; sk->sk_disconnects++; if (sk->sk_prot->disconnect(sk, flags)) sock->state = SS_DISCONNECTING; goto out; } EXPORT_SYMBOL(__inet_stream_connect); int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) { int err; lock_sock(sock->sk); err = __inet_stream_connect(sock, uaddr, addr_len, flags, 0); release_sock(sock->sk); return err; } EXPORT_SYMBOL(inet_stream_connect); void __inet_accept(struct socket *sock, struct socket *newsock, struct sock *newsk) { sock_rps_record_flow(newsk); WARN_ON(!((1 << newsk->sk_state) & (TCPF_ESTABLISHED | TCPF_SYN_RECV | TCPF_CLOSE_WAIT | TCPF_CLOSE))); if (test_bit(SOCK_SUPPORT_ZC, &sock->flags)) set_bit(SOCK_SUPPORT_ZC, &newsock->flags); sock_graft(newsk, newsock); newsock->state = SS_CONNECTED; } /* * Accept a pending connection. The TCP layer now gives BSD semantics. */ int inet_accept(struct socket *sock, struct socket *newsock, int flags, bool kern) { struct sock *sk1 = sock->sk, *sk2; int err = -EINVAL; /* IPV6_ADDRFORM can change sk->sk_prot under us. */ sk2 = READ_ONCE(sk1->sk_prot)->accept(sk1, flags, &err, kern); if (!sk2) return err; lock_sock(sk2); __inet_accept(sock, newsock, sk2); release_sock(sk2); return 0; } EXPORT_SYMBOL(inet_accept); /* * This does both peername and sockname. */ int inet_getname(struct socket *sock, struct sockaddr *uaddr, int peer) { struct sock *sk = sock->sk; struct inet_sock *inet = inet_sk(sk); DECLARE_SOCKADDR(struct sockaddr_in *, sin, uaddr); int sin_addr_len = sizeof(*sin); sin->sin_family = AF_INET; lock_sock(sk); if (peer) { if (!inet->inet_dport || (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) && peer == 1)) { release_sock(sk); return -ENOTCONN; } sin->sin_port = inet->inet_dport; sin->sin_addr.s_addr = inet->inet_daddr; BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, &sin_addr_len, CGROUP_INET4_GETPEERNAME); } else { __be32 addr = inet->inet_rcv_saddr; if (!addr) addr = inet->inet_saddr; sin->sin_port = inet->inet_sport; sin->sin_addr.s_addr = addr; BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, &sin_addr_len, CGROUP_INET4_GETSOCKNAME); } release_sock(sk); memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); return sin_addr_len; } EXPORT_SYMBOL(inet_getname); int inet_send_prepare(struct sock *sk) { sock_rps_record_flow(sk); /* We may need to bind the socket. */ if (data_race(!inet_sk(sk)->inet_num) && !sk->sk_prot->no_autobind && inet_autobind(sk)) return -EAGAIN; return 0; } EXPORT_SYMBOL_GPL(inet_send_prepare); int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) { struct sock *sk = sock->sk; if (unlikely(inet_send_prepare(sk))) return -EAGAIN; return INDIRECT_CALL_2(sk->sk_prot->sendmsg, tcp_sendmsg, udp_sendmsg, sk, msg, size); } EXPORT_SYMBOL(inet_sendmsg); void inet_splice_eof(struct socket *sock) { const struct proto *prot; struct sock *sk = sock->sk; if (unlikely(inet_send_prepare(sk))) return; /* IPV6_ADDRFORM can change sk->sk_prot under us. */ prot = READ_ONCE(sk->sk_prot); if (prot->splice_eof) prot->splice_eof(sock); } EXPORT_SYMBOL_GPL(inet_splice_eof); INDIRECT_CALLABLE_DECLARE(int udp_recvmsg(struct sock *, struct msghdr *, size_t, int, int *)); int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; int addr_len = 0; int err; if (likely(!(flags & MSG_ERRQUEUE))) sock_rps_record_flow(sk); err = INDIRECT_CALL_2(sk->sk_prot->recvmsg, tcp_recvmsg, udp_recvmsg, sk, msg, size, flags, &addr_len); if (err >= 0) msg->msg_namelen = addr_len; return err; } EXPORT_SYMBOL(inet_recvmsg); int inet_shutdown(struct socket *sock, int how) { struct sock *sk = sock->sk; int err = 0; /* This should really check to make sure * the socket is a TCP socket. (WHY AC...) */ how++; /* maps 0->1 has the advantage of making bit 1 rcvs and 1->2 bit 2 snds. 2->3 */ if ((how & ~SHUTDOWN_MASK) || !how) /* MAXINT->0 */ return -EINVAL; lock_sock(sk); if (sock->state == SS_CONNECTING) { if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE)) sock->state = SS_DISCONNECTING; else sock->state = SS_CONNECTED; } switch (sk->sk_state) { case TCP_CLOSE: err = -ENOTCONN; /* Hack to wake up other listeners, who can poll for EPOLLHUP, even on eg. unconnected UDP sockets -- RR */ fallthrough; default: WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | how); if (sk->sk_prot->shutdown) sk->sk_prot->shutdown(sk, how); break; /* Remaining two branches are temporary solution for missing * close() in multithreaded environment. It is _not_ a good idea, * but we have no choice until close() is repaired at VFS level. */ case TCP_LISTEN: if (!(how & RCV_SHUTDOWN)) break; fallthrough; case TCP_SYN_SENT: err = sk->sk_prot->disconnect(sk, O_NONBLOCK); sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED; break; } /* Wake up anyone sleeping in poll. */ sk->sk_state_change(sk); release_sock(sk); return err; } EXPORT_SYMBOL(inet_shutdown); /* * ioctl() calls you can issue on an INET socket. Most of these are * device configuration and stuff and very rarely used. Some ioctls * pass on to the socket itself. * * NOTE: I like the idea of a module for the config stuff. ie ifconfig * loads the devconfigure module does its configuring and unloads it. * There's a good 20K of config code hanging around the kernel. */ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct sock *sk = sock->sk; int err = 0; struct net *net = sock_net(sk); void __user *p = (void __user *)arg; struct ifreq ifr; struct rtentry rt; switch (cmd) { case SIOCADDRT: case SIOCDELRT: if (copy_from_user(&rt, p, sizeof(struct rtentry))) return -EFAULT; err = ip_rt_ioctl(net, cmd, &rt); break; case SIOCRTMSG: err = -EINVAL; break; case SIOCDARP: case SIOCGARP: case SIOCSARP: err = arp_ioctl(net, cmd, (void __user *)arg); break; case SIOCGIFADDR: case SIOCGIFBRDADDR: case SIOCGIFNETMASK: case SIOCGIFDSTADDR: case SIOCGIFPFLAGS: if (get_user_ifreq(&ifr, NULL, p)) return -EFAULT; err = devinet_ioctl(net, cmd, &ifr); if (!err && put_user_ifreq(&ifr, p)) err = -EFAULT; break; case SIOCSIFADDR: case SIOCSIFBRDADDR: case SIOCSIFNETMASK: case SIOCSIFDSTADDR: case SIOCSIFPFLAGS: case SIOCSIFFLAGS: if (get_user_ifreq(&ifr, NULL, p)) return -EFAULT; err = devinet_ioctl(net, cmd, &ifr); break; default: if (sk->sk_prot->ioctl) err = sk_ioctl(sk, cmd, (void __user *)arg); else err = -ENOIOCTLCMD; break; } return err; } EXPORT_SYMBOL(inet_ioctl); #ifdef CONFIG_COMPAT static int inet_compat_routing_ioctl(struct sock *sk, unsigned int cmd, struct compat_rtentry __user *ur) { compat_uptr_t rtdev; struct rtentry rt; if (copy_from_user(&rt.rt_dst, &ur->rt_dst, 3 * sizeof(struct sockaddr)) || get_user(rt.rt_flags, &ur->rt_flags) || get_user(rt.rt_metric, &ur->rt_metric) || get_user(rt.rt_mtu, &ur->rt_mtu) || get_user(rt.rt_window, &ur->rt_window) || get_user(rt.rt_irtt, &ur->rt_irtt) || get_user(rtdev, &ur->rt_dev)) return -EFAULT; rt.rt_dev = compat_ptr(rtdev); return ip_rt_ioctl(sock_net(sk), cmd, &rt); } static int inet_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { void __user *argp = compat_ptr(arg); struct sock *sk = sock->sk; switch (cmd) { case SIOCADDRT: case SIOCDELRT: return inet_compat_routing_ioctl(sk, cmd, argp); default: if (!sk->sk_prot->compat_ioctl) return -ENOIOCTLCMD; return sk->sk_prot->compat_ioctl(sk, cmd, arg); } } #endif /* CONFIG_COMPAT */ const struct proto_ops inet_stream_ops = { .family = PF_INET, .owner = THIS_MODULE, .release = inet_release, .bind = inet_bind, .connect = inet_stream_connect, .socketpair = sock_no_socketpair, .accept = inet_accept, .getname = inet_getname, .poll = tcp_poll, .ioctl = inet_ioctl, .gettstamp = sock_gettstamp, .listen = inet_listen, .shutdown = inet_shutdown, .setsockopt = sock_common_setsockopt, .getsockopt = sock_common_getsockopt, .sendmsg = inet_sendmsg, .recvmsg = inet_recvmsg, #ifdef CONFIG_MMU .mmap = tcp_mmap, #endif .splice_eof = inet_splice_eof, .splice_read = tcp_splice_read, .read_sock = tcp_read_sock, .read_skb = tcp_read_skb, .sendmsg_locked = tcp_sendmsg_locked, .peek_len = tcp_peek_len, #ifdef CONFIG_COMPAT .compat_ioctl = inet_compat_ioctl, #endif .set_rcvlowat = tcp_set_rcvlowat, }; EXPORT_SYMBOL(inet_stream_ops); const struct proto_ops inet_dgram_ops = { .family = PF_INET, .owner = THIS_MODULE, .release = inet_release, .bind = inet_bind, .connect = inet_dgram_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = inet_getname, .poll = udp_poll, .ioctl = inet_ioctl, .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = inet_shutdown, .setsockopt = sock_common_setsockopt, .getsockopt = sock_common_getsockopt, .sendmsg = inet_sendmsg, .read_skb = udp_read_skb, .recvmsg = inet_recvmsg, .mmap = sock_no_mmap, .splice_eof = inet_splice_eof, .set_peek_off = sk_set_peek_off, #ifdef CONFIG_COMPAT .compat_ioctl = inet_compat_ioctl, #endif }; EXPORT_SYMBOL(inet_dgram_ops); /* * For SOCK_RAW sockets; should be the same as inet_dgram_ops but without * udp_poll */ static const struct proto_ops inet_sockraw_ops = { .family = PF_INET, .owner = THIS_MODULE, .release = inet_release, .bind = inet_bind, .connect = inet_dgram_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = inet_getname, .poll = datagram_poll, .ioctl = inet_ioctl, .gettstamp = sock_gettstamp, .listen = sock_no_listen, .shutdown = inet_shutdown, .setsockopt = sock_common_setsockopt, .getsockopt = sock_common_getsockopt, .sendmsg = inet_sendmsg, .recvmsg = inet_recvmsg, .mmap = sock_no_mmap, .splice_eof = inet_splice_eof, #ifdef CONFIG_COMPAT .compat_ioctl = inet_compat_ioctl, #endif }; static const struct net_proto_family inet_family_ops = { .family = PF_INET, .create = inet_create, .owner = THIS_MODULE, }; /* Upon startup we insert all the elements in inetsw_array[] into * the linked list inetsw. */ static struct inet_protosw inetsw_array[] = { { .type = SOCK_STREAM, .protocol = IPPROTO_TCP, .prot = &tcp_prot, .ops = &inet_stream_ops, .flags = INET_PROTOSW_PERMANENT | INET_PROTOSW_ICSK, }, { .type = SOCK_DGRAM, .protocol = IPPROTO_UDP, .prot = &udp_prot, .ops = &inet_dgram_ops, .flags = INET_PROTOSW_PERMANENT, }, { .type = SOCK_DGRAM, .protocol = IPPROTO_ICMP, .prot = &ping_prot, .ops = &inet_sockraw_ops, .flags = INET_PROTOSW_REUSE, }, { .type = SOCK_RAW, .protocol = IPPROTO_IP, /* wild card */ .prot = &raw_prot, .ops = &inet_sockraw_ops, .flags = INET_PROTOSW_REUSE, } }; #define INETSW_ARRAY_LEN ARRAY_SIZE(inetsw_array) void inet_register_protosw(struct inet_protosw *p) { struct list_head *lh; struct inet_protosw *answer; int protocol = p->protocol; struct list_head *last_perm; spin_lock_bh(&inetsw_lock); if (p->type >= SOCK_MAX) goto out_illegal; /* If we are trying to override a permanent protocol, bail. */ last_perm = &inetsw[p->type]; list_for_each(lh, &inetsw[p->type]) { answer = list_entry(lh, struct inet_protosw, list); /* Check only the non-wild match. */ if ((INET_PROTOSW_PERMANENT & answer->flags) == 0) break; if (protocol == answer->protocol) goto out_permanent; last_perm = lh; } /* Add the new entry after the last permanent entry if any, so that * the new entry does not override a permanent entry when matched with * a wild-card protocol. But it is allowed to override any existing * non-permanent entry. This means that when we remove this entry, the * system automatically returns to the old behavior. */ list_add_rcu(&p->list, last_perm); out: spin_unlock_bh(&inetsw_lock); return; out_permanent: pr_err("Attempt to override permanent protocol %d\n", protocol); goto out; out_illegal: pr_err("Ignoring attempt to register invalid socket type %d\n", p->type); goto out; } EXPORT_SYMBOL(inet_register_protosw); void inet_unregister_protosw(struct inet_protosw *p) { if (INET_PROTOSW_PERMANENT & p->flags) { pr_err("Attempt to unregister permanent protocol %d\n", p->protocol); } else { spin_lock_bh(&inetsw_lock); list_del_rcu(&p->list); spin_unlock_bh(&inetsw_lock); synchronize_net(); } } EXPORT_SYMBOL(inet_unregister_protosw); static int inet_sk_reselect_saddr(struct sock *sk) { struct inet_sock *inet = inet_sk(sk); __be32 old_saddr = inet->inet_saddr; __be32 daddr = inet->inet_daddr; struct flowi4 *fl4; struct rtable *rt; __be32 new_saddr; struct ip_options_rcu *inet_opt; int err; inet_opt = rcu_dereference_protected(inet->inet_opt, lockdep_sock_is_held(sk)); if (inet_opt && inet_opt->opt.srr) daddr = inet_opt->opt.faddr; /* Query new route. */ fl4 = &inet->cork.fl.u.ip4; rt = ip_route_connect(fl4, daddr, 0, sk->sk_bound_dev_if, sk->sk_protocol, inet->inet_sport, inet->inet_dport, sk); if (IS_ERR(rt)) return PTR_ERR(rt); new_saddr = fl4->saddr; if (new_saddr == old_saddr) { sk_setup_caps(sk, &rt->dst); return 0; } err = inet_bhash2_update_saddr(sk, &new_saddr, AF_INET); if (err) { ip_rt_put(rt); return err; } sk_setup_caps(sk, &rt->dst); if (READ_ONCE(sock_net(sk)->ipv4.sysctl_ip_dynaddr) > 1) { pr_info("%s(): shifting inet->saddr from %pI4 to %pI4\n", __func__, &old_saddr, &new_saddr); } /* * XXX The only one ugly spot where we need to * XXX really change the sockets identity after * XXX it has entered the hashes. -DaveM * * Besides that, it does not check for connection * uniqueness. Wait for troubles. */ return __sk_prot_rehash(sk); } int inet_sk_rebuild_header(struct sock *sk) { struct inet_sock *inet = inet_sk(sk); struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0); __be32 daddr; struct ip_options_rcu *inet_opt; struct flowi4 *fl4; int err; /* Route is OK, nothing to do. */ if (rt) return 0; /* Reroute. */ rcu_read_lock(); inet_opt = rcu_dereference(inet->inet_opt); daddr = inet->inet_daddr; if (inet_opt && inet_opt->opt.srr) daddr = inet_opt->opt.faddr; rcu_read_unlock(); fl4 = &inet->cork.fl.u.ip4; rt = ip_route_output_ports(sock_net(sk), fl4, sk, daddr, inet->inet_saddr, inet->inet_dport, inet->inet_sport, sk->sk_protocol, RT_CONN_FLAGS(sk), sk->sk_bound_dev_if); if (!IS_ERR(rt)) { err = 0; sk_setup_caps(sk, &rt->dst); } else { err = PTR_ERR(rt); /* Routing failed... */ sk->sk_route_caps = 0; /* * Other protocols have to map its equivalent state to TCP_SYN_SENT. * DCCP maps its DCCP_REQUESTING state to TCP_SYN_SENT. -acme */ if (!READ_ONCE(sock_net(sk)->ipv4.sysctl_ip_dynaddr) || sk->sk_state != TCP_SYN_SENT || (sk->sk_userlocks & SOCK_BINDADDR_LOCK) || (err = inet_sk_reselect_saddr(sk)) != 0) WRITE_ONCE(sk->sk_err_soft, -err); } return err; } EXPORT_SYMBOL(inet_sk_rebuild_header); void inet_sk_set_state(struct sock *sk, int state) { trace_inet_sock_set_state(sk, sk->sk_state, state); sk->sk_state = state; } EXPORT_SYMBOL(inet_sk_set_state); void inet_sk_state_store(struct sock *sk, int newstate) { trace_inet_sock_set_state(sk, sk->sk_state, newstate); smp_store_release(&sk->sk_state, newstate); } struct sk_buff *inet_gso_segment(struct sk_buff *skb, netdev_features_t features) { bool udpfrag = false, fixedid = false, gso_partial, encap; struct sk_buff *segs = ERR_PTR(-EINVAL); const struct net_offload *ops; unsigned int offset = 0; struct iphdr *iph; int proto, tot_len; int nhoff; int ihl; int id; skb_reset_network_header(skb); nhoff = skb_network_header(skb) - skb_mac_header(skb); if (unlikely(!pskb_may_pull(skb, sizeof(*iph)))) goto out; iph = ip_hdr(skb); ihl = iph->ihl * 4; if (ihl < sizeof(*iph)) goto out; id = ntohs(iph->id); proto = iph->protocol; /* Warning: after this point, iph might be no longer valid */ if (unlikely(!pskb_may_pull(skb, ihl))) goto out; __skb_pull(skb, ihl); encap = SKB_GSO_CB(skb)->encap_level > 0; if (encap) features &= skb->dev->hw_enc_features; SKB_GSO_CB(skb)->encap_level += ihl; skb_reset_transport_header(skb); segs = ERR_PTR(-EPROTONOSUPPORT); if (!skb->encapsulation || encap) { udpfrag = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP); fixedid = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TCP_FIXEDID); /* fixed ID is invalid if DF bit is not set */ if (fixedid && !(ip_hdr(skb)->frag_off & htons(IP_DF))) goto out; } ops = rcu_dereference(inet_offloads[proto]); if (likely(ops && ops->callbacks.gso_segment)) { segs = ops->callbacks.gso_segment(skb, features); if (!segs) skb->network_header = skb_mac_header(skb) + nhoff - skb->head; } if (IS_ERR_OR_NULL(segs)) goto out; gso_partial = !!(skb_shinfo(segs)->gso_type & SKB_GSO_PARTIAL); skb = segs; do { iph = (struct iphdr *)(skb_mac_header(skb) + nhoff); if (udpfrag) { iph->frag_off = htons(offset >> 3); if (skb->next) iph->frag_off |= htons(IP_MF); offset += skb->len - nhoff - ihl; tot_len = skb->len - nhoff; } else if (skb_is_gso(skb)) { if (!fixedid) { iph->id = htons(id); id += skb_shinfo(skb)->gso_segs; } if (gso_partial) tot_len = skb_shinfo(skb)->gso_size + SKB_GSO_CB(skb)->data_offset + skb->head - (unsigned char *)iph; else tot_len = skb->len - nhoff; } else { if (!fixedid) iph->id = htons(id++); tot_len = skb->len - nhoff; } iph->tot_len = htons(tot_len); ip_send_check(iph); if (encap) skb_reset_inner_headers(skb); skb->network_header = (u8 *)iph - skb->head; skb_reset_mac_len(skb); } while ((skb = skb->next)); out: return segs; } static struct sk_buff *ipip_gso_segment(struct sk_buff *skb, netdev_features_t features) { if (!(skb_shinfo(skb)->gso_type & SKB_GSO_IPXIP4)) return ERR_PTR(-EINVAL); return inet_gso_segment(skb, features); } struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb) { const struct net_offload *ops; struct sk_buff *pp = NULL; const struct iphdr *iph; struct sk_buff *p; unsigned int hlen; unsigned int off; unsigned int id; int flush = 1; int proto; off = skb_gro_offset(skb); hlen = off + sizeof(*iph); iph = skb_gro_header(skb, hlen, off); if (unlikely(!iph)) goto out; proto = iph->protocol; ops = rcu_dereference(inet_offloads[proto]); if (!ops || !ops->callbacks.gro_receive) goto out; if (*(u8 *)iph != 0x45) goto out; if (ip_is_fragment(iph)) goto out; if (unlikely(ip_fast_csum((u8 *)iph, 5))) goto out; NAPI_GRO_CB(skb)->proto = proto; id = ntohl(*(__be32 *)&iph->id); flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id & ~IP_DF)); id >>= 16; list_for_each_entry(p, head, list) { struct iphdr *iph2; u16 flush_id; if (!NAPI_GRO_CB(p)->same_flow) continue; iph2 = (struct iphdr *)(p->data + off); /* The above works because, with the exception of the top * (inner most) layer, we only aggregate pkts with the same * hdr length so all the hdrs we'll need to verify will start * at the same offset. */ if ((iph->protocol ^ iph2->protocol) | ((__force u32)iph->saddr ^ (__force u32)iph2->saddr) | ((__force u32)iph->daddr ^ (__force u32)iph2->daddr)) { NAPI_GRO_CB(p)->same_flow = 0; continue; } /* All fields must match except length and checksum. */ NAPI_GRO_CB(p)->flush |= (iph->ttl ^ iph2->ttl) | (iph->tos ^ iph2->tos) | ((iph->frag_off ^ iph2->frag_off) & htons(IP_DF)); NAPI_GRO_CB(p)->flush |= flush; /* We need to store of the IP ID check to be included later * when we can verify that this packet does in fact belong * to a given flow. */ flush_id = (u16)(id - ntohs(iph2->id)); /* This bit of code makes it much easier for us to identify * the cases where we are doing atomic vs non-atomic IP ID * checks. Specifically an atomic check can return IP ID * values 0 - 0xFFFF, while a non-atomic check can only * return 0 or 0xFFFF. */ if (!NAPI_GRO_CB(p)->is_atomic || !(iph->frag_off & htons(IP_DF))) { flush_id ^= NAPI_GRO_CB(p)->count; flush_id = flush_id ? 0xFFFF : 0; } /* If the previous IP ID value was based on an atomic * datagram we can overwrite the value and ignore it. */ if (NAPI_GRO_CB(skb)->is_atomic) NAPI_GRO_CB(p)->flush_id = flush_id; else NAPI_GRO_CB(p)->flush_id |= flush_id; } NAPI_GRO_CB(skb)->is_atomic = !!(iph->frag_off & htons(IP_DF)); NAPI_GRO_CB(skb)->flush |= flush; skb_set_network_header(skb, off); /* The above will be needed by the transport layer if there is one * immediately following this IP hdr. */ /* Note : No need to call skb_gro_postpull_rcsum() here, * as we already checked checksum over ipv4 header was 0 */ skb_gro_pull(skb, sizeof(*iph)); skb_set_transport_header(skb, skb_gro_offset(skb)); pp = indirect_call_gro_receive(tcp4_gro_receive, udp4_gro_receive, ops->callbacks.gro_receive, head, skb); out: skb_gro_flush_final(skb, pp, flush); return pp; } static struct sk_buff *ipip_gro_receive(struct list_head *head, struct sk_buff *skb) { if (NAPI_GRO_CB(skb)->encap_mark) { NAPI_GRO_CB(skb)->flush = 1; return NULL; } NAPI_GRO_CB(skb)->encap_mark = 1; return inet_gro_receive(head, skb); } #define SECONDS_PER_DAY 86400 /* inet_current_timestamp - Return IP network timestamp * * Return milliseconds since midnight in network byte order. */ __be32 inet_current_timestamp(void) { u32 secs; u32 msecs; struct timespec64 ts; ktime_get_real_ts64(&ts); /* Get secs since midnight. */ (void)div_u64_rem(ts.tv_sec, SECONDS_PER_DAY, &secs); /* Convert to msecs. */ msecs = secs * MSEC_PER_SEC; /* Convert nsec to msec. */ msecs += (u32)ts.tv_nsec / NSEC_PER_MSEC; /* Convert to network byte order. */ return htonl(msecs); } EXPORT_SYMBOL(inet_current_timestamp); int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) { if (sk->sk_family == AF_INET) return ip_recv_error(sk, msg, len, addr_len); #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) return pingv6_ops.ipv6_recv_error(sk, msg, len, addr_len); #endif return -EINVAL; } int inet_gro_complete(struct sk_buff *skb, int nhoff) { struct iphdr *iph = (struct iphdr *)(skb->data + nhoff); const struct net_offload *ops; __be16 totlen = iph->tot_len; int proto = iph->protocol; int err = -ENOSYS; if (skb->encapsulation) { skb_set_inner_protocol(skb, cpu_to_be16(ETH_P_IP)); skb_set_inner_network_header(skb, nhoff); } iph_set_totlen(iph, skb->len - nhoff); csum_replace2(&iph->check, totlen, iph->tot_len); ops = rcu_dereference(inet_offloads[proto]); if (WARN_ON(!ops || !ops->callbacks.gro_complete)) goto out; /* Only need to add sizeof(*iph) to get to the next hdr below * because any hdr with option will have been flushed in * inet_gro_receive(). */ err = INDIRECT_CALL_2(ops->callbacks.gro_complete, tcp4_gro_complete, udp4_gro_complete, skb, nhoff + sizeof(*iph)); out: return err; } static int ipip_gro_complete(struct sk_buff *skb, int nhoff) { skb->encapsulation = 1; skb_shinfo(skb)->gso_type |= SKB_GSO_IPXIP4; return inet_gro_complete(skb, nhoff); } int inet_ctl_sock_create(struct sock **sk, unsigned short family, unsigned short type, unsigned char protocol, struct net *net) { struct socket *sock; int rc = sock_create_kern(net, family, type, protocol, &sock); if (rc == 0) { *sk = sock->sk; (*sk)->sk_allocation = GFP_ATOMIC; (*sk)->sk_use_task_frag = false; /* * Unhash it so that IP input processing does not even see it, * we do not wish this socket to see incoming packets. */ (*sk)->sk_prot->unhash(*sk); } return rc; } EXPORT_SYMBOL_GPL(inet_ctl_sock_create); unsigned long snmp_fold_field(void __percpu *mib, int offt) { unsigned long res = 0; int i; for_each_possible_cpu(i) res += snmp_get_cpu_field(mib, i, offt); return res; } EXPORT_SYMBOL_GPL(snmp_fold_field); #if BITS_PER_LONG==32 u64 snmp_get_cpu_field64(void __percpu *mib, int cpu, int offt, size_t syncp_offset) { void *bhptr; struct u64_stats_sync *syncp; u64 v; unsigned int start; bhptr = per_cpu_ptr(mib, cpu); syncp = (struct u64_stats_sync *)(bhptr + syncp_offset); do { start = u64_stats_fetch_begin(syncp); v = *(((u64 *)bhptr) + offt); } while (u64_stats_fetch_retry(syncp, start)); return v; } EXPORT_SYMBOL_GPL(snmp_get_cpu_field64); u64 snmp_fold_field64(void __percpu *mib, int offt, size_t syncp_offset) { u64 res = 0; int cpu; for_each_possible_cpu(cpu) { res += snmp_get_cpu_field64(mib, cpu, offt, syncp_offset); } return res; } EXPORT_SYMBOL_GPL(snmp_fold_field64); #endif #ifdef CONFIG_IP_MULTICAST static const struct net_protocol igmp_protocol = { .handler = igmp_rcv, }; #endif static const struct net_protocol tcp_protocol = { .handler = tcp_v4_rcv, .err_handler = tcp_v4_err, .no_policy = 1, .icmp_strict_tag_validation = 1, }; static const struct net_protocol udp_protocol = { .handler = udp_rcv, .err_handler = udp_err, .no_policy = 1, }; static const struct net_protocol icmp_protocol = { .handler = icmp_rcv, .err_handler = icmp_err, .no_policy = 1, }; static __net_init int ipv4_mib_init_net(struct net *net) { int i; net->mib.tcp_statistics = alloc_percpu(struct tcp_mib); if (!net->mib.tcp_statistics) goto err_tcp_mib; net->mib.ip_statistics = alloc_percpu(struct ipstats_mib); if (!net->mib.ip_statistics) goto err_ip_mib; for_each_possible_cpu(i) { struct ipstats_mib *af_inet_stats; af_inet_stats = per_cpu_ptr(net->mib.ip_statistics, i); u64_stats_init(&af_inet_stats->syncp); } net->mib.net_statistics = alloc_percpu(struct linux_mib); if (!net->mib.net_statistics) goto err_net_mib; net->mib.udp_statistics = alloc_percpu(struct udp_mib); if (!net->mib.udp_statistics) goto err_udp_mib; net->mib.udplite_statistics = alloc_percpu(struct udp_mib); if (!net->mib.udplite_statistics) goto err_udplite_mib; net->mib.icmp_statistics = alloc_percpu(struct icmp_mib); if (!net->mib.icmp_statistics) goto err_icmp_mib; net->mib.icmpmsg_statistics = kzalloc(sizeof(struct icmpmsg_mib), GFP_KERNEL); if (!net->mib.icmpmsg_statistics) goto err_icmpmsg_mib; tcp_mib_init(net); return 0; err_icmpmsg_mib: free_percpu(net->mib.icmp_statistics); err_icmp_mib: free_percpu(net->mib.udplite_statistics); err_udplite_mib: free_percpu(net->mib.udp_statistics); err_udp_mib: free_percpu(net->mib.net_statistics); err_net_mib: free_percpu(net->mib.ip_statistics); err_ip_mib: free_percpu(net->mib.tcp_statistics); err_tcp_mib: return -ENOMEM; } static __net_exit void ipv4_mib_exit_net(struct net *net) { kfree(net->mib.icmpmsg_statistics); free_percpu(net->mib.icmp_statistics); free_percpu(net->mib.udplite_statistics); free_percpu(net->mib.udp_statistics); free_percpu(net->mib.net_statistics); free_percpu(net->mib.ip_statistics); free_percpu(net->mib.tcp_statistics); #ifdef CONFIG_MPTCP /* allocated on demand, see mptcp_init_sock() */ free_percpu(net->mib.mptcp_statistics); #endif } static __net_initdata struct pernet_operations ipv4_mib_ops = { .init = ipv4_mib_init_net, .exit = ipv4_mib_exit_net, }; static int __init init_ipv4_mibs(void) { return register_pernet_subsys(&ipv4_mib_ops); } static __net_init int inet_init_net(struct net *net) { /* * Set defaults for local port range */ seqlock_init(&net->ipv4.ip_local_ports.lock); net->ipv4.ip_local_ports.range[0] = 32768; net->ipv4.ip_local_ports.range[1] = 60999; seqlock_init(&net->ipv4.ping_group_range.lock); /* * Sane defaults - nobody may create ping sockets. * Boot scripts should set this to distro-specific group. */ net->ipv4.ping_group_range.range[0] = make_kgid(&init_user_ns, 1); net->ipv4.ping_group_range.range[1] = make_kgid(&init_user_ns, 0); /* Default values for sysctl-controlled parameters. * We set them here, in case sysctl is not compiled. */ net->ipv4.sysctl_ip_default_ttl = IPDEFTTL; net->ipv4.sysctl_ip_fwd_update_priority = 1; net->ipv4.sysctl_ip_dynaddr = 0; net->ipv4.sysctl_ip_early_demux = 1; net->ipv4.sysctl_udp_early_demux = 1; net->ipv4.sysctl_tcp_early_demux = 1; net->ipv4.sysctl_nexthop_compat_mode = 1; #ifdef CONFIG_SYSCTL net->ipv4.sysctl_ip_prot_sock = PROT_SOCK; #endif /* Some igmp sysctl, whose values are always used */ net->ipv4.sysctl_igmp_max_memberships = 20; net->ipv4.sysctl_igmp_max_msf = 10; /* IGMP reports for link-local multicast groups are enabled by default */ net->ipv4.sysctl_igmp_llm_reports = 1; net->ipv4.sysctl_igmp_qrv = 2; net->ipv4.sysctl_fib_notify_on_flag_change = 0; return 0; } static __net_initdata struct pernet_operations af_inet_ops = { .init = inet_init_net, }; static int __init init_inet_pernet_ops(void) { return register_pernet_subsys(&af_inet_ops); } static int ipv4_proc_init(void); /* * IP protocol layer initialiser */ static struct packet_offload ip_packet_offload __read_mostly = { .type = cpu_to_be16(ETH_P_IP), .callbacks = { .gso_segment = inet_gso_segment, .gro_receive = inet_gro_receive, .gro_complete = inet_gro_complete, }, }; static const struct net_offload ipip_offload = { .callbacks = { .gso_segment = ipip_gso_segment, .gro_receive = ipip_gro_receive, .gro_complete = ipip_gro_complete, }, }; static int __init ipip_offload_init(void) { return inet_add_offload(&ipip_offload, IPPROTO_IPIP); } static int __init ipv4_offload_init(void) { /* * Add offloads */ if (udpv4_offload_init() < 0) pr_crit("%s: Cannot add UDP protocol offload\n", __func__); if (tcpv4_offload_init() < 0) pr_crit("%s: Cannot add TCP protocol offload\n", __func__); if (ipip_offload_init() < 0) pr_crit("%s: Cannot add IPIP protocol offload\n", __func__); dev_add_offload(&ip_packet_offload); return 0; } fs_initcall(ipv4_offload_init); static struct packet_type ip_packet_type __read_mostly = { .type = cpu_to_be16(ETH_P_IP), .func = ip_rcv, .list_func = ip_list_rcv, }; static int __init inet_init(void) { struct inet_protosw *q; struct list_head *r; int rc; sock_skb_cb_check_size(sizeof(struct inet_skb_parm)); raw_hashinfo_init(&raw_v4_hashinfo); rc = proto_register(&tcp_prot, 1); if (rc) goto out; rc = proto_register(&udp_prot, 1); if (rc) goto out_unregister_tcp_proto; rc = proto_register(&raw_prot, 1); if (rc) goto out_unregister_udp_proto; rc = proto_register(&ping_prot, 1); if (rc) goto out_unregister_raw_proto; /* * Tell SOCKET that we are alive... */ (void)sock_register(&inet_family_ops); #ifdef CONFIG_SYSCTL ip_static_sysctl_init(); #endif /* * Add all the base protocols. */ if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0) pr_crit("%s: Cannot add ICMP protocol\n", __func__); if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0) pr_crit("%s: Cannot add UDP protocol\n", __func__); if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0) pr_crit("%s: Cannot add TCP protocol\n", __func__); #ifdef CONFIG_IP_MULTICAST if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0) pr_crit("%s: Cannot add IGMP protocol\n", __func__); #endif /* Register the socket-side information for inet_create. */ for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r) INIT_LIST_HEAD(r); for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q) inet_register_protosw(q); /* * Set the ARP module up */ arp_init(); /* * Set the IP module up */ ip_init(); /* Initialise per-cpu ipv4 mibs */ if (init_ipv4_mibs()) panic("%s: Cannot init ipv4 mibs\n", __func__); /* Setup TCP slab cache for open requests. */ tcp_init(); /* Setup UDP memory threshold */ udp_init(); /* Add UDP-Lite (RFC 3828) */ udplite4_register(); raw_init(); ping_init(); /* * Set the ICMP layer up */ if (icmp_init() < 0) panic("Failed to create the ICMP control socket.\n"); /* * Initialise the multicast router */ #if defined(CONFIG_IP_MROUTE) if (ip_mr_init()) pr_crit("%s: Cannot init ipv4 mroute\n", __func__); #endif if (init_inet_pernet_ops()) pr_crit("%s: Cannot init ipv4 inet pernet ops\n", __func__); ipv4_proc_init(); ipfrag_init(); dev_add_pack(&ip_packet_type); ip_tunnel_core_init(); rc = 0; out: return rc; out_unregister_raw_proto: proto_unregister(&raw_prot); out_unregister_udp_proto: proto_unregister(&udp_prot); out_unregister_tcp_proto: proto_unregister(&tcp_prot); goto out; } fs_initcall(inet_init); /* ------------------------------------------------------------------------ */ #ifdef CONFIG_PROC_FS static int __init ipv4_proc_init(void) { int rc = 0; if (raw_proc_init()) goto out_raw; if (tcp4_proc_init()) goto out_tcp; if (udp4_proc_init()) goto out_udp; if (ping_proc_init()) goto out_ping; if (ip_misc_proc_init()) goto out_misc; out: return rc; out_misc: ping_proc_exit(); out_ping: udp4_proc_exit(); out_udp: tcp4_proc_exit(); out_tcp: raw_proc_exit(); out_raw: rc = -ENOMEM; goto out; } #else /* CONFIG_PROC_FS */ static int __init ipv4_proc_init(void) { return 0; } #endif /* CONFIG_PROC_FS */
2 3 11 2 3 1 2 2 11 11 27 26 20 27 2 2 6 1 4 1 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 // SPDX-License-Identifier: GPL-2.0-or-later /* * CIPSO - Commercial IP Security Option * * This is an implementation of the CIPSO 2.2 protocol as specified in * draft-ietf-cipso-ipsecurity-01.txt with additional tag types as found in * FIPS-188. While CIPSO never became a full IETF RFC standard many vendors * have chosen to adopt the protocol and over the years it has become a * de-facto standard for labeled networking. * * The CIPSO draft specification can be found in the kernel's Documentation * directory as well as the following URL: * https://tools.ietf.org/id/draft-ietf-cipso-ipsecurity-01.txt * The FIPS-188 specification can be found at the following URL: * https://www.itl.nist.gov/fipspubs/fip188.htm * * Author: Paul Moore <paul.moore@hp.com> */ /* * (c) Copyright Hewlett-Packard Development Company, L.P., 2006, 2008 */ #include <linux/init.h> #include <linux/types.h> #include <linux/rcupdate.h> #include <linux/list.h> #include <linux/spinlock.h> #include <linux/string.h> #include <linux/jhash.h> #include <linux/audit.h> #include <linux/slab.h> #include <net/ip.h> #include <net/icmp.h> #include <net/tcp.h> #include <net/netlabel.h> #include <net/cipso_ipv4.h> #include <linux/atomic.h> #include <linux/bug.h> #include <asm/unaligned.h> /* List of available DOI definitions */ /* XXX - This currently assumes a minimal number of different DOIs in use, * if in practice there are a lot of different DOIs this list should * probably be turned into a hash table or something similar so we * can do quick lookups. */ static DEFINE_SPINLOCK(cipso_v4_doi_list_lock); static LIST_HEAD(cipso_v4_doi_list); /* Label mapping cache */ int cipso_v4_cache_enabled = 1; int cipso_v4_cache_bucketsize = 10; #define CIPSO_V4_CACHE_BUCKETBITS 7 #define CIPSO_V4_CACHE_BUCKETS (1 << CIPSO_V4_CACHE_BUCKETBITS) #define CIPSO_V4_CACHE_REORDERLIMIT 10 struct cipso_v4_map_cache_bkt { spinlock_t lock; u32 size; struct list_head list; }; struct cipso_v4_map_cache_entry { u32 hash; unsigned char *key; size_t key_len; struct netlbl_lsm_cache *lsm_data; u32 activity; struct list_head list; }; static struct cipso_v4_map_cache_bkt *cipso_v4_cache; /* Restricted bitmap (tag #1) flags */ int cipso_v4_rbm_optfmt; int cipso_v4_rbm_strictvalid = 1; /* * Protocol Constants */ /* Maximum size of the CIPSO IP option, derived from the fact that the maximum * IPv4 header size is 60 bytes and the base IPv4 header is 20 bytes long. */ #define CIPSO_V4_OPT_LEN_MAX 40 /* Length of the base CIPSO option, this includes the option type (1 byte), the * option length (1 byte), and the DOI (4 bytes). */ #define CIPSO_V4_HDR_LEN 6 /* Base length of the restrictive category bitmap tag (tag #1). */ #define CIPSO_V4_TAG_RBM_BLEN 4 /* Base length of the enumerated category tag (tag #2). */ #define CIPSO_V4_TAG_ENUM_BLEN 4 /* Base length of the ranged categories bitmap tag (tag #5). */ #define CIPSO_V4_TAG_RNG_BLEN 4 /* The maximum number of category ranges permitted in the ranged category tag * (tag #5). You may note that the IETF draft states that the maximum number * of category ranges is 7, but if the low end of the last category range is * zero then it is possible to fit 8 category ranges because the zero should * be omitted. */ #define CIPSO_V4_TAG_RNG_CAT_MAX 8 /* Base length of the local tag (non-standard tag). * Tag definition (may change between kernel versions) * * 0 8 16 24 32 * +----------+----------+----------+----------+ * | 10000000 | 00000110 | 32-bit secid value | * +----------+----------+----------+----------+ * | in (host byte order)| * +----------+----------+ * */ #define CIPSO_V4_TAG_LOC_BLEN 6 /* * Helper Functions */ /** * cipso_v4_cache_entry_free - Frees a cache entry * @entry: the entry to free * * Description: * This function frees the memory associated with a cache entry including the * LSM cache data if there are no longer any users, i.e. reference count == 0. * */ static void cipso_v4_cache_entry_free(struct cipso_v4_map_cache_entry *entry) { if (entry->lsm_data) netlbl_secattr_cache_free(entry->lsm_data); kfree(entry->key); kfree(entry); } /** * cipso_v4_map_cache_hash - Hashing function for the CIPSO cache * @key: the hash key * @key_len: the length of the key in bytes * * Description: * The CIPSO tag hashing function. Returns a 32-bit hash value. * */ static u32 cipso_v4_map_cache_hash(const unsigned char *key, u32 key_len) { return jhash(key, key_len, 0); } /* * Label Mapping Cache Functions */ /** * cipso_v4_cache_init - Initialize the CIPSO cache * * Description: * Initializes the CIPSO label mapping cache, this function should be called * before any of the other functions defined in this file. Returns zero on * success, negative values on error. * */ static int __init cipso_v4_cache_init(void) { u32 iter; cipso_v4_cache = kcalloc(CIPSO_V4_CACHE_BUCKETS, sizeof(struct cipso_v4_map_cache_bkt), GFP_KERNEL); if (!cipso_v4_cache) return -ENOMEM; for (iter = 0; iter < CIPSO_V4_CACHE_BUCKETS; iter++) { spin_lock_init(&cipso_v4_cache[iter].lock); cipso_v4_cache[iter].size = 0; INIT_LIST_HEAD(&cipso_v4_cache[iter].list); } return 0; } /** * cipso_v4_cache_invalidate - Invalidates the current CIPSO cache * * Description: * Invalidates and frees any entries in the CIPSO cache. * */ void cipso_v4_cache_invalidate(void) { struct cipso_v4_map_cache_entry *entry, *tmp_entry; u32 iter; for (iter = 0; iter < CIPSO_V4_CACHE_BUCKETS; iter++) { spin_lock_bh(&cipso_v4_cache[iter].lock); list_for_each_entry_safe(entry, tmp_entry, &cipso_v4_cache[iter].list, list) { list_del(&entry->list); cipso_v4_cache_entry_free(entry); } cipso_v4_cache[iter].size = 0; spin_unlock_bh(&cipso_v4_cache[iter].lock); } } /** * cipso_v4_cache_check - Check the CIPSO cache for a label mapping * @key: the buffer to check * @key_len: buffer length in bytes * @secattr: the security attribute struct to use * * Description: * This function checks the cache to see if a label mapping already exists for * the given key. If there is a match then the cache is adjusted and the * @secattr struct is populated with the correct LSM security attributes. The * cache is adjusted in the following manner if the entry is not already the * first in the cache bucket: * * 1. The cache entry's activity counter is incremented * 2. The previous (higher ranking) entry's activity counter is decremented * 3. If the difference between the two activity counters is geater than * CIPSO_V4_CACHE_REORDERLIMIT the two entries are swapped * * Returns zero on success, -ENOENT for a cache miss, and other negative values * on error. * */ static int cipso_v4_cache_check(const unsigned char *key, u32 key_len, struct netlbl_lsm_secattr *secattr) { u32 bkt; struct cipso_v4_map_cache_entry *entry; struct cipso_v4_map_cache_entry *prev_entry = NULL; u32 hash; if (!READ_ONCE(cipso_v4_cache_enabled)) return -ENOENT; hash = cipso_v4_map_cache_hash(key, key_len); bkt = hash & (CIPSO_V4_CACHE_BUCKETS - 1); spin_lock_bh(&cipso_v4_cache[bkt].lock); list_for_each_entry(entry, &cipso_v4_cache[bkt].list, list) { if (entry->hash == hash && entry->key_len == key_len && memcmp(entry->key, key, key_len) == 0) { entry->activity += 1; refcount_inc(&entry->lsm_data->refcount); secattr->cache = entry->lsm_data; secattr->flags |= NETLBL_SECATTR_CACHE; secattr->type = NETLBL_NLTYPE_CIPSOV4; if (!prev_entry) { spin_unlock_bh(&cipso_v4_cache[bkt].lock); return 0; } if (prev_entry->activity > 0) prev_entry->activity -= 1; if (entry->activity > prev_entry->activity && entry->activity - prev_entry->activity > CIPSO_V4_CACHE_REORDERLIMIT) { __list_del(entry->list.prev, entry->list.next); __list_add(&entry->list, prev_entry->list.prev, &prev_entry->list); } spin_unlock_bh(&cipso_v4_cache[bkt].lock); return 0; } prev_entry = entry; } spin_unlock_bh(&cipso_v4_cache[bkt].lock); return -ENOENT; } /** * cipso_v4_cache_add - Add an entry to the CIPSO cache * @cipso_ptr: pointer to CIPSO IP option * @secattr: the packet's security attributes * * Description: * Add a new entry into the CIPSO label mapping cache. Add the new entry to * head of the cache bucket's list, if the cache bucket is out of room remove * the last entry in the list first. It is important to note that there is * currently no checking for duplicate keys. Returns zero on success, * negative values on failure. * */ int cipso_v4_cache_add(const unsigned char *cipso_ptr, const struct netlbl_lsm_secattr *secattr) { int bkt_size = READ_ONCE(cipso_v4_cache_bucketsize); int ret_val = -EPERM; u32 bkt; struct cipso_v4_map_cache_entry *entry = NULL; struct cipso_v4_map_cache_entry *old_entry = NULL; u32 cipso_ptr_len; if (!READ_ONCE(cipso_v4_cache_enabled) || bkt_size <= 0) return 0; cipso_ptr_len = cipso_ptr[1]; entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (!entry) return -ENOMEM; entry->key = kmemdup(cipso_ptr, cipso_ptr_len, GFP_ATOMIC); if (!entry->key) { ret_val = -ENOMEM; goto cache_add_failure; } entry->key_len = cipso_ptr_len; entry->hash = cipso_v4_map_cache_hash(cipso_ptr, cipso_ptr_len); refcount_inc(&secattr->cache->refcount); entry->lsm_data = secattr->cache; bkt = entry->hash & (CIPSO_V4_CACHE_BUCKETS - 1); spin_lock_bh(&cipso_v4_cache[bkt].lock); if (cipso_v4_cache[bkt].size < bkt_size) { list_add(&entry->list, &cipso_v4_cache[bkt].list); cipso_v4_cache[bkt].size += 1; } else { old_entry = list_entry(cipso_v4_cache[bkt].list.prev, struct cipso_v4_map_cache_entry, list); list_del(&old_entry->list); list_add(&entry->list, &cipso_v4_cache[bkt].list); cipso_v4_cache_entry_free(old_entry); } spin_unlock_bh(&cipso_v4_cache[bkt].lock); return 0; cache_add_failure: if (entry) cipso_v4_cache_entry_free(entry); return ret_val; } /* * DOI List Functions */ /** * cipso_v4_doi_search - Searches for a DOI definition * @doi: the DOI to search for * * Description: * Search the DOI definition list for a DOI definition with a DOI value that * matches @doi. The caller is responsible for calling rcu_read_[un]lock(). * Returns a pointer to the DOI definition on success and NULL on failure. */ static struct cipso_v4_doi *cipso_v4_doi_search(u32 doi) { struct cipso_v4_doi *iter; list_for_each_entry_rcu(iter, &cipso_v4_doi_list, list) if (iter->doi == doi && refcount_read(&iter->refcount)) return iter; return NULL; } /** * cipso_v4_doi_add - Add a new DOI to the CIPSO protocol engine * @doi_def: the DOI structure * @audit_info: NetLabel audit information * * Description: * The caller defines a new DOI for use by the CIPSO engine and calls this * function to add it to the list of acceptable domains. The caller must * ensure that the mapping table specified in @doi_def->map meets all of the * requirements of the mapping type (see cipso_ipv4.h for details). Returns * zero on success and non-zero on failure. * */ int cipso_v4_doi_add(struct cipso_v4_doi *doi_def, struct netlbl_audit *audit_info) { int ret_val = -EINVAL; u32 iter; u32 doi; u32 doi_type; struct audit_buffer *audit_buf; doi = doi_def->doi; doi_type = doi_def->type; if (doi_def->doi == CIPSO_V4_DOI_UNKNOWN) goto doi_add_return; for (iter = 0; iter < CIPSO_V4_TAG_MAXCNT; iter++) { switch (doi_def->tags[iter]) { case CIPSO_V4_TAG_RBITMAP: break; case CIPSO_V4_TAG_RANGE: case CIPSO_V4_TAG_ENUM: if (doi_def->type != CIPSO_V4_MAP_PASS) goto doi_add_return; break; case CIPSO_V4_TAG_LOCAL: if (doi_def->type != CIPSO_V4_MAP_LOCAL) goto doi_add_return; break; case CIPSO_V4_TAG_INVALID: if (iter == 0) goto doi_add_return; break; default: goto doi_add_return; } } refcount_set(&doi_def->refcount, 1); spin_lock(&cipso_v4_doi_list_lock); if (cipso_v4_doi_search(doi_def->doi)) { spin_unlock(&cipso_v4_doi_list_lock); ret_val = -EEXIST; goto doi_add_return; } list_add_tail_rcu(&doi_def->list, &cipso_v4_doi_list); spin_unlock(&cipso_v4_doi_list_lock); ret_val = 0; doi_add_return: audit_buf = netlbl_audit_start(AUDIT_MAC_CIPSOV4_ADD, audit_info); if (audit_buf) { const char *type_str; switch (doi_type) { case CIPSO_V4_MAP_TRANS: type_str = "trans"; break; case CIPSO_V4_MAP_PASS: type_str = "pass"; break; case CIPSO_V4_MAP_LOCAL: type_str = "local"; break; default: type_str = "(unknown)"; } audit_log_format(audit_buf, " cipso_doi=%u cipso_type=%s res=%u", doi, type_str, ret_val == 0 ? 1 : 0); audit_log_end(audit_buf); } return ret_val; } /** * cipso_v4_doi_free - Frees a DOI definition * @doi_def: the DOI definition * * Description: * This function frees all of the memory associated with a DOI definition. * */ void cipso_v4_doi_free(struct cipso_v4_doi *doi_def) { if (!doi_def) return; switch (doi_def->type) { case CIPSO_V4_MAP_TRANS: kfree(doi_def->map.std->lvl.cipso); kfree(doi_def->map.std->lvl.local); kfree(doi_def->map.std->cat.cipso); kfree(doi_def->map.std->cat.local); kfree(doi_def->map.std); break; } kfree(doi_def); } /** * cipso_v4_doi_free_rcu - Frees a DOI definition via the RCU pointer * @entry: the entry's RCU field * * Description: * This function is designed to be used as a callback to the call_rcu() * function so that the memory allocated to the DOI definition can be released * safely. * */ static void cipso_v4_doi_free_rcu(struct rcu_head *entry) { struct cipso_v4_doi *doi_def; doi_def = container_of(entry, struct cipso_v4_doi, rcu); cipso_v4_doi_free(doi_def); } /** * cipso_v4_doi_remove - Remove an existing DOI from the CIPSO protocol engine * @doi: the DOI value * @audit_info: NetLabel audit information * * Description: * Removes a DOI definition from the CIPSO engine. The NetLabel routines will * be called to release their own LSM domain mappings as well as our own * domain list. Returns zero on success and negative values on failure. * */ int cipso_v4_doi_remove(u32 doi, struct netlbl_audit *audit_info) { int ret_val; struct cipso_v4_doi *doi_def; struct audit_buffer *audit_buf; spin_lock(&cipso_v4_doi_list_lock); doi_def = cipso_v4_doi_search(doi); if (!doi_def) { spin_unlock(&cipso_v4_doi_list_lock); ret_val = -ENOENT; goto doi_remove_return; } list_del_rcu(&doi_def->list); spin_unlock(&cipso_v4_doi_list_lock); cipso_v4_doi_putdef(doi_def); ret_val = 0; doi_remove_return: audit_buf = netlbl_audit_start(AUDIT_MAC_CIPSOV4_DEL, audit_info); if (audit_buf) { audit_log_format(audit_buf, " cipso_doi=%u res=%u", doi, ret_val == 0 ? 1 : 0); audit_log_end(audit_buf); } return ret_val; } /** * cipso_v4_doi_getdef - Returns a reference to a valid DOI definition * @doi: the DOI value * * Description: * Searches for a valid DOI definition and if one is found it is returned to * the caller. Otherwise NULL is returned. The caller must ensure that * rcu_read_lock() is held while accessing the returned definition and the DOI * definition reference count is decremented when the caller is done. * */ struct cipso_v4_doi *cipso_v4_doi_getdef(u32 doi) { struct cipso_v4_doi *doi_def; rcu_read_lock(); doi_def = cipso_v4_doi_search(doi); if (!doi_def) goto doi_getdef_return; if (!refcount_inc_not_zero(&doi_def->refcount)) doi_def = NULL; doi_getdef_return: rcu_read_unlock(); return doi_def; } /** * cipso_v4_doi_putdef - Releases a reference for the given DOI definition * @doi_def: the DOI definition * * Description: * Releases a DOI definition reference obtained from cipso_v4_doi_getdef(). * */ void cipso_v4_doi_putdef(struct cipso_v4_doi *doi_def) { if (!doi_def) return; if (!refcount_dec_and_test(&doi_def->refcount)) return; cipso_v4_cache_invalidate(); call_rcu(&doi_def->rcu, cipso_v4_doi_free_rcu); } /** * cipso_v4_doi_walk - Iterate through the DOI definitions * @skip_cnt: skip past this number of DOI definitions, updated * @callback: callback for each DOI definition * @cb_arg: argument for the callback function * * Description: * Iterate over the DOI definition list, skipping the first @skip_cnt entries. * For each entry call @callback, if @callback returns a negative value stop * 'walking' through the list and return. Updates the value in @skip_cnt upon * return. Returns zero on success, negative values on failure. * */ int cipso_v4_doi_walk(u32 *skip_cnt, int (*callback) (struct cipso_v4_doi *doi_def, void *arg), void *cb_arg) { int ret_val = -ENOENT; u32 doi_cnt = 0; struct cipso_v4_doi *iter_doi; rcu_read_lock(); list_for_each_entry_rcu(iter_doi, &cipso_v4_doi_list, list) if (refcount_read(&iter_doi->refcount) > 0) { if (doi_cnt++ < *skip_cnt) continue; ret_val = callback(iter_doi, cb_arg); if (ret_val < 0) { doi_cnt--; goto doi_walk_return; } } doi_walk_return: rcu_read_unlock(); *skip_cnt = doi_cnt; return ret_val; } /* * Label Mapping Functions */ /** * cipso_v4_map_lvl_valid - Checks to see if the given level is understood * @doi_def: the DOI definition * @level: the level to check * * Description: * Checks the given level against the given DOI definition and returns a * negative value if the level does not have a valid mapping and a zero value * if the level is defined by the DOI. * */ static int cipso_v4_map_lvl_valid(const struct cipso_v4_doi *doi_def, u8 level) { switch (doi_def->type) { case CIPSO_V4_MAP_PASS: return 0; case CIPSO_V4_MAP_TRANS: if ((level < doi_def->map.std->lvl.cipso_size) && (doi_def->map.std->lvl.cipso[level] < CIPSO_V4_INV_LVL)) return 0; break; } return -EFAULT; } /** * cipso_v4_map_lvl_hton - Perform a level mapping from the host to the network * @doi_def: the DOI definition * @host_lvl: the host MLS level * @net_lvl: the network/CIPSO MLS level * * Description: * Perform a label mapping to translate a local MLS level to the correct * CIPSO level using the given DOI definition. Returns zero on success, * negative values otherwise. * */ static int cipso_v4_map_lvl_hton(const struct cipso_v4_doi *doi_def, u32 host_lvl, u32 *net_lvl) { switch (doi_def->type) { case CIPSO_V4_MAP_PASS: *net_lvl = host_lvl; return 0; case CIPSO_V4_MAP_TRANS: if (host_lvl < doi_def->map.std->lvl.local_size && doi_def->map.std->lvl.local[host_lvl] < CIPSO_V4_INV_LVL) { *net_lvl = doi_def->map.std->lvl.local[host_lvl]; return 0; } return -EPERM; } return -EINVAL; } /** * cipso_v4_map_lvl_ntoh - Perform a level mapping from the network to the host * @doi_def: the DOI definition * @net_lvl: the network/CIPSO MLS level * @host_lvl: the host MLS level * * Description: * Perform a label mapping to translate a CIPSO level to the correct local MLS * level using the given DOI definition. Returns zero on success, negative * values otherwise. * */ static int cipso_v4_map_lvl_ntoh(const struct cipso_v4_doi *doi_def, u32 net_lvl, u32 *host_lvl) { struct cipso_v4_std_map_tbl *map_tbl; switch (doi_def->type) { case CIPSO_V4_MAP_PASS: *host_lvl = net_lvl; return 0; case CIPSO_V4_MAP_TRANS: map_tbl = doi_def->map.std; if (net_lvl < map_tbl->lvl.cipso_size && map_tbl->lvl.cipso[net_lvl] < CIPSO_V4_INV_LVL) { *host_lvl = doi_def->map.std->lvl.cipso[net_lvl]; return 0; } return -EPERM; } return -EINVAL; } /** * cipso_v4_map_cat_rbm_valid - Checks to see if the category bitmap is valid * @doi_def: the DOI definition * @bitmap: category bitmap * @bitmap_len: bitmap length in bytes * * Description: * Checks the given category bitmap against the given DOI definition and * returns a negative value if any of the categories in the bitmap do not have * a valid mapping and a zero value if all of the categories are valid. * */ static int cipso_v4_map_cat_rbm_valid(const struct cipso_v4_doi *doi_def, const unsigned char *bitmap, u32 bitmap_len) { int cat = -1; u32 bitmap_len_bits = bitmap_len * 8; u32 cipso_cat_size; u32 *cipso_array; switch (doi_def->type) { case CIPSO_V4_MAP_PASS: return 0; case CIPSO_V4_MAP_TRANS: cipso_cat_size = doi_def->map.std->cat.cipso_size; cipso_array = doi_def->map.std->cat.cipso; for (;;) { cat = netlbl_bitmap_walk(bitmap, bitmap_len_bits, cat + 1, 1); if (cat < 0) break; if (cat >= cipso_cat_size || cipso_array[cat] >= CIPSO_V4_INV_CAT) return -EFAULT; } if (cat == -1) return 0; break; } return -EFAULT; } /** * cipso_v4_map_cat_rbm_hton - Perform a category mapping from host to network * @doi_def: the DOI definition * @secattr: the security attributes * @net_cat: the zero'd out category bitmap in network/CIPSO format * @net_cat_len: the length of the CIPSO bitmap in bytes * * Description: * Perform a label mapping to translate a local MLS category bitmap to the * correct CIPSO bitmap using the given DOI definition. Returns the minimum * size in bytes of the network bitmap on success, negative values otherwise. * */ static int cipso_v4_map_cat_rbm_hton(const struct cipso_v4_doi *doi_def, const struct netlbl_lsm_secattr *secattr, unsigned char *net_cat, u32 net_cat_len) { int host_spot = -1; u32 net_spot = CIPSO_V4_INV_CAT; u32 net_spot_max = 0; u32 net_clen_bits = net_cat_len * 8; u32 host_cat_size = 0; u32 *host_cat_array = NULL; if (doi_def->type == CIPSO_V4_MAP_TRANS) { host_cat_size = doi_def->map.std->cat.local_size; host_cat_array = doi_def->map.std->cat.local; } for (;;) { host_spot = netlbl_catmap_walk(secattr->attr.mls.cat, host_spot + 1); if (host_spot < 0) break; switch (doi_def->type) { case CIPSO_V4_MAP_PASS: net_spot = host_spot; break; case CIPSO_V4_MAP_TRANS: if (host_spot >= host_cat_size) return -EPERM; net_spot = host_cat_array[host_spot]; if (net_spot >= CIPSO_V4_INV_CAT) return -EPERM; break; } if (net_spot >= net_clen_bits) return -ENOSPC; netlbl_bitmap_setbit(net_cat, net_spot, 1); if (net_spot > net_spot_max) net_spot_max = net_spot; } if (++net_spot_max % 8) return net_spot_max / 8 + 1; return net_spot_max / 8; } /** * cipso_v4_map_cat_rbm_ntoh - Perform a category mapping from network to host * @doi_def: the DOI definition * @net_cat: the category bitmap in network/CIPSO format * @net_cat_len: the length of the CIPSO bitmap in bytes * @secattr: the security attributes * * Description: * Perform a label mapping to translate a CIPSO bitmap to the correct local * MLS category bitmap using the given DOI definition. Returns zero on * success, negative values on failure. * */ static int cipso_v4_map_cat_rbm_ntoh(const struct cipso_v4_doi *doi_def, const unsigned char *net_cat, u32 net_cat_len, struct netlbl_lsm_secattr *secattr) { int ret_val; int net_spot = -1; u32 host_spot = CIPSO_V4_INV_CAT; u32 net_clen_bits = net_cat_len * 8; u32 net_cat_size = 0; u32 *net_cat_array = NULL; if (doi_def->type == CIPSO_V4_MAP_TRANS) { net_cat_size = doi_def->map.std->cat.cipso_size; net_cat_array = doi_def->map.std->cat.cipso; } for (;;) { net_spot = netlbl_bitmap_walk(net_cat, net_clen_bits, net_spot + 1, 1); if (net_spot < 0) { if (net_spot == -2) return -EFAULT; return 0; } switch (doi_def->type) { case CIPSO_V4_MAP_PASS: host_spot = net_spot; break; case CIPSO_V4_MAP_TRANS: if (net_spot >= net_cat_size) return -EPERM; host_spot = net_cat_array[net_spot]; if (host_spot >= CIPSO_V4_INV_CAT) return -EPERM; break; } ret_val = netlbl_catmap_setbit(&secattr->attr.mls.cat, host_spot, GFP_ATOMIC); if (ret_val != 0) return ret_val; } return -EINVAL; } /** * cipso_v4_map_cat_enum_valid - Checks to see if the categories are valid * @doi_def: the DOI definition * @enumcat: category list * @enumcat_len: length of the category list in bytes * * Description: * Checks the given categories against the given DOI definition and returns a * negative value if any of the categories do not have a valid mapping and a * zero value if all of the categories are valid. * */ static int cipso_v4_map_cat_enum_valid(const struct cipso_v4_doi *doi_def, const unsigned char *enumcat, u32 enumcat_len) { u16 cat; int cat_prev = -1; u32 iter; if (doi_def->type != CIPSO_V4_MAP_PASS || enumcat_len & 0x01) return -EFAULT; for (iter = 0; iter < enumcat_len; iter += 2) { cat = get_unaligned_be16(&enumcat[iter]); if (cat <= cat_prev) return -EFAULT; cat_prev = cat; } return 0; } /** * cipso_v4_map_cat_enum_hton - Perform a category mapping from host to network * @doi_def: the DOI definition * @secattr: the security attributes * @net_cat: the zero'd out category list in network/CIPSO format * @net_cat_len: the length of the CIPSO category list in bytes * * Description: * Perform a label mapping to translate a local MLS category bitmap to the * correct CIPSO category list using the given DOI definition. Returns the * size in bytes of the network category bitmap on success, negative values * otherwise. * */ static int cipso_v4_map_cat_enum_hton(const struct cipso_v4_doi *doi_def, const struct netlbl_lsm_secattr *secattr, unsigned char *net_cat, u32 net_cat_len) { int cat = -1; u32 cat_iter = 0; for (;;) { cat = netlbl_catmap_walk(secattr->attr.mls.cat, cat + 1); if (cat < 0) break; if ((cat_iter + 2) > net_cat_len) return -ENOSPC; *((__be16 *)&net_cat[cat_iter]) = htons(cat); cat_iter += 2; } return cat_iter; } /** * cipso_v4_map_cat_enum_ntoh - Perform a category mapping from network to host * @doi_def: the DOI definition * @net_cat: the category list in network/CIPSO format * @net_cat_len: the length of the CIPSO bitmap in bytes * @secattr: the security attributes * * Description: * Perform a label mapping to translate a CIPSO category list to the correct * local MLS category bitmap using the given DOI definition. Returns zero on * success, negative values on failure. * */ static int cipso_v4_map_cat_enum_ntoh(const struct cipso_v4_doi *doi_def, const unsigned char *net_cat, u32 net_cat_len, struct netlbl_lsm_secattr *secattr) { int ret_val; u32 iter; for (iter = 0; iter < net_cat_len; iter += 2) { ret_val = netlbl_catmap_setbit(&secattr->attr.mls.cat, get_unaligned_be16(&net_cat[iter]), GFP_ATOMIC); if (ret_val != 0) return ret_val; } return 0; } /** * cipso_v4_map_cat_rng_valid - Checks to see if the categories are valid * @doi_def: the DOI definition * @rngcat: category list * @rngcat_len: length of the category list in bytes * * Description: * Checks the given categories against the given DOI definition and returns a * negative value if any of the categories do not have a valid mapping and a * zero value if all of the categories are valid. * */ static int cipso_v4_map_cat_rng_valid(const struct cipso_v4_doi *doi_def, const unsigned char *rngcat, u32 rngcat_len) { u16 cat_high; u16 cat_low; u32 cat_prev = CIPSO_V4_MAX_REM_CATS + 1; u32 iter; if (doi_def->type != CIPSO_V4_MAP_PASS || rngcat_len & 0x01) return -EFAULT; for (iter = 0; iter < rngcat_len; iter += 4) { cat_high = get_unaligned_be16(&rngcat[iter]); if ((iter + 4) <= rngcat_len) cat_low = get_unaligned_be16(&rngcat[iter + 2]); else cat_low = 0; if (cat_high > cat_prev) return -EFAULT; cat_prev = cat_low; } return 0; } /** * cipso_v4_map_cat_rng_hton - Perform a category mapping from host to network * @doi_def: the DOI definition * @secattr: the security attributes * @net_cat: the zero'd out category list in network/CIPSO format * @net_cat_len: the length of the CIPSO category list in bytes * * Description: * Perform a label mapping to translate a local MLS category bitmap to the * correct CIPSO category list using the given DOI definition. Returns the * size in bytes of the network category bitmap on success, negative values * otherwise. * */ static int cipso_v4_map_cat_rng_hton(const struct cipso_v4_doi *doi_def, const struct netlbl_lsm_secattr *secattr, unsigned char *net_cat, u32 net_cat_len) { int iter = -1; u16 array[CIPSO_V4_TAG_RNG_CAT_MAX * 2]; u32 array_cnt = 0; u32 cat_size = 0; /* make sure we don't overflow the 'array[]' variable */ if (net_cat_len > (CIPSO_V4_OPT_LEN_MAX - CIPSO_V4_HDR_LEN - CIPSO_V4_TAG_RNG_BLEN)) return -ENOSPC; for (;;) { iter = netlbl_catmap_walk(secattr->attr.mls.cat, iter + 1); if (iter < 0) break; cat_size += (iter == 0 ? 0 : sizeof(u16)); if (cat_size > net_cat_len) return -ENOSPC; array[array_cnt++] = iter; iter = netlbl_catmap_walkrng(secattr->attr.mls.cat, iter); if (iter < 0) return -EFAULT; cat_size += sizeof(u16); if (cat_size > net_cat_len) return -ENOSPC; array[array_cnt++] = iter; } for (iter = 0; array_cnt > 0;) { *((__be16 *)&net_cat[iter]) = htons(array[--array_cnt]); iter += 2; array_cnt--; if (array[array_cnt] != 0) { *((__be16 *)&net_cat[iter]) = htons(array[array_cnt]); iter += 2; } } return cat_size; } /** * cipso_v4_map_cat_rng_ntoh - Perform a category mapping from network to host * @doi_def: the DOI definition * @net_cat: the category list in network/CIPSO format * @net_cat_len: the length of the CIPSO bitmap in bytes * @secattr: the security attributes * * Description: * Perform a label mapping to translate a CIPSO category list to the correct * local MLS category bitmap using the given DOI definition. Returns zero on * success, negative values on failure. * */ static int cipso_v4_map_cat_rng_ntoh(const struct cipso_v4_doi *doi_def, const unsigned char *net_cat, u32 net_cat_len, struct netlbl_lsm_secattr *secattr) { int ret_val; u32 net_iter; u16 cat_low; u16 cat_high; for (net_iter = 0; net_iter < net_cat_len; net_iter += 4) { cat_high = get_unaligned_be16(&net_cat[net_iter]); if ((net_iter + 4) <= net_cat_len) cat_low = get_unaligned_be16(&net_cat[net_iter + 2]); else cat_low = 0; ret_val = netlbl_catmap_setrng(&secattr->attr.mls.cat, cat_low, cat_high, GFP_ATOMIC); if (ret_val != 0) return ret_val; } return 0; } /* * Protocol Handling Functions */ /** * cipso_v4_gentag_hdr - Generate a CIPSO option header * @doi_def: the DOI definition * @len: the total tag length in bytes, not including this header * @buf: the CIPSO option buffer * * Description: * Write a CIPSO header into the beginning of @buffer. * */ static void cipso_v4_gentag_hdr(const struct cipso_v4_doi *doi_def, unsigned char *buf, u32 len) { buf[0] = IPOPT_CIPSO; buf[1] = CIPSO_V4_HDR_LEN + len; put_unaligned_be32(doi_def->doi, &buf[2]); } /** * cipso_v4_gentag_rbm - Generate a CIPSO restricted bitmap tag (type #1) * @doi_def: the DOI definition * @secattr: the security attributes * @buffer: the option buffer * @buffer_len: length of buffer in bytes * * Description: * Generate a CIPSO option using the restricted bitmap tag, tag type #1. The * actual buffer length may be larger than the indicated size due to * translation between host and network category bitmaps. Returns the size of * the tag on success, negative values on failure. * */ static int cipso_v4_gentag_rbm(const struct cipso_v4_doi *doi_def, const struct netlbl_lsm_secattr *secattr, unsigned char *buffer, u32 buffer_len) { int ret_val; u32 tag_len; u32 level; if ((secattr->flags & NETLBL_SECATTR_MLS_LVL) == 0) return -EPERM; ret_val = cipso_v4_map_lvl_hton(doi_def, secattr->attr.mls.lvl, &level); if (ret_val != 0) return ret_val; if (secattr->flags & NETLBL_SECATTR_MLS_CAT) { ret_val = cipso_v4_map_cat_rbm_hton(doi_def, secattr, &buffer[4], buffer_len - 4); if (ret_val < 0) return ret_val; /* This will send packets using the "optimized" format when * possible as specified in section 3.4.2.6 of the * CIPSO draft. */ if (READ_ONCE(cipso_v4_rbm_optfmt) && ret_val > 0 && ret_val <= 10) tag_len = 14; else tag_len = 4 + ret_val; } else tag_len = 4; buffer[0] = CIPSO_V4_TAG_RBITMAP; buffer[1] = tag_len; buffer[3] = level; return tag_len; } /** * cipso_v4_parsetag_rbm - Parse a CIPSO restricted bitmap tag * @doi_def: the DOI definition * @tag: the CIPSO tag * @secattr: the security attributes * * Description: * Parse a CIPSO restricted bitmap tag (tag type #1) and return the security * attributes in @secattr. Return zero on success, negatives values on * failure. * */ static int cipso_v4_parsetag_rbm(const struct cipso_v4_doi *doi_def, const unsigned char *tag, struct netlbl_lsm_secattr *secattr) { int ret_val; u8 tag_len = tag[1]; u32 level; ret_val = cipso_v4_map_lvl_ntoh(doi_def, tag[3], &level); if (ret_val != 0) return ret_val; secattr->attr.mls.lvl = level; secattr->flags |= NETLBL_SECATTR_MLS_LVL; if (tag_len > 4) { ret_val = cipso_v4_map_cat_rbm_ntoh(doi_def, &tag[4], tag_len - 4, secattr); if (ret_val != 0) { netlbl_catmap_free(secattr->attr.mls.cat); return ret_val; } if (secattr->attr.mls.cat) secattr->flags |= NETLBL_SECATTR_MLS_CAT; } return 0; } /** * cipso_v4_gentag_enum - Generate a CIPSO enumerated tag (type #2) * @doi_def: the DOI definition * @secattr: the security attributes * @buffer: the option buffer * @buffer_len: length of buffer in bytes * * Description: * Generate a CIPSO option using the enumerated tag, tag type #2. Returns the * size of the tag on success, negative values on failure. * */ static int cipso_v4_gentag_enum(const struct cipso_v4_doi *doi_def, const struct netlbl_lsm_secattr *secattr, unsigned char *buffer, u32 buffer_len) { int ret_val; u32 tag_len; u32 level; if (!(secattr->flags & NETLBL_SECATTR_MLS_LVL)) return -EPERM; ret_val = cipso_v4_map_lvl_hton(doi_def, secattr->attr.mls.lvl, &level); if (ret_val != 0) return ret_val; if (secattr->flags & NETLBL_SECATTR_MLS_CAT) { ret_val = cipso_v4_map_cat_enum_hton(doi_def, secattr, &buffer[4], buffer_len - 4); if (ret_val < 0) return ret_val; tag_len = 4 + ret_val; } else tag_len = 4; buffer[0] = CIPSO_V4_TAG_ENUM; buffer[1] = tag_len; buffer[3] = level; return tag_len; } /** * cipso_v4_parsetag_enum - Parse a CIPSO enumerated tag * @doi_def: the DOI definition * @tag: the CIPSO tag * @secattr: the security attributes * * Description: * Parse a CIPSO enumerated tag (tag type #2) and return the security * attributes in @secattr. Return zero on success, negatives values on * failure. * */ static int cipso_v4_parsetag_enum(const struct cipso_v4_doi *doi_def, const unsigned char *tag, struct netlbl_lsm_secattr *secattr) { int ret_val; u8 tag_len = tag[1]; u32 level; ret_val = cipso_v4_map_lvl_ntoh(doi_def, tag[3], &level); if (ret_val != 0) return ret_val; secattr->attr.mls.lvl = level; secattr->flags |= NETLBL_SECATTR_MLS_LVL; if (tag_len > 4) { ret_val = cipso_v4_map_cat_enum_ntoh(doi_def, &tag[4], tag_len - 4, secattr); if (ret_val != 0) { netlbl_catmap_free(secattr->attr.mls.cat); return ret_val; } secattr->flags |= NETLBL_SECATTR_MLS_CAT; } return 0; } /** * cipso_v4_gentag_rng - Generate a CIPSO ranged tag (type #5) * @doi_def: the DOI definition * @secattr: the security attributes * @buffer: the option buffer * @buffer_len: length of buffer in bytes * * Description: * Generate a CIPSO option using the ranged tag, tag type #5. Returns the * size of the tag on success, negative values on failure. * */ static int cipso_v4_gentag_rng(const struct cipso_v4_doi *doi_def, const struct netlbl_lsm_secattr *secattr, unsigned char *buffer, u32 buffer_len) { int ret_val; u32 tag_len; u32 level; if (!(secattr->flags & NETLBL_SECATTR_MLS_LVL)) return -EPERM; ret_val = cipso_v4_map_lvl_hton(doi_def, secattr->attr.mls.lvl, &level); if (ret_val != 0) return ret_val; if (secattr->flags & NETLBL_SECATTR_MLS_CAT) { ret_val = cipso_v4_map_cat_rng_hton(doi_def, secattr, &buffer[4], buffer_len - 4); if (ret_val < 0) return ret_val; tag_len = 4 + ret_val; } else tag_len = 4; buffer[0] = CIPSO_V4_TAG_RANGE; buffer[1] = tag_len; buffer[3] = level; return tag_len; } /** * cipso_v4_parsetag_rng - Parse a CIPSO ranged tag * @doi_def: the DOI definition * @tag: the CIPSO tag * @secattr: the security attributes * * Description: * Parse a CIPSO ranged tag (tag type #5) and return the security attributes * in @secattr. Return zero on success, negatives values on failure. * */ static int cipso_v4_parsetag_rng(const struct cipso_v4_doi *doi_def, const unsigned char *tag, struct netlbl_lsm_secattr *secattr) { int ret_val; u8 tag_len = tag[1]; u32 level; ret_val = cipso_v4_map_lvl_ntoh(doi_def, tag[3], &level); if (ret_val != 0) return ret_val; secattr->attr.mls.lvl = level; secattr->flags |= NETLBL_SECATTR_MLS_LVL; if (tag_len > 4) { ret_val = cipso_v4_map_cat_rng_ntoh(doi_def, &tag[4], tag_len - 4, secattr); if (ret_val != 0) { netlbl_catmap_free(secattr->attr.mls.cat); return ret_val; } if (secattr->attr.mls.cat) secattr->flags |= NETLBL_SECATTR_MLS_CAT; } return 0; } /** * cipso_v4_gentag_loc - Generate a CIPSO local tag (non-standard) * @doi_def: the DOI definition * @secattr: the security attributes * @buffer: the option buffer * @buffer_len: length of buffer in bytes * * Description: * Generate a CIPSO option using the local tag. Returns the size of the tag * on success, negative values on failure. * */ static int cipso_v4_gentag_loc(const struct cipso_v4_doi *doi_def, const struct netlbl_lsm_secattr *secattr, unsigned char *buffer, u32 buffer_len) { if (!(secattr->flags & NETLBL_SECATTR_SECID)) return -EPERM; buffer[0] = CIPSO_V4_TAG_LOCAL; buffer[1] = CIPSO_V4_TAG_LOC_BLEN; *(u32 *)&buffer[2] = secattr->attr.secid; return CIPSO_V4_TAG_LOC_BLEN; } /** * cipso_v4_parsetag_loc - Parse a CIPSO local tag * @doi_def: the DOI definition * @tag: the CIPSO tag * @secattr: the security attributes * * Description: * Parse a CIPSO local tag and return the security attributes in @secattr. * Return zero on success, negatives values on failure. * */ static int cipso_v4_parsetag_loc(const struct cipso_v4_doi *doi_def, const unsigned char *tag, struct netlbl_lsm_secattr *secattr) { secattr->attr.secid = *(u32 *)&tag[2]; secattr->flags |= NETLBL_SECATTR_SECID; return 0; } /** * cipso_v4_optptr - Find the CIPSO option in the packet * @skb: the packet * * Description: * Parse the packet's IP header looking for a CIPSO option. Returns a pointer * to the start of the CIPSO option on success, NULL if one is not found. * */ unsigned char *cipso_v4_optptr(const struct sk_buff *skb) { const struct iphdr *iph = ip_hdr(skb); unsigned char *optptr = (unsigned char *)&(ip_hdr(skb)[1]); int optlen; int taglen; for (optlen = iph->ihl*4 - sizeof(struct iphdr); optlen > 1; ) { switch (optptr[0]) { case IPOPT_END: return NULL; case IPOPT_NOOP: taglen = 1; break; default: taglen = optptr[1]; } if (!taglen || taglen > optlen) return NULL; if (optptr[0] == IPOPT_CIPSO) return optptr; optlen -= taglen; optptr += taglen; } return NULL; } /** * cipso_v4_validate - Validate a CIPSO option * @skb: the packet * @option: the start of the option, on error it is set to point to the error * * Description: * This routine is called to validate a CIPSO option, it checks all of the * fields to ensure that they are at least valid, see the draft snippet below * for details. If the option is valid then a zero value is returned and * the value of @option is unchanged. If the option is invalid then a * non-zero value is returned and @option is adjusted to point to the * offending portion of the option. From the IETF draft ... * * "If any field within the CIPSO options, such as the DOI identifier, is not * recognized the IP datagram is discarded and an ICMP 'parameter problem' * (type 12) is generated and returned. The ICMP code field is set to 'bad * parameter' (code 0) and the pointer is set to the start of the CIPSO field * that is unrecognized." * */ int cipso_v4_validate(const struct sk_buff *skb, unsigned char **option) { unsigned char *opt = *option; unsigned char *tag; unsigned char opt_iter; unsigned char err_offset = 0; u8 opt_len; u8 tag_len; struct cipso_v4_doi *doi_def = NULL; u32 tag_iter; /* caller already checks for length values that are too large */ opt_len = opt[1]; if (opt_len < 8) { err_offset = 1; goto validate_return; } rcu_read_lock(); doi_def = cipso_v4_doi_search(get_unaligned_be32(&opt[2])); if (!doi_def) { err_offset = 2; goto validate_return_locked; } opt_iter = CIPSO_V4_HDR_LEN; tag = opt + opt_iter; while (opt_iter < opt_len) { for (tag_iter = 0; doi_def->tags[tag_iter] != tag[0];) if (doi_def->tags[tag_iter] == CIPSO_V4_TAG_INVALID || ++tag_iter == CIPSO_V4_TAG_MAXCNT) { err_offset = opt_iter; goto validate_return_locked; } if (opt_iter + 1 == opt_len) { err_offset = opt_iter; goto validate_return_locked; } tag_len = tag[1]; if (tag_len > (opt_len - opt_iter)) { err_offset = opt_iter + 1; goto validate_return_locked; } switch (tag[0]) { case CIPSO_V4_TAG_RBITMAP: if (tag_len < CIPSO_V4_TAG_RBM_BLEN) { err_offset = opt_iter + 1; goto validate_return_locked; } /* We are already going to do all the verification * necessary at the socket layer so from our point of * view it is safe to turn these checks off (and less * work), however, the CIPSO draft says we should do * all the CIPSO validations here but it doesn't * really specify _exactly_ what we need to validate * ... so, just make it a sysctl tunable. */ if (READ_ONCE(cipso_v4_rbm_strictvalid)) { if (cipso_v4_map_lvl_valid(doi_def, tag[3]) < 0) { err_offset = opt_iter + 3; goto validate_return_locked; } if (tag_len > CIPSO_V4_TAG_RBM_BLEN && cipso_v4_map_cat_rbm_valid(doi_def, &tag[4], tag_len - 4) < 0) { err_offset = opt_iter + 4; goto validate_return_locked; } } break; case CIPSO_V4_TAG_ENUM: if (tag_len < CIPSO_V4_TAG_ENUM_BLEN) { err_offset = opt_iter + 1; goto validate_return_locked; } if (cipso_v4_map_lvl_valid(doi_def, tag[3]) < 0) { err_offset = opt_iter + 3; goto validate_return_locked; } if (tag_len > CIPSO_V4_TAG_ENUM_BLEN && cipso_v4_map_cat_enum_valid(doi_def, &tag[4], tag_len - 4) < 0) { err_offset = opt_iter + 4; goto validate_return_locked; } break; case CIPSO_V4_TAG_RANGE: if (tag_len < CIPSO_V4_TAG_RNG_BLEN) { err_offset = opt_iter + 1; goto validate_return_locked; } if (cipso_v4_map_lvl_valid(doi_def, tag[3]) < 0) { err_offset = opt_iter + 3; goto validate_return_locked; } if (tag_len > CIPSO_V4_TAG_RNG_BLEN && cipso_v4_map_cat_rng_valid(doi_def, &tag[4], tag_len - 4) < 0) { err_offset = opt_iter + 4; goto validate_return_locked; } break; case CIPSO_V4_TAG_LOCAL: /* This is a non-standard tag that we only allow for * local connections, so if the incoming interface is * not the loopback device drop the packet. Further, * there is no legitimate reason for setting this from * userspace so reject it if skb is NULL. */ if (!skb || !(skb->dev->flags & IFF_LOOPBACK)) { err_offset = opt_iter; goto validate_return_locked; } if (tag_len != CIPSO_V4_TAG_LOC_BLEN) { err_offset = opt_iter + 1; goto validate_return_locked; } break; default: err_offset = opt_iter; goto validate_return_locked; } tag += tag_len; opt_iter += tag_len; } validate_return_locked: rcu_read_unlock(); validate_return: *option = opt + err_offset; return err_offset; } /** * cipso_v4_error - Send the correct response for a bad packet * @skb: the packet * @error: the error code * @gateway: CIPSO gateway flag * * Description: * Based on the error code given in @error, send an ICMP error message back to * the originating host. From the IETF draft ... * * "If the contents of the CIPSO [option] are valid but the security label is * outside of the configured host or port label range, the datagram is * discarded and an ICMP 'destination unreachable' (type 3) is generated and * returned. The code field of the ICMP is set to 'communication with * destination network administratively prohibited' (code 9) or to * 'communication with destination host administratively prohibited' * (code 10). The value of the code is dependent on whether the originator * of the ICMP message is acting as a CIPSO host or a CIPSO gateway. The * recipient of the ICMP message MUST be able to handle either value. The * same procedure is performed if a CIPSO [option] can not be added to an * IP packet because it is too large to fit in the IP options area." * * "If the error is triggered by receipt of an ICMP message, the message is * discarded and no response is permitted (consistent with general ICMP * processing rules)." * */ void cipso_v4_error(struct sk_buff *skb, int error, u32 gateway) { unsigned char optbuf[sizeof(struct ip_options) + 40]; struct ip_options *opt = (struct ip_options *)optbuf; int res; if (ip_hdr(skb)->protocol == IPPROTO_ICMP || error != -EACCES) return; /* * We might be called above the IP layer, * so we can not use icmp_send and IPCB here. */ memset(opt, 0, sizeof(struct ip_options)); opt->optlen = ip_hdr(skb)->ihl*4 - sizeof(struct iphdr); rcu_read_lock(); res = __ip_options_compile(dev_net(skb->dev), opt, skb, NULL); rcu_read_unlock(); if (res) return; if (gateway) __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_NET_ANO, 0, opt); else __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_ANO, 0, opt); } /** * cipso_v4_genopt - Generate a CIPSO option * @buf: the option buffer * @buf_len: the size of opt_buf * @doi_def: the CIPSO DOI to use * @secattr: the security attributes * * Description: * Generate a CIPSO option using the DOI definition and security attributes * passed to the function. Returns the length of the option on success and * negative values on failure. * */ static int cipso_v4_genopt(unsigned char *buf, u32 buf_len, const struct cipso_v4_doi *doi_def, const struct netlbl_lsm_secattr *secattr) { int ret_val; u32 iter; if (buf_len <= CIPSO_V4_HDR_LEN) return -ENOSPC; /* XXX - This code assumes only one tag per CIPSO option which isn't * really a good assumption to make but since we only support the MAC * tags right now it is a safe assumption. */ iter = 0; do { memset(buf, 0, buf_len); switch (doi_def->tags[iter]) { case CIPSO_V4_TAG_RBITMAP: ret_val = cipso_v4_gentag_rbm(doi_def, secattr, &buf[CIPSO_V4_HDR_LEN], buf_len - CIPSO_V4_HDR_LEN); break; case CIPSO_V4_TAG_ENUM: ret_val = cipso_v4_gentag_enum(doi_def, secattr, &buf[CIPSO_V4_HDR_LEN], buf_len - CIPSO_V4_HDR_LEN); break; case CIPSO_V4_TAG_RANGE: ret_val = cipso_v4_gentag_rng(doi_def, secattr, &buf[CIPSO_V4_HDR_LEN], buf_len - CIPSO_V4_HDR_LEN); break; case CIPSO_V4_TAG_LOCAL: ret_val = cipso_v4_gentag_loc(doi_def, secattr, &buf[CIPSO_V4_HDR_LEN], buf_len - CIPSO_V4_HDR_LEN); break; default: return -EPERM; } iter++; } while (ret_val < 0 && iter < CIPSO_V4_TAG_MAXCNT && doi_def->tags[iter] != CIPSO_V4_TAG_INVALID); if (ret_val < 0) return ret_val; cipso_v4_gentag_hdr(doi_def, buf, ret_val); return CIPSO_V4_HDR_LEN + ret_val; } /** * cipso_v4_sock_setattr - Add a CIPSO option to a socket * @sk: the socket * @doi_def: the CIPSO DOI to use * @secattr: the specific security attributes of the socket * * Description: * Set the CIPSO option on the given socket using the DOI definition and * security attributes passed to the function. This function requires * exclusive access to @sk, which means it either needs to be in the * process of being created or locked. Returns zero on success and negative * values on failure. * */ int cipso_v4_sock_setattr(struct sock *sk, const struct cipso_v4_doi *doi_def, const struct netlbl_lsm_secattr *secattr) { int ret_val = -EPERM; unsigned char *buf = NULL; u32 buf_len; u32 opt_len; struct ip_options_rcu *old, *opt = NULL; struct inet_sock *sk_inet; struct inet_connection_sock *sk_conn; /* In the case of sock_create_lite(), the sock->sk field is not * defined yet but it is not a problem as the only users of these * "lite" PF_INET sockets are functions which do an accept() call * afterwards so we will label the socket as part of the accept(). */ if (!sk) return 0; /* We allocate the maximum CIPSO option size here so we are probably * being a little wasteful, but it makes our life _much_ easier later * on and after all we are only talking about 40 bytes. */ buf_len = CIPSO_V4_OPT_LEN_MAX; buf = kmalloc(buf_len, GFP_ATOMIC); if (!buf) { ret_val = -ENOMEM; goto socket_setattr_failure; } ret_val = cipso_v4_genopt(buf, buf_len, doi_def, secattr); if (ret_val < 0) goto socket_setattr_failure; buf_len = ret_val; /* We can't use ip_options_get() directly because it makes a call to * ip_options_get_alloc() which allocates memory with GFP_KERNEL and * we won't always have CAP_NET_RAW even though we _always_ want to * set the IPOPT_CIPSO option. */ opt_len = (buf_len + 3) & ~3; opt = kzalloc(sizeof(*opt) + opt_len, GFP_ATOMIC); if (!opt) { ret_val = -ENOMEM; goto socket_setattr_failure; } memcpy(opt->opt.__data, buf, buf_len); opt->opt.optlen = opt_len; opt->opt.cipso = sizeof(struct iphdr); kfree(buf); buf = NULL; sk_inet = inet_sk(sk); old = rcu_dereference_protected(sk_inet->inet_opt, lockdep_sock_is_held(sk)); if (inet_test_bit(IS_ICSK, sk)) { sk_conn = inet_csk(sk); if (old) sk_conn->icsk_ext_hdr_len -= old->opt.optlen; sk_conn->icsk_ext_hdr_len += opt->opt.optlen; sk_conn->icsk_sync_mss(sk, sk_conn->icsk_pmtu_cookie); } rcu_assign_pointer(sk_inet->inet_opt, opt); if (old) kfree_rcu(old, rcu); return 0; socket_setattr_failure: kfree(buf); kfree(opt); return ret_val; } /** * cipso_v4_req_setattr - Add a CIPSO option to a connection request socket * @req: the connection request socket * @doi_def: the CIPSO DOI to use * @secattr: the specific security attributes of the socket * * Description: * Set the CIPSO option on the given socket using the DOI definition and * security attributes passed to the function. Returns zero on success and * negative values on failure. * */ int cipso_v4_req_setattr(struct request_sock *req, const struct cipso_v4_doi *doi_def, const struct netlbl_lsm_secattr *secattr) { int ret_val = -EPERM; unsigned char *buf = NULL; u32 buf_len; u32 opt_len; struct ip_options_rcu *opt = NULL; struct inet_request_sock *req_inet; /* We allocate the maximum CIPSO option size here so we are probably * being a little wasteful, but it makes our life _much_ easier later * on and after all we are only talking about 40 bytes. */ buf_len = CIPSO_V4_OPT_LEN_MAX; buf = kmalloc(buf_len, GFP_ATOMIC); if (!buf) { ret_val = -ENOMEM; goto req_setattr_failure; } ret_val = cipso_v4_genopt(buf, buf_len, doi_def, secattr); if (ret_val < 0) goto req_setattr_failure; buf_len = ret_val; /* We can't use ip_options_get() directly because it makes a call to * ip_options_get_alloc() which allocates memory with GFP_KERNEL and * we won't always have CAP_NET_RAW even though we _always_ want to * set the IPOPT_CIPSO option. */ opt_len = (buf_len + 3) & ~3; opt = kzalloc(sizeof(*opt) + opt_len, GFP_ATOMIC); if (!opt) { ret_val = -ENOMEM; goto req_setattr_failure; } memcpy(opt->opt.__data, buf, buf_len); opt->opt.optlen = opt_len; opt->opt.cipso = sizeof(struct iphdr); kfree(buf); buf = NULL; req_inet = inet_rsk(req); opt = xchg((__force struct ip_options_rcu **)&req_inet->ireq_opt, opt); if (opt) kfree_rcu(opt, rcu); return 0; req_setattr_failure: kfree(buf); kfree(opt); return ret_val; } /** * cipso_v4_delopt - Delete the CIPSO option from a set of IP options * @opt_ptr: IP option pointer * * Description: * Deletes the CIPSO IP option from a set of IP options and makes the necessary * adjustments to the IP option structure. Returns zero on success, negative * values on failure. * */ static int cipso_v4_delopt(struct ip_options_rcu __rcu **opt_ptr) { struct ip_options_rcu *opt = rcu_dereference_protected(*opt_ptr, 1); int hdr_delta = 0; if (!opt || opt->opt.cipso == 0) return 0; if (opt->opt.srr || opt->opt.rr || opt->opt.ts || opt->opt.router_alert) { u8 cipso_len; u8 cipso_off; unsigned char *cipso_ptr; int iter; int optlen_new; cipso_off = opt->opt.cipso - sizeof(struct iphdr); cipso_ptr = &opt->opt.__data[cipso_off]; cipso_len = cipso_ptr[1]; if (opt->opt.srr > opt->opt.cipso) opt->opt.srr -= cipso_len; if (opt->opt.rr > opt->opt.cipso) opt->opt.rr -= cipso_len; if (opt->opt.ts > opt->opt.cipso) opt->opt.ts -= cipso_len; if (opt->opt.router_alert > opt->opt.cipso) opt->opt.router_alert -= cipso_len; opt->opt.cipso = 0; memmove(cipso_ptr, cipso_ptr + cipso_len, opt->opt.optlen - cipso_off - cipso_len); /* determining the new total option length is tricky because of * the padding necessary, the only thing i can think to do at * this point is walk the options one-by-one, skipping the * padding at the end to determine the actual option size and * from there we can determine the new total option length */ iter = 0; optlen_new = 0; while (iter < opt->opt.optlen) if (opt->opt.__data[iter] != IPOPT_NOP) { iter += opt->opt.__data[iter + 1]; optlen_new = iter; } else iter++; hdr_delta = opt->opt.optlen; opt->opt.optlen = (optlen_new + 3) & ~3; hdr_delta -= opt->opt.optlen; } else { /* only the cipso option was present on the socket so we can * remove the entire option struct */ *opt_ptr = NULL; hdr_delta = opt->opt.optlen; kfree_rcu(opt, rcu); } return hdr_delta; } /** * cipso_v4_sock_delattr - Delete the CIPSO option from a socket * @sk: the socket * * Description: * Removes the CIPSO option from a socket, if present. * */ void cipso_v4_sock_delattr(struct sock *sk) { struct inet_sock *sk_inet; int hdr_delta; sk_inet = inet_sk(sk); hdr_delta = cipso_v4_delopt(&sk_inet->inet_opt); if (inet_test_bit(IS_ICSK, sk) && hdr_delta > 0) { struct inet_connection_sock *sk_conn = inet_csk(sk); sk_conn->icsk_ext_hdr_len -= hdr_delta; sk_conn->icsk_sync_mss(sk, sk_conn->icsk_pmtu_cookie); } } /** * cipso_v4_req_delattr - Delete the CIPSO option from a request socket * @req: the request socket * * Description: * Removes the CIPSO option from a request socket, if present. * */ void cipso_v4_req_delattr(struct request_sock *req) { cipso_v4_delopt(&inet_rsk(req)->ireq_opt); } /** * cipso_v4_getattr - Helper function for the cipso_v4_*_getattr functions * @cipso: the CIPSO v4 option * @secattr: the security attributes * * Description: * Inspect @cipso and return the security attributes in @secattr. Returns zero * on success and negative values on failure. * */ int cipso_v4_getattr(const unsigned char *cipso, struct netlbl_lsm_secattr *secattr) { int ret_val = -ENOMSG; u32 doi; struct cipso_v4_doi *doi_def; if (cipso_v4_cache_check(cipso, cipso[1], secattr) == 0) return 0; doi = get_unaligned_be32(&cipso[2]); rcu_read_lock(); doi_def = cipso_v4_doi_search(doi); if (!doi_def) goto getattr_return; /* XXX - This code assumes only one tag per CIPSO option which isn't * really a good assumption to make but since we only support the MAC * tags right now it is a safe assumption. */ switch (cipso[6]) { case CIPSO_V4_TAG_RBITMAP: ret_val = cipso_v4_parsetag_rbm(doi_def, &cipso[6], secattr); break; case CIPSO_V4_TAG_ENUM: ret_val = cipso_v4_parsetag_enum(doi_def, &cipso[6], secattr); break; case CIPSO_V4_TAG_RANGE: ret_val = cipso_v4_parsetag_rng(doi_def, &cipso[6], secattr); break; case CIPSO_V4_TAG_LOCAL: ret_val = cipso_v4_parsetag_loc(doi_def, &cipso[6], secattr); break; } if (ret_val == 0) secattr->type = NETLBL_NLTYPE_CIPSOV4; getattr_return: rcu_read_unlock(); return ret_val; } /** * cipso_v4_sock_getattr - Get the security attributes from a sock * @sk: the sock * @secattr: the security attributes * * Description: * Query @sk to see if there is a CIPSO option attached to the sock and if * there is return the CIPSO security attributes in @secattr. This function * requires that @sk be locked, or privately held, but it does not do any * locking itself. Returns zero on success and negative values on failure. * */ int cipso_v4_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr) { struct ip_options_rcu *opt; int res = -ENOMSG; rcu_read_lock(); opt = rcu_dereference(inet_sk(sk)->inet_opt); if (opt && opt->opt.cipso) res = cipso_v4_getattr(opt->opt.__data + opt->opt.cipso - sizeof(struct iphdr), secattr); rcu_read_unlock(); return res; } /** * cipso_v4_skbuff_setattr - Set the CIPSO option on a packet * @skb: the packet * @doi_def: the DOI structure * @secattr: the security attributes * * Description: * Set the CIPSO option on the given packet based on the security attributes. * Returns a pointer to the IP header on success and NULL on failure. * */ int cipso_v4_skbuff_setattr(struct sk_buff *skb, const struct cipso_v4_doi *doi_def, const struct netlbl_lsm_secattr *secattr) { int ret_val; struct iphdr *iph; struct ip_options *opt = &IPCB(skb)->opt; unsigned char buf[CIPSO_V4_OPT_LEN_MAX]; u32 buf_len = CIPSO_V4_OPT_LEN_MAX; u32 opt_len; int len_delta; ret_val = cipso_v4_genopt(buf, buf_len, doi_def, secattr); if (ret_val < 0) return ret_val; buf_len = ret_val; opt_len = (buf_len + 3) & ~3; /* we overwrite any existing options to ensure that we have enough * room for the CIPSO option, the reason is that we _need_ to guarantee * that the security label is applied to the packet - we do the same * thing when using the socket options and it hasn't caused a problem, * if we need to we can always revisit this choice later */ len_delta = opt_len - opt->optlen; /* if we don't ensure enough headroom we could panic on the skb_push() * call below so make sure we have enough, we are also "mangling" the * packet so we should probably do a copy-on-write call anyway */ ret_val = skb_cow(skb, skb_headroom(skb) + len_delta); if (ret_val < 0) return ret_val; if (len_delta > 0) { /* we assume that the header + opt->optlen have already been * "pushed" in ip_options_build() or similar */ iph = ip_hdr(skb); skb_push(skb, len_delta); memmove((char *)iph - len_delta, iph, iph->ihl << 2); skb_reset_network_header(skb); iph = ip_hdr(skb); } else if (len_delta < 0) { iph = ip_hdr(skb); memset(iph + 1, IPOPT_NOP, opt->optlen); } else iph = ip_hdr(skb); if (opt->optlen > 0) memset(opt, 0, sizeof(*opt)); opt->optlen = opt_len; opt->cipso = sizeof(struct iphdr); opt->is_changed = 1; /* we have to do the following because we are being called from a * netfilter hook which means the packet already has had the header * fields populated and the checksum calculated - yes this means we * are doing more work than needed but we do it to keep the core * stack clean and tidy */ memcpy(iph + 1, buf, buf_len); if (opt_len > buf_len) memset((char *)(iph + 1) + buf_len, 0, opt_len - buf_len); if (len_delta != 0) { iph->ihl = 5 + (opt_len >> 2); iph_set_totlen(iph, skb->len); } ip_send_check(iph); return 0; } /** * cipso_v4_skbuff_delattr - Delete any CIPSO options from a packet * @skb: the packet * * Description: * Removes any and all CIPSO options from the given packet. Returns zero on * success, negative values on failure. * */ int cipso_v4_skbuff_delattr(struct sk_buff *skb) { int ret_val; struct iphdr *iph; struct ip_options *opt = &IPCB(skb)->opt; unsigned char *cipso_ptr; if (opt->cipso == 0) return 0; /* since we are changing the packet we should make a copy */ ret_val = skb_cow(skb, skb_headroom(skb)); if (ret_val < 0) return ret_val; /* the easiest thing to do is just replace the cipso option with noop * options since we don't change the size of the packet, although we * still need to recalculate the checksum */ iph = ip_hdr(skb); cipso_ptr = (unsigned char *)iph + opt->cipso; memset(cipso_ptr, IPOPT_NOOP, cipso_ptr[1]); opt->cipso = 0; opt->is_changed = 1; ip_send_check(iph); return 0; } /* * Setup Functions */ /** * cipso_v4_init - Initialize the CIPSO module * * Description: * Initialize the CIPSO module and prepare it for use. Returns zero on success * and negative values on failure. * */ static int __init cipso_v4_init(void) { int ret_val; ret_val = cipso_v4_cache_init(); if (ret_val != 0) panic("Failed to initialize the CIPSO/IPv4 cache (%d)\n", ret_val); return 0; } subsys_initcall(cipso_v4_init);
1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 // SPDX-License-Identifier: GPL-2.0-only // // ethtool interface for Ethernet PSE (Power Sourcing Equipment) // and PD (Powered Device) // // Copyright (c) 2022 Pengutronix, Oleksij Rempel <kernel@pengutronix.de> // #include "common.h" #include "linux/pse-pd/pse.h" #include "netlink.h" #include <linux/ethtool_netlink.h> #include <linux/ethtool.h> #include <linux/phy.h> struct pse_req_info { struct ethnl_req_info base; }; struct pse_reply_data { struct ethnl_reply_data base; struct pse_control_status status; }; #define PSE_REPDATA(__reply_base) \ container_of(__reply_base, struct pse_reply_data, base) /* PSE_GET */ const struct nla_policy ethnl_pse_get_policy[ETHTOOL_A_PSE_HEADER + 1] = { [ETHTOOL_A_PSE_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), }; static int pse_get_pse_attributes(struct net_device *dev, struct netlink_ext_ack *extack, struct pse_reply_data *data) { struct phy_device *phydev = dev->phydev; if (!phydev) { NL_SET_ERR_MSG(extack, "No PHY is attached"); return -EOPNOTSUPP; } if (!phydev->psec) { NL_SET_ERR_MSG(extack, "No PSE is attached"); return -EOPNOTSUPP; } memset(&data->status, 0, sizeof(data->status)); return pse_ethtool_get_status(phydev->psec, extack, &data->status); } static int pse_prepare_data(const struct ethnl_req_info *req_base, struct ethnl_reply_data *reply_base, const struct genl_info *info) { struct pse_reply_data *data = PSE_REPDATA(reply_base); struct net_device *dev = reply_base->dev; int ret; ret = ethnl_ops_begin(dev); if (ret < 0) return ret; ret = pse_get_pse_attributes(dev, info->extack, data); ethnl_ops_complete(dev); return ret; } static int pse_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct pse_reply_data *data = PSE_REPDATA(reply_base); const struct pse_control_status *st = &data->status; int len = 0; if (st->podl_admin_state > 0) len += nla_total_size(sizeof(u32)); /* _PODL_PSE_ADMIN_STATE */ if (st->podl_pw_status > 0) len += nla_total_size(sizeof(u32)); /* _PODL_PSE_PW_D_STATUS */ return len; } static int pse_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct pse_reply_data *data = PSE_REPDATA(reply_base); const struct pse_control_status *st = &data->status; if (st->podl_admin_state > 0 && nla_put_u32(skb, ETHTOOL_A_PODL_PSE_ADMIN_STATE, st->podl_admin_state)) return -EMSGSIZE; if (st->podl_pw_status > 0 && nla_put_u32(skb, ETHTOOL_A_PODL_PSE_PW_D_STATUS, st->podl_pw_status)) return -EMSGSIZE; return 0; } /* PSE_SET */ const struct nla_policy ethnl_pse_set_policy[ETHTOOL_A_PSE_MAX + 1] = { [ETHTOOL_A_PSE_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), [ETHTOOL_A_PODL_PSE_ADMIN_CONTROL] = NLA_POLICY_RANGE(NLA_U32, ETHTOOL_PODL_PSE_ADMIN_STATE_DISABLED, ETHTOOL_PODL_PSE_ADMIN_STATE_ENABLED), }; static int ethnl_set_pse_validate(struct ethnl_req_info *req_info, struct genl_info *info) { return !!info->attrs[ETHTOOL_A_PODL_PSE_ADMIN_CONTROL]; } static int ethnl_set_pse(struct ethnl_req_info *req_info, struct genl_info *info) { struct net_device *dev = req_info->dev; struct pse_control_config config = {}; struct nlattr **tb = info->attrs; struct phy_device *phydev; /* this values are already validated by the ethnl_pse_set_policy */ config.admin_cotrol = nla_get_u32(tb[ETHTOOL_A_PODL_PSE_ADMIN_CONTROL]); phydev = dev->phydev; if (!phydev) { NL_SET_ERR_MSG(info->extack, "No PHY is attached"); return -EOPNOTSUPP; } if (!phydev->psec) { NL_SET_ERR_MSG(info->extack, "No PSE is attached"); return -EOPNOTSUPP; } /* Return errno directly - PSE has no notification */ return pse_ethtool_set_config(phydev->psec, info->extack, &config); } const struct ethnl_request_ops ethnl_pse_request_ops = { .request_cmd = ETHTOOL_MSG_PSE_GET, .reply_cmd = ETHTOOL_MSG_PSE_GET_REPLY, .hdr_attr = ETHTOOL_A_PSE_HEADER, .req_info_size = sizeof(struct pse_req_info), .reply_data_size = sizeof(struct pse_reply_data), .prepare_data = pse_prepare_data, .reply_size = pse_reply_size, .fill_reply = pse_fill_reply, .set_validate = ethnl_set_pse_validate, .set = ethnl_set_pse, /* PSE has no notification */ };
28 11 209 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 // SPDX-License-Identifier: GPL-2.0-only /* * Unified UUID/GUID definition * * Copyright (C) 2009, 2016 Intel Corp. * Huang Ying <ying.huang@intel.com> */ #include <linux/kernel.h> #include <linux/ctype.h> #include <linux/errno.h> #include <linux/export.h> #include <linux/uuid.h> #include <linux/random.h> const guid_t guid_null; EXPORT_SYMBOL(guid_null); const uuid_t uuid_null; EXPORT_SYMBOL(uuid_null); const u8 guid_index[16] = {3,2,1,0,5,4,7,6,8,9,10,11,12,13,14,15}; const u8 uuid_index[16] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}; /** * generate_random_uuid - generate a random UUID * @uuid: where to put the generated UUID * * Random UUID interface * * Used to create a Boot ID or a filesystem UUID/GUID, but can be * useful for other kernel drivers. */ void generate_random_uuid(unsigned char uuid[16]) { get_random_bytes(uuid, 16); /* Set UUID version to 4 --- truly random generation */ uuid[6] = (uuid[6] & 0x0F) | 0x40; /* Set the UUID variant to DCE */ uuid[8] = (uuid[8] & 0x3F) | 0x80; } EXPORT_SYMBOL(generate_random_uuid); void generate_random_guid(unsigned char guid[16]) { get_random_bytes(guid, 16); /* Set GUID version to 4 --- truly random generation */ guid[7] = (guid[7] & 0x0F) | 0x40; /* Set the GUID variant to DCE */ guid[8] = (guid[8] & 0x3F) | 0x80; } EXPORT_SYMBOL(generate_random_guid); static void __uuid_gen_common(__u8 b[16]) { get_random_bytes(b, 16); /* reversion 0b10 */ b[8] = (b[8] & 0x3F) | 0x80; } void guid_gen(guid_t *lu) { __uuid_gen_common(lu->b); /* version 4 : random generation */ lu->b[7] = (lu->b[7] & 0x0F) | 0x40; } EXPORT_SYMBOL_GPL(guid_gen); void uuid_gen(uuid_t *bu) { __uuid_gen_common(bu->b); /* version 4 : random generation */ bu->b[6] = (bu->b[6] & 0x0F) | 0x40; } EXPORT_SYMBOL_GPL(uuid_gen); /** * uuid_is_valid - checks if a UUID string is valid * @uuid: UUID string to check * * Description: * It checks if the UUID string is following the format: * xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx * * where x is a hex digit. * * Return: true if input is valid UUID string. */ bool uuid_is_valid(const char *uuid) { unsigned int i; for (i = 0; i < UUID_STRING_LEN; i++) { if (i == 8 || i == 13 || i == 18 || i == 23) { if (uuid[i] != '-') return false; } else if (!isxdigit(uuid[i])) { return false; } } return true; } EXPORT_SYMBOL(uuid_is_valid); static int __uuid_parse(const char *uuid, __u8 b[16], const u8 ei[16]) { static const u8 si[16] = {0,2,4,6,9,11,14,16,19,21,24,26,28,30,32,34}; unsigned int i; if (!uuid_is_valid(uuid)) return -EINVAL; for (i = 0; i < 16; i++) { int hi = hex_to_bin(uuid[si[i] + 0]); int lo = hex_to_bin(uuid[si[i] + 1]); b[ei[i]] = (hi << 4) | lo; } return 0; } int guid_parse(const char *uuid, guid_t *u) { return __uuid_parse(uuid, u->b, guid_index); } EXPORT_SYMBOL(guid_parse); int uuid_parse(const char *uuid, uuid_t *u) { return __uuid_parse(uuid, u->b, uuid_index); } EXPORT_SYMBOL(uuid_parse);
34 2 4 4 4 4 4 3 2 1 8 6 5 2 2 5 8 2 1 2 1 1 1 1 3 2 1 1 6 6 1 1 1 2 5 1 34 1 1 1 2 1 4 1 6 1 2 2 7 1 6 3 19 19 3 2 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 // SPDX-License-Identifier: GPL-2.0-or-later /* * OSS compatible sequencer driver * * Copyright (C) 1998,99 Takashi Iwai <tiwai@suse.de> */ #include "seq_oss_device.h" #include "seq_oss_synth.h" #include "seq_oss_midi.h" #include "seq_oss_event.h" #include "seq_oss_timer.h" #include <sound/seq_oss_legacy.h> #include "seq_oss_readq.h" #include "seq_oss_writeq.h" #include <linux/nospec.h> /* * prototypes */ static int extended_event(struct seq_oss_devinfo *dp, union evrec *q, struct snd_seq_event *ev); static int chn_voice_event(struct seq_oss_devinfo *dp, union evrec *event_rec, struct snd_seq_event *ev); static int chn_common_event(struct seq_oss_devinfo *dp, union evrec *event_rec, struct snd_seq_event *ev); static int timing_event(struct seq_oss_devinfo *dp, union evrec *event_rec, struct snd_seq_event *ev); static int local_event(struct seq_oss_devinfo *dp, union evrec *event_rec, struct snd_seq_event *ev); static int old_event(struct seq_oss_devinfo *dp, union evrec *q, struct snd_seq_event *ev); static int note_on_event(struct seq_oss_devinfo *dp, int dev, int ch, int note, int vel, struct snd_seq_event *ev); static int note_off_event(struct seq_oss_devinfo *dp, int dev, int ch, int note, int vel, struct snd_seq_event *ev); static int set_note_event(struct seq_oss_devinfo *dp, int dev, int type, int ch, int note, int vel, struct snd_seq_event *ev); static int set_control_event(struct seq_oss_devinfo *dp, int dev, int type, int ch, int param, int val, struct snd_seq_event *ev); static int set_echo_event(struct seq_oss_devinfo *dp, union evrec *rec, struct snd_seq_event *ev); /* * convert an OSS event to ALSA event * return 0 : enqueued * non-zero : invalid - ignored */ int snd_seq_oss_process_event(struct seq_oss_devinfo *dp, union evrec *q, struct snd_seq_event *ev) { switch (q->s.code) { case SEQ_EXTENDED: return extended_event(dp, q, ev); case EV_CHN_VOICE: return chn_voice_event(dp, q, ev); case EV_CHN_COMMON: return chn_common_event(dp, q, ev); case EV_TIMING: return timing_event(dp, q, ev); case EV_SEQ_LOCAL: return local_event(dp, q, ev); case EV_SYSEX: return snd_seq_oss_synth_sysex(dp, q->x.dev, q->x.buf, ev); case SEQ_MIDIPUTC: if (dp->seq_mode == SNDRV_SEQ_OSS_MODE_MUSIC) return -EINVAL; /* put a midi byte */ if (! is_write_mode(dp->file_mode)) break; if (snd_seq_oss_midi_open(dp, q->s.dev, SNDRV_SEQ_OSS_FILE_WRITE)) break; if (snd_seq_oss_midi_filemode(dp, q->s.dev) & SNDRV_SEQ_OSS_FILE_WRITE) return snd_seq_oss_midi_putc(dp, q->s.dev, q->s.parm1, ev); break; case SEQ_ECHO: if (dp->seq_mode == SNDRV_SEQ_OSS_MODE_MUSIC) return -EINVAL; return set_echo_event(dp, q, ev); case SEQ_PRIVATE: if (dp->seq_mode == SNDRV_SEQ_OSS_MODE_MUSIC) return -EINVAL; return snd_seq_oss_synth_raw_event(dp, q->c[1], q->c, ev); default: if (dp->seq_mode == SNDRV_SEQ_OSS_MODE_MUSIC) return -EINVAL; return old_event(dp, q, ev); } return -EINVAL; } /* old type events: mode1 only */ static int old_event(struct seq_oss_devinfo *dp, union evrec *q, struct snd_seq_event *ev) { switch (q->s.code) { case SEQ_NOTEOFF: return note_off_event(dp, 0, q->n.chn, q->n.note, q->n.vel, ev); case SEQ_NOTEON: return note_on_event(dp, 0, q->n.chn, q->n.note, q->n.vel, ev); case SEQ_WAIT: /* skip */ break; case SEQ_PGMCHANGE: return set_control_event(dp, 0, SNDRV_SEQ_EVENT_PGMCHANGE, q->n.chn, 0, q->n.note, ev); case SEQ_SYNCTIMER: return snd_seq_oss_timer_reset(dp->timer); } return -EINVAL; } /* 8bytes extended event: mode1 only */ static int extended_event(struct seq_oss_devinfo *dp, union evrec *q, struct snd_seq_event *ev) { int val; switch (q->e.cmd) { case SEQ_NOTEOFF: return note_off_event(dp, q->e.dev, q->e.chn, q->e.p1, q->e.p2, ev); case SEQ_NOTEON: return note_on_event(dp, q->e.dev, q->e.chn, q->e.p1, q->e.p2, ev); case SEQ_PGMCHANGE: return set_control_event(dp, q->e.dev, SNDRV_SEQ_EVENT_PGMCHANGE, q->e.chn, 0, q->e.p1, ev); case SEQ_AFTERTOUCH: return set_control_event(dp, q->e.dev, SNDRV_SEQ_EVENT_CHANPRESS, q->e.chn, 0, q->e.p1, ev); case SEQ_BALANCE: /* convert -128:127 to 0:127 */ val = (char)q->e.p1; val = (val + 128) / 2; return set_control_event(dp, q->e.dev, SNDRV_SEQ_EVENT_CONTROLLER, q->e.chn, CTL_PAN, val, ev); case SEQ_CONTROLLER: val = ((short)q->e.p3 << 8) | (short)q->e.p2; switch (q->e.p1) { case CTRL_PITCH_BENDER: /* SEQ1 V2 control */ /* -0x2000:0x1fff */ return set_control_event(dp, q->e.dev, SNDRV_SEQ_EVENT_PITCHBEND, q->e.chn, 0, val, ev); case CTRL_PITCH_BENDER_RANGE: /* conversion: 100/semitone -> 128/semitone */ return set_control_event(dp, q->e.dev, SNDRV_SEQ_EVENT_REGPARAM, q->e.chn, 0, val*128/100, ev); default: return set_control_event(dp, q->e.dev, SNDRV_SEQ_EVENT_CONTROL14, q->e.chn, q->e.p1, val, ev); } case SEQ_VOLMODE: return snd_seq_oss_synth_raw_event(dp, q->e.dev, q->c, ev); } return -EINVAL; } /* channel voice events: mode1 and 2 */ static int chn_voice_event(struct seq_oss_devinfo *dp, union evrec *q, struct snd_seq_event *ev) { if (q->v.chn >= 32) return -EINVAL; switch (q->v.cmd) { case MIDI_NOTEON: return note_on_event(dp, q->v.dev, q->v.chn, q->v.note, q->v.parm, ev); case MIDI_NOTEOFF: return note_off_event(dp, q->v.dev, q->v.chn, q->v.note, q->v.parm, ev); case MIDI_KEY_PRESSURE: return set_note_event(dp, q->v.dev, SNDRV_SEQ_EVENT_KEYPRESS, q->v.chn, q->v.note, q->v.parm, ev); } return -EINVAL; } /* channel common events: mode1 and 2 */ static int chn_common_event(struct seq_oss_devinfo *dp, union evrec *q, struct snd_seq_event *ev) { if (q->l.chn >= 32) return -EINVAL; switch (q->l.cmd) { case MIDI_PGM_CHANGE: return set_control_event(dp, q->l.dev, SNDRV_SEQ_EVENT_PGMCHANGE, q->l.chn, 0, q->l.p1, ev); case MIDI_CTL_CHANGE: return set_control_event(dp, q->l.dev, SNDRV_SEQ_EVENT_CONTROLLER, q->l.chn, q->l.p1, q->l.val, ev); case MIDI_PITCH_BEND: /* conversion: 0:0x3fff -> -0x2000:0x1fff */ return set_control_event(dp, q->l.dev, SNDRV_SEQ_EVENT_PITCHBEND, q->l.chn, 0, q->l.val - 8192, ev); case MIDI_CHN_PRESSURE: return set_control_event(dp, q->l.dev, SNDRV_SEQ_EVENT_CHANPRESS, q->l.chn, 0, q->l.val, ev); } return -EINVAL; } /* timer events: mode1 and mode2 */ static int timing_event(struct seq_oss_devinfo *dp, union evrec *q, struct snd_seq_event *ev) { switch (q->t.cmd) { case TMR_ECHO: if (dp->seq_mode == SNDRV_SEQ_OSS_MODE_MUSIC) return set_echo_event(dp, q, ev); else { union evrec tmp; memset(&tmp, 0, sizeof(tmp)); /* XXX: only for little-endian! */ tmp.echo = (q->t.time << 8) | SEQ_ECHO; return set_echo_event(dp, &tmp, ev); } case TMR_STOP: if (dp->seq_mode) return snd_seq_oss_timer_stop(dp->timer); return 0; case TMR_CONTINUE: if (dp->seq_mode) return snd_seq_oss_timer_continue(dp->timer); return 0; case TMR_TEMPO: if (dp->seq_mode) return snd_seq_oss_timer_tempo(dp->timer, q->t.time); return 0; } return -EINVAL; } /* local events: mode1 and 2 */ static int local_event(struct seq_oss_devinfo *dp, union evrec *q, struct snd_seq_event *ev) { return -EINVAL; } /* * process note-on event for OSS synth * three different modes are available: * - SNDRV_SEQ_OSS_PROCESS_EVENTS (for one-voice per channel mode) * Accept note 255 as volume change. * - SNDRV_SEQ_OSS_PASS_EVENTS * Pass all events to lowlevel driver anyway * - SNDRV_SEQ_OSS_PROCESS_KEYPRESS (mostly for Emu8000) * Use key-pressure if note >= 128 */ static int note_on_event(struct seq_oss_devinfo *dp, int dev, int ch, int note, int vel, struct snd_seq_event *ev) { struct seq_oss_synthinfo *info; info = snd_seq_oss_synth_info(dp, dev); if (!info) return -ENXIO; switch (info->arg.event_passing) { case SNDRV_SEQ_OSS_PROCESS_EVENTS: if (! info->ch || ch < 0 || ch >= info->nr_voices) { /* pass directly */ return set_note_event(dp, dev, SNDRV_SEQ_EVENT_NOTEON, ch, note, vel, ev); } ch = array_index_nospec(ch, info->nr_voices); if (note == 255 && info->ch[ch].note >= 0) { /* volume control */ int type; //if (! vel) /* set volume to zero -- note off */ // type = SNDRV_SEQ_EVENT_NOTEOFF; //else if (info->ch[ch].vel) /* sample already started -- volume change */ type = SNDRV_SEQ_EVENT_KEYPRESS; else /* sample not started -- start now */ type = SNDRV_SEQ_EVENT_NOTEON; info->ch[ch].vel = vel; return set_note_event(dp, dev, type, ch, info->ch[ch].note, vel, ev); } else if (note >= 128) return -EINVAL; /* invalid */ if (note != info->ch[ch].note && info->ch[ch].note >= 0) /* note changed - note off at beginning */ set_note_event(dp, dev, SNDRV_SEQ_EVENT_NOTEOFF, ch, info->ch[ch].note, 0, ev); /* set current status */ info->ch[ch].note = note; info->ch[ch].vel = vel; if (vel) /* non-zero velocity - start the note now */ return set_note_event(dp, dev, SNDRV_SEQ_EVENT_NOTEON, ch, note, vel, ev); return -EINVAL; case SNDRV_SEQ_OSS_PASS_EVENTS: /* pass the event anyway */ return set_note_event(dp, dev, SNDRV_SEQ_EVENT_NOTEON, ch, note, vel, ev); case SNDRV_SEQ_OSS_PROCESS_KEYPRESS: if (note >= 128) /* key pressure: shifted by 128 */ return set_note_event(dp, dev, SNDRV_SEQ_EVENT_KEYPRESS, ch, note - 128, vel, ev); else /* normal note-on event */ return set_note_event(dp, dev, SNDRV_SEQ_EVENT_NOTEON, ch, note, vel, ev); } return -EINVAL; } /* * process note-off event for OSS synth */ static int note_off_event(struct seq_oss_devinfo *dp, int dev, int ch, int note, int vel, struct snd_seq_event *ev) { struct seq_oss_synthinfo *info; info = snd_seq_oss_synth_info(dp, dev); if (!info) return -ENXIO; switch (info->arg.event_passing) { case SNDRV_SEQ_OSS_PROCESS_EVENTS: if (! info->ch || ch < 0 || ch >= info->nr_voices) { /* pass directly */ return set_note_event(dp, dev, SNDRV_SEQ_EVENT_NOTEON, ch, note, vel, ev); } ch = array_index_nospec(ch, info->nr_voices); if (info->ch[ch].note >= 0) { note = info->ch[ch].note; info->ch[ch].vel = 0; info->ch[ch].note = -1; return set_note_event(dp, dev, SNDRV_SEQ_EVENT_NOTEOFF, ch, note, vel, ev); } return -EINVAL; /* invalid */ case SNDRV_SEQ_OSS_PASS_EVENTS: case SNDRV_SEQ_OSS_PROCESS_KEYPRESS: /* pass the event anyway */ return set_note_event(dp, dev, SNDRV_SEQ_EVENT_NOTEOFF, ch, note, vel, ev); } return -EINVAL; } /* * create a note event */ static int set_note_event(struct seq_oss_devinfo *dp, int dev, int type, int ch, int note, int vel, struct snd_seq_event *ev) { if (!snd_seq_oss_synth_info(dp, dev)) return -ENXIO; ev->type = type; snd_seq_oss_synth_addr(dp, dev, ev); ev->data.note.channel = ch; ev->data.note.note = note; ev->data.note.velocity = vel; return 0; } /* * create a control event */ static int set_control_event(struct seq_oss_devinfo *dp, int dev, int type, int ch, int param, int val, struct snd_seq_event *ev) { if (!snd_seq_oss_synth_info(dp, dev)) return -ENXIO; ev->type = type; snd_seq_oss_synth_addr(dp, dev, ev); ev->data.control.channel = ch; ev->data.control.param = param; ev->data.control.value = val; return 0; } /* * create an echo event */ static int set_echo_event(struct seq_oss_devinfo *dp, union evrec *rec, struct snd_seq_event *ev) { ev->type = SNDRV_SEQ_EVENT_ECHO; /* echo back to itself */ snd_seq_oss_fill_addr(dp, ev, dp->addr.client, dp->addr.port); memcpy(&ev->data, rec, LONG_EVENT_SIZE); return 0; } /* * event input callback from ALSA sequencer: * the echo event is processed here. */ int snd_seq_oss_event_input(struct snd_seq_event *ev, int direct, void *private_data, int atomic, int hop) { struct seq_oss_devinfo *dp = (struct seq_oss_devinfo *)private_data; union evrec *rec; if (ev->type != SNDRV_SEQ_EVENT_ECHO) return snd_seq_oss_midi_input(ev, direct, private_data); if (ev->source.client != dp->cseq) return 0; /* ignored */ rec = (union evrec*)&ev->data; if (rec->s.code == SEQ_SYNCTIMER) { /* sync echo back */ snd_seq_oss_writeq_wakeup(dp->writeq, rec->t.time); } else { /* echo back event */ if (dp->readq == NULL) return 0; snd_seq_oss_readq_put_event(dp->readq, rec); } return 0; }
37 21 8 29 10 29 30 30 30 30 30 30 30 30 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 // SPDX-License-Identifier: GPL-2.0-only /* * MLO link handling * * Copyright (C) 2022-2023 Intel Corporation */ #include <linux/slab.h> #include <linux/kernel.h> #include <net/mac80211.h> #include "ieee80211_i.h" #include "driver-ops.h" #include "key.h" #include "debugfs_netdev.h" void ieee80211_link_setup(struct ieee80211_link_data *link) { if (link->sdata->vif.type == NL80211_IFTYPE_STATION) ieee80211_mgd_setup_link(link); } void ieee80211_link_init(struct ieee80211_sub_if_data *sdata, int link_id, struct ieee80211_link_data *link, struct ieee80211_bss_conf *link_conf) { bool deflink = link_id < 0; if (link_id < 0) link_id = 0; rcu_assign_pointer(sdata->vif.link_conf[link_id], link_conf); rcu_assign_pointer(sdata->link[link_id], link); link->sdata = sdata; link->link_id = link_id; link->conf = link_conf; link_conf->link_id = link_id; link_conf->vif = &sdata->vif; wiphy_work_init(&link->csa_finalize_work, ieee80211_csa_finalize_work); wiphy_work_init(&link->color_change_finalize_work, ieee80211_color_change_finalize_work); INIT_DELAYED_WORK(&link->color_collision_detect_work, ieee80211_color_collision_detection_work); INIT_LIST_HEAD(&link->assigned_chanctx_list); INIT_LIST_HEAD(&link->reserved_chanctx_list); wiphy_delayed_work_init(&link->dfs_cac_timer_work, ieee80211_dfs_cac_timer_work); if (!deflink) { switch (sdata->vif.type) { case NL80211_IFTYPE_AP: ether_addr_copy(link_conf->addr, sdata->wdev.links[link_id].addr); link_conf->bssid = link_conf->addr; WARN_ON(!(sdata->wdev.valid_links & BIT(link_id))); break; case NL80211_IFTYPE_STATION: /* station sets the bssid in ieee80211_mgd_setup_link */ break; default: WARN_ON(1); } ieee80211_link_debugfs_add(link); } } void ieee80211_link_stop(struct ieee80211_link_data *link) { if (link->sdata->vif.type == NL80211_IFTYPE_STATION) ieee80211_mgd_stop_link(link); cancel_delayed_work_sync(&link->color_collision_detect_work); ieee80211_link_release_channel(link); } struct link_container { struct ieee80211_link_data data; struct ieee80211_bss_conf conf; }; static void ieee80211_tear_down_links(struct ieee80211_sub_if_data *sdata, struct link_container **links, u16 mask) { struct ieee80211_link_data *link; LIST_HEAD(keys); unsigned int link_id; for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { if (!(mask & BIT(link_id))) continue; link = &links[link_id]->data; if (link_id == 0 && !link) link = &sdata->deflink; if (WARN_ON(!link)) continue; ieee80211_remove_link_keys(link, &keys); ieee80211_link_debugfs_remove(link); ieee80211_link_stop(link); } synchronize_rcu(); ieee80211_free_key_list(sdata->local, &keys); } static void ieee80211_free_links(struct ieee80211_sub_if_data *sdata, struct link_container **links) { unsigned int link_id; for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) kfree(links[link_id]); } static int ieee80211_check_dup_link_addrs(struct ieee80211_sub_if_data *sdata) { unsigned int i, j; for (i = 0; i < IEEE80211_MLD_MAX_NUM_LINKS; i++) { struct ieee80211_link_data *link1; link1 = sdata_dereference(sdata->link[i], sdata); if (!link1) continue; for (j = i + 1; j < IEEE80211_MLD_MAX_NUM_LINKS; j++) { struct ieee80211_link_data *link2; link2 = sdata_dereference(sdata->link[j], sdata); if (!link2) continue; if (ether_addr_equal(link1->conf->addr, link2->conf->addr)) return -EALREADY; } } return 0; } static void ieee80211_set_vif_links_bitmaps(struct ieee80211_sub_if_data *sdata, u16 valid_links, u16 dormant_links) { sdata->vif.valid_links = valid_links; sdata->vif.dormant_links = dormant_links; if (!valid_links || WARN((~valid_links & dormant_links) || !(valid_links & ~dormant_links), "Invalid links: valid=0x%x, dormant=0x%x", valid_links, dormant_links)) { sdata->vif.active_links = 0; sdata->vif.dormant_links = 0; return; } switch (sdata->vif.type) { case NL80211_IFTYPE_AP: /* in an AP all links are always active */ sdata->vif.active_links = valid_links; /* AP links are not expected to be disabled */ WARN_ON(dormant_links); break; case NL80211_IFTYPE_STATION: if (sdata->vif.active_links) break; sdata->vif.active_links = valid_links & ~dormant_links; WARN_ON(hweight16(sdata->vif.active_links) > 1); break; default: WARN_ON(1); } } static int ieee80211_vif_update_links(struct ieee80211_sub_if_data *sdata, struct link_container **to_free, u16 new_links, u16 dormant_links) { u16 old_links = sdata->vif.valid_links; u16 old_active = sdata->vif.active_links; unsigned long add = new_links & ~old_links; unsigned long rem = old_links & ~new_links; unsigned int link_id; int ret; struct link_container *links[IEEE80211_MLD_MAX_NUM_LINKS] = {}, *link; struct ieee80211_bss_conf *old[IEEE80211_MLD_MAX_NUM_LINKS]; struct ieee80211_link_data *old_data[IEEE80211_MLD_MAX_NUM_LINKS]; bool use_deflink = old_links == 0; /* set for error case */ lockdep_assert_wiphy(sdata->local->hw.wiphy); memset(to_free, 0, sizeof(links)); if (old_links == new_links && dormant_links == sdata->vif.dormant_links) return 0; /* if there were no old links, need to clear the pointers to deflink */ if (!old_links) rem |= BIT(0); /* allocate new link structures first */ for_each_set_bit(link_id, &add, IEEE80211_MLD_MAX_NUM_LINKS) { link = kzalloc(sizeof(*link), GFP_KERNEL); if (!link) { ret = -ENOMEM; goto free; } links[link_id] = link; } /* keep track of the old pointers for the driver */ BUILD_BUG_ON(sizeof(old) != sizeof(sdata->vif.link_conf)); memcpy(old, sdata->vif.link_conf, sizeof(old)); /* and for us in error cases */ BUILD_BUG_ON(sizeof(old_data) != sizeof(sdata->link)); memcpy(old_data, sdata->link, sizeof(old_data)); /* grab old links to free later */ for_each_set_bit(link_id, &rem, IEEE80211_MLD_MAX_NUM_LINKS) { if (rcu_access_pointer(sdata->link[link_id]) != &sdata->deflink) { /* * we must have allocated the data through this path so * we know we can free both at the same time */ to_free[link_id] = container_of(rcu_access_pointer(sdata->link[link_id]), typeof(*links[link_id]), data); } RCU_INIT_POINTER(sdata->link[link_id], NULL); RCU_INIT_POINTER(sdata->vif.link_conf[link_id], NULL); } if (!old_links) ieee80211_debugfs_recreate_netdev(sdata, true); /* link them into data structures */ for_each_set_bit(link_id, &add, IEEE80211_MLD_MAX_NUM_LINKS) { WARN_ON(!use_deflink && rcu_access_pointer(sdata->link[link_id]) == &sdata->deflink); link = links[link_id]; ieee80211_link_init(sdata, link_id, &link->data, &link->conf); ieee80211_link_setup(&link->data); } if (new_links == 0) ieee80211_link_init(sdata, -1, &sdata->deflink, &sdata->vif.bss_conf); ret = ieee80211_check_dup_link_addrs(sdata); if (!ret) { /* for keys we will not be able to undo this */ ieee80211_tear_down_links(sdata, to_free, rem); ieee80211_set_vif_links_bitmaps(sdata, new_links, dormant_links); /* tell the driver */ ret = drv_change_vif_links(sdata->local, sdata, old_links & old_active, new_links & sdata->vif.active_links, old); if (!new_links) ieee80211_debugfs_recreate_netdev(sdata, false); } if (ret) { /* restore config */ memcpy(sdata->link, old_data, sizeof(old_data)); memcpy(sdata->vif.link_conf, old, sizeof(old)); ieee80211_set_vif_links_bitmaps(sdata, old_links, dormant_links); /* and free (only) the newly allocated links */ memset(to_free, 0, sizeof(links)); goto free; } /* use deflink/bss_conf again if and only if there are no more links */ use_deflink = new_links == 0; goto deinit; free: /* if we failed during allocation, only free all */ for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { kfree(links[link_id]); links[link_id] = NULL; } deinit: if (use_deflink) ieee80211_link_init(sdata, -1, &sdata->deflink, &sdata->vif.bss_conf); return ret; } int ieee80211_vif_set_links(struct ieee80211_sub_if_data *sdata, u16 new_links, u16 dormant_links) { struct link_container *links[IEEE80211_MLD_MAX_NUM_LINKS]; int ret; ret = ieee80211_vif_update_links(sdata, links, new_links, dormant_links); ieee80211_free_links(sdata, links); return ret; } static int _ieee80211_set_active_links(struct ieee80211_sub_if_data *sdata, u16 active_links) { struct ieee80211_bss_conf *link_confs[IEEE80211_MLD_MAX_NUM_LINKS]; struct ieee80211_local *local = sdata->local; u16 old_active = sdata->vif.active_links; unsigned long rem = old_active & ~active_links; unsigned long add = active_links & ~old_active; struct sta_info *sta; unsigned int link_id; int ret, i; if (!ieee80211_sdata_running(sdata)) return -ENETDOWN; if (sdata->vif.type != NL80211_IFTYPE_STATION) return -EINVAL; if (active_links & ~ieee80211_vif_usable_links(&sdata->vif)) return -EINVAL; /* nothing to do */ if (old_active == active_links) return 0; for (i = 0; i < IEEE80211_MLD_MAX_NUM_LINKS; i++) link_confs[i] = sdata_dereference(sdata->vif.link_conf[i], sdata); if (add) { sdata->vif.active_links |= active_links; ret = drv_change_vif_links(local, sdata, old_active, sdata->vif.active_links, link_confs); if (ret) { sdata->vif.active_links = old_active; return ret; } } for_each_set_bit(link_id, &rem, IEEE80211_MLD_MAX_NUM_LINKS) { struct ieee80211_link_data *link; link = sdata_dereference(sdata->link[link_id], sdata); /* FIXME: kill TDLS connections on the link */ ieee80211_link_release_channel(link); } list_for_each_entry(sta, &local->sta_list, list) { if (sdata != sta->sdata) continue; /* this is very temporary, but do it anyway */ __ieee80211_sta_recalc_aggregates(sta, old_active | active_links); ret = drv_change_sta_links(local, sdata, &sta->sta, old_active, old_active | active_links); WARN_ON_ONCE(ret); } ret = ieee80211_key_switch_links(sdata, rem, add); WARN_ON_ONCE(ret); list_for_each_entry(sta, &local->sta_list, list) { if (sdata != sta->sdata) continue; __ieee80211_sta_recalc_aggregates(sta, active_links); ret = drv_change_sta_links(local, sdata, &sta->sta, old_active | active_links, active_links); WARN_ON_ONCE(ret); /* * Do it again, just in case - the driver might very * well have called ieee80211_sta_recalc_aggregates() * from there when filling in the new links, which * would set it wrong since the vif's active links are * not switched yet... */ __ieee80211_sta_recalc_aggregates(sta, active_links); } for_each_set_bit(link_id, &add, IEEE80211_MLD_MAX_NUM_LINKS) { struct ieee80211_link_data *link; link = sdata_dereference(sdata->link[link_id], sdata); ret = ieee80211_link_use_channel(link, &link->conf->chandef, IEEE80211_CHANCTX_SHARED); WARN_ON_ONCE(ret); ieee80211_mgd_set_link_qos_params(link); ieee80211_link_info_change_notify(sdata, link, BSS_CHANGED_ERP_CTS_PROT | BSS_CHANGED_ERP_PREAMBLE | BSS_CHANGED_ERP_SLOT | BSS_CHANGED_HT | BSS_CHANGED_BASIC_RATES | BSS_CHANGED_BSSID | BSS_CHANGED_CQM | BSS_CHANGED_QOS | BSS_CHANGED_TXPOWER | BSS_CHANGED_BANDWIDTH | BSS_CHANGED_TWT | BSS_CHANGED_HE_OBSS_PD | BSS_CHANGED_HE_BSS_COLOR); } old_active = sdata->vif.active_links; sdata->vif.active_links = active_links; if (rem) { ret = drv_change_vif_links(local, sdata, old_active, active_links, link_confs); WARN_ON_ONCE(ret); } return 0; } int ieee80211_set_active_links(struct ieee80211_vif *vif, u16 active_links) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); struct ieee80211_local *local = sdata->local; u16 old_active; int ret; lockdep_assert_wiphy(local->hw.wiphy); old_active = sdata->vif.active_links; if (old_active & active_links) { /* * if there's at least one link that stays active across * the change then switch to it (to those) first, and * then enable the additional links */ ret = _ieee80211_set_active_links(sdata, old_active & active_links); if (!ret) ret = _ieee80211_set_active_links(sdata, active_links); } else { /* otherwise switch directly */ ret = _ieee80211_set_active_links(sdata, active_links); } return ret; } EXPORT_SYMBOL_GPL(ieee80211_set_active_links); void ieee80211_set_active_links_async(struct ieee80211_vif *vif, u16 active_links) { struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); if (!ieee80211_sdata_running(sdata)) return; if (sdata->vif.type != NL80211_IFTYPE_STATION) return; if (active_links & ~ieee80211_vif_usable_links(&sdata->vif)) return; /* nothing to do */ if (sdata->vif.active_links == active_links) return; sdata->desired_active_links = active_links; wiphy_work_queue(sdata->local->hw.wiphy, &sdata->activate_links_work); } EXPORT_SYMBOL_GPL(ieee80211_set_active_links_async);
25091 25097 25100 25098 25101 25041 25047 25092 25043 25099 25107 4 4 2 4 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 // SPDX-License-Identifier: GPL-2.0+ /* * Restartable sequences system call * * Copyright (C) 2015, Google, Inc., * Paul Turner <pjt@google.com> and Andrew Hunter <ahh@google.com> * Copyright (C) 2015-2018, EfficiOS Inc., * Mathieu Desnoyers <mathieu.desnoyers@efficios.com> */ #include <linux/sched.h> #include <linux/uaccess.h> #include <linux/syscalls.h> #include <linux/rseq.h> #include <linux/types.h> #include <asm/ptrace.h> #define CREATE_TRACE_POINTS #include <trace/events/rseq.h> /* The original rseq structure size (including padding) is 32 bytes. */ #define ORIG_RSEQ_SIZE 32 #define RSEQ_CS_NO_RESTART_FLAGS (RSEQ_CS_FLAG_NO_RESTART_ON_PREEMPT | \ RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL | \ RSEQ_CS_FLAG_NO_RESTART_ON_MIGRATE) /* * * Restartable sequences are a lightweight interface that allows * user-level code to be executed atomically relative to scheduler * preemption and signal delivery. Typically used for implementing * per-cpu operations. * * It allows user-space to perform update operations on per-cpu data * without requiring heavy-weight atomic operations. * * Detailed algorithm of rseq user-space assembly sequences: * * init(rseq_cs) * cpu = TLS->rseq::cpu_id_start * [1] TLS->rseq::rseq_cs = rseq_cs * [start_ip] ---------------------------- * [2] if (cpu != TLS->rseq::cpu_id) * goto abort_ip; * [3] <last_instruction_in_cs> * [post_commit_ip] ---------------------------- * * The address of jump target abort_ip must be outside the critical * region, i.e.: * * [abort_ip] < [start_ip] || [abort_ip] >= [post_commit_ip] * * Steps [2]-[3] (inclusive) need to be a sequence of instructions in * userspace that can handle being interrupted between any of those * instructions, and then resumed to the abort_ip. * * 1. Userspace stores the address of the struct rseq_cs assembly * block descriptor into the rseq_cs field of the registered * struct rseq TLS area. This update is performed through a single * store within the inline assembly instruction sequence. * [start_ip] * * 2. Userspace tests to check whether the current cpu_id field match * the cpu number loaded before start_ip, branching to abort_ip * in case of a mismatch. * * If the sequence is preempted or interrupted by a signal * at or after start_ip and before post_commit_ip, then the kernel * clears TLS->__rseq_abi::rseq_cs, and sets the user-space return * ip to abort_ip before returning to user-space, so the preempted * execution resumes at abort_ip. * * 3. Userspace critical section final instruction before * post_commit_ip is the commit. The critical section is * self-terminating. * [post_commit_ip] * * 4. <success> * * On failure at [2], or if interrupted by preempt or signal delivery * between [1] and [3]: * * [abort_ip] * F1. <failure> */ static int rseq_update_cpu_node_id(struct task_struct *t) { struct rseq __user *rseq = t->rseq; u32 cpu_id = raw_smp_processor_id(); u32 node_id = cpu_to_node(cpu_id); u32 mm_cid = task_mm_cid(t); WARN_ON_ONCE((int) mm_cid < 0); if (!user_write_access_begin(rseq, t->rseq_len)) goto efault; unsafe_put_user(cpu_id, &rseq->cpu_id_start, efault_end); unsafe_put_user(cpu_id, &rseq->cpu_id, efault_end); unsafe_put_user(node_id, &rseq->node_id, efault_end); unsafe_put_user(mm_cid, &rseq->mm_cid, efault_end); /* * Additional feature fields added after ORIG_RSEQ_SIZE * need to be conditionally updated only if * t->rseq_len != ORIG_RSEQ_SIZE. */ user_write_access_end(); trace_rseq_update(t); return 0; efault_end: user_write_access_end(); efault: return -EFAULT; } static int rseq_reset_rseq_cpu_node_id(struct task_struct *t) { u32 cpu_id_start = 0, cpu_id = RSEQ_CPU_ID_UNINITIALIZED, node_id = 0, mm_cid = 0; /* * Reset cpu_id_start to its initial state (0). */ if (put_user(cpu_id_start, &t->rseq->cpu_id_start)) return -EFAULT; /* * Reset cpu_id to RSEQ_CPU_ID_UNINITIALIZED, so any user coming * in after unregistration can figure out that rseq needs to be * registered again. */ if (put_user(cpu_id, &t->rseq->cpu_id)) return -EFAULT; /* * Reset node_id to its initial state (0). */ if (put_user(node_id, &t->rseq->node_id)) return -EFAULT; /* * Reset mm_cid to its initial state (0). */ if (put_user(mm_cid, &t->rseq->mm_cid)) return -EFAULT; /* * Additional feature fields added after ORIG_RSEQ_SIZE * need to be conditionally reset only if * t->rseq_len != ORIG_RSEQ_SIZE. */ return 0; } static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs) { struct rseq_cs __user *urseq_cs; u64 ptr; u32 __user *usig; u32 sig; int ret; #ifdef CONFIG_64BIT if (get_user(ptr, &t->rseq->rseq_cs)) return -EFAULT; #else if (copy_from_user(&ptr, &t->rseq->rseq_cs, sizeof(ptr))) return -EFAULT; #endif if (!ptr) { memset(rseq_cs, 0, sizeof(*rseq_cs)); return 0; } if (ptr >= TASK_SIZE) return -EINVAL; urseq_cs = (struct rseq_cs __user *)(unsigned long)ptr; if (copy_from_user(rseq_cs, urseq_cs, sizeof(*rseq_cs))) return -EFAULT; if (rseq_cs->start_ip >= TASK_SIZE || rseq_cs->start_ip + rseq_cs->post_commit_offset >= TASK_SIZE || rseq_cs->abort_ip >= TASK_SIZE || rseq_cs->version > 0) return -EINVAL; /* Check for overflow. */ if (rseq_cs->start_ip + rseq_cs->post_commit_offset < rseq_cs->start_ip) return -EINVAL; /* Ensure that abort_ip is not in the critical section. */ if (rseq_cs->abort_ip - rseq_cs->start_ip < rseq_cs->post_commit_offset) return -EINVAL; usig = (u32 __user *)(unsigned long)(rseq_cs->abort_ip - sizeof(u32)); ret = get_user(sig, usig); if (ret) return ret; if (current->rseq_sig != sig) { printk_ratelimited(KERN_WARNING "Possible attack attempt. Unexpected rseq signature 0x%x, expecting 0x%x (pid=%d, addr=%p).\n", sig, current->rseq_sig, current->pid, usig); return -EINVAL; } return 0; } static bool rseq_warn_flags(const char *str, u32 flags) { u32 test_flags; if (!flags) return false; test_flags = flags & RSEQ_CS_NO_RESTART_FLAGS; if (test_flags) pr_warn_once("Deprecated flags (%u) in %s ABI structure", test_flags, str); test_flags = flags & ~RSEQ_CS_NO_RESTART_FLAGS; if (test_flags) pr_warn_once("Unknown flags (%u) in %s ABI structure", test_flags, str); return true; } static int rseq_need_restart(struct task_struct *t, u32 cs_flags) { u32 flags, event_mask; int ret; if (rseq_warn_flags("rseq_cs", cs_flags)) return -EINVAL; /* Get thread flags. */ ret = get_user(flags, &t->rseq->flags); if (ret) return ret; if (rseq_warn_flags("rseq", flags)) return -EINVAL; /* * Load and clear event mask atomically with respect to * scheduler preemption. */ preempt_disable(); event_mask = t->rseq_event_mask; t->rseq_event_mask = 0; preempt_enable(); return !!event_mask; } static int clear_rseq_cs(struct task_struct *t) { /* * The rseq_cs field is set to NULL on preemption or signal * delivery on top of rseq assembly block, as well as on top * of code outside of the rseq assembly block. This performs * a lazy clear of the rseq_cs field. * * Set rseq_cs to NULL. */ #ifdef CONFIG_64BIT return put_user(0UL, &t->rseq->rseq_cs); #else if (clear_user(&t->rseq->rseq_cs, sizeof(t->rseq->rseq_cs))) return -EFAULT; return 0; #endif } /* * Unsigned comparison will be true when ip >= start_ip, and when * ip < start_ip + post_commit_offset. */ static bool in_rseq_cs(unsigned long ip, struct rseq_cs *rseq_cs) { return ip - rseq_cs->start_ip < rseq_cs->post_commit_offset; } static int rseq_ip_fixup(struct pt_regs *regs) { unsigned long ip = instruction_pointer(regs); struct task_struct *t = current; struct rseq_cs rseq_cs; int ret; ret = rseq_get_rseq_cs(t, &rseq_cs); if (ret) return ret; /* * Handle potentially not being within a critical section. * If not nested over a rseq critical section, restart is useless. * Clear the rseq_cs pointer and return. */ if (!in_rseq_cs(ip, &rseq_cs)) return clear_rseq_cs(t); ret = rseq_need_restart(t, rseq_cs.flags); if (ret <= 0) return ret; ret = clear_rseq_cs(t); if (ret) return ret; trace_rseq_ip_fixup(ip, rseq_cs.start_ip, rseq_cs.post_commit_offset, rseq_cs.abort_ip); instruction_pointer_set(regs, (unsigned long)rseq_cs.abort_ip); return 0; } /* * This resume handler must always be executed between any of: * - preemption, * - signal delivery, * and return to user-space. * * This is how we can ensure that the entire rseq critical section * will issue the commit instruction only if executed atomically with * respect to other threads scheduled on the same CPU, and with respect * to signal handlers. */ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs) { struct task_struct *t = current; int ret, sig; if (unlikely(t->flags & PF_EXITING)) return; /* * regs is NULL if and only if the caller is in a syscall path. Skip * fixup and leave rseq_cs as is so that rseq_sycall() will detect and * kill a misbehaving userspace on debug kernels. */ if (regs) { ret = rseq_ip_fixup(regs); if (unlikely(ret < 0)) goto error; } if (unlikely(rseq_update_cpu_node_id(t))) goto error; return; error: sig = ksig ? ksig->sig : 0; force_sigsegv(sig); } #ifdef CONFIG_DEBUG_RSEQ /* * Terminate the process if a syscall is issued within a restartable * sequence. */ void rseq_syscall(struct pt_regs *regs) { unsigned long ip = instruction_pointer(regs); struct task_struct *t = current; struct rseq_cs rseq_cs; if (!t->rseq) return; if (rseq_get_rseq_cs(t, &rseq_cs) || in_rseq_cs(ip, &rseq_cs)) force_sig(SIGSEGV); } #endif /* * sys_rseq - setup restartable sequences for caller thread. */ SYSCALL_DEFINE4(rseq, struct rseq __user *, rseq, u32, rseq_len, int, flags, u32, sig) { int ret; if (flags & RSEQ_FLAG_UNREGISTER) { if (flags & ~RSEQ_FLAG_UNREGISTER) return -EINVAL; /* Unregister rseq for current thread. */ if (current->rseq != rseq || !current->rseq) return -EINVAL; if (rseq_len != current->rseq_len) return -EINVAL; if (current->rseq_sig != sig) return -EPERM; ret = rseq_reset_rseq_cpu_node_id(current); if (ret) return ret; current->rseq = NULL; current->rseq_sig = 0; current->rseq_len = 0; return 0; } if (unlikely(flags)) return -EINVAL; if (current->rseq) { /* * If rseq is already registered, check whether * the provided address differs from the prior * one. */ if (current->rseq != rseq || rseq_len != current->rseq_len) return -EINVAL; if (current->rseq_sig != sig) return -EPERM; /* Already registered. */ return -EBUSY; } /* * If there was no rseq previously registered, ensure the provided rseq * is properly aligned, as communcated to user-space through the ELF * auxiliary vector AT_RSEQ_ALIGN. If rseq_len is the original rseq * size, the required alignment is the original struct rseq alignment. * * In order to be valid, rseq_len is either the original rseq size, or * large enough to contain all supported fields, as communicated to * user-space through the ELF auxiliary vector AT_RSEQ_FEATURE_SIZE. */ if (rseq_len < ORIG_RSEQ_SIZE || (rseq_len == ORIG_RSEQ_SIZE && !IS_ALIGNED((unsigned long)rseq, ORIG_RSEQ_SIZE)) || (rseq_len != ORIG_RSEQ_SIZE && (!IS_ALIGNED((unsigned long)rseq, __alignof__(*rseq)) || rseq_len < offsetof(struct rseq, end)))) return -EINVAL; if (!access_ok(rseq, rseq_len)) return -EFAULT; current->rseq = rseq; current->rseq_len = rseq_len; current->rseq_sig = sig; /* * If rseq was previously inactive, and has just been * registered, ensure the cpu_id_start and cpu_id fields * are updated before returning to user-space. */ rseq_set_notify_resume(current); return 0; }
36 36 36 36 36 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 // SPDX-License-Identifier: GPL-2.0-or-later /* mpih-rshift.c - MPI helper functions * Copyright (C) 1994, 1996, 1998, 1999, * 2000, 2001 Free Software Foundation, Inc. * * This file is part of GNUPG * * Note: This code is heavily based on the GNU MP Library. * Actually it's the same code with only minor changes in the * way the data is stored; this is to support the abstraction * of an optional secure memory allocation which may be used * to avoid revealing of sensitive data due to paging etc. * The GNU MP Library itself is published under the LGPL; * however I decided to publish this code under the plain GPL. */ #include "mpi-internal.h" /* Shift U (pointed to by UP and USIZE limbs long) CNT bits to the right * and store the USIZE least significant limbs of the result at WP. * The bits shifted out to the right are returned. * * Argument constraints: * 1. 0 < CNT < BITS_PER_MP_LIMB * 2. If the result is to be written over the input, WP must be <= UP. */ mpi_limb_t mpihelp_rshift(mpi_ptr_t wp, mpi_ptr_t up, mpi_size_t usize, unsigned cnt) { mpi_limb_t high_limb, low_limb; unsigned sh_1, sh_2; mpi_size_t i; mpi_limb_t retval; sh_1 = cnt; wp -= 1; sh_2 = BITS_PER_MPI_LIMB - sh_1; high_limb = up[0]; retval = high_limb << sh_2; low_limb = high_limb; for (i = 1; i < usize; i++) { high_limb = up[i]; wp[i] = (low_limb >> sh_1) | (high_limb << sh_2); low_limb = high_limb; } wp[i] = low_limb >> sh_1; return retval; }
1 1500 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 /* SPDX-License-Identifier: GPL-2.0 */ #include <linux/pm_qos.h> static inline void device_pm_init_common(struct device *dev) { if (!dev->power.early_init) { spin_lock_init(&dev->power.lock); dev->power.qos = NULL; dev->power.early_init = true; } } #ifdef CONFIG_PM static inline void pm_runtime_early_init(struct device *dev) { dev->power.disable_depth = 1; device_pm_init_common(dev); } extern void pm_runtime_init(struct device *dev); extern void pm_runtime_reinit(struct device *dev); extern void pm_runtime_remove(struct device *dev); extern u64 pm_runtime_active_time(struct device *dev); #define WAKE_IRQ_DEDICATED_ALLOCATED BIT(0) #define WAKE_IRQ_DEDICATED_MANAGED BIT(1) #define WAKE_IRQ_DEDICATED_REVERSE BIT(2) #define WAKE_IRQ_DEDICATED_MASK (WAKE_IRQ_DEDICATED_ALLOCATED | \ WAKE_IRQ_DEDICATED_MANAGED | \ WAKE_IRQ_DEDICATED_REVERSE) #define WAKE_IRQ_DEDICATED_ENABLED BIT(3) struct wake_irq { struct device *dev; unsigned int status; int irq; const char *name; }; extern void dev_pm_arm_wake_irq(struct wake_irq *wirq); extern void dev_pm_disarm_wake_irq(struct wake_irq *wirq); extern void dev_pm_enable_wake_irq_check(struct device *dev, bool can_change_status); extern void dev_pm_disable_wake_irq_check(struct device *dev, bool cond_disable); extern void dev_pm_enable_wake_irq_complete(struct device *dev); #ifdef CONFIG_PM_SLEEP extern void device_wakeup_attach_irq(struct device *dev, struct wake_irq *wakeirq); extern void device_wakeup_detach_irq(struct device *dev); extern void device_wakeup_arm_wake_irqs(void); extern void device_wakeup_disarm_wake_irqs(void); #else static inline void device_wakeup_attach_irq(struct device *dev, struct wake_irq *wakeirq) {} static inline void device_wakeup_detach_irq(struct device *dev) { } #endif /* CONFIG_PM_SLEEP */ /* * sysfs.c */ extern int dpm_sysfs_add(struct device *dev); extern void dpm_sysfs_remove(struct device *dev); extern void rpm_sysfs_remove(struct device *dev); extern int wakeup_sysfs_add(struct device *dev); extern void wakeup_sysfs_remove(struct device *dev); extern int pm_qos_sysfs_add_resume_latency(struct device *dev); extern void pm_qos_sysfs_remove_resume_latency(struct device *dev); extern int pm_qos_sysfs_add_flags(struct device *dev); extern void pm_qos_sysfs_remove_flags(struct device *dev); extern int pm_qos_sysfs_add_latency_tolerance(struct device *dev); extern void pm_qos_sysfs_remove_latency_tolerance(struct device *dev); extern int dpm_sysfs_change_owner(struct device *dev, kuid_t kuid, kgid_t kgid); #else /* CONFIG_PM */ static inline void pm_runtime_early_init(struct device *dev) { device_pm_init_common(dev); } static inline void pm_runtime_init(struct device *dev) {} static inline void pm_runtime_reinit(struct device *dev) {} static inline void pm_runtime_remove(struct device *dev) {} static inline int dpm_sysfs_add(struct device *dev) { return 0; } static inline void dpm_sysfs_remove(struct device *dev) {} static inline int dpm_sysfs_change_owner(struct device *dev, kuid_t kuid, kgid_t kgid) { return 0; } #endif #ifdef CONFIG_PM_SLEEP /* kernel/power/main.c */ extern int pm_async_enabled; /* drivers/base/power/main.c */ extern struct list_head dpm_list; /* The active device list */ static inline struct device *to_device(struct list_head *entry) { return container_of(entry, struct device, power.entry); } extern void device_pm_sleep_init(struct device *dev); extern void device_pm_add(struct device *); extern void device_pm_remove(struct device *); extern void device_pm_move_before(struct device *, struct device *); extern void device_pm_move_after(struct device *, struct device *); extern void device_pm_move_last(struct device *); extern void device_pm_check_callbacks(struct device *dev); static inline bool device_pm_initialized(struct device *dev) { return dev->power.in_dpm_list; } /* drivers/base/power/wakeup_stats.c */ extern int wakeup_source_sysfs_add(struct device *parent, struct wakeup_source *ws); extern void wakeup_source_sysfs_remove(struct wakeup_source *ws); extern int pm_wakeup_source_sysfs_add(struct device *parent); #else /* !CONFIG_PM_SLEEP */ static inline void device_pm_sleep_init(struct device *dev) {} static inline void device_pm_add(struct device *dev) {} static inline void device_pm_remove(struct device *dev) { pm_runtime_remove(dev); } static inline void device_pm_move_before(struct device *deva, struct device *devb) {} static inline void device_pm_move_after(struct device *deva, struct device *devb) {} static inline void device_pm_move_last(struct device *dev) {} static inline void device_pm_check_callbacks(struct device *dev) {} static inline bool device_pm_initialized(struct device *dev) { return device_is_registered(dev); } static inline int pm_wakeup_source_sysfs_add(struct device *parent) { return 0; } #endif /* !CONFIG_PM_SLEEP */ static inline void device_pm_init(struct device *dev) { device_pm_init_common(dev); device_pm_sleep_init(dev); pm_runtime_init(dev); }
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 #ifndef _CRYPTO_GCM_H #define _CRYPTO_GCM_H #include <linux/errno.h> #include <crypto/aes.h> #include <crypto/gf128mul.h> #define GCM_AES_IV_SIZE 12 #define GCM_RFC4106_IV_SIZE 8 #define GCM_RFC4543_IV_SIZE 8 /* * validate authentication tag for GCM */ static inline int crypto_gcm_check_authsize(unsigned int authsize) { switch (authsize) { case 4: case 8: case 12: case 13: case 14: case 15: case 16: break; default: return -EINVAL; } return 0; } /* * validate authentication tag for RFC4106 */ static inline int crypto_rfc4106_check_authsize(unsigned int authsize) { switch (authsize) { case 8: case 12: case 16: break; default: return -EINVAL; } return 0; } /* * validate assoclen for RFC4106/RFC4543 */ static inline int crypto_ipsec_check_assoclen(unsigned int assoclen) { switch (assoclen) { case 16: case 20: break; default: return -EINVAL; } return 0; } struct aesgcm_ctx { be128 ghash_key; struct crypto_aes_ctx aes_ctx; unsigned int authsize; }; int aesgcm_expandkey(struct aesgcm_ctx *ctx, const u8 *key, unsigned int keysize, unsigned int authsize); void aesgcm_encrypt(const struct aesgcm_ctx *ctx, u8 *dst, const u8 *src, int crypt_len, const u8 *assoc, int assoc_len, const u8 iv[GCM_AES_IV_SIZE], u8 *authtag); bool __must_check aesgcm_decrypt(const struct aesgcm_ctx *ctx, u8 *dst, const u8 *src, int crypt_len, const u8 *assoc, int assoc_len, const u8 iv[GCM_AES_IV_SIZE], const u8 *authtag); #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 /* SPDX-License-Identifier: GPL-2.0 */ /* * USB PHY defines * * These APIs may be used between USB controllers. USB device drivers * (for either host or peripheral roles) don't use these calls; they * continue to use just usb_device and usb_gadget. */ #ifndef __LINUX_USB_PHY_H #define __LINUX_USB_PHY_H #include <linux/extcon.h> #include <linux/notifier.h> #include <linux/usb.h> #include <uapi/linux/usb/charger.h> enum usb_phy_interface { USBPHY_INTERFACE_MODE_UNKNOWN, USBPHY_INTERFACE_MODE_UTMI, USBPHY_INTERFACE_MODE_UTMIW, USBPHY_INTERFACE_MODE_ULPI, USBPHY_INTERFACE_MODE_SERIAL, USBPHY_INTERFACE_MODE_HSIC, }; enum usb_phy_events { USB_EVENT_NONE, /* no events or cable disconnected */ USB_EVENT_VBUS, /* vbus valid event */ USB_EVENT_ID, /* id was grounded */ USB_EVENT_CHARGER, /* usb dedicated charger */ USB_EVENT_ENUMERATED, /* gadget driver enumerated */ }; /* associate a type with PHY */ enum usb_phy_type { USB_PHY_TYPE_UNDEFINED, USB_PHY_TYPE_USB2, USB_PHY_TYPE_USB3, }; /* OTG defines lots of enumeration states before device reset */ enum usb_otg_state { OTG_STATE_UNDEFINED = 0, /* single-role peripheral, and dual-role default-b */ OTG_STATE_B_IDLE, OTG_STATE_B_SRP_INIT, OTG_STATE_B_PERIPHERAL, /* extra dual-role default-b states */ OTG_STATE_B_WAIT_ACON, OTG_STATE_B_HOST, /* dual-role default-a */ OTG_STATE_A_IDLE, OTG_STATE_A_WAIT_VRISE, OTG_STATE_A_WAIT_BCON, OTG_STATE_A_HOST, OTG_STATE_A_SUSPEND, OTG_STATE_A_PERIPHERAL, OTG_STATE_A_WAIT_VFALL, OTG_STATE_A_VBUS_ERR, }; struct usb_phy; struct usb_otg; /* for phys connected thru an ULPI interface, the user must * provide access ops */ struct usb_phy_io_ops { int (*read)(struct usb_phy *x, u32 reg); int (*write)(struct usb_phy *x, u32 val, u32 reg); }; struct usb_charger_current { unsigned int sdp_min; unsigned int sdp_max; unsigned int dcp_min; unsigned int dcp_max; unsigned int cdp_min; unsigned int cdp_max; unsigned int aca_min; unsigned int aca_max; }; struct usb_phy { struct device *dev; const char *label; unsigned int flags; enum usb_phy_type type; enum usb_phy_events last_event; struct usb_otg *otg; struct device *io_dev; struct usb_phy_io_ops *io_ops; void __iomem *io_priv; /* to support extcon device */ struct extcon_dev *edev; struct extcon_dev *id_edev; struct notifier_block vbus_nb; struct notifier_block id_nb; struct notifier_block type_nb; /* Support USB charger */ enum usb_charger_type chg_type; enum usb_charger_state chg_state; struct usb_charger_current chg_cur; struct work_struct chg_work; /* for notification of usb_phy_events */ struct atomic_notifier_head notifier; /* to pass extra port status to the root hub */ u16 port_status; u16 port_change; /* to support controllers that have multiple phys */ struct list_head head; /* initialize/shutdown the phy */ int (*init)(struct usb_phy *x); void (*shutdown)(struct usb_phy *x); /* enable/disable VBUS */ int (*set_vbus)(struct usb_phy *x, int on); /* effective for B devices, ignored for A-peripheral */ int (*set_power)(struct usb_phy *x, unsigned mA); /* Set phy into suspend mode */ int (*set_suspend)(struct usb_phy *x, int suspend); /* * Set wakeup enable for PHY, in that case, the PHY can be * woken up from suspend status due to external events, * like vbus change, dp/dm change and id. */ int (*set_wakeup)(struct usb_phy *x, bool enabled); /* notify phy port status change */ int (*notify_port_status)(struct usb_phy *x, int port, u16 portstatus, u16 portchange); /* notify phy connect status change */ int (*notify_connect)(struct usb_phy *x, enum usb_device_speed speed); int (*notify_disconnect)(struct usb_phy *x, enum usb_device_speed speed); /* * Charger detection method can be implemented if you need to * manually detect the charger type. */ enum usb_charger_type (*charger_detect)(struct usb_phy *x); }; /* for board-specific init logic */ extern int usb_add_phy(struct usb_phy *, enum usb_phy_type type); extern int usb_add_phy_dev(struct usb_phy *); extern void usb_remove_phy(struct usb_phy *); /* helpers for direct access thru low-level io interface */ static inline int usb_phy_io_read(struct usb_phy *x, u32 reg) { if (x && x->io_ops && x->io_ops->read) return x->io_ops->read(x, reg); return -EINVAL; } static inline int usb_phy_io_write(struct usb_phy *x, u32 val, u32 reg) { if (x && x->io_ops && x->io_ops->write) return x->io_ops->write(x, val, reg); return -EINVAL; } static inline int usb_phy_init(struct usb_phy *x) { if (x && x->init) return x->init(x); return 0; } static inline void usb_phy_shutdown(struct usb_phy *x) { if (x && x->shutdown) x->shutdown(x); } static inline int usb_phy_vbus_on(struct usb_phy *x) { if (!x || !x->set_vbus) return 0; return x->set_vbus(x, true); } static inline int usb_phy_vbus_off(struct usb_phy *x) { if (!x || !x->set_vbus) return 0; return x->set_vbus(x, false); } /* for usb host and peripheral controller drivers */ #if IS_ENABLED(CONFIG_USB_PHY) extern struct usb_phy *usb_get_phy(enum usb_phy_type type); extern struct usb_phy *devm_usb_get_phy(struct device *dev, enum usb_phy_type type); extern struct usb_phy *devm_usb_get_phy_by_phandle(struct device *dev, const char *phandle, u8 index); extern struct usb_phy *devm_usb_get_phy_by_node(struct device *dev, struct device_node *node, struct notifier_block *nb); extern void usb_put_phy(struct usb_phy *); extern void devm_usb_put_phy(struct device *dev, struct usb_phy *x); extern void usb_phy_set_event(struct usb_phy *x, unsigned long event); extern void usb_phy_set_charger_current(struct usb_phy *usb_phy, unsigned int mA); extern void usb_phy_get_charger_current(struct usb_phy *usb_phy, unsigned int *min, unsigned int *max); extern void usb_phy_set_charger_state(struct usb_phy *usb_phy, enum usb_charger_state state); #else static inline struct usb_phy *usb_get_phy(enum usb_phy_type type) { return ERR_PTR(-ENXIO); } static inline struct usb_phy *devm_usb_get_phy(struct device *dev, enum usb_phy_type type) { return ERR_PTR(-ENXIO); } static inline struct usb_phy *devm_usb_get_phy_by_phandle(struct device *dev, const char *phandle, u8 index) { return ERR_PTR(-ENXIO); } static inline struct usb_phy *devm_usb_get_phy_by_node(struct device *dev, struct device_node *node, struct notifier_block *nb) { return ERR_PTR(-ENXIO); } static inline void usb_put_phy(struct usb_phy *x) { } static inline void devm_usb_put_phy(struct device *dev, struct usb_phy *x) { } static inline void usb_phy_set_event(struct usb_phy *x, unsigned long event) { } static inline void usb_phy_set_charger_current(struct usb_phy *usb_phy, unsigned int mA) { } static inline void usb_phy_get_charger_current(struct usb_phy *usb_phy, unsigned int *min, unsigned int *max) { } static inline void usb_phy_set_charger_state(struct usb_phy *usb_phy, enum usb_charger_state state) { } #endif static inline int usb_phy_set_power(struct usb_phy *x, unsigned mA) { if (!x) return 0; usb_phy_set_charger_current(x, mA); if (x->set_power) return x->set_power(x, mA); return 0; } /* Context: can sleep */ static inline int usb_phy_set_suspend(struct usb_phy *x, int suspend) { if (x && x->set_suspend != NULL) return x->set_suspend(x, suspend); else return 0; } static inline int usb_phy_set_wakeup(struct usb_phy *x, bool enabled) { if (x && x->set_wakeup) return x->set_wakeup(x, enabled); else return 0; } static inline int usb_phy_notify_port_status(struct usb_phy *x, int port, u16 portstatus, u16 portchange) { if (x && x->notify_port_status) return x->notify_port_status(x, port, portstatus, portchange); else return 0; } static inline int usb_phy_notify_connect(struct usb_phy *x, enum usb_device_speed speed) { if (x && x->notify_connect) return x->notify_connect(x, speed); else return 0; } static inline int usb_phy_notify_disconnect(struct usb_phy *x, enum usb_device_speed speed) { if (x && x->notify_disconnect) return x->notify_disconnect(x, speed); else return 0; } /* notifiers */ static inline int usb_register_notifier(struct usb_phy *x, struct notifier_block *nb) { return atomic_notifier_chain_register(&x->notifier, nb); } static inline void usb_unregister_notifier(struct usb_phy *x, struct notifier_block *nb) { atomic_notifier_chain_unregister(&x->notifier, nb); } static inline const char *usb_phy_type_string(enum usb_phy_type type) { switch (type) { case USB_PHY_TYPE_USB2: return "USB2 PHY"; case USB_PHY_TYPE_USB3: return "USB3 PHY"; default: return "UNKNOWN PHY TYPE"; } } #endif /* __LINUX_USB_PHY_H */
46 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM vsock #if !defined(_TRACE_VSOCK_VIRTIO_TRANSPORT_COMMON_H) || \ defined(TRACE_HEADER_MULTI_READ) #define _TRACE_VSOCK_VIRTIO_TRANSPORT_COMMON_H #include <linux/tracepoint.h> TRACE_DEFINE_ENUM(VIRTIO_VSOCK_TYPE_STREAM); TRACE_DEFINE_ENUM(VIRTIO_VSOCK_TYPE_SEQPACKET); #define show_type(val) \ __print_symbolic(val, \ { VIRTIO_VSOCK_TYPE_STREAM, "STREAM" }, \ { VIRTIO_VSOCK_TYPE_SEQPACKET, "SEQPACKET" }) TRACE_DEFINE_ENUM(VIRTIO_VSOCK_OP_INVALID); TRACE_DEFINE_ENUM(VIRTIO_VSOCK_OP_REQUEST); TRACE_DEFINE_ENUM(VIRTIO_VSOCK_OP_RESPONSE); TRACE_DEFINE_ENUM(VIRTIO_VSOCK_OP_RST); TRACE_DEFINE_ENUM(VIRTIO_VSOCK_OP_SHUTDOWN); TRACE_DEFINE_ENUM(VIRTIO_VSOCK_OP_RW); TRACE_DEFINE_ENUM(VIRTIO_VSOCK_OP_CREDIT_UPDATE); TRACE_DEFINE_ENUM(VIRTIO_VSOCK_OP_CREDIT_REQUEST); #define show_op(val) \ __print_symbolic(val, \ { VIRTIO_VSOCK_OP_INVALID, "INVALID" }, \ { VIRTIO_VSOCK_OP_REQUEST, "REQUEST" }, \ { VIRTIO_VSOCK_OP_RESPONSE, "RESPONSE" }, \ { VIRTIO_VSOCK_OP_RST, "RST" }, \ { VIRTIO_VSOCK_OP_SHUTDOWN, "SHUTDOWN" }, \ { VIRTIO_VSOCK_OP_RW, "RW" }, \ { VIRTIO_VSOCK_OP_CREDIT_UPDATE, "CREDIT_UPDATE" }, \ { VIRTIO_VSOCK_OP_CREDIT_REQUEST, "CREDIT_REQUEST" }) TRACE_EVENT(virtio_transport_alloc_pkt, TP_PROTO( __u32 src_cid, __u32 src_port, __u32 dst_cid, __u32 dst_port, __u32 len, __u16 type, __u16 op, __u32 flags, bool zcopy ), TP_ARGS( src_cid, src_port, dst_cid, dst_port, len, type, op, flags, zcopy ), TP_STRUCT__entry( __field(__u32, src_cid) __field(__u32, src_port) __field(__u32, dst_cid) __field(__u32, dst_port) __field(__u32, len) __field(__u16, type) __field(__u16, op) __field(__u32, flags) __field(bool, zcopy) ), TP_fast_assign( __entry->src_cid = src_cid; __entry->src_port = src_port; __entry->dst_cid = dst_cid; __entry->dst_port = dst_port; __entry->len = len; __entry->type = type; __entry->op = op; __entry->flags = flags; __entry->zcopy = zcopy; ), TP_printk("%u:%u -> %u:%u len=%u type=%s op=%s flags=%#x zcopy=%s", __entry->src_cid, __entry->src_port, __entry->dst_cid, __entry->dst_port, __entry->len, show_type(__entry->type), show_op(__entry->op), __entry->flags, __entry->zcopy ? "true" : "false") ); TRACE_EVENT(virtio_transport_recv_pkt, TP_PROTO( __u32 src_cid, __u32 src_port, __u32 dst_cid, __u32 dst_port, __u32 len, __u16 type, __u16 op, __u32 flags, __u32 buf_alloc, __u32 fwd_cnt ), TP_ARGS( src_cid, src_port, dst_cid, dst_port, len, type, op, flags, buf_alloc, fwd_cnt ), TP_STRUCT__entry( __field(__u32, src_cid) __field(__u32, src_port) __field(__u32, dst_cid) __field(__u32, dst_port) __field(__u32, len) __field(__u16, type) __field(__u16, op) __field(__u32, flags) __field(__u32, buf_alloc) __field(__u32, fwd_cnt) ), TP_fast_assign( __entry->src_cid = src_cid; __entry->src_port = src_port; __entry->dst_cid = dst_cid; __entry->dst_port = dst_port; __entry->len = len; __entry->type = type; __entry->op = op; __entry->flags = flags; __entry->buf_alloc = buf_alloc; __entry->fwd_cnt = fwd_cnt; ), TP_printk("%u:%u -> %u:%u len=%u type=%s op=%s flags=%#x " "buf_alloc=%u fwd_cnt=%u", __entry->src_cid, __entry->src_port, __entry->dst_cid, __entry->dst_port, __entry->len, show_type(__entry->type), show_op(__entry->op), __entry->flags, __entry->buf_alloc, __entry->fwd_cnt) ); #endif /* _TRACE_VSOCK_VIRTIO_TRANSPORT_COMMON_H */ #undef TRACE_INCLUDE_FILE #define TRACE_INCLUDE_FILE vsock_virtio_transport_common /* This part must be outside protection */ #include <trace/define_trace.h>
12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef PM_TRACE_H #define PM_TRACE_H #include <linux/types.h> #ifdef CONFIG_PM_TRACE #include <asm/pm-trace.h> extern int pm_trace_enabled; extern bool pm_trace_rtc_abused; static inline bool pm_trace_rtc_valid(void) { return !pm_trace_rtc_abused; } static inline int pm_trace_is_enabled(void) { return pm_trace_enabled; } struct device; extern void set_trace_device(struct device *); extern void generate_pm_trace(const void *tracedata, unsigned int user); extern int show_trace_dev_match(char *buf, size_t size); #define TRACE_DEVICE(dev) do { \ if (pm_trace_enabled) \ set_trace_device(dev); \ } while(0) #else static inline bool pm_trace_rtc_valid(void) { return true; } static inline int pm_trace_is_enabled(void) { return 0; } #define TRACE_DEVICE(dev) do { } while (0) #define TRACE_RESUME(dev) do { } while (0) #define TRACE_SUSPEND(dev) do { } while (0) #endif #endif
120 121 121 120 155 153 111 10 155 126 112 126 114 113 113 85 82 82 126 126 155 153 10 5 5 5 5 5 5 9 9 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2008 Oracle. All rights reserved. */ #include <linux/sched.h> #include <linux/pagemap.h> #include <linux/spinlock.h> #include <linux/page-flags.h> #include <asm/bug.h> #include <trace/events/btrfs.h> #include "misc.h" #include "ctree.h" #include "extent_io.h" #include "locking.h" #include "accessors.h" /* * Lockdep class keys for extent_buffer->lock's in this root. For a given * eb, the lockdep key is determined by the btrfs_root it belongs to and * the level the eb occupies in the tree. * * Different roots are used for different purposes and may nest inside each * other and they require separate keysets. As lockdep keys should be * static, assign keysets according to the purpose of the root as indicated * by btrfs_root->root_key.objectid. This ensures that all special purpose * roots have separate keysets. * * Lock-nesting across peer nodes is always done with the immediate parent * node locked thus preventing deadlock. As lockdep doesn't know this, use * subclass to avoid triggering lockdep warning in such cases. * * The key is set by the readpage_end_io_hook after the buffer has passed * csum validation but before the pages are unlocked. It is also set by * btrfs_init_new_buffer on freshly allocated blocks. * * We also add a check to make sure the highest level of the tree is the * same as our lockdep setup here. If BTRFS_MAX_LEVEL changes, this code * needs update as well. */ #ifdef CONFIG_DEBUG_LOCK_ALLOC #if BTRFS_MAX_LEVEL != 8 #error #endif #define DEFINE_LEVEL(stem, level) \ .names[level] = "btrfs-" stem "-0" #level, #define DEFINE_NAME(stem) \ DEFINE_LEVEL(stem, 0) \ DEFINE_LEVEL(stem, 1) \ DEFINE_LEVEL(stem, 2) \ DEFINE_LEVEL(stem, 3) \ DEFINE_LEVEL(stem, 4) \ DEFINE_LEVEL(stem, 5) \ DEFINE_LEVEL(stem, 6) \ DEFINE_LEVEL(stem, 7) static struct btrfs_lockdep_keyset { u64 id; /* root objectid */ /* Longest entry: btrfs-block-group-00 */ char names[BTRFS_MAX_LEVEL][24]; struct lock_class_key keys[BTRFS_MAX_LEVEL]; } btrfs_lockdep_keysets[] = { { .id = BTRFS_ROOT_TREE_OBJECTID, DEFINE_NAME("root") }, { .id = BTRFS_EXTENT_TREE_OBJECTID, DEFINE_NAME("extent") }, { .id = BTRFS_CHUNK_TREE_OBJECTID, DEFINE_NAME("chunk") }, { .id = BTRFS_DEV_TREE_OBJECTID, DEFINE_NAME("dev") }, { .id = BTRFS_CSUM_TREE_OBJECTID, DEFINE_NAME("csum") }, { .id = BTRFS_QUOTA_TREE_OBJECTID, DEFINE_NAME("quota") }, { .id = BTRFS_TREE_LOG_OBJECTID, DEFINE_NAME("log") }, { .id = BTRFS_TREE_RELOC_OBJECTID, DEFINE_NAME("treloc") }, { .id = BTRFS_DATA_RELOC_TREE_OBJECTID, DEFINE_NAME("dreloc") }, { .id = BTRFS_UUID_TREE_OBJECTID, DEFINE_NAME("uuid") }, { .id = BTRFS_FREE_SPACE_TREE_OBJECTID, DEFINE_NAME("free-space") }, { .id = BTRFS_BLOCK_GROUP_TREE_OBJECTID, DEFINE_NAME("block-group") }, { .id = BTRFS_RAID_STRIPE_TREE_OBJECTID, DEFINE_NAME("raid-stripe") }, { .id = 0, DEFINE_NAME("tree") }, }; #undef DEFINE_LEVEL #undef DEFINE_NAME void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, int level) { struct btrfs_lockdep_keyset *ks; BUG_ON(level >= ARRAY_SIZE(ks->keys)); /* Find the matching keyset, id 0 is the default entry */ for (ks = btrfs_lockdep_keysets; ks->id; ks++) if (ks->id == objectid) break; lockdep_set_class_and_name(&eb->lock, &ks->keys[level], ks->names[level]); } void btrfs_maybe_reset_lockdep_class(struct btrfs_root *root, struct extent_buffer *eb) { if (test_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &root->state)) btrfs_set_buffer_lockdep_class(root->root_key.objectid, eb, btrfs_header_level(eb)); } #endif #ifdef CONFIG_BTRFS_DEBUG static void btrfs_set_eb_lock_owner(struct extent_buffer *eb, pid_t owner) { eb->lock_owner = owner; } #else static void btrfs_set_eb_lock_owner(struct extent_buffer *eb, pid_t owner) { } #endif /* * Extent buffer locking * ===================== * * We use a rw_semaphore for tree locking, and the semantics are exactly the * same: * * - reader/writer exclusion * - writer/writer exclusion * - reader/reader sharing * - try-lock semantics for readers and writers * * The rwsem implementation does opportunistic spinning which reduces number of * times the locking task needs to sleep. */ /* * __btrfs_tree_read_lock - lock extent buffer for read * @eb: the eb to be locked * @nest: the nesting level to be used for lockdep * * This takes the read lock on the extent buffer, using the specified nesting * level for lockdep purposes. */ void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest) { u64 start_ns = 0; if (trace_btrfs_tree_read_lock_enabled()) start_ns = ktime_get_ns(); down_read_nested(&eb->lock, nest); trace_btrfs_tree_read_lock(eb, start_ns); } void btrfs_tree_read_lock(struct extent_buffer *eb) { __btrfs_tree_read_lock(eb, BTRFS_NESTING_NORMAL); } /* * Try-lock for read. * * Return 1 if the rwlock has been taken, 0 otherwise */ int btrfs_try_tree_read_lock(struct extent_buffer *eb) { if (down_read_trylock(&eb->lock)) { trace_btrfs_try_tree_read_lock(eb); return 1; } return 0; } /* * Try-lock for write. * * Return 1 if the rwlock has been taken, 0 otherwise */ int btrfs_try_tree_write_lock(struct extent_buffer *eb) { if (down_write_trylock(&eb->lock)) { btrfs_set_eb_lock_owner(eb, current->pid); trace_btrfs_try_tree_write_lock(eb); return 1; } return 0; } /* * Release read lock. */ void btrfs_tree_read_unlock(struct extent_buffer *eb) { trace_btrfs_tree_read_unlock(eb); up_read(&eb->lock); } /* * Lock eb for write. * * @eb: the eb to lock * @nest: the nesting to use for the lock * * Returns with the eb->lock write locked. */ void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest) __acquires(&eb->lock) { u64 start_ns = 0; if (trace_btrfs_tree_lock_enabled()) start_ns = ktime_get_ns(); down_write_nested(&eb->lock, nest); btrfs_set_eb_lock_owner(eb, current->pid); trace_btrfs_tree_lock(eb, start_ns); } void btrfs_tree_lock(struct extent_buffer *eb) { __btrfs_tree_lock(eb, BTRFS_NESTING_NORMAL); } /* * Release the write lock. */ void btrfs_tree_unlock(struct extent_buffer *eb) { trace_btrfs_tree_unlock(eb); btrfs_set_eb_lock_owner(eb, 0); up_write(&eb->lock); } /* * This releases any locks held in the path starting at level and going all the * way up to the root. * * btrfs_search_slot will keep the lock held on higher nodes in a few corner * cases, such as COW of the block at slot zero in the node. This ignores * those rules, and it should only be called when there are no more updates to * be done higher up in the tree. */ void btrfs_unlock_up_safe(struct btrfs_path *path, int level) { int i; if (path->keep_locks) return; for (i = level; i < BTRFS_MAX_LEVEL; i++) { if (!path->nodes[i]) continue; if (!path->locks[i]) continue; btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]); path->locks[i] = 0; } } /* * Loop around taking references on and locking the root node of the tree until * we end up with a lock on the root node. * * Return: root extent buffer with write lock held */ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root) { struct extent_buffer *eb; while (1) { eb = btrfs_root_node(root); btrfs_maybe_reset_lockdep_class(root, eb); btrfs_tree_lock(eb); if (eb == root->node) break; btrfs_tree_unlock(eb); free_extent_buffer(eb); } return eb; } /* * Loop around taking references on and locking the root node of the tree until * we end up with a lock on the root node. * * Return: root extent buffer with read lock held */ struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) { struct extent_buffer *eb; while (1) { eb = btrfs_root_node(root); btrfs_maybe_reset_lockdep_class(root, eb); btrfs_tree_read_lock(eb); if (eb == root->node) break; btrfs_tree_read_unlock(eb); free_extent_buffer(eb); } return eb; } /* * Loop around taking references on and locking the root node of the tree in * nowait mode until we end up with a lock on the root node or returning to * avoid blocking. * * Return: root extent buffer with read lock held or -EAGAIN. */ struct extent_buffer *btrfs_try_read_lock_root_node(struct btrfs_root *root) { struct extent_buffer *eb; while (1) { eb = btrfs_root_node(root); if (!btrfs_try_tree_read_lock(eb)) { free_extent_buffer(eb); return ERR_PTR(-EAGAIN); } if (eb == root->node) break; btrfs_tree_read_unlock(eb); free_extent_buffer(eb); } return eb; } /* * DREW locks * ========== * * DREW stands for double-reader-writer-exclusion lock. It's used in situation * where you want to provide A-B exclusion but not AA or BB. * * Currently implementation gives more priority to reader. If a reader and a * writer both race to acquire their respective sides of the lock the writer * would yield its lock as soon as it detects a concurrent reader. Additionally * if there are pending readers no new writers would be allowed to come in and * acquire the lock. */ void btrfs_drew_lock_init(struct btrfs_drew_lock *lock) { atomic_set(&lock->readers, 0); atomic_set(&lock->writers, 0); init_waitqueue_head(&lock->pending_readers); init_waitqueue_head(&lock->pending_writers); } /* Return true if acquisition is successful, false otherwise */ bool btrfs_drew_try_write_lock(struct btrfs_drew_lock *lock) { if (atomic_read(&lock->readers)) return false; atomic_inc(&lock->writers); /* Ensure writers count is updated before we check for pending readers */ smp_mb__after_atomic(); if (atomic_read(&lock->readers)) { btrfs_drew_write_unlock(lock); return false; } return true; } void btrfs_drew_write_lock(struct btrfs_drew_lock *lock) { while (true) { if (btrfs_drew_try_write_lock(lock)) return; wait_event(lock->pending_writers, !atomic_read(&lock->readers)); } } void btrfs_drew_write_unlock(struct btrfs_drew_lock *lock) { atomic_dec(&lock->writers); cond_wake_up(&lock->pending_readers); } void btrfs_drew_read_lock(struct btrfs_drew_lock *lock) { atomic_inc(&lock->readers); /* * Ensure the pending reader count is perceieved BEFORE this reader * goes to sleep in case of active writers. This guarantees new writers * won't be allowed and that the current reader will be woken up when * the last active writer finishes its jobs. */ smp_mb__after_atomic(); wait_event(lock->pending_readers, atomic_read(&lock->writers) == 0); } void btrfs_drew_read_unlock(struct btrfs_drew_lock *lock) { /* * atomic_dec_and_test implies a full barrier, so woken up writers * are guaranteed to see the decrement */ if (atomic_dec_and_test(&lock->readers)) wake_up(&lock->pending_writers); }
2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 // SPDX-License-Identifier: GPL-2.0-or-later /* * * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) */ #include <linux/errno.h> #include <linux/types.h> #include <linux/socket.h> #include <linux/in.h> #include <linux/kernel.h> #include <linux/timer.h> #include <linux/string.h> #include <linux/sockios.h> #include <linux/net.h> #include <linux/slab.h> #include <net/ax25.h> #include <linux/inet.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <net/sock.h> #include <net/tcp_states.h> #include <linux/fcntl.h> #include <linux/mm.h> #include <linux/interrupt.h> #include <net/rose.h> static int rose_create_facilities(unsigned char *buffer, struct rose_sock *rose); /* * This routine purges all of the queues of frames. */ void rose_clear_queues(struct sock *sk) { skb_queue_purge(&sk->sk_write_queue); skb_queue_purge(&rose_sk(sk)->ack_queue); } /* * This routine purges the input queue of those frames that have been * acknowledged. This replaces the boxes labelled "V(a) <- N(r)" on the * SDL diagram. */ void rose_frames_acked(struct sock *sk, unsigned short nr) { struct sk_buff *skb; struct rose_sock *rose = rose_sk(sk); /* * Remove all the ack-ed frames from the ack queue. */ if (rose->va != nr) { while (skb_peek(&rose->ack_queue) != NULL && rose->va != nr) { skb = skb_dequeue(&rose->ack_queue); kfree_skb(skb); rose->va = (rose->va + 1) % ROSE_MODULUS; } } } void rose_requeue_frames(struct sock *sk) { struct sk_buff *skb, *skb_prev = NULL; /* * Requeue all the un-ack-ed frames on the output queue to be picked * up by rose_kick. This arrangement handles the possibility of an * empty output queue. */ while ((skb = skb_dequeue(&rose_sk(sk)->ack_queue)) != NULL) { if (skb_prev == NULL) skb_queue_head(&sk->sk_write_queue, skb); else skb_append(skb_prev, skb, &sk->sk_write_queue); skb_prev = skb; } } /* * Validate that the value of nr is between va and vs. Return true or * false for testing. */ int rose_validate_nr(struct sock *sk, unsigned short nr) { struct rose_sock *rose = rose_sk(sk); unsigned short vc = rose->va; while (vc != rose->vs) { if (nr == vc) return 1; vc = (vc + 1) % ROSE_MODULUS; } return nr == rose->vs; } /* * This routine is called when the packet layer internally generates a * control frame. */ void rose_write_internal(struct sock *sk, int frametype) { struct rose_sock *rose = rose_sk(sk); struct sk_buff *skb; unsigned char *dptr; unsigned char lci1, lci2; int maxfaclen = 0; int len, faclen; int reserve; reserve = AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN + 1; len = ROSE_MIN_LEN; switch (frametype) { case ROSE_CALL_REQUEST: len += 1 + ROSE_ADDR_LEN + ROSE_ADDR_LEN; maxfaclen = 256; break; case ROSE_CALL_ACCEPTED: case ROSE_CLEAR_REQUEST: case ROSE_RESET_REQUEST: len += 2; break; } skb = alloc_skb(reserve + len + maxfaclen, GFP_ATOMIC); if (!skb) return; /* * Space for AX.25 header and PID. */ skb_reserve(skb, reserve); dptr = skb_put(skb, len); lci1 = (rose->lci >> 8) & 0x0F; lci2 = (rose->lci >> 0) & 0xFF; switch (frametype) { case ROSE_CALL_REQUEST: *dptr++ = ROSE_GFI | lci1; *dptr++ = lci2; *dptr++ = frametype; *dptr++ = ROSE_CALL_REQ_ADDR_LEN_VAL; memcpy(dptr, &rose->dest_addr, ROSE_ADDR_LEN); dptr += ROSE_ADDR_LEN; memcpy(dptr, &rose->source_addr, ROSE_ADDR_LEN); dptr += ROSE_ADDR_LEN; faclen = rose_create_facilities(dptr, rose); skb_put(skb, faclen); dptr += faclen; break; case ROSE_CALL_ACCEPTED: *dptr++ = ROSE_GFI | lci1; *dptr++ = lci2; *dptr++ = frametype; *dptr++ = 0x00; /* Address length */ *dptr++ = 0; /* Facilities length */ break; case ROSE_CLEAR_REQUEST: *dptr++ = ROSE_GFI | lci1; *dptr++ = lci2; *dptr++ = frametype; *dptr++ = rose->cause; *dptr++ = rose->diagnostic; break; case ROSE_RESET_REQUEST: *dptr++ = ROSE_GFI | lci1; *dptr++ = lci2; *dptr++ = frametype; *dptr++ = ROSE_DTE_ORIGINATED; *dptr++ = 0; break; case ROSE_RR: case ROSE_RNR: *dptr++ = ROSE_GFI | lci1; *dptr++ = lci2; *dptr = frametype; *dptr++ |= (rose->vr << 5) & 0xE0; break; case ROSE_CLEAR_CONFIRMATION: case ROSE_RESET_CONFIRMATION: *dptr++ = ROSE_GFI | lci1; *dptr++ = lci2; *dptr++ = frametype; break; default: printk(KERN_ERR "ROSE: rose_write_internal - invalid frametype %02X\n", frametype); kfree_skb(skb); return; } rose_transmit_link(skb, rose->neighbour); } int rose_decode(struct sk_buff *skb, int *ns, int *nr, int *q, int *d, int *m) { unsigned char *frame; frame = skb->data; *ns = *nr = *q = *d = *m = 0; switch (frame[2]) { case ROSE_CALL_REQUEST: case ROSE_CALL_ACCEPTED: case ROSE_CLEAR_REQUEST: case ROSE_CLEAR_CONFIRMATION: case ROSE_RESET_REQUEST: case ROSE_RESET_CONFIRMATION: return frame[2]; default: break; } if ((frame[2] & 0x1F) == ROSE_RR || (frame[2] & 0x1F) == ROSE_RNR) { *nr = (frame[2] >> 5) & 0x07; return frame[2] & 0x1F; } if ((frame[2] & 0x01) == ROSE_DATA) { *q = (frame[0] & ROSE_Q_BIT) == ROSE_Q_BIT; *d = (frame[0] & ROSE_D_BIT) == ROSE_D_BIT; *m = (frame[2] & ROSE_M_BIT) == ROSE_M_BIT; *nr = (frame[2] >> 5) & 0x07; *ns = (frame[2] >> 1) & 0x07; return ROSE_DATA; } return ROSE_ILLEGAL; } static int rose_parse_national(unsigned char *p, struct rose_facilities_struct *facilities, int len) { unsigned char *pt; unsigned char l, lg, n = 0; int fac_national_digis_received = 0; do { switch (*p & 0xC0) { case 0x00: if (len < 2) return -1; p += 2; n += 2; len -= 2; break; case 0x40: if (len < 3) return -1; if (*p == FAC_NATIONAL_RAND) facilities->rand = ((p[1] << 8) & 0xFF00) + ((p[2] << 0) & 0x00FF); p += 3; n += 3; len -= 3; break; case 0x80: if (len < 4) return -1; p += 4; n += 4; len -= 4; break; case 0xC0: if (len < 2) return -1; l = p[1]; if (len < 2 + l) return -1; if (*p == FAC_NATIONAL_DEST_DIGI) { if (!fac_national_digis_received) { if (l < AX25_ADDR_LEN) return -1; memcpy(&facilities->source_digis[0], p + 2, AX25_ADDR_LEN); facilities->source_ndigis = 1; } } else if (*p == FAC_NATIONAL_SRC_DIGI) { if (!fac_national_digis_received) { if (l < AX25_ADDR_LEN) return -1; memcpy(&facilities->dest_digis[0], p + 2, AX25_ADDR_LEN); facilities->dest_ndigis = 1; } } else if (*p == FAC_NATIONAL_FAIL_CALL) { if (l < AX25_ADDR_LEN) return -1; memcpy(&facilities->fail_call, p + 2, AX25_ADDR_LEN); } else if (*p == FAC_NATIONAL_FAIL_ADD) { if (l < 1 + ROSE_ADDR_LEN) return -1; memcpy(&facilities->fail_addr, p + 3, ROSE_ADDR_LEN); } else if (*p == FAC_NATIONAL_DIGIS) { if (l % AX25_ADDR_LEN) return -1; fac_national_digis_received = 1; facilities->source_ndigis = 0; facilities->dest_ndigis = 0; for (pt = p + 2, lg = 0 ; lg < l ; pt += AX25_ADDR_LEN, lg += AX25_ADDR_LEN) { if (pt[6] & AX25_HBIT) { if (facilities->dest_ndigis >= ROSE_MAX_DIGIS) return -1; memcpy(&facilities->dest_digis[facilities->dest_ndigis++], pt, AX25_ADDR_LEN); } else { if (facilities->source_ndigis >= ROSE_MAX_DIGIS) return -1; memcpy(&facilities->source_digis[facilities->source_ndigis++], pt, AX25_ADDR_LEN); } } } p += l + 2; n += l + 2; len -= l + 2; break; } } while (*p != 0x00 && len > 0); return n; } static int rose_parse_ccitt(unsigned char *p, struct rose_facilities_struct *facilities, int len) { unsigned char l, n = 0; char callsign[11]; do { switch (*p & 0xC0) { case 0x00: if (len < 2) return -1; p += 2; n += 2; len -= 2; break; case 0x40: if (len < 3) return -1; p += 3; n += 3; len -= 3; break; case 0x80: if (len < 4) return -1; p += 4; n += 4; len -= 4; break; case 0xC0: if (len < 2) return -1; l = p[1]; /* Prevent overflows*/ if (l < 10 || l > 20) return -1; if (*p == FAC_CCITT_DEST_NSAP) { memcpy(&facilities->source_addr, p + 7, ROSE_ADDR_LEN); memcpy(callsign, p + 12, l - 10); callsign[l - 10] = '\0'; asc2ax(&facilities->source_call, callsign); } if (*p == FAC_CCITT_SRC_NSAP) { memcpy(&facilities->dest_addr, p + 7, ROSE_ADDR_LEN); memcpy(callsign, p + 12, l - 10); callsign[l - 10] = '\0'; asc2ax(&facilities->dest_call, callsign); } p += l + 2; n += l + 2; len -= l + 2; break; } } while (*p != 0x00 && len > 0); return n; } int rose_parse_facilities(unsigned char *p, unsigned packet_len, struct rose_facilities_struct *facilities) { int facilities_len, len; facilities_len = *p++; if (facilities_len == 0 || (unsigned int)facilities_len > packet_len) return 0; while (facilities_len >= 3 && *p == 0x00) { facilities_len--; p++; switch (*p) { case FAC_NATIONAL: /* National */ len = rose_parse_national(p + 1, facilities, facilities_len - 1); break; case FAC_CCITT: /* CCITT */ len = rose_parse_ccitt(p + 1, facilities, facilities_len - 1); break; default: printk(KERN_DEBUG "ROSE: rose_parse_facilities - unknown facilities family %02X\n", *p); len = 1; break; } if (len < 0) return 0; if (WARN_ON(len >= facilities_len)) return 0; facilities_len -= len + 1; p += len + 1; } return facilities_len == 0; } static int rose_create_facilities(unsigned char *buffer, struct rose_sock *rose) { unsigned char *p = buffer + 1; char *callsign; char buf[11]; int len, nb; /* National Facilities */ if (rose->rand != 0 || rose->source_ndigis == 1 || rose->dest_ndigis == 1) { *p++ = 0x00; *p++ = FAC_NATIONAL; if (rose->rand != 0) { *p++ = FAC_NATIONAL_RAND; *p++ = (rose->rand >> 8) & 0xFF; *p++ = (rose->rand >> 0) & 0xFF; } /* Sent before older facilities */ if ((rose->source_ndigis > 0) || (rose->dest_ndigis > 0)) { int maxdigi = 0; *p++ = FAC_NATIONAL_DIGIS; *p++ = AX25_ADDR_LEN * (rose->source_ndigis + rose->dest_ndigis); for (nb = 0 ; nb < rose->source_ndigis ; nb++) { if (++maxdigi >= ROSE_MAX_DIGIS) break; memcpy(p, &rose->source_digis[nb], AX25_ADDR_LEN); p[6] |= AX25_HBIT; p += AX25_ADDR_LEN; } for (nb = 0 ; nb < rose->dest_ndigis ; nb++) { if (++maxdigi >= ROSE_MAX_DIGIS) break; memcpy(p, &rose->dest_digis[nb], AX25_ADDR_LEN); p[6] &= ~AX25_HBIT; p += AX25_ADDR_LEN; } } /* For compatibility */ if (rose->source_ndigis > 0) { *p++ = FAC_NATIONAL_SRC_DIGI; *p++ = AX25_ADDR_LEN; memcpy(p, &rose->source_digis[0], AX25_ADDR_LEN); p += AX25_ADDR_LEN; } /* For compatibility */ if (rose->dest_ndigis > 0) { *p++ = FAC_NATIONAL_DEST_DIGI; *p++ = AX25_ADDR_LEN; memcpy(p, &rose->dest_digis[0], AX25_ADDR_LEN); p += AX25_ADDR_LEN; } } *p++ = 0x00; *p++ = FAC_CCITT; *p++ = FAC_CCITT_DEST_NSAP; callsign = ax2asc(buf, &rose->dest_call); *p++ = strlen(callsign) + 10; *p++ = (strlen(callsign) + 9) * 2; /* ??? */ *p++ = 0x47; *p++ = 0x00; *p++ = 0x11; *p++ = ROSE_ADDR_LEN * 2; memcpy(p, &rose->dest_addr, ROSE_ADDR_LEN); p += ROSE_ADDR_LEN; memcpy(p, callsign, strlen(callsign)); p += strlen(callsign); *p++ = FAC_CCITT_SRC_NSAP; callsign = ax2asc(buf, &rose->source_call); *p++ = strlen(callsign) + 10; *p++ = (strlen(callsign) + 9) * 2; /* ??? */ *p++ = 0x47; *p++ = 0x00; *p++ = 0x11; *p++ = ROSE_ADDR_LEN * 2; memcpy(p, &rose->source_addr, ROSE_ADDR_LEN); p += ROSE_ADDR_LEN; memcpy(p, callsign, strlen(callsign)); p += strlen(callsign); len = p - buffer; buffer[0] = len - 1; return len; } void rose_disconnect(struct sock *sk, int reason, int cause, int diagnostic) { struct rose_sock *rose = rose_sk(sk); rose_stop_timer(sk); rose_stop_idletimer(sk); rose_clear_queues(sk); rose->lci = 0; rose->state = ROSE_STATE_0; if (cause != -1) rose->cause = cause; if (diagnostic != -1) rose->diagnostic = diagnostic; sk->sk_state = TCP_CLOSE; sk->sk_err = reason; sk->sk_shutdown |= SEND_SHUTDOWN; if (!sock_flag(sk, SOCK_DEAD)) { sk->sk_state_change(sk); sock_set_flag(sk, SOCK_DEAD); } }
34 34 34 34 34 11 11 1 10 10 10 1 8 9 1 10 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (C) International Business Machines Corp., 2000-2004 */ #include <linux/fs.h> #include <linux/quotaops.h> #include "jfs_incore.h" #include "jfs_inode.h" #include "jfs_filsys.h" #include "jfs_imap.h" #include "jfs_dinode.h" #include "jfs_debug.h" void jfs_set_inode_flags(struct inode *inode) { unsigned int flags = JFS_IP(inode)->mode2; unsigned int new_fl = 0; if (flags & JFS_IMMUTABLE_FL) new_fl |= S_IMMUTABLE; if (flags & JFS_APPEND_FL) new_fl |= S_APPEND; if (flags & JFS_NOATIME_FL) new_fl |= S_NOATIME; if (flags & JFS_DIRSYNC_FL) new_fl |= S_DIRSYNC; if (flags & JFS_SYNC_FL) new_fl |= S_SYNC; inode_set_flags(inode, new_fl, S_IMMUTABLE | S_APPEND | S_NOATIME | S_DIRSYNC | S_SYNC); } /* * NAME: ialloc() * * FUNCTION: Allocate a new inode * */ struct inode *ialloc(struct inode *parent, umode_t mode) { struct super_block *sb = parent->i_sb; struct inode *inode; struct jfs_inode_info *jfs_inode; int rc; inode = new_inode(sb); if (!inode) { jfs_warn("ialloc: new_inode returned NULL!"); return ERR_PTR(-ENOMEM); } jfs_inode = JFS_IP(inode); rc = diAlloc(parent, S_ISDIR(mode), inode); if (rc) { jfs_warn("ialloc: diAlloc returned %d!", rc); goto fail_put; } if (insert_inode_locked(inode) < 0) { rc = -EINVAL; goto fail_put; } inode_init_owner(&nop_mnt_idmap, inode, parent, mode); /* * New inodes need to save sane values on disk when * uid & gid mount options are used */ jfs_inode->saved_uid = inode->i_uid; jfs_inode->saved_gid = inode->i_gid; /* * Allocate inode to quota. */ rc = dquot_initialize(inode); if (rc) goto fail_drop; rc = dquot_alloc_inode(inode); if (rc) goto fail_drop; /* inherit flags from parent */ jfs_inode->mode2 = JFS_IP(parent)->mode2 & JFS_FL_INHERIT; if (S_ISDIR(mode)) { jfs_inode->mode2 |= IDIRECTORY; jfs_inode->mode2 &= ~JFS_DIRSYNC_FL; } else { jfs_inode->mode2 |= INLINEEA | ISPARSE; if (S_ISLNK(mode)) jfs_inode->mode2 &= ~(JFS_IMMUTABLE_FL|JFS_APPEND_FL); } jfs_inode->mode2 |= inode->i_mode; inode->i_blocks = 0; simple_inode_init_ts(inode); jfs_inode->otime = inode_get_ctime_sec(inode); inode->i_generation = JFS_SBI(sb)->gengen++; jfs_inode->cflag = 0; /* Zero remaining fields */ memset(&jfs_inode->acl, 0, sizeof(dxd_t)); memset(&jfs_inode->ea, 0, sizeof(dxd_t)); jfs_inode->next_index = 0; jfs_inode->acltype = 0; jfs_inode->btorder = 0; jfs_inode->btindex = 0; jfs_inode->bxflag = 0; jfs_inode->blid = 0; jfs_inode->atlhead = 0; jfs_inode->atltail = 0; jfs_inode->xtlid = 0; jfs_set_inode_flags(inode); jfs_info("ialloc returns inode = 0x%p", inode); return inode; fail_drop: dquot_drop(inode); inode->i_flags |= S_NOQUOTA; clear_nlink(inode); discard_new_inode(inode); return ERR_PTR(rc); fail_put: iput(inode); return ERR_PTR(rc); }
15 41 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_HWEIGHT_H #define _ASM_X86_HWEIGHT_H #include <asm/cpufeatures.h> #ifdef CONFIG_64BIT #define REG_IN "D" #define REG_OUT "a" #else #define REG_IN "a" #define REG_OUT "a" #endif static __always_inline unsigned int __arch_hweight32(unsigned int w) { unsigned int res; asm (ALTERNATIVE("call __sw_hweight32", "popcntl %1, %0", X86_FEATURE_POPCNT) : "="REG_OUT (res) : REG_IN (w)); return res; } static inline unsigned int __arch_hweight16(unsigned int w) { return __arch_hweight32(w & 0xffff); } static inline unsigned int __arch_hweight8(unsigned int w) { return __arch_hweight32(w & 0xff); } #ifdef CONFIG_X86_32 static inline unsigned long __arch_hweight64(__u64 w) { return __arch_hweight32((u32)w) + __arch_hweight32((u32)(w >> 32)); } #else static __always_inline unsigned long __arch_hweight64(__u64 w) { unsigned long res; asm (ALTERNATIVE("call __sw_hweight64", "popcntq %1, %0", X86_FEATURE_POPCNT) : "="REG_OUT (res) : REG_IN (w)); return res; } #endif /* CONFIG_X86_32 */ #endif
15 57 28 14 82 81 21 12 35 1 3 3 1 76 8 8 75 74 37 36 38 1 80 86 40 35 48 47 9 132 133 56 55 6 6 6 6 7 7 4 4 18 19 4 7 7 4 4 4 7 19 19 4 19 5 5 5 6 6 6 11 11 5 1 5 3 1 6 1 11 1 1 1 1 1 1 16 16 16 6 6 81 1 1 1 1 6 41 2 67 10 1 1 1 32 32 12 12 12 12 12 12 11 10 1 1 1 1 32 30 3 3 3 3 3 3 2 2 3 32 8 31 23 11 11 11 9 9 9 2 2 9 9 9 2 2 25 1 32 39 39 39 39 39 3 3 39 5 39 28 39 37 37 37 37 37 37 36 58 54 53 54 54 19 9 9 56 19 19 5 3 19 19 19 19 19 26 39 5 5 34 39 39 40 3 3 3 43 11 5 5 5 13 24 9 4 6 19 1 4 18 18 14 13 17 13 6 2 2 1 11 11 11 6 6 5 4 4 11 11 11 10 31 31 31 41 41 41 13 13 30 31 101 101 100 101 68 5 4 64 63 61 62 62 34 62 68 83 83 83 86 86 1 79 1 78 6 1 5 4 1 83 7 75 47 47 10 83 26 10 25 157 157 155 154 151 144 144 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 // SPDX-License-Identifier: GPL-2.0-only /* * xfrm_state.c * * Changes: * Mitsuru KANDA @USAGI * Kazunori MIYAZAWA @USAGI * Kunihiro Ishiguro <kunihiro@ipinfusion.com> * IPv6 support * YOSHIFUJI Hideaki @USAGI * Split up af-specific functions * Derek Atkins <derek@ihtfp.com> * Add UDP Encapsulation * */ #include <linux/compat.h> #include <linux/workqueue.h> #include <net/xfrm.h> #include <linux/pfkeyv2.h> #include <linux/ipsec.h> #include <linux/module.h> #include <linux/cache.h> #include <linux/audit.h> #include <linux/uaccess.h> #include <linux/ktime.h> #include <linux/slab.h> #include <linux/interrupt.h> #include <linux/kernel.h> #include <crypto/aead.h> #include "xfrm_hash.h" #define xfrm_state_deref_prot(table, net) \ rcu_dereference_protected((table), lockdep_is_held(&(net)->xfrm.xfrm_state_lock)) static void xfrm_state_gc_task(struct work_struct *work); /* Each xfrm_state may be linked to two tables: 1. Hash table by (spi,daddr,ah/esp) to find SA by SPI. (input,ctl) 2. Hash table by (daddr,family,reqid) to find what SAs exist for given destination/tunnel endpoint. (output) */ static unsigned int xfrm_state_hashmax __read_mostly = 1 * 1024 * 1024; static struct kmem_cache *xfrm_state_cache __ro_after_init; static DECLARE_WORK(xfrm_state_gc_work, xfrm_state_gc_task); static HLIST_HEAD(xfrm_state_gc_list); static inline bool xfrm_state_hold_rcu(struct xfrm_state __rcu *x) { return refcount_inc_not_zero(&x->refcnt); } static inline unsigned int xfrm_dst_hash(struct net *net, const xfrm_address_t *daddr, const xfrm_address_t *saddr, u32 reqid, unsigned short family) { return __xfrm_dst_hash(daddr, saddr, reqid, family, net->xfrm.state_hmask); } static inline unsigned int xfrm_src_hash(struct net *net, const xfrm_address_t *daddr, const xfrm_address_t *saddr, unsigned short family) { return __xfrm_src_hash(daddr, saddr, family, net->xfrm.state_hmask); } static inline unsigned int xfrm_spi_hash(struct net *net, const xfrm_address_t *daddr, __be32 spi, u8 proto, unsigned short family) { return __xfrm_spi_hash(daddr, spi, proto, family, net->xfrm.state_hmask); } static unsigned int xfrm_seq_hash(struct net *net, u32 seq) { return __xfrm_seq_hash(seq, net->xfrm.state_hmask); } #define XFRM_STATE_INSERT(by, _n, _h, _type) \ { \ struct xfrm_state *_x = NULL; \ \ if (_type != XFRM_DEV_OFFLOAD_PACKET) { \ hlist_for_each_entry_rcu(_x, _h, by) { \ if (_x->xso.type == XFRM_DEV_OFFLOAD_PACKET) \ continue; \ break; \ } \ } \ \ if (!_x || _x->xso.type == XFRM_DEV_OFFLOAD_PACKET) \ /* SAD is empty or consist from HW SAs only */ \ hlist_add_head_rcu(_n, _h); \ else \ hlist_add_before_rcu(_n, &_x->by); \ } static void xfrm_hash_transfer(struct hlist_head *list, struct hlist_head *ndsttable, struct hlist_head *nsrctable, struct hlist_head *nspitable, struct hlist_head *nseqtable, unsigned int nhashmask) { struct hlist_node *tmp; struct xfrm_state *x; hlist_for_each_entry_safe(x, tmp, list, bydst) { unsigned int h; h = __xfrm_dst_hash(&x->id.daddr, &x->props.saddr, x->props.reqid, x->props.family, nhashmask); XFRM_STATE_INSERT(bydst, &x->bydst, ndsttable + h, x->xso.type); h = __xfrm_src_hash(&x->id.daddr, &x->props.saddr, x->props.family, nhashmask); XFRM_STATE_INSERT(bysrc, &x->bysrc, nsrctable + h, x->xso.type); if (x->id.spi) { h = __xfrm_spi_hash(&x->id.daddr, x->id.spi, x->id.proto, x->props.family, nhashmask); XFRM_STATE_INSERT(byspi, &x->byspi, nspitable + h, x->xso.type); } if (x->km.seq) { h = __xfrm_seq_hash(x->km.seq, nhashmask); XFRM_STATE_INSERT(byseq, &x->byseq, nseqtable + h, x->xso.type); } } } static unsigned long xfrm_hash_new_size(unsigned int state_hmask) { return ((state_hmask + 1) << 1) * sizeof(struct hlist_head); } static void xfrm_hash_resize(struct work_struct *work) { struct net *net = container_of(work, struct net, xfrm.state_hash_work); struct hlist_head *ndst, *nsrc, *nspi, *nseq, *odst, *osrc, *ospi, *oseq; unsigned long nsize, osize; unsigned int nhashmask, ohashmask; int i; nsize = xfrm_hash_new_size(net->xfrm.state_hmask); ndst = xfrm_hash_alloc(nsize); if (!ndst) return; nsrc = xfrm_hash_alloc(nsize); if (!nsrc) { xfrm_hash_free(ndst, nsize); return; } nspi = xfrm_hash_alloc(nsize); if (!nspi) { xfrm_hash_free(ndst, nsize); xfrm_hash_free(nsrc, nsize); return; } nseq = xfrm_hash_alloc(nsize); if (!nseq) { xfrm_hash_free(ndst, nsize); xfrm_hash_free(nsrc, nsize); xfrm_hash_free(nspi, nsize); return; } spin_lock_bh(&net->xfrm.xfrm_state_lock); write_seqcount_begin(&net->xfrm.xfrm_state_hash_generation); nhashmask = (nsize / sizeof(struct hlist_head)) - 1U; odst = xfrm_state_deref_prot(net->xfrm.state_bydst, net); for (i = net->xfrm.state_hmask; i >= 0; i--) xfrm_hash_transfer(odst + i, ndst, nsrc, nspi, nseq, nhashmask); osrc = xfrm_state_deref_prot(net->xfrm.state_bysrc, net); ospi = xfrm_state_deref_prot(net->xfrm.state_byspi, net); oseq = xfrm_state_deref_prot(net->xfrm.state_byseq, net); ohashmask = net->xfrm.state_hmask; rcu_assign_pointer(net->xfrm.state_bydst, ndst); rcu_assign_pointer(net->xfrm.state_bysrc, nsrc); rcu_assign_pointer(net->xfrm.state_byspi, nspi); rcu_assign_pointer(net->xfrm.state_byseq, nseq); net->xfrm.state_hmask = nhashmask; write_seqcount_end(&net->xfrm.xfrm_state_hash_generation); spin_unlock_bh(&net->xfrm.xfrm_state_lock); osize = (ohashmask + 1) * sizeof(struct hlist_head); synchronize_rcu(); xfrm_hash_free(odst, osize); xfrm_hash_free(osrc, osize); xfrm_hash_free(ospi, osize); xfrm_hash_free(oseq, osize); } static DEFINE_SPINLOCK(xfrm_state_afinfo_lock); static struct xfrm_state_afinfo __rcu *xfrm_state_afinfo[NPROTO]; static DEFINE_SPINLOCK(xfrm_state_gc_lock); int __xfrm_state_delete(struct xfrm_state *x); int km_query(struct xfrm_state *x, struct xfrm_tmpl *t, struct xfrm_policy *pol); static bool km_is_alive(const struct km_event *c); void km_state_expired(struct xfrm_state *x, int hard, u32 portid); int xfrm_register_type(const struct xfrm_type *type, unsigned short family) { struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family); int err = 0; if (!afinfo) return -EAFNOSUPPORT; #define X(afi, T, name) do { \ WARN_ON((afi)->type_ ## name); \ (afi)->type_ ## name = (T); \ } while (0) switch (type->proto) { case IPPROTO_COMP: X(afinfo, type, comp); break; case IPPROTO_AH: X(afinfo, type, ah); break; case IPPROTO_ESP: X(afinfo, type, esp); break; case IPPROTO_IPIP: X(afinfo, type, ipip); break; case IPPROTO_DSTOPTS: X(afinfo, type, dstopts); break; case IPPROTO_ROUTING: X(afinfo, type, routing); break; case IPPROTO_IPV6: X(afinfo, type, ipip6); break; default: WARN_ON(1); err = -EPROTONOSUPPORT; break; } #undef X rcu_read_unlock(); return err; } EXPORT_SYMBOL(xfrm_register_type); void xfrm_unregister_type(const struct xfrm_type *type, unsigned short family) { struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family); if (unlikely(afinfo == NULL)) return; #define X(afi, T, name) do { \ WARN_ON((afi)->type_ ## name != (T)); \ (afi)->type_ ## name = NULL; \ } while (0) switch (type->proto) { case IPPROTO_COMP: X(afinfo, type, comp); break; case IPPROTO_AH: X(afinfo, type, ah); break; case IPPROTO_ESP: X(afinfo, type, esp); break; case IPPROTO_IPIP: X(afinfo, type, ipip); break; case IPPROTO_DSTOPTS: X(afinfo, type, dstopts); break; case IPPROTO_ROUTING: X(afinfo, type, routing); break; case IPPROTO_IPV6: X(afinfo, type, ipip6); break; default: WARN_ON(1); break; } #undef X rcu_read_unlock(); } EXPORT_SYMBOL(xfrm_unregister_type); static const struct xfrm_type *xfrm_get_type(u8 proto, unsigned short family) { const struct xfrm_type *type = NULL; struct xfrm_state_afinfo *afinfo; int modload_attempted = 0; retry: afinfo = xfrm_state_get_afinfo(family); if (unlikely(afinfo == NULL)) return NULL; switch (proto) { case IPPROTO_COMP: type = afinfo->type_comp; break; case IPPROTO_AH: type = afinfo->type_ah; break; case IPPROTO_ESP: type = afinfo->type_esp; break; case IPPROTO_IPIP: type = afinfo->type_ipip; break; case IPPROTO_DSTOPTS: type = afinfo->type_dstopts; break; case IPPROTO_ROUTING: type = afinfo->type_routing; break; case IPPROTO_IPV6: type = afinfo->type_ipip6; break; default: break; } if (unlikely(type && !try_module_get(type->owner))) type = NULL; rcu_read_unlock(); if (!type && !modload_attempted) { request_module("xfrm-type-%d-%d", family, proto); modload_attempted = 1; goto retry; } return type; } static void xfrm_put_type(const struct xfrm_type *type) { module_put(type->owner); } int xfrm_register_type_offload(const struct xfrm_type_offload *type, unsigned short family) { struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family); int err = 0; if (unlikely(afinfo == NULL)) return -EAFNOSUPPORT; switch (type->proto) { case IPPROTO_ESP: WARN_ON(afinfo->type_offload_esp); afinfo->type_offload_esp = type; break; default: WARN_ON(1); err = -EPROTONOSUPPORT; break; } rcu_read_unlock(); return err; } EXPORT_SYMBOL(xfrm_register_type_offload); void xfrm_unregister_type_offload(const struct xfrm_type_offload *type, unsigned short family) { struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family); if (unlikely(afinfo == NULL)) return; switch (type->proto) { case IPPROTO_ESP: WARN_ON(afinfo->type_offload_esp != type); afinfo->type_offload_esp = NULL; break; default: WARN_ON(1); break; } rcu_read_unlock(); } EXPORT_SYMBOL(xfrm_unregister_type_offload); static const struct xfrm_type_offload * xfrm_get_type_offload(u8 proto, unsigned short family, bool try_load) { const struct xfrm_type_offload *type = NULL; struct xfrm_state_afinfo *afinfo; retry: afinfo = xfrm_state_get_afinfo(family); if (unlikely(afinfo == NULL)) return NULL; switch (proto) { case IPPROTO_ESP: type = afinfo->type_offload_esp; break; default: break; } if ((type && !try_module_get(type->owner))) type = NULL; rcu_read_unlock(); if (!type && try_load) { request_module("xfrm-offload-%d-%d", family, proto); try_load = false; goto retry; } return type; } static void xfrm_put_type_offload(const struct xfrm_type_offload *type) { module_put(type->owner); } static const struct xfrm_mode xfrm4_mode_map[XFRM_MODE_MAX] = { [XFRM_MODE_BEET] = { .encap = XFRM_MODE_BEET, .flags = XFRM_MODE_FLAG_TUNNEL, .family = AF_INET, }, [XFRM_MODE_TRANSPORT] = { .encap = XFRM_MODE_TRANSPORT, .family = AF_INET, }, [XFRM_MODE_TUNNEL] = { .encap = XFRM_MODE_TUNNEL, .flags = XFRM_MODE_FLAG_TUNNEL, .family = AF_INET, }, }; static const struct xfrm_mode xfrm6_mode_map[XFRM_MODE_MAX] = { [XFRM_MODE_BEET] = { .encap = XFRM_MODE_BEET, .flags = XFRM_MODE_FLAG_TUNNEL, .family = AF_INET6, }, [XFRM_MODE_ROUTEOPTIMIZATION] = { .encap = XFRM_MODE_ROUTEOPTIMIZATION, .family = AF_INET6, }, [XFRM_MODE_TRANSPORT] = { .encap = XFRM_MODE_TRANSPORT, .family = AF_INET6, }, [XFRM_MODE_TUNNEL] = { .encap = XFRM_MODE_TUNNEL, .flags = XFRM_MODE_FLAG_TUNNEL, .family = AF_INET6, }, }; static const struct xfrm_mode *xfrm_get_mode(unsigned int encap, int family) { const struct xfrm_mode *mode; if (unlikely(encap >= XFRM_MODE_MAX)) return NULL; switch (family) { case AF_INET: mode = &xfrm4_mode_map[encap]; if (mode->family == family) return mode; break; case AF_INET6: mode = &xfrm6_mode_map[encap]; if (mode->family == family) return mode; break; default: break; } return NULL; } void xfrm_state_free(struct xfrm_state *x) { kmem_cache_free(xfrm_state_cache, x); } EXPORT_SYMBOL(xfrm_state_free); static void ___xfrm_state_destroy(struct xfrm_state *x) { hrtimer_cancel(&x->mtimer); del_timer_sync(&x->rtimer); kfree(x->aead); kfree(x->aalg); kfree(x->ealg); kfree(x->calg); kfree(x->encap); kfree(x->coaddr); kfree(x->replay_esn); kfree(x->preplay_esn); if (x->type_offload) xfrm_put_type_offload(x->type_offload); if (x->type) { x->type->destructor(x); xfrm_put_type(x->type); } if (x->xfrag.page) put_page(x->xfrag.page); xfrm_dev_state_free(x); security_xfrm_state_free(x); xfrm_state_free(x); } static void xfrm_state_gc_task(struct work_struct *work) { struct xfrm_state *x; struct hlist_node *tmp; struct hlist_head gc_list; spin_lock_bh(&xfrm_state_gc_lock); hlist_move_list(&xfrm_state_gc_list, &gc_list); spin_unlock_bh(&xfrm_state_gc_lock); synchronize_rcu(); hlist_for_each_entry_safe(x, tmp, &gc_list, gclist) ___xfrm_state_destroy(x); } static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me) { struct xfrm_state *x = container_of(me, struct xfrm_state, mtimer); enum hrtimer_restart ret = HRTIMER_NORESTART; time64_t now = ktime_get_real_seconds(); time64_t next = TIME64_MAX; int warn = 0; int err = 0; spin_lock(&x->lock); xfrm_dev_state_update_curlft(x); if (x->km.state == XFRM_STATE_DEAD) goto out; if (x->km.state == XFRM_STATE_EXPIRED) goto expired; if (x->lft.hard_add_expires_seconds) { time64_t tmo = x->lft.hard_add_expires_seconds + x->curlft.add_time - now; if (tmo <= 0) { if (x->xflags & XFRM_SOFT_EXPIRE) { /* enter hard expire without soft expire first?! * setting a new date could trigger this. * workaround: fix x->curflt.add_time by below: */ x->curlft.add_time = now - x->saved_tmo - 1; tmo = x->lft.hard_add_expires_seconds - x->saved_tmo; } else goto expired; } if (tmo < next) next = tmo; } if (x->lft.hard_use_expires_seconds) { time64_t tmo = x->lft.hard_use_expires_seconds + (READ_ONCE(x->curlft.use_time) ? : now) - now; if (tmo <= 0) goto expired; if (tmo < next) next = tmo; } if (x->km.dying) goto resched; if (x->lft.soft_add_expires_seconds) { time64_t tmo = x->lft.soft_add_expires_seconds + x->curlft.add_time - now; if (tmo <= 0) { warn = 1; x->xflags &= ~XFRM_SOFT_EXPIRE; } else if (tmo < next) { next = tmo; x->xflags |= XFRM_SOFT_EXPIRE; x->saved_tmo = tmo; } } if (x->lft.soft_use_expires_seconds) { time64_t tmo = x->lft.soft_use_expires_seconds + (READ_ONCE(x->curlft.use_time) ? : now) - now; if (tmo <= 0) warn = 1; else if (tmo < next) next = tmo; } x->km.dying = warn; if (warn) km_state_expired(x, 0, 0); resched: if (next != TIME64_MAX) { hrtimer_forward_now(&x->mtimer, ktime_set(next, 0)); ret = HRTIMER_RESTART; } goto out; expired: if (x->km.state == XFRM_STATE_ACQ && x->id.spi == 0) x->km.state = XFRM_STATE_EXPIRED; err = __xfrm_state_delete(x); if (!err) km_state_expired(x, 1, 0); xfrm_audit_state_delete(x, err ? 0 : 1, true); out: spin_unlock(&x->lock); return ret; } static void xfrm_replay_timer_handler(struct timer_list *t); struct xfrm_state *xfrm_state_alloc(struct net *net) { struct xfrm_state *x; x = kmem_cache_zalloc(xfrm_state_cache, GFP_ATOMIC); if (x) { write_pnet(&x->xs_net, net); refcount_set(&x->refcnt, 1); atomic_set(&x->tunnel_users, 0); INIT_LIST_HEAD(&x->km.all); INIT_HLIST_NODE(&x->bydst); INIT_HLIST_NODE(&x->bysrc); INIT_HLIST_NODE(&x->byspi); INIT_HLIST_NODE(&x->byseq); hrtimer_init(&x->mtimer, CLOCK_BOOTTIME, HRTIMER_MODE_ABS_SOFT); x->mtimer.function = xfrm_timer_handler; timer_setup(&x->rtimer, xfrm_replay_timer_handler, 0); x->curlft.add_time = ktime_get_real_seconds(); x->lft.soft_byte_limit = XFRM_INF; x->lft.soft_packet_limit = XFRM_INF; x->lft.hard_byte_limit = XFRM_INF; x->lft.hard_packet_limit = XFRM_INF; x->replay_maxage = 0; x->replay_maxdiff = 0; spin_lock_init(&x->lock); } return x; } EXPORT_SYMBOL(xfrm_state_alloc); void __xfrm_state_destroy(struct xfrm_state *x, bool sync) { WARN_ON(x->km.state != XFRM_STATE_DEAD); if (sync) { synchronize_rcu(); ___xfrm_state_destroy(x); } else { spin_lock_bh(&xfrm_state_gc_lock); hlist_add_head(&x->gclist, &xfrm_state_gc_list); spin_unlock_bh(&xfrm_state_gc_lock); schedule_work(&xfrm_state_gc_work); } } EXPORT_SYMBOL(__xfrm_state_destroy); int __xfrm_state_delete(struct xfrm_state *x) { struct net *net = xs_net(x); int err = -ESRCH; if (x->km.state != XFRM_STATE_DEAD) { x->km.state = XFRM_STATE_DEAD; spin_lock(&net->xfrm.xfrm_state_lock); list_del(&x->km.all); hlist_del_rcu(&x->bydst); hlist_del_rcu(&x->bysrc); if (x->km.seq) hlist_del_rcu(&x->byseq); if (x->id.spi) hlist_del_rcu(&x->byspi); net->xfrm.state_num--; spin_unlock(&net->xfrm.xfrm_state_lock); if (x->encap_sk) sock_put(rcu_dereference_raw(x->encap_sk)); xfrm_dev_state_delete(x); /* All xfrm_state objects are created by xfrm_state_alloc. * The xfrm_state_alloc call gives a reference, and that * is what we are dropping here. */ xfrm_state_put(x); err = 0; } return err; } EXPORT_SYMBOL(__xfrm_state_delete); int xfrm_state_delete(struct xfrm_state *x) { int err; spin_lock_bh(&x->lock); err = __xfrm_state_delete(x); spin_unlock_bh(&x->lock); return err; } EXPORT_SYMBOL(xfrm_state_delete); #ifdef CONFIG_SECURITY_NETWORK_XFRM static inline int xfrm_state_flush_secctx_check(struct net *net, u8 proto, bool task_valid) { int i, err = 0; for (i = 0; i <= net->xfrm.state_hmask; i++) { struct xfrm_state *x; hlist_for_each_entry(x, net->xfrm.state_bydst+i, bydst) { if (xfrm_id_proto_match(x->id.proto, proto) && (err = security_xfrm_state_delete(x)) != 0) { xfrm_audit_state_delete(x, 0, task_valid); return err; } } } return err; } static inline int xfrm_dev_state_flush_secctx_check(struct net *net, struct net_device *dev, bool task_valid) { int i, err = 0; for (i = 0; i <= net->xfrm.state_hmask; i++) { struct xfrm_state *x; struct xfrm_dev_offload *xso; hlist_for_each_entry(x, net->xfrm.state_bydst+i, bydst) { xso = &x->xso; if (xso->dev == dev && (err = security_xfrm_state_delete(x)) != 0) { xfrm_audit_state_delete(x, 0, task_valid); return err; } } } return err; } #else static inline int xfrm_state_flush_secctx_check(struct net *net, u8 proto, bool task_valid) { return 0; } static inline int xfrm_dev_state_flush_secctx_check(struct net *net, struct net_device *dev, bool task_valid) { return 0; } #endif int xfrm_state_flush(struct net *net, u8 proto, bool task_valid, bool sync) { int i, err = 0, cnt = 0; spin_lock_bh(&net->xfrm.xfrm_state_lock); err = xfrm_state_flush_secctx_check(net, proto, task_valid); if (err) goto out; err = -ESRCH; for (i = 0; i <= net->xfrm.state_hmask; i++) { struct xfrm_state *x; restart: hlist_for_each_entry(x, net->xfrm.state_bydst+i, bydst) { if (!xfrm_state_kern(x) && xfrm_id_proto_match(x->id.proto, proto)) { xfrm_state_hold(x); spin_unlock_bh(&net->xfrm.xfrm_state_lock); err = xfrm_state_delete(x); xfrm_audit_state_delete(x, err ? 0 : 1, task_valid); if (sync) xfrm_state_put_sync(x); else xfrm_state_put(x); if (!err) cnt++; spin_lock_bh(&net->xfrm.xfrm_state_lock); goto restart; } } } out: spin_unlock_bh(&net->xfrm.xfrm_state_lock); if (cnt) err = 0; return err; } EXPORT_SYMBOL(xfrm_state_flush); int xfrm_dev_state_flush(struct net *net, struct net_device *dev, bool task_valid) { int i, err = 0, cnt = 0; spin_lock_bh(&net->xfrm.xfrm_state_lock); err = xfrm_dev_state_flush_secctx_check(net, dev, task_valid); if (err) goto out; err = -ESRCH; for (i = 0; i <= net->xfrm.state_hmask; i++) { struct xfrm_state *x; struct xfrm_dev_offload *xso; restart: hlist_for_each_entry(x, net->xfrm.state_bydst+i, bydst) { xso = &x->xso; if (!xfrm_state_kern(x) && xso->dev == dev) { xfrm_state_hold(x); spin_unlock_bh(&net->xfrm.xfrm_state_lock); err = xfrm_state_delete(x); xfrm_audit_state_delete(x, err ? 0 : 1, task_valid); xfrm_state_put(x); if (!err) cnt++; spin_lock_bh(&net->xfrm.xfrm_state_lock); goto restart; } } } if (cnt) err = 0; out: spin_unlock_bh(&net->xfrm.xfrm_state_lock); return err; } EXPORT_SYMBOL(xfrm_dev_state_flush); void xfrm_sad_getinfo(struct net *net, struct xfrmk_sadinfo *si) { spin_lock_bh(&net->xfrm.xfrm_state_lock); si->sadcnt = net->xfrm.state_num; si->sadhcnt = net->xfrm.state_hmask + 1; si->sadhmcnt = xfrm_state_hashmax; spin_unlock_bh(&net->xfrm.xfrm_state_lock); } EXPORT_SYMBOL(xfrm_sad_getinfo); static void __xfrm4_init_tempsel(struct xfrm_selector *sel, const struct flowi *fl) { const struct flowi4 *fl4 = &fl->u.ip4; sel->daddr.a4 = fl4->daddr; sel->saddr.a4 = fl4->saddr; sel->dport = xfrm_flowi_dport(fl, &fl4->uli); sel->dport_mask = htons(0xffff); sel->sport = xfrm_flowi_sport(fl, &fl4->uli); sel->sport_mask = htons(0xffff); sel->family = AF_INET; sel->prefixlen_d = 32; sel->prefixlen_s = 32; sel->proto = fl4->flowi4_proto; sel->ifindex = fl4->flowi4_oif; } static void __xfrm6_init_tempsel(struct xfrm_selector *sel, const struct flowi *fl) { const struct flowi6 *fl6 = &fl->u.ip6; /* Initialize temporary selector matching only to current session. */ *(struct in6_addr *)&sel->daddr = fl6->daddr; *(struct in6_addr *)&sel->saddr = fl6->saddr; sel->dport = xfrm_flowi_dport(fl, &fl6->uli); sel->dport_mask = htons(0xffff); sel->sport = xfrm_flowi_sport(fl, &fl6->uli); sel->sport_mask = htons(0xffff); sel->family = AF_INET6; sel->prefixlen_d = 128; sel->prefixlen_s = 128; sel->proto = fl6->flowi6_proto; sel->ifindex = fl6->flowi6_oif; } static void xfrm_init_tempstate(struct xfrm_state *x, const struct flowi *fl, const struct xfrm_tmpl *tmpl, const xfrm_address_t *daddr, const xfrm_address_t *saddr, unsigned short family) { switch (family) { case AF_INET: __xfrm4_init_tempsel(&x->sel, fl); break; case AF_INET6: __xfrm6_init_tempsel(&x->sel, fl); break; } x->id = tmpl->id; switch (tmpl->encap_family) { case AF_INET: if (x->id.daddr.a4 == 0) x->id.daddr.a4 = daddr->a4; x->props.saddr = tmpl->saddr; if (x->props.saddr.a4 == 0) x->props.saddr.a4 = saddr->a4; break; case AF_INET6: if (ipv6_addr_any((struct in6_addr *)&x->id.daddr)) memcpy(&x->id.daddr, daddr, sizeof(x->sel.daddr)); memcpy(&x->props.saddr, &tmpl->saddr, sizeof(x->props.saddr)); if (ipv6_addr_any((struct in6_addr *)&x->props.saddr)) memcpy(&x->props.saddr, saddr, sizeof(x->props.saddr)); break; } x->props.mode = tmpl->mode; x->props.reqid = tmpl->reqid; x->props.family = tmpl->encap_family; } static struct xfrm_state *__xfrm_state_lookup_all(struct net *net, u32 mark, const xfrm_address_t *daddr, __be32 spi, u8 proto, unsigned short family, struct xfrm_dev_offload *xdo) { unsigned int h = xfrm_spi_hash(net, daddr, spi, proto, family); struct xfrm_state *x; hlist_for_each_entry_rcu(x, net->xfrm.state_byspi + h, byspi) { #ifdef CONFIG_XFRM_OFFLOAD if (xdo->type == XFRM_DEV_OFFLOAD_PACKET) { if (x->xso.type != XFRM_DEV_OFFLOAD_PACKET) /* HW states are in the head of list, there is * no need to iterate further. */ break; /* Packet offload: both policy and SA should * have same device. */ if (xdo->dev != x->xso.dev) continue; } else if (x->xso.type == XFRM_DEV_OFFLOAD_PACKET) /* Skip HW policy for SW lookups */ continue; #endif if (x->props.family != family || x->id.spi != spi || x->id.proto != proto || !xfrm_addr_equal(&x->id.daddr, daddr, family)) continue; if ((mark & x->mark.m) != x->mark.v) continue; if (!xfrm_state_hold_rcu(x)) continue; return x; } return NULL; } static struct xfrm_state *__xfrm_state_lookup(struct net *net, u32 mark, const xfrm_address_t *daddr, __be32 spi, u8 proto, unsigned short family) { unsigned int h = xfrm_spi_hash(net, daddr, spi, proto, family); struct xfrm_state *x; hlist_for_each_entry_rcu(x, net->xfrm.state_byspi + h, byspi) { if (x->props.family != family || x->id.spi != spi || x->id.proto != proto || !xfrm_addr_equal(&x->id.daddr, daddr, family)) continue; if ((mark & x->mark.m) != x->mark.v) continue; if (!xfrm_state_hold_rcu(x)) continue; return x; } return NULL; } static struct xfrm_state *__xfrm_state_lookup_byaddr(struct net *net, u32 mark, const xfrm_address_t *daddr, const xfrm_address_t *saddr, u8 proto, unsigned short family) { unsigned int h = xfrm_src_hash(net, daddr, saddr, family); struct xfrm_state *x; hlist_for_each_entry_rcu(x, net->xfrm.state_bysrc + h, bysrc) { if (x->props.family != family || x->id.proto != proto || !xfrm_addr_equal(&x->id.daddr, daddr, family) || !xfrm_addr_equal(&x->props.saddr, saddr, family)) continue; if ((mark & x->mark.m) != x->mark.v) continue; if (!xfrm_state_hold_rcu(x)) continue; return x; } return NULL; } static inline struct xfrm_state * __xfrm_state_locate(struct xfrm_state *x, int use_spi, int family) { struct net *net = xs_net(x); u32 mark = x->mark.v & x->mark.m; if (use_spi) return __xfrm_state_lookup(net, mark, &x->id.daddr, x->id.spi, x->id.proto, family); else return __xfrm_state_lookup_byaddr(net, mark, &x->id.daddr, &x->props.saddr, x->id.proto, family); } static void xfrm_hash_grow_check(struct net *net, int have_hash_collision) { if (have_hash_collision && (net->xfrm.state_hmask + 1) < xfrm_state_hashmax && net->xfrm.state_num > net->xfrm.state_hmask) schedule_work(&net->xfrm.state_hash_work); } static void xfrm_state_look_at(struct xfrm_policy *pol, struct xfrm_state *x, const struct flowi *fl, unsigned short family, struct xfrm_state **best, int *acq_in_progress, int *error) { /* Resolution logic: * 1. There is a valid state with matching selector. Done. * 2. Valid state with inappropriate selector. Skip. * * Entering area of "sysdeps". * * 3. If state is not valid, selector is temporary, it selects * only session which triggered previous resolution. Key * manager will do something to install a state with proper * selector. */ if (x->km.state == XFRM_STATE_VALID) { if ((x->sel.family && (x->sel.family != family || !xfrm_selector_match(&x->sel, fl, family))) || !security_xfrm_state_pol_flow_match(x, pol, &fl->u.__fl_common)) return; if (!*best || (*best)->km.dying > x->km.dying || ((*best)->km.dying == x->km.dying && (*best)->curlft.add_time < x->curlft.add_time)) *best = x; } else if (x->km.state == XFRM_STATE_ACQ) { *acq_in_progress = 1; } else if (x->km.state == XFRM_STATE_ERROR || x->km.state == XFRM_STATE_EXPIRED) { if ((!x->sel.family || (x->sel.family == family && xfrm_selector_match(&x->sel, fl, family))) && security_xfrm_state_pol_flow_match(x, pol, &fl->u.__fl_common)) *error = -ESRCH; } } struct xfrm_state * xfrm_state_find(const xfrm_address_t *daddr, const xfrm_address_t *saddr, const struct flowi *fl, struct xfrm_tmpl *tmpl, struct xfrm_policy *pol, int *err, unsigned short family, u32 if_id) { static xfrm_address_t saddr_wildcard = { }; struct net *net = xp_net(pol); unsigned int h, h_wildcard; struct xfrm_state *x, *x0, *to_put; int acquire_in_progress = 0; int error = 0; struct xfrm_state *best = NULL; u32 mark = pol->mark.v & pol->mark.m; unsigned short encap_family = tmpl->encap_family; unsigned int sequence; struct km_event c; to_put = NULL; sequence = read_seqcount_begin(&net->xfrm.xfrm_state_hash_generation); rcu_read_lock(); h = xfrm_dst_hash(net, daddr, saddr, tmpl->reqid, encap_family); hlist_for_each_entry_rcu(x, net->xfrm.state_bydst + h, bydst) { #ifdef CONFIG_XFRM_OFFLOAD if (pol->xdo.type == XFRM_DEV_OFFLOAD_PACKET) { if (x->xso.type != XFRM_DEV_OFFLOAD_PACKET) /* HW states are in the head of list, there is * no need to iterate further. */ break; /* Packet offload: both policy and SA should * have same device. */ if (pol->xdo.dev != x->xso.dev) continue; } else if (x->xso.type == XFRM_DEV_OFFLOAD_PACKET) /* Skip HW policy for SW lookups */ continue; #endif if (x->props.family == encap_family && x->props.reqid == tmpl->reqid && (mark & x->mark.m) == x->mark.v && x->if_id == if_id && !(x->props.flags & XFRM_STATE_WILDRECV) && xfrm_state_addr_check(x, daddr, saddr, encap_family) && tmpl->mode == x->props.mode && tmpl->id.proto == x->id.proto && (tmpl->id.spi == x->id.spi || !tmpl->id.spi)) xfrm_state_look_at(pol, x, fl, family, &best, &acquire_in_progress, &error); } if (best || acquire_in_progress) goto found; h_wildcard = xfrm_dst_hash(net, daddr, &saddr_wildcard, tmpl->reqid, encap_family); hlist_for_each_entry_rcu(x, net->xfrm.state_bydst + h_wildcard, bydst) { #ifdef CONFIG_XFRM_OFFLOAD if (pol->xdo.type == XFRM_DEV_OFFLOAD_PACKET) { if (x->xso.type != XFRM_DEV_OFFLOAD_PACKET) /* HW states are in the head of list, there is * no need to iterate further. */ break; /* Packet offload: both policy and SA should * have same device. */ if (pol->xdo.dev != x->xso.dev) continue; } else if (x->xso.type == XFRM_DEV_OFFLOAD_PACKET) /* Skip HW policy for SW lookups */ continue; #endif if (x->props.family == encap_family && x->props.reqid == tmpl->reqid && (mark & x->mark.m) == x->mark.v && x->if_id == if_id && !(x->props.flags & XFRM_STATE_WILDRECV) && xfrm_addr_equal(&x->id.daddr, daddr, encap_family) && tmpl->mode == x->props.mode && tmpl->id.proto == x->id.proto && (tmpl->id.spi == x->id.spi || !tmpl->id.spi)) xfrm_state_look_at(pol, x, fl, family, &best, &acquire_in_progress, &error); } found: x = best; if (!x && !error && !acquire_in_progress) { if (tmpl->id.spi && (x0 = __xfrm_state_lookup_all(net, mark, daddr, tmpl->id.spi, tmpl->id.proto, encap_family, &pol->xdo)) != NULL) { to_put = x0; error = -EEXIST; goto out; } c.net = net; /* If the KMs have no listeners (yet...), avoid allocating an SA * for each and every packet - garbage collection might not * handle the flood. */ if (!km_is_alive(&c)) { error = -ESRCH; goto out; } x = xfrm_state_alloc(net); if (x == NULL) { error = -ENOMEM; goto out; } /* Initialize temporary state matching only * to current session. */ xfrm_init_tempstate(x, fl, tmpl, daddr, saddr, family); memcpy(&x->mark, &pol->mark, sizeof(x->mark)); x->if_id = if_id; error = security_xfrm_state_alloc_acquire(x, pol->security, fl->flowi_secid); if (error) { x->km.state = XFRM_STATE_DEAD; to_put = x; x = NULL; goto out; } #ifdef CONFIG_XFRM_OFFLOAD if (pol->xdo.type == XFRM_DEV_OFFLOAD_PACKET) { struct xfrm_dev_offload *xdo = &pol->xdo; struct xfrm_dev_offload *xso = &x->xso; xso->type = XFRM_DEV_OFFLOAD_PACKET; xso->dir = xdo->dir; xso->dev = xdo->dev; xso->real_dev = xdo->real_dev; xso->flags = XFRM_DEV_OFFLOAD_FLAG_ACQ; netdev_tracker_alloc(xso->dev, &xso->dev_tracker, GFP_ATOMIC); error = xso->dev->xfrmdev_ops->xdo_dev_state_add(x, NULL); if (error) { xso->dir = 0; netdev_put(xso->dev, &xso->dev_tracker); xso->dev = NULL; xso->real_dev = NULL; xso->type = XFRM_DEV_OFFLOAD_UNSPECIFIED; x->km.state = XFRM_STATE_DEAD; to_put = x; x = NULL; goto out; } } #endif if (km_query(x, tmpl, pol) == 0) { spin_lock_bh(&net->xfrm.xfrm_state_lock); x->km.state = XFRM_STATE_ACQ; list_add(&x->km.all, &net->xfrm.state_all); XFRM_STATE_INSERT(bydst, &x->bydst, net->xfrm.state_bydst + h, x->xso.type); h = xfrm_src_hash(net, daddr, saddr, encap_family); XFRM_STATE_INSERT(bysrc, &x->bysrc, net->xfrm.state_bysrc + h, x->xso.type); if (x->id.spi) { h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto, encap_family); XFRM_STATE_INSERT(byspi, &x->byspi, net->xfrm.state_byspi + h, x->xso.type); } if (x->km.seq) { h = xfrm_seq_hash(net, x->km.seq); XFRM_STATE_INSERT(byseq, &x->byseq, net->xfrm.state_byseq + h, x->xso.type); } x->lft.hard_add_expires_seconds = net->xfrm.sysctl_acq_expires; hrtimer_start(&x->mtimer, ktime_set(net->xfrm.sysctl_acq_expires, 0), HRTIMER_MODE_REL_SOFT); net->xfrm.state_num++; xfrm_hash_grow_check(net, x->bydst.next != NULL); spin_unlock_bh(&net->xfrm.xfrm_state_lock); } else { #ifdef CONFIG_XFRM_OFFLOAD struct xfrm_dev_offload *xso = &x->xso; if (xso->type == XFRM_DEV_OFFLOAD_PACKET) { xfrm_dev_state_delete(x); xfrm_dev_state_free(x); } #endif x->km.state = XFRM_STATE_DEAD; to_put = x; x = NULL; error = -ESRCH; } } out: if (x) { if (!xfrm_state_hold_rcu(x)) { *err = -EAGAIN; x = NULL; } } else { *err = acquire_in_progress ? -EAGAIN : error; } rcu_read_unlock(); if (to_put) xfrm_state_put(to_put); if (read_seqcount_retry(&net->xfrm.xfrm_state_hash_generation, sequence)) { *err = -EAGAIN; if (x) { xfrm_state_put(x); x = NULL; } } return x; } struct xfrm_state * xfrm_stateonly_find(struct net *net, u32 mark, u32 if_id, xfrm_address_t *daddr, xfrm_address_t *saddr, unsigned short family, u8 mode, u8 proto, u32 reqid) { unsigned int h; struct xfrm_state *rx = NULL, *x = NULL; spin_lock_bh(&net->xfrm.xfrm_state_lock); h = xfrm_dst_hash(net, daddr, saddr, reqid, family); hlist_for_each_entry(x, net->xfrm.state_bydst+h, bydst) { if (x->props.family == family && x->props.reqid == reqid && (mark & x->mark.m) == x->mark.v && x->if_id == if_id && !(x->props.flags & XFRM_STATE_WILDRECV) && xfrm_state_addr_check(x, daddr, saddr, family) && mode == x->props.mode && proto == x->id.proto && x->km.state == XFRM_STATE_VALID) { rx = x; break; } } if (rx) xfrm_state_hold(rx); spin_unlock_bh(&net->xfrm.xfrm_state_lock); return rx; } EXPORT_SYMBOL(xfrm_stateonly_find); struct xfrm_state *xfrm_state_lookup_byspi(struct net *net, __be32 spi, unsigned short family) { struct xfrm_state *x; struct xfrm_state_walk *w; spin_lock_bh(&net->xfrm.xfrm_state_lock); list_for_each_entry(w, &net->xfrm.state_all, all) { x = container_of(w, struct xfrm_state, km); if (x->props.family != family || x->id.spi != spi) continue; xfrm_state_hold(x); spin_unlock_bh(&net->xfrm.xfrm_state_lock); return x; } spin_unlock_bh(&net->xfrm.xfrm_state_lock); return NULL; } EXPORT_SYMBOL(xfrm_state_lookup_byspi); static void __xfrm_state_insert(struct xfrm_state *x) { struct net *net = xs_net(x); unsigned int h; list_add(&x->km.all, &net->xfrm.state_all); h = xfrm_dst_hash(net, &x->id.daddr, &x->props.saddr, x->props.reqid, x->props.family); XFRM_STATE_INSERT(bydst, &x->bydst, net->xfrm.state_bydst + h, x->xso.type); h = xfrm_src_hash(net, &x->id.daddr, &x->props.saddr, x->props.family); XFRM_STATE_INSERT(bysrc, &x->bysrc, net->xfrm.state_bysrc + h, x->xso.type); if (x->id.spi) { h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto, x->props.family); XFRM_STATE_INSERT(byspi, &x->byspi, net->xfrm.state_byspi + h, x->xso.type); } if (x->km.seq) { h = xfrm_seq_hash(net, x->km.seq); XFRM_STATE_INSERT(byseq, &x->byseq, net->xfrm.state_byseq + h, x->xso.type); } hrtimer_start(&x->mtimer, ktime_set(1, 0), HRTIMER_MODE_REL_SOFT); if (x->replay_maxage) mod_timer(&x->rtimer, jiffies + x->replay_maxage); net->xfrm.state_num++; xfrm_hash_grow_check(net, x->bydst.next != NULL); } /* net->xfrm.xfrm_state_lock is held */ static void __xfrm_state_bump_genids(struct xfrm_state *xnew) { struct net *net = xs_net(xnew); unsigned short family = xnew->props.family; u32 reqid = xnew->props.reqid; struct xfrm_state *x; unsigned int h; u32 mark = xnew->mark.v & xnew->mark.m; u32 if_id = xnew->if_id; h = xfrm_dst_hash(net, &xnew->id.daddr, &xnew->props.saddr, reqid, family); hlist_for_each_entry(x, net->xfrm.state_bydst+h, bydst) { if (x->props.family == family && x->props.reqid == reqid && x->if_id == if_id && (mark & x->mark.m) == x->mark.v && xfrm_addr_equal(&x->id.daddr, &xnew->id.daddr, family) && xfrm_addr_equal(&x->props.saddr, &xnew->props.saddr, family)) x->genid++; } } void xfrm_state_insert(struct xfrm_state *x) { struct net *net = xs_net(x); spin_lock_bh(&net->xfrm.xfrm_state_lock); __xfrm_state_bump_genids(x); __xfrm_state_insert(x); spin_unlock_bh(&net->xfrm.xfrm_state_lock); } EXPORT_SYMBOL(xfrm_state_insert); /* net->xfrm.xfrm_state_lock is held */ static struct xfrm_state *__find_acq_core(struct net *net, const struct xfrm_mark *m, unsigned short family, u8 mode, u32 reqid, u32 if_id, u8 proto, const xfrm_address_t *daddr, const xfrm_address_t *saddr, int create) { unsigned int h = xfrm_dst_hash(net, daddr, saddr, reqid, family); struct xfrm_state *x; u32 mark = m->v & m->m; hlist_for_each_entry(x, net->xfrm.state_bydst+h, bydst) { if (x->props.reqid != reqid || x->props.mode != mode || x->props.family != family || x->km.state != XFRM_STATE_ACQ || x->id.spi != 0 || x->id.proto != proto || (mark & x->mark.m) != x->mark.v || !xfrm_addr_equal(&x->id.daddr, daddr, family) || !xfrm_addr_equal(&x->props.saddr, saddr, family)) continue; xfrm_state_hold(x); return x; } if (!create) return NULL; x = xfrm_state_alloc(net); if (likely(x)) { switch (family) { case AF_INET: x->sel.daddr.a4 = daddr->a4; x->sel.saddr.a4 = saddr->a4; x->sel.prefixlen_d = 32; x->sel.prefixlen_s = 32; x->props.saddr.a4 = saddr->a4; x->id.daddr.a4 = daddr->a4; break; case AF_INET6: x->sel.daddr.in6 = daddr->in6; x->sel.saddr.in6 = saddr->in6; x->sel.prefixlen_d = 128; x->sel.prefixlen_s = 128; x->props.saddr.in6 = saddr->in6; x->id.daddr.in6 = daddr->in6; break; } x->km.state = XFRM_STATE_ACQ; x->id.proto = proto; x->props.family = family; x->props.mode = mode; x->props.reqid = reqid; x->if_id = if_id; x->mark.v = m->v; x->mark.m = m->m; x->lft.hard_add_expires_seconds = net->xfrm.sysctl_acq_expires; xfrm_state_hold(x); hrtimer_start(&x->mtimer, ktime_set(net->xfrm.sysctl_acq_expires, 0), HRTIMER_MODE_REL_SOFT); list_add(&x->km.all, &net->xfrm.state_all); XFRM_STATE_INSERT(bydst, &x->bydst, net->xfrm.state_bydst + h, x->xso.type); h = xfrm_src_hash(net, daddr, saddr, family); XFRM_STATE_INSERT(bysrc, &x->bysrc, net->xfrm.state_bysrc + h, x->xso.type); net->xfrm.state_num++; xfrm_hash_grow_check(net, x->bydst.next != NULL); } return x; } static struct xfrm_state *__xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq); int xfrm_state_add(struct xfrm_state *x) { struct net *net = xs_net(x); struct xfrm_state *x1, *to_put; int family; int err; u32 mark = x->mark.v & x->mark.m; int use_spi = xfrm_id_proto_match(x->id.proto, IPSEC_PROTO_ANY); family = x->props.family; to_put = NULL; spin_lock_bh(&net->xfrm.xfrm_state_lock); x1 = __xfrm_state_locate(x, use_spi, family); if (x1) { to_put = x1; x1 = NULL; err = -EEXIST; goto out; } if (use_spi && x->km.seq) { x1 = __xfrm_find_acq_byseq(net, mark, x->km.seq); if (x1 && ((x1->id.proto != x->id.proto) || !xfrm_addr_equal(&x1->id.daddr, &x->id.daddr, family))) { to_put = x1; x1 = NULL; } } if (use_spi && !x1) x1 = __find_acq_core(net, &x->mark, family, x->props.mode, x->props.reqid, x->if_id, x->id.proto, &x->id.daddr, &x->props.saddr, 0); __xfrm_state_bump_genids(x); __xfrm_state_insert(x); err = 0; out: spin_unlock_bh(&net->xfrm.xfrm_state_lock); if (x1) { xfrm_state_delete(x1); xfrm_state_put(x1); } if (to_put) xfrm_state_put(to_put); return err; } EXPORT_SYMBOL(xfrm_state_add); #ifdef CONFIG_XFRM_MIGRATE static inline int clone_security(struct xfrm_state *x, struct xfrm_sec_ctx *security) { struct xfrm_user_sec_ctx *uctx; int size = sizeof(*uctx) + security->ctx_len; int err; uctx = kmalloc(size, GFP_KERNEL); if (!uctx) return -ENOMEM; uctx->exttype = XFRMA_SEC_CTX; uctx->len = size; uctx->ctx_doi = security->ctx_doi; uctx->ctx_alg = security->ctx_alg; uctx->ctx_len = security->ctx_len; memcpy(uctx + 1, security->ctx_str, security->ctx_len); err = security_xfrm_state_alloc(x, uctx); kfree(uctx); if (err) return err; return 0; } static struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig, struct xfrm_encap_tmpl *encap) { struct net *net = xs_net(orig); struct xfrm_state *x = xfrm_state_alloc(net); if (!x) goto out; memcpy(&x->id, &orig->id, sizeof(x->id)); memcpy(&x->sel, &orig->sel, sizeof(x->sel)); memcpy(&x->lft, &orig->lft, sizeof(x->lft)); x->props.mode = orig->props.mode; x->props.replay_window = orig->props.replay_window; x->props.reqid = orig->props.reqid; x->props.family = orig->props.family; x->props.saddr = orig->props.saddr; if (orig->aalg) { x->aalg = xfrm_algo_auth_clone(orig->aalg); if (!x->aalg) goto error; } x->props.aalgo = orig->props.aalgo; if (orig->aead) { x->aead = xfrm_algo_aead_clone(orig->aead); x->geniv = orig->geniv; if (!x->aead) goto error; } if (orig->ealg) { x->ealg = xfrm_algo_clone(orig->ealg); if (!x->ealg) goto error; } x->props.ealgo = orig->props.ealgo; if (orig->calg) { x->calg = xfrm_algo_clone(orig->calg); if (!x->calg) goto error; } x->props.calgo = orig->props.calgo; if (encap || orig->encap) { if (encap) x->encap = kmemdup(encap, sizeof(*x->encap), GFP_KERNEL); else x->encap = kmemdup(orig->encap, sizeof(*x->encap), GFP_KERNEL); if (!x->encap) goto error; } if (orig->security) if (clone_security(x, orig->security)) goto error; if (orig->coaddr) { x->coaddr = kmemdup(orig->coaddr, sizeof(*x->coaddr), GFP_KERNEL); if (!x->coaddr) goto error; } if (orig->replay_esn) { if (xfrm_replay_clone(x, orig)) goto error; } memcpy(&x->mark, &orig->mark, sizeof(x->mark)); memcpy(&x->props.smark, &orig->props.smark, sizeof(x->props.smark)); x->props.flags = orig->props.flags; x->props.extra_flags = orig->props.extra_flags; x->if_id = orig->if_id; x->tfcpad = orig->tfcpad; x->replay_maxdiff = orig->replay_maxdiff; x->replay_maxage = orig->replay_maxage; memcpy(&x->curlft, &orig->curlft, sizeof(x->curlft)); x->km.state = orig->km.state; x->km.seq = orig->km.seq; x->replay = orig->replay; x->preplay = orig->preplay; x->mapping_maxage = orig->mapping_maxage; x->lastused = orig->lastused; x->new_mapping = 0; x->new_mapping_sport = 0; return x; error: xfrm_state_put(x); out: return NULL; } struct xfrm_state *xfrm_migrate_state_find(struct xfrm_migrate *m, struct net *net, u32 if_id) { unsigned int h; struct xfrm_state *x = NULL; spin_lock_bh(&net->xfrm.xfrm_state_lock); if (m->reqid) { h = xfrm_dst_hash(net, &m->old_daddr, &m->old_saddr, m->reqid, m->old_family); hlist_for_each_entry(x, net->xfrm.state_bydst+h, bydst) { if (x->props.mode != m->mode || x->id.proto != m->proto) continue; if (m->reqid && x->props.reqid != m->reqid) continue; if (if_id != 0 && x->if_id != if_id) continue; if (!xfrm_addr_equal(&x->id.daddr, &m->old_daddr, m->old_family) || !xfrm_addr_equal(&x->props.saddr, &m->old_saddr, m->old_family)) continue; xfrm_state_hold(x); break; } } else { h = xfrm_src_hash(net, &m->old_daddr, &m->old_saddr, m->old_family); hlist_for_each_entry(x, net->xfrm.state_bysrc+h, bysrc) { if (x->props.mode != m->mode || x->id.proto != m->proto) continue; if (if_id != 0 && x->if_id != if_id) continue; if (!xfrm_addr_equal(&x->id.daddr, &m->old_daddr, m->old_family) || !xfrm_addr_equal(&x->props.saddr, &m->old_saddr, m->old_family)) continue; xfrm_state_hold(x); break; } } spin_unlock_bh(&net->xfrm.xfrm_state_lock); return x; } EXPORT_SYMBOL(xfrm_migrate_state_find); struct xfrm_state *xfrm_state_migrate(struct xfrm_state *x, struct xfrm_migrate *m, struct xfrm_encap_tmpl *encap) { struct xfrm_state *xc; xc = xfrm_state_clone(x, encap); if (!xc) return NULL; xc->props.family = m->new_family; if (xfrm_init_state(xc) < 0) goto error; memcpy(&xc->id.daddr, &m->new_daddr, sizeof(xc->id.daddr)); memcpy(&xc->props.saddr, &m->new_saddr, sizeof(xc->props.saddr)); /* add state */ if (xfrm_addr_equal(&x->id.daddr, &m->new_daddr, m->new_family)) { /* a care is needed when the destination address of the state is to be updated as it is a part of triplet */ xfrm_state_insert(xc); } else { if (xfrm_state_add(xc) < 0) goto error; } return xc; error: xfrm_state_put(xc); return NULL; } EXPORT_SYMBOL(xfrm_state_migrate); #endif int xfrm_state_update(struct xfrm_state *x) { struct xfrm_state *x1, *to_put; int err; int use_spi = xfrm_id_proto_match(x->id.proto, IPSEC_PROTO_ANY); struct net *net = xs_net(x); to_put = NULL; spin_lock_bh(&net->xfrm.xfrm_state_lock); x1 = __xfrm_state_locate(x, use_spi, x->props.family); err = -ESRCH; if (!x1) goto out; if (xfrm_state_kern(x1)) { to_put = x1; err = -EEXIST; goto out; } if (x1->km.state == XFRM_STATE_ACQ) { __xfrm_state_insert(x); x = NULL; } err = 0; out: spin_unlock_bh(&net->xfrm.xfrm_state_lock); if (to_put) xfrm_state_put(to_put); if (err) return err; if (!x) { xfrm_state_delete(x1); xfrm_state_put(x1); return 0; } err = -EINVAL; spin_lock_bh(&x1->lock); if (likely(x1->km.state == XFRM_STATE_VALID)) { if (x->encap && x1->encap && x->encap->encap_type == x1->encap->encap_type) memcpy(x1->encap, x->encap, sizeof(*x1->encap)); else if (x->encap || x1->encap) goto fail; if (x->coaddr && x1->coaddr) { memcpy(x1->coaddr, x->coaddr, sizeof(*x1->coaddr)); } if (!use_spi && memcmp(&x1->sel, &x->sel, sizeof(x1->sel))) memcpy(&x1->sel, &x->sel, sizeof(x1->sel)); memcpy(&x1->lft, &x->lft, sizeof(x1->lft)); x1->km.dying = 0; hrtimer_start(&x1->mtimer, ktime_set(1, 0), HRTIMER_MODE_REL_SOFT); if (READ_ONCE(x1->curlft.use_time)) xfrm_state_check_expire(x1); if (x->props.smark.m || x->props.smark.v || x->if_id) { spin_lock_bh(&net->xfrm.xfrm_state_lock); if (x->props.smark.m || x->props.smark.v) x1->props.smark = x->props.smark; if (x->if_id) x1->if_id = x->if_id; __xfrm_state_bump_genids(x1); spin_unlock_bh(&net->xfrm.xfrm_state_lock); } err = 0; x->km.state = XFRM_STATE_DEAD; __xfrm_state_put(x); } fail: spin_unlock_bh(&x1->lock); xfrm_state_put(x1); return err; } EXPORT_SYMBOL(xfrm_state_update); int xfrm_state_check_expire(struct xfrm_state *x) { xfrm_dev_state_update_curlft(x); if (!READ_ONCE(x->curlft.use_time)) WRITE_ONCE(x->curlft.use_time, ktime_get_real_seconds()); if (x->curlft.bytes >= x->lft.hard_byte_limit || x->curlft.packets >= x->lft.hard_packet_limit) { x->km.state = XFRM_STATE_EXPIRED; hrtimer_start(&x->mtimer, 0, HRTIMER_MODE_REL_SOFT); return -EINVAL; } if (!x->km.dying && (x->curlft.bytes >= x->lft.soft_byte_limit || x->curlft.packets >= x->lft.soft_packet_limit)) { x->km.dying = 1; km_state_expired(x, 0, 0); } return 0; } EXPORT_SYMBOL(xfrm_state_check_expire); struct xfrm_state * xfrm_state_lookup(struct net *net, u32 mark, const xfrm_address_t *daddr, __be32 spi, u8 proto, unsigned short family) { struct xfrm_state *x; rcu_read_lock(); x = __xfrm_state_lookup(net, mark, daddr, spi, proto, family); rcu_read_unlock(); return x; } EXPORT_SYMBOL(xfrm_state_lookup); struct xfrm_state * xfrm_state_lookup_byaddr(struct net *net, u32 mark, const xfrm_address_t *daddr, const xfrm_address_t *saddr, u8 proto, unsigned short family) { struct xfrm_state *x; spin_lock_bh(&net->xfrm.xfrm_state_lock); x = __xfrm_state_lookup_byaddr(net, mark, daddr, saddr, proto, family); spin_unlock_bh(&net->xfrm.xfrm_state_lock); return x; } EXPORT_SYMBOL(xfrm_state_lookup_byaddr); struct xfrm_state * xfrm_find_acq(struct net *net, const struct xfrm_mark *mark, u8 mode, u32 reqid, u32 if_id, u8 proto, const xfrm_address_t *daddr, const xfrm_address_t *saddr, int create, unsigned short family) { struct xfrm_state *x; spin_lock_bh(&net->xfrm.xfrm_state_lock); x = __find_acq_core(net, mark, family, mode, reqid, if_id, proto, daddr, saddr, create); spin_unlock_bh(&net->xfrm.xfrm_state_lock); return x; } EXPORT_SYMBOL(xfrm_find_acq); #ifdef CONFIG_XFRM_SUB_POLICY #if IS_ENABLED(CONFIG_IPV6) /* distribution counting sort function for xfrm_state and xfrm_tmpl */ static void __xfrm6_sort(void **dst, void **src, int n, int (*cmp)(const void *p), int maxclass) { int count[XFRM_MAX_DEPTH] = { }; int class[XFRM_MAX_DEPTH]; int i; for (i = 0; i < n; i++) { int c = cmp(src[i]); class[i] = c; count[c]++; } for (i = 2; i < maxclass; i++) count[i] += count[i - 1]; for (i = 0; i < n; i++) { dst[count[class[i] - 1]++] = src[i]; src[i] = NULL; } } /* Rule for xfrm_state: * * rule 1: select IPsec transport except AH * rule 2: select MIPv6 RO or inbound trigger * rule 3: select IPsec transport AH * rule 4: select IPsec tunnel * rule 5: others */ static int __xfrm6_state_sort_cmp(const void *p) { const struct xfrm_state *v = p; switch (v->props.mode) { case XFRM_MODE_TRANSPORT: if (v->id.proto != IPPROTO_AH) return 1; else return 3; #if IS_ENABLED(CONFIG_IPV6_MIP6) case XFRM_MODE_ROUTEOPTIMIZATION: case XFRM_MODE_IN_TRIGGER: return 2; #endif case XFRM_MODE_TUNNEL: case XFRM_MODE_BEET: return 4; } return 5; } /* Rule for xfrm_tmpl: * * rule 1: select IPsec transport * rule 2: select MIPv6 RO or inbound trigger * rule 3: select IPsec tunnel * rule 4: others */ static int __xfrm6_tmpl_sort_cmp(const void *p) { const struct xfrm_tmpl *v = p; switch (v->mode) { case XFRM_MODE_TRANSPORT: return 1; #if IS_ENABLED(CONFIG_IPV6_MIP6) case XFRM_MODE_ROUTEOPTIMIZATION: case XFRM_MODE_IN_TRIGGER: return 2; #endif case XFRM_MODE_TUNNEL: case XFRM_MODE_BEET: return 3; } return 4; } #else static inline int __xfrm6_state_sort_cmp(const void *p) { return 5; } static inline int __xfrm6_tmpl_sort_cmp(const void *p) { return 4; } static inline void __xfrm6_sort(void **dst, void **src, int n, int (*cmp)(const void *p), int maxclass) { int i; for (i = 0; i < n; i++) dst[i] = src[i]; } #endif /* CONFIG_IPV6 */ void xfrm_tmpl_sort(struct xfrm_tmpl **dst, struct xfrm_tmpl **src, int n, unsigned short family) { int i; if (family == AF_INET6) __xfrm6_sort((void **)dst, (void **)src, n, __xfrm6_tmpl_sort_cmp, 5); else for (i = 0; i < n; i++) dst[i] = src[i]; } void xfrm_state_sort(struct xfrm_state **dst, struct xfrm_state **src, int n, unsigned short family) { int i; if (family == AF_INET6) __xfrm6_sort((void **)dst, (void **)src, n, __xfrm6_state_sort_cmp, 6); else for (i = 0; i < n; i++) dst[i] = src[i]; } #endif /* Silly enough, but I'm lazy to build resolution list */ static struct xfrm_state *__xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq) { unsigned int h = xfrm_seq_hash(net, seq); struct xfrm_state *x; hlist_for_each_entry_rcu(x, net->xfrm.state_byseq + h, byseq) { if (x->km.seq == seq && (mark & x->mark.m) == x->mark.v && x->km.state == XFRM_STATE_ACQ) { xfrm_state_hold(x); return x; } } return NULL; } struct xfrm_state *xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq) { struct xfrm_state *x; spin_lock_bh(&net->xfrm.xfrm_state_lock); x = __xfrm_find_acq_byseq(net, mark, seq); spin_unlock_bh(&net->xfrm.xfrm_state_lock); return x; } EXPORT_SYMBOL(xfrm_find_acq_byseq); u32 xfrm_get_acqseq(void) { u32 res; static atomic_t acqseq; do { res = atomic_inc_return(&acqseq); } while (!res); return res; } EXPORT_SYMBOL(xfrm_get_acqseq); int verify_spi_info(u8 proto, u32 min, u32 max, struct netlink_ext_ack *extack) { switch (proto) { case IPPROTO_AH: case IPPROTO_ESP: break; case IPPROTO_COMP: /* IPCOMP spi is 16-bits. */ if (max >= 0x10000) { NL_SET_ERR_MSG(extack, "IPCOMP SPI must be <= 65535"); return -EINVAL; } break; default: NL_SET_ERR_MSG(extack, "Invalid protocol, must be one of AH, ESP, IPCOMP"); return -EINVAL; } if (min > max) { NL_SET_ERR_MSG(extack, "Invalid SPI range: min > max"); return -EINVAL; } return 0; } EXPORT_SYMBOL(verify_spi_info); int xfrm_alloc_spi(struct xfrm_state *x, u32 low, u32 high, struct netlink_ext_ack *extack) { struct net *net = xs_net(x); unsigned int h; struct xfrm_state *x0; int err = -ENOENT; __be32 minspi = htonl(low); __be32 maxspi = htonl(high); __be32 newspi = 0; u32 mark = x->mark.v & x->mark.m; spin_lock_bh(&x->lock); if (x->km.state == XFRM_STATE_DEAD) { NL_SET_ERR_MSG(extack, "Target ACQUIRE is in DEAD state"); goto unlock; } err = 0; if (x->id.spi) goto unlock; err = -ENOENT; if (minspi == maxspi) { x0 = xfrm_state_lookup(net, mark, &x->id.daddr, minspi, x->id.proto, x->props.family); if (x0) { NL_SET_ERR_MSG(extack, "Requested SPI is already in use"); xfrm_state_put(x0); goto unlock; } newspi = minspi; } else { u32 spi = 0; for (h = 0; h < high-low+1; h++) { spi = get_random_u32_inclusive(low, high); x0 = xfrm_state_lookup(net, mark, &x->id.daddr, htonl(spi), x->id.proto, x->props.family); if (x0 == NULL) { newspi = htonl(spi); break; } xfrm_state_put(x0); } } if (newspi) { spin_lock_bh(&net->xfrm.xfrm_state_lock); x->id.spi = newspi; h = xfrm_spi_hash(net, &x->id.daddr, x->id.spi, x->id.proto, x->props.family); XFRM_STATE_INSERT(byspi, &x->byspi, net->xfrm.state_byspi + h, x->xso.type); spin_unlock_bh(&net->xfrm.xfrm_state_lock); err = 0; } else { NL_SET_ERR_MSG(extack, "No SPI available in the requested range"); } unlock: spin_unlock_bh(&x->lock); return err; } EXPORT_SYMBOL(xfrm_alloc_spi); static bool __xfrm_state_filter_match(struct xfrm_state *x, struct xfrm_address_filter *filter) { if (filter) { if ((filter->family == AF_INET || filter->family == AF_INET6) && x->props.family != filter->family) return false; return addr_match(&x->props.saddr, &filter->saddr, filter->splen) && addr_match(&x->id.daddr, &filter->daddr, filter->dplen); } return true; } int xfrm_state_walk(struct net *net, struct xfrm_state_walk *walk, int (*func)(struct xfrm_state *, int, void*), void *data) { struct xfrm_state *state; struct xfrm_state_walk *x; int err = 0; if (walk->seq != 0 && list_empty(&walk->all)) return 0; spin_lock_bh(&net->xfrm.xfrm_state_lock); if (list_empty(&walk->all)) x = list_first_entry(&net->xfrm.state_all, struct xfrm_state_walk, all); else x = list_first_entry(&walk->all, struct xfrm_state_walk, all); list_for_each_entry_from(x, &net->xfrm.state_all, all) { if (x->state == XFRM_STATE_DEAD) continue; state = container_of(x, struct xfrm_state, km); if (!xfrm_id_proto_match(state->id.proto, walk->proto)) continue; if (!__xfrm_state_filter_match(state, walk->filter)) continue; err = func(state, walk->seq, data); if (err) { list_move_tail(&walk->all, &x->all); goto out; } walk->seq++; } if (walk->seq == 0) { err = -ENOENT; goto out; } list_del_init(&walk->all); out: spin_unlock_bh(&net->xfrm.xfrm_state_lock); return err; } EXPORT_SYMBOL(xfrm_state_walk); void xfrm_state_walk_init(struct xfrm_state_walk *walk, u8 proto, struct xfrm_address_filter *filter) { INIT_LIST_HEAD(&walk->all); walk->proto = proto; walk->state = XFRM_STATE_DEAD; walk->seq = 0; walk->filter = filter; } EXPORT_SYMBOL(xfrm_state_walk_init); void xfrm_state_walk_done(struct xfrm_state_walk *walk, struct net *net) { kfree(walk->filter); if (list_empty(&walk->all)) return; spin_lock_bh(&net->xfrm.xfrm_state_lock); list_del(&walk->all); spin_unlock_bh(&net->xfrm.xfrm_state_lock); } EXPORT_SYMBOL(xfrm_state_walk_done); static void xfrm_replay_timer_handler(struct timer_list *t) { struct xfrm_state *x = from_timer(x, t, rtimer); spin_lock(&x->lock); if (x->km.state == XFRM_STATE_VALID) { if (xfrm_aevent_is_on(xs_net(x))) xfrm_replay_notify(x, XFRM_REPLAY_TIMEOUT); else x->xflags |= XFRM_TIME_DEFER; } spin_unlock(&x->lock); } static LIST_HEAD(xfrm_km_list); void km_policy_notify(struct xfrm_policy *xp, int dir, const struct km_event *c) { struct xfrm_mgr *km; rcu_read_lock(); list_for_each_entry_rcu(km, &xfrm_km_list, list) if (km->notify_policy) km->notify_policy(xp, dir, c); rcu_read_unlock(); } void km_state_notify(struct xfrm_state *x, const struct km_event *c) { struct xfrm_mgr *km; rcu_read_lock(); list_for_each_entry_rcu(km, &xfrm_km_list, list) if (km->notify) km->notify(x, c); rcu_read_unlock(); } EXPORT_SYMBOL(km_policy_notify); EXPORT_SYMBOL(km_state_notify); void km_state_expired(struct xfrm_state *x, int hard, u32 portid) { struct km_event c; c.data.hard = hard; c.portid = portid; c.event = XFRM_MSG_EXPIRE; km_state_notify(x, &c); } EXPORT_SYMBOL(km_state_expired); /* * We send to all registered managers regardless of failure * We are happy with one success */ int km_query(struct xfrm_state *x, struct xfrm_tmpl *t, struct xfrm_policy *pol) { int err = -EINVAL, acqret; struct xfrm_mgr *km; rcu_read_lock(); list_for_each_entry_rcu(km, &xfrm_km_list, list) { acqret = km->acquire(x, t, pol); if (!acqret) err = acqret; } rcu_read_unlock(); return err; } EXPORT_SYMBOL(km_query); static int __km_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, __be16 sport) { int err = -EINVAL; struct xfrm_mgr *km; rcu_read_lock(); list_for_each_entry_rcu(km, &xfrm_km_list, list) { if (km->new_mapping) err = km->new_mapping(x, ipaddr, sport); if (!err) break; } rcu_read_unlock(); return err; } int km_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, __be16 sport) { int ret = 0; if (x->mapping_maxage) { if ((jiffies / HZ - x->new_mapping) > x->mapping_maxage || x->new_mapping_sport != sport) { x->new_mapping_sport = sport; x->new_mapping = jiffies / HZ; ret = __km_new_mapping(x, ipaddr, sport); } } else { ret = __km_new_mapping(x, ipaddr, sport); } return ret; } EXPORT_SYMBOL(km_new_mapping); void km_policy_expired(struct xfrm_policy *pol, int dir, int hard, u32 portid) { struct km_event c; c.data.hard = hard; c.portid = portid; c.event = XFRM_MSG_POLEXPIRE; km_policy_notify(pol, dir, &c); } EXPORT_SYMBOL(km_policy_expired); #ifdef CONFIG_XFRM_MIGRATE int km_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, const struct xfrm_migrate *m, int num_migrate, const struct xfrm_kmaddress *k, const struct xfrm_encap_tmpl *encap) { int err = -EINVAL; int ret; struct xfrm_mgr *km; rcu_read_lock(); list_for_each_entry_rcu(km, &xfrm_km_list, list) { if (km->migrate) { ret = km->migrate(sel, dir, type, m, num_migrate, k, encap); if (!ret) err = ret; } } rcu_read_unlock(); return err; } EXPORT_SYMBOL(km_migrate); #endif int km_report(struct net *net, u8 proto, struct xfrm_selector *sel, xfrm_address_t *addr) { int err = -EINVAL; int ret; struct xfrm_mgr *km; rcu_read_lock(); list_for_each_entry_rcu(km, &xfrm_km_list, list) { if (km->report) { ret = km->report(net, proto, sel, addr); if (!ret) err = ret; } } rcu_read_unlock(); return err; } EXPORT_SYMBOL(km_report); static bool km_is_alive(const struct km_event *c) { struct xfrm_mgr *km; bool is_alive = false; rcu_read_lock(); list_for_each_entry_rcu(km, &xfrm_km_list, list) { if (km->is_alive && km->is_alive(c)) { is_alive = true; break; } } rcu_read_unlock(); return is_alive; } #if IS_ENABLED(CONFIG_XFRM_USER_COMPAT) static DEFINE_SPINLOCK(xfrm_translator_lock); static struct xfrm_translator __rcu *xfrm_translator; struct xfrm_translator *xfrm_get_translator(void) { struct xfrm_translator *xtr; rcu_read_lock(); xtr = rcu_dereference(xfrm_translator); if (unlikely(!xtr)) goto out; if (!try_module_get(xtr->owner)) xtr = NULL; out: rcu_read_unlock(); return xtr; } EXPORT_SYMBOL_GPL(xfrm_get_translator); void xfrm_put_translator(struct xfrm_translator *xtr) { module_put(xtr->owner); } EXPORT_SYMBOL_GPL(xfrm_put_translator); int xfrm_register_translator(struct xfrm_translator *xtr) { int err = 0; spin_lock_bh(&xfrm_translator_lock); if (unlikely(xfrm_translator != NULL)) err = -EEXIST; else rcu_assign_pointer(xfrm_translator, xtr); spin_unlock_bh(&xfrm_translator_lock); return err; } EXPORT_SYMBOL_GPL(xfrm_register_translator); int xfrm_unregister_translator(struct xfrm_translator *xtr) { int err = 0; spin_lock_bh(&xfrm_translator_lock); if (likely(xfrm_translator != NULL)) { if (rcu_access_pointer(xfrm_translator) != xtr) err = -EINVAL; else RCU_INIT_POINTER(xfrm_translator, NULL); } spin_unlock_bh(&xfrm_translator_lock); synchronize_rcu(); return err; } EXPORT_SYMBOL_GPL(xfrm_unregister_translator); #endif int xfrm_user_policy(struct sock *sk, int optname, sockptr_t optval, int optlen) { int err; u8 *data; struct xfrm_mgr *km; struct xfrm_policy *pol = NULL; if (sockptr_is_null(optval) && !optlen) { xfrm_sk_policy_insert(sk, XFRM_POLICY_IN, NULL); xfrm_sk_policy_insert(sk, XFRM_POLICY_OUT, NULL); __sk_dst_reset(sk); return 0; } if (optlen <= 0 || optlen > PAGE_SIZE) return -EMSGSIZE; data = memdup_sockptr(optval, optlen); if (IS_ERR(data)) return PTR_ERR(data); if (in_compat_syscall()) { struct xfrm_translator *xtr = xfrm_get_translator(); if (!xtr) { kfree(data); return -EOPNOTSUPP; } err = xtr->xlate_user_policy_sockptr(&data, optlen); xfrm_put_translator(xtr); if (err) { kfree(data); return err; } } err = -EINVAL; rcu_read_lock(); list_for_each_entry_rcu(km, &xfrm_km_list, list) { pol = km->compile_policy(sk, optname, data, optlen, &err); if (err >= 0) break; } rcu_read_unlock(); if (err >= 0) { xfrm_sk_policy_insert(sk, err, pol); xfrm_pol_put(pol); __sk_dst_reset(sk); err = 0; } kfree(data); return err; } EXPORT_SYMBOL(xfrm_user_policy); static DEFINE_SPINLOCK(xfrm_km_lock); void xfrm_register_km(struct xfrm_mgr *km) { spin_lock_bh(&xfrm_km_lock); list_add_tail_rcu(&km->list, &xfrm_km_list); spin_unlock_bh(&xfrm_km_lock); } EXPORT_SYMBOL(xfrm_register_km); void xfrm_unregister_km(struct xfrm_mgr *km) { spin_lock_bh(&xfrm_km_lock); list_del_rcu(&km->list); spin_unlock_bh(&xfrm_km_lock); synchronize_rcu(); } EXPORT_SYMBOL(xfrm_unregister_km); int xfrm_state_register_afinfo(struct xfrm_state_afinfo *afinfo) { int err = 0; if (WARN_ON(afinfo->family >= NPROTO)) return -EAFNOSUPPORT; spin_lock_bh(&xfrm_state_afinfo_lock); if (unlikely(xfrm_state_afinfo[afinfo->family] != NULL)) err = -EEXIST; else rcu_assign_pointer(xfrm_state_afinfo[afinfo->family], afinfo); spin_unlock_bh(&xfrm_state_afinfo_lock); return err; } EXPORT_SYMBOL(xfrm_state_register_afinfo); int xfrm_state_unregister_afinfo(struct xfrm_state_afinfo *afinfo) { int err = 0, family = afinfo->family; if (WARN_ON(family >= NPROTO)) return -EAFNOSUPPORT; spin_lock_bh(&xfrm_state_afinfo_lock); if (likely(xfrm_state_afinfo[afinfo->family] != NULL)) { if (rcu_access_pointer(xfrm_state_afinfo[family]) != afinfo) err = -EINVAL; else RCU_INIT_POINTER(xfrm_state_afinfo[afinfo->family], NULL); } spin_unlock_bh(&xfrm_state_afinfo_lock); synchronize_rcu(); return err; } EXPORT_SYMBOL(xfrm_state_unregister_afinfo); struct xfrm_state_afinfo *xfrm_state_afinfo_get_rcu(unsigned int family) { if (unlikely(family >= NPROTO)) return NULL; return rcu_dereference(xfrm_state_afinfo[family]); } EXPORT_SYMBOL_GPL(xfrm_state_afinfo_get_rcu); struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned int family) { struct xfrm_state_afinfo *afinfo; if (unlikely(family >= NPROTO)) return NULL; rcu_read_lock(); afinfo = rcu_dereference(xfrm_state_afinfo[family]); if (unlikely(!afinfo)) rcu_read_unlock(); return afinfo; } void xfrm_flush_gc(void) { flush_work(&xfrm_state_gc_work); } EXPORT_SYMBOL(xfrm_flush_gc); /* Temporarily located here until net/xfrm/xfrm_tunnel.c is created */ void xfrm_state_delete_tunnel(struct xfrm_state *x) { if (x->tunnel) { struct xfrm_state *t = x->tunnel; if (atomic_read(&t->tunnel_users) == 2) xfrm_state_delete(t); atomic_dec(&t->tunnel_users); xfrm_state_put_sync(t); x->tunnel = NULL; } } EXPORT_SYMBOL(xfrm_state_delete_tunnel); u32 xfrm_state_mtu(struct xfrm_state *x, int mtu) { const struct xfrm_type *type = READ_ONCE(x->type); struct crypto_aead *aead; u32 blksize, net_adj = 0; if (x->km.state != XFRM_STATE_VALID || !type || type->proto != IPPROTO_ESP) return mtu - x->props.header_len; aead = x->data; blksize = ALIGN(crypto_aead_blocksize(aead), 4); switch (x->props.mode) { case XFRM_MODE_TRANSPORT: case XFRM_MODE_BEET: if (x->props.family == AF_INET) net_adj = sizeof(struct iphdr); else if (x->props.family == AF_INET6) net_adj = sizeof(struct ipv6hdr); break; case XFRM_MODE_TUNNEL: break; default: WARN_ON_ONCE(1); break; } return ((mtu - x->props.header_len - crypto_aead_authsize(aead) - net_adj) & ~(blksize - 1)) + net_adj - 2; } EXPORT_SYMBOL_GPL(xfrm_state_mtu); int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload, struct netlink_ext_ack *extack) { const struct xfrm_mode *inner_mode; const struct xfrm_mode *outer_mode; int family = x->props.family; int err; if (family == AF_INET && READ_ONCE(xs_net(x)->ipv4.sysctl_ip_no_pmtu_disc)) x->props.flags |= XFRM_STATE_NOPMTUDISC; err = -EPROTONOSUPPORT; if (x->sel.family != AF_UNSPEC) { inner_mode = xfrm_get_mode(x->props.mode, x->sel.family); if (inner_mode == NULL) { NL_SET_ERR_MSG(extack, "Requested mode not found"); goto error; } if (!(inner_mode->flags & XFRM_MODE_FLAG_TUNNEL) && family != x->sel.family) { NL_SET_ERR_MSG(extack, "Only tunnel modes can accommodate a change of family"); goto error; } x->inner_mode = *inner_mode; } else { const struct xfrm_mode *inner_mode_iaf; int iafamily = AF_INET; inner_mode = xfrm_get_mode(x->props.mode, x->props.family); if (inner_mode == NULL) { NL_SET_ERR_MSG(extack, "Requested mode not found"); goto error; } x->inner_mode = *inner_mode; if (x->props.family == AF_INET) iafamily = AF_INET6; inner_mode_iaf = xfrm_get_mode(x->props.mode, iafamily); if (inner_mode_iaf) { if (inner_mode_iaf->flags & XFRM_MODE_FLAG_TUNNEL) x->inner_mode_iaf = *inner_mode_iaf; } } x->type = xfrm_get_type(x->id.proto, family); if (x->type == NULL) { NL_SET_ERR_MSG(extack, "Requested type not found"); goto error; } x->type_offload = xfrm_get_type_offload(x->id.proto, family, offload); err = x->type->init_state(x, extack); if (err) goto error; outer_mode = xfrm_get_mode(x->props.mode, family); if (!outer_mode) { NL_SET_ERR_MSG(extack, "Requested mode not found"); err = -EPROTONOSUPPORT; goto error; } x->outer_mode = *outer_mode; if (init_replay) { err = xfrm_init_replay(x, extack); if (err) goto error; } error: return err; } EXPORT_SYMBOL(__xfrm_init_state); int xfrm_init_state(struct xfrm_state *x) { int err; err = __xfrm_init_state(x, true, false, NULL); if (!err) x->km.state = XFRM_STATE_VALID; return err; } EXPORT_SYMBOL(xfrm_init_state); int __net_init xfrm_state_init(struct net *net) { unsigned int sz; if (net_eq(net, &init_net)) xfrm_state_cache = KMEM_CACHE(xfrm_state, SLAB_HWCACHE_ALIGN | SLAB_PANIC); INIT_LIST_HEAD(&net->xfrm.state_all); sz = sizeof(struct hlist_head) * 8; net->xfrm.state_bydst = xfrm_hash_alloc(sz); if (!net->xfrm.state_bydst) goto out_bydst; net->xfrm.state_bysrc = xfrm_hash_alloc(sz); if (!net->xfrm.state_bysrc) goto out_bysrc; net->xfrm.state_byspi = xfrm_hash_alloc(sz); if (!net->xfrm.state_byspi) goto out_byspi; net->xfrm.state_byseq = xfrm_hash_alloc(sz); if (!net->xfrm.state_byseq) goto out_byseq; net->xfrm.state_hmask = ((sz / sizeof(struct hlist_head)) - 1); net->xfrm.state_num = 0; INIT_WORK(&net->xfrm.state_hash_work, xfrm_hash_resize); spin_lock_init(&net->xfrm.xfrm_state_lock); seqcount_spinlock_init(&net->xfrm.xfrm_state_hash_generation, &net->xfrm.xfrm_state_lock); return 0; out_byseq: xfrm_hash_free(net->xfrm.state_byspi, sz); out_byspi: xfrm_hash_free(net->xfrm.state_bysrc, sz); out_bysrc: xfrm_hash_free(net->xfrm.state_bydst, sz); out_bydst: return -ENOMEM; } void xfrm_state_fini(struct net *net) { unsigned int sz; flush_work(&net->xfrm.state_hash_work); flush_work(&xfrm_state_gc_work); xfrm_state_flush(net, 0, false, true); WARN_ON(!list_empty(&net->xfrm.state_all)); sz = (net->xfrm.state_hmask + 1) * sizeof(struct hlist_head); WARN_ON(!hlist_empty(net->xfrm.state_byseq)); xfrm_hash_free(net->xfrm.state_byseq, sz); WARN_ON(!hlist_empty(net->xfrm.state_byspi)); xfrm_hash_free(net->xfrm.state_byspi, sz); WARN_ON(!hlist_empty(net->xfrm.state_bysrc)); xfrm_hash_free(net->xfrm.state_bysrc, sz); WARN_ON(!hlist_empty(net->xfrm.state_bydst)); xfrm_hash_free(net->xfrm.state_bydst, sz); } #ifdef CONFIG_AUDITSYSCALL static void xfrm_audit_helper_sainfo(struct xfrm_state *x, struct audit_buffer *audit_buf) { struct xfrm_sec_ctx *ctx = x->security; u32 spi = ntohl(x->id.spi); if (ctx) audit_log_format(audit_buf, " sec_alg=%u sec_doi=%u sec_obj=%s", ctx->ctx_alg, ctx->ctx_doi, ctx->ctx_str); switch (x->props.family) { case AF_INET: audit_log_format(audit_buf, " src=%pI4 dst=%pI4", &x->props.saddr.a4, &x->id.daddr.a4); break; case AF_INET6: audit_log_format(audit_buf, " src=%pI6 dst=%pI6", x->props.saddr.a6, x->id.daddr.a6); break; } audit_log_format(audit_buf, " spi=%u(0x%x)", spi, spi); } static void xfrm_audit_helper_pktinfo(struct sk_buff *skb, u16 family, struct audit_buffer *audit_buf) { const struct iphdr *iph4; const struct ipv6hdr *iph6; switch (family) { case AF_INET: iph4 = ip_hdr(skb); audit_log_format(audit_buf, " src=%pI4 dst=%pI4", &iph4->saddr, &iph4->daddr); break; case AF_INET6: iph6 = ipv6_hdr(skb); audit_log_format(audit_buf, " src=%pI6 dst=%pI6 flowlbl=0x%x%02x%02x", &iph6->saddr, &iph6->daddr, iph6->flow_lbl[0] & 0x0f, iph6->flow_lbl[1], iph6->flow_lbl[2]); break; } } void xfrm_audit_state_add(struct xfrm_state *x, int result, bool task_valid) { struct audit_buffer *audit_buf; audit_buf = xfrm_audit_start("SAD-add"); if (audit_buf == NULL) return; xfrm_audit_helper_usrinfo(task_valid, audit_buf); xfrm_audit_helper_sainfo(x, audit_buf); audit_log_format(audit_buf, " res=%u", result); audit_log_end(audit_buf); } EXPORT_SYMBOL_GPL(xfrm_audit_state_add); void xfrm_audit_state_delete(struct xfrm_state *x, int result, bool task_valid) { struct audit_buffer *audit_buf; audit_buf = xfrm_audit_start("SAD-delete"); if (audit_buf == NULL) return; xfrm_audit_helper_usrinfo(task_valid, audit_buf); xfrm_audit_helper_sainfo(x, audit_buf); audit_log_format(audit_buf, " res=%u", result); audit_log_end(audit_buf); } EXPORT_SYMBOL_GPL(xfrm_audit_state_delete); void xfrm_audit_state_replay_overflow(struct xfrm_state *x, struct sk_buff *skb) { struct audit_buffer *audit_buf; u32 spi; audit_buf = xfrm_audit_start("SA-replay-overflow"); if (audit_buf == NULL) return; xfrm_audit_helper_pktinfo(skb, x->props.family, audit_buf); /* don't record the sequence number because it's inherent in this kind * of audit message */ spi = ntohl(x->id.spi); audit_log_format(audit_buf, " spi=%u(0x%x)", spi, spi); audit_log_end(audit_buf); } EXPORT_SYMBOL_GPL(xfrm_audit_state_replay_overflow); void xfrm_audit_state_replay(struct xfrm_state *x, struct sk_buff *skb, __be32 net_seq) { struct audit_buffer *audit_buf; u32 spi; audit_buf = xfrm_audit_start("SA-replayed-pkt"); if (audit_buf == NULL) return; xfrm_audit_helper_pktinfo(skb, x->props.family, audit_buf); spi = ntohl(x->id.spi); audit_log_format(audit_buf, " spi=%u(0x%x) seqno=%u", spi, spi, ntohl(net_seq)); audit_log_end(audit_buf); } EXPORT_SYMBOL_GPL(xfrm_audit_state_replay); void xfrm_audit_state_notfound_simple(struct sk_buff *skb, u16 family) { struct audit_buffer *audit_buf; audit_buf = xfrm_audit_start("SA-notfound"); if (audit_buf == NULL) return; xfrm_audit_helper_pktinfo(skb, family, audit_buf); audit_log_end(audit_buf); } EXPORT_SYMBOL_GPL(xfrm_audit_state_notfound_simple); void xfrm_audit_state_notfound(struct sk_buff *skb, u16 family, __be32 net_spi, __be32 net_seq) { struct audit_buffer *audit_buf; u32 spi; audit_buf = xfrm_audit_start("SA-notfound"); if (audit_buf == NULL) return; xfrm_audit_helper_pktinfo(skb, family, audit_buf); spi = ntohl(net_spi); audit_log_format(audit_buf, " spi=%u(0x%x) seqno=%u", spi, spi, ntohl(net_seq)); audit_log_end(audit_buf); } EXPORT_SYMBOL_GPL(xfrm_audit_state_notfound); void xfrm_audit_state_icvfail(struct xfrm_state *x, struct sk_buff *skb, u8 proto) { struct audit_buffer *audit_buf; __be32 net_spi; __be32 net_seq; audit_buf = xfrm_audit_start("SA-icv-failure"); if (audit_buf == NULL) return; xfrm_audit_helper_pktinfo(skb, x->props.family, audit_buf); if (xfrm_parse_spi(skb, proto, &net_spi, &net_seq) == 0) { u32 spi = ntohl(net_spi); audit_log_format(audit_buf, " spi=%u(0x%x) seqno=%u", spi, spi, ntohl(net_seq)); } audit_log_end(audit_buf); } EXPORT_SYMBOL_GPL(xfrm_audit_state_icvfail); #endif /* CONFIG_AUDITSYSCALL */
15 14 7 7 7 14 14 15 8 14 14 14 9 5 14 14 6 15 7 7 7 7 6 1 7 6 6 6 6 7 6 1 7 7 15 6 3 3 15 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/minix/dir.c * * Copyright (C) 1991, 1992 Linus Torvalds * * minix directory handling functions * * Updated to filesystem version 3 by Daniel Aragones */ #include "minix.h" #include <linux/buffer_head.h> #include <linux/highmem.h> #include <linux/swap.h> typedef struct minix_dir_entry minix_dirent; typedef struct minix3_dir_entry minix3_dirent; static int minix_readdir(struct file *, struct dir_context *); const struct file_operations minix_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, .iterate_shared = minix_readdir, .fsync = generic_file_fsync, }; static inline void dir_put_page(struct page *page) { kunmap(page); put_page(page); } /* * Return the offset into page `page_nr' of the last valid * byte in that page, plus one. */ static unsigned minix_last_byte(struct inode *inode, unsigned long page_nr) { unsigned last_byte = PAGE_SIZE; if (page_nr == (inode->i_size >> PAGE_SHIFT)) last_byte = inode->i_size & (PAGE_SIZE - 1); return last_byte; } static void dir_commit_chunk(struct page *page, loff_t pos, unsigned len) { struct address_space *mapping = page->mapping; struct inode *dir = mapping->host; block_write_end(NULL, mapping, pos, len, len, page, NULL); if (pos+len > dir->i_size) { i_size_write(dir, pos+len); mark_inode_dirty(dir); } unlock_page(page); } static int minix_handle_dirsync(struct inode *dir) { int err; err = filemap_write_and_wait(dir->i_mapping); if (!err) err = sync_inode_metadata(dir, 1); return err; } static struct page * dir_get_page(struct inode *dir, unsigned long n) { struct address_space *mapping = dir->i_mapping; struct page *page = read_mapping_page(mapping, n, NULL); if (!IS_ERR(page)) kmap(page); return page; } static inline void *minix_next_entry(void *de, struct minix_sb_info *sbi) { return (void*)((char*)de + sbi->s_dirsize); } static int minix_readdir(struct file *file, struct dir_context *ctx) { struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; struct minix_sb_info *sbi = minix_sb(sb); unsigned chunk_size = sbi->s_dirsize; unsigned long npages = dir_pages(inode); unsigned long pos = ctx->pos; unsigned offset; unsigned long n; ctx->pos = pos = ALIGN(pos, chunk_size); if (pos >= inode->i_size) return 0; offset = pos & ~PAGE_MASK; n = pos >> PAGE_SHIFT; for ( ; n < npages; n++, offset = 0) { char *p, *kaddr, *limit; struct page *page = dir_get_page(inode, n); if (IS_ERR(page)) continue; kaddr = (char *)page_address(page); p = kaddr+offset; limit = kaddr + minix_last_byte(inode, n) - chunk_size; for ( ; p <= limit; p = minix_next_entry(p, sbi)) { const char *name; __u32 inumber; if (sbi->s_version == MINIX_V3) { minix3_dirent *de3 = (minix3_dirent *)p; name = de3->name; inumber = de3->inode; } else { minix_dirent *de = (minix_dirent *)p; name = de->name; inumber = de->inode; } if (inumber) { unsigned l = strnlen(name, sbi->s_namelen); if (!dir_emit(ctx, name, l, inumber, DT_UNKNOWN)) { dir_put_page(page); return 0; } } ctx->pos += chunk_size; } dir_put_page(page); } return 0; } static inline int namecompare(int len, int maxlen, const char * name, const char * buffer) { if (len < maxlen && buffer[len]) return 0; return !memcmp(name, buffer, len); } /* * minix_find_entry() * * finds an entry in the specified directory with the wanted name. It * returns the cache buffer in which the entry was found, and the entry * itself (as a parameter - res_dir). It does NOT read the inode of the * entry - you'll have to do that yourself if you want to. */ minix_dirent *minix_find_entry(struct dentry *dentry, struct page **res_page) { const char * name = dentry->d_name.name; int namelen = dentry->d_name.len; struct inode * dir = d_inode(dentry->d_parent); struct super_block * sb = dir->i_sb; struct minix_sb_info * sbi = minix_sb(sb); unsigned long n; unsigned long npages = dir_pages(dir); struct page *page = NULL; char *p; char *namx; __u32 inumber; *res_page = NULL; for (n = 0; n < npages; n++) { char *kaddr, *limit; page = dir_get_page(dir, n); if (IS_ERR(page)) continue; kaddr = (char*)page_address(page); limit = kaddr + minix_last_byte(dir, n) - sbi->s_dirsize; for (p = kaddr; p <= limit; p = minix_next_entry(p, sbi)) { if (sbi->s_version == MINIX_V3) { minix3_dirent *de3 = (minix3_dirent *)p; namx = de3->name; inumber = de3->inode; } else { minix_dirent *de = (minix_dirent *)p; namx = de->name; inumber = de->inode; } if (!inumber) continue; if (namecompare(namelen, sbi->s_namelen, name, namx)) goto found; } dir_put_page(page); } return NULL; found: *res_page = page; return (minix_dirent *)p; } int minix_add_link(struct dentry *dentry, struct inode *inode) { struct inode *dir = d_inode(dentry->d_parent); const char * name = dentry->d_name.name; int namelen = dentry->d_name.len; struct super_block * sb = dir->i_sb; struct minix_sb_info * sbi = minix_sb(sb); struct page *page = NULL; unsigned long npages = dir_pages(dir); unsigned long n; char *kaddr, *p; minix_dirent *de; minix3_dirent *de3; loff_t pos; int err; char *namx = NULL; __u32 inumber; /* * We take care of directory expansion in the same loop * This code plays outside i_size, so it locks the page * to protect that region. */ for (n = 0; n <= npages; n++) { char *limit, *dir_end; page = dir_get_page(dir, n); err = PTR_ERR(page); if (IS_ERR(page)) goto out; lock_page(page); kaddr = (char*)page_address(page); dir_end = kaddr + minix_last_byte(dir, n); limit = kaddr + PAGE_SIZE - sbi->s_dirsize; for (p = kaddr; p <= limit; p = minix_next_entry(p, sbi)) { de = (minix_dirent *)p; de3 = (minix3_dirent *)p; if (sbi->s_version == MINIX_V3) { namx = de3->name; inumber = de3->inode; } else { namx = de->name; inumber = de->inode; } if (p == dir_end) { /* We hit i_size */ if (sbi->s_version == MINIX_V3) de3->inode = 0; else de->inode = 0; goto got_it; } if (!inumber) goto got_it; err = -EEXIST; if (namecompare(namelen, sbi->s_namelen, name, namx)) goto out_unlock; } unlock_page(page); dir_put_page(page); } BUG(); return -EINVAL; got_it: pos = page_offset(page) + p - (char *)page_address(page); err = minix_prepare_chunk(page, pos, sbi->s_dirsize); if (err) goto out_unlock; memcpy (namx, name, namelen); if (sbi->s_version == MINIX_V3) { memset (namx + namelen, 0, sbi->s_dirsize - namelen - 4); de3->inode = inode->i_ino; } else { memset (namx + namelen, 0, sbi->s_dirsize - namelen - 2); de->inode = inode->i_ino; } dir_commit_chunk(page, pos, sbi->s_dirsize); inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); err = minix_handle_dirsync(dir); out_put: dir_put_page(page); out: return err; out_unlock: unlock_page(page); goto out_put; } int minix_delete_entry(struct minix_dir_entry *de, struct page *page) { struct inode *inode = page->mapping->host; char *kaddr = page_address(page); loff_t pos = page_offset(page) + (char*)de - kaddr; struct minix_sb_info *sbi = minix_sb(inode->i_sb); unsigned len = sbi->s_dirsize; int err; lock_page(page); err = minix_prepare_chunk(page, pos, len); if (err) { unlock_page(page); return err; } if (sbi->s_version == MINIX_V3) ((minix3_dirent *)de)->inode = 0; else de->inode = 0; dir_commit_chunk(page, pos, len); inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); mark_inode_dirty(inode); return minix_handle_dirsync(inode); } int minix_make_empty(struct inode *inode, struct inode *dir) { struct page *page = grab_cache_page(inode->i_mapping, 0); struct minix_sb_info *sbi = minix_sb(inode->i_sb); char *kaddr; int err; if (!page) return -ENOMEM; err = minix_prepare_chunk(page, 0, 2 * sbi->s_dirsize); if (err) { unlock_page(page); goto fail; } kaddr = kmap_atomic(page); memset(kaddr, 0, PAGE_SIZE); if (sbi->s_version == MINIX_V3) { minix3_dirent *de3 = (minix3_dirent *)kaddr; de3->inode = inode->i_ino; strcpy(de3->name, "."); de3 = minix_next_entry(de3, sbi); de3->inode = dir->i_ino; strcpy(de3->name, ".."); } else { minix_dirent *de = (minix_dirent *)kaddr; de->inode = inode->i_ino; strcpy(de->name, "."); de = minix_next_entry(de, sbi); de->inode = dir->i_ino; strcpy(de->name, ".."); } kunmap_atomic(kaddr); dir_commit_chunk(page, 0, 2 * sbi->s_dirsize); err = minix_handle_dirsync(inode); fail: put_page(page); return err; } /* * routine to check that the specified directory is empty (for rmdir) */ int minix_empty_dir(struct inode * inode) { struct page *page = NULL; unsigned long i, npages = dir_pages(inode); struct minix_sb_info *sbi = minix_sb(inode->i_sb); char *name; __u32 inumber; for (i = 0; i < npages; i++) { char *p, *kaddr, *limit; page = dir_get_page(inode, i); if (IS_ERR(page)) continue; kaddr = (char *)page_address(page); limit = kaddr + minix_last_byte(inode, i) - sbi->s_dirsize; for (p = kaddr; p <= limit; p = minix_next_entry(p, sbi)) { if (sbi->s_version == MINIX_V3) { minix3_dirent *de3 = (minix3_dirent *)p; name = de3->name; inumber = de3->inode; } else { minix_dirent *de = (minix_dirent *)p; name = de->name; inumber = de->inode; } if (inumber != 0) { /* check for . and .. */ if (name[0] != '.') goto not_empty; if (!name[1]) { if (inumber != inode->i_ino) goto not_empty; } else if (name[1] != '.') goto not_empty; else if (name[2]) goto not_empty; } } dir_put_page(page); } return 1; not_empty: dir_put_page(page); return 0; } /* Releases the page */ int minix_set_link(struct minix_dir_entry *de, struct page *page, struct inode *inode) { struct inode *dir = page->mapping->host; struct minix_sb_info *sbi = minix_sb(dir->i_sb); loff_t pos = page_offset(page) + (char *)de-(char*)page_address(page); int err; lock_page(page); err = minix_prepare_chunk(page, pos, sbi->s_dirsize); if (err) { unlock_page(page); return err; } if (sbi->s_version == MINIX_V3) ((minix3_dirent *)de)->inode = inode->i_ino; else de->inode = inode->i_ino; dir_commit_chunk(page, pos, sbi->s_dirsize); inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); mark_inode_dirty(dir); return minix_handle_dirsync(dir); } struct minix_dir_entry * minix_dotdot (struct inode *dir, struct page **p) { struct page *page = dir_get_page(dir, 0); struct minix_sb_info *sbi = minix_sb(dir->i_sb); struct minix_dir_entry *de = NULL; if (!IS_ERR(page)) { de = minix_next_entry(page_address(page), sbi); *p = page; } return de; } ino_t minix_inode_by_name(struct dentry *dentry) { struct page *page; struct minix_dir_entry *de = minix_find_entry(dentry, &page); ino_t res = 0; if (de) { struct address_space *mapping = page->mapping; struct inode *inode = mapping->host; struct minix_sb_info *sbi = minix_sb(inode->i_sb); if (sbi->s_version == MINIX_V3) res = ((minix3_dirent *) de)->inode; else res = de->inode; dir_put_page(page); } return res; }
14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 /* SPDX-License-Identifier: GPL-2.0 */ /* * fs/bfs/bfs.h * Copyright (C) 1999-2018 Tigran Aivazian <aivazian.tigran@gmail.com> */ #ifndef _FS_BFS_BFS_H #define _FS_BFS_BFS_H #include <linux/bfs_fs.h> /* In theory BFS supports up to 512 inodes, numbered from 2 (for /) up to 513 inclusive. In actual fact, attempting to create the 512th inode (i.e. inode No. 513 or file No. 511) will fail with ENOSPC in bfs_add_entry(): the root directory cannot contain so many entries, counting '..'. So, mkfs.bfs(8) should really limit its -N option to 511 and not 512. For now, we just print a warning if a filesystem is mounted with such "impossible to fill up" number of inodes */ #define BFS_MAX_LASTI 513 /* * BFS file system in-core superblock info */ struct bfs_sb_info { unsigned long si_blocks; unsigned long si_freeb; unsigned long si_freei; unsigned long si_lf_eblk; unsigned long si_lasti; DECLARE_BITMAP(si_imap, BFS_MAX_LASTI+1); struct mutex bfs_lock; }; /* * BFS file system in-core inode info */ struct bfs_inode_info { unsigned long i_dsk_ino; /* inode number from the disk, can be 0 */ unsigned long i_sblock; unsigned long i_eblock; struct inode vfs_inode; }; static inline struct bfs_sb_info *BFS_SB(struct super_block *sb) { return sb->s_fs_info; } static inline struct bfs_inode_info *BFS_I(struct inode *inode) { return container_of(inode, struct bfs_inode_info, vfs_inode); } #define printf(format, args...) \ printk(KERN_ERR "BFS-fs: %s(): " format, __func__, ## args) /* inode.c */ extern struct inode *bfs_iget(struct super_block *sb, unsigned long ino); extern void bfs_dump_imap(const char *, struct super_block *); /* file.c */ extern const struct inode_operations bfs_file_inops; extern const struct file_operations bfs_file_operations; extern const struct address_space_operations bfs_aops; /* dir.c */ extern const struct inode_operations bfs_dir_inops; extern const struct file_operations bfs_dir_operations; #endif /* _FS_BFS_BFS_H */
4 4 4 5 4 4 4 4 3 4 4 4 4 2 2 4 4 4 4 4 4 3 3 1 9 9 9 2 2 2 1 2 2 2 1 4 4 4 4 4 4 4 4 4 4 4 1 1 4 4 5 4 2 4 4 5 5 5 5 4 4 4 3 4 4 2 4 4 4 2 2 4 5 5 1 3 3 3 3 3 3 3 3 3 3 3 3 4 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 3 3 3 4 3 4 4 1 4 3 2 2 1 1 3 1 4 4 3 3 3 3 3 3 4 4 4 4 4 4 4 4 1 4 4 4 1 4 4 4 4 4 4 4 5 5 5 5 4 1 3 5 3 4 4 3 4 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2018 HUAWEI, Inc. * https://www.huawei.com/ * Copyright (C) 2022 Alibaba Cloud */ #include "compress.h" #include <linux/psi.h> #include <linux/cpuhotplug.h> #include <trace/events/erofs.h> #define Z_EROFS_PCLUSTER_MAX_PAGES (Z_EROFS_PCLUSTER_MAX_SIZE / PAGE_SIZE) #define Z_EROFS_INLINE_BVECS 2 /* * let's leave a type here in case of introducing * another tagged pointer later. */ typedef void *z_erofs_next_pcluster_t; struct z_erofs_bvec { struct page *page; int offset; unsigned int end; }; #define __Z_EROFS_BVSET(name, total) \ struct name { \ /* point to the next page which contains the following bvecs */ \ struct page *nextpage; \ struct z_erofs_bvec bvec[total]; \ } __Z_EROFS_BVSET(z_erofs_bvset,); __Z_EROFS_BVSET(z_erofs_bvset_inline, Z_EROFS_INLINE_BVECS); /* * Structure fields follow one of the following exclusion rules. * * I: Modifiable by initialization/destruction paths and read-only * for everyone else; * * L: Field should be protected by the pcluster lock; * * A: Field should be accessed / updated in atomic for parallelized code. */ struct z_erofs_pcluster { struct erofs_workgroup obj; struct mutex lock; /* A: point to next chained pcluster or TAILs */ z_erofs_next_pcluster_t next; /* L: the maximum decompression size of this round */ unsigned int length; /* L: total number of bvecs */ unsigned int vcnt; /* I: page offset of start position of decompression */ unsigned short pageofs_out; /* I: page offset of inline compressed data */ unsigned short pageofs_in; union { /* L: inline a certain number of bvec for bootstrap */ struct z_erofs_bvset_inline bvset; /* I: can be used to free the pcluster by RCU. */ struct rcu_head rcu; }; union { /* I: physical cluster size in pages */ unsigned short pclusterpages; /* I: tailpacking inline compressed size */ unsigned short tailpacking_size; }; /* I: compression algorithm format */ unsigned char algorithmformat; /* L: whether partial decompression or not */ bool partial; /* L: indicate several pageofs_outs or not */ bool multibases; /* A: compressed bvecs (can be cached or inplaced pages) */ struct z_erofs_bvec compressed_bvecs[]; }; /* the end of a chain of pclusters */ #define Z_EROFS_PCLUSTER_TAIL ((void *) 0x700 + POISON_POINTER_DELTA) #define Z_EROFS_PCLUSTER_NIL (NULL) struct z_erofs_decompressqueue { struct super_block *sb; atomic_t pending_bios; z_erofs_next_pcluster_t head; union { struct completion done; struct work_struct work; struct kthread_work kthread_work; } u; bool eio, sync; }; static inline bool z_erofs_is_inline_pcluster(struct z_erofs_pcluster *pcl) { return !pcl->obj.index; } static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl) { if (z_erofs_is_inline_pcluster(pcl)) return 1; return pcl->pclusterpages; } /* * bit 30: I/O error occurred on this page * bit 0 - 29: remaining parts to complete this page */ #define Z_EROFS_PAGE_EIO (1 << 30) static inline void z_erofs_onlinepage_init(struct page *page) { union { atomic_t o; unsigned long v; } u = { .o = ATOMIC_INIT(1) }; set_page_private(page, u.v); smp_wmb(); SetPagePrivate(page); } static inline void z_erofs_onlinepage_split(struct page *page) { atomic_inc((atomic_t *)&page->private); } static void z_erofs_onlinepage_endio(struct page *page, int err) { int orig, v; DBG_BUGON(!PagePrivate(page)); do { orig = atomic_read((atomic_t *)&page->private); v = (orig - 1) | (err ? Z_EROFS_PAGE_EIO : 0); } while (atomic_cmpxchg((atomic_t *)&page->private, orig, v) != orig); if (!(v & ~Z_EROFS_PAGE_EIO)) { set_page_private(page, 0); ClearPagePrivate(page); if (!(v & Z_EROFS_PAGE_EIO)) SetPageUptodate(page); unlock_page(page); } } #define Z_EROFS_ONSTACK_PAGES 32 /* * since pclustersize is variable for big pcluster feature, introduce slab * pools implementation for different pcluster sizes. */ struct z_erofs_pcluster_slab { struct kmem_cache *slab; unsigned int maxpages; char name[48]; }; #define _PCLP(n) { .maxpages = n } static struct z_erofs_pcluster_slab pcluster_pool[] __read_mostly = { _PCLP(1), _PCLP(4), _PCLP(16), _PCLP(64), _PCLP(128), _PCLP(Z_EROFS_PCLUSTER_MAX_PAGES) }; struct z_erofs_bvec_iter { struct page *bvpage; struct z_erofs_bvset *bvset; unsigned int nr, cur; }; static struct page *z_erofs_bvec_iter_end(struct z_erofs_bvec_iter *iter) { if (iter->bvpage) kunmap_local(iter->bvset); return iter->bvpage; } static struct page *z_erofs_bvset_flip(struct z_erofs_bvec_iter *iter) { unsigned long base = (unsigned long)((struct z_erofs_bvset *)0)->bvec; /* have to access nextpage in advance, otherwise it will be unmapped */ struct page *nextpage = iter->bvset->nextpage; struct page *oldpage; DBG_BUGON(!nextpage); oldpage = z_erofs_bvec_iter_end(iter); iter->bvpage = nextpage; iter->bvset = kmap_local_page(nextpage); iter->nr = (PAGE_SIZE - base) / sizeof(struct z_erofs_bvec); iter->cur = 0; return oldpage; } static void z_erofs_bvec_iter_begin(struct z_erofs_bvec_iter *iter, struct z_erofs_bvset_inline *bvset, unsigned int bootstrap_nr, unsigned int cur) { *iter = (struct z_erofs_bvec_iter) { .nr = bootstrap_nr, .bvset = (struct z_erofs_bvset *)bvset, }; while (cur > iter->nr) { cur -= iter->nr; z_erofs_bvset_flip(iter); } iter->cur = cur; } static int z_erofs_bvec_enqueue(struct z_erofs_bvec_iter *iter, struct z_erofs_bvec *bvec, struct page **candidate_bvpage, struct page **pagepool) { if (iter->cur >= iter->nr) { struct page *nextpage = *candidate_bvpage; if (!nextpage) { nextpage = erofs_allocpage(pagepool, GFP_NOFS); if (!nextpage) return -ENOMEM; set_page_private(nextpage, Z_EROFS_SHORTLIVED_PAGE); } DBG_BUGON(iter->bvset->nextpage); iter->bvset->nextpage = nextpage; z_erofs_bvset_flip(iter); iter->bvset->nextpage = NULL; *candidate_bvpage = NULL; } iter->bvset->bvec[iter->cur++] = *bvec; return 0; } static void z_erofs_bvec_dequeue(struct z_erofs_bvec_iter *iter, struct z_erofs_bvec *bvec, struct page **old_bvpage) { if (iter->cur == iter->nr) *old_bvpage = z_erofs_bvset_flip(iter); else *old_bvpage = NULL; *bvec = iter->bvset->bvec[iter->cur++]; } static void z_erofs_destroy_pcluster_pool(void) { int i; for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) { if (!pcluster_pool[i].slab) continue; kmem_cache_destroy(pcluster_pool[i].slab); pcluster_pool[i].slab = NULL; } } static int z_erofs_create_pcluster_pool(void) { struct z_erofs_pcluster_slab *pcs; struct z_erofs_pcluster *a; unsigned int size; for (pcs = pcluster_pool; pcs < pcluster_pool + ARRAY_SIZE(pcluster_pool); ++pcs) { size = struct_size(a, compressed_bvecs, pcs->maxpages); sprintf(pcs->name, "erofs_pcluster-%u", pcs->maxpages); pcs->slab = kmem_cache_create(pcs->name, size, 0, SLAB_RECLAIM_ACCOUNT, NULL); if (pcs->slab) continue; z_erofs_destroy_pcluster_pool(); return -ENOMEM; } return 0; } static struct z_erofs_pcluster *z_erofs_alloc_pcluster(unsigned int nrpages) { int i; for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) { struct z_erofs_pcluster_slab *pcs = pcluster_pool + i; struct z_erofs_pcluster *pcl; if (nrpages > pcs->maxpages) continue; pcl = kmem_cache_zalloc(pcs->slab, GFP_NOFS); if (!pcl) return ERR_PTR(-ENOMEM); pcl->pclusterpages = nrpages; return pcl; } return ERR_PTR(-EINVAL); } static void z_erofs_free_pcluster(struct z_erofs_pcluster *pcl) { unsigned int pclusterpages = z_erofs_pclusterpages(pcl); int i; for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) { struct z_erofs_pcluster_slab *pcs = pcluster_pool + i; if (pclusterpages > pcs->maxpages) continue; kmem_cache_free(pcs->slab, pcl); return; } DBG_BUGON(1); } static struct workqueue_struct *z_erofs_workqueue __read_mostly; #ifdef CONFIG_EROFS_FS_PCPU_KTHREAD static struct kthread_worker __rcu **z_erofs_pcpu_workers; static void erofs_destroy_percpu_workers(void) { struct kthread_worker *worker; unsigned int cpu; for_each_possible_cpu(cpu) { worker = rcu_dereference_protected( z_erofs_pcpu_workers[cpu], 1); rcu_assign_pointer(z_erofs_pcpu_workers[cpu], NULL); if (worker) kthread_destroy_worker(worker); } kfree(z_erofs_pcpu_workers); } static struct kthread_worker *erofs_init_percpu_worker(int cpu) { struct kthread_worker *worker = kthread_create_worker_on_cpu(cpu, 0, "erofs_worker/%u", cpu); if (IS_ERR(worker)) return worker; if (IS_ENABLED(CONFIG_EROFS_FS_PCPU_KTHREAD_HIPRI)) sched_set_fifo_low(worker->task); return worker; } static int erofs_init_percpu_workers(void) { struct kthread_worker *worker; unsigned int cpu; z_erofs_pcpu_workers = kcalloc(num_possible_cpus(), sizeof(struct kthread_worker *), GFP_ATOMIC); if (!z_erofs_pcpu_workers) return -ENOMEM; for_each_online_cpu(cpu) { /* could miss cpu{off,on}line? */ worker = erofs_init_percpu_worker(cpu); if (!IS_ERR(worker)) rcu_assign_pointer(z_erofs_pcpu_workers[cpu], worker); } return 0; } #else static inline void erofs_destroy_percpu_workers(void) {} static inline int erofs_init_percpu_workers(void) { return 0; } #endif #if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_EROFS_FS_PCPU_KTHREAD) static DEFINE_SPINLOCK(z_erofs_pcpu_worker_lock); static enum cpuhp_state erofs_cpuhp_state; static int erofs_cpu_online(unsigned int cpu) { struct kthread_worker *worker, *old; worker = erofs_init_percpu_worker(cpu); if (IS_ERR(worker)) return PTR_ERR(worker); spin_lock(&z_erofs_pcpu_worker_lock); old = rcu_dereference_protected(z_erofs_pcpu_workers[cpu], lockdep_is_held(&z_erofs_pcpu_worker_lock)); if (!old) rcu_assign_pointer(z_erofs_pcpu_workers[cpu], worker); spin_unlock(&z_erofs_pcpu_worker_lock); if (old) kthread_destroy_worker(worker); return 0; } static int erofs_cpu_offline(unsigned int cpu) { struct kthread_worker *worker; spin_lock(&z_erofs_pcpu_worker_lock); worker = rcu_dereference_protected(z_erofs_pcpu_workers[cpu], lockdep_is_held(&z_erofs_pcpu_worker_lock)); rcu_assign_pointer(z_erofs_pcpu_workers[cpu], NULL); spin_unlock(&z_erofs_pcpu_worker_lock); synchronize_rcu(); if (worker) kthread_destroy_worker(worker); return 0; } static int erofs_cpu_hotplug_init(void) { int state; state = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "fs/erofs:online", erofs_cpu_online, erofs_cpu_offline); if (state < 0) return state; erofs_cpuhp_state = state; return 0; } static void erofs_cpu_hotplug_destroy(void) { if (erofs_cpuhp_state) cpuhp_remove_state_nocalls(erofs_cpuhp_state); } #else /* !CONFIG_HOTPLUG_CPU || !CONFIG_EROFS_FS_PCPU_KTHREAD */ static inline int erofs_cpu_hotplug_init(void) { return 0; } static inline void erofs_cpu_hotplug_destroy(void) {} #endif void z_erofs_exit_zip_subsystem(void) { erofs_cpu_hotplug_destroy(); erofs_destroy_percpu_workers(); destroy_workqueue(z_erofs_workqueue); z_erofs_destroy_pcluster_pool(); } int __init z_erofs_init_zip_subsystem(void) { int err = z_erofs_create_pcluster_pool(); if (err) goto out_error_pcluster_pool; z_erofs_workqueue = alloc_workqueue("erofs_worker", WQ_UNBOUND | WQ_HIGHPRI, num_possible_cpus()); if (!z_erofs_workqueue) { err = -ENOMEM; goto out_error_workqueue_init; } err = erofs_init_percpu_workers(); if (err) goto out_error_pcpu_worker; err = erofs_cpu_hotplug_init(); if (err < 0) goto out_error_cpuhp_init; return err; out_error_cpuhp_init: erofs_destroy_percpu_workers(); out_error_pcpu_worker: destroy_workqueue(z_erofs_workqueue); out_error_workqueue_init: z_erofs_destroy_pcluster_pool(); out_error_pcluster_pool: return err; } enum z_erofs_pclustermode { Z_EROFS_PCLUSTER_INFLIGHT, /* * a weak form of Z_EROFS_PCLUSTER_FOLLOWED, the difference is that it * could be dispatched into bypass queue later due to uptodated managed * pages. All related online pages cannot be reused for inplace I/O (or * bvpage) since it can be directly decoded without I/O submission. */ Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE, /* * The pcluster was just linked to a decompression chain by us. It can * also be linked with the remaining pclusters, which means if the * processing page is the tail page of a pcluster, this pcluster can * safely use the whole page (since the previous pcluster is within the * same chain) for in-place I/O, as illustrated below: * ___________________________________________________ * | tail (partial) page | head (partial) page | * | (of the current pcl) | (of the previous pcl) | * |___PCLUSTER_FOLLOWED___|_____PCLUSTER_FOLLOWED_____| * * [ (*) the page above can be used as inplace I/O. ] */ Z_EROFS_PCLUSTER_FOLLOWED, }; struct z_erofs_decompress_frontend { struct inode *const inode; struct erofs_map_blocks map; struct z_erofs_bvec_iter biter; struct page *pagepool; struct page *candidate_bvpage; struct z_erofs_pcluster *pcl; z_erofs_next_pcluster_t owned_head; enum z_erofs_pclustermode mode; erofs_off_t headoffset; /* a pointer used to pick up inplace I/O pages */ unsigned int icur; }; #define DECOMPRESS_FRONTEND_INIT(__i) { \ .inode = __i, .owned_head = Z_EROFS_PCLUSTER_TAIL, \ .mode = Z_EROFS_PCLUSTER_FOLLOWED } static bool z_erofs_should_alloc_cache(struct z_erofs_decompress_frontend *fe) { unsigned int cachestrategy = EROFS_I_SB(fe->inode)->opt.cache_strategy; if (cachestrategy <= EROFS_ZIP_CACHE_DISABLED) return false; if (!(fe->map.m_flags & EROFS_MAP_FULL_MAPPED)) return true; if (cachestrategy >= EROFS_ZIP_CACHE_READAROUND && fe->map.m_la < fe->headoffset) return true; return false; } static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe) { struct address_space *mc = MNGD_MAPPING(EROFS_I_SB(fe->inode)); struct z_erofs_pcluster *pcl = fe->pcl; bool shouldalloc = z_erofs_should_alloc_cache(fe); bool standalone = true; /* * optimistic allocation without direct reclaim since inplace I/O * can be used if low memory otherwise. */ gfp_t gfp = (mapping_gfp_mask(mc) & ~__GFP_DIRECT_RECLAIM) | __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; unsigned int i; if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED) return; for (i = 0; i < pcl->pclusterpages; ++i) { struct page *page; void *t; /* mark pages just found for debugging */ struct page *newpage = NULL; /* the compressed page was loaded before */ if (READ_ONCE(pcl->compressed_bvecs[i].page)) continue; page = find_get_page(mc, pcl->obj.index + i); if (page) { t = (void *)((unsigned long)page | 1); } else { /* I/O is needed, no possible to decompress directly */ standalone = false; if (!shouldalloc) continue; /* * try to use cached I/O if page allocation * succeeds or fallback to in-place I/O instead * to avoid any direct reclaim. */ newpage = erofs_allocpage(&fe->pagepool, gfp); if (!newpage) continue; set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE); t = (void *)((unsigned long)newpage | 1); } if (!cmpxchg_relaxed(&pcl->compressed_bvecs[i].page, NULL, t)) continue; if (page) put_page(page); else if (newpage) erofs_pagepool_add(&fe->pagepool, newpage); } /* * don't do inplace I/O if all compressed pages are available in * managed cache since it can be moved to the bypass queue instead. */ if (standalone) fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE; } /* called by erofs_shrinker to get rid of all compressed_pages */ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, struct erofs_workgroup *grp) { struct z_erofs_pcluster *const pcl = container_of(grp, struct z_erofs_pcluster, obj); int i; DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); /* * refcount of workgroup is now freezed as 0, * therefore no need to worry about available decompression users. */ for (i = 0; i < pcl->pclusterpages; ++i) { struct page *page = pcl->compressed_bvecs[i].page; if (!page) continue; /* block other users from reclaiming or migrating the page */ if (!trylock_page(page)) return -EBUSY; if (!erofs_page_is_managed(sbi, page)) continue; /* barrier is implied in the following 'unlock_page' */ WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); detach_page_private(page); unlock_page(page); } return 0; } static bool z_erofs_cache_release_folio(struct folio *folio, gfp_t gfp) { struct z_erofs_pcluster *pcl = folio_get_private(folio); bool ret; int i; if (!folio_test_private(folio)) return true; ret = false; spin_lock(&pcl->obj.lockref.lock); if (pcl->obj.lockref.count > 0) goto out; DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); for (i = 0; i < pcl->pclusterpages; ++i) { if (pcl->compressed_bvecs[i].page == &folio->page) { WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); ret = true; break; } } if (ret) folio_detach_private(folio); out: spin_unlock(&pcl->obj.lockref.lock); return ret; } /* * It will be called only on inode eviction. In case that there are still some * decompression requests in progress, wait with rescheduling for a bit here. * An extra lock could be introduced instead but it seems unnecessary. */ static void z_erofs_cache_invalidate_folio(struct folio *folio, size_t offset, size_t length) { const size_t stop = length + offset; /* Check for potential overflow in debug mode */ DBG_BUGON(stop > folio_size(folio) || stop < length); if (offset == 0 && stop == folio_size(folio)) while (!z_erofs_cache_release_folio(folio, GFP_NOFS)) cond_resched(); } static const struct address_space_operations z_erofs_cache_aops = { .release_folio = z_erofs_cache_release_folio, .invalidate_folio = z_erofs_cache_invalidate_folio, }; int erofs_init_managed_cache(struct super_block *sb) { struct inode *const inode = new_inode(sb); if (!inode) return -ENOMEM; set_nlink(inode, 1); inode->i_size = OFFSET_MAX; inode->i_mapping->a_ops = &z_erofs_cache_aops; mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); EROFS_SB(sb)->managed_cache = inode; return 0; } static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe, struct z_erofs_bvec *bvec) { struct z_erofs_pcluster *const pcl = fe->pcl; while (fe->icur > 0) { if (!cmpxchg(&pcl->compressed_bvecs[--fe->icur].page, NULL, bvec->page)) { pcl->compressed_bvecs[fe->icur] = *bvec; return true; } } return false; } /* callers must be with pcluster lock held */ static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe, struct z_erofs_bvec *bvec, bool exclusive) { int ret; if (exclusive) { /* give priority for inplaceio to use file pages first */ if (z_erofs_try_inplace_io(fe, bvec)) return 0; /* otherwise, check if it can be used as a bvpage */ if (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED && !fe->candidate_bvpage) fe->candidate_bvpage = bvec->page; } ret = z_erofs_bvec_enqueue(&fe->biter, bvec, &fe->candidate_bvpage, &fe->pagepool); fe->pcl->vcnt += (ret >= 0); return ret; } static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f) { struct z_erofs_pcluster *pcl = f->pcl; z_erofs_next_pcluster_t *owned_head = &f->owned_head; /* type 1, nil pcluster (this pcluster doesn't belong to any chain.) */ if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_NIL, *owned_head) == Z_EROFS_PCLUSTER_NIL) { *owned_head = &pcl->next; /* so we can attach this pcluster to our submission chain. */ f->mode = Z_EROFS_PCLUSTER_FOLLOWED; return; } /* type 2, it belongs to an ongoing chain */ f->mode = Z_EROFS_PCLUSTER_INFLIGHT; } static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe) { struct erofs_map_blocks *map = &fe->map; bool ztailpacking = map->m_flags & EROFS_MAP_META; struct z_erofs_pcluster *pcl; struct erofs_workgroup *grp; int err; if (!(map->m_flags & EROFS_MAP_ENCODED) || (!ztailpacking && !(map->m_pa >> PAGE_SHIFT))) { DBG_BUGON(1); return -EFSCORRUPTED; } /* no available pcluster, let's allocate one */ pcl = z_erofs_alloc_pcluster(ztailpacking ? 1 : map->m_plen >> PAGE_SHIFT); if (IS_ERR(pcl)) return PTR_ERR(pcl); spin_lock_init(&pcl->obj.lockref.lock); pcl->obj.lockref.count = 1; /* one ref for this request */ pcl->algorithmformat = map->m_algorithmformat; pcl->length = 0; pcl->partial = true; /* new pclusters should be claimed as type 1, primary and followed */ pcl->next = fe->owned_head; pcl->pageofs_out = map->m_la & ~PAGE_MASK; fe->mode = Z_EROFS_PCLUSTER_FOLLOWED; /* * lock all primary followed works before visible to others * and mutex_trylock *never* fails for a new pcluster. */ mutex_init(&pcl->lock); DBG_BUGON(!mutex_trylock(&pcl->lock)); if (ztailpacking) { pcl->obj.index = 0; /* which indicates ztailpacking */ pcl->pageofs_in = erofs_blkoff(fe->inode->i_sb, map->m_pa); pcl->tailpacking_size = map->m_plen; } else { pcl->obj.index = map->m_pa >> PAGE_SHIFT; grp = erofs_insert_workgroup(fe->inode->i_sb, &pcl->obj); if (IS_ERR(grp)) { err = PTR_ERR(grp); goto err_out; } if (grp != &pcl->obj) { fe->pcl = container_of(grp, struct z_erofs_pcluster, obj); err = -EEXIST; goto err_out; } } fe->owned_head = &pcl->next; fe->pcl = pcl; return 0; err_out: mutex_unlock(&pcl->lock); z_erofs_free_pcluster(pcl); return err; } static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe) { struct erofs_map_blocks *map = &fe->map; struct super_block *sb = fe->inode->i_sb; erofs_blk_t blknr = erofs_blknr(sb, map->m_pa); struct erofs_workgroup *grp = NULL; int ret; DBG_BUGON(fe->pcl); /* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous pcluster */ DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL); if (!(map->m_flags & EROFS_MAP_META)) { grp = erofs_find_workgroup(sb, blknr); } else if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) { DBG_BUGON(1); return -EFSCORRUPTED; } if (grp) { fe->pcl = container_of(grp, struct z_erofs_pcluster, obj); ret = -EEXIST; } else { ret = z_erofs_register_pcluster(fe); } if (ret == -EEXIST) { mutex_lock(&fe->pcl->lock); z_erofs_try_to_claim_pcluster(fe); } else if (ret) { return ret; } z_erofs_bvec_iter_begin(&fe->biter, &fe->pcl->bvset, Z_EROFS_INLINE_BVECS, fe->pcl->vcnt); if (!z_erofs_is_inline_pcluster(fe->pcl)) { /* bind cache first when cached decompression is preferred */ z_erofs_bind_cache(fe); } else { void *mptr; mptr = erofs_read_metabuf(&map->buf, sb, blknr, EROFS_NO_KMAP); if (IS_ERR(mptr)) { ret = PTR_ERR(mptr); erofs_err(sb, "failed to get inline data %d", ret); return ret; } get_page(map->buf.page); WRITE_ONCE(fe->pcl->compressed_bvecs[0].page, map->buf.page); fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE; } /* file-backed inplace I/O pages are traversed in reverse order */ fe->icur = z_erofs_pclusterpages(fe->pcl); return 0; } /* * keep in mind that no referenced pclusters will be freed * only after a RCU grace period. */ static void z_erofs_rcu_callback(struct rcu_head *head) { z_erofs_free_pcluster(container_of(head, struct z_erofs_pcluster, rcu)); } void erofs_workgroup_free_rcu(struct erofs_workgroup *grp) { struct z_erofs_pcluster *const pcl = container_of(grp, struct z_erofs_pcluster, obj); call_rcu(&pcl->rcu, z_erofs_rcu_callback); } static void z_erofs_pcluster_end(struct z_erofs_decompress_frontend *fe) { struct z_erofs_pcluster *pcl = fe->pcl; if (!pcl) return; z_erofs_bvec_iter_end(&fe->biter); mutex_unlock(&pcl->lock); if (fe->candidate_bvpage) fe->candidate_bvpage = NULL; /* * if all pending pages are added, don't hold its reference * any longer if the pcluster isn't hosted by ourselves. */ if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE) erofs_workgroup_put(&pcl->obj); fe->pcl = NULL; } static int z_erofs_read_fragment(struct super_block *sb, struct page *page, unsigned int cur, unsigned int end, erofs_off_t pos) { struct inode *packed_inode = EROFS_SB(sb)->packed_inode; struct erofs_buf buf = __EROFS_BUF_INITIALIZER; unsigned int cnt; u8 *src; if (!packed_inode) return -EFSCORRUPTED; buf.inode = packed_inode; for (; cur < end; cur += cnt, pos += cnt) { cnt = min_t(unsigned int, end - cur, sb->s_blocksize - erofs_blkoff(sb, pos)); src = erofs_bread(&buf, erofs_blknr(sb, pos), EROFS_KMAP); if (IS_ERR(src)) { erofs_put_metabuf(&buf); return PTR_ERR(src); } memcpy_to_page(page, cur, src + erofs_blkoff(sb, pos), cnt); } erofs_put_metabuf(&buf); return 0; } static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, struct page *page) { struct inode *const inode = fe->inode; struct erofs_map_blocks *const map = &fe->map; const loff_t offset = page_offset(page); bool tight = true, exclusive; unsigned int cur, end, len, split; int err = 0; z_erofs_onlinepage_init(page); split = 0; end = PAGE_SIZE; repeat: if (offset + end - 1 < map->m_la || offset + end - 1 >= map->m_la + map->m_llen) { z_erofs_pcluster_end(fe); map->m_la = offset + end - 1; map->m_llen = 0; err = z_erofs_map_blocks_iter(inode, map, 0); if (err) goto out; } cur = offset > map->m_la ? 0 : map->m_la - offset; /* bump split parts first to avoid several separate cases */ ++split; if (!(map->m_flags & EROFS_MAP_MAPPED)) { zero_user_segment(page, cur, end); tight = false; goto next_part; } if (map->m_flags & EROFS_MAP_FRAGMENT) { erofs_off_t fpos = offset + cur - map->m_la; len = min_t(unsigned int, map->m_llen - fpos, end - cur); err = z_erofs_read_fragment(inode->i_sb, page, cur, cur + len, EROFS_I(inode)->z_fragmentoff + fpos); if (err) goto out; tight = false; goto next_part; } if (!fe->pcl) { err = z_erofs_pcluster_begin(fe); if (err) goto out; } /* * Ensure the current partial page belongs to this submit chain rather * than other concurrent submit chains or the noio(bypass) chain since * those chains are handled asynchronously thus the page cannot be used * for inplace I/O or bvpage (should be processed in a strict order.) */ tight &= (fe->mode > Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE); exclusive = (!cur && ((split <= 1) || tight)); if (cur) tight &= (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED); err = z_erofs_attach_page(fe, &((struct z_erofs_bvec) { .page = page, .offset = offset - map->m_la, .end = end, }), exclusive); if (err) goto out; z_erofs_onlinepage_split(page); if (fe->pcl->pageofs_out != (map->m_la & ~PAGE_MASK)) fe->pcl->multibases = true; if (fe->pcl->length < offset + end - map->m_la) { fe->pcl->length = offset + end - map->m_la; fe->pcl->pageofs_out = map->m_la & ~PAGE_MASK; } if ((map->m_flags & EROFS_MAP_FULL_MAPPED) && !(map->m_flags & EROFS_MAP_PARTIAL_REF) && fe->pcl->length == map->m_llen) fe->pcl->partial = false; next_part: /* shorten the remaining extent to update progress */ map->m_llen = offset + cur - map->m_la; map->m_flags &= ~EROFS_MAP_FULL_MAPPED; end = cur; if (end > 0) goto repeat; out: z_erofs_onlinepage_endio(page, err); return err; } static bool z_erofs_is_sync_decompress(struct erofs_sb_info *sbi, unsigned int readahead_pages) { /* auto: enable for read_folio, disable for readahead */ if ((sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_AUTO) && !readahead_pages) return true; if ((sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_FORCE_ON) && (readahead_pages <= sbi->opt.max_sync_decompress_pages)) return true; return false; } static bool z_erofs_page_is_invalidated(struct page *page) { return !page->mapping && !z_erofs_is_shortlived_page(page); } struct z_erofs_decompress_backend { struct page *onstack_pages[Z_EROFS_ONSTACK_PAGES]; struct super_block *sb; struct z_erofs_pcluster *pcl; /* pages with the longest decompressed length for deduplication */ struct page **decompressed_pages; /* pages to keep the compressed data */ struct page **compressed_pages; struct list_head decompressed_secondary_bvecs; struct page **pagepool; unsigned int onstack_used, nr_pages; }; struct z_erofs_bvec_item { struct z_erofs_bvec bvec; struct list_head list; }; static void z_erofs_do_decompressed_bvec(struct z_erofs_decompress_backend *be, struct z_erofs_bvec *bvec) { struct z_erofs_bvec_item *item; unsigned int pgnr; if (!((bvec->offset + be->pcl->pageofs_out) & ~PAGE_MASK) && (bvec->end == PAGE_SIZE || bvec->offset + bvec->end == be->pcl->length)) { pgnr = (bvec->offset + be->pcl->pageofs_out) >> PAGE_SHIFT; DBG_BUGON(pgnr >= be->nr_pages); if (!be->decompressed_pages[pgnr]) { be->decompressed_pages[pgnr] = bvec->page; return; } } /* (cold path) one pcluster is requested multiple times */ item = kmalloc(sizeof(*item), GFP_KERNEL | __GFP_NOFAIL); item->bvec = *bvec; list_add(&item->list, &be->decompressed_secondary_bvecs); } static void z_erofs_fill_other_copies(struct z_erofs_decompress_backend *be, int err) { unsigned int off0 = be->pcl->pageofs_out; struct list_head *p, *n; list_for_each_safe(p, n, &be->decompressed_secondary_bvecs) { struct z_erofs_bvec_item *bvi; unsigned int end, cur; void *dst, *src; bvi = container_of(p, struct z_erofs_bvec_item, list); cur = bvi->bvec.offset < 0 ? -bvi->bvec.offset : 0; end = min_t(unsigned int, be->pcl->length - bvi->bvec.offset, bvi->bvec.end); dst = kmap_local_page(bvi->bvec.page); while (cur < end) { unsigned int pgnr, scur, len; pgnr = (bvi->bvec.offset + cur + off0) >> PAGE_SHIFT; DBG_BUGON(pgnr >= be->nr_pages); scur = bvi->bvec.offset + cur - ((pgnr << PAGE_SHIFT) - off0); len = min_t(unsigned int, end - cur, PAGE_SIZE - scur); if (!be->decompressed_pages[pgnr]) { err = -EFSCORRUPTED; cur += len; continue; } src = kmap_local_page(be->decompressed_pages[pgnr]); memcpy(dst + cur, src + scur, len); kunmap_local(src); cur += len; } kunmap_local(dst); z_erofs_onlinepage_endio(bvi->bvec.page, err); list_del(p); kfree(bvi); } } static void z_erofs_parse_out_bvecs(struct z_erofs_decompress_backend *be) { struct z_erofs_pcluster *pcl = be->pcl; struct z_erofs_bvec_iter biter; struct page *old_bvpage; int i; z_erofs_bvec_iter_begin(&biter, &pcl->bvset, Z_EROFS_INLINE_BVECS, 0); for (i = 0; i < pcl->vcnt; ++i) { struct z_erofs_bvec bvec; z_erofs_bvec_dequeue(&biter, &bvec, &old_bvpage); if (old_bvpage) z_erofs_put_shortlivedpage(be->pagepool, old_bvpage); DBG_BUGON(z_erofs_page_is_invalidated(bvec.page)); z_erofs_do_decompressed_bvec(be, &bvec); } old_bvpage = z_erofs_bvec_iter_end(&biter); if (old_bvpage) z_erofs_put_shortlivedpage(be->pagepool, old_bvpage); } static int z_erofs_parse_in_bvecs(struct z_erofs_decompress_backend *be, bool *overlapped) { struct z_erofs_pcluster *pcl = be->pcl; unsigned int pclusterpages = z_erofs_pclusterpages(pcl); int i, err = 0; *overlapped = false; for (i = 0; i < pclusterpages; ++i) { struct z_erofs_bvec *bvec = &pcl->compressed_bvecs[i]; struct page *page = bvec->page; /* compressed pages ought to be present before decompressing */ if (!page) { DBG_BUGON(1); continue; } be->compressed_pages[i] = page; if (z_erofs_is_inline_pcluster(pcl)) { if (!PageUptodate(page)) err = -EIO; continue; } DBG_BUGON(z_erofs_page_is_invalidated(page)); if (!z_erofs_is_shortlived_page(page)) { if (erofs_page_is_managed(EROFS_SB(be->sb), page)) { if (!PageUptodate(page)) err = -EIO; continue; } z_erofs_do_decompressed_bvec(be, bvec); *overlapped = true; } } if (err) return err; return 0; } static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, int err) { struct erofs_sb_info *const sbi = EROFS_SB(be->sb); struct z_erofs_pcluster *pcl = be->pcl; unsigned int pclusterpages = z_erofs_pclusterpages(pcl); const struct z_erofs_decompressor *decompressor = &erofs_decompressors[pcl->algorithmformat]; unsigned int i, inputsize; int err2; struct page *page; bool overlapped; mutex_lock(&pcl->lock); be->nr_pages = PAGE_ALIGN(pcl->length + pcl->pageofs_out) >> PAGE_SHIFT; /* allocate (de)compressed page arrays if cannot be kept on stack */ be->decompressed_pages = NULL; be->compressed_pages = NULL; be->onstack_used = 0; if (be->nr_pages <= Z_EROFS_ONSTACK_PAGES) { be->decompressed_pages = be->onstack_pages; be->onstack_used = be->nr_pages; memset(be->decompressed_pages, 0, sizeof(struct page *) * be->nr_pages); } if (pclusterpages + be->onstack_used <= Z_EROFS_ONSTACK_PAGES) be->compressed_pages = be->onstack_pages + be->onstack_used; if (!be->decompressed_pages) be->decompressed_pages = kvcalloc(be->nr_pages, sizeof(struct page *), GFP_KERNEL | __GFP_NOFAIL); if (!be->compressed_pages) be->compressed_pages = kvcalloc(pclusterpages, sizeof(struct page *), GFP_KERNEL | __GFP_NOFAIL); z_erofs_parse_out_bvecs(be); err2 = z_erofs_parse_in_bvecs(be, &overlapped); if (err2) err = err2; if (err) goto out; if (z_erofs_is_inline_pcluster(pcl)) inputsize = pcl->tailpacking_size; else inputsize = pclusterpages * PAGE_SIZE; err = decompressor->decompress(&(struct z_erofs_decompress_req) { .sb = be->sb, .in = be->compressed_pages, .out = be->decompressed_pages, .pageofs_in = pcl->pageofs_in, .pageofs_out = pcl->pageofs_out, .inputsize = inputsize, .outputsize = pcl->length, .alg = pcl->algorithmformat, .inplace_io = overlapped, .partial_decoding = pcl->partial, .fillgaps = pcl->multibases, }, be->pagepool); out: /* must handle all compressed pages before actual file pages */ if (z_erofs_is_inline_pcluster(pcl)) { page = pcl->compressed_bvecs[0].page; WRITE_ONCE(pcl->compressed_bvecs[0].page, NULL); put_page(page); } else { for (i = 0; i < pclusterpages; ++i) { page = pcl->compressed_bvecs[i].page; if (erofs_page_is_managed(sbi, page)) continue; /* recycle all individual short-lived pages */ (void)z_erofs_put_shortlivedpage(be->pagepool, page); WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); } } if (be->compressed_pages < be->onstack_pages || be->compressed_pages >= be->onstack_pages + Z_EROFS_ONSTACK_PAGES) kvfree(be->compressed_pages); z_erofs_fill_other_copies(be, err); for (i = 0; i < be->nr_pages; ++i) { page = be->decompressed_pages[i]; if (!page) continue; DBG_BUGON(z_erofs_page_is_invalidated(page)); /* recycle all individual short-lived pages */ if (z_erofs_put_shortlivedpage(be->pagepool, page)) continue; z_erofs_onlinepage_endio(page, err); } if (be->decompressed_pages != be->onstack_pages) kvfree(be->decompressed_pages); pcl->length = 0; pcl->partial = true; pcl->multibases = false; pcl->bvset.nextpage = NULL; pcl->vcnt = 0; /* pcluster lock MUST be taken before the following line */ WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_NIL); mutex_unlock(&pcl->lock); return err; } static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io, struct page **pagepool) { struct z_erofs_decompress_backend be = { .sb = io->sb, .pagepool = pagepool, .decompressed_secondary_bvecs = LIST_HEAD_INIT(be.decompressed_secondary_bvecs), }; z_erofs_next_pcluster_t owned = io->head; while (owned != Z_EROFS_PCLUSTER_TAIL) { DBG_BUGON(owned == Z_EROFS_PCLUSTER_NIL); be.pcl = container_of(owned, struct z_erofs_pcluster, next); owned = READ_ONCE(be.pcl->next); z_erofs_decompress_pcluster(&be, io->eio ? -EIO : 0); if (z_erofs_is_inline_pcluster(be.pcl)) z_erofs_free_pcluster(be.pcl); else erofs_workgroup_put(&be.pcl->obj); } } static void z_erofs_decompressqueue_work(struct work_struct *work) { struct z_erofs_decompressqueue *bgq = container_of(work, struct z_erofs_decompressqueue, u.work); struct page *pagepool = NULL; DBG_BUGON(bgq->head == Z_EROFS_PCLUSTER_TAIL); z_erofs_decompress_queue(bgq, &pagepool); erofs_release_pages(&pagepool); kvfree(bgq); } #ifdef CONFIG_EROFS_FS_PCPU_KTHREAD static void z_erofs_decompressqueue_kthread_work(struct kthread_work *work) { z_erofs_decompressqueue_work((struct work_struct *)work); } #endif static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io, int bios) { struct erofs_sb_info *const sbi = EROFS_SB(io->sb); /* wake up the caller thread for sync decompression */ if (io->sync) { if (!atomic_add_return(bios, &io->pending_bios)) complete(&io->u.done); return; } if (atomic_add_return(bios, &io->pending_bios)) return; /* Use (kthread_)work and sync decompression for atomic contexts only */ if (!in_task() || irqs_disabled() || rcu_read_lock_any_held()) { #ifdef CONFIG_EROFS_FS_PCPU_KTHREAD struct kthread_worker *worker; rcu_read_lock(); worker = rcu_dereference( z_erofs_pcpu_workers[raw_smp_processor_id()]); if (!worker) { INIT_WORK(&io->u.work, z_erofs_decompressqueue_work); queue_work(z_erofs_workqueue, &io->u.work); } else { kthread_queue_work(worker, &io->u.kthread_work); } rcu_read_unlock(); #else queue_work(z_erofs_workqueue, &io->u.work); #endif /* enable sync decompression for readahead */ if (sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_AUTO) sbi->opt.sync_decompress = EROFS_SYNC_DECOMPRESS_FORCE_ON; return; } z_erofs_decompressqueue_work(&io->u.work); } static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl, unsigned int nr, struct page **pagepool, struct address_space *mc) { const pgoff_t index = pcl->obj.index; gfp_t gfp = mapping_gfp_mask(mc); bool tocache = false; struct address_space *mapping; struct page *oldpage, *page; int justfound; repeat: page = READ_ONCE(pcl->compressed_bvecs[nr].page); oldpage = page; if (!page) goto out_allocpage; justfound = (unsigned long)page & 1UL; page = (struct page *)((unsigned long)page & ~1UL); /* * preallocated cached pages, which is used to avoid direct reclaim * otherwise, it will go inplace I/O path instead. */ if (page->private == Z_EROFS_PREALLOCATED_PAGE) { WRITE_ONCE(pcl->compressed_bvecs[nr].page, page); set_page_private(page, 0); tocache = true; goto out_tocache; } mapping = READ_ONCE(page->mapping); /* * file-backed online pages in plcuster are all locked steady, * therefore it is impossible for `mapping' to be NULL. */ if (mapping && mapping != mc) /* ought to be unmanaged pages */ goto out; /* directly return for shortlived page as well */ if (z_erofs_is_shortlived_page(page)) goto out; lock_page(page); /* only true if page reclaim goes wrong, should never happen */ DBG_BUGON(justfound && PagePrivate(page)); /* the page is still in manage cache */ if (page->mapping == mc) { WRITE_ONCE(pcl->compressed_bvecs[nr].page, page); if (!PagePrivate(page)) { /* * impossible to be !PagePrivate(page) for * the current restriction as well if * the page is already in compressed_bvecs[]. */ DBG_BUGON(!justfound); justfound = 0; set_page_private(page, (unsigned long)pcl); SetPagePrivate(page); } /* no need to submit io if it is already up-to-date */ if (PageUptodate(page)) { unlock_page(page); page = NULL; } goto out; } /* * the managed page has been truncated, it's unsafe to * reuse this one, let's allocate a new cache-managed page. */ DBG_BUGON(page->mapping); DBG_BUGON(!justfound); tocache = true; unlock_page(page); put_page(page); out_allocpage: page = erofs_allocpage(pagepool, gfp | __GFP_NOFAIL); if (oldpage != cmpxchg(&pcl->compressed_bvecs[nr].page, oldpage, page)) { erofs_pagepool_add(pagepool, page); cond_resched(); goto repeat; } out_tocache: if (!tocache || add_to_page_cache_lru(page, mc, index + nr, gfp)) { /* turn into temporary page if fails (1 ref) */ set_page_private(page, Z_EROFS_SHORTLIVED_PAGE); goto out; } attach_page_private(page, pcl); /* drop a refcount added by allocpage (then we have 2 refs here) */ put_page(page); out: /* the only exit (for tracing and debugging) */ return page; } static struct z_erofs_decompressqueue *jobqueue_init(struct super_block *sb, struct z_erofs_decompressqueue *fgq, bool *fg) { struct z_erofs_decompressqueue *q; if (fg && !*fg) { q = kvzalloc(sizeof(*q), GFP_KERNEL | __GFP_NOWARN); if (!q) { *fg = true; goto fg_out; } #ifdef CONFIG_EROFS_FS_PCPU_KTHREAD kthread_init_work(&q->u.kthread_work, z_erofs_decompressqueue_kthread_work); #else INIT_WORK(&q->u.work, z_erofs_decompressqueue_work); #endif } else { fg_out: q = fgq; init_completion(&fgq->u.done); atomic_set(&fgq->pending_bios, 0); q->eio = false; q->sync = true; } q->sb = sb; q->head = Z_EROFS_PCLUSTER_TAIL; return q; } /* define decompression jobqueue types */ enum { JQ_BYPASS, JQ_SUBMIT, NR_JOBQUEUES, }; static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl, z_erofs_next_pcluster_t qtail[], z_erofs_next_pcluster_t owned_head) { z_erofs_next_pcluster_t *const submit_qtail = qtail[JQ_SUBMIT]; z_erofs_next_pcluster_t *const bypass_qtail = qtail[JQ_BYPASS]; WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_TAIL); WRITE_ONCE(*submit_qtail, owned_head); WRITE_ONCE(*bypass_qtail, &pcl->next); qtail[JQ_BYPASS] = &pcl->next; } static void z_erofs_decompressqueue_endio(struct bio *bio) { struct z_erofs_decompressqueue *q = bio->bi_private; blk_status_t err = bio->bi_status; struct bio_vec *bvec; struct bvec_iter_all iter_all; bio_for_each_segment_all(bvec, bio, iter_all) { struct page *page = bvec->bv_page; DBG_BUGON(PageUptodate(page)); DBG_BUGON(z_erofs_page_is_invalidated(page)); if (erofs_page_is_managed(EROFS_SB(q->sb), page)) { if (!err) SetPageUptodate(page); unlock_page(page); } } if (err) q->eio = true; z_erofs_decompress_kickoff(q, -1); bio_put(bio); } static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, struct z_erofs_decompressqueue *fgq, bool *force_fg, bool readahead) { struct super_block *sb = f->inode->i_sb; struct address_space *mc = MNGD_MAPPING(EROFS_SB(sb)); z_erofs_next_pcluster_t qtail[NR_JOBQUEUES]; struct z_erofs_decompressqueue *q[NR_JOBQUEUES]; z_erofs_next_pcluster_t owned_head = f->owned_head; /* bio is NULL initially, so no need to initialize last_{index,bdev} */ pgoff_t last_index; struct block_device *last_bdev; unsigned int nr_bios = 0; struct bio *bio = NULL; unsigned long pflags; int memstall = 0; /* * if managed cache is enabled, bypass jobqueue is needed, * no need to read from device for all pclusters in this queue. */ q[JQ_BYPASS] = jobqueue_init(sb, fgq + JQ_BYPASS, NULL); q[JQ_SUBMIT] = jobqueue_init(sb, fgq + JQ_SUBMIT, force_fg); qtail[JQ_BYPASS] = &q[JQ_BYPASS]->head; qtail[JQ_SUBMIT] = &q[JQ_SUBMIT]->head; /* by default, all need io submission */ q[JQ_SUBMIT]->head = owned_head; do { struct erofs_map_dev mdev; struct z_erofs_pcluster *pcl; pgoff_t cur, end; unsigned int i = 0; bool bypass = true; DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_NIL); pcl = container_of(owned_head, struct z_erofs_pcluster, next); owned_head = READ_ONCE(pcl->next); if (z_erofs_is_inline_pcluster(pcl)) { move_to_bypass_jobqueue(pcl, qtail, owned_head); continue; } /* no device id here, thus it will always succeed */ mdev = (struct erofs_map_dev) { .m_pa = erofs_pos(sb, pcl->obj.index), }; (void)erofs_map_dev(sb, &mdev); cur = erofs_blknr(sb, mdev.m_pa); end = cur + pcl->pclusterpages; do { struct page *page; page = pickup_page_for_submission(pcl, i++, &f->pagepool, mc); if (!page) continue; if (bio && (cur != last_index + 1 || last_bdev != mdev.m_bdev)) { submit_bio_retry: submit_bio(bio); if (memstall) { psi_memstall_leave(&pflags); memstall = 0; } bio = NULL; } if (unlikely(PageWorkingset(page)) && !memstall) { psi_memstall_enter(&pflags); memstall = 1; } if (!bio) { bio = bio_alloc(mdev.m_bdev, BIO_MAX_VECS, REQ_OP_READ, GFP_NOIO); bio->bi_end_io = z_erofs_decompressqueue_endio; last_bdev = mdev.m_bdev; bio->bi_iter.bi_sector = (sector_t)cur << (sb->s_blocksize_bits - 9); bio->bi_private = q[JQ_SUBMIT]; if (readahead) bio->bi_opf |= REQ_RAHEAD; ++nr_bios; } if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) goto submit_bio_retry; last_index = cur; bypass = false; } while (++cur < end); if (!bypass) qtail[JQ_SUBMIT] = &pcl->next; else move_to_bypass_jobqueue(pcl, qtail, owned_head); } while (owned_head != Z_EROFS_PCLUSTER_TAIL); if (bio) { submit_bio(bio); if (memstall) psi_memstall_leave(&pflags); } /* * although background is preferred, no one is pending for submission. * don't issue decompression but drop it directly instead. */ if (!*force_fg && !nr_bios) { kvfree(q[JQ_SUBMIT]); return; } z_erofs_decompress_kickoff(q[JQ_SUBMIT], nr_bios); } static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f, bool force_fg, bool ra) { struct z_erofs_decompressqueue io[NR_JOBQUEUES]; if (f->owned_head == Z_EROFS_PCLUSTER_TAIL) return; z_erofs_submit_queue(f, io, &force_fg, ra); /* handle bypass queue (no i/o pclusters) immediately */ z_erofs_decompress_queue(&io[JQ_BYPASS], &f->pagepool); if (!force_fg) return; /* wait until all bios are completed */ wait_for_completion_io(&io[JQ_SUBMIT].u.done); /* handle synchronous decompress queue in the caller context */ z_erofs_decompress_queue(&io[JQ_SUBMIT], &f->pagepool); } /* * Since partial uptodate is still unimplemented for now, we have to use * approximate readmore strategies as a start. */ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f, struct readahead_control *rac, bool backmost) { struct inode *inode = f->inode; struct erofs_map_blocks *map = &f->map; erofs_off_t cur, end, headoffset = f->headoffset; int err; if (backmost) { if (rac) end = headoffset + readahead_length(rac) - 1; else end = headoffset + PAGE_SIZE - 1; map->m_la = end; err = z_erofs_map_blocks_iter(inode, map, EROFS_GET_BLOCKS_READMORE); if (err) return; /* expand ra for the trailing edge if readahead */ if (rac) { cur = round_up(map->m_la + map->m_llen, PAGE_SIZE); readahead_expand(rac, headoffset, cur - headoffset); return; } end = round_up(end, PAGE_SIZE); } else { end = round_up(map->m_la, PAGE_SIZE); if (!map->m_llen) return; } cur = map->m_la + map->m_llen - 1; while ((cur >= end) && (cur < i_size_read(inode))) { pgoff_t index = cur >> PAGE_SHIFT; struct page *page; page = erofs_grab_cache_page_nowait(inode->i_mapping, index); if (page) { if (PageUptodate(page)) unlock_page(page); else (void)z_erofs_do_read_page(f, page); put_page(page); } if (cur < PAGE_SIZE) break; cur = (index << PAGE_SHIFT) - 1; } } static int z_erofs_read_folio(struct file *file, struct folio *folio) { struct inode *const inode = folio->mapping->host; struct erofs_sb_info *const sbi = EROFS_I_SB(inode); struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); int err; trace_erofs_read_folio(folio, false); f.headoffset = (erofs_off_t)folio->index << PAGE_SHIFT; z_erofs_pcluster_readmore(&f, NULL, true); err = z_erofs_do_read_page(&f, &folio->page); z_erofs_pcluster_readmore(&f, NULL, false); z_erofs_pcluster_end(&f); /* if some compressed cluster ready, need submit them anyway */ z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, 0), false); if (err && err != -EINTR) erofs_err(inode->i_sb, "read error %d @ %lu of nid %llu", err, folio->index, EROFS_I(inode)->nid); erofs_put_metabuf(&f.map.buf); erofs_release_pages(&f.pagepool); return err; } static void z_erofs_readahead(struct readahead_control *rac) { struct inode *const inode = rac->mapping->host; struct erofs_sb_info *const sbi = EROFS_I_SB(inode); struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); struct folio *head = NULL, *folio; unsigned int nr_folios; int err; f.headoffset = readahead_pos(rac); z_erofs_pcluster_readmore(&f, rac, true); nr_folios = readahead_count(rac); trace_erofs_readpages(inode, readahead_index(rac), nr_folios, false); while ((folio = readahead_folio(rac))) { folio->private = head; head = folio; } /* traverse in reverse order for best metadata I/O performance */ while (head) { folio = head; head = folio_get_private(folio); err = z_erofs_do_read_page(&f, &folio->page); if (err && err != -EINTR) erofs_err(inode->i_sb, "readahead error at folio %lu @ nid %llu", folio->index, EROFS_I(inode)->nid); } z_erofs_pcluster_readmore(&f, rac, false); z_erofs_pcluster_end(&f); z_erofs_runqueue(&f, z_erofs_is_sync_decompress(sbi, nr_folios), true); erofs_put_metabuf(&f.map.buf); erofs_release_pages(&f.pagepool); } const struct address_space_operations z_erofs_aops = { .read_folio = z_erofs_read_folio, .readahead = z_erofs_readahead, };
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 /* * linux/fs/nls/nls_cp775.c * * Charset cp775 translation tables. * Generated automatically from the Unicode and charset * tables from the Unicode Organization (www.unicode.org). * The Unicode to charset table has only exact mappings. */ #include <linux/module.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/nls.h> #include <linux/errno.h> static const wchar_t charset2uni[256] = { /* 0x00*/ 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f, /* 0x10*/ 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, 0x001a, 0x001b, 0x001c, 0x001d, 0x001e, 0x001f, /* 0x20*/ 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x0028, 0x0029, 0x002a, 0x002b, 0x002c, 0x002d, 0x002e, 0x002f, /* 0x30*/ 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, /* 0x40*/ 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, 0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f, /* 0x50*/ 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, 0x0058, 0x0059, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f, /* 0x60*/ 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, 0x006e, 0x006f, /* 0x70*/ 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, 0x0078, 0x0079, 0x007a, 0x007b, 0x007c, 0x007d, 0x007e, 0x007f, /* 0x80*/ 0x0106, 0x00fc, 0x00e9, 0x0101, 0x00e4, 0x0123, 0x00e5, 0x0107, 0x0142, 0x0113, 0x0156, 0x0157, 0x012b, 0x0179, 0x00c4, 0x00c5, /* 0x90*/ 0x00c9, 0x00e6, 0x00c6, 0x014d, 0x00f6, 0x0122, 0x00a2, 0x015a, 0x015b, 0x00d6, 0x00dc, 0x00f8, 0x00a3, 0x00d8, 0x00d7, 0x00a4, /* 0xa0*/ 0x0100, 0x012a, 0x00f3, 0x017b, 0x017c, 0x017a, 0x201d, 0x00a6, 0x00a9, 0x00ae, 0x00ac, 0x00bd, 0x00bc, 0x0141, 0x00ab, 0x00bb, /* 0xb0*/ 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x0104, 0x010c, 0x0118, 0x0116, 0x2563, 0x2551, 0x2557, 0x255d, 0x012e, 0x0160, 0x2510, /* 0xc0*/ 0x2514, 0x2534, 0x252c, 0x251c, 0x2500, 0x253c, 0x0172, 0x016a, 0x255a, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x017d, /* 0xd0*/ 0x0105, 0x010d, 0x0119, 0x0117, 0x012f, 0x0161, 0x0173, 0x016b, 0x017e, 0x2518, 0x250c, 0x2588, 0x2584, 0x258c, 0x2590, 0x2580, /* 0xe0*/ 0x00d3, 0x00df, 0x014c, 0x0143, 0x00f5, 0x00d5, 0x00b5, 0x0144, 0x0136, 0x0137, 0x013b, 0x013c, 0x0146, 0x0112, 0x0145, 0x2019, /* 0xf0*/ 0x00ad, 0x00b1, 0x201c, 0x00be, 0x00b6, 0x00a7, 0x00f7, 0x201e, 0x00b0, 0x2219, 0x00b7, 0x00b9, 0x00b3, 0x00b2, 0x25a0, 0x00a0, }; static const unsigned char page00[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0xff, 0x00, 0x96, 0x9c, 0x9f, 0x00, 0xa7, 0xf5, /* 0xa0-0xa7 */ 0x00, 0xa8, 0x00, 0xae, 0xaa, 0xf0, 0xa9, 0x00, /* 0xa8-0xaf */ 0xf8, 0xf1, 0xfd, 0xfc, 0x00, 0xe6, 0xf4, 0xfa, /* 0xb0-0xb7 */ 0x00, 0xfb, 0x00, 0xaf, 0xac, 0xab, 0xf3, 0x00, /* 0xb8-0xbf */ 0x00, 0x00, 0x00, 0x00, 0x8e, 0x8f, 0x92, 0x00, /* 0xc0-0xc7 */ 0x00, 0x90, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xc8-0xcf */ 0x00, 0x00, 0x00, 0xe0, 0x00, 0xe5, 0x99, 0x9e, /* 0xd0-0xd7 */ 0x9d, 0x00, 0x00, 0x00, 0x9a, 0x00, 0x00, 0xe1, /* 0xd8-0xdf */ 0x00, 0x00, 0x00, 0x00, 0x84, 0x86, 0x91, 0x00, /* 0xe0-0xe7 */ 0x00, 0x82, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xe8-0xef */ 0x00, 0x00, 0x00, 0xa2, 0x00, 0xe4, 0x94, 0xf6, /* 0xf0-0xf7 */ 0x9b, 0x00, 0x00, 0x00, 0x81, 0x00, 0x00, 0x00, /* 0xf8-0xff */ }; static const unsigned char page01[256] = { 0xa0, 0x83, 0x00, 0x00, 0xb5, 0xd0, 0x80, 0x87, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0xb6, 0xd1, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0xed, 0x89, 0x00, 0x00, 0xb8, 0xd3, /* 0x10-0x17 */ 0xb7, 0xd2, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x95, 0x85, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0xa1, 0x8c, 0x00, 0x00, 0xbd, 0xd4, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xe8, 0xe9, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0xea, 0xeb, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0xad, 0x88, 0xe3, 0xe7, 0xee, 0xec, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0xe2, 0x93, 0x00, 0x00, /* 0x48-0x4f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8a, 0x8b, /* 0x50-0x57 */ 0x00, 0x00, 0x97, 0x98, 0x00, 0x00, 0x00, 0x00, /* 0x58-0x5f */ 0xbe, 0xd5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x60-0x67 */ 0x00, 0x00, 0xc7, 0xd7, 0x00, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0xc6, 0xd6, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x8d, 0xa5, 0xa3, 0xa4, 0xcf, 0xd8, 0x00, /* 0x78-0x7f */ }; static const unsigned char page20[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0xef, 0x00, 0x00, 0xf2, 0xa6, 0xf7, 0x00, /* 0x18-0x1f */ }; static const unsigned char page22[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0x00, 0xf9, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */ }; static const unsigned char page25[256] = { 0xc4, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */ 0x00, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, /* 0x08-0x0f */ 0xbf, 0x00, 0x00, 0x00, 0xc0, 0x00, 0x00, 0x00, /* 0x10-0x17 */ 0xd9, 0x00, 0x00, 0x00, 0xc3, 0x00, 0x00, 0x00, /* 0x18-0x1f */ 0x00, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, /* 0x20-0x27 */ 0x00, 0x00, 0x00, 0x00, 0xc2, 0x00, 0x00, 0x00, /* 0x28-0x2f */ 0x00, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, /* 0x30-0x37 */ 0x00, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, /* 0x38-0x3f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x40-0x47 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x48-0x4f */ 0xcd, 0xba, 0x00, 0x00, 0xc9, 0x00, 0x00, 0xbb, /* 0x50-0x57 */ 0x00, 0x00, 0xc8, 0x00, 0x00, 0xbc, 0x00, 0x00, /* 0x58-0x5f */ 0xcc, 0x00, 0x00, 0xb9, 0x00, 0x00, 0xcb, 0x00, /* 0x60-0x67 */ 0x00, 0xca, 0x00, 0x00, 0xce, 0x00, 0x00, 0x00, /* 0x68-0x6f */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x70-0x77 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x78-0x7f */ 0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, /* 0x80-0x87 */ 0xdb, 0x00, 0x00, 0x00, 0xdd, 0x00, 0x00, 0x00, /* 0x88-0x8f */ 0xde, 0xb0, 0xb1, 0xb2, 0x00, 0x00, 0x00, 0x00, /* 0x90-0x97 */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x98-0x9f */ 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0xa0-0xa7 */ }; static const unsigned char *const page_uni2charset[256] = { page00, page01, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, page20, NULL, page22, NULL, NULL, page25, NULL, NULL, }; static const unsigned char charset2lower[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x40-0x47 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x48-0x4f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x50-0x57 */ 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 0x60-0x67 */ 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 0x68-0x6f */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 0x70-0x77 */ 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x87, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, /* 0x80-0x87 */ 0x88, 0x89, 0x8b, 0x8b, 0x8c, 0xa5, 0x84, 0x86, /* 0x88-0x8f */ 0x82, 0x91, 0x91, 0x93, 0x94, 0x85, 0x96, 0x98, /* 0x90-0x97 */ 0x98, 0x94, 0x81, 0x9b, 0x9c, 0x9b, 0x9e, 0x9f, /* 0x98-0x9f */ 0x83, 0x8c, 0xa2, 0xa4, 0xa4, 0xa5, 0xa6, 0xa7, /* 0xa0-0xa7 */ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0x88, 0xae, 0xaf, /* 0xa8-0xaf */ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xd0, 0xd1, 0xd2, /* 0xb0-0xb7 */ 0xd3, 0xb9, 0xba, 0xbb, 0xbc, 0xd4, 0xd5, 0xbf, /* 0xb8-0xbf */ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xd6, 0xd7, /* 0xc0-0xc7 */ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xd8, /* 0xc8-0xcf */ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, /* 0xd0-0xd7 */ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ 0xa2, 0xe1, 0x93, 0xe7, 0xe4, 0xe4, 0xe6, 0xe7, /* 0xe0-0xe7 */ 0xe9, 0xe9, 0xeb, 0xeb, 0xec, 0x89, 0xec, 0xef, /* 0xe8-0xef */ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ }; static const unsigned char charset2upper[256] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 0x00-0x07 */ 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 0x08-0x0f */ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 0x10-0x17 */ 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 0x18-0x1f */ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, /* 0x20-0x27 */ 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, /* 0x28-0x2f */ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 0x30-0x37 */ 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, /* 0x38-0x3f */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x40-0x47 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x48-0x4f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x50-0x57 */ 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, /* 0x58-0x5f */ 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 0x60-0x67 */ 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 0x68-0x6f */ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 0x70-0x77 */ 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, /* 0x78-0x7f */ 0x80, 0x9a, 0x90, 0xa0, 0x8e, 0x95, 0x8f, 0x80, /* 0x80-0x87 */ 0xad, 0xed, 0x8a, 0x8a, 0xa1, 0x8d, 0x8e, 0x8f, /* 0x88-0x8f */ 0x90, 0x92, 0x92, 0xe2, 0x99, 0x95, 0x96, 0x97, /* 0x90-0x97 */ 0x97, 0x99, 0x9a, 0x9d, 0x9c, 0x9d, 0x9e, 0x9f, /* 0x98-0x9f */ 0xa0, 0xa1, 0xe0, 0xa3, 0xa3, 0x8d, 0xa6, 0xa7, /* 0xa0-0xa7 */ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, /* 0xa8-0xaf */ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, /* 0xb0-0xb7 */ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, /* 0xb8-0xbf */ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, /* 0xc0-0xc7 */ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, /* 0xc8-0xcf */ 0xb5, 0xb6, 0xb7, 0xb8, 0xbd, 0xbe, 0xc6, 0xc7, /* 0xd0-0xd7 */ 0xcf, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, /* 0xd8-0xdf */ 0xe0, 0xe1, 0xe2, 0xe3, 0xe5, 0xe5, 0x00, 0xe3, /* 0xe0-0xe7 */ 0xe8, 0xe8, 0xea, 0xea, 0xee, 0xed, 0xee, 0xef, /* 0xe8-0xef */ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, /* 0xf0-0xf7 */ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, /* 0xf8-0xff */ }; static int uni2char(wchar_t uni, unsigned char *out, int boundlen) { const unsigned char *uni2charset; unsigned char cl = uni & 0x00ff; unsigned char ch = (uni & 0xff00) >> 8; if (boundlen <= 0) return -ENAMETOOLONG; uni2charset = page_uni2charset[ch]; if (uni2charset && uni2charset[cl]) out[0] = uni2charset[cl]; else return -EINVAL; return 1; } static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) { *uni = charset2uni[*rawstring]; if (*uni == 0x0000) return -EINVAL; return 1; } static struct nls_table table = { .charset = "cp775", .uni2char = uni2char, .char2uni = char2uni, .charset2lower = charset2lower, .charset2upper = charset2upper, }; static int __init init_nls_cp775(void) { return register_nls(&table); } static void __exit exit_nls_cp775(void) { unregister_nls(&table); } module_init(init_nls_cp775) module_exit(exit_nls_cp775) MODULE_LICENSE("Dual BSD/GPL");
3 3 24 24 89 46 46 88 56 56 56 56 89 90 32 42 56 61 42 42 42 42 42 42 32 32 41 36 36 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 24 24 24 13 12 13 13 12 13 13 13 1 1 13 1 19 8 1 18 22 22 14 14 14 1 13 8 3 3 3 3 3 3 3 61 61 46 1 46 11 46 46 46 46 46 46 46 46 4 46 46 45 46 46 38 34 8 46 45 46 28 28 12 21 21 44 12 17 17 12 7 7 7 7 4 7 6 7 21 26 22 22 22 26 4 22 21 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 21 3 22 22 22 22 19 3 21 27 27 27 26 26 22 22 22 27 46 46 36 9 36 36 36 9 9 9 9 9 46 46 46 46 1 46 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 // SPDX-License-Identifier: GPL-2.0 /* * fs/f2fs/checkpoint.c * * Copyright (c) 2012 Samsung Electronics Co., Ltd. * http://www.samsung.com/ */ #include <linux/fs.h> #include <linux/bio.h> #include <linux/mpage.h> #include <linux/writeback.h> #include <linux/blkdev.h> #include <linux/f2fs_fs.h> #include <linux/pagevec.h> #include <linux/swap.h> #include <linux/kthread.h> #include "f2fs.h" #include "node.h" #include "segment.h" #include "iostat.h" #include <trace/events/f2fs.h> #define DEFAULT_CHECKPOINT_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) static struct kmem_cache *ino_entry_slab; struct kmem_cache *f2fs_inode_entry_slab; void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io, unsigned char reason) { f2fs_build_fault_attr(sbi, 0, 0); if (!end_io) f2fs_flush_merged_writes(sbi); f2fs_handle_critical_error(sbi, reason, end_io); } /* * We guarantee no failure on the returned page. */ struct page *f2fs_grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) { struct address_space *mapping = META_MAPPING(sbi); struct page *page; repeat: page = f2fs_grab_cache_page(mapping, index, false); if (!page) { cond_resched(); goto repeat; } f2fs_wait_on_page_writeback(page, META, true, true); if (!PageUptodate(page)) SetPageUptodate(page); return page; } static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index, bool is_meta) { struct address_space *mapping = META_MAPPING(sbi); struct page *page; struct f2fs_io_info fio = { .sbi = sbi, .type = META, .op = REQ_OP_READ, .op_flags = REQ_META | REQ_PRIO, .old_blkaddr = index, .new_blkaddr = index, .encrypted_page = NULL, .is_por = !is_meta ? 1 : 0, }; int err; if (unlikely(!is_meta)) fio.op_flags &= ~REQ_META; repeat: page = f2fs_grab_cache_page(mapping, index, false); if (!page) { cond_resched(); goto repeat; } if (PageUptodate(page)) goto out; fio.page = page; err = f2fs_submit_page_bio(&fio); if (err) { f2fs_put_page(page, 1); return ERR_PTR(err); } f2fs_update_iostat(sbi, NULL, FS_META_READ_IO, F2FS_BLKSIZE); lock_page(page); if (unlikely(page->mapping != mapping)) { f2fs_put_page(page, 1); goto repeat; } if (unlikely(!PageUptodate(page))) { f2fs_handle_page_eio(sbi, page->index, META); f2fs_put_page(page, 1); return ERR_PTR(-EIO); } out: return page; } struct page *f2fs_get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) { return __get_meta_page(sbi, index, true); } struct page *f2fs_get_meta_page_retry(struct f2fs_sb_info *sbi, pgoff_t index) { struct page *page; int count = 0; retry: page = __get_meta_page(sbi, index, true); if (IS_ERR(page)) { if (PTR_ERR(page) == -EIO && ++count <= DEFAULT_RETRY_IO_COUNT) goto retry; f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_META_PAGE); } return page; } /* for POR only */ struct page *f2fs_get_tmp_page(struct f2fs_sb_info *sbi, pgoff_t index) { return __get_meta_page(sbi, index, false); } static bool __is_bitmap_valid(struct f2fs_sb_info *sbi, block_t blkaddr, int type) { struct seg_entry *se; unsigned int segno, offset; bool exist; if (type == DATA_GENERIC) return true; segno = GET_SEGNO(sbi, blkaddr); offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr); se = get_seg_entry(sbi, segno); exist = f2fs_test_bit(offset, se->cur_valid_map); /* skip data, if we already have an error in checkpoint. */ if (unlikely(f2fs_cp_error(sbi))) return exist; if (exist && type == DATA_GENERIC_ENHANCE_UPDATE) { f2fs_err(sbi, "Inconsistent error blkaddr:%u, sit bitmap:%d", blkaddr, exist); set_sbi_flag(sbi, SBI_NEED_FSCK); return exist; } if (!exist && type == DATA_GENERIC_ENHANCE) { f2fs_err(sbi, "Inconsistent error blkaddr:%u, sit bitmap:%d", blkaddr, exist); set_sbi_flag(sbi, SBI_NEED_FSCK); dump_stack(); } return exist; } bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type) { if (time_to_inject(sbi, FAULT_BLKADDR)) return false; switch (type) { case META_NAT: break; case META_SIT: if (unlikely(blkaddr >= SIT_BLK_CNT(sbi))) return false; break; case META_SSA: if (unlikely(blkaddr >= MAIN_BLKADDR(sbi) || blkaddr < SM_I(sbi)->ssa_blkaddr)) return false; break; case META_CP: if (unlikely(blkaddr >= SIT_I(sbi)->sit_base_addr || blkaddr < __start_cp_addr(sbi))) return false; break; case META_POR: if (unlikely(blkaddr >= MAX_BLKADDR(sbi) || blkaddr < MAIN_BLKADDR(sbi))) return false; break; case DATA_GENERIC: case DATA_GENERIC_ENHANCE: case DATA_GENERIC_ENHANCE_READ: case DATA_GENERIC_ENHANCE_UPDATE: if (unlikely(blkaddr >= MAX_BLKADDR(sbi) || blkaddr < MAIN_BLKADDR(sbi))) { /* Skip to emit an error message. */ if (unlikely(f2fs_cp_error(sbi))) return false; f2fs_warn(sbi, "access invalid blkaddr:%u", blkaddr); set_sbi_flag(sbi, SBI_NEED_FSCK); dump_stack(); return false; } else { return __is_bitmap_valid(sbi, blkaddr, type); } break; case META_GENERIC: if (unlikely(blkaddr < SEG0_BLKADDR(sbi) || blkaddr >= MAIN_BLKADDR(sbi))) return false; break; default: BUG(); } return true; } /* * Readahead CP/NAT/SIT/SSA/POR pages */ int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type, bool sync) { struct page *page; block_t blkno = start; struct f2fs_io_info fio = { .sbi = sbi, .type = META, .op = REQ_OP_READ, .op_flags = sync ? (REQ_META | REQ_PRIO) : REQ_RAHEAD, .encrypted_page = NULL, .in_list = 0, .is_por = (type == META_POR) ? 1 : 0, }; struct blk_plug plug; int err; if (unlikely(type == META_POR)) fio.op_flags &= ~REQ_META; blk_start_plug(&plug); for (; nrpages-- > 0; blkno++) { if (!f2fs_is_valid_blkaddr(sbi, blkno, type)) goto out; switch (type) { case META_NAT: if (unlikely(blkno >= NAT_BLOCK_OFFSET(NM_I(sbi)->max_nid))) blkno = 0; /* get nat block addr */ fio.new_blkaddr = current_nat_addr(sbi, blkno * NAT_ENTRY_PER_BLOCK); break; case META_SIT: if (unlikely(blkno >= TOTAL_SEGS(sbi))) goto out; /* get sit block addr */ fio.new_blkaddr = current_sit_addr(sbi, blkno * SIT_ENTRY_PER_BLOCK); break; case META_SSA: case META_CP: case META_POR: fio.new_blkaddr = blkno; break; default: BUG(); } page = f2fs_grab_cache_page(META_MAPPING(sbi), fio.new_blkaddr, false); if (!page) continue; if (PageUptodate(page)) { f2fs_put_page(page, 1); continue; } fio.page = page; err = f2fs_submit_page_bio(&fio); f2fs_put_page(page, err ? 1 : 0); if (!err) f2fs_update_iostat(sbi, NULL, FS_META_READ_IO, F2FS_BLKSIZE); } out: blk_finish_plug(&plug); return blkno - start; } void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index, unsigned int ra_blocks) { struct page *page; bool readahead = false; if (ra_blocks == RECOVERY_MIN_RA_BLOCKS) return; page = find_get_page(META_MAPPING(sbi), index); if (!page || !PageUptodate(page)) readahead = true; f2fs_put_page(page, 0); if (readahead) f2fs_ra_meta_pages(sbi, index, ra_blocks, META_POR, true); } static int __f2fs_write_meta_page(struct page *page, struct writeback_control *wbc, enum iostat_type io_type) { struct f2fs_sb_info *sbi = F2FS_P_SB(page); trace_f2fs_writepage(page, META); if (unlikely(f2fs_cp_error(sbi))) { if (is_sbi_flag_set(sbi, SBI_IS_CLOSE)) { ClearPageUptodate(page); dec_page_count(sbi, F2FS_DIRTY_META); unlock_page(page); return 0; } goto redirty_out; } if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) goto redirty_out; if (wbc->for_reclaim && page->index < GET_SUM_BLOCK(sbi, 0)) goto redirty_out; f2fs_do_write_meta_page(sbi, page, io_type); dec_page_count(sbi, F2FS_DIRTY_META); if (wbc->for_reclaim) f2fs_submit_merged_write_cond(sbi, NULL, page, 0, META); unlock_page(page); if (unlikely(f2fs_cp_error(sbi))) f2fs_submit_merged_write(sbi, META); return 0; redirty_out: redirty_page_for_writepage(wbc, page); return AOP_WRITEPAGE_ACTIVATE; } static int f2fs_write_meta_page(struct page *page, struct writeback_control *wbc) { return __f2fs_write_meta_page(page, wbc, FS_META_IO); } static int f2fs_write_meta_pages(struct address_space *mapping, struct writeback_control *wbc) { struct f2fs_sb_info *sbi = F2FS_M_SB(mapping); long diff, written; if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) goto skip_write; /* collect a number of dirty meta pages and write together */ if (wbc->sync_mode != WB_SYNC_ALL && get_pages(sbi, F2FS_DIRTY_META) < nr_pages_to_skip(sbi, META)) goto skip_write; /* if locked failed, cp will flush dirty pages instead */ if (!f2fs_down_write_trylock(&sbi->cp_global_sem)) goto skip_write; trace_f2fs_writepages(mapping->host, wbc, META); diff = nr_pages_to_write(sbi, META, wbc); written = f2fs_sync_meta_pages(sbi, META, wbc->nr_to_write, FS_META_IO); f2fs_up_write(&sbi->cp_global_sem); wbc->nr_to_write = max((long)0, wbc->nr_to_write - written - diff); return 0; skip_write: wbc->pages_skipped += get_pages(sbi, F2FS_DIRTY_META); trace_f2fs_writepages(mapping->host, wbc, META); return 0; } long f2fs_sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, long nr_to_write, enum iostat_type io_type) { struct address_space *mapping = META_MAPPING(sbi); pgoff_t index = 0, prev = ULONG_MAX; struct folio_batch fbatch; long nwritten = 0; int nr_folios; struct writeback_control wbc = { .for_reclaim = 0, }; struct blk_plug plug; folio_batch_init(&fbatch); blk_start_plug(&plug); while ((nr_folios = filemap_get_folios_tag(mapping, &index, (pgoff_t)-1, PAGECACHE_TAG_DIRTY, &fbatch))) { int i; for (i = 0; i < nr_folios; i++) { struct folio *folio = fbatch.folios[i]; if (nr_to_write != LONG_MAX && i != 0 && folio->index != prev + folio_nr_pages(fbatch.folios[i-1])) { folio_batch_release(&fbatch); goto stop; } folio_lock(folio); if (unlikely(folio->mapping != mapping)) { continue_unlock: folio_unlock(folio); continue; } if (!folio_test_dirty(folio)) { /* someone wrote it for us */ goto continue_unlock; } f2fs_wait_on_page_writeback(&folio->page, META, true, true); if (!folio_clear_dirty_for_io(folio)) goto continue_unlock; if (__f2fs_write_meta_page(&folio->page, &wbc, io_type)) { folio_unlock(folio); break; } nwritten += folio_nr_pages(folio); prev = folio->index; if (unlikely(nwritten >= nr_to_write)) break; } folio_batch_release(&fbatch); cond_resched(); } stop: if (nwritten) f2fs_submit_merged_write(sbi, type); blk_finish_plug(&plug); return nwritten; } static bool f2fs_dirty_meta_folio(struct address_space *mapping, struct folio *folio) { trace_f2fs_set_page_dirty(&folio->page, META); if (!folio_test_uptodate(folio)) folio_mark_uptodate(folio); if (filemap_dirty_folio(mapping, folio)) { inc_page_count(F2FS_M_SB(mapping), F2FS_DIRTY_META); set_page_private_reference(&folio->page); return true; } return false; } const struct address_space_operations f2fs_meta_aops = { .writepage = f2fs_write_meta_page, .writepages = f2fs_write_meta_pages, .dirty_folio = f2fs_dirty_meta_folio, .invalidate_folio = f2fs_invalidate_folio, .release_folio = f2fs_release_folio, .migrate_folio = filemap_migrate_folio, }; static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, unsigned int devidx, int type) { struct inode_management *im = &sbi->im[type]; struct ino_entry *e = NULL, *new = NULL; if (type == FLUSH_INO) { rcu_read_lock(); e = radix_tree_lookup(&im->ino_root, ino); rcu_read_unlock(); } retry: if (!e) new = f2fs_kmem_cache_alloc(ino_entry_slab, GFP_NOFS, true, NULL); radix_tree_preload(GFP_NOFS | __GFP_NOFAIL); spin_lock(&im->ino_lock); e = radix_tree_lookup(&im->ino_root, ino); if (!e) { if (!new) { spin_unlock(&im->ino_lock); radix_tree_preload_end(); goto retry; } e = new; if (unlikely(radix_tree_insert(&im->ino_root, ino, e))) f2fs_bug_on(sbi, 1); memset(e, 0, sizeof(struct ino_entry)); e->ino = ino; list_add_tail(&e->list, &im->ino_list); if (type != ORPHAN_INO) im->ino_num++; } if (type == FLUSH_INO) f2fs_set_bit(devidx, (char *)&e->dirty_device); spin_unlock(&im->ino_lock); radix_tree_preload_end(); if (new && e != new) kmem_cache_free(ino_entry_slab, new); } static void __remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) { struct inode_management *im = &sbi->im[type]; struct ino_entry *e; spin_lock(&im->ino_lock); e = radix_tree_lookup(&im->ino_root, ino); if (e) { list_del(&e->list); radix_tree_delete(&im->ino_root, ino); im->ino_num--; spin_unlock(&im->ino_lock); kmem_cache_free(ino_entry_slab, e); return; } spin_unlock(&im->ino_lock); } void f2fs_add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) { /* add new dirty ino entry into list */ __add_ino_entry(sbi, ino, 0, type); } void f2fs_remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) { /* remove dirty ino entry from list */ __remove_ino_entry(sbi, ino, type); } /* mode should be APPEND_INO, UPDATE_INO or TRANS_DIR_INO */ bool f2fs_exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode) { struct inode_management *im = &sbi->im[mode]; struct ino_entry *e; spin_lock(&im->ino_lock); e = radix_tree_lookup(&im->ino_root, ino); spin_unlock(&im->ino_lock); return e ? true : false; } void f2fs_release_ino_entry(struct f2fs_sb_info *sbi, bool all) { struct ino_entry *e, *tmp; int i; for (i = all ? ORPHAN_INO : APPEND_INO; i < MAX_INO_ENTRY; i++) { struct inode_management *im = &sbi->im[i]; spin_lock(&im->ino_lock); list_for_each_entry_safe(e, tmp, &im->ino_list, list) { list_del(&e->list); radix_tree_delete(&im->ino_root, e->ino); kmem_cache_free(ino_entry_slab, e); im->ino_num--; } spin_unlock(&im->ino_lock); } } void f2fs_set_dirty_device(struct f2fs_sb_info *sbi, nid_t ino, unsigned int devidx, int type) { __add_ino_entry(sbi, ino, devidx, type); } bool f2fs_is_dirty_device(struct f2fs_sb_info *sbi, nid_t ino, unsigned int devidx, int type) { struct inode_management *im = &sbi->im[type]; struct ino_entry *e; bool is_dirty = false; spin_lock(&im->ino_lock); e = radix_tree_lookup(&im->ino_root, ino); if (e && f2fs_test_bit(devidx, (char *)&e->dirty_device)) is_dirty = true; spin_unlock(&im->ino_lock); return is_dirty; } int f2fs_acquire_orphan_inode(struct f2fs_sb_info *sbi) { struct inode_management *im = &sbi->im[ORPHAN_INO]; int err = 0; spin_lock(&im->ino_lock); if (time_to_inject(sbi, FAULT_ORPHAN)) { spin_unlock(&im->ino_lock); return -ENOSPC; } if (unlikely(im->ino_num >= sbi->max_orphans)) err = -ENOSPC; else im->ino_num++; spin_unlock(&im->ino_lock); return err; } void f2fs_release_orphan_inode(struct f2fs_sb_info *sbi) { struct inode_management *im = &sbi->im[ORPHAN_INO]; spin_lock(&im->ino_lock); f2fs_bug_on(sbi, im->ino_num == 0); im->ino_num--; spin_unlock(&im->ino_lock); } void f2fs_add_orphan_inode(struct inode *inode) { /* add new orphan ino entry into list */ __add_ino_entry(F2FS_I_SB(inode), inode->i_ino, 0, ORPHAN_INO); f2fs_update_inode_page(inode); } void f2fs_remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) { /* remove orphan entry from orphan list */ __remove_ino_entry(sbi, ino, ORPHAN_INO); } static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) { struct inode *inode; struct node_info ni; int err; inode = f2fs_iget_retry(sbi->sb, ino); if (IS_ERR(inode)) { /* * there should be a bug that we can't find the entry * to orphan inode. */ f2fs_bug_on(sbi, PTR_ERR(inode) == -ENOENT); return PTR_ERR(inode); } err = f2fs_dquot_initialize(inode); if (err) { iput(inode); goto err_out; } clear_nlink(inode); /* truncate all the data during iput */ iput(inode); err = f2fs_get_node_info(sbi, ino, &ni, false); if (err) goto err_out; /* ENOMEM was fully retried in f2fs_evict_inode. */ if (ni.blk_addr != NULL_ADDR) { err = -EIO; goto err_out; } return 0; err_out: set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: orphan failed (ino=%x), run fsck to fix.", __func__, ino); return err; } int f2fs_recover_orphan_inodes(struct f2fs_sb_info *sbi) { block_t start_blk, orphan_blocks, i, j; int err = 0; if (!is_set_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG)) return 0; if (f2fs_hw_is_readonly(sbi)) { f2fs_info(sbi, "write access unavailable, skipping orphan cleanup"); return 0; } if (is_sbi_flag_set(sbi, SBI_IS_WRITABLE)) f2fs_info(sbi, "orphan cleanup on readonly fs"); start_blk = __start_cp_addr(sbi) + 1 + __cp_payload(sbi); orphan_blocks = __start_sum_addr(sbi) - 1 - __cp_payload(sbi); f2fs_ra_meta_pages(sbi, start_blk, orphan_blocks, META_CP, true); for (i = 0; i < orphan_blocks; i++) { struct page *page; struct f2fs_orphan_block *orphan_blk; page = f2fs_get_meta_page(sbi, start_blk + i); if (IS_ERR(page)) { err = PTR_ERR(page); goto out; } orphan_blk = (struct f2fs_orphan_block *)page_address(page); for (j = 0; j < le32_to_cpu(orphan_blk->entry_count); j++) { nid_t ino = le32_to_cpu(orphan_blk->ino[j]); err = recover_orphan_inode(sbi, ino); if (err) { f2fs_put_page(page, 1); goto out; } } f2fs_put_page(page, 1); } /* clear Orphan Flag */ clear_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG); out: set_sbi_flag(sbi, SBI_IS_RECOVERED); return err; } static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk) { struct list_head *head; struct f2fs_orphan_block *orphan_blk = NULL; unsigned int nentries = 0; unsigned short index = 1; unsigned short orphan_blocks; struct page *page = NULL; struct ino_entry *orphan = NULL; struct inode_management *im = &sbi->im[ORPHAN_INO]; orphan_blocks = GET_ORPHAN_BLOCKS(im->ino_num); /* * we don't need to do spin_lock(&im->ino_lock) here, since all the * orphan inode operations are covered under f2fs_lock_op(). * And, spin_lock should be avoided due to page operations below. */ head = &im->ino_list; /* loop for each orphan inode entry and write them in journal block */ list_for_each_entry(orphan, head, list) { if (!page) { page = f2fs_grab_meta_page(sbi, start_blk++); orphan_blk = (struct f2fs_orphan_block *)page_address(page); memset(orphan_blk, 0, sizeof(*orphan_blk)); } orphan_blk->ino[nentries++] = cpu_to_le32(orphan->ino); if (nentries == F2FS_ORPHANS_PER_BLOCK) { /* * an orphan block is full of 1020 entries, * then we need to flush current orphan blocks * and bring another one in memory */ orphan_blk->blk_addr = cpu_to_le16(index); orphan_blk->blk_count = cpu_to_le16(orphan_blocks); orphan_blk->entry_count = cpu_to_le32(nentries); set_page_dirty(page); f2fs_put_page(page, 1); index++; nentries = 0; page = NULL; } } if (page) { orphan_blk->blk_addr = cpu_to_le16(index); orphan_blk->blk_count = cpu_to_le16(orphan_blocks); orphan_blk->entry_count = cpu_to_le32(nentries); set_page_dirty(page); f2fs_put_page(page, 1); } } static __u32 f2fs_checkpoint_chksum(struct f2fs_sb_info *sbi, struct f2fs_checkpoint *ckpt) { unsigned int chksum_ofs = le32_to_cpu(ckpt->checksum_offset); __u32 chksum; chksum = f2fs_crc32(sbi, ckpt, chksum_ofs); if (chksum_ofs < CP_CHKSUM_OFFSET) { chksum_ofs += sizeof(chksum); chksum = f2fs_chksum(sbi, chksum, (__u8 *)ckpt + chksum_ofs, F2FS_BLKSIZE - chksum_ofs); } return chksum; } static int get_checkpoint_version(struct f2fs_sb_info *sbi, block_t cp_addr, struct f2fs_checkpoint **cp_block, struct page **cp_page, unsigned long long *version) { size_t crc_offset = 0; __u32 crc; *cp_page = f2fs_get_meta_page(sbi, cp_addr); if (IS_ERR(*cp_page)) return PTR_ERR(*cp_page); *cp_block = (struct f2fs_checkpoint *)page_address(*cp_page); crc_offset = le32_to_cpu((*cp_block)->checksum_offset); if (crc_offset < CP_MIN_CHKSUM_OFFSET || crc_offset > CP_CHKSUM_OFFSET) { f2fs_put_page(*cp_page, 1); f2fs_warn(sbi, "invalid crc_offset: %zu", crc_offset); return -EINVAL; } crc = f2fs_checkpoint_chksum(sbi, *cp_block); if (crc != cur_cp_crc(*cp_block)) { f2fs_put_page(*cp_page, 1); f2fs_warn(sbi, "invalid crc value"); return -EINVAL; } *version = cur_cp_version(*cp_block); return 0; } static struct page *validate_checkpoint(struct f2fs_sb_info *sbi, block_t cp_addr, unsigned long long *version) { struct page *cp_page_1 = NULL, *cp_page_2 = NULL; struct f2fs_checkpoint *cp_block = NULL; unsigned long long cur_version = 0, pre_version = 0; unsigned int cp_blocks; int err; err = get_checkpoint_version(sbi, cp_addr, &cp_block, &cp_page_1, version); if (err) return NULL; cp_blocks = le32_to_cpu(cp_block->cp_pack_total_block_count); if (cp_blocks > sbi->blocks_per_seg || cp_blocks <= F2FS_CP_PACKS) { f2fs_warn(sbi, "invalid cp_pack_total_block_count:%u", le32_to_cpu(cp_block->cp_pack_total_block_count)); goto invalid_cp; } pre_version = *version; cp_addr += cp_blocks - 1; err = get_checkpoint_version(sbi, cp_addr, &cp_block, &cp_page_2, version); if (err) goto invalid_cp; cur_version = *version; if (cur_version == pre_version) { *version = cur_version; f2fs_put_page(cp_page_2, 1); return cp_page_1; } f2fs_put_page(cp_page_2, 1); invalid_cp: f2fs_put_page(cp_page_1, 1); return NULL; } int f2fs_get_valid_checkpoint(struct f2fs_sb_info *sbi) { struct f2fs_checkpoint *cp_block; struct f2fs_super_block *fsb = sbi->raw_super; struct page *cp1, *cp2, *cur_page; unsigned long blk_size = sbi->blocksize; unsigned long long cp1_version = 0, cp2_version = 0; unsigned long long cp_start_blk_no; unsigned int cp_blks = 1 + __cp_payload(sbi); block_t cp_blk_no; int i; int err; sbi->ckpt = f2fs_kvzalloc(sbi, array_size(blk_size, cp_blks), GFP_KERNEL); if (!sbi->ckpt) return -ENOMEM; /* * Finding out valid cp block involves read both * sets( cp pack 1 and cp pack 2) */ cp_start_blk_no = le32_to_cpu(fsb->cp_blkaddr); cp1 = validate_checkpoint(sbi, cp_start_blk_no, &cp1_version); /* The second checkpoint pack should start at the next segment */ cp_start_blk_no += ((unsigned long long)1) << le32_to_cpu(fsb->log_blocks_per_seg); cp2 = validate_checkpoint(sbi, cp_start_blk_no, &cp2_version); if (cp1 && cp2) { if (ver_after(cp2_version, cp1_version)) cur_page = cp2; else cur_page = cp1; } else if (cp1) { cur_page = cp1; } else if (cp2) { cur_page = cp2; } else { err = -EFSCORRUPTED; goto fail_no_cp; } cp_block = (struct f2fs_checkpoint *)page_address(cur_page); memcpy(sbi->ckpt, cp_block, blk_size); if (cur_page == cp1) sbi->cur_cp_pack = 1; else sbi->cur_cp_pack = 2; /* Sanity checking of checkpoint */ if (f2fs_sanity_check_ckpt(sbi)) { err = -EFSCORRUPTED; goto free_fail_no_cp; } if (cp_blks <= 1) goto done; cp_blk_no = le32_to_cpu(fsb->cp_blkaddr); if (cur_page == cp2) cp_blk_no += BIT(le32_to_cpu(fsb->log_blocks_per_seg)); for (i = 1; i < cp_blks; i++) { void *sit_bitmap_ptr; unsigned char *ckpt = (unsigned char *)sbi->ckpt; cur_page = f2fs_get_meta_page(sbi, cp_blk_no + i); if (IS_ERR(cur_page)) { err = PTR_ERR(cur_page); goto free_fail_no_cp; } sit_bitmap_ptr = page_address(cur_page); memcpy(ckpt + i * blk_size, sit_bitmap_ptr, blk_size); f2fs_put_page(cur_page, 1); } done: f2fs_put_page(cp1, 1); f2fs_put_page(cp2, 1); return 0; free_fail_no_cp: f2fs_put_page(cp1, 1); f2fs_put_page(cp2, 1); fail_no_cp: kvfree(sbi->ckpt); return err; } static void __add_dirty_inode(struct inode *inode, enum inode_type type) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); int flag = (type == DIR_INODE) ? FI_DIRTY_DIR : FI_DIRTY_FILE; if (is_inode_flag_set(inode, flag)) return; set_inode_flag(inode, flag); list_add_tail(&F2FS_I(inode)->dirty_list, &sbi->inode_list[type]); stat_inc_dirty_inode(sbi, type); } static void __remove_dirty_inode(struct inode *inode, enum inode_type type) { int flag = (type == DIR_INODE) ? FI_DIRTY_DIR : FI_DIRTY_FILE; if (get_dirty_pages(inode) || !is_inode_flag_set(inode, flag)) return; list_del_init(&F2FS_I(inode)->dirty_list); clear_inode_flag(inode, flag); stat_dec_dirty_inode(F2FS_I_SB(inode), type); } void f2fs_update_dirty_folio(struct inode *inode, struct folio *folio) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); enum inode_type type = S_ISDIR(inode->i_mode) ? DIR_INODE : FILE_INODE; if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode) && !S_ISLNK(inode->i_mode)) return; spin_lock(&sbi->inode_lock[type]); if (type != FILE_INODE || test_opt(sbi, DATA_FLUSH)) __add_dirty_inode(inode, type); inode_inc_dirty_pages(inode); spin_unlock(&sbi->inode_lock[type]); set_page_private_reference(&folio->page); } void f2fs_remove_dirty_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); enum inode_type type = S_ISDIR(inode->i_mode) ? DIR_INODE : FILE_INODE; if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode) && !S_ISLNK(inode->i_mode)) return; if (type == FILE_INODE && !test_opt(sbi, DATA_FLUSH)) return; spin_lock(&sbi->inode_lock[type]); __remove_dirty_inode(inode, type); spin_unlock(&sbi->inode_lock[type]); } int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type, bool from_cp) { struct list_head *head; struct inode *inode; struct f2fs_inode_info *fi; bool is_dir = (type == DIR_INODE); unsigned long ino = 0; trace_f2fs_sync_dirty_inodes_enter(sbi->sb, is_dir, get_pages(sbi, is_dir ? F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA)); retry: if (unlikely(f2fs_cp_error(sbi))) { trace_f2fs_sync_dirty_inodes_exit(sbi->sb, is_dir, get_pages(sbi, is_dir ? F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA)); return -EIO; } spin_lock(&sbi->inode_lock[type]); head = &sbi->inode_list[type]; if (list_empty(head)) { spin_unlock(&sbi->inode_lock[type]); trace_f2fs_sync_dirty_inodes_exit(sbi->sb, is_dir, get_pages(sbi, is_dir ? F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA)); return 0; } fi = list_first_entry(head, struct f2fs_inode_info, dirty_list); inode = igrab(&fi->vfs_inode); spin_unlock(&sbi->inode_lock[type]); if (inode) { unsigned long cur_ino = inode->i_ino; if (from_cp) F2FS_I(inode)->cp_task = current; F2FS_I(inode)->wb_task = current; filemap_fdatawrite(inode->i_mapping); F2FS_I(inode)->wb_task = NULL; if (from_cp) F2FS_I(inode)->cp_task = NULL; iput(inode); /* We need to give cpu to another writers. */ if (ino == cur_ino) cond_resched(); else ino = cur_ino; } else { /* * We should submit bio, since it exists several * writebacking dentry pages in the freeing inode. */ f2fs_submit_merged_write(sbi, DATA); cond_resched(); } goto retry; } static int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi) { struct list_head *head = &sbi->inode_list[DIRTY_META]; struct inode *inode; struct f2fs_inode_info *fi; s64 total = get_pages(sbi, F2FS_DIRTY_IMETA); while (total--) { if (unlikely(f2fs_cp_error(sbi))) return -EIO; spin_lock(&sbi->inode_lock[DIRTY_META]); if (list_empty(head)) { spin_unlock(&sbi->inode_lock[DIRTY_META]); return 0; } fi = list_first_entry(head, struct f2fs_inode_info, gdirty_list); inode = igrab(&fi->vfs_inode); spin_unlock(&sbi->inode_lock[DIRTY_META]); if (inode) { sync_inode_metadata(inode, 0); /* it's on eviction */ if (is_inode_flag_set(inode, FI_DIRTY_INODE)) f2fs_update_inode_page(inode); iput(inode); } } return 0; } static void __prepare_cp_block(struct f2fs_sb_info *sbi) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); struct f2fs_nm_info *nm_i = NM_I(sbi); nid_t last_nid = nm_i->next_scan_nid; next_free_nid(sbi, &last_nid); ckpt->valid_block_count = cpu_to_le64(valid_user_blocks(sbi)); ckpt->valid_node_count = cpu_to_le32(valid_node_count(sbi)); ckpt->valid_inode_count = cpu_to_le32(valid_inode_count(sbi)); ckpt->next_free_nid = cpu_to_le32(last_nid); } static bool __need_flush_quota(struct f2fs_sb_info *sbi) { bool ret = false; if (!is_journalled_quota(sbi)) return false; if (!f2fs_down_write_trylock(&sbi->quota_sem)) return true; if (is_sbi_flag_set(sbi, SBI_QUOTA_SKIP_FLUSH)) { ret = false; } else if (is_sbi_flag_set(sbi, SBI_QUOTA_NEED_REPAIR)) { ret = false; } else if (is_sbi_flag_set(sbi, SBI_QUOTA_NEED_FLUSH)) { clear_sbi_flag(sbi, SBI_QUOTA_NEED_FLUSH); ret = true; } else if (get_pages(sbi, F2FS_DIRTY_QDATA)) { ret = true; } f2fs_up_write(&sbi->quota_sem); return ret; } /* * Freeze all the FS-operations for checkpoint. */ static int block_operations(struct f2fs_sb_info *sbi) { struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, .nr_to_write = LONG_MAX, .for_reclaim = 0, }; int err = 0, cnt = 0; /* * Let's flush inline_data in dirty node pages. */ f2fs_flush_inline_data(sbi); retry_flush_quotas: f2fs_lock_all(sbi); if (__need_flush_quota(sbi)) { int locked; if (++cnt > DEFAULT_RETRY_QUOTA_FLUSH_COUNT) { set_sbi_flag(sbi, SBI_QUOTA_SKIP_FLUSH); set_sbi_flag(sbi, SBI_QUOTA_NEED_FLUSH); goto retry_flush_dents; } f2fs_unlock_all(sbi); /* only failed during mount/umount/freeze/quotactl */ locked = down_read_trylock(&sbi->sb->s_umount); f2fs_quota_sync(sbi->sb, -1); if (locked) up_read(&sbi->sb->s_umount); cond_resched(); goto retry_flush_quotas; } retry_flush_dents: /* write all the dirty dentry pages */ if (get_pages(sbi, F2FS_DIRTY_DENTS)) { f2fs_unlock_all(sbi); err = f2fs_sync_dirty_inodes(sbi, DIR_INODE, true); if (err) return err; cond_resched(); goto retry_flush_quotas; } /* * POR: we should ensure that there are no dirty node pages * until finishing nat/sit flush. inode->i_blocks can be updated. */ f2fs_down_write(&sbi->node_change); if (get_pages(sbi, F2FS_DIRTY_IMETA)) { f2fs_up_write(&sbi->node_change); f2fs_unlock_all(sbi); err = f2fs_sync_inode_meta(sbi); if (err) return err; cond_resched(); goto retry_flush_quotas; } retry_flush_nodes: f2fs_down_write(&sbi->node_write); if (get_pages(sbi, F2FS_DIRTY_NODES)) { f2fs_up_write(&sbi->node_write); atomic_inc(&sbi->wb_sync_req[NODE]); err = f2fs_sync_node_pages(sbi, &wbc, false, FS_CP_NODE_IO); atomic_dec(&sbi->wb_sync_req[NODE]); if (err) { f2fs_up_write(&sbi->node_change); f2fs_unlock_all(sbi); return err; } cond_resched(); goto retry_flush_nodes; } /* * sbi->node_change is used only for AIO write_begin path which produces * dirty node blocks and some checkpoint values by block allocation. */ __prepare_cp_block(sbi); f2fs_up_write(&sbi->node_change); return err; } static void unblock_operations(struct f2fs_sb_info *sbi) { f2fs_up_write(&sbi->node_write); f2fs_unlock_all(sbi); } void f2fs_wait_on_all_pages(struct f2fs_sb_info *sbi, int type) { DEFINE_WAIT(wait); for (;;) { if (!get_pages(sbi, type)) break; if (unlikely(f2fs_cp_error(sbi) && !is_sbi_flag_set(sbi, SBI_IS_CLOSE))) break; if (type == F2FS_DIRTY_META) f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); else if (type == F2FS_WB_CP_DATA) f2fs_submit_merged_write(sbi, DATA); prepare_to_wait(&sbi->cp_wait, &wait, TASK_UNINTERRUPTIBLE); io_schedule_timeout(DEFAULT_IO_TIMEOUT); } finish_wait(&sbi->cp_wait, &wait); } static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) { unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num; struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); unsigned long flags; if (cpc->reason & CP_UMOUNT) { if (le32_to_cpu(ckpt->cp_pack_total_block_count) + NM_I(sbi)->nat_bits_blocks > sbi->blocks_per_seg) { clear_ckpt_flags(sbi, CP_NAT_BITS_FLAG); f2fs_notice(sbi, "Disable nat_bits due to no space"); } else if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG) && f2fs_nat_bitmap_enabled(sbi)) { f2fs_enable_nat_bits(sbi); set_ckpt_flags(sbi, CP_NAT_BITS_FLAG); f2fs_notice(sbi, "Rebuild and enable nat_bits"); } } spin_lock_irqsave(&sbi->cp_lock, flags); if (cpc->reason & CP_TRIMMED) __set_ckpt_flags(ckpt, CP_TRIMMED_FLAG); else __clear_ckpt_flags(ckpt, CP_TRIMMED_FLAG); if (cpc->reason & CP_UMOUNT) __set_ckpt_flags(ckpt, CP_UMOUNT_FLAG); else __clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG); if (cpc->reason & CP_FASTBOOT) __set_ckpt_flags(ckpt, CP_FASTBOOT_FLAG); else __clear_ckpt_flags(ckpt, CP_FASTBOOT_FLAG); if (orphan_num) __set_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG); else __clear_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG); if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) __set_ckpt_flags(ckpt, CP_FSCK_FLAG); if (is_sbi_flag_set(sbi, SBI_IS_RESIZEFS)) __set_ckpt_flags(ckpt, CP_RESIZEFS_FLAG); else __clear_ckpt_flags(ckpt, CP_RESIZEFS_FLAG); if (is_sbi_flag_set(sbi, SBI_CP_DISABLED)) __set_ckpt_flags(ckpt, CP_DISABLED_FLAG); else __clear_ckpt_flags(ckpt, CP_DISABLED_FLAG); if (is_sbi_flag_set(sbi, SBI_CP_DISABLED_QUICK)) __set_ckpt_flags(ckpt, CP_DISABLED_QUICK_FLAG); else __clear_ckpt_flags(ckpt, CP_DISABLED_QUICK_FLAG); if (is_sbi_flag_set(sbi, SBI_QUOTA_SKIP_FLUSH)) __set_ckpt_flags(ckpt, CP_QUOTA_NEED_FSCK_FLAG); else __clear_ckpt_flags(ckpt, CP_QUOTA_NEED_FSCK_FLAG); if (is_sbi_flag_set(sbi, SBI_QUOTA_NEED_REPAIR)) __set_ckpt_flags(ckpt, CP_QUOTA_NEED_FSCK_FLAG); /* set this flag to activate crc|cp_ver for recovery */ __set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG); __clear_ckpt_flags(ckpt, CP_NOCRC_RECOVERY_FLAG); spin_unlock_irqrestore(&sbi->cp_lock, flags); } static void commit_checkpoint(struct f2fs_sb_info *sbi, void *src, block_t blk_addr) { struct writeback_control wbc = { .for_reclaim = 0, }; /* * filemap_get_folios_tag and lock_page again will take * some extra time. Therefore, f2fs_update_meta_pages and * f2fs_sync_meta_pages are combined in this function. */ struct page *page = f2fs_grab_meta_page(sbi, blk_addr); int err; f2fs_wait_on_page_writeback(page, META, true, true); memcpy(page_address(page), src, PAGE_SIZE); set_page_dirty(page); if (unlikely(!clear_page_dirty_for_io(page))) f2fs_bug_on(sbi, 1); /* writeout cp pack 2 page */ err = __f2fs_write_meta_page(page, &wbc, FS_CP_META_IO); if (unlikely(err && f2fs_cp_error(sbi))) { f2fs_put_page(page, 1); return; } f2fs_bug_on(sbi, err); f2fs_put_page(page, 0); /* submit checkpoint (with barrier if NOBARRIER is not set) */ f2fs_submit_merged_write(sbi, META_FLUSH); } static inline u64 get_sectors_written(struct block_device *bdev) { return (u64)part_stat_read(bdev, sectors[STAT_WRITE]); } u64 f2fs_get_sectors_written(struct f2fs_sb_info *sbi) { if (f2fs_is_multi_device(sbi)) { u64 sectors = 0; int i; for (i = 0; i < sbi->s_ndevs; i++) sectors += get_sectors_written(FDEV(i).bdev); return sectors; } return get_sectors_written(sbi->sb->s_bdev); } static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); struct f2fs_nm_info *nm_i = NM_I(sbi); unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num, flags; block_t start_blk; unsigned int data_sum_blocks, orphan_blocks; __u32 crc32 = 0; int i; int cp_payload_blks = __cp_payload(sbi); struct curseg_info *seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE); u64 kbytes_written; int err; /* Flush all the NAT/SIT pages */ f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); /* start to update checkpoint, cp ver is already updated previously */ ckpt->elapsed_time = cpu_to_le64(get_mtime(sbi, true)); ckpt->free_segment_count = cpu_to_le32(free_segments(sbi)); for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) { struct curseg_info *curseg = CURSEG_I(sbi, i + CURSEG_HOT_NODE); ckpt->cur_node_segno[i] = cpu_to_le32(curseg->segno); ckpt->cur_node_blkoff[i] = cpu_to_le16(curseg->next_blkoff); ckpt->alloc_type[i + CURSEG_HOT_NODE] = curseg->alloc_type; } for (i = 0; i < NR_CURSEG_DATA_TYPE; i++) { struct curseg_info *curseg = CURSEG_I(sbi, i + CURSEG_HOT_DATA); ckpt->cur_data_segno[i] = cpu_to_le32(curseg->segno); ckpt->cur_data_blkoff[i] = cpu_to_le16(curseg->next_blkoff); ckpt->alloc_type[i + CURSEG_HOT_DATA] = curseg->alloc_type; } /* 2 cp + n data seg summary + orphan inode blocks */ data_sum_blocks = f2fs_npages_for_summary_flush(sbi, false); spin_lock_irqsave(&sbi->cp_lock, flags); if (data_sum_blocks < NR_CURSEG_DATA_TYPE) __set_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG); else __clear_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG); spin_unlock_irqrestore(&sbi->cp_lock, flags); orphan_blocks = GET_ORPHAN_BLOCKS(orphan_num); ckpt->cp_pack_start_sum = cpu_to_le32(1 + cp_payload_blks + orphan_blocks); if (__remain_node_summaries(cpc->reason)) ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS + cp_payload_blks + data_sum_blocks + orphan_blocks + NR_CURSEG_NODE_TYPE); else ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS + cp_payload_blks + data_sum_blocks + orphan_blocks); /* update ckpt flag for checkpoint */ update_ckpt_flags(sbi, cpc); /* update SIT/NAT bitmap */ get_sit_bitmap(sbi, __bitmap_ptr(sbi, SIT_BITMAP)); get_nat_bitmap(sbi, __bitmap_ptr(sbi, NAT_BITMAP)); crc32 = f2fs_checkpoint_chksum(sbi, ckpt); *((__le32 *)((unsigned char *)ckpt + le32_to_cpu(ckpt->checksum_offset))) = cpu_to_le32(crc32); start_blk = __start_cp_next_addr(sbi); /* write nat bits */ if ((cpc->reason & CP_UMOUNT) && is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG)) { __u64 cp_ver = cur_cp_version(ckpt); block_t blk; cp_ver |= ((__u64)crc32 << 32); *(__le64 *)nm_i->nat_bits = cpu_to_le64(cp_ver); blk = start_blk + sbi->blocks_per_seg - nm_i->nat_bits_blocks; for (i = 0; i < nm_i->nat_bits_blocks; i++) f2fs_update_meta_page(sbi, nm_i->nat_bits + (i << F2FS_BLKSIZE_BITS), blk + i); } /* write out checkpoint buffer at block 0 */ f2fs_update_meta_page(sbi, ckpt, start_blk++); for (i = 1; i < 1 + cp_payload_blks; i++) f2fs_update_meta_page(sbi, (char *)ckpt + i * F2FS_BLKSIZE, start_blk++); if (orphan_num) { write_orphan_inodes(sbi, start_blk); start_blk += orphan_blocks; } f2fs_write_data_summaries(sbi, start_blk); start_blk += data_sum_blocks; /* Record write statistics in the hot node summary */ kbytes_written = sbi->kbytes_written; kbytes_written += (f2fs_get_sectors_written(sbi) - sbi->sectors_written_start) >> 1; seg_i->journal->info.kbytes_written = cpu_to_le64(kbytes_written); if (__remain_node_summaries(cpc->reason)) { f2fs_write_node_summaries(sbi, start_blk); start_blk += NR_CURSEG_NODE_TYPE; } /* update user_block_counts */ sbi->last_valid_block_count = sbi->total_valid_block_count; percpu_counter_set(&sbi->alloc_valid_block_count, 0); percpu_counter_set(&sbi->rf_node_block_count, 0); /* Here, we have one bio having CP pack except cp pack 2 page */ f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); /* Wait for all dirty meta pages to be submitted for IO */ f2fs_wait_on_all_pages(sbi, F2FS_DIRTY_META); /* wait for previous submitted meta pages writeback */ f2fs_wait_on_all_pages(sbi, F2FS_WB_CP_DATA); /* flush all device cache */ err = f2fs_flush_device_cache(sbi); if (err) return err; /* barrier and flush checkpoint cp pack 2 page if it can */ commit_checkpoint(sbi, ckpt, start_blk); f2fs_wait_on_all_pages(sbi, F2FS_WB_CP_DATA); /* * invalidate intermediate page cache borrowed from meta inode which are * used for migration of encrypted, verity or compressed inode's blocks. */ if (f2fs_sb_has_encrypt(sbi) || f2fs_sb_has_verity(sbi) || f2fs_sb_has_compression(sbi)) invalidate_mapping_pages(META_MAPPING(sbi), MAIN_BLKADDR(sbi), MAX_BLKADDR(sbi) - 1); f2fs_release_ino_entry(sbi, false); f2fs_reset_fsync_node_info(sbi); clear_sbi_flag(sbi, SBI_IS_DIRTY); clear_sbi_flag(sbi, SBI_NEED_CP); clear_sbi_flag(sbi, SBI_QUOTA_SKIP_FLUSH); spin_lock(&sbi->stat_lock); sbi->unusable_block_count = 0; spin_unlock(&sbi->stat_lock); __set_cp_next_pack(sbi); /* * redirty superblock if metadata like node page or inode cache is * updated during writing checkpoint. */ if (get_pages(sbi, F2FS_DIRTY_NODES) || get_pages(sbi, F2FS_DIRTY_IMETA)) set_sbi_flag(sbi, SBI_IS_DIRTY); f2fs_bug_on(sbi, get_pages(sbi, F2FS_DIRTY_DENTS)); return unlikely(f2fs_cp_error(sbi)) ? -EIO : 0; } int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); unsigned long long ckpt_ver; int err = 0; if (f2fs_readonly(sbi->sb) || f2fs_hw_is_readonly(sbi)) return -EROFS; if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { if (cpc->reason != CP_PAUSE) return 0; f2fs_warn(sbi, "Start checkpoint disabled!"); } if (cpc->reason != CP_RESIZE) f2fs_down_write(&sbi->cp_global_sem); if (!is_sbi_flag_set(sbi, SBI_IS_DIRTY) && ((cpc->reason & CP_FASTBOOT) || (cpc->reason & CP_SYNC) || ((cpc->reason & CP_DISCARD) && !sbi->discard_blks))) goto out; if (unlikely(f2fs_cp_error(sbi))) { err = -EIO; goto out; } trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "start block_ops"); err = block_operations(sbi); if (err) goto out; trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish block_ops"); f2fs_flush_merged_writes(sbi); /* this is the case of multiple fstrims without any changes */ if (cpc->reason & CP_DISCARD) { if (!f2fs_exist_trim_candidates(sbi, cpc)) { unblock_operations(sbi); goto out; } if (NM_I(sbi)->nat_cnt[DIRTY_NAT] == 0 && SIT_I(sbi)->dirty_sentries == 0 && prefree_segments(sbi) == 0) { f2fs_flush_sit_entries(sbi, cpc); f2fs_clear_prefree_segments(sbi, cpc); unblock_operations(sbi); goto out; } } /* * update checkpoint pack index * Increase the version number so that * SIT entries and seg summaries are written at correct place */ ckpt_ver = cur_cp_version(ckpt); ckpt->checkpoint_ver = cpu_to_le64(++ckpt_ver); /* write cached NAT/SIT entries to NAT/SIT area */ err = f2fs_flush_nat_entries(sbi, cpc); if (err) { f2fs_err(sbi, "f2fs_flush_nat_entries failed err:%d, stop checkpoint", err); f2fs_bug_on(sbi, !f2fs_cp_error(sbi)); goto stop; } f2fs_flush_sit_entries(sbi, cpc); /* save inmem log status */ f2fs_save_inmem_curseg(sbi); err = do_checkpoint(sbi, cpc); if (err) { f2fs_err(sbi, "do_checkpoint failed err:%d, stop checkpoint", err); f2fs_bug_on(sbi, !f2fs_cp_error(sbi)); f2fs_release_discard_addrs(sbi); } else { f2fs_clear_prefree_segments(sbi, cpc); } f2fs_restore_inmem_curseg(sbi); stat_inc_cp_count(sbi); stop: unblock_operations(sbi); if (cpc->reason & CP_RECOVERY) f2fs_notice(sbi, "checkpoint: version = %llx", ckpt_ver); /* update CP_TIME to trigger checkpoint periodically */ f2fs_update_time(sbi, CP_TIME); trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint"); out: if (cpc->reason != CP_RESIZE) f2fs_up_write(&sbi->cp_global_sem); return err; } void f2fs_init_ino_entry_info(struct f2fs_sb_info *sbi) { int i; for (i = 0; i < MAX_INO_ENTRY; i++) { struct inode_management *im = &sbi->im[i]; INIT_RADIX_TREE(&im->ino_root, GFP_ATOMIC); spin_lock_init(&im->ino_lock); INIT_LIST_HEAD(&im->ino_list); im->ino_num = 0; } sbi->max_orphans = (sbi->blocks_per_seg - F2FS_CP_PACKS - NR_CURSEG_PERSIST_TYPE - __cp_payload(sbi)) * F2FS_ORPHANS_PER_BLOCK; } int __init f2fs_create_checkpoint_caches(void) { ino_entry_slab = f2fs_kmem_cache_create("f2fs_ino_entry", sizeof(struct ino_entry)); if (!ino_entry_slab) return -ENOMEM; f2fs_inode_entry_slab = f2fs_kmem_cache_create("f2fs_inode_entry", sizeof(struct inode_entry)); if (!f2fs_inode_entry_slab) { kmem_cache_destroy(ino_entry_slab); return -ENOMEM; } return 0; } void f2fs_destroy_checkpoint_caches(void) { kmem_cache_destroy(ino_entry_slab); kmem_cache_destroy(f2fs_inode_entry_slab); } static int __write_checkpoint_sync(struct f2fs_sb_info *sbi) { struct cp_control cpc = { .reason = CP_SYNC, }; int err; f2fs_down_write(&sbi->gc_lock); err = f2fs_write_checkpoint(sbi, &cpc); f2fs_up_write(&sbi->gc_lock); return err; } static void __checkpoint_and_complete_reqs(struct f2fs_sb_info *sbi) { struct ckpt_req_control *cprc = &sbi->cprc_info; struct ckpt_req *req, *next; struct llist_node *dispatch_list; u64 sum_diff = 0, diff, count = 0; int ret; dispatch_list = llist_del_all(&cprc->issue_list); if (!dispatch_list) return; dispatch_list = llist_reverse_order(dispatch_list); ret = __write_checkpoint_sync(sbi); atomic_inc(&cprc->issued_ckpt); llist_for_each_entry_safe(req, next, dispatch_list, llnode) { diff = (u64)ktime_ms_delta(ktime_get(), req->queue_time); req->ret = ret; complete(&req->wait); sum_diff += diff; count++; } atomic_sub(count, &cprc->queued_ckpt); atomic_add(count, &cprc->total_ckpt); spin_lock(&cprc->stat_lock); cprc->cur_time = (unsigned int)div64_u64(sum_diff, count); if (cprc->peak_time < cprc->cur_time) cprc->peak_time = cprc->cur_time; spin_unlock(&cprc->stat_lock); } static int issue_checkpoint_thread(void *data) { struct f2fs_sb_info *sbi = data; struct ckpt_req_control *cprc = &sbi->cprc_info; wait_queue_head_t *q = &cprc->ckpt_wait_queue; repeat: if (kthread_should_stop()) return 0; if (!llist_empty(&cprc->issue_list)) __checkpoint_and_complete_reqs(sbi); wait_event_interruptible(*q, kthread_should_stop() || !llist_empty(&cprc->issue_list)); goto repeat; } static void flush_remained_ckpt_reqs(struct f2fs_sb_info *sbi, struct ckpt_req *wait_req) { struct ckpt_req_control *cprc = &sbi->cprc_info; if (!llist_empty(&cprc->issue_list)) { __checkpoint_and_complete_reqs(sbi); } else { /* already dispatched by issue_checkpoint_thread */ if (wait_req) wait_for_completion(&wait_req->wait); } } static void init_ckpt_req(struct ckpt_req *req) { memset(req, 0, sizeof(struct ckpt_req)); init_completion(&req->wait); req->queue_time = ktime_get(); } int f2fs_issue_checkpoint(struct f2fs_sb_info *sbi) { struct ckpt_req_control *cprc = &sbi->cprc_info; struct ckpt_req req; struct cp_control cpc; cpc.reason = __get_cp_reason(sbi); if (!test_opt(sbi, MERGE_CHECKPOINT) || cpc.reason != CP_SYNC) { int ret; f2fs_down_write(&sbi->gc_lock); ret = f2fs_write_checkpoint(sbi, &cpc); f2fs_up_write(&sbi->gc_lock); return ret; } if (!cprc->f2fs_issue_ckpt) return __write_checkpoint_sync(sbi); init_ckpt_req(&req); llist_add(&req.llnode, &cprc->issue_list); atomic_inc(&cprc->queued_ckpt); /* * update issue_list before we wake up issue_checkpoint thread, * this smp_mb() pairs with another barrier in ___wait_event(), * see more details in comments of waitqueue_active(). */ smp_mb(); if (waitqueue_active(&cprc->ckpt_wait_queue)) wake_up(&cprc->ckpt_wait_queue); if (cprc->f2fs_issue_ckpt) wait_for_completion(&req.wait); else flush_remained_ckpt_reqs(sbi, &req); return req.ret; } int f2fs_start_ckpt_thread(struct f2fs_sb_info *sbi) { dev_t dev = sbi->sb->s_bdev->bd_dev; struct ckpt_req_control *cprc = &sbi->cprc_info; if (cprc->f2fs_issue_ckpt) return 0; cprc->f2fs_issue_ckpt = kthread_run(issue_checkpoint_thread, sbi, "f2fs_ckpt-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(cprc->f2fs_issue_ckpt)) { int err = PTR_ERR(cprc->f2fs_issue_ckpt); cprc->f2fs_issue_ckpt = NULL; return err; } set_task_ioprio(cprc->f2fs_issue_ckpt, cprc->ckpt_thread_ioprio); return 0; } void f2fs_stop_ckpt_thread(struct f2fs_sb_info *sbi) { struct ckpt_req_control *cprc = &sbi->cprc_info; struct task_struct *ckpt_task; if (!cprc->f2fs_issue_ckpt) return; ckpt_task = cprc->f2fs_issue_ckpt; cprc->f2fs_issue_ckpt = NULL; kthread_stop(ckpt_task); f2fs_flush_ckpt_thread(sbi); } void f2fs_flush_ckpt_thread(struct f2fs_sb_info *sbi) { struct ckpt_req_control *cprc = &sbi->cprc_info; flush_remained_ckpt_reqs(sbi, NULL); /* Let's wait for the previous dispatched checkpoint. */ while (atomic_read(&cprc->queued_ckpt)) io_schedule_timeout(DEFAULT_IO_TIMEOUT); } void f2fs_init_ckpt_req_control(struct f2fs_sb_info *sbi) { struct ckpt_req_control *cprc = &sbi->cprc_info; atomic_set(&cprc->issued_ckpt, 0); atomic_set(&cprc->total_ckpt, 0); atomic_set(&cprc->queued_ckpt, 0); cprc->ckpt_thread_ioprio = DEFAULT_CHECKPOINT_IOPRIO; init_waitqueue_head(&cprc->ckpt_wait_queue); init_llist_head(&cprc->issue_list); spin_lock_init(&cprc->stat_lock); }
1976 1977 83 9 458 459 457 423 419 424 402 402 401 401 23 23 23 23 23 52 52 481 3264 3265 3264 3263 3264 3262 3264 1415 1415 3262 3261 3265 3266 1452 1415 1415 1415 3263 3261 3265 3265 3262 38 38 38 37 38 22 75 123 123 42 123 100 122 123 123 123 123 1 22 22 22 22 22 1 22 22 22 1 22 22 22 22 22 22 22 22 22 22 22 22 3225 3228 412 22 22 22 21 25 25 3 3 4 4 4 25 24 25 25 25 20 3162 3163 3165 3166 886 2126 2124 2121 886 887 186 2126 2126 1412 2123 13 13 13 20 20 20 20 20 20 482 1413 1413 1411 1413 1413 3149 1 1413 34 1 1412 1411 1411 1413 1381 1413 1412 1412 1410 3 57 4 3 3 4 4 441 1973 1864 1977 1977 1503 243 1413 59 480 25 481 480 481 34 34 34 34 14 14 84 83 84 84 83 84 84 135 102 102 102 102 135 746 3227 3221 1876 1877 1875 1721 1722 1722 719 721 723 724 724 722 722 1 721 722 14 1469 1469 1469 353 9 1469 1469 1469 1468 1469 1467 1394 1467 3428 2088 2091 737 736 14 14 724 722 1469 2087 3223 1437 1436 3225 3226 3223 2123 3223 3227 503 2127 3228 3226 3226 2084 1070 3225 3226 3223 3226 3224 3193 2125 1818 1721 102 3227 57 3 3 3 3 3 2 2 2 3 57 57 57 57 57 57 57 57 57 57 57 57 57 57 57 1 1 1 1 1 1 1 1 57 57 57 57 57 57 57 57 57 57 57 57 57 57 57 3 3 3 3 57 57 57 57 57 57 57 57 57 57 57 57 57 57 1 57 57 57 57 57 57 57 57 57 57 57 57 57 57 57 57 57 57 56 57 57 57 57 57 57 57 57 57 1 57 57 57 57 57 57 57 57 57 57 57 57 57 57 57 57 57 57 57 57 57 57 57 57 57 57 3 3 3 3 3 3 3 3 3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 // SPDX-License-Identifier: GPL-2.0 /* * Block multiqueue core code * * Copyright (C) 2013-2014 Jens Axboe * Copyright (C) 2013-2014 Christoph Hellwig */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/backing-dev.h> #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/blk-integrity.h> #include <linux/kmemleak.h> #include <linux/mm.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/workqueue.h> #include <linux/smp.h> #include <linux/interrupt.h> #include <linux/llist.h> #include <linux/cpu.h> #include <linux/cache.h> #include <linux/sched/sysctl.h> #include <linux/sched/topology.h> #include <linux/sched/signal.h> #include <linux/delay.h> #include <linux/crash_dump.h> #include <linux/prefetch.h> #include <linux/blk-crypto.h> #include <linux/part_stat.h> #include <trace/events/block.h> #include <linux/t10-pi.h> #include "blk.h" #include "blk-mq.h" #include "blk-mq-debugfs.h" #include "blk-pm.h" #include "blk-stat.h" #include "blk-mq-sched.h" #include "blk-rq-qos.h" #include "blk-ioprio.h" static DEFINE_PER_CPU(struct llist_head, blk_cpu_done); static DEFINE_PER_CPU(call_single_data_t, blk_cpu_csd); static void blk_mq_insert_request(struct request *rq, blk_insert_t flags); static void blk_mq_request_bypass_insert(struct request *rq, blk_insert_t flags); static void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, struct list_head *list); static int blk_hctx_poll(struct request_queue *q, struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob, unsigned int flags); /* * Check if any of the ctx, dispatch list or elevator * have pending work in this hardware queue. */ static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) { return !list_empty_careful(&hctx->dispatch) || sbitmap_any_bit_set(&hctx->ctx_map) || blk_mq_sched_has_work(hctx); } /* * Mark this ctx as having pending work in this hardware queue */ static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx) { const int bit = ctx->index_hw[hctx->type]; if (!sbitmap_test_bit(&hctx->ctx_map, bit)) sbitmap_set_bit(&hctx->ctx_map, bit); } static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx) { const int bit = ctx->index_hw[hctx->type]; sbitmap_clear_bit(&hctx->ctx_map, bit); } struct mq_inflight { struct block_device *part; unsigned int inflight[2]; }; static bool blk_mq_check_inflight(struct request *rq, void *priv) { struct mq_inflight *mi = priv; if (rq->part && blk_do_io_stat(rq) && (!mi->part->bd_partno || rq->part == mi->part) && blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT) mi->inflight[rq_data_dir(rq)]++; return true; } unsigned int blk_mq_in_flight(struct request_queue *q, struct block_device *part) { struct mq_inflight mi = { .part = part }; blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi); return mi.inflight[0] + mi.inflight[1]; } void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part, unsigned int inflight[2]) { struct mq_inflight mi = { .part = part }; blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi); inflight[0] = mi.inflight[0]; inflight[1] = mi.inflight[1]; } void blk_freeze_queue_start(struct request_queue *q) { mutex_lock(&q->mq_freeze_lock); if (++q->mq_freeze_depth == 1) { percpu_ref_kill(&q->q_usage_counter); mutex_unlock(&q->mq_freeze_lock); if (queue_is_mq(q)) blk_mq_run_hw_queues(q, false); } else { mutex_unlock(&q->mq_freeze_lock); } } EXPORT_SYMBOL_GPL(blk_freeze_queue_start); void blk_mq_freeze_queue_wait(struct request_queue *q) { wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter)); } EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait); int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, unsigned long timeout) { return wait_event_timeout(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter), timeout); } EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait_timeout); /* * Guarantee no request is in use, so we can change any data structure of * the queue afterward. */ void blk_freeze_queue(struct request_queue *q) { /* * In the !blk_mq case we are only calling this to kill the * q_usage_counter, otherwise this increases the freeze depth * and waits for it to return to zero. For this reason there is * no blk_unfreeze_queue(), and blk_freeze_queue() is not * exported to drivers as the only user for unfreeze is blk_mq. */ blk_freeze_queue_start(q); blk_mq_freeze_queue_wait(q); } void blk_mq_freeze_queue(struct request_queue *q) { /* * ...just an alias to keep freeze and unfreeze actions balanced * in the blk_mq_* namespace */ blk_freeze_queue(q); } EXPORT_SYMBOL_GPL(blk_mq_freeze_queue); void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic) { mutex_lock(&q->mq_freeze_lock); if (force_atomic) q->q_usage_counter.data->force_atomic = true; q->mq_freeze_depth--; WARN_ON_ONCE(q->mq_freeze_depth < 0); if (!q->mq_freeze_depth) { percpu_ref_resurrect(&q->q_usage_counter); wake_up_all(&q->mq_freeze_wq); } mutex_unlock(&q->mq_freeze_lock); } void blk_mq_unfreeze_queue(struct request_queue *q) { __blk_mq_unfreeze_queue(q, false); } EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue); /* * FIXME: replace the scsi_internal_device_*block_nowait() calls in the * mpt3sas driver such that this function can be removed. */ void blk_mq_quiesce_queue_nowait(struct request_queue *q) { unsigned long flags; spin_lock_irqsave(&q->queue_lock, flags); if (!q->quiesce_depth++) blk_queue_flag_set(QUEUE_FLAG_QUIESCED, q); spin_unlock_irqrestore(&q->queue_lock, flags); } EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait); /** * blk_mq_wait_quiesce_done() - wait until in-progress quiesce is done * @set: tag_set to wait on * * Note: it is driver's responsibility for making sure that quiesce has * been started on or more of the request_queues of the tag_set. This * function only waits for the quiesce on those request_queues that had * the quiesce flag set using blk_mq_quiesce_queue_nowait. */ void blk_mq_wait_quiesce_done(struct blk_mq_tag_set *set) { if (set->flags & BLK_MQ_F_BLOCKING) synchronize_srcu(set->srcu); else synchronize_rcu(); } EXPORT_SYMBOL_GPL(blk_mq_wait_quiesce_done); /** * blk_mq_quiesce_queue() - wait until all ongoing dispatches have finished * @q: request queue. * * Note: this function does not prevent that the struct request end_io() * callback function is invoked. Once this function is returned, we make * sure no dispatch can happen until the queue is unquiesced via * blk_mq_unquiesce_queue(). */ void blk_mq_quiesce_queue(struct request_queue *q) { blk_mq_quiesce_queue_nowait(q); /* nothing to wait for non-mq queues */ if (queue_is_mq(q)) blk_mq_wait_quiesce_done(q->tag_set); } EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue); /* * blk_mq_unquiesce_queue() - counterpart of blk_mq_quiesce_queue() * @q: request queue. * * This function recovers queue into the state before quiescing * which is done by blk_mq_quiesce_queue. */ void blk_mq_unquiesce_queue(struct request_queue *q) { unsigned long flags; bool run_queue = false; spin_lock_irqsave(&q->queue_lock, flags); if (WARN_ON_ONCE(q->quiesce_depth <= 0)) { ; } else if (!--q->quiesce_depth) { blk_queue_flag_clear(QUEUE_FLAG_QUIESCED, q); run_queue = true; } spin_unlock_irqrestore(&q->queue_lock, flags); /* dispatch requests which are inserted during quiescing */ if (run_queue) blk_mq_run_hw_queues(q, true); } EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue); void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set) { struct request_queue *q; mutex_lock(&set->tag_list_lock); list_for_each_entry(q, &set->tag_list, tag_set_list) { if (!blk_queue_skip_tagset_quiesce(q)) blk_mq_quiesce_queue_nowait(q); } blk_mq_wait_quiesce_done(set); mutex_unlock(&set->tag_list_lock); } EXPORT_SYMBOL_GPL(blk_mq_quiesce_tagset); void blk_mq_unquiesce_tagset(struct blk_mq_tag_set *set) { struct request_queue *q; mutex_lock(&set->tag_list_lock); list_for_each_entry(q, &set->tag_list, tag_set_list) { if (!blk_queue_skip_tagset_quiesce(q)) blk_mq_unquiesce_queue(q); } mutex_unlock(&set->tag_list_lock); } EXPORT_SYMBOL_GPL(blk_mq_unquiesce_tagset); void blk_mq_wake_waiters(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; unsigned long i; queue_for_each_hw_ctx(q, hctx, i) if (blk_mq_hw_queue_mapped(hctx)) blk_mq_tag_wakeup_all(hctx->tags, true); } void blk_rq_init(struct request_queue *q, struct request *rq) { memset(rq, 0, sizeof(*rq)); INIT_LIST_HEAD(&rq->queuelist); rq->q = q; rq->__sector = (sector_t) -1; INIT_HLIST_NODE(&rq->hash); RB_CLEAR_NODE(&rq->rb_node); rq->tag = BLK_MQ_NO_TAG; rq->internal_tag = BLK_MQ_NO_TAG; rq->start_time_ns = ktime_get_ns(); rq->part = NULL; blk_crypto_rq_set_defaults(rq); } EXPORT_SYMBOL(blk_rq_init); /* Set start and alloc time when the allocated request is actually used */ static inline void blk_mq_rq_time_init(struct request *rq, u64 alloc_time_ns) { if (blk_mq_need_time_stamp(rq)) rq->start_time_ns = ktime_get_ns(); else rq->start_time_ns = 0; #ifdef CONFIG_BLK_RQ_ALLOC_TIME if (blk_queue_rq_alloc_time(rq->q)) rq->alloc_time_ns = alloc_time_ns ?: rq->start_time_ns; else rq->alloc_time_ns = 0; #endif } static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, struct blk_mq_tags *tags, unsigned int tag) { struct blk_mq_ctx *ctx = data->ctx; struct blk_mq_hw_ctx *hctx = data->hctx; struct request_queue *q = data->q; struct request *rq = tags->static_rqs[tag]; rq->q = q; rq->mq_ctx = ctx; rq->mq_hctx = hctx; rq->cmd_flags = data->cmd_flags; if (data->flags & BLK_MQ_REQ_PM) data->rq_flags |= RQF_PM; if (blk_queue_io_stat(q)) data->rq_flags |= RQF_IO_STAT; rq->rq_flags = data->rq_flags; if (data->rq_flags & RQF_SCHED_TAGS) { rq->tag = BLK_MQ_NO_TAG; rq->internal_tag = tag; } else { rq->tag = tag; rq->internal_tag = BLK_MQ_NO_TAG; } rq->timeout = 0; rq->part = NULL; rq->io_start_time_ns = 0; rq->stats_sectors = 0; rq->nr_phys_segments = 0; #if defined(CONFIG_BLK_DEV_INTEGRITY) rq->nr_integrity_segments = 0; #endif rq->end_io = NULL; rq->end_io_data = NULL; blk_crypto_rq_set_defaults(rq); INIT_LIST_HEAD(&rq->queuelist); /* tag was already set */ WRITE_ONCE(rq->deadline, 0); req_ref_set(rq, 1); if (rq->rq_flags & RQF_USE_SCHED) { struct elevator_queue *e = data->q->elevator; INIT_HLIST_NODE(&rq->hash); RB_CLEAR_NODE(&rq->rb_node); if (e->type->ops.prepare_request) e->type->ops.prepare_request(rq); } return rq; } static inline struct request * __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data) { unsigned int tag, tag_offset; struct blk_mq_tags *tags; struct request *rq; unsigned long tag_mask; int i, nr = 0; tag_mask = blk_mq_get_tags(data, data->nr_tags, &tag_offset); if (unlikely(!tag_mask)) return NULL; tags = blk_mq_tags_from_data(data); for (i = 0; tag_mask; i++) { if (!(tag_mask & (1UL << i))) continue; tag = tag_offset + i; prefetch(tags->static_rqs[tag]); tag_mask &= ~(1UL << i); rq = blk_mq_rq_ctx_init(data, tags, tag); rq_list_add(data->cached_rq, rq); nr++; } if (!(data->rq_flags & RQF_SCHED_TAGS)) blk_mq_add_active_requests(data->hctx, nr); /* caller already holds a reference, add for remainder */ percpu_ref_get_many(&data->q->q_usage_counter, nr - 1); data->nr_tags -= nr; return rq_list_pop(data->cached_rq); } static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data) { struct request_queue *q = data->q; u64 alloc_time_ns = 0; struct request *rq; unsigned int tag; /* alloc_time includes depth and tag waits */ if (blk_queue_rq_alloc_time(q)) alloc_time_ns = ktime_get_ns(); if (data->cmd_flags & REQ_NOWAIT) data->flags |= BLK_MQ_REQ_NOWAIT; if (q->elevator) { /* * All requests use scheduler tags when an I/O scheduler is * enabled for the queue. */ data->rq_flags |= RQF_SCHED_TAGS; /* * Flush/passthrough requests are special and go directly to the * dispatch list. */ if ((data->cmd_flags & REQ_OP_MASK) != REQ_OP_FLUSH && !blk_op_is_passthrough(data->cmd_flags)) { struct elevator_mq_ops *ops = &q->elevator->type->ops; WARN_ON_ONCE(data->flags & BLK_MQ_REQ_RESERVED); data->rq_flags |= RQF_USE_SCHED; if (ops->limit_depth) ops->limit_depth(data->cmd_flags, data); } } retry: data->ctx = blk_mq_get_ctx(q); data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx); if (!(data->rq_flags & RQF_SCHED_TAGS)) blk_mq_tag_busy(data->hctx); if (data->flags & BLK_MQ_REQ_RESERVED) data->rq_flags |= RQF_RESV; /* * Try batched alloc if we want more than 1 tag. */ if (data->nr_tags > 1) { rq = __blk_mq_alloc_requests_batch(data); if (rq) { blk_mq_rq_time_init(rq, alloc_time_ns); return rq; } data->nr_tags = 1; } /* * Waiting allocations only fail because of an inactive hctx. In that * case just retry the hctx assignment and tag allocation as CPU hotplug * should have migrated us to an online CPU by now. */ tag = blk_mq_get_tag(data); if (tag == BLK_MQ_NO_TAG) { if (data->flags & BLK_MQ_REQ_NOWAIT) return NULL; /* * Give up the CPU and sleep for a random short time to * ensure that thread using a realtime scheduling class * are migrated off the CPU, and thus off the hctx that * is going away. */ msleep(3); goto retry; } if (!(data->rq_flags & RQF_SCHED_TAGS)) blk_mq_inc_active_requests(data->hctx); rq = blk_mq_rq_ctx_init(data, blk_mq_tags_from_data(data), tag); blk_mq_rq_time_init(rq, alloc_time_ns); return rq; } static struct request *blk_mq_rq_cache_fill(struct request_queue *q, struct blk_plug *plug, blk_opf_t opf, blk_mq_req_flags_t flags) { struct blk_mq_alloc_data data = { .q = q, .flags = flags, .cmd_flags = opf, .nr_tags = plug->nr_ios, .cached_rq = &plug->cached_rq, }; struct request *rq; if (blk_queue_enter(q, flags)) return NULL; plug->nr_ios = 1; rq = __blk_mq_alloc_requests(&data); if (unlikely(!rq)) blk_queue_exit(q); return rq; } static struct request *blk_mq_alloc_cached_request(struct request_queue *q, blk_opf_t opf, blk_mq_req_flags_t flags) { struct blk_plug *plug = current->plug; struct request *rq; if (!plug) return NULL; if (rq_list_empty(plug->cached_rq)) { if (plug->nr_ios == 1) return NULL; rq = blk_mq_rq_cache_fill(q, plug, opf, flags); if (!rq) return NULL; } else { rq = rq_list_peek(&plug->cached_rq); if (!rq || rq->q != q) return NULL; if (blk_mq_get_hctx_type(opf) != rq->mq_hctx->type) return NULL; if (op_is_flush(rq->cmd_flags) != op_is_flush(opf)) return NULL; plug->cached_rq = rq_list_next(rq); blk_mq_rq_time_init(rq, 0); } rq->cmd_flags = opf; INIT_LIST_HEAD(&rq->queuelist); return rq; } struct request *blk_mq_alloc_request(struct request_queue *q, blk_opf_t opf, blk_mq_req_flags_t flags) { struct request *rq; rq = blk_mq_alloc_cached_request(q, opf, flags); if (!rq) { struct blk_mq_alloc_data data = { .q = q, .flags = flags, .cmd_flags = opf, .nr_tags = 1, }; int ret; ret = blk_queue_enter(q, flags); if (ret) return ERR_PTR(ret); rq = __blk_mq_alloc_requests(&data); if (!rq) goto out_queue_exit; } rq->__data_len = 0; rq->__sector = (sector_t) -1; rq->bio = rq->biotail = NULL; return rq; out_queue_exit: blk_queue_exit(q); return ERR_PTR(-EWOULDBLOCK); } EXPORT_SYMBOL(blk_mq_alloc_request); struct request *blk_mq_alloc_request_hctx(struct request_queue *q, blk_opf_t opf, blk_mq_req_flags_t flags, unsigned int hctx_idx) { struct blk_mq_alloc_data data = { .q = q, .flags = flags, .cmd_flags = opf, .nr_tags = 1, }; u64 alloc_time_ns = 0; struct request *rq; unsigned int cpu; unsigned int tag; int ret; /* alloc_time includes depth and tag waits */ if (blk_queue_rq_alloc_time(q)) alloc_time_ns = ktime_get_ns(); /* * If the tag allocator sleeps we could get an allocation for a * different hardware context. No need to complicate the low level * allocator for this for the rare use case of a command tied to * a specific queue. */ if (WARN_ON_ONCE(!(flags & BLK_MQ_REQ_NOWAIT)) || WARN_ON_ONCE(!(flags & BLK_MQ_REQ_RESERVED))) return ERR_PTR(-EINVAL); if (hctx_idx >= q->nr_hw_queues) return ERR_PTR(-EIO); ret = blk_queue_enter(q, flags); if (ret) return ERR_PTR(ret); /* * Check if the hardware context is actually mapped to anything. * If not tell the caller that it should skip this queue. */ ret = -EXDEV; data.hctx = xa_load(&q->hctx_table, hctx_idx); if (!blk_mq_hw_queue_mapped(data.hctx)) goto out_queue_exit; cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask); if (cpu >= nr_cpu_ids) goto out_queue_exit; data.ctx = __blk_mq_get_ctx(q, cpu); if (q->elevator) data.rq_flags |= RQF_SCHED_TAGS; else blk_mq_tag_busy(data.hctx); if (flags & BLK_MQ_REQ_RESERVED) data.rq_flags |= RQF_RESV; ret = -EWOULDBLOCK; tag = blk_mq_get_tag(&data); if (tag == BLK_MQ_NO_TAG) goto out_queue_exit; if (!(data.rq_flags & RQF_SCHED_TAGS)) blk_mq_inc_active_requests(data.hctx); rq = blk_mq_rq_ctx_init(&data, blk_mq_tags_from_data(&data), tag); blk_mq_rq_time_init(rq, alloc_time_ns); rq->__data_len = 0; rq->__sector = (sector_t) -1; rq->bio = rq->biotail = NULL; return rq; out_queue_exit: blk_queue_exit(q); return ERR_PTR(ret); } EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx); static void blk_mq_finish_request(struct request *rq) { struct request_queue *q = rq->q; if (rq->rq_flags & RQF_USE_SCHED) { q->elevator->type->ops.finish_request(rq); /* * For postflush request that may need to be * completed twice, we should clear this flag * to avoid double finish_request() on the rq. */ rq->rq_flags &= ~RQF_USE_SCHED; } } static void __blk_mq_free_request(struct request *rq) { struct request_queue *q = rq->q; struct blk_mq_ctx *ctx = rq->mq_ctx; struct blk_mq_hw_ctx *hctx = rq->mq_hctx; const int sched_tag = rq->internal_tag; blk_crypto_free_request(rq); blk_pm_mark_last_busy(rq); rq->mq_hctx = NULL; if (rq->tag != BLK_MQ_NO_TAG) { blk_mq_dec_active_requests(hctx); blk_mq_put_tag(hctx->tags, ctx, rq->tag); } if (sched_tag != BLK_MQ_NO_TAG) blk_mq_put_tag(hctx->sched_tags, ctx, sched_tag); blk_mq_sched_restart(hctx); blk_queue_exit(q); } void blk_mq_free_request(struct request *rq) { struct request_queue *q = rq->q; blk_mq_finish_request(rq); if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq))) laptop_io_completion(q->disk->bdi); rq_qos_done(q, rq); WRITE_ONCE(rq->state, MQ_RQ_IDLE); if (req_ref_put_and_test(rq)) __blk_mq_free_request(rq); } EXPORT_SYMBOL_GPL(blk_mq_free_request); void blk_mq_free_plug_rqs(struct blk_plug *plug) { struct request *rq; while ((rq = rq_list_pop(&plug->cached_rq)) != NULL) blk_mq_free_request(rq); } void blk_dump_rq_flags(struct request *rq, char *msg) { printk(KERN_INFO "%s: dev %s: flags=%llx\n", msg, rq->q->disk ? rq->q->disk->disk_name : "?", (__force unsigned long long) rq->cmd_flags); printk(KERN_INFO " sector %llu, nr/cnr %u/%u\n", (unsigned long long)blk_rq_pos(rq), blk_rq_sectors(rq), blk_rq_cur_sectors(rq)); printk(KERN_INFO " bio %p, biotail %p, len %u\n", rq->bio, rq->biotail, blk_rq_bytes(rq)); } EXPORT_SYMBOL(blk_dump_rq_flags); static void req_bio_endio(struct request *rq, struct bio *bio, unsigned int nbytes, blk_status_t error) { if (unlikely(error)) { bio->bi_status = error; } else if (req_op(rq) == REQ_OP_ZONE_APPEND) { /* * Partial zone append completions cannot be supported as the * BIO fragments may end up not being written sequentially. */ if (bio->bi_iter.bi_size != nbytes) bio->bi_status = BLK_STS_IOERR; else bio->bi_iter.bi_sector = rq->__sector; } bio_advance(bio, nbytes); if (unlikely(rq->rq_flags & RQF_QUIET)) bio_set_flag(bio, BIO_QUIET); /* don't actually finish bio if it's part of flush sequence */ if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ)) bio_endio(bio); } static void blk_account_io_completion(struct request *req, unsigned int bytes) { if (req->part && blk_do_io_stat(req)) { const int sgrp = op_stat_group(req_op(req)); part_stat_lock(); part_stat_add(req->part, sectors[sgrp], bytes >> 9); part_stat_unlock(); } } static void blk_print_req_error(struct request *req, blk_status_t status) { printk_ratelimited(KERN_ERR "%s error, dev %s, sector %llu op 0x%x:(%s) flags 0x%x " "phys_seg %u prio class %u\n", blk_status_to_str(status), req->q->disk ? req->q->disk->disk_name : "?", blk_rq_pos(req), (__force u32)req_op(req), blk_op_str(req_op(req)), (__force u32)(req->cmd_flags & ~REQ_OP_MASK), req->nr_phys_segments, IOPRIO_PRIO_CLASS(req->ioprio)); } /* * Fully end IO on a request. Does not support partial completions, or * errors. */ static void blk_complete_request(struct request *req) { const bool is_flush = (req->rq_flags & RQF_FLUSH_SEQ) != 0; int total_bytes = blk_rq_bytes(req); struct bio *bio = req->bio; trace_block_rq_complete(req, BLK_STS_OK, total_bytes); if (!bio) return; #ifdef CONFIG_BLK_DEV_INTEGRITY if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ) req->q->integrity.profile->complete_fn(req, total_bytes); #endif /* * Upper layers may call blk_crypto_evict_key() anytime after the last * bio_endio(). Therefore, the keyslot must be released before that. */ blk_crypto_rq_put_keyslot(req); blk_account_io_completion(req, total_bytes); do { struct bio *next = bio->bi_next; /* Completion has already been traced */ bio_clear_flag(bio, BIO_TRACE_COMPLETION); if (req_op(req) == REQ_OP_ZONE_APPEND) bio->bi_iter.bi_sector = req->__sector; if (!is_flush) bio_endio(bio); bio = next; } while (bio); /* * Reset counters so that the request stacking driver * can find how many bytes remain in the request * later. */ if (!req->end_io) { req->bio = NULL; req->__data_len = 0; } } /** * blk_update_request - Complete multiple bytes without completing the request * @req: the request being processed * @error: block status code * @nr_bytes: number of bytes to complete for @req * * Description: * Ends I/O on a number of bytes attached to @req, but doesn't complete * the request structure even if @req doesn't have leftover. * If @req has leftover, sets it up for the next range of segments. * * Passing the result of blk_rq_bytes() as @nr_bytes guarantees * %false return from this function. * * Note: * The RQF_SPECIAL_PAYLOAD flag is ignored on purpose in this function * except in the consistency check at the end of this function. * * Return: * %false - this request doesn't have any more data * %true - this request has more data **/ bool blk_update_request(struct request *req, blk_status_t error, unsigned int nr_bytes) { int total_bytes; trace_block_rq_complete(req, error, nr_bytes); if (!req->bio) return false; #ifdef CONFIG_BLK_DEV_INTEGRITY if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ && error == BLK_STS_OK) req->q->integrity.profile->complete_fn(req, nr_bytes); #endif /* * Upper layers may call blk_crypto_evict_key() anytime after the last * bio_endio(). Therefore, the keyslot must be released before that. */ if (blk_crypto_rq_has_keyslot(req) && nr_bytes >= blk_rq_bytes(req)) __blk_crypto_rq_put_keyslot(req); if (unlikely(error && !blk_rq_is_passthrough(req) && !(req->rq_flags & RQF_QUIET)) && !test_bit(GD_DEAD, &req->q->disk->state)) { blk_print_req_error(req, error); trace_block_rq_error(req, error, nr_bytes); } blk_account_io_completion(req, nr_bytes); total_bytes = 0; while (req->bio) { struct bio *bio = req->bio; unsigned bio_bytes = min(bio->bi_iter.bi_size, nr_bytes); if (bio_bytes == bio->bi_iter.bi_size) req->bio = bio->bi_next; /* Completion has already been traced */ bio_clear_flag(bio, BIO_TRACE_COMPLETION); req_bio_endio(req, bio, bio_bytes, error); total_bytes += bio_bytes; nr_bytes -= bio_bytes; if (!nr_bytes) break; } /* * completely done */ if (!req->bio) { /* * Reset counters so that the request stacking driver * can find how many bytes remain in the request * later. */ req->__data_len = 0; return false; } req->__data_len -= total_bytes; /* update sector only for requests with clear definition of sector */ if (!blk_rq_is_passthrough(req)) req->__sector += total_bytes >> 9; /* mixed attributes always follow the first bio */ if (req->rq_flags & RQF_MIXED_MERGE) { req->cmd_flags &= ~REQ_FAILFAST_MASK; req->cmd_flags |= req->bio->bi_opf & REQ_FAILFAST_MASK; } if (!(req->rq_flags & RQF_SPECIAL_PAYLOAD)) { /* * If total number of sectors is less than the first segment * size, something has gone terribly wrong. */ if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) { blk_dump_rq_flags(req, "request botched"); req->__data_len = blk_rq_cur_bytes(req); } /* recalculate the number of segments */ req->nr_phys_segments = blk_recalc_rq_segments(req); } return true; } EXPORT_SYMBOL_GPL(blk_update_request); static inline void blk_account_io_done(struct request *req, u64 now) { trace_block_io_done(req); /* * Account IO completion. flush_rq isn't accounted as a * normal IO on queueing nor completion. Accounting the * containing request is enough. */ if (blk_do_io_stat(req) && req->part && !(req->rq_flags & RQF_FLUSH_SEQ)) { const int sgrp = op_stat_group(req_op(req)); part_stat_lock(); update_io_ticks(req->part, jiffies, true); part_stat_inc(req->part, ios[sgrp]); part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns); part_stat_unlock(); } } static inline void blk_account_io_start(struct request *req) { trace_block_io_start(req); if (blk_do_io_stat(req)) { /* * All non-passthrough requests are created from a bio with one * exception: when a flush command that is part of a flush sequence * generated by the state machine in blk-flush.c is cloned onto the * lower device by dm-multipath we can get here without a bio. */ if (req->bio) req->part = req->bio->bi_bdev; else req->part = req->q->disk->part0; part_stat_lock(); update_io_ticks(req->part, jiffies, false); part_stat_unlock(); } } static inline void __blk_mq_end_request_acct(struct request *rq, u64 now) { if (rq->rq_flags & RQF_STATS) blk_stat_add(rq, now); blk_mq_sched_completed_request(rq, now); blk_account_io_done(rq, now); } inline void __blk_mq_end_request(struct request *rq, blk_status_t error) { if (blk_mq_need_time_stamp(rq)) __blk_mq_end_request_acct(rq, ktime_get_ns()); blk_mq_finish_request(rq); if (rq->end_io) { rq_qos_done(rq->q, rq); if (rq->end_io(rq, error) == RQ_END_IO_FREE) blk_mq_free_request(rq); } else { blk_mq_free_request(rq); } } EXPORT_SYMBOL(__blk_mq_end_request); void blk_mq_end_request(struct request *rq, blk_status_t error) { if (blk_update_request(rq, error, blk_rq_bytes(rq))) BUG(); __blk_mq_end_request(rq, error); } EXPORT_SYMBOL(blk_mq_end_request); #define TAG_COMP_BATCH 32 static inline void blk_mq_flush_tag_batch(struct blk_mq_hw_ctx *hctx, int *tag_array, int nr_tags) { struct request_queue *q = hctx->queue; blk_mq_sub_active_requests(hctx, nr_tags); blk_mq_put_tags(hctx->tags, tag_array, nr_tags); percpu_ref_put_many(&q->q_usage_counter, nr_tags); } void blk_mq_end_request_batch(struct io_comp_batch *iob) { int tags[TAG_COMP_BATCH], nr_tags = 0; struct blk_mq_hw_ctx *cur_hctx = NULL; struct request *rq; u64 now = 0; if (iob->need_ts) now = ktime_get_ns(); while ((rq = rq_list_pop(&iob->req_list)) != NULL) { prefetch(rq->bio); prefetch(rq->rq_next); blk_complete_request(rq); if (iob->need_ts) __blk_mq_end_request_acct(rq, now); blk_mq_finish_request(rq); rq_qos_done(rq->q, rq); /* * If end_io handler returns NONE, then it still has * ownership of the request. */ if (rq->end_io && rq->end_io(rq, 0) == RQ_END_IO_NONE) continue; WRITE_ONCE(rq->state, MQ_RQ_IDLE); if (!req_ref_put_and_test(rq)) continue; blk_crypto_free_request(rq); blk_pm_mark_last_busy(rq); if (nr_tags == TAG_COMP_BATCH || cur_hctx != rq->mq_hctx) { if (cur_hctx) blk_mq_flush_tag_batch(cur_hctx, tags, nr_tags); nr_tags = 0; cur_hctx = rq->mq_hctx; } tags[nr_tags++] = rq->tag; } if (nr_tags) blk_mq_flush_tag_batch(cur_hctx, tags, nr_tags); } EXPORT_SYMBOL_GPL(blk_mq_end_request_batch); static void blk_complete_reqs(struct llist_head *list) { struct llist_node *entry = llist_reverse_order(llist_del_all(list)); struct request *rq, *next; llist_for_each_entry_safe(rq, next, entry, ipi_list) rq->q->mq_ops->complete(rq); } static __latent_entropy void blk_done_softirq(struct softirq_action *h) { blk_complete_reqs(this_cpu_ptr(&blk_cpu_done)); } static int blk_softirq_cpu_dead(unsigned int cpu) { blk_complete_reqs(&per_cpu(blk_cpu_done, cpu)); return 0; } static void __blk_mq_complete_request_remote(void *data) { __raise_softirq_irqoff(BLOCK_SOFTIRQ); } static inline bool blk_mq_complete_need_ipi(struct request *rq) { int cpu = raw_smp_processor_id(); if (!IS_ENABLED(CONFIG_SMP) || !test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) return false; /* * With force threaded interrupts enabled, raising softirq from an SMP * function call will always result in waking the ksoftirqd thread. * This is probably worse than completing the request on a different * cache domain. */ if (force_irqthreads()) return false; /* same CPU or cache domain? Complete locally */ if (cpu == rq->mq_ctx->cpu || (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags) && cpus_share_cache(cpu, rq->mq_ctx->cpu))) return false; /* don't try to IPI to an offline CPU */ return cpu_online(rq->mq_ctx->cpu); } static void blk_mq_complete_send_ipi(struct request *rq) { unsigned int cpu; cpu = rq->mq_ctx->cpu; if (llist_add(&rq->ipi_list, &per_cpu(blk_cpu_done, cpu))) smp_call_function_single_async(cpu, &per_cpu(blk_cpu_csd, cpu)); } static void blk_mq_raise_softirq(struct request *rq) { struct llist_head *list; preempt_disable(); list = this_cpu_ptr(&blk_cpu_done); if (llist_add(&rq->ipi_list, list)) raise_softirq(BLOCK_SOFTIRQ); preempt_enable(); } bool blk_mq_complete_request_remote(struct request *rq) { WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); /* * For request which hctx has only one ctx mapping, * or a polled request, always complete locally, * it's pointless to redirect the completion. */ if ((rq->mq_hctx->nr_ctx == 1 && rq->mq_ctx->cpu == raw_smp_processor_id()) || rq->cmd_flags & REQ_POLLED) return false; if (blk_mq_complete_need_ipi(rq)) { blk_mq_complete_send_ipi(rq); return true; } if (rq->q->nr_hw_queues == 1) { blk_mq_raise_softirq(rq); return true; } return false; } EXPORT_SYMBOL_GPL(blk_mq_complete_request_remote); /** * blk_mq_complete_request - end I/O on a request * @rq: the request being processed * * Description: * Complete a request by scheduling the ->complete_rq operation. **/ void blk_mq_complete_request(struct request *rq) { if (!blk_mq_complete_request_remote(rq)) rq->q->mq_ops->complete(rq); } EXPORT_SYMBOL(blk_mq_complete_request); /** * blk_mq_start_request - Start processing a request * @rq: Pointer to request to be started * * Function used by device drivers to notify the block layer that a request * is going to be processed now, so blk layer can do proper initializations * such as starting the timeout timer. */ void blk_mq_start_request(struct request *rq) { struct request_queue *q = rq->q; trace_block_rq_issue(rq); if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) { rq->io_start_time_ns = ktime_get_ns(); rq->stats_sectors = blk_rq_sectors(rq); rq->rq_flags |= RQF_STATS; rq_qos_issue(q, rq); } WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE); blk_add_timer(rq); WRITE_ONCE(rq->state, MQ_RQ_IN_FLIGHT); rq->mq_hctx->tags->rqs[rq->tag] = rq; #ifdef CONFIG_BLK_DEV_INTEGRITY if (blk_integrity_rq(rq) && req_op(rq) == REQ_OP_WRITE) q->integrity.profile->prepare_fn(rq); #endif if (rq->bio && rq->bio->bi_opf & REQ_POLLED) WRITE_ONCE(rq->bio->bi_cookie, rq->mq_hctx->queue_num); } EXPORT_SYMBOL(blk_mq_start_request); /* * Allow 2x BLK_MAX_REQUEST_COUNT requests on plug queue for multiple * queues. This is important for md arrays to benefit from merging * requests. */ static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug) { if (plug->multiple_queues) return BLK_MAX_REQUEST_COUNT * 2; return BLK_MAX_REQUEST_COUNT; } static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq) { struct request *last = rq_list_peek(&plug->mq_list); if (!plug->rq_count) { trace_block_plug(rq->q); } else if (plug->rq_count >= blk_plug_max_rq_count(plug) || (!blk_queue_nomerges(rq->q) && blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) { blk_mq_flush_plug_list(plug, false); last = NULL; trace_block_plug(rq->q); } if (!plug->multiple_queues && last && last->q != rq->q) plug->multiple_queues = true; /* * Any request allocated from sched tags can't be issued to * ->queue_rqs() directly */ if (!plug->has_elevator && (rq->rq_flags & RQF_SCHED_TAGS)) plug->has_elevator = true; rq->rq_next = NULL; rq_list_add(&plug->mq_list, rq); plug->rq_count++; } /** * blk_execute_rq_nowait - insert a request to I/O scheduler for execution * @rq: request to insert * @at_head: insert request at head or tail of queue * * Description: * Insert a fully prepared request at the back of the I/O scheduler queue * for execution. Don't wait for completion. * * Note: * This function will invoke @done directly if the queue is dead. */ void blk_execute_rq_nowait(struct request *rq, bool at_head) { struct blk_mq_hw_ctx *hctx = rq->mq_hctx; WARN_ON(irqs_disabled()); WARN_ON(!blk_rq_is_passthrough(rq)); blk_account_io_start(rq); /* * As plugging can be enabled for passthrough requests on a zoned * device, directly accessing the plug instead of using blk_mq_plug() * should not have any consequences. */ if (current->plug && !at_head) { blk_add_rq_to_plug(current->plug, rq); return; } blk_mq_insert_request(rq, at_head ? BLK_MQ_INSERT_AT_HEAD : 0); blk_mq_run_hw_queue(hctx, hctx->flags & BLK_MQ_F_BLOCKING); } EXPORT_SYMBOL_GPL(blk_execute_rq_nowait); struct blk_rq_wait { struct completion done; blk_status_t ret; }; static enum rq_end_io_ret blk_end_sync_rq(struct request *rq, blk_status_t ret) { struct blk_rq_wait *wait = rq->end_io_data; wait->ret = ret; complete(&wait->done); return RQ_END_IO_NONE; } bool blk_rq_is_poll(struct request *rq) { if (!rq->mq_hctx) return false; if (rq->mq_hctx->type != HCTX_TYPE_POLL) return false; return true; } EXPORT_SYMBOL_GPL(blk_rq_is_poll); static void blk_rq_poll_completion(struct request *rq, struct completion *wait) { do { blk_hctx_poll(rq->q, rq->mq_hctx, NULL, 0); cond_resched(); } while (!completion_done(wait)); } /** * blk_execute_rq - insert a request into queue for execution * @rq: request to insert * @at_head: insert request at head or tail of queue * * Description: * Insert a fully prepared request at the back of the I/O scheduler queue * for execution and wait for completion. * Return: The blk_status_t result provided to blk_mq_end_request(). */ blk_status_t blk_execute_rq(struct request *rq, bool at_head) { struct blk_mq_hw_ctx *hctx = rq->mq_hctx; struct blk_rq_wait wait = { .done = COMPLETION_INITIALIZER_ONSTACK(wait.done), }; WARN_ON(irqs_disabled()); WARN_ON(!blk_rq_is_passthrough(rq)); rq->end_io_data = &wait; rq->end_io = blk_end_sync_rq; blk_account_io_start(rq); blk_mq_insert_request(rq, at_head ? BLK_MQ_INSERT_AT_HEAD : 0); blk_mq_run_hw_queue(hctx, false); if (blk_rq_is_poll(rq)) { blk_rq_poll_completion(rq, &wait.done); } else { /* * Prevent hang_check timer from firing at us during very long * I/O */ unsigned long hang_check = sysctl_hung_task_timeout_secs; if (hang_check) while (!wait_for_completion_io_timeout(&wait.done, hang_check * (HZ/2))) ; else wait_for_completion_io(&wait.done); } return wait.ret; } EXPORT_SYMBOL(blk_execute_rq); static void __blk_mq_requeue_request(struct request *rq) { struct request_queue *q = rq->q; blk_mq_put_driver_tag(rq); trace_block_rq_requeue(rq); rq_qos_requeue(q, rq); if (blk_mq_request_started(rq)) { WRITE_ONCE(rq->state, MQ_RQ_IDLE); rq->rq_flags &= ~RQF_TIMED_OUT; } } void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list) { struct request_queue *q = rq->q; unsigned long flags; __blk_mq_requeue_request(rq); /* this request will be re-inserted to io scheduler queue */ blk_mq_sched_requeue_request(rq); spin_lock_irqsave(&q->requeue_lock, flags); list_add_tail(&rq->queuelist, &q->requeue_list); spin_unlock_irqrestore(&q->requeue_lock, flags); if (kick_requeue_list) blk_mq_kick_requeue_list(q); } EXPORT_SYMBOL(blk_mq_requeue_request); static void blk_mq_requeue_work(struct work_struct *work) { struct request_queue *q = container_of(work, struct request_queue, requeue_work.work); LIST_HEAD(rq_list); LIST_HEAD(flush_list); struct request *rq; spin_lock_irq(&q->requeue_lock); list_splice_init(&q->requeue_list, &rq_list); list_splice_init(&q->flush_list, &flush_list); spin_unlock_irq(&q->requeue_lock); while (!list_empty(&rq_list)) { rq = list_entry(rq_list.next, struct request, queuelist); /* * If RQF_DONTPREP ist set, the request has been started by the * driver already and might have driver-specific data allocated * already. Insert it into the hctx dispatch list to avoid * block layer merges for the request. */ if (rq->rq_flags & RQF_DONTPREP) { list_del_init(&rq->queuelist); blk_mq_request_bypass_insert(rq, 0); } else { list_del_init(&rq->queuelist); blk_mq_insert_request(rq, BLK_MQ_INSERT_AT_HEAD); } } while (!list_empty(&flush_list)) { rq = list_entry(flush_list.next, struct request, queuelist); list_del_init(&rq->queuelist); blk_mq_insert_request(rq, 0); } blk_mq_run_hw_queues(q, false); } void blk_mq_kick_requeue_list(struct request_queue *q) { kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work, 0); } EXPORT_SYMBOL(blk_mq_kick_requeue_list); void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs) { kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work, msecs_to_jiffies(msecs)); } EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list); static bool blk_mq_rq_inflight(struct request *rq, void *priv) { /* * If we find a request that isn't idle we know the queue is busy * as it's checked in the iter. * Return false to stop the iteration. */ if (blk_mq_request_started(rq)) { bool *busy = priv; *busy = true; return false; } return true; } bool blk_mq_queue_inflight(struct request_queue *q) { bool busy = false; blk_mq_queue_tag_busy_iter(q, blk_mq_rq_inflight, &busy); return busy; } EXPORT_SYMBOL_GPL(blk_mq_queue_inflight); static void blk_mq_rq_timed_out(struct request *req) { req->rq_flags |= RQF_TIMED_OUT; if (req->q->mq_ops->timeout) { enum blk_eh_timer_return ret; ret = req->q->mq_ops->timeout(req); if (ret == BLK_EH_DONE) return; WARN_ON_ONCE(ret != BLK_EH_RESET_TIMER); } blk_add_timer(req); } struct blk_expired_data { bool has_timedout_rq; unsigned long next; unsigned long timeout_start; }; static bool blk_mq_req_expired(struct request *rq, struct blk_expired_data *expired) { unsigned long deadline; if (blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT) return false; if (rq->rq_flags & RQF_TIMED_OUT) return false; deadline = READ_ONCE(rq->deadline); if (time_after_eq(expired->timeout_start, deadline)) return true; if (expired->next == 0) expired->next = deadline; else if (time_after(expired->next, deadline)) expired->next = deadline; return false; } void blk_mq_put_rq_ref(struct request *rq) { if (is_flush_rq(rq)) { if (rq->end_io(rq, 0) == RQ_END_IO_FREE) blk_mq_free_request(rq); } else if (req_ref_put_and_test(rq)) { __blk_mq_free_request(rq); } } static bool blk_mq_check_expired(struct request *rq, void *priv) { struct blk_expired_data *expired = priv; /* * blk_mq_queue_tag_busy_iter() has locked the request, so it cannot * be reallocated underneath the timeout handler's processing, then * the expire check is reliable. If the request is not expired, then * it was completed and reallocated as a new request after returning * from blk_mq_check_expired(). */ if (blk_mq_req_expired(rq, expired)) { expired->has_timedout_rq = true; return false; } return true; } static bool blk_mq_handle_expired(struct request *rq, void *priv) { struct blk_expired_data *expired = priv; if (blk_mq_req_expired(rq, expired)) blk_mq_rq_timed_out(rq); return true; } static void blk_mq_timeout_work(struct work_struct *work) { struct request_queue *q = container_of(work, struct request_queue, timeout_work); struct blk_expired_data expired = { .timeout_start = jiffies, }; struct blk_mq_hw_ctx *hctx; unsigned long i; /* A deadlock might occur if a request is stuck requiring a * timeout at the same time a queue freeze is waiting * completion, since the timeout code would not be able to * acquire the queue reference here. * * That's why we don't use blk_queue_enter here; instead, we use * percpu_ref_tryget directly, because we need to be able to * obtain a reference even in the short window between the queue * starting to freeze, by dropping the first reference in * blk_freeze_queue_start, and the moment the last request is * consumed, marked by the instant q_usage_counter reaches * zero. */ if (!percpu_ref_tryget(&q->q_usage_counter)) return; /* check if there is any timed-out request */ blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &expired); if (expired.has_timedout_rq) { /* * Before walking tags, we must ensure any submit started * before the current time has finished. Since the submit * uses srcu or rcu, wait for a synchronization point to * ensure all running submits have finished */ blk_mq_wait_quiesce_done(q->tag_set); expired.next = 0; blk_mq_queue_tag_busy_iter(q, blk_mq_handle_expired, &expired); } if (expired.next != 0) { mod_timer(&q->timeout, expired.next); } else { /* * Request timeouts are handled as a forward rolling timer. If * we end up here it means that no requests are pending and * also that no request has been pending for a while. Mark * each hctx as idle. */ queue_for_each_hw_ctx(q, hctx, i) { /* the hctx may be unmapped, so check it here */ if (blk_mq_hw_queue_mapped(hctx)) blk_mq_tag_idle(hctx); } } blk_queue_exit(q); } struct flush_busy_ctx_data { struct blk_mq_hw_ctx *hctx; struct list_head *list; }; static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data) { struct flush_busy_ctx_data *flush_data = data; struct blk_mq_hw_ctx *hctx = flush_data->hctx; struct blk_mq_ctx *ctx = hctx->ctxs[bitnr]; enum hctx_type type = hctx->type; spin_lock(&ctx->lock); list_splice_tail_init(&ctx->rq_lists[type], flush_data->list); sbitmap_clear_bit(sb, bitnr); spin_unlock(&ctx->lock); return true; } /* * Process software queues that have been marked busy, splicing them * to the for-dispatch */ void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list) { struct flush_busy_ctx_data data = { .hctx = hctx, .list = list, }; sbitmap_for_each_set(&hctx->ctx_map, flush_busy_ctx, &data); } EXPORT_SYMBOL_GPL(blk_mq_flush_busy_ctxs); struct dispatch_rq_data { struct blk_mq_hw_ctx *hctx; struct request *rq; }; static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr, void *data) { struct dispatch_rq_data *dispatch_data = data; struct blk_mq_hw_ctx *hctx = dispatch_data->hctx; struct blk_mq_ctx *ctx = hctx->ctxs[bitnr]; enum hctx_type type = hctx->type; spin_lock(&ctx->lock); if (!list_empty(&ctx->rq_lists[type])) { dispatch_data->rq = list_entry_rq(ctx->rq_lists[type].next); list_del_init(&dispatch_data->rq->queuelist); if (list_empty(&ctx->rq_lists[type])) sbitmap_clear_bit(sb, bitnr); } spin_unlock(&ctx->lock); return !dispatch_data->rq; } struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *start) { unsigned off = start ? start->index_hw[hctx->type] : 0; struct dispatch_rq_data data = { .hctx = hctx, .rq = NULL, }; __sbitmap_for_each_set(&hctx->ctx_map, off, dispatch_rq_from_ctx, &data); return data.rq; } bool __blk_mq_alloc_driver_tag(struct request *rq) { struct sbitmap_queue *bt = &rq->mq_hctx->tags->bitmap_tags; unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags; int tag; blk_mq_tag_busy(rq->mq_hctx); if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) { bt = &rq->mq_hctx->tags->breserved_tags; tag_offset = 0; } else { if (!hctx_may_queue(rq->mq_hctx, bt)) return false; } tag = __sbitmap_queue_get(bt); if (tag == BLK_MQ_NO_TAG) return false; rq->tag = tag + tag_offset; blk_mq_inc_active_requests(rq->mq_hctx); return true; } static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, int flags, void *key) { struct blk_mq_hw_ctx *hctx; hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait); spin_lock(&hctx->dispatch_wait_lock); if (!list_empty(&wait->entry)) { struct sbitmap_queue *sbq; list_del_init(&wait->entry); sbq = &hctx->tags->bitmap_tags; atomic_dec(&sbq->ws_active); } spin_unlock(&hctx->dispatch_wait_lock); blk_mq_run_hw_queue(hctx, true); return 1; } /* * Mark us waiting for a tag. For shared tags, this involves hooking us into * the tag wakeups. For non-shared tags, we can simply mark us needing a * restart. For both cases, take care to check the condition again after * marking us as waiting. */ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx, struct request *rq) { struct sbitmap_queue *sbq; struct wait_queue_head *wq; wait_queue_entry_t *wait; bool ret; if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) && !(blk_mq_is_shared_tags(hctx->flags))) { blk_mq_sched_mark_restart_hctx(hctx); /* * It's possible that a tag was freed in the window between the * allocation failure and adding the hardware queue to the wait * queue. * * Don't clear RESTART here, someone else could have set it. * At most this will cost an extra queue run. */ return blk_mq_get_driver_tag(rq); } wait = &hctx->dispatch_wait; if (!list_empty_careful(&wait->entry)) return false; if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) sbq = &hctx->tags->breserved_tags; else sbq = &hctx->tags->bitmap_tags; wq = &bt_wait_ptr(sbq, hctx)->wait; spin_lock_irq(&wq->lock); spin_lock(&hctx->dispatch_wait_lock); if (!list_empty(&wait->entry)) { spin_unlock(&hctx->dispatch_wait_lock); spin_unlock_irq(&wq->lock); return false; } atomic_inc(&sbq->ws_active); wait->flags &= ~WQ_FLAG_EXCLUSIVE; __add_wait_queue(wq, wait); /* * It's possible that a tag was freed in the window between the * allocation failure and adding the hardware queue to the wait * queue. */ ret = blk_mq_get_driver_tag(rq); if (!ret) { spin_unlock(&hctx->dispatch_wait_lock); spin_unlock_irq(&wq->lock); return false; } /* * We got a tag, remove ourselves from the wait queue to ensure * someone else gets the wakeup. */ list_del_init(&wait->entry); atomic_dec(&sbq->ws_active); spin_unlock(&hctx->dispatch_wait_lock); spin_unlock_irq(&wq->lock); return true; } #define BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT 8 #define BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR 4 /* * Update dispatch busy with the Exponential Weighted Moving Average(EWMA): * - EWMA is one simple way to compute running average value * - weight(7/8 and 1/8) is applied so that it can decrease exponentially * - take 4 as factor for avoiding to get too small(0) result, and this * factor doesn't matter because EWMA decreases exponentially */ static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy) { unsigned int ewma; ewma = hctx->dispatch_busy; if (!ewma && !busy) return; ewma *= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT - 1; if (busy) ewma += 1 << BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR; ewma /= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT; hctx->dispatch_busy = ewma; } #define BLK_MQ_RESOURCE_DELAY 3 /* ms units */ static void blk_mq_handle_dev_resource(struct request *rq, struct list_head *list) { list_add(&rq->queuelist, list); __blk_mq_requeue_request(rq); } static void blk_mq_handle_zone_resource(struct request *rq, struct list_head *zone_list) { /* * If we end up here it is because we cannot dispatch a request to a * specific zone due to LLD level zone-write locking or other zone * related resource not being available. In this case, set the request * aside in zone_list for retrying it later. */ list_add(&rq->queuelist, zone_list); __blk_mq_requeue_request(rq); } enum prep_dispatch { PREP_DISPATCH_OK, PREP_DISPATCH_NO_TAG, PREP_DISPATCH_NO_BUDGET, }; static enum prep_dispatch blk_mq_prep_dispatch_rq(struct request *rq, bool need_budget) { struct blk_mq_hw_ctx *hctx = rq->mq_hctx; int budget_token = -1; if (need_budget) { budget_token = blk_mq_get_dispatch_budget(rq->q); if (budget_token < 0) { blk_mq_put_driver_tag(rq); return PREP_DISPATCH_NO_BUDGET; } blk_mq_set_rq_budget_token(rq, budget_token); } if (!blk_mq_get_driver_tag(rq)) { /* * The initial allocation attempt failed, so we need to * rerun the hardware queue when a tag is freed. The * waitqueue takes care of that. If the queue is run * before we add this entry back on the dispatch list, * we'll re-run it below. */ if (!blk_mq_mark_tag_wait(hctx, rq)) { /* * All budgets not got from this function will be put * together during handling partial dispatch */ if (need_budget) blk_mq_put_dispatch_budget(rq->q, budget_token); return PREP_DISPATCH_NO_TAG; } } return PREP_DISPATCH_OK; } /* release all allocated budgets before calling to blk_mq_dispatch_rq_list */ static void blk_mq_release_budgets(struct request_queue *q, struct list_head *list) { struct request *rq; list_for_each_entry(rq, list, queuelist) { int budget_token = blk_mq_get_rq_budget_token(rq); if (budget_token >= 0) blk_mq_put_dispatch_budget(q, budget_token); } } /* * blk_mq_commit_rqs will notify driver using bd->last that there is no * more requests. (See comment in struct blk_mq_ops for commit_rqs for * details) * Attention, we should explicitly call this in unusual cases: * 1) did not queue everything initially scheduled to queue * 2) the last attempt to queue a request failed */ static void blk_mq_commit_rqs(struct blk_mq_hw_ctx *hctx, int queued, bool from_schedule) { if (hctx->queue->mq_ops->commit_rqs && queued) { trace_block_unplug(hctx->queue, queued, !from_schedule); hctx->queue->mq_ops->commit_rqs(hctx); } } /* * Returns true if we did some work AND can potentially do more. */ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, unsigned int nr_budgets) { enum prep_dispatch prep; struct request_queue *q = hctx->queue; struct request *rq; int queued; blk_status_t ret = BLK_STS_OK; LIST_HEAD(zone_list); bool needs_resource = false; if (list_empty(list)) return false; /* * Now process all the entries, sending them to the driver. */ queued = 0; do { struct blk_mq_queue_data bd; rq = list_first_entry(list, struct request, queuelist); WARN_ON_ONCE(hctx != rq->mq_hctx); prep = blk_mq_prep_dispatch_rq(rq, !nr_budgets); if (prep != PREP_DISPATCH_OK) break; list_del_init(&rq->queuelist); bd.rq = rq; bd.last = list_empty(list); /* * once the request is queued to lld, no need to cover the * budget any more */ if (nr_budgets) nr_budgets--; ret = q->mq_ops->queue_rq(hctx, &bd); switch (ret) { case BLK_STS_OK: queued++; break; case BLK_STS_RESOURCE: needs_resource = true; fallthrough; case BLK_STS_DEV_RESOURCE: blk_mq_handle_dev_resource(rq, list); goto out; case BLK_STS_ZONE_RESOURCE: /* * Move the request to zone_list and keep going through * the dispatch list to find more requests the drive can * accept. */ blk_mq_handle_zone_resource(rq, &zone_list); needs_resource = true; break; default: blk_mq_end_request(rq, ret); } } while (!list_empty(list)); out: if (!list_empty(&zone_list)) list_splice_tail_init(&zone_list, list); /* If we didn't flush the entire list, we could have told the driver * there was more coming, but that turned out to be a lie. */ if (!list_empty(list) || ret != BLK_STS_OK) blk_mq_commit_rqs(hctx, queued, false); /* * Any items that need requeuing? Stuff them into hctx->dispatch, * that is where we will continue on next queue run. */ if (!list_empty(list)) { bool needs_restart; /* For non-shared tags, the RESTART check will suffice */ bool no_tag = prep == PREP_DISPATCH_NO_TAG && ((hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) || blk_mq_is_shared_tags(hctx->flags)); if (nr_budgets) blk_mq_release_budgets(q, list); spin_lock(&hctx->lock); list_splice_tail_init(list, &hctx->dispatch); spin_unlock(&hctx->lock); /* * Order adding requests to hctx->dispatch and checking * SCHED_RESTART flag. The pair of this smp_mb() is the one * in blk_mq_sched_restart(). Avoid restart code path to * miss the new added requests to hctx->dispatch, meantime * SCHED_RESTART is observed here. */ smp_mb(); /* * If SCHED_RESTART was set by the caller of this function and * it is no longer set that means that it was cleared by another * thread and hence that a queue rerun is needed. * * If 'no_tag' is set, that means that we failed getting * a driver tag with an I/O scheduler attached. If our dispatch * waitqueue is no longer active, ensure that we run the queue * AFTER adding our entries back to the list. * * If no I/O scheduler has been configured it is possible that * the hardware queue got stopped and restarted before requests * were pushed back onto the dispatch list. Rerun the queue to * avoid starvation. Notes: * - blk_mq_run_hw_queue() checks whether or not a queue has * been stopped before rerunning a queue. * - Some but not all block drivers stop a queue before * returning BLK_STS_RESOURCE. Two exceptions are scsi-mq * and dm-rq. * * If driver returns BLK_STS_RESOURCE and SCHED_RESTART * bit is set, run queue after a delay to avoid IO stalls * that could otherwise occur if the queue is idle. We'll do * similar if we couldn't get budget or couldn't lock a zone * and SCHED_RESTART is set. */ needs_restart = blk_mq_sched_needs_restart(hctx); if (prep == PREP_DISPATCH_NO_BUDGET) needs_resource = true; if (!needs_restart || (no_tag && list_empty_careful(&hctx->dispatch_wait.entry))) blk_mq_run_hw_queue(hctx, true); else if (needs_resource) blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY); blk_mq_update_dispatch_busy(hctx, true); return false; } blk_mq_update_dispatch_busy(hctx, false); return true; } static inline int blk_mq_first_mapped_cpu(struct blk_mq_hw_ctx *hctx) { int cpu = cpumask_first_and(hctx->cpumask, cpu_online_mask); if (cpu >= nr_cpu_ids) cpu = cpumask_first(hctx->cpumask); return cpu; } /* * It'd be great if the workqueue API had a way to pass * in a mask and had some smarts for more clever placement. * For now we just round-robin here, switching for every * BLK_MQ_CPU_WORK_BATCH queued items. */ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) { bool tried = false; int next_cpu = hctx->next_cpu; if (hctx->queue->nr_hw_queues == 1) return WORK_CPU_UNBOUND; if (--hctx->next_cpu_batch <= 0) { select_cpu: next_cpu = cpumask_next_and(next_cpu, hctx->cpumask, cpu_online_mask); if (next_cpu >= nr_cpu_ids) next_cpu = blk_mq_first_mapped_cpu(hctx); hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; } /* * Do unbound schedule if we can't find a online CPU for this hctx, * and it should only happen in the path of handling CPU DEAD. */ if (!cpu_online(next_cpu)) { if (!tried) { tried = true; goto select_cpu; } /* * Make sure to re-select CPU next time once after CPUs * in hctx->cpumask become online again. */ hctx->next_cpu = next_cpu; hctx->next_cpu_batch = 1; return WORK_CPU_UNBOUND; } hctx->next_cpu = next_cpu; return next_cpu; } /** * blk_mq_delay_run_hw_queue - Run a hardware queue asynchronously. * @hctx: Pointer to the hardware queue to run. * @msecs: Milliseconds of delay to wait before running the queue. * * Run a hardware queue asynchronously with a delay of @msecs. */ void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) { if (unlikely(blk_mq_hctx_stopped(hctx))) return; kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work, msecs_to_jiffies(msecs)); } EXPORT_SYMBOL(blk_mq_delay_run_hw_queue); /** * blk_mq_run_hw_queue - Start to run a hardware queue. * @hctx: Pointer to the hardware queue to run. * @async: If we want to run the queue asynchronously. * * Check if the request queue is not in a quiesced state and if there are * pending requests to be sent. If this is true, run the queue to send requests * to hardware. */ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) { bool need_run; /* * We can't run the queue inline with interrupts disabled. */ WARN_ON_ONCE(!async && in_interrupt()); might_sleep_if(!async && hctx->flags & BLK_MQ_F_BLOCKING); /* * When queue is quiesced, we may be switching io scheduler, or * updating nr_hw_queues, or other things, and we can't run queue * any more, even __blk_mq_hctx_has_pending() can't be called safely. * * And queue will be rerun in blk_mq_unquiesce_queue() if it is * quiesced. */ __blk_mq_run_dispatch_ops(hctx->queue, false, need_run = !blk_queue_quiesced(hctx->queue) && blk_mq_hctx_has_pending(hctx)); if (!need_run) return; if (async || !cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)) { blk_mq_delay_run_hw_queue(hctx, 0); return; } blk_mq_run_dispatch_ops(hctx->queue, blk_mq_sched_dispatch_requests(hctx)); } EXPORT_SYMBOL(blk_mq_run_hw_queue); /* * Return prefered queue to dispatch from (if any) for non-mq aware IO * scheduler. */ static struct blk_mq_hw_ctx *blk_mq_get_sq_hctx(struct request_queue *q) { struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); /* * If the IO scheduler does not respect hardware queues when * dispatching, we just don't bother with multiple HW queues and * dispatch from hctx for the current CPU since running multiple queues * just causes lock contention inside the scheduler and pointless cache * bouncing. */ struct blk_mq_hw_ctx *hctx = ctx->hctxs[HCTX_TYPE_DEFAULT]; if (!blk_mq_hctx_stopped(hctx)) return hctx; return NULL; } /** * blk_mq_run_hw_queues - Run all hardware queues in a request queue. * @q: Pointer to the request queue to run. * @async: If we want to run the queue asynchronously. */ void blk_mq_run_hw_queues(struct request_queue *q, bool async) { struct blk_mq_hw_ctx *hctx, *sq_hctx; unsigned long i; sq_hctx = NULL; if (blk_queue_sq_sched(q)) sq_hctx = blk_mq_get_sq_hctx(q); queue_for_each_hw_ctx(q, hctx, i) { if (blk_mq_hctx_stopped(hctx)) continue; /* * Dispatch from this hctx either if there's no hctx preferred * by IO scheduler or if it has requests that bypass the * scheduler. */ if (!sq_hctx || sq_hctx == hctx || !list_empty_careful(&hctx->dispatch)) blk_mq_run_hw_queue(hctx, async); } } EXPORT_SYMBOL(blk_mq_run_hw_queues); /** * blk_mq_delay_run_hw_queues - Run all hardware queues asynchronously. * @q: Pointer to the request queue to run. * @msecs: Milliseconds of delay to wait before running the queues. */ void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs) { struct blk_mq_hw_ctx *hctx, *sq_hctx; unsigned long i; sq_hctx = NULL; if (blk_queue_sq_sched(q)) sq_hctx = blk_mq_get_sq_hctx(q); queue_for_each_hw_ctx(q, hctx, i) { if (blk_mq_hctx_stopped(hctx)) continue; /* * If there is already a run_work pending, leave the * pending delay untouched. Otherwise, a hctx can stall * if another hctx is re-delaying the other's work * before the work executes. */ if (delayed_work_pending(&hctx->run_work)) continue; /* * Dispatch from this hctx either if there's no hctx preferred * by IO scheduler or if it has requests that bypass the * scheduler. */ if (!sq_hctx || sq_hctx == hctx || !list_empty_careful(&hctx->dispatch)) blk_mq_delay_run_hw_queue(hctx, msecs); } } EXPORT_SYMBOL(blk_mq_delay_run_hw_queues); /* * This function is often used for pausing .queue_rq() by driver when * there isn't enough resource or some conditions aren't satisfied, and * BLK_STS_RESOURCE is usually returned. * * We do not guarantee that dispatch can be drained or blocked * after blk_mq_stop_hw_queue() returns. Please use * blk_mq_quiesce_queue() for that requirement. */ void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx) { cancel_delayed_work(&hctx->run_work); set_bit(BLK_MQ_S_STOPPED, &hctx->state); } EXPORT_SYMBOL(blk_mq_stop_hw_queue); /* * This function is often used for pausing .queue_rq() by driver when * there isn't enough resource or some conditions aren't satisfied, and * BLK_STS_RESOURCE is usually returned. * * We do not guarantee that dispatch can be drained or blocked * after blk_mq_stop_hw_queues() returns. Please use * blk_mq_quiesce_queue() for that requirement. */ void blk_mq_stop_hw_queues(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; unsigned long i; queue_for_each_hw_ctx(q, hctx, i) blk_mq_stop_hw_queue(hctx); } EXPORT_SYMBOL(blk_mq_stop_hw_queues); void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx) { clear_bit(BLK_MQ_S_STOPPED, &hctx->state); blk_mq_run_hw_queue(hctx, hctx->flags & BLK_MQ_F_BLOCKING); } EXPORT_SYMBOL(blk_mq_start_hw_queue); void blk_mq_start_hw_queues(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; unsigned long i; queue_for_each_hw_ctx(q, hctx, i) blk_mq_start_hw_queue(hctx); } EXPORT_SYMBOL(blk_mq_start_hw_queues); void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) { if (!blk_mq_hctx_stopped(hctx)) return; clear_bit(BLK_MQ_S_STOPPED, &hctx->state); blk_mq_run_hw_queue(hctx, async); } EXPORT_SYMBOL_GPL(blk_mq_start_stopped_hw_queue); void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async) { struct blk_mq_hw_ctx *hctx; unsigned long i; queue_for_each_hw_ctx(q, hctx, i) blk_mq_start_stopped_hw_queue(hctx, async || (hctx->flags & BLK_MQ_F_BLOCKING)); } EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues); static void blk_mq_run_work_fn(struct work_struct *work) { struct blk_mq_hw_ctx *hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work); blk_mq_run_dispatch_ops(hctx->queue, blk_mq_sched_dispatch_requests(hctx)); } /** * blk_mq_request_bypass_insert - Insert a request at dispatch list. * @rq: Pointer to request to be inserted. * @flags: BLK_MQ_INSERT_* * * Should only be used carefully, when the caller knows we want to * bypass a potential IO scheduler on the target device. */ static void blk_mq_request_bypass_insert(struct request *rq, blk_insert_t flags) { struct blk_mq_hw_ctx *hctx = rq->mq_hctx; spin_lock(&hctx->lock); if (flags & BLK_MQ_INSERT_AT_HEAD) list_add(&rq->queuelist, &hctx->dispatch); else list_add_tail(&rq->queuelist, &hctx->dispatch); spin_unlock(&hctx->lock); } static void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, struct list_head *list, bool run_queue_async) { struct request *rq; enum hctx_type type = hctx->type; /* * Try to issue requests directly if the hw queue isn't busy to save an * extra enqueue & dequeue to the sw queue. */ if (!hctx->dispatch_busy && !run_queue_async) { blk_mq_run_dispatch_ops(hctx->queue, blk_mq_try_issue_list_directly(hctx, list)); if (list_empty(list)) goto out; } /* * preemption doesn't flush plug list, so it's possible ctx->cpu is * offline now */ list_for_each_entry(rq, list, queuelist) { BUG_ON(rq->mq_ctx != ctx); trace_block_rq_insert(rq); if (rq->cmd_flags & REQ_NOWAIT) run_queue_async = true; } spin_lock(&ctx->lock); list_splice_tail_init(list, &ctx->rq_lists[type]); blk_mq_hctx_mark_pending(hctx, ctx); spin_unlock(&ctx->lock); out: blk_mq_run_hw_queue(hctx, run_queue_async); } static void blk_mq_insert_request(struct request *rq, blk_insert_t flags) { struct request_queue *q = rq->q; struct blk_mq_ctx *ctx = rq->mq_ctx; struct blk_mq_hw_ctx *hctx = rq->mq_hctx; if (blk_rq_is_passthrough(rq)) { /* * Passthrough request have to be added to hctx->dispatch * directly. The device may be in a situation where it can't * handle FS request, and always returns BLK_STS_RESOURCE for * them, which gets them added to hctx->dispatch. * * If a passthrough request is required to unblock the queues, * and it is added to the scheduler queue, there is no chance to * dispatch it given we prioritize requests in hctx->dispatch. */ blk_mq_request_bypass_insert(rq, flags); } else if (req_op(rq) == REQ_OP_FLUSH) { /* * Firstly normal IO request is inserted to scheduler queue or * sw queue, meantime we add flush request to dispatch queue( * hctx->dispatch) directly and there is at most one in-flight * flush request for each hw queue, so it doesn't matter to add * flush request to tail or front of the dispatch queue. * * Secondly in case of NCQ, flush request belongs to non-NCQ * command, and queueing it will fail when there is any * in-flight normal IO request(NCQ command). When adding flush * rq to the front of hctx->dispatch, it is easier to introduce * extra time to flush rq's latency because of S_SCHED_RESTART * compared with adding to the tail of dispatch queue, then * chance of flush merge is increased, and less flush requests * will be issued to controller. It is observed that ~10% time * is saved in blktests block/004 on disk attached to AHCI/NCQ * drive when adding flush rq to the front of hctx->dispatch. * * Simply queue flush rq to the front of hctx->dispatch so that * intensive flush workloads can benefit in case of NCQ HW. */ blk_mq_request_bypass_insert(rq, BLK_MQ_INSERT_AT_HEAD); } else if (q->elevator) { LIST_HEAD(list); WARN_ON_ONCE(rq->tag != BLK_MQ_NO_TAG); list_add(&rq->queuelist, &list); q->elevator->type->ops.insert_requests(hctx, &list, flags); } else { trace_block_rq_insert(rq); spin_lock(&ctx->lock); if (flags & BLK_MQ_INSERT_AT_HEAD) list_add(&rq->queuelist, &ctx->rq_lists[hctx->type]); else list_add_tail(&rq->queuelist, &ctx->rq_lists[hctx->type]); blk_mq_hctx_mark_pending(hctx, ctx); spin_unlock(&ctx->lock); } } static void blk_mq_bio_to_request(struct request *rq, struct bio *bio, unsigned int nr_segs) { int err; if (bio->bi_opf & REQ_RAHEAD) rq->cmd_flags |= REQ_FAILFAST_MASK; rq->__sector = bio->bi_iter.bi_sector; blk_rq_bio_prep(rq, bio, nr_segs); /* This can't fail, since GFP_NOIO includes __GFP_DIRECT_RECLAIM. */ err = blk_crypto_rq_bio_prep(rq, bio, GFP_NOIO); WARN_ON_ONCE(err); blk_account_io_start(rq); } static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx, struct request *rq, bool last) { struct request_queue *q = rq->q; struct blk_mq_queue_data bd = { .rq = rq, .last = last, }; blk_status_t ret; /* * For OK queue, we are done. For error, caller may kill it. * Any other error (busy), just add it to our list as we * previously would have done. */ ret = q->mq_ops->queue_rq(hctx, &bd); switch (ret) { case BLK_STS_OK: blk_mq_update_dispatch_busy(hctx, false); break; case BLK_STS_RESOURCE: case BLK_STS_DEV_RESOURCE: blk_mq_update_dispatch_busy(hctx, true); __blk_mq_requeue_request(rq); break; default: blk_mq_update_dispatch_busy(hctx, false); break; } return ret; } static bool blk_mq_get_budget_and_tag(struct request *rq) { int budget_token; budget_token = blk_mq_get_dispatch_budget(rq->q); if (budget_token < 0) return false; blk_mq_set_rq_budget_token(rq, budget_token); if (!blk_mq_get_driver_tag(rq)) { blk_mq_put_dispatch_budget(rq->q, budget_token); return false; } return true; } /** * blk_mq_try_issue_directly - Try to send a request directly to device driver. * @hctx: Pointer of the associated hardware queue. * @rq: Pointer to request to be sent. * * If the device has enough resources to accept a new request now, send the * request directly to device driver. Else, insert at hctx->dispatch queue, so * we can try send it another time in the future. Requests inserted at this * queue have higher priority. */ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, struct request *rq) { blk_status_t ret; if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(rq->q)) { blk_mq_insert_request(rq, 0); return; } if ((rq->rq_flags & RQF_USE_SCHED) || !blk_mq_get_budget_and_tag(rq)) { blk_mq_insert_request(rq, 0); blk_mq_run_hw_queue(hctx, rq->cmd_flags & REQ_NOWAIT); return; } ret = __blk_mq_issue_directly(hctx, rq, true); switch (ret) { case BLK_STS_OK: break; case BLK_STS_RESOURCE: case BLK_STS_DEV_RESOURCE: blk_mq_request_bypass_insert(rq, 0); blk_mq_run_hw_queue(hctx, false); break; default: blk_mq_end_request(rq, ret); break; } } static blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last) { struct blk_mq_hw_ctx *hctx = rq->mq_hctx; if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(rq->q)) { blk_mq_insert_request(rq, 0); return BLK_STS_OK; } if (!blk_mq_get_budget_and_tag(rq)) return BLK_STS_RESOURCE; return __blk_mq_issue_directly(hctx, rq, last); } static void blk_mq_plug_issue_direct(struct blk_plug *plug) { struct blk_mq_hw_ctx *hctx = NULL; struct request *rq; int queued = 0; blk_status_t ret = BLK_STS_OK; while ((rq = rq_list_pop(&plug->mq_list))) { bool last = rq_list_empty(plug->mq_list); if (hctx != rq->mq_hctx) { if (hctx) { blk_mq_commit_rqs(hctx, queued, false); queued = 0; } hctx = rq->mq_hctx; } ret = blk_mq_request_issue_directly(rq, last); switch (ret) { case BLK_STS_OK: queued++; break; case BLK_STS_RESOURCE: case BLK_STS_DEV_RESOURCE: blk_mq_request_bypass_insert(rq, 0); blk_mq_run_hw_queue(hctx, false); goto out; default: blk_mq_end_request(rq, ret); break; } } out: if (ret != BLK_STS_OK) blk_mq_commit_rqs(hctx, queued, false); } static void __blk_mq_flush_plug_list(struct request_queue *q, struct blk_plug *plug) { if (blk_queue_quiesced(q)) return; q->mq_ops->queue_rqs(&plug->mq_list); } static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched) { struct blk_mq_hw_ctx *this_hctx = NULL; struct blk_mq_ctx *this_ctx = NULL; struct request *requeue_list = NULL; struct request **requeue_lastp = &requeue_list; unsigned int depth = 0; bool is_passthrough = false; LIST_HEAD(list); do { struct request *rq = rq_list_pop(&plug->mq_list); if (!this_hctx) { this_hctx = rq->mq_hctx; this_ctx = rq->mq_ctx; is_passthrough = blk_rq_is_passthrough(rq); } else if (this_hctx != rq->mq_hctx || this_ctx != rq->mq_ctx || is_passthrough != blk_rq_is_passthrough(rq)) { rq_list_add_tail(&requeue_lastp, rq); continue; } list_add(&rq->queuelist, &list); depth++; } while (!rq_list_empty(plug->mq_list)); plug->mq_list = requeue_list; trace_block_unplug(this_hctx->queue, depth, !from_sched); percpu_ref_get(&this_hctx->queue->q_usage_counter); /* passthrough requests should never be issued to the I/O scheduler */ if (is_passthrough) { spin_lock(&this_hctx->lock); list_splice_tail_init(&list, &this_hctx->dispatch); spin_unlock(&this_hctx->lock); blk_mq_run_hw_queue(this_hctx, from_sched); } else if (this_hctx->queue->elevator) { this_hctx->queue->elevator->type->ops.insert_requests(this_hctx, &list, 0); blk_mq_run_hw_queue(this_hctx, from_sched); } else { blk_mq_insert_requests(this_hctx, this_ctx, &list, from_sched); } percpu_ref_put(&this_hctx->queue->q_usage_counter); } void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) { struct request *rq; /* * We may have been called recursively midway through handling * plug->mq_list via a schedule() in the driver's queue_rq() callback. * To avoid mq_list changing under our feet, clear rq_count early and * bail out specifically if rq_count is 0 rather than checking * whether the mq_list is empty. */ if (plug->rq_count == 0) return; plug->rq_count = 0; if (!plug->multiple_queues && !plug->has_elevator && !from_schedule) { struct request_queue *q; rq = rq_list_peek(&plug->mq_list); q = rq->q; /* * Peek first request and see if we have a ->queue_rqs() hook. * If we do, we can dispatch the whole plug list in one go. We * already know at this point that all requests belong to the * same queue, caller must ensure that's the case. */ if (q->mq_ops->queue_rqs) { blk_mq_run_dispatch_ops(q, __blk_mq_flush_plug_list(q, plug)); if (rq_list_empty(plug->mq_list)) return; } blk_mq_run_dispatch_ops(q, blk_mq_plug_issue_direct(plug)); if (rq_list_empty(plug->mq_list)) return; } do { blk_mq_dispatch_plug_list(plug, from_schedule); } while (!rq_list_empty(plug->mq_list)); } static void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, struct list_head *list) { int queued = 0; blk_status_t ret = BLK_STS_OK; while (!list_empty(list)) { struct request *rq = list_first_entry(list, struct request, queuelist); list_del_init(&rq->queuelist); ret = blk_mq_request_issue_directly(rq, list_empty(list)); switch (ret) { case BLK_STS_OK: queued++; break; case BLK_STS_RESOURCE: case BLK_STS_DEV_RESOURCE: blk_mq_request_bypass_insert(rq, 0); if (list_empty(list)) blk_mq_run_hw_queue(hctx, false); goto out; default: blk_mq_end_request(rq, ret); break; } } out: if (ret != BLK_STS_OK) blk_mq_commit_rqs(hctx, queued, false); } static bool blk_mq_attempt_bio_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs) { if (!blk_queue_nomerges(q) && bio_mergeable(bio)) { if (blk_attempt_plug_merge(q, bio, nr_segs)) return true; if (blk_mq_sched_bio_merge(q, bio, nr_segs)) return true; } return false; } static struct request *blk_mq_get_new_requests(struct request_queue *q, struct blk_plug *plug, struct bio *bio, unsigned int nsegs) { struct blk_mq_alloc_data data = { .q = q, .nr_tags = 1, .cmd_flags = bio->bi_opf, }; struct request *rq; if (unlikely(bio_queue_enter(bio))) return NULL; if (blk_mq_attempt_bio_merge(q, bio, nsegs)) goto queue_exit; rq_qos_throttle(q, bio); if (plug) { data.nr_tags = plug->nr_ios; plug->nr_ios = 1; data.cached_rq = &plug->cached_rq; } rq = __blk_mq_alloc_requests(&data); if (rq) return rq; rq_qos_cleanup(q, bio); if (bio->bi_opf & REQ_NOWAIT) bio_wouldblock_error(bio); queue_exit: blk_queue_exit(q); return NULL; } static inline struct request *blk_mq_get_cached_request(struct request_queue *q, struct blk_plug *plug, struct bio **bio, unsigned int nsegs) { struct request *rq; enum hctx_type type, hctx_type; if (!plug) return NULL; rq = rq_list_peek(&plug->cached_rq); if (!rq || rq->q != q) return NULL; if (blk_mq_attempt_bio_merge(q, *bio, nsegs)) { *bio = NULL; return NULL; } type = blk_mq_get_hctx_type((*bio)->bi_opf); hctx_type = rq->mq_hctx->type; if (type != hctx_type && !(type == HCTX_TYPE_READ && hctx_type == HCTX_TYPE_DEFAULT)) return NULL; if (op_is_flush(rq->cmd_flags) != op_is_flush((*bio)->bi_opf)) return NULL; /* * If any qos ->throttle() end up blocking, we will have flushed the * plug and hence killed the cached_rq list as well. Pop this entry * before we throttle. */ plug->cached_rq = rq_list_next(rq); rq_qos_throttle(q, *bio); blk_mq_rq_time_init(rq, 0); rq->cmd_flags = (*bio)->bi_opf; INIT_LIST_HEAD(&rq->queuelist); return rq; } static void bio_set_ioprio(struct bio *bio) { /* Nobody set ioprio so far? Initialize it based on task's nice value */ if (IOPRIO_PRIO_CLASS(bio->bi_ioprio) == IOPRIO_CLASS_NONE) bio->bi_ioprio = get_current_ioprio(); blkcg_set_ioprio(bio); } /** * blk_mq_submit_bio - Create and send a request to block device. * @bio: Bio pointer. * * Builds up a request structure from @q and @bio and send to the device. The * request may not be queued directly to hardware if: * * This request can be merged with another one * * We want to place request at plug queue for possible future merging * * There is an IO scheduler active at this queue * * It will not queue the request if there is an error with the bio, or at the * request creation. */ void blk_mq_submit_bio(struct bio *bio) { struct request_queue *q = bdev_get_queue(bio->bi_bdev); struct blk_plug *plug = blk_mq_plug(bio); const int is_sync = op_is_sync(bio->bi_opf); struct blk_mq_hw_ctx *hctx; struct request *rq; unsigned int nr_segs = 1; blk_status_t ret; bio = blk_queue_bounce(bio, q); if (bio_may_exceed_limits(bio, &q->limits)) { bio = __bio_split_to_limits(bio, &q->limits, &nr_segs); if (!bio) return; } if (!bio_integrity_prep(bio)) return; bio_set_ioprio(bio); rq = blk_mq_get_cached_request(q, plug, &bio, nr_segs); if (!rq) { if (!bio) return; rq = blk_mq_get_new_requests(q, plug, bio, nr_segs); if (unlikely(!rq)) return; } trace_block_getrq(bio); rq_qos_track(q, rq, bio); blk_mq_bio_to_request(rq, bio, nr_segs); ret = blk_crypto_rq_get_keyslot(rq); if (ret != BLK_STS_OK) { bio->bi_status = ret; bio_endio(bio); blk_mq_free_request(rq); return; } if (op_is_flush(bio->bi_opf) && blk_insert_flush(rq)) return; if (plug) { blk_add_rq_to_plug(plug, rq); return; } hctx = rq->mq_hctx; if ((rq->rq_flags & RQF_USE_SCHED) || (hctx->dispatch_busy && (q->nr_hw_queues == 1 || !is_sync))) { blk_mq_insert_request(rq, 0); blk_mq_run_hw_queue(hctx, true); } else { blk_mq_run_dispatch_ops(q, blk_mq_try_issue_directly(hctx, rq)); } } #ifdef CONFIG_BLK_MQ_STACKING /** * blk_insert_cloned_request - Helper for stacking drivers to submit a request * @rq: the request being queued */ blk_status_t blk_insert_cloned_request(struct request *rq) { struct request_queue *q = rq->q; unsigned int max_sectors = blk_queue_get_max_sectors(q, req_op(rq)); unsigned int max_segments = blk_rq_get_max_segments(rq); blk_status_t ret; if (blk_rq_sectors(rq) > max_sectors) { /* * SCSI device does not have a good way to return if * Write Same/Zero is actually supported. If a device rejects * a non-read/write command (discard, write same,etc.) the * low-level device driver will set the relevant queue limit to * 0 to prevent blk-lib from issuing more of the offending * operations. Commands queued prior to the queue limit being * reset need to be completed with BLK_STS_NOTSUPP to avoid I/O * errors being propagated to upper layers. */ if (max_sectors == 0) return BLK_STS_NOTSUPP; printk(KERN_ERR "%s: over max size limit. (%u > %u)\n", __func__, blk_rq_sectors(rq), max_sectors); return BLK_STS_IOERR; } /* * The queue settings related to segment counting may differ from the * original queue. */ rq->nr_phys_segments = blk_recalc_rq_segments(rq); if (rq->nr_phys_segments > max_segments) { printk(KERN_ERR "%s: over max segments limit. (%u > %u)\n", __func__, rq->nr_phys_segments, max_segments); return BLK_STS_IOERR; } if (q->disk && should_fail_request(q->disk->part0, blk_rq_bytes(rq))) return BLK_STS_IOERR; ret = blk_crypto_rq_get_keyslot(rq); if (ret != BLK_STS_OK) return ret; blk_account_io_start(rq); /* * Since we have a scheduler attached on the top device, * bypass a potential scheduler on the bottom device for * insert. */ blk_mq_run_dispatch_ops(q, ret = blk_mq_request_issue_directly(rq, true)); if (ret) blk_account_io_done(rq, ktime_get_ns()); return ret; } EXPORT_SYMBOL_GPL(blk_insert_cloned_request); /** * blk_rq_unprep_clone - Helper function to free all bios in a cloned request * @rq: the clone request to be cleaned up * * Description: * Free all bios in @rq for a cloned request. */ void blk_rq_unprep_clone(struct request *rq) { struct bio *bio; while ((bio = rq->bio) != NULL) { rq->bio = bio->bi_next; bio_put(bio); } } EXPORT_SYMBOL_GPL(blk_rq_unprep_clone); /** * blk_rq_prep_clone - Helper function to setup clone request * @rq: the request to be setup * @rq_src: original request to be cloned * @bs: bio_set that bios for clone are allocated from * @gfp_mask: memory allocation mask for bio * @bio_ctr: setup function to be called for each clone bio. * Returns %0 for success, non %0 for failure. * @data: private data to be passed to @bio_ctr * * Description: * Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq. * Also, pages which the original bios are pointing to are not copied * and the cloned bios just point same pages. * So cloned bios must be completed before original bios, which means * the caller must complete @rq before @rq_src. */ int blk_rq_prep_clone(struct request *rq, struct request *rq_src, struct bio_set *bs, gfp_t gfp_mask, int (*bio_ctr)(struct bio *, struct bio *, void *), void *data) { struct bio *bio, *bio_src; if (!bs) bs = &fs_bio_set; __rq_for_each_bio(bio_src, rq_src) { bio = bio_alloc_clone(rq->q->disk->part0, bio_src, gfp_mask, bs); if (!bio) goto free_and_out; if (bio_ctr && bio_ctr(bio, bio_src, data)) goto free_and_out; if (rq->bio) { rq->biotail->bi_next = bio; rq->biotail = bio; } else { rq->bio = rq->biotail = bio; } bio = NULL; } /* Copy attributes of the original request to the clone request. */ rq->__sector = blk_rq_pos(rq_src); rq->__data_len = blk_rq_bytes(rq_src); if (rq_src->rq_flags & RQF_SPECIAL_PAYLOAD) { rq->rq_flags |= RQF_SPECIAL_PAYLOAD; rq->special_vec = rq_src->special_vec; } rq->nr_phys_segments = rq_src->nr_phys_segments; rq->ioprio = rq_src->ioprio; if (rq->bio && blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask) < 0) goto free_and_out; return 0; free_and_out: if (bio) bio_put(bio); blk_rq_unprep_clone(rq); return -ENOMEM; } EXPORT_SYMBOL_GPL(blk_rq_prep_clone); #endif /* CONFIG_BLK_MQ_STACKING */ /* * Steal bios from a request and add them to a bio list. * The request must not have been partially completed before. */ void blk_steal_bios(struct bio_list *list, struct request *rq) { if (rq->bio) { if (list->tail) list->tail->bi_next = rq->bio; else list->head = rq->bio; list->tail = rq->biotail; rq->bio = NULL; rq->biotail = NULL; } rq->__data_len = 0; } EXPORT_SYMBOL_GPL(blk_steal_bios); static size_t order_to_size(unsigned int order) { return (size_t)PAGE_SIZE << order; } /* called before freeing request pool in @tags */ static void blk_mq_clear_rq_mapping(struct blk_mq_tags *drv_tags, struct blk_mq_tags *tags) { struct page *page; unsigned long flags; /* * There is no need to clear mapping if driver tags is not initialized * or the mapping belongs to the driver tags. */ if (!drv_tags || drv_tags == tags) return; list_for_each_entry(page, &tags->page_list, lru) { unsigned long start = (unsigned long)page_address(page); unsigned long end = start + order_to_size(page->private); int i; for (i = 0; i < drv_tags->nr_tags; i++) { struct request *rq = drv_tags->rqs[i]; unsigned long rq_addr = (unsigned long)rq; if (rq_addr >= start && rq_addr < end) { WARN_ON_ONCE(req_ref_read(rq) != 0); cmpxchg(&drv_tags->rqs[i], rq, NULL); } } } /* * Wait until all pending iteration is done. * * Request reference is cleared and it is guaranteed to be observed * after the ->lock is released. */ spin_lock_irqsave(&drv_tags->lock, flags); spin_unlock_irqrestore(&drv_tags->lock, flags); } void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx) { struct blk_mq_tags *drv_tags; struct page *page; if (list_empty(&tags->page_list)) return; if (blk_mq_is_shared_tags(set->flags)) drv_tags = set->shared_tags; else drv_tags = set->tags[hctx_idx]; if (tags->static_rqs && set->ops->exit_request) { int i; for (i = 0; i < tags->nr_tags; i++) { struct request *rq = tags->static_rqs[i]; if (!rq) continue; set->ops->exit_request(set, rq, hctx_idx); tags->static_rqs[i] = NULL; } } blk_mq_clear_rq_mapping(drv_tags, tags); while (!list_empty(&tags->page_list)) { page = list_first_entry(&tags->page_list, struct page, lru); list_del_init(&page->lru); /* * Remove kmemleak object previously allocated in * blk_mq_alloc_rqs(). */ kmemleak_free(page_address(page)); __free_pages(page, page->private); } } void blk_mq_free_rq_map(struct blk_mq_tags *tags) { kfree(tags->rqs); tags->rqs = NULL; kfree(tags->static_rqs); tags->static_rqs = NULL; blk_mq_free_tags(tags); } static enum hctx_type hctx_idx_to_type(struct blk_mq_tag_set *set, unsigned int hctx_idx) { int i; for (i = 0; i < set->nr_maps; i++) { unsigned int start = set->map[i].queue_offset; unsigned int end = start + set->map[i].nr_queues; if (hctx_idx >= start && hctx_idx < end) break; } if (i >= set->nr_maps) i = HCTX_TYPE_DEFAULT; return i; } static int blk_mq_get_hctx_node(struct blk_mq_tag_set *set, unsigned int hctx_idx) { enum hctx_type type = hctx_idx_to_type(set, hctx_idx); return blk_mq_hw_queue_to_node(&set->map[type], hctx_idx); } static struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, unsigned int hctx_idx, unsigned int nr_tags, unsigned int reserved_tags) { int node = blk_mq_get_hctx_node(set, hctx_idx); struct blk_mq_tags *tags; if (node == NUMA_NO_NODE) node = set->numa_node; tags = blk_mq_init_tags(nr_tags, reserved_tags, node, BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags)); if (!tags) return NULL; tags->rqs = kcalloc_node(nr_tags, sizeof(struct request *), GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, node); if (!tags->rqs) goto err_free_tags; tags->static_rqs = kcalloc_node(nr_tags, sizeof(struct request *), GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, node); if (!tags->static_rqs) goto err_free_rqs; return tags; err_free_rqs: kfree(tags->rqs); err_free_tags: blk_mq_free_tags(tags); return NULL; } static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq, unsigned int hctx_idx, int node) { int ret; if (set->ops->init_request) { ret = set->ops->init_request(set, rq, hctx_idx, node); if (ret) return ret; } WRITE_ONCE(rq->state, MQ_RQ_IDLE); return 0; } static int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx, unsigned int depth) { unsigned int i, j, entries_per_page, max_order = 4; int node = blk_mq_get_hctx_node(set, hctx_idx); size_t rq_size, left; if (node == NUMA_NO_NODE) node = set->numa_node; INIT_LIST_HEAD(&tags->page_list); /* * rq_size is the size of the request plus driver payload, rounded * to the cacheline size */ rq_size = round_up(sizeof(struct request) + set->cmd_size, cache_line_size()); left = rq_size * depth; for (i = 0; i < depth; ) { int this_order = max_order; struct page *page; int to_do; void *p; while (this_order && left < order_to_size(this_order - 1)) this_order--; do { page = alloc_pages_node(node, GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO, this_order); if (page) break; if (!this_order--) break; if (order_to_size(this_order) < rq_size) break; } while (1); if (!page) goto fail; page->private = this_order; list_add_tail(&page->lru, &tags->page_list); p = page_address(page); /* * Allow kmemleak to scan these pages as they contain pointers * to additional allocations like via ops->init_request(). */ kmemleak_alloc(p, order_to_size(this_order), 1, GFP_NOIO); entries_per_page = order_to_size(this_order) / rq_size; to_do = min(entries_per_page, depth - i); left -= to_do * rq_size; for (j = 0; j < to_do; j++) { struct request *rq = p; tags->static_rqs[i] = rq; if (blk_mq_init_request(set, rq, hctx_idx, node)) { tags->static_rqs[i] = NULL; goto fail; } p += rq_size; i++; } } return 0; fail: blk_mq_free_rqs(set, tags, hctx_idx); return -ENOMEM; } struct rq_iter_data { struct blk_mq_hw_ctx *hctx; bool has_rq; }; static bool blk_mq_has_request(struct request *rq, void *data) { struct rq_iter_data *iter_data = data; if (rq->mq_hctx != iter_data->hctx) return true; iter_data->has_rq = true; return false; } static bool blk_mq_hctx_has_requests(struct blk_mq_hw_ctx *hctx) { struct blk_mq_tags *tags = hctx->sched_tags ? hctx->sched_tags : hctx->tags; struct rq_iter_data data = { .hctx = hctx, }; blk_mq_all_tag_iter(tags, blk_mq_has_request, &data); return data.has_rq; } static inline bool blk_mq_last_cpu_in_hctx(unsigned int cpu, struct blk_mq_hw_ctx *hctx) { if (cpumask_first_and(hctx->cpumask, cpu_online_mask) != cpu) return false; if (cpumask_next_and(cpu, hctx->cpumask, cpu_online_mask) < nr_cpu_ids) return false; return true; } static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node) { struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_online); if (!cpumask_test_cpu(cpu, hctx->cpumask) || !blk_mq_last_cpu_in_hctx(cpu, hctx)) return 0; /* * Prevent new request from being allocated on the current hctx. * * The smp_mb__after_atomic() Pairs with the implied barrier in * test_and_set_bit_lock in sbitmap_get(). Ensures the inactive flag is * seen once we return from the tag allocator. */ set_bit(BLK_MQ_S_INACTIVE, &hctx->state); smp_mb__after_atomic(); /* * Try to grab a reference to the queue and wait for any outstanding * requests. If we could not grab a reference the queue has been * frozen and there are no requests. */ if (percpu_ref_tryget(&hctx->queue->q_usage_counter)) { while (blk_mq_hctx_has_requests(hctx)) msleep(5); percpu_ref_put(&hctx->queue->q_usage_counter); } return 0; } static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node) { struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_online); if (cpumask_test_cpu(cpu, hctx->cpumask)) clear_bit(BLK_MQ_S_INACTIVE, &hctx->state); return 0; } /* * 'cpu' is going away. splice any existing rq_list entries from this * software queue to the hw queue dispatch list, and ensure that it * gets run. */ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node) { struct blk_mq_hw_ctx *hctx; struct blk_mq_ctx *ctx; LIST_HEAD(tmp); enum hctx_type type; hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead); if (!cpumask_test_cpu(cpu, hctx->cpumask)) return 0; ctx = __blk_mq_get_ctx(hctx->queue, cpu); type = hctx->type; spin_lock(&ctx->lock); if (!list_empty(&ctx->rq_lists[type])) { list_splice_init(&ctx->rq_lists[type], &tmp); blk_mq_hctx_clear_pending(hctx, ctx); } spin_unlock(&ctx->lock); if (list_empty(&tmp)) return 0; spin_lock(&hctx->lock); list_splice_tail_init(&tmp, &hctx->dispatch); spin_unlock(&hctx->lock); blk_mq_run_hw_queue(hctx, true); return 0; } static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx) { if (!(hctx->flags & BLK_MQ_F_STACKING)) cpuhp_state_remove_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE, &hctx->cpuhp_online); cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead); } /* * Before freeing hw queue, clearing the flush request reference in * tags->rqs[] for avoiding potential UAF. */ static void blk_mq_clear_flush_rq_mapping(struct blk_mq_tags *tags, unsigned int queue_depth, struct request *flush_rq) { int i; unsigned long flags; /* The hw queue may not be mapped yet */ if (!tags) return; WARN_ON_ONCE(req_ref_read(flush_rq) != 0); for (i = 0; i < queue_depth; i++) cmpxchg(&tags->rqs[i], flush_rq, NULL); /* * Wait until all pending iteration is done. * * Request reference is cleared and it is guaranteed to be observed * after the ->lock is released. */ spin_lock_irqsave(&tags->lock, flags); spin_unlock_irqrestore(&tags->lock, flags); } /* hctx->ctxs will be freed in queue's release handler */ static void blk_mq_exit_hctx(struct request_queue *q, struct blk_mq_tag_set *set, struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) { struct request *flush_rq = hctx->fq->flush_rq; if (blk_mq_hw_queue_mapped(hctx)) blk_mq_tag_idle(hctx); if (blk_queue_init_done(q)) blk_mq_clear_flush_rq_mapping(set->tags[hctx_idx], set->queue_depth, flush_rq); if (set->ops->exit_request) set->ops->exit_request(set, flush_rq, hctx_idx); if (set->ops->exit_hctx) set->ops->exit_hctx(hctx, hctx_idx); blk_mq_remove_cpuhp(hctx); xa_erase(&q->hctx_table, hctx_idx); spin_lock(&q->unused_hctx_lock); list_add(&hctx->hctx_list, &q->unused_hctx_list); spin_unlock(&q->unused_hctx_lock); } static void blk_mq_exit_hw_queues(struct request_queue *q, struct blk_mq_tag_set *set, int nr_queue) { struct blk_mq_hw_ctx *hctx; unsigned long i; queue_for_each_hw_ctx(q, hctx, i) { if (i == nr_queue) break; blk_mq_exit_hctx(q, set, hctx, i); } } static int blk_mq_init_hctx(struct request_queue *q, struct blk_mq_tag_set *set, struct blk_mq_hw_ctx *hctx, unsigned hctx_idx) { hctx->queue_num = hctx_idx; if (!(hctx->flags & BLK_MQ_F_STACKING)) cpuhp_state_add_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE, &hctx->cpuhp_online); cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead); hctx->tags = set->tags[hctx_idx]; if (set->ops->init_hctx && set->ops->init_hctx(hctx, set->driver_data, hctx_idx)) goto unregister_cpu_notifier; if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx, hctx->numa_node)) goto exit_hctx; if (xa_insert(&q->hctx_table, hctx_idx, hctx, GFP_KERNEL)) goto exit_flush_rq; return 0; exit_flush_rq: if (set->ops->exit_request) set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx); exit_hctx: if (set->ops->exit_hctx) set->ops->exit_hctx(hctx, hctx_idx); unregister_cpu_notifier: blk_mq_remove_cpuhp(hctx); return -1; } static struct blk_mq_hw_ctx * blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set, int node) { struct blk_mq_hw_ctx *hctx; gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY; hctx = kzalloc_node(sizeof(struct blk_mq_hw_ctx), gfp, node); if (!hctx) goto fail_alloc_hctx; if (!zalloc_cpumask_var_node(&hctx->cpumask, gfp, node)) goto free_hctx; atomic_set(&hctx->nr_active, 0); if (node == NUMA_NO_NODE) node = set->numa_node; hctx->numa_node = node; INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn); spin_lock_init(&hctx->lock); INIT_LIST_HEAD(&hctx->dispatch); hctx->queue = q; hctx->flags = set->flags & ~BLK_MQ_F_TAG_QUEUE_SHARED; INIT_LIST_HEAD(&hctx->hctx_list); /* * Allocate space for all possible cpus to avoid allocation at * runtime */ hctx->ctxs = kmalloc_array_node(nr_cpu_ids, sizeof(void *), gfp, node); if (!hctx->ctxs) goto free_cpumask; if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8), gfp, node, false, false)) goto free_ctxs; hctx->nr_ctx = 0; spin_lock_init(&hctx->dispatch_wait_lock); init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake); INIT_LIST_HEAD(&hctx->dispatch_wait.entry); hctx->fq = blk_alloc_flush_queue(hctx->numa_node, set->cmd_size, gfp); if (!hctx->fq) goto free_bitmap; blk_mq_hctx_kobj_init(hctx); return hctx; free_bitmap: sbitmap_free(&hctx->ctx_map); free_ctxs: kfree(hctx->ctxs); free_cpumask: free_cpumask_var(hctx->cpumask); free_hctx: kfree(hctx); fail_alloc_hctx: return NULL; } static void blk_mq_init_cpu_queues(struct request_queue *q, unsigned int nr_hw_queues) { struct blk_mq_tag_set *set = q->tag_set; unsigned int i, j; for_each_possible_cpu(i) { struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i); struct blk_mq_hw_ctx *hctx; int k; __ctx->cpu = i; spin_lock_init(&__ctx->lock); for (k = HCTX_TYPE_DEFAULT; k < HCTX_MAX_TYPES; k++) INIT_LIST_HEAD(&__ctx->rq_lists[k]); __ctx->queue = q; /* * Set local node, IFF we have more than one hw queue. If * not, we remain on the home node of the device */ for (j = 0; j < set->nr_maps; j++) { hctx = blk_mq_map_queue_type(q, j, i); if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE) hctx->numa_node = cpu_to_node(i); } } } struct blk_mq_tags *blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set, unsigned int hctx_idx, unsigned int depth) { struct blk_mq_tags *tags; int ret; tags = blk_mq_alloc_rq_map(set, hctx_idx, depth, set->reserved_tags); if (!tags) return NULL; ret = blk_mq_alloc_rqs(set, tags, hctx_idx, depth); if (ret) { blk_mq_free_rq_map(tags); return NULL; } return tags; } static bool __blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set, int hctx_idx) { if (blk_mq_is_shared_tags(set->flags)) { set->tags[hctx_idx] = set->shared_tags; return true; } set->tags[hctx_idx] = blk_mq_alloc_map_and_rqs(set, hctx_idx, set->queue_depth); return set->tags[hctx_idx]; } void blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx) { if (tags) { blk_mq_free_rqs(set, tags, hctx_idx); blk_mq_free_rq_map(tags); } } static void __blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set, unsigned int hctx_idx) { if (!blk_mq_is_shared_tags(set->flags)) blk_mq_free_map_and_rqs(set, set->tags[hctx_idx], hctx_idx); set->tags[hctx_idx] = NULL; } static void blk_mq_map_swqueue(struct request_queue *q) { unsigned int j, hctx_idx; unsigned long i; struct blk_mq_hw_ctx *hctx; struct blk_mq_ctx *ctx; struct blk_mq_tag_set *set = q->tag_set; queue_for_each_hw_ctx(q, hctx, i) { cpumask_clear(hctx->cpumask); hctx->nr_ctx = 0; hctx->dispatch_from = NULL; } /* * Map software to hardware queues. * * If the cpu isn't present, the cpu is mapped to first hctx. */ for_each_possible_cpu(i) { ctx = per_cpu_ptr(q->queue_ctx, i); for (j = 0; j < set->nr_maps; j++) { if (!set->map[j].nr_queues) { ctx->hctxs[j] = blk_mq_map_queue_type(q, HCTX_TYPE_DEFAULT, i); continue; } hctx_idx = set->map[j].mq_map[i]; /* unmapped hw queue can be remapped after CPU topo changed */ if (!set->tags[hctx_idx] && !__blk_mq_alloc_map_and_rqs(set, hctx_idx)) { /* * If tags initialization fail for some hctx, * that hctx won't be brought online. In this * case, remap the current ctx to hctx[0] which * is guaranteed to always have tags allocated */ set->map[j].mq_map[i] = 0; } hctx = blk_mq_map_queue_type(q, j, i); ctx->hctxs[j] = hctx; /* * If the CPU is already set in the mask, then we've * mapped this one already. This can happen if * devices share queues across queue maps. */ if (cpumask_test_cpu(i, hctx->cpumask)) continue; cpumask_set_cpu(i, hctx->cpumask); hctx->type = j; ctx->index_hw[hctx->type] = hctx->nr_ctx; hctx->ctxs[hctx->nr_ctx++] = ctx; /* * If the nr_ctx type overflows, we have exceeded the * amount of sw queues we can support. */ BUG_ON(!hctx->nr_ctx); } for (; j < HCTX_MAX_TYPES; j++) ctx->hctxs[j] = blk_mq_map_queue_type(q, HCTX_TYPE_DEFAULT, i); } queue_for_each_hw_ctx(q, hctx, i) { /* * If no software queues are mapped to this hardware queue, * disable it and free the request entries. */ if (!hctx->nr_ctx) { /* Never unmap queue 0. We need it as a * fallback in case of a new remap fails * allocation */ if (i) __blk_mq_free_map_and_rqs(set, i); hctx->tags = NULL; continue; } hctx->tags = set->tags[i]; WARN_ON(!hctx->tags); /* * Set the map size to the number of mapped software queues. * This is more accurate and more efficient than looping * over all possibly mapped software queues. */ sbitmap_resize(&hctx->ctx_map, hctx->nr_ctx); /* * Initialize batch roundrobin counts */ hctx->next_cpu = blk_mq_first_mapped_cpu(hctx); hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; } } /* * Caller needs to ensure that we're either frozen/quiesced, or that * the queue isn't live yet. */ static void queue_set_hctx_shared(struct request_queue *q, bool shared) { struct blk_mq_hw_ctx *hctx; unsigned long i; queue_for_each_hw_ctx(q, hctx, i) { if (shared) { hctx->flags |= BLK_MQ_F_TAG_QUEUE_SHARED; } else { blk_mq_tag_idle(hctx); hctx->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED; } } } static void blk_mq_update_tag_set_shared(struct blk_mq_tag_set *set, bool shared) { struct request_queue *q; lockdep_assert_held(&set->tag_list_lock); list_for_each_entry(q, &set->tag_list, tag_set_list) { blk_mq_freeze_queue(q); queue_set_hctx_shared(q, shared); blk_mq_unfreeze_queue(q); } } static void blk_mq_del_queue_tag_set(struct request_queue *q) { struct blk_mq_tag_set *set = q->tag_set; mutex_lock(&set->tag_list_lock); list_del(&q->tag_set_list); if (list_is_singular(&set->tag_list)) { /* just transitioned to unshared */ set->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED; /* update existing queue */ blk_mq_update_tag_set_shared(set, false); } mutex_unlock(&set->tag_list_lock); INIT_LIST_HEAD(&q->tag_set_list); } static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set, struct request_queue *q) { mutex_lock(&set->tag_list_lock); /* * Check to see if we're transitioning to shared (from 1 to 2 queues). */ if (!list_empty(&set->tag_list) && !(set->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) { set->flags |= BLK_MQ_F_TAG_QUEUE_SHARED; /* update existing queue */ blk_mq_update_tag_set_shared(set, true); } if (set->flags & BLK_MQ_F_TAG_QUEUE_SHARED) queue_set_hctx_shared(q, true); list_add_tail(&q->tag_set_list, &set->tag_list); mutex_unlock(&set->tag_list_lock); } /* All allocations will be freed in release handler of q->mq_kobj */ static int blk_mq_alloc_ctxs(struct request_queue *q) { struct blk_mq_ctxs *ctxs; int cpu; ctxs = kzalloc(sizeof(*ctxs), GFP_KERNEL); if (!ctxs) return -ENOMEM; ctxs->queue_ctx = alloc_percpu(struct blk_mq_ctx); if (!ctxs->queue_ctx) goto fail; for_each_possible_cpu(cpu) { struct blk_mq_ctx *ctx = per_cpu_ptr(ctxs->queue_ctx, cpu); ctx->ctxs = ctxs; } q->mq_kobj = &ctxs->kobj; q->queue_ctx = ctxs->queue_ctx; return 0; fail: kfree(ctxs); return -ENOMEM; } /* * It is the actual release handler for mq, but we do it from * request queue's release handler for avoiding use-after-free * and headache because q->mq_kobj shouldn't have been introduced, * but we can't group ctx/kctx kobj without it. */ void blk_mq_release(struct request_queue *q) { struct blk_mq_hw_ctx *hctx, *next; unsigned long i; queue_for_each_hw_ctx(q, hctx, i) WARN_ON_ONCE(hctx && list_empty(&hctx->hctx_list)); /* all hctx are in .unused_hctx_list now */ list_for_each_entry_safe(hctx, next, &q->unused_hctx_list, hctx_list) { list_del_init(&hctx->hctx_list); kobject_put(&hctx->kobj); } xa_destroy(&q->hctx_table); /* * release .mq_kobj and sw queue's kobject now because * both share lifetime with request queue. */ blk_mq_sysfs_deinit(q); } static struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set, void *queuedata) { struct request_queue *q; int ret; q = blk_alloc_queue(set->numa_node); if (!q) return ERR_PTR(-ENOMEM); q->queuedata = queuedata; ret = blk_mq_init_allocated_queue(set, q); if (ret) { blk_put_queue(q); return ERR_PTR(ret); } return q; } struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) { return blk_mq_init_queue_data(set, NULL); } EXPORT_SYMBOL(blk_mq_init_queue); /** * blk_mq_destroy_queue - shutdown a request queue * @q: request queue to shutdown * * This shuts down a request queue allocated by blk_mq_init_queue(). All future * requests will be failed with -ENODEV. The caller is responsible for dropping * the reference from blk_mq_init_queue() by calling blk_put_queue(). * * Context: can sleep */ void blk_mq_destroy_queue(struct request_queue *q) { WARN_ON_ONCE(!queue_is_mq(q)); WARN_ON_ONCE(blk_queue_registered(q)); might_sleep(); blk_queue_flag_set(QUEUE_FLAG_DYING, q); blk_queue_start_drain(q); blk_mq_freeze_queue_wait(q); blk_sync_queue(q); blk_mq_cancel_work_sync(q); blk_mq_exit_queue(q); } EXPORT_SYMBOL(blk_mq_destroy_queue); struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata, struct lock_class_key *lkclass) { struct request_queue *q; struct gendisk *disk; q = blk_mq_init_queue_data(set, queuedata); if (IS_ERR(q)) return ERR_CAST(q); disk = __alloc_disk_node(q, set->numa_node, lkclass); if (!disk) { blk_mq_destroy_queue(q); blk_put_queue(q); return ERR_PTR(-ENOMEM); } set_bit(GD_OWNS_QUEUE, &disk->state); return disk; } EXPORT_SYMBOL(__blk_mq_alloc_disk); struct gendisk *blk_mq_alloc_disk_for_queue(struct request_queue *q, struct lock_class_key *lkclass) { struct gendisk *disk; if (!blk_get_queue(q)) return NULL; disk = __alloc_disk_node(q, NUMA_NO_NODE, lkclass); if (!disk) blk_put_queue(q); return disk; } EXPORT_SYMBOL(blk_mq_alloc_disk_for_queue); static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx( struct blk_mq_tag_set *set, struct request_queue *q, int hctx_idx, int node) { struct blk_mq_hw_ctx *hctx = NULL, *tmp; /* reuse dead hctx first */ spin_lock(&q->unused_hctx_lock); list_for_each_entry(tmp, &q->unused_hctx_list, hctx_list) { if (tmp->numa_node == node) { hctx = tmp; break; } } if (hctx) list_del_init(&hctx->hctx_list); spin_unlock(&q->unused_hctx_lock); if (!hctx) hctx = blk_mq_alloc_hctx(q, set, node); if (!hctx) goto fail; if (blk_mq_init_hctx(q, set, hctx, hctx_idx)) goto free_hctx; return hctx; free_hctx: kobject_put(&hctx->kobj); fail: return NULL; } static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, struct request_queue *q) { struct blk_mq_hw_ctx *hctx; unsigned long i, j; /* protect against switching io scheduler */ mutex_lock(&q->sysfs_lock); for (i = 0; i < set->nr_hw_queues; i++) { int old_node; int node = blk_mq_get_hctx_node(set, i); struct blk_mq_hw_ctx *old_hctx = xa_load(&q->hctx_table, i); if (old_hctx) { old_node = old_hctx->numa_node; blk_mq_exit_hctx(q, set, old_hctx, i); } if (!blk_mq_alloc_and_init_hctx(set, q, i, node)) { if (!old_hctx) break; pr_warn("Allocate new hctx on node %d fails, fallback to previous one on node %d\n", node, old_node); hctx = blk_mq_alloc_and_init_hctx(set, q, i, old_node); WARN_ON_ONCE(!hctx); } } /* * Increasing nr_hw_queues fails. Free the newly allocated * hctxs and keep the previous q->nr_hw_queues. */ if (i != set->nr_hw_queues) { j = q->nr_hw_queues; } else { j = i; q->nr_hw_queues = set->nr_hw_queues; } xa_for_each_start(&q->hctx_table, j, hctx, j) blk_mq_exit_hctx(q, set, hctx, j); mutex_unlock(&q->sysfs_lock); } static void blk_mq_update_poll_flag(struct request_queue *q) { struct blk_mq_tag_set *set = q->tag_set; if (set->nr_maps > HCTX_TYPE_POLL && set->map[HCTX_TYPE_POLL].nr_queues) blk_queue_flag_set(QUEUE_FLAG_POLL, q); else blk_queue_flag_clear(QUEUE_FLAG_POLL, q); } int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, struct request_queue *q) { /* mark the queue as mq asap */ q->mq_ops = set->ops; if (blk_mq_alloc_ctxs(q)) goto err_exit; /* init q->mq_kobj and sw queues' kobjects */ blk_mq_sysfs_init(q); INIT_LIST_HEAD(&q->unused_hctx_list); spin_lock_init(&q->unused_hctx_lock); xa_init(&q->hctx_table); blk_mq_realloc_hw_ctxs(set, q); if (!q->nr_hw_queues) goto err_hctxs; INIT_WORK(&q->timeout_work, blk_mq_timeout_work); blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ); q->tag_set = set; q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT; blk_mq_update_poll_flag(q); INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work); INIT_LIST_HEAD(&q->flush_list); INIT_LIST_HEAD(&q->requeue_list); spin_lock_init(&q->requeue_lock); q->nr_requests = set->queue_depth; blk_mq_init_cpu_queues(q, set->nr_hw_queues); blk_mq_add_queue_tag_set(set, q); blk_mq_map_swqueue(q); return 0; err_hctxs: blk_mq_release(q); err_exit: q->mq_ops = NULL; return -ENOMEM; } EXPORT_SYMBOL(blk_mq_init_allocated_queue); /* tags can _not_ be used after returning from blk_mq_exit_queue */ void blk_mq_exit_queue(struct request_queue *q) { struct blk_mq_tag_set *set = q->tag_set; /* Checks hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED. */ blk_mq_exit_hw_queues(q, set, set->nr_hw_queues); /* May clear BLK_MQ_F_TAG_QUEUE_SHARED in hctx->flags. */ blk_mq_del_queue_tag_set(q); } static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) { int i; if (blk_mq_is_shared_tags(set->flags)) { set->shared_tags = blk_mq_alloc_map_and_rqs(set, BLK_MQ_NO_HCTX_IDX, set->queue_depth); if (!set->shared_tags) return -ENOMEM; } for (i = 0; i < set->nr_hw_queues; i++) { if (!__blk_mq_alloc_map_and_rqs(set, i)) goto out_unwind; cond_resched(); } return 0; out_unwind: while (--i >= 0) __blk_mq_free_map_and_rqs(set, i); if (blk_mq_is_shared_tags(set->flags)) { blk_mq_free_map_and_rqs(set, set->shared_tags, BLK_MQ_NO_HCTX_IDX); } return -ENOMEM; } /* * Allocate the request maps associated with this tag_set. Note that this * may reduce the depth asked for, if memory is tight. set->queue_depth * will be updated to reflect the allocated depth. */ static int blk_mq_alloc_set_map_and_rqs(struct blk_mq_tag_set *set) { unsigned int depth; int err; depth = set->queue_depth; do { err = __blk_mq_alloc_rq_maps(set); if (!err) break; set->queue_depth >>= 1; if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) { err = -ENOMEM; break; } } while (set->queue_depth); if (!set->queue_depth || err) { pr_err("blk-mq: failed to allocate request map\n"); return -ENOMEM; } if (depth != set->queue_depth) pr_info("blk-mq: reduced tag depth (%u -> %u)\n", depth, set->queue_depth); return 0; } static void blk_mq_update_queue_map(struct blk_mq_tag_set *set) { /* * blk_mq_map_queues() and multiple .map_queues() implementations * expect that set->map[HCTX_TYPE_DEFAULT].nr_queues is set to the * number of hardware queues. */ if (set->nr_maps == 1) set->map[HCTX_TYPE_DEFAULT].nr_queues = set->nr_hw_queues; if (set->ops->map_queues && !is_kdump_kernel()) { int i; /* * transport .map_queues is usually done in the following * way: * * for (queue = 0; queue < set->nr_hw_queues; queue++) { * mask = get_cpu_mask(queue) * for_each_cpu(cpu, mask) * set->map[x].mq_map[cpu] = queue; * } * * When we need to remap, the table has to be cleared for * killing stale mapping since one CPU may not be mapped * to any hw queue. */ for (i = 0; i < set->nr_maps; i++) blk_mq_clear_mq_map(&set->map[i]); set->ops->map_queues(set); } else { BUG_ON(set->nr_maps > 1); blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]); } } static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set, int new_nr_hw_queues) { struct blk_mq_tags **new_tags; int i; if (set->nr_hw_queues >= new_nr_hw_queues) goto done; new_tags = kcalloc_node(new_nr_hw_queues, sizeof(struct blk_mq_tags *), GFP_KERNEL, set->numa_node); if (!new_tags) return -ENOMEM; if (set->tags) memcpy(new_tags, set->tags, set->nr_hw_queues * sizeof(*set->tags)); kfree(set->tags); set->tags = new_tags; for (i = set->nr_hw_queues; i < new_nr_hw_queues; i++) { if (!__blk_mq_alloc_map_and_rqs(set, i)) { while (--i >= set->nr_hw_queues) __blk_mq_free_map_and_rqs(set, i); return -ENOMEM; } cond_resched(); } done: set->nr_hw_queues = new_nr_hw_queues; return 0; } /* * Alloc a tag set to be associated with one or more request queues. * May fail with EINVAL for various error conditions. May adjust the * requested depth down, if it's too large. In that case, the set * value will be stored in set->queue_depth. */ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) { int i, ret; BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS); if (!set->nr_hw_queues) return -EINVAL; if (!set->queue_depth) return -EINVAL; if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) return -EINVAL; if (!set->ops->queue_rq) return -EINVAL; if (!set->ops->get_budget ^ !set->ops->put_budget) return -EINVAL; if (set->queue_depth > BLK_MQ_MAX_DEPTH) { pr_info("blk-mq: reduced tag depth to %u\n", BLK_MQ_MAX_DEPTH); set->queue_depth = BLK_MQ_MAX_DEPTH; } if (!set->nr_maps) set->nr_maps = 1; else if (set->nr_maps > HCTX_MAX_TYPES) return -EINVAL; /* * If a crashdump is active, then we are potentially in a very * memory constrained environment. Limit us to 1 queue and * 64 tags to prevent using too much memory. */ if (is_kdump_kernel()) { set->nr_hw_queues = 1; set->nr_maps = 1; set->queue_depth = min(64U, set->queue_depth); } /* * There is no use for more h/w queues than cpus if we just have * a single map */ if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids) set->nr_hw_queues = nr_cpu_ids; if (set->flags & BLK_MQ_F_BLOCKING) { set->srcu = kmalloc(sizeof(*set->srcu), GFP_KERNEL); if (!set->srcu) return -ENOMEM; ret = init_srcu_struct(set->srcu); if (ret) goto out_free_srcu; } ret = -ENOMEM; set->tags = kcalloc_node(set->nr_hw_queues, sizeof(struct blk_mq_tags *), GFP_KERNEL, set->numa_node); if (!set->tags) goto out_cleanup_srcu; for (i = 0; i < set->nr_maps; i++) { set->map[i].mq_map = kcalloc_node(nr_cpu_ids, sizeof(set->map[i].mq_map[0]), GFP_KERNEL, set->numa_node); if (!set->map[i].mq_map) goto out_free_mq_map; set->map[i].nr_queues = is_kdump_kernel() ? 1 : set->nr_hw_queues; } blk_mq_update_queue_map(set); ret = blk_mq_alloc_set_map_and_rqs(set); if (ret) goto out_free_mq_map; mutex_init(&set->tag_list_lock); INIT_LIST_HEAD(&set->tag_list); return 0; out_free_mq_map: for (i = 0; i < set->nr_maps; i++) { kfree(set->map[i].mq_map); set->map[i].mq_map = NULL; } kfree(set->tags); set->tags = NULL; out_cleanup_srcu: if (set->flags & BLK_MQ_F_BLOCKING) cleanup_srcu_struct(set->srcu); out_free_srcu: if (set->flags & BLK_MQ_F_BLOCKING) kfree(set->srcu); return ret; } EXPORT_SYMBOL(blk_mq_alloc_tag_set); /* allocate and initialize a tagset for a simple single-queue device */ int blk_mq_alloc_sq_tag_set(struct blk_mq_tag_set *set, const struct blk_mq_ops *ops, unsigned int queue_depth, unsigned int set_flags) { memset(set, 0, sizeof(*set)); set->ops = ops; set->nr_hw_queues = 1; set->nr_maps = 1; set->queue_depth = queue_depth; set->numa_node = NUMA_NO_NODE; set->flags = set_flags; return blk_mq_alloc_tag_set(set); } EXPORT_SYMBOL_GPL(blk_mq_alloc_sq_tag_set); void blk_mq_free_tag_set(struct blk_mq_tag_set *set) { int i, j; for (i = 0; i < set->nr_hw_queues; i++) __blk_mq_free_map_and_rqs(set, i); if (blk_mq_is_shared_tags(set->flags)) { blk_mq_free_map_and_rqs(set, set->shared_tags, BLK_MQ_NO_HCTX_IDX); } for (j = 0; j < set->nr_maps; j++) { kfree(set->map[j].mq_map); set->map[j].mq_map = NULL; } kfree(set->tags); set->tags = NULL; if (set->flags & BLK_MQ_F_BLOCKING) { cleanup_srcu_struct(set->srcu); kfree(set->srcu); } } EXPORT_SYMBOL(blk_mq_free_tag_set); int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) { struct blk_mq_tag_set *set = q->tag_set; struct blk_mq_hw_ctx *hctx; int ret; unsigned long i; if (!set) return -EINVAL; if (q->nr_requests == nr) return 0; blk_mq_freeze_queue(q); blk_mq_quiesce_queue(q); ret = 0; queue_for_each_hw_ctx(q, hctx, i) { if (!hctx->tags) continue; /* * If we're using an MQ scheduler, just update the scheduler * queue depth. This is similar to what the old code would do. */ if (hctx->sched_tags) { ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags, nr, true); } else { ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr, false); } if (ret) break; if (q->elevator && q->elevator->type->ops.depth_updated) q->elevator->type->ops.depth_updated(hctx); } if (!ret) { q->nr_requests = nr; if (blk_mq_is_shared_tags(set->flags)) { if (q->elevator) blk_mq_tag_update_sched_shared_tags(q); else blk_mq_tag_resize_shared_tags(set, nr); } } blk_mq_unquiesce_queue(q); blk_mq_unfreeze_queue(q); return ret; } /* * request_queue and elevator_type pair. * It is just used by __blk_mq_update_nr_hw_queues to cache * the elevator_type associated with a request_queue. */ struct blk_mq_qe_pair { struct list_head node; struct request_queue *q; struct elevator_type *type; }; /* * Cache the elevator_type in qe pair list and switch the * io scheduler to 'none' */ static bool blk_mq_elv_switch_none(struct list_head *head, struct request_queue *q) { struct blk_mq_qe_pair *qe; qe = kmalloc(sizeof(*qe), GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY); if (!qe) return false; /* q->elevator needs protection from ->sysfs_lock */ mutex_lock(&q->sysfs_lock); /* the check has to be done with holding sysfs_lock */ if (!q->elevator) { kfree(qe); goto unlock; } INIT_LIST_HEAD(&qe->node); qe->q = q; qe->type = q->elevator->type; /* keep a reference to the elevator module as we'll switch back */ __elevator_get(qe->type); list_add(&qe->node, head); elevator_disable(q); unlock: mutex_unlock(&q->sysfs_lock); return true; } static struct blk_mq_qe_pair *blk_lookup_qe_pair(struct list_head *head, struct request_queue *q) { struct blk_mq_qe_pair *qe; list_for_each_entry(qe, head, node) if (qe->q == q) return qe; return NULL; } static void blk_mq_elv_switch_back(struct list_head *head, struct request_queue *q) { struct blk_mq_qe_pair *qe; struct elevator_type *t; qe = blk_lookup_qe_pair(head, q); if (!qe) return; t = qe->type; list_del(&qe->node); kfree(qe); mutex_lock(&q->sysfs_lock); elevator_switch(q, t); /* drop the reference acquired in blk_mq_elv_switch_none */ elevator_put(t); mutex_unlock(&q->sysfs_lock); } static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues) { struct request_queue *q; LIST_HEAD(head); int prev_nr_hw_queues = set->nr_hw_queues; int i; lockdep_assert_held(&set->tag_list_lock); if (set->nr_maps == 1 && nr_hw_queues > nr_cpu_ids) nr_hw_queues = nr_cpu_ids; if (nr_hw_queues < 1) return; if (set->nr_maps == 1 && nr_hw_queues == set->nr_hw_queues) return; list_for_each_entry(q, &set->tag_list, tag_set_list) blk_mq_freeze_queue(q); /* * Switch IO scheduler to 'none', cleaning up the data associated * with the previous scheduler. We will switch back once we are done * updating the new sw to hw queue mappings. */ list_for_each_entry(q, &set->tag_list, tag_set_list) if (!blk_mq_elv_switch_none(&head, q)) goto switch_back; list_for_each_entry(q, &set->tag_list, tag_set_list) { blk_mq_debugfs_unregister_hctxs(q); blk_mq_sysfs_unregister_hctxs(q); } if (blk_mq_realloc_tag_set_tags(set, nr_hw_queues) < 0) goto reregister; fallback: blk_mq_update_queue_map(set); list_for_each_entry(q, &set->tag_list, tag_set_list) { blk_mq_realloc_hw_ctxs(set, q); blk_mq_update_poll_flag(q); if (q->nr_hw_queues != set->nr_hw_queues) { int i = prev_nr_hw_queues; pr_warn("Increasing nr_hw_queues to %d fails, fallback to %d\n", nr_hw_queues, prev_nr_hw_queues); for (; i < set->nr_hw_queues; i++) __blk_mq_free_map_and_rqs(set, i); set->nr_hw_queues = prev_nr_hw_queues; goto fallback; } blk_mq_map_swqueue(q); } reregister: list_for_each_entry(q, &set->tag_list, tag_set_list) { blk_mq_sysfs_register_hctxs(q); blk_mq_debugfs_register_hctxs(q); } switch_back: list_for_each_entry(q, &set->tag_list, tag_set_list) blk_mq_elv_switch_back(&head, q); list_for_each_entry(q, &set->tag_list, tag_set_list) blk_mq_unfreeze_queue(q); /* Free the excess tags when nr_hw_queues shrink. */ for (i = set->nr_hw_queues; i < prev_nr_hw_queues; i++) __blk_mq_free_map_and_rqs(set, i); } void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues) { mutex_lock(&set->tag_list_lock); __blk_mq_update_nr_hw_queues(set, nr_hw_queues); mutex_unlock(&set->tag_list_lock); } EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues); static int blk_hctx_poll(struct request_queue *q, struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob, unsigned int flags) { long state = get_current_state(); int ret; do { ret = q->mq_ops->poll(hctx, iob); if (ret > 0) { __set_current_state(TASK_RUNNING); return ret; } if (signal_pending_state(state, current)) __set_current_state(TASK_RUNNING); if (task_is_running(current)) return 1; if (ret < 0 || (flags & BLK_POLL_ONESHOT)) break; cpu_relax(); } while (!need_resched()); __set_current_state(TASK_RUNNING); return 0; } int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob, unsigned int flags) { struct blk_mq_hw_ctx *hctx = xa_load(&q->hctx_table, cookie); return blk_hctx_poll(q, hctx, iob, flags); } int blk_rq_poll(struct request *rq, struct io_comp_batch *iob, unsigned int poll_flags) { struct request_queue *q = rq->q; int ret; if (!blk_rq_is_poll(rq)) return 0; if (!percpu_ref_tryget(&q->q_usage_counter)) return 0; ret = blk_hctx_poll(q, rq->mq_hctx, iob, poll_flags); blk_queue_exit(q); return ret; } EXPORT_SYMBOL_GPL(blk_rq_poll); unsigned int blk_mq_rq_cpu(struct request *rq) { return rq->mq_ctx->cpu; } EXPORT_SYMBOL(blk_mq_rq_cpu); void blk_mq_cancel_work_sync(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; unsigned long i; cancel_delayed_work_sync(&q->requeue_work); queue_for_each_hw_ctx(q, hctx, i) cancel_delayed_work_sync(&hctx->run_work); } static int __init blk_mq_init(void) { int i; for_each_possible_cpu(i) init_llist_head(&per_cpu(blk_cpu_done, i)); for_each_possible_cpu(i) INIT_CSD(&per_cpu(blk_cpu_csd, i), __blk_mq_complete_request_remote, NULL); open_softirq(BLOCK_SOFTIRQ, blk_done_softirq); cpuhp_setup_state_nocalls(CPUHP_BLOCK_SOFTIRQ_DEAD, "block/softirq:dead", NULL, blk_softirq_cpu_dead); cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL, blk_mq_hctx_notify_dead); cpuhp_setup_state_multi(CPUHP_AP_BLK_MQ_ONLINE, "block/mq:online", blk_mq_hctx_notify_online, blk_mq_hctx_notify_offline); return 0; } subsys_initcall(blk_mq_init);
13 10 11 10 10 5 2 3 1 1 3 3 1 5 4 2 1 5 4 4 4 4 3 1 1 12 1 4 4 3 3 3 2 4 4 4 4 1 2 1 4 1 2 1 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 // SPDX-License-Identifier: GPL-2.0-or-later /* * Copyright (c) 2014 Jiri Pirko <jiri@resnulli.us> */ #include <linux/module.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <linux/if_vlan.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> #include <net/tc_wrapper.h> #include <linux/tc_act/tc_vlan.h> #include <net/tc_act/tc_vlan.h> static struct tc_action_ops act_vlan_ops; TC_INDIRECT_SCOPE int tcf_vlan_act(struct sk_buff *skb, const struct tc_action *a, struct tcf_result *res) { struct tcf_vlan *v = to_vlan(a); struct tcf_vlan_params *p; int action; int err; u16 tci; tcf_lastuse_update(&v->tcf_tm); tcf_action_update_bstats(&v->common, skb); /* Ensure 'data' points at mac_header prior calling vlan manipulating * functions. */ if (skb_at_tc_ingress(skb)) skb_push_rcsum(skb, skb->mac_len); action = READ_ONCE(v->tcf_action); p = rcu_dereference_bh(v->vlan_p); switch (p->tcfv_action) { case TCA_VLAN_ACT_POP: err = skb_vlan_pop(skb); if (err) goto drop; break; case TCA_VLAN_ACT_PUSH: err = skb_vlan_push(skb, p->tcfv_push_proto, p->tcfv_push_vid | (p->tcfv_push_prio << VLAN_PRIO_SHIFT)); if (err) goto drop; break; case TCA_VLAN_ACT_MODIFY: /* No-op if no vlan tag (either hw-accel or in-payload) */ if (!skb_vlan_tagged(skb)) goto out; /* extract existing tag (and guarantee no hw-accel tag) */ if (skb_vlan_tag_present(skb)) { tci = skb_vlan_tag_get(skb); __vlan_hwaccel_clear_tag(skb); } else { /* in-payload vlan tag, pop it */ err = __skb_vlan_pop(skb, &tci); if (err) goto drop; } /* replace the vid */ tci = (tci & ~VLAN_VID_MASK) | p->tcfv_push_vid; /* replace prio bits, if tcfv_push_prio specified */ if (p->tcfv_push_prio_exists) { tci &= ~VLAN_PRIO_MASK; tci |= p->tcfv_push_prio << VLAN_PRIO_SHIFT; } /* put updated tci as hwaccel tag */ __vlan_hwaccel_put_tag(skb, p->tcfv_push_proto, tci); break; case TCA_VLAN_ACT_POP_ETH: err = skb_eth_pop(skb); if (err) goto drop; break; case TCA_VLAN_ACT_PUSH_ETH: err = skb_eth_push(skb, p->tcfv_push_dst, p->tcfv_push_src); if (err) goto drop; break; default: BUG(); } out: if (skb_at_tc_ingress(skb)) skb_pull_rcsum(skb, skb->mac_len); return action; drop: tcf_action_inc_drop_qstats(&v->common); return TC_ACT_SHOT; } static const struct nla_policy vlan_policy[TCA_VLAN_MAX + 1] = { [TCA_VLAN_UNSPEC] = { .strict_start_type = TCA_VLAN_PUSH_ETH_DST }, [TCA_VLAN_PARMS] = { .len = sizeof(struct tc_vlan) }, [TCA_VLAN_PUSH_VLAN_ID] = { .type = NLA_U16 }, [TCA_VLAN_PUSH_VLAN_PROTOCOL] = { .type = NLA_U16 }, [TCA_VLAN_PUSH_VLAN_PRIORITY] = { .type = NLA_U8 }, [TCA_VLAN_PUSH_ETH_DST] = NLA_POLICY_ETH_ADDR, [TCA_VLAN_PUSH_ETH_SRC] = NLA_POLICY_ETH_ADDR, }; static int tcf_vlan_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { struct tc_action_net *tn = net_generic(net, act_vlan_ops.net_id); bool bind = flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_VLAN_MAX + 1]; struct tcf_chain *goto_ch = NULL; bool push_prio_exists = false; struct tcf_vlan_params *p; struct tc_vlan *parm; struct tcf_vlan *v; int action; u16 push_vid = 0; __be16 push_proto = 0; u8 push_prio = 0; bool exists = false; int ret = 0, err; u32 index; if (!nla) return -EINVAL; err = nla_parse_nested_deprecated(tb, TCA_VLAN_MAX, nla, vlan_policy, NULL); if (err < 0) return err; if (!tb[TCA_VLAN_PARMS]) return -EINVAL; parm = nla_data(tb[TCA_VLAN_PARMS]); index = parm->index; err = tcf_idr_check_alloc(tn, &index, a, bind); if (err < 0) return err; exists = err; if (exists && bind) return 0; switch (parm->v_action) { case TCA_VLAN_ACT_POP: break; case TCA_VLAN_ACT_PUSH: case TCA_VLAN_ACT_MODIFY: if (!tb[TCA_VLAN_PUSH_VLAN_ID]) { if (exists) tcf_idr_release(*a, bind); else tcf_idr_cleanup(tn, index); return -EINVAL; } push_vid = nla_get_u16(tb[TCA_VLAN_PUSH_VLAN_ID]); if (push_vid >= VLAN_VID_MASK) { if (exists) tcf_idr_release(*a, bind); else tcf_idr_cleanup(tn, index); return -ERANGE; } if (tb[TCA_VLAN_PUSH_VLAN_PROTOCOL]) { push_proto = nla_get_be16(tb[TCA_VLAN_PUSH_VLAN_PROTOCOL]); switch (push_proto) { case htons(ETH_P_8021Q): case htons(ETH_P_8021AD): break; default: if (exists) tcf_idr_release(*a, bind); else tcf_idr_cleanup(tn, index); return -EPROTONOSUPPORT; } } else { push_proto = htons(ETH_P_8021Q); } push_prio_exists = !!tb[TCA_VLAN_PUSH_VLAN_PRIORITY]; if (push_prio_exists) push_prio = nla_get_u8(tb[TCA_VLAN_PUSH_VLAN_PRIORITY]); break; case TCA_VLAN_ACT_POP_ETH: break; case TCA_VLAN_ACT_PUSH_ETH: if (!tb[TCA_VLAN_PUSH_ETH_DST] || !tb[TCA_VLAN_PUSH_ETH_SRC]) { if (exists) tcf_idr_release(*a, bind); else tcf_idr_cleanup(tn, index); return -EINVAL; } break; default: if (exists) tcf_idr_release(*a, bind); else tcf_idr_cleanup(tn, index); return -EINVAL; } action = parm->v_action; if (!exists) { ret = tcf_idr_create_from_flags(tn, index, est, a, &act_vlan_ops, bind, flags); if (ret) { tcf_idr_cleanup(tn, index); return ret; } ret = ACT_P_CREATED; } else if (!(flags & TCA_ACT_FLAGS_REPLACE)) { tcf_idr_release(*a, bind); return -EEXIST; } err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack); if (err < 0) goto release_idr; v = to_vlan(*a); p = kzalloc(sizeof(*p), GFP_KERNEL); if (!p) { err = -ENOMEM; goto put_chain; } p->tcfv_action = action; p->tcfv_push_vid = push_vid; p->tcfv_push_prio = push_prio; p->tcfv_push_prio_exists = push_prio_exists || action == TCA_VLAN_ACT_PUSH; p->tcfv_push_proto = push_proto; if (action == TCA_VLAN_ACT_PUSH_ETH) { nla_memcpy(&p->tcfv_push_dst, tb[TCA_VLAN_PUSH_ETH_DST], ETH_ALEN); nla_memcpy(&p->tcfv_push_src, tb[TCA_VLAN_PUSH_ETH_SRC], ETH_ALEN); } spin_lock_bh(&v->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); p = rcu_replace_pointer(v->vlan_p, p, lockdep_is_held(&v->tcf_lock)); spin_unlock_bh(&v->tcf_lock); if (goto_ch) tcf_chain_put_by_act(goto_ch); if (p) kfree_rcu(p, rcu); return ret; put_chain: if (goto_ch) tcf_chain_put_by_act(goto_ch); release_idr: tcf_idr_release(*a, bind); return err; } static void tcf_vlan_cleanup(struct tc_action *a) { struct tcf_vlan *v = to_vlan(a); struct tcf_vlan_params *p; p = rcu_dereference_protected(v->vlan_p, 1); if (p) kfree_rcu(p, rcu); } static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) { unsigned char *b = skb_tail_pointer(skb); struct tcf_vlan *v = to_vlan(a); struct tcf_vlan_params *p; struct tc_vlan opt = { .index = v->tcf_index, .refcnt = refcount_read(&v->tcf_refcnt) - ref, .bindcnt = atomic_read(&v->tcf_bindcnt) - bind, }; struct tcf_t t; spin_lock_bh(&v->tcf_lock); opt.action = v->tcf_action; p = rcu_dereference_protected(v->vlan_p, lockdep_is_held(&v->tcf_lock)); opt.v_action = p->tcfv_action; if (nla_put(skb, TCA_VLAN_PARMS, sizeof(opt), &opt)) goto nla_put_failure; if ((p->tcfv_action == TCA_VLAN_ACT_PUSH || p->tcfv_action == TCA_VLAN_ACT_MODIFY) && (nla_put_u16(skb, TCA_VLAN_PUSH_VLAN_ID, p->tcfv_push_vid) || nla_put_be16(skb, TCA_VLAN_PUSH_VLAN_PROTOCOL, p->tcfv_push_proto) || (p->tcfv_push_prio_exists && nla_put_u8(skb, TCA_VLAN_PUSH_VLAN_PRIORITY, p->tcfv_push_prio)))) goto nla_put_failure; if (p->tcfv_action == TCA_VLAN_ACT_PUSH_ETH) { if (nla_put(skb, TCA_VLAN_PUSH_ETH_DST, ETH_ALEN, p->tcfv_push_dst)) goto nla_put_failure; if (nla_put(skb, TCA_VLAN_PUSH_ETH_SRC, ETH_ALEN, p->tcfv_push_src)) goto nla_put_failure; } tcf_tm_dump(&t, &v->tcf_tm); if (nla_put_64bit(skb, TCA_VLAN_TM, sizeof(t), &t, TCA_VLAN_PAD)) goto nla_put_failure; spin_unlock_bh(&v->tcf_lock); return skb->len; nla_put_failure: spin_unlock_bh(&v->tcf_lock); nlmsg_trim(skb, b); return -1; } static void tcf_vlan_stats_update(struct tc_action *a, u64 bytes, u64 packets, u64 drops, u64 lastuse, bool hw) { struct tcf_vlan *v = to_vlan(a); struct tcf_t *tm = &v->tcf_tm; tcf_action_update_stats(a, bytes, packets, drops, hw); tm->lastuse = max_t(u64, tm->lastuse, lastuse); } static size_t tcf_vlan_get_fill_size(const struct tc_action *act) { return nla_total_size(sizeof(struct tc_vlan)) + nla_total_size(sizeof(u16)) /* TCA_VLAN_PUSH_VLAN_ID */ + nla_total_size(sizeof(u16)) /* TCA_VLAN_PUSH_VLAN_PROTOCOL */ + nla_total_size(sizeof(u8)); /* TCA_VLAN_PUSH_VLAN_PRIORITY */ } static int tcf_vlan_offload_act_setup(struct tc_action *act, void *entry_data, u32 *index_inc, bool bind, struct netlink_ext_ack *extack) { if (bind) { struct flow_action_entry *entry = entry_data; switch (tcf_vlan_action(act)) { case TCA_VLAN_ACT_PUSH: entry->id = FLOW_ACTION_VLAN_PUSH; entry->vlan.vid = tcf_vlan_push_vid(act); entry->vlan.proto = tcf_vlan_push_proto(act); entry->vlan.prio = tcf_vlan_push_prio(act); break; case TCA_VLAN_ACT_POP: entry->id = FLOW_ACTION_VLAN_POP; break; case TCA_VLAN_ACT_MODIFY: entry->id = FLOW_ACTION_VLAN_MANGLE; entry->vlan.vid = tcf_vlan_push_vid(act); entry->vlan.proto = tcf_vlan_push_proto(act); entry->vlan.prio = tcf_vlan_push_prio(act); break; case TCA_VLAN_ACT_POP_ETH: entry->id = FLOW_ACTION_VLAN_POP_ETH; break; case TCA_VLAN_ACT_PUSH_ETH: entry->id = FLOW_ACTION_VLAN_PUSH_ETH; tcf_vlan_push_eth(entry->vlan_push_eth.src, entry->vlan_push_eth.dst, act); break; default: NL_SET_ERR_MSG_MOD(extack, "Unsupported vlan action mode offload"); return -EOPNOTSUPP; } *index_inc = 1; } else { struct flow_offload_action *fl_action = entry_data; switch (tcf_vlan_action(act)) { case TCA_VLAN_ACT_PUSH: fl_action->id = FLOW_ACTION_VLAN_PUSH; break; case TCA_VLAN_ACT_POP: fl_action->id = FLOW_ACTION_VLAN_POP; break; case TCA_VLAN_ACT_MODIFY: fl_action->id = FLOW_ACTION_VLAN_MANGLE; break; case TCA_VLAN_ACT_POP_ETH: fl_action->id = FLOW_ACTION_VLAN_POP_ETH; break; case TCA_VLAN_ACT_PUSH_ETH: fl_action->id = FLOW_ACTION_VLAN_PUSH_ETH; break; default: return -EOPNOTSUPP; } } return 0; } static struct tc_action_ops act_vlan_ops = { .kind = "vlan", .id = TCA_ID_VLAN, .owner = THIS_MODULE, .act = tcf_vlan_act, .dump = tcf_vlan_dump, .init = tcf_vlan_init, .cleanup = tcf_vlan_cleanup, .stats_update = tcf_vlan_stats_update, .get_fill_size = tcf_vlan_get_fill_size, .offload_act_setup = tcf_vlan_offload_act_setup, .size = sizeof(struct tcf_vlan), }; static __net_init int vlan_init_net(struct net *net) { struct tc_action_net *tn = net_generic(net, act_vlan_ops.net_id); return tc_action_net_init(net, tn, &act_vlan_ops); } static void __net_exit vlan_exit_net(struct list_head *net_list) { tc_action_net_exit(net_list, act_vlan_ops.net_id); } static struct pernet_operations vlan_net_ops = { .init = vlan_init_net, .exit_batch = vlan_exit_net, .id = &act_vlan_ops.net_id, .size = sizeof(struct tc_action_net), }; static int __init vlan_init_module(void) { return tcf_register_action(&act_vlan_ops, &vlan_net_ops); } static void __exit vlan_cleanup_module(void) { tcf_unregister_action(&act_vlan_ops, &vlan_net_ops); } module_init(vlan_init_module); module_exit(vlan_cleanup_module); MODULE_AUTHOR("Jiri Pirko <jiri@resnulli.us>"); MODULE_DESCRIPTION("vlan manipulation actions"); MODULE_LICENSE("GPL v2");
34215 34211 34208 34205 34211 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 /* * Stack trace management functions * * Copyright (C) 2006-2009 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> */ #include <linux/sched.h> #include <linux/sched/debug.h> #include <linux/sched/task_stack.h> #include <linux/stacktrace.h> #include <linux/export.h> #include <linux/uaccess.h> #include <asm/stacktrace.h> #include <asm/unwind.h> void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie, struct task_struct *task, struct pt_regs *regs) { struct unwind_state state; unsigned long addr; if (regs && !consume_entry(cookie, regs->ip)) return; for (unwind_start(&state, task, regs, NULL); !unwind_done(&state); unwind_next_frame(&state)) { addr = unwind_get_return_address(&state); if (!addr || !consume_entry(cookie, addr)) break; } } int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry, void *cookie, struct task_struct *task) { struct unwind_state state; struct pt_regs *regs; unsigned long addr; for (unwind_start(&state, task, NULL, NULL); !unwind_done(&state) && !unwind_error(&state); unwind_next_frame(&state)) { regs = unwind_get_entry_regs(&state, NULL); if (regs) { /* Success path for user tasks */ if (user_mode(regs)) return 0; /* * Kernel mode registers on the stack indicate an * in-kernel interrupt or exception (e.g., preemption * or a page fault), which can make frame pointers * unreliable. */ if (IS_ENABLED(CONFIG_FRAME_POINTER)) return -EINVAL; } addr = unwind_get_return_address(&state); /* * A NULL or invalid return address probably means there's some * generated code which __kernel_text_address() doesn't know * about. */ if (!addr) return -EINVAL; if (!consume_entry(cookie, addr)) return -EINVAL; } /* Check for stack corruption */ if (unwind_error(&state)) return -EINVAL; return 0; } /* Userspace stacktrace - based on kernel/trace/trace_sysprof.c */ struct stack_frame_user { const void __user *next_fp; unsigned long ret_addr; }; static int copy_stack_frame(const struct stack_frame_user __user *fp, struct stack_frame_user *frame) { int ret; if (!__access_ok(fp, sizeof(*frame))) return 0; ret = 1; pagefault_disable(); if (__get_user(frame->next_fp, &fp->next_fp) || __get_user(frame->ret_addr, &fp->ret_addr)) ret = 0; pagefault_enable(); return ret; } void arch_stack_walk_user(stack_trace_consume_fn consume_entry, void *cookie, const struct pt_regs *regs) { const void __user *fp = (const void __user *)regs->bp; if (!consume_entry(cookie, regs->ip)) return; while (1) { struct stack_frame_user frame; frame.next_fp = NULL; frame.ret_addr = 0; if (!copy_stack_frame(fp, &frame)) break; if ((unsigned long)fp < regs->sp) break; if (!frame.ret_addr) break; if (!consume_entry(cookie, frame.ret_addr)) break; fp = frame.next_fp; } }
45 7 1 16 2 2 2 2 2 2 69 65 4 64 62 1 12 12 3 12 3 71 69 65 2 63 31 31 31 31 2 1 15 15 6 6 3 10 14 11 10 5 5 5 5 4 4 5 10 5 10 15 15 6 10 10 5 2 1 2 22 1 1 1 1 23 23 23 19 2 2 31 31 5 5 5 4 5 5 5 3 1 4 3 31 4 4 5 4 4 5 31 31 31 31 31 31 5 5 5 5 3 4 31 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 // SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/pnode.c * * (C) Copyright IBM Corporation 2005. * Author : Ram Pai (linuxram@us.ibm.com) */ #include <linux/mnt_namespace.h> #include <linux/mount.h> #include <linux/fs.h> #include <linux/nsproxy.h> #include <uapi/linux/mount.h> #include "internal.h" #include "pnode.h" /* return the next shared peer mount of @p */ static inline struct mount *next_peer(struct mount *p) { return list_entry(p->mnt_share.next, struct mount, mnt_share); } static inline struct mount *first_slave(struct mount *p) { return list_entry(p->mnt_slave_list.next, struct mount, mnt_slave); } static inline struct mount *last_slave(struct mount *p) { return list_entry(p->mnt_slave_list.prev, struct mount, mnt_slave); } static inline struct mount *next_slave(struct mount *p) { return list_entry(p->mnt_slave.next, struct mount, mnt_slave); } static struct mount *get_peer_under_root(struct mount *mnt, struct mnt_namespace *ns, const struct path *root) { struct mount *m = mnt; do { /* Check the namespace first for optimization */ if (m->mnt_ns == ns && is_path_reachable(m, m->mnt.mnt_root, root)) return m; m = next_peer(m); } while (m != mnt); return NULL; } /* * Get ID of closest dominating peer group having a representative * under the given root. * * Caller must hold namespace_sem */ int get_dominating_id(struct mount *mnt, const struct path *root) { struct mount *m; for (m = mnt->mnt_master; m != NULL; m = m->mnt_master) { struct mount *d = get_peer_under_root(m, mnt->mnt_ns, root); if (d) return d->mnt_group_id; } return 0; } static int do_make_slave(struct mount *mnt) { struct mount *master, *slave_mnt; if (list_empty(&mnt->mnt_share)) { if (IS_MNT_SHARED(mnt)) { mnt_release_group_id(mnt); CLEAR_MNT_SHARED(mnt); } master = mnt->mnt_master; if (!master) { struct list_head *p = &mnt->mnt_slave_list; while (!list_empty(p)) { slave_mnt = list_first_entry(p, struct mount, mnt_slave); list_del_init(&slave_mnt->mnt_slave); slave_mnt->mnt_master = NULL; } return 0; } } else { struct mount *m; /* * slave 'mnt' to a peer mount that has the * same root dentry. If none is available then * slave it to anything that is available. */ for (m = master = next_peer(mnt); m != mnt; m = next_peer(m)) { if (m->mnt.mnt_root == mnt->mnt.mnt_root) { master = m; break; } } list_del_init(&mnt->mnt_share); mnt->mnt_group_id = 0; CLEAR_MNT_SHARED(mnt); } list_for_each_entry(slave_mnt, &mnt->mnt_slave_list, mnt_slave) slave_mnt->mnt_master = master; list_move(&mnt->mnt_slave, &master->mnt_slave_list); list_splice(&mnt->mnt_slave_list, master->mnt_slave_list.prev); INIT_LIST_HEAD(&mnt->mnt_slave_list); mnt->mnt_master = master; return 0; } /* * vfsmount lock must be held for write */ void change_mnt_propagation(struct mount *mnt, int type) { if (type == MS_SHARED) { set_mnt_shared(mnt); return; } do_make_slave(mnt); if (type != MS_SLAVE) { list_del_init(&mnt->mnt_slave); mnt->mnt_master = NULL; if (type == MS_UNBINDABLE) mnt->mnt.mnt_flags |= MNT_UNBINDABLE; else mnt->mnt.mnt_flags &= ~MNT_UNBINDABLE; } } /* * get the next mount in the propagation tree. * @m: the mount seen last * @origin: the original mount from where the tree walk initiated * * Note that peer groups form contiguous segments of slave lists. * We rely on that in get_source() to be able to find out if * vfsmount found while iterating with propagation_next() is * a peer of one we'd found earlier. */ static struct mount *propagation_next(struct mount *m, struct mount *origin) { /* are there any slaves of this mount? */ if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list)) return first_slave(m); while (1) { struct mount *master = m->mnt_master; if (master == origin->mnt_master) { struct mount *next = next_peer(m); return (next == origin) ? NULL : next; } else if (m->mnt_slave.next != &master->mnt_slave_list) return next_slave(m); /* back at master */ m = master; } } static struct mount *skip_propagation_subtree(struct mount *m, struct mount *origin) { /* * Advance m such that propagation_next will not return * the slaves of m. */ if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list)) m = last_slave(m); return m; } static struct mount *next_group(struct mount *m, struct mount *origin) { while (1) { while (1) { struct mount *next; if (!IS_MNT_NEW(m) && !list_empty(&m->mnt_slave_list)) return first_slave(m); next = next_peer(m); if (m->mnt_group_id == origin->mnt_group_id) { if (next == origin) return NULL; } else if (m->mnt_slave.next != &next->mnt_slave) break; m = next; } /* m is the last peer */ while (1) { struct mount *master = m->mnt_master; if (m->mnt_slave.next != &master->mnt_slave_list) return next_slave(m); m = next_peer(master); if (master->mnt_group_id == origin->mnt_group_id) break; if (master->mnt_slave.next == &m->mnt_slave) break; m = master; } if (m == origin) return NULL; } } /* all accesses are serialized by namespace_sem */ static struct mount *last_dest, *first_source, *last_source, *dest_master; static struct hlist_head *list; static inline bool peers(const struct mount *m1, const struct mount *m2) { return m1->mnt_group_id == m2->mnt_group_id && m1->mnt_group_id; } static int propagate_one(struct mount *m, struct mountpoint *dest_mp) { struct mount *child; int type; /* skip ones added by this propagate_mnt() */ if (IS_MNT_NEW(m)) return 0; /* skip if mountpoint isn't covered by it */ if (!is_subdir(dest_mp->m_dentry, m->mnt.mnt_root)) return 0; if (peers(m, last_dest)) { type = CL_MAKE_SHARED; } else { struct mount *n, *p; bool done; for (n = m; ; n = p) { p = n->mnt_master; if (p == dest_master || IS_MNT_MARKED(p)) break; } do { struct mount *parent = last_source->mnt_parent; if (peers(last_source, first_source)) break; done = parent->mnt_master == p; if (done && peers(n, parent)) break; last_source = last_source->mnt_master; } while (!done); type = CL_SLAVE; /* beginning of peer group among the slaves? */ if (IS_MNT_SHARED(m)) type |= CL_MAKE_SHARED; } child = copy_tree(last_source, last_source->mnt.mnt_root, type); if (IS_ERR(child)) return PTR_ERR(child); read_seqlock_excl(&mount_lock); mnt_set_mountpoint(m, dest_mp, child); if (m->mnt_master != dest_master) SET_MNT_MARK(m->mnt_master); read_sequnlock_excl(&mount_lock); last_dest = m; last_source = child; hlist_add_head(&child->mnt_hash, list); return count_mounts(m->mnt_ns, child); } /* * mount 'source_mnt' under the destination 'dest_mnt' at * dentry 'dest_dentry'. And propagate that mount to * all the peer and slave mounts of 'dest_mnt'. * Link all the new mounts into a propagation tree headed at * source_mnt. Also link all the new mounts using ->mnt_list * headed at source_mnt's ->mnt_list * * @dest_mnt: destination mount. * @dest_dentry: destination dentry. * @source_mnt: source mount. * @tree_list : list of heads of trees to be attached. */ int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp, struct mount *source_mnt, struct hlist_head *tree_list) { struct mount *m, *n; int ret = 0; /* * we don't want to bother passing tons of arguments to * propagate_one(); everything is serialized by namespace_sem, * so globals will do just fine. */ last_dest = dest_mnt; first_source = source_mnt; last_source = source_mnt; list = tree_list; dest_master = dest_mnt->mnt_master; /* all peers of dest_mnt, except dest_mnt itself */ for (n = next_peer(dest_mnt); n != dest_mnt; n = next_peer(n)) { ret = propagate_one(n, dest_mp); if (ret) goto out; } /* all slave groups */ for (m = next_group(dest_mnt, dest_mnt); m; m = next_group(m, dest_mnt)) { /* everything in that slave group */ n = m; do { ret = propagate_one(n, dest_mp); if (ret) goto out; n = next_peer(n); } while (n != m); } out: read_seqlock_excl(&mount_lock); hlist_for_each_entry(n, tree_list, mnt_hash) { m = n->mnt_parent; if (m->mnt_master != dest_mnt->mnt_master) CLEAR_MNT_MARK(m->mnt_master); } read_sequnlock_excl(&mount_lock); return ret; } static struct mount *find_topper(struct mount *mnt) { /* If there is exactly one mount covering mnt completely return it. */ struct mount *child; if (!list_is_singular(&mnt->mnt_mounts)) return NULL; child = list_first_entry(&mnt->mnt_mounts, struct mount, mnt_child); if (child->mnt_mountpoint != mnt->mnt.mnt_root) return NULL; return child; } /* * return true if the refcount is greater than count */ static inline int do_refcount_check(struct mount *mnt, int count) { return mnt_get_count(mnt) > count; } /** * propagation_would_overmount - check whether propagation from @from * would overmount @to * @from: shared mount * @to: mount to check * @mp: future mountpoint of @to on @from * * If @from propagates mounts to @to, @from and @to must either be peers * or one of the masters in the hierarchy of masters of @to must be a * peer of @from. * * If the root of the @to mount is equal to the future mountpoint @mp of * the @to mount on @from then @to will be overmounted by whatever is * propagated to it. * * Context: This function expects namespace_lock() to be held and that * @mp is stable. * Return: If @from overmounts @to, true is returned, false if not. */ bool propagation_would_overmount(const struct mount *from, const struct mount *to, const struct mountpoint *mp) { if (!IS_MNT_SHARED(from)) return false; if (IS_MNT_NEW(to)) return false; if (to->mnt.mnt_root != mp->m_dentry) return false; for (const struct mount *m = to; m; m = m->mnt_master) { if (peers(from, m)) return true; } return false; } /* * check if the mount 'mnt' can be unmounted successfully. * @mnt: the mount to be checked for unmount * NOTE: unmounting 'mnt' would naturally propagate to all * other mounts its parent propagates to. * Check if any of these mounts that **do not have submounts** * have more references than 'refcnt'. If so return busy. * * vfsmount lock must be held for write */ int propagate_mount_busy(struct mount *mnt, int refcnt) { struct mount *m, *child, *topper; struct mount *parent = mnt->mnt_parent; if (mnt == parent) return do_refcount_check(mnt, refcnt); /* * quickly check if the current mount can be unmounted. * If not, we don't have to go checking for all other * mounts */ if (!list_empty(&mnt->mnt_mounts) || do_refcount_check(mnt, refcnt)) return 1; for (m = propagation_next(parent, parent); m; m = propagation_next(m, parent)) { int count = 1; child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint); if (!child) continue; /* Is there exactly one mount on the child that covers * it completely whose reference should be ignored? */ topper = find_topper(child); if (topper) count += 1; else if (!list_empty(&child->mnt_mounts)) continue; if (do_refcount_check(child, count)) return 1; } return 0; } /* * Clear MNT_LOCKED when it can be shown to be safe. * * mount_lock lock must be held for write */ void propagate_mount_unlock(struct mount *mnt) { struct mount *parent = mnt->mnt_parent; struct mount *m, *child; BUG_ON(parent == mnt); for (m = propagation_next(parent, parent); m; m = propagation_next(m, parent)) { child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint); if (child) child->mnt.mnt_flags &= ~MNT_LOCKED; } } static void umount_one(struct mount *mnt, struct list_head *to_umount) { CLEAR_MNT_MARK(mnt); mnt->mnt.mnt_flags |= MNT_UMOUNT; list_del_init(&mnt->mnt_child); list_del_init(&mnt->mnt_umounting); list_move_tail(&mnt->mnt_list, to_umount); } /* * NOTE: unmounting 'mnt' naturally propagates to all other mounts its * parent propagates to. */ static bool __propagate_umount(struct mount *mnt, struct list_head *to_umount, struct list_head *to_restore) { bool progress = false; struct mount *child; /* * The state of the parent won't change if this mount is * already unmounted or marked as without children. */ if (mnt->mnt.mnt_flags & (MNT_UMOUNT | MNT_MARKED)) goto out; /* Verify topper is the only grandchild that has not been * speculatively unmounted. */ list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) { if (child->mnt_mountpoint == mnt->mnt.mnt_root) continue; if (!list_empty(&child->mnt_umounting) && IS_MNT_MARKED(child)) continue; /* Found a mounted child */ goto children; } /* Mark mounts that can be unmounted if not locked */ SET_MNT_MARK(mnt); progress = true; /* If a mount is without children and not locked umount it. */ if (!IS_MNT_LOCKED(mnt)) { umount_one(mnt, to_umount); } else { children: list_move_tail(&mnt->mnt_umounting, to_restore); } out: return progress; } static void umount_list(struct list_head *to_umount, struct list_head *to_restore) { struct mount *mnt, *child, *tmp; list_for_each_entry(mnt, to_umount, mnt_list) { list_for_each_entry_safe(child, tmp, &mnt->mnt_mounts, mnt_child) { /* topper? */ if (child->mnt_mountpoint == mnt->mnt.mnt_root) list_move_tail(&child->mnt_umounting, to_restore); else umount_one(child, to_umount); } } } static void restore_mounts(struct list_head *to_restore) { /* Restore mounts to a clean working state */ while (!list_empty(to_restore)) { struct mount *mnt, *parent; struct mountpoint *mp; mnt = list_first_entry(to_restore, struct mount, mnt_umounting); CLEAR_MNT_MARK(mnt); list_del_init(&mnt->mnt_umounting); /* Should this mount be reparented? */ mp = mnt->mnt_mp; parent = mnt->mnt_parent; while (parent->mnt.mnt_flags & MNT_UMOUNT) { mp = parent->mnt_mp; parent = parent->mnt_parent; } if (parent != mnt->mnt_parent) mnt_change_mountpoint(parent, mp, mnt); } } static void cleanup_umount_visitations(struct list_head *visited) { while (!list_empty(visited)) { struct mount *mnt = list_first_entry(visited, struct mount, mnt_umounting); list_del_init(&mnt->mnt_umounting); } } /* * collect all mounts that receive propagation from the mount in @list, * and return these additional mounts in the same list. * @list: the list of mounts to be unmounted. * * vfsmount lock must be held for write */ int propagate_umount(struct list_head *list) { struct mount *mnt; LIST_HEAD(to_restore); LIST_HEAD(to_umount); LIST_HEAD(visited); /* Find candidates for unmounting */ list_for_each_entry_reverse(mnt, list, mnt_list) { struct mount *parent = mnt->mnt_parent; struct mount *m; /* * If this mount has already been visited it is known that it's * entire peer group and all of their slaves in the propagation * tree for the mountpoint has already been visited and there is * no need to visit them again. */ if (!list_empty(&mnt->mnt_umounting)) continue; list_add_tail(&mnt->mnt_umounting, &visited); for (m = propagation_next(parent, parent); m; m = propagation_next(m, parent)) { struct mount *child = __lookup_mnt(&m->mnt, mnt->mnt_mountpoint); if (!child) continue; if (!list_empty(&child->mnt_umounting)) { /* * If the child has already been visited it is * know that it's entire peer group and all of * their slaves in the propgation tree for the * mountpoint has already been visited and there * is no need to visit this subtree again. */ m = skip_propagation_subtree(m, parent); continue; } else if (child->mnt.mnt_flags & MNT_UMOUNT) { /* * We have come accross an partially unmounted * mount in list that has not been visited yet. * Remember it has been visited and continue * about our merry way. */ list_add_tail(&child->mnt_umounting, &visited); continue; } /* Check the child and parents while progress is made */ while (__propagate_umount(child, &to_umount, &to_restore)) { /* Is the parent a umount candidate? */ child = child->mnt_parent; if (list_empty(&child->mnt_umounting)) break; } } } umount_list(&to_umount, &to_restore); restore_mounts(&to_restore); cleanup_umount_visitations(&visited); list_splice_tail(&to_umount, list); return 0; }
1480 1480 1479 1476 1480 1475 1 1 1 1 1 1 1 1 1 1 1 2 2 2722 989 1 1 81 81 81 12 12 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 // SPDX-License-Identifier: GPL-2.0-only /* * net/core/dst.c Protocol independent destination cache. * * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> * */ #include <linux/bitops.h> #include <linux/errno.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/workqueue.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/slab.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/string.h> #include <linux/types.h> #include <net/net_namespace.h> #include <linux/sched.h> #include <linux/prefetch.h> #include <net/lwtunnel.h> #include <net/xfrm.h> #include <net/dst.h> #include <net/dst_metadata.h> int dst_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb) { kfree_skb(skb); return 0; } EXPORT_SYMBOL(dst_discard_out); const struct dst_metrics dst_default_metrics = { /* This initializer is needed to force linker to place this variable * into const section. Otherwise it might end into bss section. * We really want to avoid false sharing on this variable, and catch * any writes on it. */ .refcnt = REFCOUNT_INIT(1), }; EXPORT_SYMBOL(dst_default_metrics); void dst_init(struct dst_entry *dst, struct dst_ops *ops, struct net_device *dev, int initial_obsolete, unsigned short flags) { dst->dev = dev; netdev_hold(dev, &dst->dev_tracker, GFP_ATOMIC); dst->ops = ops; dst_init_metrics(dst, dst_default_metrics.metrics, true); dst->expires = 0UL; #ifdef CONFIG_XFRM dst->xfrm = NULL; #endif dst->input = dst_discard; dst->output = dst_discard_out; dst->error = 0; dst->obsolete = initial_obsolete; dst->header_len = 0; dst->trailer_len = 0; #ifdef CONFIG_IP_ROUTE_CLASSID dst->tclassid = 0; #endif dst->lwtstate = NULL; rcuref_init(&dst->__rcuref, 1); INIT_LIST_HEAD(&dst->rt_uncached); dst->__use = 0; dst->lastuse = jiffies; dst->flags = flags; if (!(flags & DST_NOCOUNT)) dst_entries_add(ops, 1); } EXPORT_SYMBOL(dst_init); void *dst_alloc(struct dst_ops *ops, struct net_device *dev, int initial_obsolete, unsigned short flags) { struct dst_entry *dst; if (ops->gc && !(flags & DST_NOCOUNT) && dst_entries_get_fast(ops) > ops->gc_thresh) ops->gc(ops); dst = kmem_cache_alloc(ops->kmem_cachep, GFP_ATOMIC); if (!dst) return NULL; dst_init(dst, ops, dev, initial_obsolete, flags); return dst; } EXPORT_SYMBOL(dst_alloc); struct dst_entry *dst_destroy(struct dst_entry * dst) { struct dst_entry *child = NULL; smp_rmb(); #ifdef CONFIG_XFRM if (dst->xfrm) { struct xfrm_dst *xdst = (struct xfrm_dst *) dst; child = xdst->child; } #endif if (!(dst->flags & DST_NOCOUNT)) dst_entries_add(dst->ops, -1); if (dst->ops->destroy) dst->ops->destroy(dst); netdev_put(dst->dev, &dst->dev_tracker); lwtstate_put(dst->lwtstate); if (dst->flags & DST_METADATA) metadata_dst_free((struct metadata_dst *)dst); else kmem_cache_free(dst->ops->kmem_cachep, dst); dst = child; if (dst) dst_release_immediate(dst); return NULL; } EXPORT_SYMBOL(dst_destroy); static void dst_destroy_rcu(struct rcu_head *head) { struct dst_entry *dst = container_of(head, struct dst_entry, rcu_head); dst = dst_destroy(dst); } /* Operations to mark dst as DEAD and clean up the net device referenced * by dst: * 1. put the dst under blackhole interface and discard all tx/rx packets * on this route. * 2. release the net_device * This function should be called when removing routes from the fib tree * in preparation for a NETDEV_DOWN/NETDEV_UNREGISTER event and also to * make the next dst_ops->check() fail. */ void dst_dev_put(struct dst_entry *dst) { struct net_device *dev = dst->dev; dst->obsolete = DST_OBSOLETE_DEAD; if (dst->ops->ifdown) dst->ops->ifdown(dst, dev); dst->input = dst_discard; dst->output = dst_discard_out; dst->dev = blackhole_netdev; netdev_ref_replace(dev, blackhole_netdev, &dst->dev_tracker, GFP_ATOMIC); } EXPORT_SYMBOL(dst_dev_put); void dst_release(struct dst_entry *dst) { if (dst && rcuref_put(&dst->__rcuref)) call_rcu_hurry(&dst->rcu_head, dst_destroy_rcu); } EXPORT_SYMBOL(dst_release); void dst_release_immediate(struct dst_entry *dst) { if (dst && rcuref_put(&dst->__rcuref)) dst_destroy(dst); } EXPORT_SYMBOL(dst_release_immediate); u32 *dst_cow_metrics_generic(struct dst_entry *dst, unsigned long old) { struct dst_metrics *p = kmalloc(sizeof(*p), GFP_ATOMIC); if (p) { struct dst_metrics *old_p = (struct dst_metrics *)__DST_METRICS_PTR(old); unsigned long prev, new; refcount_set(&p->refcnt, 1); memcpy(p->metrics, old_p->metrics, sizeof(p->metrics)); new = (unsigned long) p; prev = cmpxchg(&dst->_metrics, old, new); if (prev != old) { kfree(p); p = (struct dst_metrics *)__DST_METRICS_PTR(prev); if (prev & DST_METRICS_READ_ONLY) p = NULL; } else if (prev & DST_METRICS_REFCOUNTED) { if (refcount_dec_and_test(&old_p->refcnt)) kfree(old_p); } } BUILD_BUG_ON(offsetof(struct dst_metrics, metrics) != 0); return (u32 *)p; } EXPORT_SYMBOL(dst_cow_metrics_generic); /* Caller asserts that dst_metrics_read_only(dst) is false. */ void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old) { unsigned long prev, new; new = ((unsigned long) &dst_default_metrics) | DST_METRICS_READ_ONLY; prev = cmpxchg(&dst->_metrics, old, new); if (prev == old) kfree(__DST_METRICS_PTR(old)); } EXPORT_SYMBOL(__dst_destroy_metrics_generic); struct dst_entry *dst_blackhole_check(struct dst_entry *dst, u32 cookie) { return NULL; } u32 *dst_blackhole_cow_metrics(struct dst_entry *dst, unsigned long old) { return NULL; } struct neighbour *dst_blackhole_neigh_lookup(const struct dst_entry *dst, struct sk_buff *skb, const void *daddr) { return NULL; } void dst_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, u32 mtu, bool confirm_neigh) { } EXPORT_SYMBOL_GPL(dst_blackhole_update_pmtu); void dst_blackhole_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) { } EXPORT_SYMBOL_GPL(dst_blackhole_redirect); unsigned int dst_blackhole_mtu(const struct dst_entry *dst) { unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); return mtu ? : dst->dev->mtu; } EXPORT_SYMBOL_GPL(dst_blackhole_mtu); static struct dst_ops dst_blackhole_ops = { .family = AF_UNSPEC, .neigh_lookup = dst_blackhole_neigh_lookup, .check = dst_blackhole_check, .cow_metrics = dst_blackhole_cow_metrics, .update_pmtu = dst_blackhole_update_pmtu, .redirect = dst_blackhole_redirect, .mtu = dst_blackhole_mtu, }; static void __metadata_dst_init(struct metadata_dst *md_dst, enum metadata_type type, u8 optslen) { struct dst_entry *dst; dst = &md_dst->dst; dst_init(dst, &dst_blackhole_ops, NULL, DST_OBSOLETE_NONE, DST_METADATA | DST_NOCOUNT); memset(dst + 1, 0, sizeof(*md_dst) + optslen - sizeof(*dst)); md_dst->type = type; } struct metadata_dst *metadata_dst_alloc(u8 optslen, enum metadata_type type, gfp_t flags) { struct metadata_dst *md_dst; md_dst = kmalloc(sizeof(*md_dst) + optslen, flags); if (!md_dst) return NULL; __metadata_dst_init(md_dst, type, optslen); return md_dst; } EXPORT_SYMBOL_GPL(metadata_dst_alloc); void metadata_dst_free(struct metadata_dst *md_dst) { #ifdef CONFIG_DST_CACHE if (md_dst->type == METADATA_IP_TUNNEL) dst_cache_destroy(&md_dst->u.tun_info.dst_cache); #endif if (md_dst->type == METADATA_XFRM) dst_release(md_dst->u.xfrm_info.dst_orig); kfree(md_dst); } EXPORT_SYMBOL_GPL(metadata_dst_free); struct metadata_dst __percpu * metadata_dst_alloc_percpu(u8 optslen, enum metadata_type type, gfp_t flags) { int cpu; struct metadata_dst __percpu *md_dst; md_dst = __alloc_percpu_gfp(sizeof(struct metadata_dst) + optslen, __alignof__(struct metadata_dst), flags); if (!md_dst) return NULL; for_each_possible_cpu(cpu) __metadata_dst_init(per_cpu_ptr(md_dst, cpu), type, optslen); return md_dst; } EXPORT_SYMBOL_GPL(metadata_dst_alloc_percpu); void metadata_dst_free_percpu(struct metadata_dst __percpu *md_dst) { int cpu; for_each_possible_cpu(cpu) { struct metadata_dst *one_md_dst = per_cpu_ptr(md_dst, cpu); #ifdef CONFIG_DST_CACHE if (one_md_dst->type == METADATA_IP_TUNNEL) dst_cache_destroy(&one_md_dst->u.tun_info.dst_cache); #endif if (one_md_dst->type == METADATA_XFRM) dst_release(one_md_dst->u.xfrm_info.dst_orig); } free_percpu(md_dst); } EXPORT_SYMBOL_GPL(metadata_dst_free_percpu);
4 1 3 1 4 4 1 3 1 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 // SPDX-License-Identifier: GPL-2.0-or-later /* Kernel module to match Segment Routing Header (SRH) parameters. */ /* Author: * Ahmed Abdelsalam <amsalam20@gmail.com> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/skbuff.h> #include <linux/ipv6.h> #include <linux/types.h> #include <net/ipv6.h> #include <net/seg6.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_ipv6/ip6t_srh.h> #include <linux/netfilter_ipv6/ip6_tables.h> /* Test a struct->mt_invflags and a boolean for inequality */ #define NF_SRH_INVF(ptr, flag, boolean) \ ((boolean) ^ !!((ptr)->mt_invflags & (flag))) static bool srh_mt6(const struct sk_buff *skb, struct xt_action_param *par) { const struct ip6t_srh *srhinfo = par->matchinfo; struct ipv6_sr_hdr *srh; struct ipv6_sr_hdr _srh; int hdrlen, srhoff = 0; if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) return false; srh = skb_header_pointer(skb, srhoff, sizeof(_srh), &_srh); if (!srh) return false; hdrlen = ipv6_optlen(srh); if (skb->len - srhoff < hdrlen) return false; if (srh->type != IPV6_SRCRT_TYPE_4) return false; if (srh->segments_left > srh->first_segment) return false; /* Next Header matching */ if (srhinfo->mt_flags & IP6T_SRH_NEXTHDR) if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_NEXTHDR, !(srh->nexthdr == srhinfo->next_hdr))) return false; /* Header Extension Length matching */ if (srhinfo->mt_flags & IP6T_SRH_LEN_EQ) if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LEN_EQ, !(srh->hdrlen == srhinfo->hdr_len))) return false; if (srhinfo->mt_flags & IP6T_SRH_LEN_GT) if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LEN_GT, !(srh->hdrlen > srhinfo->hdr_len))) return false; if (srhinfo->mt_flags & IP6T_SRH_LEN_LT) if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LEN_LT, !(srh->hdrlen < srhinfo->hdr_len))) return false; /* Segments Left matching */ if (srhinfo->mt_flags & IP6T_SRH_SEGS_EQ) if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_SEGS_EQ, !(srh->segments_left == srhinfo->segs_left))) return false; if (srhinfo->mt_flags & IP6T_SRH_SEGS_GT) if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_SEGS_GT, !(srh->segments_left > srhinfo->segs_left))) return false; if (srhinfo->mt_flags & IP6T_SRH_SEGS_LT) if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_SEGS_LT, !(srh->segments_left < srhinfo->segs_left))) return false; /** * Last Entry matching * Last_Entry field was introduced in revision 6 of the SRH draft. * It was called First_Segment in the previous revision */ if (srhinfo->mt_flags & IP6T_SRH_LAST_EQ) if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LAST_EQ, !(srh->first_segment == srhinfo->last_entry))) return false; if (srhinfo->mt_flags & IP6T_SRH_LAST_GT) if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LAST_GT, !(srh->first_segment > srhinfo->last_entry))) return false; if (srhinfo->mt_flags & IP6T_SRH_LAST_LT) if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LAST_LT, !(srh->first_segment < srhinfo->last_entry))) return false; /** * Tag matchig * Tag field was introduced in revision 6 of the SRH draft. */ if (srhinfo->mt_flags & IP6T_SRH_TAG) if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_TAG, !(srh->tag == srhinfo->tag))) return false; return true; } static bool srh1_mt6(const struct sk_buff *skb, struct xt_action_param *par) { int hdrlen, psidoff, nsidoff, lsidoff, srhoff = 0; const struct ip6t_srh1 *srhinfo = par->matchinfo; struct in6_addr *psid, *nsid, *lsid; struct in6_addr _psid, _nsid, _lsid; struct ipv6_sr_hdr *srh; struct ipv6_sr_hdr _srh; if (ipv6_find_hdr(skb, &srhoff, IPPROTO_ROUTING, NULL, NULL) < 0) return false; srh = skb_header_pointer(skb, srhoff, sizeof(_srh), &_srh); if (!srh) return false; hdrlen = ipv6_optlen(srh); if (skb->len - srhoff < hdrlen) return false; if (srh->type != IPV6_SRCRT_TYPE_4) return false; if (srh->segments_left > srh->first_segment) return false; /* Next Header matching */ if (srhinfo->mt_flags & IP6T_SRH_NEXTHDR) if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_NEXTHDR, !(srh->nexthdr == srhinfo->next_hdr))) return false; /* Header Extension Length matching */ if (srhinfo->mt_flags & IP6T_SRH_LEN_EQ) if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LEN_EQ, !(srh->hdrlen == srhinfo->hdr_len))) return false; if (srhinfo->mt_flags & IP6T_SRH_LEN_GT) if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LEN_GT, !(srh->hdrlen > srhinfo->hdr_len))) return false; if (srhinfo->mt_flags & IP6T_SRH_LEN_LT) if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LEN_LT, !(srh->hdrlen < srhinfo->hdr_len))) return false; /* Segments Left matching */ if (srhinfo->mt_flags & IP6T_SRH_SEGS_EQ) if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_SEGS_EQ, !(srh->segments_left == srhinfo->segs_left))) return false; if (srhinfo->mt_flags & IP6T_SRH_SEGS_GT) if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_SEGS_GT, !(srh->segments_left > srhinfo->segs_left))) return false; if (srhinfo->mt_flags & IP6T_SRH_SEGS_LT) if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_SEGS_LT, !(srh->segments_left < srhinfo->segs_left))) return false; /** * Last Entry matching * Last_Entry field was introduced in revision 6 of the SRH draft. * It was called First_Segment in the previous revision */ if (srhinfo->mt_flags & IP6T_SRH_LAST_EQ) if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LAST_EQ, !(srh->first_segment == srhinfo->last_entry))) return false; if (srhinfo->mt_flags & IP6T_SRH_LAST_GT) if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LAST_GT, !(srh->first_segment > srhinfo->last_entry))) return false; if (srhinfo->mt_flags & IP6T_SRH_LAST_LT) if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LAST_LT, !(srh->first_segment < srhinfo->last_entry))) return false; /** * Tag matchig * Tag field was introduced in revision 6 of the SRH draft */ if (srhinfo->mt_flags & IP6T_SRH_TAG) if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_TAG, !(srh->tag == srhinfo->tag))) return false; /* Previous SID matching */ if (srhinfo->mt_flags & IP6T_SRH_PSID) { if (srh->segments_left == srh->first_segment) return false; psidoff = srhoff + sizeof(struct ipv6_sr_hdr) + ((srh->segments_left + 1) * sizeof(struct in6_addr)); psid = skb_header_pointer(skb, psidoff, sizeof(_psid), &_psid); if (!psid) return false; if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_PSID, ipv6_masked_addr_cmp(psid, &srhinfo->psid_msk, &srhinfo->psid_addr))) return false; } /* Next SID matching */ if (srhinfo->mt_flags & IP6T_SRH_NSID) { if (srh->segments_left == 0) return false; nsidoff = srhoff + sizeof(struct ipv6_sr_hdr) + ((srh->segments_left - 1) * sizeof(struct in6_addr)); nsid = skb_header_pointer(skb, nsidoff, sizeof(_nsid), &_nsid); if (!nsid) return false; if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_NSID, ipv6_masked_addr_cmp(nsid, &srhinfo->nsid_msk, &srhinfo->nsid_addr))) return false; } /* Last SID matching */ if (srhinfo->mt_flags & IP6T_SRH_LSID) { lsidoff = srhoff + sizeof(struct ipv6_sr_hdr); lsid = skb_header_pointer(skb, lsidoff, sizeof(_lsid), &_lsid); if (!lsid) return false; if (NF_SRH_INVF(srhinfo, IP6T_SRH_INV_LSID, ipv6_masked_addr_cmp(lsid, &srhinfo->lsid_msk, &srhinfo->lsid_addr))) return false; } return true; } static int srh_mt6_check(const struct xt_mtchk_param *par) { const struct ip6t_srh *srhinfo = par->matchinfo; if (srhinfo->mt_flags & ~IP6T_SRH_MASK) { pr_info_ratelimited("unknown srh match flags %X\n", srhinfo->mt_flags); return -EINVAL; } if (srhinfo->mt_invflags & ~IP6T_SRH_INV_MASK) { pr_info_ratelimited("unknown srh invflags %X\n", srhinfo->mt_invflags); return -EINVAL; } return 0; } static int srh1_mt6_check(const struct xt_mtchk_param *par) { const struct ip6t_srh1 *srhinfo = par->matchinfo; if (srhinfo->mt_flags & ~IP6T_SRH_MASK) { pr_info_ratelimited("unknown srh match flags %X\n", srhinfo->mt_flags); return -EINVAL; } if (srhinfo->mt_invflags & ~IP6T_SRH_INV_MASK) { pr_info_ratelimited("unknown srh invflags %X\n", srhinfo->mt_invflags); return -EINVAL; } return 0; } static struct xt_match srh_mt6_reg[] __read_mostly = { { .name = "srh", .revision = 0, .family = NFPROTO_IPV6, .match = srh_mt6, .matchsize = sizeof(struct ip6t_srh), .checkentry = srh_mt6_check, .me = THIS_MODULE, }, { .name = "srh", .revision = 1, .family = NFPROTO_IPV6, .match = srh1_mt6, .matchsize = sizeof(struct ip6t_srh1), .checkentry = srh1_mt6_check, .me = THIS_MODULE, } }; static int __init srh_mt6_init(void) { return xt_register_matches(srh_mt6_reg, ARRAY_SIZE(srh_mt6_reg)); } static void __exit srh_mt6_exit(void) { xt_unregister_matches(srh_mt6_reg, ARRAY_SIZE(srh_mt6_reg)); } module_init(srh_mt6_init); module_exit(srh_mt6_exit); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Xtables: IPv6 Segment Routing Header match"); MODULE_AUTHOR("Ahmed Abdelsalam <amsalam20@gmail.com>");
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_SEGMENT_H #define _ASM_X86_SEGMENT_H #include <linux/const.h> #include <asm/alternative.h> #include <asm/ibt.h> /* * Constructor for a conventional segment GDT (or LDT) entry. * This is a macro so it can be used in initializers. */ #define GDT_ENTRY(flags, base, limit) \ ((((base) & _AC(0xff000000,ULL)) << (56-24)) | \ (((flags) & _AC(0x0000f0ff,ULL)) << 40) | \ (((limit) & _AC(0x000f0000,ULL)) << (48-16)) | \ (((base) & _AC(0x00ffffff,ULL)) << 16) | \ (((limit) & _AC(0x0000ffff,ULL)))) /* Simple and small GDT entries for booting only: */ #define GDT_ENTRY_BOOT_CS 2 #define GDT_ENTRY_BOOT_DS 3 #define GDT_ENTRY_BOOT_TSS 4 #define __BOOT_CS (GDT_ENTRY_BOOT_CS*8) #define __BOOT_DS (GDT_ENTRY_BOOT_DS*8) #define __BOOT_TSS (GDT_ENTRY_BOOT_TSS*8) /* * Bottom two bits of selector give the ring * privilege level */ #define SEGMENT_RPL_MASK 0x3 /* * When running on Xen PV, the actual privilege level of the kernel is 1, * not 0. Testing the Requested Privilege Level in a segment selector to * determine whether the context is user mode or kernel mode with * SEGMENT_RPL_MASK is wrong because the PV kernel's privilege level * matches the 0x3 mask. * * Testing with USER_SEGMENT_RPL_MASK is valid for both native and Xen PV * kernels because privilege level 2 is never used. */ #define USER_SEGMENT_RPL_MASK 0x2 /* User mode is privilege level 3: */ #define USER_RPL 0x3 /* Bit 2 is Table Indicator (TI): selects between LDT or GDT */ #define SEGMENT_TI_MASK 0x4 /* LDT segment has TI set ... */ #define SEGMENT_LDT 0x4 /* ... GDT has it cleared */ #define SEGMENT_GDT 0x0 #define GDT_ENTRY_INVALID_SEG 0 #if defined(CONFIG_X86_32) && !defined(BUILD_VDSO32_64) /* * The layout of the per-CPU GDT under Linux: * * 0 - null <=== cacheline #1 * 1 - reserved * 2 - reserved * 3 - reserved * * 4 - unused <=== cacheline #2 * 5 - unused * * ------- start of TLS (Thread-Local Storage) segments: * * 6 - TLS segment #1 [ glibc's TLS segment ] * 7 - TLS segment #2 [ Wine's %fs Win32 segment ] * 8 - TLS segment #3 <=== cacheline #3 * 9 - reserved * 10 - reserved * 11 - reserved * * ------- start of kernel segments: * * 12 - kernel code segment <=== cacheline #4 * 13 - kernel data segment * 14 - default user CS * 15 - default user DS * 16 - TSS <=== cacheline #5 * 17 - LDT * 18 - PNPBIOS support (16->32 gate) * 19 - PNPBIOS support * 20 - PNPBIOS support <=== cacheline #6 * 21 - PNPBIOS support * 22 - PNPBIOS support * 23 - APM BIOS support * 24 - APM BIOS support <=== cacheline #7 * 25 - APM BIOS support * * 26 - ESPFIX small SS * 27 - per-cpu [ offset to per-cpu data area ] * 28 - VDSO getcpu * 29 - unused * 30 - unused * 31 - TSS for double fault handler */ #define GDT_ENTRY_TLS_MIN 6 #define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1) #define GDT_ENTRY_KERNEL_CS 12 #define GDT_ENTRY_KERNEL_DS 13 #define GDT_ENTRY_DEFAULT_USER_CS 14 #define GDT_ENTRY_DEFAULT_USER_DS 15 #define GDT_ENTRY_TSS 16 #define GDT_ENTRY_LDT 17 #define GDT_ENTRY_PNPBIOS_CS32 18 #define GDT_ENTRY_PNPBIOS_CS16 19 #define GDT_ENTRY_PNPBIOS_DS 20 #define GDT_ENTRY_PNPBIOS_TS1 21 #define GDT_ENTRY_PNPBIOS_TS2 22 #define GDT_ENTRY_APMBIOS_BASE 23 #define GDT_ENTRY_ESPFIX_SS 26 #define GDT_ENTRY_PERCPU 27 #define GDT_ENTRY_CPUNODE 28 #define GDT_ENTRY_DOUBLEFAULT_TSS 31 /* * Number of entries in the GDT table: */ #define GDT_ENTRIES 32 /* * Segment selector values corresponding to the above entries: */ #define __KERNEL_CS (GDT_ENTRY_KERNEL_CS*8) #define __KERNEL_DS (GDT_ENTRY_KERNEL_DS*8) #define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS*8 + 3) #define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS*8 + 3) #define __USER32_CS __USER_CS #define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS*8) /* segment for calling fn: */ #define PNP_CS32 (GDT_ENTRY_PNPBIOS_CS32*8) /* code segment for BIOS: */ #define PNP_CS16 (GDT_ENTRY_PNPBIOS_CS16*8) /* "Is this PNP code selector (PNP_CS32 or PNP_CS16)?" */ #define SEGMENT_IS_PNP_CODE(x) (((x) & 0xf4) == PNP_CS32) /* data segment for BIOS: */ #define PNP_DS (GDT_ENTRY_PNPBIOS_DS*8) /* transfer data segment: */ #define PNP_TS1 (GDT_ENTRY_PNPBIOS_TS1*8) /* another data segment: */ #define PNP_TS2 (GDT_ENTRY_PNPBIOS_TS2*8) #ifdef CONFIG_SMP # define __KERNEL_PERCPU (GDT_ENTRY_PERCPU*8) #else # define __KERNEL_PERCPU 0 #endif #define __CPUNODE_SEG (GDT_ENTRY_CPUNODE*8 + 3) #else /* 64-bit: */ #include <asm/cache.h> #define GDT_ENTRY_KERNEL32_CS 1 #define GDT_ENTRY_KERNEL_CS 2 #define GDT_ENTRY_KERNEL_DS 3 /* * We cannot use the same code segment descriptor for user and kernel mode, * not even in long flat mode, because of different DPL. * * GDT layout to get 64-bit SYSCALL/SYSRET support right. SYSRET hardcodes * selectors: * * if returning to 32-bit userspace: cs = STAR.SYSRET_CS, * if returning to 64-bit userspace: cs = STAR.SYSRET_CS+16, * * ss = STAR.SYSRET_CS+8 (in either case) * * thus USER_DS should be between 32-bit and 64-bit code selectors: */ #define GDT_ENTRY_DEFAULT_USER32_CS 4 #define GDT_ENTRY_DEFAULT_USER_DS 5 #define GDT_ENTRY_DEFAULT_USER_CS 6 /* Needs two entries */ #define GDT_ENTRY_TSS 8 /* Needs two entries */ #define GDT_ENTRY_LDT 10 #define GDT_ENTRY_TLS_MIN 12 #define GDT_ENTRY_TLS_MAX 14 #define GDT_ENTRY_CPUNODE 15 /* * Number of entries in the GDT table: */ #define GDT_ENTRIES 16 /* * Segment selector values corresponding to the above entries: * * Note, selectors also need to have a correct RPL, * expressed with the +3 value for user-space selectors: */ #define __KERNEL32_CS (GDT_ENTRY_KERNEL32_CS*8) #define __KERNEL_CS (GDT_ENTRY_KERNEL_CS*8) #define __KERNEL_DS (GDT_ENTRY_KERNEL_DS*8) #define __USER32_CS (GDT_ENTRY_DEFAULT_USER32_CS*8 + 3) #define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS*8 + 3) #define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS*8 + 3) #define __CPUNODE_SEG (GDT_ENTRY_CPUNODE*8 + 3) #endif #define IDT_ENTRIES 256 #define NUM_EXCEPTION_VECTORS 32 /* Bitmask of exception vectors which push an error code on the stack: */ #define EXCEPTION_ERRCODE_MASK 0x20027d00 #define GDT_SIZE (GDT_ENTRIES*8) #define GDT_ENTRY_TLS_ENTRIES 3 #define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES* 8) /* Bit size and mask of CPU number stored in the per CPU data (and TSC_AUX) */ #define VDSO_CPUNODE_BITS 12 #define VDSO_CPUNODE_MASK 0xfff #ifndef __ASSEMBLY__ /* Helper functions to store/load CPU and node numbers */ static inline unsigned long vdso_encode_cpunode(int cpu, unsigned long node) { return (node << VDSO_CPUNODE_BITS) | cpu; } static inline void vdso_read_cpunode(unsigned *cpu, unsigned *node) { unsigned int p; /* * Load CPU and node number from the GDT. LSL is faster than RDTSCP * and works on all CPUs. This is volatile so that it orders * correctly with respect to barrier() and to keep GCC from cleverly * hoisting it out of the calling function. * * If RDPID is available, use it. */ alternative_io ("lsl %[seg],%[p]", ".byte 0xf3,0x0f,0xc7,0xf8", /* RDPID %eax/rax */ X86_FEATURE_RDPID, [p] "=a" (p), [seg] "r" (__CPUNODE_SEG)); if (cpu) *cpu = (p & VDSO_CPUNODE_MASK); if (node) *node = (p >> VDSO_CPUNODE_BITS); } #endif /* !__ASSEMBLY__ */ #ifdef __KERNEL__ /* * early_idt_handler_array is an array of entry points referenced in the * early IDT. For simplicity, it's a real array with one entry point * every nine bytes. That leaves room for an optional 'push $0' if the * vector has no error code (two bytes), a 'push $vector_number' (two * bytes), and a jump to the common entry code (up to five bytes). */ #define EARLY_IDT_HANDLER_SIZE (9 + ENDBR_INSN_SIZE) /* * xen_early_idt_handler_array is for Xen pv guests: for each entry in * early_idt_handler_array it contains a prequel in the form of * pop %rcx; pop %r11; jmp early_idt_handler_array[i]; summing up to * max 8 bytes. */ #define XEN_EARLY_IDT_HANDLER_SIZE (8 + ENDBR_INSN_SIZE) #ifndef __ASSEMBLY__ extern const char early_idt_handler_array[NUM_EXCEPTION_VECTORS][EARLY_IDT_HANDLER_SIZE]; extern void early_ignore_irq(void); #ifdef CONFIG_XEN_PV extern const char xen_early_idt_handler_array[NUM_EXCEPTION_VECTORS][XEN_EARLY_IDT_HANDLER_SIZE]; #endif /* * Load a segment. Fall back on loading the zero segment if something goes * wrong. This variant assumes that loading zero fully clears the segment. * This is always the case on Intel CPUs and, even on 64-bit AMD CPUs, any * failure to fully clear the cached descriptor is only observable for * FS and GS. */ #define __loadsegment_simple(seg, value) \ do { \ unsigned short __val = (value); \ \ asm volatile(" \n" \ "1: movl %k0,%%" #seg " \n" \ _ASM_EXTABLE_TYPE_REG(1b, 1b, EX_TYPE_ZERO_REG, %k0)\ : "+r" (__val) : : "memory"); \ } while (0) #define __loadsegment_ss(value) __loadsegment_simple(ss, (value)) #define __loadsegment_ds(value) __loadsegment_simple(ds, (value)) #define __loadsegment_es(value) __loadsegment_simple(es, (value)) #ifdef CONFIG_X86_32 /* * On 32-bit systems, the hidden parts of FS and GS are unobservable if * the selector is NULL, so there's no funny business here. */ #define __loadsegment_fs(value) __loadsegment_simple(fs, (value)) #define __loadsegment_gs(value) __loadsegment_simple(gs, (value)) #else static inline void __loadsegment_fs(unsigned short value) { asm volatile(" \n" "1: movw %0, %%fs \n" "2: \n" _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_CLEAR_FS) : : "rm" (value) : "memory"); } /* __loadsegment_gs is intentionally undefined. Use load_gs_index instead. */ #endif #define loadsegment(seg, value) __loadsegment_ ## seg (value) /* * Save a segment register away: */ #define savesegment(seg, value) \ asm("mov %%" #seg ",%0":"=r" (value) : : "memory") #endif /* !__ASSEMBLY__ */ #endif /* __KERNEL__ */ #endif /* _ASM_X86_SEGMENT_H */
272 271 112 113 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 /* * Copyright (C) 2016 Red Hat * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. * * Authors: * Rob Clark <robdclark@gmail.com> */ #include <linux/stdarg.h> #include <linux/io.h> #include <linux/moduleparam.h> #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/dynamic_debug.h> #include <drm/drm.h> #include <drm/drm_drv.h> #include <drm/drm_print.h> /* * __drm_debug: Enable debug output. * Bitmask of DRM_UT_x. See include/drm/drm_print.h for details. */ unsigned long __drm_debug; EXPORT_SYMBOL(__drm_debug); MODULE_PARM_DESC(debug, "Enable debug output, where each bit enables a debug category.\n" "\t\tBit 0 (0x01) will enable CORE messages (drm core code)\n" "\t\tBit 1 (0x02) will enable DRIVER messages (drm controller code)\n" "\t\tBit 2 (0x04) will enable KMS messages (modesetting code)\n" "\t\tBit 3 (0x08) will enable PRIME messages (prime code)\n" "\t\tBit 4 (0x10) will enable ATOMIC messages (atomic code)\n" "\t\tBit 5 (0x20) will enable VBL messages (vblank code)\n" "\t\tBit 7 (0x80) will enable LEASE messages (leasing code)\n" "\t\tBit 8 (0x100) will enable DP messages (displayport code)"); #if !defined(CONFIG_DRM_USE_DYNAMIC_DEBUG) module_param_named(debug, __drm_debug, ulong, 0600); #else /* classnames must match vals of enum drm_debug_category */ DECLARE_DYNDBG_CLASSMAP(drm_debug_classes, DD_CLASS_TYPE_DISJOINT_BITS, 0, "DRM_UT_CORE", "DRM_UT_DRIVER", "DRM_UT_KMS", "DRM_UT_PRIME", "DRM_UT_ATOMIC", "DRM_UT_VBL", "DRM_UT_STATE", "DRM_UT_LEASE", "DRM_UT_DP", "DRM_UT_DRMRES"); static struct ddebug_class_param drm_debug_bitmap = { .bits = &__drm_debug, .flags = "p", .map = &drm_debug_classes, }; module_param_cb(debug, &param_ops_dyndbg_classes, &drm_debug_bitmap, 0600); #endif void __drm_puts_coredump(struct drm_printer *p, const char *str) { struct drm_print_iterator *iterator = p->arg; ssize_t len; if (!iterator->remain) return; if (iterator->offset < iterator->start) { ssize_t copy; len = strlen(str); if (iterator->offset + len <= iterator->start) { iterator->offset += len; return; } copy = len - (iterator->start - iterator->offset); if (copy > iterator->remain) copy = iterator->remain; /* Copy out the bit of the string that we need */ memcpy(iterator->data, str + (iterator->start - iterator->offset), copy); iterator->offset = iterator->start + copy; iterator->remain -= copy; } else { ssize_t pos = iterator->offset - iterator->start; len = min_t(ssize_t, strlen(str), iterator->remain); memcpy(iterator->data + pos, str, len); iterator->offset += len; iterator->remain -= len; } } EXPORT_SYMBOL(__drm_puts_coredump); void __drm_printfn_coredump(struct drm_printer *p, struct va_format *vaf) { struct drm_print_iterator *iterator = p->arg; size_t len; char *buf; if (!iterator->remain) return; /* Figure out how big the string will be */ len = snprintf(NULL, 0, "%pV", vaf); /* This is the easiest path, we've already advanced beyond the offset */ if (iterator->offset + len <= iterator->start) { iterator->offset += len; return; } /* Then check if we can directly copy into the target buffer */ if ((iterator->offset >= iterator->start) && (len < iterator->remain)) { ssize_t pos = iterator->offset - iterator->start; snprintf(((char *) iterator->data) + pos, iterator->remain, "%pV", vaf); iterator->offset += len; iterator->remain -= len; return; } /* * Finally, hit the slow path and make a temporary string to copy over * using _drm_puts_coredump */ buf = kmalloc(len + 1, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY); if (!buf) return; snprintf(buf, len + 1, "%pV", vaf); __drm_puts_coredump(p, (const char *) buf); kfree(buf); } EXPORT_SYMBOL(__drm_printfn_coredump); void __drm_puts_seq_file(struct drm_printer *p, const char *str) { seq_puts(p->arg, str); } EXPORT_SYMBOL(__drm_puts_seq_file); void __drm_printfn_seq_file(struct drm_printer *p, struct va_format *vaf) { seq_printf(p->arg, "%pV", vaf); } EXPORT_SYMBOL(__drm_printfn_seq_file); void __drm_printfn_info(struct drm_printer *p, struct va_format *vaf) { dev_info(p->arg, "[" DRM_NAME "] %pV", vaf); } EXPORT_SYMBOL(__drm_printfn_info); void __drm_printfn_debug(struct drm_printer *p, struct va_format *vaf) { /* pr_debug callsite decorations are unhelpful here */ printk(KERN_DEBUG "%s %pV", p->prefix, vaf); } EXPORT_SYMBOL(__drm_printfn_debug); void __drm_printfn_err(struct drm_printer *p, struct va_format *vaf) { pr_err("*ERROR* %s %pV", p->prefix, vaf); } EXPORT_SYMBOL(__drm_printfn_err); /** * drm_puts - print a const string to a &drm_printer stream * @p: the &drm printer * @str: const string * * Allow &drm_printer types that have a constant string * option to use it. */ void drm_puts(struct drm_printer *p, const char *str) { if (p->puts) p->puts(p, str); else drm_printf(p, "%s", str); } EXPORT_SYMBOL(drm_puts); /** * drm_printf - print to a &drm_printer stream * @p: the &drm_printer * @f: format string */ void drm_printf(struct drm_printer *p, const char *f, ...) { va_list args; va_start(args, f); drm_vprintf(p, f, &args); va_end(args); } EXPORT_SYMBOL(drm_printf); /** * drm_print_bits - print bits to a &drm_printer stream * * Print bits (in flag fields for example) in human readable form. * * @p: the &drm_printer * @value: field value. * @bits: Array with bit names. * @nbits: Size of bit names array. */ void drm_print_bits(struct drm_printer *p, unsigned long value, const char * const bits[], unsigned int nbits) { bool first = true; unsigned int i; if (WARN_ON_ONCE(nbits > BITS_PER_TYPE(value))) nbits = BITS_PER_TYPE(value); for_each_set_bit(i, &value, nbits) { if (WARN_ON_ONCE(!bits[i])) continue; drm_printf(p, "%s%s", first ? "" : ",", bits[i]); first = false; } if (first) drm_printf(p, "(none)"); } EXPORT_SYMBOL(drm_print_bits); void drm_dev_printk(const struct device *dev, const char *level, const char *format, ...) { struct va_format vaf; va_list args; va_start(args, format); vaf.fmt = format; vaf.va = &args; if (dev) dev_printk(level, dev, "[" DRM_NAME ":%ps] %pV", __builtin_return_address(0), &vaf); else printk("%s" "[" DRM_NAME ":%ps] %pV", level, __builtin_return_address(0), &vaf); va_end(args); } EXPORT_SYMBOL(drm_dev_printk); void __drm_dev_dbg(struct _ddebug *desc, const struct device *dev, enum drm_debug_category category, const char *format, ...) { struct va_format vaf; va_list args; if (!__drm_debug_enabled(category)) return; /* we know we are printing for either syslog, tracefs, or both */ va_start(args, format); vaf.fmt = format; vaf.va = &args; if (dev) dev_printk(KERN_DEBUG, dev, "[" DRM_NAME ":%ps] %pV", __builtin_return_address(0), &vaf); else printk(KERN_DEBUG "[" DRM_NAME ":%ps] %pV", __builtin_return_address(0), &vaf); va_end(args); } EXPORT_SYMBOL(__drm_dev_dbg); void ___drm_dbg(struct _ddebug *desc, enum drm_debug_category category, const char *format, ...) { struct va_format vaf; va_list args; if (!__drm_debug_enabled(category)) return; va_start(args, format); vaf.fmt = format; vaf.va = &args; printk(KERN_DEBUG "[" DRM_NAME ":%ps] %pV", __builtin_return_address(0), &vaf); va_end(args); } EXPORT_SYMBOL(___drm_dbg); void __drm_err(const char *format, ...) { struct va_format vaf; va_list args; va_start(args, format); vaf.fmt = format; vaf.va = &args; printk(KERN_ERR "[" DRM_NAME ":%ps] *ERROR* %pV", __builtin_return_address(0), &vaf); va_end(args); } EXPORT_SYMBOL(__drm_err); /** * drm_print_regset32 - print the contents of registers to a * &drm_printer stream. * * @p: the &drm printer * @regset: the list of registers to print. * * Often in driver debug, it's useful to be able to either capture the * contents of registers in the steady state using debugfs or at * specific points during operation. This lets the driver have a * single list of registers for both. */ void drm_print_regset32(struct drm_printer *p, struct debugfs_regset32 *regset) { int namelen = 0; int i; for (i = 0; i < regset->nregs; i++) namelen = max(namelen, (int)strlen(regset->regs[i].name)); for (i = 0; i < regset->nregs; i++) { drm_printf(p, "%*s = 0x%08x\n", namelen, regset->regs[i].name, readl(regset->base + regset->regs[i].offset)); } } EXPORT_SYMBOL(drm_print_regset32);
73 43 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 /* SPDX-License-Identifier: GPL-2.0-only */ /* * pcm_local.h - a local header file for snd-pcm module. * * Copyright (c) Takashi Sakamoto <o-takashi@sakamocchi.jp> */ #ifndef __SOUND_CORE_PCM_LOCAL_H #define __SOUND_CORE_PCM_LOCAL_H extern const struct snd_pcm_hw_constraint_list snd_pcm_known_rates; void snd_interval_mul(const struct snd_interval *a, const struct snd_interval *b, struct snd_interval *c); void snd_interval_div(const struct snd_interval *a, const struct snd_interval *b, struct snd_interval *c); void snd_interval_muldivk(const struct snd_interval *a, const struct snd_interval *b, unsigned int k, struct snd_interval *c); void snd_interval_mulkdiv(const struct snd_interval *a, unsigned int k, const struct snd_interval *b, struct snd_interval *c); int snd_pcm_hw_constraint_mask(struct snd_pcm_runtime *runtime, snd_pcm_hw_param_t var, u_int32_t mask); int pcm_lib_apply_appl_ptr(struct snd_pcm_substream *substream, snd_pcm_uframes_t appl_ptr); int snd_pcm_update_state(struct snd_pcm_substream *substream, struct snd_pcm_runtime *runtime); int snd_pcm_update_hw_ptr(struct snd_pcm_substream *substream); void snd_pcm_playback_silence(struct snd_pcm_substream *substream, snd_pcm_uframes_t new_hw_ptr); static inline snd_pcm_uframes_t snd_pcm_avail(struct snd_pcm_substream *substream) { if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) return snd_pcm_playback_avail(substream->runtime); else return snd_pcm_capture_avail(substream->runtime); } static inline snd_pcm_uframes_t snd_pcm_hw_avail(struct snd_pcm_substream *substream) { if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) return snd_pcm_playback_hw_avail(substream->runtime); else return snd_pcm_capture_hw_avail(substream->runtime); } #ifdef CONFIG_SND_PCM_TIMER void snd_pcm_timer_resolution_change(struct snd_pcm_substream *substream); void snd_pcm_timer_init(struct snd_pcm_substream *substream); void snd_pcm_timer_done(struct snd_pcm_substream *substream); #else static inline void snd_pcm_timer_resolution_change(struct snd_pcm_substream *substream) {} static inline void snd_pcm_timer_init(struct snd_pcm_substream *substream) {} static inline void snd_pcm_timer_done(struct snd_pcm_substream *substream) {} #endif void __snd_pcm_xrun(struct snd_pcm_substream *substream); void snd_pcm_group_init(struct snd_pcm_group *group); void snd_pcm_sync_stop(struct snd_pcm_substream *substream, bool sync_irq); #define PCM_RUNTIME_CHECK(sub) snd_BUG_ON(!(sub) || !(sub)->runtime) /* loop over all PCM substreams */ #define for_each_pcm_substream(pcm, str, subs) \ for ((str) = 0; (str) < 2; (str)++) \ for ((subs) = (pcm)->streams[str].substream; (subs); \ (subs) = (subs)->next) static inline void snd_pcm_dma_buffer_sync(struct snd_pcm_substream *substream, enum snd_dma_sync_mode mode) { if (substream->runtime->info & SNDRV_PCM_INFO_EXPLICIT_SYNC) snd_dma_buffer_sync(snd_pcm_get_dma_buf(substream), mode); } #endif /* __SOUND_CORE_PCM_LOCAL_H */
238 238 244 168 379 378 360 109 22 18 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2016 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. */ #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> #include <linux/cache.h> #include <linux/random.h> #include <linux/hrtimer.h> #include <linux/ktime.h> #include <linux/string.h> #include <linux/net.h> #include <linux/siphash.h> #include <net/secure_seq.h> #if IS_ENABLED(CONFIG_IPV6) || IS_ENABLED(CONFIG_INET) #include <linux/in6.h> #include <net/tcp.h> static siphash_aligned_key_t net_secret; static siphash_aligned_key_t ts_secret; #define EPHEMERAL_PORT_SHUFFLE_PERIOD (10 * HZ) static __always_inline void net_secret_init(void) { net_get_random_once(&net_secret, sizeof(net_secret)); } static __always_inline void ts_secret_init(void) { net_get_random_once(&ts_secret, sizeof(ts_secret)); } #endif #ifdef CONFIG_INET static u32 seq_scale(u32 seq) { /* * As close as possible to RFC 793, which * suggests using a 250 kHz clock. * Further reading shows this assumes 2 Mb/s networks. * For 10 Mb/s Ethernet, a 1 MHz clock is appropriate. * For 10 Gb/s Ethernet, a 1 GHz clock should be ok, but * we also need to limit the resolution so that the u32 seq * overlaps less than one time per MSL (2 minutes). * Choosing a clock of 64 ns period is OK. (period of 274 s) */ return seq + (ktime_get_real_ns() >> 6); } #endif #if IS_ENABLED(CONFIG_IPV6) u32 secure_tcpv6_ts_off(const struct net *net, const __be32 *saddr, const __be32 *daddr) { const struct { struct in6_addr saddr; struct in6_addr daddr; } __aligned(SIPHASH_ALIGNMENT) combined = { .saddr = *(struct in6_addr *)saddr, .daddr = *(struct in6_addr *)daddr, }; if (READ_ONCE(net->ipv4.sysctl_tcp_timestamps) != 1) return 0; ts_secret_init(); return siphash(&combined, offsetofend(typeof(combined), daddr), &ts_secret); } EXPORT_SYMBOL(secure_tcpv6_ts_off); u32 secure_tcpv6_seq(const __be32 *saddr, const __be32 *daddr, __be16 sport, __be16 dport) { const struct { struct in6_addr saddr; struct in6_addr daddr; __be16 sport; __be16 dport; } __aligned(SIPHASH_ALIGNMENT) combined = { .saddr = *(struct in6_addr *)saddr, .daddr = *(struct in6_addr *)daddr, .sport = sport, .dport = dport }; u32 hash; net_secret_init(); hash = siphash(&combined, offsetofend(typeof(combined), dport), &net_secret); return seq_scale(hash); } EXPORT_SYMBOL(secure_tcpv6_seq); u64 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr, __be16 dport) { const struct { struct in6_addr saddr; struct in6_addr daddr; unsigned int timeseed; __be16 dport; } __aligned(SIPHASH_ALIGNMENT) combined = { .saddr = *(struct in6_addr *)saddr, .daddr = *(struct in6_addr *)daddr, .timeseed = jiffies / EPHEMERAL_PORT_SHUFFLE_PERIOD, .dport = dport, }; net_secret_init(); return siphash(&combined, offsetofend(typeof(combined), dport), &net_secret); } EXPORT_SYMBOL(secure_ipv6_port_ephemeral); #endif #ifdef CONFIG_INET u32 secure_tcp_ts_off(const struct net *net, __be32 saddr, __be32 daddr) { if (READ_ONCE(net->ipv4.sysctl_tcp_timestamps) != 1) return 0; ts_secret_init(); return siphash_2u32((__force u32)saddr, (__force u32)daddr, &ts_secret); } /* secure_tcp_seq_and_tsoff(a, b, 0, d) == secure_ipv4_port_ephemeral(a, b, d), * but fortunately, `sport' cannot be 0 in any circumstances. If this changes, * it would be easy enough to have the former function use siphash_4u32, passing * the arguments as separate u32. */ u32 secure_tcp_seq(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport) { u32 hash; net_secret_init(); hash = siphash_3u32((__force u32)saddr, (__force u32)daddr, (__force u32)sport << 16 | (__force u32)dport, &net_secret); return seq_scale(hash); } EXPORT_SYMBOL_GPL(secure_tcp_seq); u64 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport) { net_secret_init(); return siphash_4u32((__force u32)saddr, (__force u32)daddr, (__force u16)dport, jiffies / EPHEMERAL_PORT_SHUFFLE_PERIOD, &net_secret); } EXPORT_SYMBOL_GPL(secure_ipv4_port_ephemeral); #endif #if IS_ENABLED(CONFIG_IP_DCCP) u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport) { u64 seq; net_secret_init(); seq = siphash_3u32((__force u32)saddr, (__force u32)daddr, (__force u32)sport << 16 | (__force u32)dport, &net_secret); seq += ktime_get_real_ns(); seq &= (1ull << 48) - 1; return seq; } EXPORT_SYMBOL(secure_dccp_sequence_number); #if IS_ENABLED(CONFIG_IPV6) u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr, __be16 sport, __be16 dport) { const struct { struct in6_addr saddr; struct in6_addr daddr; __be16 sport; __be16 dport; } __aligned(SIPHASH_ALIGNMENT) combined = { .saddr = *(struct in6_addr *)saddr, .daddr = *(struct in6_addr *)daddr, .sport = sport, .dport = dport }; u64 seq; net_secret_init(); seq = siphash(&combined, offsetofend(typeof(combined), dport), &net_secret); seq += ktime_get_real_ns(); seq &= (1ull << 48) - 1; return seq; } EXPORT_SYMBOL(secure_dccpv6_sequence_number); #endif #endif
6 5 6 4 4 4 7 3 2 10 5 3 93 93 93 6 6 5 8 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1994 Linus Torvalds * * Cyrix stuff, June 1998 by: * - Rafael R. Reilova (moved everything from head.S), * <rreilova@ececs.uc.edu> * - Channing Corn (tests & fixes), * - Andrew D. Balsa (code cleanup). */ #include <linux/init.h> #include <linux/cpu.h> #include <linux/module.h> #include <linux/nospec.h> #include <linux/prctl.h> #include <linux/sched/smt.h> #include <linux/pgtable.h> #include <linux/bpf.h> #include <asm/spec-ctrl.h> #include <asm/cmdline.h> #include <asm/bugs.h> #include <asm/processor.h> #include <asm/processor-flags.h> #include <asm/fpu/api.h> #include <asm/msr.h> #include <asm/vmx.h> #include <asm/paravirt.h> #include <asm/intel-family.h> #include <asm/e820/api.h> #include <asm/hypervisor.h> #include <asm/tlbflush.h> #include <asm/cpu.h> #include "cpu.h" static void __init spectre_v1_select_mitigation(void); static void __init spectre_v2_select_mitigation(void); static void __init retbleed_select_mitigation(void); static void __init spectre_v2_user_select_mitigation(void); static void __init ssb_select_mitigation(void); static void __init l1tf_select_mitigation(void); static void __init mds_select_mitigation(void); static void __init md_clear_update_mitigation(void); static void __init md_clear_select_mitigation(void); static void __init taa_select_mitigation(void); static void __init mmio_select_mitigation(void); static void __init srbds_select_mitigation(void); static void __init l1d_flush_select_mitigation(void); static void __init srso_select_mitigation(void); static void __init gds_select_mitigation(void); /* The base value of the SPEC_CTRL MSR without task-specific bits set */ u64 x86_spec_ctrl_base; EXPORT_SYMBOL_GPL(x86_spec_ctrl_base); /* The current value of the SPEC_CTRL MSR with task-specific bits set */ DEFINE_PER_CPU(u64, x86_spec_ctrl_current); EXPORT_SYMBOL_GPL(x86_spec_ctrl_current); u64 x86_pred_cmd __ro_after_init = PRED_CMD_IBPB; EXPORT_SYMBOL_GPL(x86_pred_cmd); static DEFINE_MUTEX(spec_ctrl_mutex); void (*x86_return_thunk)(void) __ro_after_init = __x86_return_thunk; /* Update SPEC_CTRL MSR and its cached copy unconditionally */ static void update_spec_ctrl(u64 val) { this_cpu_write(x86_spec_ctrl_current, val); wrmsrl(MSR_IA32_SPEC_CTRL, val); } /* * Keep track of the SPEC_CTRL MSR value for the current task, which may differ * from x86_spec_ctrl_base due to STIBP/SSB in __speculation_ctrl_update(). */ void update_spec_ctrl_cond(u64 val) { if (this_cpu_read(x86_spec_ctrl_current) == val) return; this_cpu_write(x86_spec_ctrl_current, val); /* * When KERNEL_IBRS this MSR is written on return-to-user, unless * forced the update can be delayed until that time. */ if (!cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS)) wrmsrl(MSR_IA32_SPEC_CTRL, val); } noinstr u64 spec_ctrl_current(void) { return this_cpu_read(x86_spec_ctrl_current); } EXPORT_SYMBOL_GPL(spec_ctrl_current); /* * AMD specific MSR info for Speculative Store Bypass control. * x86_amd_ls_cfg_ssbd_mask is initialized in identify_boot_cpu(). */ u64 __ro_after_init x86_amd_ls_cfg_base; u64 __ro_after_init x86_amd_ls_cfg_ssbd_mask; /* Control conditional STIBP in switch_to() */ DEFINE_STATIC_KEY_FALSE(switch_to_cond_stibp); /* Control conditional IBPB in switch_mm() */ DEFINE_STATIC_KEY_FALSE(switch_mm_cond_ibpb); /* Control unconditional IBPB in switch_mm() */ DEFINE_STATIC_KEY_FALSE(switch_mm_always_ibpb); /* Control MDS CPU buffer clear before returning to user space */ DEFINE_STATIC_KEY_FALSE(mds_user_clear); EXPORT_SYMBOL_GPL(mds_user_clear); /* Control MDS CPU buffer clear before idling (halt, mwait) */ DEFINE_STATIC_KEY_FALSE(mds_idle_clear); EXPORT_SYMBOL_GPL(mds_idle_clear); /* * Controls whether l1d flush based mitigations are enabled, * based on hw features and admin setting via boot parameter * defaults to false */ DEFINE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush); /* Controls CPU Fill buffer clear before KVM guest MMIO accesses */ DEFINE_STATIC_KEY_FALSE(mmio_stale_data_clear); EXPORT_SYMBOL_GPL(mmio_stale_data_clear); void __init cpu_select_mitigations(void) { /* * Read the SPEC_CTRL MSR to account for reserved bits which may * have unknown values. AMD64_LS_CFG MSR is cached in the early AMD * init code as it is not enumerated and depends on the family. */ if (cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL)) { rdmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); /* * Previously running kernel (kexec), may have some controls * turned ON. Clear them and let the mitigations setup below * rediscover them based on configuration. */ x86_spec_ctrl_base &= ~SPEC_CTRL_MITIGATIONS_MASK; } /* Select the proper CPU mitigations before patching alternatives: */ spectre_v1_select_mitigation(); spectre_v2_select_mitigation(); /* * retbleed_select_mitigation() relies on the state set by * spectre_v2_select_mitigation(); specifically it wants to know about * spectre_v2=ibrs. */ retbleed_select_mitigation(); /* * spectre_v2_user_select_mitigation() relies on the state set by * retbleed_select_mitigation(); specifically the STIBP selection is * forced for UNRET or IBPB. */ spectre_v2_user_select_mitigation(); ssb_select_mitigation(); l1tf_select_mitigation(); md_clear_select_mitigation(); srbds_select_mitigation(); l1d_flush_select_mitigation(); /* * srso_select_mitigation() depends and must run after * retbleed_select_mitigation(). */ srso_select_mitigation(); gds_select_mitigation(); } /* * NOTE: This function is *only* called for SVM, since Intel uses * MSR_IA32_SPEC_CTRL for SSBD. */ void x86_virt_spec_ctrl(u64 guest_virt_spec_ctrl, bool setguest) { u64 guestval, hostval; struct thread_info *ti = current_thread_info(); /* * If SSBD is not handled in MSR_SPEC_CTRL on AMD, update * MSR_AMD64_L2_CFG or MSR_VIRT_SPEC_CTRL if supported. */ if (!static_cpu_has(X86_FEATURE_LS_CFG_SSBD) && !static_cpu_has(X86_FEATURE_VIRT_SSBD)) return; /* * If the host has SSBD mitigation enabled, force it in the host's * virtual MSR value. If its not permanently enabled, evaluate * current's TIF_SSBD thread flag. */ if (static_cpu_has(X86_FEATURE_SPEC_STORE_BYPASS_DISABLE)) hostval = SPEC_CTRL_SSBD; else hostval = ssbd_tif_to_spec_ctrl(ti->flags); /* Sanitize the guest value */ guestval = guest_virt_spec_ctrl & SPEC_CTRL_SSBD; if (hostval != guestval) { unsigned long tif; tif = setguest ? ssbd_spec_ctrl_to_tif(guestval) : ssbd_spec_ctrl_to_tif(hostval); speculation_ctrl_update(tif); } } EXPORT_SYMBOL_GPL(x86_virt_spec_ctrl); static void x86_amd_ssb_disable(void) { u64 msrval = x86_amd_ls_cfg_base | x86_amd_ls_cfg_ssbd_mask; if (boot_cpu_has(X86_FEATURE_VIRT_SSBD)) wrmsrl(MSR_AMD64_VIRT_SPEC_CTRL, SPEC_CTRL_SSBD); else if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD)) wrmsrl(MSR_AMD64_LS_CFG, msrval); } #undef pr_fmt #define pr_fmt(fmt) "MDS: " fmt /* Default mitigation for MDS-affected CPUs */ static enum mds_mitigations mds_mitigation __ro_after_init = MDS_MITIGATION_FULL; static bool mds_nosmt __ro_after_init = false; static const char * const mds_strings[] = { [MDS_MITIGATION_OFF] = "Vulnerable", [MDS_MITIGATION_FULL] = "Mitigation: Clear CPU buffers", [MDS_MITIGATION_VMWERV] = "Vulnerable: Clear CPU buffers attempted, no microcode", }; static void __init mds_select_mitigation(void) { if (!boot_cpu_has_bug(X86_BUG_MDS) || cpu_mitigations_off()) { mds_mitigation = MDS_MITIGATION_OFF; return; } if (mds_mitigation == MDS_MITIGATION_FULL) { if (!boot_cpu_has(X86_FEATURE_MD_CLEAR)) mds_mitigation = MDS_MITIGATION_VMWERV; static_branch_enable(&mds_user_clear); if (!boot_cpu_has(X86_BUG_MSBDS_ONLY) && (mds_nosmt || cpu_mitigations_auto_nosmt())) cpu_smt_disable(false); } } static int __init mds_cmdline(char *str) { if (!boot_cpu_has_bug(X86_BUG_MDS)) return 0; if (!str) return -EINVAL; if (!strcmp(str, "off")) mds_mitigation = MDS_MITIGATION_OFF; else if (!strcmp(str, "full")) mds_mitigation = MDS_MITIGATION_FULL; else if (!strcmp(str, "full,nosmt")) { mds_mitigation = MDS_MITIGATION_FULL; mds_nosmt = true; } return 0; } early_param("mds", mds_cmdline); #undef pr_fmt #define pr_fmt(fmt) "TAA: " fmt enum taa_mitigations { TAA_MITIGATION_OFF, TAA_MITIGATION_UCODE_NEEDED, TAA_MITIGATION_VERW, TAA_MITIGATION_TSX_DISABLED, }; /* Default mitigation for TAA-affected CPUs */ static enum taa_mitigations taa_mitigation __ro_after_init = TAA_MITIGATION_VERW; static bool taa_nosmt __ro_after_init; static const char * const taa_strings[] = { [TAA_MITIGATION_OFF] = "Vulnerable", [TAA_MITIGATION_UCODE_NEEDED] = "Vulnerable: Clear CPU buffers attempted, no microcode", [TAA_MITIGATION_VERW] = "Mitigation: Clear CPU buffers", [TAA_MITIGATION_TSX_DISABLED] = "Mitigation: TSX disabled", }; static void __init taa_select_mitigation(void) { u64 ia32_cap; if (!boot_cpu_has_bug(X86_BUG_TAA)) { taa_mitigation = TAA_MITIGATION_OFF; return; } /* TSX previously disabled by tsx=off */ if (!boot_cpu_has(X86_FEATURE_RTM)) { taa_mitigation = TAA_MITIGATION_TSX_DISABLED; return; } if (cpu_mitigations_off()) { taa_mitigation = TAA_MITIGATION_OFF; return; } /* * TAA mitigation via VERW is turned off if both * tsx_async_abort=off and mds=off are specified. */ if (taa_mitigation == TAA_MITIGATION_OFF && mds_mitigation == MDS_MITIGATION_OFF) return; if (boot_cpu_has(X86_FEATURE_MD_CLEAR)) taa_mitigation = TAA_MITIGATION_VERW; else taa_mitigation = TAA_MITIGATION_UCODE_NEEDED; /* * VERW doesn't clear the CPU buffers when MD_CLEAR=1 and MDS_NO=1. * A microcode update fixes this behavior to clear CPU buffers. It also * adds support for MSR_IA32_TSX_CTRL which is enumerated by the * ARCH_CAP_TSX_CTRL_MSR bit. * * On MDS_NO=1 CPUs if ARCH_CAP_TSX_CTRL_MSR is not set, microcode * update is required. */ ia32_cap = x86_read_arch_cap_msr(); if ( (ia32_cap & ARCH_CAP_MDS_NO) && !(ia32_cap & ARCH_CAP_TSX_CTRL_MSR)) taa_mitigation = TAA_MITIGATION_UCODE_NEEDED; /* * TSX is enabled, select alternate mitigation for TAA which is * the same as MDS. Enable MDS static branch to clear CPU buffers. * * For guests that can't determine whether the correct microcode is * present on host, enable the mitigation for UCODE_NEEDED as well. */ static_branch_enable(&mds_user_clear); if (taa_nosmt || cpu_mitigations_auto_nosmt()) cpu_smt_disable(false); } static int __init tsx_async_abort_parse_cmdline(char *str) { if (!boot_cpu_has_bug(X86_BUG_TAA)) return 0; if (!str) return -EINVAL; if (!strcmp(str, "off")) { taa_mitigation = TAA_MITIGATION_OFF; } else if (!strcmp(str, "full")) { taa_mitigation = TAA_MITIGATION_VERW; } else if (!strcmp(str, "full,nosmt")) { taa_mitigation = TAA_MITIGATION_VERW; taa_nosmt = true; } return 0; } early_param("tsx_async_abort", tsx_async_abort_parse_cmdline); #undef pr_fmt #define pr_fmt(fmt) "MMIO Stale Data: " fmt enum mmio_mitigations { MMIO_MITIGATION_OFF, MMIO_MITIGATION_UCODE_NEEDED, MMIO_MITIGATION_VERW, }; /* Default mitigation for Processor MMIO Stale Data vulnerabilities */ static enum mmio_mitigations mmio_mitigation __ro_after_init = MMIO_MITIGATION_VERW; static bool mmio_nosmt __ro_after_init = false; static const char * const mmio_strings[] = { [MMIO_MITIGATION_OFF] = "Vulnerable", [MMIO_MITIGATION_UCODE_NEEDED] = "Vulnerable: Clear CPU buffers attempted, no microcode", [MMIO_MITIGATION_VERW] = "Mitigation: Clear CPU buffers", }; static void __init mmio_select_mitigation(void) { u64 ia32_cap; if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA) || boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN) || cpu_mitigations_off()) { mmio_mitigation = MMIO_MITIGATION_OFF; return; } if (mmio_mitigation == MMIO_MITIGATION_OFF) return; ia32_cap = x86_read_arch_cap_msr(); /* * Enable CPU buffer clear mitigation for host and VMM, if also affected * by MDS or TAA. Otherwise, enable mitigation for VMM only. */ if (boot_cpu_has_bug(X86_BUG_MDS) || (boot_cpu_has_bug(X86_BUG_TAA) && boot_cpu_has(X86_FEATURE_RTM))) static_branch_enable(&mds_user_clear); else static_branch_enable(&mmio_stale_data_clear); /* * If Processor-MMIO-Stale-Data bug is present and Fill Buffer data can * be propagated to uncore buffers, clearing the Fill buffers on idle * is required irrespective of SMT state. */ if (!(ia32_cap & ARCH_CAP_FBSDP_NO)) static_branch_enable(&mds_idle_clear); /* * Check if the system has the right microcode. * * CPU Fill buffer clear mitigation is enumerated by either an explicit * FB_CLEAR or by the presence of both MD_CLEAR and L1D_FLUSH on MDS * affected systems. */ if ((ia32_cap & ARCH_CAP_FB_CLEAR) || (boot_cpu_has(X86_FEATURE_MD_CLEAR) && boot_cpu_has(X86_FEATURE_FLUSH_L1D) && !(ia32_cap & ARCH_CAP_MDS_NO))) mmio_mitigation = MMIO_MITIGATION_VERW; else mmio_mitigation = MMIO_MITIGATION_UCODE_NEEDED; if (mmio_nosmt || cpu_mitigations_auto_nosmt()) cpu_smt_disable(false); } static int __init mmio_stale_data_parse_cmdline(char *str) { if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) return 0; if (!str) return -EINVAL; if (!strcmp(str, "off")) { mmio_mitigation = MMIO_MITIGATION_OFF; } else if (!strcmp(str, "full")) { mmio_mitigation = MMIO_MITIGATION_VERW; } else if (!strcmp(str, "full,nosmt")) { mmio_mitigation = MMIO_MITIGATION_VERW; mmio_nosmt = true; } return 0; } early_param("mmio_stale_data", mmio_stale_data_parse_cmdline); #undef pr_fmt #define pr_fmt(fmt) "" fmt static void __init md_clear_update_mitigation(void) { if (cpu_mitigations_off()) return; if (!static_key_enabled(&mds_user_clear)) goto out; /* * mds_user_clear is now enabled. Update MDS, TAA and MMIO Stale Data * mitigation, if necessary. */ if (mds_mitigation == MDS_MITIGATION_OFF && boot_cpu_has_bug(X86_BUG_MDS)) { mds_mitigation = MDS_MITIGATION_FULL; mds_select_mitigation(); } if (taa_mitigation == TAA_MITIGATION_OFF && boot_cpu_has_bug(X86_BUG_TAA)) { taa_mitigation = TAA_MITIGATION_VERW; taa_select_mitigation(); } if (mmio_mitigation == MMIO_MITIGATION_OFF && boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) { mmio_mitigation = MMIO_MITIGATION_VERW; mmio_select_mitigation(); } out: if (boot_cpu_has_bug(X86_BUG_MDS)) pr_info("MDS: %s\n", mds_strings[mds_mitigation]); if (boot_cpu_has_bug(X86_BUG_TAA)) pr_info("TAA: %s\n", taa_strings[taa_mitigation]); if (boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) pr_info("MMIO Stale Data: %s\n", mmio_strings[mmio_mitigation]); else if (boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN)) pr_info("MMIO Stale Data: Unknown: No mitigations\n"); } static void __init md_clear_select_mitigation(void) { mds_select_mitigation(); taa_select_mitigation(); mmio_select_mitigation(); /* * As MDS, TAA and MMIO Stale Data mitigations are inter-related, update * and print their mitigation after MDS, TAA and MMIO Stale Data * mitigation selection is done. */ md_clear_update_mitigation(); } #undef pr_fmt #define pr_fmt(fmt) "SRBDS: " fmt enum srbds_mitigations { SRBDS_MITIGATION_OFF, SRBDS_MITIGATION_UCODE_NEEDED, SRBDS_MITIGATION_FULL, SRBDS_MITIGATION_TSX_OFF, SRBDS_MITIGATION_HYPERVISOR, }; static enum srbds_mitigations srbds_mitigation __ro_after_init = SRBDS_MITIGATION_FULL; static const char * const srbds_strings[] = { [SRBDS_MITIGATION_OFF] = "Vulnerable", [SRBDS_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode", [SRBDS_MITIGATION_FULL] = "Mitigation: Microcode", [SRBDS_MITIGATION_TSX_OFF] = "Mitigation: TSX disabled", [SRBDS_MITIGATION_HYPERVISOR] = "Unknown: Dependent on hypervisor status", }; static bool srbds_off; void update_srbds_msr(void) { u64 mcu_ctrl; if (!boot_cpu_has_bug(X86_BUG_SRBDS)) return; if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) return; if (srbds_mitigation == SRBDS_MITIGATION_UCODE_NEEDED) return; /* * A MDS_NO CPU for which SRBDS mitigation is not needed due to TSX * being disabled and it hasn't received the SRBDS MSR microcode. */ if (!boot_cpu_has(X86_FEATURE_SRBDS_CTRL)) return; rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); switch (srbds_mitigation) { case SRBDS_MITIGATION_OFF: case SRBDS_MITIGATION_TSX_OFF: mcu_ctrl |= RNGDS_MITG_DIS; break; case SRBDS_MITIGATION_FULL: mcu_ctrl &= ~RNGDS_MITG_DIS; break; default: break; } wrmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); } static void __init srbds_select_mitigation(void) { u64 ia32_cap; if (!boot_cpu_has_bug(X86_BUG_SRBDS)) return; /* * Check to see if this is one of the MDS_NO systems supporting TSX that * are only exposed to SRBDS when TSX is enabled or when CPU is affected * by Processor MMIO Stale Data vulnerability. */ ia32_cap = x86_read_arch_cap_msr(); if ((ia32_cap & ARCH_CAP_MDS_NO) && !boot_cpu_has(X86_FEATURE_RTM) && !boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) srbds_mitigation = SRBDS_MITIGATION_TSX_OFF; else if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) srbds_mitigation = SRBDS_MITIGATION_HYPERVISOR; else if (!boot_cpu_has(X86_FEATURE_SRBDS_CTRL)) srbds_mitigation = SRBDS_MITIGATION_UCODE_NEEDED; else if (cpu_mitigations_off() || srbds_off) srbds_mitigation = SRBDS_MITIGATION_OFF; update_srbds_msr(); pr_info("%s\n", srbds_strings[srbds_mitigation]); } static int __init srbds_parse_cmdline(char *str) { if (!str) return -EINVAL; if (!boot_cpu_has_bug(X86_BUG_SRBDS)) return 0; srbds_off = !strcmp(str, "off"); return 0; } early_param("srbds", srbds_parse_cmdline); #undef pr_fmt #define pr_fmt(fmt) "L1D Flush : " fmt enum l1d_flush_mitigations { L1D_FLUSH_OFF = 0, L1D_FLUSH_ON, }; static enum l1d_flush_mitigations l1d_flush_mitigation __initdata = L1D_FLUSH_OFF; static void __init l1d_flush_select_mitigation(void) { if (!l1d_flush_mitigation || !boot_cpu_has(X86_FEATURE_FLUSH_L1D)) return; static_branch_enable(&switch_mm_cond_l1d_flush); pr_info("Conditional flush on switch_mm() enabled\n"); } static int __init l1d_flush_parse_cmdline(char *str) { if (!strcmp(str, "on")) l1d_flush_mitigation = L1D_FLUSH_ON; return 0; } early_param("l1d_flush", l1d_flush_parse_cmdline); #undef pr_fmt #define pr_fmt(fmt) "GDS: " fmt enum gds_mitigations { GDS_MITIGATION_OFF, GDS_MITIGATION_UCODE_NEEDED, GDS_MITIGATION_FORCE, GDS_MITIGATION_FULL, GDS_MITIGATION_FULL_LOCKED, GDS_MITIGATION_HYPERVISOR, }; #if IS_ENABLED(CONFIG_GDS_FORCE_MITIGATION) static enum gds_mitigations gds_mitigation __ro_after_init = GDS_MITIGATION_FORCE; #else static enum gds_mitigations gds_mitigation __ro_after_init = GDS_MITIGATION_FULL; #endif static const char * const gds_strings[] = { [GDS_MITIGATION_OFF] = "Vulnerable", [GDS_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode", [GDS_MITIGATION_FORCE] = "Mitigation: AVX disabled, no microcode", [GDS_MITIGATION_FULL] = "Mitigation: Microcode", [GDS_MITIGATION_FULL_LOCKED] = "Mitigation: Microcode (locked)", [GDS_MITIGATION_HYPERVISOR] = "Unknown: Dependent on hypervisor status", }; bool gds_ucode_mitigated(void) { return (gds_mitigation == GDS_MITIGATION_FULL || gds_mitigation == GDS_MITIGATION_FULL_LOCKED); } EXPORT_SYMBOL_GPL(gds_ucode_mitigated); void update_gds_msr(void) { u64 mcu_ctrl_after; u64 mcu_ctrl; switch (gds_mitigation) { case GDS_MITIGATION_OFF: rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); mcu_ctrl |= GDS_MITG_DIS; break; case GDS_MITIGATION_FULL_LOCKED: /* * The LOCKED state comes from the boot CPU. APs might not have * the same state. Make sure the mitigation is enabled on all * CPUs. */ case GDS_MITIGATION_FULL: rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); mcu_ctrl &= ~GDS_MITG_DIS; break; case GDS_MITIGATION_FORCE: case GDS_MITIGATION_UCODE_NEEDED: case GDS_MITIGATION_HYPERVISOR: return; } wrmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); /* * Check to make sure that the WRMSR value was not ignored. Writes to * GDS_MITG_DIS will be ignored if this processor is locked but the boot * processor was not. */ rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl_after); WARN_ON_ONCE(mcu_ctrl != mcu_ctrl_after); } static void __init gds_select_mitigation(void) { u64 mcu_ctrl; if (!boot_cpu_has_bug(X86_BUG_GDS)) return; if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { gds_mitigation = GDS_MITIGATION_HYPERVISOR; goto out; } if (cpu_mitigations_off()) gds_mitigation = GDS_MITIGATION_OFF; /* Will verify below that mitigation _can_ be disabled */ /* No microcode */ if (!(x86_read_arch_cap_msr() & ARCH_CAP_GDS_CTRL)) { if (gds_mitigation == GDS_MITIGATION_FORCE) { /* * This only needs to be done on the boot CPU so do it * here rather than in update_gds_msr() */ setup_clear_cpu_cap(X86_FEATURE_AVX); pr_warn("Microcode update needed! Disabling AVX as mitigation.\n"); } else { gds_mitigation = GDS_MITIGATION_UCODE_NEEDED; } goto out; } /* Microcode has mitigation, use it */ if (gds_mitigation == GDS_MITIGATION_FORCE) gds_mitigation = GDS_MITIGATION_FULL; rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); if (mcu_ctrl & GDS_MITG_LOCKED) { if (gds_mitigation == GDS_MITIGATION_OFF) pr_warn("Mitigation locked. Disable failed.\n"); /* * The mitigation is selected from the boot CPU. All other CPUs * _should_ have the same state. If the boot CPU isn't locked * but others are then update_gds_msr() will WARN() of the state * mismatch. If the boot CPU is locked update_gds_msr() will * ensure the other CPUs have the mitigation enabled. */ gds_mitigation = GDS_MITIGATION_FULL_LOCKED; } update_gds_msr(); out: pr_info("%s\n", gds_strings[gds_mitigation]); } static int __init gds_parse_cmdline(char *str) { if (!str) return -EINVAL; if (!boot_cpu_has_bug(X86_BUG_GDS)) return 0; if (!strcmp(str, "off")) gds_mitigation = GDS_MITIGATION_OFF; else if (!strcmp(str, "force")) gds_mitigation = GDS_MITIGATION_FORCE; return 0; } early_param("gather_data_sampling", gds_parse_cmdline); #undef pr_fmt #define pr_fmt(fmt) "Spectre V1 : " fmt enum spectre_v1_mitigation { SPECTRE_V1_MITIGATION_NONE, SPECTRE_V1_MITIGATION_AUTO, }; static enum spectre_v1_mitigation spectre_v1_mitigation __ro_after_init = SPECTRE_V1_MITIGATION_AUTO; static const char * const spectre_v1_strings[] = { [SPECTRE_V1_MITIGATION_NONE] = "Vulnerable: __user pointer sanitization and usercopy barriers only; no swapgs barriers", [SPECTRE_V1_MITIGATION_AUTO] = "Mitigation: usercopy/swapgs barriers and __user pointer sanitization", }; /* * Does SMAP provide full mitigation against speculative kernel access to * userspace? */ static bool smap_works_speculatively(void) { if (!boot_cpu_has(X86_FEATURE_SMAP)) return false; /* * On CPUs which are vulnerable to Meltdown, SMAP does not * prevent speculative access to user data in the L1 cache. * Consider SMAP to be non-functional as a mitigation on these * CPUs. */ if (boot_cpu_has(X86_BUG_CPU_MELTDOWN)) return false; return true; } static void __init spectre_v1_select_mitigation(void) { if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1) || cpu_mitigations_off()) { spectre_v1_mitigation = SPECTRE_V1_MITIGATION_NONE; return; } if (spectre_v1_mitigation == SPECTRE_V1_MITIGATION_AUTO) { /* * With Spectre v1, a user can speculatively control either * path of a conditional swapgs with a user-controlled GS * value. The mitigation is to add lfences to both code paths. * * If FSGSBASE is enabled, the user can put a kernel address in * GS, in which case SMAP provides no protection. * * If FSGSBASE is disabled, the user can only put a user space * address in GS. That makes an attack harder, but still * possible if there's no SMAP protection. */ if (boot_cpu_has(X86_FEATURE_FSGSBASE) || !smap_works_speculatively()) { /* * Mitigation can be provided from SWAPGS itself or * PTI as the CR3 write in the Meltdown mitigation * is serializing. * * If neither is there, mitigate with an LFENCE to * stop speculation through swapgs. */ if (boot_cpu_has_bug(X86_BUG_SWAPGS) && !boot_cpu_has(X86_FEATURE_PTI)) setup_force_cpu_cap(X86_FEATURE_FENCE_SWAPGS_USER); /* * Enable lfences in the kernel entry (non-swapgs) * paths, to prevent user entry from speculatively * skipping swapgs. */ setup_force_cpu_cap(X86_FEATURE_FENCE_SWAPGS_KERNEL); } } pr_info("%s\n", spectre_v1_strings[spectre_v1_mitigation]); } static int __init nospectre_v1_cmdline(char *str) { spectre_v1_mitigation = SPECTRE_V1_MITIGATION_NONE; return 0; } early_param("nospectre_v1", nospectre_v1_cmdline); enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init = SPECTRE_V2_NONE; #undef pr_fmt #define pr_fmt(fmt) "RETBleed: " fmt enum retbleed_mitigation { RETBLEED_MITIGATION_NONE, RETBLEED_MITIGATION_UNRET, RETBLEED_MITIGATION_IBPB, RETBLEED_MITIGATION_IBRS, RETBLEED_MITIGATION_EIBRS, RETBLEED_MITIGATION_STUFF, }; enum retbleed_mitigation_cmd { RETBLEED_CMD_OFF, RETBLEED_CMD_AUTO, RETBLEED_CMD_UNRET, RETBLEED_CMD_IBPB, RETBLEED_CMD_STUFF, }; static const char * const retbleed_strings[] = { [RETBLEED_MITIGATION_NONE] = "Vulnerable", [RETBLEED_MITIGATION_UNRET] = "Mitigation: untrained return thunk", [RETBLEED_MITIGATION_IBPB] = "Mitigation: IBPB", [RETBLEED_MITIGATION_IBRS] = "Mitigation: IBRS", [RETBLEED_MITIGATION_EIBRS] = "Mitigation: Enhanced IBRS", [RETBLEED_MITIGATION_STUFF] = "Mitigation: Stuffing", }; static enum retbleed_mitigation retbleed_mitigation __ro_after_init = RETBLEED_MITIGATION_NONE; static enum retbleed_mitigation_cmd retbleed_cmd __ro_after_init = RETBLEED_CMD_AUTO; static int __ro_after_init retbleed_nosmt = false; static int __init retbleed_parse_cmdline(char *str) { if (!str) return -EINVAL; while (str) { char *next = strchr(str, ','); if (next) { *next = 0; next++; } if (!strcmp(str, "off")) { retbleed_cmd = RETBLEED_CMD_OFF; } else if (!strcmp(str, "auto")) { retbleed_cmd = RETBLEED_CMD_AUTO; } else if (!strcmp(str, "unret")) { retbleed_cmd = RETBLEED_CMD_UNRET; } else if (!strcmp(str, "ibpb")) { retbleed_cmd = RETBLEED_CMD_IBPB; } else if (!strcmp(str, "stuff")) { retbleed_cmd = RETBLEED_CMD_STUFF; } else if (!strcmp(str, "nosmt")) { retbleed_nosmt = true; } else if (!strcmp(str, "force")) { setup_force_cpu_bug(X86_BUG_RETBLEED); } else { pr_err("Ignoring unknown retbleed option (%s).", str); } str = next; } return 0; } early_param("retbleed", retbleed_parse_cmdline); #define RETBLEED_UNTRAIN_MSG "WARNING: BTB untrained return thunk mitigation is only effective on AMD/Hygon!\n" #define RETBLEED_INTEL_MSG "WARNING: Spectre v2 mitigation leaves CPU vulnerable to RETBleed attacks, data leaks possible!\n" static void __init retbleed_select_mitigation(void) { bool mitigate_smt = false; if (!boot_cpu_has_bug(X86_BUG_RETBLEED) || cpu_mitigations_off()) return; switch (retbleed_cmd) { case RETBLEED_CMD_OFF: return; case RETBLEED_CMD_UNRET: if (IS_ENABLED(CONFIG_CPU_UNRET_ENTRY)) { retbleed_mitigation = RETBLEED_MITIGATION_UNRET; } else { pr_err("WARNING: kernel not compiled with CPU_UNRET_ENTRY.\n"); goto do_cmd_auto; } break; case RETBLEED_CMD_IBPB: if (!boot_cpu_has(X86_FEATURE_IBPB)) { pr_err("WARNING: CPU does not support IBPB.\n"); goto do_cmd_auto; } else if (IS_ENABLED(CONFIG_CPU_IBPB_ENTRY)) { retbleed_mitigation = RETBLEED_MITIGATION_IBPB; } else { pr_err("WARNING: kernel not compiled with CPU_IBPB_ENTRY.\n"); goto do_cmd_auto; } break; case RETBLEED_CMD_STUFF: if (IS_ENABLED(CONFIG_CALL_DEPTH_TRACKING) && spectre_v2_enabled == SPECTRE_V2_RETPOLINE) { retbleed_mitigation = RETBLEED_MITIGATION_STUFF; } else { if (IS_ENABLED(CONFIG_CALL_DEPTH_TRACKING)) pr_err("WARNING: retbleed=stuff depends on spectre_v2=retpoline\n"); else pr_err("WARNING: kernel not compiled with CALL_DEPTH_TRACKING.\n"); goto do_cmd_auto; } break; do_cmd_auto: case RETBLEED_CMD_AUTO: if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) { if (IS_ENABLED(CONFIG_CPU_UNRET_ENTRY)) retbleed_mitigation = RETBLEED_MITIGATION_UNRET; else if (IS_ENABLED(CONFIG_CPU_IBPB_ENTRY) && boot_cpu_has(X86_FEATURE_IBPB)) retbleed_mitigation = RETBLEED_MITIGATION_IBPB; } /* * The Intel mitigation (IBRS or eIBRS) was already selected in * spectre_v2_select_mitigation(). 'retbleed_mitigation' will * be set accordingly below. */ break; } switch (retbleed_mitigation) { case RETBLEED_MITIGATION_UNRET: setup_force_cpu_cap(X86_FEATURE_RETHUNK); setup_force_cpu_cap(X86_FEATURE_UNRET); x86_return_thunk = retbleed_return_thunk; if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD && boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) pr_err(RETBLEED_UNTRAIN_MSG); mitigate_smt = true; break; case RETBLEED_MITIGATION_IBPB: setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB); setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT); mitigate_smt = true; break; case RETBLEED_MITIGATION_STUFF: setup_force_cpu_cap(X86_FEATURE_RETHUNK); setup_force_cpu_cap(X86_FEATURE_CALL_DEPTH); x86_return_thunk = call_depth_return_thunk; break; default: break; } if (mitigate_smt && !boot_cpu_has(X86_FEATURE_STIBP) && (retbleed_nosmt || cpu_mitigations_auto_nosmt())) cpu_smt_disable(false); /* * Let IBRS trump all on Intel without affecting the effects of the * retbleed= cmdline option except for call depth based stuffing */ if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { switch (spectre_v2_enabled) { case SPECTRE_V2_IBRS: retbleed_mitigation = RETBLEED_MITIGATION_IBRS; break; case SPECTRE_V2_EIBRS: case SPECTRE_V2_EIBRS_RETPOLINE: case SPECTRE_V2_EIBRS_LFENCE: retbleed_mitigation = RETBLEED_MITIGATION_EIBRS; break; default: if (retbleed_mitigation != RETBLEED_MITIGATION_STUFF) pr_err(RETBLEED_INTEL_MSG); } } pr_info("%s\n", retbleed_strings[retbleed_mitigation]); } #undef pr_fmt #define pr_fmt(fmt) "Spectre V2 : " fmt static enum spectre_v2_user_mitigation spectre_v2_user_stibp __ro_after_init = SPECTRE_V2_USER_NONE; static enum spectre_v2_user_mitigation spectre_v2_user_ibpb __ro_after_init = SPECTRE_V2_USER_NONE; #ifdef CONFIG_RETPOLINE static bool spectre_v2_bad_module; bool retpoline_module_ok(bool has_retpoline) { if (spectre_v2_enabled == SPECTRE_V2_NONE || has_retpoline) return true; pr_err("System may be vulnerable to spectre v2\n"); spectre_v2_bad_module = true; return false; } static inline const char *spectre_v2_module_string(void) { return spectre_v2_bad_module ? " - vulnerable module loaded" : ""; } #else static inline const char *spectre_v2_module_string(void) { return ""; } #endif #define SPECTRE_V2_LFENCE_MSG "WARNING: LFENCE mitigation is not recommended for this CPU, data leaks possible!\n" #define SPECTRE_V2_EIBRS_EBPF_MSG "WARNING: Unprivileged eBPF is enabled with eIBRS on, data leaks possible via Spectre v2 BHB attacks!\n" #define SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG "WARNING: Unprivileged eBPF is enabled with eIBRS+LFENCE mitigation and SMT, data leaks possible via Spectre v2 BHB attacks!\n" #define SPECTRE_V2_IBRS_PERF_MSG "WARNING: IBRS mitigation selected on Enhanced IBRS CPU, this may cause unnecessary performance loss\n" #ifdef CONFIG_BPF_SYSCALL void unpriv_ebpf_notify(int new_state) { if (new_state) return; /* Unprivileged eBPF is enabled */ switch (spectre_v2_enabled) { case SPECTRE_V2_EIBRS: pr_err(SPECTRE_V2_EIBRS_EBPF_MSG); break; case SPECTRE_V2_EIBRS_LFENCE: if (sched_smt_active()) pr_err(SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG); break; default: break; } } #endif static inline bool match_option(const char *arg, int arglen, const char *opt) { int len = strlen(opt); return len == arglen && !strncmp(arg, opt, len); } /* The kernel command line selection for spectre v2 */ enum spectre_v2_mitigation_cmd { SPECTRE_V2_CMD_NONE, SPECTRE_V2_CMD_AUTO, SPECTRE_V2_CMD_FORCE, SPECTRE_V2_CMD_RETPOLINE, SPECTRE_V2_CMD_RETPOLINE_GENERIC, SPECTRE_V2_CMD_RETPOLINE_LFENCE, SPECTRE_V2_CMD_EIBRS, SPECTRE_V2_CMD_EIBRS_RETPOLINE, SPECTRE_V2_CMD_EIBRS_LFENCE, SPECTRE_V2_CMD_IBRS, }; enum spectre_v2_user_cmd { SPECTRE_V2_USER_CMD_NONE, SPECTRE_V2_USER_CMD_AUTO, SPECTRE_V2_USER_CMD_FORCE, SPECTRE_V2_USER_CMD_PRCTL, SPECTRE_V2_USER_CMD_PRCTL_IBPB, SPECTRE_V2_USER_CMD_SECCOMP, SPECTRE_V2_USER_CMD_SECCOMP_IBPB, }; static const char * const spectre_v2_user_strings[] = { [SPECTRE_V2_USER_NONE] = "User space: Vulnerable", [SPECTRE_V2_USER_STRICT] = "User space: Mitigation: STIBP protection", [SPECTRE_V2_USER_STRICT_PREFERRED] = "User space: Mitigation: STIBP always-on protection", [SPECTRE_V2_USER_PRCTL] = "User space: Mitigation: STIBP via prctl", [SPECTRE_V2_USER_SECCOMP] = "User space: Mitigation: STIBP via seccomp and prctl", }; static const struct { const char *option; enum spectre_v2_user_cmd cmd; bool secure; } v2_user_options[] __initconst = { { "auto", SPECTRE_V2_USER_CMD_AUTO, false }, { "off", SPECTRE_V2_USER_CMD_NONE, false }, { "on", SPECTRE_V2_USER_CMD_FORCE, true }, { "prctl", SPECTRE_V2_USER_CMD_PRCTL, false }, { "prctl,ibpb", SPECTRE_V2_USER_CMD_PRCTL_IBPB, false }, { "seccomp", SPECTRE_V2_USER_CMD_SECCOMP, false }, { "seccomp,ibpb", SPECTRE_V2_USER_CMD_SECCOMP_IBPB, false }, }; static void __init spec_v2_user_print_cond(const char *reason, bool secure) { if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2) != secure) pr_info("spectre_v2_user=%s forced on command line.\n", reason); } static __ro_after_init enum spectre_v2_mitigation_cmd spectre_v2_cmd; static enum spectre_v2_user_cmd __init spectre_v2_parse_user_cmdline(void) { char arg[20]; int ret, i; switch (spectre_v2_cmd) { case SPECTRE_V2_CMD_NONE: return SPECTRE_V2_USER_CMD_NONE; case SPECTRE_V2_CMD_FORCE: return SPECTRE_V2_USER_CMD_FORCE; default: break; } ret = cmdline_find_option(boot_command_line, "spectre_v2_user", arg, sizeof(arg)); if (ret < 0) return SPECTRE_V2_USER_CMD_AUTO; for (i = 0; i < ARRAY_SIZE(v2_user_options); i++) { if (match_option(arg, ret, v2_user_options[i].option)) { spec_v2_user_print_cond(v2_user_options[i].option, v2_user_options[i].secure); return v2_user_options[i].cmd; } } pr_err("Unknown user space protection option (%s). Switching to AUTO select\n", arg); return SPECTRE_V2_USER_CMD_AUTO; } static inline bool spectre_v2_in_ibrs_mode(enum spectre_v2_mitigation mode) { return spectre_v2_in_eibrs_mode(mode) || mode == SPECTRE_V2_IBRS; } static void __init spectre_v2_user_select_mitigation(void) { enum spectre_v2_user_mitigation mode = SPECTRE_V2_USER_NONE; bool smt_possible = IS_ENABLED(CONFIG_SMP); enum spectre_v2_user_cmd cmd; if (!boot_cpu_has(X86_FEATURE_IBPB) && !boot_cpu_has(X86_FEATURE_STIBP)) return; if (cpu_smt_control == CPU_SMT_FORCE_DISABLED || cpu_smt_control == CPU_SMT_NOT_SUPPORTED) smt_possible = false; cmd = spectre_v2_parse_user_cmdline(); switch (cmd) { case SPECTRE_V2_USER_CMD_NONE: goto set_mode; case SPECTRE_V2_USER_CMD_FORCE: mode = SPECTRE_V2_USER_STRICT; break; case SPECTRE_V2_USER_CMD_AUTO: case SPECTRE_V2_USER_CMD_PRCTL: case SPECTRE_V2_USER_CMD_PRCTL_IBPB: mode = SPECTRE_V2_USER_PRCTL; break; case SPECTRE_V2_USER_CMD_SECCOMP: case SPECTRE_V2_USER_CMD_SECCOMP_IBPB: if (IS_ENABLED(CONFIG_SECCOMP)) mode = SPECTRE_V2_USER_SECCOMP; else mode = SPECTRE_V2_USER_PRCTL; break; } /* Initialize Indirect Branch Prediction Barrier */ if (boot_cpu_has(X86_FEATURE_IBPB)) { setup_force_cpu_cap(X86_FEATURE_USE_IBPB); spectre_v2_user_ibpb = mode; switch (cmd) { case SPECTRE_V2_USER_CMD_NONE: break; case SPECTRE_V2_USER_CMD_FORCE: case SPECTRE_V2_USER_CMD_PRCTL_IBPB: case SPECTRE_V2_USER_CMD_SECCOMP_IBPB: static_branch_enable(&switch_mm_always_ibpb); spectre_v2_user_ibpb = SPECTRE_V2_USER_STRICT; break; case SPECTRE_V2_USER_CMD_PRCTL: case SPECTRE_V2_USER_CMD_AUTO: case SPECTRE_V2_USER_CMD_SECCOMP: static_branch_enable(&switch_mm_cond_ibpb); break; } pr_info("mitigation: Enabling %s Indirect Branch Prediction Barrier\n", static_key_enabled(&switch_mm_always_ibpb) ? "always-on" : "conditional"); } /* * If no STIBP, Intel enhanced IBRS is enabled, or SMT impossible, STIBP * is not required. * * Intel's Enhanced IBRS also protects against cross-thread branch target * injection in user-mode as the IBRS bit remains always set which * implicitly enables cross-thread protections. However, in legacy IBRS * mode, the IBRS bit is set only on kernel entry and cleared on return * to userspace. AMD Automatic IBRS also does not protect userspace. * These modes therefore disable the implicit cross-thread protection, * so allow for STIBP to be selected in those cases. */ if (!boot_cpu_has(X86_FEATURE_STIBP) || !smt_possible || (spectre_v2_in_eibrs_mode(spectre_v2_enabled) && !boot_cpu_has(X86_FEATURE_AUTOIBRS))) return; /* * At this point, an STIBP mode other than "off" has been set. * If STIBP support is not being forced, check if STIBP always-on * is preferred. */ if (mode != SPECTRE_V2_USER_STRICT && boot_cpu_has(X86_FEATURE_AMD_STIBP_ALWAYS_ON)) mode = SPECTRE_V2_USER_STRICT_PREFERRED; if (retbleed_mitigation == RETBLEED_MITIGATION_UNRET || retbleed_mitigation == RETBLEED_MITIGATION_IBPB) { if (mode != SPECTRE_V2_USER_STRICT && mode != SPECTRE_V2_USER_STRICT_PREFERRED) pr_info("Selecting STIBP always-on mode to complement retbleed mitigation\n"); mode = SPECTRE_V2_USER_STRICT_PREFERRED; } spectre_v2_user_stibp = mode; set_mode: pr_info("%s\n", spectre_v2_user_strings[mode]); } static const char * const spectre_v2_strings[] = { [SPECTRE_V2_NONE] = "Vulnerable", [SPECTRE_V2_RETPOLINE] = "Mitigation: Retpolines", [SPECTRE_V2_LFENCE] = "Mitigation: LFENCE", [SPECTRE_V2_EIBRS] = "Mitigation: Enhanced / Automatic IBRS", [SPECTRE_V2_EIBRS_LFENCE] = "Mitigation: Enhanced / Automatic IBRS + LFENCE", [SPECTRE_V2_EIBRS_RETPOLINE] = "Mitigation: Enhanced / Automatic IBRS + Retpolines", [SPECTRE_V2_IBRS] = "Mitigation: IBRS", }; static const struct { const char *option; enum spectre_v2_mitigation_cmd cmd; bool secure; } mitigation_options[] __initconst = { { "off", SPECTRE_V2_CMD_NONE, false }, { "on", SPECTRE_V2_CMD_FORCE, true }, { "retpoline", SPECTRE_V2_CMD_RETPOLINE, false }, { "retpoline,amd", SPECTRE_V2_CMD_RETPOLINE_LFENCE, false }, { "retpoline,lfence", SPECTRE_V2_CMD_RETPOLINE_LFENCE, false }, { "retpoline,generic", SPECTRE_V2_CMD_RETPOLINE_GENERIC, false }, { "eibrs", SPECTRE_V2_CMD_EIBRS, false }, { "eibrs,lfence", SPECTRE_V2_CMD_EIBRS_LFENCE, false }, { "eibrs,retpoline", SPECTRE_V2_CMD_EIBRS_RETPOLINE, false }, { "auto", SPECTRE_V2_CMD_AUTO, false }, { "ibrs", SPECTRE_V2_CMD_IBRS, false }, }; static void __init spec_v2_print_cond(const char *reason, bool secure) { if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2) != secure) pr_info("%s selected on command line.\n", reason); } static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) { enum spectre_v2_mitigation_cmd cmd = SPECTRE_V2_CMD_AUTO; char arg[20]; int ret, i; if (cmdline_find_option_bool(boot_command_line, "nospectre_v2") || cpu_mitigations_off()) return SPECTRE_V2_CMD_NONE; ret = cmdline_find_option(boot_command_line, "spectre_v2", arg, sizeof(arg)); if (ret < 0) return SPECTRE_V2_CMD_AUTO; for (i = 0; i < ARRAY_SIZE(mitigation_options); i++) { if (!match_option(arg, ret, mitigation_options[i].option)) continue; cmd = mitigation_options[i].cmd; break; } if (i >= ARRAY_SIZE(mitigation_options)) { pr_err("unknown option (%s). Switching to AUTO select\n", arg); return SPECTRE_V2_CMD_AUTO; } if ((cmd == SPECTRE_V2_CMD_RETPOLINE || cmd == SPECTRE_V2_CMD_RETPOLINE_LFENCE || cmd == SPECTRE_V2_CMD_RETPOLINE_GENERIC || cmd == SPECTRE_V2_CMD_EIBRS_LFENCE || cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) && !IS_ENABLED(CONFIG_RETPOLINE)) { pr_err("%s selected but not compiled in. Switching to AUTO select\n", mitigation_options[i].option); return SPECTRE_V2_CMD_AUTO; } if ((cmd == SPECTRE_V2_CMD_EIBRS || cmd == SPECTRE_V2_CMD_EIBRS_LFENCE || cmd == SPECTRE_V2_CMD_EIBRS_RETPOLINE) && !boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) { pr_err("%s selected but CPU doesn't have Enhanced or Automatic IBRS. Switching to AUTO select\n", mitigation_options[i].option); return SPECTRE_V2_CMD_AUTO; } if ((cmd == SPECTRE_V2_CMD_RETPOLINE_LFENCE || cmd == SPECTRE_V2_CMD_EIBRS_LFENCE) && !boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) { pr_err("%s selected, but CPU doesn't have a serializing LFENCE. Switching to AUTO select\n", mitigation_options[i].option); return SPECTRE_V2_CMD_AUTO; } if (cmd == SPECTRE_V2_CMD_IBRS && !IS_ENABLED(CONFIG_CPU_IBRS_ENTRY)) { pr_err("%s selected but not compiled in. Switching to AUTO select\n", mitigation_options[i].option); return SPECTRE_V2_CMD_AUTO; } if (cmd == SPECTRE_V2_CMD_IBRS && boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { pr_err("%s selected but not Intel CPU. Switching to AUTO select\n", mitigation_options[i].option); return SPECTRE_V2_CMD_AUTO; } if (cmd == SPECTRE_V2_CMD_IBRS && !boot_cpu_has(X86_FEATURE_IBRS)) { pr_err("%s selected but CPU doesn't have IBRS. Switching to AUTO select\n", mitigation_options[i].option); return SPECTRE_V2_CMD_AUTO; } if (cmd == SPECTRE_V2_CMD_IBRS && cpu_feature_enabled(X86_FEATURE_XENPV)) { pr_err("%s selected but running as XenPV guest. Switching to AUTO select\n", mitigation_options[i].option); return SPECTRE_V2_CMD_AUTO; } spec_v2_print_cond(mitigation_options[i].option, mitigation_options[i].secure); return cmd; } static enum spectre_v2_mitigation __init spectre_v2_select_retpoline(void) { if (!IS_ENABLED(CONFIG_RETPOLINE)) { pr_err("Kernel not compiled with retpoline; no mitigation available!"); return SPECTRE_V2_NONE; } return SPECTRE_V2_RETPOLINE; } /* Disable in-kernel use of non-RSB RET predictors */ static void __init spec_ctrl_disable_kernel_rrsba(void) { u64 ia32_cap; if (!boot_cpu_has(X86_FEATURE_RRSBA_CTRL)) return; ia32_cap = x86_read_arch_cap_msr(); if (ia32_cap & ARCH_CAP_RRSBA) { x86_spec_ctrl_base |= SPEC_CTRL_RRSBA_DIS_S; update_spec_ctrl(x86_spec_ctrl_base); } } static void __init spectre_v2_determine_rsb_fill_type_at_vmexit(enum spectre_v2_mitigation mode) { /* * Similar to context switches, there are two types of RSB attacks * after VM exit: * * 1) RSB underflow * * 2) Poisoned RSB entry * * When retpoline is enabled, both are mitigated by filling/clearing * the RSB. * * When IBRS is enabled, while #1 would be mitigated by the IBRS branch * prediction isolation protections, RSB still needs to be cleared * because of #2. Note that SMEP provides no protection here, unlike * user-space-poisoned RSB entries. * * eIBRS should protect against RSB poisoning, but if the EIBRS_PBRSB * bug is present then a LITE version of RSB protection is required, * just a single call needs to retire before a RET is executed. */ switch (mode) { case SPECTRE_V2_NONE: return; case SPECTRE_V2_EIBRS_LFENCE: case SPECTRE_V2_EIBRS: if (boot_cpu_has_bug(X86_BUG_EIBRS_PBRSB)) { setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT_LITE); pr_info("Spectre v2 / PBRSB-eIBRS: Retire a single CALL on VMEXIT\n"); } return; case SPECTRE_V2_EIBRS_RETPOLINE: case SPECTRE_V2_RETPOLINE: case SPECTRE_V2_LFENCE: case SPECTRE_V2_IBRS: setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT); pr_info("Spectre v2 / SpectreRSB : Filling RSB on VMEXIT\n"); return; } pr_warn_once("Unknown Spectre v2 mode, disabling RSB mitigation at VM exit"); dump_stack(); } static void __init spectre_v2_select_mitigation(void) { enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline(); enum spectre_v2_mitigation mode = SPECTRE_V2_NONE; /* * If the CPU is not affected and the command line mode is NONE or AUTO * then nothing to do. */ if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2) && (cmd == SPECTRE_V2_CMD_NONE || cmd == SPECTRE_V2_CMD_AUTO)) return; switch (cmd) { case SPECTRE_V2_CMD_NONE: return; case SPECTRE_V2_CMD_FORCE: case SPECTRE_V2_CMD_AUTO: if (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) { mode = SPECTRE_V2_EIBRS; break; } if (IS_ENABLED(CONFIG_CPU_IBRS_ENTRY) && boot_cpu_has_bug(X86_BUG_RETBLEED) && retbleed_cmd != RETBLEED_CMD_OFF && retbleed_cmd != RETBLEED_CMD_STUFF && boot_cpu_has(X86_FEATURE_IBRS) && boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { mode = SPECTRE_V2_IBRS; break; } mode = spectre_v2_select_retpoline(); break; case SPECTRE_V2_CMD_RETPOLINE_LFENCE: pr_err(SPECTRE_V2_LFENCE_MSG); mode = SPECTRE_V2_LFENCE; break; case SPECTRE_V2_CMD_RETPOLINE_GENERIC: mode = SPECTRE_V2_RETPOLINE; break; case SPECTRE_V2_CMD_RETPOLINE: mode = spectre_v2_select_retpoline(); break; case SPECTRE_V2_CMD_IBRS: mode = SPECTRE_V2_IBRS; break; case SPECTRE_V2_CMD_EIBRS: mode = SPECTRE_V2_EIBRS; break; case SPECTRE_V2_CMD_EIBRS_LFENCE: mode = SPECTRE_V2_EIBRS_LFENCE; break; case SPECTRE_V2_CMD_EIBRS_RETPOLINE: mode = SPECTRE_V2_EIBRS_RETPOLINE; break; } if (mode == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled()) pr_err(SPECTRE_V2_EIBRS_EBPF_MSG); if (spectre_v2_in_ibrs_mode(mode)) { if (boot_cpu_has(X86_FEATURE_AUTOIBRS)) { msr_set_bit(MSR_EFER, _EFER_AUTOIBRS); } else { x86_spec_ctrl_base |= SPEC_CTRL_IBRS; update_spec_ctrl(x86_spec_ctrl_base); } } switch (mode) { case SPECTRE_V2_NONE: case SPECTRE_V2_EIBRS: break; case SPECTRE_V2_IBRS: setup_force_cpu_cap(X86_FEATURE_KERNEL_IBRS); if (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) pr_warn(SPECTRE_V2_IBRS_PERF_MSG); break; case SPECTRE_V2_LFENCE: case SPECTRE_V2_EIBRS_LFENCE: setup_force_cpu_cap(X86_FEATURE_RETPOLINE_LFENCE); fallthrough; case SPECTRE_V2_RETPOLINE: case SPECTRE_V2_EIBRS_RETPOLINE: setup_force_cpu_cap(X86_FEATURE_RETPOLINE); break; } /* * Disable alternate RSB predictions in kernel when indirect CALLs and * JMPs gets protection against BHI and Intramode-BTI, but RET * prediction from a non-RSB predictor is still a risk. */ if (mode == SPECTRE_V2_EIBRS_LFENCE || mode == SPECTRE_V2_EIBRS_RETPOLINE || mode == SPECTRE_V2_RETPOLINE) spec_ctrl_disable_kernel_rrsba(); spectre_v2_enabled = mode; pr_info("%s\n", spectre_v2_strings[mode]); /* * If Spectre v2 protection has been enabled, fill the RSB during a * context switch. In general there are two types of RSB attacks * across context switches, for which the CALLs/RETs may be unbalanced. * * 1) RSB underflow * * Some Intel parts have "bottomless RSB". When the RSB is empty, * speculated return targets may come from the branch predictor, * which could have a user-poisoned BTB or BHB entry. * * AMD has it even worse: *all* returns are speculated from the BTB, * regardless of the state of the RSB. * * When IBRS or eIBRS is enabled, the "user -> kernel" attack * scenario is mitigated by the IBRS branch prediction isolation * properties, so the RSB buffer filling wouldn't be necessary to * protect against this type of attack. * * The "user -> user" attack scenario is mitigated by RSB filling. * * 2) Poisoned RSB entry * * If the 'next' in-kernel return stack is shorter than 'prev', * 'next' could be tricked into speculating with a user-poisoned RSB * entry. * * The "user -> kernel" attack scenario is mitigated by SMEP and * eIBRS. * * The "user -> user" scenario, also known as SpectreBHB, requires * RSB clearing. * * So to mitigate all cases, unconditionally fill RSB on context * switches. * * FIXME: Is this pointless for retbleed-affected AMD? */ setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW); pr_info("Spectre v2 / SpectreRSB mitigation: Filling RSB on context switch\n"); spectre_v2_determine_rsb_fill_type_at_vmexit(mode); /* * Retpoline protects the kernel, but doesn't protect firmware. IBRS * and Enhanced IBRS protect firmware too, so enable IBRS around * firmware calls only when IBRS / Enhanced / Automatic IBRS aren't * otherwise enabled. * * Use "mode" to check Enhanced IBRS instead of boot_cpu_has(), because * the user might select retpoline on the kernel command line and if * the CPU supports Enhanced IBRS, kernel might un-intentionally not * enable IBRS around firmware calls. */ if (boot_cpu_has_bug(X86_BUG_RETBLEED) && boot_cpu_has(X86_FEATURE_IBPB) && (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)) { if (retbleed_cmd != RETBLEED_CMD_IBPB) { setup_force_cpu_cap(X86_FEATURE_USE_IBPB_FW); pr_info("Enabling Speculation Barrier for firmware calls\n"); } } else if (boot_cpu_has(X86_FEATURE_IBRS) && !spectre_v2_in_ibrs_mode(mode)) { setup_force_cpu_cap(X86_FEATURE_USE_IBRS_FW); pr_info("Enabling Restricted Speculation for firmware calls\n"); } /* Set up IBPB and STIBP depending on the general spectre V2 command */ spectre_v2_cmd = cmd; } static void update_stibp_msr(void * __unused) { u64 val = spec_ctrl_current() | (x86_spec_ctrl_base & SPEC_CTRL_STIBP); update_spec_ctrl(val); } /* Update x86_spec_ctrl_base in case SMT state changed. */ static void update_stibp_strict(void) { u64 mask = x86_spec_ctrl_base & ~SPEC_CTRL_STIBP; if (sched_smt_active()) mask |= SPEC_CTRL_STIBP; if (mask == x86_spec_ctrl_base) return; pr_info("Update user space SMT mitigation: STIBP %s\n", mask & SPEC_CTRL_STIBP ? "always-on" : "off"); x86_spec_ctrl_base = mask; on_each_cpu(update_stibp_msr, NULL, 1); } /* Update the static key controlling the evaluation of TIF_SPEC_IB */ static void update_indir_branch_cond(void) { if (sched_smt_active()) static_branch_enable(&switch_to_cond_stibp); else static_branch_disable(&switch_to_cond_stibp); } #undef pr_fmt #define pr_fmt(fmt) fmt /* Update the static key controlling the MDS CPU buffer clear in idle */ static void update_mds_branch_idle(void) { u64 ia32_cap = x86_read_arch_cap_msr(); /* * Enable the idle clearing if SMT is active on CPUs which are * affected only by MSBDS and not any other MDS variant. * * The other variants cannot be mitigated when SMT is enabled, so * clearing the buffers on idle just to prevent the Store Buffer * repartitioning leak would be a window dressing exercise. */ if (!boot_cpu_has_bug(X86_BUG_MSBDS_ONLY)) return; if (sched_smt_active()) { static_branch_enable(&mds_idle_clear); } else if (mmio_mitigation == MMIO_MITIGATION_OFF || (ia32_cap & ARCH_CAP_FBSDP_NO)) { static_branch_disable(&mds_idle_clear); } } #define MDS_MSG_SMT "MDS CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/mds.html for more details.\n" #define TAA_MSG_SMT "TAA CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/tsx_async_abort.html for more details.\n" #define MMIO_MSG_SMT "MMIO Stale Data CPU bug present and SMT on, data leak possible. See https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/processor_mmio_stale_data.html for more details.\n" void cpu_bugs_smt_update(void) { mutex_lock(&spec_ctrl_mutex); if (sched_smt_active() && unprivileged_ebpf_enabled() && spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE) pr_warn_once(SPECTRE_V2_EIBRS_LFENCE_EBPF_SMT_MSG); switch (spectre_v2_user_stibp) { case SPECTRE_V2_USER_NONE: break; case SPECTRE_V2_USER_STRICT: case SPECTRE_V2_USER_STRICT_PREFERRED: update_stibp_strict(); break; case SPECTRE_V2_USER_PRCTL: case SPECTRE_V2_USER_SECCOMP: update_indir_branch_cond(); break; } switch (mds_mitigation) { case MDS_MITIGATION_FULL: case MDS_MITIGATION_VMWERV: if (sched_smt_active() && !boot_cpu_has(X86_BUG_MSBDS_ONLY)) pr_warn_once(MDS_MSG_SMT); update_mds_branch_idle(); break; case MDS_MITIGATION_OFF: break; } switch (taa_mitigation) { case TAA_MITIGATION_VERW: case TAA_MITIGATION_UCODE_NEEDED: if (sched_smt_active()) pr_warn_once(TAA_MSG_SMT); break; case TAA_MITIGATION_TSX_DISABLED: case TAA_MITIGATION_OFF: break; } switch (mmio_mitigation) { case MMIO_MITIGATION_VERW: case MMIO_MITIGATION_UCODE_NEEDED: if (sched_smt_active()) pr_warn_once(MMIO_MSG_SMT); break; case MMIO_MITIGATION_OFF: break; } mutex_unlock(&spec_ctrl_mutex); } #undef pr_fmt #define pr_fmt(fmt) "Speculative Store Bypass: " fmt static enum ssb_mitigation ssb_mode __ro_after_init = SPEC_STORE_BYPASS_NONE; /* The kernel command line selection */ enum ssb_mitigation_cmd { SPEC_STORE_BYPASS_CMD_NONE, SPEC_STORE_BYPASS_CMD_AUTO, SPEC_STORE_BYPASS_CMD_ON, SPEC_STORE_BYPASS_CMD_PRCTL, SPEC_STORE_BYPASS_CMD_SECCOMP, }; static const char * const ssb_strings[] = { [SPEC_STORE_BYPASS_NONE] = "Vulnerable", [SPEC_STORE_BYPASS_DISABLE] = "Mitigation: Speculative Store Bypass disabled", [SPEC_STORE_BYPASS_PRCTL] = "Mitigation: Speculative Store Bypass disabled via prctl", [SPEC_STORE_BYPASS_SECCOMP] = "Mitigation: Speculative Store Bypass disabled via prctl and seccomp", }; static const struct { const char *option; enum ssb_mitigation_cmd cmd; } ssb_mitigation_options[] __initconst = { { "auto", SPEC_STORE_BYPASS_CMD_AUTO }, /* Platform decides */ { "on", SPEC_STORE_BYPASS_CMD_ON }, /* Disable Speculative Store Bypass */ { "off", SPEC_STORE_BYPASS_CMD_NONE }, /* Don't touch Speculative Store Bypass */ { "prctl", SPEC_STORE_BYPASS_CMD_PRCTL }, /* Disable Speculative Store Bypass via prctl */ { "seccomp", SPEC_STORE_BYPASS_CMD_SECCOMP }, /* Disable Speculative Store Bypass via prctl and seccomp */ }; static enum ssb_mitigation_cmd __init ssb_parse_cmdline(void) { enum ssb_mitigation_cmd cmd = SPEC_STORE_BYPASS_CMD_AUTO; char arg[20]; int ret, i; if (cmdline_find_option_bool(boot_command_line, "nospec_store_bypass_disable") || cpu_mitigations_off()) { return SPEC_STORE_BYPASS_CMD_NONE; } else { ret = cmdline_find_option(boot_command_line, "spec_store_bypass_disable", arg, sizeof(arg)); if (ret < 0) return SPEC_STORE_BYPASS_CMD_AUTO; for (i = 0; i < ARRAY_SIZE(ssb_mitigation_options); i++) { if (!match_option(arg, ret, ssb_mitigation_options[i].option)) continue; cmd = ssb_mitigation_options[i].cmd; break; } if (i >= ARRAY_SIZE(ssb_mitigation_options)) { pr_err("unknown option (%s). Switching to AUTO select\n", arg); return SPEC_STORE_BYPASS_CMD_AUTO; } } return cmd; } static enum ssb_mitigation __init __ssb_select_mitigation(void) { enum ssb_mitigation mode = SPEC_STORE_BYPASS_NONE; enum ssb_mitigation_cmd cmd; if (!boot_cpu_has(X86_FEATURE_SSBD)) return mode; cmd = ssb_parse_cmdline(); if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS) && (cmd == SPEC_STORE_BYPASS_CMD_NONE || cmd == SPEC_STORE_BYPASS_CMD_AUTO)) return mode; switch (cmd) { case SPEC_STORE_BYPASS_CMD_SECCOMP: /* * Choose prctl+seccomp as the default mode if seccomp is * enabled. */ if (IS_ENABLED(CONFIG_SECCOMP)) mode = SPEC_STORE_BYPASS_SECCOMP; else mode = SPEC_STORE_BYPASS_PRCTL; break; case SPEC_STORE_BYPASS_CMD_ON: mode = SPEC_STORE_BYPASS_DISABLE; break; case SPEC_STORE_BYPASS_CMD_AUTO: case SPEC_STORE_BYPASS_CMD_PRCTL: mode = SPEC_STORE_BYPASS_PRCTL; break; case SPEC_STORE_BYPASS_CMD_NONE: break; } /* * We have three CPU feature flags that are in play here: * - X86_BUG_SPEC_STORE_BYPASS - CPU is susceptible. * - X86_FEATURE_SSBD - CPU is able to turn off speculative store bypass * - X86_FEATURE_SPEC_STORE_BYPASS_DISABLE - engage the mitigation */ if (mode == SPEC_STORE_BYPASS_DISABLE) { setup_force_cpu_cap(X86_FEATURE_SPEC_STORE_BYPASS_DISABLE); /* * Intel uses the SPEC CTRL MSR Bit(2) for this, while AMD may * use a completely different MSR and bit dependent on family. */ if (!static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) && !static_cpu_has(X86_FEATURE_AMD_SSBD)) { x86_amd_ssb_disable(); } else { x86_spec_ctrl_base |= SPEC_CTRL_SSBD; update_spec_ctrl(x86_spec_ctrl_base); } } return mode; } static void ssb_select_mitigation(void) { ssb_mode = __ssb_select_mitigation(); if (boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) pr_info("%s\n", ssb_strings[ssb_mode]); } #undef pr_fmt #define pr_fmt(fmt) "Speculation prctl: " fmt static void task_update_spec_tif(struct task_struct *tsk) { /* Force the update of the real TIF bits */ set_tsk_thread_flag(tsk, TIF_SPEC_FORCE_UPDATE); /* * Immediately update the speculation control MSRs for the current * task, but for a non-current task delay setting the CPU * mitigation until it is scheduled next. * * This can only happen for SECCOMP mitigation. For PRCTL it's * always the current task. */ if (tsk == current) speculation_ctrl_update_current(); } static int l1d_flush_prctl_set(struct task_struct *task, unsigned long ctrl) { if (!static_branch_unlikely(&switch_mm_cond_l1d_flush)) return -EPERM; switch (ctrl) { case PR_SPEC_ENABLE: set_ti_thread_flag(&task->thread_info, TIF_SPEC_L1D_FLUSH); return 0; case PR_SPEC_DISABLE: clear_ti_thread_flag(&task->thread_info, TIF_SPEC_L1D_FLUSH); return 0; default: return -ERANGE; } } static int ssb_prctl_set(struct task_struct *task, unsigned long ctrl) { if (ssb_mode != SPEC_STORE_BYPASS_PRCTL && ssb_mode != SPEC_STORE_BYPASS_SECCOMP) return -ENXIO; switch (ctrl) { case PR_SPEC_ENABLE: /* If speculation is force disabled, enable is not allowed */ if (task_spec_ssb_force_disable(task)) return -EPERM; task_clear_spec_ssb_disable(task); task_clear_spec_ssb_noexec(task); task_update_spec_tif(task); break; case PR_SPEC_DISABLE: task_set_spec_ssb_disable(task); task_clear_spec_ssb_noexec(task); task_update_spec_tif(task); break; case PR_SPEC_FORCE_DISABLE: task_set_spec_ssb_disable(task); task_set_spec_ssb_force_disable(task); task_clear_spec_ssb_noexec(task); task_update_spec_tif(task); break; case PR_SPEC_DISABLE_NOEXEC: if (task_spec_ssb_force_disable(task)) return -EPERM; task_set_spec_ssb_disable(task); task_set_spec_ssb_noexec(task); task_update_spec_tif(task); break; default: return -ERANGE; } return 0; } static bool is_spec_ib_user_controlled(void) { return spectre_v2_user_ibpb == SPECTRE_V2_USER_PRCTL || spectre_v2_user_ibpb == SPECTRE_V2_USER_SECCOMP || spectre_v2_user_stibp == SPECTRE_V2_USER_PRCTL || spectre_v2_user_stibp == SPECTRE_V2_USER_SECCOMP; } static int ib_prctl_set(struct task_struct *task, unsigned long ctrl) { switch (ctrl) { case PR_SPEC_ENABLE: if (spectre_v2_user_ibpb == SPECTRE_V2_USER_NONE && spectre_v2_user_stibp == SPECTRE_V2_USER_NONE) return 0; /* * With strict mode for both IBPB and STIBP, the instruction * code paths avoid checking this task flag and instead, * unconditionally run the instruction. However, STIBP and IBPB * are independent and either can be set to conditionally * enabled regardless of the mode of the other. * * If either is set to conditional, allow the task flag to be * updated, unless it was force-disabled by a previous prctl * call. Currently, this is possible on an AMD CPU which has the * feature X86_FEATURE_AMD_STIBP_ALWAYS_ON. In this case, if the * kernel is booted with 'spectre_v2_user=seccomp', then * spectre_v2_user_ibpb == SPECTRE_V2_USER_SECCOMP and * spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED. */ if (!is_spec_ib_user_controlled() || task_spec_ib_force_disable(task)) return -EPERM; task_clear_spec_ib_disable(task); task_update_spec_tif(task); break; case PR_SPEC_DISABLE: case PR_SPEC_FORCE_DISABLE: /* * Indirect branch speculation is always allowed when * mitigation is force disabled. */ if (spectre_v2_user_ibpb == SPECTRE_V2_USER_NONE && spectre_v2_user_stibp == SPECTRE_V2_USER_NONE) return -EPERM; if (!is_spec_ib_user_controlled()) return 0; task_set_spec_ib_disable(task); if (ctrl == PR_SPEC_FORCE_DISABLE) task_set_spec_ib_force_disable(task); task_update_spec_tif(task); if (task == current) indirect_branch_prediction_barrier(); break; default: return -ERANGE; } return 0; } int arch_prctl_spec_ctrl_set(struct task_struct *task, unsigned long which, unsigned long ctrl) { switch (which) { case PR_SPEC_STORE_BYPASS: return ssb_prctl_set(task, ctrl); case PR_SPEC_INDIRECT_BRANCH: return ib_prctl_set(task, ctrl); case PR_SPEC_L1D_FLUSH: return l1d_flush_prctl_set(task, ctrl); default: return -ENODEV; } } #ifdef CONFIG_SECCOMP void arch_seccomp_spec_mitigate(struct task_struct *task) { if (ssb_mode == SPEC_STORE_BYPASS_SECCOMP) ssb_prctl_set(task, PR_SPEC_FORCE_DISABLE); if (spectre_v2_user_ibpb == SPECTRE_V2_USER_SECCOMP || spectre_v2_user_stibp == SPECTRE_V2_USER_SECCOMP) ib_prctl_set(task, PR_SPEC_FORCE_DISABLE); } #endif static int l1d_flush_prctl_get(struct task_struct *task) { if (!static_branch_unlikely(&switch_mm_cond_l1d_flush)) return PR_SPEC_FORCE_DISABLE; if (test_ti_thread_flag(&task->thread_info, TIF_SPEC_L1D_FLUSH)) return PR_SPEC_PRCTL | PR_SPEC_ENABLE; else return PR_SPEC_PRCTL | PR_SPEC_DISABLE; } static int ssb_prctl_get(struct task_struct *task) { switch (ssb_mode) { case SPEC_STORE_BYPASS_NONE: if (boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) return PR_SPEC_ENABLE; return PR_SPEC_NOT_AFFECTED; case SPEC_STORE_BYPASS_DISABLE: return PR_SPEC_DISABLE; case SPEC_STORE_BYPASS_SECCOMP: case SPEC_STORE_BYPASS_PRCTL: if (task_spec_ssb_force_disable(task)) return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE; if (task_spec_ssb_noexec(task)) return PR_SPEC_PRCTL | PR_SPEC_DISABLE_NOEXEC; if (task_spec_ssb_disable(task)) return PR_SPEC_PRCTL | PR_SPEC_DISABLE; return PR_SPEC_PRCTL | PR_SPEC_ENABLE; } BUG(); } static int ib_prctl_get(struct task_struct *task) { if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) return PR_SPEC_NOT_AFFECTED; if (spectre_v2_user_ibpb == SPECTRE_V2_USER_NONE && spectre_v2_user_stibp == SPECTRE_V2_USER_NONE) return PR_SPEC_ENABLE; else if (is_spec_ib_user_controlled()) { if (task_spec_ib_force_disable(task)) return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE; if (task_spec_ib_disable(task)) return PR_SPEC_PRCTL | PR_SPEC_DISABLE; return PR_SPEC_PRCTL | PR_SPEC_ENABLE; } else if (spectre_v2_user_ibpb == SPECTRE_V2_USER_STRICT || spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT || spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED) return PR_SPEC_DISABLE; else return PR_SPEC_NOT_AFFECTED; } int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which) { switch (which) { case PR_SPEC_STORE_BYPASS: return ssb_prctl_get(task); case PR_SPEC_INDIRECT_BRANCH: return ib_prctl_get(task); case PR_SPEC_L1D_FLUSH: return l1d_flush_prctl_get(task); default: return -ENODEV; } } void x86_spec_ctrl_setup_ap(void) { if (boot_cpu_has(X86_FEATURE_MSR_SPEC_CTRL)) update_spec_ctrl(x86_spec_ctrl_base); if (ssb_mode == SPEC_STORE_BYPASS_DISABLE) x86_amd_ssb_disable(); } bool itlb_multihit_kvm_mitigation; EXPORT_SYMBOL_GPL(itlb_multihit_kvm_mitigation); #undef pr_fmt #define pr_fmt(fmt) "L1TF: " fmt /* Default mitigation for L1TF-affected CPUs */ enum l1tf_mitigations l1tf_mitigation __ro_after_init = L1TF_MITIGATION_FLUSH; #if IS_ENABLED(CONFIG_KVM_INTEL) EXPORT_SYMBOL_GPL(l1tf_mitigation); #endif enum vmx_l1d_flush_state l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO; EXPORT_SYMBOL_GPL(l1tf_vmx_mitigation); /* * These CPUs all support 44bits physical address space internally in the * cache but CPUID can report a smaller number of physical address bits. * * The L1TF mitigation uses the top most address bit for the inversion of * non present PTEs. When the installed memory reaches into the top most * address bit due to memory holes, which has been observed on machines * which report 36bits physical address bits and have 32G RAM installed, * then the mitigation range check in l1tf_select_mitigation() triggers. * This is a false positive because the mitigation is still possible due to * the fact that the cache uses 44bit internally. Use the cache bits * instead of the reported physical bits and adjust them on the affected * machines to 44bit if the reported bits are less than 44. */ static void override_cache_bits(struct cpuinfo_x86 *c) { if (c->x86 != 6) return; switch (c->x86_model) { case INTEL_FAM6_NEHALEM: case INTEL_FAM6_WESTMERE: case INTEL_FAM6_SANDYBRIDGE: case INTEL_FAM6_IVYBRIDGE: case INTEL_FAM6_HASWELL: case INTEL_FAM6_HASWELL_L: case INTEL_FAM6_HASWELL_G: case INTEL_FAM6_BROADWELL: case INTEL_FAM6_BROADWELL_G: case INTEL_FAM6_SKYLAKE_L: case INTEL_FAM6_SKYLAKE: case INTEL_FAM6_KABYLAKE_L: case INTEL_FAM6_KABYLAKE: if (c->x86_cache_bits < 44) c->x86_cache_bits = 44; break; } } static void __init l1tf_select_mitigation(void) { u64 half_pa; if (!boot_cpu_has_bug(X86_BUG_L1TF)) return; if (cpu_mitigations_off()) l1tf_mitigation = L1TF_MITIGATION_OFF; else if (cpu_mitigations_auto_nosmt()) l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOSMT; override_cache_bits(&boot_cpu_data); switch (l1tf_mitigation) { case L1TF_MITIGATION_OFF: case L1TF_MITIGATION_FLUSH_NOWARN: case L1TF_MITIGATION_FLUSH: break; case L1TF_MITIGATION_FLUSH_NOSMT: case L1TF_MITIGATION_FULL: cpu_smt_disable(false); break; case L1TF_MITIGATION_FULL_FORCE: cpu_smt_disable(true); break; } #if CONFIG_PGTABLE_LEVELS == 2 pr_warn("Kernel not compiled for PAE. No mitigation for L1TF\n"); return; #endif half_pa = (u64)l1tf_pfn_limit() << PAGE_SHIFT; if (l1tf_mitigation != L1TF_MITIGATION_OFF && e820__mapped_any(half_pa, ULLONG_MAX - half_pa, E820_TYPE_RAM)) { pr_warn("System has more than MAX_PA/2 memory. L1TF mitigation not effective.\n"); pr_info("You may make it effective by booting the kernel with mem=%llu parameter.\n", half_pa); pr_info("However, doing so will make a part of your RAM unusable.\n"); pr_info("Reading https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html might help you decide.\n"); return; } setup_force_cpu_cap(X86_FEATURE_L1TF_PTEINV); } static int __init l1tf_cmdline(char *str) { if (!boot_cpu_has_bug(X86_BUG_L1TF)) return 0; if (!str) return -EINVAL; if (!strcmp(str, "off")) l1tf_mitigation = L1TF_MITIGATION_OFF; else if (!strcmp(str, "flush,nowarn")) l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOWARN; else if (!strcmp(str, "flush")) l1tf_mitigation = L1TF_MITIGATION_FLUSH; else if (!strcmp(str, "flush,nosmt")) l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOSMT; else if (!strcmp(str, "full")) l1tf_mitigation = L1TF_MITIGATION_FULL; else if (!strcmp(str, "full,force")) l1tf_mitigation = L1TF_MITIGATION_FULL_FORCE; return 0; } early_param("l1tf", l1tf_cmdline); #undef pr_fmt #define pr_fmt(fmt) "Speculative Return Stack Overflow: " fmt enum srso_mitigation { SRSO_MITIGATION_NONE, SRSO_MITIGATION_UCODE_NEEDED, SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED, SRSO_MITIGATION_MICROCODE, SRSO_MITIGATION_SAFE_RET, SRSO_MITIGATION_IBPB, SRSO_MITIGATION_IBPB_ON_VMEXIT, }; enum srso_mitigation_cmd { SRSO_CMD_OFF, SRSO_CMD_MICROCODE, SRSO_CMD_SAFE_RET, SRSO_CMD_IBPB, SRSO_CMD_IBPB_ON_VMEXIT, }; static const char * const srso_strings[] = { [SRSO_MITIGATION_NONE] = "Vulnerable", [SRSO_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode", [SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED] = "Vulnerable: Safe RET, no microcode", [SRSO_MITIGATION_MICROCODE] = "Vulnerable: Microcode, no safe RET", [SRSO_MITIGATION_SAFE_RET] = "Mitigation: Safe RET", [SRSO_MITIGATION_IBPB] = "Mitigation: IBPB", [SRSO_MITIGATION_IBPB_ON_VMEXIT] = "Mitigation: IBPB on VMEXIT only" }; static enum srso_mitigation srso_mitigation __ro_after_init = SRSO_MITIGATION_NONE; static enum srso_mitigation_cmd srso_cmd __ro_after_init = SRSO_CMD_SAFE_RET; static int __init srso_parse_cmdline(char *str) { if (!str) return -EINVAL; if (!strcmp(str, "off")) srso_cmd = SRSO_CMD_OFF; else if (!strcmp(str, "microcode")) srso_cmd = SRSO_CMD_MICROCODE; else if (!strcmp(str, "safe-ret")) srso_cmd = SRSO_CMD_SAFE_RET; else if (!strcmp(str, "ibpb")) srso_cmd = SRSO_CMD_IBPB; else if (!strcmp(str, "ibpb-vmexit")) srso_cmd = SRSO_CMD_IBPB_ON_VMEXIT; else pr_err("Ignoring unknown SRSO option (%s).", str); return 0; } early_param("spec_rstack_overflow", srso_parse_cmdline); #define SRSO_NOTICE "WARNING: See https://kernel.org/doc/html/latest/admin-guide/hw-vuln/srso.html for mitigation options." static void __init srso_select_mitigation(void) { bool has_microcode = boot_cpu_has(X86_FEATURE_IBPB_BRTYPE); if (cpu_mitigations_off()) return; if (!boot_cpu_has_bug(X86_BUG_SRSO)) { if (boot_cpu_has(X86_FEATURE_SBPB)) x86_pred_cmd = PRED_CMD_SBPB; return; } if (has_microcode) { /* * Zen1/2 with SMT off aren't vulnerable after the right * IBPB microcode has been applied. * * Zen1/2 don't have SBPB, no need to try to enable it here. */ if (boot_cpu_data.x86 < 0x19 && !cpu_smt_possible()) { setup_force_cpu_cap(X86_FEATURE_SRSO_NO); return; } if (retbleed_mitigation == RETBLEED_MITIGATION_IBPB) { srso_mitigation = SRSO_MITIGATION_IBPB; goto out; } } else { pr_warn("IBPB-extending microcode not applied!\n"); pr_warn(SRSO_NOTICE); /* may be overwritten by SRSO_CMD_SAFE_RET below */ srso_mitigation = SRSO_MITIGATION_UCODE_NEEDED; } switch (srso_cmd) { case SRSO_CMD_OFF: if (boot_cpu_has(X86_FEATURE_SBPB)) x86_pred_cmd = PRED_CMD_SBPB; return; case SRSO_CMD_MICROCODE: if (has_microcode) { srso_mitigation = SRSO_MITIGATION_MICROCODE; pr_warn(SRSO_NOTICE); } break; case SRSO_CMD_SAFE_RET: if (IS_ENABLED(CONFIG_CPU_SRSO)) { /* * Enable the return thunk for generated code * like ftrace, static_call, etc. */ setup_force_cpu_cap(X86_FEATURE_RETHUNK); setup_force_cpu_cap(X86_FEATURE_UNRET); if (boot_cpu_data.x86 == 0x19) { setup_force_cpu_cap(X86_FEATURE_SRSO_ALIAS); x86_return_thunk = srso_alias_return_thunk; } else { setup_force_cpu_cap(X86_FEATURE_SRSO); x86_return_thunk = srso_return_thunk; } if (has_microcode) srso_mitigation = SRSO_MITIGATION_SAFE_RET; else srso_mitigation = SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED; } else { pr_err("WARNING: kernel not compiled with CPU_SRSO.\n"); } break; case SRSO_CMD_IBPB: if (IS_ENABLED(CONFIG_CPU_IBPB_ENTRY)) { if (has_microcode) { setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB); srso_mitigation = SRSO_MITIGATION_IBPB; } } else { pr_err("WARNING: kernel not compiled with CPU_IBPB_ENTRY.\n"); } break; case SRSO_CMD_IBPB_ON_VMEXIT: if (IS_ENABLED(CONFIG_CPU_SRSO)) { if (!boot_cpu_has(X86_FEATURE_ENTRY_IBPB) && has_microcode) { setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT); srso_mitigation = SRSO_MITIGATION_IBPB_ON_VMEXIT; } } else { pr_err("WARNING: kernel not compiled with CPU_SRSO.\n"); } break; } out: pr_info("%s\n", srso_strings[srso_mitigation]); } #undef pr_fmt #define pr_fmt(fmt) fmt #ifdef CONFIG_SYSFS #define L1TF_DEFAULT_MSG "Mitigation: PTE Inversion" #if IS_ENABLED(CONFIG_KVM_INTEL) static const char * const l1tf_vmx_states[] = { [VMENTER_L1D_FLUSH_AUTO] = "auto", [VMENTER_L1D_FLUSH_NEVER] = "vulnerable", [VMENTER_L1D_FLUSH_COND] = "conditional cache flushes", [VMENTER_L1D_FLUSH_ALWAYS] = "cache flushes", [VMENTER_L1D_FLUSH_EPT_DISABLED] = "EPT disabled", [VMENTER_L1D_FLUSH_NOT_REQUIRED] = "flush not necessary" }; static ssize_t l1tf_show_state(char *buf) { if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) return sysfs_emit(buf, "%s\n", L1TF_DEFAULT_MSG); if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_EPT_DISABLED || (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER && sched_smt_active())) { return sysfs_emit(buf, "%s; VMX: %s\n", L1TF_DEFAULT_MSG, l1tf_vmx_states[l1tf_vmx_mitigation]); } return sysfs_emit(buf, "%s; VMX: %s, SMT %s\n", L1TF_DEFAULT_MSG, l1tf_vmx_states[l1tf_vmx_mitigation], sched_smt_active() ? "vulnerable" : "disabled"); } static ssize_t itlb_multihit_show_state(char *buf) { if (!boot_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) || !boot_cpu_has(X86_FEATURE_VMX)) return sysfs_emit(buf, "KVM: Mitigation: VMX unsupported\n"); else if (!(cr4_read_shadow() & X86_CR4_VMXE)) return sysfs_emit(buf, "KVM: Mitigation: VMX disabled\n"); else if (itlb_multihit_kvm_mitigation) return sysfs_emit(buf, "KVM: Mitigation: Split huge pages\n"); else return sysfs_emit(buf, "KVM: Vulnerable\n"); } #else static ssize_t l1tf_show_state(char *buf) { return sysfs_emit(buf, "%s\n", L1TF_DEFAULT_MSG); } static ssize_t itlb_multihit_show_state(char *buf) { return sysfs_emit(buf, "Processor vulnerable\n"); } #endif static ssize_t mds_show_state(char *buf) { if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { return sysfs_emit(buf, "%s; SMT Host state unknown\n", mds_strings[mds_mitigation]); } if (boot_cpu_has(X86_BUG_MSBDS_ONLY)) { return sysfs_emit(buf, "%s; SMT %s\n", mds_strings[mds_mitigation], (mds_mitigation == MDS_MITIGATION_OFF ? "vulnerable" : sched_smt_active() ? "mitigated" : "disabled")); } return sysfs_emit(buf, "%s; SMT %s\n", mds_strings[mds_mitigation], sched_smt_active() ? "vulnerable" : "disabled"); } static ssize_t tsx_async_abort_show_state(char *buf) { if ((taa_mitigation == TAA_MITIGATION_TSX_DISABLED) || (taa_mitigation == TAA_MITIGATION_OFF)) return sysfs_emit(buf, "%s\n", taa_strings[taa_mitigation]); if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { return sysfs_emit(buf, "%s; SMT Host state unknown\n", taa_strings[taa_mitigation]); } return sysfs_emit(buf, "%s; SMT %s\n", taa_strings[taa_mitigation], sched_smt_active() ? "vulnerable" : "disabled"); } static ssize_t mmio_stale_data_show_state(char *buf) { if (boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN)) return sysfs_emit(buf, "Unknown: No mitigations\n"); if (mmio_mitigation == MMIO_MITIGATION_OFF) return sysfs_emit(buf, "%s\n", mmio_strings[mmio_mitigation]); if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { return sysfs_emit(buf, "%s; SMT Host state unknown\n", mmio_strings[mmio_mitigation]); } return sysfs_emit(buf, "%s; SMT %s\n", mmio_strings[mmio_mitigation], sched_smt_active() ? "vulnerable" : "disabled"); } static char *stibp_state(void) { if (spectre_v2_in_eibrs_mode(spectre_v2_enabled) && !boot_cpu_has(X86_FEATURE_AUTOIBRS)) return ""; switch (spectre_v2_user_stibp) { case SPECTRE_V2_USER_NONE: return ", STIBP: disabled"; case SPECTRE_V2_USER_STRICT: return ", STIBP: forced"; case SPECTRE_V2_USER_STRICT_PREFERRED: return ", STIBP: always-on"; case SPECTRE_V2_USER_PRCTL: case SPECTRE_V2_USER_SECCOMP: if (static_key_enabled(&switch_to_cond_stibp)) return ", STIBP: conditional"; } return ""; } static char *ibpb_state(void) { if (boot_cpu_has(X86_FEATURE_IBPB)) { if (static_key_enabled(&switch_mm_always_ibpb)) return ", IBPB: always-on"; if (static_key_enabled(&switch_mm_cond_ibpb)) return ", IBPB: conditional"; return ", IBPB: disabled"; } return ""; } static char *pbrsb_eibrs_state(void) { if (boot_cpu_has_bug(X86_BUG_EIBRS_PBRSB)) { if (boot_cpu_has(X86_FEATURE_RSB_VMEXIT_LITE) || boot_cpu_has(X86_FEATURE_RSB_VMEXIT)) return ", PBRSB-eIBRS: SW sequence"; else return ", PBRSB-eIBRS: Vulnerable"; } else { return ", PBRSB-eIBRS: Not affected"; } } static ssize_t spectre_v2_show_state(char *buf) { if (spectre_v2_enabled == SPECTRE_V2_LFENCE) return sysfs_emit(buf, "Vulnerable: LFENCE\n"); if (spectre_v2_enabled == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled()) return sysfs_emit(buf, "Vulnerable: eIBRS with unprivileged eBPF\n"); if (sched_smt_active() && unprivileged_ebpf_enabled() && spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE) return sysfs_emit(buf, "Vulnerable: eIBRS+LFENCE with unprivileged eBPF and SMT\n"); return sysfs_emit(buf, "%s%s%s%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled], ibpb_state(), boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "", stibp_state(), boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "", pbrsb_eibrs_state(), spectre_v2_module_string()); } static ssize_t srbds_show_state(char *buf) { return sysfs_emit(buf, "%s\n", srbds_strings[srbds_mitigation]); } static ssize_t retbleed_show_state(char *buf) { if (retbleed_mitigation == RETBLEED_MITIGATION_UNRET || retbleed_mitigation == RETBLEED_MITIGATION_IBPB) { if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD && boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) return sysfs_emit(buf, "Vulnerable: untrained return thunk / IBPB on non-AMD based uarch\n"); return sysfs_emit(buf, "%s; SMT %s\n", retbleed_strings[retbleed_mitigation], !sched_smt_active() ? "disabled" : spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT || spectre_v2_user_stibp == SPECTRE_V2_USER_STRICT_PREFERRED ? "enabled with STIBP protection" : "vulnerable"); } return sysfs_emit(buf, "%s\n", retbleed_strings[retbleed_mitigation]); } static ssize_t srso_show_state(char *buf) { if (boot_cpu_has(X86_FEATURE_SRSO_NO)) return sysfs_emit(buf, "Mitigation: SMT disabled\n"); return sysfs_emit(buf, "%s\n", srso_strings[srso_mitigation]); } static ssize_t gds_show_state(char *buf) { return sysfs_emit(buf, "%s\n", gds_strings[gds_mitigation]); } static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr, char *buf, unsigned int bug) { if (!boot_cpu_has_bug(bug)) return sysfs_emit(buf, "Not affected\n"); switch (bug) { case X86_BUG_CPU_MELTDOWN: if (boot_cpu_has(X86_FEATURE_PTI)) return sysfs_emit(buf, "Mitigation: PTI\n"); if (hypervisor_is_type(X86_HYPER_XEN_PV)) return sysfs_emit(buf, "Unknown (XEN PV detected, hypervisor mitigation required)\n"); break; case X86_BUG_SPECTRE_V1: return sysfs_emit(buf, "%s\n", spectre_v1_strings[spectre_v1_mitigation]); case X86_BUG_SPECTRE_V2: return spectre_v2_show_state(buf); case X86_BUG_SPEC_STORE_BYPASS: return sysfs_emit(buf, "%s\n", ssb_strings[ssb_mode]); case X86_BUG_L1TF: if (boot_cpu_has(X86_FEATURE_L1TF_PTEINV)) return l1tf_show_state(buf); break; case X86_BUG_MDS: return mds_show_state(buf); case X86_BUG_TAA: return tsx_async_abort_show_state(buf); case X86_BUG_ITLB_MULTIHIT: return itlb_multihit_show_state(buf); case X86_BUG_SRBDS: return srbds_show_state(buf); case X86_BUG_MMIO_STALE_DATA: case X86_BUG_MMIO_UNKNOWN: return mmio_stale_data_show_state(buf); case X86_BUG_RETBLEED: return retbleed_show_state(buf); case X86_BUG_SRSO: return srso_show_state(buf); case X86_BUG_GDS: return gds_show_state(buf); default: break; } return sysfs_emit(buf, "Vulnerable\n"); } ssize_t cpu_show_meltdown(struct device *dev, struct device_attribute *attr, char *buf) { return cpu_show_common(dev, attr, buf, X86_BUG_CPU_MELTDOWN); } ssize_t cpu_show_spectre_v1(struct device *dev, struct device_attribute *attr, char *buf) { return cpu_show_common(dev, attr, buf, X86_BUG_SPECTRE_V1); } ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr, char *buf) { return cpu_show_common(dev, attr, buf, X86_BUG_SPECTRE_V2); } ssize_t cpu_show_spec_store_bypass(struct device *dev, struct device_attribute *attr, char *buf) { return cpu_show_common(dev, attr, buf, X86_BUG_SPEC_STORE_BYPASS); } ssize_t cpu_show_l1tf(struct device *dev, struct device_attribute *attr, char *buf) { return cpu_show_common(dev, attr, buf, X86_BUG_L1TF); } ssize_t cpu_show_mds(struct device *dev, struct device_attribute *attr, char *buf) { return cpu_show_common(dev, attr, buf, X86_BUG_MDS); } ssize_t cpu_show_tsx_async_abort(struct device *dev, struct device_attribute *attr, char *buf) { return cpu_show_common(dev, attr, buf, X86_BUG_TAA); } ssize_t cpu_show_itlb_multihit(struct device *dev, struct device_attribute *attr, char *buf) { return cpu_show_common(dev, attr, buf, X86_BUG_ITLB_MULTIHIT); } ssize_t cpu_show_srbds(struct device *dev, struct device_attribute *attr, char *buf) { return cpu_show_common(dev, attr, buf, X86_BUG_SRBDS); } ssize_t cpu_show_mmio_stale_data(struct device *dev, struct device_attribute *attr, char *buf) { if (boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN)) return cpu_show_common(dev, attr, buf, X86_BUG_MMIO_UNKNOWN); else return cpu_show_common(dev, attr, buf, X86_BUG_MMIO_STALE_DATA); } ssize_t cpu_show_retbleed(struct device *dev, struct device_attribute *attr, char *buf) { return cpu_show_common(dev, attr, buf, X86_BUG_RETBLEED); } ssize_t cpu_show_spec_rstack_overflow(struct device *dev, struct device_attribute *attr, char *buf) { return cpu_show_common(dev, attr, buf, X86_BUG_SRSO); } ssize_t cpu_show_gds(struct device *dev, struct device_attribute *attr, char *buf) { return cpu_show_common(dev, attr, buf, X86_BUG_GDS); } #endif
24 24 24 24 8865 8869 4479 4469 11 77 11 11 76 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 // SPDX-License-Identifier: GPL-2.0 #include <linux/err.h> #include <linux/bug.h> #include <linux/atomic.h> #include <linux/errseq.h> #include <linux/log2.h> /* * An errseq_t is a way of recording errors in one place, and allowing any * number of "subscribers" to tell whether it has changed since a previous * point where it was sampled. * * It's implemented as an unsigned 32-bit value. The low order bits are * designated to hold an error code (between 0 and -MAX_ERRNO). The upper bits * are used as a counter. This is done with atomics instead of locking so that * these functions can be called from any context. * * The general idea is for consumers to sample an errseq_t value. That value * can later be used to tell whether any new errors have occurred since that * sampling was done. * * Note that there is a risk of collisions if new errors are being recorded * frequently, since we have so few bits to use as a counter. * * To mitigate this, one bit is used as a flag to tell whether the value has * been sampled since a new value was recorded. That allows us to avoid bumping * the counter if no one has sampled it since the last time an error was * recorded. * * A new errseq_t should always be zeroed out. A errseq_t value of all zeroes * is the special (but common) case where there has never been an error. An all * zero value thus serves as the "epoch" if one wishes to know whether there * has ever been an error set since it was first initialized. */ /* The low bits are designated for error code (max of MAX_ERRNO) */ #define ERRSEQ_SHIFT ilog2(MAX_ERRNO + 1) /* This bit is used as a flag to indicate whether the value has been seen */ #define ERRSEQ_SEEN (1 << ERRSEQ_SHIFT) /* The lowest bit of the counter */ #define ERRSEQ_CTR_INC (1 << (ERRSEQ_SHIFT + 1)) /** * errseq_set - set a errseq_t for later reporting * @eseq: errseq_t field that should be set * @err: error to set (must be between -1 and -MAX_ERRNO) * * This function sets the error in @eseq, and increments the sequence counter * if the last sequence was sampled at some point in the past. * * Any error set will always overwrite an existing error. * * Return: The previous value, primarily for debugging purposes. The * return value should not be used as a previously sampled value in later * calls as it will not have the SEEN flag set. */ errseq_t errseq_set(errseq_t *eseq, int err) { errseq_t cur, old; /* MAX_ERRNO must be able to serve as a mask */ BUILD_BUG_ON_NOT_POWER_OF_2(MAX_ERRNO + 1); /* * Ensure the error code actually fits where we want it to go. If it * doesn't then just throw a warning and don't record anything. We * also don't accept zero here as that would effectively clear a * previous error. */ old = READ_ONCE(*eseq); if (WARN(unlikely(err == 0 || (unsigned int)-err > MAX_ERRNO), "err = %d\n", err)) return old; for (;;) { errseq_t new; /* Clear out error bits and set new error */ new = (old & ~(MAX_ERRNO|ERRSEQ_SEEN)) | -err; /* Only increment if someone has looked at it */ if (old & ERRSEQ_SEEN) new += ERRSEQ_CTR_INC; /* If there would be no change, then call it done */ if (new == old) { cur = new; break; } /* Try to swap the new value into place */ cur = cmpxchg(eseq, old, new); /* * Call it success if we did the swap or someone else beat us * to it for the same value. */ if (likely(cur == old || cur == new)) break; /* Raced with an update, try again */ old = cur; } return cur; } EXPORT_SYMBOL(errseq_set); /** * errseq_sample() - Grab current errseq_t value. * @eseq: Pointer to errseq_t to be sampled. * * This function allows callers to initialise their errseq_t variable. * If the error has been "seen", new callers will not see an old error. * If there is an unseen error in @eseq, the caller of this function will * see it the next time it checks for an error. * * Context: Any context. * Return: The current errseq value. */ errseq_t errseq_sample(errseq_t *eseq) { errseq_t old = READ_ONCE(*eseq); /* If nobody has seen this error yet, then we can be the first. */ if (!(old & ERRSEQ_SEEN)) old = 0; return old; } EXPORT_SYMBOL(errseq_sample); /** * errseq_check() - Has an error occurred since a particular sample point? * @eseq: Pointer to errseq_t value to be checked. * @since: Previously-sampled errseq_t from which to check. * * Grab the value that eseq points to, and see if it has changed @since * the given value was sampled. The @since value is not advanced, so there * is no need to mark the value as seen. * * Return: The latest error set in the errseq_t or 0 if it hasn't changed. */ int errseq_check(errseq_t *eseq, errseq_t since) { errseq_t cur = READ_ONCE(*eseq); if (likely(cur == since)) return 0; return -(cur & MAX_ERRNO); } EXPORT_SYMBOL(errseq_check); /** * errseq_check_and_advance() - Check an errseq_t and advance to current value. * @eseq: Pointer to value being checked and reported. * @since: Pointer to previously-sampled errseq_t to check against and advance. * * Grab the eseq value, and see whether it matches the value that @since * points to. If it does, then just return 0. * * If it doesn't, then the value has changed. Set the "seen" flag, and try to * swap it into place as the new eseq value. Then, set that value as the new * "since" value, and return whatever the error portion is set to. * * Note that no locking is provided here for concurrent updates to the "since" * value. The caller must provide that if necessary. Because of this, callers * may want to do a lockless errseq_check before taking the lock and calling * this. * * Return: Negative errno if one has been stored, or 0 if no new error has * occurred. */ int errseq_check_and_advance(errseq_t *eseq, errseq_t *since) { int err = 0; errseq_t old, new; /* * Most callers will want to use the inline wrapper to check this, * so that the common case of no error is handled without needing * to take the lock that protects the "since" value. */ old = READ_ONCE(*eseq); if (old != *since) { /* * Set the flag and try to swap it into place if it has * changed. * * We don't care about the outcome of the swap here. If the * swap doesn't occur, then it has either been updated by a * writer who is altering the value in some way (updating * counter or resetting the error), or another reader who is * just setting the "seen" flag. Either outcome is OK, and we * can advance "since" and return an error based on what we * have. */ new = old | ERRSEQ_SEEN; if (new != old) cmpxchg(eseq, old, new); *since = new; err = -(new & MAX_ERRNO); } return err; } EXPORT_SYMBOL(errseq_check_and_advance);
123 19 1440 2 22 1890 1440 1439 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef BLK_MQ_SCHED_H #define BLK_MQ_SCHED_H #include "elevator.h" #include "blk-mq.h" #define MAX_SCHED_RQ (16 * BLKDEV_DEFAULT_RQ) bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs, struct request **merged_request); bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs); bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq, struct list_head *free); void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx); void __blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx); void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx); int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e); void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e); void blk_mq_sched_free_rqs(struct request_queue *q); static inline void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx) { if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) __blk_mq_sched_restart(hctx); } static inline bool bio_mergeable(struct bio *bio) { return !(bio->bi_opf & REQ_NOMERGE_FLAGS); } static inline bool blk_mq_sched_allow_merge(struct request_queue *q, struct request *rq, struct bio *bio) { if (rq->rq_flags & RQF_USE_SCHED) { struct elevator_queue *e = q->elevator; if (e->type->ops.allow_merge) return e->type->ops.allow_merge(q, rq, bio); } return true; } static inline void blk_mq_sched_completed_request(struct request *rq, u64 now) { if (rq->rq_flags & RQF_USE_SCHED) { struct elevator_queue *e = rq->q->elevator; if (e->type->ops.completed_request) e->type->ops.completed_request(rq, now); } } static inline void blk_mq_sched_requeue_request(struct request *rq) { if (rq->rq_flags & RQF_USE_SCHED) { struct request_queue *q = rq->q; struct elevator_queue *e = q->elevator; if (e->type->ops.requeue_request) e->type->ops.requeue_request(rq); } } static inline bool blk_mq_sched_has_work(struct blk_mq_hw_ctx *hctx) { struct elevator_queue *e = hctx->queue->elevator; if (e && e->type->ops.has_work) return e->type->ops.has_work(hctx); return false; } static inline bool blk_mq_sched_needs_restart(struct blk_mq_hw_ctx *hctx) { return test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); } #endif
1639 1641 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 /* SPDX-License-Identifier: GPL-2.0 */ /* * Kernel Electric-Fence (KFENCE). For more info please see * Documentation/dev-tools/kfence.rst. * * Copyright (C) 2020, Google LLC. */ #ifndef MM_KFENCE_KFENCE_H #define MM_KFENCE_KFENCE_H #include <linux/mm.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/types.h> #include "../slab.h" /* for struct kmem_cache */ /* * Get the canary byte pattern for @addr. Use a pattern that varies based on the * lower 3 bits of the address, to detect memory corruptions with higher * probability, where similar constants are used. */ #define KFENCE_CANARY_PATTERN_U8(addr) ((u8)0xaa ^ (u8)((unsigned long)(addr) & 0x7)) /* * Define a continuous 8-byte canary starting from a multiple of 8. The canary * of each byte is only related to the lowest three bits of its address, so the * canary of every 8 bytes is the same. 64-bit memory can be filled and checked * at a time instead of byte by byte to improve performance. */ #define KFENCE_CANARY_PATTERN_U64 ((u64)0xaaaaaaaaaaaaaaaa ^ (u64)(le64_to_cpu(0x0706050403020100))) /* Maximum stack depth for reports. */ #define KFENCE_STACK_DEPTH 64 /* KFENCE object states. */ enum kfence_object_state { KFENCE_OBJECT_UNUSED, /* Object is unused. */ KFENCE_OBJECT_ALLOCATED, /* Object is currently allocated. */ KFENCE_OBJECT_FREED, /* Object was allocated, and then freed. */ }; /* Alloc/free tracking information. */ struct kfence_track { pid_t pid; int cpu; u64 ts_nsec; int num_stack_entries; unsigned long stack_entries[KFENCE_STACK_DEPTH]; }; /* KFENCE metadata per guarded allocation. */ struct kfence_metadata { struct list_head list; /* Freelist node; access under kfence_freelist_lock. */ struct rcu_head rcu_head; /* For delayed freeing. */ /* * Lock protecting below data; to ensure consistency of the below data, * since the following may execute concurrently: __kfence_alloc(), * __kfence_free(), kfence_handle_page_fault(). However, note that we * cannot grab the same metadata off the freelist twice, and multiple * __kfence_alloc() cannot run concurrently on the same metadata. */ raw_spinlock_t lock; /* The current state of the object; see above. */ enum kfence_object_state state; /* * Allocated object address; cannot be calculated from size, because of * alignment requirements. * * Invariant: ALIGN_DOWN(addr, PAGE_SIZE) is constant. */ unsigned long addr; /* * The size of the original allocation. */ size_t size; /* * The kmem_cache cache of the last allocation; NULL if never allocated * or the cache has already been destroyed. */ struct kmem_cache *cache; /* * In case of an invalid access, the page that was unprotected; we * optimistically only store one address. */ unsigned long unprotected_page; /* Allocation and free stack information. */ struct kfence_track alloc_track; struct kfence_track free_track; /* For updating alloc_covered on frees. */ u32 alloc_stack_hash; #ifdef CONFIG_MEMCG struct obj_cgroup *objcg; #endif }; #define KFENCE_METADATA_SIZE PAGE_ALIGN(sizeof(struct kfence_metadata) * \ CONFIG_KFENCE_NUM_OBJECTS) extern struct kfence_metadata *kfence_metadata; static inline struct kfence_metadata *addr_to_metadata(unsigned long addr) { long index; /* The checks do not affect performance; only called from slow-paths. */ if (!is_kfence_address((void *)addr)) return NULL; /* * May be an invalid index if called with an address at the edge of * __kfence_pool, in which case we would report an "invalid access" * error. */ index = (addr - (unsigned long)__kfence_pool) / (PAGE_SIZE * 2) - 1; if (index < 0 || index >= CONFIG_KFENCE_NUM_OBJECTS) return NULL; return &kfence_metadata[index]; } /* KFENCE error types for report generation. */ enum kfence_error_type { KFENCE_ERROR_OOB, /* Detected a out-of-bounds access. */ KFENCE_ERROR_UAF, /* Detected a use-after-free access. */ KFENCE_ERROR_CORRUPTION, /* Detected a memory corruption on free. */ KFENCE_ERROR_INVALID, /* Invalid access of unknown type. */ KFENCE_ERROR_INVALID_FREE, /* Invalid free. */ }; void kfence_report_error(unsigned long address, bool is_write, struct pt_regs *regs, const struct kfence_metadata *meta, enum kfence_error_type type); void kfence_print_object(struct seq_file *seq, const struct kfence_metadata *meta); #endif /* MM_KFENCE_KFENCE_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 /* SPDX-License-Identifier: GPL-2.0-only */ /* * This file is part of UBIFS. * * Copyright (C) 2006-2008 Nokia Corporation * * Authors: Artem Bityutskiy (Битюцкий Артём) * Adrian Hunter */ /* * This file contains miscellaneous helper functions. */ #ifndef __UBIFS_MISC_H__ #define __UBIFS_MISC_H__ /** * ubifs_zn_dirty - check if znode is dirty. * @znode: znode to check * * This helper function returns %1 if @znode is dirty and %0 otherwise. */ static inline int ubifs_zn_dirty(const struct ubifs_znode *znode) { return !!test_bit(DIRTY_ZNODE, &znode->flags); } /** * ubifs_zn_obsolete - check if znode is obsolete. * @znode: znode to check * * This helper function returns %1 if @znode is obsolete and %0 otherwise. */ static inline int ubifs_zn_obsolete(const struct ubifs_znode *znode) { return !!test_bit(OBSOLETE_ZNODE, &znode->flags); } /** * ubifs_zn_cow - check if znode has to be copied on write. * @znode: znode to check * * This helper function returns %1 if @znode is has COW flag set and %0 * otherwise. */ static inline int ubifs_zn_cow(const struct ubifs_znode *znode) { return !!test_bit(COW_ZNODE, &znode->flags); } /** * ubifs_wake_up_bgt - wake up background thread. * @c: UBIFS file-system description object */ static inline void ubifs_wake_up_bgt(struct ubifs_info *c) { if (c->bgt && !c->need_bgt) { c->need_bgt = 1; wake_up_process(c->bgt); } } /** * ubifs_tnc_find_child - find next child in znode. * @znode: znode to search at * @start: the zbranch index to start at * * This helper function looks for znode child starting at index @start. Returns * the child or %NULL if no children were found. */ static inline struct ubifs_znode * ubifs_tnc_find_child(struct ubifs_znode *znode, int start) { while (start < znode->child_cnt) { if (znode->zbranch[start].znode) return znode->zbranch[start].znode; start += 1; } return NULL; } /** * ubifs_inode - get UBIFS inode information by VFS 'struct inode' object. * @inode: the VFS 'struct inode' pointer */ static inline struct ubifs_inode *ubifs_inode(const struct inode *inode) { return container_of(inode, struct ubifs_inode, vfs_inode); } /** * ubifs_compr_present - check if compressor was compiled in. * @compr_type: compressor type to check * @c: the UBIFS file-system description object * * This function returns %1 of compressor of type @compr_type is present, and * %0 if not. */ static inline int ubifs_compr_present(struct ubifs_info *c, int compr_type) { ubifs_assert(c, compr_type >= 0 && compr_type < UBIFS_COMPR_TYPES_CNT); return !!ubifs_compressors[compr_type]->capi_name; } /** * ubifs_compr_name - get compressor name string by its type. * @compr_type: compressor type * @c: the UBIFS file-system description object * * This function returns compressor type string. */ static inline const char *ubifs_compr_name(struct ubifs_info *c, int compr_type) { ubifs_assert(c, compr_type >= 0 && compr_type < UBIFS_COMPR_TYPES_CNT); return ubifs_compressors[compr_type]->name; } /** * ubifs_wbuf_sync - synchronize write-buffer. * @wbuf: write-buffer to synchronize * * This is the same as 'ubifs_wbuf_sync_nolock()' but it does not assume * that the write-buffer is already locked. */ static inline int ubifs_wbuf_sync(struct ubifs_wbuf *wbuf) { int err; mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead); err = ubifs_wbuf_sync_nolock(wbuf); mutex_unlock(&wbuf->io_mutex); return err; } /** * ubifs_encode_dev - encode device node IDs. * @dev: UBIFS device node information * @rdev: device IDs to encode * * This is a helper function which encodes major/minor numbers of a device node * into UBIFS device node description. We use standard Linux "new" and "huge" * encodings. */ static inline int ubifs_encode_dev(union ubifs_dev_desc *dev, dev_t rdev) { dev->new = cpu_to_le32(new_encode_dev(rdev)); return sizeof(dev->new); } /** * ubifs_add_dirt - add dirty space to LEB properties. * @c: the UBIFS file-system description object * @lnum: LEB to add dirty space for * @dirty: dirty space to add * * This is a helper function which increased amount of dirty LEB space. Returns * zero in case of success and a negative error code in case of failure. */ static inline int ubifs_add_dirt(struct ubifs_info *c, int lnum, int dirty) { return ubifs_update_one_lp(c, lnum, LPROPS_NC, dirty, 0, 0); } /** * ubifs_return_leb - return LEB to lprops. * @c: the UBIFS file-system description object * @lnum: LEB to return * * This helper function cleans the "taken" flag of a logical eraseblock in the * lprops. Returns zero in case of success and a negative error code in case of * failure. */ static inline int ubifs_return_leb(struct ubifs_info *c, int lnum) { return ubifs_change_one_lp(c, lnum, LPROPS_NC, LPROPS_NC, 0, LPROPS_TAKEN, 0); } /** * ubifs_idx_node_sz - return index node size. * @c: the UBIFS file-system description object * @child_cnt: number of children of this index node */ static inline int ubifs_idx_node_sz(const struct ubifs_info *c, int child_cnt) { return UBIFS_IDX_NODE_SZ + (UBIFS_BRANCH_SZ + c->key_len + c->hash_len) * child_cnt; } /** * ubifs_idx_branch - return pointer to an index branch. * @c: the UBIFS file-system description object * @idx: index node * @bnum: branch number */ static inline struct ubifs_branch *ubifs_idx_branch(const struct ubifs_info *c, const struct ubifs_idx_node *idx, int bnum) { return (struct ubifs_branch *)((void *)idx->branches + (UBIFS_BRANCH_SZ + c->key_len + c->hash_len) * bnum); } /** * ubifs_idx_key - return pointer to an index key. * @c: the UBIFS file-system description object * @idx: index node */ static inline void *ubifs_idx_key(const struct ubifs_info *c, const struct ubifs_idx_node *idx) { return (void *)((struct ubifs_branch *)idx->branches)->key; } /** * ubifs_tnc_lookup - look up a file-system node. * @c: UBIFS file-system description object * @key: node key to lookup * @node: the node is returned here * * This function look up and reads node with key @key. The caller has to make * sure the @node buffer is large enough to fit the node. Returns zero in case * of success, %-ENOENT if the node was not found, and a negative error code in * case of failure. */ static inline int ubifs_tnc_lookup(struct ubifs_info *c, const union ubifs_key *key, void *node) { return ubifs_tnc_locate(c, key, node, NULL, NULL); } /** * ubifs_get_lprops - get reference to LEB properties. * @c: the UBIFS file-system description object * * This function locks lprops. Lprops have to be unlocked by * 'ubifs_release_lprops()'. */ static inline void ubifs_get_lprops(struct ubifs_info *c) { mutex_lock(&c->lp_mutex); } /** * ubifs_release_lprops - release lprops lock. * @c: the UBIFS file-system description object * * This function has to be called after each 'ubifs_get_lprops()' call to * unlock lprops. */ static inline void ubifs_release_lprops(struct ubifs_info *c) { ubifs_assert(c, mutex_is_locked(&c->lp_mutex)); ubifs_assert(c, c->lst.empty_lebs >= 0 && c->lst.empty_lebs <= c->main_lebs); mutex_unlock(&c->lp_mutex); } /** * ubifs_next_log_lnum - switch to the next log LEB. * @c: UBIFS file-system description object * @lnum: current log LEB * * This helper function returns the log LEB number which goes next after LEB * 'lnum'. */ static inline int ubifs_next_log_lnum(const struct ubifs_info *c, int lnum) { lnum += 1; if (lnum > c->log_last) lnum = UBIFS_LOG_LNUM; return lnum; } static inline int ubifs_xattr_max_cnt(struct ubifs_info *c) { int max_xattrs = (c->leb_size / 2) / UBIFS_INO_NODE_SZ; ubifs_assert(c, max_xattrs < c->max_orphans); return max_xattrs; } const char *ubifs_assert_action_name(struct ubifs_info *c); #endif /* __UBIFS_MISC_H__ */
17 17 17 80 79 78 78 79 15 17 16 14 14 19 6 5 5 6 6 2 3 1 1 1 20 20 19 19 3 14 5 4 2 12 5 14 20 88 6 6 85 19 5 5 9 9 9 8 9 7 6 8 7 3 3 3 2 1 5 4 4 4 9 4 9 4 4 4 4 8 90 91 89 1 90 1 89 1 82 81 81 78 7 15 82 5 15 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 // SPDX-License-Identifier: GPL-2.0-or-later #include <linux/sched/task.h> #include <linux/sched/signal.h> #include <linux/freezer.h> #include "futex.h" /* * READ this before attempting to hack on futexes! * * Basic futex operation and ordering guarantees * ============================================= * * The waiter reads the futex value in user space and calls * futex_wait(). This function computes the hash bucket and acquires * the hash bucket lock. After that it reads the futex user space value * again and verifies that the data has not changed. If it has not changed * it enqueues itself into the hash bucket, releases the hash bucket lock * and schedules. * * The waker side modifies the user space value of the futex and calls * futex_wake(). This function computes the hash bucket and acquires the * hash bucket lock. Then it looks for waiters on that futex in the hash * bucket and wakes them. * * In futex wake up scenarios where no tasks are blocked on a futex, taking * the hb spinlock can be avoided and simply return. In order for this * optimization to work, ordering guarantees must exist so that the waiter * being added to the list is acknowledged when the list is concurrently being * checked by the waker, avoiding scenarios like the following: * * CPU 0 CPU 1 * val = *futex; * sys_futex(WAIT, futex, val); * futex_wait(futex, val); * uval = *futex; * *futex = newval; * sys_futex(WAKE, futex); * futex_wake(futex); * if (queue_empty()) * return; * if (uval == val) * lock(hash_bucket(futex)); * queue(); * unlock(hash_bucket(futex)); * schedule(); * * This would cause the waiter on CPU 0 to wait forever because it * missed the transition of the user space value from val to newval * and the waker did not find the waiter in the hash bucket queue. * * The correct serialization ensures that a waiter either observes * the changed user space value before blocking or is woken by a * concurrent waker: * * CPU 0 CPU 1 * val = *futex; * sys_futex(WAIT, futex, val); * futex_wait(futex, val); * * waiters++; (a) * smp_mb(); (A) <-- paired with -. * | * lock(hash_bucket(futex)); | * | * uval = *futex; | * | *futex = newval; * | sys_futex(WAKE, futex); * | futex_wake(futex); * | * `--------> smp_mb(); (B) * if (uval == val) * queue(); * unlock(hash_bucket(futex)); * schedule(); if (waiters) * lock(hash_bucket(futex)); * else wake_waiters(futex); * waiters--; (b) unlock(hash_bucket(futex)); * * Where (A) orders the waiters increment and the futex value read through * atomic operations (see futex_hb_waiters_inc) and where (B) orders the write * to futex and the waiters read (see futex_hb_waiters_pending()). * * This yields the following case (where X:=waiters, Y:=futex): * * X = Y = 0 * * w[X]=1 w[Y]=1 * MB MB * r[Y]=y r[X]=x * * Which guarantees that x==0 && y==0 is impossible; which translates back into * the guarantee that we cannot both miss the futex variable change and the * enqueue. * * Note that a new waiter is accounted for in (a) even when it is possible that * the wait call can return error, in which case we backtrack from it in (b). * Refer to the comment in futex_q_lock(). * * Similarly, in order to account for waiters being requeued on another * address we always increment the waiters for the destination bucket before * acquiring the lock. It then decrements them again after releasing it - * the code that actually moves the futex(es) between hash buckets (requeue_futex) * will do the additional required waiter count housekeeping. This is done for * double_lock_hb() and double_unlock_hb(), respectively. */ bool __futex_wake_mark(struct futex_q *q) { if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n")) return false; __futex_unqueue(q); /* * The waiting task can free the futex_q as soon as q->lock_ptr = NULL * is written, without taking any locks. This is possible in the event * of a spurious wakeup, for example. A memory barrier is required here * to prevent the following store to lock_ptr from getting ahead of the * plist_del in __futex_unqueue(). */ smp_store_release(&q->lock_ptr, NULL); return true; } /* * The hash bucket lock must be held when this is called. * Afterwards, the futex_q must not be accessed. Callers * must ensure to later call wake_up_q() for the actual * wakeups to occur. */ void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q) { struct task_struct *p = q->task; get_task_struct(p); if (!__futex_wake_mark(q)) { put_task_struct(p); return; } /* * Queue the task for later wakeup for after we've released * the hb->lock. */ wake_q_add_safe(wake_q, p); } /* * Wake up waiters matching bitset queued on this futex (uaddr). */ int futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) { struct futex_hash_bucket *hb; struct futex_q *this, *next; union futex_key key = FUTEX_KEY_INIT; DEFINE_WAKE_Q(wake_q); int ret; if (!bitset) return -EINVAL; ret = get_futex_key(uaddr, flags, &key, FUTEX_READ); if (unlikely(ret != 0)) return ret; if ((flags & FLAGS_STRICT) && !nr_wake) return 0; hb = futex_hash(&key); /* Make sure we really have tasks to wakeup */ if (!futex_hb_waiters_pending(hb)) return ret; spin_lock(&hb->lock); plist_for_each_entry_safe(this, next, &hb->chain, list) { if (futex_match (&this->key, &key)) { if (this->pi_state || this->rt_waiter) { ret = -EINVAL; break; } /* Check if one of the bits is set in both bitsets */ if (!(this->bitset & bitset)) continue; this->wake(&wake_q, this); if (++ret >= nr_wake) break; } } spin_unlock(&hb->lock); wake_up_q(&wake_q); return ret; } static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr) { unsigned int op = (encoded_op & 0x70000000) >> 28; unsigned int cmp = (encoded_op & 0x0f000000) >> 24; int oparg = sign_extend32((encoded_op & 0x00fff000) >> 12, 11); int cmparg = sign_extend32(encoded_op & 0x00000fff, 11); int oldval, ret; if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) { if (oparg < 0 || oparg > 31) { char comm[sizeof(current->comm)]; /* * kill this print and return -EINVAL when userspace * is sane again */ pr_info_ratelimited("futex_wake_op: %s tries to shift op by %d; fix this program\n", get_task_comm(comm, current), oparg); oparg &= 31; } oparg = 1 << oparg; } pagefault_disable(); ret = arch_futex_atomic_op_inuser(op, oparg, &oldval, uaddr); pagefault_enable(); if (ret) return ret; switch (cmp) { case FUTEX_OP_CMP_EQ: return oldval == cmparg; case FUTEX_OP_CMP_NE: return oldval != cmparg; case FUTEX_OP_CMP_LT: return oldval < cmparg; case FUTEX_OP_CMP_GE: return oldval >= cmparg; case FUTEX_OP_CMP_LE: return oldval <= cmparg; case FUTEX_OP_CMP_GT: return oldval > cmparg; default: return -ENOSYS; } } /* * Wake up all waiters hashed on the physical page that is mapped * to this virtual address: */ int futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, int nr_wake, int nr_wake2, int op) { union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; struct futex_hash_bucket *hb1, *hb2; struct futex_q *this, *next; int ret, op_ret; DEFINE_WAKE_Q(wake_q); retry: ret = get_futex_key(uaddr1, flags, &key1, FUTEX_READ); if (unlikely(ret != 0)) return ret; ret = get_futex_key(uaddr2, flags, &key2, FUTEX_WRITE); if (unlikely(ret != 0)) return ret; hb1 = futex_hash(&key1); hb2 = futex_hash(&key2); retry_private: double_lock_hb(hb1, hb2); op_ret = futex_atomic_op_inuser(op, uaddr2); if (unlikely(op_ret < 0)) { double_unlock_hb(hb1, hb2); if (!IS_ENABLED(CONFIG_MMU) || unlikely(op_ret != -EFAULT && op_ret != -EAGAIN)) { /* * we don't get EFAULT from MMU faults if we don't have * an MMU, but we might get them from range checking */ ret = op_ret; return ret; } if (op_ret == -EFAULT) { ret = fault_in_user_writeable(uaddr2); if (ret) return ret; } cond_resched(); if (!(flags & FLAGS_SHARED)) goto retry_private; goto retry; } plist_for_each_entry_safe(this, next, &hb1->chain, list) { if (futex_match (&this->key, &key1)) { if (this->pi_state || this->rt_waiter) { ret = -EINVAL; goto out_unlock; } this->wake(&wake_q, this); if (++ret >= nr_wake) break; } } if (op_ret > 0) { op_ret = 0; plist_for_each_entry_safe(this, next, &hb2->chain, list) { if (futex_match (&this->key, &key2)) { if (this->pi_state || this->rt_waiter) { ret = -EINVAL; goto out_unlock; } this->wake(&wake_q, this); if (++op_ret >= nr_wake2) break; } } ret += op_ret; } out_unlock: double_unlock_hb(hb1, hb2); wake_up_q(&wake_q); return ret; } static long futex_wait_restart(struct restart_block *restart); /** * futex_wait_queue() - futex_queue() and wait for wakeup, timeout, or signal * @hb: the futex hash bucket, must be locked by the caller * @q: the futex_q to queue up on * @timeout: the prepared hrtimer_sleeper, or null for no timeout */ void futex_wait_queue(struct futex_hash_bucket *hb, struct futex_q *q, struct hrtimer_sleeper *timeout) { /* * The task state is guaranteed to be set before another task can * wake it. set_current_state() is implemented using smp_store_mb() and * futex_queue() calls spin_unlock() upon completion, both serializing * access to the hash list and forcing another memory barrier. */ set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); futex_queue(q, hb); /* Arm the timer */ if (timeout) hrtimer_sleeper_start_expires(timeout, HRTIMER_MODE_ABS); /* * If we have been removed from the hash list, then another task * has tried to wake us, and we can skip the call to schedule(). */ if (likely(!plist_node_empty(&q->list))) { /* * If the timer has already expired, current will already be * flagged for rescheduling. Only call schedule if there * is no timeout, or if it has yet to expire. */ if (!timeout || timeout->task) schedule(); } __set_current_state(TASK_RUNNING); } /** * futex_unqueue_multiple - Remove various futexes from their hash bucket * @v: The list of futexes to unqueue * @count: Number of futexes in the list * * Helper to unqueue a list of futexes. This can't fail. * * Return: * - >=0 - Index of the last futex that was awoken; * - -1 - No futex was awoken */ int futex_unqueue_multiple(struct futex_vector *v, int count) { int ret = -1, i; for (i = 0; i < count; i++) { if (!futex_unqueue(&v[i].q)) ret = i; } return ret; } /** * futex_wait_multiple_setup - Prepare to wait and enqueue multiple futexes * @vs: The futex list to wait on * @count: The size of the list * @woken: Index of the last woken futex, if any. Used to notify the * caller that it can return this index to userspace (return parameter) * * Prepare multiple futexes in a single step and enqueue them. This may fail if * the futex list is invalid or if any futex was already awoken. On success the * task is ready to interruptible sleep. * * Return: * - 1 - One of the futexes was woken by another thread * - 0 - Success * - <0 - -EFAULT, -EWOULDBLOCK or -EINVAL */ int futex_wait_multiple_setup(struct futex_vector *vs, int count, int *woken) { struct futex_hash_bucket *hb; bool retry = false; int ret, i; u32 uval; /* * Enqueuing multiple futexes is tricky, because we need to enqueue * each futex on the list before dealing with the next one to avoid * deadlocking on the hash bucket. But, before enqueuing, we need to * make sure that current->state is TASK_INTERRUPTIBLE, so we don't * lose any wake events, which cannot be done before the get_futex_key * of the next key, because it calls get_user_pages, which can sleep. * Thus, we fetch the list of futexes keys in two steps, by first * pinning all the memory keys in the futex key, and only then we read * each key and queue the corresponding futex. * * Private futexes doesn't need to recalculate hash in retry, so skip * get_futex_key() when retrying. */ retry: for (i = 0; i < count; i++) { if (!(vs[i].w.flags & FLAGS_SHARED) && retry) continue; ret = get_futex_key(u64_to_user_ptr(vs[i].w.uaddr), vs[i].w.flags, &vs[i].q.key, FUTEX_READ); if (unlikely(ret)) return ret; } set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); for (i = 0; i < count; i++) { u32 __user *uaddr = (u32 __user *)(unsigned long)vs[i].w.uaddr; struct futex_q *q = &vs[i].q; u32 val = vs[i].w.val; hb = futex_q_lock(q); ret = futex_get_value_locked(&uval, uaddr); if (!ret && uval == val) { /* * The bucket lock can't be held while dealing with the * next futex. Queue each futex at this moment so hb can * be unlocked. */ futex_queue(q, hb); continue; } futex_q_unlock(hb); __set_current_state(TASK_RUNNING); /* * Even if something went wrong, if we find out that a futex * was woken, we don't return error and return this index to * userspace */ *woken = futex_unqueue_multiple(vs, i); if (*woken >= 0) return 1; if (ret) { /* * If we need to handle a page fault, we need to do so * without any lock and any enqueued futex (otherwise * we could lose some wakeup). So we do it here, after * undoing all the work done so far. In success, we * retry all the work. */ if (get_user(uval, uaddr)) return -EFAULT; retry = true; goto retry; } if (uval != val) return -EWOULDBLOCK; } return 0; } /** * futex_sleep_multiple - Check sleeping conditions and sleep * @vs: List of futexes to wait for * @count: Length of vs * @to: Timeout * * Sleep if and only if the timeout hasn't expired and no futex on the list has * been woken up. */ static void futex_sleep_multiple(struct futex_vector *vs, unsigned int count, struct hrtimer_sleeper *to) { if (to && !to->task) return; for (; count; count--, vs++) { if (!READ_ONCE(vs->q.lock_ptr)) return; } schedule(); } /** * futex_wait_multiple - Prepare to wait on and enqueue several futexes * @vs: The list of futexes to wait on * @count: The number of objects * @to: Timeout before giving up and returning to userspace * * Entry point for the FUTEX_WAIT_MULTIPLE futex operation, this function * sleeps on a group of futexes and returns on the first futex that is * wake, or after the timeout has elapsed. * * Return: * - >=0 - Hint to the futex that was awoken * - <0 - On error */ int futex_wait_multiple(struct futex_vector *vs, unsigned int count, struct hrtimer_sleeper *to) { int ret, hint = 0; if (to) hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS); while (1) { ret = futex_wait_multiple_setup(vs, count, &hint); if (ret) { if (ret > 0) { /* A futex was woken during setup */ ret = hint; } return ret; } futex_sleep_multiple(vs, count, to); __set_current_state(TASK_RUNNING); ret = futex_unqueue_multiple(vs, count); if (ret >= 0) return ret; if (to && !to->task) return -ETIMEDOUT; else if (signal_pending(current)) return -ERESTARTSYS; /* * The final case is a spurious wakeup, for * which just retry. */ } } /** * futex_wait_setup() - Prepare to wait on a futex * @uaddr: the futex userspace address * @val: the expected value * @flags: futex flags (FLAGS_SHARED, etc.) * @q: the associated futex_q * @hb: storage for hash_bucket pointer to be returned to caller * * Setup the futex_q and locate the hash_bucket. Get the futex value and * compare it with the expected value. Handle atomic faults internally. * Return with the hb lock held on success, and unlocked on failure. * * Return: * - 0 - uaddr contains val and hb has been locked; * - <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked */ int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, struct futex_q *q, struct futex_hash_bucket **hb) { u32 uval; int ret; /* * Access the page AFTER the hash-bucket is locked. * Order is important: * * Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val); * Userspace waker: if (cond(var)) { var = new; futex_wake(&var); } * * The basic logical guarantee of a futex is that it blocks ONLY * if cond(var) is known to be true at the time of blocking, for * any cond. If we locked the hash-bucket after testing *uaddr, that * would open a race condition where we could block indefinitely with * cond(var) false, which would violate the guarantee. * * On the other hand, we insert q and release the hash-bucket only * after testing *uaddr. This guarantees that futex_wait() will NOT * absorb a wakeup if *uaddr does not match the desired values * while the syscall executes. */ retry: ret = get_futex_key(uaddr, flags, &q->key, FUTEX_READ); if (unlikely(ret != 0)) return ret; retry_private: *hb = futex_q_lock(q); ret = futex_get_value_locked(&uval, uaddr); if (ret) { futex_q_unlock(*hb); ret = get_user(uval, uaddr); if (ret) return ret; if (!(flags & FLAGS_SHARED)) goto retry_private; goto retry; } if (uval != val) { futex_q_unlock(*hb); ret = -EWOULDBLOCK; } return ret; } int __futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, struct hrtimer_sleeper *to, u32 bitset) { struct futex_q q = futex_q_init; struct futex_hash_bucket *hb; int ret; if (!bitset) return -EINVAL; q.bitset = bitset; retry: /* * Prepare to wait on uaddr. On success, it holds hb->lock and q * is initialized. */ ret = futex_wait_setup(uaddr, val, flags, &q, &hb); if (ret) return ret; /* futex_queue and wait for wakeup, timeout, or a signal. */ futex_wait_queue(hb, &q, to); /* If we were woken (and unqueued), we succeeded, whatever. */ if (!futex_unqueue(&q)) return 0; if (to && !to->task) return -ETIMEDOUT; /* * We expect signal_pending(current), but we might be the * victim of a spurious wakeup as well. */ if (!signal_pending(current)) goto retry; return -ERESTARTSYS; } int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset) { struct hrtimer_sleeper timeout, *to; struct restart_block *restart; int ret; to = futex_setup_timer(abs_time, &timeout, flags, current->timer_slack_ns); ret = __futex_wait(uaddr, flags, val, to, bitset); /* No timeout, nothing to clean up. */ if (!to) return ret; hrtimer_cancel(&to->timer); destroy_hrtimer_on_stack(&to->timer); if (ret == -ERESTARTSYS) { restart = &current->restart_block; restart->futex.uaddr = uaddr; restart->futex.val = val; restart->futex.time = *abs_time; restart->futex.bitset = bitset; restart->futex.flags = flags | FLAGS_HAS_TIMEOUT; return set_restart_fn(restart, futex_wait_restart); } return ret; } static long futex_wait_restart(struct restart_block *restart) { u32 __user *uaddr = restart->futex.uaddr; ktime_t t, *tp = NULL; if (restart->futex.flags & FLAGS_HAS_TIMEOUT) { t = restart->futex.time; tp = &t; } restart->fn = do_no_restart_syscall; return (long)futex_wait(uaddr, restart->futex.flags, restart->futex.val, tp, restart->futex.bitset); }
16 16 16 16 16 16 26 26 26 86 117 136 144 145 105 105 102 104 145 69 69 69 68 69 69 69 72 72 68 67 141 118 68 141 79 8 8 8 8 118 144 144 73 2 2 2 15 41 52 14 144 145 72 73 2 73 113 38 144 144 133 117 144 4 145 145 140 141 73 73 73 141 81 141 141 144 8 8 8 115 8 56 1 1 25 25 8 2 2 2 2 1 2 2 2 1 2 11 1 2 1 1 1 11 138 107 108 78 6 73 73 72 101 103 103 101 103 102 1 103 103 103 102 103 102 60 60 59 60 60 55 52 51 51 50 50 50 50 49 49 34 34 34 34 34 34 25 15 15 26 26 29 26 28 26 29 26 26 26 26 26 19 19 26 30 30 30 16 16 16 16 16 16 11 11 16 9 9 9 9 9 9 1 9 9 9 9 9 9 9 9 8 8 8 8 8 8 8 2 2 2 2 1 1 41 1 34 25 11 25 2 2 2 43 2 42 2 42 9 9 9 53 53 53 52 47 47 47 43 1 1 1 41 43 43 42 42 42 42 1 42 42 34 32 30 29 30 29 26 26 26 25 25 11 11 25 15 15 13 13 14 13 14 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2007 Oracle. All rights reserved. */ #include <linux/fs.h> #include <linux/slab.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/writeback.h> #include <linux/pagemap.h> #include <linux/blkdev.h> #include <linux/uuid.h> #include <linux/timekeeping.h> #include "misc.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" #include "locking.h" #include "tree-log.h" #include "volumes.h" #include "dev-replace.h" #include "qgroup.h" #include "block-group.h" #include "space-info.h" #include "zoned.h" #include "fs.h" #include "accessors.h" #include "extent-tree.h" #include "root-tree.h" #include "defrag.h" #include "dir-item.h" #include "uuid-tree.h" #include "ioctl.h" #include "relocation.h" #include "scrub.h" static struct kmem_cache *btrfs_trans_handle_cachep; #define BTRFS_ROOT_TRANS_TAG 0 /* * Transaction states and transitions * * No running transaction (fs tree blocks are not modified) * | * | To next stage: * | Call start_transaction() variants. Except btrfs_join_transaction_nostart(). * V * Transaction N [[TRANS_STATE_RUNNING]] * | * | New trans handles can be attached to transaction N by calling all * | start_transaction() variants. * | * | To next stage: * | Call btrfs_commit_transaction() on any trans handle attached to * | transaction N * V * Transaction N [[TRANS_STATE_COMMIT_PREP]] * | * | If there are simultaneous calls to btrfs_commit_transaction() one will win * | the race and the rest will wait for the winner to commit the transaction. * | * | The winner will wait for previous running transaction to completely finish * | if there is one. * | * Transaction N [[TRANS_STATE_COMMIT_START]] * | * | Then one of the following happens: * | - Wait for all other trans handle holders to release. * | The btrfs_commit_transaction() caller will do the commit work. * | - Wait for current transaction to be committed by others. * | Other btrfs_commit_transaction() caller will do the commit work. * | * | At this stage, only btrfs_join_transaction*() variants can attach * | to this running transaction. * | All other variants will wait for current one to finish and attach to * | transaction N+1. * | * | To next stage: * | Caller is chosen to commit transaction N, and all other trans handle * | haven been released. * V * Transaction N [[TRANS_STATE_COMMIT_DOING]] * | * | The heavy lifting transaction work is started. * | From running delayed refs (modifying extent tree) to creating pending * | snapshots, running qgroups. * | In short, modify supporting trees to reflect modifications of subvolume * | trees. * | * | At this stage, all start_transaction() calls will wait for this * | transaction to finish and attach to transaction N+1. * | * | To next stage: * | Until all supporting trees are updated. * V * Transaction N [[TRANS_STATE_UNBLOCKED]] * | Transaction N+1 * | All needed trees are modified, thus we only [[TRANS_STATE_RUNNING]] * | need to write them back to disk and update | * | super blocks. | * | | * | At this stage, new transaction is allowed to | * | start. | * | All new start_transaction() calls will be | * | attached to transid N+1. | * | | * | To next stage: | * | Until all tree blocks are super blocks are | * | written to block devices | * V | * Transaction N [[TRANS_STATE_COMPLETED]] V * All tree blocks and super blocks are written. Transaction N+1 * This transaction is finished and all its [[TRANS_STATE_COMMIT_START]] * data structures will be cleaned up. | Life goes on */ static const unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = { [TRANS_STATE_RUNNING] = 0U, [TRANS_STATE_COMMIT_PREP] = 0U, [TRANS_STATE_COMMIT_START] = (__TRANS_START | __TRANS_ATTACH), [TRANS_STATE_COMMIT_DOING] = (__TRANS_START | __TRANS_ATTACH | __TRANS_JOIN | __TRANS_JOIN_NOSTART), [TRANS_STATE_UNBLOCKED] = (__TRANS_START | __TRANS_ATTACH | __TRANS_JOIN | __TRANS_JOIN_NOLOCK | __TRANS_JOIN_NOSTART), [TRANS_STATE_SUPER_COMMITTED] = (__TRANS_START | __TRANS_ATTACH | __TRANS_JOIN | __TRANS_JOIN_NOLOCK | __TRANS_JOIN_NOSTART), [TRANS_STATE_COMPLETED] = (__TRANS_START | __TRANS_ATTACH | __TRANS_JOIN | __TRANS_JOIN_NOLOCK | __TRANS_JOIN_NOSTART), }; void btrfs_put_transaction(struct btrfs_transaction *transaction) { WARN_ON(refcount_read(&transaction->use_count) == 0); if (refcount_dec_and_test(&transaction->use_count)) { BUG_ON(!list_empty(&transaction->list)); WARN_ON(!RB_EMPTY_ROOT( &transaction->delayed_refs.href_root.rb_root)); WARN_ON(!RB_EMPTY_ROOT( &transaction->delayed_refs.dirty_extent_root)); if (transaction->delayed_refs.pending_csums) btrfs_err(transaction->fs_info, "pending csums is %llu", transaction->delayed_refs.pending_csums); /* * If any block groups are found in ->deleted_bgs then it's * because the transaction was aborted and a commit did not * happen (things failed before writing the new superblock * and calling btrfs_finish_extent_commit()), so we can not * discard the physical locations of the block groups. */ while (!list_empty(&transaction->deleted_bgs)) { struct btrfs_block_group *cache; cache = list_first_entry(&transaction->deleted_bgs, struct btrfs_block_group, bg_list); list_del_init(&cache->bg_list); btrfs_unfreeze_block_group(cache); btrfs_put_block_group(cache); } WARN_ON(!list_empty(&transaction->dev_update_list)); kfree(transaction); } } static noinline void switch_commit_roots(struct btrfs_trans_handle *trans) { struct btrfs_transaction *cur_trans = trans->transaction; struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *root, *tmp; /* * At this point no one can be using this transaction to modify any tree * and no one can start another transaction to modify any tree either. */ ASSERT(cur_trans->state == TRANS_STATE_COMMIT_DOING); down_write(&fs_info->commit_root_sem); if (test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) fs_info->last_reloc_trans = trans->transid; list_for_each_entry_safe(root, tmp, &cur_trans->switch_commits, dirty_list) { list_del_init(&root->dirty_list); free_extent_buffer(root->commit_root); root->commit_root = btrfs_root_node(root); extent_io_tree_release(&root->dirty_log_pages); btrfs_qgroup_clean_swapped_blocks(root); } /* We can free old roots now. */ spin_lock(&cur_trans->dropped_roots_lock); while (!list_empty(&cur_trans->dropped_roots)) { root = list_first_entry(&cur_trans->dropped_roots, struct btrfs_root, root_list); list_del_init(&root->root_list); spin_unlock(&cur_trans->dropped_roots_lock); btrfs_free_log(trans, root); btrfs_drop_and_free_fs_root(fs_info, root); spin_lock(&cur_trans->dropped_roots_lock); } spin_unlock(&cur_trans->dropped_roots_lock); up_write(&fs_info->commit_root_sem); } static inline void extwriter_counter_inc(struct btrfs_transaction *trans, unsigned int type) { if (type & TRANS_EXTWRITERS) atomic_inc(&trans->num_extwriters); } static inline void extwriter_counter_dec(struct btrfs_transaction *trans, unsigned int type) { if (type & TRANS_EXTWRITERS) atomic_dec(&trans->num_extwriters); } static inline void extwriter_counter_init(struct btrfs_transaction *trans, unsigned int type) { atomic_set(&trans->num_extwriters, ((type & TRANS_EXTWRITERS) ? 1 : 0)); } static inline int extwriter_counter_read(struct btrfs_transaction *trans) { return atomic_read(&trans->num_extwriters); } /* * To be called after doing the chunk btree updates right after allocating a new * chunk (after btrfs_chunk_alloc_add_chunk_item() is called), when removing a * chunk after all chunk btree updates and after finishing the second phase of * chunk allocation (btrfs_create_pending_block_groups()) in case some block * group had its chunk item insertion delayed to the second phase. */ void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; if (!trans->chunk_bytes_reserved) return; btrfs_block_rsv_release(fs_info, &fs_info->chunk_block_rsv, trans->chunk_bytes_reserved, NULL); trans->chunk_bytes_reserved = 0; } /* * either allocate a new transaction or hop into the existing one */ static noinline int join_transaction(struct btrfs_fs_info *fs_info, unsigned int type) { struct btrfs_transaction *cur_trans; spin_lock(&fs_info->trans_lock); loop: /* The file system has been taken offline. No new transactions. */ if (BTRFS_FS_ERROR(fs_info)) { spin_unlock(&fs_info->trans_lock); return -EROFS; } cur_trans = fs_info->running_transaction; if (cur_trans) { if (TRANS_ABORTED(cur_trans)) { spin_unlock(&fs_info->trans_lock); return cur_trans->aborted; } if (btrfs_blocked_trans_types[cur_trans->state] & type) { spin_unlock(&fs_info->trans_lock); return -EBUSY; } refcount_inc(&cur_trans->use_count); atomic_inc(&cur_trans->num_writers); extwriter_counter_inc(cur_trans, type); spin_unlock(&fs_info->trans_lock); btrfs_lockdep_acquire(fs_info, btrfs_trans_num_writers); btrfs_lockdep_acquire(fs_info, btrfs_trans_num_extwriters); return 0; } spin_unlock(&fs_info->trans_lock); /* * If we are ATTACH or TRANS_JOIN_NOSTART, we just want to catch the * current transaction, and commit it. If there is no transaction, just * return ENOENT. */ if (type == TRANS_ATTACH || type == TRANS_JOIN_NOSTART) return -ENOENT; /* * JOIN_NOLOCK only happens during the transaction commit, so * it is impossible that ->running_transaction is NULL */ BUG_ON(type == TRANS_JOIN_NOLOCK); cur_trans = kmalloc(sizeof(*cur_trans), GFP_NOFS); if (!cur_trans) return -ENOMEM; btrfs_lockdep_acquire(fs_info, btrfs_trans_num_writers); btrfs_lockdep_acquire(fs_info, btrfs_trans_num_extwriters); spin_lock(&fs_info->trans_lock); if (fs_info->running_transaction) { /* * someone started a transaction after we unlocked. Make sure * to redo the checks above */ btrfs_lockdep_release(fs_info, btrfs_trans_num_extwriters); btrfs_lockdep_release(fs_info, btrfs_trans_num_writers); kfree(cur_trans); goto loop; } else if (BTRFS_FS_ERROR(fs_info)) { spin_unlock(&fs_info->trans_lock); btrfs_lockdep_release(fs_info, btrfs_trans_num_extwriters); btrfs_lockdep_release(fs_info, btrfs_trans_num_writers); kfree(cur_trans); return -EROFS; } cur_trans->fs_info = fs_info; atomic_set(&cur_trans->pending_ordered, 0); init_waitqueue_head(&cur_trans->pending_wait); atomic_set(&cur_trans->num_writers, 1); extwriter_counter_init(cur_trans, type); init_waitqueue_head(&cur_trans->writer_wait); init_waitqueue_head(&cur_trans->commit_wait); cur_trans->state = TRANS_STATE_RUNNING; /* * One for this trans handle, one so it will live on until we * commit the transaction. */ refcount_set(&cur_trans->use_count, 2); cur_trans->flags = 0; cur_trans->start_time = ktime_get_seconds(); memset(&cur_trans->delayed_refs, 0, sizeof(cur_trans->delayed_refs)); cur_trans->delayed_refs.href_root = RB_ROOT_CACHED; cur_trans->delayed_refs.dirty_extent_root = RB_ROOT; atomic_set(&cur_trans->delayed_refs.num_entries, 0); /* * although the tree mod log is per file system and not per transaction, * the log must never go across transaction boundaries. */ smp_mb(); if (!list_empty(&fs_info->tree_mod_seq_list)) WARN(1, KERN_ERR "BTRFS: tree_mod_seq_list not empty when creating a fresh transaction\n"); if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log)) WARN(1, KERN_ERR "BTRFS: tree_mod_log rb tree not empty when creating a fresh transaction\n"); atomic64_set(&fs_info->tree_mod_seq, 0); spin_lock_init(&cur_trans->delayed_refs.lock); INIT_LIST_HEAD(&cur_trans->pending_snapshots); INIT_LIST_HEAD(&cur_trans->dev_update_list); INIT_LIST_HEAD(&cur_trans->switch_commits); INIT_LIST_HEAD(&cur_trans->dirty_bgs); INIT_LIST_HEAD(&cur_trans->io_bgs); INIT_LIST_HEAD(&cur_trans->dropped_roots); mutex_init(&cur_trans->cache_write_mutex); spin_lock_init(&cur_trans->dirty_bgs_lock); INIT_LIST_HEAD(&cur_trans->deleted_bgs); spin_lock_init(&cur_trans->dropped_roots_lock); list_add_tail(&cur_trans->list, &fs_info->trans_list); extent_io_tree_init(fs_info, &cur_trans->dirty_pages, IO_TREE_TRANS_DIRTY_PAGES); extent_io_tree_init(fs_info, &cur_trans->pinned_extents, IO_TREE_FS_PINNED_EXTENTS); btrfs_set_fs_generation(fs_info, fs_info->generation + 1); cur_trans->transid = fs_info->generation; fs_info->running_transaction = cur_trans; cur_trans->aborted = 0; spin_unlock(&fs_info->trans_lock); return 0; } /* * This does all the record keeping required to make sure that a shareable root * is properly recorded in a given transaction. This is required to make sure * the old root from before we joined the transaction is deleted when the * transaction commits. */ static int record_root_in_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root, int force) { struct btrfs_fs_info *fs_info = root->fs_info; int ret = 0; if ((test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && root->last_trans < trans->transid) || force) { WARN_ON(!force && root->commit_root != root->node); /* * see below for IN_TRANS_SETUP usage rules * we have the reloc mutex held now, so there * is only one writer in this function */ set_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state); /* make sure readers find IN_TRANS_SETUP before * they find our root->last_trans update */ smp_wmb(); spin_lock(&fs_info->fs_roots_radix_lock); if (root->last_trans == trans->transid && !force) { spin_unlock(&fs_info->fs_roots_radix_lock); return 0; } radix_tree_tag_set(&fs_info->fs_roots_radix, (unsigned long)root->root_key.objectid, BTRFS_ROOT_TRANS_TAG); spin_unlock(&fs_info->fs_roots_radix_lock); root->last_trans = trans->transid; /* this is pretty tricky. We don't want to * take the relocation lock in btrfs_record_root_in_trans * unless we're really doing the first setup for this root in * this transaction. * * Normally we'd use root->last_trans as a flag to decide * if we want to take the expensive mutex. * * But, we have to set root->last_trans before we * init the relocation root, otherwise, we trip over warnings * in ctree.c. The solution used here is to flag ourselves * with root IN_TRANS_SETUP. When this is 1, we're still * fixing up the reloc trees and everyone must wait. * * When this is zero, they can trust root->last_trans and fly * through btrfs_record_root_in_trans without having to take the * lock. smp_wmb() makes sure that all the writes above are * done before we pop in the zero below */ ret = btrfs_init_reloc_root(trans, root); smp_mb__before_atomic(); clear_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state); } return ret; } void btrfs_add_dropped_root(struct btrfs_trans_handle *trans, struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_transaction *cur_trans = trans->transaction; /* Add ourselves to the transaction dropped list */ spin_lock(&cur_trans->dropped_roots_lock); list_add_tail(&root->root_list, &cur_trans->dropped_roots); spin_unlock(&cur_trans->dropped_roots_lock); /* Make sure we don't try to update the root at commit time */ spin_lock(&fs_info->fs_roots_radix_lock); radix_tree_tag_clear(&fs_info->fs_roots_radix, (unsigned long)root->root_key.objectid, BTRFS_ROOT_TRANS_TAG); spin_unlock(&fs_info->fs_roots_radix_lock); } int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; int ret; if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) return 0; /* * see record_root_in_trans for comments about IN_TRANS_SETUP usage * and barriers */ smp_rmb(); if (root->last_trans == trans->transid && !test_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state)) return 0; mutex_lock(&fs_info->reloc_mutex); ret = record_root_in_trans(trans, root, 0); mutex_unlock(&fs_info->reloc_mutex); return ret; } static inline int is_transaction_blocked(struct btrfs_transaction *trans) { return (trans->state >= TRANS_STATE_COMMIT_START && trans->state < TRANS_STATE_UNBLOCKED && !TRANS_ABORTED(trans)); } /* wait for commit against the current transaction to become unblocked * when this is done, it is safe to start a new transaction, but the current * transaction might not be fully on disk. */ static void wait_current_trans(struct btrfs_fs_info *fs_info) { struct btrfs_transaction *cur_trans; spin_lock(&fs_info->trans_lock); cur_trans = fs_info->running_transaction; if (cur_trans && is_transaction_blocked(cur_trans)) { refcount_inc(&cur_trans->use_count); spin_unlock(&fs_info->trans_lock); btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED); wait_event(fs_info->transaction_wait, cur_trans->state >= TRANS_STATE_UNBLOCKED || TRANS_ABORTED(cur_trans)); btrfs_put_transaction(cur_trans); } else { spin_unlock(&fs_info->trans_lock); } } static int may_wait_transaction(struct btrfs_fs_info *fs_info, int type) { if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) return 0; if (type == TRANS_START) return 1; return 0; } static inline bool need_reserve_reloc_root(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; if (!fs_info->reloc_ctl || !test_bit(BTRFS_ROOT_SHAREABLE, &root->state) || root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID || root->reloc_root) return false; return true; } static int btrfs_reserve_trans_metadata(struct btrfs_fs_info *fs_info, enum btrfs_reserve_flush_enum flush, u64 num_bytes, u64 *delayed_refs_bytes) { struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; struct btrfs_space_info *si = fs_info->trans_block_rsv.space_info; u64 extra_delayed_refs_bytes = 0; u64 bytes; int ret; /* * If there's a gap between the size of the delayed refs reserve and * its reserved space, than some tasks have added delayed refs or bumped * its size otherwise (due to block group creation or removal, or block * group item update). Also try to allocate that gap in order to prevent * using (and possibly abusing) the global reserve when committing the * transaction. */ if (flush == BTRFS_RESERVE_FLUSH_ALL && !btrfs_block_rsv_full(delayed_refs_rsv)) { spin_lock(&delayed_refs_rsv->lock); if (delayed_refs_rsv->size > delayed_refs_rsv->reserved) extra_delayed_refs_bytes = delayed_refs_rsv->size - delayed_refs_rsv->reserved; spin_unlock(&delayed_refs_rsv->lock); } bytes = num_bytes + *delayed_refs_bytes + extra_delayed_refs_bytes; /* * We want to reserve all the bytes we may need all at once, so we only * do 1 enospc flushing cycle per transaction start. */ ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush); if (ret == 0) { if (extra_delayed_refs_bytes > 0) btrfs_migrate_to_delayed_refs_rsv(fs_info, extra_delayed_refs_bytes); return 0; } if (extra_delayed_refs_bytes > 0) { bytes -= extra_delayed_refs_bytes; ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush); if (ret == 0) return 0; } /* * If we are an emergency flush, which can steal from the global block * reserve, then attempt to not reserve space for the delayed refs, as * we will consume space for them from the global block reserve. */ if (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL) { bytes -= *delayed_refs_bytes; *delayed_refs_bytes = 0; ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush); } return ret; } static struct btrfs_trans_handle * start_transaction(struct btrfs_root *root, unsigned int num_items, unsigned int type, enum btrfs_reserve_flush_enum flush, bool enforce_qgroups) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; struct btrfs_block_rsv *trans_rsv = &fs_info->trans_block_rsv; struct btrfs_trans_handle *h; struct btrfs_transaction *cur_trans; u64 num_bytes = 0; u64 qgroup_reserved = 0; u64 delayed_refs_bytes = 0; bool reloc_reserved = false; bool do_chunk_alloc = false; int ret; if (BTRFS_FS_ERROR(fs_info)) return ERR_PTR(-EROFS); if (current->journal_info) { WARN_ON(type & TRANS_EXTWRITERS); h = current->journal_info; refcount_inc(&h->use_count); WARN_ON(refcount_read(&h->use_count) > 2); h->orig_rsv = h->block_rsv; h->block_rsv = NULL; goto got_it; } /* * Do the reservation before we join the transaction so we can do all * the appropriate flushing if need be. */ if (num_items && root != fs_info->chunk_root) { qgroup_reserved = num_items * fs_info->nodesize; /* * Use prealloc for now, as there might be a currently running * transaction that could free this reserved space prematurely * by committing. */ ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserved, enforce_qgroups, false); if (ret) return ERR_PTR(ret); num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_items); /* * If we plan to insert/update/delete "num_items" from a btree, * we will also generate delayed refs for extent buffers in the * respective btree paths, so reserve space for the delayed refs * that will be generated by the caller as it modifies btrees. * Try to reserve them to avoid excessive use of the global * block reserve. */ delayed_refs_bytes = btrfs_calc_delayed_ref_bytes(fs_info, num_items); /* * Do the reservation for the relocation root creation */ if (need_reserve_reloc_root(root)) { num_bytes += fs_info->nodesize; reloc_reserved = true; } ret = btrfs_reserve_trans_metadata(fs_info, flush, num_bytes, &delayed_refs_bytes); if (ret) goto reserve_fail; btrfs_block_rsv_add_bytes(trans_rsv, num_bytes, true); if (trans_rsv->space_info->force_alloc) do_chunk_alloc = true; } else if (num_items == 0 && flush == BTRFS_RESERVE_FLUSH_ALL && !btrfs_block_rsv_full(delayed_refs_rsv)) { /* * Some people call with btrfs_start_transaction(root, 0) * because they can be throttled, but have some other mechanism * for reserving space. We still want these guys to refill the * delayed block_rsv so just add 1 items worth of reservation * here. */ ret = btrfs_delayed_refs_rsv_refill(fs_info, flush); if (ret) goto reserve_fail; } again: h = kmem_cache_zalloc(btrfs_trans_handle_cachep, GFP_NOFS); if (!h) { ret = -ENOMEM; goto alloc_fail; } /* * If we are JOIN_NOLOCK we're already committing a transaction and * waiting on this guy, so we don't need to do the sb_start_intwrite * because we're already holding a ref. We need this because we could * have raced in and did an fsync() on a file which can kick a commit * and then we deadlock with somebody doing a freeze. * * If we are ATTACH, it means we just want to catch the current * transaction and commit it, so we needn't do sb_start_intwrite(). */ if (type & __TRANS_FREEZABLE) sb_start_intwrite(fs_info->sb); if (may_wait_transaction(fs_info, type)) wait_current_trans(fs_info); do { ret = join_transaction(fs_info, type); if (ret == -EBUSY) { wait_current_trans(fs_info); if (unlikely(type == TRANS_ATTACH || type == TRANS_JOIN_NOSTART)) ret = -ENOENT; } } while (ret == -EBUSY); if (ret < 0) goto join_fail; cur_trans = fs_info->running_transaction; h->transid = cur_trans->transid; h->transaction = cur_trans; refcount_set(&h->use_count, 1); h->fs_info = root->fs_info; h->type = type; INIT_LIST_HEAD(&h->new_bgs); btrfs_init_metadata_block_rsv(fs_info, &h->delayed_rsv, BTRFS_BLOCK_RSV_DELOPS); smp_mb(); if (cur_trans->state >= TRANS_STATE_COMMIT_START && may_wait_transaction(fs_info, type)) { current->journal_info = h; btrfs_commit_transaction(h); goto again; } if (num_bytes) { trace_btrfs_space_reservation(fs_info, "transaction", h->transid, num_bytes, 1); h->block_rsv = trans_rsv; h->bytes_reserved = num_bytes; if (delayed_refs_bytes > 0) { trace_btrfs_space_reservation(fs_info, "local_delayed_refs_rsv", h->transid, delayed_refs_bytes, 1); h->delayed_refs_bytes_reserved = delayed_refs_bytes; btrfs_block_rsv_add_bytes(&h->delayed_rsv, delayed_refs_bytes, true); delayed_refs_bytes = 0; } h->reloc_reserved = reloc_reserved; } /* * Now that we have found a transaction to be a part of, convert the * qgroup reservation from prealloc to pertrans. A different transaction * can't race in and free our pertrans out from under us. */ if (qgroup_reserved) btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved); got_it: if (!current->journal_info) current->journal_info = h; /* * If the space_info is marked ALLOC_FORCE then we'll get upgraded to * ALLOC_FORCE the first run through, and then we won't allocate for * anybody else who races in later. We don't care about the return * value here. */ if (do_chunk_alloc && num_bytes) { u64 flags = h->block_rsv->space_info->flags; btrfs_chunk_alloc(h, btrfs_get_alloc_profile(fs_info, flags), CHUNK_ALLOC_NO_FORCE); } /* * btrfs_record_root_in_trans() needs to alloc new extents, and may * call btrfs_join_transaction() while we're also starting a * transaction. * * Thus it need to be called after current->journal_info initialized, * or we can deadlock. */ ret = btrfs_record_root_in_trans(h, root); if (ret) { /* * The transaction handle is fully initialized and linked with * other structures so it needs to be ended in case of errors, * not just freed. */ btrfs_end_transaction(h); return ERR_PTR(ret); } return h; join_fail: if (type & __TRANS_FREEZABLE) sb_end_intwrite(fs_info->sb); kmem_cache_free(btrfs_trans_handle_cachep, h); alloc_fail: if (num_bytes) btrfs_block_rsv_release(fs_info, trans_rsv, num_bytes, NULL); if (delayed_refs_bytes) btrfs_space_info_free_bytes_may_use(fs_info, trans_rsv->space_info, delayed_refs_bytes); reserve_fail: btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved); return ERR_PTR(ret); } struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, unsigned int num_items) { return start_transaction(root, num_items, TRANS_START, BTRFS_RESERVE_FLUSH_ALL, true); } struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv( struct btrfs_root *root, unsigned int num_items) { return start_transaction(root, num_items, TRANS_START, BTRFS_RESERVE_FLUSH_ALL_STEAL, false); } struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root) { return start_transaction(root, 0, TRANS_JOIN, BTRFS_RESERVE_NO_FLUSH, true); } struct btrfs_trans_handle *btrfs_join_transaction_spacecache(struct btrfs_root *root) { return start_transaction(root, 0, TRANS_JOIN_NOLOCK, BTRFS_RESERVE_NO_FLUSH, true); } /* * Similar to regular join but it never starts a transaction when none is * running or when there's a running one at a state >= TRANS_STATE_UNBLOCKED. * This is similar to btrfs_attach_transaction() but it allows the join to * happen if the transaction commit already started but it's not yet in the * "doing" phase (the state is < TRANS_STATE_COMMIT_DOING). */ struct btrfs_trans_handle *btrfs_join_transaction_nostart(struct btrfs_root *root) { return start_transaction(root, 0, TRANS_JOIN_NOSTART, BTRFS_RESERVE_NO_FLUSH, true); } /* * Catch the running transaction. * * It is used when we want to commit the current the transaction, but * don't want to start a new one. * * Note: If this function return -ENOENT, it just means there is no * running transaction. But it is possible that the inactive transaction * is still in the memory, not fully on disk. If you hope there is no * inactive transaction in the fs when -ENOENT is returned, you should * invoke * btrfs_attach_transaction_barrier() */ struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root) { return start_transaction(root, 0, TRANS_ATTACH, BTRFS_RESERVE_NO_FLUSH, true); } /* * Catch the running transaction. * * It is similar to the above function, the difference is this one * will wait for all the inactive transactions until they fully * complete. */ struct btrfs_trans_handle * btrfs_attach_transaction_barrier(struct btrfs_root *root) { struct btrfs_trans_handle *trans; trans = start_transaction(root, 0, TRANS_ATTACH, BTRFS_RESERVE_NO_FLUSH, true); if (trans == ERR_PTR(-ENOENT)) { int ret; ret = btrfs_wait_for_commit(root->fs_info, 0); if (ret) return ERR_PTR(ret); } return trans; } /* Wait for a transaction commit to reach at least the given state. */ static noinline void wait_for_commit(struct btrfs_transaction *commit, const enum btrfs_trans_state min_state) { struct btrfs_fs_info *fs_info = commit->fs_info; u64 transid = commit->transid; bool put = false; /* * At the moment this function is called with min_state either being * TRANS_STATE_COMPLETED or TRANS_STATE_SUPER_COMMITTED. */ if (min_state == TRANS_STATE_COMPLETED) btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_COMPLETED); else btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED); while (1) { wait_event(commit->commit_wait, commit->state >= min_state); if (put) btrfs_put_transaction(commit); if (min_state < TRANS_STATE_COMPLETED) break; /* * A transaction isn't really completed until all of the * previous transactions are completed, but with fsync we can * end up with SUPER_COMMITTED transactions before a COMPLETED * transaction. Wait for those. */ spin_lock(&fs_info->trans_lock); commit = list_first_entry_or_null(&fs_info->trans_list, struct btrfs_transaction, list); if (!commit || commit->transid > transid) { spin_unlock(&fs_info->trans_lock); break; } refcount_inc(&commit->use_count); put = true; spin_unlock(&fs_info->trans_lock); } } int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid) { struct btrfs_transaction *cur_trans = NULL, *t; int ret = 0; if (transid) { if (transid <= btrfs_get_last_trans_committed(fs_info)) goto out; /* find specified transaction */ spin_lock(&fs_info->trans_lock); list_for_each_entry(t, &fs_info->trans_list, list) { if (t->transid == transid) { cur_trans = t; refcount_inc(&cur_trans->use_count); ret = 0; break; } if (t->transid > transid) { ret = 0; break; } } spin_unlock(&fs_info->trans_lock); /* * The specified transaction doesn't exist, or we * raced with btrfs_commit_transaction */ if (!cur_trans) { if (transid > btrfs_get_last_trans_committed(fs_info)) ret = -EINVAL; goto out; } } else { /* find newest transaction that is committing | committed */ spin_lock(&fs_info->trans_lock); list_for_each_entry_reverse(t, &fs_info->trans_list, list) { if (t->state >= TRANS_STATE_COMMIT_START) { if (t->state == TRANS_STATE_COMPLETED) break; cur_trans = t; refcount_inc(&cur_trans->use_count); break; } } spin_unlock(&fs_info->trans_lock); if (!cur_trans) goto out; /* nothing committing|committed */ } wait_for_commit(cur_trans, TRANS_STATE_COMPLETED); ret = cur_trans->aborted; btrfs_put_transaction(cur_trans); out: return ret; } void btrfs_throttle(struct btrfs_fs_info *fs_info) { wait_current_trans(fs_info); } bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans) { struct btrfs_transaction *cur_trans = trans->transaction; if (cur_trans->state >= TRANS_STATE_COMMIT_START || test_bit(BTRFS_DELAYED_REFS_FLUSHING, &cur_trans->delayed_refs.flags)) return true; if (btrfs_check_space_for_delayed_refs(trans->fs_info)) return true; return !!btrfs_block_rsv_check(&trans->fs_info->global_block_rsv, 50); } static void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; if (!trans->block_rsv) { ASSERT(!trans->bytes_reserved); ASSERT(!trans->delayed_refs_bytes_reserved); return; } if (!trans->bytes_reserved) { ASSERT(!trans->delayed_refs_bytes_reserved); return; } ASSERT(trans->block_rsv == &fs_info->trans_block_rsv); trace_btrfs_space_reservation(fs_info, "transaction", trans->transid, trans->bytes_reserved, 0); btrfs_block_rsv_release(fs_info, trans->block_rsv, trans->bytes_reserved, NULL); trans->bytes_reserved = 0; if (!trans->delayed_refs_bytes_reserved) return; trace_btrfs_space_reservation(fs_info, "local_delayed_refs_rsv", trans->transid, trans->delayed_refs_bytes_reserved, 0); btrfs_block_rsv_release(fs_info, &trans->delayed_rsv, trans->delayed_refs_bytes_reserved, NULL); trans->delayed_refs_bytes_reserved = 0; } static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, int throttle) { struct btrfs_fs_info *info = trans->fs_info; struct btrfs_transaction *cur_trans = trans->transaction; int err = 0; if (refcount_read(&trans->use_count) > 1) { refcount_dec(&trans->use_count); trans->block_rsv = trans->orig_rsv; return 0; } btrfs_trans_release_metadata(trans); trans->block_rsv = NULL; btrfs_create_pending_block_groups(trans); btrfs_trans_release_chunk_metadata(trans); if (trans->type & __TRANS_FREEZABLE) sb_end_intwrite(info->sb); WARN_ON(cur_trans != info->running_transaction); WARN_ON(atomic_read(&cur_trans->num_writers) < 1); atomic_dec(&cur_trans->num_writers); extwriter_counter_dec(cur_trans, trans->type); cond_wake_up(&cur_trans->writer_wait); btrfs_lockdep_release(info, btrfs_trans_num_extwriters); btrfs_lockdep_release(info, btrfs_trans_num_writers); btrfs_put_transaction(cur_trans); if (current->journal_info == trans) current->journal_info = NULL; if (throttle) btrfs_run_delayed_iputs(info); if (TRANS_ABORTED(trans) || BTRFS_FS_ERROR(info)) { wake_up_process(info->transaction_kthread); if (TRANS_ABORTED(trans)) err = trans->aborted; else err = -EROFS; } kmem_cache_free(btrfs_trans_handle_cachep, trans); return err; } int btrfs_end_transaction(struct btrfs_trans_handle *trans) { return __btrfs_end_transaction(trans, 0); } int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans) { return __btrfs_end_transaction(trans, 1); } /* * when btree blocks are allocated, they have some corresponding bits set for * them in one of two extent_io trees. This is used to make sure all of * those extents are sent to disk but does not wait on them */ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info, struct extent_io_tree *dirty_pages, int mark) { int err = 0; int werr = 0; struct address_space *mapping = fs_info->btree_inode->i_mapping; struct extent_state *cached_state = NULL; u64 start = 0; u64 end; while (find_first_extent_bit(dirty_pages, start, &start, &end, mark, &cached_state)) { bool wait_writeback = false; err = convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, mark, &cached_state); /* * convert_extent_bit can return -ENOMEM, which is most of the * time a temporary error. So when it happens, ignore the error * and wait for writeback of this range to finish - because we * failed to set the bit EXTENT_NEED_WAIT for the range, a call * to __btrfs_wait_marked_extents() would not know that * writeback for this range started and therefore wouldn't * wait for it to finish - we don't want to commit a * superblock that points to btree nodes/leafs for which * writeback hasn't finished yet (and without errors). * We cleanup any entries left in the io tree when committing * the transaction (through extent_io_tree_release()). */ if (err == -ENOMEM) { err = 0; wait_writeback = true; } if (!err) err = filemap_fdatawrite_range(mapping, start, end); if (err) werr = err; else if (wait_writeback) werr = filemap_fdatawait_range(mapping, start, end); free_extent_state(cached_state); cached_state = NULL; cond_resched(); start = end + 1; } return werr; } /* * when btree blocks are allocated, they have some corresponding bits set for * them in one of two extent_io trees. This is used to make sure all of * those extents are on disk for transaction or log commit. We wait * on all the pages and clear them from the dirty pages state tree */ static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info, struct extent_io_tree *dirty_pages) { int err = 0; int werr = 0; struct address_space *mapping = fs_info->btree_inode->i_mapping; struct extent_state *cached_state = NULL; u64 start = 0; u64 end; while (find_first_extent_bit(dirty_pages, start, &start, &end, EXTENT_NEED_WAIT, &cached_state)) { /* * Ignore -ENOMEM errors returned by clear_extent_bit(). * When committing the transaction, we'll remove any entries * left in the io tree. For a log commit, we don't remove them * after committing the log because the tree can be accessed * concurrently - we do it only at transaction commit time when * it's safe to do it (through extent_io_tree_release()). */ err = clear_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, &cached_state); if (err == -ENOMEM) err = 0; if (!err) err = filemap_fdatawait_range(mapping, start, end); if (err) werr = err; free_extent_state(cached_state); cached_state = NULL; cond_resched(); start = end + 1; } if (err) werr = err; return werr; } static int btrfs_wait_extents(struct btrfs_fs_info *fs_info, struct extent_io_tree *dirty_pages) { bool errors = false; int err; err = __btrfs_wait_marked_extents(fs_info, dirty_pages); if (test_and_clear_bit(BTRFS_FS_BTREE_ERR, &fs_info->flags)) errors = true; if (errors && !err) err = -EIO; return err; } int btrfs_wait_tree_log_extents(struct btrfs_root *log_root, int mark) { struct btrfs_fs_info *fs_info = log_root->fs_info; struct extent_io_tree *dirty_pages = &log_root->dirty_log_pages; bool errors = false; int err; ASSERT(log_root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID); err = __btrfs_wait_marked_extents(fs_info, dirty_pages); if ((mark & EXTENT_DIRTY) && test_and_clear_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags)) errors = true; if ((mark & EXTENT_NEW) && test_and_clear_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags)) errors = true; if (errors && !err) err = -EIO; return err; } /* * When btree blocks are allocated the corresponding extents are marked dirty. * This function ensures such extents are persisted on disk for transaction or * log commit. * * @trans: transaction whose dirty pages we'd like to write */ static int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans) { int ret; int ret2; struct extent_io_tree *dirty_pages = &trans->transaction->dirty_pages; struct btrfs_fs_info *fs_info = trans->fs_info; struct blk_plug plug; blk_start_plug(&plug); ret = btrfs_write_marked_extents(fs_info, dirty_pages, EXTENT_DIRTY); blk_finish_plug(&plug); ret2 = btrfs_wait_extents(fs_info, dirty_pages); extent_io_tree_release(&trans->transaction->dirty_pages); if (ret) return ret; else if (ret2) return ret2; else return 0; } /* * this is used to update the root pointer in the tree of tree roots. * * But, in the case of the extent allocation tree, updating the root * pointer may allocate blocks which may change the root of the extent * allocation tree. * * So, this loops and repeats and makes sure the cowonly root didn't * change while the root pointer was being updated in the metadata. */ static int update_cowonly_root(struct btrfs_trans_handle *trans, struct btrfs_root *root) { int ret; u64 old_root_bytenr; u64 old_root_used; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_root *tree_root = fs_info->tree_root; old_root_used = btrfs_root_used(&root->root_item); while (1) { old_root_bytenr = btrfs_root_bytenr(&root->root_item); if (old_root_bytenr == root->node->start && old_root_used == btrfs_root_used(&root->root_item)) break; btrfs_set_root_node(&root->root_item, root->node); ret = btrfs_update_root(trans, tree_root, &root->root_key, &root->root_item); if (ret) return ret; old_root_used = btrfs_root_used(&root->root_item); } return 0; } /* * update all the cowonly tree roots on disk * * The error handling in this function may not be obvious. Any of the * failures will cause the file system to go offline. We still need * to clean up the delayed refs. */ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; struct list_head *dirty_bgs = &trans->transaction->dirty_bgs; struct list_head *io_bgs = &trans->transaction->io_bgs; struct list_head *next; struct extent_buffer *eb; int ret; /* * At this point no one can be using this transaction to modify any tree * and no one can start another transaction to modify any tree either. */ ASSERT(trans->transaction->state == TRANS_STATE_COMMIT_DOING); eb = btrfs_lock_root_node(fs_info->tree_root); ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb, BTRFS_NESTING_COW); btrfs_tree_unlock(eb); free_extent_buffer(eb); if (ret) return ret; ret = btrfs_run_dev_stats(trans); if (ret) return ret; ret = btrfs_run_dev_replace(trans); if (ret) return ret; ret = btrfs_run_qgroups(trans); if (ret) return ret; ret = btrfs_setup_space_cache(trans); if (ret) return ret; again: while (!list_empty(&fs_info->dirty_cowonly_roots)) { struct btrfs_root *root; next = fs_info->dirty_cowonly_roots.next; list_del_init(next); root = list_entry(next, struct btrfs_root, dirty_list); clear_bit(BTRFS_ROOT_DIRTY, &root->state); list_add_tail(&root->dirty_list, &trans->transaction->switch_commits); ret = update_cowonly_root(trans, root); if (ret) return ret; } /* Now flush any delayed refs generated by updating all of the roots */ ret = btrfs_run_delayed_refs(trans, U64_MAX); if (ret) return ret; while (!list_empty(dirty_bgs) || !list_empty(io_bgs)) { ret = btrfs_write_dirty_block_groups(trans); if (ret) return ret; /* * We're writing the dirty block groups, which could generate * delayed refs, which could generate more dirty block groups, * so we want to keep this flushing in this loop to make sure * everything gets run. */ ret = btrfs_run_delayed_refs(trans, U64_MAX); if (ret) return ret; } if (!list_empty(&fs_info->dirty_cowonly_roots)) goto again; /* Update dev-replace pointer once everything is committed */ fs_info->dev_replace.committed_cursor_left = fs_info->dev_replace.cursor_left_last_write_of_item; return 0; } /* * If we had a pending drop we need to see if there are any others left in our * dead roots list, and if not clear our bit and wake any waiters. */ void btrfs_maybe_wake_unfinished_drop(struct btrfs_fs_info *fs_info) { /* * We put the drop in progress roots at the front of the list, so if the * first entry doesn't have UNFINISHED_DROP set we can wake everybody * up. */ spin_lock(&fs_info->trans_lock); if (!list_empty(&fs_info->dead_roots)) { struct btrfs_root *root = list_first_entry(&fs_info->dead_roots, struct btrfs_root, root_list); if (test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state)) { spin_unlock(&fs_info->trans_lock); return; } } spin_unlock(&fs_info->trans_lock); btrfs_wake_unfinished_drop(fs_info); } /* * dead roots are old snapshots that need to be deleted. This allocates * a dirty root struct and adds it into the list of dead roots that need to * be deleted */ void btrfs_add_dead_root(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; spin_lock(&fs_info->trans_lock); if (list_empty(&root->root_list)) { btrfs_grab_root(root); /* We want to process the partially complete drops first. */ if (test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state)) list_add(&root->root_list, &fs_info->dead_roots); else list_add_tail(&root->root_list, &fs_info->dead_roots); } spin_unlock(&fs_info->trans_lock); } /* * Update each subvolume root and its relocation root, if it exists, in the tree * of tree roots. Also free log roots if they exist. */ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *gang[8]; int i; int ret; /* * At this point no one can be using this transaction to modify any tree * and no one can start another transaction to modify any tree either. */ ASSERT(trans->transaction->state == TRANS_STATE_COMMIT_DOING); spin_lock(&fs_info->fs_roots_radix_lock); while (1) { ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix, (void **)gang, 0, ARRAY_SIZE(gang), BTRFS_ROOT_TRANS_TAG); if (ret == 0) break; for (i = 0; i < ret; i++) { struct btrfs_root *root = gang[i]; int ret2; /* * At this point we can neither have tasks logging inodes * from a root nor trying to commit a log tree. */ ASSERT(atomic_read(&root->log_writers) == 0); ASSERT(atomic_read(&root->log_commit[0]) == 0); ASSERT(atomic_read(&root->log_commit[1]) == 0); radix_tree_tag_clear(&fs_info->fs_roots_radix, (unsigned long)root->root_key.objectid, BTRFS_ROOT_TRANS_TAG); spin_unlock(&fs_info->fs_roots_radix_lock); btrfs_free_log(trans, root); ret2 = btrfs_update_reloc_root(trans, root); if (ret2) return ret2; /* see comments in should_cow_block() */ clear_bit(BTRFS_ROOT_FORCE_COW, &root->state); smp_mb__after_atomic(); if (root->commit_root != root->node) { list_add_tail(&root->dirty_list, &trans->transaction->switch_commits); btrfs_set_root_node(&root->root_item, root->node); } ret2 = btrfs_update_root(trans, fs_info->tree_root, &root->root_key, &root->root_item); if (ret2) return ret2; spin_lock(&fs_info->fs_roots_radix_lock); btrfs_qgroup_free_meta_all_pertrans(root); } } spin_unlock(&fs_info->fs_roots_radix_lock); return 0; } /* * Do all special snapshot related qgroup dirty hack. * * Will do all needed qgroup inherit and dirty hack like switch commit * roots inside one transaction and write all btree into disk, to make * qgroup works. */ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root *src, struct btrfs_root *parent, struct btrfs_qgroup_inherit *inherit, u64 dst_objectid) { struct btrfs_fs_info *fs_info = src->fs_info; int ret; /* * Save some performance in the case that qgroups are not enabled. If * this check races with the ioctl, rescan will kick in anyway. */ if (!btrfs_qgroup_full_accounting(fs_info)) return 0; /* * Ensure dirty @src will be committed. Or, after coming * commit_fs_roots() and switch_commit_roots(), any dirty but not * recorded root will never be updated again, causing an outdated root * item. */ ret = record_root_in_trans(trans, src, 1); if (ret) return ret; /* * btrfs_qgroup_inherit relies on a consistent view of the usage for the * src root, so we must run the delayed refs here. * * However this isn't particularly fool proof, because there's no * synchronization keeping us from changing the tree after this point * before we do the qgroup_inherit, or even from making changes while * we're doing the qgroup_inherit. But that's a problem for the future, * for now flush the delayed refs to narrow the race window where the * qgroup counters could end up wrong. */ ret = btrfs_run_delayed_refs(trans, U64_MAX); if (ret) { btrfs_abort_transaction(trans, ret); return ret; } ret = commit_fs_roots(trans); if (ret) goto out; ret = btrfs_qgroup_account_extents(trans); if (ret < 0) goto out; /* Now qgroup are all updated, we can inherit it to new qgroups */ ret = btrfs_qgroup_inherit(trans, src->root_key.objectid, dst_objectid, parent->root_key.objectid, inherit); if (ret < 0) goto out; /* * Now we do a simplified commit transaction, which will: * 1) commit all subvolume and extent tree * To ensure all subvolume and extent tree have a valid * commit_root to accounting later insert_dir_item() * 2) write all btree blocks onto disk * This is to make sure later btree modification will be cowed * Or commit_root can be populated and cause wrong qgroup numbers * In this simplified commit, we don't really care about other trees * like chunk and root tree, as they won't affect qgroup. * And we don't write super to avoid half committed status. */ ret = commit_cowonly_roots(trans); if (ret) goto out; switch_commit_roots(trans); ret = btrfs_write_and_wait_transaction(trans); if (ret) btrfs_handle_fs_error(fs_info, ret, "Error while writing out transaction for qgroup"); out: /* * Force parent root to be updated, as we recorded it before so its * last_trans == cur_transid. * Or it won't be committed again onto disk after later * insert_dir_item() */ if (!ret) ret = record_root_in_trans(trans, parent, 1); return ret; } /* * new snapshots need to be created at a very specific time in the * transaction commit. This does the actual creation. * * Note: * If the error which may affect the commitment of the current transaction * happens, we should return the error number. If the error which just affect * the creation of the pending snapshots, just return 0. */ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, struct btrfs_pending_snapshot *pending) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_key key; struct btrfs_root_item *new_root_item; struct btrfs_root *tree_root = fs_info->tree_root; struct btrfs_root *root = pending->root; struct btrfs_root *parent_root; struct btrfs_block_rsv *rsv; struct inode *parent_inode = pending->dir; struct btrfs_path *path; struct btrfs_dir_item *dir_item; struct extent_buffer *tmp; struct extent_buffer *old; struct timespec64 cur_time; int ret = 0; u64 to_reserve = 0; u64 index = 0; u64 objectid; u64 root_flags; unsigned int nofs_flags; struct fscrypt_name fname; ASSERT(pending->path); path = pending->path; ASSERT(pending->root_item); new_root_item = pending->root_item; /* * We're inside a transaction and must make sure that any potential * allocations with GFP_KERNEL in fscrypt won't recurse back to * filesystem. */ nofs_flags = memalloc_nofs_save(); pending->error = fscrypt_setup_filename(parent_inode, &pending->dentry->d_name, 0, &fname); memalloc_nofs_restore(nofs_flags); if (pending->error) goto free_pending; pending->error = btrfs_get_free_objectid(tree_root, &objectid); if (pending->error) goto free_fname; /* * Make qgroup to skip current new snapshot's qgroupid, as it is * accounted by later btrfs_qgroup_inherit(). */ btrfs_set_skip_qgroup(trans, objectid); btrfs_reloc_pre_snapshot(pending, &to_reserve); if (to_reserve > 0) { pending->error = btrfs_block_rsv_add(fs_info, &pending->block_rsv, to_reserve, BTRFS_RESERVE_NO_FLUSH); if (pending->error) goto clear_skip_qgroup; } key.objectid = objectid; key.offset = (u64)-1; key.type = BTRFS_ROOT_ITEM_KEY; rsv = trans->block_rsv; trans->block_rsv = &pending->block_rsv; trans->bytes_reserved = trans->block_rsv->reserved; trace_btrfs_space_reservation(fs_info, "transaction", trans->transid, trans->bytes_reserved, 1); parent_root = BTRFS_I(parent_inode)->root; ret = record_root_in_trans(trans, parent_root, 0); if (ret) goto fail; cur_time = current_time(parent_inode); /* * insert the directory item */ ret = btrfs_set_inode_index(BTRFS_I(parent_inode), &index); if (ret) { btrfs_abort_transaction(trans, ret); goto fail; } /* check if there is a file/dir which has the same name. */ dir_item = btrfs_lookup_dir_item(NULL, parent_root, path, btrfs_ino(BTRFS_I(parent_inode)), &fname.disk_name, 0); if (dir_item != NULL && !IS_ERR(dir_item)) { pending->error = -EEXIST; goto dir_item_existed; } else if (IS_ERR(dir_item)) { ret = PTR_ERR(dir_item); btrfs_abort_transaction(trans, ret); goto fail; } btrfs_release_path(path); ret = btrfs_create_qgroup(trans, objectid); if (ret) { btrfs_abort_transaction(trans, ret); goto fail; } /* * pull in the delayed directory update * and the delayed inode item * otherwise we corrupt the FS during * snapshot */ ret = btrfs_run_delayed_items(trans); if (ret) { /* Transaction aborted */ btrfs_abort_transaction(trans, ret); goto fail; } ret = record_root_in_trans(trans, root, 0); if (ret) { btrfs_abort_transaction(trans, ret); goto fail; } btrfs_set_root_last_snapshot(&root->root_item, trans->transid); memcpy(new_root_item, &root->root_item, sizeof(*new_root_item)); btrfs_check_and_init_root_item(new_root_item); root_flags = btrfs_root_flags(new_root_item); if (pending->readonly) root_flags |= BTRFS_ROOT_SUBVOL_RDONLY; else root_flags &= ~BTRFS_ROOT_SUBVOL_RDONLY; btrfs_set_root_flags(new_root_item, root_flags); btrfs_set_root_generation_v2(new_root_item, trans->transid); generate_random_guid(new_root_item->uuid); memcpy(new_root_item->parent_uuid, root->root_item.uuid, BTRFS_UUID_SIZE); if (!(root_flags & BTRFS_ROOT_SUBVOL_RDONLY)) { memset(new_root_item->received_uuid, 0, sizeof(new_root_item->received_uuid)); memset(&new_root_item->stime, 0, sizeof(new_root_item->stime)); memset(&new_root_item->rtime, 0, sizeof(new_root_item->rtime)); btrfs_set_root_stransid(new_root_item, 0); btrfs_set_root_rtransid(new_root_item, 0); } btrfs_set_stack_timespec_sec(&new_root_item->otime, cur_time.tv_sec); btrfs_set_stack_timespec_nsec(&new_root_item->otime, cur_time.tv_nsec); btrfs_set_root_otransid(new_root_item, trans->transid); old = btrfs_lock_root_node(root); ret = btrfs_cow_block(trans, root, old, NULL, 0, &old, BTRFS_NESTING_COW); if (ret) { btrfs_tree_unlock(old); free_extent_buffer(old); btrfs_abort_transaction(trans, ret); goto fail; } ret = btrfs_copy_root(trans, root, old, &tmp, objectid); /* clean up in any case */ btrfs_tree_unlock(old); free_extent_buffer(old); if (ret) { btrfs_abort_transaction(trans, ret); goto fail; } /* see comments in should_cow_block() */ set_bit(BTRFS_ROOT_FORCE_COW, &root->state); smp_wmb(); btrfs_set_root_node(new_root_item, tmp); /* record when the snapshot was created in key.offset */ key.offset = trans->transid; ret = btrfs_insert_root(trans, tree_root, &key, new_root_item); btrfs_tree_unlock(tmp); free_extent_buffer(tmp); if (ret) { btrfs_abort_transaction(trans, ret); goto fail; } /* * insert root back/forward references */ ret = btrfs_add_root_ref(trans, objectid, parent_root->root_key.objectid, btrfs_ino(BTRFS_I(parent_inode)), index, &fname.disk_name); if (ret) { btrfs_abort_transaction(trans, ret); goto fail; } key.offset = (u64)-1; pending->snap = btrfs_get_new_fs_root(fs_info, objectid, pending->anon_dev); if (IS_ERR(pending->snap)) { ret = PTR_ERR(pending->snap); pending->snap = NULL; btrfs_abort_transaction(trans, ret); goto fail; } ret = btrfs_reloc_post_snapshot(trans, pending); if (ret) { btrfs_abort_transaction(trans, ret); goto fail; } /* * Do special qgroup accounting for snapshot, as we do some qgroup * snapshot hack to do fast snapshot. * To co-operate with that hack, we do hack again. * Or snapshot will be greatly slowed down by a subtree qgroup rescan */ if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_FULL) ret = qgroup_account_snapshot(trans, root, parent_root, pending->inherit, objectid); else if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) ret = btrfs_qgroup_inherit(trans, root->root_key.objectid, objectid, parent_root->root_key.objectid, pending->inherit); if (ret < 0) goto fail; ret = btrfs_insert_dir_item(trans, &fname.disk_name, BTRFS_I(parent_inode), &key, BTRFS_FT_DIR, index); /* We have check then name at the beginning, so it is impossible. */ BUG_ON(ret == -EEXIST || ret == -EOVERFLOW); if (ret) { btrfs_abort_transaction(trans, ret); goto fail; } btrfs_i_size_write(BTRFS_I(parent_inode), parent_inode->i_size + fname.disk_name.len * 2); inode_set_mtime_to_ts(parent_inode, inode_set_ctime_current(parent_inode)); ret = btrfs_update_inode_fallback(trans, BTRFS_I(parent_inode)); if (ret) { btrfs_abort_transaction(trans, ret); goto fail; } ret = btrfs_uuid_tree_add(trans, new_root_item->uuid, BTRFS_UUID_KEY_SUBVOL, objectid); if (ret) { btrfs_abort_transaction(trans, ret); goto fail; } if (!btrfs_is_empty_uuid(new_root_item->received_uuid)) { ret = btrfs_uuid_tree_add(trans, new_root_item->received_uuid, BTRFS_UUID_KEY_RECEIVED_SUBVOL, objectid); if (ret && ret != -EEXIST) { btrfs_abort_transaction(trans, ret); goto fail; } } fail: pending->error = ret; dir_item_existed: trans->block_rsv = rsv; trans->bytes_reserved = 0; clear_skip_qgroup: btrfs_clear_skip_qgroup(trans); free_fname: fscrypt_free_filename(&fname); free_pending: kfree(new_root_item); pending->root_item = NULL; btrfs_free_path(path); pending->path = NULL; return ret; } /* * create all the snapshots we've scheduled for creation */ static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans) { struct btrfs_pending_snapshot *pending, *next; struct list_head *head = &trans->transaction->pending_snapshots; int ret = 0; list_for_each_entry_safe(pending, next, head, list) { list_del(&pending->list); ret = create_pending_snapshot(trans, pending); if (ret) break; } return ret; } static void update_super_roots(struct btrfs_fs_info *fs_info) { struct btrfs_root_item *root_item; struct btrfs_super_block *super; super = fs_info->super_copy; root_item = &fs_info->chunk_root->root_item; super->chunk_root = root_item->bytenr; super->chunk_root_generation = root_item->generation; super->chunk_root_level = root_item->level; root_item = &fs_info->tree_root->root_item; super->root = root_item->bytenr; super->generation = root_item->generation; super->root_level = root_item->level; if (btrfs_test_opt(fs_info, SPACE_CACHE)) super->cache_generation = root_item->generation; else if (test_bit(BTRFS_FS_CLEANUP_SPACE_CACHE_V1, &fs_info->flags)) super->cache_generation = 0; if (test_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags)) super->uuid_tree_generation = root_item->generation; } int btrfs_transaction_in_commit(struct btrfs_fs_info *info) { struct btrfs_transaction *trans; int ret = 0; spin_lock(&info->trans_lock); trans = info->running_transaction; if (trans) ret = (trans->state >= TRANS_STATE_COMMIT_START); spin_unlock(&info->trans_lock); return ret; } int btrfs_transaction_blocked(struct btrfs_fs_info *info) { struct btrfs_transaction *trans; int ret = 0; spin_lock(&info->trans_lock); trans = info->running_transaction; if (trans) ret = is_transaction_blocked(trans); spin_unlock(&info->trans_lock); return ret; } void btrfs_commit_transaction_async(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_transaction *cur_trans; /* Kick the transaction kthread. */ set_bit(BTRFS_FS_COMMIT_TRANS, &fs_info->flags); wake_up_process(fs_info->transaction_kthread); /* take transaction reference */ cur_trans = trans->transaction; refcount_inc(&cur_trans->use_count); btrfs_end_transaction(trans); /* * Wait for the current transaction commit to start and block * subsequent transaction joins */ btrfs_might_wait_for_state(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP); wait_event(fs_info->transaction_blocked_wait, cur_trans->state >= TRANS_STATE_COMMIT_START || TRANS_ABORTED(cur_trans)); btrfs_put_transaction(cur_trans); } static void cleanup_transaction(struct btrfs_trans_handle *trans, int err) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_transaction *cur_trans = trans->transaction; WARN_ON(refcount_read(&trans->use_count) > 1); btrfs_abort_transaction(trans, err); spin_lock(&fs_info->trans_lock); /* * If the transaction is removed from the list, it means this * transaction has been committed successfully, so it is impossible * to call the cleanup function. */ BUG_ON(list_empty(&cur_trans->list)); if (cur_trans == fs_info->running_transaction) { cur_trans->state = TRANS_STATE_COMMIT_DOING; spin_unlock(&fs_info->trans_lock); /* * The thread has already released the lockdep map as reader * already in btrfs_commit_transaction(). */ btrfs_might_wait_for_event(fs_info, btrfs_trans_num_writers); wait_event(cur_trans->writer_wait, atomic_read(&cur_trans->num_writers) == 1); spin_lock(&fs_info->trans_lock); } /* * Now that we know no one else is still using the transaction we can * remove the transaction from the list of transactions. This avoids * the transaction kthread from cleaning up the transaction while some * other task is still using it, which could result in a use-after-free * on things like log trees, as it forces the transaction kthread to * wait for this transaction to be cleaned up by us. */ list_del_init(&cur_trans->list); spin_unlock(&fs_info->trans_lock); btrfs_cleanup_one_transaction(trans->transaction, fs_info); spin_lock(&fs_info->trans_lock); if (cur_trans == fs_info->running_transaction) fs_info->running_transaction = NULL; spin_unlock(&fs_info->trans_lock); if (trans->type & __TRANS_FREEZABLE) sb_end_intwrite(fs_info->sb); btrfs_put_transaction(cur_trans); btrfs_put_transaction(cur_trans); trace_btrfs_transaction_commit(fs_info); if (current->journal_info == trans) current->journal_info = NULL; /* * If relocation is running, we can't cancel scrub because that will * result in a deadlock. Before relocating a block group, relocation * pauses scrub, then starts and commits a transaction before unpausing * scrub. If the transaction commit is being done by the relocation * task or triggered by another task and the relocation task is waiting * for the commit, and we end up here due to an error in the commit * path, then calling btrfs_scrub_cancel() will deadlock, as we are * asking for scrub to stop while having it asked to be paused higher * above in relocation code. */ if (!test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) btrfs_scrub_cancel(fs_info); kmem_cache_free(btrfs_trans_handle_cachep, trans); } /* * Release reserved delayed ref space of all pending block groups of the * transaction and remove them from the list */ static void btrfs_cleanup_pending_block_groups(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_block_group *block_group, *tmp; list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) { btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info); list_del_init(&block_group->bg_list); } } static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info) { /* * We use try_to_writeback_inodes_sb() here because if we used * btrfs_start_delalloc_roots we would deadlock with fs freeze. * Currently are holding the fs freeze lock, if we do an async flush * we'll do btrfs_join_transaction() and deadlock because we need to * wait for the fs freeze lock. Using the direct flushing we benefit * from already being in a transaction and our join_transaction doesn't * have to re-take the fs freeze lock. * * Note that try_to_writeback_inodes_sb() will only trigger writeback * if it can read lock sb->s_umount. It will always be able to lock it, * except when the filesystem is being unmounted or being frozen, but in * those cases sync_filesystem() is called, which results in calling * writeback_inodes_sb() while holding a write lock on sb->s_umount. * Note that we don't call writeback_inodes_sb() directly, because it * will emit a warning if sb->s_umount is not locked. */ if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) try_to_writeback_inodes_sb(fs_info->sb, WB_REASON_SYNC); return 0; } static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info) { if (btrfs_test_opt(fs_info, FLUSHONCOMMIT)) btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); } /* * Add a pending snapshot associated with the given transaction handle to the * respective handle. This must be called after the transaction commit started * and while holding fs_info->trans_lock. * This serves to guarantee a caller of btrfs_commit_transaction() that it can * safely free the pending snapshot pointer in case btrfs_commit_transaction() * returns an error. */ static void add_pending_snapshot(struct btrfs_trans_handle *trans) { struct btrfs_transaction *cur_trans = trans->transaction; if (!trans->pending_snapshot) return; lockdep_assert_held(&trans->fs_info->trans_lock); ASSERT(cur_trans->state >= TRANS_STATE_COMMIT_PREP); list_add(&trans->pending_snapshot->list, &cur_trans->pending_snapshots); } static void update_commit_stats(struct btrfs_fs_info *fs_info, ktime_t interval) { fs_info->commit_stats.commit_count++; fs_info->commit_stats.last_commit_dur = interval; fs_info->commit_stats.max_commit_dur = max_t(u64, fs_info->commit_stats.max_commit_dur, interval); fs_info->commit_stats.total_commit_dur += interval; } int btrfs_commit_transaction(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_transaction *cur_trans = trans->transaction; struct btrfs_transaction *prev_trans = NULL; int ret; ktime_t start_time; ktime_t interval; ASSERT(refcount_read(&trans->use_count) == 1); btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP); clear_bit(BTRFS_FS_NEED_TRANS_COMMIT, &fs_info->flags); /* Stop the commit early if ->aborted is set */ if (TRANS_ABORTED(cur_trans)) { ret = cur_trans->aborted; goto lockdep_trans_commit_start_release; } btrfs_trans_release_metadata(trans); trans->block_rsv = NULL; /* * We only want one transaction commit doing the flushing so we do not * waste a bunch of time on lock contention on the extent root node. */ if (!test_and_set_bit(BTRFS_DELAYED_REFS_FLUSHING, &cur_trans->delayed_refs.flags)) { /* * Make a pass through all the delayed refs we have so far. * Any running threads may add more while we are here. */ ret = btrfs_run_delayed_refs(trans, 0); if (ret) goto lockdep_trans_commit_start_release; } btrfs_create_pending_block_groups(trans); if (!test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &cur_trans->flags)) { int run_it = 0; /* this mutex is also taken before trying to set * block groups readonly. We need to make sure * that nobody has set a block group readonly * after a extents from that block group have been * allocated for cache files. btrfs_set_block_group_ro * will wait for the transaction to commit if it * finds BTRFS_TRANS_DIRTY_BG_RUN set. * * The BTRFS_TRANS_DIRTY_BG_RUN flag is also used to make sure * only one process starts all the block group IO. It wouldn't * hurt to have more than one go through, but there's no * real advantage to it either. */ mutex_lock(&fs_info->ro_block_group_mutex); if (!test_and_set_bit(BTRFS_TRANS_DIRTY_BG_RUN, &cur_trans->flags)) run_it = 1; mutex_unlock(&fs_info->ro_block_group_mutex); if (run_it) { ret = btrfs_start_dirty_block_groups(trans); if (ret) goto lockdep_trans_commit_start_release; } } spin_lock(&fs_info->trans_lock); if (cur_trans->state >= TRANS_STATE_COMMIT_PREP) { enum btrfs_trans_state want_state = TRANS_STATE_COMPLETED; add_pending_snapshot(trans); spin_unlock(&fs_info->trans_lock); refcount_inc(&cur_trans->use_count); if (trans->in_fsync) want_state = TRANS_STATE_SUPER_COMMITTED; btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP); ret = btrfs_end_transaction(trans); wait_for_commit(cur_trans, want_state); if (TRANS_ABORTED(cur_trans)) ret = cur_trans->aborted; btrfs_put_transaction(cur_trans); return ret; } cur_trans->state = TRANS_STATE_COMMIT_PREP; wake_up(&fs_info->transaction_blocked_wait); btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP); if (cur_trans->list.prev != &fs_info->trans_list) { enum btrfs_trans_state want_state = TRANS_STATE_COMPLETED; if (trans->in_fsync) want_state = TRANS_STATE_SUPER_COMMITTED; prev_trans = list_entry(cur_trans->list.prev, struct btrfs_transaction, list); if (prev_trans->state < want_state) { refcount_inc(&prev_trans->use_count); spin_unlock(&fs_info->trans_lock); wait_for_commit(prev_trans, want_state); ret = READ_ONCE(prev_trans->aborted); btrfs_put_transaction(prev_trans); if (ret) goto lockdep_release; spin_lock(&fs_info->trans_lock); } } else { /* * The previous transaction was aborted and was already removed * from the list of transactions at fs_info->trans_list. So we * abort to prevent writing a new superblock that reflects a * corrupt state (pointing to trees with unwritten nodes/leafs). */ if (BTRFS_FS_ERROR(fs_info)) { spin_unlock(&fs_info->trans_lock); ret = -EROFS; goto lockdep_release; } } cur_trans->state = TRANS_STATE_COMMIT_START; wake_up(&fs_info->transaction_blocked_wait); spin_unlock(&fs_info->trans_lock); /* * Get the time spent on the work done by the commit thread and not * the time spent waiting on a previous commit */ start_time = ktime_get_ns(); extwriter_counter_dec(cur_trans, trans->type); ret = btrfs_start_delalloc_flush(fs_info); if (ret) goto lockdep_release; ret = btrfs_run_delayed_items(trans); if (ret) goto lockdep_release; /* * The thread has started/joined the transaction thus it holds the * lockdep map as a reader. It has to release it before acquiring the * lockdep map as a writer. */ btrfs_lockdep_release(fs_info, btrfs_trans_num_extwriters); btrfs_might_wait_for_event(fs_info, btrfs_trans_num_extwriters); wait_event(cur_trans->writer_wait, extwriter_counter_read(cur_trans) == 0); /* some pending stuffs might be added after the previous flush. */ ret = btrfs_run_delayed_items(trans); if (ret) { btrfs_lockdep_release(fs_info, btrfs_trans_num_writers); goto cleanup_transaction; } btrfs_wait_delalloc_flush(fs_info); /* * Wait for all ordered extents started by a fast fsync that joined this * transaction. Otherwise if this transaction commits before the ordered * extents complete we lose logged data after a power failure. */ btrfs_might_wait_for_event(fs_info, btrfs_trans_pending_ordered); wait_event(cur_trans->pending_wait, atomic_read(&cur_trans->pending_ordered) == 0); btrfs_scrub_pause(fs_info); /* * Ok now we need to make sure to block out any other joins while we * commit the transaction. We could have started a join before setting * COMMIT_DOING so make sure to wait for num_writers to == 1 again. */ spin_lock(&fs_info->trans_lock); add_pending_snapshot(trans); cur_trans->state = TRANS_STATE_COMMIT_DOING; spin_unlock(&fs_info->trans_lock); /* * The thread has started/joined the transaction thus it holds the * lockdep map as a reader. It has to release it before acquiring the * lockdep map as a writer. */ btrfs_lockdep_release(fs_info, btrfs_trans_num_writers); btrfs_might_wait_for_event(fs_info, btrfs_trans_num_writers); wait_event(cur_trans->writer_wait, atomic_read(&cur_trans->num_writers) == 1); /* * Make lockdep happy by acquiring the state locks after * btrfs_trans_num_writers is released. If we acquired the state locks * before releasing the btrfs_trans_num_writers lock then lockdep would * complain because we did not follow the reverse order unlocking rule. */ btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_COMPLETED); btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED); btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED); /* * We've started the commit, clear the flag in case we were triggered to * do an async commit but somebody else started before the transaction * kthread could do the work. */ clear_bit(BTRFS_FS_COMMIT_TRANS, &fs_info->flags); if (TRANS_ABORTED(cur_trans)) { ret = cur_trans->aborted; btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED); goto scrub_continue; } /* * the reloc mutex makes sure that we stop * the balancing code from coming in and moving * extents around in the middle of the commit */ mutex_lock(&fs_info->reloc_mutex); /* * We needn't worry about the delayed items because we will * deal with them in create_pending_snapshot(), which is the * core function of the snapshot creation. */ ret = create_pending_snapshots(trans); if (ret) goto unlock_reloc; /* * We insert the dir indexes of the snapshots and update the inode * of the snapshots' parents after the snapshot creation, so there * are some delayed items which are not dealt with. Now deal with * them. * * We needn't worry that this operation will corrupt the snapshots, * because all the tree which are snapshoted will be forced to COW * the nodes and leaves. */ ret = btrfs_run_delayed_items(trans); if (ret) goto unlock_reloc; ret = btrfs_run_delayed_refs(trans, U64_MAX); if (ret) goto unlock_reloc; /* * make sure none of the code above managed to slip in a * delayed item */ btrfs_assert_delayed_root_empty(fs_info); WARN_ON(cur_trans != trans->transaction); ret = commit_fs_roots(trans); if (ret) goto unlock_reloc; /* commit_fs_roots gets rid of all the tree log roots, it is now * safe to free the root of tree log roots */ btrfs_free_log_root_tree(trans, fs_info); /* * Since fs roots are all committed, we can get a quite accurate * new_roots. So let's do quota accounting. */ ret = btrfs_qgroup_account_extents(trans); if (ret < 0) goto unlock_reloc; ret = commit_cowonly_roots(trans); if (ret) goto unlock_reloc; /* * The tasks which save the space cache and inode cache may also * update ->aborted, check it. */ if (TRANS_ABORTED(cur_trans)) { ret = cur_trans->aborted; goto unlock_reloc; } cur_trans = fs_info->running_transaction; btrfs_set_root_node(&fs_info->tree_root->root_item, fs_info->tree_root->node); list_add_tail(&fs_info->tree_root->dirty_list, &cur_trans->switch_commits); btrfs_set_root_node(&fs_info->chunk_root->root_item, fs_info->chunk_root->node); list_add_tail(&fs_info->chunk_root->dirty_list, &cur_trans->switch_commits); if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { btrfs_set_root_node(&fs_info->block_group_root->root_item, fs_info->block_group_root->node); list_add_tail(&fs_info->block_group_root->dirty_list, &cur_trans->switch_commits); } switch_commit_roots(trans); ASSERT(list_empty(&cur_trans->dirty_bgs)); ASSERT(list_empty(&cur_trans->io_bgs)); update_super_roots(fs_info); btrfs_set_super_log_root(fs_info->super_copy, 0); btrfs_set_super_log_root_level(fs_info->super_copy, 0); memcpy(fs_info->super_for_commit, fs_info->super_copy, sizeof(*fs_info->super_copy)); btrfs_commit_device_sizes(cur_trans); clear_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags); clear_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags); btrfs_trans_release_chunk_metadata(trans); /* * Before changing the transaction state to TRANS_STATE_UNBLOCKED and * setting fs_info->running_transaction to NULL, lock tree_log_mutex to * make sure that before we commit our superblock, no other task can * start a new transaction and commit a log tree before we commit our * superblock. Anyone trying to commit a log tree locks this mutex before * writing its superblock. */ mutex_lock(&fs_info->tree_log_mutex); spin_lock(&fs_info->trans_lock); cur_trans->state = TRANS_STATE_UNBLOCKED; fs_info->running_transaction = NULL; spin_unlock(&fs_info->trans_lock); mutex_unlock(&fs_info->reloc_mutex); wake_up(&fs_info->transaction_wait); btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED); /* If we have features changed, wake up the cleaner to update sysfs. */ if (test_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags) && fs_info->cleaner_kthread) wake_up_process(fs_info->cleaner_kthread); ret = btrfs_write_and_wait_transaction(trans); if (ret) { btrfs_handle_fs_error(fs_info, ret, "Error while writing out transaction"); mutex_unlock(&fs_info->tree_log_mutex); goto scrub_continue; } ret = write_all_supers(fs_info, 0); /* * the super is written, we can safely allow the tree-loggers * to go about their business */ mutex_unlock(&fs_info->tree_log_mutex); if (ret) goto scrub_continue; /* * We needn't acquire the lock here because there is no other task * which can change it. */ cur_trans->state = TRANS_STATE_SUPER_COMMITTED; wake_up(&cur_trans->commit_wait); btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED); btrfs_finish_extent_commit(trans); if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &cur_trans->flags)) btrfs_clear_space_info_full(fs_info); btrfs_set_last_trans_committed(fs_info, cur_trans->transid); /* * We needn't acquire the lock here because there is no other task * which can change it. */ cur_trans->state = TRANS_STATE_COMPLETED; wake_up(&cur_trans->commit_wait); btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMPLETED); spin_lock(&fs_info->trans_lock); list_del_init(&cur_trans->list); spin_unlock(&fs_info->trans_lock); btrfs_put_transaction(cur_trans); btrfs_put_transaction(cur_trans); if (trans->type & __TRANS_FREEZABLE) sb_end_intwrite(fs_info->sb); trace_btrfs_transaction_commit(fs_info); interval = ktime_get_ns() - start_time; btrfs_scrub_continue(fs_info); if (current->journal_info == trans) current->journal_info = NULL; kmem_cache_free(btrfs_trans_handle_cachep, trans); update_commit_stats(fs_info, interval); return ret; unlock_reloc: mutex_unlock(&fs_info->reloc_mutex); btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED); scrub_continue: btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED); btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMPLETED); btrfs_scrub_continue(fs_info); cleanup_transaction: btrfs_trans_release_metadata(trans); btrfs_cleanup_pending_block_groups(trans); btrfs_trans_release_chunk_metadata(trans); trans->block_rsv = NULL; btrfs_warn(fs_info, "Skipping commit of aborted transaction."); if (current->journal_info == trans) current->journal_info = NULL; cleanup_transaction(trans, ret); return ret; lockdep_release: btrfs_lockdep_release(fs_info, btrfs_trans_num_extwriters); btrfs_lockdep_release(fs_info, btrfs_trans_num_writers); goto cleanup_transaction; lockdep_trans_commit_start_release: btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP); btrfs_end_transaction(trans); return ret; } /* * return < 0 if error * 0 if there are no more dead_roots at the time of call * 1 there are more to be processed, call me again * * The return value indicates there are certainly more snapshots to delete, but * if there comes a new one during processing, it may return 0. We don't mind, * because btrfs_commit_super will poke cleaner thread and it will process it a * few seconds later. */ int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info) { struct btrfs_root *root; int ret; spin_lock(&fs_info->trans_lock); if (list_empty(&fs_info->dead_roots)) { spin_unlock(&fs_info->trans_lock); return 0; } root = list_first_entry(&fs_info->dead_roots, struct btrfs_root, root_list); list_del_init(&root->root_list); spin_unlock(&fs_info->trans_lock); btrfs_debug(fs_info, "cleaner removing %llu", root->root_key.objectid); btrfs_kill_all_delayed_nodes(root); if (btrfs_header_backref_rev(root->node) < BTRFS_MIXED_BACKREF_REV) ret = btrfs_drop_snapshot(root, 0, 0); else ret = btrfs_drop_snapshot(root, 1, 0); btrfs_put_root(root); return (ret < 0) ? 0 : 1; } /* * We only mark the transaction aborted and then set the file system read-only. * This will prevent new transactions from starting or trying to join this * one. * * This means that error recovery at the call site is limited to freeing * any local memory allocations and passing the error code up without * further cleanup. The transaction should complete as it normally would * in the call path but will return -EIO. * * We'll complete the cleanup in btrfs_end_transaction and * btrfs_commit_transaction. */ void __cold __btrfs_abort_transaction(struct btrfs_trans_handle *trans, const char *function, unsigned int line, int error, bool first_hit) { struct btrfs_fs_info *fs_info = trans->fs_info; WRITE_ONCE(trans->aborted, error); WRITE_ONCE(trans->transaction->aborted, error); if (first_hit && error == -ENOSPC) btrfs_dump_space_info_for_trans_abort(fs_info); /* Wake up anybody who may be waiting on this transaction */ wake_up(&fs_info->transaction_wait); wake_up(&fs_info->transaction_blocked_wait); __btrfs_handle_fs_error(fs_info, function, line, error, NULL); } int __init btrfs_transaction_init(void) { btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle", sizeof(struct btrfs_trans_handle), 0, SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL); if (!btrfs_trans_handle_cachep) return -ENOMEM; return 0; } void __cold btrfs_transaction_exit(void) { kmem_cache_destroy(btrfs_trans_handle_cachep); }
1 1 1 1 1 1 73 73 69 73 73 73 5 3 3 17 17 5 5 5 5 5 5 3 5 2 2 2 2 6 4 3 3 3 2 2 3 3 3 3 3 7 6 5 5 5 1 1 1 1 1 2 1 1 10 9 8 8 8 1 1 1 1 1 1 1 1 4 3 2 2 2 2 2 2 2 2 4 3 2 1 1 1 1 1 2 2 1 2 1 3 1 1 1 5 5 1 2 2 3 1 1 1 1 1 21 2 21 2 2 5 3 1 1 1 3 5 2 5 1 1 5 1 1 1 1 27 27 27 7 7 7 2 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 5 1 1 1 1 1 1 1 5 5 5 5 1 1 6 6 6 6 11 12 12 12 32 32 32 2 3 3 2 3 6 5 1 1 6 6 5 6 20 20 20 15 5 5 5 5 5 2 1 1 1 1 1 1 32 30 29 3 4 3 1 122 122 120 123 105 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 /* BlueZ - Bluetooth protocol stack for Linux Copyright (C) 2000-2001 Qualcomm Incorporated Copyright (C) 2011 ProFUSION Embedded Systems Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License version 2 as published by the Free Software Foundation; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS SOFTWARE IS DISCLAIMED. */ /* Bluetooth HCI core. */ #include <linux/export.h> #include <linux/rfkill.h> #include <linux/debugfs.h> #include <linux/crypto.h> #include <linux/kcov.h> #include <linux/property.h> #include <linux/suspend.h> #include <linux/wait.h> #include <asm/unaligned.h> #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_core.h> #include <net/bluetooth/l2cap.h> #include <net/bluetooth/mgmt.h> #include "hci_request.h" #include "hci_debugfs.h" #include "smp.h" #include "leds.h" #include "msft.h" #include "aosp.h" #include "hci_codec.h" static void hci_rx_work(struct work_struct *work); static void hci_cmd_work(struct work_struct *work); static void hci_tx_work(struct work_struct *work); /* HCI device list */ LIST_HEAD(hci_dev_list); DEFINE_RWLOCK(hci_dev_list_lock); /* HCI callback list */ LIST_HEAD(hci_cb_list); DEFINE_MUTEX(hci_cb_list_lock); /* HCI ID Numbering */ static DEFINE_IDA(hci_index_ida); static int hci_scan_req(struct hci_request *req, unsigned long opt) { __u8 scan = opt; BT_DBG("%s %x", req->hdev->name, scan); /* Inquiry and Page scans */ hci_req_add(req, HCI_OP_WRITE_SCAN_ENABLE, 1, &scan); return 0; } static int hci_auth_req(struct hci_request *req, unsigned long opt) { __u8 auth = opt; BT_DBG("%s %x", req->hdev->name, auth); /* Authentication */ hci_req_add(req, HCI_OP_WRITE_AUTH_ENABLE, 1, &auth); return 0; } static int hci_encrypt_req(struct hci_request *req, unsigned long opt) { __u8 encrypt = opt; BT_DBG("%s %x", req->hdev->name, encrypt); /* Encryption */ hci_req_add(req, HCI_OP_WRITE_ENCRYPT_MODE, 1, &encrypt); return 0; } static int hci_linkpol_req(struct hci_request *req, unsigned long opt) { __le16 policy = cpu_to_le16(opt); BT_DBG("%s %x", req->hdev->name, policy); /* Default link policy */ hci_req_add(req, HCI_OP_WRITE_DEF_LINK_POLICY, 2, &policy); return 0; } /* Get HCI device by index. * Device is held on return. */ struct hci_dev *hci_dev_get(int index) { struct hci_dev *hdev = NULL, *d; BT_DBG("%d", index); if (index < 0) return NULL; read_lock(&hci_dev_list_lock); list_for_each_entry(d, &hci_dev_list, list) { if (d->id == index) { hdev = hci_dev_hold(d); break; } } read_unlock(&hci_dev_list_lock); return hdev; } /* ---- Inquiry support ---- */ bool hci_discovery_active(struct hci_dev *hdev) { struct discovery_state *discov = &hdev->discovery; switch (discov->state) { case DISCOVERY_FINDING: case DISCOVERY_RESOLVING: return true; default: return false; } } void hci_discovery_set_state(struct hci_dev *hdev, int state) { int old_state = hdev->discovery.state; BT_DBG("%s state %u -> %u", hdev->name, hdev->discovery.state, state); if (old_state == state) return; hdev->discovery.state = state; switch (state) { case DISCOVERY_STOPPED: hci_update_passive_scan(hdev); if (old_state != DISCOVERY_STARTING) mgmt_discovering(hdev, 0); break; case DISCOVERY_STARTING: break; case DISCOVERY_FINDING: mgmt_discovering(hdev, 1); break; case DISCOVERY_RESOLVING: break; case DISCOVERY_STOPPING: break; } } void hci_inquiry_cache_flush(struct hci_dev *hdev) { struct discovery_state *cache = &hdev->discovery; struct inquiry_entry *p, *n; list_for_each_entry_safe(p, n, &cache->all, all) { list_del(&p->all); kfree(p); } INIT_LIST_HEAD(&cache->unknown); INIT_LIST_HEAD(&cache->resolve); } struct inquiry_entry *hci_inquiry_cache_lookup(struct hci_dev *hdev, bdaddr_t *bdaddr) { struct discovery_state *cache = &hdev->discovery; struct inquiry_entry *e; BT_DBG("cache %p, %pMR", cache, bdaddr); list_for_each_entry(e, &cache->all, all) { if (!bacmp(&e->data.bdaddr, bdaddr)) return e; } return NULL; } struct inquiry_entry *hci_inquiry_cache_lookup_unknown(struct hci_dev *hdev, bdaddr_t *bdaddr) { struct discovery_state *cache = &hdev->discovery; struct inquiry_entry *e; BT_DBG("cache %p, %pMR", cache, bdaddr); list_for_each_entry(e, &cache->unknown, list) { if (!bacmp(&e->data.bdaddr, bdaddr)) return e; } return NULL; } struct inquiry_entry *hci_inquiry_cache_lookup_resolve(struct hci_dev *hdev, bdaddr_t *bdaddr, int state) { struct discovery_state *cache = &hdev->discovery; struct inquiry_entry *e; BT_DBG("cache %p bdaddr %pMR state %d", cache, bdaddr, state); list_for_each_entry(e, &cache->resolve, list) { if (!bacmp(bdaddr, BDADDR_ANY) && e->name_state == state) return e; if (!bacmp(&e->data.bdaddr, bdaddr)) return e; } return NULL; } void hci_inquiry_cache_update_resolve(struct hci_dev *hdev, struct inquiry_entry *ie) { struct discovery_state *cache = &hdev->discovery; struct list_head *pos = &cache->resolve; struct inquiry_entry *p; list_del(&ie->list); list_for_each_entry(p, &cache->resolve, list) { if (p->name_state != NAME_PENDING && abs(p->data.rssi) >= abs(ie->data.rssi)) break; pos = &p->list; } list_add(&ie->list, pos); } u32 hci_inquiry_cache_update(struct hci_dev *hdev, struct inquiry_data *data, bool name_known) { struct discovery_state *cache = &hdev->discovery; struct inquiry_entry *ie; u32 flags = 0; BT_DBG("cache %p, %pMR", cache, &data->bdaddr); hci_remove_remote_oob_data(hdev, &data->bdaddr, BDADDR_BREDR); if (!data->ssp_mode) flags |= MGMT_DEV_FOUND_LEGACY_PAIRING; ie = hci_inquiry_cache_lookup(hdev, &data->bdaddr); if (ie) { if (!ie->data.ssp_mode) flags |= MGMT_DEV_FOUND_LEGACY_PAIRING; if (ie->name_state == NAME_NEEDED && data->rssi != ie->data.rssi) { ie->data.rssi = data->rssi; hci_inquiry_cache_update_resolve(hdev, ie); } goto update; } /* Entry not in the cache. Add new one. */ ie = kzalloc(sizeof(*ie), GFP_KERNEL); if (!ie) { flags |= MGMT_DEV_FOUND_CONFIRM_NAME; goto done; } list_add(&ie->all, &cache->all); if (name_known) { ie->name_state = NAME_KNOWN; } else { ie->name_state = NAME_NOT_KNOWN; list_add(&ie->list, &cache->unknown); } update: if (name_known && ie->name_state != NAME_KNOWN && ie->name_state != NAME_PENDING) { ie->name_state = NAME_KNOWN; list_del(&ie->list); } memcpy(&ie->data, data, sizeof(*data)); ie->timestamp = jiffies; cache->timestamp = jiffies; if (ie->name_state == NAME_NOT_KNOWN) flags |= MGMT_DEV_FOUND_CONFIRM_NAME; done: return flags; } static int inquiry_cache_dump(struct hci_dev *hdev, int num, __u8 *buf) { struct discovery_state *cache = &hdev->discovery; struct inquiry_info *info = (struct inquiry_info *) buf; struct inquiry_entry *e; int copied = 0; list_for_each_entry(e, &cache->all, all) { struct inquiry_data *data = &e->data; if (copied >= num) break; bacpy(&info->bdaddr, &data->bdaddr); info->pscan_rep_mode = data->pscan_rep_mode; info->pscan_period_mode = data->pscan_period_mode; info->pscan_mode = data->pscan_mode; memcpy(info->dev_class, data->dev_class, 3); info->clock_offset = data->clock_offset; info++; copied++; } BT_DBG("cache %p, copied %d", cache, copied); return copied; } static int hci_inq_req(struct hci_request *req, unsigned long opt) { struct hci_inquiry_req *ir = (struct hci_inquiry_req *) opt; struct hci_dev *hdev = req->hdev; struct hci_cp_inquiry cp; BT_DBG("%s", hdev->name); if (test_bit(HCI_INQUIRY, &hdev->flags)) return 0; /* Start Inquiry */ memcpy(&cp.lap, &ir->lap, 3); cp.length = ir->length; cp.num_rsp = ir->num_rsp; hci_req_add(req, HCI_OP_INQUIRY, sizeof(cp), &cp); return 0; } int hci_inquiry(void __user *arg) { __u8 __user *ptr = arg; struct hci_inquiry_req ir; struct hci_dev *hdev; int err = 0, do_inquiry = 0, max_rsp; long timeo; __u8 *buf; if (copy_from_user(&ir, ptr, sizeof(ir))) return -EFAULT; hdev = hci_dev_get(ir.dev_id); if (!hdev) return -ENODEV; if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) { err = -EBUSY; goto done; } if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) { err = -EOPNOTSUPP; goto done; } if (hdev->dev_type != HCI_PRIMARY) { err = -EOPNOTSUPP; goto done; } if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) { err = -EOPNOTSUPP; goto done; } /* Restrict maximum inquiry length to 60 seconds */ if (ir.length > 60) { err = -EINVAL; goto done; } hci_dev_lock(hdev); if (inquiry_cache_age(hdev) > INQUIRY_CACHE_AGE_MAX || inquiry_cache_empty(hdev) || ir.flags & IREQ_CACHE_FLUSH) { hci_inquiry_cache_flush(hdev); do_inquiry = 1; } hci_dev_unlock(hdev); timeo = ir.length * msecs_to_jiffies(2000); if (do_inquiry) { err = hci_req_sync(hdev, hci_inq_req, (unsigned long) &ir, timeo, NULL); if (err < 0) goto done; /* Wait until Inquiry procedure finishes (HCI_INQUIRY flag is * cleared). If it is interrupted by a signal, return -EINTR. */ if (wait_on_bit(&hdev->flags, HCI_INQUIRY, TASK_INTERRUPTIBLE)) { err = -EINTR; goto done; } } /* for unlimited number of responses we will use buffer with * 255 entries */ max_rsp = (ir.num_rsp == 0) ? 255 : ir.num_rsp; /* cache_dump can't sleep. Therefore we allocate temp buffer and then * copy it to the user space. */ buf = kmalloc_array(max_rsp, sizeof(struct inquiry_info), GFP_KERNEL); if (!buf) { err = -ENOMEM; goto done; } hci_dev_lock(hdev); ir.num_rsp = inquiry_cache_dump(hdev, max_rsp, buf); hci_dev_unlock(hdev); BT_DBG("num_rsp %d", ir.num_rsp); if (!copy_to_user(ptr, &ir, sizeof(ir))) { ptr += sizeof(ir); if (copy_to_user(ptr, buf, sizeof(struct inquiry_info) * ir.num_rsp)) err = -EFAULT; } else err = -EFAULT; kfree(buf); done: hci_dev_put(hdev); return err; } static int hci_dev_do_open(struct hci_dev *hdev) { int ret = 0; BT_DBG("%s %p", hdev->name, hdev); hci_req_sync_lock(hdev); ret = hci_dev_open_sync(hdev); hci_req_sync_unlock(hdev); return ret; } /* ---- HCI ioctl helpers ---- */ int hci_dev_open(__u16 dev) { struct hci_dev *hdev; int err; hdev = hci_dev_get(dev); if (!hdev) return -ENODEV; /* Devices that are marked as unconfigured can only be powered * up as user channel. Trying to bring them up as normal devices * will result into a failure. Only user channel operation is * possible. * * When this function is called for a user channel, the flag * HCI_USER_CHANNEL will be set first before attempting to * open the device. */ if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED) && !hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) { err = -EOPNOTSUPP; goto done; } /* We need to ensure that no other power on/off work is pending * before proceeding to call hci_dev_do_open. This is * particularly important if the setup procedure has not yet * completed. */ if (hci_dev_test_and_clear_flag(hdev, HCI_AUTO_OFF)) cancel_delayed_work(&hdev->power_off); /* After this call it is guaranteed that the setup procedure * has finished. This means that error conditions like RFKILL * or no valid public or static random address apply. */ flush_workqueue(hdev->req_workqueue); /* For controllers not using the management interface and that * are brought up using legacy ioctl, set the HCI_BONDABLE bit * so that pairing works for them. Once the management interface * is in use this bit will be cleared again and userspace has * to explicitly enable it. */ if (!hci_dev_test_flag(hdev, HCI_USER_CHANNEL) && !hci_dev_test_flag(hdev, HCI_MGMT)) hci_dev_set_flag(hdev, HCI_BONDABLE); err = hci_dev_do_open(hdev); done: hci_dev_put(hdev); return err; } int hci_dev_do_close(struct hci_dev *hdev) { int err; BT_DBG("%s %p", hdev->name, hdev); hci_req_sync_lock(hdev); err = hci_dev_close_sync(hdev); hci_req_sync_unlock(hdev); return err; } int hci_dev_close(__u16 dev) { struct hci_dev *hdev; int err; hdev = hci_dev_get(dev); if (!hdev) return -ENODEV; if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) { err = -EBUSY; goto done; } cancel_work_sync(&hdev->power_on); if (hci_dev_test_and_clear_flag(hdev, HCI_AUTO_OFF)) cancel_delayed_work(&hdev->power_off); err = hci_dev_do_close(hdev); done: hci_dev_put(hdev); return err; } static int hci_dev_do_reset(struct hci_dev *hdev) { int ret; BT_DBG("%s %p", hdev->name, hdev); hci_req_sync_lock(hdev); /* Drop queues */ skb_queue_purge(&hdev->rx_q); skb_queue_purge(&hdev->cmd_q); /* Cancel these to avoid queueing non-chained pending work */ hci_dev_set_flag(hdev, HCI_CMD_DRAIN_WORKQUEUE); /* Wait for * * if (!hci_dev_test_flag(hdev, HCI_CMD_DRAIN_WORKQUEUE)) * queue_delayed_work(&hdev->{cmd,ncmd}_timer) * * inside RCU section to see the flag or complete scheduling. */ synchronize_rcu(); /* Explicitly cancel works in case scheduled after setting the flag. */ cancel_delayed_work(&hdev->cmd_timer); cancel_delayed_work(&hdev->ncmd_timer); /* Avoid potential lockdep warnings from the *_flush() calls by * ensuring the workqueue is empty up front. */ drain_workqueue(hdev->workqueue); hci_dev_lock(hdev); hci_inquiry_cache_flush(hdev); hci_conn_hash_flush(hdev); hci_dev_unlock(hdev); if (hdev->flush) hdev->flush(hdev); hci_dev_clear_flag(hdev, HCI_CMD_DRAIN_WORKQUEUE); atomic_set(&hdev->cmd_cnt, 1); hdev->acl_cnt = 0; hdev->sco_cnt = 0; hdev->le_cnt = 0; hdev->iso_cnt = 0; ret = hci_reset_sync(hdev); hci_req_sync_unlock(hdev); return ret; } int hci_dev_reset(__u16 dev) { struct hci_dev *hdev; int err; hdev = hci_dev_get(dev); if (!hdev) return -ENODEV; if (!test_bit(HCI_UP, &hdev->flags)) { err = -ENETDOWN; goto done; } if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) { err = -EBUSY; goto done; } if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) { err = -EOPNOTSUPP; goto done; } err = hci_dev_do_reset(hdev); done: hci_dev_put(hdev); return err; } int hci_dev_reset_stat(__u16 dev) { struct hci_dev *hdev; int ret = 0; hdev = hci_dev_get(dev); if (!hdev) return -ENODEV; if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) { ret = -EBUSY; goto done; } if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) { ret = -EOPNOTSUPP; goto done; } memset(&hdev->stat, 0, sizeof(struct hci_dev_stats)); done: hci_dev_put(hdev); return ret; } static void hci_update_passive_scan_state(struct hci_dev *hdev, u8 scan) { bool conn_changed, discov_changed; BT_DBG("%s scan 0x%02x", hdev->name, scan); if ((scan & SCAN_PAGE)) conn_changed = !hci_dev_test_and_set_flag(hdev, HCI_CONNECTABLE); else conn_changed = hci_dev_test_and_clear_flag(hdev, HCI_CONNECTABLE); if ((scan & SCAN_INQUIRY)) { discov_changed = !hci_dev_test_and_set_flag(hdev, HCI_DISCOVERABLE); } else { hci_dev_clear_flag(hdev, HCI_LIMITED_DISCOVERABLE); discov_changed = hci_dev_test_and_clear_flag(hdev, HCI_DISCOVERABLE); } if (!hci_dev_test_flag(hdev, HCI_MGMT)) return; if (conn_changed || discov_changed) { /* In case this was disabled through mgmt */ hci_dev_set_flag(hdev, HCI_BREDR_ENABLED); if (hci_dev_test_flag(hdev, HCI_LE_ENABLED)) hci_update_adv_data(hdev, hdev->cur_adv_instance); mgmt_new_settings(hdev); } } int hci_dev_cmd(unsigned int cmd, void __user *arg) { struct hci_dev *hdev; struct hci_dev_req dr; int err = 0; if (copy_from_user(&dr, arg, sizeof(dr))) return -EFAULT; hdev = hci_dev_get(dr.dev_id); if (!hdev) return -ENODEV; if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) { err = -EBUSY; goto done; } if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) { err = -EOPNOTSUPP; goto done; } if (hdev->dev_type != HCI_PRIMARY) { err = -EOPNOTSUPP; goto done; } if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) { err = -EOPNOTSUPP; goto done; } switch (cmd) { case HCISETAUTH: err = hci_req_sync(hdev, hci_auth_req, dr.dev_opt, HCI_INIT_TIMEOUT, NULL); break; case HCISETENCRYPT: if (!lmp_encrypt_capable(hdev)) { err = -EOPNOTSUPP; break; } if (!test_bit(HCI_AUTH, &hdev->flags)) { /* Auth must be enabled first */ err = hci_req_sync(hdev, hci_auth_req, dr.dev_opt, HCI_INIT_TIMEOUT, NULL); if (err) break; } err = hci_req_sync(hdev, hci_encrypt_req, dr.dev_opt, HCI_INIT_TIMEOUT, NULL); break; case HCISETSCAN: err = hci_req_sync(hdev, hci_scan_req, dr.dev_opt, HCI_INIT_TIMEOUT, NULL); /* Ensure that the connectable and discoverable states * get correctly modified as this was a non-mgmt change. */ if (!err) hci_update_passive_scan_state(hdev, dr.dev_opt); break; case HCISETLINKPOL: err = hci_req_sync(hdev, hci_linkpol_req, dr.dev_opt, HCI_INIT_TIMEOUT, NULL); break; case HCISETLINKMODE: hdev->link_mode = ((__u16) dr.dev_opt) & (HCI_LM_MASTER | HCI_LM_ACCEPT); break; case HCISETPTYPE: if (hdev->pkt_type == (__u16) dr.dev_opt) break; hdev->pkt_type = (__u16) dr.dev_opt; mgmt_phy_configuration_changed(hdev, NULL); break; case HCISETACLMTU: hdev->acl_mtu = *((__u16 *) &dr.dev_opt + 1); hdev->acl_pkts = *((__u16 *) &dr.dev_opt + 0); break; case HCISETSCOMTU: hdev->sco_mtu = *((__u16 *) &dr.dev_opt + 1); hdev->sco_pkts = *((__u16 *) &dr.dev_opt + 0); break; default: err = -EINVAL; break; } done: hci_dev_put(hdev); return err; } int hci_get_dev_list(void __user *arg) { struct hci_dev *hdev; struct hci_dev_list_req *dl; struct hci_dev_req *dr; int n = 0, size, err; __u16 dev_num; if (get_user(dev_num, (__u16 __user *) arg)) return -EFAULT; if (!dev_num || dev_num > (PAGE_SIZE * 2) / sizeof(*dr)) return -EINVAL; size = sizeof(*dl) + dev_num * sizeof(*dr); dl = kzalloc(size, GFP_KERNEL); if (!dl) return -ENOMEM; dr = dl->dev_req; read_lock(&hci_dev_list_lock); list_for_each_entry(hdev, &hci_dev_list, list) { unsigned long flags = hdev->flags; /* When the auto-off is configured it means the transport * is running, but in that case still indicate that the * device is actually down. */ if (hci_dev_test_flag(hdev, HCI_AUTO_OFF)) flags &= ~BIT(HCI_UP); (dr + n)->dev_id = hdev->id; (dr + n)->dev_opt = flags; if (++n >= dev_num) break; } read_unlock(&hci_dev_list_lock); dl->dev_num = n; size = sizeof(*dl) + n * sizeof(*dr); err = copy_to_user(arg, dl, size); kfree(dl); return err ? -EFAULT : 0; } int hci_get_dev_info(void __user *arg) { struct hci_dev *hdev; struct hci_dev_info di; unsigned long flags; int err = 0; if (copy_from_user(&di, arg, sizeof(di))) return -EFAULT; hdev = hci_dev_get(di.dev_id); if (!hdev) return -ENODEV; /* When the auto-off is configured it means the transport * is running, but in that case still indicate that the * device is actually down. */ if (hci_dev_test_flag(hdev, HCI_AUTO_OFF)) flags = hdev->flags & ~BIT(HCI_UP); else flags = hdev->flags; strcpy(di.name, hdev->name); di.bdaddr = hdev->bdaddr; di.type = (hdev->bus & 0x0f) | ((hdev->dev_type & 0x03) << 4); di.flags = flags; di.pkt_type = hdev->pkt_type; if (lmp_bredr_capable(hdev)) { di.acl_mtu = hdev->acl_mtu; di.acl_pkts = hdev->acl_pkts; di.sco_mtu = hdev->sco_mtu; di.sco_pkts = hdev->sco_pkts; } else { di.acl_mtu = hdev->le_mtu; di.acl_pkts = hdev->le_pkts; di.sco_mtu = 0; di.sco_pkts = 0; } di.link_policy = hdev->link_policy; di.link_mode = hdev->link_mode; memcpy(&di.stat, &hdev->stat, sizeof(di.stat)); memcpy(&di.features, &hdev->features, sizeof(di.features)); if (copy_to_user(arg, &di, sizeof(di))) err = -EFAULT; hci_dev_put(hdev); return err; } /* ---- Interface to HCI drivers ---- */ static int hci_rfkill_set_block(void *data, bool blocked) { struct hci_dev *hdev = data; BT_DBG("%p name %s blocked %d", hdev, hdev->name, blocked); if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) return -EBUSY; if (blocked) { hci_dev_set_flag(hdev, HCI_RFKILLED); if (!hci_dev_test_flag(hdev, HCI_SETUP) && !hci_dev_test_flag(hdev, HCI_CONFIG)) hci_dev_do_close(hdev); } else { hci_dev_clear_flag(hdev, HCI_RFKILLED); } return 0; } static const struct rfkill_ops hci_rfkill_ops = { .set_block = hci_rfkill_set_block, }; static void hci_power_on(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, power_on); int err; BT_DBG("%s", hdev->name); if (test_bit(HCI_UP, &hdev->flags) && hci_dev_test_flag(hdev, HCI_MGMT) && hci_dev_test_and_clear_flag(hdev, HCI_AUTO_OFF)) { cancel_delayed_work(&hdev->power_off); err = hci_powered_update_sync(hdev); mgmt_power_on(hdev, err); return; } err = hci_dev_do_open(hdev); if (err < 0) { hci_dev_lock(hdev); mgmt_set_powered_failed(hdev, err); hci_dev_unlock(hdev); return; } /* During the HCI setup phase, a few error conditions are * ignored and they need to be checked now. If they are still * valid, it is important to turn the device back off. */ if (hci_dev_test_flag(hdev, HCI_RFKILLED) || hci_dev_test_flag(hdev, HCI_UNCONFIGURED) || (hdev->dev_type == HCI_PRIMARY && !bacmp(&hdev->bdaddr, BDADDR_ANY) && !bacmp(&hdev->static_addr, BDADDR_ANY))) { hci_dev_clear_flag(hdev, HCI_AUTO_OFF); hci_dev_do_close(hdev); } else if (hci_dev_test_flag(hdev, HCI_AUTO_OFF)) { queue_delayed_work(hdev->req_workqueue, &hdev->power_off, HCI_AUTO_OFF_TIMEOUT); } if (hci_dev_test_and_clear_flag(hdev, HCI_SETUP)) { /* For unconfigured devices, set the HCI_RAW flag * so that userspace can easily identify them. */ if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) set_bit(HCI_RAW, &hdev->flags); /* For fully configured devices, this will send * the Index Added event. For unconfigured devices, * it will send Unconfigued Index Added event. * * Devices with HCI_QUIRK_RAW_DEVICE are ignored * and no event will be send. */ mgmt_index_added(hdev); } else if (hci_dev_test_and_clear_flag(hdev, HCI_CONFIG)) { /* When the controller is now configured, then it * is important to clear the HCI_RAW flag. */ if (!hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) clear_bit(HCI_RAW, &hdev->flags); /* Powering on the controller with HCI_CONFIG set only * happens with the transition from unconfigured to * configured. This will send the Index Added event. */ mgmt_index_added(hdev); } } static void hci_power_off(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, power_off.work); BT_DBG("%s", hdev->name); hci_dev_do_close(hdev); } static void hci_error_reset(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, error_reset); BT_DBG("%s", hdev->name); if (hdev->hw_error) hdev->hw_error(hdev, hdev->hw_error_code); else bt_dev_err(hdev, "hardware error 0x%2.2x", hdev->hw_error_code); if (hci_dev_do_close(hdev)) return; hci_dev_do_open(hdev); } void hci_uuids_clear(struct hci_dev *hdev) { struct bt_uuid *uuid, *tmp; list_for_each_entry_safe(uuid, tmp, &hdev->uuids, list) { list_del(&uuid->list); kfree(uuid); } } void hci_link_keys_clear(struct hci_dev *hdev) { struct link_key *key, *tmp; list_for_each_entry_safe(key, tmp, &hdev->link_keys, list) { list_del_rcu(&key->list); kfree_rcu(key, rcu); } } void hci_smp_ltks_clear(struct hci_dev *hdev) { struct smp_ltk *k, *tmp; list_for_each_entry_safe(k, tmp, &hdev->long_term_keys, list) { list_del_rcu(&k->list); kfree_rcu(k, rcu); } } void hci_smp_irks_clear(struct hci_dev *hdev) { struct smp_irk *k, *tmp; list_for_each_entry_safe(k, tmp, &hdev->identity_resolving_keys, list) { list_del_rcu(&k->list); kfree_rcu(k, rcu); } } void hci_blocked_keys_clear(struct hci_dev *hdev) { struct blocked_key *b, *tmp; list_for_each_entry_safe(b, tmp, &hdev->blocked_keys, list) { list_del_rcu(&b->list); kfree_rcu(b, rcu); } } bool hci_is_blocked_key(struct hci_dev *hdev, u8 type, u8 val[16]) { bool blocked = false; struct blocked_key *b; rcu_read_lock(); list_for_each_entry_rcu(b, &hdev->blocked_keys, list) { if (b->type == type && !memcmp(b->val, val, sizeof(b->val))) { blocked = true; break; } } rcu_read_unlock(); return blocked; } struct link_key *hci_find_link_key(struct hci_dev *hdev, bdaddr_t *bdaddr) { struct link_key *k; rcu_read_lock(); list_for_each_entry_rcu(k, &hdev->link_keys, list) { if (bacmp(bdaddr, &k->bdaddr) == 0) { rcu_read_unlock(); if (hci_is_blocked_key(hdev, HCI_BLOCKED_KEY_TYPE_LINKKEY, k->val)) { bt_dev_warn_ratelimited(hdev, "Link key blocked for %pMR", &k->bdaddr); return NULL; } return k; } } rcu_read_unlock(); return NULL; } static bool hci_persistent_key(struct hci_dev *hdev, struct hci_conn *conn, u8 key_type, u8 old_key_type) { /* Legacy key */ if (key_type < 0x03) return true; /* Debug keys are insecure so don't store them persistently */ if (key_type == HCI_LK_DEBUG_COMBINATION) return false; /* Changed combination key and there's no previous one */ if (key_type == HCI_LK_CHANGED_COMBINATION && old_key_type == 0xff) return false; /* Security mode 3 case */ if (!conn) return true; /* BR/EDR key derived using SC from an LE link */ if (conn->type == LE_LINK) return true; /* Neither local nor remote side had no-bonding as requirement */ if (conn->auth_type > 0x01 && conn->remote_auth > 0x01) return true; /* Local side had dedicated bonding as requirement */ if (conn->auth_type == 0x02 || conn->auth_type == 0x03) return true; /* Remote side had dedicated bonding as requirement */ if (conn->remote_auth == 0x02 || conn->remote_auth == 0x03) return true; /* If none of the above criteria match, then don't store the key * persistently */ return false; } static u8 ltk_role(u8 type) { if (type == SMP_LTK) return HCI_ROLE_MASTER; return HCI_ROLE_SLAVE; } struct smp_ltk *hci_find_ltk(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 addr_type, u8 role) { struct smp_ltk *k; rcu_read_lock(); list_for_each_entry_rcu(k, &hdev->long_term_keys, list) { if (addr_type != k->bdaddr_type || bacmp(bdaddr, &k->bdaddr)) continue; if (smp_ltk_is_sc(k) || ltk_role(k->type) == role) { rcu_read_unlock(); if (hci_is_blocked_key(hdev, HCI_BLOCKED_KEY_TYPE_LTK, k->val)) { bt_dev_warn_ratelimited(hdev, "LTK blocked for %pMR", &k->bdaddr); return NULL; } return k; } } rcu_read_unlock(); return NULL; } struct smp_irk *hci_find_irk_by_rpa(struct hci_dev *hdev, bdaddr_t *rpa) { struct smp_irk *irk_to_return = NULL; struct smp_irk *irk; rcu_read_lock(); list_for_each_entry_rcu(irk, &hdev->identity_resolving_keys, list) { if (!bacmp(&irk->rpa, rpa)) { irk_to_return = irk; goto done; } } list_for_each_entry_rcu(irk, &hdev->identity_resolving_keys, list) { if (smp_irk_matches(hdev, irk->val, rpa)) { bacpy(&irk->rpa, rpa); irk_to_return = irk; goto done; } } done: if (irk_to_return && hci_is_blocked_key(hdev, HCI_BLOCKED_KEY_TYPE_IRK, irk_to_return->val)) { bt_dev_warn_ratelimited(hdev, "Identity key blocked for %pMR", &irk_to_return->bdaddr); irk_to_return = NULL; } rcu_read_unlock(); return irk_to_return; } struct smp_irk *hci_find_irk_by_addr(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 addr_type) { struct smp_irk *irk_to_return = NULL; struct smp_irk *irk; /* Identity Address must be public or static random */ if (addr_type == ADDR_LE_DEV_RANDOM && (bdaddr->b[5] & 0xc0) != 0xc0) return NULL; rcu_read_lock(); list_for_each_entry_rcu(irk, &hdev->identity_resolving_keys, list) { if (addr_type == irk->addr_type && bacmp(bdaddr, &irk->bdaddr) == 0) { irk_to_return = irk; goto done; } } done: if (irk_to_return && hci_is_blocked_key(hdev, HCI_BLOCKED_KEY_TYPE_IRK, irk_to_return->val)) { bt_dev_warn_ratelimited(hdev, "Identity key blocked for %pMR", &irk_to_return->bdaddr); irk_to_return = NULL; } rcu_read_unlock(); return irk_to_return; } struct link_key *hci_add_link_key(struct hci_dev *hdev, struct hci_conn *conn, bdaddr_t *bdaddr, u8 *val, u8 type, u8 pin_len, bool *persistent) { struct link_key *key, *old_key; u8 old_key_type; old_key = hci_find_link_key(hdev, bdaddr); if (old_key) { old_key_type = old_key->type; key = old_key; } else { old_key_type = conn ? conn->key_type : 0xff; key = kzalloc(sizeof(*key), GFP_KERNEL); if (!key) return NULL; list_add_rcu(&key->list, &hdev->link_keys); } BT_DBG("%s key for %pMR type %u", hdev->name, bdaddr, type); /* Some buggy controller combinations generate a changed * combination key for legacy pairing even when there's no * previous key */ if (type == HCI_LK_CHANGED_COMBINATION && (!conn || conn->remote_auth == 0xff) && old_key_type == 0xff) { type = HCI_LK_COMBINATION; if (conn) conn->key_type = type; } bacpy(&key->bdaddr, bdaddr); memcpy(key->val, val, HCI_LINK_KEY_SIZE); key->pin_len = pin_len; if (type == HCI_LK_CHANGED_COMBINATION) key->type = old_key_type; else key->type = type; if (persistent) *persistent = hci_persistent_key(hdev, conn, type, old_key_type); return key; } struct smp_ltk *hci_add_ltk(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 addr_type, u8 type, u8 authenticated, u8 tk[16], u8 enc_size, __le16 ediv, __le64 rand) { struct smp_ltk *key, *old_key; u8 role = ltk_role(type); old_key = hci_find_ltk(hdev, bdaddr, addr_type, role); if (old_key) key = old_key; else { key = kzalloc(sizeof(*key), GFP_KERNEL); if (!key) return NULL; list_add_rcu(&key->list, &hdev->long_term_keys); } bacpy(&key->bdaddr, bdaddr); key->bdaddr_type = addr_type; memcpy(key->val, tk, sizeof(key->val)); key->authenticated = authenticated; key->ediv = ediv; key->rand = rand; key->enc_size = enc_size; key->type = type; return key; } struct smp_irk *hci_add_irk(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 addr_type, u8 val[16], bdaddr_t *rpa) { struct smp_irk *irk; irk = hci_find_irk_by_addr(hdev, bdaddr, addr_type); if (!irk) { irk = kzalloc(sizeof(*irk), GFP_KERNEL); if (!irk) return NULL; bacpy(&irk->bdaddr, bdaddr); irk->addr_type = addr_type; list_add_rcu(&irk->list, &hdev->identity_resolving_keys); } memcpy(irk->val, val, 16); bacpy(&irk->rpa, rpa); return irk; } int hci_remove_link_key(struct hci_dev *hdev, bdaddr_t *bdaddr) { struct link_key *key; key = hci_find_link_key(hdev, bdaddr); if (!key) return -ENOENT; BT_DBG("%s removing %pMR", hdev->name, bdaddr); list_del_rcu(&key->list); kfree_rcu(key, rcu); return 0; } int hci_remove_ltk(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 bdaddr_type) { struct smp_ltk *k, *tmp; int removed = 0; list_for_each_entry_safe(k, tmp, &hdev->long_term_keys, list) { if (bacmp(bdaddr, &k->bdaddr) || k->bdaddr_type != bdaddr_type) continue; BT_DBG("%s removing %pMR", hdev->name, bdaddr); list_del_rcu(&k->list); kfree_rcu(k, rcu); removed++; } return removed ? 0 : -ENOENT; } void hci_remove_irk(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 addr_type) { struct smp_irk *k, *tmp; list_for_each_entry_safe(k, tmp, &hdev->identity_resolving_keys, list) { if (bacmp(bdaddr, &k->bdaddr) || k->addr_type != addr_type) continue; BT_DBG("%s removing %pMR", hdev->name, bdaddr); list_del_rcu(&k->list); kfree_rcu(k, rcu); } } bool hci_bdaddr_is_paired(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 type) { struct smp_ltk *k; struct smp_irk *irk; u8 addr_type; if (type == BDADDR_BREDR) { if (hci_find_link_key(hdev, bdaddr)) return true; return false; } /* Convert to HCI addr type which struct smp_ltk uses */ if (type == BDADDR_LE_PUBLIC) addr_type = ADDR_LE_DEV_PUBLIC; else addr_type = ADDR_LE_DEV_RANDOM; irk = hci_get_irk(hdev, bdaddr, addr_type); if (irk) { bdaddr = &irk->bdaddr; addr_type = irk->addr_type; } rcu_read_lock(); list_for_each_entry_rcu(k, &hdev->long_term_keys, list) { if (k->bdaddr_type == addr_type && !bacmp(bdaddr, &k->bdaddr)) { rcu_read_unlock(); return true; } } rcu_read_unlock(); return false; } /* HCI command timer function */ static void hci_cmd_timeout(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, cmd_timer.work); if (hdev->sent_cmd) { struct hci_command_hdr *sent = (void *) hdev->sent_cmd->data; u16 opcode = __le16_to_cpu(sent->opcode); bt_dev_err(hdev, "command 0x%4.4x tx timeout", opcode); } else { bt_dev_err(hdev, "command tx timeout"); } if (hdev->cmd_timeout) hdev->cmd_timeout(hdev); atomic_set(&hdev->cmd_cnt, 1); queue_work(hdev->workqueue, &hdev->cmd_work); } /* HCI ncmd timer function */ static void hci_ncmd_timeout(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, ncmd_timer.work); bt_dev_err(hdev, "Controller not accepting commands anymore: ncmd = 0"); /* During HCI_INIT phase no events can be injected if the ncmd timer * triggers since the procedure has its own timeout handling. */ if (test_bit(HCI_INIT, &hdev->flags)) return; /* This is an irrecoverable state, inject hardware error event */ hci_reset_dev(hdev); } struct oob_data *hci_find_remote_oob_data(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 bdaddr_type) { struct oob_data *data; list_for_each_entry(data, &hdev->remote_oob_data, list) { if (bacmp(bdaddr, &data->bdaddr) != 0) continue; if (data->bdaddr_type != bdaddr_type) continue; return data; } return NULL; } int hci_remove_remote_oob_data(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 bdaddr_type) { struct oob_data *data; data = hci_find_remote_oob_data(hdev, bdaddr, bdaddr_type); if (!data) return -ENOENT; BT_DBG("%s removing %pMR (%u)", hdev->name, bdaddr, bdaddr_type); list_del(&data->list); kfree(data); return 0; } void hci_remote_oob_data_clear(struct hci_dev *hdev) { struct oob_data *data, *n; list_for_each_entry_safe(data, n, &hdev->remote_oob_data, list) { list_del(&data->list); kfree(data); } } int hci_add_remote_oob_data(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 bdaddr_type, u8 *hash192, u8 *rand192, u8 *hash256, u8 *rand256) { struct oob_data *data; data = hci_find_remote_oob_data(hdev, bdaddr, bdaddr_type); if (!data) { data = kmalloc(sizeof(*data), GFP_KERNEL); if (!data) return -ENOMEM; bacpy(&data->bdaddr, bdaddr); data->bdaddr_type = bdaddr_type; list_add(&data->list, &hdev->remote_oob_data); } if (hash192 && rand192) { memcpy(data->hash192, hash192, sizeof(data->hash192)); memcpy(data->rand192, rand192, sizeof(data->rand192)); if (hash256 && rand256) data->present = 0x03; } else { memset(data->hash192, 0, sizeof(data->hash192)); memset(data->rand192, 0, sizeof(data->rand192)); if (hash256 && rand256) data->present = 0x02; else data->present = 0x00; } if (hash256 && rand256) { memcpy(data->hash256, hash256, sizeof(data->hash256)); memcpy(data->rand256, rand256, sizeof(data->rand256)); } else { memset(data->hash256, 0, sizeof(data->hash256)); memset(data->rand256, 0, sizeof(data->rand256)); if (hash192 && rand192) data->present = 0x01; } BT_DBG("%s for %pMR", hdev->name, bdaddr); return 0; } /* This function requires the caller holds hdev->lock */ struct adv_info *hci_find_adv_instance(struct hci_dev *hdev, u8 instance) { struct adv_info *adv_instance; list_for_each_entry(adv_instance, &hdev->adv_instances, list) { if (adv_instance->instance == instance) return adv_instance; } return NULL; } /* This function requires the caller holds hdev->lock */ struct adv_info *hci_get_next_instance(struct hci_dev *hdev, u8 instance) { struct adv_info *cur_instance; cur_instance = hci_find_adv_instance(hdev, instance); if (!cur_instance) return NULL; if (cur_instance == list_last_entry(&hdev->adv_instances, struct adv_info, list)) return list_first_entry(&hdev->adv_instances, struct adv_info, list); else return list_next_entry(cur_instance, list); } /* This function requires the caller holds hdev->lock */ int hci_remove_adv_instance(struct hci_dev *hdev, u8 instance) { struct adv_info *adv_instance; adv_instance = hci_find_adv_instance(hdev, instance); if (!adv_instance) return -ENOENT; BT_DBG("%s removing %dMR", hdev->name, instance); if (hdev->cur_adv_instance == instance) { if (hdev->adv_instance_timeout) { cancel_delayed_work(&hdev->adv_instance_expire); hdev->adv_instance_timeout = 0; } hdev->cur_adv_instance = 0x00; } cancel_delayed_work_sync(&adv_instance->rpa_expired_cb); list_del(&adv_instance->list); kfree(adv_instance); hdev->adv_instance_cnt--; return 0; } void hci_adv_instances_set_rpa_expired(struct hci_dev *hdev, bool rpa_expired) { struct adv_info *adv_instance, *n; list_for_each_entry_safe(adv_instance, n, &hdev->adv_instances, list) adv_instance->rpa_expired = rpa_expired; } /* This function requires the caller holds hdev->lock */ void hci_adv_instances_clear(struct hci_dev *hdev) { struct adv_info *adv_instance, *n; if (hdev->adv_instance_timeout) { cancel_delayed_work(&hdev->adv_instance_expire); hdev->adv_instance_timeout = 0; } list_for_each_entry_safe(adv_instance, n, &hdev->adv_instances, list) { cancel_delayed_work_sync(&adv_instance->rpa_expired_cb); list_del(&adv_instance->list); kfree(adv_instance); } hdev->adv_instance_cnt = 0; hdev->cur_adv_instance = 0x00; } static void adv_instance_rpa_expired(struct work_struct *work) { struct adv_info *adv_instance = container_of(work, struct adv_info, rpa_expired_cb.work); BT_DBG(""); adv_instance->rpa_expired = true; } /* This function requires the caller holds hdev->lock */ struct adv_info *hci_add_adv_instance(struct hci_dev *hdev, u8 instance, u32 flags, u16 adv_data_len, u8 *adv_data, u16 scan_rsp_len, u8 *scan_rsp_data, u16 timeout, u16 duration, s8 tx_power, u32 min_interval, u32 max_interval, u8 mesh_handle) { struct adv_info *adv; adv = hci_find_adv_instance(hdev, instance); if (adv) { memset(adv->adv_data, 0, sizeof(adv->adv_data)); memset(adv->scan_rsp_data, 0, sizeof(adv->scan_rsp_data)); memset(adv->per_adv_data, 0, sizeof(adv->per_adv_data)); } else { if (hdev->adv_instance_cnt >= hdev->le_num_of_adv_sets || instance < 1 || instance > hdev->le_num_of_adv_sets + 1) return ERR_PTR(-EOVERFLOW); adv = kzalloc(sizeof(*adv), GFP_KERNEL); if (!adv) return ERR_PTR(-ENOMEM); adv->pending = true; adv->instance = instance; list_add(&adv->list, &hdev->adv_instances); hdev->adv_instance_cnt++; } adv->flags = flags; adv->min_interval = min_interval; adv->max_interval = max_interval; adv->tx_power = tx_power; /* Defining a mesh_handle changes the timing units to ms, * rather than seconds, and ties the instance to the requested * mesh_tx queue. */ adv->mesh = mesh_handle; hci_set_adv_instance_data(hdev, instance, adv_data_len, adv_data, scan_rsp_len, scan_rsp_data); adv->timeout = timeout; adv->remaining_time = timeout; if (duration == 0) adv->duration = hdev->def_multi_adv_rotation_duration; else adv->duration = duration; INIT_DELAYED_WORK(&adv->rpa_expired_cb, adv_instance_rpa_expired); BT_DBG("%s for %dMR", hdev->name, instance); return adv; } /* This function requires the caller holds hdev->lock */ struct adv_info *hci_add_per_instance(struct hci_dev *hdev, u8 instance, u32 flags, u8 data_len, u8 *data, u32 min_interval, u32 max_interval) { struct adv_info *adv; adv = hci_add_adv_instance(hdev, instance, flags, 0, NULL, 0, NULL, 0, 0, HCI_ADV_TX_POWER_NO_PREFERENCE, min_interval, max_interval, 0); if (IS_ERR(adv)) return adv; adv->periodic = true; adv->per_adv_data_len = data_len; if (data) memcpy(adv->per_adv_data, data, data_len); return adv; } /* This function requires the caller holds hdev->lock */ int hci_set_adv_instance_data(struct hci_dev *hdev, u8 instance, u16 adv_data_len, u8 *adv_data, u16 scan_rsp_len, u8 *scan_rsp_data) { struct adv_info *adv; adv = hci_find_adv_instance(hdev, instance); /* If advertisement doesn't exist, we can't modify its data */ if (!adv) return -ENOENT; if (adv_data_len && ADV_DATA_CMP(adv, adv_data, adv_data_len)) { memset(adv->adv_data, 0, sizeof(adv->adv_data)); memcpy(adv->adv_data, adv_data, adv_data_len); adv->adv_data_len = adv_data_len; adv->adv_data_changed = true; } if (scan_rsp_len && SCAN_RSP_CMP(adv, scan_rsp_data, scan_rsp_len)) { memset(adv->scan_rsp_data, 0, sizeof(adv->scan_rsp_data)); memcpy(adv->scan_rsp_data, scan_rsp_data, scan_rsp_len); adv->scan_rsp_len = scan_rsp_len; adv->scan_rsp_changed = true; } /* Mark as changed if there are flags which would affect it */ if (((adv->flags & MGMT_ADV_FLAG_APPEARANCE) && hdev->appearance) || adv->flags & MGMT_ADV_FLAG_LOCAL_NAME) adv->scan_rsp_changed = true; return 0; } /* This function requires the caller holds hdev->lock */ u32 hci_adv_instance_flags(struct hci_dev *hdev, u8 instance) { u32 flags; struct adv_info *adv; if (instance == 0x00) { /* Instance 0 always manages the "Tx Power" and "Flags" * fields */ flags = MGMT_ADV_FLAG_TX_POWER | MGMT_ADV_FLAG_MANAGED_FLAGS; /* For instance 0, the HCI_ADVERTISING_CONNECTABLE setting * corresponds to the "connectable" instance flag. */ if (hci_dev_test_flag(hdev, HCI_ADVERTISING_CONNECTABLE)) flags |= MGMT_ADV_FLAG_CONNECTABLE; if (hci_dev_test_flag(hdev, HCI_LIMITED_DISCOVERABLE)) flags |= MGMT_ADV_FLAG_LIMITED_DISCOV; else if (hci_dev_test_flag(hdev, HCI_DISCOVERABLE)) flags |= MGMT_ADV_FLAG_DISCOV; return flags; } adv = hci_find_adv_instance(hdev, instance); /* Return 0 when we got an invalid instance identifier. */ if (!adv) return 0; return adv->flags; } bool hci_adv_instance_is_scannable(struct hci_dev *hdev, u8 instance) { struct adv_info *adv; /* Instance 0x00 always set local name */ if (instance == 0x00) return true; adv = hci_find_adv_instance(hdev, instance); if (!adv) return false; if (adv->flags & MGMT_ADV_FLAG_APPEARANCE || adv->flags & MGMT_ADV_FLAG_LOCAL_NAME) return true; return adv->scan_rsp_len ? true : false; } /* This function requires the caller holds hdev->lock */ void hci_adv_monitors_clear(struct hci_dev *hdev) { struct adv_monitor *monitor; int handle; idr_for_each_entry(&hdev->adv_monitors_idr, monitor, handle) hci_free_adv_monitor(hdev, monitor); idr_destroy(&hdev->adv_monitors_idr); } /* Frees the monitor structure and do some bookkeepings. * This function requires the caller holds hdev->lock. */ void hci_free_adv_monitor(struct hci_dev *hdev, struct adv_monitor *monitor) { struct adv_pattern *pattern; struct adv_pattern *tmp; if (!monitor) return; list_for_each_entry_safe(pattern, tmp, &monitor->patterns, list) { list_del(&pattern->list); kfree(pattern); } if (monitor->handle) idr_remove(&hdev->adv_monitors_idr, monitor->handle); if (monitor->state != ADV_MONITOR_STATE_NOT_REGISTERED) { hdev->adv_monitors_cnt--; mgmt_adv_monitor_removed(hdev, monitor->handle); } kfree(monitor); } /* Assigns handle to a monitor, and if offloading is supported and power is on, * also attempts to forward the request to the controller. * This function requires the caller holds hci_req_sync_lock. */ int hci_add_adv_monitor(struct hci_dev *hdev, struct adv_monitor *monitor) { int min, max, handle; int status = 0; if (!monitor) return -EINVAL; hci_dev_lock(hdev); min = HCI_MIN_ADV_MONITOR_HANDLE; max = HCI_MIN_ADV_MONITOR_HANDLE + HCI_MAX_ADV_MONITOR_NUM_HANDLES; handle = idr_alloc(&hdev->adv_monitors_idr, monitor, min, max, GFP_KERNEL); hci_dev_unlock(hdev); if (handle < 0) return handle; monitor->handle = handle; if (!hdev_is_powered(hdev)) return status; switch (hci_get_adv_monitor_offload_ext(hdev)) { case HCI_ADV_MONITOR_EXT_NONE: bt_dev_dbg(hdev, "add monitor %d status %d", monitor->handle, status); /* Message was not forwarded to controller - not an error */ break; case HCI_ADV_MONITOR_EXT_MSFT: status = msft_add_monitor_pattern(hdev, monitor); bt_dev_dbg(hdev, "add monitor %d msft status %d", handle, status); break; } return status; } /* Attempts to tell the controller and free the monitor. If somehow the * controller doesn't have a corresponding handle, remove anyway. * This function requires the caller holds hci_req_sync_lock. */ static int hci_remove_adv_monitor(struct hci_dev *hdev, struct adv_monitor *monitor) { int status = 0; int handle; switch (hci_get_adv_monitor_offload_ext(hdev)) { case HCI_ADV_MONITOR_EXT_NONE: /* also goes here when powered off */ bt_dev_dbg(hdev, "remove monitor %d status %d", monitor->handle, status); goto free_monitor; case HCI_ADV_MONITOR_EXT_MSFT: handle = monitor->handle; status = msft_remove_monitor(hdev, monitor); bt_dev_dbg(hdev, "remove monitor %d msft status %d", handle, status); break; } /* In case no matching handle registered, just free the monitor */ if (status == -ENOENT) goto free_monitor; return status; free_monitor: if (status == -ENOENT) bt_dev_warn(hdev, "Removing monitor with no matching handle %d", monitor->handle); hci_free_adv_monitor(hdev, monitor); return status; } /* This function requires the caller holds hci_req_sync_lock */ int hci_remove_single_adv_monitor(struct hci_dev *hdev, u16 handle) { struct adv_monitor *monitor = idr_find(&hdev->adv_monitors_idr, handle); if (!monitor) return -EINVAL; return hci_remove_adv_monitor(hdev, monitor); } /* This function requires the caller holds hci_req_sync_lock */ int hci_remove_all_adv_monitor(struct hci_dev *hdev) { struct adv_monitor *monitor; int idr_next_id = 0; int status = 0; while (1) { monitor = idr_get_next(&hdev->adv_monitors_idr, &idr_next_id); if (!monitor) break; status = hci_remove_adv_monitor(hdev, monitor); if (status) return status; idr_next_id++; } return status; } /* This function requires the caller holds hdev->lock */ bool hci_is_adv_monitoring(struct hci_dev *hdev) { return !idr_is_empty(&hdev->adv_monitors_idr); } int hci_get_adv_monitor_offload_ext(struct hci_dev *hdev) { if (msft_monitor_supported(hdev)) return HCI_ADV_MONITOR_EXT_MSFT; return HCI_ADV_MONITOR_EXT_NONE; } struct bdaddr_list *hci_bdaddr_list_lookup(struct list_head *bdaddr_list, bdaddr_t *bdaddr, u8 type) { struct bdaddr_list *b; list_for_each_entry(b, bdaddr_list, list) { if (!bacmp(&b->bdaddr, bdaddr) && b->bdaddr_type == type) return b; } return NULL; } struct bdaddr_list_with_irk *hci_bdaddr_list_lookup_with_irk( struct list_head *bdaddr_list, bdaddr_t *bdaddr, u8 type) { struct bdaddr_list_with_irk *b; list_for_each_entry(b, bdaddr_list, list) { if (!bacmp(&b->bdaddr, bdaddr) && b->bdaddr_type == type) return b; } return NULL; } struct bdaddr_list_with_flags * hci_bdaddr_list_lookup_with_flags(struct list_head *bdaddr_list, bdaddr_t *bdaddr, u8 type) { struct bdaddr_list_with_flags *b; list_for_each_entry(b, bdaddr_list, list) { if (!bacmp(&b->bdaddr, bdaddr) && b->bdaddr_type == type) return b; } return NULL; } void hci_bdaddr_list_clear(struct list_head *bdaddr_list) { struct bdaddr_list *b, *n; list_for_each_entry_safe(b, n, bdaddr_list, list) { list_del(&b->list); kfree(b); } } int hci_bdaddr_list_add(struct list_head *list, bdaddr_t *bdaddr, u8 type) { struct bdaddr_list *entry; if (!bacmp(bdaddr, BDADDR_ANY)) return -EBADF; if (hci_bdaddr_list_lookup(list, bdaddr, type)) return -EEXIST; entry = kzalloc(sizeof(*entry), GFP_KERNEL); if (!entry) return -ENOMEM; bacpy(&entry->bdaddr, bdaddr); entry->bdaddr_type = type; list_add(&entry->list, list); return 0; } int hci_bdaddr_list_add_with_irk(struct list_head *list, bdaddr_t *bdaddr, u8 type, u8 *peer_irk, u8 *local_irk) { struct bdaddr_list_with_irk *entry; if (!bacmp(bdaddr, BDADDR_ANY)) return -EBADF; if (hci_bdaddr_list_lookup(list, bdaddr, type)) return -EEXIST; entry = kzalloc(sizeof(*entry), GFP_KERNEL); if (!entry) return -ENOMEM; bacpy(&entry->bdaddr, bdaddr); entry->bdaddr_type = type; if (peer_irk) memcpy(entry->peer_irk, peer_irk, 16); if (local_irk) memcpy(entry->local_irk, local_irk, 16); list_add(&entry->list, list); return 0; } int hci_bdaddr_list_add_with_flags(struct list_head *list, bdaddr_t *bdaddr, u8 type, u32 flags) { struct bdaddr_list_with_flags *entry; if (!bacmp(bdaddr, BDADDR_ANY)) return -EBADF; if (hci_bdaddr_list_lookup(list, bdaddr, type)) return -EEXIST; entry = kzalloc(sizeof(*entry), GFP_KERNEL); if (!entry) return -ENOMEM; bacpy(&entry->bdaddr, bdaddr); entry->bdaddr_type = type; entry->flags = flags; list_add(&entry->list, list); return 0; } int hci_bdaddr_list_del(struct list_head *list, bdaddr_t *bdaddr, u8 type) { struct bdaddr_list *entry; if (!bacmp(bdaddr, BDADDR_ANY)) { hci_bdaddr_list_clear(list); return 0; } entry = hci_bdaddr_list_lookup(list, bdaddr, type); if (!entry) return -ENOENT; list_del(&entry->list); kfree(entry); return 0; } int hci_bdaddr_list_del_with_irk(struct list_head *list, bdaddr_t *bdaddr, u8 type) { struct bdaddr_list_with_irk *entry; if (!bacmp(bdaddr, BDADDR_ANY)) { hci_bdaddr_list_clear(list); return 0; } entry = hci_bdaddr_list_lookup_with_irk(list, bdaddr, type); if (!entry) return -ENOENT; list_del(&entry->list); kfree(entry); return 0; } int hci_bdaddr_list_del_with_flags(struct list_head *list, bdaddr_t *bdaddr, u8 type) { struct bdaddr_list_with_flags *entry; if (!bacmp(bdaddr, BDADDR_ANY)) { hci_bdaddr_list_clear(list); return 0; } entry = hci_bdaddr_list_lookup_with_flags(list, bdaddr, type); if (!entry) return -ENOENT; list_del(&entry->list); kfree(entry); return 0; } /* This function requires the caller holds hdev->lock */ struct hci_conn_params *hci_conn_params_lookup(struct hci_dev *hdev, bdaddr_t *addr, u8 addr_type) { struct hci_conn_params *params; list_for_each_entry(params, &hdev->le_conn_params, list) { if (bacmp(&params->addr, addr) == 0 && params->addr_type == addr_type) { return params; } } return NULL; } /* This function requires the caller holds hdev->lock or rcu_read_lock */ struct hci_conn_params *hci_pend_le_action_lookup(struct list_head *list, bdaddr_t *addr, u8 addr_type) { struct hci_conn_params *param; rcu_read_lock(); list_for_each_entry_rcu(param, list, action) { if (bacmp(&param->addr, addr) == 0 && param->addr_type == addr_type) { rcu_read_unlock(); return param; } } rcu_read_unlock(); return NULL; } /* This function requires the caller holds hdev->lock */ void hci_pend_le_list_del_init(struct hci_conn_params *param) { if (list_empty(&param->action)) return; list_del_rcu(&param->action); synchronize_rcu(); INIT_LIST_HEAD(&param->action); } /* This function requires the caller holds hdev->lock */ void hci_pend_le_list_add(struct hci_conn_params *param, struct list_head *list) { list_add_rcu(&param->action, list); } /* This function requires the caller holds hdev->lock */ struct hci_conn_params *hci_conn_params_add(struct hci_dev *hdev, bdaddr_t *addr, u8 addr_type) { struct hci_conn_params *params; params = hci_conn_params_lookup(hdev, addr, addr_type); if (params) return params; params = kzalloc(sizeof(*params), GFP_KERNEL); if (!params) { bt_dev_err(hdev, "out of memory"); return NULL; } bacpy(&params->addr, addr); params->addr_type = addr_type; list_add(&params->list, &hdev->le_conn_params); INIT_LIST_HEAD(&params->action); params->conn_min_interval = hdev->le_conn_min_interval; params->conn_max_interval = hdev->le_conn_max_interval; params->conn_latency = hdev->le_conn_latency; params->supervision_timeout = hdev->le_supv_timeout; params->auto_connect = HCI_AUTO_CONN_DISABLED; BT_DBG("addr %pMR (type %u)", addr, addr_type); return params; } void hci_conn_params_free(struct hci_conn_params *params) { hci_pend_le_list_del_init(params); if (params->conn) { hci_conn_drop(params->conn); hci_conn_put(params->conn); } list_del(&params->list); kfree(params); } /* This function requires the caller holds hdev->lock */ void hci_conn_params_del(struct hci_dev *hdev, bdaddr_t *addr, u8 addr_type) { struct hci_conn_params *params; params = hci_conn_params_lookup(hdev, addr, addr_type); if (!params) return; hci_conn_params_free(params); hci_update_passive_scan(hdev); BT_DBG("addr %pMR (type %u)", addr, addr_type); } /* This function requires the caller holds hdev->lock */ void hci_conn_params_clear_disabled(struct hci_dev *hdev) { struct hci_conn_params *params, *tmp; list_for_each_entry_safe(params, tmp, &hdev->le_conn_params, list) { if (params->auto_connect != HCI_AUTO_CONN_DISABLED) continue; /* If trying to establish one time connection to disabled * device, leave the params, but mark them as just once. */ if (params->explicit_connect) { params->auto_connect = HCI_AUTO_CONN_EXPLICIT; continue; } hci_conn_params_free(params); } BT_DBG("All LE disabled connection parameters were removed"); } /* This function requires the caller holds hdev->lock */ static void hci_conn_params_clear_all(struct hci_dev *hdev) { struct hci_conn_params *params, *tmp; list_for_each_entry_safe(params, tmp, &hdev->le_conn_params, list) hci_conn_params_free(params); BT_DBG("All LE connection parameters were removed"); } /* Copy the Identity Address of the controller. * * If the controller has a public BD_ADDR, then by default use that one. * If this is a LE only controller without a public address, default to * the static random address. * * For debugging purposes it is possible to force controllers with a * public address to use the static random address instead. * * In case BR/EDR has been disabled on a dual-mode controller and * userspace has configured a static address, then that address * becomes the identity address instead of the public BR/EDR address. */ void hci_copy_identity_address(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 *bdaddr_type) { if (hci_dev_test_flag(hdev, HCI_FORCE_STATIC_ADDR) || !bacmp(&hdev->bdaddr, BDADDR_ANY) || (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED) && bacmp(&hdev->static_addr, BDADDR_ANY))) { bacpy(bdaddr, &hdev->static_addr); *bdaddr_type = ADDR_LE_DEV_RANDOM; } else { bacpy(bdaddr, &hdev->bdaddr); *bdaddr_type = ADDR_LE_DEV_PUBLIC; } } static void hci_clear_wake_reason(struct hci_dev *hdev) { hci_dev_lock(hdev); hdev->wake_reason = 0; bacpy(&hdev->wake_addr, BDADDR_ANY); hdev->wake_addr_type = 0; hci_dev_unlock(hdev); } static int hci_suspend_notifier(struct notifier_block *nb, unsigned long action, void *data) { struct hci_dev *hdev = container_of(nb, struct hci_dev, suspend_notifier); int ret = 0; /* Userspace has full control of this device. Do nothing. */ if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) return NOTIFY_DONE; /* To avoid a potential race with hci_unregister_dev. */ hci_dev_hold(hdev); if (action == PM_SUSPEND_PREPARE) ret = hci_suspend_dev(hdev); else if (action == PM_POST_SUSPEND) ret = hci_resume_dev(hdev); if (ret) bt_dev_err(hdev, "Suspend notifier action (%lu) failed: %d", action, ret); hci_dev_put(hdev); return NOTIFY_DONE; } /* Alloc HCI device */ struct hci_dev *hci_alloc_dev_priv(int sizeof_priv) { struct hci_dev *hdev; unsigned int alloc_size; alloc_size = sizeof(*hdev); if (sizeof_priv) { /* Fixme: May need ALIGN-ment? */ alloc_size += sizeof_priv; } hdev = kzalloc(alloc_size, GFP_KERNEL); if (!hdev) return NULL; hdev->pkt_type = (HCI_DM1 | HCI_DH1 | HCI_HV1); hdev->esco_type = (ESCO_HV1); hdev->link_mode = (HCI_LM_ACCEPT); hdev->num_iac = 0x01; /* One IAC support is mandatory */ hdev->io_capability = 0x03; /* No Input No Output */ hdev->manufacturer = 0xffff; /* Default to internal use */ hdev->inq_tx_power = HCI_TX_POWER_INVALID; hdev->adv_tx_power = HCI_TX_POWER_INVALID; hdev->adv_instance_cnt = 0; hdev->cur_adv_instance = 0x00; hdev->adv_instance_timeout = 0; hdev->advmon_allowlist_duration = 300; hdev->advmon_no_filter_duration = 500; hdev->enable_advmon_interleave_scan = 0x00; /* Default to disable */ hdev->sniff_max_interval = 800; hdev->sniff_min_interval = 80; hdev->le_adv_channel_map = 0x07; hdev->le_adv_min_interval = 0x0800; hdev->le_adv_max_interval = 0x0800; hdev->le_scan_interval = 0x0060; hdev->le_scan_window = 0x0030; hdev->le_scan_int_suspend = 0x0400; hdev->le_scan_window_suspend = 0x0012; hdev->le_scan_int_discovery = DISCOV_LE_SCAN_INT; hdev->le_scan_window_discovery = DISCOV_LE_SCAN_WIN; hdev->le_scan_int_adv_monitor = 0x0060; hdev->le_scan_window_adv_monitor = 0x0030; hdev->le_scan_int_connect = 0x0060; hdev->le_scan_window_connect = 0x0060; hdev->le_conn_min_interval = 0x0018; hdev->le_conn_max_interval = 0x0028; hdev->le_conn_latency = 0x0000; hdev->le_supv_timeout = 0x002a; hdev->le_def_tx_len = 0x001b; hdev->le_def_tx_time = 0x0148; hdev->le_max_tx_len = 0x001b; hdev->le_max_tx_time = 0x0148; hdev->le_max_rx_len = 0x001b; hdev->le_max_rx_time = 0x0148; hdev->le_max_key_size = SMP_MAX_ENC_KEY_SIZE; hdev->le_min_key_size = SMP_MIN_ENC_KEY_SIZE; hdev->le_tx_def_phys = HCI_LE_SET_PHY_1M; hdev->le_rx_def_phys = HCI_LE_SET_PHY_1M; hdev->le_num_of_adv_sets = HCI_MAX_ADV_INSTANCES; hdev->def_multi_adv_rotation_duration = HCI_DEFAULT_ADV_DURATION; hdev->def_le_autoconnect_timeout = HCI_LE_AUTOCONN_TIMEOUT; hdev->min_le_tx_power = HCI_TX_POWER_INVALID; hdev->max_le_tx_power = HCI_TX_POWER_INVALID; hdev->rpa_timeout = HCI_DEFAULT_RPA_TIMEOUT; hdev->discov_interleaved_timeout = DISCOV_INTERLEAVED_TIMEOUT; hdev->conn_info_min_age = DEFAULT_CONN_INFO_MIN_AGE; hdev->conn_info_max_age = DEFAULT_CONN_INFO_MAX_AGE; hdev->auth_payload_timeout = DEFAULT_AUTH_PAYLOAD_TIMEOUT; hdev->min_enc_key_size = HCI_MIN_ENC_KEY_SIZE; /* default 1.28 sec page scan */ hdev->def_page_scan_type = PAGE_SCAN_TYPE_STANDARD; hdev->def_page_scan_int = 0x0800; hdev->def_page_scan_window = 0x0012; mutex_init(&hdev->lock); mutex_init(&hdev->req_lock); ida_init(&hdev->unset_handle_ida); INIT_LIST_HEAD(&hdev->mesh_pending); INIT_LIST_HEAD(&hdev->mgmt_pending); INIT_LIST_HEAD(&hdev->reject_list); INIT_LIST_HEAD(&hdev->accept_list); INIT_LIST_HEAD(&hdev->uuids); INIT_LIST_HEAD(&hdev->link_keys); INIT_LIST_HEAD(&hdev->long_term_keys); INIT_LIST_HEAD(&hdev->identity_resolving_keys); INIT_LIST_HEAD(&hdev->remote_oob_data); INIT_LIST_HEAD(&hdev->le_accept_list); INIT_LIST_HEAD(&hdev->le_resolv_list); INIT_LIST_HEAD(&hdev->le_conn_params); INIT_LIST_HEAD(&hdev->pend_le_conns); INIT_LIST_HEAD(&hdev->pend_le_reports); INIT_LIST_HEAD(&hdev->conn_hash.list); INIT_LIST_HEAD(&hdev->adv_instances); INIT_LIST_HEAD(&hdev->blocked_keys); INIT_LIST_HEAD(&hdev->monitored_devices); INIT_LIST_HEAD(&hdev->local_codecs); INIT_WORK(&hdev->rx_work, hci_rx_work); INIT_WORK(&hdev->cmd_work, hci_cmd_work); INIT_WORK(&hdev->tx_work, hci_tx_work); INIT_WORK(&hdev->power_on, hci_power_on); INIT_WORK(&hdev->error_reset, hci_error_reset); hci_cmd_sync_init(hdev); INIT_DELAYED_WORK(&hdev->power_off, hci_power_off); skb_queue_head_init(&hdev->rx_q); skb_queue_head_init(&hdev->cmd_q); skb_queue_head_init(&hdev->raw_q); init_waitqueue_head(&hdev->req_wait_q); INIT_DELAYED_WORK(&hdev->cmd_timer, hci_cmd_timeout); INIT_DELAYED_WORK(&hdev->ncmd_timer, hci_ncmd_timeout); hci_devcd_setup(hdev); hci_request_setup(hdev); hci_init_sysfs(hdev); discovery_init(hdev); return hdev; } EXPORT_SYMBOL(hci_alloc_dev_priv); /* Free HCI device */ void hci_free_dev(struct hci_dev *hdev) { /* will free via device release */ put_device(&hdev->dev); } EXPORT_SYMBOL(hci_free_dev); /* Register HCI device */ int hci_register_dev(struct hci_dev *hdev) { int id, error; if (!hdev->open || !hdev->close || !hdev->send) return -EINVAL; /* Do not allow HCI_AMP devices to register at index 0, * so the index can be used as the AMP controller ID. */ switch (hdev->dev_type) { case HCI_PRIMARY: id = ida_simple_get(&hci_index_ida, 0, HCI_MAX_ID, GFP_KERNEL); break; case HCI_AMP: id = ida_simple_get(&hci_index_ida, 1, HCI_MAX_ID, GFP_KERNEL); break; default: return -EINVAL; } if (id < 0) return id; error = dev_set_name(&hdev->dev, "hci%u", id); if (error) return error; hdev->name = dev_name(&hdev->dev); hdev->id = id; BT_DBG("%p name %s bus %d", hdev, hdev->name, hdev->bus); hdev->workqueue = alloc_ordered_workqueue("%s", WQ_HIGHPRI, hdev->name); if (!hdev->workqueue) { error = -ENOMEM; goto err; } hdev->req_workqueue = alloc_ordered_workqueue("%s", WQ_HIGHPRI, hdev->name); if (!hdev->req_workqueue) { destroy_workqueue(hdev->workqueue); error = -ENOMEM; goto err; } if (!IS_ERR_OR_NULL(bt_debugfs)) hdev->debugfs = debugfs_create_dir(hdev->name, bt_debugfs); error = device_add(&hdev->dev); if (error < 0) goto err_wqueue; hci_leds_init(hdev); hdev->rfkill = rfkill_alloc(hdev->name, &hdev->dev, RFKILL_TYPE_BLUETOOTH, &hci_rfkill_ops, hdev); if (hdev->rfkill) { if (rfkill_register(hdev->rfkill) < 0) { rfkill_destroy(hdev->rfkill); hdev->rfkill = NULL; } } if (hdev->rfkill && rfkill_blocked(hdev->rfkill)) hci_dev_set_flag(hdev, HCI_RFKILLED); hci_dev_set_flag(hdev, HCI_SETUP); hci_dev_set_flag(hdev, HCI_AUTO_OFF); if (hdev->dev_type == HCI_PRIMARY) { /* Assume BR/EDR support until proven otherwise (such as * through reading supported features during init. */ hci_dev_set_flag(hdev, HCI_BREDR_ENABLED); } write_lock(&hci_dev_list_lock); list_add(&hdev->list, &hci_dev_list); write_unlock(&hci_dev_list_lock); /* Devices that are marked for raw-only usage are unconfigured * and should not be included in normal operation. */ if (test_bit(HCI_QUIRK_RAW_DEVICE, &hdev->quirks)) hci_dev_set_flag(hdev, HCI_UNCONFIGURED); /* Mark Remote Wakeup connection flag as supported if driver has wakeup * callback. */ if (hdev->wakeup) hdev->conn_flags |= HCI_CONN_FLAG_REMOTE_WAKEUP; hci_sock_dev_event(hdev, HCI_DEV_REG); hci_dev_hold(hdev); error = hci_register_suspend_notifier(hdev); if (error) BT_WARN("register suspend notifier failed error:%d\n", error); queue_work(hdev->req_workqueue, &hdev->power_on); idr_init(&hdev->adv_monitors_idr); msft_register(hdev); return id; err_wqueue: debugfs_remove_recursive(hdev->debugfs); destroy_workqueue(hdev->workqueue); destroy_workqueue(hdev->req_workqueue); err: ida_simple_remove(&hci_index_ida, hdev->id); return error; } EXPORT_SYMBOL(hci_register_dev); /* Unregister HCI device */ void hci_unregister_dev(struct hci_dev *hdev) { BT_DBG("%p name %s bus %d", hdev, hdev->name, hdev->bus); mutex_lock(&hdev->unregister_lock); hci_dev_set_flag(hdev, HCI_UNREGISTER); mutex_unlock(&hdev->unregister_lock); write_lock(&hci_dev_list_lock); list_del(&hdev->list); write_unlock(&hci_dev_list_lock); cancel_work_sync(&hdev->power_on); hci_cmd_sync_clear(hdev); hci_unregister_suspend_notifier(hdev); msft_unregister(hdev); hci_dev_do_close(hdev); if (!test_bit(HCI_INIT, &hdev->flags) && !hci_dev_test_flag(hdev, HCI_SETUP) && !hci_dev_test_flag(hdev, HCI_CONFIG)) { hci_dev_lock(hdev); mgmt_index_removed(hdev); hci_dev_unlock(hdev); } /* mgmt_index_removed should take care of emptying the * pending list */ BUG_ON(!list_empty(&hdev->mgmt_pending)); hci_sock_dev_event(hdev, HCI_DEV_UNREG); if (hdev->rfkill) { rfkill_unregister(hdev->rfkill); rfkill_destroy(hdev->rfkill); } device_del(&hdev->dev); /* Actual cleanup is deferred until hci_release_dev(). */ hci_dev_put(hdev); } EXPORT_SYMBOL(hci_unregister_dev); /* Release HCI device */ void hci_release_dev(struct hci_dev *hdev) { debugfs_remove_recursive(hdev->debugfs); kfree_const(hdev->hw_info); kfree_const(hdev->fw_info); destroy_workqueue(hdev->workqueue); destroy_workqueue(hdev->req_workqueue); hci_dev_lock(hdev); hci_bdaddr_list_clear(&hdev->reject_list); hci_bdaddr_list_clear(&hdev->accept_list); hci_uuids_clear(hdev); hci_link_keys_clear(hdev); hci_smp_ltks_clear(hdev); hci_smp_irks_clear(hdev); hci_remote_oob_data_clear(hdev); hci_adv_instances_clear(hdev); hci_adv_monitors_clear(hdev); hci_bdaddr_list_clear(&hdev->le_accept_list); hci_bdaddr_list_clear(&hdev->le_resolv_list); hci_conn_params_clear_all(hdev); hci_discovery_filter_clear(hdev); hci_blocked_keys_clear(hdev); hci_codec_list_clear(&hdev->local_codecs); hci_dev_unlock(hdev); ida_destroy(&hdev->unset_handle_ida); ida_simple_remove(&hci_index_ida, hdev->id); kfree_skb(hdev->sent_cmd); kfree_skb(hdev->recv_event); kfree(hdev); } EXPORT_SYMBOL(hci_release_dev); int hci_register_suspend_notifier(struct hci_dev *hdev) { int ret = 0; if (!hdev->suspend_notifier.notifier_call && !test_bit(HCI_QUIRK_NO_SUSPEND_NOTIFIER, &hdev->quirks)) { hdev->suspend_notifier.notifier_call = hci_suspend_notifier; ret = register_pm_notifier(&hdev->suspend_notifier); } return ret; } int hci_unregister_suspend_notifier(struct hci_dev *hdev) { int ret = 0; if (hdev->suspend_notifier.notifier_call) { ret = unregister_pm_notifier(&hdev->suspend_notifier); if (!ret) hdev->suspend_notifier.notifier_call = NULL; } return ret; } /* Suspend HCI device */ int hci_suspend_dev(struct hci_dev *hdev) { int ret; bt_dev_dbg(hdev, ""); /* Suspend should only act on when powered. */ if (!hdev_is_powered(hdev) || hci_dev_test_flag(hdev, HCI_UNREGISTER)) return 0; /* If powering down don't attempt to suspend */ if (mgmt_powering_down(hdev)) return 0; /* Cancel potentially blocking sync operation before suspend */ __hci_cmd_sync_cancel(hdev, -EHOSTDOWN); hci_req_sync_lock(hdev); ret = hci_suspend_sync(hdev); hci_req_sync_unlock(hdev); hci_clear_wake_reason(hdev); mgmt_suspending(hdev, hdev->suspend_state); hci_sock_dev_event(hdev, HCI_DEV_SUSPEND); return ret; } EXPORT_SYMBOL(hci_suspend_dev); /* Resume HCI device */ int hci_resume_dev(struct hci_dev *hdev) { int ret; bt_dev_dbg(hdev, ""); /* Resume should only act on when powered. */ if (!hdev_is_powered(hdev) || hci_dev_test_flag(hdev, HCI_UNREGISTER)) return 0; /* If powering down don't attempt to resume */ if (mgmt_powering_down(hdev)) return 0; hci_req_sync_lock(hdev); ret = hci_resume_sync(hdev); hci_req_sync_unlock(hdev); mgmt_resuming(hdev, hdev->wake_reason, &hdev->wake_addr, hdev->wake_addr_type); hci_sock_dev_event(hdev, HCI_DEV_RESUME); return ret; } EXPORT_SYMBOL(hci_resume_dev); /* Reset HCI device */ int hci_reset_dev(struct hci_dev *hdev) { static const u8 hw_err[] = { HCI_EV_HARDWARE_ERROR, 0x01, 0x00 }; struct sk_buff *skb; skb = bt_skb_alloc(3, GFP_ATOMIC); if (!skb) return -ENOMEM; hci_skb_pkt_type(skb) = HCI_EVENT_PKT; skb_put_data(skb, hw_err, 3); bt_dev_err(hdev, "Injecting HCI hardware error event"); /* Send Hardware Error to upper stack */ return hci_recv_frame(hdev, skb); } EXPORT_SYMBOL(hci_reset_dev); /* Receive frame from HCI drivers */ int hci_recv_frame(struct hci_dev *hdev, struct sk_buff *skb) { if (!hdev || (!test_bit(HCI_UP, &hdev->flags) && !test_bit(HCI_INIT, &hdev->flags))) { kfree_skb(skb); return -ENXIO; } switch (hci_skb_pkt_type(skb)) { case HCI_EVENT_PKT: break; case HCI_ACLDATA_PKT: /* Detect if ISO packet has been sent as ACL */ if (hci_conn_num(hdev, ISO_LINK)) { __u16 handle = __le16_to_cpu(hci_acl_hdr(skb)->handle); __u8 type; type = hci_conn_lookup_type(hdev, hci_handle(handle)); if (type == ISO_LINK) hci_skb_pkt_type(skb) = HCI_ISODATA_PKT; } break; case HCI_SCODATA_PKT: break; case HCI_ISODATA_PKT: break; default: kfree_skb(skb); return -EINVAL; } /* Incoming skb */ bt_cb(skb)->incoming = 1; /* Time stamp */ __net_timestamp(skb); skb_queue_tail(&hdev->rx_q, skb); queue_work(hdev->workqueue, &hdev->rx_work); return 0; } EXPORT_SYMBOL(hci_recv_frame); /* Receive diagnostic message from HCI drivers */ int hci_recv_diag(struct hci_dev *hdev, struct sk_buff *skb) { /* Mark as diagnostic packet */ hci_skb_pkt_type(skb) = HCI_DIAG_PKT; /* Time stamp */ __net_timestamp(skb); skb_queue_tail(&hdev->rx_q, skb); queue_work(hdev->workqueue, &hdev->rx_work); return 0; } EXPORT_SYMBOL(hci_recv_diag); void hci_set_hw_info(struct hci_dev *hdev, const char *fmt, ...) { va_list vargs; va_start(vargs, fmt); kfree_const(hdev->hw_info); hdev->hw_info = kvasprintf_const(GFP_KERNEL, fmt, vargs); va_end(vargs); } EXPORT_SYMBOL(hci_set_hw_info); void hci_set_fw_info(struct hci_dev *hdev, const char *fmt, ...) { va_list vargs; va_start(vargs, fmt); kfree_const(hdev->fw_info); hdev->fw_info = kvasprintf_const(GFP_KERNEL, fmt, vargs); va_end(vargs); } EXPORT_SYMBOL(hci_set_fw_info); /* ---- Interface to upper protocols ---- */ int hci_register_cb(struct hci_cb *cb) { BT_DBG("%p name %s", cb, cb->name); mutex_lock(&hci_cb_list_lock); list_add_tail(&cb->list, &hci_cb_list); mutex_unlock(&hci_cb_list_lock); return 0; } EXPORT_SYMBOL(hci_register_cb); int hci_unregister_cb(struct hci_cb *cb) { BT_DBG("%p name %s", cb, cb->name); mutex_lock(&hci_cb_list_lock); list_del(&cb->list); mutex_unlock(&hci_cb_list_lock); return 0; } EXPORT_SYMBOL(hci_unregister_cb); static int hci_send_frame(struct hci_dev *hdev, struct sk_buff *skb) { int err; BT_DBG("%s type %d len %d", hdev->name, hci_skb_pkt_type(skb), skb->len); /* Time stamp */ __net_timestamp(skb); /* Send copy to monitor */ hci_send_to_monitor(hdev, skb); if (atomic_read(&hdev->promisc)) { /* Send copy to the sockets */ hci_send_to_sock(hdev, skb); } /* Get rid of skb owner, prior to sending to the driver. */ skb_orphan(skb); if (!test_bit(HCI_RUNNING, &hdev->flags)) { kfree_skb(skb); return -EINVAL; } err = hdev->send(hdev, skb); if (err < 0) { bt_dev_err(hdev, "sending frame failed (%d)", err); kfree_skb(skb); return err; } return 0; } /* Send HCI command */ int hci_send_cmd(struct hci_dev *hdev, __u16 opcode, __u32 plen, const void *param) { struct sk_buff *skb; BT_DBG("%s opcode 0x%4.4x plen %d", hdev->name, opcode, plen); skb = hci_prepare_cmd(hdev, opcode, plen, param); if (!skb) { bt_dev_err(hdev, "no memory for command"); return -ENOMEM; } /* Stand-alone HCI commands must be flagged as * single-command requests. */ bt_cb(skb)->hci.req_flags |= HCI_REQ_START; skb_queue_tail(&hdev->cmd_q, skb); queue_work(hdev->workqueue, &hdev->cmd_work); return 0; } int __hci_cmd_send(struct hci_dev *hdev, u16 opcode, u32 plen, const void *param) { struct sk_buff *skb; if (hci_opcode_ogf(opcode) != 0x3f) { /* A controller receiving a command shall respond with either * a Command Status Event or a Command Complete Event. * Therefore, all standard HCI commands must be sent via the * standard API, using hci_send_cmd or hci_cmd_sync helpers. * Some vendors do not comply with this rule for vendor-specific * commands and do not return any event. We want to support * unresponded commands for such cases only. */ bt_dev_err(hdev, "unresponded command not supported"); return -EINVAL; } skb = hci_prepare_cmd(hdev, opcode, plen, param); if (!skb) { bt_dev_err(hdev, "no memory for command (opcode 0x%4.4x)", opcode); return -ENOMEM; } hci_send_frame(hdev, skb); return 0; } EXPORT_SYMBOL(__hci_cmd_send); /* Get data from the previously sent command */ void *hci_sent_cmd_data(struct hci_dev *hdev, __u16 opcode) { struct hci_command_hdr *hdr; if (!hdev->sent_cmd) return NULL; hdr = (void *) hdev->sent_cmd->data; if (hdr->opcode != cpu_to_le16(opcode)) return NULL; BT_DBG("%s opcode 0x%4.4x", hdev->name, opcode); return hdev->sent_cmd->data + HCI_COMMAND_HDR_SIZE; } /* Get data from last received event */ void *hci_recv_event_data(struct hci_dev *hdev, __u8 event) { struct hci_event_hdr *hdr; int offset; if (!hdev->recv_event) return NULL; hdr = (void *)hdev->recv_event->data; offset = sizeof(*hdr); if (hdr->evt != event) { /* In case of LE metaevent check the subevent match */ if (hdr->evt == HCI_EV_LE_META) { struct hci_ev_le_meta *ev; ev = (void *)hdev->recv_event->data + offset; offset += sizeof(*ev); if (ev->subevent == event) goto found; } return NULL; } found: bt_dev_dbg(hdev, "event 0x%2.2x", event); return hdev->recv_event->data + offset; } /* Send ACL data */ static void hci_add_acl_hdr(struct sk_buff *skb, __u16 handle, __u16 flags) { struct hci_acl_hdr *hdr; int len = skb->len; skb_push(skb, HCI_ACL_HDR_SIZE); skb_reset_transport_header(skb); hdr = (struct hci_acl_hdr *)skb_transport_header(skb); hdr->handle = cpu_to_le16(hci_handle_pack(handle, flags)); hdr->dlen = cpu_to_le16(len); } static void hci_queue_acl(struct hci_chan *chan, struct sk_buff_head *queue, struct sk_buff *skb, __u16 flags) { struct hci_conn *conn = chan->conn; struct hci_dev *hdev = conn->hdev; struct sk_buff *list; skb->len = skb_headlen(skb); skb->data_len = 0; hci_skb_pkt_type(skb) = HCI_ACLDATA_PKT; switch (hdev->dev_type) { case HCI_PRIMARY: hci_add_acl_hdr(skb, conn->handle, flags); break; case HCI_AMP: hci_add_acl_hdr(skb, chan->handle, flags); break; default: bt_dev_err(hdev, "unknown dev_type %d", hdev->dev_type); return; } list = skb_shinfo(skb)->frag_list; if (!list) { /* Non fragmented */ BT_DBG("%s nonfrag skb %p len %d", hdev->name, skb, skb->len); skb_queue_tail(queue, skb); } else { /* Fragmented */ BT_DBG("%s frag %p len %d", hdev->name, skb, skb->len); skb_shinfo(skb)->frag_list = NULL; /* Queue all fragments atomically. We need to use spin_lock_bh * here because of 6LoWPAN links, as there this function is * called from softirq and using normal spin lock could cause * deadlocks. */ spin_lock_bh(&queue->lock); __skb_queue_tail(queue, skb); flags &= ~ACL_START; flags |= ACL_CONT; do { skb = list; list = list->next; hci_skb_pkt_type(skb) = HCI_ACLDATA_PKT; hci_add_acl_hdr(skb, conn->handle, flags); BT_DBG("%s frag %p len %d", hdev->name, skb, skb->len); __skb_queue_tail(queue, skb); } while (list); spin_unlock_bh(&queue->lock); } } void hci_send_acl(struct hci_chan *chan, struct sk_buff *skb, __u16 flags) { struct hci_dev *hdev = chan->conn->hdev; BT_DBG("%s chan %p flags 0x%4.4x", hdev->name, chan, flags); hci_queue_acl(chan, &chan->data_q, skb, flags); queue_work(hdev->workqueue, &hdev->tx_work); } /* Send SCO data */ void hci_send_sco(struct hci_conn *conn, struct sk_buff *skb) { struct hci_dev *hdev = conn->hdev; struct hci_sco_hdr hdr; BT_DBG("%s len %d", hdev->name, skb->len); hdr.handle = cpu_to_le16(conn->handle); hdr.dlen = skb->len; skb_push(skb, HCI_SCO_HDR_SIZE); skb_reset_transport_header(skb); memcpy(skb_transport_header(skb), &hdr, HCI_SCO_HDR_SIZE); hci_skb_pkt_type(skb) = HCI_SCODATA_PKT; skb_queue_tail(&conn->data_q, skb); queue_work(hdev->workqueue, &hdev->tx_work); } /* Send ISO data */ static void hci_add_iso_hdr(struct sk_buff *skb, __u16 handle, __u8 flags) { struct hci_iso_hdr *hdr; int len = skb->len; skb_push(skb, HCI_ISO_HDR_SIZE); skb_reset_transport_header(skb); hdr = (struct hci_iso_hdr *)skb_transport_header(skb); hdr->handle = cpu_to_le16(hci_handle_pack(handle, flags)); hdr->dlen = cpu_to_le16(len); } static void hci_queue_iso(struct hci_conn *conn, struct sk_buff_head *queue, struct sk_buff *skb) { struct hci_dev *hdev = conn->hdev; struct sk_buff *list; __u16 flags; skb->len = skb_headlen(skb); skb->data_len = 0; hci_skb_pkt_type(skb) = HCI_ISODATA_PKT; list = skb_shinfo(skb)->frag_list; flags = hci_iso_flags_pack(list ? ISO_START : ISO_SINGLE, 0x00); hci_add_iso_hdr(skb, conn->handle, flags); if (!list) { /* Non fragmented */ BT_DBG("%s nonfrag skb %p len %d", hdev->name, skb, skb->len); skb_queue_tail(queue, skb); } else { /* Fragmented */ BT_DBG("%s frag %p len %d", hdev->name, skb, skb->len); skb_shinfo(skb)->frag_list = NULL; __skb_queue_tail(queue, skb); do { skb = list; list = list->next; hci_skb_pkt_type(skb) = HCI_ISODATA_PKT; flags = hci_iso_flags_pack(list ? ISO_CONT : ISO_END, 0x00); hci_add_iso_hdr(skb, conn->handle, flags); BT_DBG("%s frag %p len %d", hdev->name, skb, skb->len); __skb_queue_tail(queue, skb); } while (list); } } void hci_send_iso(struct hci_conn *conn, struct sk_buff *skb) { struct hci_dev *hdev = conn->hdev; BT_DBG("%s len %d", hdev->name, skb->len); hci_queue_iso(conn, &conn->data_q, skb); queue_work(hdev->workqueue, &hdev->tx_work); } /* ---- HCI TX task (outgoing data) ---- */ /* HCI Connection scheduler */ static inline void hci_quote_sent(struct hci_conn *conn, int num, int *quote) { struct hci_dev *hdev; int cnt, q; if (!conn) { *quote = 0; return; } hdev = conn->hdev; switch (conn->type) { case ACL_LINK: cnt = hdev->acl_cnt; break; case AMP_LINK: cnt = hdev->block_cnt; break; case SCO_LINK: case ESCO_LINK: cnt = hdev->sco_cnt; break; case LE_LINK: cnt = hdev->le_mtu ? hdev->le_cnt : hdev->acl_cnt; break; case ISO_LINK: cnt = hdev->iso_mtu ? hdev->iso_cnt : hdev->le_mtu ? hdev->le_cnt : hdev->acl_cnt; break; default: cnt = 0; bt_dev_err(hdev, "unknown link type %d", conn->type); } q = cnt / num; *quote = q ? q : 1; } static struct hci_conn *hci_low_sent(struct hci_dev *hdev, __u8 type, int *quote) { struct hci_conn_hash *h = &hdev->conn_hash; struct hci_conn *conn = NULL, *c; unsigned int num = 0, min = ~0; /* We don't have to lock device here. Connections are always * added and removed with TX task disabled. */ rcu_read_lock(); list_for_each_entry_rcu(c, &h->list, list) { if (c->type != type || skb_queue_empty(&c->data_q)) continue; if (c->state != BT_CONNECTED && c->state != BT_CONFIG) continue; num++; if (c->sent < min) { min = c->sent; conn = c; } if (hci_conn_num(hdev, type) == num) break; } rcu_read_unlock(); hci_quote_sent(conn, num, quote); BT_DBG("conn %p quote %d", conn, *quote); return conn; } static void hci_link_tx_to(struct hci_dev *hdev, __u8 type) { struct hci_conn_hash *h = &hdev->conn_hash; struct hci_conn *c; bt_dev_err(hdev, "link tx timeout"); rcu_read_lock(); /* Kill stalled connections */ list_for_each_entry_rcu(c, &h->list, list) { if (c->type == type && c->sent) { bt_dev_err(hdev, "killing stalled connection %pMR", &c->dst); /* hci_disconnect might sleep, so, we have to release * the RCU read lock before calling it. */ rcu_read_unlock(); hci_disconnect(c, HCI_ERROR_REMOTE_USER_TERM); rcu_read_lock(); } } rcu_read_unlock(); } static struct hci_chan *hci_chan_sent(struct hci_dev *hdev, __u8 type, int *quote) { struct hci_conn_hash *h = &hdev->conn_hash; struct hci_chan *chan = NULL; unsigned int num = 0, min = ~0, cur_prio = 0; struct hci_conn *conn; int conn_num = 0; BT_DBG("%s", hdev->name); rcu_read_lock(); list_for_each_entry_rcu(conn, &h->list, list) { struct hci_chan *tmp; if (conn->type != type) continue; if (conn->state != BT_CONNECTED && conn->state != BT_CONFIG) continue; conn_num++; list_for_each_entry_rcu(tmp, &conn->chan_list, list) { struct sk_buff *skb; if (skb_queue_empty(&tmp->data_q)) continue; skb = skb_peek(&tmp->data_q); if (skb->priority < cur_prio) continue; if (skb->priority > cur_prio) { num = 0; min = ~0; cur_prio = skb->priority; } num++; if (conn->sent < min) { min = conn->sent; chan = tmp; } } if (hci_conn_num(hdev, type) == conn_num) break; } rcu_read_unlock(); if (!chan) return NULL; hci_quote_sent(chan->conn, num, quote); BT_DBG("chan %p quote %d", chan, *quote); return chan; } static void hci_prio_recalculate(struct hci_dev *hdev, __u8 type) { struct hci_conn_hash *h = &hdev->conn_hash; struct hci_conn *conn; int num = 0; BT_DBG("%s", hdev->name); rcu_read_lock(); list_for_each_entry_rcu(conn, &h->list, list) { struct hci_chan *chan; if (conn->type != type) continue; if (conn->state != BT_CONNECTED && conn->state != BT_CONFIG) continue; num++; list_for_each_entry_rcu(chan, &conn->chan_list, list) { struct sk_buff *skb; if (chan->sent) { chan->sent = 0; continue; } if (skb_queue_empty(&chan->data_q)) continue; skb = skb_peek(&chan->data_q); if (skb->priority >= HCI_PRIO_MAX - 1) continue; skb->priority = HCI_PRIO_MAX - 1; BT_DBG("chan %p skb %p promoted to %d", chan, skb, skb->priority); } if (hci_conn_num(hdev, type) == num) break; } rcu_read_unlock(); } static inline int __get_blocks(struct hci_dev *hdev, struct sk_buff *skb) { /* Calculate count of blocks used by this packet */ return DIV_ROUND_UP(skb->len - HCI_ACL_HDR_SIZE, hdev->block_len); } static void __check_timeout(struct hci_dev *hdev, unsigned int cnt, u8 type) { unsigned long last_tx; if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) return; switch (type) { case LE_LINK: last_tx = hdev->le_last_tx; break; default: last_tx = hdev->acl_last_tx; break; } /* tx timeout must be longer than maximum link supervision timeout * (40.9 seconds) */ if (!cnt && time_after(jiffies, last_tx + HCI_ACL_TX_TIMEOUT)) hci_link_tx_to(hdev, type); } /* Schedule SCO */ static void hci_sched_sco(struct hci_dev *hdev) { struct hci_conn *conn; struct sk_buff *skb; int quote; BT_DBG("%s", hdev->name); if (!hci_conn_num(hdev, SCO_LINK)) return; while (hdev->sco_cnt && (conn = hci_low_sent(hdev, SCO_LINK, &quote))) { while (quote-- && (skb = skb_dequeue(&conn->data_q))) { BT_DBG("skb %p len %d", skb, skb->len); hci_send_frame(hdev, skb); conn->sent++; if (conn->sent == ~0) conn->sent = 0; } } } static void hci_sched_esco(struct hci_dev *hdev) { struct hci_conn *conn; struct sk_buff *skb; int quote; BT_DBG("%s", hdev->name); if (!hci_conn_num(hdev, ESCO_LINK)) return; while (hdev->sco_cnt && (conn = hci_low_sent(hdev, ESCO_LINK, &quote))) { while (quote-- && (skb = skb_dequeue(&conn->data_q))) { BT_DBG("skb %p len %d", skb, skb->len); hci_send_frame(hdev, skb); conn->sent++; if (conn->sent == ~0) conn->sent = 0; } } } static void hci_sched_acl_pkt(struct hci_dev *hdev) { unsigned int cnt = hdev->acl_cnt; struct hci_chan *chan; struct sk_buff *skb; int quote; __check_timeout(hdev, cnt, ACL_LINK); while (hdev->acl_cnt && (chan = hci_chan_sent(hdev, ACL_LINK, &quote))) { u32 priority = (skb_peek(&chan->data_q))->priority; while (quote-- && (skb = skb_peek(&chan->data_q))) { BT_DBG("chan %p skb %p len %d priority %u", chan, skb, skb->len, skb->priority); /* Stop if priority has changed */ if (skb->priority < priority) break; skb = skb_dequeue(&chan->data_q); hci_conn_enter_active_mode(chan->conn, bt_cb(skb)->force_active); hci_send_frame(hdev, skb); hdev->acl_last_tx = jiffies; hdev->acl_cnt--; chan->sent++; chan->conn->sent++; /* Send pending SCO packets right away */ hci_sched_sco(hdev); hci_sched_esco(hdev); } } if (cnt != hdev->acl_cnt) hci_prio_recalculate(hdev, ACL_LINK); } static void hci_sched_acl_blk(struct hci_dev *hdev) { unsigned int cnt = hdev->block_cnt; struct hci_chan *chan; struct sk_buff *skb; int quote; u8 type; BT_DBG("%s", hdev->name); if (hdev->dev_type == HCI_AMP) type = AMP_LINK; else type = ACL_LINK; __check_timeout(hdev, cnt, type); while (hdev->block_cnt > 0 && (chan = hci_chan_sent(hdev, type, &quote))) { u32 priority = (skb_peek(&chan->data_q))->priority; while (quote > 0 && (skb = skb_peek(&chan->data_q))) { int blocks; BT_DBG("chan %p skb %p len %d priority %u", chan, skb, skb->len, skb->priority); /* Stop if priority has changed */ if (skb->priority < priority) break; skb = skb_dequeue(&chan->data_q); blocks = __get_blocks(hdev, skb); if (blocks > hdev->block_cnt) return; hci_conn_enter_active_mode(chan->conn, bt_cb(skb)->force_active); hci_send_frame(hdev, skb); hdev->acl_last_tx = jiffies; hdev->block_cnt -= blocks; quote -= blocks; chan->sent += blocks; chan->conn->sent += blocks; } } if (cnt != hdev->block_cnt) hci_prio_recalculate(hdev, type); } static void hci_sched_acl(struct hci_dev *hdev) { BT_DBG("%s", hdev->name); /* No ACL link over BR/EDR controller */ if (!hci_conn_num(hdev, ACL_LINK) && hdev->dev_type == HCI_PRIMARY) return; /* No AMP link over AMP controller */ if (!hci_conn_num(hdev, AMP_LINK) && hdev->dev_type == HCI_AMP) return; switch (hdev->flow_ctl_mode) { case HCI_FLOW_CTL_MODE_PACKET_BASED: hci_sched_acl_pkt(hdev); break; case HCI_FLOW_CTL_MODE_BLOCK_BASED: hci_sched_acl_blk(hdev); break; } } static void hci_sched_le(struct hci_dev *hdev) { struct hci_chan *chan; struct sk_buff *skb; int quote, cnt, tmp; BT_DBG("%s", hdev->name); if (!hci_conn_num(hdev, LE_LINK)) return; cnt = hdev->le_pkts ? hdev->le_cnt : hdev->acl_cnt; __check_timeout(hdev, cnt, LE_LINK); tmp = cnt; while (cnt && (chan = hci_chan_sent(hdev, LE_LINK, &quote))) { u32 priority = (skb_peek(&chan->data_q))->priority; while (quote-- && (skb = skb_peek(&chan->data_q))) { BT_DBG("chan %p skb %p len %d priority %u", chan, skb, skb->len, skb->priority); /* Stop if priority has changed */ if (skb->priority < priority) break; skb = skb_dequeue(&chan->data_q); hci_send_frame(hdev, skb); hdev->le_last_tx = jiffies; cnt--; chan->sent++; chan->conn->sent++; /* Send pending SCO packets right away */ hci_sched_sco(hdev); hci_sched_esco(hdev); } } if (hdev->le_pkts) hdev->le_cnt = cnt; else hdev->acl_cnt = cnt; if (cnt != tmp) hci_prio_recalculate(hdev, LE_LINK); } /* Schedule CIS */ static void hci_sched_iso(struct hci_dev *hdev) { struct hci_conn *conn; struct sk_buff *skb; int quote, *cnt; BT_DBG("%s", hdev->name); if (!hci_conn_num(hdev, ISO_LINK)) return; cnt = hdev->iso_pkts ? &hdev->iso_cnt : hdev->le_pkts ? &hdev->le_cnt : &hdev->acl_cnt; while (*cnt && (conn = hci_low_sent(hdev, ISO_LINK, &quote))) { while (quote-- && (skb = skb_dequeue(&conn->data_q))) { BT_DBG("skb %p len %d", skb, skb->len); hci_send_frame(hdev, skb); conn->sent++; if (conn->sent == ~0) conn->sent = 0; (*cnt)--; } } } static void hci_tx_work(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, tx_work); struct sk_buff *skb; BT_DBG("%s acl %d sco %d le %d iso %d", hdev->name, hdev->acl_cnt, hdev->sco_cnt, hdev->le_cnt, hdev->iso_cnt); if (!hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) { /* Schedule queues and send stuff to HCI driver */ hci_sched_sco(hdev); hci_sched_esco(hdev); hci_sched_iso(hdev); hci_sched_acl(hdev); hci_sched_le(hdev); } /* Send next queued raw (unknown type) packet */ while ((skb = skb_dequeue(&hdev->raw_q))) hci_send_frame(hdev, skb); } /* ----- HCI RX task (incoming data processing) ----- */ /* ACL data packet */ static void hci_acldata_packet(struct hci_dev *hdev, struct sk_buff *skb) { struct hci_acl_hdr *hdr = (void *) skb->data; struct hci_conn *conn; __u16 handle, flags; skb_pull(skb, HCI_ACL_HDR_SIZE); handle = __le16_to_cpu(hdr->handle); flags = hci_flags(handle); handle = hci_handle(handle); BT_DBG("%s len %d handle 0x%4.4x flags 0x%4.4x", hdev->name, skb->len, handle, flags); hdev->stat.acl_rx++; hci_dev_lock(hdev); conn = hci_conn_hash_lookup_handle(hdev, handle); hci_dev_unlock(hdev); if (conn) { hci_conn_enter_active_mode(conn, BT_POWER_FORCE_ACTIVE_OFF); /* Send to upper protocol */ l2cap_recv_acldata(conn, skb, flags); return; } else { bt_dev_err(hdev, "ACL packet for unknown connection handle %d", handle); } kfree_skb(skb); } /* SCO data packet */ static void hci_scodata_packet(struct hci_dev *hdev, struct sk_buff *skb) { struct hci_sco_hdr *hdr = (void *) skb->data; struct hci_conn *conn; __u16 handle, flags; skb_pull(skb, HCI_SCO_HDR_SIZE); handle = __le16_to_cpu(hdr->handle); flags = hci_flags(handle); handle = hci_handle(handle); BT_DBG("%s len %d handle 0x%4.4x flags 0x%4.4x", hdev->name, skb->len, handle, flags); hdev->stat.sco_rx++; hci_dev_lock(hdev); conn = hci_conn_hash_lookup_handle(hdev, handle); hci_dev_unlock(hdev); if (conn) { /* Send to upper protocol */ hci_skb_pkt_status(skb) = flags & 0x03; sco_recv_scodata(conn, skb); return; } else { bt_dev_err_ratelimited(hdev, "SCO packet for unknown connection handle %d", handle); } kfree_skb(skb); } static void hci_isodata_packet(struct hci_dev *hdev, struct sk_buff *skb) { struct hci_iso_hdr *hdr; struct hci_conn *conn; __u16 handle, flags; hdr = skb_pull_data(skb, sizeof(*hdr)); if (!hdr) { bt_dev_err(hdev, "ISO packet too small"); goto drop; } handle = __le16_to_cpu(hdr->handle); flags = hci_flags(handle); handle = hci_handle(handle); bt_dev_dbg(hdev, "len %d handle 0x%4.4x flags 0x%4.4x", skb->len, handle, flags); hci_dev_lock(hdev); conn = hci_conn_hash_lookup_handle(hdev, handle); hci_dev_unlock(hdev); if (!conn) { bt_dev_err(hdev, "ISO packet for unknown connection handle %d", handle); goto drop; } /* Send to upper protocol */ iso_recv(conn, skb, flags); return; drop: kfree_skb(skb); } static bool hci_req_is_complete(struct hci_dev *hdev) { struct sk_buff *skb; skb = skb_peek(&hdev->cmd_q); if (!skb) return true; return (bt_cb(skb)->hci.req_flags & HCI_REQ_START); } static void hci_resend_last(struct hci_dev *hdev) { struct hci_command_hdr *sent; struct sk_buff *skb; u16 opcode; if (!hdev->sent_cmd) return; sent = (void *) hdev->sent_cmd->data; opcode = __le16_to_cpu(sent->opcode); if (opcode == HCI_OP_RESET) return; skb = skb_clone(hdev->sent_cmd, GFP_KERNEL); if (!skb) return; skb_queue_head(&hdev->cmd_q, skb); queue_work(hdev->workqueue, &hdev->cmd_work); } void hci_req_cmd_complete(struct hci_dev *hdev, u16 opcode, u8 status, hci_req_complete_t *req_complete, hci_req_complete_skb_t *req_complete_skb) { struct sk_buff *skb; unsigned long flags; BT_DBG("opcode 0x%04x status 0x%02x", opcode, status); /* If the completed command doesn't match the last one that was * sent we need to do special handling of it. */ if (!hci_sent_cmd_data(hdev, opcode)) { /* Some CSR based controllers generate a spontaneous * reset complete event during init and any pending * command will never be completed. In such a case we * need to resend whatever was the last sent * command. */ if (test_bit(HCI_INIT, &hdev->flags) && opcode == HCI_OP_RESET) hci_resend_last(hdev); return; } /* If we reach this point this event matches the last command sent */ hci_dev_clear_flag(hdev, HCI_CMD_PENDING); /* If the command succeeded and there's still more commands in * this request the request is not yet complete. */ if (!status && !hci_req_is_complete(hdev)) return; /* If this was the last command in a request the complete * callback would be found in hdev->sent_cmd instead of the * command queue (hdev->cmd_q). */ if (bt_cb(hdev->sent_cmd)->hci.req_flags & HCI_REQ_SKB) { *req_complete_skb = bt_cb(hdev->sent_cmd)->hci.req_complete_skb; return; } if (bt_cb(hdev->sent_cmd)->hci.req_complete) { *req_complete = bt_cb(hdev->sent_cmd)->hci.req_complete; return; } /* Remove all pending commands belonging to this request */ spin_lock_irqsave(&hdev->cmd_q.lock, flags); while ((skb = __skb_dequeue(&hdev->cmd_q))) { if (bt_cb(skb)->hci.req_flags & HCI_REQ_START) { __skb_queue_head(&hdev->cmd_q, skb); break; } if (bt_cb(skb)->hci.req_flags & HCI_REQ_SKB) *req_complete_skb = bt_cb(skb)->hci.req_complete_skb; else *req_complete = bt_cb(skb)->hci.req_complete; dev_kfree_skb_irq(skb); } spin_unlock_irqrestore(&hdev->cmd_q.lock, flags); } static void hci_rx_work(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, rx_work); struct sk_buff *skb; BT_DBG("%s", hdev->name); /* The kcov_remote functions used for collecting packet parsing * coverage information from this background thread and associate * the coverage with the syscall's thread which originally injected * the packet. This helps fuzzing the kernel. */ for (; (skb = skb_dequeue(&hdev->rx_q)); kcov_remote_stop()) { kcov_remote_start_common(skb_get_kcov_handle(skb)); /* Send copy to monitor */ hci_send_to_monitor(hdev, skb); if (atomic_read(&hdev->promisc)) { /* Send copy to the sockets */ hci_send_to_sock(hdev, skb); } /* If the device has been opened in HCI_USER_CHANNEL, * the userspace has exclusive access to device. * When device is HCI_INIT, we still need to process * the data packets to the driver in order * to complete its setup(). */ if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL) && !test_bit(HCI_INIT, &hdev->flags)) { kfree_skb(skb); continue; } if (test_bit(HCI_INIT, &hdev->flags)) { /* Don't process data packets in this states. */ switch (hci_skb_pkt_type(skb)) { case HCI_ACLDATA_PKT: case HCI_SCODATA_PKT: case HCI_ISODATA_PKT: kfree_skb(skb); continue; } } /* Process frame */ switch (hci_skb_pkt_type(skb)) { case HCI_EVENT_PKT: BT_DBG("%s Event packet", hdev->name); hci_event_packet(hdev, skb); break; case HCI_ACLDATA_PKT: BT_DBG("%s ACL data packet", hdev->name); hci_acldata_packet(hdev, skb); break; case HCI_SCODATA_PKT: BT_DBG("%s SCO data packet", hdev->name); hci_scodata_packet(hdev, skb); break; case HCI_ISODATA_PKT: BT_DBG("%s ISO data packet", hdev->name); hci_isodata_packet(hdev, skb); break; default: kfree_skb(skb); break; } } } static void hci_cmd_work(struct work_struct *work) { struct hci_dev *hdev = container_of(work, struct hci_dev, cmd_work); struct sk_buff *skb; BT_DBG("%s cmd_cnt %d cmd queued %d", hdev->name, atomic_read(&hdev->cmd_cnt), skb_queue_len(&hdev->cmd_q)); /* Send queued commands */ if (atomic_read(&hdev->cmd_cnt)) { skb = skb_dequeue(&hdev->cmd_q); if (!skb) return; kfree_skb(hdev->sent_cmd); hdev->sent_cmd = skb_clone(skb, GFP_KERNEL); if (hdev->sent_cmd) { int res; if (hci_req_status_pend(hdev)) hci_dev_set_flag(hdev, HCI_CMD_PENDING); atomic_dec(&hdev->cmd_cnt); res = hci_send_frame(hdev, skb); if (res < 0) __hci_cmd_sync_cancel(hdev, -res); rcu_read_lock(); if (test_bit(HCI_RESET, &hdev->flags) || hci_dev_test_flag(hdev, HCI_CMD_DRAIN_WORKQUEUE)) cancel_delayed_work(&hdev->cmd_timer); else queue_delayed_work(hdev->workqueue, &hdev->cmd_timer, HCI_CMD_TIMEOUT); rcu_read_unlock(); } else { skb_queue_head(&hdev->cmd_q, skb); queue_work(hdev->workqueue, &hdev->cmd_work); } } }
9 10 2 12 9 9 9 35 11 11 35 34 31 30 30 12 147 147 147 146 184 184 155 4 155 155 155 12 3 12 28 124 184 13 13 11 5 13 113 115 115 115 115 100 98 44 43 61 60 21 10 12 3 3 3 45 84 75 121 121 117 118 117 116 115 114 108 109 40 47 20 20 31 31 1 3 3 3 2 2 1 1 2 2 15 13 63 30 30 35 35 15 15 15 15 15 5 15 15 4 15 15 15 15 15 4 4 15 1 1 1 15 5 5 15 2 15 2 14 15 15 15 15 15 15 15 15 15 15 15 5 5 15 4 15 15 15 15 15 39 37 4 33 9 36 34 10 32 30 7 6 29 6 5 28 28 1 28 2 28 2 28 3 3 28 3 3 28 1 1 28 8 7 7 5 7 7 5 7 27 12 7 6 6 5 4 4 3 4 4 3 4 25 3 3 25 25 24 25 1 1 1 25 9 18 15 12 13 10 8 2 1 7 3 4 3 7 7 7 4 4 4 4 3 4 1 28 9 9 9 27 3 9 9 9 9 9 5 6 2 4 5 1 8 7 4 4 7 1 7 3 3 7 5 2 3 6 2 2 2 10 1 10 8 4 3 4 10 2 6 5 5 1 3 4 4 5 2 1 1 2 2 2 2 2 14 14 13 12 5 5 4 7 6 1 5 11 6 5 8 18 18 16 16 3 8 35 35 35 11 11 11 35 34 35 35 35 35 34 35 35 35 35 25 11 14 35 9 11 2 9 11 35 9 9 9 2 33 33 11 29 28 28 28 20 18 21 17 16 16 16 16 9 15 16 2 16 1 1 15 2 2 16 3 3 16 4 13 10 10 6 6 5 7 4 3 3 1 3 3 1 3 1 1 3 7 6 5 5 3 7 1 1 7 4 3 7 2 1 1 1 1 1 1 1 2 1 5 6 5 5 8 1 8 8 4 2 155 121 137 137 146 158 158 157 154 155 9 9 5 5 7 7 7 2 7 5 5 5 2 2 2 2 5 7 7 7 7 2 7 5 5 5 5 2 2 2 2 2 2 5 41 41 41 11 31 30 4 31 24 24 4 13 13 6 13 13 13 13 13 12 13 5 13 13 27 15 37 10 22 21 20 19 18 18 14 12 3 3 9 9 162 162 160 155 155 155 5 4 4 4 4 1 4 5 2 2 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 // SPDX-License-Identifier: GPL-2.0-or-later /* * net/key/af_key.c An implementation of PF_KEYv2 sockets. * * Authors: Maxim Giryaev <gem@asplinux.ru> * David S. Miller <davem@redhat.com> * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> * Kunihiro Ishiguro <kunihiro@ipinfusion.com> * Kazunori MIYAZAWA / USAGI Project <miyazawa@linux-ipv6.org> * Derek Atkins <derek@ihtfp.com> */ #include <linux/capability.h> #include <linux/module.h> #include <linux/kernel.h> #include <linux/socket.h> #include <linux/pfkeyv2.h> #include <linux/ipsec.h> #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <linux/in.h> #include <linux/in6.h> #include <linux/proc_fs.h> #include <linux/init.h> #include <linux/slab.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/xfrm.h> #include <net/sock.h> #define _X2KEY(x) ((x) == XFRM_INF ? 0 : (x)) #define _KEY2X(x) ((x) == 0 ? XFRM_INF : (x)) static unsigned int pfkey_net_id __read_mostly; struct netns_pfkey { /* List of all pfkey sockets. */ struct hlist_head table; atomic_t socks_nr; }; static DEFINE_MUTEX(pfkey_mutex); #define DUMMY_MARK 0 static const struct xfrm_mark dummy_mark = {0, 0}; struct pfkey_sock { /* struct sock must be the first member of struct pfkey_sock */ struct sock sk; int registered; int promisc; struct { uint8_t msg_version; uint32_t msg_portid; int (*dump)(struct pfkey_sock *sk); void (*done)(struct pfkey_sock *sk); union { struct xfrm_policy_walk policy; struct xfrm_state_walk state; } u; struct sk_buff *skb; } dump; struct mutex dump_lock; }; static int parse_sockaddr_pair(struct sockaddr *sa, int ext_len, xfrm_address_t *saddr, xfrm_address_t *daddr, u16 *family); static inline struct pfkey_sock *pfkey_sk(struct sock *sk) { return (struct pfkey_sock *)sk; } static int pfkey_can_dump(const struct sock *sk) { if (3 * atomic_read(&sk->sk_rmem_alloc) <= 2 * sk->sk_rcvbuf) return 1; return 0; } static void pfkey_terminate_dump(struct pfkey_sock *pfk) { if (pfk->dump.dump) { if (pfk->dump.skb) { kfree_skb(pfk->dump.skb); pfk->dump.skb = NULL; } pfk->dump.done(pfk); pfk->dump.dump = NULL; pfk->dump.done = NULL; } } static void pfkey_sock_destruct(struct sock *sk) { struct net *net = sock_net(sk); struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id); pfkey_terminate_dump(pfkey_sk(sk)); skb_queue_purge(&sk->sk_receive_queue); if (!sock_flag(sk, SOCK_DEAD)) { pr_err("Attempt to release alive pfkey socket: %p\n", sk); return; } WARN_ON(atomic_read(&sk->sk_rmem_alloc)); WARN_ON(refcount_read(&sk->sk_wmem_alloc)); atomic_dec(&net_pfkey->socks_nr); } static const struct proto_ops pfkey_ops; static void pfkey_insert(struct sock *sk) { struct net *net = sock_net(sk); struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id); mutex_lock(&pfkey_mutex); sk_add_node_rcu(sk, &net_pfkey->table); mutex_unlock(&pfkey_mutex); } static void pfkey_remove(struct sock *sk) { mutex_lock(&pfkey_mutex); sk_del_node_init_rcu(sk); mutex_unlock(&pfkey_mutex); } static struct proto key_proto = { .name = "KEY", .owner = THIS_MODULE, .obj_size = sizeof(struct pfkey_sock), }; static int pfkey_create(struct net *net, struct socket *sock, int protocol, int kern) { struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id); struct sock *sk; struct pfkey_sock *pfk; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; if (sock->type != SOCK_RAW) return -ESOCKTNOSUPPORT; if (protocol != PF_KEY_V2) return -EPROTONOSUPPORT; sk = sk_alloc(net, PF_KEY, GFP_KERNEL, &key_proto, kern); if (sk == NULL) return -ENOMEM; pfk = pfkey_sk(sk); mutex_init(&pfk->dump_lock); sock->ops = &pfkey_ops; sock_init_data(sock, sk); sk->sk_family = PF_KEY; sk->sk_destruct = pfkey_sock_destruct; atomic_inc(&net_pfkey->socks_nr); pfkey_insert(sk); return 0; } static int pfkey_release(struct socket *sock) { struct sock *sk = sock->sk; if (!sk) return 0; pfkey_remove(sk); sock_orphan(sk); sock->sk = NULL; skb_queue_purge(&sk->sk_write_queue); synchronize_rcu(); sock_put(sk); return 0; } static int pfkey_broadcast_one(struct sk_buff *skb, gfp_t allocation, struct sock *sk) { int err = -ENOBUFS; if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) return err; skb = skb_clone(skb, allocation); if (skb) { skb_set_owner_r(skb, sk); skb_queue_tail(&sk->sk_receive_queue, skb); sk->sk_data_ready(sk); err = 0; } return err; } /* Send SKB to all pfkey sockets matching selected criteria. */ #define BROADCAST_ALL 0 #define BROADCAST_ONE 1 #define BROADCAST_REGISTERED 2 #define BROADCAST_PROMISC_ONLY 4 static int pfkey_broadcast(struct sk_buff *skb, gfp_t allocation, int broadcast_flags, struct sock *one_sk, struct net *net) { struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id); struct sock *sk; int err = -ESRCH; /* XXX Do we need something like netlink_overrun? I think * XXX PF_KEY socket apps will not mind current behavior. */ if (!skb) return -ENOMEM; rcu_read_lock(); sk_for_each_rcu(sk, &net_pfkey->table) { struct pfkey_sock *pfk = pfkey_sk(sk); int err2; /* Yes, it means that if you are meant to receive this * pfkey message you receive it twice as promiscuous * socket. */ if (pfk->promisc) pfkey_broadcast_one(skb, GFP_ATOMIC, sk); /* the exact target will be processed later */ if (sk == one_sk) continue; if (broadcast_flags != BROADCAST_ALL) { if (broadcast_flags & BROADCAST_PROMISC_ONLY) continue; if ((broadcast_flags & BROADCAST_REGISTERED) && !pfk->registered) continue; if (broadcast_flags & BROADCAST_ONE) continue; } err2 = pfkey_broadcast_one(skb, GFP_ATOMIC, sk); /* Error is cleared after successful sending to at least one * registered KM */ if ((broadcast_flags & BROADCAST_REGISTERED) && err) err = err2; } rcu_read_unlock(); if (one_sk != NULL) err = pfkey_broadcast_one(skb, allocation, one_sk); kfree_skb(skb); return err; } static int pfkey_do_dump(struct pfkey_sock *pfk) { struct sadb_msg *hdr; int rc; mutex_lock(&pfk->dump_lock); if (!pfk->dump.dump) { rc = 0; goto out; } rc = pfk->dump.dump(pfk); if (rc == -ENOBUFS) { rc = 0; goto out; } if (pfk->dump.skb) { if (!pfkey_can_dump(&pfk->sk)) { rc = 0; goto out; } hdr = (struct sadb_msg *) pfk->dump.skb->data; hdr->sadb_msg_seq = 0; hdr->sadb_msg_errno = rc; pfkey_broadcast(pfk->dump.skb, GFP_ATOMIC, BROADCAST_ONE, &pfk->sk, sock_net(&pfk->sk)); pfk->dump.skb = NULL; } pfkey_terminate_dump(pfk); out: mutex_unlock(&pfk->dump_lock); return rc; } static inline void pfkey_hdr_dup(struct sadb_msg *new, const struct sadb_msg *orig) { *new = *orig; } static int pfkey_error(const struct sadb_msg *orig, int err, struct sock *sk) { struct sk_buff *skb = alloc_skb(sizeof(struct sadb_msg) + 16, GFP_KERNEL); struct sadb_msg *hdr; if (!skb) return -ENOBUFS; /* Woe be to the platform trying to support PFKEY yet * having normal errnos outside the 1-255 range, inclusive. */ err = -err; if (err == ERESTARTSYS || err == ERESTARTNOHAND || err == ERESTARTNOINTR) err = EINTR; if (err >= 512) err = EINVAL; BUG_ON(err <= 0 || err >= 256); hdr = skb_put(skb, sizeof(struct sadb_msg)); pfkey_hdr_dup(hdr, orig); hdr->sadb_msg_errno = (uint8_t) err; hdr->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t)); pfkey_broadcast(skb, GFP_KERNEL, BROADCAST_ONE, sk, sock_net(sk)); return 0; } static const u8 sadb_ext_min_len[] = { [SADB_EXT_RESERVED] = (u8) 0, [SADB_EXT_SA] = (u8) sizeof(struct sadb_sa), [SADB_EXT_LIFETIME_CURRENT] = (u8) sizeof(struct sadb_lifetime), [SADB_EXT_LIFETIME_HARD] = (u8) sizeof(struct sadb_lifetime), [SADB_EXT_LIFETIME_SOFT] = (u8) sizeof(struct sadb_lifetime), [SADB_EXT_ADDRESS_SRC] = (u8) sizeof(struct sadb_address), [SADB_EXT_ADDRESS_DST] = (u8) sizeof(struct sadb_address), [SADB_EXT_ADDRESS_PROXY] = (u8) sizeof(struct sadb_address), [SADB_EXT_KEY_AUTH] = (u8) sizeof(struct sadb_key), [SADB_EXT_KEY_ENCRYPT] = (u8) sizeof(struct sadb_key), [SADB_EXT_IDENTITY_SRC] = (u8) sizeof(struct sadb_ident), [SADB_EXT_IDENTITY_DST] = (u8) sizeof(struct sadb_ident), [SADB_EXT_SENSITIVITY] = (u8) sizeof(struct sadb_sens), [SADB_EXT_PROPOSAL] = (u8) sizeof(struct sadb_prop), [SADB_EXT_SUPPORTED_AUTH] = (u8) sizeof(struct sadb_supported), [SADB_EXT_SUPPORTED_ENCRYPT] = (u8) sizeof(struct sadb_supported), [SADB_EXT_SPIRANGE] = (u8) sizeof(struct sadb_spirange), [SADB_X_EXT_KMPRIVATE] = (u8) sizeof(struct sadb_x_kmprivate), [SADB_X_EXT_POLICY] = (u8) sizeof(struct sadb_x_policy), [SADB_X_EXT_SA2] = (u8) sizeof(struct sadb_x_sa2), [SADB_X_EXT_NAT_T_TYPE] = (u8) sizeof(struct sadb_x_nat_t_type), [SADB_X_EXT_NAT_T_SPORT] = (u8) sizeof(struct sadb_x_nat_t_port), [SADB_X_EXT_NAT_T_DPORT] = (u8) sizeof(struct sadb_x_nat_t_port), [SADB_X_EXT_NAT_T_OA] = (u8) sizeof(struct sadb_address), [SADB_X_EXT_SEC_CTX] = (u8) sizeof(struct sadb_x_sec_ctx), [SADB_X_EXT_KMADDRESS] = (u8) sizeof(struct sadb_x_kmaddress), [SADB_X_EXT_FILTER] = (u8) sizeof(struct sadb_x_filter), }; /* Verify sadb_address_{len,prefixlen} against sa_family. */ static int verify_address_len(const void *p) { const struct sadb_address *sp = p; const struct sockaddr *addr = (const struct sockaddr *)(sp + 1); const struct sockaddr_in *sin; #if IS_ENABLED(CONFIG_IPV6) const struct sockaddr_in6 *sin6; #endif int len; if (sp->sadb_address_len < DIV_ROUND_UP(sizeof(*sp) + offsetofend(typeof(*addr), sa_family), sizeof(uint64_t))) return -EINVAL; switch (addr->sa_family) { case AF_INET: len = DIV_ROUND_UP(sizeof(*sp) + sizeof(*sin), sizeof(uint64_t)); if (sp->sadb_address_len != len || sp->sadb_address_prefixlen > 32) return -EINVAL; break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: len = DIV_ROUND_UP(sizeof(*sp) + sizeof(*sin6), sizeof(uint64_t)); if (sp->sadb_address_len != len || sp->sadb_address_prefixlen > 128) return -EINVAL; break; #endif default: /* It is user using kernel to keep track of security * associations for another protocol, such as * OSPF/RSVP/RIPV2/MIP. It is user's job to verify * lengths. * * XXX Actually, association/policy database is not yet * XXX able to cope with arbitrary sockaddr families. * XXX When it can, remove this -EINVAL. -DaveM */ return -EINVAL; } return 0; } static inline int sadb_key_len(const struct sadb_key *key) { int key_bytes = DIV_ROUND_UP(key->sadb_key_bits, 8); return DIV_ROUND_UP(sizeof(struct sadb_key) + key_bytes, sizeof(uint64_t)); } static int verify_key_len(const void *p) { const struct sadb_key *key = p; if (sadb_key_len(key) > key->sadb_key_len) return -EINVAL; return 0; } static inline int pfkey_sec_ctx_len(const struct sadb_x_sec_ctx *sec_ctx) { return DIV_ROUND_UP(sizeof(struct sadb_x_sec_ctx) + sec_ctx->sadb_x_ctx_len, sizeof(uint64_t)); } static inline int verify_sec_ctx_len(const void *p) { const struct sadb_x_sec_ctx *sec_ctx = p; int len = sec_ctx->sadb_x_ctx_len; if (len > PAGE_SIZE) return -EINVAL; len = pfkey_sec_ctx_len(sec_ctx); if (sec_ctx->sadb_x_sec_len != len) return -EINVAL; return 0; } static inline struct xfrm_user_sec_ctx *pfkey_sadb2xfrm_user_sec_ctx(const struct sadb_x_sec_ctx *sec_ctx, gfp_t gfp) { struct xfrm_user_sec_ctx *uctx = NULL; int ctx_size = sec_ctx->sadb_x_ctx_len; uctx = kmalloc((sizeof(*uctx)+ctx_size), gfp); if (!uctx) return NULL; uctx->len = pfkey_sec_ctx_len(sec_ctx); uctx->exttype = sec_ctx->sadb_x_sec_exttype; uctx->ctx_doi = sec_ctx->sadb_x_ctx_doi; uctx->ctx_alg = sec_ctx->sadb_x_ctx_alg; uctx->ctx_len = sec_ctx->sadb_x_ctx_len; memcpy(uctx + 1, sec_ctx + 1, uctx->ctx_len); return uctx; } static int present_and_same_family(const struct sadb_address *src, const struct sadb_address *dst) { const struct sockaddr *s_addr, *d_addr; if (!src || !dst) return 0; s_addr = (const struct sockaddr *)(src + 1); d_addr = (const struct sockaddr *)(dst + 1); if (s_addr->sa_family != d_addr->sa_family) return 0; if (s_addr->sa_family != AF_INET #if IS_ENABLED(CONFIG_IPV6) && s_addr->sa_family != AF_INET6 #endif ) return 0; return 1; } static int parse_exthdrs(struct sk_buff *skb, const struct sadb_msg *hdr, void **ext_hdrs) { const char *p = (char *) hdr; int len = skb->len; len -= sizeof(*hdr); p += sizeof(*hdr); while (len > 0) { const struct sadb_ext *ehdr = (const struct sadb_ext *) p; uint16_t ext_type; int ext_len; if (len < sizeof(*ehdr)) return -EINVAL; ext_len = ehdr->sadb_ext_len; ext_len *= sizeof(uint64_t); ext_type = ehdr->sadb_ext_type; if (ext_len < sizeof(uint64_t) || ext_len > len || ext_type == SADB_EXT_RESERVED) return -EINVAL; if (ext_type <= SADB_EXT_MAX) { int min = (int) sadb_ext_min_len[ext_type]; if (ext_len < min) return -EINVAL; if (ext_hdrs[ext_type-1] != NULL) return -EINVAL; switch (ext_type) { case SADB_EXT_ADDRESS_SRC: case SADB_EXT_ADDRESS_DST: case SADB_EXT_ADDRESS_PROXY: case SADB_X_EXT_NAT_T_OA: if (verify_address_len(p)) return -EINVAL; break; case SADB_X_EXT_SEC_CTX: if (verify_sec_ctx_len(p)) return -EINVAL; break; case SADB_EXT_KEY_AUTH: case SADB_EXT_KEY_ENCRYPT: if (verify_key_len(p)) return -EINVAL; break; default: break; } ext_hdrs[ext_type-1] = (void *) p; } p += ext_len; len -= ext_len; } return 0; } static uint16_t pfkey_satype2proto(uint8_t satype) { switch (satype) { case SADB_SATYPE_UNSPEC: return IPSEC_PROTO_ANY; case SADB_SATYPE_AH: return IPPROTO_AH; case SADB_SATYPE_ESP: return IPPROTO_ESP; case SADB_X_SATYPE_IPCOMP: return IPPROTO_COMP; default: return 0; } /* NOTREACHED */ } static uint8_t pfkey_proto2satype(uint16_t proto) { switch (proto) { case IPPROTO_AH: return SADB_SATYPE_AH; case IPPROTO_ESP: return SADB_SATYPE_ESP; case IPPROTO_COMP: return SADB_X_SATYPE_IPCOMP; default: return 0; } /* NOTREACHED */ } /* BTW, this scheme means that there is no way with PFKEY2 sockets to * say specifically 'just raw sockets' as we encode them as 255. */ static uint8_t pfkey_proto_to_xfrm(uint8_t proto) { return proto == IPSEC_PROTO_ANY ? 0 : proto; } static uint8_t pfkey_proto_from_xfrm(uint8_t proto) { return proto ? proto : IPSEC_PROTO_ANY; } static inline int pfkey_sockaddr_len(sa_family_t family) { switch (family) { case AF_INET: return sizeof(struct sockaddr_in); #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: return sizeof(struct sockaddr_in6); #endif } return 0; } static int pfkey_sockaddr_extract(const struct sockaddr *sa, xfrm_address_t *xaddr) { switch (sa->sa_family) { case AF_INET: xaddr->a4 = ((struct sockaddr_in *)sa)->sin_addr.s_addr; return AF_INET; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: memcpy(xaddr->a6, &((struct sockaddr_in6 *)sa)->sin6_addr, sizeof(struct in6_addr)); return AF_INET6; #endif } return 0; } static int pfkey_sadb_addr2xfrm_addr(const struct sadb_address *addr, xfrm_address_t *xaddr) { return pfkey_sockaddr_extract((struct sockaddr *)(addr + 1), xaddr); } static struct xfrm_state *pfkey_xfrm_state_lookup(struct net *net, const struct sadb_msg *hdr, void * const *ext_hdrs) { const struct sadb_sa *sa; const struct sadb_address *addr; uint16_t proto; unsigned short family; xfrm_address_t *xaddr; sa = ext_hdrs[SADB_EXT_SA - 1]; if (sa == NULL) return NULL; proto = pfkey_satype2proto(hdr->sadb_msg_satype); if (proto == 0) return NULL; /* sadb_address_len should be checked by caller */ addr = ext_hdrs[SADB_EXT_ADDRESS_DST - 1]; if (addr == NULL) return NULL; family = ((const struct sockaddr *)(addr + 1))->sa_family; switch (family) { case AF_INET: xaddr = (xfrm_address_t *)&((const struct sockaddr_in *)(addr + 1))->sin_addr; break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: xaddr = (xfrm_address_t *)&((const struct sockaddr_in6 *)(addr + 1))->sin6_addr; break; #endif default: xaddr = NULL; } if (!xaddr) return NULL; return xfrm_state_lookup(net, DUMMY_MARK, xaddr, sa->sadb_sa_spi, proto, family); } #define PFKEY_ALIGN8(a) (1 + (((a) - 1) | (8 - 1))) static int pfkey_sockaddr_size(sa_family_t family) { return PFKEY_ALIGN8(pfkey_sockaddr_len(family)); } static inline int pfkey_mode_from_xfrm(int mode) { switch(mode) { case XFRM_MODE_TRANSPORT: return IPSEC_MODE_TRANSPORT; case XFRM_MODE_TUNNEL: return IPSEC_MODE_TUNNEL; case XFRM_MODE_BEET: return IPSEC_MODE_BEET; default: return -1; } } static inline int pfkey_mode_to_xfrm(int mode) { switch(mode) { case IPSEC_MODE_ANY: /*XXX*/ case IPSEC_MODE_TRANSPORT: return XFRM_MODE_TRANSPORT; case IPSEC_MODE_TUNNEL: return XFRM_MODE_TUNNEL; case IPSEC_MODE_BEET: return XFRM_MODE_BEET; default: return -1; } } static unsigned int pfkey_sockaddr_fill(const xfrm_address_t *xaddr, __be16 port, struct sockaddr *sa, unsigned short family) { switch (family) { case AF_INET: { struct sockaddr_in *sin = (struct sockaddr_in *)sa; sin->sin_family = AF_INET; sin->sin_port = port; sin->sin_addr.s_addr = xaddr->a4; memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); return 32; } #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: { struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sa; sin6->sin6_family = AF_INET6; sin6->sin6_port = port; sin6->sin6_flowinfo = 0; sin6->sin6_addr = xaddr->in6; sin6->sin6_scope_id = 0; return 128; } #endif } return 0; } static struct sk_buff *__pfkey_xfrm_state2msg(const struct xfrm_state *x, int add_keys, int hsc) { struct sk_buff *skb; struct sadb_msg *hdr; struct sadb_sa *sa; struct sadb_lifetime *lifetime; struct sadb_address *addr; struct sadb_key *key; struct sadb_x_sa2 *sa2; struct sadb_x_sec_ctx *sec_ctx; struct xfrm_sec_ctx *xfrm_ctx; int ctx_size = 0; int size; int auth_key_size = 0; int encrypt_key_size = 0; int sockaddr_size; struct xfrm_encap_tmpl *natt = NULL; int mode; /* address family check */ sockaddr_size = pfkey_sockaddr_size(x->props.family); if (!sockaddr_size) return ERR_PTR(-EINVAL); /* base, SA, (lifetime (HSC),) address(SD), (address(P),) key(AE), (identity(SD),) (sensitivity)> */ size = sizeof(struct sadb_msg) +sizeof(struct sadb_sa) + sizeof(struct sadb_lifetime) + ((hsc & 1) ? sizeof(struct sadb_lifetime) : 0) + ((hsc & 2) ? sizeof(struct sadb_lifetime) : 0) + sizeof(struct sadb_address)*2 + sockaddr_size*2 + sizeof(struct sadb_x_sa2); if ((xfrm_ctx = x->security)) { ctx_size = PFKEY_ALIGN8(xfrm_ctx->ctx_len); size += sizeof(struct sadb_x_sec_ctx) + ctx_size; } /* identity & sensitivity */ if (!xfrm_addr_equal(&x->sel.saddr, &x->props.saddr, x->props.family)) size += sizeof(struct sadb_address) + sockaddr_size; if (add_keys) { if (x->aalg && x->aalg->alg_key_len) { auth_key_size = PFKEY_ALIGN8((x->aalg->alg_key_len + 7) / 8); size += sizeof(struct sadb_key) + auth_key_size; } if (x->ealg && x->ealg->alg_key_len) { encrypt_key_size = PFKEY_ALIGN8((x->ealg->alg_key_len+7) / 8); size += sizeof(struct sadb_key) + encrypt_key_size; } } if (x->encap) natt = x->encap; if (natt && natt->encap_type) { size += sizeof(struct sadb_x_nat_t_type); size += sizeof(struct sadb_x_nat_t_port); size += sizeof(struct sadb_x_nat_t_port); } skb = alloc_skb(size + 16, GFP_ATOMIC); if (skb == NULL) return ERR_PTR(-ENOBUFS); /* call should fill header later */ hdr = skb_put(skb, sizeof(struct sadb_msg)); memset(hdr, 0, size); /* XXX do we need this ? */ hdr->sadb_msg_len = size / sizeof(uint64_t); /* sa */ sa = skb_put(skb, sizeof(struct sadb_sa)); sa->sadb_sa_len = sizeof(struct sadb_sa)/sizeof(uint64_t); sa->sadb_sa_exttype = SADB_EXT_SA; sa->sadb_sa_spi = x->id.spi; sa->sadb_sa_replay = x->props.replay_window; switch (x->km.state) { case XFRM_STATE_VALID: sa->sadb_sa_state = x->km.dying ? SADB_SASTATE_DYING : SADB_SASTATE_MATURE; break; case XFRM_STATE_ACQ: sa->sadb_sa_state = SADB_SASTATE_LARVAL; break; default: sa->sadb_sa_state = SADB_SASTATE_DEAD; break; } sa->sadb_sa_auth = 0; if (x->aalg) { struct xfrm_algo_desc *a = xfrm_aalg_get_byname(x->aalg->alg_name, 0); sa->sadb_sa_auth = (a && a->pfkey_supported) ? a->desc.sadb_alg_id : 0; } sa->sadb_sa_encrypt = 0; BUG_ON(x->ealg && x->calg); if (x->ealg) { struct xfrm_algo_desc *a = xfrm_ealg_get_byname(x->ealg->alg_name, 0); sa->sadb_sa_encrypt = (a && a->pfkey_supported) ? a->desc.sadb_alg_id : 0; } /* KAME compatible: sadb_sa_encrypt is overloaded with calg id */ if (x->calg) { struct xfrm_algo_desc *a = xfrm_calg_get_byname(x->calg->alg_name, 0); sa->sadb_sa_encrypt = (a && a->pfkey_supported) ? a->desc.sadb_alg_id : 0; } sa->sadb_sa_flags = 0; if (x->props.flags & XFRM_STATE_NOECN) sa->sadb_sa_flags |= SADB_SAFLAGS_NOECN; if (x->props.flags & XFRM_STATE_DECAP_DSCP) sa->sadb_sa_flags |= SADB_SAFLAGS_DECAP_DSCP; if (x->props.flags & XFRM_STATE_NOPMTUDISC) sa->sadb_sa_flags |= SADB_SAFLAGS_NOPMTUDISC; /* hard time */ if (hsc & 2) { lifetime = skb_put(skb, sizeof(struct sadb_lifetime)); lifetime->sadb_lifetime_len = sizeof(struct sadb_lifetime)/sizeof(uint64_t); lifetime->sadb_lifetime_exttype = SADB_EXT_LIFETIME_HARD; lifetime->sadb_lifetime_allocations = _X2KEY(x->lft.hard_packet_limit); lifetime->sadb_lifetime_bytes = _X2KEY(x->lft.hard_byte_limit); lifetime->sadb_lifetime_addtime = x->lft.hard_add_expires_seconds; lifetime->sadb_lifetime_usetime = x->lft.hard_use_expires_seconds; } /* soft time */ if (hsc & 1) { lifetime = skb_put(skb, sizeof(struct sadb_lifetime)); lifetime->sadb_lifetime_len = sizeof(struct sadb_lifetime)/sizeof(uint64_t); lifetime->sadb_lifetime_exttype = SADB_EXT_LIFETIME_SOFT; lifetime->sadb_lifetime_allocations = _X2KEY(x->lft.soft_packet_limit); lifetime->sadb_lifetime_bytes = _X2KEY(x->lft.soft_byte_limit); lifetime->sadb_lifetime_addtime = x->lft.soft_add_expires_seconds; lifetime->sadb_lifetime_usetime = x->lft.soft_use_expires_seconds; } /* current time */ lifetime = skb_put(skb, sizeof(struct sadb_lifetime)); lifetime->sadb_lifetime_len = sizeof(struct sadb_lifetime)/sizeof(uint64_t); lifetime->sadb_lifetime_exttype = SADB_EXT_LIFETIME_CURRENT; lifetime->sadb_lifetime_allocations = x->curlft.packets; lifetime->sadb_lifetime_bytes = x->curlft.bytes; lifetime->sadb_lifetime_addtime = x->curlft.add_time; lifetime->sadb_lifetime_usetime = x->curlft.use_time; /* src address */ addr = skb_put(skb, sizeof(struct sadb_address) + sockaddr_size); addr->sadb_address_len = (sizeof(struct sadb_address)+sockaddr_size)/ sizeof(uint64_t); addr->sadb_address_exttype = SADB_EXT_ADDRESS_SRC; /* "if the ports are non-zero, then the sadb_address_proto field, normally zero, MUST be filled in with the transport protocol's number." - RFC2367 */ addr->sadb_address_proto = 0; addr->sadb_address_reserved = 0; addr->sadb_address_prefixlen = pfkey_sockaddr_fill(&x->props.saddr, 0, (struct sockaddr *) (addr + 1), x->props.family); BUG_ON(!addr->sadb_address_prefixlen); /* dst address */ addr = skb_put(skb, sizeof(struct sadb_address) + sockaddr_size); addr->sadb_address_len = (sizeof(struct sadb_address)+sockaddr_size)/ sizeof(uint64_t); addr->sadb_address_exttype = SADB_EXT_ADDRESS_DST; addr->sadb_address_proto = 0; addr->sadb_address_reserved = 0; addr->sadb_address_prefixlen = pfkey_sockaddr_fill(&x->id.daddr, 0, (struct sockaddr *) (addr + 1), x->props.family); BUG_ON(!addr->sadb_address_prefixlen); if (!xfrm_addr_equal(&x->sel.saddr, &x->props.saddr, x->props.family)) { addr = skb_put(skb, sizeof(struct sadb_address) + sockaddr_size); addr->sadb_address_len = (sizeof(struct sadb_address)+sockaddr_size)/ sizeof(uint64_t); addr->sadb_address_exttype = SADB_EXT_ADDRESS_PROXY; addr->sadb_address_proto = pfkey_proto_from_xfrm(x->sel.proto); addr->sadb_address_prefixlen = x->sel.prefixlen_s; addr->sadb_address_reserved = 0; pfkey_sockaddr_fill(&x->sel.saddr, x->sel.sport, (struct sockaddr *) (addr + 1), x->props.family); } /* auth key */ if (add_keys && auth_key_size) { key = skb_put(skb, sizeof(struct sadb_key) + auth_key_size); key->sadb_key_len = (sizeof(struct sadb_key) + auth_key_size) / sizeof(uint64_t); key->sadb_key_exttype = SADB_EXT_KEY_AUTH; key->sadb_key_bits = x->aalg->alg_key_len; key->sadb_key_reserved = 0; memcpy(key + 1, x->aalg->alg_key, (x->aalg->alg_key_len+7)/8); } /* encrypt key */ if (add_keys && encrypt_key_size) { key = skb_put(skb, sizeof(struct sadb_key) + encrypt_key_size); key->sadb_key_len = (sizeof(struct sadb_key) + encrypt_key_size) / sizeof(uint64_t); key->sadb_key_exttype = SADB_EXT_KEY_ENCRYPT; key->sadb_key_bits = x->ealg->alg_key_len; key->sadb_key_reserved = 0; memcpy(key + 1, x->ealg->alg_key, (x->ealg->alg_key_len+7)/8); } /* sa */ sa2 = skb_put(skb, sizeof(struct sadb_x_sa2)); sa2->sadb_x_sa2_len = sizeof(struct sadb_x_sa2)/sizeof(uint64_t); sa2->sadb_x_sa2_exttype = SADB_X_EXT_SA2; if ((mode = pfkey_mode_from_xfrm(x->props.mode)) < 0) { kfree_skb(skb); return ERR_PTR(-EINVAL); } sa2->sadb_x_sa2_mode = mode; sa2->sadb_x_sa2_reserved1 = 0; sa2->sadb_x_sa2_reserved2 = 0; sa2->sadb_x_sa2_sequence = 0; sa2->sadb_x_sa2_reqid = x->props.reqid; if (natt && natt->encap_type) { struct sadb_x_nat_t_type *n_type; struct sadb_x_nat_t_port *n_port; /* type */ n_type = skb_put(skb, sizeof(*n_type)); n_type->sadb_x_nat_t_type_len = sizeof(*n_type)/sizeof(uint64_t); n_type->sadb_x_nat_t_type_exttype = SADB_X_EXT_NAT_T_TYPE; n_type->sadb_x_nat_t_type_type = natt->encap_type; n_type->sadb_x_nat_t_type_reserved[0] = 0; n_type->sadb_x_nat_t_type_reserved[1] = 0; n_type->sadb_x_nat_t_type_reserved[2] = 0; /* source port */ n_port = skb_put(skb, sizeof(*n_port)); n_port->sadb_x_nat_t_port_len = sizeof(*n_port)/sizeof(uint64_t); n_port->sadb_x_nat_t_port_exttype = SADB_X_EXT_NAT_T_SPORT; n_port->sadb_x_nat_t_port_port = natt->encap_sport; n_port->sadb_x_nat_t_port_reserved = 0; /* dest port */ n_port = skb_put(skb, sizeof(*n_port)); n_port->sadb_x_nat_t_port_len = sizeof(*n_port)/sizeof(uint64_t); n_port->sadb_x_nat_t_port_exttype = SADB_X_EXT_NAT_T_DPORT; n_port->sadb_x_nat_t_port_port = natt->encap_dport; n_port->sadb_x_nat_t_port_reserved = 0; } /* security context */ if (xfrm_ctx) { sec_ctx = skb_put(skb, sizeof(struct sadb_x_sec_ctx) + ctx_size); sec_ctx->sadb_x_sec_len = (sizeof(struct sadb_x_sec_ctx) + ctx_size) / sizeof(uint64_t); sec_ctx->sadb_x_sec_exttype = SADB_X_EXT_SEC_CTX; sec_ctx->sadb_x_ctx_doi = xfrm_ctx->ctx_doi; sec_ctx->sadb_x_ctx_alg = xfrm_ctx->ctx_alg; sec_ctx->sadb_x_ctx_len = xfrm_ctx->ctx_len; memcpy(sec_ctx + 1, xfrm_ctx->ctx_str, xfrm_ctx->ctx_len); } return skb; } static inline struct sk_buff *pfkey_xfrm_state2msg(const struct xfrm_state *x) { struct sk_buff *skb; skb = __pfkey_xfrm_state2msg(x, 1, 3); return skb; } static inline struct sk_buff *pfkey_xfrm_state2msg_expire(const struct xfrm_state *x, int hsc) { return __pfkey_xfrm_state2msg(x, 0, hsc); } static struct xfrm_state * pfkey_msg2xfrm_state(struct net *net, const struct sadb_msg *hdr, void * const *ext_hdrs) { struct xfrm_state *x; const struct sadb_lifetime *lifetime; const struct sadb_sa *sa; const struct sadb_key *key; const struct sadb_x_sec_ctx *sec_ctx; uint16_t proto; int err; sa = ext_hdrs[SADB_EXT_SA - 1]; if (!sa || !present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1], ext_hdrs[SADB_EXT_ADDRESS_DST-1])) return ERR_PTR(-EINVAL); if (hdr->sadb_msg_satype == SADB_SATYPE_ESP && !ext_hdrs[SADB_EXT_KEY_ENCRYPT-1]) return ERR_PTR(-EINVAL); if (hdr->sadb_msg_satype == SADB_SATYPE_AH && !ext_hdrs[SADB_EXT_KEY_AUTH-1]) return ERR_PTR(-EINVAL); if (!!ext_hdrs[SADB_EXT_LIFETIME_HARD-1] != !!ext_hdrs[SADB_EXT_LIFETIME_SOFT-1]) return ERR_PTR(-EINVAL); proto = pfkey_satype2proto(hdr->sadb_msg_satype); if (proto == 0) return ERR_PTR(-EINVAL); /* default error is no buffer space */ err = -ENOBUFS; /* RFC2367: Only SADB_SASTATE_MATURE SAs may be submitted in an SADB_ADD message. SADB_SASTATE_LARVAL SAs are created by SADB_GETSPI and it is not sensible to add a new SA in the DYING or SADB_SASTATE_DEAD state. Therefore, the sadb_sa_state field of all submitted SAs MUST be SADB_SASTATE_MATURE and the kernel MUST return an error if this is not true. However, KAME setkey always uses SADB_SASTATE_LARVAL. Hence, we have to _ignore_ sadb_sa_state, which is also reasonable. */ if (sa->sadb_sa_auth > SADB_AALG_MAX || (hdr->sadb_msg_satype == SADB_X_SATYPE_IPCOMP && sa->sadb_sa_encrypt > SADB_X_CALG_MAX) || sa->sadb_sa_encrypt > SADB_EALG_MAX) return ERR_PTR(-EINVAL); key = ext_hdrs[SADB_EXT_KEY_AUTH - 1]; if (key != NULL && sa->sadb_sa_auth != SADB_X_AALG_NULL && key->sadb_key_bits == 0) return ERR_PTR(-EINVAL); key = ext_hdrs[SADB_EXT_KEY_ENCRYPT-1]; if (key != NULL && sa->sadb_sa_encrypt != SADB_EALG_NULL && key->sadb_key_bits == 0) return ERR_PTR(-EINVAL); x = xfrm_state_alloc(net); if (x == NULL) return ERR_PTR(-ENOBUFS); x->id.proto = proto; x->id.spi = sa->sadb_sa_spi; x->props.replay_window = min_t(unsigned int, sa->sadb_sa_replay, (sizeof(x->replay.bitmap) * 8)); if (sa->sadb_sa_flags & SADB_SAFLAGS_NOECN) x->props.flags |= XFRM_STATE_NOECN; if (sa->sadb_sa_flags & SADB_SAFLAGS_DECAP_DSCP) x->props.flags |= XFRM_STATE_DECAP_DSCP; if (sa->sadb_sa_flags & SADB_SAFLAGS_NOPMTUDISC) x->props.flags |= XFRM_STATE_NOPMTUDISC; lifetime = ext_hdrs[SADB_EXT_LIFETIME_HARD - 1]; if (lifetime != NULL) { x->lft.hard_packet_limit = _KEY2X(lifetime->sadb_lifetime_allocations); x->lft.hard_byte_limit = _KEY2X(lifetime->sadb_lifetime_bytes); x->lft.hard_add_expires_seconds = lifetime->sadb_lifetime_addtime; x->lft.hard_use_expires_seconds = lifetime->sadb_lifetime_usetime; } lifetime = ext_hdrs[SADB_EXT_LIFETIME_SOFT - 1]; if (lifetime != NULL) { x->lft.soft_packet_limit = _KEY2X(lifetime->sadb_lifetime_allocations); x->lft.soft_byte_limit = _KEY2X(lifetime->sadb_lifetime_bytes); x->lft.soft_add_expires_seconds = lifetime->sadb_lifetime_addtime; x->lft.soft_use_expires_seconds = lifetime->sadb_lifetime_usetime; } sec_ctx = ext_hdrs[SADB_X_EXT_SEC_CTX - 1]; if (sec_ctx != NULL) { struct xfrm_user_sec_ctx *uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx, GFP_KERNEL); if (!uctx) goto out; err = security_xfrm_state_alloc(x, uctx); kfree(uctx); if (err) goto out; } err = -ENOBUFS; key = ext_hdrs[SADB_EXT_KEY_AUTH - 1]; if (sa->sadb_sa_auth) { int keysize = 0; struct xfrm_algo_desc *a = xfrm_aalg_get_byid(sa->sadb_sa_auth); if (!a || !a->pfkey_supported) { err = -ENOSYS; goto out; } if (key) keysize = (key->sadb_key_bits + 7) / 8; x->aalg = kmalloc(sizeof(*x->aalg) + keysize, GFP_KERNEL); if (!x->aalg) { err = -ENOMEM; goto out; } strcpy(x->aalg->alg_name, a->name); x->aalg->alg_key_len = 0; if (key) { x->aalg->alg_key_len = key->sadb_key_bits; memcpy(x->aalg->alg_key, key+1, keysize); } x->aalg->alg_trunc_len = a->uinfo.auth.icv_truncbits; x->props.aalgo = sa->sadb_sa_auth; /* x->algo.flags = sa->sadb_sa_flags; */ } if (sa->sadb_sa_encrypt) { if (hdr->sadb_msg_satype == SADB_X_SATYPE_IPCOMP) { struct xfrm_algo_desc *a = xfrm_calg_get_byid(sa->sadb_sa_encrypt); if (!a || !a->pfkey_supported) { err = -ENOSYS; goto out; } x->calg = kmalloc(sizeof(*x->calg), GFP_KERNEL); if (!x->calg) { err = -ENOMEM; goto out; } strcpy(x->calg->alg_name, a->name); x->props.calgo = sa->sadb_sa_encrypt; } else { int keysize = 0; struct xfrm_algo_desc *a = xfrm_ealg_get_byid(sa->sadb_sa_encrypt); if (!a || !a->pfkey_supported) { err = -ENOSYS; goto out; } key = (struct sadb_key*) ext_hdrs[SADB_EXT_KEY_ENCRYPT-1]; if (key) keysize = (key->sadb_key_bits + 7) / 8; x->ealg = kmalloc(sizeof(*x->ealg) + keysize, GFP_KERNEL); if (!x->ealg) { err = -ENOMEM; goto out; } strcpy(x->ealg->alg_name, a->name); x->ealg->alg_key_len = 0; if (key) { x->ealg->alg_key_len = key->sadb_key_bits; memcpy(x->ealg->alg_key, key+1, keysize); } x->props.ealgo = sa->sadb_sa_encrypt; x->geniv = a->uinfo.encr.geniv; } } /* x->algo.flags = sa->sadb_sa_flags; */ x->props.family = pfkey_sadb_addr2xfrm_addr((struct sadb_address *) ext_hdrs[SADB_EXT_ADDRESS_SRC-1], &x->props.saddr); pfkey_sadb_addr2xfrm_addr((struct sadb_address *) ext_hdrs[SADB_EXT_ADDRESS_DST-1], &x->id.daddr); if (ext_hdrs[SADB_X_EXT_SA2-1]) { const struct sadb_x_sa2 *sa2 = ext_hdrs[SADB_X_EXT_SA2-1]; int mode = pfkey_mode_to_xfrm(sa2->sadb_x_sa2_mode); if (mode < 0) { err = -EINVAL; goto out; } x->props.mode = mode; x->props.reqid = sa2->sadb_x_sa2_reqid; } if (ext_hdrs[SADB_EXT_ADDRESS_PROXY-1]) { const struct sadb_address *addr = ext_hdrs[SADB_EXT_ADDRESS_PROXY-1]; /* Nobody uses this, but we try. */ x->sel.family = pfkey_sadb_addr2xfrm_addr(addr, &x->sel.saddr); x->sel.prefixlen_s = addr->sadb_address_prefixlen; } if (!x->sel.family) x->sel.family = x->props.family; if (ext_hdrs[SADB_X_EXT_NAT_T_TYPE-1]) { const struct sadb_x_nat_t_type* n_type; struct xfrm_encap_tmpl *natt; x->encap = kzalloc(sizeof(*x->encap), GFP_KERNEL); if (!x->encap) { err = -ENOMEM; goto out; } natt = x->encap; n_type = ext_hdrs[SADB_X_EXT_NAT_T_TYPE-1]; natt->encap_type = n_type->sadb_x_nat_t_type_type; if (ext_hdrs[SADB_X_EXT_NAT_T_SPORT-1]) { const struct sadb_x_nat_t_port *n_port = ext_hdrs[SADB_X_EXT_NAT_T_SPORT-1]; natt->encap_sport = n_port->sadb_x_nat_t_port_port; } if (ext_hdrs[SADB_X_EXT_NAT_T_DPORT-1]) { const struct sadb_x_nat_t_port *n_port = ext_hdrs[SADB_X_EXT_NAT_T_DPORT-1]; natt->encap_dport = n_port->sadb_x_nat_t_port_port; } } err = xfrm_init_state(x); if (err) goto out; x->km.seq = hdr->sadb_msg_seq; return x; out: x->km.state = XFRM_STATE_DEAD; xfrm_state_put(x); return ERR_PTR(err); } static int pfkey_reserved(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { return -EOPNOTSUPP; } static int pfkey_getspi(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { struct net *net = sock_net(sk); struct sk_buff *resp_skb; struct sadb_x_sa2 *sa2; struct sadb_address *saddr, *daddr; struct sadb_msg *out_hdr; struct sadb_spirange *range; struct xfrm_state *x = NULL; int mode; int err; u32 min_spi, max_spi; u32 reqid; u8 proto; unsigned short family; xfrm_address_t *xsaddr = NULL, *xdaddr = NULL; if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1], ext_hdrs[SADB_EXT_ADDRESS_DST-1])) return -EINVAL; proto = pfkey_satype2proto(hdr->sadb_msg_satype); if (proto == 0) return -EINVAL; if ((sa2 = ext_hdrs[SADB_X_EXT_SA2-1]) != NULL) { mode = pfkey_mode_to_xfrm(sa2->sadb_x_sa2_mode); if (mode < 0) return -EINVAL; reqid = sa2->sadb_x_sa2_reqid; } else { mode = 0; reqid = 0; } saddr = ext_hdrs[SADB_EXT_ADDRESS_SRC-1]; daddr = ext_hdrs[SADB_EXT_ADDRESS_DST-1]; family = ((struct sockaddr *)(saddr + 1))->sa_family; switch (family) { case AF_INET: xdaddr = (xfrm_address_t *)&((struct sockaddr_in *)(daddr + 1))->sin_addr.s_addr; xsaddr = (xfrm_address_t *)&((struct sockaddr_in *)(saddr + 1))->sin_addr.s_addr; break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: xdaddr = (xfrm_address_t *)&((struct sockaddr_in6 *)(daddr + 1))->sin6_addr; xsaddr = (xfrm_address_t *)&((struct sockaddr_in6 *)(saddr + 1))->sin6_addr; break; #endif } if (hdr->sadb_msg_seq) { x = xfrm_find_acq_byseq(net, DUMMY_MARK, hdr->sadb_msg_seq); if (x && !xfrm_addr_equal(&x->id.daddr, xdaddr, family)) { xfrm_state_put(x); x = NULL; } } if (!x) x = xfrm_find_acq(net, &dummy_mark, mode, reqid, 0, proto, xdaddr, xsaddr, 1, family); if (x == NULL) return -ENOENT; min_spi = 0x100; max_spi = 0x0fffffff; range = ext_hdrs[SADB_EXT_SPIRANGE-1]; if (range) { min_spi = range->sadb_spirange_min; max_spi = range->sadb_spirange_max; } err = verify_spi_info(x->id.proto, min_spi, max_spi, NULL); if (err) { xfrm_state_put(x); return err; } err = xfrm_alloc_spi(x, min_spi, max_spi, NULL); resp_skb = err ? ERR_PTR(err) : pfkey_xfrm_state2msg(x); if (IS_ERR(resp_skb)) { xfrm_state_put(x); return PTR_ERR(resp_skb); } out_hdr = (struct sadb_msg *) resp_skb->data; out_hdr->sadb_msg_version = hdr->sadb_msg_version; out_hdr->sadb_msg_type = SADB_GETSPI; out_hdr->sadb_msg_satype = pfkey_proto2satype(proto); out_hdr->sadb_msg_errno = 0; out_hdr->sadb_msg_reserved = 0; out_hdr->sadb_msg_seq = hdr->sadb_msg_seq; out_hdr->sadb_msg_pid = hdr->sadb_msg_pid; xfrm_state_put(x); pfkey_broadcast(resp_skb, GFP_KERNEL, BROADCAST_ONE, sk, net); return 0; } static int pfkey_acquire(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { struct net *net = sock_net(sk); struct xfrm_state *x; if (hdr->sadb_msg_len != sizeof(struct sadb_msg)/8) return -EOPNOTSUPP; if (hdr->sadb_msg_seq == 0 || hdr->sadb_msg_errno == 0) return 0; x = xfrm_find_acq_byseq(net, DUMMY_MARK, hdr->sadb_msg_seq); if (x == NULL) return 0; spin_lock_bh(&x->lock); if (x->km.state == XFRM_STATE_ACQ) x->km.state = XFRM_STATE_ERROR; spin_unlock_bh(&x->lock); xfrm_state_put(x); return 0; } static inline int event2poltype(int event) { switch (event) { case XFRM_MSG_DELPOLICY: return SADB_X_SPDDELETE; case XFRM_MSG_NEWPOLICY: return SADB_X_SPDADD; case XFRM_MSG_UPDPOLICY: return SADB_X_SPDUPDATE; case XFRM_MSG_POLEXPIRE: // return SADB_X_SPDEXPIRE; default: pr_err("pfkey: Unknown policy event %d\n", event); break; } return 0; } static inline int event2keytype(int event) { switch (event) { case XFRM_MSG_DELSA: return SADB_DELETE; case XFRM_MSG_NEWSA: return SADB_ADD; case XFRM_MSG_UPDSA: return SADB_UPDATE; case XFRM_MSG_EXPIRE: return SADB_EXPIRE; default: pr_err("pfkey: Unknown SA event %d\n", event); break; } return 0; } /* ADD/UPD/DEL */ static int key_notify_sa(struct xfrm_state *x, const struct km_event *c) { struct sk_buff *skb; struct sadb_msg *hdr; skb = pfkey_xfrm_state2msg(x); if (IS_ERR(skb)) return PTR_ERR(skb); hdr = (struct sadb_msg *) skb->data; hdr->sadb_msg_version = PF_KEY_V2; hdr->sadb_msg_type = event2keytype(c->event); hdr->sadb_msg_satype = pfkey_proto2satype(x->id.proto); hdr->sadb_msg_errno = 0; hdr->sadb_msg_reserved = 0; hdr->sadb_msg_seq = c->seq; hdr->sadb_msg_pid = c->portid; pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ALL, NULL, xs_net(x)); return 0; } static int pfkey_add(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { struct net *net = sock_net(sk); struct xfrm_state *x; int err; struct km_event c; x = pfkey_msg2xfrm_state(net, hdr, ext_hdrs); if (IS_ERR(x)) return PTR_ERR(x); xfrm_state_hold(x); if (hdr->sadb_msg_type == SADB_ADD) err = xfrm_state_add(x); else err = xfrm_state_update(x); xfrm_audit_state_add(x, err ? 0 : 1, true); if (err < 0) { x->km.state = XFRM_STATE_DEAD; __xfrm_state_put(x); goto out; } if (hdr->sadb_msg_type == SADB_ADD) c.event = XFRM_MSG_NEWSA; else c.event = XFRM_MSG_UPDSA; c.seq = hdr->sadb_msg_seq; c.portid = hdr->sadb_msg_pid; km_state_notify(x, &c); out: xfrm_state_put(x); return err; } static int pfkey_delete(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { struct net *net = sock_net(sk); struct xfrm_state *x; struct km_event c; int err; if (!ext_hdrs[SADB_EXT_SA-1] || !present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1], ext_hdrs[SADB_EXT_ADDRESS_DST-1])) return -EINVAL; x = pfkey_xfrm_state_lookup(net, hdr, ext_hdrs); if (x == NULL) return -ESRCH; if ((err = security_xfrm_state_delete(x))) goto out; if (xfrm_state_kern(x)) { err = -EPERM; goto out; } err = xfrm_state_delete(x); if (err < 0) goto out; c.seq = hdr->sadb_msg_seq; c.portid = hdr->sadb_msg_pid; c.event = XFRM_MSG_DELSA; km_state_notify(x, &c); out: xfrm_audit_state_delete(x, err ? 0 : 1, true); xfrm_state_put(x); return err; } static int pfkey_get(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { struct net *net = sock_net(sk); __u8 proto; struct sk_buff *out_skb; struct sadb_msg *out_hdr; struct xfrm_state *x; if (!ext_hdrs[SADB_EXT_SA-1] || !present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1], ext_hdrs[SADB_EXT_ADDRESS_DST-1])) return -EINVAL; x = pfkey_xfrm_state_lookup(net, hdr, ext_hdrs); if (x == NULL) return -ESRCH; out_skb = pfkey_xfrm_state2msg(x); proto = x->id.proto; xfrm_state_put(x); if (IS_ERR(out_skb)) return PTR_ERR(out_skb); out_hdr = (struct sadb_msg *) out_skb->data; out_hdr->sadb_msg_version = hdr->sadb_msg_version; out_hdr->sadb_msg_type = SADB_GET; out_hdr->sadb_msg_satype = pfkey_proto2satype(proto); out_hdr->sadb_msg_errno = 0; out_hdr->sadb_msg_reserved = 0; out_hdr->sadb_msg_seq = hdr->sadb_msg_seq; out_hdr->sadb_msg_pid = hdr->sadb_msg_pid; pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ONE, sk, sock_net(sk)); return 0; } static struct sk_buff *compose_sadb_supported(const struct sadb_msg *orig, gfp_t allocation) { struct sk_buff *skb; struct sadb_msg *hdr; int len, auth_len, enc_len, i; auth_len = xfrm_count_pfkey_auth_supported(); if (auth_len) { auth_len *= sizeof(struct sadb_alg); auth_len += sizeof(struct sadb_supported); } enc_len = xfrm_count_pfkey_enc_supported(); if (enc_len) { enc_len *= sizeof(struct sadb_alg); enc_len += sizeof(struct sadb_supported); } len = enc_len + auth_len + sizeof(struct sadb_msg); skb = alloc_skb(len + 16, allocation); if (!skb) goto out_put_algs; hdr = skb_put(skb, sizeof(*hdr)); pfkey_hdr_dup(hdr, orig); hdr->sadb_msg_errno = 0; hdr->sadb_msg_len = len / sizeof(uint64_t); if (auth_len) { struct sadb_supported *sp; struct sadb_alg *ap; sp = skb_put(skb, auth_len); ap = (struct sadb_alg *) (sp + 1); sp->sadb_supported_len = auth_len / sizeof(uint64_t); sp->sadb_supported_exttype = SADB_EXT_SUPPORTED_AUTH; for (i = 0; ; i++) { struct xfrm_algo_desc *aalg = xfrm_aalg_get_byidx(i); if (!aalg) break; if (!aalg->pfkey_supported) continue; if (aalg->available) *ap++ = aalg->desc; } } if (enc_len) { struct sadb_supported *sp; struct sadb_alg *ap; sp = skb_put(skb, enc_len); ap = (struct sadb_alg *) (sp + 1); sp->sadb_supported_len = enc_len / sizeof(uint64_t); sp->sadb_supported_exttype = SADB_EXT_SUPPORTED_ENCRYPT; for (i = 0; ; i++) { struct xfrm_algo_desc *ealg = xfrm_ealg_get_byidx(i); if (!ealg) break; if (!ealg->pfkey_supported) continue; if (ealg->available) *ap++ = ealg->desc; } } out_put_algs: return skb; } static int pfkey_register(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { struct pfkey_sock *pfk = pfkey_sk(sk); struct sk_buff *supp_skb; if (hdr->sadb_msg_satype > SADB_SATYPE_MAX) return -EINVAL; if (hdr->sadb_msg_satype != SADB_SATYPE_UNSPEC) { if (pfk->registered&(1<<hdr->sadb_msg_satype)) return -EEXIST; pfk->registered |= (1<<hdr->sadb_msg_satype); } mutex_lock(&pfkey_mutex); xfrm_probe_algs(); supp_skb = compose_sadb_supported(hdr, GFP_KERNEL | __GFP_ZERO); mutex_unlock(&pfkey_mutex); if (!supp_skb) { if (hdr->sadb_msg_satype != SADB_SATYPE_UNSPEC) pfk->registered &= ~(1<<hdr->sadb_msg_satype); return -ENOBUFS; } pfkey_broadcast(supp_skb, GFP_KERNEL, BROADCAST_REGISTERED, sk, sock_net(sk)); return 0; } static int unicast_flush_resp(struct sock *sk, const struct sadb_msg *ihdr) { struct sk_buff *skb; struct sadb_msg *hdr; skb = alloc_skb(sizeof(struct sadb_msg) + 16, GFP_ATOMIC); if (!skb) return -ENOBUFS; hdr = skb_put_data(skb, ihdr, sizeof(struct sadb_msg)); hdr->sadb_msg_errno = (uint8_t) 0; hdr->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t)); return pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ONE, sk, sock_net(sk)); } static int key_notify_sa_flush(const struct km_event *c) { struct sk_buff *skb; struct sadb_msg *hdr; skb = alloc_skb(sizeof(struct sadb_msg) + 16, GFP_ATOMIC); if (!skb) return -ENOBUFS; hdr = skb_put(skb, sizeof(struct sadb_msg)); hdr->sadb_msg_satype = pfkey_proto2satype(c->data.proto); hdr->sadb_msg_type = SADB_FLUSH; hdr->sadb_msg_seq = c->seq; hdr->sadb_msg_pid = c->portid; hdr->sadb_msg_version = PF_KEY_V2; hdr->sadb_msg_errno = (uint8_t) 0; hdr->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t)); hdr->sadb_msg_reserved = 0; pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ALL, NULL, c->net); return 0; } static int pfkey_flush(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { struct net *net = sock_net(sk); unsigned int proto; struct km_event c; int err, err2; proto = pfkey_satype2proto(hdr->sadb_msg_satype); if (proto == 0) return -EINVAL; err = xfrm_state_flush(net, proto, true, false); err2 = unicast_flush_resp(sk, hdr); if (err || err2) { if (err == -ESRCH) /* empty table - go quietly */ err = 0; return err ? err : err2; } c.data.proto = proto; c.seq = hdr->sadb_msg_seq; c.portid = hdr->sadb_msg_pid; c.event = XFRM_MSG_FLUSHSA; c.net = net; km_state_notify(NULL, &c); return 0; } static int dump_sa(struct xfrm_state *x, int count, void *ptr) { struct pfkey_sock *pfk = ptr; struct sk_buff *out_skb; struct sadb_msg *out_hdr; if (!pfkey_can_dump(&pfk->sk)) return -ENOBUFS; out_skb = pfkey_xfrm_state2msg(x); if (IS_ERR(out_skb)) return PTR_ERR(out_skb); out_hdr = (struct sadb_msg *) out_skb->data; out_hdr->sadb_msg_version = pfk->dump.msg_version; out_hdr->sadb_msg_type = SADB_DUMP; out_hdr->sadb_msg_satype = pfkey_proto2satype(x->id.proto); out_hdr->sadb_msg_errno = 0; out_hdr->sadb_msg_reserved = 0; out_hdr->sadb_msg_seq = count + 1; out_hdr->sadb_msg_pid = pfk->dump.msg_portid; if (pfk->dump.skb) pfkey_broadcast(pfk->dump.skb, GFP_ATOMIC, BROADCAST_ONE, &pfk->sk, sock_net(&pfk->sk)); pfk->dump.skb = out_skb; return 0; } static int pfkey_dump_sa(struct pfkey_sock *pfk) { struct net *net = sock_net(&pfk->sk); return xfrm_state_walk(net, &pfk->dump.u.state, dump_sa, (void *) pfk); } static void pfkey_dump_sa_done(struct pfkey_sock *pfk) { struct net *net = sock_net(&pfk->sk); xfrm_state_walk_done(&pfk->dump.u.state, net); } static int pfkey_dump(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { u8 proto; struct xfrm_address_filter *filter = NULL; struct pfkey_sock *pfk = pfkey_sk(sk); mutex_lock(&pfk->dump_lock); if (pfk->dump.dump != NULL) { mutex_unlock(&pfk->dump_lock); return -EBUSY; } proto = pfkey_satype2proto(hdr->sadb_msg_satype); if (proto == 0) { mutex_unlock(&pfk->dump_lock); return -EINVAL; } if (ext_hdrs[SADB_X_EXT_FILTER - 1]) { struct sadb_x_filter *xfilter = ext_hdrs[SADB_X_EXT_FILTER - 1]; if ((xfilter->sadb_x_filter_splen > (sizeof(xfrm_address_t) << 3)) || (xfilter->sadb_x_filter_dplen > (sizeof(xfrm_address_t) << 3))) { mutex_unlock(&pfk->dump_lock); return -EINVAL; } filter = kmalloc(sizeof(*filter), GFP_KERNEL); if (filter == NULL) { mutex_unlock(&pfk->dump_lock); return -ENOMEM; } memcpy(&filter->saddr, &xfilter->sadb_x_filter_saddr, sizeof(xfrm_address_t)); memcpy(&filter->daddr, &xfilter->sadb_x_filter_daddr, sizeof(xfrm_address_t)); filter->family = xfilter->sadb_x_filter_family; filter->splen = xfilter->sadb_x_filter_splen; filter->dplen = xfilter->sadb_x_filter_dplen; } pfk->dump.msg_version = hdr->sadb_msg_version; pfk->dump.msg_portid = hdr->sadb_msg_pid; pfk->dump.dump = pfkey_dump_sa; pfk->dump.done = pfkey_dump_sa_done; xfrm_state_walk_init(&pfk->dump.u.state, proto, filter); mutex_unlock(&pfk->dump_lock); return pfkey_do_dump(pfk); } static int pfkey_promisc(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { struct pfkey_sock *pfk = pfkey_sk(sk); int satype = hdr->sadb_msg_satype; bool reset_errno = false; if (hdr->sadb_msg_len == (sizeof(*hdr) / sizeof(uint64_t))) { reset_errno = true; if (satype != 0 && satype != 1) return -EINVAL; pfk->promisc = satype; } if (reset_errno && skb_cloned(skb)) skb = skb_copy(skb, GFP_KERNEL); else skb = skb_clone(skb, GFP_KERNEL); if (reset_errno && skb) { struct sadb_msg *new_hdr = (struct sadb_msg *) skb->data; new_hdr->sadb_msg_errno = 0; } pfkey_broadcast(skb, GFP_KERNEL, BROADCAST_ALL, NULL, sock_net(sk)); return 0; } static int check_reqid(struct xfrm_policy *xp, int dir, int count, void *ptr) { int i; u32 reqid = *(u32*)ptr; for (i=0; i<xp->xfrm_nr; i++) { if (xp->xfrm_vec[i].reqid == reqid) return -EEXIST; } return 0; } static u32 gen_reqid(struct net *net) { struct xfrm_policy_walk walk; u32 start; int rc; static u32 reqid = IPSEC_MANUAL_REQID_MAX; start = reqid; do { ++reqid; if (reqid == 0) reqid = IPSEC_MANUAL_REQID_MAX+1; xfrm_policy_walk_init(&walk, XFRM_POLICY_TYPE_MAIN); rc = xfrm_policy_walk(net, &walk, check_reqid, (void*)&reqid); xfrm_policy_walk_done(&walk, net); if (rc != -EEXIST) return reqid; } while (reqid != start); return 0; } static int parse_ipsecrequest(struct xfrm_policy *xp, struct sadb_x_policy *pol, struct sadb_x_ipsecrequest *rq) { struct net *net = xp_net(xp); struct xfrm_tmpl *t = xp->xfrm_vec + xp->xfrm_nr; int mode; if (xp->xfrm_nr >= XFRM_MAX_DEPTH) return -ELOOP; if (rq->sadb_x_ipsecrequest_mode == 0) return -EINVAL; if (!xfrm_id_proto_valid(rq->sadb_x_ipsecrequest_proto)) return -EINVAL; t->id.proto = rq->sadb_x_ipsecrequest_proto; if ((mode = pfkey_mode_to_xfrm(rq->sadb_x_ipsecrequest_mode)) < 0) return -EINVAL; t->mode = mode; if (rq->sadb_x_ipsecrequest_level == IPSEC_LEVEL_USE) { if ((mode == XFRM_MODE_TUNNEL || mode == XFRM_MODE_BEET) && pol->sadb_x_policy_dir == IPSEC_DIR_OUTBOUND) return -EINVAL; t->optional = 1; } else if (rq->sadb_x_ipsecrequest_level == IPSEC_LEVEL_UNIQUE) { t->reqid = rq->sadb_x_ipsecrequest_reqid; if (t->reqid > IPSEC_MANUAL_REQID_MAX) t->reqid = 0; if (!t->reqid && !(t->reqid = gen_reqid(net))) return -ENOBUFS; } /* addresses present only in tunnel mode */ if (t->mode == XFRM_MODE_TUNNEL) { int err; err = parse_sockaddr_pair( (struct sockaddr *)(rq + 1), rq->sadb_x_ipsecrequest_len - sizeof(*rq), &t->saddr, &t->id.daddr, &t->encap_family); if (err) return err; } else t->encap_family = xp->family; /* No way to set this via kame pfkey */ t->allalgs = 1; xp->xfrm_nr++; return 0; } static int parse_ipsecrequests(struct xfrm_policy *xp, struct sadb_x_policy *pol) { int err; int len = pol->sadb_x_policy_len*8 - sizeof(struct sadb_x_policy); struct sadb_x_ipsecrequest *rq = (void*)(pol+1); if (pol->sadb_x_policy_len * 8 < sizeof(struct sadb_x_policy)) return -EINVAL; while (len >= sizeof(*rq)) { if (len < rq->sadb_x_ipsecrequest_len || rq->sadb_x_ipsecrequest_len < sizeof(*rq)) return -EINVAL; if ((err = parse_ipsecrequest(xp, pol, rq)) < 0) return err; len -= rq->sadb_x_ipsecrequest_len; rq = (void*)((u8*)rq + rq->sadb_x_ipsecrequest_len); } return 0; } static inline int pfkey_xfrm_policy2sec_ctx_size(const struct xfrm_policy *xp) { struct xfrm_sec_ctx *xfrm_ctx = xp->security; if (xfrm_ctx) { int len = sizeof(struct sadb_x_sec_ctx); len += xfrm_ctx->ctx_len; return PFKEY_ALIGN8(len); } return 0; } static int pfkey_xfrm_policy2msg_size(const struct xfrm_policy *xp) { const struct xfrm_tmpl *t; int sockaddr_size = pfkey_sockaddr_size(xp->family); int socklen = 0; int i; for (i=0; i<xp->xfrm_nr; i++) { t = xp->xfrm_vec + i; socklen += pfkey_sockaddr_len(t->encap_family); } return sizeof(struct sadb_msg) + (sizeof(struct sadb_lifetime) * 3) + (sizeof(struct sadb_address) * 2) + (sockaddr_size * 2) + sizeof(struct sadb_x_policy) + (xp->xfrm_nr * sizeof(struct sadb_x_ipsecrequest)) + (socklen * 2) + pfkey_xfrm_policy2sec_ctx_size(xp); } static struct sk_buff * pfkey_xfrm_policy2msg_prep(const struct xfrm_policy *xp) { struct sk_buff *skb; int size; size = pfkey_xfrm_policy2msg_size(xp); skb = alloc_skb(size + 16, GFP_ATOMIC); if (skb == NULL) return ERR_PTR(-ENOBUFS); return skb; } static int pfkey_xfrm_policy2msg(struct sk_buff *skb, const struct xfrm_policy *xp, int dir) { struct sadb_msg *hdr; struct sadb_address *addr; struct sadb_lifetime *lifetime; struct sadb_x_policy *pol; struct sadb_x_sec_ctx *sec_ctx; struct xfrm_sec_ctx *xfrm_ctx; int i; int size; int sockaddr_size = pfkey_sockaddr_size(xp->family); int socklen = pfkey_sockaddr_len(xp->family); size = pfkey_xfrm_policy2msg_size(xp); /* call should fill header later */ hdr = skb_put(skb, sizeof(struct sadb_msg)); memset(hdr, 0, size); /* XXX do we need this ? */ /* src address */ addr = skb_put(skb, sizeof(struct sadb_address) + sockaddr_size); addr->sadb_address_len = (sizeof(struct sadb_address)+sockaddr_size)/ sizeof(uint64_t); addr->sadb_address_exttype = SADB_EXT_ADDRESS_SRC; addr->sadb_address_proto = pfkey_proto_from_xfrm(xp->selector.proto); addr->sadb_address_prefixlen = xp->selector.prefixlen_s; addr->sadb_address_reserved = 0; if (!pfkey_sockaddr_fill(&xp->selector.saddr, xp->selector.sport, (struct sockaddr *) (addr + 1), xp->family)) BUG(); /* dst address */ addr = skb_put(skb, sizeof(struct sadb_address) + sockaddr_size); addr->sadb_address_len = (sizeof(struct sadb_address)+sockaddr_size)/ sizeof(uint64_t); addr->sadb_address_exttype = SADB_EXT_ADDRESS_DST; addr->sadb_address_proto = pfkey_proto_from_xfrm(xp->selector.proto); addr->sadb_address_prefixlen = xp->selector.prefixlen_d; addr->sadb_address_reserved = 0; pfkey_sockaddr_fill(&xp->selector.daddr, xp->selector.dport, (struct sockaddr *) (addr + 1), xp->family); /* hard time */ lifetime = skb_put(skb, sizeof(struct sadb_lifetime)); lifetime->sadb_lifetime_len = sizeof(struct sadb_lifetime)/sizeof(uint64_t); lifetime->sadb_lifetime_exttype = SADB_EXT_LIFETIME_HARD; lifetime->sadb_lifetime_allocations = _X2KEY(xp->lft.hard_packet_limit); lifetime->sadb_lifetime_bytes = _X2KEY(xp->lft.hard_byte_limit); lifetime->sadb_lifetime_addtime = xp->lft.hard_add_expires_seconds; lifetime->sadb_lifetime_usetime = xp->lft.hard_use_expires_seconds; /* soft time */ lifetime = skb_put(skb, sizeof(struct sadb_lifetime)); lifetime->sadb_lifetime_len = sizeof(struct sadb_lifetime)/sizeof(uint64_t); lifetime->sadb_lifetime_exttype = SADB_EXT_LIFETIME_SOFT; lifetime->sadb_lifetime_allocations = _X2KEY(xp->lft.soft_packet_limit); lifetime->sadb_lifetime_bytes = _X2KEY(xp->lft.soft_byte_limit); lifetime->sadb_lifetime_addtime = xp->lft.soft_add_expires_seconds; lifetime->sadb_lifetime_usetime = xp->lft.soft_use_expires_seconds; /* current time */ lifetime = skb_put(skb, sizeof(struct sadb_lifetime)); lifetime->sadb_lifetime_len = sizeof(struct sadb_lifetime)/sizeof(uint64_t); lifetime->sadb_lifetime_exttype = SADB_EXT_LIFETIME_CURRENT; lifetime->sadb_lifetime_allocations = xp->curlft.packets; lifetime->sadb_lifetime_bytes = xp->curlft.bytes; lifetime->sadb_lifetime_addtime = xp->curlft.add_time; lifetime->sadb_lifetime_usetime = xp->curlft.use_time; pol = skb_put(skb, sizeof(struct sadb_x_policy)); pol->sadb_x_policy_len = sizeof(struct sadb_x_policy)/sizeof(uint64_t); pol->sadb_x_policy_exttype = SADB_X_EXT_POLICY; pol->sadb_x_policy_type = IPSEC_POLICY_DISCARD; if (xp->action == XFRM_POLICY_ALLOW) { if (xp->xfrm_nr) pol->sadb_x_policy_type = IPSEC_POLICY_IPSEC; else pol->sadb_x_policy_type = IPSEC_POLICY_NONE; } pol->sadb_x_policy_dir = dir+1; pol->sadb_x_policy_reserved = 0; pol->sadb_x_policy_id = xp->index; pol->sadb_x_policy_priority = xp->priority; for (i=0; i<xp->xfrm_nr; i++) { const struct xfrm_tmpl *t = xp->xfrm_vec + i; struct sadb_x_ipsecrequest *rq; int req_size; int mode; req_size = sizeof(struct sadb_x_ipsecrequest); if (t->mode == XFRM_MODE_TUNNEL) { socklen = pfkey_sockaddr_len(t->encap_family); req_size += socklen * 2; } else { size -= 2*socklen; } rq = skb_put(skb, req_size); pol->sadb_x_policy_len += req_size/8; memset(rq, 0, sizeof(*rq)); rq->sadb_x_ipsecrequest_len = req_size; rq->sadb_x_ipsecrequest_proto = t->id.proto; if ((mode = pfkey_mode_from_xfrm(t->mode)) < 0) return -EINVAL; rq->sadb_x_ipsecrequest_mode = mode; rq->sadb_x_ipsecrequest_level = IPSEC_LEVEL_REQUIRE; if (t->reqid) rq->sadb_x_ipsecrequest_level = IPSEC_LEVEL_UNIQUE; if (t->optional) rq->sadb_x_ipsecrequest_level = IPSEC_LEVEL_USE; rq->sadb_x_ipsecrequest_reqid = t->reqid; if (t->mode == XFRM_MODE_TUNNEL) { u8 *sa = (void *)(rq + 1); pfkey_sockaddr_fill(&t->saddr, 0, (struct sockaddr *)sa, t->encap_family); pfkey_sockaddr_fill(&t->id.daddr, 0, (struct sockaddr *) (sa + socklen), t->encap_family); } } /* security context */ if ((xfrm_ctx = xp->security)) { int ctx_size = pfkey_xfrm_policy2sec_ctx_size(xp); sec_ctx = skb_put(skb, ctx_size); sec_ctx->sadb_x_sec_len = ctx_size / sizeof(uint64_t); sec_ctx->sadb_x_sec_exttype = SADB_X_EXT_SEC_CTX; sec_ctx->sadb_x_ctx_doi = xfrm_ctx->ctx_doi; sec_ctx->sadb_x_ctx_alg = xfrm_ctx->ctx_alg; sec_ctx->sadb_x_ctx_len = xfrm_ctx->ctx_len; memcpy(sec_ctx + 1, xfrm_ctx->ctx_str, xfrm_ctx->ctx_len); } hdr->sadb_msg_len = size / sizeof(uint64_t); hdr->sadb_msg_reserved = refcount_read(&xp->refcnt); return 0; } static int key_notify_policy(struct xfrm_policy *xp, int dir, const struct km_event *c) { struct sk_buff *out_skb; struct sadb_msg *out_hdr; int err; out_skb = pfkey_xfrm_policy2msg_prep(xp); if (IS_ERR(out_skb)) return PTR_ERR(out_skb); err = pfkey_xfrm_policy2msg(out_skb, xp, dir); if (err < 0) { kfree_skb(out_skb); return err; } out_hdr = (struct sadb_msg *) out_skb->data; out_hdr->sadb_msg_version = PF_KEY_V2; if (c->data.byid && c->event == XFRM_MSG_DELPOLICY) out_hdr->sadb_msg_type = SADB_X_SPDDELETE2; else out_hdr->sadb_msg_type = event2poltype(c->event); out_hdr->sadb_msg_errno = 0; out_hdr->sadb_msg_seq = c->seq; out_hdr->sadb_msg_pid = c->portid; pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ALL, NULL, xp_net(xp)); return 0; } static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { struct net *net = sock_net(sk); int err = 0; struct sadb_lifetime *lifetime; struct sadb_address *sa; struct sadb_x_policy *pol; struct xfrm_policy *xp; struct km_event c; struct sadb_x_sec_ctx *sec_ctx; if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1], ext_hdrs[SADB_EXT_ADDRESS_DST-1]) || !ext_hdrs[SADB_X_EXT_POLICY-1]) return -EINVAL; pol = ext_hdrs[SADB_X_EXT_POLICY-1]; if (pol->sadb_x_policy_type > IPSEC_POLICY_IPSEC) return -EINVAL; if (!pol->sadb_x_policy_dir || pol->sadb_x_policy_dir >= IPSEC_DIR_MAX) return -EINVAL; xp = xfrm_policy_alloc(net, GFP_KERNEL); if (xp == NULL) return -ENOBUFS; xp->action = (pol->sadb_x_policy_type == IPSEC_POLICY_DISCARD ? XFRM_POLICY_BLOCK : XFRM_POLICY_ALLOW); xp->priority = pol->sadb_x_policy_priority; sa = ext_hdrs[SADB_EXT_ADDRESS_SRC-1]; xp->family = pfkey_sadb_addr2xfrm_addr(sa, &xp->selector.saddr); xp->selector.family = xp->family; xp->selector.prefixlen_s = sa->sadb_address_prefixlen; xp->selector.proto = pfkey_proto_to_xfrm(sa->sadb_address_proto); xp->selector.sport = ((struct sockaddr_in *)(sa+1))->sin_port; if (xp->selector.sport) xp->selector.sport_mask = htons(0xffff); sa = ext_hdrs[SADB_EXT_ADDRESS_DST-1]; pfkey_sadb_addr2xfrm_addr(sa, &xp->selector.daddr); xp->selector.prefixlen_d = sa->sadb_address_prefixlen; /* Amusing, we set this twice. KAME apps appear to set same value * in both addresses. */ xp->selector.proto = pfkey_proto_to_xfrm(sa->sadb_address_proto); xp->selector.dport = ((struct sockaddr_in *)(sa+1))->sin_port; if (xp->selector.dport) xp->selector.dport_mask = htons(0xffff); sec_ctx = ext_hdrs[SADB_X_EXT_SEC_CTX - 1]; if (sec_ctx != NULL) { struct xfrm_user_sec_ctx *uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx, GFP_KERNEL); if (!uctx) { err = -ENOBUFS; goto out; } err = security_xfrm_policy_alloc(&xp->security, uctx, GFP_KERNEL); kfree(uctx); if (err) goto out; } xp->lft.soft_byte_limit = XFRM_INF; xp->lft.hard_byte_limit = XFRM_INF; xp->lft.soft_packet_limit = XFRM_INF; xp->lft.hard_packet_limit = XFRM_INF; if ((lifetime = ext_hdrs[SADB_EXT_LIFETIME_HARD-1]) != NULL) { xp->lft.hard_packet_limit = _KEY2X(lifetime->sadb_lifetime_allocations); xp->lft.hard_byte_limit = _KEY2X(lifetime->sadb_lifetime_bytes); xp->lft.hard_add_expires_seconds = lifetime->sadb_lifetime_addtime; xp->lft.hard_use_expires_seconds = lifetime->sadb_lifetime_usetime; } if ((lifetime = ext_hdrs[SADB_EXT_LIFETIME_SOFT-1]) != NULL) { xp->lft.soft_packet_limit = _KEY2X(lifetime->sadb_lifetime_allocations); xp->lft.soft_byte_limit = _KEY2X(lifetime->sadb_lifetime_bytes); xp->lft.soft_add_expires_seconds = lifetime->sadb_lifetime_addtime; xp->lft.soft_use_expires_seconds = lifetime->sadb_lifetime_usetime; } xp->xfrm_nr = 0; if (pol->sadb_x_policy_type == IPSEC_POLICY_IPSEC && (err = parse_ipsecrequests(xp, pol)) < 0) goto out; err = xfrm_policy_insert(pol->sadb_x_policy_dir-1, xp, hdr->sadb_msg_type != SADB_X_SPDUPDATE); xfrm_audit_policy_add(xp, err ? 0 : 1, true); if (err) goto out; if (hdr->sadb_msg_type == SADB_X_SPDUPDATE) c.event = XFRM_MSG_UPDPOLICY; else c.event = XFRM_MSG_NEWPOLICY; c.seq = hdr->sadb_msg_seq; c.portid = hdr->sadb_msg_pid; km_policy_notify(xp, pol->sadb_x_policy_dir-1, &c); xfrm_pol_put(xp); return 0; out: xp->walk.dead = 1; xfrm_policy_destroy(xp); return err; } static int pfkey_spddelete(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { struct net *net = sock_net(sk); int err; struct sadb_address *sa; struct sadb_x_policy *pol; struct xfrm_policy *xp; struct xfrm_selector sel; struct km_event c; struct sadb_x_sec_ctx *sec_ctx; struct xfrm_sec_ctx *pol_ctx = NULL; if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1], ext_hdrs[SADB_EXT_ADDRESS_DST-1]) || !ext_hdrs[SADB_X_EXT_POLICY-1]) return -EINVAL; pol = ext_hdrs[SADB_X_EXT_POLICY-1]; if (!pol->sadb_x_policy_dir || pol->sadb_x_policy_dir >= IPSEC_DIR_MAX) return -EINVAL; memset(&sel, 0, sizeof(sel)); sa = ext_hdrs[SADB_EXT_ADDRESS_SRC-1]; sel.family = pfkey_sadb_addr2xfrm_addr(sa, &sel.saddr); sel.prefixlen_s = sa->sadb_address_prefixlen; sel.proto = pfkey_proto_to_xfrm(sa->sadb_address_proto); sel.sport = ((struct sockaddr_in *)(sa+1))->sin_port; if (sel.sport) sel.sport_mask = htons(0xffff); sa = ext_hdrs[SADB_EXT_ADDRESS_DST-1]; pfkey_sadb_addr2xfrm_addr(sa, &sel.daddr); sel.prefixlen_d = sa->sadb_address_prefixlen; sel.proto = pfkey_proto_to_xfrm(sa->sadb_address_proto); sel.dport = ((struct sockaddr_in *)(sa+1))->sin_port; if (sel.dport) sel.dport_mask = htons(0xffff); sec_ctx = ext_hdrs[SADB_X_EXT_SEC_CTX - 1]; if (sec_ctx != NULL) { struct xfrm_user_sec_ctx *uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx, GFP_KERNEL); if (!uctx) return -ENOMEM; err = security_xfrm_policy_alloc(&pol_ctx, uctx, GFP_KERNEL); kfree(uctx); if (err) return err; } xp = xfrm_policy_bysel_ctx(net, &dummy_mark, 0, XFRM_POLICY_TYPE_MAIN, pol->sadb_x_policy_dir - 1, &sel, pol_ctx, 1, &err); security_xfrm_policy_free(pol_ctx); if (xp == NULL) return -ENOENT; xfrm_audit_policy_delete(xp, err ? 0 : 1, true); if (err) goto out; c.seq = hdr->sadb_msg_seq; c.portid = hdr->sadb_msg_pid; c.data.byid = 0; c.event = XFRM_MSG_DELPOLICY; km_policy_notify(xp, pol->sadb_x_policy_dir-1, &c); out: xfrm_pol_put(xp); return err; } static int key_pol_get_resp(struct sock *sk, struct xfrm_policy *xp, const struct sadb_msg *hdr, int dir) { int err; struct sk_buff *out_skb; struct sadb_msg *out_hdr; err = 0; out_skb = pfkey_xfrm_policy2msg_prep(xp); if (IS_ERR(out_skb)) { err = PTR_ERR(out_skb); goto out; } err = pfkey_xfrm_policy2msg(out_skb, xp, dir); if (err < 0) { kfree_skb(out_skb); goto out; } out_hdr = (struct sadb_msg *) out_skb->data; out_hdr->sadb_msg_version = hdr->sadb_msg_version; out_hdr->sadb_msg_type = hdr->sadb_msg_type; out_hdr->sadb_msg_satype = 0; out_hdr->sadb_msg_errno = 0; out_hdr->sadb_msg_seq = hdr->sadb_msg_seq; out_hdr->sadb_msg_pid = hdr->sadb_msg_pid; pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ONE, sk, xp_net(xp)); err = 0; out: return err; } static int pfkey_sockaddr_pair_size(sa_family_t family) { return PFKEY_ALIGN8(pfkey_sockaddr_len(family) * 2); } static int parse_sockaddr_pair(struct sockaddr *sa, int ext_len, xfrm_address_t *saddr, xfrm_address_t *daddr, u16 *family) { int af, socklen; if (ext_len < 2 || ext_len < pfkey_sockaddr_pair_size(sa->sa_family)) return -EINVAL; af = pfkey_sockaddr_extract(sa, saddr); if (!af) return -EINVAL; socklen = pfkey_sockaddr_len(af); if (pfkey_sockaddr_extract((struct sockaddr *) (((u8 *)sa) + socklen), daddr) != af) return -EINVAL; *family = af; return 0; } #ifdef CONFIG_NET_KEY_MIGRATE static int ipsecrequests_to_migrate(struct sadb_x_ipsecrequest *rq1, int len, struct xfrm_migrate *m) { int err; struct sadb_x_ipsecrequest *rq2; int mode; if (len < sizeof(*rq1) || len < rq1->sadb_x_ipsecrequest_len || rq1->sadb_x_ipsecrequest_len < sizeof(*rq1)) return -EINVAL; /* old endoints */ err = parse_sockaddr_pair((struct sockaddr *)(rq1 + 1), rq1->sadb_x_ipsecrequest_len - sizeof(*rq1), &m->old_saddr, &m->old_daddr, &m->old_family); if (err) return err; rq2 = (struct sadb_x_ipsecrequest *)((u8 *)rq1 + rq1->sadb_x_ipsecrequest_len); len -= rq1->sadb_x_ipsecrequest_len; if (len <= sizeof(*rq2) || len < rq2->sadb_x_ipsecrequest_len || rq2->sadb_x_ipsecrequest_len < sizeof(*rq2)) return -EINVAL; /* new endpoints */ err = parse_sockaddr_pair((struct sockaddr *)(rq2 + 1), rq2->sadb_x_ipsecrequest_len - sizeof(*rq2), &m->new_saddr, &m->new_daddr, &m->new_family); if (err) return err; if (rq1->sadb_x_ipsecrequest_proto != rq2->sadb_x_ipsecrequest_proto || rq1->sadb_x_ipsecrequest_mode != rq2->sadb_x_ipsecrequest_mode || rq1->sadb_x_ipsecrequest_reqid != rq2->sadb_x_ipsecrequest_reqid) return -EINVAL; m->proto = rq1->sadb_x_ipsecrequest_proto; if ((mode = pfkey_mode_to_xfrm(rq1->sadb_x_ipsecrequest_mode)) < 0) return -EINVAL; m->mode = mode; m->reqid = rq1->sadb_x_ipsecrequest_reqid; return ((int)(rq1->sadb_x_ipsecrequest_len + rq2->sadb_x_ipsecrequest_len)); } static int pfkey_migrate(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { int i, len, ret, err = -EINVAL; u8 dir; struct sadb_address *sa; struct sadb_x_kmaddress *kma; struct sadb_x_policy *pol; struct sadb_x_ipsecrequest *rq; struct xfrm_selector sel; struct xfrm_migrate m[XFRM_MAX_DEPTH]; struct xfrm_kmaddress k; struct net *net = sock_net(sk); if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC - 1], ext_hdrs[SADB_EXT_ADDRESS_DST - 1]) || !ext_hdrs[SADB_X_EXT_POLICY - 1]) { err = -EINVAL; goto out; } kma = ext_hdrs[SADB_X_EXT_KMADDRESS - 1]; pol = ext_hdrs[SADB_X_EXT_POLICY - 1]; if (pol->sadb_x_policy_dir >= IPSEC_DIR_MAX) { err = -EINVAL; goto out; } if (kma) { /* convert sadb_x_kmaddress to xfrm_kmaddress */ k.reserved = kma->sadb_x_kmaddress_reserved; ret = parse_sockaddr_pair((struct sockaddr *)(kma + 1), 8*(kma->sadb_x_kmaddress_len) - sizeof(*kma), &k.local, &k.remote, &k.family); if (ret < 0) { err = ret; goto out; } } dir = pol->sadb_x_policy_dir - 1; memset(&sel, 0, sizeof(sel)); /* set source address info of selector */ sa = ext_hdrs[SADB_EXT_ADDRESS_SRC - 1]; sel.family = pfkey_sadb_addr2xfrm_addr(sa, &sel.saddr); sel.prefixlen_s = sa->sadb_address_prefixlen; sel.proto = pfkey_proto_to_xfrm(sa->sadb_address_proto); sel.sport = ((struct sockaddr_in *)(sa + 1))->sin_port; if (sel.sport) sel.sport_mask = htons(0xffff); /* set destination address info of selector */ sa = ext_hdrs[SADB_EXT_ADDRESS_DST - 1]; pfkey_sadb_addr2xfrm_addr(sa, &sel.daddr); sel.prefixlen_d = sa->sadb_address_prefixlen; sel.proto = pfkey_proto_to_xfrm(sa->sadb_address_proto); sel.dport = ((struct sockaddr_in *)(sa + 1))->sin_port; if (sel.dport) sel.dport_mask = htons(0xffff); rq = (struct sadb_x_ipsecrequest *)(pol + 1); /* extract ipsecrequests */ i = 0; len = pol->sadb_x_policy_len * 8 - sizeof(struct sadb_x_policy); while (len > 0 && i < XFRM_MAX_DEPTH) { ret = ipsecrequests_to_migrate(rq, len, &m[i]); if (ret < 0) { err = ret; goto out; } else { rq = (struct sadb_x_ipsecrequest *)((u8 *)rq + ret); len -= ret; i++; } } if (!i || len > 0) { err = -EINVAL; goto out; } return xfrm_migrate(&sel, dir, XFRM_POLICY_TYPE_MAIN, m, i, kma ? &k : NULL, net, NULL, 0, NULL); out: return err; } #else static int pfkey_migrate(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { return -ENOPROTOOPT; } #endif static int pfkey_spdget(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { struct net *net = sock_net(sk); unsigned int dir; int err = 0, delete; struct sadb_x_policy *pol; struct xfrm_policy *xp; struct km_event c; if ((pol = ext_hdrs[SADB_X_EXT_POLICY-1]) == NULL) return -EINVAL; dir = xfrm_policy_id2dir(pol->sadb_x_policy_id); if (dir >= XFRM_POLICY_MAX) return -EINVAL; delete = (hdr->sadb_msg_type == SADB_X_SPDDELETE2); xp = xfrm_policy_byid(net, &dummy_mark, 0, XFRM_POLICY_TYPE_MAIN, dir, pol->sadb_x_policy_id, delete, &err); if (xp == NULL) return -ENOENT; if (delete) { xfrm_audit_policy_delete(xp, err ? 0 : 1, true); if (err) goto out; c.seq = hdr->sadb_msg_seq; c.portid = hdr->sadb_msg_pid; c.data.byid = 1; c.event = XFRM_MSG_DELPOLICY; km_policy_notify(xp, dir, &c); } else { err = key_pol_get_resp(sk, xp, hdr, dir); } out: xfrm_pol_put(xp); return err; } static int dump_sp(struct xfrm_policy *xp, int dir, int count, void *ptr) { struct pfkey_sock *pfk = ptr; struct sk_buff *out_skb; struct sadb_msg *out_hdr; int err; if (!pfkey_can_dump(&pfk->sk)) return -ENOBUFS; out_skb = pfkey_xfrm_policy2msg_prep(xp); if (IS_ERR(out_skb)) return PTR_ERR(out_skb); err = pfkey_xfrm_policy2msg(out_skb, xp, dir); if (err < 0) { kfree_skb(out_skb); return err; } out_hdr = (struct sadb_msg *) out_skb->data; out_hdr->sadb_msg_version = pfk->dump.msg_version; out_hdr->sadb_msg_type = SADB_X_SPDDUMP; out_hdr->sadb_msg_satype = SADB_SATYPE_UNSPEC; out_hdr->sadb_msg_errno = 0; out_hdr->sadb_msg_seq = count + 1; out_hdr->sadb_msg_pid = pfk->dump.msg_portid; if (pfk->dump.skb) pfkey_broadcast(pfk->dump.skb, GFP_ATOMIC, BROADCAST_ONE, &pfk->sk, sock_net(&pfk->sk)); pfk->dump.skb = out_skb; return 0; } static int pfkey_dump_sp(struct pfkey_sock *pfk) { struct net *net = sock_net(&pfk->sk); return xfrm_policy_walk(net, &pfk->dump.u.policy, dump_sp, (void *) pfk); } static void pfkey_dump_sp_done(struct pfkey_sock *pfk) { struct net *net = sock_net((struct sock *)pfk); xfrm_policy_walk_done(&pfk->dump.u.policy, net); } static int pfkey_spddump(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { struct pfkey_sock *pfk = pfkey_sk(sk); mutex_lock(&pfk->dump_lock); if (pfk->dump.dump != NULL) { mutex_unlock(&pfk->dump_lock); return -EBUSY; } pfk->dump.msg_version = hdr->sadb_msg_version; pfk->dump.msg_portid = hdr->sadb_msg_pid; pfk->dump.dump = pfkey_dump_sp; pfk->dump.done = pfkey_dump_sp_done; xfrm_policy_walk_init(&pfk->dump.u.policy, XFRM_POLICY_TYPE_MAIN); mutex_unlock(&pfk->dump_lock); return pfkey_do_dump(pfk); } static int key_notify_policy_flush(const struct km_event *c) { struct sk_buff *skb_out; struct sadb_msg *hdr; skb_out = alloc_skb(sizeof(struct sadb_msg) + 16, GFP_ATOMIC); if (!skb_out) return -ENOBUFS; hdr = skb_put(skb_out, sizeof(struct sadb_msg)); hdr->sadb_msg_type = SADB_X_SPDFLUSH; hdr->sadb_msg_seq = c->seq; hdr->sadb_msg_pid = c->portid; hdr->sadb_msg_version = PF_KEY_V2; hdr->sadb_msg_errno = (uint8_t) 0; hdr->sadb_msg_satype = SADB_SATYPE_UNSPEC; hdr->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t)); hdr->sadb_msg_reserved = 0; pfkey_broadcast(skb_out, GFP_ATOMIC, BROADCAST_ALL, NULL, c->net); return 0; } static int pfkey_spdflush(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs) { struct net *net = sock_net(sk); struct km_event c; int err, err2; err = xfrm_policy_flush(net, XFRM_POLICY_TYPE_MAIN, true); err2 = unicast_flush_resp(sk, hdr); if (err || err2) { if (err == -ESRCH) /* empty table - old silent behavior */ return 0; return err; } c.data.type = XFRM_POLICY_TYPE_MAIN; c.event = XFRM_MSG_FLUSHPOLICY; c.portid = hdr->sadb_msg_pid; c.seq = hdr->sadb_msg_seq; c.net = net; km_policy_notify(NULL, 0, &c); return 0; } typedef int (*pfkey_handler)(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr, void * const *ext_hdrs); static const pfkey_handler pfkey_funcs[SADB_MAX + 1] = { [SADB_RESERVED] = pfkey_reserved, [SADB_GETSPI] = pfkey_getspi, [SADB_UPDATE] = pfkey_add, [SADB_ADD] = pfkey_add, [SADB_DELETE] = pfkey_delete, [SADB_GET] = pfkey_get, [SADB_ACQUIRE] = pfkey_acquire, [SADB_REGISTER] = pfkey_register, [SADB_EXPIRE] = NULL, [SADB_FLUSH] = pfkey_flush, [SADB_DUMP] = pfkey_dump, [SADB_X_PROMISC] = pfkey_promisc, [SADB_X_PCHANGE] = NULL, [SADB_X_SPDUPDATE] = pfkey_spdadd, [SADB_X_SPDADD] = pfkey_spdadd, [SADB_X_SPDDELETE] = pfkey_spddelete, [SADB_X_SPDGET] = pfkey_spdget, [SADB_X_SPDACQUIRE] = NULL, [SADB_X_SPDDUMP] = pfkey_spddump, [SADB_X_SPDFLUSH] = pfkey_spdflush, [SADB_X_SPDSETIDX] = pfkey_spdadd, [SADB_X_SPDDELETE2] = pfkey_spdget, [SADB_X_MIGRATE] = pfkey_migrate, }; static int pfkey_process(struct sock *sk, struct sk_buff *skb, const struct sadb_msg *hdr) { void *ext_hdrs[SADB_EXT_MAX]; int err; /* Non-zero return value of pfkey_broadcast() does not always signal * an error and even on an actual error we may still want to process * the message so rather ignore the return value. */ pfkey_broadcast(skb_clone(skb, GFP_KERNEL), GFP_KERNEL, BROADCAST_PROMISC_ONLY, NULL, sock_net(sk)); memset(ext_hdrs, 0, sizeof(ext_hdrs)); err = parse_exthdrs(skb, hdr, ext_hdrs); if (!err) { err = -EOPNOTSUPP; if (pfkey_funcs[hdr->sadb_msg_type]) err = pfkey_funcs[hdr->sadb_msg_type](sk, skb, hdr, ext_hdrs); } return err; } static struct sadb_msg *pfkey_get_base_msg(struct sk_buff *skb, int *errp) { struct sadb_msg *hdr = NULL; if (skb->len < sizeof(*hdr)) { *errp = -EMSGSIZE; } else { hdr = (struct sadb_msg *) skb->data; if (hdr->sadb_msg_version != PF_KEY_V2 || hdr->sadb_msg_reserved != 0 || (hdr->sadb_msg_type <= SADB_RESERVED || hdr->sadb_msg_type > SADB_MAX)) { hdr = NULL; *errp = -EINVAL; } else if (hdr->sadb_msg_len != (skb->len / sizeof(uint64_t)) || hdr->sadb_msg_len < (sizeof(struct sadb_msg) / sizeof(uint64_t))) { hdr = NULL; *errp = -EMSGSIZE; } else { *errp = 0; } } return hdr; } static inline int aalg_tmpl_set(const struct xfrm_tmpl *t, const struct xfrm_algo_desc *d) { unsigned int id = d->desc.sadb_alg_id; if (id >= sizeof(t->aalgos) * 8) return 0; return (t->aalgos >> id) & 1; } static inline int ealg_tmpl_set(const struct xfrm_tmpl *t, const struct xfrm_algo_desc *d) { unsigned int id = d->desc.sadb_alg_id; if (id >= sizeof(t->ealgos) * 8) return 0; return (t->ealgos >> id) & 1; } static int count_ah_combs(const struct xfrm_tmpl *t) { int i, sz = 0; for (i = 0; ; i++) { const struct xfrm_algo_desc *aalg = xfrm_aalg_get_byidx(i); if (!aalg) break; if (!aalg->pfkey_supported) continue; if (aalg_tmpl_set(t, aalg)) sz += sizeof(struct sadb_comb); } return sz + sizeof(struct sadb_prop); } static int count_esp_combs(const struct xfrm_tmpl *t) { int i, k, sz = 0; for (i = 0; ; i++) { const struct xfrm_algo_desc *ealg = xfrm_ealg_get_byidx(i); if (!ealg) break; if (!ealg->pfkey_supported) continue; if (!(ealg_tmpl_set(t, ealg))) continue; for (k = 1; ; k++) { const struct xfrm_algo_desc *aalg = xfrm_aalg_get_byidx(k); if (!aalg) break; if (!aalg->pfkey_supported) continue; if (aalg_tmpl_set(t, aalg)) sz += sizeof(struct sadb_comb); } } return sz + sizeof(struct sadb_prop); } static int dump_ah_combs(struct sk_buff *skb, const struct xfrm_tmpl *t) { struct sadb_prop *p; int sz = 0; int i; p = skb_put(skb, sizeof(struct sadb_prop)); p->sadb_prop_len = sizeof(struct sadb_prop)/8; p->sadb_prop_exttype = SADB_EXT_PROPOSAL; p->sadb_prop_replay = 32; memset(p->sadb_prop_reserved, 0, sizeof(p->sadb_prop_reserved)); for (i = 0; ; i++) { const struct xfrm_algo_desc *aalg = xfrm_aalg_get_byidx(i); if (!aalg) break; if (!aalg->pfkey_supported) continue; if (aalg_tmpl_set(t, aalg) && aalg->available) { struct sadb_comb *c; c = skb_put_zero(skb, sizeof(struct sadb_comb)); p->sadb_prop_len += sizeof(struct sadb_comb)/8; c->sadb_comb_auth = aalg->desc.sadb_alg_id; c->sadb_comb_auth_minbits = aalg->desc.sadb_alg_minbits; c->sadb_comb_auth_maxbits = aalg->desc.sadb_alg_maxbits; c->sadb_comb_hard_addtime = 24*60*60; c->sadb_comb_soft_addtime = 20*60*60; c->sadb_comb_hard_usetime = 8*60*60; c->sadb_comb_soft_usetime = 7*60*60; sz += sizeof(*c); } } return sz + sizeof(*p); } static int dump_esp_combs(struct sk_buff *skb, const struct xfrm_tmpl *t) { struct sadb_prop *p; int sz = 0; int i, k; p = skb_put(skb, sizeof(struct sadb_prop)); p->sadb_prop_len = sizeof(struct sadb_prop)/8; p->sadb_prop_exttype = SADB_EXT_PROPOSAL; p->sadb_prop_replay = 32; memset(p->sadb_prop_reserved, 0, sizeof(p->sadb_prop_reserved)); for (i=0; ; i++) { const struct xfrm_algo_desc *ealg = xfrm_ealg_get_byidx(i); if (!ealg) break; if (!ealg->pfkey_supported) continue; if (!(ealg_tmpl_set(t, ealg) && ealg->available)) continue; for (k = 1; ; k++) { struct sadb_comb *c; const struct xfrm_algo_desc *aalg = xfrm_aalg_get_byidx(k); if (!aalg) break; if (!aalg->pfkey_supported) continue; if (!(aalg_tmpl_set(t, aalg) && aalg->available)) continue; c = skb_put(skb, sizeof(struct sadb_comb)); memset(c, 0, sizeof(*c)); p->sadb_prop_len += sizeof(struct sadb_comb)/8; c->sadb_comb_auth = aalg->desc.sadb_alg_id; c->sadb_comb_auth_minbits = aalg->desc.sadb_alg_minbits; c->sadb_comb_auth_maxbits = aalg->desc.sadb_alg_maxbits; c->sadb_comb_encrypt = ealg->desc.sadb_alg_id; c->sadb_comb_encrypt_minbits = ealg->desc.sadb_alg_minbits; c->sadb_comb_encrypt_maxbits = ealg->desc.sadb_alg_maxbits; c->sadb_comb_hard_addtime = 24*60*60; c->sadb_comb_soft_addtime = 20*60*60; c->sadb_comb_hard_usetime = 8*60*60; c->sadb_comb_soft_usetime = 7*60*60; sz += sizeof(*c); } } return sz + sizeof(*p); } static int key_notify_policy_expire(struct xfrm_policy *xp, const struct km_event *c) { return 0; } static int key_notify_sa_expire(struct xfrm_state *x, const struct km_event *c) { struct sk_buff *out_skb; struct sadb_msg *out_hdr; int hard; int hsc; hard = c->data.hard; if (hard) hsc = 2; else hsc = 1; out_skb = pfkey_xfrm_state2msg_expire(x, hsc); if (IS_ERR(out_skb)) return PTR_ERR(out_skb); out_hdr = (struct sadb_msg *) out_skb->data; out_hdr->sadb_msg_version = PF_KEY_V2; out_hdr->sadb_msg_type = SADB_EXPIRE; out_hdr->sadb_msg_satype = pfkey_proto2satype(x->id.proto); out_hdr->sadb_msg_errno = 0; out_hdr->sadb_msg_reserved = 0; out_hdr->sadb_msg_seq = 0; out_hdr->sadb_msg_pid = 0; pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_REGISTERED, NULL, xs_net(x)); return 0; } static int pfkey_send_notify(struct xfrm_state *x, const struct km_event *c) { struct net *net = x ? xs_net(x) : c->net; struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id); if (atomic_read(&net_pfkey->socks_nr) == 0) return 0; switch (c->event) { case XFRM_MSG_EXPIRE: return key_notify_sa_expire(x, c); case XFRM_MSG_DELSA: case XFRM_MSG_NEWSA: case XFRM_MSG_UPDSA: return key_notify_sa(x, c); case XFRM_MSG_FLUSHSA: return key_notify_sa_flush(c); case XFRM_MSG_NEWAE: /* not yet supported */ break; default: pr_err("pfkey: Unknown SA event %d\n", c->event); break; } return 0; } static int pfkey_send_policy_notify(struct xfrm_policy *xp, int dir, const struct km_event *c) { if (xp && xp->type != XFRM_POLICY_TYPE_MAIN) return 0; switch (c->event) { case XFRM_MSG_POLEXPIRE: return key_notify_policy_expire(xp, c); case XFRM_MSG_DELPOLICY: case XFRM_MSG_NEWPOLICY: case XFRM_MSG_UPDPOLICY: return key_notify_policy(xp, dir, c); case XFRM_MSG_FLUSHPOLICY: if (c->data.type != XFRM_POLICY_TYPE_MAIN) break; return key_notify_policy_flush(c); default: pr_err("pfkey: Unknown policy event %d\n", c->event); break; } return 0; } static u32 get_acqseq(void) { u32 res; static atomic_t acqseq; do { res = atomic_inc_return(&acqseq); } while (!res); return res; } static bool pfkey_is_alive(const struct km_event *c) { struct netns_pfkey *net_pfkey = net_generic(c->net, pfkey_net_id); struct sock *sk; bool is_alive = false; rcu_read_lock(); sk_for_each_rcu(sk, &net_pfkey->table) { if (pfkey_sk(sk)->registered) { is_alive = true; break; } } rcu_read_unlock(); return is_alive; } static int pfkey_send_acquire(struct xfrm_state *x, struct xfrm_tmpl *t, struct xfrm_policy *xp) { struct sk_buff *skb; struct sadb_msg *hdr; struct sadb_address *addr; struct sadb_x_policy *pol; int sockaddr_size; int size; struct sadb_x_sec_ctx *sec_ctx; struct xfrm_sec_ctx *xfrm_ctx; int ctx_size = 0; int alg_size = 0; sockaddr_size = pfkey_sockaddr_size(x->props.family); if (!sockaddr_size) return -EINVAL; size = sizeof(struct sadb_msg) + (sizeof(struct sadb_address) * 2) + (sockaddr_size * 2) + sizeof(struct sadb_x_policy); if (x->id.proto == IPPROTO_AH) alg_size = count_ah_combs(t); else if (x->id.proto == IPPROTO_ESP) alg_size = count_esp_combs(t); if ((xfrm_ctx = x->security)) { ctx_size = PFKEY_ALIGN8(xfrm_ctx->ctx_len); size += sizeof(struct sadb_x_sec_ctx) + ctx_size; } skb = alloc_skb(size + alg_size + 16, GFP_ATOMIC); if (skb == NULL) return -ENOMEM; hdr = skb_put(skb, sizeof(struct sadb_msg)); hdr->sadb_msg_version = PF_KEY_V2; hdr->sadb_msg_type = SADB_ACQUIRE; hdr->sadb_msg_satype = pfkey_proto2satype(x->id.proto); hdr->sadb_msg_len = size / sizeof(uint64_t); hdr->sadb_msg_errno = 0; hdr->sadb_msg_reserved = 0; hdr->sadb_msg_seq = x->km.seq = get_acqseq(); hdr->sadb_msg_pid = 0; /* src address */ addr = skb_put(skb, sizeof(struct sadb_address) + sockaddr_size); addr->sadb_address_len = (sizeof(struct sadb_address)+sockaddr_size)/ sizeof(uint64_t); addr->sadb_address_exttype = SADB_EXT_ADDRESS_SRC; addr->sadb_address_proto = 0; addr->sadb_address_reserved = 0; addr->sadb_address_prefixlen = pfkey_sockaddr_fill(&x->props.saddr, 0, (struct sockaddr *) (addr + 1), x->props.family); if (!addr->sadb_address_prefixlen) BUG(); /* dst address */ addr = skb_put(skb, sizeof(struct sadb_address) + sockaddr_size); addr->sadb_address_len = (sizeof(struct sadb_address)+sockaddr_size)/ sizeof(uint64_t); addr->sadb_address_exttype = SADB_EXT_ADDRESS_DST; addr->sadb_address_proto = 0; addr->sadb_address_reserved = 0; addr->sadb_address_prefixlen = pfkey_sockaddr_fill(&x->id.daddr, 0, (struct sockaddr *) (addr + 1), x->props.family); if (!addr->sadb_address_prefixlen) BUG(); pol = skb_put(skb, sizeof(struct sadb_x_policy)); pol->sadb_x_policy_len = sizeof(struct sadb_x_policy)/sizeof(uint64_t); pol->sadb_x_policy_exttype = SADB_X_EXT_POLICY; pol->sadb_x_policy_type = IPSEC_POLICY_IPSEC; pol->sadb_x_policy_dir = XFRM_POLICY_OUT + 1; pol->sadb_x_policy_reserved = 0; pol->sadb_x_policy_id = xp->index; pol->sadb_x_policy_priority = xp->priority; /* Set sadb_comb's. */ alg_size = 0; if (x->id.proto == IPPROTO_AH) alg_size = dump_ah_combs(skb, t); else if (x->id.proto == IPPROTO_ESP) alg_size = dump_esp_combs(skb, t); hdr->sadb_msg_len += alg_size / 8; /* security context */ if (xfrm_ctx) { sec_ctx = skb_put(skb, sizeof(struct sadb_x_sec_ctx) + ctx_size); sec_ctx->sadb_x_sec_len = (sizeof(struct sadb_x_sec_ctx) + ctx_size) / sizeof(uint64_t); sec_ctx->sadb_x_sec_exttype = SADB_X_EXT_SEC_CTX; sec_ctx->sadb_x_ctx_doi = xfrm_ctx->ctx_doi; sec_ctx->sadb_x_ctx_alg = xfrm_ctx->ctx_alg; sec_ctx->sadb_x_ctx_len = xfrm_ctx->ctx_len; memcpy(sec_ctx + 1, xfrm_ctx->ctx_str, xfrm_ctx->ctx_len); } return pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_REGISTERED, NULL, xs_net(x)); } static struct xfrm_policy *pfkey_compile_policy(struct sock *sk, int opt, u8 *data, int len, int *dir) { struct net *net = sock_net(sk); struct xfrm_policy *xp; struct sadb_x_policy *pol = (struct sadb_x_policy*)data; struct sadb_x_sec_ctx *sec_ctx; switch (sk->sk_family) { case AF_INET: if (opt != IP_IPSEC_POLICY) { *dir = -EOPNOTSUPP; return NULL; } break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: if (opt != IPV6_IPSEC_POLICY) { *dir = -EOPNOTSUPP; return NULL; } break; #endif default: *dir = -EINVAL; return NULL; } *dir = -EINVAL; if (len < sizeof(struct sadb_x_policy) || pol->sadb_x_policy_len*8 > len || pol->sadb_x_policy_type > IPSEC_POLICY_BYPASS || (!pol->sadb_x_policy_dir || pol->sadb_x_policy_dir > IPSEC_DIR_OUTBOUND)) return NULL; xp = xfrm_policy_alloc(net, GFP_ATOMIC); if (xp == NULL) { *dir = -ENOBUFS; return NULL; } xp->action = (pol->sadb_x_policy_type == IPSEC_POLICY_DISCARD ? XFRM_POLICY_BLOCK : XFRM_POLICY_ALLOW); xp->lft.soft_byte_limit = XFRM_INF; xp->lft.hard_byte_limit = XFRM_INF; xp->lft.soft_packet_limit = XFRM_INF; xp->lft.hard_packet_limit = XFRM_INF; xp->family = sk->sk_family; xp->xfrm_nr = 0; if (pol->sadb_x_policy_type == IPSEC_POLICY_IPSEC && (*dir = parse_ipsecrequests(xp, pol)) < 0) goto out; /* security context too */ if (len >= (pol->sadb_x_policy_len*8 + sizeof(struct sadb_x_sec_ctx))) { char *p = (char *)pol; struct xfrm_user_sec_ctx *uctx; p += pol->sadb_x_policy_len*8; sec_ctx = (struct sadb_x_sec_ctx *)p; if (len < pol->sadb_x_policy_len*8 + sec_ctx->sadb_x_sec_len*8) { *dir = -EINVAL; goto out; } if ((*dir = verify_sec_ctx_len(p))) goto out; uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx, GFP_ATOMIC); *dir = security_xfrm_policy_alloc(&xp->security, uctx, GFP_ATOMIC); kfree(uctx); if (*dir) goto out; } *dir = pol->sadb_x_policy_dir-1; return xp; out: xp->walk.dead = 1; xfrm_policy_destroy(xp); return NULL; } static int pfkey_send_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, __be16 sport) { struct sk_buff *skb; struct sadb_msg *hdr; struct sadb_sa *sa; struct sadb_address *addr; struct sadb_x_nat_t_port *n_port; int sockaddr_size; int size; __u8 satype = (x->id.proto == IPPROTO_ESP ? SADB_SATYPE_ESP : 0); struct xfrm_encap_tmpl *natt = NULL; sockaddr_size = pfkey_sockaddr_size(x->props.family); if (!sockaddr_size) return -EINVAL; if (!satype) return -EINVAL; if (!x->encap) return -EINVAL; natt = x->encap; /* Build an SADB_X_NAT_T_NEW_MAPPING message: * * HDR | SA | ADDRESS_SRC (old addr) | NAT_T_SPORT (old port) | * ADDRESS_DST (new addr) | NAT_T_DPORT (new port) */ size = sizeof(struct sadb_msg) + sizeof(struct sadb_sa) + (sizeof(struct sadb_address) * 2) + (sockaddr_size * 2) + (sizeof(struct sadb_x_nat_t_port) * 2); skb = alloc_skb(size + 16, GFP_ATOMIC); if (skb == NULL) return -ENOMEM; hdr = skb_put(skb, sizeof(struct sadb_msg)); hdr->sadb_msg_version = PF_KEY_V2; hdr->sadb_msg_type = SADB_X_NAT_T_NEW_MAPPING; hdr->sadb_msg_satype = satype; hdr->sadb_msg_len = size / sizeof(uint64_t); hdr->sadb_msg_errno = 0; hdr->sadb_msg_reserved = 0; hdr->sadb_msg_seq = x->km.seq; hdr->sadb_msg_pid = 0; /* SA */ sa = skb_put(skb, sizeof(struct sadb_sa)); sa->sadb_sa_len = sizeof(struct sadb_sa)/sizeof(uint64_t); sa->sadb_sa_exttype = SADB_EXT_SA; sa->sadb_sa_spi = x->id.spi; sa->sadb_sa_replay = 0; sa->sadb_sa_state = 0; sa->sadb_sa_auth = 0; sa->sadb_sa_encrypt = 0; sa->sadb_sa_flags = 0; /* ADDRESS_SRC (old addr) */ addr = skb_put(skb, sizeof(struct sadb_address) + sockaddr_size); addr->sadb_address_len = (sizeof(struct sadb_address)+sockaddr_size)/ sizeof(uint64_t); addr->sadb_address_exttype = SADB_EXT_ADDRESS_SRC; addr->sadb_address_proto = 0; addr->sadb_address_reserved = 0; addr->sadb_address_prefixlen = pfkey_sockaddr_fill(&x->props.saddr, 0, (struct sockaddr *) (addr + 1), x->props.family); if (!addr->sadb_address_prefixlen) BUG(); /* NAT_T_SPORT (old port) */ n_port = skb_put(skb, sizeof(*n_port)); n_port->sadb_x_nat_t_port_len = sizeof(*n_port)/sizeof(uint64_t); n_port->sadb_x_nat_t_port_exttype = SADB_X_EXT_NAT_T_SPORT; n_port->sadb_x_nat_t_port_port = natt->encap_sport; n_port->sadb_x_nat_t_port_reserved = 0; /* ADDRESS_DST (new addr) */ addr = skb_put(skb, sizeof(struct sadb_address) + sockaddr_size); addr->sadb_address_len = (sizeof(struct sadb_address)+sockaddr_size)/ sizeof(uint64_t); addr->sadb_address_exttype = SADB_EXT_ADDRESS_DST; addr->sadb_address_proto = 0; addr->sadb_address_reserved = 0; addr->sadb_address_prefixlen = pfkey_sockaddr_fill(ipaddr, 0, (struct sockaddr *) (addr + 1), x->props.family); if (!addr->sadb_address_prefixlen) BUG(); /* NAT_T_DPORT (new port) */ n_port = skb_put(skb, sizeof(*n_port)); n_port->sadb_x_nat_t_port_len = sizeof(*n_port)/sizeof(uint64_t); n_port->sadb_x_nat_t_port_exttype = SADB_X_EXT_NAT_T_DPORT; n_port->sadb_x_nat_t_port_port = sport; n_port->sadb_x_nat_t_port_reserved = 0; return pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_REGISTERED, NULL, xs_net(x)); } #ifdef CONFIG_NET_KEY_MIGRATE static int set_sadb_address(struct sk_buff *skb, int sasize, int type, const struct xfrm_selector *sel) { struct sadb_address *addr; addr = skb_put(skb, sizeof(struct sadb_address) + sasize); addr->sadb_address_len = (sizeof(struct sadb_address) + sasize)/8; addr->sadb_address_exttype = type; addr->sadb_address_proto = sel->proto; addr->sadb_address_reserved = 0; switch (type) { case SADB_EXT_ADDRESS_SRC: addr->sadb_address_prefixlen = sel->prefixlen_s; pfkey_sockaddr_fill(&sel->saddr, 0, (struct sockaddr *)(addr + 1), sel->family); break; case SADB_EXT_ADDRESS_DST: addr->sadb_address_prefixlen = sel->prefixlen_d; pfkey_sockaddr_fill(&sel->daddr, 0, (struct sockaddr *)(addr + 1), sel->family); break; default: return -EINVAL; } return 0; } static int set_sadb_kmaddress(struct sk_buff *skb, const struct xfrm_kmaddress *k) { struct sadb_x_kmaddress *kma; u8 *sa; int family = k->family; int socklen = pfkey_sockaddr_len(family); int size_req; size_req = (sizeof(struct sadb_x_kmaddress) + pfkey_sockaddr_pair_size(family)); kma = skb_put_zero(skb, size_req); kma->sadb_x_kmaddress_len = size_req / 8; kma->sadb_x_kmaddress_exttype = SADB_X_EXT_KMADDRESS; kma->sadb_x_kmaddress_reserved = k->reserved; sa = (u8 *)(kma + 1); if (!pfkey_sockaddr_fill(&k->local, 0, (struct sockaddr *)sa, family) || !pfkey_sockaddr_fill(&k->remote, 0, (struct sockaddr *)(sa+socklen), family)) return -EINVAL; return 0; } static int set_ipsecrequest(struct sk_buff *skb, uint8_t proto, uint8_t mode, int level, uint32_t reqid, uint8_t family, const xfrm_address_t *src, const xfrm_address_t *dst) { struct sadb_x_ipsecrequest *rq; u8 *sa; int socklen = pfkey_sockaddr_len(family); int size_req; size_req = sizeof(struct sadb_x_ipsecrequest) + pfkey_sockaddr_pair_size(family); rq = skb_put_zero(skb, size_req); rq->sadb_x_ipsecrequest_len = size_req; rq->sadb_x_ipsecrequest_proto = proto; rq->sadb_x_ipsecrequest_mode = mode; rq->sadb_x_ipsecrequest_level = level; rq->sadb_x_ipsecrequest_reqid = reqid; sa = (u8 *) (rq + 1); if (!pfkey_sockaddr_fill(src, 0, (struct sockaddr *)sa, family) || !pfkey_sockaddr_fill(dst, 0, (struct sockaddr *)(sa + socklen), family)) return -EINVAL; return 0; } #endif #ifdef CONFIG_NET_KEY_MIGRATE static int pfkey_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, const struct xfrm_migrate *m, int num_bundles, const struct xfrm_kmaddress *k, const struct xfrm_encap_tmpl *encap) { int i; int sasize_sel; int size = 0; int size_pol = 0; struct sk_buff *skb; struct sadb_msg *hdr; struct sadb_x_policy *pol; const struct xfrm_migrate *mp; if (type != XFRM_POLICY_TYPE_MAIN) return 0; if (num_bundles <= 0 || num_bundles > XFRM_MAX_DEPTH) return -EINVAL; if (k != NULL) { /* addresses for KM */ size += PFKEY_ALIGN8(sizeof(struct sadb_x_kmaddress) + pfkey_sockaddr_pair_size(k->family)); } /* selector */ sasize_sel = pfkey_sockaddr_size(sel->family); if (!sasize_sel) return -EINVAL; size += (sizeof(struct sadb_address) + sasize_sel) * 2; /* policy info */ size_pol += sizeof(struct sadb_x_policy); /* ipsecrequests */ for (i = 0, mp = m; i < num_bundles; i++, mp++) { /* old locator pair */ size_pol += sizeof(struct sadb_x_ipsecrequest) + pfkey_sockaddr_pair_size(mp->old_family); /* new locator pair */ size_pol += sizeof(struct sadb_x_ipsecrequest) + pfkey_sockaddr_pair_size(mp->new_family); } size += sizeof(struct sadb_msg) + size_pol; /* alloc buffer */ skb = alloc_skb(size, GFP_ATOMIC); if (skb == NULL) return -ENOMEM; hdr = skb_put(skb, sizeof(struct sadb_msg)); hdr->sadb_msg_version = PF_KEY_V2; hdr->sadb_msg_type = SADB_X_MIGRATE; hdr->sadb_msg_satype = pfkey_proto2satype(m->proto); hdr->sadb_msg_len = size / 8; hdr->sadb_msg_errno = 0; hdr->sadb_msg_reserved = 0; hdr->sadb_msg_seq = 0; hdr->sadb_msg_pid = 0; /* Addresses to be used by KM for negotiation, if ext is available */ if (k != NULL && (set_sadb_kmaddress(skb, k) < 0)) goto err; /* selector src */ set_sadb_address(skb, sasize_sel, SADB_EXT_ADDRESS_SRC, sel); /* selector dst */ set_sadb_address(skb, sasize_sel, SADB_EXT_ADDRESS_DST, sel); /* policy information */ pol = skb_put(skb, sizeof(struct sadb_x_policy)); pol->sadb_x_policy_len = size_pol / 8; pol->sadb_x_policy_exttype = SADB_X_EXT_POLICY; pol->sadb_x_policy_type = IPSEC_POLICY_IPSEC; pol->sadb_x_policy_dir = dir + 1; pol->sadb_x_policy_reserved = 0; pol->sadb_x_policy_id = 0; pol->sadb_x_policy_priority = 0; for (i = 0, mp = m; i < num_bundles; i++, mp++) { /* old ipsecrequest */ int mode = pfkey_mode_from_xfrm(mp->mode); if (mode < 0) goto err; if (set_ipsecrequest(skb, mp->proto, mode, (mp->reqid ? IPSEC_LEVEL_UNIQUE : IPSEC_LEVEL_REQUIRE), mp->reqid, mp->old_family, &mp->old_saddr, &mp->old_daddr) < 0) goto err; /* new ipsecrequest */ if (set_ipsecrequest(skb, mp->proto, mode, (mp->reqid ? IPSEC_LEVEL_UNIQUE : IPSEC_LEVEL_REQUIRE), mp->reqid, mp->new_family, &mp->new_saddr, &mp->new_daddr) < 0) goto err; } /* broadcast migrate message to sockets */ pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ALL, NULL, &init_net); return 0; err: kfree_skb(skb); return -EINVAL; } #else static int pfkey_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, const struct xfrm_migrate *m, int num_bundles, const struct xfrm_kmaddress *k, const struct xfrm_encap_tmpl *encap) { return -ENOPROTOOPT; } #endif static int pfkey_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; struct sk_buff *skb = NULL; struct sadb_msg *hdr = NULL; int err; struct net *net = sock_net(sk); err = -EOPNOTSUPP; if (msg->msg_flags & MSG_OOB) goto out; err = -EMSGSIZE; if ((unsigned int)len > sk->sk_sndbuf - 32) goto out; err = -ENOBUFS; skb = alloc_skb(len, GFP_KERNEL); if (skb == NULL) goto out; err = -EFAULT; if (memcpy_from_msg(skb_put(skb,len), msg, len)) goto out; hdr = pfkey_get_base_msg(skb, &err); if (!hdr) goto out; mutex_lock(&net->xfrm.xfrm_cfg_mutex); err = pfkey_process(sk, skb, hdr); mutex_unlock(&net->xfrm.xfrm_cfg_mutex); out: if (err && hdr && pfkey_error(hdr, err, sk) == 0) err = 0; kfree_skb(skb); return err ? : len; } static int pfkey_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags) { struct sock *sk = sock->sk; struct pfkey_sock *pfk = pfkey_sk(sk); struct sk_buff *skb; int copied, err; err = -EINVAL; if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT)) goto out; skb = skb_recv_datagram(sk, flags, &err); if (skb == NULL) goto out; copied = skb->len; if (copied > len) { msg->msg_flags |= MSG_TRUNC; copied = len; } skb_reset_transport_header(skb); err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto out_free; sock_recv_cmsgs(msg, sk, skb); err = (flags & MSG_TRUNC) ? skb->len : copied; if (pfk->dump.dump != NULL && 3 * atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) pfkey_do_dump(pfk); out_free: skb_free_datagram(sk, skb); out: return err; } static const struct proto_ops pfkey_ops = { .family = PF_KEY, .owner = THIS_MODULE, /* Operations that make no sense on pfkey sockets. */ .bind = sock_no_bind, .connect = sock_no_connect, .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = sock_no_getname, .ioctl = sock_no_ioctl, .listen = sock_no_listen, .shutdown = sock_no_shutdown, .mmap = sock_no_mmap, /* Now the operations that really occur. */ .release = pfkey_release, .poll = datagram_poll, .sendmsg = pfkey_sendmsg, .recvmsg = pfkey_recvmsg, }; static const struct net_proto_family pfkey_family_ops = { .family = PF_KEY, .create = pfkey_create, .owner = THIS_MODULE, }; #ifdef CONFIG_PROC_FS static int pfkey_seq_show(struct seq_file *f, void *v) { struct sock *s = sk_entry(v); if (v == SEQ_START_TOKEN) seq_printf(f ,"sk RefCnt Rmem Wmem User Inode\n"); else seq_printf(f, "%pK %-6d %-6u %-6u %-6u %-6lu\n", s, refcount_read(&s->sk_refcnt), sk_rmem_alloc_get(s), sk_wmem_alloc_get(s), from_kuid_munged(seq_user_ns(f), sock_i_uid(s)), sock_i_ino(s) ); return 0; } static void *pfkey_seq_start(struct seq_file *f, loff_t *ppos) __acquires(rcu) { struct net *net = seq_file_net(f); struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id); rcu_read_lock(); return seq_hlist_start_head_rcu(&net_pfkey->table, *ppos); } static void *pfkey_seq_next(struct seq_file *f, void *v, loff_t *ppos) { struct net *net = seq_file_net(f); struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id); return seq_hlist_next_rcu(v, &net_pfkey->table, ppos); } static void pfkey_seq_stop(struct seq_file *f, void *v) __releases(rcu) { rcu_read_unlock(); } static const struct seq_operations pfkey_seq_ops = { .start = pfkey_seq_start, .next = pfkey_seq_next, .stop = pfkey_seq_stop, .show = pfkey_seq_show, }; static int __net_init pfkey_init_proc(struct net *net) { struct proc_dir_entry *e; e = proc_create_net("pfkey", 0, net->proc_net, &pfkey_seq_ops, sizeof(struct seq_net_private)); if (e == NULL) return -ENOMEM; return 0; } static void __net_exit pfkey_exit_proc(struct net *net) { remove_proc_entry("pfkey", net->proc_net); } #else static inline int pfkey_init_proc(struct net *net) { return 0; } static inline void pfkey_exit_proc(struct net *net) { } #endif static struct xfrm_mgr pfkeyv2_mgr = { .notify = pfkey_send_notify, .acquire = pfkey_send_acquire, .compile_policy = pfkey_compile_policy, .new_mapping = pfkey_send_new_mapping, .notify_policy = pfkey_send_policy_notify, .migrate = pfkey_send_migrate, .is_alive = pfkey_is_alive, }; static int __net_init pfkey_net_init(struct net *net) { struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id); int rv; INIT_HLIST_HEAD(&net_pfkey->table); atomic_set(&net_pfkey->socks_nr, 0); rv = pfkey_init_proc(net); return rv; } static void __net_exit pfkey_net_exit(struct net *net) { struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id); pfkey_exit_proc(net); WARN_ON(!hlist_empty(&net_pfkey->table)); } static struct pernet_operations pfkey_net_ops = { .init = pfkey_net_init, .exit = pfkey_net_exit, .id = &pfkey_net_id, .size = sizeof(struct netns_pfkey), }; static void __exit ipsec_pfkey_exit(void) { xfrm_unregister_km(&pfkeyv2_mgr); sock_unregister(PF_KEY); unregister_pernet_subsys(&pfkey_net_ops); proto_unregister(&key_proto); } static int __init ipsec_pfkey_init(void) { int err = proto_register(&key_proto, 0); if (err != 0) goto out; err = register_pernet_subsys(&pfkey_net_ops); if (err != 0) goto out_unregister_key_proto; err = sock_register(&pfkey_family_ops); if (err != 0) goto out_unregister_pernet; xfrm_register_km(&pfkeyv2_mgr); out: return err; out_unregister_pernet: unregister_pernet_subsys(&pfkey_net_ops); out_unregister_key_proto: proto_unregister(&key_proto); goto out; } module_init(ipsec_pfkey_init); module_exit(ipsec_pfkey_exit); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(PF_KEY);
6 5 6 3 2 6 7 6 6 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 #undef TRACE_SYSTEM #define TRACE_SYSTEM rtc #if !defined(_TRACE_RTC_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_RTC_H #include <linux/rtc.h> #include <linux/tracepoint.h> DECLARE_EVENT_CLASS(rtc_time_alarm_class, TP_PROTO(time64_t secs, int err), TP_ARGS(secs, err), TP_STRUCT__entry( __field(time64_t, secs) __field(int, err) ), TP_fast_assign( __entry->secs = secs; __entry->err = err; ), TP_printk("UTC (%lld) (%d)", __entry->secs, __entry->err ) ); DEFINE_EVENT(rtc_time_alarm_class, rtc_set_time, TP_PROTO(time64_t secs, int err), TP_ARGS(secs, err) ); DEFINE_EVENT(rtc_time_alarm_class, rtc_read_time, TP_PROTO(time64_t secs, int err), TP_ARGS(secs, err) ); DEFINE_EVENT(rtc_time_alarm_class, rtc_set_alarm, TP_PROTO(time64_t secs, int err), TP_ARGS(secs, err) ); DEFINE_EVENT(rtc_time_alarm_class, rtc_read_alarm, TP_PROTO(time64_t secs, int err), TP_ARGS(secs, err) ); TRACE_EVENT(rtc_irq_set_freq, TP_PROTO(int freq, int err), TP_ARGS(freq, err), TP_STRUCT__entry( __field(int, freq) __field(int, err) ), TP_fast_assign( __entry->freq = freq; __entry->err = err; ), TP_printk("set RTC periodic IRQ frequency:%u (%d)", __entry->freq, __entry->err ) ); TRACE_EVENT(rtc_irq_set_state, TP_PROTO(int enabled, int err), TP_ARGS(enabled, err), TP_STRUCT__entry( __field(int, enabled) __field(int, err) ), TP_fast_assign( __entry->enabled = enabled; __entry->err = err; ), TP_printk("%s RTC 2^N Hz periodic IRQs (%d)", __entry->enabled ? "enable" : "disable", __entry->err ) ); TRACE_EVENT(rtc_alarm_irq_enable, TP_PROTO(unsigned int enabled, int err), TP_ARGS(enabled, err), TP_STRUCT__entry( __field(unsigned int, enabled) __field(int, err) ), TP_fast_assign( __entry->enabled = enabled; __entry->err = err; ), TP_printk("%s RTC alarm IRQ (%d)", __entry->enabled ? "enable" : "disable", __entry->err ) ); DECLARE_EVENT_CLASS(rtc_offset_class, TP_PROTO(long offset, int err), TP_ARGS(offset, err), TP_STRUCT__entry( __field(long, offset) __field(int, err) ), TP_fast_assign( __entry->offset = offset; __entry->err = err; ), TP_printk("RTC offset: %ld (%d)", __entry->offset, __entry->err ) ); DEFINE_EVENT(rtc_offset_class, rtc_set_offset, TP_PROTO(long offset, int err), TP_ARGS(offset, err) ); DEFINE_EVENT(rtc_offset_class, rtc_read_offset, TP_PROTO(long offset, int err), TP_ARGS(offset, err) ); DECLARE_EVENT_CLASS(rtc_timer_class, TP_PROTO(struct rtc_timer *timer), TP_ARGS(timer), TP_STRUCT__entry( __field(struct rtc_timer *, timer) __field(ktime_t, expires) __field(ktime_t, period) ), TP_fast_assign( __entry->timer = timer; __entry->expires = timer->node.expires; __entry->period = timer->period; ), TP_printk("RTC timer:(%p) expires:%lld period:%lld", __entry->timer, __entry->expires, __entry->period ) ); DEFINE_EVENT(rtc_timer_class, rtc_timer_enqueue, TP_PROTO(struct rtc_timer *timer), TP_ARGS(timer) ); DEFINE_EVENT(rtc_timer_class, rtc_timer_dequeue, TP_PROTO(struct rtc_timer *timer), TP_ARGS(timer) ); DEFINE_EVENT(rtc_timer_class, rtc_timer_fired, TP_PROTO(struct rtc_timer *timer), TP_ARGS(timer) ); #endif /* _TRACE_RTC_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
1 2 2 2 2 3 3 2 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 /* * net/tipc/diag.c: TIPC socket diag * * Copyright (c) 2018, Ericsson AB * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "ASIS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "core.h" #include "socket.h" #include <linux/sock_diag.h> #include <linux/tipc_sockets_diag.h> static u64 __tipc_diag_gen_cookie(struct sock *sk) { u32 res[2]; sock_diag_save_cookie(sk, res); return *((u64 *)res); } static int __tipc_add_sock_diag(struct sk_buff *skb, struct netlink_callback *cb, struct tipc_sock *tsk) { struct tipc_sock_diag_req *req = nlmsg_data(cb->nlh); struct nlmsghdr *nlh; int err; nlh = nlmsg_put_answer(skb, cb, SOCK_DIAG_BY_FAMILY, 0, NLM_F_MULTI); if (!nlh) return -EMSGSIZE; err = tipc_sk_fill_sock_diag(skb, cb, tsk, req->tidiag_states, __tipc_diag_gen_cookie); if (err) return err; nlmsg_end(skb, nlh); return 0; } static int tipc_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) { return tipc_nl_sk_walk(skb, cb, __tipc_add_sock_diag); } static int tipc_sock_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) { int hdrlen = sizeof(struct tipc_sock_diag_req); struct net *net = sock_net(skb->sk); if (nlmsg_len(h) < hdrlen) return -EINVAL; if (h->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .start = tipc_dump_start, .dump = tipc_diag_dump, .done = tipc_dump_done, }; netlink_dump_start(net->diag_nlsk, skb, h, &c); return 0; } return -EOPNOTSUPP; } static const struct sock_diag_handler tipc_sock_diag_handler = { .family = AF_TIPC, .dump = tipc_sock_diag_handler_dump, }; static int __init tipc_diag_init(void) { return sock_diag_register(&tipc_sock_diag_handler); } static void __exit tipc_diag_exit(void) { sock_diag_unregister(&tipc_sock_diag_handler); } module_init(tipc_diag_init); module_exit(tipc_diag_exit); MODULE_LICENSE("Dual BSD/GPL"); MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, AF_TIPC);
81 4196 9 2 16795 4 49 49 49 2322 6 18814 91 18819 1 851 15 1 851 11 156 2898 576 18 800 3792 4081 3900 1164 531 3 205 15 2255 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_FORTIFY_STRING_H_ #define _LINUX_FORTIFY_STRING_H_ #include <linux/bug.h> #include <linux/const.h> #include <linux/limits.h> #define __FORTIFY_INLINE extern __always_inline __gnu_inline __overloadable #define __RENAME(x) __asm__(#x) void fortify_panic(const char *name) __noreturn __cold; void __read_overflow(void) __compiletime_error("detected read beyond size of object (1st parameter)"); void __read_overflow2(void) __compiletime_error("detected read beyond size of object (2nd parameter)"); void __read_overflow2_field(size_t avail, size_t wanted) __compiletime_warning("detected read beyond size of field (2nd parameter); maybe use struct_group()?"); void __write_overflow(void) __compiletime_error("detected write beyond size of object (1st parameter)"); void __write_overflow_field(size_t avail, size_t wanted) __compiletime_warning("detected write beyond size of field (1st parameter); maybe use struct_group()?"); #define __compiletime_strlen(p) \ ({ \ char *__p = (char *)(p); \ size_t __ret = SIZE_MAX; \ const size_t __p_size = __member_size(p); \ if (__p_size != SIZE_MAX && \ __builtin_constant_p(*__p)) { \ size_t __p_len = __p_size - 1; \ if (__builtin_constant_p(__p[__p_len]) && \ __p[__p_len] == '\0') \ __ret = __builtin_strlen(__p); \ } \ __ret; \ }) #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) extern void *__underlying_memchr(const void *p, int c, __kernel_size_t size) __RENAME(memchr); extern int __underlying_memcmp(const void *p, const void *q, __kernel_size_t size) __RENAME(memcmp); extern void *__underlying_memcpy(void *p, const void *q, __kernel_size_t size) __RENAME(memcpy); extern void *__underlying_memmove(void *p, const void *q, __kernel_size_t size) __RENAME(memmove); extern void *__underlying_memset(void *p, int c, __kernel_size_t size) __RENAME(memset); extern char *__underlying_strcat(char *p, const char *q) __RENAME(strcat); extern char *__underlying_strcpy(char *p, const char *q) __RENAME(strcpy); extern __kernel_size_t __underlying_strlen(const char *p) __RENAME(strlen); extern char *__underlying_strncat(char *p, const char *q, __kernel_size_t count) __RENAME(strncat); extern char *__underlying_strncpy(char *p, const char *q, __kernel_size_t size) __RENAME(strncpy); #else #if defined(__SANITIZE_MEMORY__) /* * For KMSAN builds all memcpy/memset/memmove calls should be replaced by the * corresponding __msan_XXX functions. */ #include <linux/kmsan_string.h> #define __underlying_memcpy __msan_memcpy #define __underlying_memmove __msan_memmove #define __underlying_memset __msan_memset #else #define __underlying_memcpy __builtin_memcpy #define __underlying_memmove __builtin_memmove #define __underlying_memset __builtin_memset #endif #define __underlying_memchr __builtin_memchr #define __underlying_memcmp __builtin_memcmp #define __underlying_strcat __builtin_strcat #define __underlying_strcpy __builtin_strcpy #define __underlying_strlen __builtin_strlen #define __underlying_strncat __builtin_strncat #define __underlying_strncpy __builtin_strncpy #endif /** * unsafe_memcpy - memcpy implementation with no FORTIFY bounds checking * * @dst: Destination memory address to write to * @src: Source memory address to read from * @bytes: How many bytes to write to @dst from @src * @justification: Free-form text or comment describing why the use is needed * * This should be used for corner cases where the compiler cannot do the * right thing, or during transitions between APIs, etc. It should be used * very rarely, and includes a place for justification detailing where bounds * checking has happened, and why existing solutions cannot be employed. */ #define unsafe_memcpy(dst, src, bytes, justification) \ __underlying_memcpy(dst, src, bytes) /* * Clang's use of __builtin_*object_size() within inlines needs hinting via * __pass_*object_size(). The preference is to only ever use type 1 (member * size, rather than struct size), but there remain some stragglers using * type 0 that will be converted in the future. */ #if __has_builtin(__builtin_dynamic_object_size) #define POS __pass_dynamic_object_size(1) #define POS0 __pass_dynamic_object_size(0) #else #define POS __pass_object_size(1) #define POS0 __pass_object_size(0) #endif #define __compiletime_lessthan(bounds, length) ( \ __builtin_constant_p((bounds) < (length)) && \ (bounds) < (length) \ ) /** * strncpy - Copy a string to memory with non-guaranteed NUL padding * * @p: pointer to destination of copy * @q: pointer to NUL-terminated source string to copy * @size: bytes to write at @p * * If strlen(@q) >= @size, the copy of @q will stop after @size bytes, * and @p will NOT be NUL-terminated * * If strlen(@q) < @size, following the copy of @q, trailing NUL bytes * will be written to @p until @size total bytes have been written. * * Do not use this function. While FORTIFY_SOURCE tries to avoid * over-reads of @q, it cannot defend against writing unterminated * results to @p. Using strncpy() remains ambiguous and fragile. * Instead, please choose an alternative, so that the expectation * of @p's contents is unambiguous: * * +--------------------+--------------------+------------+ * | **p** needs to be: | padded to **size** | not padded | * +====================+====================+============+ * | NUL-terminated | strscpy_pad() | strscpy() | * +--------------------+--------------------+------------+ * | not NUL-terminated | strtomem_pad() | strtomem() | * +--------------------+--------------------+------------+ * * Note strscpy*()'s differing return values for detecting truncation, * and strtomem*()'s expectation that the destination is marked with * __nonstring when it is a character array. * */ __FORTIFY_INLINE __diagnose_as(__builtin_strncpy, 1, 2, 3) char *strncpy(char * const POS p, const char *q, __kernel_size_t size) { const size_t p_size = __member_size(p); if (__compiletime_lessthan(p_size, size)) __write_overflow(); if (p_size < size) fortify_panic(__func__); return __underlying_strncpy(p, q, size); } extern __kernel_size_t __real_strnlen(const char *, __kernel_size_t) __RENAME(strnlen); /** * strnlen - Return bounded count of characters in a NUL-terminated string * * @p: pointer to NUL-terminated string to count. * @maxlen: maximum number of characters to count. * * Returns number of characters in @p (NOT including the final NUL), or * @maxlen, if no NUL has been found up to there. * */ __FORTIFY_INLINE __kernel_size_t strnlen(const char * const POS p, __kernel_size_t maxlen) { const size_t p_size = __member_size(p); const size_t p_len = __compiletime_strlen(p); size_t ret; /* We can take compile-time actions when maxlen is const. */ if (__builtin_constant_p(maxlen) && p_len != SIZE_MAX) { /* If p is const, we can use its compile-time-known len. */ if (maxlen >= p_size) return p_len; } /* Do not check characters beyond the end of p. */ ret = __real_strnlen(p, maxlen < p_size ? maxlen : p_size); if (p_size <= ret && maxlen != ret) fortify_panic(__func__); return ret; } /* * Defined after fortified strnlen to reuse it. However, it must still be * possible for strlen() to be used on compile-time strings for use in * static initializers (i.e. as a constant expression). */ /** * strlen - Return count of characters in a NUL-terminated string * * @p: pointer to NUL-terminated string to count. * * Do not use this function unless the string length is known at * compile-time. When @p is unterminated, this function may crash * or return unexpected counts that could lead to memory content * exposures. Prefer strnlen(). * * Returns number of characters in @p (NOT including the final NUL). * */ #define strlen(p) \ __builtin_choose_expr(__is_constexpr(__builtin_strlen(p)), \ __builtin_strlen(p), __fortify_strlen(p)) __FORTIFY_INLINE __diagnose_as(__builtin_strlen, 1) __kernel_size_t __fortify_strlen(const char * const POS p) { const size_t p_size = __member_size(p); __kernel_size_t ret; /* Give up if we don't know how large p is. */ if (p_size == SIZE_MAX) return __underlying_strlen(p); ret = strnlen(p, p_size); if (p_size <= ret) fortify_panic(__func__); return ret; } /* Defined after fortified strlen() to reuse it. */ extern size_t __real_strlcpy(char *, const char *, size_t) __RENAME(strlcpy); /** * strlcpy - Copy a string into another string buffer * * @p: pointer to destination of copy * @q: pointer to NUL-terminated source string to copy * @size: maximum number of bytes to write at @p * * If strlen(@q) >= @size, the copy of @q will be truncated at * @size - 1 bytes. @p will always be NUL-terminated. * * Do not use this function. While FORTIFY_SOURCE tries to avoid * over-reads when calculating strlen(@q), it is still possible. * Prefer strscpy(), though note its different return values for * detecting truncation. * * Returns total number of bytes written to @p, including terminating NUL. * */ __FORTIFY_INLINE size_t strlcpy(char * const POS p, const char * const POS q, size_t size) { const size_t p_size = __member_size(p); const size_t q_size = __member_size(q); size_t q_len; /* Full count of source string length. */ size_t len; /* Count of characters going into destination. */ if (p_size == SIZE_MAX && q_size == SIZE_MAX) return __real_strlcpy(p, q, size); q_len = strlen(q); len = (q_len >= size) ? size - 1 : q_len; if (__builtin_constant_p(size) && __builtin_constant_p(q_len) && size) { /* Write size is always larger than destination. */ if (len >= p_size) __write_overflow(); } if (size) { if (len >= p_size) fortify_panic(__func__); __underlying_memcpy(p, q, len); p[len] = '\0'; } return q_len; } /* Defined after fortified strnlen() to reuse it. */ extern ssize_t __real_strscpy(char *, const char *, size_t) __RENAME(strscpy); /** * strscpy - Copy a C-string into a sized buffer * * @p: Where to copy the string to * @q: Where to copy the string from * @size: Size of destination buffer * * Copy the source string @q, or as much of it as fits, into the destination * @p buffer. The behavior is undefined if the string buffers overlap. The * destination @p buffer is always NUL terminated, unless it's zero-sized. * * Preferred to strlcpy() since the API doesn't require reading memory * from the source @q string beyond the specified @size bytes, and since * the return value is easier to error-check than strlcpy()'s. * In addition, the implementation is robust to the string changing out * from underneath it, unlike the current strlcpy() implementation. * * Preferred to strncpy() since it always returns a valid string, and * doesn't unnecessarily force the tail of the destination buffer to be * zero padded. If padding is desired please use strscpy_pad(). * * Returns the number of characters copied in @p (not including the * trailing %NUL) or -E2BIG if @size is 0 or the copy of @q was truncated. */ __FORTIFY_INLINE ssize_t strscpy(char * const POS p, const char * const POS q, size_t size) { /* Use string size rather than possible enclosing struct size. */ const size_t p_size = __member_size(p); const size_t q_size = __member_size(q); size_t len; /* If we cannot get size of p and q default to call strscpy. */ if (p_size == SIZE_MAX && q_size == SIZE_MAX) return __real_strscpy(p, q, size); /* * If size can be known at compile time and is greater than * p_size, generate a compile time write overflow error. */ if (__compiletime_lessthan(p_size, size)) __write_overflow(); /* Short-circuit for compile-time known-safe lengths. */ if (__compiletime_lessthan(p_size, SIZE_MAX)) { len = __compiletime_strlen(q); if (len < SIZE_MAX && __compiletime_lessthan(len, size)) { __underlying_memcpy(p, q, len + 1); return len; } } /* * This call protects from read overflow, because len will default to q * length if it smaller than size. */ len = strnlen(q, size); /* * If len equals size, we will copy only size bytes which leads to * -E2BIG being returned. * Otherwise we will copy len + 1 because of the final '\O'. */ len = len == size ? size : len + 1; /* * Generate a runtime write overflow error if len is greater than * p_size. */ if (len > p_size) fortify_panic(__func__); /* * We can now safely call vanilla strscpy because we are protected from: * 1. Read overflow thanks to call to strnlen(). * 2. Write overflow thanks to above ifs. */ return __real_strscpy(p, q, len); } /* Defined after fortified strlen() to reuse it. */ extern size_t __real_strlcat(char *p, const char *q, size_t avail) __RENAME(strlcat); /** * strlcat - Append a string to an existing string * * @p: pointer to %NUL-terminated string to append to * @q: pointer to %NUL-terminated string to append from * @avail: Maximum bytes available in @p * * Appends %NUL-terminated string @q after the %NUL-terminated * string at @p, but will not write beyond @avail bytes total, * potentially truncating the copy from @q. @p will stay * %NUL-terminated only if a %NUL already existed within * the @avail bytes of @p. If so, the resulting number of * bytes copied from @q will be at most "@avail - strlen(@p) - 1". * * Do not use this function. While FORTIFY_SOURCE tries to avoid * read and write overflows, this is only possible when the sizes * of @p and @q are known to the compiler. Prefer building the * string with formatting, via scnprintf(), seq_buf, or similar. * * Returns total bytes that _would_ have been contained by @p * regardless of truncation, similar to snprintf(). If return * value is >= @avail, the string has been truncated. * */ __FORTIFY_INLINE size_t strlcat(char * const POS p, const char * const POS q, size_t avail) { const size_t p_size = __member_size(p); const size_t q_size = __member_size(q); size_t p_len, copy_len; size_t actual, wanted; /* Give up immediately if both buffer sizes are unknown. */ if (p_size == SIZE_MAX && q_size == SIZE_MAX) return __real_strlcat(p, q, avail); p_len = strnlen(p, avail); copy_len = strlen(q); wanted = actual = p_len + copy_len; /* Cannot append any more: report truncation. */ if (avail <= p_len) return wanted; /* Give up if string is already overflowed. */ if (p_size <= p_len) fortify_panic(__func__); if (actual >= avail) { copy_len = avail - p_len - 1; actual = p_len + copy_len; } /* Give up if copy will overflow. */ if (p_size <= actual) fortify_panic(__func__); __underlying_memcpy(p + p_len, q, copy_len); p[actual] = '\0'; return wanted; } /* Defined after fortified strlcat() to reuse it. */ /** * strcat - Append a string to an existing string * * @p: pointer to NUL-terminated string to append to * @q: pointer to NUL-terminated source string to append from * * Do not use this function. While FORTIFY_SOURCE tries to avoid * read and write overflows, this is only possible when the * destination buffer size is known to the compiler. Prefer * building the string with formatting, via scnprintf() or similar. * At the very least, use strncat(). * * Returns @p. * */ __FORTIFY_INLINE __diagnose_as(__builtin_strcat, 1, 2) char *strcat(char * const POS p, const char *q) { const size_t p_size = __member_size(p); if (strlcat(p, q, p_size) >= p_size) fortify_panic(__func__); return p; } /** * strncat - Append a string to an existing string * * @p: pointer to NUL-terminated string to append to * @q: pointer to source string to append from * @count: Maximum bytes to read from @q * * Appends at most @count bytes from @q (stopping at the first * NUL byte) after the NUL-terminated string at @p. @p will be * NUL-terminated. * * Do not use this function. While FORTIFY_SOURCE tries to avoid * read and write overflows, this is only possible when the sizes * of @p and @q are known to the compiler. Prefer building the * string with formatting, via scnprintf() or similar. * * Returns @p. * */ /* Defined after fortified strlen() and strnlen() to reuse them. */ __FORTIFY_INLINE __diagnose_as(__builtin_strncat, 1, 2, 3) char *strncat(char * const POS p, const char * const POS q, __kernel_size_t count) { const size_t p_size = __member_size(p); const size_t q_size = __member_size(q); size_t p_len, copy_len; if (p_size == SIZE_MAX && q_size == SIZE_MAX) return __underlying_strncat(p, q, count); p_len = strlen(p); copy_len = strnlen(q, count); if (p_size < p_len + copy_len + 1) fortify_panic(__func__); __underlying_memcpy(p + p_len, q, copy_len); p[p_len + copy_len] = '\0'; return p; } __FORTIFY_INLINE void fortify_memset_chk(__kernel_size_t size, const size_t p_size, const size_t p_size_field) { if (__builtin_constant_p(size)) { /* * Length argument is a constant expression, so we * can perform compile-time bounds checking where * buffer sizes are also known at compile time. */ /* Error when size is larger than enclosing struct. */ if (__compiletime_lessthan(p_size_field, p_size) && __compiletime_lessthan(p_size, size)) __write_overflow(); /* Warn when write size is larger than dest field. */ if (__compiletime_lessthan(p_size_field, size)) __write_overflow_field(p_size_field, size); } /* * At this point, length argument may not be a constant expression, * so run-time bounds checking can be done where buffer sizes are * known. (This is not an "else" because the above checks may only * be compile-time warnings, and we want to still warn for run-time * overflows.) */ /* * Always stop accesses beyond the struct that contains the * field, when the buffer's remaining size is known. * (The SIZE_MAX test is to optimize away checks where the buffer * lengths are unknown.) */ if (p_size != SIZE_MAX && p_size < size) fortify_panic("memset"); } #define __fortify_memset_chk(p, c, size, p_size, p_size_field) ({ \ size_t __fortify_size = (size_t)(size); \ fortify_memset_chk(__fortify_size, p_size, p_size_field), \ __underlying_memset(p, c, __fortify_size); \ }) /* * __struct_size() vs __member_size() must be captured here to avoid * evaluating argument side-effects further into the macro layers. */ #ifndef CONFIG_KMSAN #define memset(p, c, s) __fortify_memset_chk(p, c, s, \ __struct_size(p), __member_size(p)) #endif /* * To make sure the compiler can enforce protection against buffer overflows, * memcpy(), memmove(), and memset() must not be used beyond individual * struct members. If you need to copy across multiple members, please use * struct_group() to create a named mirror of an anonymous struct union. * (e.g. see struct sk_buff.) Read overflow checking is currently only * done when a write overflow is also present, or when building with W=1. * * Mitigation coverage matrix * Bounds checking at: * +-------+-------+-------+-------+ * | Compile time | Run time | * memcpy() argument sizes: | write | read | write | read | * dest source length +-------+-------+-------+-------+ * memcpy(known, known, constant) | y | y | n/a | n/a | * memcpy(known, unknown, constant) | y | n | n/a | V | * memcpy(known, known, dynamic) | n | n | B | B | * memcpy(known, unknown, dynamic) | n | n | B | V | * memcpy(unknown, known, constant) | n | y | V | n/a | * memcpy(unknown, unknown, constant) | n | n | V | V | * memcpy(unknown, known, dynamic) | n | n | V | B | * memcpy(unknown, unknown, dynamic) | n | n | V | V | * +-------+-------+-------+-------+ * * y = perform deterministic compile-time bounds checking * n = cannot perform deterministic compile-time bounds checking * n/a = no run-time bounds checking needed since compile-time deterministic * B = can perform run-time bounds checking (currently unimplemented) * V = vulnerable to run-time overflow (will need refactoring to solve) * */ __FORTIFY_INLINE bool fortify_memcpy_chk(__kernel_size_t size, const size_t p_size, const size_t q_size, const size_t p_size_field, const size_t q_size_field, const char *func) { if (__builtin_constant_p(size)) { /* * Length argument is a constant expression, so we * can perform compile-time bounds checking where * buffer sizes are also known at compile time. */ /* Error when size is larger than enclosing struct. */ if (__compiletime_lessthan(p_size_field, p_size) && __compiletime_lessthan(p_size, size)) __write_overflow(); if (__compiletime_lessthan(q_size_field, q_size) && __compiletime_lessthan(q_size, size)) __read_overflow2(); /* Warn when write size argument larger than dest field. */ if (__compiletime_lessthan(p_size_field, size)) __write_overflow_field(p_size_field, size); /* * Warn for source field over-read when building with W=1 * or when an over-write happened, so both can be fixed at * the same time. */ if ((IS_ENABLED(KBUILD_EXTRA_WARN1) || __compiletime_lessthan(p_size_field, size)) && __compiletime_lessthan(q_size_field, size)) __read_overflow2_field(q_size_field, size); } /* * At this point, length argument may not be a constant expression, * so run-time bounds checking can be done where buffer sizes are * known. (This is not an "else" because the above checks may only * be compile-time warnings, and we want to still warn for run-time * overflows.) */ /* * Always stop accesses beyond the struct that contains the * field, when the buffer's remaining size is known. * (The SIZE_MAX test is to optimize away checks where the buffer * lengths are unknown.) */ if ((p_size != SIZE_MAX && p_size < size) || (q_size != SIZE_MAX && q_size < size)) fortify_panic(func); /* * Warn when writing beyond destination field size. * * We must ignore p_size_field == 0 for existing 0-element * fake flexible arrays, until they are all converted to * proper flexible arrays. * * The implementation of __builtin_*object_size() behaves * like sizeof() when not directly referencing a flexible * array member, which means there will be many bounds checks * that will appear at run-time, without a way for them to be * detected at compile-time (as can be done when the destination * is specifically the flexible array member). * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101832 */ if (p_size_field != 0 && p_size_field != SIZE_MAX && p_size != p_size_field && p_size_field < size) return true; return false; } #define __fortify_memcpy_chk(p, q, size, p_size, q_size, \ p_size_field, q_size_field, op) ({ \ const size_t __fortify_size = (size_t)(size); \ const size_t __p_size = (p_size); \ const size_t __q_size = (q_size); \ const size_t __p_size_field = (p_size_field); \ const size_t __q_size_field = (q_size_field); \ WARN_ONCE(fortify_memcpy_chk(__fortify_size, __p_size, \ __q_size, __p_size_field, \ __q_size_field, #op), \ #op ": detected field-spanning write (size %zu) of single %s (size %zu)\n", \ __fortify_size, \ "field \"" #p "\" at " FILE_LINE, \ __p_size_field); \ __underlying_##op(p, q, __fortify_size); \ }) /* * Notes about compile-time buffer size detection: * * With these types... * * struct middle { * u16 a; * u8 middle_buf[16]; * int b; * }; * struct end { * u16 a; * u8 end_buf[16]; * }; * struct flex { * int a; * u8 flex_buf[]; * }; * * void func(TYPE *ptr) { ... } * * Cases where destination size cannot be currently detected: * - the size of ptr's object (seemingly by design, gcc & clang fail): * __builtin_object_size(ptr, 1) == SIZE_MAX * - the size of flexible arrays in ptr's obj (by design, dynamic size): * __builtin_object_size(ptr->flex_buf, 1) == SIZE_MAX * - the size of ANY array at the end of ptr's obj (gcc and clang bug): * __builtin_object_size(ptr->end_buf, 1) == SIZE_MAX * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101836 * * Cases where destination size is currently detected: * - the size of non-array members within ptr's object: * __builtin_object_size(ptr->a, 1) == 2 * - the size of non-flexible-array in the middle of ptr's obj: * __builtin_object_size(ptr->middle_buf, 1) == 16 * */ /* * __struct_size() vs __member_size() must be captured here to avoid * evaluating argument side-effects further into the macro layers. */ #define memcpy(p, q, s) __fortify_memcpy_chk(p, q, s, \ __struct_size(p), __struct_size(q), \ __member_size(p), __member_size(q), \ memcpy) #define memmove(p, q, s) __fortify_memcpy_chk(p, q, s, \ __struct_size(p), __struct_size(q), \ __member_size(p), __member_size(q), \ memmove) extern void *__real_memscan(void *, int, __kernel_size_t) __RENAME(memscan); __FORTIFY_INLINE void *memscan(void * const POS0 p, int c, __kernel_size_t size) { const size_t p_size = __struct_size(p); if (__compiletime_lessthan(p_size, size)) __read_overflow(); if (p_size < size) fortify_panic(__func__); return __real_memscan(p, c, size); } __FORTIFY_INLINE __diagnose_as(__builtin_memcmp, 1, 2, 3) int memcmp(const void * const POS0 p, const void * const POS0 q, __kernel_size_t size) { const size_t p_size = __struct_size(p); const size_t q_size = __struct_size(q); if (__builtin_constant_p(size)) { if (__compiletime_lessthan(p_size, size)) __read_overflow(); if (__compiletime_lessthan(q_size, size)) __read_overflow2(); } if (p_size < size || q_size < size) fortify_panic(__func__); return __underlying_memcmp(p, q, size); } __FORTIFY_INLINE __diagnose_as(__builtin_memchr, 1, 2, 3) void *memchr(const void * const POS0 p, int c, __kernel_size_t size) { const size_t p_size = __struct_size(p); if (__compiletime_lessthan(p_size, size)) __read_overflow(); if (p_size < size) fortify_panic(__func__); return __underlying_memchr(p, c, size); } void *__real_memchr_inv(const void *s, int c, size_t n) __RENAME(memchr_inv); __FORTIFY_INLINE void *memchr_inv(const void * const POS0 p, int c, size_t size) { const size_t p_size = __struct_size(p); if (__compiletime_lessthan(p_size, size)) __read_overflow(); if (p_size < size) fortify_panic(__func__); return __real_memchr_inv(p, c, size); } extern void *__real_kmemdup(const void *src, size_t len, gfp_t gfp) __RENAME(kmemdup) __realloc_size(2); __FORTIFY_INLINE void *kmemdup(const void * const POS0 p, size_t size, gfp_t gfp) { const size_t p_size = __struct_size(p); if (__compiletime_lessthan(p_size, size)) __read_overflow(); if (p_size < size) fortify_panic(__func__); return __real_kmemdup(p, size, gfp); } /** * strcpy - Copy a string into another string buffer * * @p: pointer to destination of copy * @q: pointer to NUL-terminated source string to copy * * Do not use this function. While FORTIFY_SOURCE tries to avoid * overflows, this is only possible when the sizes of @q and @p are * known to the compiler. Prefer strscpy(), though note its different * return values for detecting truncation. * * Returns @p. * */ /* Defined after fortified strlen to reuse it. */ __FORTIFY_INLINE __diagnose_as(__builtin_strcpy, 1, 2) char *strcpy(char * const POS p, const char * const POS q) { const size_t p_size = __member_size(p); const size_t q_size = __member_size(q); size_t size; /* If neither buffer size is known, immediately give up. */ if (__builtin_constant_p(p_size) && __builtin_constant_p(q_size) && p_size == SIZE_MAX && q_size == SIZE_MAX) return __underlying_strcpy(p, q); size = strlen(q) + 1; /* Compile-time check for const size overflow. */ if (__compiletime_lessthan(p_size, size)) __write_overflow(); /* Run-time check for dynamic size overflow. */ if (p_size < size) fortify_panic(__func__); __underlying_memcpy(p, q, size); return p; } /* Don't use these outside the FORITFY_SOURCE implementation */ #undef __underlying_memchr #undef __underlying_memcmp #undef __underlying_strcat #undef __underlying_strcpy #undef __underlying_strlen #undef __underlying_strncat #undef __underlying_strncpy #undef POS #undef POS0 #endif /* _LINUX_FORTIFY_STRING_H_ */
41 6 40 37 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (C) 2008 Oracle. All rights reserved. */ #ifndef BTRFS_TREE_LOG_H #define BTRFS_TREE_LOG_H #include "messages.h" #include "ctree.h" #include "transaction.h" /* return value for btrfs_log_dentry_safe that means we don't need to log it at all */ #define BTRFS_NO_LOG_SYNC 256 /* * We can't use the tree log for whatever reason, force a transaction commit. * We use a negative value because there are functions through the logging code * that need to return an error (< 0 value), false (0) or true (1). Any negative * value will do, as it will cause the log to be marked for a full sync. */ #define BTRFS_LOG_FORCE_COMMIT (-(MAX_ERRNO + 1)) struct btrfs_log_ctx { int log_ret; int log_transid; bool log_new_dentries; bool logging_new_name; bool logging_new_delayed_dentries; /* Indicate if the inode being logged was logged before. */ bool logged_before; struct inode *inode; struct list_head list; /* Only used for fast fsyncs. */ struct list_head ordered_extents; struct list_head conflict_inodes; int num_conflict_inodes; bool logging_conflict_inodes; }; static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, struct inode *inode) { ctx->log_ret = 0; ctx->log_transid = 0; ctx->log_new_dentries = false; ctx->logging_new_name = false; ctx->logging_new_delayed_dentries = false; ctx->logged_before = false; ctx->inode = inode; INIT_LIST_HEAD(&ctx->list); INIT_LIST_HEAD(&ctx->ordered_extents); INIT_LIST_HEAD(&ctx->conflict_inodes); ctx->num_conflict_inodes = 0; ctx->logging_conflict_inodes = false; } static inline void btrfs_release_log_ctx_extents(struct btrfs_log_ctx *ctx) { struct btrfs_ordered_extent *ordered; struct btrfs_ordered_extent *tmp; ASSERT(inode_is_locked(ctx->inode)); list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) { list_del_init(&ordered->log_list); btrfs_put_ordered_extent(ordered); } } static inline void btrfs_set_log_full_commit(struct btrfs_trans_handle *trans) { WRITE_ONCE(trans->fs_info->last_trans_log_full_commit, trans->transid); } static inline int btrfs_need_log_full_commit(struct btrfs_trans_handle *trans) { return READ_ONCE(trans->fs_info->last_trans_log_full_commit) == trans->transid; } int btrfs_sync_log(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_log_ctx *ctx); int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_recover_log_trees(struct btrfs_root *tree_root); int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, struct dentry *dentry, struct btrfs_log_ctx *ctx); void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, struct btrfs_root *root, const struct fscrypt_str *name, struct btrfs_inode *dir, u64 index); void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, struct btrfs_root *root, const struct fscrypt_str *name, struct btrfs_inode *inode, u64 dirid); void btrfs_end_log_trans(struct btrfs_root *root); void btrfs_pin_log_trans(struct btrfs_root *root); void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, struct btrfs_inode *dir, struct btrfs_inode *inode, bool for_rename); void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, struct btrfs_inode *dir); void btrfs_log_new_name(struct btrfs_trans_handle *trans, struct dentry *old_dentry, struct btrfs_inode *old_dir, u64 old_dir_index, struct dentry *parent); #endif
1381 3 2 2 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 // SPDX-License-Identifier: GPL-2.0 /* * Zoned block device handling * * Copyright (c) 2015, Hannes Reinecke * Copyright (c) 2015, SUSE Linux GmbH * * Copyright (c) 2016, Damien Le Moal * Copyright (c) 2016, Western Digital */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/rbtree.h> #include <linux/blkdev.h> #include <linux/blk-mq.h> #include <linux/mm.h> #include <linux/vmalloc.h> #include <linux/sched/mm.h> #include "blk.h" #define ZONE_COND_NAME(name) [BLK_ZONE_COND_##name] = #name static const char *const zone_cond_name[] = { ZONE_COND_NAME(NOT_WP), ZONE_COND_NAME(EMPTY), ZONE_COND_NAME(IMP_OPEN), ZONE_COND_NAME(EXP_OPEN), ZONE_COND_NAME(CLOSED), ZONE_COND_NAME(READONLY), ZONE_COND_NAME(FULL), ZONE_COND_NAME(OFFLINE), }; #undef ZONE_COND_NAME /** * blk_zone_cond_str - Return string XXX in BLK_ZONE_COND_XXX. * @zone_cond: BLK_ZONE_COND_XXX. * * Description: Centralize block layer function to convert BLK_ZONE_COND_XXX * into string format. Useful in the debugging and tracing zone conditions. For * invalid BLK_ZONE_COND_XXX it returns string "UNKNOWN". */ const char *blk_zone_cond_str(enum blk_zone_cond zone_cond) { static const char *zone_cond_str = "UNKNOWN"; if (zone_cond < ARRAY_SIZE(zone_cond_name) && zone_cond_name[zone_cond]) zone_cond_str = zone_cond_name[zone_cond]; return zone_cond_str; } EXPORT_SYMBOL_GPL(blk_zone_cond_str); /* * Return true if a request is a write requests that needs zone write locking. */ bool blk_req_needs_zone_write_lock(struct request *rq) { if (!rq->q->disk->seq_zones_wlock) return false; return blk_rq_is_seq_zoned_write(rq); } EXPORT_SYMBOL_GPL(blk_req_needs_zone_write_lock); bool blk_req_zone_write_trylock(struct request *rq) { unsigned int zno = blk_rq_zone_no(rq); if (test_and_set_bit(zno, rq->q->disk->seq_zones_wlock)) return false; WARN_ON_ONCE(rq->rq_flags & RQF_ZONE_WRITE_LOCKED); rq->rq_flags |= RQF_ZONE_WRITE_LOCKED; return true; } EXPORT_SYMBOL_GPL(blk_req_zone_write_trylock); void __blk_req_zone_write_lock(struct request *rq) { if (WARN_ON_ONCE(test_and_set_bit(blk_rq_zone_no(rq), rq->q->disk->seq_zones_wlock))) return; WARN_ON_ONCE(rq->rq_flags & RQF_ZONE_WRITE_LOCKED); rq->rq_flags |= RQF_ZONE_WRITE_LOCKED; } EXPORT_SYMBOL_GPL(__blk_req_zone_write_lock); void __blk_req_zone_write_unlock(struct request *rq) { rq->rq_flags &= ~RQF_ZONE_WRITE_LOCKED; if (rq->q->disk->seq_zones_wlock) WARN_ON_ONCE(!test_and_clear_bit(blk_rq_zone_no(rq), rq->q->disk->seq_zones_wlock)); } EXPORT_SYMBOL_GPL(__blk_req_zone_write_unlock); /** * bdev_nr_zones - Get number of zones * @bdev: Target device * * Return the total number of zones of a zoned block device. For a block * device without zone capabilities, the number of zones is always 0. */ unsigned int bdev_nr_zones(struct block_device *bdev) { sector_t zone_sectors = bdev_zone_sectors(bdev); if (!bdev_is_zoned(bdev)) return 0; return (bdev_nr_sectors(bdev) + zone_sectors - 1) >> ilog2(zone_sectors); } EXPORT_SYMBOL_GPL(bdev_nr_zones); /** * blkdev_report_zones - Get zones information * @bdev: Target block device * @sector: Sector from which to report zones * @nr_zones: Maximum number of zones to report * @cb: Callback function called for each reported zone * @data: Private data for the callback * * Description: * Get zone information starting from the zone containing @sector for at most * @nr_zones, and call @cb for each zone reported by the device. * To report all zones in a device starting from @sector, the BLK_ALL_ZONES * constant can be passed to @nr_zones. * Returns the number of zones reported by the device, or a negative errno * value in case of failure. * * Note: The caller must use memalloc_noXX_save/restore() calls to control * memory allocations done within this function. */ int blkdev_report_zones(struct block_device *bdev, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data) { struct gendisk *disk = bdev->bd_disk; sector_t capacity = get_capacity(disk); if (!bdev_is_zoned(bdev) || WARN_ON_ONCE(!disk->fops->report_zones)) return -EOPNOTSUPP; if (!nr_zones || sector >= capacity) return 0; return disk->fops->report_zones(disk, sector, nr_zones, cb, data); } EXPORT_SYMBOL_GPL(blkdev_report_zones); static inline unsigned long *blk_alloc_zone_bitmap(int node, unsigned int nr_zones) { return kcalloc_node(BITS_TO_LONGS(nr_zones), sizeof(unsigned long), GFP_NOIO, node); } static int blk_zone_need_reset_cb(struct blk_zone *zone, unsigned int idx, void *data) { /* * For an all-zones reset, ignore conventional, empty, read-only * and offline zones. */ switch (zone->cond) { case BLK_ZONE_COND_NOT_WP: case BLK_ZONE_COND_EMPTY: case BLK_ZONE_COND_READONLY: case BLK_ZONE_COND_OFFLINE: return 0; default: set_bit(idx, (unsigned long *)data); return 0; } } static int blkdev_zone_reset_all_emulated(struct block_device *bdev, gfp_t gfp_mask) { struct gendisk *disk = bdev->bd_disk; sector_t capacity = bdev_nr_sectors(bdev); sector_t zone_sectors = bdev_zone_sectors(bdev); unsigned long *need_reset; struct bio *bio = NULL; sector_t sector = 0; int ret; need_reset = blk_alloc_zone_bitmap(disk->queue->node, disk->nr_zones); if (!need_reset) return -ENOMEM; ret = disk->fops->report_zones(disk, 0, disk->nr_zones, blk_zone_need_reset_cb, need_reset); if (ret < 0) goto out_free_need_reset; ret = 0; while (sector < capacity) { if (!test_bit(disk_zone_no(disk, sector), need_reset)) { sector += zone_sectors; continue; } bio = blk_next_bio(bio, bdev, 0, REQ_OP_ZONE_RESET | REQ_SYNC, gfp_mask); bio->bi_iter.bi_sector = sector; sector += zone_sectors; /* This may take a while, so be nice to others */ cond_resched(); } if (bio) { ret = submit_bio_wait(bio); bio_put(bio); } out_free_need_reset: kfree(need_reset); return ret; } static int blkdev_zone_reset_all(struct block_device *bdev, gfp_t gfp_mask) { struct bio bio; bio_init(&bio, bdev, NULL, 0, REQ_OP_ZONE_RESET_ALL | REQ_SYNC); return submit_bio_wait(&bio); } /** * blkdev_zone_mgmt - Execute a zone management operation on a range of zones * @bdev: Target block device * @op: Operation to be performed on the zones * @sector: Start sector of the first zone to operate on * @nr_sectors: Number of sectors, should be at least the length of one zone and * must be zone size aligned. * @gfp_mask: Memory allocation flags (for bio_alloc) * * Description: * Perform the specified operation on the range of zones specified by * @sector..@sector+@nr_sectors. Specifying the entire disk sector range * is valid, but the specified range should not contain conventional zones. * The operation to execute on each zone can be a zone reset, open, close * or finish request. */ int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op, sector_t sector, sector_t nr_sectors, gfp_t gfp_mask) { struct request_queue *q = bdev_get_queue(bdev); sector_t zone_sectors = bdev_zone_sectors(bdev); sector_t capacity = bdev_nr_sectors(bdev); sector_t end_sector = sector + nr_sectors; struct bio *bio = NULL; int ret = 0; if (!bdev_is_zoned(bdev)) return -EOPNOTSUPP; if (bdev_read_only(bdev)) return -EPERM; if (!op_is_zone_mgmt(op)) return -EOPNOTSUPP; if (end_sector <= sector || end_sector > capacity) /* Out of range */ return -EINVAL; /* Check alignment (handle eventual smaller last zone) */ if (!bdev_is_zone_start(bdev, sector)) return -EINVAL; if (!bdev_is_zone_start(bdev, nr_sectors) && end_sector != capacity) return -EINVAL; /* * In the case of a zone reset operation over all zones, * REQ_OP_ZONE_RESET_ALL can be used with devices supporting this * command. For other devices, we emulate this command behavior by * identifying the zones needing a reset. */ if (op == REQ_OP_ZONE_RESET && sector == 0 && nr_sectors == capacity) { if (!blk_queue_zone_resetall(q)) return blkdev_zone_reset_all_emulated(bdev, gfp_mask); return blkdev_zone_reset_all(bdev, gfp_mask); } while (sector < end_sector) { bio = blk_next_bio(bio, bdev, 0, op | REQ_SYNC, gfp_mask); bio->bi_iter.bi_sector = sector; sector += zone_sectors; /* This may take a while, so be nice to others */ cond_resched(); } ret = submit_bio_wait(bio); bio_put(bio); return ret; } EXPORT_SYMBOL_GPL(blkdev_zone_mgmt); struct zone_report_args { struct blk_zone __user *zones; }; static int blkdev_copy_zone_to_user(struct blk_zone *zone, unsigned int idx, void *data) { struct zone_report_args *args = data; if (copy_to_user(&args->zones[idx], zone, sizeof(struct blk_zone))) return -EFAULT; return 0; } /* * BLKREPORTZONE ioctl processing. * Called from blkdev_ioctl. */ int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd, unsigned long arg) { void __user *argp = (void __user *)arg; struct zone_report_args args; struct blk_zone_report rep; int ret; if (!argp) return -EINVAL; if (!bdev_is_zoned(bdev)) return -ENOTTY; if (copy_from_user(&rep, argp, sizeof(struct blk_zone_report))) return -EFAULT; if (!rep.nr_zones) return -EINVAL; args.zones = argp + sizeof(struct blk_zone_report); ret = blkdev_report_zones(bdev, rep.sector, rep.nr_zones, blkdev_copy_zone_to_user, &args); if (ret < 0) return ret; rep.nr_zones = ret; rep.flags = BLK_ZONE_REP_CAPACITY; if (copy_to_user(argp, &rep, sizeof(struct blk_zone_report))) return -EFAULT; return 0; } static int blkdev_truncate_zone_range(struct block_device *bdev, blk_mode_t mode, const struct blk_zone_range *zrange) { loff_t start, end; if (zrange->sector + zrange->nr_sectors <= zrange->sector || zrange->sector + zrange->nr_sectors > get_capacity(bdev->bd_disk)) /* Out of range */ return -EINVAL; start = zrange->sector << SECTOR_SHIFT; end = ((zrange->sector + zrange->nr_sectors) << SECTOR_SHIFT) - 1; return truncate_bdev_range(bdev, mode, start, end); } /* * BLKRESETZONE, BLKOPENZONE, BLKCLOSEZONE and BLKFINISHZONE ioctl processing. * Called from blkdev_ioctl. */ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg) { void __user *argp = (void __user *)arg; struct blk_zone_range zrange; enum req_op op; int ret; if (!argp) return -EINVAL; if (!bdev_is_zoned(bdev)) return -ENOTTY; if (!(mode & BLK_OPEN_WRITE)) return -EBADF; if (copy_from_user(&zrange, argp, sizeof(struct blk_zone_range))) return -EFAULT; switch (cmd) { case BLKRESETZONE: op = REQ_OP_ZONE_RESET; /* Invalidate the page cache, including dirty pages. */ filemap_invalidate_lock(bdev->bd_inode->i_mapping); ret = blkdev_truncate_zone_range(bdev, mode, &zrange); if (ret) goto fail; break; case BLKOPENZONE: op = REQ_OP_ZONE_OPEN; break; case BLKCLOSEZONE: op = REQ_OP_ZONE_CLOSE; break; case BLKFINISHZONE: op = REQ_OP_ZONE_FINISH; break; default: return -ENOTTY; } ret = blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors, GFP_KERNEL); fail: if (cmd == BLKRESETZONE) filemap_invalidate_unlock(bdev->bd_inode->i_mapping); return ret; } void disk_free_zone_bitmaps(struct gendisk *disk) { kfree(disk->conv_zones_bitmap); disk->conv_zones_bitmap = NULL; kfree(disk->seq_zones_wlock); disk->seq_zones_wlock = NULL; } struct blk_revalidate_zone_args { struct gendisk *disk; unsigned long *conv_zones_bitmap; unsigned long *seq_zones_wlock; unsigned int nr_zones; sector_t sector; }; /* * Helper function to check the validity of zones of a zoned block device. */ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx, void *data) { struct blk_revalidate_zone_args *args = data; struct gendisk *disk = args->disk; struct request_queue *q = disk->queue; sector_t capacity = get_capacity(disk); sector_t zone_sectors = q->limits.chunk_sectors; /* Check for bad zones and holes in the zone report */ if (zone->start != args->sector) { pr_warn("%s: Zone gap at sectors %llu..%llu\n", disk->disk_name, args->sector, zone->start); return -ENODEV; } if (zone->start >= capacity || !zone->len) { pr_warn("%s: Invalid zone start %llu, length %llu\n", disk->disk_name, zone->start, zone->len); return -ENODEV; } /* * All zones must have the same size, with the exception on an eventual * smaller last zone. */ if (zone->start + zone->len < capacity) { if (zone->len != zone_sectors) { pr_warn("%s: Invalid zoned device with non constant zone size\n", disk->disk_name); return -ENODEV; } } else if (zone->len > zone_sectors) { pr_warn("%s: Invalid zoned device with larger last zone size\n", disk->disk_name); return -ENODEV; } /* Check zone type */ switch (zone->type) { case BLK_ZONE_TYPE_CONVENTIONAL: if (!args->conv_zones_bitmap) { args->conv_zones_bitmap = blk_alloc_zone_bitmap(q->node, args->nr_zones); if (!args->conv_zones_bitmap) return -ENOMEM; } set_bit(idx, args->conv_zones_bitmap); break; case BLK_ZONE_TYPE_SEQWRITE_REQ: case BLK_ZONE_TYPE_SEQWRITE_PREF: if (!args->seq_zones_wlock) { args->seq_zones_wlock = blk_alloc_zone_bitmap(q->node, args->nr_zones); if (!args->seq_zones_wlock) return -ENOMEM; } break; default: pr_warn("%s: Invalid zone type 0x%x at sectors %llu\n", disk->disk_name, (int)zone->type, zone->start); return -ENODEV; } args->sector += zone->len; return 0; } /** * blk_revalidate_disk_zones - (re)allocate and initialize zone bitmaps * @disk: Target disk * @update_driver_data: Callback to update driver data on the frozen disk * * Helper function for low-level device drivers to check and (re) allocate and * initialize a disk request queue zone bitmaps. This functions should normally * be called within the disk ->revalidate method for blk-mq based drivers. * Before calling this function, the device driver must already have set the * device zone size (chunk_sector limit) and the max zone append limit. * For BIO based drivers, this function cannot be used. BIO based device drivers * only need to set disk->nr_zones so that the sysfs exposed value is correct. * If the @update_driver_data callback function is not NULL, the callback is * executed with the device request queue frozen after all zones have been * checked. */ int blk_revalidate_disk_zones(struct gendisk *disk, void (*update_driver_data)(struct gendisk *disk)) { struct request_queue *q = disk->queue; sector_t zone_sectors = q->limits.chunk_sectors; sector_t capacity = get_capacity(disk); struct blk_revalidate_zone_args args = { }; unsigned int noio_flag; int ret; if (WARN_ON_ONCE(!blk_queue_is_zoned(q))) return -EIO; if (WARN_ON_ONCE(!queue_is_mq(q))) return -EIO; if (!capacity) return -ENODEV; /* * Checks that the device driver indicated a valid zone size and that * the max zone append limit is set. */ if (!zone_sectors || !is_power_of_2(zone_sectors)) { pr_warn("%s: Invalid non power of two zone size (%llu)\n", disk->disk_name, zone_sectors); return -ENODEV; } if (!q->limits.max_zone_append_sectors) { pr_warn("%s: Invalid 0 maximum zone append limit\n", disk->disk_name); return -ENODEV; } /* * Ensure that all memory allocations in this context are done as if * GFP_NOIO was specified. */ args.disk = disk; args.nr_zones = (capacity + zone_sectors - 1) >> ilog2(zone_sectors); noio_flag = memalloc_noio_save(); ret = disk->fops->report_zones(disk, 0, UINT_MAX, blk_revalidate_zone_cb, &args); if (!ret) { pr_warn("%s: No zones reported\n", disk->disk_name); ret = -ENODEV; } memalloc_noio_restore(noio_flag); /* * If zones where reported, make sure that the entire disk capacity * has been checked. */ if (ret > 0 && args.sector != capacity) { pr_warn("%s: Missing zones from sector %llu\n", disk->disk_name, args.sector); ret = -ENODEV; } /* * Install the new bitmaps and update nr_zones only once the queue is * stopped and all I/Os are completed (i.e. a scheduler is not * referencing the bitmaps). */ blk_mq_freeze_queue(q); if (ret > 0) { disk->nr_zones = args.nr_zones; swap(disk->seq_zones_wlock, args.seq_zones_wlock); swap(disk->conv_zones_bitmap, args.conv_zones_bitmap); if (update_driver_data) update_driver_data(disk); ret = 0; } else { pr_warn("%s: failed to revalidate zones\n", disk->disk_name); disk_free_zone_bitmaps(disk); } blk_mq_unfreeze_queue(q); kfree(args.seq_zones_wlock); kfree(args.conv_zones_bitmap); return ret; } EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones); void disk_clear_zone_settings(struct gendisk *disk) { struct request_queue *q = disk->queue; blk_mq_freeze_queue(q); disk_free_zone_bitmaps(disk); blk_queue_flag_clear(QUEUE_FLAG_ZONE_RESETALL, q); q->required_elevator_features &= ~ELEVATOR_F_ZBD_SEQ_WRITE; disk->nr_zones = 0; disk->max_open_zones = 0; disk->max_active_zones = 0; q->limits.chunk_sectors = 0; q->limits.zone_write_granularity = 0; q->limits.max_zone_append_sectors = 0; blk_mq_unfreeze_queue(q); }
37 37 29 29 29 29 14 29 12 12 12 12 9 12 11 12 3 12 12 11 30 29 29 27 29 29 30 30 30 30 1 6 6 11 11 6 11 228 2 2 1 2 2 1 1 1 2 1 1 1 2 5 5 5 5 2 2 2 2 5 4 5 5 4 1 1 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 /* * \author Rickard E. (Rik) Faith <faith@valinux.com> * \author Daryll Strauss <daryll@valinux.com> * \author Gareth Hughes <gareth@valinux.com> */ /* * Created: Mon Jan 4 08:58:31 1999 by faith@valinux.com * * Copyright 1999 Precision Insight, Inc., Cedar Park, Texas. * Copyright 2000 VA Linux Systems, Inc., Sunnyvale, California. * All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * VA LINUX SYSTEMS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. */ #include <linux/anon_inodes.h> #include <linux/dma-fence.h> #include <linux/file.h> #include <linux/module.h> #include <linux/pci.h> #include <linux/poll.h> #include <linux/slab.h> #include <drm/drm_client.h> #include <drm/drm_drv.h> #include <drm/drm_file.h> #include <drm/drm_gem.h> #include <drm/drm_print.h> #include "drm_crtc_internal.h" #include "drm_internal.h" #include "drm_legacy.h" /* from BKL pushdown */ DEFINE_MUTEX(drm_global_mutex); bool drm_dev_needs_global_mutex(struct drm_device *dev) { /* * Legacy drivers rely on all kinds of BKL locking semantics, don't * bother. They also still need BKL locking for their ioctls, so better * safe than sorry. */ if (drm_core_check_feature(dev, DRIVER_LEGACY)) return true; /* * The deprecated ->load callback must be called after the driver is * already registered. This means such drivers rely on the BKL to make * sure an open can't proceed until the driver is actually fully set up. * Similar hilarity holds for the unload callback. */ if (dev->driver->load || dev->driver->unload) return true; /* * Drivers with the lastclose callback assume that it's synchronized * against concurrent opens, which again needs the BKL. The proper fix * is to use the drm_client infrastructure with proper locking for each * client. */ if (dev->driver->lastclose) return true; return false; } /** * DOC: file operations * * Drivers must define the file operations structure that forms the DRM * userspace API entry point, even though most of those operations are * implemented in the DRM core. The resulting &struct file_operations must be * stored in the &drm_driver.fops field. The mandatory functions are drm_open(), * drm_read(), drm_ioctl() and drm_compat_ioctl() if CONFIG_COMPAT is enabled * Note that drm_compat_ioctl will be NULL if CONFIG_COMPAT=n, so there's no * need to sprinkle #ifdef into the code. Drivers which implement private ioctls * that require 32/64 bit compatibility support must provide their own * &file_operations.compat_ioctl handler that processes private ioctls and calls * drm_compat_ioctl() for core ioctls. * * In addition drm_read() and drm_poll() provide support for DRM events. DRM * events are a generic and extensible means to send asynchronous events to * userspace through the file descriptor. They are used to send vblank event and * page flip completions by the KMS API. But drivers can also use it for their * own needs, e.g. to signal completion of rendering. * * For the driver-side event interface see drm_event_reserve_init() and * drm_send_event() as the main starting points. * * The memory mapping implementation will vary depending on how the driver * manages memory. Legacy drivers will use the deprecated drm_legacy_mmap() * function, modern drivers should use one of the provided memory-manager * specific implementations. For GEM-based drivers this is drm_gem_mmap(). * * No other file operations are supported by the DRM userspace API. Overall the * following is an example &file_operations structure:: * * static const example_drm_fops = { * .owner = THIS_MODULE, * .open = drm_open, * .release = drm_release, * .unlocked_ioctl = drm_ioctl, * .compat_ioctl = drm_compat_ioctl, // NULL if CONFIG_COMPAT=n * .poll = drm_poll, * .read = drm_read, * .llseek = no_llseek, * .mmap = drm_gem_mmap, * }; * * For plain GEM based drivers there is the DEFINE_DRM_GEM_FOPS() macro, and for * DMA based drivers there is the DEFINE_DRM_GEM_DMA_FOPS() macro to make this * simpler. * * The driver's &file_operations must be stored in &drm_driver.fops. * * For driver-private IOCTL handling see the more detailed discussion in * :ref:`IOCTL support in the userland interfaces chapter<drm_driver_ioctl>`. */ /** * drm_file_alloc - allocate file context * @minor: minor to allocate on * * This allocates a new DRM file context. It is not linked into any context and * can be used by the caller freely. Note that the context keeps a pointer to * @minor, so it must be freed before @minor is. * * RETURNS: * Pointer to newly allocated context, ERR_PTR on failure. */ struct drm_file *drm_file_alloc(struct drm_minor *minor) { static atomic64_t ident = ATOMIC_INIT(0); struct drm_device *dev = minor->dev; struct drm_file *file; int ret; file = kzalloc(sizeof(*file), GFP_KERNEL); if (!file) return ERR_PTR(-ENOMEM); /* Get a unique identifier for fdinfo: */ file->client_id = atomic64_inc_return(&ident); rcu_assign_pointer(file->pid, get_pid(task_tgid(current))); file->minor = minor; /* for compatibility root is always authenticated */ file->authenticated = capable(CAP_SYS_ADMIN); INIT_LIST_HEAD(&file->lhead); INIT_LIST_HEAD(&file->fbs); mutex_init(&file->fbs_lock); INIT_LIST_HEAD(&file->blobs); INIT_LIST_HEAD(&file->pending_event_list); INIT_LIST_HEAD(&file->event_list); init_waitqueue_head(&file->event_wait); file->event_space = 4096; /* set aside 4k for event buffer */ spin_lock_init(&file->master_lookup_lock); mutex_init(&file->event_read_lock); if (drm_core_check_feature(dev, DRIVER_GEM)) drm_gem_open(dev, file); if (drm_core_check_feature(dev, DRIVER_SYNCOBJ)) drm_syncobj_open(file); drm_prime_init_file_private(&file->prime); if (dev->driver->open) { ret = dev->driver->open(dev, file); if (ret < 0) goto out_prime_destroy; } return file; out_prime_destroy: drm_prime_destroy_file_private(&file->prime); if (drm_core_check_feature(dev, DRIVER_SYNCOBJ)) drm_syncobj_release(file); if (drm_core_check_feature(dev, DRIVER_GEM)) drm_gem_release(dev, file); put_pid(rcu_access_pointer(file->pid)); kfree(file); return ERR_PTR(ret); } static void drm_events_release(struct drm_file *file_priv) { struct drm_device *dev = file_priv->minor->dev; struct drm_pending_event *e, *et; unsigned long flags; spin_lock_irqsave(&dev->event_lock, flags); /* Unlink pending events */ list_for_each_entry_safe(e, et, &file_priv->pending_event_list, pending_link) { list_del(&e->pending_link); e->file_priv = NULL; } /* Remove unconsumed events */ list_for_each_entry_safe(e, et, &file_priv->event_list, link) { list_del(&e->link); kfree(e); } spin_unlock_irqrestore(&dev->event_lock, flags); } /** * drm_file_free - free file context * @file: context to free, or NULL * * This destroys and deallocates a DRM file context previously allocated via * drm_file_alloc(). The caller must make sure to unlink it from any contexts * before calling this. * * If NULL is passed, this is a no-op. */ void drm_file_free(struct drm_file *file) { struct drm_device *dev; if (!file) return; dev = file->minor->dev; drm_dbg_core(dev, "comm=\"%s\", pid=%d, dev=0x%lx, open_count=%d\n", current->comm, task_pid_nr(current), (long)old_encode_dev(file->minor->kdev->devt), atomic_read(&dev->open_count)); #ifdef CONFIG_DRM_LEGACY if (drm_core_check_feature(dev, DRIVER_LEGACY) && dev->driver->preclose) dev->driver->preclose(dev, file); #endif if (drm_core_check_feature(dev, DRIVER_LEGACY)) drm_legacy_lock_release(dev, file->filp); if (drm_core_check_feature(dev, DRIVER_HAVE_DMA)) drm_legacy_reclaim_buffers(dev, file); drm_events_release(file); if (drm_core_check_feature(dev, DRIVER_MODESET)) { drm_fb_release(file); drm_property_destroy_user_blobs(dev, file); } if (drm_core_check_feature(dev, DRIVER_SYNCOBJ)) drm_syncobj_release(file); if (drm_core_check_feature(dev, DRIVER_GEM)) drm_gem_release(dev, file); drm_legacy_ctxbitmap_flush(dev, file); if (drm_is_primary_client(file)) drm_master_release(file); if (dev->driver->postclose) dev->driver->postclose(dev, file); drm_prime_destroy_file_private(&file->prime); WARN_ON(!list_empty(&file->event_list)); put_pid(rcu_access_pointer(file->pid)); kfree(file); } static void drm_close_helper(struct file *filp) { struct drm_file *file_priv = filp->private_data; struct drm_device *dev = file_priv->minor->dev; mutex_lock(&dev->filelist_mutex); list_del(&file_priv->lhead); mutex_unlock(&dev->filelist_mutex); drm_file_free(file_priv); } /* * Check whether DRI will run on this CPU. * * \return non-zero if the DRI will run on this CPU, or zero otherwise. */ static int drm_cpu_valid(void) { #if defined(__sparc__) && !defined(__sparc_v9__) return 0; /* No cmpxchg before v9 sparc. */ #endif return 1; } /* * Called whenever a process opens a drm node * * \param filp file pointer. * \param minor acquired minor-object. * \return zero on success or a negative number on failure. * * Creates and initializes a drm_file structure for the file private data in \p * filp and add it into the double linked list in \p dev. */ int drm_open_helper(struct file *filp, struct drm_minor *minor) { struct drm_device *dev = minor->dev; struct drm_file *priv; int ret; if (filp->f_flags & O_EXCL) return -EBUSY; /* No exclusive opens */ if (!drm_cpu_valid()) return -EINVAL; if (dev->switch_power_state != DRM_SWITCH_POWER_ON && dev->switch_power_state != DRM_SWITCH_POWER_DYNAMIC_OFF) return -EINVAL; drm_dbg_core(dev, "comm=\"%s\", pid=%d, minor=%d\n", current->comm, task_pid_nr(current), minor->index); priv = drm_file_alloc(minor); if (IS_ERR(priv)) return PTR_ERR(priv); if (drm_is_primary_client(priv)) { ret = drm_master_open(priv); if (ret) { drm_file_free(priv); return ret; } } filp->private_data = priv; filp->f_mode |= FMODE_UNSIGNED_OFFSET; priv->filp = filp; mutex_lock(&dev->filelist_mutex); list_add(&priv->lhead, &dev->filelist); mutex_unlock(&dev->filelist_mutex); #ifdef CONFIG_DRM_LEGACY #ifdef __alpha__ /* * Default the hose */ if (!dev->hose) { struct pci_dev *pci_dev; pci_dev = pci_get_class(PCI_CLASS_DISPLAY_VGA << 8, NULL); if (pci_dev) { dev->hose = pci_dev->sysdata; pci_dev_put(pci_dev); } if (!dev->hose) { struct pci_bus *b = list_entry(pci_root_buses.next, struct pci_bus, node); if (b) dev->hose = b->sysdata; } } #endif #endif return 0; } /** * drm_open - open method for DRM file * @inode: device inode * @filp: file pointer. * * This function must be used by drivers as their &file_operations.open method. * It looks up the correct DRM device and instantiates all the per-file * resources for it. It also calls the &drm_driver.open driver callback. * * RETURNS: * * 0 on success or negative errno value on failure. */ int drm_open(struct inode *inode, struct file *filp) { struct drm_device *dev; struct drm_minor *minor; int retcode; int need_setup = 0; minor = drm_minor_acquire(iminor(inode)); if (IS_ERR(minor)) return PTR_ERR(minor); dev = minor->dev; if (drm_dev_needs_global_mutex(dev)) mutex_lock(&drm_global_mutex); if (!atomic_fetch_inc(&dev->open_count)) need_setup = 1; /* share address_space across all char-devs of a single device */ filp->f_mapping = dev->anon_inode->i_mapping; retcode = drm_open_helper(filp, minor); if (retcode) goto err_undo; if (need_setup) { retcode = drm_legacy_setup(dev); if (retcode) { drm_close_helper(filp); goto err_undo; } } if (drm_dev_needs_global_mutex(dev)) mutex_unlock(&drm_global_mutex); return 0; err_undo: atomic_dec(&dev->open_count); if (drm_dev_needs_global_mutex(dev)) mutex_unlock(&drm_global_mutex); drm_minor_release(minor); return retcode; } EXPORT_SYMBOL(drm_open); void drm_lastclose(struct drm_device * dev) { drm_dbg_core(dev, "\n"); if (dev->driver->lastclose) dev->driver->lastclose(dev); drm_dbg_core(dev, "driver lastclose completed\n"); if (drm_core_check_feature(dev, DRIVER_LEGACY)) drm_legacy_dev_reinit(dev); drm_client_dev_restore(dev); } /** * drm_release - release method for DRM file * @inode: device inode * @filp: file pointer. * * This function must be used by drivers as their &file_operations.release * method. It frees any resources associated with the open file, and calls the * &drm_driver.postclose driver callback. If this is the last open file for the * DRM device also proceeds to call the &drm_driver.lastclose driver callback. * * RETURNS: * * Always succeeds and returns 0. */ int drm_release(struct inode *inode, struct file *filp) { struct drm_file *file_priv = filp->private_data; struct drm_minor *minor = file_priv->minor; struct drm_device *dev = minor->dev; if (drm_dev_needs_global_mutex(dev)) mutex_lock(&drm_global_mutex); drm_dbg_core(dev, "open_count = %d\n", atomic_read(&dev->open_count)); drm_close_helper(filp); if (atomic_dec_and_test(&dev->open_count)) drm_lastclose(dev); if (drm_dev_needs_global_mutex(dev)) mutex_unlock(&drm_global_mutex); drm_minor_release(minor); return 0; } EXPORT_SYMBOL(drm_release); void drm_file_update_pid(struct drm_file *filp) { struct drm_device *dev; struct pid *pid, *old; /* * Master nodes need to keep the original ownership in order for * drm_master_check_perm to keep working correctly. (See comment in * drm_auth.c.) */ if (filp->was_master) return; pid = task_tgid(current); /* * Quick unlocked check since the model is a single handover followed by * exclusive repeated use. */ if (pid == rcu_access_pointer(filp->pid)) return; dev = filp->minor->dev; mutex_lock(&dev->filelist_mutex); old = rcu_replace_pointer(filp->pid, pid, 1); mutex_unlock(&dev->filelist_mutex); if (pid != old) { get_pid(pid); synchronize_rcu(); put_pid(old); } } /** * drm_release_noglobal - release method for DRM file * @inode: device inode * @filp: file pointer. * * This function may be used by drivers as their &file_operations.release * method. It frees any resources associated with the open file prior to taking * the drm_global_mutex, which then calls the &drm_driver.postclose driver * callback. If this is the last open file for the DRM device also proceeds to * call the &drm_driver.lastclose driver callback. * * RETURNS: * * Always succeeds and returns 0. */ int drm_release_noglobal(struct inode *inode, struct file *filp) { struct drm_file *file_priv = filp->private_data; struct drm_minor *minor = file_priv->minor; struct drm_device *dev = minor->dev; drm_close_helper(filp); if (atomic_dec_and_mutex_lock(&dev->open_count, &drm_global_mutex)) { drm_lastclose(dev); mutex_unlock(&drm_global_mutex); } drm_minor_release(minor); return 0; } EXPORT_SYMBOL(drm_release_noglobal); /** * drm_read - read method for DRM file * @filp: file pointer * @buffer: userspace destination pointer for the read * @count: count in bytes to read * @offset: offset to read * * This function must be used by drivers as their &file_operations.read * method if they use DRM events for asynchronous signalling to userspace. * Since events are used by the KMS API for vblank and page flip completion this * means all modern display drivers must use it. * * @offset is ignored, DRM events are read like a pipe. Polling support is * provided by drm_poll(). * * This function will only ever read a full event. Therefore userspace must * supply a big enough buffer to fit any event to ensure forward progress. Since * the maximum event space is currently 4K it's recommended to just use that for * safety. * * RETURNS: * * Number of bytes read (always aligned to full events, and can be 0) or a * negative error code on failure. */ ssize_t drm_read(struct file *filp, char __user *buffer, size_t count, loff_t *offset) { struct drm_file *file_priv = filp->private_data; struct drm_device *dev = file_priv->minor->dev; ssize_t ret; ret = mutex_lock_interruptible(&file_priv->event_read_lock); if (ret) return ret; for (;;) { struct drm_pending_event *e = NULL; spin_lock_irq(&dev->event_lock); if (!list_empty(&file_priv->event_list)) { e = list_first_entry(&file_priv->event_list, struct drm_pending_event, link); file_priv->event_space += e->event->length; list_del(&e->link); } spin_unlock_irq(&dev->event_lock); if (e == NULL) { if (ret) break; if (filp->f_flags & O_NONBLOCK) { ret = -EAGAIN; break; } mutex_unlock(&file_priv->event_read_lock); ret = wait_event_interruptible(file_priv->event_wait, !list_empty(&file_priv->event_list)); if (ret >= 0) ret = mutex_lock_interruptible(&file_priv->event_read_lock); if (ret) return ret; } else { unsigned length = e->event->length; if (length > count - ret) { put_back_event: spin_lock_irq(&dev->event_lock); file_priv->event_space -= length; list_add(&e->link, &file_priv->event_list); spin_unlock_irq(&dev->event_lock); wake_up_interruptible_poll(&file_priv->event_wait, EPOLLIN | EPOLLRDNORM); break; } if (copy_to_user(buffer + ret, e->event, length)) { if (ret == 0) ret = -EFAULT; goto put_back_event; } ret += length; kfree(e); } } mutex_unlock(&file_priv->event_read_lock); return ret; } EXPORT_SYMBOL(drm_read); /** * drm_poll - poll method for DRM file * @filp: file pointer * @wait: poll waiter table * * This function must be used by drivers as their &file_operations.read method * if they use DRM events for asynchronous signalling to userspace. Since * events are used by the KMS API for vblank and page flip completion this means * all modern display drivers must use it. * * See also drm_read(). * * RETURNS: * * Mask of POLL flags indicating the current status of the file. */ __poll_t drm_poll(struct file *filp, struct poll_table_struct *wait) { struct drm_file *file_priv = filp->private_data; __poll_t mask = 0; poll_wait(filp, &file_priv->event_wait, wait); if (!list_empty(&file_priv->event_list)) mask |= EPOLLIN | EPOLLRDNORM; return mask; } EXPORT_SYMBOL(drm_poll); /** * drm_event_reserve_init_locked - init a DRM event and reserve space for it * @dev: DRM device * @file_priv: DRM file private data * @p: tracking structure for the pending event * @e: actual event data to deliver to userspace * * This function prepares the passed in event for eventual delivery. If the event * doesn't get delivered (because the IOCTL fails later on, before queuing up * anything) then the even must be cancelled and freed using * drm_event_cancel_free(). Successfully initialized events should be sent out * using drm_send_event() or drm_send_event_locked() to signal completion of the * asynchronous event to userspace. * * If callers embedded @p into a larger structure it must be allocated with * kmalloc and @p must be the first member element. * * This is the locked version of drm_event_reserve_init() for callers which * already hold &drm_device.event_lock. * * RETURNS: * * 0 on success or a negative error code on failure. */ int drm_event_reserve_init_locked(struct drm_device *dev, struct drm_file *file_priv, struct drm_pending_event *p, struct drm_event *e) { if (file_priv->event_space < e->length) return -ENOMEM; file_priv->event_space -= e->length; p->event = e; list_add(&p->pending_link, &file_priv->pending_event_list); p->file_priv = file_priv; return 0; } EXPORT_SYMBOL(drm_event_reserve_init_locked); /** * drm_event_reserve_init - init a DRM event and reserve space for it * @dev: DRM device * @file_priv: DRM file private data * @p: tracking structure for the pending event * @e: actual event data to deliver to userspace * * This function prepares the passed in event for eventual delivery. If the event * doesn't get delivered (because the IOCTL fails later on, before queuing up * anything) then the even must be cancelled and freed using * drm_event_cancel_free(). Successfully initialized events should be sent out * using drm_send_event() or drm_send_event_locked() to signal completion of the * asynchronous event to userspace. * * If callers embedded @p into a larger structure it must be allocated with * kmalloc and @p must be the first member element. * * Callers which already hold &drm_device.event_lock should use * drm_event_reserve_init_locked() instead. * * RETURNS: * * 0 on success or a negative error code on failure. */ int drm_event_reserve_init(struct drm_device *dev, struct drm_file *file_priv, struct drm_pending_event *p, struct drm_event *e) { unsigned long flags; int ret; spin_lock_irqsave(&dev->event_lock, flags); ret = drm_event_reserve_init_locked(dev, file_priv, p, e); spin_unlock_irqrestore(&dev->event_lock, flags); return ret; } EXPORT_SYMBOL(drm_event_reserve_init); /** * drm_event_cancel_free - free a DRM event and release its space * @dev: DRM device * @p: tracking structure for the pending event * * This function frees the event @p initialized with drm_event_reserve_init() * and releases any allocated space. It is used to cancel an event when the * nonblocking operation could not be submitted and needed to be aborted. */ void drm_event_cancel_free(struct drm_device *dev, struct drm_pending_event *p) { unsigned long flags; spin_lock_irqsave(&dev->event_lock, flags); if (p->file_priv) { p->file_priv->event_space += p->event->length; list_del(&p->pending_link); } spin_unlock_irqrestore(&dev->event_lock, flags); if (p->fence) dma_fence_put(p->fence); kfree(p); } EXPORT_SYMBOL(drm_event_cancel_free); static void drm_send_event_helper(struct drm_device *dev, struct drm_pending_event *e, ktime_t timestamp) { assert_spin_locked(&dev->event_lock); if (e->completion) { complete_all(e->completion); e->completion_release(e->completion); e->completion = NULL; } if (e->fence) { if (timestamp) dma_fence_signal_timestamp(e->fence, timestamp); else dma_fence_signal(e->fence); dma_fence_put(e->fence); } if (!e->file_priv) { kfree(e); return; } list_del(&e->pending_link); list_add_tail(&e->link, &e->file_priv->event_list); wake_up_interruptible_poll(&e->file_priv->event_wait, EPOLLIN | EPOLLRDNORM); } /** * drm_send_event_timestamp_locked - send DRM event to file descriptor * @dev: DRM device * @e: DRM event to deliver * @timestamp: timestamp to set for the fence event in kernel's CLOCK_MONOTONIC * time domain * * This function sends the event @e, initialized with drm_event_reserve_init(), * to its associated userspace DRM file. Callers must already hold * &drm_device.event_lock. * * Note that the core will take care of unlinking and disarming events when the * corresponding DRM file is closed. Drivers need not worry about whether the * DRM file for this event still exists and can call this function upon * completion of the asynchronous work unconditionally. */ void drm_send_event_timestamp_locked(struct drm_device *dev, struct drm_pending_event *e, ktime_t timestamp) { drm_send_event_helper(dev, e, timestamp); } EXPORT_SYMBOL(drm_send_event_timestamp_locked); /** * drm_send_event_locked - send DRM event to file descriptor * @dev: DRM device * @e: DRM event to deliver * * This function sends the event @e, initialized with drm_event_reserve_init(), * to its associated userspace DRM file. Callers must already hold * &drm_device.event_lock, see drm_send_event() for the unlocked version. * * Note that the core will take care of unlinking and disarming events when the * corresponding DRM file is closed. Drivers need not worry about whether the * DRM file for this event still exists and can call this function upon * completion of the asynchronous work unconditionally. */ void drm_send_event_locked(struct drm_device *dev, struct drm_pending_event *e) { drm_send_event_helper(dev, e, 0); } EXPORT_SYMBOL(drm_send_event_locked); /** * drm_send_event - send DRM event to file descriptor * @dev: DRM device * @e: DRM event to deliver * * This function sends the event @e, initialized with drm_event_reserve_init(), * to its associated userspace DRM file. This function acquires * &drm_device.event_lock, see drm_send_event_locked() for callers which already * hold this lock. * * Note that the core will take care of unlinking and disarming events when the * corresponding DRM file is closed. Drivers need not worry about whether the * DRM file for this event still exists and can call this function upon * completion of the asynchronous work unconditionally. */ void drm_send_event(struct drm_device *dev, struct drm_pending_event *e) { unsigned long irqflags; spin_lock_irqsave(&dev->event_lock, irqflags); drm_send_event_helper(dev, e, 0); spin_unlock_irqrestore(&dev->event_lock, irqflags); } EXPORT_SYMBOL(drm_send_event); static void print_size(struct drm_printer *p, const char *stat, const char *region, u64 sz) { const char *units[] = {"", " KiB", " MiB"}; unsigned u; for (u = 0; u < ARRAY_SIZE(units) - 1; u++) { if (sz < SZ_1K) break; sz = div_u64(sz, SZ_1K); } drm_printf(p, "drm-%s-%s:\t%llu%s\n", stat, region, sz, units[u]); } /** * drm_print_memory_stats - A helper to print memory stats * @p: The printer to print output to * @stats: The collected memory stats * @supported_status: Bitmask of optional stats which are available * @region: The memory region * */ void drm_print_memory_stats(struct drm_printer *p, const struct drm_memory_stats *stats, enum drm_gem_object_status supported_status, const char *region) { print_size(p, "total", region, stats->private + stats->shared); print_size(p, "shared", region, stats->shared); print_size(p, "active", region, stats->active); if (supported_status & DRM_GEM_OBJECT_RESIDENT) print_size(p, "resident", region, stats->resident); if (supported_status & DRM_GEM_OBJECT_PURGEABLE) print_size(p, "purgeable", region, stats->purgeable); } EXPORT_SYMBOL(drm_print_memory_stats); /** * drm_show_memory_stats - Helper to collect and show standard fdinfo memory stats * @p: the printer to print output to * @file: the DRM file * * Helper to iterate over GEM objects with a handle allocated in the specified * file. */ void drm_show_memory_stats(struct drm_printer *p, struct drm_file *file) { struct drm_gem_object *obj; struct drm_memory_stats status = {}; enum drm_gem_object_status supported_status; int id; spin_lock(&file->table_lock); idr_for_each_entry (&file->object_idr, obj, id) { enum drm_gem_object_status s = 0; size_t add_size = (obj->funcs && obj->funcs->rss) ? obj->funcs->rss(obj) : obj->size; if (obj->funcs && obj->funcs->status) { s = obj->funcs->status(obj); supported_status = DRM_GEM_OBJECT_RESIDENT | DRM_GEM_OBJECT_PURGEABLE; } if (obj->handle_count > 1) { status.shared += obj->size; } else { status.private += obj->size; } if (s & DRM_GEM_OBJECT_RESIDENT) { status.resident += add_size; } else { /* If already purged or not yet backed by pages, don't * count it as purgeable: */ s &= ~DRM_GEM_OBJECT_PURGEABLE; } if (!dma_resv_test_signaled(obj->resv, dma_resv_usage_rw(true))) { status.active += add_size; /* If still active, don't count as purgeable: */ s &= ~DRM_GEM_OBJECT_PURGEABLE; } if (s & DRM_GEM_OBJECT_PURGEABLE) status.purgeable += add_size; } spin_unlock(&file->table_lock); drm_print_memory_stats(p, &status, supported_status, "memory"); } EXPORT_SYMBOL(drm_show_memory_stats); /** * drm_show_fdinfo - helper for drm file fops * @m: output stream * @f: the device file instance * * Helper to implement fdinfo, for userspace to query usage stats, etc, of a * process using the GPU. See also &drm_driver.show_fdinfo. * * For text output format description please see Documentation/gpu/drm-usage-stats.rst */ void drm_show_fdinfo(struct seq_file *m, struct file *f) { struct drm_file *file = f->private_data; struct drm_device *dev = file->minor->dev; struct drm_printer p = drm_seq_file_printer(m); drm_printf(&p, "drm-driver:\t%s\n", dev->driver->name); drm_printf(&p, "drm-client-id:\t%llu\n", file->client_id); if (dev_is_pci(dev->dev)) { struct pci_dev *pdev = to_pci_dev(dev->dev); drm_printf(&p, "drm-pdev:\t%04x:%02x:%02x.%d\n", pci_domain_nr(pdev->bus), pdev->bus->number, PCI_SLOT(pdev->devfn), PCI_FUNC(pdev->devfn)); } if (dev->driver->show_fdinfo) dev->driver->show_fdinfo(&p, file); } EXPORT_SYMBOL(drm_show_fdinfo); /** * mock_drm_getfile - Create a new struct file for the drm device * @minor: drm minor to wrap (e.g. #drm_device.primary) * @flags: file creation mode (O_RDWR etc) * * This create a new struct file that wraps a DRM file context around a * DRM minor. This mimicks userspace opening e.g. /dev/dri/card0, but without * invoking userspace. The struct file may be operated on using its f_op * (the drm_device.driver.fops) to mimick userspace operations, or be supplied * to userspace facing functions as an internal/anonymous client. * * RETURNS: * Pointer to newly created struct file, ERR_PTR on failure. */ struct file *mock_drm_getfile(struct drm_minor *minor, unsigned int flags) { struct drm_device *dev = minor->dev; struct drm_file *priv; struct file *file; priv = drm_file_alloc(minor); if (IS_ERR(priv)) return ERR_CAST(priv); file = anon_inode_getfile("drm", dev->driver->fops, priv, flags); if (IS_ERR(file)) { drm_file_free(priv); return file; } /* Everyone shares a single global address space */ file->f_mapping = dev->anon_inode->i_mapping; drm_dev_get(dev); priv->filp = file; return file; } EXPORT_SYMBOL_FOR_TESTS_ONLY(mock_drm_getfile);
46 45 16 6 17 17 13 17 4 4 2 1 1 1 1 1 1 1 1 1 1 4 4 4 15 15 50 47 44 41 13 40 40 40 40 39 13 12 39 32 32 8 39 42 44 10 10 9 9 5 9 8 9 10 10 4 3 4 4 13 11 6 1 6 6 5 14 17 15 13 13 11 18 2 8 5 10 10 2 1 2 2 2 13 4 2 4 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 // SPDX-License-Identifier: GPL-2.0-or-later /* * OSS compatible sequencer driver * * MIDI device handlers * * Copyright (C) 1998,99 Takashi Iwai <tiwai@suse.de> */ #include <sound/asoundef.h> #include "seq_oss_midi.h" #include "seq_oss_readq.h" #include "seq_oss_timer.h" #include "seq_oss_event.h" #include <sound/seq_midi_event.h> #include "../seq_lock.h" #include <linux/init.h> #include <linux/slab.h> #include <linux/nospec.h> /* * constants */ #define SNDRV_SEQ_OSS_MAX_MIDI_NAME 30 /* * definition of midi device record */ struct seq_oss_midi { int seq_device; /* device number */ int client; /* sequencer client number */ int port; /* sequencer port number */ unsigned int flags; /* port capability */ int opened; /* flag for opening */ unsigned char name[SNDRV_SEQ_OSS_MAX_MIDI_NAME]; struct snd_midi_event *coder; /* MIDI event coder */ struct seq_oss_devinfo *devinfo; /* assigned OSSseq device */ snd_use_lock_t use_lock; struct mutex open_mutex; }; /* * midi device table */ static int max_midi_devs; static struct seq_oss_midi *midi_devs[SNDRV_SEQ_OSS_MAX_MIDI_DEVS]; static DEFINE_SPINLOCK(register_lock); /* * prototypes */ static struct seq_oss_midi *get_mdev(int dev); static struct seq_oss_midi *get_mididev(struct seq_oss_devinfo *dp, int dev); static int send_synth_event(struct seq_oss_devinfo *dp, struct snd_seq_event *ev, int dev); static int send_midi_event(struct seq_oss_devinfo *dp, struct snd_seq_event *ev, struct seq_oss_midi *mdev); /* * look up the existing ports * this looks a very exhausting job. */ int snd_seq_oss_midi_lookup_ports(int client) { struct snd_seq_client_info *clinfo; struct snd_seq_port_info *pinfo; clinfo = kzalloc(sizeof(*clinfo), GFP_KERNEL); pinfo = kzalloc(sizeof(*pinfo), GFP_KERNEL); if (! clinfo || ! pinfo) { kfree(clinfo); kfree(pinfo); return -ENOMEM; } clinfo->client = -1; while (snd_seq_kernel_client_ctl(client, SNDRV_SEQ_IOCTL_QUERY_NEXT_CLIENT, clinfo) == 0) { if (clinfo->client == client) continue; /* ignore myself */ pinfo->addr.client = clinfo->client; pinfo->addr.port = -1; while (snd_seq_kernel_client_ctl(client, SNDRV_SEQ_IOCTL_QUERY_NEXT_PORT, pinfo) == 0) snd_seq_oss_midi_check_new_port(pinfo); } kfree(clinfo); kfree(pinfo); return 0; } /* */ static struct seq_oss_midi * get_mdev(int dev) { struct seq_oss_midi *mdev; unsigned long flags; spin_lock_irqsave(&register_lock, flags); mdev = midi_devs[dev]; if (mdev) snd_use_lock_use(&mdev->use_lock); spin_unlock_irqrestore(&register_lock, flags); return mdev; } /* * look for the identical slot */ static struct seq_oss_midi * find_slot(int client, int port) { int i; struct seq_oss_midi *mdev; unsigned long flags; spin_lock_irqsave(&register_lock, flags); for (i = 0; i < max_midi_devs; i++) { mdev = midi_devs[i]; if (mdev && mdev->client == client && mdev->port == port) { /* found! */ snd_use_lock_use(&mdev->use_lock); spin_unlock_irqrestore(&register_lock, flags); return mdev; } } spin_unlock_irqrestore(&register_lock, flags); return NULL; } #define PERM_WRITE (SNDRV_SEQ_PORT_CAP_WRITE|SNDRV_SEQ_PORT_CAP_SUBS_WRITE) #define PERM_READ (SNDRV_SEQ_PORT_CAP_READ|SNDRV_SEQ_PORT_CAP_SUBS_READ) /* * register a new port if it doesn't exist yet */ int snd_seq_oss_midi_check_new_port(struct snd_seq_port_info *pinfo) { int i; struct seq_oss_midi *mdev; unsigned long flags; /* the port must include generic midi */ if (! (pinfo->type & SNDRV_SEQ_PORT_TYPE_MIDI_GENERIC)) return 0; /* either read or write subscribable */ if ((pinfo->capability & PERM_WRITE) != PERM_WRITE && (pinfo->capability & PERM_READ) != PERM_READ) return 0; /* * look for the identical slot */ mdev = find_slot(pinfo->addr.client, pinfo->addr.port); if (mdev) { /* already exists */ snd_use_lock_free(&mdev->use_lock); return 0; } /* * allocate midi info record */ mdev = kzalloc(sizeof(*mdev), GFP_KERNEL); if (!mdev) return -ENOMEM; /* copy the port information */ mdev->client = pinfo->addr.client; mdev->port = pinfo->addr.port; mdev->flags = pinfo->capability; mdev->opened = 0; snd_use_lock_init(&mdev->use_lock); mutex_init(&mdev->open_mutex); /* copy and truncate the name of synth device */ strscpy(mdev->name, pinfo->name, sizeof(mdev->name)); /* create MIDI coder */ if (snd_midi_event_new(MAX_MIDI_EVENT_BUF, &mdev->coder) < 0) { pr_err("ALSA: seq_oss: can't malloc midi coder\n"); kfree(mdev); return -ENOMEM; } /* OSS sequencer adds running status to all sequences */ snd_midi_event_no_status(mdev->coder, 1); /* * look for en empty slot */ spin_lock_irqsave(&register_lock, flags); for (i = 0; i < max_midi_devs; i++) { if (midi_devs[i] == NULL) break; } if (i >= max_midi_devs) { if (max_midi_devs >= SNDRV_SEQ_OSS_MAX_MIDI_DEVS) { spin_unlock_irqrestore(&register_lock, flags); snd_midi_event_free(mdev->coder); kfree(mdev); return -ENOMEM; } max_midi_devs++; } mdev->seq_device = i; midi_devs[mdev->seq_device] = mdev; spin_unlock_irqrestore(&register_lock, flags); return 0; } /* * release the midi device if it was registered */ int snd_seq_oss_midi_check_exit_port(int client, int port) { struct seq_oss_midi *mdev; unsigned long flags; int index; mdev = find_slot(client, port); if (mdev) { spin_lock_irqsave(&register_lock, flags); midi_devs[mdev->seq_device] = NULL; spin_unlock_irqrestore(&register_lock, flags); snd_use_lock_free(&mdev->use_lock); snd_use_lock_sync(&mdev->use_lock); snd_midi_event_free(mdev->coder); kfree(mdev); } spin_lock_irqsave(&register_lock, flags); for (index = max_midi_devs - 1; index >= 0; index--) { if (midi_devs[index]) break; } max_midi_devs = index + 1; spin_unlock_irqrestore(&register_lock, flags); return 0; } /* * release the midi device if it was registered */ void snd_seq_oss_midi_clear_all(void) { int i; struct seq_oss_midi *mdev; unsigned long flags; spin_lock_irqsave(&register_lock, flags); for (i = 0; i < max_midi_devs; i++) { mdev = midi_devs[i]; if (mdev) { snd_midi_event_free(mdev->coder); kfree(mdev); midi_devs[i] = NULL; } } max_midi_devs = 0; spin_unlock_irqrestore(&register_lock, flags); } /* * set up midi tables */ void snd_seq_oss_midi_setup(struct seq_oss_devinfo *dp) { spin_lock_irq(&register_lock); dp->max_mididev = max_midi_devs; spin_unlock_irq(&register_lock); } /* * clean up midi tables */ void snd_seq_oss_midi_cleanup(struct seq_oss_devinfo *dp) { int i; for (i = 0; i < dp->max_mididev; i++) snd_seq_oss_midi_close(dp, i); dp->max_mididev = 0; } /* * open all midi devices. ignore errors. */ void snd_seq_oss_midi_open_all(struct seq_oss_devinfo *dp, int file_mode) { int i; for (i = 0; i < dp->max_mididev; i++) snd_seq_oss_midi_open(dp, i, file_mode); } /* * get the midi device information */ static struct seq_oss_midi * get_mididev(struct seq_oss_devinfo *dp, int dev) { if (dev < 0 || dev >= dp->max_mididev) return NULL; dev = array_index_nospec(dev, dp->max_mididev); return get_mdev(dev); } /* * open the midi device if not opened yet */ int snd_seq_oss_midi_open(struct seq_oss_devinfo *dp, int dev, int fmode) { int perm; struct seq_oss_midi *mdev; struct snd_seq_port_subscribe subs; int err; mdev = get_mididev(dp, dev); if (!mdev) return -ENODEV; mutex_lock(&mdev->open_mutex); /* already used? */ if (mdev->opened && mdev->devinfo != dp) { err = -EBUSY; goto unlock; } perm = 0; if (is_write_mode(fmode)) perm |= PERM_WRITE; if (is_read_mode(fmode)) perm |= PERM_READ; perm &= mdev->flags; if (perm == 0) { err = -ENXIO; goto unlock; } /* already opened? */ if ((mdev->opened & perm) == perm) { err = 0; goto unlock; } perm &= ~mdev->opened; memset(&subs, 0, sizeof(subs)); if (perm & PERM_WRITE) { subs.sender = dp->addr; subs.dest.client = mdev->client; subs.dest.port = mdev->port; if (snd_seq_kernel_client_ctl(dp->cseq, SNDRV_SEQ_IOCTL_SUBSCRIBE_PORT, &subs) >= 0) mdev->opened |= PERM_WRITE; } if (perm & PERM_READ) { subs.sender.client = mdev->client; subs.sender.port = mdev->port; subs.dest = dp->addr; subs.flags = SNDRV_SEQ_PORT_SUBS_TIMESTAMP; subs.queue = dp->queue; /* queue for timestamps */ if (snd_seq_kernel_client_ctl(dp->cseq, SNDRV_SEQ_IOCTL_SUBSCRIBE_PORT, &subs) >= 0) mdev->opened |= PERM_READ; } if (! mdev->opened) { err = -ENXIO; goto unlock; } mdev->devinfo = dp; err = 0; unlock: mutex_unlock(&mdev->open_mutex); snd_use_lock_free(&mdev->use_lock); return err; } /* * close the midi device if already opened */ int snd_seq_oss_midi_close(struct seq_oss_devinfo *dp, int dev) { struct seq_oss_midi *mdev; struct snd_seq_port_subscribe subs; mdev = get_mididev(dp, dev); if (!mdev) return -ENODEV; mutex_lock(&mdev->open_mutex); if (!mdev->opened || mdev->devinfo != dp) goto unlock; memset(&subs, 0, sizeof(subs)); if (mdev->opened & PERM_WRITE) { subs.sender = dp->addr; subs.dest.client = mdev->client; subs.dest.port = mdev->port; snd_seq_kernel_client_ctl(dp->cseq, SNDRV_SEQ_IOCTL_UNSUBSCRIBE_PORT, &subs); } if (mdev->opened & PERM_READ) { subs.sender.client = mdev->client; subs.sender.port = mdev->port; subs.dest = dp->addr; snd_seq_kernel_client_ctl(dp->cseq, SNDRV_SEQ_IOCTL_UNSUBSCRIBE_PORT, &subs); } mdev->opened = 0; mdev->devinfo = NULL; unlock: mutex_unlock(&mdev->open_mutex); snd_use_lock_free(&mdev->use_lock); return 0; } /* * change seq capability flags to file mode flags */ int snd_seq_oss_midi_filemode(struct seq_oss_devinfo *dp, int dev) { struct seq_oss_midi *mdev; int mode; mdev = get_mididev(dp, dev); if (!mdev) return 0; mode = 0; if (mdev->opened & PERM_WRITE) mode |= SNDRV_SEQ_OSS_FILE_WRITE; if (mdev->opened & PERM_READ) mode |= SNDRV_SEQ_OSS_FILE_READ; snd_use_lock_free(&mdev->use_lock); return mode; } /* * reset the midi device and close it: * so far, only close the device. */ void snd_seq_oss_midi_reset(struct seq_oss_devinfo *dp, int dev) { struct seq_oss_midi *mdev; mdev = get_mididev(dp, dev); if (!mdev) return; if (! mdev->opened) { snd_use_lock_free(&mdev->use_lock); return; } if (mdev->opened & PERM_WRITE) { struct snd_seq_event ev; int c; memset(&ev, 0, sizeof(ev)); ev.dest.client = mdev->client; ev.dest.port = mdev->port; ev.queue = dp->queue; ev.source.port = dp->port; if (dp->seq_mode == SNDRV_SEQ_OSS_MODE_SYNTH) { ev.type = SNDRV_SEQ_EVENT_SENSING; snd_seq_oss_dispatch(dp, &ev, 0, 0); } for (c = 0; c < 16; c++) { ev.type = SNDRV_SEQ_EVENT_CONTROLLER; ev.data.control.channel = c; ev.data.control.param = MIDI_CTL_ALL_NOTES_OFF; snd_seq_oss_dispatch(dp, &ev, 0, 0); if (dp->seq_mode == SNDRV_SEQ_OSS_MODE_MUSIC) { ev.data.control.param = MIDI_CTL_RESET_CONTROLLERS; snd_seq_oss_dispatch(dp, &ev, 0, 0); ev.type = SNDRV_SEQ_EVENT_PITCHBEND; ev.data.control.value = 0; snd_seq_oss_dispatch(dp, &ev, 0, 0); } } } // snd_seq_oss_midi_close(dp, dev); snd_use_lock_free(&mdev->use_lock); } /* * get client/port of the specified MIDI device */ void snd_seq_oss_midi_get_addr(struct seq_oss_devinfo *dp, int dev, struct snd_seq_addr *addr) { struct seq_oss_midi *mdev; mdev = get_mididev(dp, dev); if (!mdev) return; addr->client = mdev->client; addr->port = mdev->port; snd_use_lock_free(&mdev->use_lock); } /* * input callback - this can be atomic */ int snd_seq_oss_midi_input(struct snd_seq_event *ev, int direct, void *private_data) { struct seq_oss_devinfo *dp = (struct seq_oss_devinfo *)private_data; struct seq_oss_midi *mdev; int rc; if (dp->readq == NULL) return 0; mdev = find_slot(ev->source.client, ev->source.port); if (!mdev) return 0; if (! (mdev->opened & PERM_READ)) { snd_use_lock_free(&mdev->use_lock); return 0; } if (dp->seq_mode == SNDRV_SEQ_OSS_MODE_MUSIC) rc = send_synth_event(dp, ev, mdev->seq_device); else rc = send_midi_event(dp, ev, mdev); snd_use_lock_free(&mdev->use_lock); return rc; } /* * convert ALSA sequencer event to OSS synth event */ static int send_synth_event(struct seq_oss_devinfo *dp, struct snd_seq_event *ev, int dev) { union evrec ossev; memset(&ossev, 0, sizeof(ossev)); switch (ev->type) { case SNDRV_SEQ_EVENT_NOTEON: ossev.v.cmd = MIDI_NOTEON; break; case SNDRV_SEQ_EVENT_NOTEOFF: ossev.v.cmd = MIDI_NOTEOFF; break; case SNDRV_SEQ_EVENT_KEYPRESS: ossev.v.cmd = MIDI_KEY_PRESSURE; break; case SNDRV_SEQ_EVENT_CONTROLLER: ossev.l.cmd = MIDI_CTL_CHANGE; break; case SNDRV_SEQ_EVENT_PGMCHANGE: ossev.l.cmd = MIDI_PGM_CHANGE; break; case SNDRV_SEQ_EVENT_CHANPRESS: ossev.l.cmd = MIDI_CHN_PRESSURE; break; case SNDRV_SEQ_EVENT_PITCHBEND: ossev.l.cmd = MIDI_PITCH_BEND; break; default: return 0; /* not supported */ } ossev.v.dev = dev; switch (ev->type) { case SNDRV_SEQ_EVENT_NOTEON: case SNDRV_SEQ_EVENT_NOTEOFF: case SNDRV_SEQ_EVENT_KEYPRESS: ossev.v.code = EV_CHN_VOICE; ossev.v.note = ev->data.note.note; ossev.v.parm = ev->data.note.velocity; ossev.v.chn = ev->data.note.channel; break; case SNDRV_SEQ_EVENT_CONTROLLER: case SNDRV_SEQ_EVENT_PGMCHANGE: case SNDRV_SEQ_EVENT_CHANPRESS: ossev.l.code = EV_CHN_COMMON; ossev.l.p1 = ev->data.control.param; ossev.l.val = ev->data.control.value; ossev.l.chn = ev->data.control.channel; break; case SNDRV_SEQ_EVENT_PITCHBEND: ossev.l.code = EV_CHN_COMMON; ossev.l.val = ev->data.control.value + 8192; ossev.l.chn = ev->data.control.channel; break; } snd_seq_oss_readq_put_timestamp(dp->readq, ev->time.tick, dp->seq_mode); snd_seq_oss_readq_put_event(dp->readq, &ossev); return 0; } /* * decode event and send MIDI bytes to read queue */ static int send_midi_event(struct seq_oss_devinfo *dp, struct snd_seq_event *ev, struct seq_oss_midi *mdev) { char msg[32]; int len; snd_seq_oss_readq_put_timestamp(dp->readq, ev->time.tick, dp->seq_mode); if (!dp->timer->running) len = snd_seq_oss_timer_start(dp->timer); if (ev->type == SNDRV_SEQ_EVENT_SYSEX) { snd_seq_oss_readq_sysex(dp->readq, mdev->seq_device, ev); snd_midi_event_reset_decode(mdev->coder); } else { len = snd_midi_event_decode(mdev->coder, msg, sizeof(msg), ev); if (len > 0) snd_seq_oss_readq_puts(dp->readq, mdev->seq_device, msg, len); } return 0; } /* * dump midi data * return 0 : enqueued * non-zero : invalid - ignored */ int snd_seq_oss_midi_putc(struct seq_oss_devinfo *dp, int dev, unsigned char c, struct snd_seq_event *ev) { struct seq_oss_midi *mdev; mdev = get_mididev(dp, dev); if (!mdev) return -ENODEV; if (snd_midi_event_encode_byte(mdev->coder, c, ev)) { snd_seq_oss_fill_addr(dp, ev, mdev->client, mdev->port); snd_use_lock_free(&mdev->use_lock); return 0; } snd_use_lock_free(&mdev->use_lock); return -EINVAL; } /* * create OSS compatible midi_info record */ int snd_seq_oss_midi_make_info(struct seq_oss_devinfo *dp, int dev, struct midi_info *inf) { struct seq_oss_midi *mdev; mdev = get_mididev(dp, dev); if (!mdev) return -ENXIO; inf->device = dev; inf->dev_type = 0; /* FIXME: ?? */ inf->capabilities = 0; /* FIXME: ?? */ strscpy(inf->name, mdev->name, sizeof(inf->name)); snd_use_lock_free(&mdev->use_lock); return 0; } #ifdef CONFIG_SND_PROC_FS /* * proc interface */ static char * capmode_str(int val) { val &= PERM_READ|PERM_WRITE; if (val == (PERM_READ|PERM_WRITE)) return "read/write"; else if (val == PERM_READ) return "read"; else if (val == PERM_WRITE) return "write"; else return "none"; } void snd_seq_oss_midi_info_read(struct snd_info_buffer *buf) { int i; struct seq_oss_midi *mdev; snd_iprintf(buf, "\nNumber of MIDI devices: %d\n", max_midi_devs); for (i = 0; i < max_midi_devs; i++) { snd_iprintf(buf, "\nmidi %d: ", i); mdev = get_mdev(i); if (mdev == NULL) { snd_iprintf(buf, "*empty*\n"); continue; } snd_iprintf(buf, "[%s] ALSA port %d:%d\n", mdev->name, mdev->client, mdev->port); snd_iprintf(buf, " capability %s / opened %s\n", capmode_str(mdev->flags), capmode_str(mdev->opened)); snd_use_lock_free(&mdev->use_lock); } } #endif /* CONFIG_SND_PROC_FS */
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 // SPDX-License-Identifier: GPL-2.0-or-later /* * acpi_bus.c - ACPI Bus Driver ($Revision: 80 $) * * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com> */ #define pr_fmt(fmt) "ACPI: " fmt #include <linux/module.h> #include <linux/init.h> #include <linux/ioport.h> #include <linux/kernel.h> #include <linux/list.h> #include <linux/sched.h> #include <linux/pm.h> #include <linux/device.h> #include <linux/proc_fs.h> #include <linux/acpi.h> #include <linux/slab.h> #include <linux/regulator/machine.h> #include <linux/workqueue.h> #include <linux/reboot.h> #include <linux/delay.h> #ifdef CONFIG_X86 #include <asm/mpspec.h> #include <linux/dmi.h> #endif #include <linux/acpi_viot.h> #include <linux/pci.h> #include <acpi/apei.h> #include <linux/suspend.h> #include <linux/prmt.h> #include "internal.h" struct acpi_device *acpi_root; struct proc_dir_entry *acpi_root_dir; EXPORT_SYMBOL(acpi_root_dir); #ifdef CONFIG_X86 #ifdef CONFIG_ACPI_CUSTOM_DSDT static inline int set_copy_dsdt(const struct dmi_system_id *id) { return 0; } #else static int set_copy_dsdt(const struct dmi_system_id *id) { pr_notice("%s detected - force copy of DSDT to local memory\n", id->ident); acpi_gbl_copy_dsdt_locally = 1; return 0; } #endif static const struct dmi_system_id dsdt_dmi_table[] __initconst = { /* * Invoke DSDT corruption work-around on all Toshiba Satellite. * https://bugzilla.kernel.org/show_bug.cgi?id=14679 */ { .callback = set_copy_dsdt, .ident = "TOSHIBA Satellite", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "TOSHIBA"), DMI_MATCH(DMI_PRODUCT_NAME, "Satellite"), }, }, {} }; #endif /* -------------------------------------------------------------------------- Device Management -------------------------------------------------------------------------- */ acpi_status acpi_bus_get_status_handle(acpi_handle handle, unsigned long long *sta) { acpi_status status; status = acpi_evaluate_integer(handle, "_STA", NULL, sta); if (ACPI_SUCCESS(status)) return AE_OK; if (status == AE_NOT_FOUND) { *sta = ACPI_STA_DEVICE_PRESENT | ACPI_STA_DEVICE_ENABLED | ACPI_STA_DEVICE_UI | ACPI_STA_DEVICE_FUNCTIONING; return AE_OK; } return status; } EXPORT_SYMBOL_GPL(acpi_bus_get_status_handle); int acpi_bus_get_status(struct acpi_device *device) { acpi_status status; unsigned long long sta; if (acpi_device_override_status(device, &sta)) { acpi_set_device_status(device, sta); return 0; } /* Battery devices must have their deps met before calling _STA */ if (acpi_device_is_battery(device) && device->dep_unmet) { acpi_set_device_status(device, 0); return 0; } status = acpi_bus_get_status_handle(device->handle, &sta); if (ACPI_FAILURE(status)) return -ENODEV; acpi_set_device_status(device, sta); if (device->status.functional && !device->status.present) { pr_debug("Device [%s] status [%08x]: functional but not present\n", device->pnp.bus_id, (u32)sta); } pr_debug("Device [%s] status [%08x]\n", device->pnp.bus_id, (u32)sta); return 0; } EXPORT_SYMBOL(acpi_bus_get_status); void acpi_bus_private_data_handler(acpi_handle handle, void *context) { return; } EXPORT_SYMBOL(acpi_bus_private_data_handler); int acpi_bus_attach_private_data(acpi_handle handle, void *data) { acpi_status status; status = acpi_attach_data(handle, acpi_bus_private_data_handler, data); if (ACPI_FAILURE(status)) { acpi_handle_debug(handle, "Error attaching device data\n"); return -ENODEV; } return 0; } EXPORT_SYMBOL_GPL(acpi_bus_attach_private_data); int acpi_bus_get_private_data(acpi_handle handle, void **data) { acpi_status status; if (!data) return -EINVAL; status = acpi_get_data(handle, acpi_bus_private_data_handler, data); if (ACPI_FAILURE(status)) { acpi_handle_debug(handle, "No context for object\n"); return -ENODEV; } return 0; } EXPORT_SYMBOL_GPL(acpi_bus_get_private_data); void acpi_bus_detach_private_data(acpi_handle handle) { acpi_detach_data(handle, acpi_bus_private_data_handler); } EXPORT_SYMBOL_GPL(acpi_bus_detach_private_data); static void acpi_print_osc_error(acpi_handle handle, struct acpi_osc_context *context, char *error) { int i; acpi_handle_debug(handle, "(%s): %s\n", context->uuid_str, error); pr_debug("_OSC request data:"); for (i = 0; i < context->cap.length; i += sizeof(u32)) pr_debug(" %x", *((u32 *)(context->cap.pointer + i))); pr_debug("\n"); } acpi_status acpi_run_osc(acpi_handle handle, struct acpi_osc_context *context) { acpi_status status; struct acpi_object_list input; union acpi_object in_params[4]; union acpi_object *out_obj; guid_t guid; u32 errors; struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL}; if (!context) return AE_ERROR; if (guid_parse(context->uuid_str, &guid)) return AE_ERROR; context->ret.length = ACPI_ALLOCATE_BUFFER; context->ret.pointer = NULL; /* Setting up input parameters */ input.count = 4; input.pointer = in_params; in_params[0].type = ACPI_TYPE_BUFFER; in_params[0].buffer.length = 16; in_params[0].buffer.pointer = (u8 *)&guid; in_params[1].type = ACPI_TYPE_INTEGER; in_params[1].integer.value = context->rev; in_params[2].type = ACPI_TYPE_INTEGER; in_params[2].integer.value = context->cap.length/sizeof(u32); in_params[3].type = ACPI_TYPE_BUFFER; in_params[3].buffer.length = context->cap.length; in_params[3].buffer.pointer = context->cap.pointer; status = acpi_evaluate_object(handle, "_OSC", &input, &output); if (ACPI_FAILURE(status)) return status; if (!output.length) return AE_NULL_OBJECT; out_obj = output.pointer; if (out_obj->type != ACPI_TYPE_BUFFER || out_obj->buffer.length != context->cap.length) { acpi_print_osc_error(handle, context, "_OSC evaluation returned wrong type"); status = AE_TYPE; goto out_kfree; } /* Need to ignore the bit0 in result code */ errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0); if (errors) { if (errors & OSC_REQUEST_ERROR) acpi_print_osc_error(handle, context, "_OSC request failed"); if (errors & OSC_INVALID_UUID_ERROR) acpi_print_osc_error(handle, context, "_OSC invalid UUID"); if (errors & OSC_INVALID_REVISION_ERROR) acpi_print_osc_error(handle, context, "_OSC invalid revision"); if (errors & OSC_CAPABILITIES_MASK_ERROR) { if (((u32 *)context->cap.pointer)[OSC_QUERY_DWORD] & OSC_QUERY_ENABLE) goto out_success; status = AE_SUPPORT; goto out_kfree; } status = AE_ERROR; goto out_kfree; } out_success: context->ret.length = out_obj->buffer.length; context->ret.pointer = kmemdup(out_obj->buffer.pointer, context->ret.length, GFP_KERNEL); if (!context->ret.pointer) { status = AE_NO_MEMORY; goto out_kfree; } status = AE_OK; out_kfree: kfree(output.pointer); return status; } EXPORT_SYMBOL(acpi_run_osc); bool osc_sb_apei_support_acked; /* * ACPI 6.0 Section 8.4.4.2 Idle State Coordination * OSPM supports platform coordinated low power idle(LPI) states */ bool osc_pc_lpi_support_confirmed; EXPORT_SYMBOL_GPL(osc_pc_lpi_support_confirmed); /* * ACPI 6.2 Section 6.2.11.2 'Platform-Wide OSPM Capabilities': * Starting with ACPI Specification 6.2, all _CPC registers can be in * PCC, System Memory, System IO, or Functional Fixed Hardware address * spaces. OSPM support for this more flexible register space scheme is * indicated by the “Flexible Address Space for CPPC Registers” _OSC bit. * * Otherwise (cf ACPI 6.1, s8.4.7.1.1.X), _CPC registers must be in: * - PCC or Functional Fixed Hardware address space if defined * - SystemMemory address space (NULL register) if not defined */ bool osc_cpc_flexible_adr_space_confirmed; EXPORT_SYMBOL_GPL(osc_cpc_flexible_adr_space_confirmed); /* * ACPI 6.4 Operating System Capabilities for USB. */ bool osc_sb_native_usb4_support_confirmed; EXPORT_SYMBOL_GPL(osc_sb_native_usb4_support_confirmed); bool osc_sb_cppc2_support_acked; static u8 sb_uuid_str[] = "0811B06E-4A27-44F9-8D60-3CBBC22E7B48"; static void acpi_bus_osc_negotiate_platform_control(void) { u32 capbuf[2], *capbuf_ret; struct acpi_osc_context context = { .uuid_str = sb_uuid_str, .rev = 1, .cap.length = 8, .cap.pointer = capbuf, }; acpi_handle handle; capbuf[OSC_QUERY_DWORD] = OSC_QUERY_ENABLE; capbuf[OSC_SUPPORT_DWORD] = OSC_SB_PR3_SUPPORT; /* _PR3 is in use */ if (IS_ENABLED(CONFIG_ACPI_PROCESSOR_AGGREGATOR)) capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_PAD_SUPPORT; if (IS_ENABLED(CONFIG_ACPI_PROCESSOR)) capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_PPC_OST_SUPPORT; capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_HOTPLUG_OST_SUPPORT; capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_PCLPI_SUPPORT; if (IS_ENABLED(CONFIG_ACPI_PRMT)) capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_PRM_SUPPORT; if (IS_ENABLED(CONFIG_ACPI_FFH)) capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_FFH_OPR_SUPPORT; #ifdef CONFIG_ARM64 capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_GENERIC_INITIATOR_SUPPORT; #endif #ifdef CONFIG_X86 capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_GENERIC_INITIATOR_SUPPORT; #endif #ifdef CONFIG_ACPI_CPPC_LIB capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_CPC_SUPPORT; capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_CPCV2_SUPPORT; #endif capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_CPC_FLEXIBLE_ADR_SPACE; if (IS_ENABLED(CONFIG_SCHED_MC_PRIO)) capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_CPC_DIVERSE_HIGH_SUPPORT; if (IS_ENABLED(CONFIG_USB4)) capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_NATIVE_USB4_SUPPORT; if (!ghes_disable) capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_APEI_SUPPORT; if (ACPI_FAILURE(acpi_get_handle(NULL, "\\_SB", &handle))) return; if (ACPI_FAILURE(acpi_run_osc(handle, &context))) return; capbuf_ret = context.ret.pointer; if (context.ret.length <= OSC_SUPPORT_DWORD) { kfree(context.ret.pointer); return; } /* * Now run _OSC again with query flag clear and with the caps * supported by both the OS and the platform. */ capbuf[OSC_QUERY_DWORD] = 0; capbuf[OSC_SUPPORT_DWORD] = capbuf_ret[OSC_SUPPORT_DWORD]; kfree(context.ret.pointer); if (ACPI_FAILURE(acpi_run_osc(handle, &context))) return; capbuf_ret = context.ret.pointer; if (context.ret.length > OSC_SUPPORT_DWORD) { #ifdef CONFIG_ACPI_CPPC_LIB osc_sb_cppc2_support_acked = capbuf_ret[OSC_SUPPORT_DWORD] & OSC_SB_CPCV2_SUPPORT; #endif osc_sb_apei_support_acked = capbuf_ret[OSC_SUPPORT_DWORD] & OSC_SB_APEI_SUPPORT; osc_pc_lpi_support_confirmed = capbuf_ret[OSC_SUPPORT_DWORD] & OSC_SB_PCLPI_SUPPORT; osc_sb_native_usb4_support_confirmed = capbuf_ret[OSC_SUPPORT_DWORD] & OSC_SB_NATIVE_USB4_SUPPORT; osc_cpc_flexible_adr_space_confirmed = capbuf_ret[OSC_SUPPORT_DWORD] & OSC_SB_CPC_FLEXIBLE_ADR_SPACE; } kfree(context.ret.pointer); } /* * Native control of USB4 capabilities. If any of the tunneling bits is * set it means OS is in control and we use software based connection * manager. */ u32 osc_sb_native_usb4_control; EXPORT_SYMBOL_GPL(osc_sb_native_usb4_control); static void acpi_bus_decode_usb_osc(const char *msg, u32 bits) { pr_info("%s USB3%c DisplayPort%c PCIe%c XDomain%c\n", msg, (bits & OSC_USB_USB3_TUNNELING) ? '+' : '-', (bits & OSC_USB_DP_TUNNELING) ? '+' : '-', (bits & OSC_USB_PCIE_TUNNELING) ? '+' : '-', (bits & OSC_USB_XDOMAIN) ? '+' : '-'); } static u8 sb_usb_uuid_str[] = "23A0D13A-26AB-486C-9C5F-0FFA525A575A"; static void acpi_bus_osc_negotiate_usb_control(void) { u32 capbuf[3]; struct acpi_osc_context context = { .uuid_str = sb_usb_uuid_str, .rev = 1, .cap.length = sizeof(capbuf), .cap.pointer = capbuf, }; acpi_handle handle; acpi_status status; u32 control; if (!osc_sb_native_usb4_support_confirmed) return; if (ACPI_FAILURE(acpi_get_handle(NULL, "\\_SB", &handle))) return; control = OSC_USB_USB3_TUNNELING | OSC_USB_DP_TUNNELING | OSC_USB_PCIE_TUNNELING | OSC_USB_XDOMAIN; capbuf[OSC_QUERY_DWORD] = 0; capbuf[OSC_SUPPORT_DWORD] = 0; capbuf[OSC_CONTROL_DWORD] = control; status = acpi_run_osc(handle, &context); if (ACPI_FAILURE(status)) return; if (context.ret.length != sizeof(capbuf)) { pr_info("USB4 _OSC: returned invalid length buffer\n"); goto out_free; } osc_sb_native_usb4_control = control & acpi_osc_ctx_get_pci_control(&context); acpi_bus_decode_usb_osc("USB4 _OSC: OS supports", control); acpi_bus_decode_usb_osc("USB4 _OSC: OS controls", osc_sb_native_usb4_control); out_free: kfree(context.ret.pointer); } /* -------------------------------------------------------------------------- Notification Handling -------------------------------------------------------------------------- */ /** * acpi_bus_notify - Global system-level (0x00-0x7F) notifications handler * @handle: Target ACPI object. * @type: Notification type. * @data: Ignored. * * This only handles notifications related to device hotplug. */ static void acpi_bus_notify(acpi_handle handle, u32 type, void *data) { struct acpi_device *adev; switch (type) { case ACPI_NOTIFY_BUS_CHECK: acpi_handle_debug(handle, "ACPI_NOTIFY_BUS_CHECK event\n"); break; case ACPI_NOTIFY_DEVICE_CHECK: acpi_handle_debug(handle, "ACPI_NOTIFY_DEVICE_CHECK event\n"); break; case ACPI_NOTIFY_DEVICE_WAKE: acpi_handle_debug(handle, "ACPI_NOTIFY_DEVICE_WAKE event\n"); return; case ACPI_NOTIFY_EJECT_REQUEST: acpi_handle_debug(handle, "ACPI_NOTIFY_EJECT_REQUEST event\n"); break; case ACPI_NOTIFY_DEVICE_CHECK_LIGHT: acpi_handle_debug(handle, "ACPI_NOTIFY_DEVICE_CHECK_LIGHT event\n"); /* TBD: Exactly what does 'light' mean? */ return; case ACPI_NOTIFY_FREQUENCY_MISMATCH: acpi_handle_err(handle, "Device cannot be configured due " "to a frequency mismatch\n"); return; case ACPI_NOTIFY_BUS_MODE_MISMATCH: acpi_handle_err(handle, "Device cannot be configured due " "to a bus mode mismatch\n"); return; case ACPI_NOTIFY_POWER_FAULT: acpi_handle_err(handle, "Device has suffered a power fault\n"); return; default: acpi_handle_debug(handle, "Unknown event type 0x%x\n", type); return; } adev = acpi_get_acpi_dev(handle); if (adev && ACPI_SUCCESS(acpi_hotplug_schedule(adev, type))) return; acpi_put_acpi_dev(adev); acpi_evaluate_ost(handle, type, ACPI_OST_SC_NON_SPECIFIC_FAILURE, NULL); } static void acpi_notify_device(acpi_handle handle, u32 event, void *data) { struct acpi_device *device = data; struct acpi_driver *acpi_drv = to_acpi_driver(device->dev.driver); acpi_drv->ops.notify(device, event); } static int acpi_device_install_notify_handler(struct acpi_device *device, struct acpi_driver *acpi_drv) { u32 type = acpi_drv->flags & ACPI_DRIVER_ALL_NOTIFY_EVENTS ? ACPI_ALL_NOTIFY : ACPI_DEVICE_NOTIFY; acpi_status status; status = acpi_install_notify_handler(device->handle, type, acpi_notify_device, device); if (ACPI_FAILURE(status)) return -EINVAL; return 0; } static void acpi_device_remove_notify_handler(struct acpi_device *device, struct acpi_driver *acpi_drv) { u32 type = acpi_drv->flags & ACPI_DRIVER_ALL_NOTIFY_EVENTS ? ACPI_ALL_NOTIFY : ACPI_DEVICE_NOTIFY; acpi_remove_notify_handler(device->handle, type, acpi_notify_device); acpi_os_wait_events_complete(); } int acpi_dev_install_notify_handler(struct acpi_device *adev, u32 handler_type, acpi_notify_handler handler, void *context) { acpi_status status; status = acpi_install_notify_handler(adev->handle, handler_type, handler, context); if (ACPI_FAILURE(status)) return -ENODEV; return 0; } EXPORT_SYMBOL_GPL(acpi_dev_install_notify_handler); void acpi_dev_remove_notify_handler(struct acpi_device *adev, u32 handler_type, acpi_notify_handler handler) { acpi_remove_notify_handler(adev->handle, handler_type, handler); acpi_os_wait_events_complete(); } EXPORT_SYMBOL_GPL(acpi_dev_remove_notify_handler); /* Handle events targeting \_SB device (at present only graceful shutdown) */ #define ACPI_SB_NOTIFY_SHUTDOWN_REQUEST 0x81 #define ACPI_SB_INDICATE_INTERVAL 10000 static void sb_notify_work(struct work_struct *dummy) { acpi_handle sb_handle; orderly_poweroff(true); /* * After initiating graceful shutdown, the ACPI spec requires OSPM * to evaluate _OST method once every 10seconds to indicate that * the shutdown is in progress */ acpi_get_handle(NULL, "\\_SB", &sb_handle); while (1) { pr_info("Graceful shutdown in progress.\n"); acpi_evaluate_ost(sb_handle, ACPI_OST_EC_OSPM_SHUTDOWN, ACPI_OST_SC_OS_SHUTDOWN_IN_PROGRESS, NULL); msleep(ACPI_SB_INDICATE_INTERVAL); } } static void acpi_sb_notify(acpi_handle handle, u32 event, void *data) { static DECLARE_WORK(acpi_sb_work, sb_notify_work); if (event == ACPI_SB_NOTIFY_SHUTDOWN_REQUEST) { if (!work_busy(&acpi_sb_work)) schedule_work(&acpi_sb_work); } else { pr_warn("event %x is not supported by \\_SB device\n", event); } } static int __init acpi_setup_sb_notify_handler(void) { acpi_handle sb_handle; if (ACPI_FAILURE(acpi_get_handle(NULL, "\\_SB", &sb_handle))) return -ENXIO; if (ACPI_FAILURE(acpi_install_notify_handler(sb_handle, ACPI_DEVICE_NOTIFY, acpi_sb_notify, NULL))) return -EINVAL; return 0; } /* -------------------------------------------------------------------------- Device Matching -------------------------------------------------------------------------- */ /** * acpi_get_first_physical_node - Get first physical node of an ACPI device * @adev: ACPI device in question * * Return: First physical node of ACPI device @adev */ struct device *acpi_get_first_physical_node(struct acpi_device *adev) { struct mutex *physical_node_lock = &adev->physical_node_lock; struct device *phys_dev; mutex_lock(physical_node_lock); if (list_empty(&adev->physical_node_list)) { phys_dev = NULL; } else { const struct acpi_device_physical_node *node; node = list_first_entry(&adev->physical_node_list, struct acpi_device_physical_node, node); phys_dev = node->dev; } mutex_unlock(physical_node_lock); return phys_dev; } EXPORT_SYMBOL_GPL(acpi_get_first_physical_node); static struct acpi_device *acpi_primary_dev_companion(struct acpi_device *adev, const struct device *dev) { const struct device *phys_dev = acpi_get_first_physical_node(adev); return phys_dev && phys_dev == dev ? adev : NULL; } /** * acpi_device_is_first_physical_node - Is given dev first physical node * @adev: ACPI companion device * @dev: Physical device to check * * Function checks if given @dev is the first physical devices attached to * the ACPI companion device. This distinction is needed in some cases * where the same companion device is shared between many physical devices. * * Note that the caller have to provide valid @adev pointer. */ bool acpi_device_is_first_physical_node(struct acpi_device *adev, const struct device *dev) { return !!acpi_primary_dev_companion(adev, dev); } /* * acpi_companion_match() - Can we match via ACPI companion device * @dev: Device in question * * Check if the given device has an ACPI companion and if that companion has * a valid list of PNP IDs, and if the device is the first (primary) physical * device associated with it. Return the companion pointer if that's the case * or NULL otherwise. * * If multiple physical devices are attached to a single ACPI companion, we need * to be careful. The usage scenario for this kind of relationship is that all * of the physical devices in question use resources provided by the ACPI * companion. A typical case is an MFD device where all the sub-devices share * the parent's ACPI companion. In such cases we can only allow the primary * (first) physical device to be matched with the help of the companion's PNP * IDs. * * Additional physical devices sharing the ACPI companion can still use * resources available from it but they will be matched normally using functions * provided by their bus types (and analogously for their modalias). */ const struct acpi_device *acpi_companion_match(const struct device *dev) { struct acpi_device *adev; adev = ACPI_COMPANION(dev); if (!adev) return NULL; if (list_empty(&adev->pnp.ids)) return NULL; return acpi_primary_dev_companion(adev, dev); } /** * acpi_of_match_device - Match device object using the "compatible" property. * @adev: ACPI device object to match. * @of_match_table: List of device IDs to match against. * @of_id: OF ID if matched * * If @dev has an ACPI companion which has ACPI_DT_NAMESPACE_HID in its list of * identifiers and a _DSD object with the "compatible" property, use that * property to match against the given list of identifiers. */ static bool acpi_of_match_device(const struct acpi_device *adev, const struct of_device_id *of_match_table, const struct of_device_id **of_id) { const union acpi_object *of_compatible, *obj; int i, nval; if (!adev) return false; of_compatible = adev->data.of_compatible; if (!of_match_table || !of_compatible) return false; if (of_compatible->type == ACPI_TYPE_PACKAGE) { nval = of_compatible->package.count; obj = of_compatible->package.elements; } else { /* Must be ACPI_TYPE_STRING. */ nval = 1; obj = of_compatible; } /* Now we can look for the driver DT compatible strings */ for (i = 0; i < nval; i++, obj++) { const struct of_device_id *id; for (id = of_match_table; id->compatible[0]; id++) if (!strcasecmp(obj->string.pointer, id->compatible)) { if (of_id) *of_id = id; return true; } } return false; } static bool acpi_of_modalias(struct acpi_device *adev, char *modalias, size_t len) { const union acpi_object *of_compatible; const union acpi_object *obj; const char *str, *chr; of_compatible = adev->data.of_compatible; if (!of_compatible) return false; if (of_compatible->type == ACPI_TYPE_PACKAGE) obj = of_compatible->package.elements; else /* Must be ACPI_TYPE_STRING. */ obj = of_compatible; str = obj->string.pointer; chr = strchr(str, ','); strscpy(modalias, chr ? chr + 1 : str, len); return true; } /** * acpi_set_modalias - Set modalias using "compatible" property or supplied ID * @adev: ACPI device object to match * @default_id: ID string to use as default if no compatible string found * @modalias: Pointer to buffer that modalias value will be copied into * @len: Length of modalias buffer * * This is a counterpart of of_alias_from_compatible() for struct acpi_device * objects. If there is a compatible string for @adev, it will be copied to * @modalias with the vendor prefix stripped; otherwise, @default_id will be * used. */ void acpi_set_modalias(struct acpi_device *adev, const char *default_id, char *modalias, size_t len) { if (!acpi_of_modalias(adev, modalias, len)) strscpy(modalias, default_id, len); } EXPORT_SYMBOL_GPL(acpi_set_modalias); static bool __acpi_match_device_cls(const struct acpi_device_id *id, struct acpi_hardware_id *hwid) { int i, msk, byte_shift; char buf[3]; if (!id->cls) return false; /* Apply class-code bitmask, before checking each class-code byte */ for (i = 1; i <= 3; i++) { byte_shift = 8 * (3 - i); msk = (id->cls_msk >> byte_shift) & 0xFF; if (!msk) continue; sprintf(buf, "%02x", (id->cls >> byte_shift) & msk); if (strncmp(buf, &hwid->id[(i - 1) * 2], 2)) return false; } return true; } static bool __acpi_match_device(const struct acpi_device *device, const struct acpi_device_id *acpi_ids, const struct of_device_id *of_ids, const struct acpi_device_id **acpi_id, const struct of_device_id **of_id) { const struct acpi_device_id *id; struct acpi_hardware_id *hwid; /* * If the device is not present, it is unnecessary to load device * driver for it. */ if (!device || !device->status.present) return false; list_for_each_entry(hwid, &device->pnp.ids, list) { /* First, check the ACPI/PNP IDs provided by the caller. */ if (acpi_ids) { for (id = acpi_ids; id->id[0] || id->cls; id++) { if (id->id[0] && !strcmp((char *)id->id, hwid->id)) goto out_acpi_match; if (id->cls && __acpi_match_device_cls(id, hwid)) goto out_acpi_match; } } /* * Next, check ACPI_DT_NAMESPACE_HID and try to match the * "compatible" property if found. */ if (!strcmp(ACPI_DT_NAMESPACE_HID, hwid->id)) return acpi_of_match_device(device, of_ids, of_id); } return false; out_acpi_match: if (acpi_id) *acpi_id = id; return true; } /** * acpi_match_acpi_device - Match an ACPI device against a given list of ACPI IDs * @ids: Array of struct acpi_device_id objects to match against. * @adev: The ACPI device pointer to match. * * Match the ACPI device @adev against a given list of ACPI IDs @ids. * * Return: * a pointer to the first matching ACPI ID on success or %NULL on failure. */ const struct acpi_device_id *acpi_match_acpi_device(const struct acpi_device_id *ids, const struct acpi_device *adev) { const struct acpi_device_id *id = NULL; __acpi_match_device(adev, ids, NULL, &id, NULL); return id; } EXPORT_SYMBOL_GPL(acpi_match_acpi_device); /** * acpi_match_device - Match a struct device against a given list of ACPI IDs * @ids: Array of struct acpi_device_id object to match against. * @dev: The device structure to match. * * Check if @dev has a valid ACPI handle and if there is a struct acpi_device * object for that handle and use that object to match against a given list of * device IDs. * * Return a pointer to the first matching ID on success or %NULL on failure. */ const struct acpi_device_id *acpi_match_device(const struct acpi_device_id *ids, const struct device *dev) { return acpi_match_acpi_device(ids, acpi_companion_match(dev)); } EXPORT_SYMBOL_GPL(acpi_match_device); static const void *acpi_of_device_get_match_data(const struct device *dev) { struct acpi_device *adev = ACPI_COMPANION(dev); const struct of_device_id *match = NULL; if (!acpi_of_match_device(adev, dev->driver->of_match_table, &match)) return NULL; return match->data; } const void *acpi_device_get_match_data(const struct device *dev) { const struct acpi_device_id *acpi_ids = dev->driver->acpi_match_table; const struct acpi_device_id *match; if (!acpi_ids) return acpi_of_device_get_match_data(dev); match = acpi_match_device(acpi_ids, dev); if (!match) return NULL; return (const void *)match->driver_data; } EXPORT_SYMBOL_GPL(acpi_device_get_match_data); int acpi_match_device_ids(struct acpi_device *device, const struct acpi_device_id *ids) { return __acpi_match_device(device, ids, NULL, NULL, NULL) ? 0 : -ENOENT; } EXPORT_SYMBOL(acpi_match_device_ids); bool acpi_driver_match_device(struct device *dev, const struct device_driver *drv) { const struct acpi_device_id *acpi_ids = drv->acpi_match_table; const struct of_device_id *of_ids = drv->of_match_table; if (!acpi_ids) return acpi_of_match_device(ACPI_COMPANION(dev), of_ids, NULL); return __acpi_match_device(acpi_companion_match(dev), acpi_ids, of_ids, NULL, NULL); } EXPORT_SYMBOL_GPL(acpi_driver_match_device); /* -------------------------------------------------------------------------- ACPI Driver Management -------------------------------------------------------------------------- */ /** * acpi_bus_register_driver - register a driver with the ACPI bus * @driver: driver being registered * * Registers a driver with the ACPI bus. Searches the namespace for all * devices that match the driver's criteria and binds. Returns zero for * success or a negative error status for failure. */ int acpi_bus_register_driver(struct acpi_driver *driver) { if (acpi_disabled) return -ENODEV; driver->drv.name = driver->name; driver->drv.bus = &acpi_bus_type; driver->drv.owner = driver->owner; return driver_register(&driver->drv); } EXPORT_SYMBOL(acpi_bus_register_driver); /** * acpi_bus_unregister_driver - unregisters a driver with the ACPI bus * @driver: driver to unregister * * Unregisters a driver with the ACPI bus. Searches the namespace for all * devices that match the driver's criteria and unbinds. */ void acpi_bus_unregister_driver(struct acpi_driver *driver) { driver_unregister(&driver->drv); } EXPORT_SYMBOL(acpi_bus_unregister_driver); /* -------------------------------------------------------------------------- ACPI Bus operations -------------------------------------------------------------------------- */ static int acpi_bus_match(struct device *dev, struct device_driver *drv) { struct acpi_device *acpi_dev = to_acpi_device(dev); struct acpi_driver *acpi_drv = to_acpi_driver(drv); return acpi_dev->flags.match_driver && !acpi_match_device_ids(acpi_dev, acpi_drv->ids); } static int acpi_device_uevent(const struct device *dev, struct kobj_uevent_env *env) { return __acpi_device_uevent_modalias(to_acpi_device(dev), env); } static int acpi_device_probe(struct device *dev) { struct acpi_device *acpi_dev = to_acpi_device(dev); struct acpi_driver *acpi_drv = to_acpi_driver(dev->driver); int ret; if (acpi_dev->handler && !acpi_is_pnp_device(acpi_dev)) return -EINVAL; if (!acpi_drv->ops.add) return -ENOSYS; ret = acpi_drv->ops.add(acpi_dev); if (ret) { acpi_dev->driver_data = NULL; return ret; } pr_debug("Driver [%s] successfully bound to device [%s]\n", acpi_drv->name, acpi_dev->pnp.bus_id); if (acpi_drv->ops.notify) { ret = acpi_device_install_notify_handler(acpi_dev, acpi_drv); if (ret) { if (acpi_drv->ops.remove) acpi_drv->ops.remove(acpi_dev); acpi_dev->driver_data = NULL; return ret; } } pr_debug("Found driver [%s] for device [%s]\n", acpi_drv->name, acpi_dev->pnp.bus_id); get_device(dev); return 0; } static void acpi_device_remove(struct device *dev) { struct acpi_device *acpi_dev = to_acpi_device(dev); struct acpi_driver *acpi_drv = to_acpi_driver(dev->driver); if (acpi_drv->ops.notify) acpi_device_remove_notify_handler(acpi_dev, acpi_drv); if (acpi_drv->ops.remove) acpi_drv->ops.remove(acpi_dev); acpi_dev->driver_data = NULL; put_device(dev); } struct bus_type acpi_bus_type = { .name = "acpi", .match = acpi_bus_match, .probe = acpi_device_probe, .remove = acpi_device_remove, .uevent = acpi_device_uevent, }; int acpi_bus_for_each_dev(int (*fn)(struct device *, void *), void *data) { return bus_for_each_dev(&acpi_bus_type, NULL, data, fn); } EXPORT_SYMBOL_GPL(acpi_bus_for_each_dev); struct acpi_dev_walk_context { int (*fn)(struct acpi_device *, void *); void *data; }; static int acpi_dev_for_one_check(struct device *dev, void *context) { struct acpi_dev_walk_context *adwc = context; if (dev->bus != &acpi_bus_type) return 0; return adwc->fn(to_acpi_device(dev), adwc->data); } EXPORT_SYMBOL_GPL(acpi_dev_for_each_child); int acpi_dev_for_each_child(struct acpi_device *adev, int (*fn)(struct acpi_device *, void *), void *data) { struct acpi_dev_walk_context adwc = { .fn = fn, .data = data, }; return device_for_each_child(&adev->dev, &adwc, acpi_dev_for_one_check); } int acpi_dev_for_each_child_reverse(struct acpi_device *adev, int (*fn)(struct acpi_device *, void *), void *data) { struct acpi_dev_walk_context adwc = { .fn = fn, .data = data, }; return device_for_each_child_reverse(&adev->dev, &adwc, acpi_dev_for_one_check); } /* -------------------------------------------------------------------------- Initialization/Cleanup -------------------------------------------------------------------------- */ static int __init acpi_bus_init_irq(void) { acpi_status status; char *message = NULL; /* * Let the system know what interrupt model we are using by * evaluating the \_PIC object, if exists. */ switch (acpi_irq_model) { case ACPI_IRQ_MODEL_PIC: message = "PIC"; break; case ACPI_IRQ_MODEL_IOAPIC: message = "IOAPIC"; break; case ACPI_IRQ_MODEL_IOSAPIC: message = "IOSAPIC"; break; case ACPI_IRQ_MODEL_GIC: message = "GIC"; break; case ACPI_IRQ_MODEL_PLATFORM: message = "platform specific model"; break; case ACPI_IRQ_MODEL_LPIC: message = "LPIC"; break; default: pr_info("Unknown interrupt routing model\n"); return -ENODEV; } pr_info("Using %s for interrupt routing\n", message); status = acpi_execute_simple_method(NULL, "\\_PIC", acpi_irq_model); if (ACPI_FAILURE(status) && (status != AE_NOT_FOUND)) { pr_info("_PIC evaluation failed: %s\n", acpi_format_exception(status)); return -ENODEV; } return 0; } /** * acpi_early_init - Initialize ACPICA and populate the ACPI namespace. * * The ACPI tables are accessible after this, but the handling of events has not * been initialized and the global lock is not available yet, so AML should not * be executed at this point. * * Doing this before switching the EFI runtime services to virtual mode allows * the EfiBootServices memory to be freed slightly earlier on boot. */ void __init acpi_early_init(void) { acpi_status status; if (acpi_disabled) return; pr_info("Core revision %08x\n", ACPI_CA_VERSION); /* enable workarounds, unless strict ACPI spec. compliance */ if (!acpi_strict) acpi_gbl_enable_interpreter_slack = TRUE; acpi_permanent_mmap = true; #ifdef CONFIG_X86 /* * If the machine falls into the DMI check table, * DSDT will be copied to memory. * Note that calling dmi_check_system() here on other architectures * would not be OK because only x86 initializes dmi early enough. * Thankfully only x86 systems need such quirks for now. */ dmi_check_system(dsdt_dmi_table); #endif status = acpi_reallocate_root_table(); if (ACPI_FAILURE(status)) { pr_err("Unable to reallocate ACPI tables\n"); goto error0; } status = acpi_initialize_subsystem(); if (ACPI_FAILURE(status)) { pr_err("Unable to initialize the ACPI Interpreter\n"); goto error0; } #ifdef CONFIG_X86 if (!acpi_ioapic) { /* compatible (0) means level (3) */ if (!(acpi_sci_flags & ACPI_MADT_TRIGGER_MASK)) { acpi_sci_flags &= ~ACPI_MADT_TRIGGER_MASK; acpi_sci_flags |= ACPI_MADT_TRIGGER_LEVEL; } /* Set PIC-mode SCI trigger type */ acpi_pic_sci_set_trigger(acpi_gbl_FADT.sci_interrupt, (acpi_sci_flags & ACPI_MADT_TRIGGER_MASK) >> 2); } else { /* * now that acpi_gbl_FADT is initialized, * update it with result from INT_SRC_OVR parsing */ acpi_gbl_FADT.sci_interrupt = acpi_sci_override_gsi; } #endif return; error0: disable_acpi(); } /** * acpi_subsystem_init - Finalize the early initialization of ACPI. * * Switch over the platform to the ACPI mode (if possible). * * Doing this too early is generally unsafe, but at the same time it needs to be * done before all things that really depend on ACPI. The right spot appears to * be before finalizing the EFI initialization. */ void __init acpi_subsystem_init(void) { acpi_status status; if (acpi_disabled) return; status = acpi_enable_subsystem(~ACPI_NO_ACPI_ENABLE); if (ACPI_FAILURE(status)) { pr_err("Unable to enable ACPI\n"); disable_acpi(); } else { /* * If the system is using ACPI then we can be reasonably * confident that any regulators are managed by the firmware * so tell the regulator core it has everything it needs to * know. */ regulator_has_full_constraints(); } } static acpi_status acpi_bus_table_handler(u32 event, void *table, void *context) { if (event == ACPI_TABLE_EVENT_LOAD) acpi_scan_table_notify(); return acpi_sysfs_table_handler(event, table, context); } static int __init acpi_bus_init(void) { int result; acpi_status status; acpi_os_initialize1(); status = acpi_load_tables(); if (ACPI_FAILURE(status)) { pr_err("Unable to load the System Description Tables\n"); goto error1; } /* * ACPI 2.0 requires the EC driver to be loaded and work before the EC * device is found in the namespace. * * This is accomplished by looking for the ECDT table and getting the EC * parameters out of that. * * Do that before calling acpi_initialize_objects() which may trigger EC * address space accesses. */ acpi_ec_ecdt_probe(); status = acpi_enable_subsystem(ACPI_NO_ACPI_ENABLE); if (ACPI_FAILURE(status)) { pr_err("Unable to start the ACPI Interpreter\n"); goto error1; } status = acpi_initialize_objects(ACPI_FULL_INITIALIZATION); if (ACPI_FAILURE(status)) { pr_err("Unable to initialize ACPI objects\n"); goto error1; } /* * _OSC method may exist in module level code, * so it must be run after ACPI_FULL_INITIALIZATION */ acpi_bus_osc_negotiate_platform_control(); acpi_bus_osc_negotiate_usb_control(); /* * _PDC control method may load dynamic SSDT tables, * and we need to install the table handler before that. */ status = acpi_install_table_handler(acpi_bus_table_handler, NULL); acpi_sysfs_init(); acpi_early_processor_control_setup(); /* * Maybe EC region is required at bus_scan/acpi_get_devices. So it * is necessary to enable it as early as possible. */ acpi_ec_dsdt_probe(); pr_info("Interpreter enabled\n"); /* Initialize sleep structures */ acpi_sleep_init(); /* * Get the system interrupt model and evaluate \_PIC. */ result = acpi_bus_init_irq(); if (result) goto error1; /* * Register the for all standard device notifications. */ status = acpi_install_notify_handler(ACPI_ROOT_OBJECT, ACPI_SYSTEM_NOTIFY, &acpi_bus_notify, NULL); if (ACPI_FAILURE(status)) { pr_err("Unable to register for system notifications\n"); goto error1; } /* * Create the top ACPI proc directory */ acpi_root_dir = proc_mkdir(ACPI_BUS_FILE_ROOT, NULL); result = bus_register(&acpi_bus_type); if (!result) return 0; /* Mimic structured exception handling */ error1: acpi_terminate(); return -ENODEV; } struct kobject *acpi_kobj; EXPORT_SYMBOL_GPL(acpi_kobj); static int __init acpi_init(void) { int result; if (acpi_disabled) { pr_info("Interpreter disabled.\n"); return -ENODEV; } acpi_kobj = kobject_create_and_add("acpi", firmware_kobj); if (!acpi_kobj) pr_debug("%s: kset create error\n", __func__); init_prmt(); acpi_init_pcc(); result = acpi_bus_init(); if (result) { kobject_put(acpi_kobj); disable_acpi(); return result; } acpi_init_ffh(); pci_mmcfg_late_init(); acpi_viot_early_init(); acpi_hest_init(); acpi_ghes_init(); acpi_arm_init(); acpi_scan_init(); acpi_ec_init(); acpi_debugfs_init(); acpi_sleep_proc_init(); acpi_wakeup_device_init(); acpi_debugger_init(); acpi_setup_sb_notify_handler(); acpi_viot_init(); return 0; } subsys_initcall(acpi_init);
1 1 1 1 1 1 1 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 12 30 1 30 30 30 30 30 12 30 30 30 8 8 8 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 // SPDX-License-Identifier: MIT /* * Copyright 2018 Noralf Trønnes * Copyright (c) 2006-2009 Red Hat Inc. * Copyright (c) 2006-2008 Intel Corporation * Jesse Barnes <jesse.barnes@intel.com> * Copyright (c) 2007 Dave Airlie <airlied@linux.ie> */ #include "drm/drm_modeset_lock.h" #include <linux/module.h> #include <linux/mutex.h> #include <linux/slab.h> #include <linux/string_helpers.h> #include <drm/drm_atomic.h> #include <drm/drm_client.h> #include <drm/drm_connector.h> #include <drm/drm_crtc.h> #include <drm/drm_device.h> #include <drm/drm_drv.h> #include <drm/drm_edid.h> #include <drm/drm_encoder.h> #include <drm/drm_print.h> #include "drm_crtc_internal.h" #include "drm_internal.h" #define DRM_CLIENT_MAX_CLONED_CONNECTORS 8 struct drm_client_offset { int x, y; }; int drm_client_modeset_create(struct drm_client_dev *client) { struct drm_device *dev = client->dev; unsigned int num_crtc = dev->mode_config.num_crtc; unsigned int max_connector_count = 1; struct drm_mode_set *modeset; struct drm_crtc *crtc; unsigned int i = 0; /* Add terminating zero entry to enable index less iteration */ client->modesets = kcalloc(num_crtc + 1, sizeof(*client->modesets), GFP_KERNEL); if (!client->modesets) return -ENOMEM; mutex_init(&client->modeset_mutex); drm_for_each_crtc(crtc, dev) client->modesets[i++].crtc = crtc; /* Cloning is only supported in the single crtc case. */ if (num_crtc == 1) max_connector_count = DRM_CLIENT_MAX_CLONED_CONNECTORS; for (modeset = client->modesets; modeset->crtc; modeset++) { modeset->connectors = kcalloc(max_connector_count, sizeof(*modeset->connectors), GFP_KERNEL); if (!modeset->connectors) goto err_free; } return 0; err_free: drm_client_modeset_free(client); return -ENOMEM; } static void drm_client_modeset_release(struct drm_client_dev *client) { struct drm_mode_set *modeset; unsigned int i; drm_client_for_each_modeset(modeset, client) { drm_mode_destroy(client->dev, modeset->mode); modeset->mode = NULL; modeset->fb = NULL; for (i = 0; i < modeset->num_connectors; i++) { drm_connector_put(modeset->connectors[i]); modeset->connectors[i] = NULL; } modeset->num_connectors = 0; } } void drm_client_modeset_free(struct drm_client_dev *client) { struct drm_mode_set *modeset; mutex_lock(&client->modeset_mutex); drm_client_modeset_release(client); drm_client_for_each_modeset(modeset, client) kfree(modeset->connectors); mutex_unlock(&client->modeset_mutex); mutex_destroy(&client->modeset_mutex); kfree(client->modesets); } static struct drm_mode_set * drm_client_find_modeset(struct drm_client_dev *client, struct drm_crtc *crtc) { struct drm_mode_set *modeset; drm_client_for_each_modeset(modeset, client) if (modeset->crtc == crtc) return modeset; return NULL; } static struct drm_display_mode * drm_connector_get_tiled_mode(struct drm_connector *connector) { struct drm_display_mode *mode; list_for_each_entry(mode, &connector->modes, head) { if (mode->hdisplay == connector->tile_h_size && mode->vdisplay == connector->tile_v_size) return mode; } return NULL; } static struct drm_display_mode * drm_connector_fallback_non_tiled_mode(struct drm_connector *connector) { struct drm_display_mode *mode; list_for_each_entry(mode, &connector->modes, head) { if (mode->hdisplay == connector->tile_h_size && mode->vdisplay == connector->tile_v_size) continue; return mode; } return NULL; } static struct drm_display_mode * drm_connector_has_preferred_mode(struct drm_connector *connector, int width, int height) { struct drm_display_mode *mode; list_for_each_entry(mode, &connector->modes, head) { if (mode->hdisplay > width || mode->vdisplay > height) continue; if (mode->type & DRM_MODE_TYPE_PREFERRED) return mode; } return NULL; } static struct drm_display_mode *drm_connector_pick_cmdline_mode(struct drm_connector *connector) { struct drm_cmdline_mode *cmdline_mode; struct drm_display_mode *mode; bool prefer_non_interlace; /* * Find a user-defined mode. If the user gave us a valid * mode on the kernel command line, it will show up in this * list. */ list_for_each_entry(mode, &connector->modes, head) { if (mode->type & DRM_MODE_TYPE_USERDEF) return mode; } cmdline_mode = &connector->cmdline_mode; if (cmdline_mode->specified == false) return NULL; /* * Attempt to find a matching mode in the list of modes we * have gotten so far. */ prefer_non_interlace = !cmdline_mode->interlace; again: list_for_each_entry(mode, &connector->modes, head) { /* check width/height */ if (mode->hdisplay != cmdline_mode->xres || mode->vdisplay != cmdline_mode->yres) continue; if (cmdline_mode->refresh_specified) { if (drm_mode_vrefresh(mode) != cmdline_mode->refresh) continue; } if (cmdline_mode->interlace) { if (!(mode->flags & DRM_MODE_FLAG_INTERLACE)) continue; } else if (prefer_non_interlace) { if (mode->flags & DRM_MODE_FLAG_INTERLACE) continue; } return mode; } if (prefer_non_interlace) { prefer_non_interlace = false; goto again; } return NULL; } static bool drm_connector_enabled(struct drm_connector *connector, bool strict) { bool enable; if (connector->display_info.non_desktop) return false; if (strict) enable = connector->status == connector_status_connected; else enable = connector->status != connector_status_disconnected; return enable; } static void drm_client_connectors_enabled(struct drm_connector **connectors, unsigned int connector_count, bool *enabled) { bool any_enabled = false; struct drm_connector *connector; int i = 0; for (i = 0; i < connector_count; i++) { connector = connectors[i]; enabled[i] = drm_connector_enabled(connector, true); DRM_DEBUG_KMS("connector %d enabled? %s\n", connector->base.id, connector->display_info.non_desktop ? "non desktop" : str_yes_no(enabled[i])); any_enabled |= enabled[i]; } if (any_enabled) return; for (i = 0; i < connector_count; i++) enabled[i] = drm_connector_enabled(connectors[i], false); } static bool drm_client_target_cloned(struct drm_device *dev, struct drm_connector **connectors, unsigned int connector_count, struct drm_display_mode **modes, struct drm_client_offset *offsets, bool *enabled, int width, int height) { int count, i, j; bool can_clone = false; struct drm_display_mode *dmt_mode, *mode; /* only contemplate cloning in the single crtc case */ if (dev->mode_config.num_crtc > 1) return false; count = 0; for (i = 0; i < connector_count; i++) { if (enabled[i]) count++; } /* only contemplate cloning if more than one connector is enabled */ if (count <= 1) return false; /* check the command line or if nothing common pick 1024x768 */ can_clone = true; for (i = 0; i < connector_count; i++) { if (!enabled[i]) continue; modes[i] = drm_connector_pick_cmdline_mode(connectors[i]); if (!modes[i]) { can_clone = false; break; } for (j = 0; j < i; j++) { if (!enabled[j]) continue; if (!drm_mode_match(modes[j], modes[i], DRM_MODE_MATCH_TIMINGS | DRM_MODE_MATCH_CLOCK | DRM_MODE_MATCH_FLAGS | DRM_MODE_MATCH_3D_FLAGS)) can_clone = false; } } if (can_clone) { DRM_DEBUG_KMS("can clone using command line\n"); return true; } /* try and find a 1024x768 mode on each connector */ can_clone = true; dmt_mode = drm_mode_find_dmt(dev, 1024, 768, 60, false); if (!dmt_mode) goto fail; for (i = 0; i < connector_count; i++) { if (!enabled[i]) continue; list_for_each_entry(mode, &connectors[i]->modes, head) { if (drm_mode_match(mode, dmt_mode, DRM_MODE_MATCH_TIMINGS | DRM_MODE_MATCH_CLOCK | DRM_MODE_MATCH_FLAGS | DRM_MODE_MATCH_3D_FLAGS)) modes[i] = mode; } if (!modes[i]) can_clone = false; } kfree(dmt_mode); if (can_clone) { DRM_DEBUG_KMS("can clone using 1024x768\n"); return true; } fail: DRM_INFO("kms: can't enable cloning when we probably wanted to.\n"); return false; } static int drm_client_get_tile_offsets(struct drm_connector **connectors, unsigned int connector_count, struct drm_display_mode **modes, struct drm_client_offset *offsets, int idx, int h_idx, int v_idx) { struct drm_connector *connector; int i; int hoffset = 0, voffset = 0; for (i = 0; i < connector_count; i++) { connector = connectors[i]; if (!connector->has_tile) continue; if (!modes[i] && (h_idx || v_idx)) { DRM_DEBUG_KMS("no modes for connector tiled %d %d\n", i, connector->base.id); continue; } if (connector->tile_h_loc < h_idx) hoffset += modes[i]->hdisplay; if (connector->tile_v_loc < v_idx) voffset += modes[i]->vdisplay; } offsets[idx].x = hoffset; offsets[idx].y = voffset; DRM_DEBUG_KMS("returned %d %d for %d %d\n", hoffset, voffset, h_idx, v_idx); return 0; } static bool drm_client_target_preferred(struct drm_connector **connectors, unsigned int connector_count, struct drm_display_mode **modes, struct drm_client_offset *offsets, bool *enabled, int width, int height) { const u64 mask = BIT_ULL(connector_count) - 1; struct drm_connector *connector; u64 conn_configured = 0; int tile_pass = 0; int num_tiled_conns = 0; int i; for (i = 0; i < connector_count; i++) { if (connectors[i]->has_tile && connectors[i]->status == connector_status_connected) num_tiled_conns++; } retry: for (i = 0; i < connector_count; i++) { connector = connectors[i]; if (conn_configured & BIT_ULL(i)) continue; if (enabled[i] == false) { conn_configured |= BIT_ULL(i); continue; } /* first pass over all the untiled connectors */ if (tile_pass == 0 && connector->has_tile) continue; if (tile_pass == 1) { if (connector->tile_h_loc != 0 || connector->tile_v_loc != 0) continue; } else { if (connector->tile_h_loc != tile_pass - 1 && connector->tile_v_loc != tile_pass - 1) /* if this tile_pass doesn't cover any of the tiles - keep going */ continue; /* * find the tile offsets for this pass - need to find * all tiles left and above */ drm_client_get_tile_offsets(connectors, connector_count, modes, offsets, i, connector->tile_h_loc, connector->tile_v_loc); } DRM_DEBUG_KMS("looking for cmdline mode on connector %d\n", connector->base.id); /* got for command line mode first */ modes[i] = drm_connector_pick_cmdline_mode(connector); if (!modes[i]) { DRM_DEBUG_KMS("looking for preferred mode on connector %d %d\n", connector->base.id, connector->tile_group ? connector->tile_group->id : 0); modes[i] = drm_connector_has_preferred_mode(connector, width, height); } /* No preferred modes, pick one off the list */ if (!modes[i] && !list_empty(&connector->modes)) { list_for_each_entry(modes[i], &connector->modes, head) break; } /* * In case of tiled mode if all tiles not present fallback to * first available non tiled mode. * After all tiles are present, try to find the tiled mode * for all and if tiled mode not present due to fbcon size * limitations, use first non tiled mode only for * tile 0,0 and set to no mode for all other tiles. */ if (connector->has_tile) { if (num_tiled_conns < connector->num_h_tile * connector->num_v_tile || (connector->tile_h_loc == 0 && connector->tile_v_loc == 0 && !drm_connector_get_tiled_mode(connector))) { DRM_DEBUG_KMS("Falling back to non tiled mode on Connector %d\n", connector->base.id); modes[i] = drm_connector_fallback_non_tiled_mode(connector); } else { modes[i] = drm_connector_get_tiled_mode(connector); } } DRM_DEBUG_KMS("found mode %s\n", modes[i] ? modes[i]->name : "none"); conn_configured |= BIT_ULL(i); } if ((conn_configured & mask) != mask) { tile_pass++; goto retry; } return true; } static bool connector_has_possible_crtc(struct drm_connector *connector, struct drm_crtc *crtc) { struct drm_encoder *encoder; drm_connector_for_each_possible_encoder(connector, encoder) { if (encoder->possible_crtcs & drm_crtc_mask(crtc)) return true; } return false; } static int drm_client_pick_crtcs(struct drm_client_dev *client, struct drm_connector **connectors, unsigned int connector_count, struct drm_crtc **best_crtcs, struct drm_display_mode **modes, int n, int width, int height) { struct drm_device *dev = client->dev; struct drm_connector *connector; int my_score, best_score, score; struct drm_crtc **crtcs, *crtc; struct drm_mode_set *modeset; int o; if (n == connector_count) return 0; connector = connectors[n]; best_crtcs[n] = NULL; best_score = drm_client_pick_crtcs(client, connectors, connector_count, best_crtcs, modes, n + 1, width, height); if (modes[n] == NULL) return best_score; crtcs = kcalloc(connector_count, sizeof(*crtcs), GFP_KERNEL); if (!crtcs) return best_score; my_score = 1; if (connector->status == connector_status_connected) my_score++; if (connector->cmdline_mode.specified) my_score++; if (drm_connector_has_preferred_mode(connector, width, height)) my_score++; /* * select a crtc for this connector and then attempt to configure * remaining connectors */ drm_client_for_each_modeset(modeset, client) { crtc = modeset->crtc; if (!connector_has_possible_crtc(connector, crtc)) continue; for (o = 0; o < n; o++) if (best_crtcs[o] == crtc) break; if (o < n) { /* ignore cloning unless only a single crtc */ if (dev->mode_config.num_crtc > 1) continue; if (!drm_mode_equal(modes[o], modes[n])) continue; } crtcs[n] = crtc; memcpy(crtcs, best_crtcs, n * sizeof(*crtcs)); score = my_score + drm_client_pick_crtcs(client, connectors, connector_count, crtcs, modes, n + 1, width, height); if (score > best_score) { best_score = score; memcpy(best_crtcs, crtcs, connector_count * sizeof(*crtcs)); } } kfree(crtcs); return best_score; } /* Try to read the BIOS display configuration and use it for the initial config */ static bool drm_client_firmware_config(struct drm_client_dev *client, struct drm_connector **connectors, unsigned int connector_count, struct drm_crtc **crtcs, struct drm_display_mode **modes, struct drm_client_offset *offsets, bool *enabled, int width, int height) { const int count = min_t(unsigned int, connector_count, BITS_PER_LONG); unsigned long conn_configured, conn_seq, mask; struct drm_device *dev = client->dev; int i, j; bool *save_enabled; bool fallback = true, ret = true; int num_connectors_enabled = 0; int num_connectors_detected = 0; int num_tiled_conns = 0; struct drm_modeset_acquire_ctx ctx; if (!drm_drv_uses_atomic_modeset(dev)) return false; if (WARN_ON(count <= 0)) return false; save_enabled = kcalloc(count, sizeof(bool), GFP_KERNEL); if (!save_enabled) return false; drm_modeset_acquire_init(&ctx, 0); while (drm_modeset_lock_all_ctx(dev, &ctx) != 0) drm_modeset_backoff(&ctx); memcpy(save_enabled, enabled, count); mask = GENMASK(count - 1, 0); conn_configured = 0; for (i = 0; i < count; i++) { if (connectors[i]->has_tile && connectors[i]->status == connector_status_connected) num_tiled_conns++; } retry: conn_seq = conn_configured; for (i = 0; i < count; i++) { struct drm_connector *connector; struct drm_encoder *encoder; struct drm_crtc *new_crtc; connector = connectors[i]; if (conn_configured & BIT(i)) continue; if (conn_seq == 0 && !connector->has_tile) continue; if (connector->status == connector_status_connected) num_connectors_detected++; if (!enabled[i]) { DRM_DEBUG_KMS("connector %s not enabled, skipping\n", connector->name); conn_configured |= BIT(i); continue; } if (connector->force == DRM_FORCE_OFF) { DRM_DEBUG_KMS("connector %s is disabled by user, skipping\n", connector->name); enabled[i] = false; continue; } encoder = connector->state->best_encoder; if (!encoder || WARN_ON(!connector->state->crtc)) { if (connector->force > DRM_FORCE_OFF) goto bail; DRM_DEBUG_KMS("connector %s has no encoder or crtc, skipping\n", connector->name); enabled[i] = false; conn_configured |= BIT(i); continue; } num_connectors_enabled++; new_crtc = connector->state->crtc; /* * Make sure we're not trying to drive multiple connectors * with a single CRTC, since our cloning support may not * match the BIOS. */ for (j = 0; j < count; j++) { if (crtcs[j] == new_crtc) { DRM_DEBUG_KMS("fallback: cloned configuration\n"); goto bail; } } DRM_DEBUG_KMS("looking for cmdline mode on connector %s\n", connector->name); /* go for command line mode first */ modes[i] = drm_connector_pick_cmdline_mode(connector); /* try for preferred next */ if (!modes[i]) { DRM_DEBUG_KMS("looking for preferred mode on connector %s %d\n", connector->name, connector->has_tile); modes[i] = drm_connector_has_preferred_mode(connector, width, height); } /* No preferred mode marked by the EDID? Are there any modes? */ if (!modes[i] && !list_empty(&connector->modes)) { DRM_DEBUG_KMS("using first mode listed on connector %s\n", connector->name); modes[i] = list_first_entry(&connector->modes, struct drm_display_mode, head); } /* last resort: use current mode */ if (!modes[i]) { /* * IMPORTANT: We want to use the adjusted mode (i.e. * after the panel fitter upscaling) as the initial * config, not the input mode, which is what crtc->mode * usually contains. But since our current * code puts a mode derived from the post-pfit timings * into crtc->mode this works out correctly. * * This is crtc->mode and not crtc->state->mode for the * fastboot check to work correctly. */ DRM_DEBUG_KMS("looking for current mode on connector %s\n", connector->name); modes[i] = &connector->state->crtc->mode; } /* * In case of tiled modes, if all tiles are not present * then fallback to a non tiled mode. */ if (connector->has_tile && num_tiled_conns < connector->num_h_tile * connector->num_v_tile) { DRM_DEBUG_KMS("Falling back to non tiled mode on Connector %d\n", connector->base.id); modes[i] = drm_connector_fallback_non_tiled_mode(connector); } crtcs[i] = new_crtc; DRM_DEBUG_KMS("connector %s on [CRTC:%d:%s]: %dx%d%s\n", connector->name, connector->state->crtc->base.id, connector->state->crtc->name, modes[i]->hdisplay, modes[i]->vdisplay, modes[i]->flags & DRM_MODE_FLAG_INTERLACE ? "i" : ""); fallback = false; conn_configured |= BIT(i); } if ((conn_configured & mask) != mask && conn_configured != conn_seq) goto retry; /* * If the BIOS didn't enable everything it could, fall back to have the * same user experiencing of lighting up as much as possible like the * fbdev helper library. */ if (num_connectors_enabled != num_connectors_detected && num_connectors_enabled < dev->mode_config.num_crtc) { DRM_DEBUG_KMS("fallback: Not all outputs enabled\n"); DRM_DEBUG_KMS("Enabled: %i, detected: %i\n", num_connectors_enabled, num_connectors_detected); fallback = true; } if (fallback) { bail: DRM_DEBUG_KMS("Not using firmware configuration\n"); memcpy(enabled, save_enabled, count); ret = false; } drm_modeset_drop_locks(&ctx); drm_modeset_acquire_fini(&ctx); kfree(save_enabled); return ret; } /** * drm_client_modeset_probe() - Probe for displays * @client: DRM client * @width: Maximum display mode width (optional) * @height: Maximum display mode height (optional) * * This function sets up display pipelines for enabled connectors and stores the * config in the client's modeset array. * * Returns: * Zero on success or negative error code on failure. */ int drm_client_modeset_probe(struct drm_client_dev *client, unsigned int width, unsigned int height) { struct drm_connector *connector, **connectors = NULL; struct drm_connector_list_iter conn_iter; struct drm_device *dev = client->dev; unsigned int total_modes_count = 0; struct drm_client_offset *offsets; unsigned int connector_count = 0; struct drm_display_mode **modes; struct drm_crtc **crtcs; int i, ret = 0; bool *enabled; DRM_DEBUG_KMS("\n"); if (!width) width = dev->mode_config.max_width; if (!height) height = dev->mode_config.max_height; drm_connector_list_iter_begin(dev, &conn_iter); drm_client_for_each_connector_iter(connector, &conn_iter) { struct drm_connector **tmp; tmp = krealloc(connectors, (connector_count + 1) * sizeof(*connectors), GFP_KERNEL); if (!tmp) { ret = -ENOMEM; goto free_connectors; } connectors = tmp; drm_connector_get(connector); connectors[connector_count++] = connector; } drm_connector_list_iter_end(&conn_iter); if (!connector_count) return 0; crtcs = kcalloc(connector_count, sizeof(*crtcs), GFP_KERNEL); modes = kcalloc(connector_count, sizeof(*modes), GFP_KERNEL); offsets = kcalloc(connector_count, sizeof(*offsets), GFP_KERNEL); enabled = kcalloc(connector_count, sizeof(bool), GFP_KERNEL); if (!crtcs || !modes || !enabled || !offsets) { DRM_ERROR("Memory allocation failed\n"); ret = -ENOMEM; goto out; } mutex_lock(&client->modeset_mutex); mutex_lock(&dev->mode_config.mutex); for (i = 0; i < connector_count; i++) total_modes_count += connectors[i]->funcs->fill_modes(connectors[i], width, height); if (!total_modes_count) DRM_DEBUG_KMS("No connectors reported connected with modes\n"); drm_client_connectors_enabled(connectors, connector_count, enabled); if (!drm_client_firmware_config(client, connectors, connector_count, crtcs, modes, offsets, enabled, width, height)) { memset(modes, 0, connector_count * sizeof(*modes)); memset(crtcs, 0, connector_count * sizeof(*crtcs)); memset(offsets, 0, connector_count * sizeof(*offsets)); if (!drm_client_target_cloned(dev, connectors, connector_count, modes, offsets, enabled, width, height) && !drm_client_target_preferred(connectors, connector_count, modes, offsets, enabled, width, height)) DRM_ERROR("Unable to find initial modes\n"); DRM_DEBUG_KMS("picking CRTCs for %dx%d config\n", width, height); drm_client_pick_crtcs(client, connectors, connector_count, crtcs, modes, 0, width, height); } mutex_unlock(&dev->mode_config.mutex); drm_client_modeset_release(client); for (i = 0; i < connector_count; i++) { struct drm_display_mode *mode = modes[i]; struct drm_crtc *crtc = crtcs[i]; struct drm_client_offset *offset = &offsets[i]; if (mode && crtc) { struct drm_mode_set *modeset = drm_client_find_modeset(client, crtc); struct drm_connector *connector = connectors[i]; DRM_DEBUG_KMS("desired mode %s set on crtc %d (%d,%d)\n", mode->name, crtc->base.id, offset->x, offset->y); if (WARN_ON_ONCE(modeset->num_connectors == DRM_CLIENT_MAX_CLONED_CONNECTORS || (dev->mode_config.num_crtc > 1 && modeset->num_connectors == 1))) { ret = -EINVAL; break; } kfree(modeset->mode); modeset->mode = drm_mode_duplicate(dev, mode); drm_connector_get(connector); modeset->connectors[modeset->num_connectors++] = connector; modeset->x = offset->x; modeset->y = offset->y; } } mutex_unlock(&client->modeset_mutex); out: kfree(crtcs); kfree(modes); kfree(offsets); kfree(enabled); free_connectors: for (i = 0; i < connector_count; i++) drm_connector_put(connectors[i]); kfree(connectors); return ret; } EXPORT_SYMBOL(drm_client_modeset_probe); /** * drm_client_rotation() - Check the initial rotation value * @modeset: DRM modeset * @rotation: Returned rotation value * * This function checks if the primary plane in @modeset can hw rotate * to match the rotation needed on its connector. * * Note: Currently only 0 and 180 degrees are supported. * * Return: * True if the plane can do the rotation, false otherwise. */ bool drm_client_rotation(struct drm_mode_set *modeset, unsigned int *rotation) { struct drm_connector *connector = modeset->connectors[0]; struct drm_plane *plane = modeset->crtc->primary; struct drm_cmdline_mode *cmdline; u64 valid_mask = 0; unsigned int i; if (!modeset->num_connectors) return false; switch (connector->display_info.panel_orientation) { case DRM_MODE_PANEL_ORIENTATION_BOTTOM_UP: *rotation = DRM_MODE_ROTATE_180; break; case DRM_MODE_PANEL_ORIENTATION_LEFT_UP: *rotation = DRM_MODE_ROTATE_90; break; case DRM_MODE_PANEL_ORIENTATION_RIGHT_UP: *rotation = DRM_MODE_ROTATE_270; break; default: *rotation = DRM_MODE_ROTATE_0; } /** * The panel already defined the default rotation * through its orientation. Whatever has been provided * on the command line needs to be added to that. * * Unfortunately, the rotations are at different bit * indices, so the math to add them up are not as * trivial as they could. * * Reflections on the other hand are pretty trivial to deal with, a * simple XOR between the two handle the addition nicely. */ cmdline = &connector->cmdline_mode; if (cmdline->specified && cmdline->rotation_reflection) { unsigned int cmdline_rest, panel_rest; unsigned int cmdline_rot, panel_rot; unsigned int sum_rot, sum_rest; panel_rot = ilog2(*rotation & DRM_MODE_ROTATE_MASK); cmdline_rot = ilog2(cmdline->rotation_reflection & DRM_MODE_ROTATE_MASK); sum_rot = (panel_rot + cmdline_rot) % 4; panel_rest = *rotation & ~DRM_MODE_ROTATE_MASK; cmdline_rest = cmdline->rotation_reflection & ~DRM_MODE_ROTATE_MASK; sum_rest = panel_rest ^ cmdline_rest; *rotation = (1 << sum_rot) | sum_rest; } /* * TODO: support 90 / 270 degree hardware rotation, * depending on the hardware this may require the framebuffer * to be in a specific tiling format. */ if (((*rotation & DRM_MODE_ROTATE_MASK) != DRM_MODE_ROTATE_0 && (*rotation & DRM_MODE_ROTATE_MASK) != DRM_MODE_ROTATE_180) || !plane->rotation_property) return false; for (i = 0; i < plane->rotation_property->num_values; i++) valid_mask |= (1ULL << plane->rotation_property->values[i]); if (!(*rotation & valid_mask)) return false; return true; } EXPORT_SYMBOL(drm_client_rotation); static int drm_client_modeset_commit_atomic(struct drm_client_dev *client, bool active, bool check) { struct drm_device *dev = client->dev; struct drm_plane *plane; struct drm_atomic_state *state; struct drm_modeset_acquire_ctx ctx; struct drm_mode_set *mode_set; int ret; drm_modeset_acquire_init(&ctx, 0); state = drm_atomic_state_alloc(dev); if (!state) { ret = -ENOMEM; goto out_ctx; } state->acquire_ctx = &ctx; retry: drm_for_each_plane(plane, dev) { struct drm_plane_state *plane_state; plane_state = drm_atomic_get_plane_state(state, plane); if (IS_ERR(plane_state)) { ret = PTR_ERR(plane_state); goto out_state; } plane_state->rotation = DRM_MODE_ROTATE_0; /* disable non-primary: */ if (plane->type == DRM_PLANE_TYPE_PRIMARY) continue; ret = __drm_atomic_helper_disable_plane(plane, plane_state); if (ret != 0) goto out_state; } drm_client_for_each_modeset(mode_set, client) { struct drm_plane *primary = mode_set->crtc->primary; unsigned int rotation; if (drm_client_rotation(mode_set, &rotation)) { struct drm_plane_state *plane_state; /* Cannot fail as we've already gotten the plane state above */ plane_state = drm_atomic_get_new_plane_state(state, primary); plane_state->rotation = rotation; } ret = __drm_atomic_helper_set_config(mode_set, state); if (ret != 0) goto out_state; /* * __drm_atomic_helper_set_config() sets active when a * mode is set, unconditionally clear it if we force DPMS off */ if (!active) { struct drm_crtc *crtc = mode_set->crtc; struct drm_crtc_state *crtc_state = drm_atomic_get_new_crtc_state(state, crtc); crtc_state->active = false; } } if (check) ret = drm_atomic_check_only(state); else ret = drm_atomic_commit(state); out_state: if (ret == -EDEADLK) goto backoff; drm_atomic_state_put(state); out_ctx: drm_modeset_drop_locks(&ctx); drm_modeset_acquire_fini(&ctx); return ret; backoff: drm_atomic_state_clear(state); drm_modeset_backoff(&ctx); goto retry; } static int drm_client_modeset_commit_legacy(struct drm_client_dev *client) { struct drm_device *dev = client->dev; struct drm_mode_set *mode_set; struct drm_plane *plane; int ret = 0; drm_modeset_lock_all(dev); drm_for_each_plane(plane, dev) { if (plane->type != DRM_PLANE_TYPE_PRIMARY) drm_plane_force_disable(plane); if (plane->rotation_property) drm_mode_plane_set_obj_prop(plane, plane->rotation_property, DRM_MODE_ROTATE_0); } drm_client_for_each_modeset(mode_set, client) { struct drm_crtc *crtc = mode_set->crtc; if (crtc->funcs->cursor_set2) { ret = crtc->funcs->cursor_set2(crtc, NULL, 0, 0, 0, 0, 0); if (ret) goto out; } else if (crtc->funcs->cursor_set) { ret = crtc->funcs->cursor_set(crtc, NULL, 0, 0, 0); if (ret) goto out; } ret = drm_mode_set_config_internal(mode_set); if (ret) goto out; } out: drm_modeset_unlock_all(dev); return ret; } /** * drm_client_modeset_check() - Check modeset configuration * @client: DRM client * * Check modeset configuration. * * Returns: * Zero on success or negative error code on failure. */ int drm_client_modeset_check(struct drm_client_dev *client) { int ret; if (!drm_drv_uses_atomic_modeset(client->dev)) return 0; mutex_lock(&client->modeset_mutex); ret = drm_client_modeset_commit_atomic(client, true, true); mutex_unlock(&client->modeset_mutex); return ret; } EXPORT_SYMBOL(drm_client_modeset_check); /** * drm_client_modeset_commit_locked() - Force commit CRTC configuration * @client: DRM client * * Commit modeset configuration to crtcs without checking if there is a DRM * master. The assumption is that the caller already holds an internal DRM * master reference acquired with drm_master_internal_acquire(). * * Returns: * Zero on success or negative error code on failure. */ int drm_client_modeset_commit_locked(struct drm_client_dev *client) { struct drm_device *dev = client->dev; int ret; mutex_lock(&client->modeset_mutex); if (drm_drv_uses_atomic_modeset(dev)) ret = drm_client_modeset_commit_atomic(client, true, false); else ret = drm_client_modeset_commit_legacy(client); mutex_unlock(&client->modeset_mutex); return ret; } EXPORT_SYMBOL(drm_client_modeset_commit_locked); /** * drm_client_modeset_commit() - Commit CRTC configuration * @client: DRM client * * Commit modeset configuration to crtcs. * * Returns: * Zero on success or negative error code on failure. */ int drm_client_modeset_commit(struct drm_client_dev *client) { struct drm_device *dev = client->dev; int ret; if (!drm_master_internal_acquire(dev)) return -EBUSY; ret = drm_client_modeset_commit_locked(client); drm_master_internal_release(dev); return ret; } EXPORT_SYMBOL(drm_client_modeset_commit); static void drm_client_modeset_dpms_legacy(struct drm_client_dev *client, int dpms_mode) { struct drm_device *dev = client->dev; struct drm_connector *connector; struct drm_mode_set *modeset; struct drm_modeset_acquire_ctx ctx; int j; int ret; DRM_MODESET_LOCK_ALL_BEGIN(dev, ctx, 0, ret); drm_client_for_each_modeset(modeset, client) { if (!modeset->crtc->enabled) continue; for (j = 0; j < modeset->num_connectors; j++) { connector = modeset->connectors[j]; connector->funcs->dpms(connector, dpms_mode); drm_object_property_set_value(&connector->base, dev->mode_config.dpms_property, dpms_mode); } } DRM_MODESET_LOCK_ALL_END(dev, ctx, ret); } /** * drm_client_modeset_dpms() - Set DPMS mode * @client: DRM client * @mode: DPMS mode * * Note: For atomic drivers @mode is reduced to on/off. * * Returns: * Zero on success or negative error code on failure. */ int drm_client_modeset_dpms(struct drm_client_dev *client, int mode) { struct drm_device *dev = client->dev; int ret = 0; if (!drm_master_internal_acquire(dev)) return -EBUSY; mutex_lock(&client->modeset_mutex); if (drm_drv_uses_atomic_modeset(dev)) ret = drm_client_modeset_commit_atomic(client, mode == DRM_MODE_DPMS_ON, false); else drm_client_modeset_dpms_legacy(client, mode); mutex_unlock(&client->modeset_mutex); drm_master_internal_release(dev); return ret; } EXPORT_SYMBOL(drm_client_modeset_dpms); #ifdef CONFIG_DRM_KUNIT_TEST #include "tests/drm_client_modeset_test.c" #endif
19 19 19 19 19 83 47 35 73 76 75 18 7 83 22 76 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 // SPDX-License-Identifier: GPL-2.0-or-later /* * Linux network device link state notification * * Author: * Stefan Rompf <sux@loplof.de> */ #include <linux/module.h> #include <linux/netdevice.h> #include <linux/if.h> #include <net/sock.h> #include <net/pkt_sched.h> #include <linux/rtnetlink.h> #include <linux/jiffies.h> #include <linux/spinlock.h> #include <linux/workqueue.h> #include <linux/bitops.h> #include <linux/types.h> #include "dev.h" enum lw_bits { LW_URGENT = 0, }; static unsigned long linkwatch_flags; static unsigned long linkwatch_nextevent; static void linkwatch_event(struct work_struct *dummy); static DECLARE_DELAYED_WORK(linkwatch_work, linkwatch_event); static LIST_HEAD(lweventlist); static DEFINE_SPINLOCK(lweventlist_lock); static unsigned char default_operstate(const struct net_device *dev) { if (netif_testing(dev)) return IF_OPER_TESTING; /* Some uppers (DSA) have additional sources for being down, so * first check whether lower is indeed the source of its down state. */ if (!netif_carrier_ok(dev)) { int iflink = dev_get_iflink(dev); struct net_device *peer; if (iflink == dev->ifindex) return IF_OPER_DOWN; peer = __dev_get_by_index(dev_net(dev), iflink); if (!peer) return IF_OPER_DOWN; return netif_carrier_ok(peer) ? IF_OPER_DOWN : IF_OPER_LOWERLAYERDOWN; } if (netif_dormant(dev)) return IF_OPER_DORMANT; return IF_OPER_UP; } static void rfc2863_policy(struct net_device *dev) { unsigned char operstate = default_operstate(dev); if (operstate == dev->operstate) return; write_lock(&dev_base_lock); switch(dev->link_mode) { case IF_LINK_MODE_TESTING: if (operstate == IF_OPER_UP) operstate = IF_OPER_TESTING; break; case IF_LINK_MODE_DORMANT: if (operstate == IF_OPER_UP) operstate = IF_OPER_DORMANT; break; case IF_LINK_MODE_DEFAULT: default: break; } dev->operstate = operstate; write_unlock(&dev_base_lock); } void linkwatch_init_dev(struct net_device *dev) { /* Handle pre-registration link state changes */ if (!netif_carrier_ok(dev) || netif_dormant(dev) || netif_testing(dev)) rfc2863_policy(dev); } static bool linkwatch_urgent_event(struct net_device *dev) { if (!netif_running(dev)) return false; if (dev->ifindex != dev_get_iflink(dev)) return true; if (netif_is_lag_port(dev) || netif_is_lag_master(dev)) return true; return netif_carrier_ok(dev) && qdisc_tx_changing(dev); } static void linkwatch_add_event(struct net_device *dev) { unsigned long flags; spin_lock_irqsave(&lweventlist_lock, flags); if (list_empty(&dev->link_watch_list)) { list_add_tail(&dev->link_watch_list, &lweventlist); netdev_hold(dev, &dev->linkwatch_dev_tracker, GFP_ATOMIC); } spin_unlock_irqrestore(&lweventlist_lock, flags); } static void linkwatch_schedule_work(int urgent) { unsigned long delay = linkwatch_nextevent - jiffies; if (test_bit(LW_URGENT, &linkwatch_flags)) return; /* Minimise down-time: drop delay for up event. */ if (urgent) { if (test_and_set_bit(LW_URGENT, &linkwatch_flags)) return; delay = 0; } /* If we wrap around we'll delay it by at most HZ. */ if (delay > HZ) delay = 0; /* * If urgent, schedule immediate execution; otherwise, don't * override the existing timer. */ if (test_bit(LW_URGENT, &linkwatch_flags)) mod_delayed_work(system_wq, &linkwatch_work, 0); else schedule_delayed_work(&linkwatch_work, delay); } static void linkwatch_do_dev(struct net_device *dev) { /* * Make sure the above read is complete since it can be * rewritten as soon as we clear the bit below. */ smp_mb__before_atomic(); /* We are about to handle this device, * so new events can be accepted */ clear_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state); rfc2863_policy(dev); if (dev->flags & IFF_UP) { if (netif_carrier_ok(dev)) dev_activate(dev); else dev_deactivate(dev); netdev_state_change(dev); } /* Note: our callers are responsible for calling netdev_tracker_free(). * This is the reason we use __dev_put() instead of dev_put(). */ __dev_put(dev); } static void __linkwatch_run_queue(int urgent_only) { #define MAX_DO_DEV_PER_LOOP 100 int do_dev = MAX_DO_DEV_PER_LOOP; struct net_device *dev; LIST_HEAD(wrk); /* Give urgent case more budget */ if (urgent_only) do_dev += MAX_DO_DEV_PER_LOOP; /* * Limit the number of linkwatch events to one * per second so that a runaway driver does not * cause a storm of messages on the netlink * socket. This limit does not apply to up events * while the device qdisc is down. */ if (!urgent_only) linkwatch_nextevent = jiffies + HZ; /* Limit wrap-around effect on delay. */ else if (time_after(linkwatch_nextevent, jiffies + HZ)) linkwatch_nextevent = jiffies; clear_bit(LW_URGENT, &linkwatch_flags); spin_lock_irq(&lweventlist_lock); list_splice_init(&lweventlist, &wrk); while (!list_empty(&wrk) && do_dev > 0) { dev = list_first_entry(&wrk, struct net_device, link_watch_list); list_del_init(&dev->link_watch_list); if (!netif_device_present(dev) || (urgent_only && !linkwatch_urgent_event(dev))) { list_add_tail(&dev->link_watch_list, &lweventlist); continue; } /* We must free netdev tracker under * the spinlock protection. */ netdev_tracker_free(dev, &dev->linkwatch_dev_tracker); spin_unlock_irq(&lweventlist_lock); linkwatch_do_dev(dev); do_dev--; spin_lock_irq(&lweventlist_lock); } /* Add the remaining work back to lweventlist */ list_splice_init(&wrk, &lweventlist); if (!list_empty(&lweventlist)) linkwatch_schedule_work(0); spin_unlock_irq(&lweventlist_lock); } void linkwatch_forget_dev(struct net_device *dev) { unsigned long flags; int clean = 0; spin_lock_irqsave(&lweventlist_lock, flags); if (!list_empty(&dev->link_watch_list)) { list_del_init(&dev->link_watch_list); clean = 1; /* We must release netdev tracker under * the spinlock protection. */ netdev_tracker_free(dev, &dev->linkwatch_dev_tracker); } spin_unlock_irqrestore(&lweventlist_lock, flags); if (clean) linkwatch_do_dev(dev); } /* Must be called with the rtnl semaphore held */ void linkwatch_run_queue(void) { __linkwatch_run_queue(0); } static void linkwatch_event(struct work_struct *dummy) { rtnl_lock(); __linkwatch_run_queue(time_after(linkwatch_nextevent, jiffies)); rtnl_unlock(); } void linkwatch_fire_event(struct net_device *dev) { bool urgent = linkwatch_urgent_event(dev); if (!test_and_set_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state)) { linkwatch_add_event(dev); } else if (!urgent) return; linkwatch_schedule_work(urgent); } EXPORT_SYMBOL(linkwatch_fire_event);
539 536 395 395 395 157 203 203 203 73 74 177 177 177 291 263 291 290 285 131 3 7 24 24 12 24 2 24 24 23 23 23 23 24 11 10 1 9 24 26 11 14 2 25 25 25 25 4 1 3 3 2 21 21 21 21 21 20 21 24 26 2 1 1 152 152 152 56 1 1 2 152 152 91 2 91 27 16 82 82 132 35 1 1 5 29 55 48 48 20 31 48 21 30 1 7 8 1 7 161 161 161 41 41 161 160 161 134 135 61 9 135 150 9 150 76 81 80 9 80 3 1 24 1 24 1 1 74 7 10 76 65 65 90 143 7 7 7 5 7 7 7 7 7 142 140 3 134 140 136 88 139 139 138 71 3 1 1 3 2 3 3 3 3 3 2 2 291 290 291 241 152 10 11 4 152 151 6 2 2 1 1 4 5 4 1 3 141 141 8 134 25 2 1 2 1 1 1 4 150 26 26 5 2 1439 1441 1442 924 1442 1439 1441 1440 1440 1443 1437 1440 1441 2 1440 461 461 1441 1440 1440 1437 42 42 42 41 42 42 29 20 26 10 42 42 3 3 2 1 42 98 97 98 98 98 98 122 12 122 122 122 122 122 122 122 122 122 121 122 63 122 96 97 122 122 1 1 122 122 58 97 97 122 107 122 113 113 113 100 100 41 60 122 107 107 158 158 132 132 7 158 158 1 1 1 1 157 136 136 136 135 158 131 122 158 157 82 158 158 160 159 160 160 158 157 158 1 80 538 540 540 539 539 5 539 540 538 539 540 540 539 540 539 1 539 539 537 539 538 539 5 5 540 536 536 537 532 535 507 170 170 495 509 509 495 495 135 495 3 3 494 511 510 509 511 495 170 5 5 4 4 155 1 156 155 156 156 4 4 4 4 4 4 4 5 3 4 3 5 5 3 3 5 5 5 5 4 5 5 4 4 3 5 5 5 4 4 4 4 2 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 // SPDX-License-Identifier: GPL-2.0-or-later /* * Linux INET6 implementation * Forwarding Information Database * * Authors: * Pedro Roque <roque@di.fc.ul.pt> * * Changes: * Yuji SEKIYA @USAGI: Support default route on router node; * remove ip6_null_entry from the top of * routing table. * Ville Nuorvala: Fixed routing subtrees. */ #define pr_fmt(fmt) "IPv6: " fmt #include <linux/bpf.h> #include <linux/errno.h> #include <linux/types.h> #include <linux/net.h> #include <linux/route.h> #include <linux/netdevice.h> #include <linux/in6.h> #include <linux/init.h> #include <linux/list.h> #include <linux/slab.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/ndisc.h> #include <net/addrconf.h> #include <net/lwtunnel.h> #include <net/fib_notifier.h> #include <net/ip_fib.h> #include <net/ip6_fib.h> #include <net/ip6_route.h> static struct kmem_cache *fib6_node_kmem __read_mostly; struct fib6_cleaner { struct fib6_walker w; struct net *net; int (*func)(struct fib6_info *, void *arg); int sernum; void *arg; bool skip_notify; }; #ifdef CONFIG_IPV6_SUBTREES #define FWS_INIT FWS_S #else #define FWS_INIT FWS_L #endif static struct fib6_info *fib6_find_prefix(struct net *net, struct fib6_table *table, struct fib6_node *fn); static struct fib6_node *fib6_repair_tree(struct net *net, struct fib6_table *table, struct fib6_node *fn); static int fib6_walk(struct net *net, struct fib6_walker *w); static int fib6_walk_continue(struct fib6_walker *w); /* * A routing update causes an increase of the serial number on the * affected subtree. This allows for cached routes to be asynchronously * tested when modifications are made to the destination cache as a * result of redirects, path MTU changes, etc. */ static void fib6_gc_timer_cb(struct timer_list *t); #define FOR_WALKERS(net, w) \ list_for_each_entry(w, &(net)->ipv6.fib6_walkers, lh) static void fib6_walker_link(struct net *net, struct fib6_walker *w) { write_lock_bh(&net->ipv6.fib6_walker_lock); list_add(&w->lh, &net->ipv6.fib6_walkers); write_unlock_bh(&net->ipv6.fib6_walker_lock); } static void fib6_walker_unlink(struct net *net, struct fib6_walker *w) { write_lock_bh(&net->ipv6.fib6_walker_lock); list_del(&w->lh); write_unlock_bh(&net->ipv6.fib6_walker_lock); } static int fib6_new_sernum(struct net *net) { int new, old = atomic_read(&net->ipv6.fib6_sernum); do { new = old < INT_MAX ? old + 1 : 1; } while (!atomic_try_cmpxchg(&net->ipv6.fib6_sernum, &old, new)); return new; } enum { FIB6_NO_SERNUM_CHANGE = 0, }; void fib6_update_sernum(struct net *net, struct fib6_info *f6i) { struct fib6_node *fn; fn = rcu_dereference_protected(f6i->fib6_node, lockdep_is_held(&f6i->fib6_table->tb6_lock)); if (fn) WRITE_ONCE(fn->fn_sernum, fib6_new_sernum(net)); } /* * Auxiliary address test functions for the radix tree. * * These assume a 32bit processor (although it will work on * 64bit processors) */ /* * test bit */ #if defined(__LITTLE_ENDIAN) # define BITOP_BE32_SWIZZLE (0x1F & ~7) #else # define BITOP_BE32_SWIZZLE 0 #endif static __be32 addr_bit_set(const void *token, int fn_bit) { const __be32 *addr = token; /* * Here, * 1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f) * is optimized version of * htonl(1 << ((~fn_bit)&0x1F)) * See include/asm-generic/bitops/le.h. */ return (__force __be32)(1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f)) & addr[fn_bit >> 5]; } struct fib6_info *fib6_info_alloc(gfp_t gfp_flags, bool with_fib6_nh) { struct fib6_info *f6i; size_t sz = sizeof(*f6i); if (with_fib6_nh) sz += sizeof(struct fib6_nh); f6i = kzalloc(sz, gfp_flags); if (!f6i) return NULL; /* fib6_siblings is a union with nh_list, so this initializes both */ INIT_LIST_HEAD(&f6i->fib6_siblings); refcount_set(&f6i->fib6_ref, 1); INIT_HLIST_NODE(&f6i->gc_link); return f6i; } void fib6_info_destroy_rcu(struct rcu_head *head) { struct fib6_info *f6i = container_of(head, struct fib6_info, rcu); WARN_ON(f6i->fib6_node); if (f6i->nh) nexthop_put(f6i->nh); else fib6_nh_release(f6i->fib6_nh); ip_fib_metrics_put(f6i->fib6_metrics); kfree(f6i); } EXPORT_SYMBOL_GPL(fib6_info_destroy_rcu); static struct fib6_node *node_alloc(struct net *net) { struct fib6_node *fn; fn = kmem_cache_zalloc(fib6_node_kmem, GFP_ATOMIC); if (fn) net->ipv6.rt6_stats->fib_nodes++; return fn; } static void node_free_immediate(struct net *net, struct fib6_node *fn) { kmem_cache_free(fib6_node_kmem, fn); net->ipv6.rt6_stats->fib_nodes--; } static void node_free_rcu(struct rcu_head *head) { struct fib6_node *fn = container_of(head, struct fib6_node, rcu); kmem_cache_free(fib6_node_kmem, fn); } static void node_free(struct net *net, struct fib6_node *fn) { call_rcu(&fn->rcu, node_free_rcu); net->ipv6.rt6_stats->fib_nodes--; } static void fib6_free_table(struct fib6_table *table) { inetpeer_invalidate_tree(&table->tb6_peers); kfree(table); } static void fib6_link_table(struct net *net, struct fib6_table *tb) { unsigned int h; /* * Initialize table lock at a single place to give lockdep a key, * tables aren't visible prior to being linked to the list. */ spin_lock_init(&tb->tb6_lock); h = tb->tb6_id & (FIB6_TABLE_HASHSZ - 1); /* * No protection necessary, this is the only list mutatation * operation, tables never disappear once they exist. */ hlist_add_head_rcu(&tb->tb6_hlist, &net->ipv6.fib_table_hash[h]); } #ifdef CONFIG_IPV6_MULTIPLE_TABLES static struct fib6_table *fib6_alloc_table(struct net *net, u32 id) { struct fib6_table *table; table = kzalloc(sizeof(*table), GFP_ATOMIC); if (table) { table->tb6_id = id; rcu_assign_pointer(table->tb6_root.leaf, net->ipv6.fib6_null_entry); table->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&table->tb6_peers); INIT_HLIST_HEAD(&table->tb6_gc_hlist); } return table; } struct fib6_table *fib6_new_table(struct net *net, u32 id) { struct fib6_table *tb; if (id == 0) id = RT6_TABLE_MAIN; tb = fib6_get_table(net, id); if (tb) return tb; tb = fib6_alloc_table(net, id); if (tb) fib6_link_table(net, tb); return tb; } EXPORT_SYMBOL_GPL(fib6_new_table); struct fib6_table *fib6_get_table(struct net *net, u32 id) { struct fib6_table *tb; struct hlist_head *head; unsigned int h; if (id == 0) id = RT6_TABLE_MAIN; h = id & (FIB6_TABLE_HASHSZ - 1); rcu_read_lock(); head = &net->ipv6.fib_table_hash[h]; hlist_for_each_entry_rcu(tb, head, tb6_hlist) { if (tb->tb6_id == id) { rcu_read_unlock(); return tb; } } rcu_read_unlock(); return NULL; } EXPORT_SYMBOL_GPL(fib6_get_table); static void __net_init fib6_tables_init(struct net *net) { fib6_link_table(net, net->ipv6.fib6_main_tbl); fib6_link_table(net, net->ipv6.fib6_local_tbl); } #else struct fib6_table *fib6_new_table(struct net *net, u32 id) { return fib6_get_table(net, id); } struct fib6_table *fib6_get_table(struct net *net, u32 id) { return net->ipv6.fib6_main_tbl; } struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, const struct sk_buff *skb, int flags, pol_lookup_t lookup) { struct rt6_info *rt; rt = pol_lookup_func(lookup, net, net->ipv6.fib6_main_tbl, fl6, skb, flags); if (rt->dst.error == -EAGAIN) { ip6_rt_put_flags(rt, flags); rt = net->ipv6.ip6_null_entry; if (!(flags & RT6_LOOKUP_F_DST_NOREF)) dst_hold(&rt->dst); } return &rt->dst; } /* called with rcu lock held; no reference taken on fib6_info */ int fib6_lookup(struct net *net, int oif, struct flowi6 *fl6, struct fib6_result *res, int flags) { return fib6_table_lookup(net, net->ipv6.fib6_main_tbl, oif, fl6, res, flags); } static void __net_init fib6_tables_init(struct net *net) { fib6_link_table(net, net->ipv6.fib6_main_tbl); } #endif unsigned int fib6_tables_seq_read(struct net *net) { unsigned int h, fib_seq = 0; rcu_read_lock(); for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { struct hlist_head *head = &net->ipv6.fib_table_hash[h]; struct fib6_table *tb; hlist_for_each_entry_rcu(tb, head, tb6_hlist) fib_seq += tb->fib_seq; } rcu_read_unlock(); return fib_seq; } static int call_fib6_entry_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib6_info *rt, struct netlink_ext_ack *extack) { struct fib6_entry_notifier_info info = { .info.extack = extack, .rt = rt, }; return call_fib6_notifier(nb, event_type, &info.info); } static int call_fib6_multipath_entry_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib6_info *rt, unsigned int nsiblings, struct netlink_ext_ack *extack) { struct fib6_entry_notifier_info info = { .info.extack = extack, .rt = rt, .nsiblings = nsiblings, }; return call_fib6_notifier(nb, event_type, &info.info); } int call_fib6_entry_notifiers(struct net *net, enum fib_event_type event_type, struct fib6_info *rt, struct netlink_ext_ack *extack) { struct fib6_entry_notifier_info info = { .info.extack = extack, .rt = rt, }; rt->fib6_table->fib_seq++; return call_fib6_notifiers(net, event_type, &info.info); } int call_fib6_multipath_entry_notifiers(struct net *net, enum fib_event_type event_type, struct fib6_info *rt, unsigned int nsiblings, struct netlink_ext_ack *extack) { struct fib6_entry_notifier_info info = { .info.extack = extack, .rt = rt, .nsiblings = nsiblings, }; rt->fib6_table->fib_seq++; return call_fib6_notifiers(net, event_type, &info.info); } int call_fib6_entry_notifiers_replace(struct net *net, struct fib6_info *rt) { struct fib6_entry_notifier_info info = { .rt = rt, .nsiblings = rt->fib6_nsiblings, }; rt->fib6_table->fib_seq++; return call_fib6_notifiers(net, FIB_EVENT_ENTRY_REPLACE, &info.info); } struct fib6_dump_arg { struct net *net; struct notifier_block *nb; struct netlink_ext_ack *extack; }; static int fib6_rt_dump(struct fib6_info *rt, struct fib6_dump_arg *arg) { enum fib_event_type fib_event = FIB_EVENT_ENTRY_REPLACE; int err; if (!rt || rt == arg->net->ipv6.fib6_null_entry) return 0; if (rt->fib6_nsiblings) err = call_fib6_multipath_entry_notifier(arg->nb, fib_event, rt, rt->fib6_nsiblings, arg->extack); else err = call_fib6_entry_notifier(arg->nb, fib_event, rt, arg->extack); return err; } static int fib6_node_dump(struct fib6_walker *w) { int err; err = fib6_rt_dump(w->leaf, w->args); w->leaf = NULL; return err; } static int fib6_table_dump(struct net *net, struct fib6_table *tb, struct fib6_walker *w) { int err; w->root = &tb->tb6_root; spin_lock_bh(&tb->tb6_lock); err = fib6_walk(net, w); spin_unlock_bh(&tb->tb6_lock); return err; } /* Called with rcu_read_lock() */ int fib6_tables_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { struct fib6_dump_arg arg; struct fib6_walker *w; unsigned int h; int err = 0; w = kzalloc(sizeof(*w), GFP_ATOMIC); if (!w) return -ENOMEM; w->func = fib6_node_dump; arg.net = net; arg.nb = nb; arg.extack = extack; w->args = &arg; for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { struct hlist_head *head = &net->ipv6.fib_table_hash[h]; struct fib6_table *tb; hlist_for_each_entry_rcu(tb, head, tb6_hlist) { err = fib6_table_dump(net, tb, w); if (err) goto out; } } out: kfree(w); /* The tree traversal function should never return a positive value. */ return err > 0 ? -EINVAL : err; } static int fib6_dump_node(struct fib6_walker *w) { int res; struct fib6_info *rt; for_each_fib6_walker_rt(w) { res = rt6_dump_route(rt, w->args, w->skip_in_node); if (res >= 0) { /* Frame is full, suspend walking */ w->leaf = rt; /* We'll restart from this node, so if some routes were * already dumped, skip them next time. */ w->skip_in_node += res; return 1; } w->skip_in_node = 0; /* Multipath routes are dumped in one route with the * RTA_MULTIPATH attribute. Jump 'rt' to point to the * last sibling of this route (no need to dump the * sibling routes again) */ if (rt->fib6_nsiblings) rt = list_last_entry(&rt->fib6_siblings, struct fib6_info, fib6_siblings); } w->leaf = NULL; return 0; } static void fib6_dump_end(struct netlink_callback *cb) { struct net *net = sock_net(cb->skb->sk); struct fib6_walker *w = (void *)cb->args[2]; if (w) { if (cb->args[4]) { cb->args[4] = 0; fib6_walker_unlink(net, w); } cb->args[2] = 0; kfree(w); } cb->done = (void *)cb->args[3]; cb->args[1] = 3; } static int fib6_dump_done(struct netlink_callback *cb) { fib6_dump_end(cb); return cb->done ? cb->done(cb) : 0; } static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb, struct netlink_callback *cb) { struct net *net = sock_net(skb->sk); struct fib6_walker *w; int res; w = (void *)cb->args[2]; w->root = &table->tb6_root; if (cb->args[4] == 0) { w->count = 0; w->skip = 0; w->skip_in_node = 0; spin_lock_bh(&table->tb6_lock); res = fib6_walk(net, w); spin_unlock_bh(&table->tb6_lock); if (res > 0) { cb->args[4] = 1; cb->args[5] = READ_ONCE(w->root->fn_sernum); } } else { int sernum = READ_ONCE(w->root->fn_sernum); if (cb->args[5] != sernum) { /* Begin at the root if the tree changed */ cb->args[5] = sernum; w->state = FWS_INIT; w->node = w->root; w->skip = w->count; w->skip_in_node = 0; } else w->skip = 0; spin_lock_bh(&table->tb6_lock); res = fib6_walk_continue(w); spin_unlock_bh(&table->tb6_lock); if (res <= 0) { fib6_walker_unlink(net, w); cb->args[4] = 0; } } return res; } static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) { struct rt6_rtnl_dump_arg arg = { .filter.dump_exceptions = true, .filter.dump_routes = true }; const struct nlmsghdr *nlh = cb->nlh; struct net *net = sock_net(skb->sk); unsigned int h, s_h; unsigned int e = 0, s_e; struct fib6_walker *w; struct fib6_table *tb; struct hlist_head *head; int res = 0; if (cb->strict_check) { int err; err = ip_valid_fib_dump_req(net, nlh, &arg.filter, cb); if (err < 0) return err; } else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) { struct rtmsg *rtm = nlmsg_data(nlh); if (rtm->rtm_flags & RTM_F_PREFIX) arg.filter.flags = RTM_F_PREFIX; } w = (void *)cb->args[2]; if (!w) { /* New dump: * * 1. hook callback destructor. */ cb->args[3] = (long)cb->done; cb->done = fib6_dump_done; /* * 2. allocate and initialize walker. */ w = kzalloc(sizeof(*w), GFP_ATOMIC); if (!w) return -ENOMEM; w->func = fib6_dump_node; cb->args[2] = (long)w; } arg.skb = skb; arg.cb = cb; arg.net = net; w->args = &arg; if (arg.filter.table_id) { tb = fib6_get_table(net, arg.filter.table_id); if (!tb) { if (rtnl_msg_family(cb->nlh) != PF_INET6) goto out; NL_SET_ERR_MSG_MOD(cb->extack, "FIB table does not exist"); return -ENOENT; } if (!cb->args[0]) { res = fib6_dump_table(tb, skb, cb); if (!res) cb->args[0] = 1; } goto out; } s_h = cb->args[0]; s_e = cb->args[1]; rcu_read_lock(); for (h = s_h; h < FIB6_TABLE_HASHSZ; h++, s_e = 0) { e = 0; head = &net->ipv6.fib_table_hash[h]; hlist_for_each_entry_rcu(tb, head, tb6_hlist) { if (e < s_e) goto next; res = fib6_dump_table(tb, skb, cb); if (res != 0) goto out_unlock; next: e++; } } out_unlock: rcu_read_unlock(); cb->args[1] = e; cb->args[0] = h; out: res = res < 0 ? res : skb->len; if (res <= 0) fib6_dump_end(cb); return res; } void fib6_metric_set(struct fib6_info *f6i, int metric, u32 val) { if (!f6i) return; if (f6i->fib6_metrics == &dst_default_metrics) { struct dst_metrics *p = kzalloc(sizeof(*p), GFP_ATOMIC); if (!p) return; refcount_set(&p->refcnt, 1); f6i->fib6_metrics = p; } f6i->fib6_metrics->metrics[metric - 1] = val; } /* * Routing Table * * return the appropriate node for a routing tree "add" operation * by either creating and inserting or by returning an existing * node. */ static struct fib6_node *fib6_add_1(struct net *net, struct fib6_table *table, struct fib6_node *root, struct in6_addr *addr, int plen, int offset, int allow_create, int replace_required, struct netlink_ext_ack *extack) { struct fib6_node *fn, *in, *ln; struct fib6_node *pn = NULL; struct rt6key *key; int bit; __be32 dir = 0; RT6_TRACE("fib6_add_1\n"); /* insert node in tree */ fn = root; do { struct fib6_info *leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&table->tb6_lock)); key = (struct rt6key *)((u8 *)leaf + offset); /* * Prefix match */ if (plen < fn->fn_bit || !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit)) { if (!allow_create) { if (replace_required) { NL_SET_ERR_MSG(extack, "Can not replace route - no match found"); pr_warn("Can't replace route, no match found\n"); return ERR_PTR(-ENOENT); } pr_warn("NLM_F_CREATE should be set when creating new route\n"); } goto insert_above; } /* * Exact match ? */ if (plen == fn->fn_bit) { /* clean up an intermediate node */ if (!(fn->fn_flags & RTN_RTINFO)) { RCU_INIT_POINTER(fn->leaf, NULL); fib6_info_release(leaf); /* remove null_entry in the root node */ } else if (fn->fn_flags & RTN_TL_ROOT && rcu_access_pointer(fn->leaf) == net->ipv6.fib6_null_entry) { RCU_INIT_POINTER(fn->leaf, NULL); } return fn; } /* * We have more bits to go */ /* Try to walk down on tree. */ dir = addr_bit_set(addr, fn->fn_bit); pn = fn; fn = dir ? rcu_dereference_protected(fn->right, lockdep_is_held(&table->tb6_lock)) : rcu_dereference_protected(fn->left, lockdep_is_held(&table->tb6_lock)); } while (fn); if (!allow_create) { /* We should not create new node because * NLM_F_REPLACE was specified without NLM_F_CREATE * I assume it is safe to require NLM_F_CREATE when * REPLACE flag is used! Later we may want to remove the * check for replace_required, because according * to netlink specification, NLM_F_CREATE * MUST be specified if new route is created. * That would keep IPv6 consistent with IPv4 */ if (replace_required) { NL_SET_ERR_MSG(extack, "Can not replace route - no match found"); pr_warn("Can't replace route, no match found\n"); return ERR_PTR(-ENOENT); } pr_warn("NLM_F_CREATE should be set when creating new route\n"); } /* * We walked to the bottom of tree. * Create new leaf node without children. */ ln = node_alloc(net); if (!ln) return ERR_PTR(-ENOMEM); ln->fn_bit = plen; RCU_INIT_POINTER(ln->parent, pn); if (dir) rcu_assign_pointer(pn->right, ln); else rcu_assign_pointer(pn->left, ln); return ln; insert_above: /* * split since we don't have a common prefix anymore or * we have a less significant route. * we've to insert an intermediate node on the list * this new node will point to the one we need to create * and the current */ pn = rcu_dereference_protected(fn->parent, lockdep_is_held(&table->tb6_lock)); /* find 1st bit in difference between the 2 addrs. See comment in __ipv6_addr_diff: bit may be an invalid value, but if it is >= plen, the value is ignored in any case. */ bit = __ipv6_addr_diff(addr, &key->addr, sizeof(*addr)); /* * (intermediate)[in] * / \ * (new leaf node)[ln] (old node)[fn] */ if (plen > bit) { in = node_alloc(net); ln = node_alloc(net); if (!in || !ln) { if (in) node_free_immediate(net, in); if (ln) node_free_immediate(net, ln); return ERR_PTR(-ENOMEM); } /* * new intermediate node. * RTN_RTINFO will * be off since that an address that chooses one of * the branches would not match less specific routes * in the other branch */ in->fn_bit = bit; RCU_INIT_POINTER(in->parent, pn); in->leaf = fn->leaf; fib6_info_hold(rcu_dereference_protected(in->leaf, lockdep_is_held(&table->tb6_lock))); /* update parent pointer */ if (dir) rcu_assign_pointer(pn->right, in); else rcu_assign_pointer(pn->left, in); ln->fn_bit = plen; RCU_INIT_POINTER(ln->parent, in); rcu_assign_pointer(fn->parent, in); if (addr_bit_set(addr, bit)) { rcu_assign_pointer(in->right, ln); rcu_assign_pointer(in->left, fn); } else { rcu_assign_pointer(in->left, ln); rcu_assign_pointer(in->right, fn); } } else { /* plen <= bit */ /* * (new leaf node)[ln] * / \ * (old node)[fn] NULL */ ln = node_alloc(net); if (!ln) return ERR_PTR(-ENOMEM); ln->fn_bit = plen; RCU_INIT_POINTER(ln->parent, pn); if (addr_bit_set(&key->addr, plen)) RCU_INIT_POINTER(ln->right, fn); else RCU_INIT_POINTER(ln->left, fn); rcu_assign_pointer(fn->parent, ln); if (dir) rcu_assign_pointer(pn->right, ln); else rcu_assign_pointer(pn->left, ln); } return ln; } static void __fib6_drop_pcpu_from(struct fib6_nh *fib6_nh, const struct fib6_info *match, const struct fib6_table *table) { int cpu; if (!fib6_nh->rt6i_pcpu) return; /* release the reference to this fib entry from * all of its cached pcpu routes */ for_each_possible_cpu(cpu) { struct rt6_info **ppcpu_rt; struct rt6_info *pcpu_rt; ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu); pcpu_rt = *ppcpu_rt; /* only dropping the 'from' reference if the cached route * is using 'match'. The cached pcpu_rt->from only changes * from a fib6_info to NULL (ip6_dst_destroy); it can never * change from one fib6_info reference to another */ if (pcpu_rt && rcu_access_pointer(pcpu_rt->from) == match) { struct fib6_info *from; from = xchg((__force struct fib6_info **)&pcpu_rt->from, NULL); fib6_info_release(from); } } } struct fib6_nh_pcpu_arg { struct fib6_info *from; const struct fib6_table *table; }; static int fib6_nh_drop_pcpu_from(struct fib6_nh *nh, void *_arg) { struct fib6_nh_pcpu_arg *arg = _arg; __fib6_drop_pcpu_from(nh, arg->from, arg->table); return 0; } static void fib6_drop_pcpu_from(struct fib6_info *f6i, const struct fib6_table *table) { /* Make sure rt6_make_pcpu_route() wont add other percpu routes * while we are cleaning them here. */ f6i->fib6_destroying = 1; mb(); /* paired with the cmpxchg() in rt6_make_pcpu_route() */ if (f6i->nh) { struct fib6_nh_pcpu_arg arg = { .from = f6i, .table = table }; nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_drop_pcpu_from, &arg); } else { struct fib6_nh *fib6_nh; fib6_nh = f6i->fib6_nh; __fib6_drop_pcpu_from(fib6_nh, f6i, table); } } static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn, struct net *net) { struct fib6_table *table = rt->fib6_table; /* Flush all cached dst in exception table */ rt6_flush_exceptions(rt); fib6_drop_pcpu_from(rt, table); if (rt->nh && !list_empty(&rt->nh_list)) list_del_init(&rt->nh_list); if (refcount_read(&rt->fib6_ref) != 1) { /* This route is used as dummy address holder in some split * nodes. It is not leaked, but it still holds other resources, * which must be released in time. So, scan ascendant nodes * and replace dummy references to this route with references * to still alive ones. */ while (fn) { struct fib6_info *leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&table->tb6_lock)); struct fib6_info *new_leaf; if (!(fn->fn_flags & RTN_RTINFO) && leaf == rt) { new_leaf = fib6_find_prefix(net, table, fn); fib6_info_hold(new_leaf); rcu_assign_pointer(fn->leaf, new_leaf); fib6_info_release(rt); } fn = rcu_dereference_protected(fn->parent, lockdep_is_held(&table->tb6_lock)); } } fib6_clean_expires_locked(rt); } /* * Insert routing information in a node. */ static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt, struct nl_info *info, struct netlink_ext_ack *extack) { struct fib6_info *leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&rt->fib6_table->tb6_lock)); struct fib6_info *iter = NULL; struct fib6_info __rcu **ins; struct fib6_info __rcu **fallback_ins = NULL; int replace = (info->nlh && (info->nlh->nlmsg_flags & NLM_F_REPLACE)); int add = (!info->nlh || (info->nlh->nlmsg_flags & NLM_F_CREATE)); int found = 0; bool rt_can_ecmp = rt6_qualify_for_ecmp(rt); bool notify_sibling_rt = false; u16 nlflags = NLM_F_EXCL; int err; if (info->nlh && (info->nlh->nlmsg_flags & NLM_F_APPEND)) nlflags |= NLM_F_APPEND; ins = &fn->leaf; for (iter = leaf; iter; iter = rcu_dereference_protected(iter->fib6_next, lockdep_is_held(&rt->fib6_table->tb6_lock))) { /* * Search for duplicates */ if (iter->fib6_metric == rt->fib6_metric) { /* * Same priority level */ if (info->nlh && (info->nlh->nlmsg_flags & NLM_F_EXCL)) return -EEXIST; nlflags &= ~NLM_F_EXCL; if (replace) { if (rt_can_ecmp == rt6_qualify_for_ecmp(iter)) { found++; break; } fallback_ins = fallback_ins ?: ins; goto next_iter; } if (rt6_duplicate_nexthop(iter, rt)) { if (rt->fib6_nsiblings) rt->fib6_nsiblings = 0; if (!(iter->fib6_flags & RTF_EXPIRES)) return -EEXIST; if (!(rt->fib6_flags & RTF_EXPIRES)) fib6_clean_expires_locked(iter); else fib6_set_expires_locked(iter, rt->expires); if (rt->fib6_pmtu) fib6_metric_set(iter, RTAX_MTU, rt->fib6_pmtu); return -EEXIST; } /* If we have the same destination and the same metric, * but not the same gateway, then the route we try to * add is sibling to this route, increment our counter * of siblings, and later we will add our route to the * list. * Only static routes (which don't have flag * RTF_EXPIRES) are used for ECMPv6. * * To avoid long list, we only had siblings if the * route have a gateway. */ if (rt_can_ecmp && rt6_qualify_for_ecmp(iter)) rt->fib6_nsiblings++; } if (iter->fib6_metric > rt->fib6_metric) break; next_iter: ins = &iter->fib6_next; } if (fallback_ins && !found) { /* No matching route with same ecmp-able-ness found, replace * first matching route */ ins = fallback_ins; iter = rcu_dereference_protected(*ins, lockdep_is_held(&rt->fib6_table->tb6_lock)); found++; } /* Reset round-robin state, if necessary */ if (ins == &fn->leaf) fn->rr_ptr = NULL; /* Link this route to others same route. */ if (rt->fib6_nsiblings) { unsigned int fib6_nsiblings; struct fib6_info *sibling, *temp_sibling; /* Find the first route that have the same metric */ sibling = leaf; notify_sibling_rt = true; while (sibling) { if (sibling->fib6_metric == rt->fib6_metric && rt6_qualify_for_ecmp(sibling)) { list_add_tail(&rt->fib6_siblings, &sibling->fib6_siblings); break; } sibling = rcu_dereference_protected(sibling->fib6_next, lockdep_is_held(&rt->fib6_table->tb6_lock)); notify_sibling_rt = false; } /* For each sibling in the list, increment the counter of * siblings. BUG() if counters does not match, list of siblings * is broken! */ fib6_nsiblings = 0; list_for_each_entry_safe(sibling, temp_sibling, &rt->fib6_siblings, fib6_siblings) { sibling->fib6_nsiblings++; BUG_ON(sibling->fib6_nsiblings != rt->fib6_nsiblings); fib6_nsiblings++; } BUG_ON(fib6_nsiblings != rt->fib6_nsiblings); rt6_multipath_rebalance(temp_sibling); } /* * insert node */ if (!replace) { if (!add) pr_warn("NLM_F_CREATE should be set when creating new route\n"); add: nlflags |= NLM_F_CREATE; /* The route should only be notified if it is the first * route in the node or if it is added as a sibling * route to the first route in the node. */ if (!info->skip_notify_kernel && (notify_sibling_rt || ins == &fn->leaf)) { enum fib_event_type fib_event; if (notify_sibling_rt) fib_event = FIB_EVENT_ENTRY_APPEND; else fib_event = FIB_EVENT_ENTRY_REPLACE; err = call_fib6_entry_notifiers(info->nl_net, fib_event, rt, extack); if (err) { struct fib6_info *sibling, *next_sibling; /* If the route has siblings, then it first * needs to be unlinked from them. */ if (!rt->fib6_nsiblings) return err; list_for_each_entry_safe(sibling, next_sibling, &rt->fib6_siblings, fib6_siblings) sibling->fib6_nsiblings--; rt->fib6_nsiblings = 0; list_del_init(&rt->fib6_siblings); rt6_multipath_rebalance(next_sibling); return err; } } rcu_assign_pointer(rt->fib6_next, iter); fib6_info_hold(rt); rcu_assign_pointer(rt->fib6_node, fn); rcu_assign_pointer(*ins, rt); if (!info->skip_notify) inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); info->nl_net->ipv6.rt6_stats->fib_rt_entries++; if (!(fn->fn_flags & RTN_RTINFO)) { info->nl_net->ipv6.rt6_stats->fib_route_nodes++; fn->fn_flags |= RTN_RTINFO; } } else { int nsiblings; if (!found) { if (add) goto add; pr_warn("NLM_F_REPLACE set, but no existing node found!\n"); return -ENOENT; } if (!info->skip_notify_kernel && ins == &fn->leaf) { err = call_fib6_entry_notifiers(info->nl_net, FIB_EVENT_ENTRY_REPLACE, rt, extack); if (err) return err; } fib6_info_hold(rt); rcu_assign_pointer(rt->fib6_node, fn); rt->fib6_next = iter->fib6_next; rcu_assign_pointer(*ins, rt); if (!info->skip_notify) inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE); if (!(fn->fn_flags & RTN_RTINFO)) { info->nl_net->ipv6.rt6_stats->fib_route_nodes++; fn->fn_flags |= RTN_RTINFO; } nsiblings = iter->fib6_nsiblings; iter->fib6_node = NULL; fib6_purge_rt(iter, fn, info->nl_net); if (rcu_access_pointer(fn->rr_ptr) == iter) fn->rr_ptr = NULL; fib6_info_release(iter); if (nsiblings) { /* Replacing an ECMP route, remove all siblings */ ins = &rt->fib6_next; iter = rcu_dereference_protected(*ins, lockdep_is_held(&rt->fib6_table->tb6_lock)); while (iter) { if (iter->fib6_metric > rt->fib6_metric) break; if (rt6_qualify_for_ecmp(iter)) { *ins = iter->fib6_next; iter->fib6_node = NULL; fib6_purge_rt(iter, fn, info->nl_net); if (rcu_access_pointer(fn->rr_ptr) == iter) fn->rr_ptr = NULL; fib6_info_release(iter); nsiblings--; info->nl_net->ipv6.rt6_stats->fib_rt_entries--; } else { ins = &iter->fib6_next; } iter = rcu_dereference_protected(*ins, lockdep_is_held(&rt->fib6_table->tb6_lock)); } WARN_ON(nsiblings != 0); } } return 0; } static void fib6_start_gc(struct net *net, struct fib6_info *rt) { if (!timer_pending(&net->ipv6.ip6_fib_timer) && (rt->fib6_flags & RTF_EXPIRES)) mod_timer(&net->ipv6.ip6_fib_timer, jiffies + net->ipv6.sysctl.ip6_rt_gc_interval); } void fib6_force_start_gc(struct net *net) { if (!timer_pending(&net->ipv6.ip6_fib_timer)) mod_timer(&net->ipv6.ip6_fib_timer, jiffies + net->ipv6.sysctl.ip6_rt_gc_interval); } static void __fib6_update_sernum_upto_root(struct fib6_info *rt, int sernum) { struct fib6_node *fn = rcu_dereference_protected(rt->fib6_node, lockdep_is_held(&rt->fib6_table->tb6_lock)); /* paired with smp_rmb() in fib6_get_cookie_safe() */ smp_wmb(); while (fn) { WRITE_ONCE(fn->fn_sernum, sernum); fn = rcu_dereference_protected(fn->parent, lockdep_is_held(&rt->fib6_table->tb6_lock)); } } void fib6_update_sernum_upto_root(struct net *net, struct fib6_info *rt) { __fib6_update_sernum_upto_root(rt, fib6_new_sernum(net)); } /* allow ipv4 to update sernum via ipv6_stub */ void fib6_update_sernum_stub(struct net *net, struct fib6_info *f6i) { spin_lock_bh(&f6i->fib6_table->tb6_lock); fib6_update_sernum_upto_root(net, f6i); spin_unlock_bh(&f6i->fib6_table->tb6_lock); } /* * Add routing information to the routing tree. * <destination addr>/<source addr> * with source addr info in sub-trees * Need to own table->tb6_lock */ int fib6_add(struct fib6_node *root, struct fib6_info *rt, struct nl_info *info, struct netlink_ext_ack *extack) { struct fib6_table *table = rt->fib6_table; struct fib6_node *fn, *pn = NULL; int err = -ENOMEM; int allow_create = 1; int replace_required = 0; if (info->nlh) { if (!(info->nlh->nlmsg_flags & NLM_F_CREATE)) allow_create = 0; if (info->nlh->nlmsg_flags & NLM_F_REPLACE) replace_required = 1; } if (!allow_create && !replace_required) pr_warn("RTM_NEWROUTE with no NLM_F_CREATE or NLM_F_REPLACE\n"); fn = fib6_add_1(info->nl_net, table, root, &rt->fib6_dst.addr, rt->fib6_dst.plen, offsetof(struct fib6_info, fib6_dst), allow_create, replace_required, extack); if (IS_ERR(fn)) { err = PTR_ERR(fn); fn = NULL; goto out; } pn = fn; #ifdef CONFIG_IPV6_SUBTREES if (rt->fib6_src.plen) { struct fib6_node *sn; if (!rcu_access_pointer(fn->subtree)) { struct fib6_node *sfn; /* * Create subtree. * * fn[main tree] * | * sfn[subtree root] * \ * sn[new leaf node] */ /* Create subtree root node */ sfn = node_alloc(info->nl_net); if (!sfn) goto failure; fib6_info_hold(info->nl_net->ipv6.fib6_null_entry); rcu_assign_pointer(sfn->leaf, info->nl_net->ipv6.fib6_null_entry); sfn->fn_flags = RTN_ROOT; /* Now add the first leaf node to new subtree */ sn = fib6_add_1(info->nl_net, table, sfn, &rt->fib6_src.addr, rt->fib6_src.plen, offsetof(struct fib6_info, fib6_src), allow_create, replace_required, extack); if (IS_ERR(sn)) { /* If it is failed, discard just allocated root, and then (in failure) stale node in main tree. */ node_free_immediate(info->nl_net, sfn); err = PTR_ERR(sn); goto failure; } /* Now link new subtree to main tree */ rcu_assign_pointer(sfn->parent, fn); rcu_assign_pointer(fn->subtree, sfn); } else { sn = fib6_add_1(info->nl_net, table, FIB6_SUBTREE(fn), &rt->fib6_src.addr, rt->fib6_src.plen, offsetof(struct fib6_info, fib6_src), allow_create, replace_required, extack); if (IS_ERR(sn)) { err = PTR_ERR(sn); goto failure; } } if (!rcu_access_pointer(fn->leaf)) { if (fn->fn_flags & RTN_TL_ROOT) { /* put back null_entry for root node */ rcu_assign_pointer(fn->leaf, info->nl_net->ipv6.fib6_null_entry); } else { fib6_info_hold(rt); rcu_assign_pointer(fn->leaf, rt); } } fn = sn; } #endif err = fib6_add_rt2node(fn, rt, info, extack); if (!err) { if (rt->nh) list_add(&rt->nh_list, &rt->nh->f6i_list); __fib6_update_sernum_upto_root(rt, fib6_new_sernum(info->nl_net)); if (fib6_has_expires(rt)) hlist_add_head(&rt->gc_link, &table->tb6_gc_hlist); fib6_start_gc(info->nl_net, rt); } out: if (err) { #ifdef CONFIG_IPV6_SUBTREES /* * If fib6_add_1 has cleared the old leaf pointer in the * super-tree leaf node we have to find a new one for it. */ if (pn != fn) { struct fib6_info *pn_leaf = rcu_dereference_protected(pn->leaf, lockdep_is_held(&table->tb6_lock)); if (pn_leaf == rt) { pn_leaf = NULL; RCU_INIT_POINTER(pn->leaf, NULL); fib6_info_release(rt); } if (!pn_leaf && !(pn->fn_flags & RTN_RTINFO)) { pn_leaf = fib6_find_prefix(info->nl_net, table, pn); #if RT6_DEBUG >= 2 if (!pn_leaf) { WARN_ON(!pn_leaf); pn_leaf = info->nl_net->ipv6.fib6_null_entry; } #endif fib6_info_hold(pn_leaf); rcu_assign_pointer(pn->leaf, pn_leaf); } } #endif goto failure; } else if (fib6_requires_src(rt)) { fib6_routes_require_src_inc(info->nl_net); } return err; failure: /* fn->leaf could be NULL and fib6_repair_tree() needs to be called if: * 1. fn is an intermediate node and we failed to add the new * route to it in both subtree creation failure and fib6_add_rt2node() * failure case. * 2. fn is the root node in the table and we fail to add the first * default route to it. */ if (fn && (!(fn->fn_flags & (RTN_RTINFO|RTN_ROOT)) || (fn->fn_flags & RTN_TL_ROOT && !rcu_access_pointer(fn->leaf)))) fib6_repair_tree(info->nl_net, table, fn); return err; } /* * Routing tree lookup * */ struct lookup_args { int offset; /* key offset on fib6_info */ const struct in6_addr *addr; /* search key */ }; static struct fib6_node *fib6_node_lookup_1(struct fib6_node *root, struct lookup_args *args) { struct fib6_node *fn; __be32 dir; if (unlikely(args->offset == 0)) return NULL; /* * Descend on a tree */ fn = root; for (;;) { struct fib6_node *next; dir = addr_bit_set(args->addr, fn->fn_bit); next = dir ? rcu_dereference(fn->right) : rcu_dereference(fn->left); if (next) { fn = next; continue; } break; } while (fn) { struct fib6_node *subtree = FIB6_SUBTREE(fn); if (subtree || fn->fn_flags & RTN_RTINFO) { struct fib6_info *leaf = rcu_dereference(fn->leaf); struct rt6key *key; if (!leaf) goto backtrack; key = (struct rt6key *) ((u8 *)leaf + args->offset); if (ipv6_prefix_equal(&key->addr, args->addr, key->plen)) { #ifdef CONFIG_IPV6_SUBTREES if (subtree) { struct fib6_node *sfn; sfn = fib6_node_lookup_1(subtree, args + 1); if (!sfn) goto backtrack; fn = sfn; } #endif if (fn->fn_flags & RTN_RTINFO) return fn; } } backtrack: if (fn->fn_flags & RTN_ROOT) break; fn = rcu_dereference(fn->parent); } return NULL; } /* called with rcu_read_lock() held */ struct fib6_node *fib6_node_lookup(struct fib6_node *root, const struct in6_addr *daddr, const struct in6_addr *saddr) { struct fib6_node *fn; struct lookup_args args[] = { { .offset = offsetof(struct fib6_info, fib6_dst), .addr = daddr, }, #ifdef CONFIG_IPV6_SUBTREES { .offset = offsetof(struct fib6_info, fib6_src), .addr = saddr, }, #endif { .offset = 0, /* sentinel */ } }; fn = fib6_node_lookup_1(root, daddr ? args : args + 1); if (!fn || fn->fn_flags & RTN_TL_ROOT) fn = root; return fn; } /* * Get node with specified destination prefix (and source prefix, * if subtrees are used) * exact_match == true means we try to find fn with exact match of * the passed in prefix addr * exact_match == false means we try to find fn with longest prefix * match of the passed in prefix addr. This is useful for finding fn * for cached route as it will be stored in the exception table under * the node with longest prefix length. */ static struct fib6_node *fib6_locate_1(struct fib6_node *root, const struct in6_addr *addr, int plen, int offset, bool exact_match) { struct fib6_node *fn, *prev = NULL; for (fn = root; fn ; ) { struct fib6_info *leaf = rcu_dereference(fn->leaf); struct rt6key *key; /* This node is being deleted */ if (!leaf) { if (plen <= fn->fn_bit) goto out; else goto next; } key = (struct rt6key *)((u8 *)leaf + offset); /* * Prefix match */ if (plen < fn->fn_bit || !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit)) goto out; if (plen == fn->fn_bit) return fn; if (fn->fn_flags & RTN_RTINFO) prev = fn; next: /* * We have more bits to go */ if (addr_bit_set(addr, fn->fn_bit)) fn = rcu_dereference(fn->right); else fn = rcu_dereference(fn->left); } out: if (exact_match) return NULL; else return prev; } struct fib6_node *fib6_locate(struct fib6_node *root, const struct in6_addr *daddr, int dst_len, const struct in6_addr *saddr, int src_len, bool exact_match) { struct fib6_node *fn; fn = fib6_locate_1(root, daddr, dst_len, offsetof(struct fib6_info, fib6_dst), exact_match); #ifdef CONFIG_IPV6_SUBTREES if (src_len) { WARN_ON(saddr == NULL); if (fn) { struct fib6_node *subtree = FIB6_SUBTREE(fn); if (subtree) { fn = fib6_locate_1(subtree, saddr, src_len, offsetof(struct fib6_info, fib6_src), exact_match); } } } #endif if (fn && fn->fn_flags & RTN_RTINFO) return fn; return NULL; } /* * Deletion * */ static struct fib6_info *fib6_find_prefix(struct net *net, struct fib6_table *table, struct fib6_node *fn) { struct fib6_node *child_left, *child_right; if (fn->fn_flags & RTN_ROOT) return net->ipv6.fib6_null_entry; while (fn) { child_left = rcu_dereference_protected(fn->left, lockdep_is_held(&table->tb6_lock)); child_right = rcu_dereference_protected(fn->right, lockdep_is_held(&table->tb6_lock)); if (child_left) return rcu_dereference_protected(child_left->leaf, lockdep_is_held(&table->tb6_lock)); if (child_right) return rcu_dereference_protected(child_right->leaf, lockdep_is_held(&table->tb6_lock)); fn = FIB6_SUBTREE(fn); } return NULL; } /* * Called to trim the tree of intermediate nodes when possible. "fn" * is the node we want to try and remove. * Need to own table->tb6_lock */ static struct fib6_node *fib6_repair_tree(struct net *net, struct fib6_table *table, struct fib6_node *fn) { int children; int nstate; struct fib6_node *child; struct fib6_walker *w; int iter = 0; /* Set fn->leaf to null_entry for root node. */ if (fn->fn_flags & RTN_TL_ROOT) { rcu_assign_pointer(fn->leaf, net->ipv6.fib6_null_entry); return fn; } for (;;) { struct fib6_node *fn_r = rcu_dereference_protected(fn->right, lockdep_is_held(&table->tb6_lock)); struct fib6_node *fn_l = rcu_dereference_protected(fn->left, lockdep_is_held(&table->tb6_lock)); struct fib6_node *pn = rcu_dereference_protected(fn->parent, lockdep_is_held(&table->tb6_lock)); struct fib6_node *pn_r = rcu_dereference_protected(pn->right, lockdep_is_held(&table->tb6_lock)); struct fib6_node *pn_l = rcu_dereference_protected(pn->left, lockdep_is_held(&table->tb6_lock)); struct fib6_info *fn_leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&table->tb6_lock)); struct fib6_info *pn_leaf = rcu_dereference_protected(pn->leaf, lockdep_is_held(&table->tb6_lock)); struct fib6_info *new_fn_leaf; RT6_TRACE("fixing tree: plen=%d iter=%d\n", fn->fn_bit, iter); iter++; WARN_ON(fn->fn_flags & RTN_RTINFO); WARN_ON(fn->fn_flags & RTN_TL_ROOT); WARN_ON(fn_leaf); children = 0; child = NULL; if (fn_r) { child = fn_r; children |= 1; } if (fn_l) { child = fn_l; children |= 2; } if (children == 3 || FIB6_SUBTREE(fn) #ifdef CONFIG_IPV6_SUBTREES /* Subtree root (i.e. fn) may have one child */ || (children && fn->fn_flags & RTN_ROOT) #endif ) { new_fn_leaf = fib6_find_prefix(net, table, fn); #if RT6_DEBUG >= 2 if (!new_fn_leaf) { WARN_ON(!new_fn_leaf); new_fn_leaf = net->ipv6.fib6_null_entry; } #endif fib6_info_hold(new_fn_leaf); rcu_assign_pointer(fn->leaf, new_fn_leaf); return pn; } #ifdef CONFIG_IPV6_SUBTREES if (FIB6_SUBTREE(pn) == fn) { WARN_ON(!(fn->fn_flags & RTN_ROOT)); RCU_INIT_POINTER(pn->subtree, NULL); nstate = FWS_L; } else { WARN_ON(fn->fn_flags & RTN_ROOT); #endif if (pn_r == fn) rcu_assign_pointer(pn->right, child); else if (pn_l == fn) rcu_assign_pointer(pn->left, child); #if RT6_DEBUG >= 2 else WARN_ON(1); #endif if (child) rcu_assign_pointer(child->parent, pn); nstate = FWS_R; #ifdef CONFIG_IPV6_SUBTREES } #endif read_lock(&net->ipv6.fib6_walker_lock); FOR_WALKERS(net, w) { if (!child) { if (w->node == fn) { RT6_TRACE("W %p adjusted by delnode 1, s=%d/%d\n", w, w->state, nstate); w->node = pn; w->state = nstate; } } else { if (w->node == fn) { w->node = child; if (children&2) { RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state); w->state = w->state >= FWS_R ? FWS_U : FWS_INIT; } else { RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state); w->state = w->state >= FWS_C ? FWS_U : FWS_INIT; } } } } read_unlock(&net->ipv6.fib6_walker_lock); node_free(net, fn); if (pn->fn_flags & RTN_RTINFO || FIB6_SUBTREE(pn)) return pn; RCU_INIT_POINTER(pn->leaf, NULL); fib6_info_release(pn_leaf); fn = pn; } } static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, struct fib6_info __rcu **rtp, struct nl_info *info) { struct fib6_info *leaf, *replace_rt = NULL; struct fib6_walker *w; struct fib6_info *rt = rcu_dereference_protected(*rtp, lockdep_is_held(&table->tb6_lock)); struct net *net = info->nl_net; bool notify_del = false; RT6_TRACE("fib6_del_route\n"); /* If the deleted route is the first in the node and it is not part of * a multipath route, then we need to replace it with the next route * in the node, if exists. */ leaf = rcu_dereference_protected(fn->leaf, lockdep_is_held(&table->tb6_lock)); if (leaf == rt && !rt->fib6_nsiblings) { if (rcu_access_pointer(rt->fib6_next)) replace_rt = rcu_dereference_protected(rt->fib6_next, lockdep_is_held(&table->tb6_lock)); else notify_del = true; } /* Unlink it */ *rtp = rt->fib6_next; rt->fib6_node = NULL; net->ipv6.rt6_stats->fib_rt_entries--; net->ipv6.rt6_stats->fib_discarded_routes++; /* Reset round-robin state, if necessary */ if (rcu_access_pointer(fn->rr_ptr) == rt) fn->rr_ptr = NULL; /* Remove this entry from other siblings */ if (rt->fib6_nsiblings) { struct fib6_info *sibling, *next_sibling; /* The route is deleted from a multipath route. If this * multipath route is the first route in the node, then we need * to emit a delete notification. Otherwise, we need to skip * the notification. */ if (rt->fib6_metric == leaf->fib6_metric && rt6_qualify_for_ecmp(leaf)) notify_del = true; list_for_each_entry_safe(sibling, next_sibling, &rt->fib6_siblings, fib6_siblings) sibling->fib6_nsiblings--; rt->fib6_nsiblings = 0; list_del_init(&rt->fib6_siblings); rt6_multipath_rebalance(next_sibling); } /* Adjust walkers */ read_lock(&net->ipv6.fib6_walker_lock); FOR_WALKERS(net, w) { if (w->state == FWS_C && w->leaf == rt) { RT6_TRACE("walker %p adjusted by delroute\n", w); w->leaf = rcu_dereference_protected(rt->fib6_next, lockdep_is_held(&table->tb6_lock)); if (!w->leaf) w->state = FWS_U; } } read_unlock(&net->ipv6.fib6_walker_lock); /* If it was last route, call fib6_repair_tree() to: * 1. For root node, put back null_entry as how the table was created. * 2. For other nodes, expunge its radix tree node. */ if (!rcu_access_pointer(fn->leaf)) { if (!(fn->fn_flags & RTN_TL_ROOT)) { fn->fn_flags &= ~RTN_RTINFO; net->ipv6.rt6_stats->fib_route_nodes--; } fn = fib6_repair_tree(net, table, fn); } fib6_purge_rt(rt, fn, net); if (!info->skip_notify_kernel) { if (notify_del) call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, rt, NULL); else if (replace_rt) call_fib6_entry_notifiers_replace(net, replace_rt); } if (!info->skip_notify) inet6_rt_notify(RTM_DELROUTE, rt, info, 0); fib6_info_release(rt); } /* Need to own table->tb6_lock */ int fib6_del(struct fib6_info *rt, struct nl_info *info) { struct net *net = info->nl_net; struct fib6_info __rcu **rtp; struct fib6_info __rcu **rtp_next; struct fib6_table *table; struct fib6_node *fn; if (rt == net->ipv6.fib6_null_entry) return -ENOENT; table = rt->fib6_table; fn = rcu_dereference_protected(rt->fib6_node, lockdep_is_held(&table->tb6_lock)); if (!fn) return -ENOENT; WARN_ON(!(fn->fn_flags & RTN_RTINFO)); /* * Walk the leaf entries looking for ourself */ for (rtp = &fn->leaf; *rtp; rtp = rtp_next) { struct fib6_info *cur = rcu_dereference_protected(*rtp, lockdep_is_held(&table->tb6_lock)); if (rt == cur) { if (fib6_requires_src(cur)) fib6_routes_require_src_dec(info->nl_net); fib6_del_route(table, fn, rtp, info); return 0; } rtp_next = &cur->fib6_next; } return -ENOENT; } /* * Tree traversal function. * * Certainly, it is not interrupt safe. * However, it is internally reenterable wrt itself and fib6_add/fib6_del. * It means, that we can modify tree during walking * and use this function for garbage collection, clone pruning, * cleaning tree when a device goes down etc. etc. * * It guarantees that every node will be traversed, * and that it will be traversed only once. * * Callback function w->func may return: * 0 -> continue walking. * positive value -> walking is suspended (used by tree dumps, * and probably by gc, if it will be split to several slices) * negative value -> terminate walking. * * The function itself returns: * 0 -> walk is complete. * >0 -> walk is incomplete (i.e. suspended) * <0 -> walk is terminated by an error. * * This function is called with tb6_lock held. */ static int fib6_walk_continue(struct fib6_walker *w) { struct fib6_node *fn, *pn, *left, *right; /* w->root should always be table->tb6_root */ WARN_ON_ONCE(!(w->root->fn_flags & RTN_TL_ROOT)); for (;;) { fn = w->node; if (!fn) return 0; switch (w->state) { #ifdef CONFIG_IPV6_SUBTREES case FWS_S: if (FIB6_SUBTREE(fn)) { w->node = FIB6_SUBTREE(fn); continue; } w->state = FWS_L; fallthrough; #endif case FWS_L: left = rcu_dereference_protected(fn->left, 1); if (left) { w->node = left; w->state = FWS_INIT; continue; } w->state = FWS_R; fallthrough; case FWS_R: right = rcu_dereference_protected(fn->right, 1); if (right) { w->node = right; w->state = FWS_INIT; continue; } w->state = FWS_C; w->leaf = rcu_dereference_protected(fn->leaf, 1); fallthrough; case FWS_C: if (w->leaf && fn->fn_flags & RTN_RTINFO) { int err; if (w->skip) { w->skip--; goto skip; } err = w->func(w); if (err) return err; w->count++; continue; } skip: w->state = FWS_U; fallthrough; case FWS_U: if (fn == w->root) return 0; pn = rcu_dereference_protected(fn->parent, 1); left = rcu_dereference_protected(pn->left, 1); right = rcu_dereference_protected(pn->right, 1); w->node = pn; #ifdef CONFIG_IPV6_SUBTREES if (FIB6_SUBTREE(pn) == fn) { WARN_ON(!(fn->fn_flags & RTN_ROOT)); w->state = FWS_L; continue; } #endif if (left == fn) { w->state = FWS_R; continue; } if (right == fn) { w->state = FWS_C; w->leaf = rcu_dereference_protected(w->node->leaf, 1); continue; } #if RT6_DEBUG >= 2 WARN_ON(1); #endif } } } static int fib6_walk(struct net *net, struct fib6_walker *w) { int res; w->state = FWS_INIT; w->node = w->root; fib6_walker_link(net, w); res = fib6_walk_continue(w); if (res <= 0) fib6_walker_unlink(net, w); return res; } static int fib6_clean_node(struct fib6_walker *w) { int res; struct fib6_info *rt; struct fib6_cleaner *c = container_of(w, struct fib6_cleaner, w); struct nl_info info = { .nl_net = c->net, .skip_notify = c->skip_notify, }; if (c->sernum != FIB6_NO_SERNUM_CHANGE && READ_ONCE(w->node->fn_sernum) != c->sernum) WRITE_ONCE(w->node->fn_sernum, c->sernum); if (!c->func) { WARN_ON_ONCE(c->sernum == FIB6_NO_SERNUM_CHANGE); w->leaf = NULL; return 0; } for_each_fib6_walker_rt(w) { res = c->func(rt, c->arg); if (res == -1) { w->leaf = rt; res = fib6_del(rt, &info); if (res) { #if RT6_DEBUG >= 2 pr_debug("%s: del failed: rt=%p@%p err=%d\n", __func__, rt, rcu_access_pointer(rt->fib6_node), res); #endif continue; } return 0; } else if (res == -2) { if (WARN_ON(!rt->fib6_nsiblings)) continue; rt = list_last_entry(&rt->fib6_siblings, struct fib6_info, fib6_siblings); continue; } WARN_ON(res != 0); } w->leaf = rt; return 0; } /* * Convenient frontend to tree walker. * * func is called on each route. * It may return -2 -> skip multipath route. * -1 -> delete this route. * 0 -> continue walking */ static void fib6_clean_tree(struct net *net, struct fib6_node *root, int (*func)(struct fib6_info *, void *arg), int sernum, void *arg, bool skip_notify) { struct fib6_cleaner c; c.w.root = root; c.w.func = fib6_clean_node; c.w.count = 0; c.w.skip = 0; c.w.skip_in_node = 0; c.func = func; c.sernum = sernum; c.arg = arg; c.net = net; c.skip_notify = skip_notify; fib6_walk(net, &c.w); } static void __fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *), int sernum, void *arg, bool skip_notify) { struct fib6_table *table; struct hlist_head *head; unsigned int h; rcu_read_lock(); for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { head = &net->ipv6.fib_table_hash[h]; hlist_for_each_entry_rcu(table, head, tb6_hlist) { spin_lock_bh(&table->tb6_lock); fib6_clean_tree(net, &table->tb6_root, func, sernum, arg, skip_notify); spin_unlock_bh(&table->tb6_lock); } } rcu_read_unlock(); } void fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *), void *arg) { __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg, false); } void fib6_clean_all_skip_notify(struct net *net, int (*func)(struct fib6_info *, void *), void *arg) { __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg, true); } static void fib6_flush_trees(struct net *net) { int new_sernum = fib6_new_sernum(net); __fib6_clean_all(net, NULL, new_sernum, NULL, false); } /* * Garbage collection */ static int fib6_age(struct fib6_info *rt, struct fib6_gc_args *gc_args) { unsigned long now = jiffies; /* * check addrconf expiration here. * Routes are expired even if they are in use. */ if (fib6_has_expires(rt) && rt->expires) { if (time_after(now, rt->expires)) { RT6_TRACE("expiring %p\n", rt); return -1; } gc_args->more++; } /* Also age clones in the exception table. * Note, that clones are aged out * only if they are not in use now. */ rt6_age_exceptions(rt, gc_args, now); return 0; } static void fib6_gc_table(struct net *net, struct fib6_table *tb6, struct fib6_gc_args *gc_args) { struct fib6_info *rt; struct hlist_node *n; struct nl_info info = { .nl_net = net, .skip_notify = false, }; hlist_for_each_entry_safe(rt, n, &tb6->tb6_gc_hlist, gc_link) if (fib6_age(rt, gc_args) == -1) fib6_del(rt, &info); } static void fib6_gc_all(struct net *net, struct fib6_gc_args *gc_args) { struct fib6_table *table; struct hlist_head *head; unsigned int h; rcu_read_lock(); for (h = 0; h < FIB6_TABLE_HASHSZ; h++) { head = &net->ipv6.fib_table_hash[h]; hlist_for_each_entry_rcu(table, head, tb6_hlist) { spin_lock_bh(&table->tb6_lock); fib6_gc_table(net, table, gc_args); spin_unlock_bh(&table->tb6_lock); } } rcu_read_unlock(); } void fib6_run_gc(unsigned long expires, struct net *net, bool force) { struct fib6_gc_args gc_args; unsigned long now; if (force) { spin_lock_bh(&net->ipv6.fib6_gc_lock); } else if (!spin_trylock_bh(&net->ipv6.fib6_gc_lock)) { mod_timer(&net->ipv6.ip6_fib_timer, jiffies + HZ); return; } gc_args.timeout = expires ? (int)expires : net->ipv6.sysctl.ip6_rt_gc_interval; gc_args.more = 0; fib6_gc_all(net, &gc_args); now = jiffies; net->ipv6.ip6_rt_last_gc = now; if (gc_args.more) mod_timer(&net->ipv6.ip6_fib_timer, round_jiffies(now + net->ipv6.sysctl.ip6_rt_gc_interval)); else del_timer(&net->ipv6.ip6_fib_timer); spin_unlock_bh(&net->ipv6.fib6_gc_lock); } static void fib6_gc_timer_cb(struct timer_list *t) { struct net *arg = from_timer(arg, t, ipv6.ip6_fib_timer); fib6_run_gc(0, arg, true); } static int __net_init fib6_net_init(struct net *net) { size_t size = sizeof(struct hlist_head) * FIB6_TABLE_HASHSZ; int err; err = fib6_notifier_init(net); if (err) return err; /* Default to 3-tuple */ net->ipv6.sysctl.multipath_hash_fields = FIB_MULTIPATH_HASH_FIELD_DEFAULT_MASK; spin_lock_init(&net->ipv6.fib6_gc_lock); rwlock_init(&net->ipv6.fib6_walker_lock); INIT_LIST_HEAD(&net->ipv6.fib6_walkers); timer_setup(&net->ipv6.ip6_fib_timer, fib6_gc_timer_cb, 0); net->ipv6.rt6_stats = kzalloc(sizeof(*net->ipv6.rt6_stats), GFP_KERNEL); if (!net->ipv6.rt6_stats) goto out_notifier; /* Avoid false sharing : Use at least a full cache line */ size = max_t(size_t, size, L1_CACHE_BYTES); net->ipv6.fib_table_hash = kzalloc(size, GFP_KERNEL); if (!net->ipv6.fib_table_hash) goto out_rt6_stats; net->ipv6.fib6_main_tbl = kzalloc(sizeof(*net->ipv6.fib6_main_tbl), GFP_KERNEL); if (!net->ipv6.fib6_main_tbl) goto out_fib_table_hash; net->ipv6.fib6_main_tbl->tb6_id = RT6_TABLE_MAIN; rcu_assign_pointer(net->ipv6.fib6_main_tbl->tb6_root.leaf, net->ipv6.fib6_null_entry); net->ipv6.fib6_main_tbl->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&net->ipv6.fib6_main_tbl->tb6_peers); #ifdef CONFIG_IPV6_MULTIPLE_TABLES net->ipv6.fib6_local_tbl = kzalloc(sizeof(*net->ipv6.fib6_local_tbl), GFP_KERNEL); if (!net->ipv6.fib6_local_tbl) goto out_fib6_main_tbl; net->ipv6.fib6_local_tbl->tb6_id = RT6_TABLE_LOCAL; rcu_assign_pointer(net->ipv6.fib6_local_tbl->tb6_root.leaf, net->ipv6.fib6_null_entry); net->ipv6.fib6_local_tbl->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO; inet_peer_base_init(&net->ipv6.fib6_local_tbl->tb6_peers); #endif fib6_tables_init(net); return 0; #ifdef CONFIG_IPV6_MULTIPLE_TABLES out_fib6_main_tbl: kfree(net->ipv6.fib6_main_tbl); #endif out_fib_table_hash: kfree(net->ipv6.fib_table_hash); out_rt6_stats: kfree(net->ipv6.rt6_stats); out_notifier: fib6_notifier_exit(net); return -ENOMEM; } static void fib6_net_exit(struct net *net) { unsigned int i; del_timer_sync(&net->ipv6.ip6_fib_timer); for (i = 0; i < FIB6_TABLE_HASHSZ; i++) { struct hlist_head *head = &net->ipv6.fib_table_hash[i]; struct hlist_node *tmp; struct fib6_table *tb; hlist_for_each_entry_safe(tb, tmp, head, tb6_hlist) { hlist_del(&tb->tb6_hlist); fib6_free_table(tb); } } kfree(net->ipv6.fib_table_hash); kfree(net->ipv6.rt6_stats); fib6_notifier_exit(net); } static struct pernet_operations fib6_net_ops = { .init = fib6_net_init, .exit = fib6_net_exit, }; int __init fib6_init(void) { int ret = -ENOMEM; fib6_node_kmem = kmem_cache_create("fib6_nodes", sizeof(struct fib6_node), 0, SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL); if (!fib6_node_kmem) goto out; ret = register_pernet_subsys(&fib6_net_ops); if (ret) goto out_kmem_cache_create; ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE, NULL, inet6_dump_fib, 0); if (ret) goto out_unregister_subsys; __fib6_flush_trees = fib6_flush_trees; out: return ret; out_unregister_subsys: unregister_pernet_subsys(&fib6_net_ops); out_kmem_cache_create: kmem_cache_destroy(fib6_node_kmem); goto out; } void fib6_gc_cleanup(void) { unregister_pernet_subsys(&fib6_net_ops); kmem_cache_destroy(fib6_node_kmem); } #ifdef CONFIG_PROC_FS static int ipv6_route_native_seq_show(struct seq_file *seq, void *v) { struct fib6_info *rt = v; struct ipv6_route_iter *iter = seq->private; struct fib6_nh *fib6_nh = rt->fib6_nh; unsigned int flags = rt->fib6_flags; const struct net_device *dev; if (rt->nh) fib6_nh = nexthop_fib6_nh(rt->nh); seq_printf(seq, "%pi6 %02x ", &rt->fib6_dst.addr, rt->fib6_dst.plen); #ifdef CONFIG_IPV6_SUBTREES seq_printf(seq, "%pi6 %02x ", &rt->fib6_src.addr, rt->fib6_src.plen); #else seq_puts(seq, "00000000000000000000000000000000 00 "); #endif if (fib6_nh->fib_nh_gw_family) { flags |= RTF_GATEWAY; seq_printf(seq, "%pi6", &fib6_nh->fib_nh_gw6); } else { seq_puts(seq, "00000000000000000000000000000000"); } dev = fib6_nh->fib_nh_dev; seq_printf(seq, " %08x %08x %08x %08x %8s\n", rt->fib6_metric, refcount_read(&rt->fib6_ref), 0, flags, dev ? dev->name : ""); iter->w.leaf = NULL; return 0; } static int ipv6_route_yield(struct fib6_walker *w) { struct ipv6_route_iter *iter = w->args; if (!iter->skip) return 1; do { iter->w.leaf = rcu_dereference_protected( iter->w.leaf->fib6_next, lockdep_is_held(&iter->tbl->tb6_lock)); iter->skip--; if (!iter->skip && iter->w.leaf) return 1; } while (iter->w.leaf); return 0; } static void ipv6_route_seq_setup_walk(struct ipv6_route_iter *iter, struct net *net) { memset(&iter->w, 0, sizeof(iter->w)); iter->w.func = ipv6_route_yield; iter->w.root = &iter->tbl->tb6_root; iter->w.state = FWS_INIT; iter->w.node = iter->w.root; iter->w.args = iter; iter->sernum = READ_ONCE(iter->w.root->fn_sernum); INIT_LIST_HEAD(&iter->w.lh); fib6_walker_link(net, &iter->w); } static struct fib6_table *ipv6_route_seq_next_table(struct fib6_table *tbl, struct net *net) { unsigned int h; struct hlist_node *node; if (tbl) { h = (tbl->tb6_id & (FIB6_TABLE_HASHSZ - 1)) + 1; node = rcu_dereference(hlist_next_rcu(&tbl->tb6_hlist)); } else { h = 0; node = NULL; } while (!node && h < FIB6_TABLE_HASHSZ) { node = rcu_dereference( hlist_first_rcu(&net->ipv6.fib_table_hash[h++])); } return hlist_entry_safe(node, struct fib6_table, tb6_hlist); } static void ipv6_route_check_sernum(struct ipv6_route_iter *iter) { int sernum = READ_ONCE(iter->w.root->fn_sernum); if (iter->sernum != sernum) { iter->sernum = sernum; iter->w.state = FWS_INIT; iter->w.node = iter->w.root; WARN_ON(iter->w.skip); iter->w.skip = iter->w.count; } } static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos) { int r; struct fib6_info *n; struct net *net = seq_file_net(seq); struct ipv6_route_iter *iter = seq->private; ++(*pos); if (!v) goto iter_table; n = rcu_dereference(((struct fib6_info *)v)->fib6_next); if (n) return n; iter_table: ipv6_route_check_sernum(iter); spin_lock_bh(&iter->tbl->tb6_lock); r = fib6_walk_continue(&iter->w); spin_unlock_bh(&iter->tbl->tb6_lock); if (r > 0) { return iter->w.leaf; } else if (r < 0) { fib6_walker_unlink(net, &iter->w); return NULL; } fib6_walker_unlink(net, &iter->w); iter->tbl = ipv6_route_seq_next_table(iter->tbl, net); if (!iter->tbl) return NULL; ipv6_route_seq_setup_walk(iter, net); goto iter_table; } static void *ipv6_route_seq_start(struct seq_file *seq, loff_t *pos) __acquires(RCU) { struct net *net = seq_file_net(seq); struct ipv6_route_iter *iter = seq->private; rcu_read_lock(); iter->tbl = ipv6_route_seq_next_table(NULL, net); iter->skip = *pos; if (iter->tbl) { loff_t p = 0; ipv6_route_seq_setup_walk(iter, net); return ipv6_route_seq_next(seq, NULL, &p); } else { return NULL; } } static bool ipv6_route_iter_active(struct ipv6_route_iter *iter) { struct fib6_walker *w = &iter->w; return w->node && !(w->state == FWS_U && w->node == w->root); } static void ipv6_route_native_seq_stop(struct seq_file *seq, void *v) __releases(RCU) { struct net *net = seq_file_net(seq); struct ipv6_route_iter *iter = seq->private; if (ipv6_route_iter_active(iter)) fib6_walker_unlink(net, &iter->w); rcu_read_unlock(); } #if IS_BUILTIN(CONFIG_IPV6) && defined(CONFIG_BPF_SYSCALL) static int ipv6_route_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, void *v) { struct bpf_iter__ipv6_route ctx; ctx.meta = meta; ctx.rt = v; return bpf_iter_run_prog(prog, &ctx); } static int ipv6_route_seq_show(struct seq_file *seq, void *v) { struct ipv6_route_iter *iter = seq->private; struct bpf_iter_meta meta; struct bpf_prog *prog; int ret; meta.seq = seq; prog = bpf_iter_get_info(&meta, false); if (!prog) return ipv6_route_native_seq_show(seq, v); ret = ipv6_route_prog_seq_show(prog, &meta, v); iter->w.leaf = NULL; return ret; } static void ipv6_route_seq_stop(struct seq_file *seq, void *v) { struct bpf_iter_meta meta; struct bpf_prog *prog; if (!v) { meta.seq = seq; prog = bpf_iter_get_info(&meta, true); if (prog) (void)ipv6_route_prog_seq_show(prog, &meta, v); } ipv6_route_native_seq_stop(seq, v); } #else static int ipv6_route_seq_show(struct seq_file *seq, void *v) { return ipv6_route_native_seq_show(seq, v); } static void ipv6_route_seq_stop(struct seq_file *seq, void *v) { ipv6_route_native_seq_stop(seq, v); } #endif const struct seq_operations ipv6_route_seq_ops = { .start = ipv6_route_seq_start, .next = ipv6_route_seq_next, .stop = ipv6_route_seq_stop, .show = ipv6_route_seq_show }; #endif /* CONFIG_PROC_FS */
3 35 35 27 51 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 /* SPDX-License-Identifier: GPL-2.0-only */ /* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES */ #ifndef __IOMMUFD_PRIVATE_H #define __IOMMUFD_PRIVATE_H #include <linux/rwsem.h> #include <linux/xarray.h> #include <linux/refcount.h> #include <linux/uaccess.h> #include <linux/iommu.h> #include <linux/iova_bitmap.h> #include <uapi/linux/iommufd.h> struct iommu_domain; struct iommu_group; struct iommu_option; struct iommufd_device; struct iommufd_ctx { struct file *file; struct xarray objects; struct xarray groups; u8 account_mode; /* Compatibility with VFIO no iommu */ u8 no_iommu_mode; struct iommufd_ioas *vfio_ioas; }; /* * The IOVA to PFN map. The map automatically copies the PFNs into multiple * domains and permits sharing of PFNs between io_pagetable instances. This * supports both a design where IOAS's are 1:1 with a domain (eg because the * domain is HW customized), or where the IOAS is 1:N with multiple generic * domains. The io_pagetable holds an interval tree of iopt_areas which point * to shared iopt_pages which hold the pfns mapped to the page table. * * The locking order is domains_rwsem -> iova_rwsem -> pages::mutex */ struct io_pagetable { struct rw_semaphore domains_rwsem; struct xarray domains; struct xarray access_list; unsigned int next_domain_id; struct rw_semaphore iova_rwsem; struct rb_root_cached area_itree; /* IOVA that cannot become reserved, struct iopt_allowed */ struct rb_root_cached allowed_itree; /* IOVA that cannot be allocated, struct iopt_reserved */ struct rb_root_cached reserved_itree; u8 disable_large_pages; unsigned long iova_alignment; }; void iopt_init_table(struct io_pagetable *iopt); void iopt_destroy_table(struct io_pagetable *iopt); int iopt_get_pages(struct io_pagetable *iopt, unsigned long iova, unsigned long length, struct list_head *pages_list); void iopt_free_pages_list(struct list_head *pages_list); enum { IOPT_ALLOC_IOVA = 1 << 0, }; int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt, unsigned long *iova, void __user *uptr, unsigned long length, int iommu_prot, unsigned int flags); int iopt_map_pages(struct io_pagetable *iopt, struct list_head *pages_list, unsigned long length, unsigned long *dst_iova, int iommu_prot, unsigned int flags); int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova, unsigned long length, unsigned long *unmapped); int iopt_unmap_all(struct io_pagetable *iopt, unsigned long *unmapped); int iopt_read_and_clear_dirty_data(struct io_pagetable *iopt, struct iommu_domain *domain, unsigned long flags, struct iommu_hwpt_get_dirty_bitmap *bitmap); int iopt_set_dirty_tracking(struct io_pagetable *iopt, struct iommu_domain *domain, bool enable); void iommufd_access_notify_unmap(struct io_pagetable *iopt, unsigned long iova, unsigned long length); int iopt_table_add_domain(struct io_pagetable *iopt, struct iommu_domain *domain); void iopt_table_remove_domain(struct io_pagetable *iopt, struct iommu_domain *domain); int iopt_table_enforce_dev_resv_regions(struct io_pagetable *iopt, struct device *dev, phys_addr_t *sw_msi_start); int iopt_set_allow_iova(struct io_pagetable *iopt, struct rb_root_cached *allowed_iova); int iopt_reserve_iova(struct io_pagetable *iopt, unsigned long start, unsigned long last, void *owner); void iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner); int iopt_cut_iova(struct io_pagetable *iopt, unsigned long *iovas, size_t num_iovas); void iopt_enable_large_pages(struct io_pagetable *iopt); int iopt_disable_large_pages(struct io_pagetable *iopt); struct iommufd_ucmd { struct iommufd_ctx *ictx; void __user *ubuffer; u32 user_size; void *cmd; }; int iommufd_vfio_ioctl(struct iommufd_ctx *ictx, unsigned int cmd, unsigned long arg); /* Copy the response in ucmd->cmd back to userspace. */ static inline int iommufd_ucmd_respond(struct iommufd_ucmd *ucmd, size_t cmd_len) { if (copy_to_user(ucmd->ubuffer, ucmd->cmd, min_t(size_t, ucmd->user_size, cmd_len))) return -EFAULT; return 0; } enum iommufd_object_type { IOMMUFD_OBJ_NONE, IOMMUFD_OBJ_ANY = IOMMUFD_OBJ_NONE, IOMMUFD_OBJ_DEVICE, IOMMUFD_OBJ_HWPT_PAGING, IOMMUFD_OBJ_HWPT_NESTED, IOMMUFD_OBJ_IOAS, IOMMUFD_OBJ_ACCESS, #ifdef CONFIG_IOMMUFD_TEST IOMMUFD_OBJ_SELFTEST, #endif IOMMUFD_OBJ_MAX, }; /* Base struct for all objects with a userspace ID handle. */ struct iommufd_object { struct rw_semaphore destroy_rwsem; refcount_t users; enum iommufd_object_type type; unsigned int id; }; static inline bool iommufd_lock_obj(struct iommufd_object *obj) { if (!down_read_trylock(&obj->destroy_rwsem)) return false; if (!refcount_inc_not_zero(&obj->users)) { up_read(&obj->destroy_rwsem); return false; } return true; } struct iommufd_object *iommufd_get_object(struct iommufd_ctx *ictx, u32 id, enum iommufd_object_type type); static inline void iommufd_put_object(struct iommufd_object *obj) { refcount_dec(&obj->users); up_read(&obj->destroy_rwsem); } void iommufd_object_abort(struct iommufd_ctx *ictx, struct iommufd_object *obj); void iommufd_object_abort_and_destroy(struct iommufd_ctx *ictx, struct iommufd_object *obj); void iommufd_object_finalize(struct iommufd_ctx *ictx, struct iommufd_object *obj); void __iommufd_object_destroy_user(struct iommufd_ctx *ictx, struct iommufd_object *obj, bool allow_fail); static inline void iommufd_object_destroy_user(struct iommufd_ctx *ictx, struct iommufd_object *obj) { __iommufd_object_destroy_user(ictx, obj, false); } static inline void iommufd_object_deref_user(struct iommufd_ctx *ictx, struct iommufd_object *obj) { __iommufd_object_destroy_user(ictx, obj, true); } struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx, size_t size, enum iommufd_object_type type); #define __iommufd_object_alloc(ictx, ptr, type, obj) \ container_of(_iommufd_object_alloc( \ ictx, \ sizeof(*(ptr)) + BUILD_BUG_ON_ZERO( \ offsetof(typeof(*(ptr)), \ obj) != 0), \ type), \ typeof(*(ptr)), obj) #define iommufd_object_alloc(ictx, ptr, type) \ __iommufd_object_alloc(ictx, ptr, type, obj) /* * The IO Address Space (IOAS) pagetable is a virtual page table backed by the * io_pagetable object. It is a user controlled mapping of IOVA -> PFNs. The * mapping is copied into all of the associated domains and made available to * in-kernel users. * * Every iommu_domain that is created is wrapped in a iommufd_hw_pagetable * object. When we go to attach a device to an IOAS we need to get an * iommu_domain and wrapping iommufd_hw_pagetable for it. * * An iommu_domain & iommfd_hw_pagetable will be automatically selected * for a device based on the hwpt_list. If no suitable iommu_domain * is found a new iommu_domain will be created. */ struct iommufd_ioas { struct iommufd_object obj; struct io_pagetable iopt; struct mutex mutex; struct list_head hwpt_list; }; static inline struct iommufd_ioas *iommufd_get_ioas(struct iommufd_ctx *ictx, u32 id) { return container_of(iommufd_get_object(ictx, id, IOMMUFD_OBJ_IOAS), struct iommufd_ioas, obj); } struct iommufd_ioas *iommufd_ioas_alloc(struct iommufd_ctx *ictx); int iommufd_ioas_alloc_ioctl(struct iommufd_ucmd *ucmd); void iommufd_ioas_destroy(struct iommufd_object *obj); int iommufd_ioas_iova_ranges(struct iommufd_ucmd *ucmd); int iommufd_ioas_allow_iovas(struct iommufd_ucmd *ucmd); int iommufd_ioas_map(struct iommufd_ucmd *ucmd); int iommufd_ioas_copy(struct iommufd_ucmd *ucmd); int iommufd_ioas_unmap(struct iommufd_ucmd *ucmd); int iommufd_ioas_option(struct iommufd_ucmd *ucmd); int iommufd_option_rlimit_mode(struct iommu_option *cmd, struct iommufd_ctx *ictx); int iommufd_vfio_ioas(struct iommufd_ucmd *ucmd); int iommufd_check_iova_range(struct io_pagetable *iopt, struct iommu_hwpt_get_dirty_bitmap *bitmap); /* * A HW pagetable is called an iommu_domain inside the kernel. This user object * allows directly creating and inspecting the domains. Domains that have kernel * owned page tables will be associated with an iommufd_ioas that provides the * IOVA to PFN map. */ struct iommufd_hw_pagetable { struct iommufd_object obj; struct iommu_domain *domain; }; struct iommufd_hwpt_paging { struct iommufd_hw_pagetable common; struct iommufd_ioas *ioas; bool auto_domain : 1; bool enforce_cache_coherency : 1; bool msi_cookie : 1; bool nest_parent : 1; /* Head at iommufd_ioas::hwpt_list */ struct list_head hwpt_item; }; struct iommufd_hwpt_nested { struct iommufd_hw_pagetable common; struct iommufd_hwpt_paging *parent; }; static inline bool hwpt_is_paging(struct iommufd_hw_pagetable *hwpt) { return hwpt->obj.type == IOMMUFD_OBJ_HWPT_PAGING; } static inline struct iommufd_hwpt_paging * to_hwpt_paging(struct iommufd_hw_pagetable *hwpt) { return container_of(hwpt, struct iommufd_hwpt_paging, common); } static inline struct iommufd_hwpt_paging * iommufd_get_hwpt_paging(struct iommufd_ucmd *ucmd, u32 id) { return container_of(iommufd_get_object(ucmd->ictx, id, IOMMUFD_OBJ_HWPT_PAGING), struct iommufd_hwpt_paging, common.obj); } int iommufd_hwpt_set_dirty_tracking(struct iommufd_ucmd *ucmd); int iommufd_hwpt_get_dirty_bitmap(struct iommufd_ucmd *ucmd); struct iommufd_hwpt_paging * iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas, struct iommufd_device *idev, u32 flags, bool immediate_attach, const struct iommu_user_data *user_data); int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt, struct iommufd_device *idev); struct iommufd_hw_pagetable * iommufd_hw_pagetable_detach(struct iommufd_device *idev); void iommufd_hwpt_paging_destroy(struct iommufd_object *obj); void iommufd_hwpt_paging_abort(struct iommufd_object *obj); void iommufd_hwpt_nested_destroy(struct iommufd_object *obj); void iommufd_hwpt_nested_abort(struct iommufd_object *obj); int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd); static inline void iommufd_hw_pagetable_put(struct iommufd_ctx *ictx, struct iommufd_hw_pagetable *hwpt) { if (hwpt->obj.type == IOMMUFD_OBJ_HWPT_PAGING) { struct iommufd_hwpt_paging *hwpt_paging = to_hwpt_paging(hwpt); lockdep_assert_not_held(&hwpt_paging->ioas->mutex); if (hwpt_paging->auto_domain) { iommufd_object_deref_user(ictx, &hwpt->obj); return; } } refcount_dec(&hwpt->obj.users); } struct iommufd_group { struct kref ref; struct mutex lock; struct iommufd_ctx *ictx; struct iommu_group *group; struct iommufd_hw_pagetable *hwpt; struct list_head device_list; phys_addr_t sw_msi_start; }; /* * A iommufd_device object represents the binding relationship between a * consuming driver and the iommufd. These objects are created/destroyed by * external drivers, not by userspace. */ struct iommufd_device { struct iommufd_object obj; struct iommufd_ctx *ictx; struct iommufd_group *igroup; struct list_head group_item; /* always the physical device */ struct device *dev; bool enforce_cache_coherency; }; static inline struct iommufd_device * iommufd_get_device(struct iommufd_ucmd *ucmd, u32 id) { return container_of(iommufd_get_object(ucmd->ictx, id, IOMMUFD_OBJ_DEVICE), struct iommufd_device, obj); } void iommufd_device_destroy(struct iommufd_object *obj); int iommufd_get_hw_info(struct iommufd_ucmd *ucmd); struct iommufd_access { struct iommufd_object obj; struct iommufd_ctx *ictx; struct iommufd_ioas *ioas; struct iommufd_ioas *ioas_unpin; struct mutex ioas_lock; const struct iommufd_access_ops *ops; void *data; unsigned long iova_alignment; u32 iopt_access_list_id; }; int iopt_add_access(struct io_pagetable *iopt, struct iommufd_access *access); void iopt_remove_access(struct io_pagetable *iopt, struct iommufd_access *access, u32 iopt_access_list_id); void iommufd_access_destroy_object(struct iommufd_object *obj); #ifdef CONFIG_IOMMUFD_TEST int iommufd_test(struct iommufd_ucmd *ucmd); void iommufd_selftest_destroy(struct iommufd_object *obj); extern size_t iommufd_test_memory_limit; void iommufd_test_syz_conv_iova_id(struct iommufd_ucmd *ucmd, unsigned int ioas_id, u64 *iova, u32 *flags); bool iommufd_should_fail(void); int __init iommufd_test_init(void); void iommufd_test_exit(void); bool iommufd_selftest_is_mock_dev(struct device *dev); #else static inline void iommufd_test_syz_conv_iova_id(struct iommufd_ucmd *ucmd, unsigned int ioas_id, u64 *iova, u32 *flags) { } static inline bool iommufd_should_fail(void) { return false; } static inline int __init iommufd_test_init(void) { return 0; } static inline void iommufd_test_exit(void) { } static inline bool iommufd_selftest_is_mock_dev(struct device *dev) { return false; } #endif #endif
22 22 11 11 33 1036 1035 1037 1085 4 1030 541 540 492 493 3 1032 1028 1030 4 4 1083 1 1 1 1 1 1 1 1031 1031 1028 1085 1085 1082 2 2 2 1084 1085 7 7 3 1084 1032 1084 1085 1030 1085 1085 1084 1073 1728 1731 1723 725 1730 1475 1731 1731 1730 1351 1073 1071 1073 1072 1072 1071 460 838 837 1730 88 6 6 7 7 8 8 6 3 2 3 1 2 12 12 11 12 12 12 12 12 10 6 6 4 6 6 6 3 3 3 3 3 6 6 6 6 6 3 3 3 1 3 3 3 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 // SPDX-License-Identifier: GPL-2.0-only /* * (C) 1999-2001 Paul `Rusty' Russell * (C) 2002-2006 Netfilter Core Team <coreteam@netfilter.org> * (C) 2011 Patrick McHardy <kaber@trash.net> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <linux/types.h> #include <linux/timer.h> #include <linux/skbuff.h> #include <linux/gfp.h> #include <net/xfrm.h> #include <linux/siphash.h> #include <linux/rtnetlink.h> #include <net/netfilter/nf_conntrack_bpf.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_seqadj.h> #include <net/netfilter/nf_conntrack_zones.h> #include <net/netfilter/nf_nat.h> #include <net/netfilter/nf_nat_helper.h> #include <uapi/linux/netfilter/nf_nat.h> #include "nf_internals.h" #define NF_NAT_MAX_ATTEMPTS 128 #define NF_NAT_HARDER_THRESH (NF_NAT_MAX_ATTEMPTS / 4) static spinlock_t nf_nat_locks[CONNTRACK_LOCKS]; static DEFINE_MUTEX(nf_nat_proto_mutex); static unsigned int nat_net_id __read_mostly; static struct hlist_head *nf_nat_bysource __read_mostly; static unsigned int nf_nat_htable_size __read_mostly; static siphash_aligned_key_t nf_nat_hash_rnd; struct nf_nat_lookup_hook_priv { struct nf_hook_entries __rcu *entries; struct rcu_head rcu_head; }; struct nf_nat_hooks_net { struct nf_hook_ops *nat_hook_ops; unsigned int users; }; struct nat_net { struct nf_nat_hooks_net nat_proto_net[NFPROTO_NUMPROTO]; }; #ifdef CONFIG_XFRM static void nf_nat_ipv4_decode_session(struct sk_buff *skb, const struct nf_conn *ct, enum ip_conntrack_dir dir, unsigned long statusbit, struct flowi *fl) { const struct nf_conntrack_tuple *t = &ct->tuplehash[dir].tuple; struct flowi4 *fl4 = &fl->u.ip4; if (ct->status & statusbit) { fl4->daddr = t->dst.u3.ip; if (t->dst.protonum == IPPROTO_TCP || t->dst.protonum == IPPROTO_UDP || t->dst.protonum == IPPROTO_UDPLITE || t->dst.protonum == IPPROTO_DCCP || t->dst.protonum == IPPROTO_SCTP) fl4->fl4_dport = t->dst.u.all; } statusbit ^= IPS_NAT_MASK; if (ct->status & statusbit) { fl4->saddr = t->src.u3.ip; if (t->dst.protonum == IPPROTO_TCP || t->dst.protonum == IPPROTO_UDP || t->dst.protonum == IPPROTO_UDPLITE || t->dst.protonum == IPPROTO_DCCP || t->dst.protonum == IPPROTO_SCTP) fl4->fl4_sport = t->src.u.all; } } static void nf_nat_ipv6_decode_session(struct sk_buff *skb, const struct nf_conn *ct, enum ip_conntrack_dir dir, unsigned long statusbit, struct flowi *fl) { #if IS_ENABLED(CONFIG_IPV6) const struct nf_conntrack_tuple *t = &ct->tuplehash[dir].tuple; struct flowi6 *fl6 = &fl->u.ip6; if (ct->status & statusbit) { fl6->daddr = t->dst.u3.in6; if (t->dst.protonum == IPPROTO_TCP || t->dst.protonum == IPPROTO_UDP || t->dst.protonum == IPPROTO_UDPLITE || t->dst.protonum == IPPROTO_DCCP || t->dst.protonum == IPPROTO_SCTP) fl6->fl6_dport = t->dst.u.all; } statusbit ^= IPS_NAT_MASK; if (ct->status & statusbit) { fl6->saddr = t->src.u3.in6; if (t->dst.protonum == IPPROTO_TCP || t->dst.protonum == IPPROTO_UDP || t->dst.protonum == IPPROTO_UDPLITE || t->dst.protonum == IPPROTO_DCCP || t->dst.protonum == IPPROTO_SCTP) fl6->fl6_sport = t->src.u.all; } #endif } static void __nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl) { const struct nf_conn *ct; enum ip_conntrack_info ctinfo; enum ip_conntrack_dir dir; unsigned long statusbit; u8 family; ct = nf_ct_get(skb, &ctinfo); if (ct == NULL) return; family = nf_ct_l3num(ct); dir = CTINFO2DIR(ctinfo); if (dir == IP_CT_DIR_ORIGINAL) statusbit = IPS_DST_NAT; else statusbit = IPS_SRC_NAT; switch (family) { case NFPROTO_IPV4: nf_nat_ipv4_decode_session(skb, ct, dir, statusbit, fl); return; case NFPROTO_IPV6: nf_nat_ipv6_decode_session(skb, ct, dir, statusbit, fl); return; } } #endif /* CONFIG_XFRM */ /* We keep an extra hash for each conntrack, for fast searching. */ static unsigned int hash_by_src(const struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple) { unsigned int hash; struct { struct nf_conntrack_man src; u32 net_mix; u32 protonum; u32 zone; } __aligned(SIPHASH_ALIGNMENT) combined; get_random_once(&nf_nat_hash_rnd, sizeof(nf_nat_hash_rnd)); memset(&combined, 0, sizeof(combined)); /* Original src, to ensure we map it consistently if poss. */ combined.src = tuple->src; combined.net_mix = net_hash_mix(net); combined.protonum = tuple->dst.protonum; /* Zone ID can be used provided its valid for both directions */ if (zone->dir == NF_CT_DEFAULT_ZONE_DIR) combined.zone = zone->id; hash = siphash(&combined, sizeof(combined), &nf_nat_hash_rnd); return reciprocal_scale(hash, nf_nat_htable_size); } /* Is this tuple already taken? (not by us) */ static int nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple, const struct nf_conn *ignored_conntrack) { /* Conntrack tracking doesn't keep track of outgoing tuples; only * incoming ones. NAT means they don't have a fixed mapping, * so we invert the tuple and look for the incoming reply. * * We could keep a separate hash if this proves too slow. */ struct nf_conntrack_tuple reply; nf_ct_invert_tuple(&reply, tuple); return nf_conntrack_tuple_taken(&reply, ignored_conntrack); } static bool nf_nat_may_kill(struct nf_conn *ct, unsigned long flags) { static const unsigned long flags_refuse = IPS_FIXED_TIMEOUT | IPS_DYING; static const unsigned long flags_needed = IPS_SRC_NAT; enum tcp_conntrack old_state; old_state = READ_ONCE(ct->proto.tcp.state); if (old_state < TCP_CONNTRACK_TIME_WAIT) return false; if (flags & flags_refuse) return false; return (flags & flags_needed) == flags_needed; } /* reverse direction will send packets to new source, so * make sure such packets are invalid. */ static bool nf_seq_has_advanced(const struct nf_conn *old, const struct nf_conn *new) { return (__s32)(new->proto.tcp.seen[0].td_end - old->proto.tcp.seen[0].td_end) > 0; } static int nf_nat_used_tuple_harder(const struct nf_conntrack_tuple *tuple, const struct nf_conn *ignored_conntrack, unsigned int attempts_left) { static const unsigned long flags_offload = IPS_OFFLOAD | IPS_HW_OFFLOAD; struct nf_conntrack_tuple_hash *thash; const struct nf_conntrack_zone *zone; struct nf_conntrack_tuple reply; unsigned long flags; struct nf_conn *ct; bool taken = true; struct net *net; nf_ct_invert_tuple(&reply, tuple); if (attempts_left > NF_NAT_HARDER_THRESH || tuple->dst.protonum != IPPROTO_TCP || ignored_conntrack->proto.tcp.state != TCP_CONNTRACK_SYN_SENT) return nf_conntrack_tuple_taken(&reply, ignored_conntrack); /* :ast few attempts to find a free tcp port. Destructive * action: evict colliding if its in timewait state and the * tcp sequence number has advanced past the one used by the * old entry. */ net = nf_ct_net(ignored_conntrack); zone = nf_ct_zone(ignored_conntrack); thash = nf_conntrack_find_get(net, zone, &reply); if (!thash) return false; ct = nf_ct_tuplehash_to_ctrack(thash); if (thash->tuple.dst.dir == IP_CT_DIR_ORIGINAL) goto out; if (WARN_ON_ONCE(ct == ignored_conntrack)) goto out; flags = READ_ONCE(ct->status); if (!nf_nat_may_kill(ct, flags)) goto out; if (!nf_seq_has_advanced(ct, ignored_conntrack)) goto out; /* Even if we can evict do not reuse if entry is offloaded. */ if (nf_ct_kill(ct)) taken = flags & flags_offload; out: nf_ct_put(ct); return taken; } static bool nf_nat_inet_in_range(const struct nf_conntrack_tuple *t, const struct nf_nat_range2 *range) { if (t->src.l3num == NFPROTO_IPV4) return ntohl(t->src.u3.ip) >= ntohl(range->min_addr.ip) && ntohl(t->src.u3.ip) <= ntohl(range->max_addr.ip); return ipv6_addr_cmp(&t->src.u3.in6, &range->min_addr.in6) >= 0 && ipv6_addr_cmp(&t->src.u3.in6, &range->max_addr.in6) <= 0; } /* Is the manipable part of the tuple between min and max incl? */ static bool l4proto_in_range(const struct nf_conntrack_tuple *tuple, enum nf_nat_manip_type maniptype, const union nf_conntrack_man_proto *min, const union nf_conntrack_man_proto *max) { __be16 port; switch (tuple->dst.protonum) { case IPPROTO_ICMP: case IPPROTO_ICMPV6: return ntohs(tuple->src.u.icmp.id) >= ntohs(min->icmp.id) && ntohs(tuple->src.u.icmp.id) <= ntohs(max->icmp.id); case IPPROTO_GRE: /* all fall though */ case IPPROTO_TCP: case IPPROTO_UDP: case IPPROTO_UDPLITE: case IPPROTO_DCCP: case IPPROTO_SCTP: if (maniptype == NF_NAT_MANIP_SRC) port = tuple->src.u.all; else port = tuple->dst.u.all; return ntohs(port) >= ntohs(min->all) && ntohs(port) <= ntohs(max->all); default: return true; } } /* If we source map this tuple so reply looks like reply_tuple, will * that meet the constraints of range. */ static int nf_in_range(const struct nf_conntrack_tuple *tuple, const struct nf_nat_range2 *range) { /* If we are supposed to map IPs, then we must be in the * range specified, otherwise let this drag us onto a new src IP. */ if (range->flags & NF_NAT_RANGE_MAP_IPS && !nf_nat_inet_in_range(tuple, range)) return 0; if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) return 1; return l4proto_in_range(tuple, NF_NAT_MANIP_SRC, &range->min_proto, &range->max_proto); } static inline int same_src(const struct nf_conn *ct, const struct nf_conntrack_tuple *tuple) { const struct nf_conntrack_tuple *t; t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; return (t->dst.protonum == tuple->dst.protonum && nf_inet_addr_cmp(&t->src.u3, &tuple->src.u3) && t->src.u.all == tuple->src.u.all); } /* Only called for SRC manip */ static int find_appropriate_src(struct net *net, const struct nf_conntrack_zone *zone, const struct nf_conntrack_tuple *tuple, struct nf_conntrack_tuple *result, const struct nf_nat_range2 *range) { unsigned int h = hash_by_src(net, zone, tuple); const struct nf_conn *ct; hlist_for_each_entry_rcu(ct, &nf_nat_bysource[h], nat_bysource) { if (same_src(ct, tuple) && net_eq(net, nf_ct_net(ct)) && nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) { /* Copy source part from reply tuple. */ nf_ct_invert_tuple(result, &ct->tuplehash[IP_CT_DIR_REPLY].tuple); result->dst = tuple->dst; if (nf_in_range(result, range)) return 1; } } return 0; } /* For [FUTURE] fragmentation handling, we want the least-used * src-ip/dst-ip/proto triple. Fairness doesn't come into it. Thus * if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports * 1-65535, we don't do pro-rata allocation based on ports; we choose * the ip with the lowest src-ip/dst-ip/proto usage. */ static void find_best_ips_proto(const struct nf_conntrack_zone *zone, struct nf_conntrack_tuple *tuple, const struct nf_nat_range2 *range, const struct nf_conn *ct, enum nf_nat_manip_type maniptype) { union nf_inet_addr *var_ipp; unsigned int i, max; /* Host order */ u32 minip, maxip, j, dist; bool full_range; /* No IP mapping? Do nothing. */ if (!(range->flags & NF_NAT_RANGE_MAP_IPS)) return; if (maniptype == NF_NAT_MANIP_SRC) var_ipp = &tuple->src.u3; else var_ipp = &tuple->dst.u3; /* Fast path: only one choice. */ if (nf_inet_addr_cmp(&range->min_addr, &range->max_addr)) { *var_ipp = range->min_addr; return; } if (nf_ct_l3num(ct) == NFPROTO_IPV4) max = sizeof(var_ipp->ip) / sizeof(u32) - 1; else max = sizeof(var_ipp->ip6) / sizeof(u32) - 1; /* Hashing source and destination IPs gives a fairly even * spread in practice (if there are a small number of IPs * involved, there usually aren't that many connections * anyway). The consistency means that servers see the same * client coming from the same IP (some Internet Banking sites * like this), even across reboots. */ j = jhash2((u32 *)&tuple->src.u3, sizeof(tuple->src.u3) / sizeof(u32), range->flags & NF_NAT_RANGE_PERSISTENT ? 0 : (__force u32)tuple->dst.u3.all[max] ^ zone->id); full_range = false; for (i = 0; i <= max; i++) { /* If first bytes of the address are at the maximum, use the * distance. Otherwise use the full range. */ if (!full_range) { minip = ntohl((__force __be32)range->min_addr.all[i]); maxip = ntohl((__force __be32)range->max_addr.all[i]); dist = maxip - minip + 1; } else { minip = 0; dist = ~0; } var_ipp->all[i] = (__force __u32) htonl(minip + reciprocal_scale(j, dist)); if (var_ipp->all[i] != range->max_addr.all[i]) full_range = true; if (!(range->flags & NF_NAT_RANGE_PERSISTENT)) j ^= (__force u32)tuple->dst.u3.all[i]; } } /* Alter the per-proto part of the tuple (depending on maniptype), to * give a unique tuple in the given range if possible. * * Per-protocol part of tuple is initialized to the incoming packet. */ static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple, const struct nf_nat_range2 *range, enum nf_nat_manip_type maniptype, const struct nf_conn *ct) { unsigned int range_size, min, max, i, attempts; __be16 *keyptr; u16 off; switch (tuple->dst.protonum) { case IPPROTO_ICMP: case IPPROTO_ICMPV6: /* id is same for either direction... */ keyptr = &tuple->src.u.icmp.id; if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) { min = 0; range_size = 65536; } else { min = ntohs(range->min_proto.icmp.id); range_size = ntohs(range->max_proto.icmp.id) - ntohs(range->min_proto.icmp.id) + 1; } goto find_free_id; #if IS_ENABLED(CONFIG_NF_CT_PROTO_GRE) case IPPROTO_GRE: /* If there is no master conntrack we are not PPTP, do not change tuples */ if (!ct->master) return; if (maniptype == NF_NAT_MANIP_SRC) keyptr = &tuple->src.u.gre.key; else keyptr = &tuple->dst.u.gre.key; if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) { min = 1; range_size = 65535; } else { min = ntohs(range->min_proto.gre.key); range_size = ntohs(range->max_proto.gre.key) - min + 1; } goto find_free_id; #endif case IPPROTO_UDP: case IPPROTO_UDPLITE: case IPPROTO_TCP: case IPPROTO_SCTP: case IPPROTO_DCCP: if (maniptype == NF_NAT_MANIP_SRC) keyptr = &tuple->src.u.all; else keyptr = &tuple->dst.u.all; break; default: return; } /* If no range specified... */ if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) { /* If it's dst rewrite, can't change port */ if (maniptype == NF_NAT_MANIP_DST) return; if (ntohs(*keyptr) < 1024) { /* Loose convention: >> 512 is credential passing */ if (ntohs(*keyptr) < 512) { min = 1; range_size = 511 - min + 1; } else { min = 600; range_size = 1023 - min + 1; } } else { min = 1024; range_size = 65535 - 1024 + 1; } } else { min = ntohs(range->min_proto.all); max = ntohs(range->max_proto.all); if (unlikely(max < min)) swap(max, min); range_size = max - min + 1; } find_free_id: if (range->flags & NF_NAT_RANGE_PROTO_OFFSET) off = (ntohs(*keyptr) - ntohs(range->base_proto.all)); else off = get_random_u16(); attempts = range_size; if (attempts > NF_NAT_MAX_ATTEMPTS) attempts = NF_NAT_MAX_ATTEMPTS; /* We are in softirq; doing a search of the entire range risks * soft lockup when all tuples are already used. * * If we can't find any free port from first offset, pick a new * one and try again, with ever smaller search window. */ another_round: for (i = 0; i < attempts; i++, off++) { *keyptr = htons(min + off % range_size); if (!nf_nat_used_tuple_harder(tuple, ct, attempts - i)) return; } if (attempts >= range_size || attempts < 16) return; attempts /= 2; off = get_random_u16(); goto another_round; } /* Manipulate the tuple into the range given. For NF_INET_POST_ROUTING, * we change the source to map into the range. For NF_INET_PRE_ROUTING * and NF_INET_LOCAL_OUT, we change the destination to map into the * range. It might not be possible to get a unique tuple, but we try. * At worst (or if we race), we will end up with a final duplicate in * __nf_conntrack_confirm and drop the packet. */ static void get_unique_tuple(struct nf_conntrack_tuple *tuple, const struct nf_conntrack_tuple *orig_tuple, const struct nf_nat_range2 *range, struct nf_conn *ct, enum nf_nat_manip_type maniptype) { const struct nf_conntrack_zone *zone; struct net *net = nf_ct_net(ct); zone = nf_ct_zone(ct); /* 1) If this srcip/proto/src-proto-part is currently mapped, * and that same mapping gives a unique tuple within the given * range, use that. * * This is only required for source (ie. NAT/masq) mappings. * So far, we don't do local source mappings, so multiple * manips not an issue. */ if (maniptype == NF_NAT_MANIP_SRC && !(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) { /* try the original tuple first */ if (nf_in_range(orig_tuple, range)) { if (!nf_nat_used_tuple(orig_tuple, ct)) { *tuple = *orig_tuple; return; } } else if (find_appropriate_src(net, zone, orig_tuple, tuple, range)) { pr_debug("get_unique_tuple: Found current src map\n"); if (!nf_nat_used_tuple(tuple, ct)) return; } } /* 2) Select the least-used IP/proto combination in the given range */ *tuple = *orig_tuple; find_best_ips_proto(zone, tuple, range, ct, maniptype); /* 3) The per-protocol part of the manip is made to map into * the range to make a unique tuple. */ /* Only bother mapping if it's not already in range and unique */ if (!(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) { if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) { if (!(range->flags & NF_NAT_RANGE_PROTO_OFFSET) && l4proto_in_range(tuple, maniptype, &range->min_proto, &range->max_proto) && (range->min_proto.all == range->max_proto.all || !nf_nat_used_tuple(tuple, ct))) return; } else if (!nf_nat_used_tuple(tuple, ct)) { return; } } /* Last chance: get protocol to try to obtain unique tuple. */ nf_nat_l4proto_unique_tuple(tuple, range, maniptype, ct); } struct nf_conn_nat *nf_ct_nat_ext_add(struct nf_conn *ct) { struct nf_conn_nat *nat = nfct_nat(ct); if (nat) return nat; if (!nf_ct_is_confirmed(ct)) nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC); return nat; } EXPORT_SYMBOL_GPL(nf_ct_nat_ext_add); unsigned int nf_nat_setup_info(struct nf_conn *ct, const struct nf_nat_range2 *range, enum nf_nat_manip_type maniptype) { struct net *net = nf_ct_net(ct); struct nf_conntrack_tuple curr_tuple, new_tuple; /* Can't setup nat info for confirmed ct. */ if (nf_ct_is_confirmed(ct)) return NF_ACCEPT; WARN_ON(maniptype != NF_NAT_MANIP_SRC && maniptype != NF_NAT_MANIP_DST); if (WARN_ON(nf_nat_initialized(ct, maniptype))) return NF_DROP; /* What we've got will look like inverse of reply. Normally * this is what is in the conntrack, except for prior * manipulations (future optimization: if num_manips == 0, * orig_tp = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple) */ nf_ct_invert_tuple(&curr_tuple, &ct->tuplehash[IP_CT_DIR_REPLY].tuple); get_unique_tuple(&new_tuple, &curr_tuple, range, ct, maniptype); if (!nf_ct_tuple_equal(&new_tuple, &curr_tuple)) { struct nf_conntrack_tuple reply; /* Alter conntrack table so will recognize replies. */ nf_ct_invert_tuple(&reply, &new_tuple); nf_conntrack_alter_reply(ct, &reply); /* Non-atomic: we own this at the moment. */ if (maniptype == NF_NAT_MANIP_SRC) ct->status |= IPS_SRC_NAT; else ct->status |= IPS_DST_NAT; if (nfct_help(ct) && !nfct_seqadj(ct)) if (!nfct_seqadj_ext_add(ct)) return NF_DROP; } if (maniptype == NF_NAT_MANIP_SRC) { unsigned int srchash; spinlock_t *lock; srchash = hash_by_src(net, nf_ct_zone(ct), &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); lock = &nf_nat_locks[srchash % CONNTRACK_LOCKS]; spin_lock_bh(lock); hlist_add_head_rcu(&ct->nat_bysource, &nf_nat_bysource[srchash]); spin_unlock_bh(lock); } /* It's done. */ if (maniptype == NF_NAT_MANIP_DST) ct->status |= IPS_DST_NAT_DONE; else ct->status |= IPS_SRC_NAT_DONE; return NF_ACCEPT; } EXPORT_SYMBOL(nf_nat_setup_info); static unsigned int __nf_nat_alloc_null_binding(struct nf_conn *ct, enum nf_nat_manip_type manip) { /* Force range to this IP; let proto decide mapping for * per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED). * Use reply in case it's already been mangled (eg local packet). */ union nf_inet_addr ip = (manip == NF_NAT_MANIP_SRC ? ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3 : ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3); struct nf_nat_range2 range = { .flags = NF_NAT_RANGE_MAP_IPS, .min_addr = ip, .max_addr = ip, }; return nf_nat_setup_info(ct, &range, manip); } unsigned int nf_nat_alloc_null_binding(struct nf_conn *ct, unsigned int hooknum) { return __nf_nat_alloc_null_binding(ct, HOOK2MANIP(hooknum)); } EXPORT_SYMBOL_GPL(nf_nat_alloc_null_binding); /* Do packet manipulations according to nf_nat_setup_info. */ unsigned int nf_nat_packet(struct nf_conn *ct, enum ip_conntrack_info ctinfo, unsigned int hooknum, struct sk_buff *skb) { enum nf_nat_manip_type mtype = HOOK2MANIP(hooknum); enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); unsigned int verdict = NF_ACCEPT; unsigned long statusbit; if (mtype == NF_NAT_MANIP_SRC) statusbit = IPS_SRC_NAT; else statusbit = IPS_DST_NAT; /* Invert if this is reply dir. */ if (dir == IP_CT_DIR_REPLY) statusbit ^= IPS_NAT_MASK; /* Non-atomic: these bits don't change. */ if (ct->status & statusbit) verdict = nf_nat_manip_pkt(skb, ct, mtype, dir); return verdict; } EXPORT_SYMBOL_GPL(nf_nat_packet); static bool in_vrf_postrouting(const struct nf_hook_state *state) { #if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) if (state->hook == NF_INET_POST_ROUTING && netif_is_l3_master(state->out)) return true; #endif return false; } unsigned int nf_nat_inet_fn(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { struct nf_conn *ct; enum ip_conntrack_info ctinfo; struct nf_conn_nat *nat; /* maniptype == SRC for postrouting. */ enum nf_nat_manip_type maniptype = HOOK2MANIP(state->hook); ct = nf_ct_get(skb, &ctinfo); /* Can't track? It's not due to stress, or conntrack would * have dropped it. Hence it's the user's responsibilty to * packet filter it out, or implement conntrack/NAT for that * protocol. 8) --RR */ if (!ct || in_vrf_postrouting(state)) return NF_ACCEPT; nat = nfct_nat(ct); switch (ctinfo) { case IP_CT_RELATED: case IP_CT_RELATED_REPLY: /* Only ICMPs can be IP_CT_IS_REPLY. Fallthrough */ case IP_CT_NEW: /* Seen it before? This can happen for loopback, retrans, * or local packets. */ if (!nf_nat_initialized(ct, maniptype)) { struct nf_nat_lookup_hook_priv *lpriv = priv; struct nf_hook_entries *e = rcu_dereference(lpriv->entries); unsigned int ret; int i; if (!e) goto null_bind; for (i = 0; i < e->num_hook_entries; i++) { ret = e->hooks[i].hook(e->hooks[i].priv, skb, state); if (ret != NF_ACCEPT) return ret; if (nf_nat_initialized(ct, maniptype)) goto do_nat; } null_bind: ret = nf_nat_alloc_null_binding(ct, state->hook); if (ret != NF_ACCEPT) return ret; } else { pr_debug("Already setup manip %s for ct %p (status bits 0x%lx)\n", maniptype == NF_NAT_MANIP_SRC ? "SRC" : "DST", ct, ct->status); if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out)) goto oif_changed; } break; default: /* ESTABLISHED */ WARN_ON(ctinfo != IP_CT_ESTABLISHED && ctinfo != IP_CT_ESTABLISHED_REPLY); if (nf_nat_oif_changed(state->hook, ctinfo, nat, state->out)) goto oif_changed; } do_nat: return nf_nat_packet(ct, ctinfo, state->hook, skb); oif_changed: nf_ct_kill_acct(ct, ctinfo, skb); return NF_DROP; } EXPORT_SYMBOL_GPL(nf_nat_inet_fn); struct nf_nat_proto_clean { u8 l3proto; u8 l4proto; }; /* kill conntracks with affected NAT section */ static int nf_nat_proto_remove(struct nf_conn *i, void *data) { const struct nf_nat_proto_clean *clean = data; if ((clean->l3proto && nf_ct_l3num(i) != clean->l3proto) || (clean->l4proto && nf_ct_protonum(i) != clean->l4proto)) return 0; return i->status & IPS_NAT_MASK ? 1 : 0; } static void nf_nat_cleanup_conntrack(struct nf_conn *ct) { unsigned int h; h = hash_by_src(nf_ct_net(ct), nf_ct_zone(ct), &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); spin_lock_bh(&nf_nat_locks[h % CONNTRACK_LOCKS]); hlist_del_rcu(&ct->nat_bysource); spin_unlock_bh(&nf_nat_locks[h % CONNTRACK_LOCKS]); } static int nf_nat_proto_clean(struct nf_conn *ct, void *data) { if (nf_nat_proto_remove(ct, data)) return 1; /* This module is being removed and conntrack has nat null binding. * Remove it from bysource hash, as the table will be freed soon. * * Else, when the conntrack is destoyed, nf_nat_cleanup_conntrack() * will delete entry from already-freed table. */ if (test_and_clear_bit(IPS_SRC_NAT_DONE_BIT, &ct->status)) nf_nat_cleanup_conntrack(ct); /* don't delete conntrack. Although that would make things a lot * simpler, we'd end up flushing all conntracks on nat rmmod. */ return 0; } #if IS_ENABLED(CONFIG_NF_CT_NETLINK) #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nfnetlink_conntrack.h> static const struct nla_policy protonat_nla_policy[CTA_PROTONAT_MAX+1] = { [CTA_PROTONAT_PORT_MIN] = { .type = NLA_U16 }, [CTA_PROTONAT_PORT_MAX] = { .type = NLA_U16 }, }; static int nf_nat_l4proto_nlattr_to_range(struct nlattr *tb[], struct nf_nat_range2 *range) { if (tb[CTA_PROTONAT_PORT_MIN]) { range->min_proto.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MIN]); range->max_proto.all = range->min_proto.all; range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED; } if (tb[CTA_PROTONAT_PORT_MAX]) { range->max_proto.all = nla_get_be16(tb[CTA_PROTONAT_PORT_MAX]); range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED; } return 0; } static int nfnetlink_parse_nat_proto(struct nlattr *attr, const struct nf_conn *ct, struct nf_nat_range2 *range) { struct nlattr *tb[CTA_PROTONAT_MAX+1]; int err; err = nla_parse_nested_deprecated(tb, CTA_PROTONAT_MAX, attr, protonat_nla_policy, NULL); if (err < 0) return err; return nf_nat_l4proto_nlattr_to_range(tb, range); } static const struct nla_policy nat_nla_policy[CTA_NAT_MAX+1] = { [CTA_NAT_V4_MINIP] = { .type = NLA_U32 }, [CTA_NAT_V4_MAXIP] = { .type = NLA_U32 }, [CTA_NAT_V6_MINIP] = { .len = sizeof(struct in6_addr) }, [CTA_NAT_V6_MAXIP] = { .len = sizeof(struct in6_addr) }, [CTA_NAT_PROTO] = { .type = NLA_NESTED }, }; static int nf_nat_ipv4_nlattr_to_range(struct nlattr *tb[], struct nf_nat_range2 *range) { if (tb[CTA_NAT_V4_MINIP]) { range->min_addr.ip = nla_get_be32(tb[CTA_NAT_V4_MINIP]); range->flags |= NF_NAT_RANGE_MAP_IPS; } if (tb[CTA_NAT_V4_MAXIP]) range->max_addr.ip = nla_get_be32(tb[CTA_NAT_V4_MAXIP]); else range->max_addr.ip = range->min_addr.ip; return 0; } static int nf_nat_ipv6_nlattr_to_range(struct nlattr *tb[], struct nf_nat_range2 *range) { if (tb[CTA_NAT_V6_MINIP]) { nla_memcpy(&range->min_addr.ip6, tb[CTA_NAT_V6_MINIP], sizeof(struct in6_addr)); range->flags |= NF_NAT_RANGE_MAP_IPS; } if (tb[CTA_NAT_V6_MAXIP]) nla_memcpy(&range->max_addr.ip6, tb[CTA_NAT_V6_MAXIP], sizeof(struct in6_addr)); else range->max_addr = range->min_addr; return 0; } static int nfnetlink_parse_nat(const struct nlattr *nat, const struct nf_conn *ct, struct nf_nat_range2 *range) { struct nlattr *tb[CTA_NAT_MAX+1]; int err; memset(range, 0, sizeof(*range)); err = nla_parse_nested_deprecated(tb, CTA_NAT_MAX, nat, nat_nla_policy, NULL); if (err < 0) return err; switch (nf_ct_l3num(ct)) { case NFPROTO_IPV4: err = nf_nat_ipv4_nlattr_to_range(tb, range); break; case NFPROTO_IPV6: err = nf_nat_ipv6_nlattr_to_range(tb, range); break; default: err = -EPROTONOSUPPORT; break; } if (err) return err; if (!tb[CTA_NAT_PROTO]) return 0; return nfnetlink_parse_nat_proto(tb[CTA_NAT_PROTO], ct, range); } /* This function is called under rcu_read_lock() */ static int nfnetlink_parse_nat_setup(struct nf_conn *ct, enum nf_nat_manip_type manip, const struct nlattr *attr) { struct nf_nat_range2 range; int err; /* Should not happen, restricted to creating new conntracks * via ctnetlink. */ if (WARN_ON_ONCE(nf_nat_initialized(ct, manip))) return -EEXIST; /* No NAT information has been passed, allocate the null-binding */ if (attr == NULL) return __nf_nat_alloc_null_binding(ct, manip) == NF_DROP ? -ENOMEM : 0; err = nfnetlink_parse_nat(attr, ct, &range); if (err < 0) return err; return nf_nat_setup_info(ct, &range, manip) == NF_DROP ? -ENOMEM : 0; } #else static int nfnetlink_parse_nat_setup(struct nf_conn *ct, enum nf_nat_manip_type manip, const struct nlattr *attr) { return -EOPNOTSUPP; } #endif static struct nf_ct_helper_expectfn follow_master_nat = { .name = "nat-follow-master", .expectfn = nf_nat_follow_master, }; int nf_nat_register_fn(struct net *net, u8 pf, const struct nf_hook_ops *ops, const struct nf_hook_ops *orig_nat_ops, unsigned int ops_count) { struct nat_net *nat_net = net_generic(net, nat_net_id); struct nf_nat_hooks_net *nat_proto_net; struct nf_nat_lookup_hook_priv *priv; unsigned int hooknum = ops->hooknum; struct nf_hook_ops *nat_ops; int i, ret; if (WARN_ON_ONCE(pf >= ARRAY_SIZE(nat_net->nat_proto_net))) return -EINVAL; nat_proto_net = &nat_net->nat_proto_net[pf]; for (i = 0; i < ops_count; i++) { if (orig_nat_ops[i].hooknum == hooknum) { hooknum = i; break; } } if (WARN_ON_ONCE(i == ops_count)) return -EINVAL; mutex_lock(&nf_nat_proto_mutex); if (!nat_proto_net->nat_hook_ops) { WARN_ON(nat_proto_net->users != 0); nat_ops = kmemdup(orig_nat_ops, sizeof(*orig_nat_ops) * ops_count, GFP_KERNEL); if (!nat_ops) { mutex_unlock(&nf_nat_proto_mutex); return -ENOMEM; } for (i = 0; i < ops_count; i++) { priv = kzalloc(sizeof(*priv), GFP_KERNEL); if (priv) { nat_ops[i].priv = priv; continue; } mutex_unlock(&nf_nat_proto_mutex); while (i) kfree(nat_ops[--i].priv); kfree(nat_ops); return -ENOMEM; } ret = nf_register_net_hooks(net, nat_ops, ops_count); if (ret < 0) { mutex_unlock(&nf_nat_proto_mutex); for (i = 0; i < ops_count; i++) kfree(nat_ops[i].priv); kfree(nat_ops); return ret; } nat_proto_net->nat_hook_ops = nat_ops; } nat_ops = nat_proto_net->nat_hook_ops; priv = nat_ops[hooknum].priv; if (WARN_ON_ONCE(!priv)) { mutex_unlock(&nf_nat_proto_mutex); return -EOPNOTSUPP; } ret = nf_hook_entries_insert_raw(&priv->entries, ops); if (ret == 0) nat_proto_net->users++; mutex_unlock(&nf_nat_proto_mutex); return ret; } void nf_nat_unregister_fn(struct net *net, u8 pf, const struct nf_hook_ops *ops, unsigned int ops_count) { struct nat_net *nat_net = net_generic(net, nat_net_id); struct nf_nat_hooks_net *nat_proto_net; struct nf_nat_lookup_hook_priv *priv; struct nf_hook_ops *nat_ops; int hooknum = ops->hooknum; int i; if (pf >= ARRAY_SIZE(nat_net->nat_proto_net)) return; nat_proto_net = &nat_net->nat_proto_net[pf]; mutex_lock(&nf_nat_proto_mutex); if (WARN_ON(nat_proto_net->users == 0)) goto unlock; nat_proto_net->users--; nat_ops = nat_proto_net->nat_hook_ops; for (i = 0; i < ops_count; i++) { if (nat_ops[i].hooknum == hooknum) { hooknum = i; break; } } if (WARN_ON_ONCE(i == ops_count)) goto unlock; priv = nat_ops[hooknum].priv; nf_hook_entries_delete_raw(&priv->entries, ops); if (nat_proto_net->users == 0) { nf_unregister_net_hooks(net, nat_ops, ops_count); for (i = 0; i < ops_count; i++) { priv = nat_ops[i].priv; kfree_rcu(priv, rcu_head); } nat_proto_net->nat_hook_ops = NULL; kfree(nat_ops); } unlock: mutex_unlock(&nf_nat_proto_mutex); } static struct pernet_operations nat_net_ops = { .id = &nat_net_id, .size = sizeof(struct nat_net), }; static const struct nf_nat_hook nat_hook = { .parse_nat_setup = nfnetlink_parse_nat_setup, #ifdef CONFIG_XFRM .decode_session = __nf_nat_decode_session, #endif .manip_pkt = nf_nat_manip_pkt, .remove_nat_bysrc = nf_nat_cleanup_conntrack, }; static int __init nf_nat_init(void) { int ret, i; /* Leave them the same for the moment. */ nf_nat_htable_size = nf_conntrack_htable_size; if (nf_nat_htable_size < CONNTRACK_LOCKS) nf_nat_htable_size = CONNTRACK_LOCKS; nf_nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, 0); if (!nf_nat_bysource) return -ENOMEM; for (i = 0; i < CONNTRACK_LOCKS; i++) spin_lock_init(&nf_nat_locks[i]); ret = register_pernet_subsys(&nat_net_ops); if (ret < 0) { kvfree(nf_nat_bysource); return ret; } nf_ct_helper_expectfn_register(&follow_master_nat); WARN_ON(nf_nat_hook != NULL); RCU_INIT_POINTER(nf_nat_hook, &nat_hook); ret = register_nf_nat_bpf(); if (ret < 0) { RCU_INIT_POINTER(nf_nat_hook, NULL); nf_ct_helper_expectfn_unregister(&follow_master_nat); synchronize_net(); unregister_pernet_subsys(&nat_net_ops); kvfree(nf_nat_bysource); } return ret; } static void __exit nf_nat_cleanup(void) { struct nf_nat_proto_clean clean = {}; nf_ct_iterate_destroy(nf_nat_proto_clean, &clean); nf_ct_helper_expectfn_unregister(&follow_master_nat); RCU_INIT_POINTER(nf_nat_hook, NULL); synchronize_net(); kvfree(nf_nat_bysource); unregister_pernet_subsys(&nat_net_ops); } MODULE_LICENSE("GPL"); module_init(nf_nat_init); module_exit(nf_nat_cleanup);
19 18 5 5 5 18 18 5 5 5 19 18 19 19 19 5 19 18 18 5 5 5 5 5 12 13 123 123 4 4 4 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 // SPDX-License-Identifier: GPL-2.0 #include <linux/jiffies.h> #include <linux/kernel.h> #include <linux/ktime.h> #include <linux/list.h> #include <linux/math64.h> #include <linux/sizes.h> #include <linux/workqueue.h> #include "ctree.h" #include "block-group.h" #include "discard.h" #include "free-space-cache.h" #include "fs.h" /* * This contains the logic to handle async discard. * * Async discard manages trimming of free space outside of transaction commit. * Discarding is done by managing the block_groups on a LRU list based on free * space recency. Two passes are used to first prioritize discarding extents * and then allow for trimming in the bitmap the best opportunity to coalesce. * The block_groups are maintained on multiple lists to allow for multiple * passes with different discard filter requirements. A delayed work item is * used to manage discarding with timeout determined by a max of the delay * incurred by the iops rate limit, the byte rate limit, and the max delay of * BTRFS_DISCARD_MAX_DELAY. * * Note, this only keeps track of block_groups that are explicitly for data. * Mixed block_groups are not supported. * * The first list is special to manage discarding of fully free block groups. * This is necessary because we issue a final trim for a full free block group * after forgetting it. When a block group becomes unused, instead of directly * being added to the unused_bgs list, we add it to this first list. Then * from there, if it becomes fully discarded, we place it onto the unused_bgs * list. * * The in-memory free space cache serves as the backing state for discard. * Consequently this means there is no persistence. We opt to load all the * block groups in as not discarded, so the mount case degenerates to the * crashing case. * * As the free space cache uses bitmaps, there exists a tradeoff between * ease/efficiency for find_free_extent() and the accuracy of discard state. * Here we opt to let untrimmed regions merge with everything while only letting * trimmed regions merge with other trimmed regions. This can cause * overtrimming, but the coalescing benefit seems to be worth it. Additionally, * bitmap state is tracked as a whole. If we're able to fully trim a bitmap, * the trimmed flag is set on the bitmap. Otherwise, if an allocation comes in, * this resets the state and we will retry trimming the whole bitmap. This is a * tradeoff between discard state accuracy and the cost of accounting. */ /* This is an initial delay to give some chance for block reuse */ #define BTRFS_DISCARD_DELAY (120ULL * NSEC_PER_SEC) #define BTRFS_DISCARD_UNUSED_DELAY (10ULL * NSEC_PER_SEC) #define BTRFS_DISCARD_MIN_DELAY_MSEC (1UL) #define BTRFS_DISCARD_MAX_DELAY_MSEC (1000UL) #define BTRFS_DISCARD_MAX_IOPS (1000U) /* Monotonically decreasing minimum length filters after index 0 */ static int discard_minlen[BTRFS_NR_DISCARD_LISTS] = { 0, BTRFS_ASYNC_DISCARD_MAX_FILTER, BTRFS_ASYNC_DISCARD_MIN_FILTER }; static struct list_head *get_discard_list(struct btrfs_discard_ctl *discard_ctl, struct btrfs_block_group *block_group) { return &discard_ctl->discard_list[block_group->discard_index]; } /* * Determine if async discard should be running. * * @discard_ctl: discard control * * Check if the file system is writeable and BTRFS_FS_DISCARD_RUNNING is set. */ static bool btrfs_run_discard_work(struct btrfs_discard_ctl *discard_ctl) { struct btrfs_fs_info *fs_info = container_of(discard_ctl, struct btrfs_fs_info, discard_ctl); return (!(fs_info->sb->s_flags & SB_RDONLY) && test_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags)); } static void __add_to_discard_list(struct btrfs_discard_ctl *discard_ctl, struct btrfs_block_group *block_group) { lockdep_assert_held(&discard_ctl->lock); if (!btrfs_run_discard_work(discard_ctl)) return; if (list_empty(&block_group->discard_list) || block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED) { if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED) block_group->discard_index = BTRFS_DISCARD_INDEX_START; block_group->discard_eligible_time = (ktime_get_ns() + BTRFS_DISCARD_DELAY); block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR; } if (list_empty(&block_group->discard_list)) btrfs_get_block_group(block_group); list_move_tail(&block_group->discard_list, get_discard_list(discard_ctl, block_group)); } static void add_to_discard_list(struct btrfs_discard_ctl *discard_ctl, struct btrfs_block_group *block_group) { if (!btrfs_is_block_group_data_only(block_group)) return; spin_lock(&discard_ctl->lock); __add_to_discard_list(discard_ctl, block_group); spin_unlock(&discard_ctl->lock); } static void add_to_discard_unused_list(struct btrfs_discard_ctl *discard_ctl, struct btrfs_block_group *block_group) { bool queued; spin_lock(&discard_ctl->lock); queued = !list_empty(&block_group->discard_list); if (!btrfs_run_discard_work(discard_ctl)) { spin_unlock(&discard_ctl->lock); return; } list_del_init(&block_group->discard_list); block_group->discard_index = BTRFS_DISCARD_INDEX_UNUSED; block_group->discard_eligible_time = (ktime_get_ns() + BTRFS_DISCARD_UNUSED_DELAY); block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR; if (!queued) btrfs_get_block_group(block_group); list_add_tail(&block_group->discard_list, &discard_ctl->discard_list[BTRFS_DISCARD_INDEX_UNUSED]); spin_unlock(&discard_ctl->lock); } static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl, struct btrfs_block_group *block_group) { bool running = false; bool queued = false; spin_lock(&discard_ctl->lock); if (block_group == discard_ctl->block_group) { running = true; discard_ctl->block_group = NULL; } block_group->discard_eligible_time = 0; queued = !list_empty(&block_group->discard_list); list_del_init(&block_group->discard_list); /* * If the block group is currently running in the discard workfn, we * don't want to deref it, since it's still being used by the workfn. * The workfn will notice this case and deref the block group when it is * finished. */ if (queued && !running) btrfs_put_block_group(block_group); spin_unlock(&discard_ctl->lock); return running; } /* * Find block_group that's up next for discarding. * * @discard_ctl: discard control * @now: current time * * Iterate over the discard lists to find the next block_group up for * discarding checking the discard_eligible_time of block_group. */ static struct btrfs_block_group *find_next_block_group( struct btrfs_discard_ctl *discard_ctl, u64 now) { struct btrfs_block_group *ret_block_group = NULL, *block_group; int i; for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++) { struct list_head *discard_list = &discard_ctl->discard_list[i]; if (!list_empty(discard_list)) { block_group = list_first_entry(discard_list, struct btrfs_block_group, discard_list); if (!ret_block_group) ret_block_group = block_group; if (ret_block_group->discard_eligible_time < now) break; if (ret_block_group->discard_eligible_time > block_group->discard_eligible_time) ret_block_group = block_group; } } return ret_block_group; } /* * Look up next block group and set it for use. * * @discard_ctl: discard control * @discard_state: the discard_state of the block_group after state management * @discard_index: the discard_index of the block_group after state management * @now: time when discard was invoked, in ns * * Wrap find_next_block_group() and set the block_group to be in use. * @discard_state's control flow is managed here. Variables related to * @discard_state are reset here as needed (eg. @discard_cursor). @discard_state * and @discard_index are remembered as it may change while we're discarding, * but we want the discard to execute in the context determined here. */ static struct btrfs_block_group *peek_discard_list( struct btrfs_discard_ctl *discard_ctl, enum btrfs_discard_state *discard_state, int *discard_index, u64 now) { struct btrfs_block_group *block_group; spin_lock(&discard_ctl->lock); again: block_group = find_next_block_group(discard_ctl, now); if (block_group && now >= block_group->discard_eligible_time) { if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED && block_group->used != 0) { if (btrfs_is_block_group_data_only(block_group)) { __add_to_discard_list(discard_ctl, block_group); } else { list_del_init(&block_group->discard_list); btrfs_put_block_group(block_group); } goto again; } if (block_group->discard_state == BTRFS_DISCARD_RESET_CURSOR) { block_group->discard_cursor = block_group->start; block_group->discard_state = BTRFS_DISCARD_EXTENTS; } discard_ctl->block_group = block_group; } if (block_group) { *discard_state = block_group->discard_state; *discard_index = block_group->discard_index; } spin_unlock(&discard_ctl->lock); return block_group; } /* * Update a block group's filters. * * @block_group: block group of interest * @bytes: recently freed region size after coalescing * * Async discard maintains multiple lists with progressively smaller filters * to prioritize discarding based on size. Should a free space that matches * a larger filter be returned to the free_space_cache, prioritize that discard * by moving @block_group to the proper filter. */ void btrfs_discard_check_filter(struct btrfs_block_group *block_group, u64 bytes) { struct btrfs_discard_ctl *discard_ctl; if (!block_group || !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC)) return; discard_ctl = &block_group->fs_info->discard_ctl; if (block_group->discard_index > BTRFS_DISCARD_INDEX_START && bytes >= discard_minlen[block_group->discard_index - 1]) { int i; remove_from_discard_list(discard_ctl, block_group); for (i = BTRFS_DISCARD_INDEX_START; i < BTRFS_NR_DISCARD_LISTS; i++) { if (bytes >= discard_minlen[i]) { block_group->discard_index = i; add_to_discard_list(discard_ctl, block_group); break; } } } } /* * Move a block group along the discard lists. * * @discard_ctl: discard control * @block_group: block_group of interest * * Increment @block_group's discard_index. If it falls of the list, let it be. * Otherwise add it back to the appropriate list. */ static void btrfs_update_discard_index(struct btrfs_discard_ctl *discard_ctl, struct btrfs_block_group *block_group) { block_group->discard_index++; if (block_group->discard_index == BTRFS_NR_DISCARD_LISTS) { block_group->discard_index = 1; return; } add_to_discard_list(discard_ctl, block_group); } /* * Remove a block_group from the discard lists. * * @discard_ctl: discard control * @block_group: block_group of interest * * Remove @block_group from the discard lists. If necessary, wait on the * current work and then reschedule the delayed work. */ void btrfs_discard_cancel_work(struct btrfs_discard_ctl *discard_ctl, struct btrfs_block_group *block_group) { if (remove_from_discard_list(discard_ctl, block_group)) { cancel_delayed_work_sync(&discard_ctl->work); btrfs_discard_schedule_work(discard_ctl, true); } } /* * Handles queuing the block_groups. * * @discard_ctl: discard control * @block_group: block_group of interest * * Maintain the LRU order of the discard lists. */ void btrfs_discard_queue_work(struct btrfs_discard_ctl *discard_ctl, struct btrfs_block_group *block_group) { if (!block_group || !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC)) return; if (block_group->used == 0) add_to_discard_unused_list(discard_ctl, block_group); else add_to_discard_list(discard_ctl, block_group); if (!delayed_work_pending(&discard_ctl->work)) btrfs_discard_schedule_work(discard_ctl, false); } static void __btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl, u64 now, bool override) { struct btrfs_block_group *block_group; if (!btrfs_run_discard_work(discard_ctl)) return; if (!override && delayed_work_pending(&discard_ctl->work)) return; block_group = find_next_block_group(discard_ctl, now); if (block_group) { u64 delay = discard_ctl->delay_ms * NSEC_PER_MSEC; u32 kbps_limit = READ_ONCE(discard_ctl->kbps_limit); /* * A single delayed workqueue item is responsible for * discarding, so we can manage the bytes rate limit by keeping * track of the previous discard. */ if (kbps_limit && discard_ctl->prev_discard) { u64 bps_limit = ((u64)kbps_limit) * SZ_1K; u64 bps_delay = div64_u64(discard_ctl->prev_discard * NSEC_PER_SEC, bps_limit); delay = max(delay, bps_delay); } /* * This timeout is to hopefully prevent immediate discarding * in a recently allocated block group. */ if (now < block_group->discard_eligible_time) { u64 bg_timeout = block_group->discard_eligible_time - now; delay = max(delay, bg_timeout); } if (override && discard_ctl->prev_discard) { u64 elapsed = now - discard_ctl->prev_discard_time; if (delay > elapsed) delay -= elapsed; else delay = 0; } mod_delayed_work(discard_ctl->discard_workers, &discard_ctl->work, nsecs_to_jiffies(delay)); } } /* * Responsible for scheduling the discard work. * * @discard_ctl: discard control * @override: override the current timer * * Discards are issued by a delayed workqueue item. @override is used to * update the current delay as the baseline delay interval is reevaluated on * transaction commit. This is also maxed with any other rate limit. */ void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl, bool override) { const u64 now = ktime_get_ns(); spin_lock(&discard_ctl->lock); __btrfs_discard_schedule_work(discard_ctl, now, override); spin_unlock(&discard_ctl->lock); } /* * Determine next step of a block_group. * * @discard_ctl: discard control * @block_group: block_group of interest * * Determine the next step for a block group after it's finished going through * a pass on a discard list. If it is unused and fully trimmed, we can mark it * unused and send it to the unused_bgs path. Otherwise, pass it onto the * appropriate filter list or let it fall off. */ static void btrfs_finish_discard_pass(struct btrfs_discard_ctl *discard_ctl, struct btrfs_block_group *block_group) { remove_from_discard_list(discard_ctl, block_group); if (block_group->used == 0) { if (btrfs_is_free_space_trimmed(block_group)) btrfs_mark_bg_unused(block_group); else add_to_discard_unused_list(discard_ctl, block_group); } else { btrfs_update_discard_index(discard_ctl, block_group); } } /* * Discard work queue callback * * @work: work * * Find the next block_group to start discarding and then discard a single * region. It does this in a two-pass fashion: first extents and second * bitmaps. Completely discarded block groups are sent to the unused_bgs path. */ static void btrfs_discard_workfn(struct work_struct *work) { struct btrfs_discard_ctl *discard_ctl; struct btrfs_block_group *block_group; enum btrfs_discard_state discard_state; int discard_index = 0; u64 trimmed = 0; u64 minlen = 0; u64 now = ktime_get_ns(); discard_ctl = container_of(work, struct btrfs_discard_ctl, work.work); block_group = peek_discard_list(discard_ctl, &discard_state, &discard_index, now); if (!block_group || !btrfs_run_discard_work(discard_ctl)) return; if (now < block_group->discard_eligible_time) { btrfs_discard_schedule_work(discard_ctl, false); return; } /* Perform discarding */ minlen = discard_minlen[discard_index]; if (discard_state == BTRFS_DISCARD_BITMAPS) { u64 maxlen = 0; /* * Use the previous levels minimum discard length as the max * length filter. In the case something is added to make a * region go beyond the max filter, the entire bitmap is set * back to BTRFS_TRIM_STATE_UNTRIMMED. */ if (discard_index != BTRFS_DISCARD_INDEX_UNUSED) maxlen = discard_minlen[discard_index - 1]; btrfs_trim_block_group_bitmaps(block_group, &trimmed, block_group->discard_cursor, btrfs_block_group_end(block_group), minlen, maxlen, true); discard_ctl->discard_bitmap_bytes += trimmed; } else { btrfs_trim_block_group_extents(block_group, &trimmed, block_group->discard_cursor, btrfs_block_group_end(block_group), minlen, true); discard_ctl->discard_extent_bytes += trimmed; } /* Determine next steps for a block_group */ if (block_group->discard_cursor >= btrfs_block_group_end(block_group)) { if (discard_state == BTRFS_DISCARD_BITMAPS) { btrfs_finish_discard_pass(discard_ctl, block_group); } else { block_group->discard_cursor = block_group->start; spin_lock(&discard_ctl->lock); if (block_group->discard_state != BTRFS_DISCARD_RESET_CURSOR) block_group->discard_state = BTRFS_DISCARD_BITMAPS; spin_unlock(&discard_ctl->lock); } } now = ktime_get_ns(); spin_lock(&discard_ctl->lock); discard_ctl->prev_discard = trimmed; discard_ctl->prev_discard_time = now; /* * If the block group was removed from the discard list while it was * running in this workfn, then we didn't deref it, since this function * still owned that reference. But we set the discard_ctl->block_group * back to NULL, so we can use that condition to know that now we need * to deref the block_group. */ if (discard_ctl->block_group == NULL) btrfs_put_block_group(block_group); discard_ctl->block_group = NULL; __btrfs_discard_schedule_work(discard_ctl, now, false); spin_unlock(&discard_ctl->lock); } /* * Recalculate the base delay. * * @discard_ctl: discard control * * Recalculate the base delay which is based off the total number of * discardable_extents. Clamp this between the lower_limit (iops_limit or 1ms) * and the upper_limit (BTRFS_DISCARD_MAX_DELAY_MSEC). */ void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl) { s32 discardable_extents; s64 discardable_bytes; u32 iops_limit; unsigned long min_delay = BTRFS_DISCARD_MIN_DELAY_MSEC; unsigned long delay; discardable_extents = atomic_read(&discard_ctl->discardable_extents); if (!discardable_extents) return; spin_lock(&discard_ctl->lock); /* * The following is to fix a potential -1 discrepancy that we're not * sure how to reproduce. But given that this is the only place that * utilizes these numbers and this is only called by from * btrfs_finish_extent_commit() which is synchronized, we can correct * here. */ if (discardable_extents < 0) atomic_add(-discardable_extents, &discard_ctl->discardable_extents); discardable_bytes = atomic64_read(&discard_ctl->discardable_bytes); if (discardable_bytes < 0) atomic64_add(-discardable_bytes, &discard_ctl->discardable_bytes); if (discardable_extents <= 0) { spin_unlock(&discard_ctl->lock); return; } iops_limit = READ_ONCE(discard_ctl->iops_limit); if (iops_limit) { delay = MSEC_PER_SEC / iops_limit; } else { /* * Unset iops_limit means go as fast as possible, so allow a * delay of 0. */ delay = 0; min_delay = 0; } delay = clamp(delay, min_delay, BTRFS_DISCARD_MAX_DELAY_MSEC); discard_ctl->delay_ms = delay; spin_unlock(&discard_ctl->lock); } /* * Propagate discard counters. * * @block_group: block_group of interest * * Propagate deltas of counters up to the discard_ctl. It maintains a current * counter and a previous counter passing the delta up to the global stat. * Then the current counter value becomes the previous counter value. */ void btrfs_discard_update_discardable(struct btrfs_block_group *block_group) { struct btrfs_free_space_ctl *ctl; struct btrfs_discard_ctl *discard_ctl; s32 extents_delta; s64 bytes_delta; if (!block_group || !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC) || !btrfs_is_block_group_data_only(block_group)) return; ctl = block_group->free_space_ctl; discard_ctl = &block_group->fs_info->discard_ctl; lockdep_assert_held(&ctl->tree_lock); extents_delta = ctl->discardable_extents[BTRFS_STAT_CURR] - ctl->discardable_extents[BTRFS_STAT_PREV]; if (extents_delta) { atomic_add(extents_delta, &discard_ctl->discardable_extents); ctl->discardable_extents[BTRFS_STAT_PREV] = ctl->discardable_extents[BTRFS_STAT_CURR]; } bytes_delta = ctl->discardable_bytes[BTRFS_STAT_CURR] - ctl->discardable_bytes[BTRFS_STAT_PREV]; if (bytes_delta) { atomic64_add(bytes_delta, &discard_ctl->discardable_bytes); ctl->discardable_bytes[BTRFS_STAT_PREV] = ctl->discardable_bytes[BTRFS_STAT_CURR]; } } /* * Punt unused_bgs list to discard lists. * * @fs_info: fs_info of interest * * The unused_bgs list needs to be punted to the discard lists because the * order of operations is changed. In the normal synchronous discard path, the * block groups are trimmed via a single large trim in transaction commit. This * is ultimately what we are trying to avoid with asynchronous discard. Thus, * it must be done before going down the unused_bgs path. */ void btrfs_discard_punt_unused_bgs_list(struct btrfs_fs_info *fs_info) { struct btrfs_block_group *block_group, *next; spin_lock(&fs_info->unused_bgs_lock); /* We enabled async discard, so punt all to the queue */ list_for_each_entry_safe(block_group, next, &fs_info->unused_bgs, bg_list) { list_del_init(&block_group->bg_list); btrfs_discard_queue_work(&fs_info->discard_ctl, block_group); /* * This put is for the get done by btrfs_mark_bg_unused. * Queueing discard incremented it for discard's reference. */ btrfs_put_block_group(block_group); } spin_unlock(&fs_info->unused_bgs_lock); } /* * Purge discard lists. * * @discard_ctl: discard control * * If we are disabling async discard, we may have intercepted block groups that * are completely free and ready for the unused_bgs path. As discarding will * now happen in transaction commit or not at all, we can safely mark the * corresponding block groups as unused and they will be sent on their merry * way to the unused_bgs list. */ static void btrfs_discard_purge_list(struct btrfs_discard_ctl *discard_ctl) { struct btrfs_block_group *block_group, *next; int i; spin_lock(&discard_ctl->lock); for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++) { list_for_each_entry_safe(block_group, next, &discard_ctl->discard_list[i], discard_list) { list_del_init(&block_group->discard_list); spin_unlock(&discard_ctl->lock); if (block_group->used == 0) btrfs_mark_bg_unused(block_group); spin_lock(&discard_ctl->lock); btrfs_put_block_group(block_group); } } spin_unlock(&discard_ctl->lock); } void btrfs_discard_resume(struct btrfs_fs_info *fs_info) { if (!btrfs_test_opt(fs_info, DISCARD_ASYNC)) { btrfs_discard_cleanup(fs_info); return; } btrfs_discard_punt_unused_bgs_list(fs_info); set_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags); } void btrfs_discard_stop(struct btrfs_fs_info *fs_info) { clear_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags); } void btrfs_discard_init(struct btrfs_fs_info *fs_info) { struct btrfs_discard_ctl *discard_ctl = &fs_info->discard_ctl; int i; spin_lock_init(&discard_ctl->lock); INIT_DELAYED_WORK(&discard_ctl->work, btrfs_discard_workfn); for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++) INIT_LIST_HEAD(&discard_ctl->discard_list[i]); discard_ctl->prev_discard = 0; discard_ctl->prev_discard_time = 0; atomic_set(&discard_ctl->discardable_extents, 0); atomic64_set(&discard_ctl->discardable_bytes, 0); discard_ctl->max_discard_size = BTRFS_ASYNC_DISCARD_DEFAULT_MAX_SIZE; discard_ctl->delay_ms = BTRFS_DISCARD_MAX_DELAY_MSEC; discard_ctl->iops_limit = BTRFS_DISCARD_MAX_IOPS; discard_ctl->kbps_limit = 0; discard_ctl->discard_extent_bytes = 0; discard_ctl->discard_bitmap_bytes = 0; atomic64_set(&discard_ctl->discard_bytes_saved, 0); } void btrfs_discard_cleanup(struct btrfs_fs_info *fs_info) { btrfs_discard_stop(fs_info); cancel_delayed_work_sync(&fs_info->discard_ctl.work); btrfs_discard_purge_list(&fs_info->discard_ctl); }
2 17 16 16 16 16 23 19 19 23 23 23 23 23 23 25 25 13 13 25 25 2 2 2 2 2 2 2 2 2 2 2 24 25 25 25 25 16 1 1 15 12 25 1 1 25 10 25 1 16 16 16 16 16 16 16 16 16 16 16 16 16 16 2 2 2 2 2 4 4 4 4 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 // SPDX-License-Identifier: GPL-2.0+ /* * NILFS directory entry operations * * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation. * * Modified for NILFS by Amagai Yoshiji. */ /* * linux/fs/ext2/dir.c * * Copyright (C) 1992, 1993, 1994, 1995 * Remy Card (card@masi.ibp.fr) * Laboratoire MASI - Institut Blaise Pascal * Universite Pierre et Marie Curie (Paris VI) * * from * * linux/fs/minix/dir.c * * Copyright (C) 1991, 1992 Linus Torvalds * * ext2 directory handling functions * * Big-endian to little-endian byte-swapping/bitmaps by * David S. Miller (davem@caip.rutgers.edu), 1995 * * All code that works with directory layout had been switched to pagecache * and moved here. AV */ #include <linux/pagemap.h> #include "nilfs.h" #include "page.h" static inline unsigned int nilfs_rec_len_from_disk(__le16 dlen) { unsigned int len = le16_to_cpu(dlen); #if (PAGE_SIZE >= 65536) if (len == NILFS_MAX_REC_LEN) return 1 << 16; #endif return len; } static inline __le16 nilfs_rec_len_to_disk(unsigned int len) { #if (PAGE_SIZE >= 65536) if (len == (1 << 16)) return cpu_to_le16(NILFS_MAX_REC_LEN); BUG_ON(len > (1 << 16)); #endif return cpu_to_le16(len); } /* * nilfs uses block-sized chunks. Arguably, sector-sized ones would be * more robust, but we have what we have */ static inline unsigned int nilfs_chunk_size(struct inode *inode) { return inode->i_sb->s_blocksize; } static inline void nilfs_put_page(struct page *page) { kunmap(page); put_page(page); } /* * Return the offset into page `page_nr' of the last valid * byte in that page, plus one. */ static unsigned int nilfs_last_byte(struct inode *inode, unsigned long page_nr) { unsigned int last_byte = inode->i_size; last_byte -= page_nr << PAGE_SHIFT; if (last_byte > PAGE_SIZE) last_byte = PAGE_SIZE; return last_byte; } static int nilfs_prepare_chunk(struct page *page, unsigned int from, unsigned int to) { loff_t pos = page_offset(page) + from; return __block_write_begin(page, pos, to - from, nilfs_get_block); } static void nilfs_commit_chunk(struct page *page, struct address_space *mapping, unsigned int from, unsigned int to) { struct inode *dir = mapping->host; loff_t pos = page_offset(page) + from; unsigned int len = to - from; unsigned int nr_dirty, copied; int err; nr_dirty = nilfs_page_count_clean_buffers(page, from, to); copied = block_write_end(NULL, mapping, pos, len, len, page, NULL); if (pos + copied > dir->i_size) i_size_write(dir, pos + copied); if (IS_DIRSYNC(dir)) nilfs_set_transaction_flag(NILFS_TI_SYNC); err = nilfs_set_file_dirty(dir, nr_dirty); WARN_ON(err); /* do not happen */ unlock_page(page); } static bool nilfs_check_page(struct page *page) { struct inode *dir = page->mapping->host; struct super_block *sb = dir->i_sb; unsigned int chunk_size = nilfs_chunk_size(dir); char *kaddr = page_address(page); unsigned int offs, rec_len; unsigned int limit = PAGE_SIZE; struct nilfs_dir_entry *p; char *error; if ((dir->i_size >> PAGE_SHIFT) == page->index) { limit = dir->i_size & ~PAGE_MASK; if (limit & (chunk_size - 1)) goto Ebadsize; if (!limit) goto out; } for (offs = 0; offs <= limit - NILFS_DIR_REC_LEN(1); offs += rec_len) { p = (struct nilfs_dir_entry *)(kaddr + offs); rec_len = nilfs_rec_len_from_disk(p->rec_len); if (rec_len < NILFS_DIR_REC_LEN(1)) goto Eshort; if (rec_len & 3) goto Ealign; if (rec_len < NILFS_DIR_REC_LEN(p->name_len)) goto Enamelen; if (((offs + rec_len - 1) ^ offs) & ~(chunk_size-1)) goto Espan; } if (offs != limit) goto Eend; out: SetPageChecked(page); return true; /* Too bad, we had an error */ Ebadsize: nilfs_error(sb, "size of directory #%lu is not a multiple of chunk size", dir->i_ino); goto fail; Eshort: error = "rec_len is smaller than minimal"; goto bad_entry; Ealign: error = "unaligned directory entry"; goto bad_entry; Enamelen: error = "rec_len is too small for name_len"; goto bad_entry; Espan: error = "directory entry across blocks"; bad_entry: nilfs_error(sb, "bad entry in directory #%lu: %s - offset=%lu, inode=%lu, rec_len=%d, name_len=%d", dir->i_ino, error, (page->index << PAGE_SHIFT) + offs, (unsigned long)le64_to_cpu(p->inode), rec_len, p->name_len); goto fail; Eend: p = (struct nilfs_dir_entry *)(kaddr + offs); nilfs_error(sb, "entry in directory #%lu spans the page boundary offset=%lu, inode=%lu", dir->i_ino, (page->index << PAGE_SHIFT) + offs, (unsigned long)le64_to_cpu(p->inode)); fail: SetPageError(page); return false; } static struct page *nilfs_get_page(struct inode *dir, unsigned long n) { struct address_space *mapping = dir->i_mapping; struct page *page = read_mapping_page(mapping, n, NULL); if (!IS_ERR(page)) { kmap(page); if (unlikely(!PageChecked(page))) { if (!nilfs_check_page(page)) goto fail; } } return page; fail: nilfs_put_page(page); return ERR_PTR(-EIO); } /* * NOTE! unlike strncmp, nilfs_match returns 1 for success, 0 for failure. * * len <= NILFS_NAME_LEN and de != NULL are guaranteed by caller. */ static int nilfs_match(int len, const unsigned char *name, struct nilfs_dir_entry *de) { if (len != de->name_len) return 0; if (!de->inode) return 0; return !memcmp(name, de->name, len); } /* * p is at least 6 bytes before the end of page */ static struct nilfs_dir_entry *nilfs_next_entry(struct nilfs_dir_entry *p) { return (struct nilfs_dir_entry *)((char *)p + nilfs_rec_len_from_disk(p->rec_len)); } static unsigned char nilfs_filetype_table[NILFS_FT_MAX] = { [NILFS_FT_UNKNOWN] = DT_UNKNOWN, [NILFS_FT_REG_FILE] = DT_REG, [NILFS_FT_DIR] = DT_DIR, [NILFS_FT_CHRDEV] = DT_CHR, [NILFS_FT_BLKDEV] = DT_BLK, [NILFS_FT_FIFO] = DT_FIFO, [NILFS_FT_SOCK] = DT_SOCK, [NILFS_FT_SYMLINK] = DT_LNK, }; #define S_SHIFT 12 static unsigned char nilfs_type_by_mode[S_IFMT >> S_SHIFT] = { [S_IFREG >> S_SHIFT] = NILFS_FT_REG_FILE, [S_IFDIR >> S_SHIFT] = NILFS_FT_DIR, [S_IFCHR >> S_SHIFT] = NILFS_FT_CHRDEV, [S_IFBLK >> S_SHIFT] = NILFS_FT_BLKDEV, [S_IFIFO >> S_SHIFT] = NILFS_FT_FIFO, [S_IFSOCK >> S_SHIFT] = NILFS_FT_SOCK, [S_IFLNK >> S_SHIFT] = NILFS_FT_SYMLINK, }; static void nilfs_set_de_type(struct nilfs_dir_entry *de, struct inode *inode) { umode_t mode = inode->i_mode; de->file_type = nilfs_type_by_mode[(mode & S_IFMT)>>S_SHIFT]; } static int nilfs_readdir(struct file *file, struct dir_context *ctx) { loff_t pos = ctx->pos; struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; unsigned int offset = pos & ~PAGE_MASK; unsigned long n = pos >> PAGE_SHIFT; unsigned long npages = dir_pages(inode); if (pos > inode->i_size - NILFS_DIR_REC_LEN(1)) return 0; for ( ; n < npages; n++, offset = 0) { char *kaddr, *limit; struct nilfs_dir_entry *de; struct page *page = nilfs_get_page(inode, n); if (IS_ERR(page)) { nilfs_error(sb, "bad page in #%lu", inode->i_ino); ctx->pos += PAGE_SIZE - offset; return -EIO; } kaddr = page_address(page); de = (struct nilfs_dir_entry *)(kaddr + offset); limit = kaddr + nilfs_last_byte(inode, n) - NILFS_DIR_REC_LEN(1); for ( ; (char *)de <= limit; de = nilfs_next_entry(de)) { if (de->rec_len == 0) { nilfs_error(sb, "zero-length directory entry"); nilfs_put_page(page); return -EIO; } if (de->inode) { unsigned char t; if (de->file_type < NILFS_FT_MAX) t = nilfs_filetype_table[de->file_type]; else t = DT_UNKNOWN; if (!dir_emit(ctx, de->name, de->name_len, le64_to_cpu(de->inode), t)) { nilfs_put_page(page); return 0; } } ctx->pos += nilfs_rec_len_from_disk(de->rec_len); } nilfs_put_page(page); } return 0; } /* * nilfs_find_entry() * * finds an entry in the specified directory with the wanted name. It * returns the page in which the entry was found, and the entry itself * (as a parameter - res_dir). Page is returned mapped and unlocked. * Entry is guaranteed to be valid. */ struct nilfs_dir_entry * nilfs_find_entry(struct inode *dir, const struct qstr *qstr, struct page **res_page) { const unsigned char *name = qstr->name; int namelen = qstr->len; unsigned int reclen = NILFS_DIR_REC_LEN(namelen); unsigned long start, n; unsigned long npages = dir_pages(dir); struct page *page = NULL; struct nilfs_inode_info *ei = NILFS_I(dir); struct nilfs_dir_entry *de; if (npages == 0) goto out; /* OFFSET_CACHE */ *res_page = NULL; start = ei->i_dir_start_lookup; if (start >= npages) start = 0; n = start; do { char *kaddr; page = nilfs_get_page(dir, n); if (!IS_ERR(page)) { kaddr = page_address(page); de = (struct nilfs_dir_entry *)kaddr; kaddr += nilfs_last_byte(dir, n) - reclen; while ((char *) de <= kaddr) { if (de->rec_len == 0) { nilfs_error(dir->i_sb, "zero-length directory entry"); nilfs_put_page(page); goto out; } if (nilfs_match(namelen, name, de)) goto found; de = nilfs_next_entry(de); } nilfs_put_page(page); } if (++n >= npages) n = 0; /* next page is past the blocks we've got */ if (unlikely(n > (dir->i_blocks >> (PAGE_SHIFT - 9)))) { nilfs_error(dir->i_sb, "dir %lu size %lld exceeds block count %llu", dir->i_ino, dir->i_size, (unsigned long long)dir->i_blocks); goto out; } } while (n != start); out: return NULL; found: *res_page = page; ei->i_dir_start_lookup = n; return de; } struct nilfs_dir_entry *nilfs_dotdot(struct inode *dir, struct page **p) { struct page *page = nilfs_get_page(dir, 0); struct nilfs_dir_entry *de = NULL; if (!IS_ERR(page)) { de = nilfs_next_entry( (struct nilfs_dir_entry *)page_address(page)); *p = page; } return de; } ino_t nilfs_inode_by_name(struct inode *dir, const struct qstr *qstr) { ino_t res = 0; struct nilfs_dir_entry *de; struct page *page; de = nilfs_find_entry(dir, qstr, &page); if (de) { res = le64_to_cpu(de->inode); kunmap(page); put_page(page); } return res; } /* Releases the page */ void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de, struct page *page, struct inode *inode) { unsigned int from = (char *)de - (char *)page_address(page); unsigned int to = from + nilfs_rec_len_from_disk(de->rec_len); struct address_space *mapping = page->mapping; int err; lock_page(page); err = nilfs_prepare_chunk(page, from, to); BUG_ON(err); de->inode = cpu_to_le64(inode->i_ino); nilfs_set_de_type(de, inode); nilfs_commit_chunk(page, mapping, from, to); nilfs_put_page(page); inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); } /* * Parent is locked. */ int nilfs_add_link(struct dentry *dentry, struct inode *inode) { struct inode *dir = d_inode(dentry->d_parent); const unsigned char *name = dentry->d_name.name; int namelen = dentry->d_name.len; unsigned int chunk_size = nilfs_chunk_size(dir); unsigned int reclen = NILFS_DIR_REC_LEN(namelen); unsigned short rec_len, name_len; struct page *page = NULL; struct nilfs_dir_entry *de; unsigned long npages = dir_pages(dir); unsigned long n; char *kaddr; unsigned int from, to; int err; /* * We take care of directory expansion in the same loop. * This code plays outside i_size, so it locks the page * to protect that region. */ for (n = 0; n <= npages; n++) { char *dir_end; page = nilfs_get_page(dir, n); err = PTR_ERR(page); if (IS_ERR(page)) goto out; lock_page(page); kaddr = page_address(page); dir_end = kaddr + nilfs_last_byte(dir, n); de = (struct nilfs_dir_entry *)kaddr; kaddr += PAGE_SIZE - reclen; while ((char *)de <= kaddr) { if ((char *)de == dir_end) { /* We hit i_size */ name_len = 0; rec_len = chunk_size; de->rec_len = nilfs_rec_len_to_disk(chunk_size); de->inode = 0; goto got_it; } if (de->rec_len == 0) { nilfs_error(dir->i_sb, "zero-length directory entry"); err = -EIO; goto out_unlock; } err = -EEXIST; if (nilfs_match(namelen, name, de)) goto out_unlock; name_len = NILFS_DIR_REC_LEN(de->name_len); rec_len = nilfs_rec_len_from_disk(de->rec_len); if (!de->inode && rec_len >= reclen) goto got_it; if (rec_len >= name_len + reclen) goto got_it; de = (struct nilfs_dir_entry *)((char *)de + rec_len); } unlock_page(page); nilfs_put_page(page); } BUG(); return -EINVAL; got_it: from = (char *)de - (char *)page_address(page); to = from + rec_len; err = nilfs_prepare_chunk(page, from, to); if (err) goto out_unlock; if (de->inode) { struct nilfs_dir_entry *de1; de1 = (struct nilfs_dir_entry *)((char *)de + name_len); de1->rec_len = nilfs_rec_len_to_disk(rec_len - name_len); de->rec_len = nilfs_rec_len_to_disk(name_len); de = de1; } de->name_len = namelen; memcpy(de->name, name, namelen); de->inode = cpu_to_le64(inode->i_ino); nilfs_set_de_type(de, inode); nilfs_commit_chunk(page, page->mapping, from, to); inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); nilfs_mark_inode_dirty(dir); /* OFFSET_CACHE */ out_put: nilfs_put_page(page); out: return err; out_unlock: unlock_page(page); goto out_put; } /* * nilfs_delete_entry deletes a directory entry by merging it with the * previous entry. Page is up-to-date. Releases the page. */ int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page) { struct address_space *mapping = page->mapping; struct inode *inode = mapping->host; char *kaddr = page_address(page); unsigned int from, to; struct nilfs_dir_entry *de, *pde = NULL; int err; from = ((char *)dir - kaddr) & ~(nilfs_chunk_size(inode) - 1); to = ((char *)dir - kaddr) + nilfs_rec_len_from_disk(dir->rec_len); de = (struct nilfs_dir_entry *)(kaddr + from); while ((char *)de < (char *)dir) { if (de->rec_len == 0) { nilfs_error(inode->i_sb, "zero-length directory entry"); err = -EIO; goto out; } pde = de; de = nilfs_next_entry(de); } if (pde) from = (char *)pde - (char *)page_address(page); lock_page(page); err = nilfs_prepare_chunk(page, from, to); BUG_ON(err); if (pde) pde->rec_len = nilfs_rec_len_to_disk(to - from); dir->inode = 0; nilfs_commit_chunk(page, mapping, from, to); inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); out: nilfs_put_page(page); return err; } /* * Set the first fragment of directory. */ int nilfs_make_empty(struct inode *inode, struct inode *parent) { struct address_space *mapping = inode->i_mapping; struct page *page = grab_cache_page(mapping, 0); unsigned int chunk_size = nilfs_chunk_size(inode); struct nilfs_dir_entry *de; int err; void *kaddr; if (!page) return -ENOMEM; err = nilfs_prepare_chunk(page, 0, chunk_size); if (unlikely(err)) { unlock_page(page); goto fail; } kaddr = kmap_atomic(page); memset(kaddr, 0, chunk_size); de = (struct nilfs_dir_entry *)kaddr; de->name_len = 1; de->rec_len = nilfs_rec_len_to_disk(NILFS_DIR_REC_LEN(1)); memcpy(de->name, ".\0\0", 4); de->inode = cpu_to_le64(inode->i_ino); nilfs_set_de_type(de, inode); de = (struct nilfs_dir_entry *)(kaddr + NILFS_DIR_REC_LEN(1)); de->name_len = 2; de->rec_len = nilfs_rec_len_to_disk(chunk_size - NILFS_DIR_REC_LEN(1)); de->inode = cpu_to_le64(parent->i_ino); memcpy(de->name, "..\0", 4); nilfs_set_de_type(de, inode); kunmap_atomic(kaddr); nilfs_commit_chunk(page, mapping, 0, chunk_size); fail: put_page(page); return err; } /* * routine to check that the specified directory is empty (for rmdir) */ int nilfs_empty_dir(struct inode *inode) { struct page *page = NULL; unsigned long i, npages = dir_pages(inode); for (i = 0; i < npages; i++) { char *kaddr; struct nilfs_dir_entry *de; page = nilfs_get_page(inode, i); if (IS_ERR(page)) continue; kaddr = page_address(page); de = (struct nilfs_dir_entry *)kaddr; kaddr += nilfs_last_byte(inode, i) - NILFS_DIR_REC_LEN(1); while ((char *)de <= kaddr) { if (de->rec_len == 0) { nilfs_error(inode->i_sb, "zero-length directory entry (kaddr=%p, de=%p)", kaddr, de); goto not_empty; } if (de->inode != 0) { /* check for . and .. */ if (de->name[0] != '.') goto not_empty; if (de->name_len > 2) goto not_empty; if (de->name_len < 2) { if (de->inode != cpu_to_le64(inode->i_ino)) goto not_empty; } else if (de->name[1] != '.') goto not_empty; } de = nilfs_next_entry(de); } nilfs_put_page(page); } return 1; not_empty: nilfs_put_page(page); return 0; } const struct file_operations nilfs_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, .iterate_shared = nilfs_readdir, .unlocked_ioctl = nilfs_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = nilfs_compat_ioctl, #endif /* CONFIG_COMPAT */ .fsync = nilfs_sync_file, };
66 26 8 1781 1232 12 11 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 /* SPDX-License-Identifier: GPL-2.0-only */ /* * include/linux/idr.h * * 2002-10-18 written by Jim Houston jim.houston@ccur.com * Copyright (C) 2002 by Concurrent Computer Corporation * * Small id to pointer translation service avoiding fixed sized * tables. */ #ifndef __IDR_H__ #define __IDR_H__ #include <linux/radix-tree.h> #include <linux/gfp.h> #include <linux/percpu.h> struct idr { struct radix_tree_root idr_rt; unsigned int idr_base; unsigned int idr_next; }; /* * The IDR API does not expose the tagging functionality of the radix tree * to users. Use tag 0 to track whether a node has free space below it. */ #define IDR_FREE 0 /* Set the IDR flag and the IDR_FREE tag */ #define IDR_RT_MARKER (ROOT_IS_IDR | (__force gfp_t) \ (1 << (ROOT_TAG_SHIFT + IDR_FREE))) #define IDR_INIT_BASE(name, base) { \ .idr_rt = RADIX_TREE_INIT(name, IDR_RT_MARKER), \ .idr_base = (base), \ .idr_next = 0, \ } /** * IDR_INIT() - Initialise an IDR. * @name: Name of IDR. * * A freshly-initialised IDR contains no IDs. */ #define IDR_INIT(name) IDR_INIT_BASE(name, 0) /** * DEFINE_IDR() - Define a statically-allocated IDR. * @name: Name of IDR. * * An IDR defined using this macro is ready for use with no additional * initialisation required. It contains no IDs. */ #define DEFINE_IDR(name) struct idr name = IDR_INIT(name) /** * idr_get_cursor - Return the current position of the cyclic allocator * @idr: idr handle * * The value returned is the value that will be next returned from * idr_alloc_cyclic() if it is free (otherwise the search will start from * this position). */ static inline unsigned int idr_get_cursor(const struct idr *idr) { return READ_ONCE(idr->idr_next); } /** * idr_set_cursor - Set the current position of the cyclic allocator * @idr: idr handle * @val: new position * * The next call to idr_alloc_cyclic() will return @val if it is free * (otherwise the search will start from this position). */ static inline void idr_set_cursor(struct idr *idr, unsigned int val) { WRITE_ONCE(idr->idr_next, val); } /** * DOC: idr sync * idr synchronization (stolen from radix-tree.h) * * idr_find() is able to be called locklessly, using RCU. The caller must * ensure calls to this function are made within rcu_read_lock() regions. * Other readers (lock-free or otherwise) and modifications may be running * concurrently. * * It is still required that the caller manage the synchronization and * lifetimes of the items. So if RCU lock-free lookups are used, typically * this would mean that the items have their own locks, or are amenable to * lock-free access; and that the items are freed by RCU (or only freed after * having been deleted from the idr tree *and* a synchronize_rcu() grace * period). */ #define idr_lock(idr) xa_lock(&(idr)->idr_rt) #define idr_unlock(idr) xa_unlock(&(idr)->idr_rt) #define idr_lock_bh(idr) xa_lock_bh(&(idr)->idr_rt) #define idr_unlock_bh(idr) xa_unlock_bh(&(idr)->idr_rt) #define idr_lock_irq(idr) xa_lock_irq(&(idr)->idr_rt) #define idr_unlock_irq(idr) xa_unlock_irq(&(idr)->idr_rt) #define idr_lock_irqsave(idr, flags) \ xa_lock_irqsave(&(idr)->idr_rt, flags) #define idr_unlock_irqrestore(idr, flags) \ xa_unlock_irqrestore(&(idr)->idr_rt, flags) void idr_preload(gfp_t gfp_mask); int idr_alloc(struct idr *, void *ptr, int start, int end, gfp_t); int __must_check idr_alloc_u32(struct idr *, void *ptr, u32 *id, unsigned long max, gfp_t); int idr_alloc_cyclic(struct idr *, void *ptr, int start, int end, gfp_t); void *idr_remove(struct idr *, unsigned long id); void *idr_find(const struct idr *, unsigned long id); int idr_for_each(const struct idr *, int (*fn)(int id, void *p, void *data), void *data); void *idr_get_next(struct idr *, int *nextid); void *idr_get_next_ul(struct idr *, unsigned long *nextid); void *idr_replace(struct idr *, void *, unsigned long id); void idr_destroy(struct idr *); /** * idr_init_base() - Initialise an IDR. * @idr: IDR handle. * @base: The base value for the IDR. * * This variation of idr_init() creates an IDR which will allocate IDs * starting at %base. */ static inline void idr_init_base(struct idr *idr, int base) { INIT_RADIX_TREE(&idr->idr_rt, IDR_RT_MARKER); idr->idr_base = base; idr->idr_next = 0; } /** * idr_init() - Initialise an IDR. * @idr: IDR handle. * * Initialise a dynamically allocated IDR. To initialise a * statically allocated IDR, use DEFINE_IDR(). */ static inline void idr_init(struct idr *idr) { idr_init_base(idr, 0); } /** * idr_is_empty() - Are there any IDs allocated? * @idr: IDR handle. * * Return: %true if any IDs have been allocated from this IDR. */ static inline bool idr_is_empty(const struct idr *idr) { return radix_tree_empty(&idr->idr_rt) && radix_tree_tagged(&idr->idr_rt, IDR_FREE); } /** * idr_preload_end - end preload section started with idr_preload() * * Each idr_preload() should be matched with an invocation of this * function. See idr_preload() for details. */ static inline void idr_preload_end(void) { local_unlock(&radix_tree_preloads.lock); } /** * idr_for_each_entry() - Iterate over an IDR's elements of a given type. * @idr: IDR handle. * @entry: The type * to use as cursor * @id: Entry ID. * * @entry and @id do not need to be initialized before the loop, and * after normal termination @entry is left with the value NULL. This * is convenient for a "not found" value. */ #define idr_for_each_entry(idr, entry, id) \ for (id = 0; ((entry) = idr_get_next(idr, &(id))) != NULL; id += 1U) /** * idr_for_each_entry_ul() - Iterate over an IDR's elements of a given type. * @idr: IDR handle. * @entry: The type * to use as cursor. * @tmp: A temporary placeholder for ID. * @id: Entry ID. * * @entry and @id do not need to be initialized before the loop, and * after normal termination @entry is left with the value NULL. This * is convenient for a "not found" value. */ #define idr_for_each_entry_ul(idr, entry, tmp, id) \ for (tmp = 0, id = 0; \ tmp <= id && ((entry) = idr_get_next_ul(idr, &(id))) != NULL; \ tmp = id, ++id) /** * idr_for_each_entry_continue() - Continue iteration over an IDR's elements of a given type * @idr: IDR handle. * @entry: The type * to use as a cursor. * @id: Entry ID. * * Continue to iterate over entries, continuing after the current position. */ #define idr_for_each_entry_continue(idr, entry, id) \ for ((entry) = idr_get_next((idr), &(id)); \ entry; \ ++id, (entry) = idr_get_next((idr), &(id))) /** * idr_for_each_entry_continue_ul() - Continue iteration over an IDR's elements of a given type * @idr: IDR handle. * @entry: The type * to use as a cursor. * @tmp: A temporary placeholder for ID. * @id: Entry ID. * * Continue to iterate over entries, continuing after the current position. */ #define idr_for_each_entry_continue_ul(idr, entry, tmp, id) \ for (tmp = id; \ tmp <= id && ((entry) = idr_get_next_ul(idr, &(id))) != NULL; \ tmp = id, ++id) /* * IDA - ID Allocator, use when translation from id to pointer isn't necessary. */ #define IDA_CHUNK_SIZE 128 /* 128 bytes per chunk */ #define IDA_BITMAP_LONGS (IDA_CHUNK_SIZE / sizeof(long)) #define IDA_BITMAP_BITS (IDA_BITMAP_LONGS * sizeof(long) * 8) struct ida_bitmap { unsigned long bitmap[IDA_BITMAP_LONGS]; }; struct ida { struct xarray xa; }; #define IDA_INIT_FLAGS (XA_FLAGS_LOCK_IRQ | XA_FLAGS_ALLOC) #define IDA_INIT(name) { \ .xa = XARRAY_INIT(name, IDA_INIT_FLAGS) \ } #define DEFINE_IDA(name) struct ida name = IDA_INIT(name) int ida_alloc_range(struct ida *, unsigned int min, unsigned int max, gfp_t); void ida_free(struct ida *, unsigned int id); void ida_destroy(struct ida *ida); /** * ida_alloc() - Allocate an unused ID. * @ida: IDA handle. * @gfp: Memory allocation flags. * * Allocate an ID between 0 and %INT_MAX, inclusive. * * Context: Any context. It is safe to call this function without * locking in your code. * Return: The allocated ID, or %-ENOMEM if memory could not be allocated, * or %-ENOSPC if there are no free IDs. */ static inline int ida_alloc(struct ida *ida, gfp_t gfp) { return ida_alloc_range(ida, 0, ~0, gfp); } /** * ida_alloc_min() - Allocate an unused ID. * @ida: IDA handle. * @min: Lowest ID to allocate. * @gfp: Memory allocation flags. * * Allocate an ID between @min and %INT_MAX, inclusive. * * Context: Any context. It is safe to call this function without * locking in your code. * Return: The allocated ID, or %-ENOMEM if memory could not be allocated, * or %-ENOSPC if there are no free IDs. */ static inline int ida_alloc_min(struct ida *ida, unsigned int min, gfp_t gfp) { return ida_alloc_range(ida, min, ~0, gfp); } /** * ida_alloc_max() - Allocate an unused ID. * @ida: IDA handle. * @max: Highest ID to allocate. * @gfp: Memory allocation flags. * * Allocate an ID between 0 and @max, inclusive. * * Context: Any context. It is safe to call this function without * locking in your code. * Return: The allocated ID, or %-ENOMEM if memory could not be allocated, * or %-ENOSPC if there are no free IDs. */ static inline int ida_alloc_max(struct ida *ida, unsigned int max, gfp_t gfp) { return ida_alloc_range(ida, 0, max, gfp); } static inline void ida_init(struct ida *ida) { xa_init_flags(&ida->xa, IDA_INIT_FLAGS); } /* * ida_simple_get() and ida_simple_remove() are deprecated. Use * ida_alloc() and ida_free() instead respectively. */ #define ida_simple_get(ida, start, end, gfp) \ ida_alloc_range(ida, start, (end) - 1, gfp) #define ida_simple_remove(ida, id) ida_free(ida, id) static inline bool ida_is_empty(const struct ida *ida) { return xa_empty(&ida->xa); } #endif /* __IDR_H__ */
4111 4103 4104 4110 13 9 7 7 4291 4108 4101 4102 306 151 302 297 16 302 291 15 363 363 17 362 56 308 300 300 294 300 300 300 7 300 300 97 299 2 1 1 1 298 298 298 292 298 65 362 1270 715 715 715 710 6 715 4 3551 3344 3551 3544 3 2 2 2 3666 2 3666 3665 3667 3147 1870 1870 3664 1856 1854 1776 84 1856 237 238 26 26 237 1697 3210 3119 3664 348 348 2179 2103 84 76 2145 2146 2144 810 2145 987 2145 98 2143 2142 2142 1213 392 392 392 2145 2145 186 186 186 183 183 183 149 37 33 122 8 8 2717 2713 2717 856 2716 2717 12 3 2712 2713 767 767 767 88 767 2715 2434 2433 108 2549 2549 2546 2545 115 2546 4 4 4 4 2549 1 1 1 1027 1026 1026 1027 949 1027 1027 1024 1027 1027 989 990 85 942 5 4 3 1025 3 1 1026 7 1026 77 76 43 43 77 52 56 56 56 56 9 47 56 4 4 2 2 2 4 3 52 22 52 52 52 10 51 39 18 1 52 73 52 947 947 947 3 947 48 968 48 48 48 814 813 755 13 13 755 755 755 755 755 755 2 13 755 14 14 14 947 23 21 20 20 949 980 648 1 43 13 701 659 32 688 949 1 1 1 1 3 980 980 980 40 979 770 770 770 531 530 770 770 770 250 744 744 742 211 211 211 210 792 791 792 755 755 789 771 251 249 792 576 722 722 1 1 722 722 721 721 722 722 722 722 722 721 718 740 740 730 740 722 680 677 722 740 740 22 22 21 6 16 722 718 719 655 650 655 655 754 815 815 815 813 367 795 93 93 795 795 39 793 18 793 792 792 792 792 352 256 814 817 817 815 9 9 816 742 816 816 816 816 816 757 816 814 813 753 754 753 753 751 716 740 6 736 13 6 732 796 262 794 109 2 795 817 817 789 804 3 1172 1172 1038 983 1037 71 1037 988 17 17 985 3 3 3 3 984 896 7 896 984 984 37 1036 159 159 1036 1036 68 987 10 984 984 197 83 10 6 8 68 17 17 51 458 452 170 303 303 303 24 888 858 14 13 898 13 73 873 898 485 898 898 898 898 25 898 153 153 714 714 714 240 512 135 19 494 512 504 512 512 512 512 511 5 880 878 878 880 880 164 139 413 850 874 38 38 38 832 832 19 10 10 1 10 26 26 26 1 25 25 8 25 25 10 295 1737 2 1735 1715 20 1735 83 83 83 56 83 5 65 11 34 82 83 13 83 50 50 65 65 19 63 6 13 58 65 19 614 7 2 71 71 52 2 693 693 693 52 54 38 54 54 54 15 54 54 53 53 33 49 49 49 48 33 16 47 48 47 48 48 48 50 1883 1883 1516 1516 614 614 8 5 609 50 576 608 607 607 575 35 603 603 475 603 603 604 4290 4288 3 4292 4291 4280 1121 4292 19 19 4292 4292 4292 4292 4289 4291 191 4278 4292 4289 4208 4204 4209 4291 153 4291 4209 4204 4292 4495 4495 2 4493 233 26 26 16 26 23 16 18 219 212 65 212 212 218 4492 4496 319 319 4359 4359 2508 2508 1738 2569 2569 23 2569 13 2569 8 2569 10 2568 2568 36 2569 8 2569 3 2569 107 3 174 138 139 121 277 487 487 28 465 487 506 295 293 294 294 507 507 507 506 505 231 284 287 284 146 146 138 284 15 13 284 9 9 8 241 12 12 1 1 282 282 107 116 1 179 281 281 281 9 1 9 1 9 281 145 7 279 143 279 279 278 277 142 136 279 17 1 263 10 2 202 53 255 275 56 230 229 2 2 2 2 2 8 8 8 8 275 6 269 1 268 6 21 254 37 37 37 37 254 254 2276 4291 4292 35 4291 2 4291 254 4292 4103 48 50 50 4 4 45 46 46 44 44 135 875 874 873 846 643 27 27 1 26 869 617 30 617 617 128 135 617 617 290 274 128 17 168 135 135 10 168 168 290 290 168 617 458 588 588 841 94 93 841 198 902 896 1424 1 1 1424 1 1 1 1 1424 1 1424 2 1424 5 1423 2 1424 3 1424 1424 1368 6 1368 1368 101 1741 1809 1810 179 1810 1809 748 74 748 703 4292 4290 4291 4315 4315 4315 4314 22 22 22 22 15 11 20 15 14 1 22 4259 4258 4257 22 4257 4 4259 3540 3539 20 18 19 18 19 18 18 18 191 191 190 190 190 187 185 1 5 3 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326 5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353 5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446 5447 5448 5449 5450 5451 5452 5453 5454 5455 5456 5457 5458 5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477 5478 5479 5480 5481 5482 5483 5484 5485 5486 5487 5488 5489 5490 5491 5492 5493 5494 5495 5496 5497 5498 5499 5500 5501 5502 5503 5504 5505 5506 5507 5508 5509 5510 5511 5512 5513 5514 5515 5516 5517 5518 5519 5520 5521 5522 5523 5524 5525 5526 5527 5528 5529 5530 5531 5532 5533 5534 5535 5536 5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551 5552 5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618 5619 5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666 5667 5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709 5710 5711 5712 5713 5714 5715 5716 5717 5718 5719 5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744 5745 5746 5747 5748 5749 5750 5751 5752 5753 5754 5755 5756 5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778 5779 5780 5781 5782 5783 5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820 5821 5822 5823 5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881 5882 5883 5884 5885 5886 5887 5888 5889 5890 5891 5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908 5909 5910 5911 5912 5913 5914 5915 5916 5917 5918 5919 5920 5921 5922 5923 5924 5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966 5967 5968 5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991 5992 5993 5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006 6007 6008 6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022 6023 6024 6025 6026 6027 6028 6029 6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044 6045 6046 6047 6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082 6083 6084 6085 6086 6087 6088 6089 6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108 6109 6110 6111 6112 6113 6114 6115 6116 6117 6118 6119 6120 6121 6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136 6137 6138 6139 6140 6141 6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154 6155 6156 6157 6158 6159 6160 6161 6162 6163 6164 6165 6166 6167 6168 6169 6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/inode.c * * Copyright (C) 1992, 1993, 1994, 1995 * Remy Card (card@masi.ibp.fr) * Laboratoire MASI - Institut Blaise Pascal * Universite Pierre et Marie Curie (Paris VI) * * from * * linux/fs/minix/inode.c * * Copyright (C) 1991, 1992 Linus Torvalds * * 64-bit file support on 64-bit platforms by Jakub Jelinek * (jj@sunsite.ms.mff.cuni.cz) * * Assorted race fixes, rewrite of ext4_get_block() by Al Viro, 2000 */ #include <linux/fs.h> #include <linux/mount.h> #include <linux/time.h> #include <linux/highuid.h> #include <linux/pagemap.h> #include <linux/dax.h> #include <linux/quotaops.h> #include <linux/string.h> #include <linux/buffer_head.h> #include <linux/writeback.h> #include <linux/pagevec.h> #include <linux/mpage.h> #include <linux/namei.h> #include <linux/uio.h> #include <linux/bio.h> #include <linux/workqueue.h> #include <linux/kernel.h> #include <linux/printk.h> #include <linux/slab.h> #include <linux/bitops.h> #include <linux/iomap.h> #include <linux/iversion.h> #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" #include "truncate.h" #include <trace/events/ext4.h> static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw, struct ext4_inode_info *ei) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); __u32 csum; __u16 dummy_csum = 0; int offset = offsetof(struct ext4_inode, i_checksum_lo); unsigned int csum_size = sizeof(dummy_csum); csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw, offset); csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, csum_size); offset += csum_size; csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset, EXT4_GOOD_OLD_INODE_SIZE - offset); if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { offset = offsetof(struct ext4_inode, i_checksum_hi); csum = ext4_chksum(sbi, csum, (__u8 *)raw + EXT4_GOOD_OLD_INODE_SIZE, offset - EXT4_GOOD_OLD_INODE_SIZE); if (EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) { csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, csum_size); offset += csum_size; } csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset, EXT4_INODE_SIZE(inode->i_sb) - offset); } return csum; } static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw, struct ext4_inode_info *ei) { __u32 provided, calculated; if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != cpu_to_le32(EXT4_OS_LINUX) || !ext4_has_metadata_csum(inode->i_sb)) return 1; provided = le16_to_cpu(raw->i_checksum_lo); calculated = ext4_inode_csum(inode, raw, ei); if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) provided |= ((__u32)le16_to_cpu(raw->i_checksum_hi)) << 16; else calculated &= 0xFFFF; return provided == calculated; } void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw, struct ext4_inode_info *ei) { __u32 csum; if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != cpu_to_le32(EXT4_OS_LINUX) || !ext4_has_metadata_csum(inode->i_sb)) return; csum = ext4_inode_csum(inode, raw, ei); raw->i_checksum_lo = cpu_to_le16(csum & 0xFFFF); if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) raw->i_checksum_hi = cpu_to_le16(csum >> 16); } static inline int ext4_begin_ordered_truncate(struct inode *inode, loff_t new_size) { trace_ext4_begin_ordered_truncate(inode, new_size); /* * If jinode is zero, then we never opened the file for * writing, so there's no need to call * jbd2_journal_begin_ordered_truncate() since there's no * outstanding writes we need to flush. */ if (!EXT4_I(inode)->jinode) return 0; return jbd2_journal_begin_ordered_truncate(EXT4_JOURNAL(inode), EXT4_I(inode)->jinode, new_size); } static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, int pextents); /* * Test whether an inode is a fast symlink. * A fast symlink has its symlink data stored in ext4_inode_info->i_data. */ int ext4_inode_is_fast_symlink(struct inode *inode) { if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) { int ea_blocks = EXT4_I(inode)->i_file_acl ? EXT4_CLUSTER_SIZE(inode->i_sb) >> 9 : 0; if (ext4_has_inline_data(inode)) return 0; return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0); } return S_ISLNK(inode->i_mode) && inode->i_size && (inode->i_size < EXT4_N_BLOCKS * 4); } /* * Called at the last iput() if i_nlink is zero. */ void ext4_evict_inode(struct inode *inode) { handle_t *handle; int err; /* * Credits for final inode cleanup and freeing: * sb + inode (ext4_orphan_del()), block bitmap, group descriptor * (xattr block freeing), bitmap, group descriptor (inode freeing) */ int extra_credits = 6; struct ext4_xattr_inode_array *ea_inode_array = NULL; bool freeze_protected = false; trace_ext4_evict_inode(inode); if (EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL) ext4_evict_ea_inode(inode); if (inode->i_nlink) { truncate_inode_pages_final(&inode->i_data); goto no_delete; } if (is_bad_inode(inode)) goto no_delete; dquot_initialize(inode); if (ext4_should_order_data(inode)) ext4_begin_ordered_truncate(inode, 0); truncate_inode_pages_final(&inode->i_data); /* * For inodes with journalled data, transaction commit could have * dirtied the inode. And for inodes with dioread_nolock, unwritten * extents converting worker could merge extents and also have dirtied * the inode. Flush worker is ignoring it because of I_FREEING flag but * we still need to remove the inode from the writeback lists. */ if (!list_empty_careful(&inode->i_io_list)) inode_io_list_del(inode); /* * Protect us against freezing - iput() caller didn't have to have any * protection against it. When we are in a running transaction though, * we are already protected against freezing and we cannot grab further * protection due to lock ordering constraints. */ if (!ext4_journal_current_handle()) { sb_start_intwrite(inode->i_sb); freeze_protected = true; } if (!IS_NOQUOTA(inode)) extra_credits += EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb); /* * Block bitmap, group descriptor, and inode are accounted in both * ext4_blocks_for_truncate() and extra_credits. So subtract 3. */ handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, ext4_blocks_for_truncate(inode) + extra_credits - 3); if (IS_ERR(handle)) { ext4_std_error(inode->i_sb, PTR_ERR(handle)); /* * If we're going to skip the normal cleanup, we still need to * make sure that the in-core orphan linked list is properly * cleaned up. */ ext4_orphan_del(NULL, inode); if (freeze_protected) sb_end_intwrite(inode->i_sb); goto no_delete; } if (IS_SYNC(inode)) ext4_handle_sync(handle); /* * Set inode->i_size to 0 before calling ext4_truncate(). We need * special handling of symlinks here because i_size is used to * determine whether ext4_inode_info->i_data contains symlink data or * block mappings. Setting i_size to 0 will remove its fast symlink * status. Erase i_data so that it becomes a valid empty block map. */ if (ext4_inode_is_fast_symlink(inode)) memset(EXT4_I(inode)->i_data, 0, sizeof(EXT4_I(inode)->i_data)); inode->i_size = 0; err = ext4_mark_inode_dirty(handle, inode); if (err) { ext4_warning(inode->i_sb, "couldn't mark inode dirty (err %d)", err); goto stop_handle; } if (inode->i_blocks) { err = ext4_truncate(inode); if (err) { ext4_error_err(inode->i_sb, -err, "couldn't truncate inode %lu (err %d)", inode->i_ino, err); goto stop_handle; } } /* Remove xattr references. */ err = ext4_xattr_delete_inode(handle, inode, &ea_inode_array, extra_credits); if (err) { ext4_warning(inode->i_sb, "xattr delete (err %d)", err); stop_handle: ext4_journal_stop(handle); ext4_orphan_del(NULL, inode); if (freeze_protected) sb_end_intwrite(inode->i_sb); ext4_xattr_inode_array_free(ea_inode_array); goto no_delete; } /* * Kill off the orphan record which ext4_truncate created. * AKPM: I think this can be inside the above `if'. * Note that ext4_orphan_del() has to be able to cope with the * deletion of a non-existent orphan - this is because we don't * know if ext4_truncate() actually created an orphan record. * (Well, we could do this if we need to, but heck - it works) */ ext4_orphan_del(handle, inode); EXT4_I(inode)->i_dtime = (__u32)ktime_get_real_seconds(); /* * One subtle ordering requirement: if anything has gone wrong * (transaction abort, IO errors, whatever), then we can still * do these next steps (the fs will already have been marked as * having errors), but we can't free the inode if the mark_dirty * fails. */ if (ext4_mark_inode_dirty(handle, inode)) /* If that failed, just do the required in-core inode clear. */ ext4_clear_inode(inode); else ext4_free_inode(handle, inode); ext4_journal_stop(handle); if (freeze_protected) sb_end_intwrite(inode->i_sb); ext4_xattr_inode_array_free(ea_inode_array); return; no_delete: /* * Check out some where else accidentally dirty the evicting inode, * which may probably cause inode use-after-free issues later. */ WARN_ON_ONCE(!list_empty_careful(&inode->i_io_list)); if (!list_empty(&EXT4_I(inode)->i_fc_list)) ext4_fc_mark_ineligible(inode->i_sb, EXT4_FC_REASON_NOMEM, NULL); ext4_clear_inode(inode); /* We must guarantee clearing of inode... */ } #ifdef CONFIG_QUOTA qsize_t *ext4_get_reserved_space(struct inode *inode) { return &EXT4_I(inode)->i_reserved_quota; } #endif /* * Called with i_data_sem down, which is important since we can call * ext4_discard_preallocations() from here. */ void ext4_da_update_reserve_space(struct inode *inode, int used, int quota_claim) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); spin_lock(&ei->i_block_reservation_lock); trace_ext4_da_update_reserve_space(inode, used, quota_claim); if (unlikely(used > ei->i_reserved_data_blocks)) { ext4_warning(inode->i_sb, "%s: ino %lu, used %d " "with only %d reserved data blocks", __func__, inode->i_ino, used, ei->i_reserved_data_blocks); WARN_ON(1); used = ei->i_reserved_data_blocks; } /* Update per-inode reservations */ ei->i_reserved_data_blocks -= used; percpu_counter_sub(&sbi->s_dirtyclusters_counter, used); spin_unlock(&ei->i_block_reservation_lock); /* Update quota subsystem for data blocks */ if (quota_claim) dquot_claim_block(inode, EXT4_C2B(sbi, used)); else { /* * We did fallocate with an offset that is already delayed * allocated. So on delayed allocated writeback we should * not re-claim the quota for fallocated blocks. */ dquot_release_reservation_block(inode, EXT4_C2B(sbi, used)); } /* * If we have done all the pending block allocations and if * there aren't any writers on the inode, we can discard the * inode's preallocations. */ if ((ei->i_reserved_data_blocks == 0) && !inode_is_open_for_write(inode)) ext4_discard_preallocations(inode, 0); } static int __check_block_validity(struct inode *inode, const char *func, unsigned int line, struct ext4_map_blocks *map) { if (ext4_has_feature_journal(inode->i_sb) && (inode->i_ino == le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum))) return 0; if (!ext4_inode_block_valid(inode, map->m_pblk, map->m_len)) { ext4_error_inode(inode, func, line, map->m_pblk, "lblock %lu mapped to illegal pblock %llu " "(length %d)", (unsigned long) map->m_lblk, map->m_pblk, map->m_len); return -EFSCORRUPTED; } return 0; } int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk, ext4_lblk_t len) { int ret; if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode)) return fscrypt_zeroout_range(inode, lblk, pblk, len); ret = sb_issue_zeroout(inode->i_sb, pblk, len, GFP_NOFS); if (ret > 0) ret = 0; return ret; } #define check_block_validity(inode, map) \ __check_block_validity((inode), __func__, __LINE__, (map)) #ifdef ES_AGGRESSIVE_TEST static void ext4_map_blocks_es_recheck(handle_t *handle, struct inode *inode, struct ext4_map_blocks *es_map, struct ext4_map_blocks *map, int flags) { int retval; map->m_flags = 0; /* * There is a race window that the result is not the same. * e.g. xfstests #223 when dioread_nolock enables. The reason * is that we lookup a block mapping in extent status tree with * out taking i_data_sem. So at the time the unwritten extent * could be converted. */ down_read(&EXT4_I(inode)->i_data_sem); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { retval = ext4_ext_map_blocks(handle, inode, map, 0); } else { retval = ext4_ind_map_blocks(handle, inode, map, 0); } up_read((&EXT4_I(inode)->i_data_sem)); /* * We don't check m_len because extent will be collpased in status * tree. So the m_len might not equal. */ if (es_map->m_lblk != map->m_lblk || es_map->m_flags != map->m_flags || es_map->m_pblk != map->m_pblk) { printk("ES cache assertion failed for inode: %lu " "es_cached ex [%d/%d/%llu/%x] != " "found ex [%d/%d/%llu/%x] retval %d flags %x\n", inode->i_ino, es_map->m_lblk, es_map->m_len, es_map->m_pblk, es_map->m_flags, map->m_lblk, map->m_len, map->m_pblk, map->m_flags, retval, flags); } } #endif /* ES_AGGRESSIVE_TEST */ /* * The ext4_map_blocks() function tries to look up the requested blocks, * and returns if the blocks are already mapped. * * Otherwise it takes the write lock of the i_data_sem and allocate blocks * and store the allocated blocks in the result buffer head and mark it * mapped. * * If file type is extents based, it will call ext4_ext_map_blocks(), * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping * based files * * On success, it returns the number of blocks being mapped or allocated. if * create==0 and the blocks are pre-allocated and unwritten, the resulting @map * is marked as unwritten. If the create == 1, it will mark @map as mapped. * * It returns 0 if plain look up failed (blocks have not been allocated), in * that case, @map is returned as unmapped but we still do fill map->m_len to * indicate the length of a hole starting at map->m_lblk. * * It returns the error in case of allocation failure. */ int ext4_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags) { struct extent_status es; int retval; int ret = 0; #ifdef ES_AGGRESSIVE_TEST struct ext4_map_blocks orig_map; memcpy(&orig_map, map, sizeof(*map)); #endif map->m_flags = 0; ext_debug(inode, "flag 0x%x, max_blocks %u, logical block %lu\n", flags, map->m_len, (unsigned long) map->m_lblk); /* * ext4_map_blocks returns an int, and m_len is an unsigned int */ if (unlikely(map->m_len > INT_MAX)) map->m_len = INT_MAX; /* We can handle the block number less than EXT_MAX_BLOCKS */ if (unlikely(map->m_lblk >= EXT_MAX_BLOCKS)) return -EFSCORRUPTED; /* Lookup extent status tree firstly */ if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) && ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) { map->m_pblk = ext4_es_pblock(&es) + map->m_lblk - es.es_lblk; map->m_flags |= ext4_es_is_written(&es) ? EXT4_MAP_MAPPED : EXT4_MAP_UNWRITTEN; retval = es.es_len - (map->m_lblk - es.es_lblk); if (retval > map->m_len) retval = map->m_len; map->m_len = retval; } else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) { map->m_pblk = 0; retval = es.es_len - (map->m_lblk - es.es_lblk); if (retval > map->m_len) retval = map->m_len; map->m_len = retval; retval = 0; } else { BUG(); } if (flags & EXT4_GET_BLOCKS_CACHED_NOWAIT) return retval; #ifdef ES_AGGRESSIVE_TEST ext4_map_blocks_es_recheck(handle, inode, map, &orig_map, flags); #endif goto found; } /* * In the query cache no-wait mode, nothing we can do more if we * cannot find extent in the cache. */ if (flags & EXT4_GET_BLOCKS_CACHED_NOWAIT) return 0; /* * Try to see if we can get the block without requesting a new * file system block. */ down_read(&EXT4_I(inode)->i_data_sem); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { retval = ext4_ext_map_blocks(handle, inode, map, 0); } else { retval = ext4_ind_map_blocks(handle, inode, map, 0); } if (retval > 0) { unsigned int status; if (unlikely(retval != map->m_len)) { ext4_warning(inode->i_sb, "ES len assertion failed for inode " "%lu: retval %d != map->m_len %d", inode->i_ino, retval, map->m_len); WARN_ON(1); } status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && !(status & EXTENT_STATUS_WRITTEN) && ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk, map->m_lblk + map->m_len - 1)) status |= EXTENT_STATUS_DELAYED; ext4_es_insert_extent(inode, map->m_lblk, map->m_len, map->m_pblk, status); } up_read((&EXT4_I(inode)->i_data_sem)); found: if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { ret = check_block_validity(inode, map); if (ret != 0) return ret; } /* If it is only a block(s) look up */ if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) return retval; /* * Returns if the blocks have already allocated * * Note that if blocks have been preallocated * ext4_ext_get_block() returns the create = 0 * with buffer head unmapped. */ if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) /* * If we need to convert extent to unwritten * we continue and do the actual work in * ext4_ext_map_blocks() */ if (!(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) return retval; /* * Here we clear m_flags because after allocating an new extent, * it will be set again. */ map->m_flags &= ~EXT4_MAP_FLAGS; /* * New blocks allocate and/or writing to unwritten extent * will possibly result in updating i_data, so we take * the write lock of i_data_sem, and call get_block() * with create == 1 flag. */ down_write(&EXT4_I(inode)->i_data_sem); /* * We need to check for EXT4 here because migrate * could have changed the inode type in between */ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { retval = ext4_ext_map_blocks(handle, inode, map, flags); } else { retval = ext4_ind_map_blocks(handle, inode, map, flags); if (retval > 0 && map->m_flags & EXT4_MAP_NEW) { /* * We allocated new blocks which will result in * i_data's format changing. Force the migrate * to fail by clearing migrate flags */ ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE); } } if (retval > 0) { unsigned int status; if (unlikely(retval != map->m_len)) { ext4_warning(inode->i_sb, "ES len assertion failed for inode " "%lu: retval %d != map->m_len %d", inode->i_ino, retval, map->m_len); WARN_ON(1); } /* * We have to zeroout blocks before inserting them into extent * status tree. Otherwise someone could look them up there and * use them before they are really zeroed. We also have to * unmap metadata before zeroing as otherwise writeback can * overwrite zeros with stale data from block device. */ if (flags & EXT4_GET_BLOCKS_ZERO && map->m_flags & EXT4_MAP_MAPPED && map->m_flags & EXT4_MAP_NEW) { ret = ext4_issue_zeroout(inode, map->m_lblk, map->m_pblk, map->m_len); if (ret) { retval = ret; goto out_sem; } } /* * If the extent has been zeroed out, we don't need to update * extent status tree. */ if ((flags & EXT4_GET_BLOCKS_PRE_IO) && ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { if (ext4_es_is_written(&es)) goto out_sem; } status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; if (!(flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) && !(status & EXTENT_STATUS_WRITTEN) && ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk, map->m_lblk + map->m_len - 1)) status |= EXTENT_STATUS_DELAYED; ext4_es_insert_extent(inode, map->m_lblk, map->m_len, map->m_pblk, status); } out_sem: up_write((&EXT4_I(inode)->i_data_sem)); if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { ret = check_block_validity(inode, map); if (ret != 0) return ret; /* * Inodes with freshly allocated blocks where contents will be * visible after transaction commit must be on transaction's * ordered data list. */ if (map->m_flags & EXT4_MAP_NEW && !(map->m_flags & EXT4_MAP_UNWRITTEN) && !(flags & EXT4_GET_BLOCKS_ZERO) && !ext4_is_quota_file(inode) && ext4_should_order_data(inode)) { loff_t start_byte = (loff_t)map->m_lblk << inode->i_blkbits; loff_t length = (loff_t)map->m_len << inode->i_blkbits; if (flags & EXT4_GET_BLOCKS_IO_SUBMIT) ret = ext4_jbd2_inode_add_wait(handle, inode, start_byte, length); else ret = ext4_jbd2_inode_add_write(handle, inode, start_byte, length); if (ret) return ret; } } if (retval > 0 && (map->m_flags & EXT4_MAP_UNWRITTEN || map->m_flags & EXT4_MAP_MAPPED)) ext4_fc_track_range(handle, inode, map->m_lblk, map->m_lblk + map->m_len - 1); if (retval < 0) ext_debug(inode, "failed with err %d\n", retval); return retval; } /* * Update EXT4_MAP_FLAGS in bh->b_state. For buffer heads attached to pages * we have to be careful as someone else may be manipulating b_state as well. */ static void ext4_update_bh_state(struct buffer_head *bh, unsigned long flags) { unsigned long old_state; unsigned long new_state; flags &= EXT4_MAP_FLAGS; /* Dummy buffer_head? Set non-atomically. */ if (!bh->b_page) { bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | flags; return; } /* * Someone else may be modifying b_state. Be careful! This is ugly but * once we get rid of using bh as a container for mapping information * to pass to / from get_block functions, this can go away. */ old_state = READ_ONCE(bh->b_state); do { new_state = (old_state & ~EXT4_MAP_FLAGS) | flags; } while (unlikely(!try_cmpxchg(&bh->b_state, &old_state, new_state))); } static int _ext4_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh, int flags) { struct ext4_map_blocks map; int ret = 0; if (ext4_has_inline_data(inode)) return -ERANGE; map.m_lblk = iblock; map.m_len = bh->b_size >> inode->i_blkbits; ret = ext4_map_blocks(ext4_journal_current_handle(), inode, &map, flags); if (ret > 0) { map_bh(bh, inode->i_sb, map.m_pblk); ext4_update_bh_state(bh, map.m_flags); bh->b_size = inode->i_sb->s_blocksize * map.m_len; ret = 0; } else if (ret == 0) { /* hole case, need to fill in bh->b_size */ bh->b_size = inode->i_sb->s_blocksize * map.m_len; } return ret; } int ext4_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create) { return _ext4_get_block(inode, iblock, bh, create ? EXT4_GET_BLOCKS_CREATE : 0); } /* * Get block function used when preparing for buffered write if we require * creating an unwritten extent if blocks haven't been allocated. The extent * will be converted to written after the IO is complete. */ int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { int ret = 0; ext4_debug("ext4_get_block_unwritten: inode %lu, create flag %d\n", inode->i_ino, create); ret = _ext4_get_block(inode, iblock, bh_result, EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT); /* * If the buffer is marked unwritten, mark it as new to make sure it is * zeroed out correctly in case of partial writes. Otherwise, there is * a chance of stale data getting exposed. */ if (ret == 0 && buffer_unwritten(bh_result)) set_buffer_new(bh_result); return ret; } /* Maximum number of blocks we map for direct IO at once. */ #define DIO_MAX_BLOCKS 4096 /* * `handle' can be NULL if create is zero */ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, ext4_lblk_t block, int map_flags) { struct ext4_map_blocks map; struct buffer_head *bh; int create = map_flags & EXT4_GET_BLOCKS_CREATE; bool nowait = map_flags & EXT4_GET_BLOCKS_CACHED_NOWAIT; int err; ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) || handle != NULL || create == 0); ASSERT(create == 0 || !nowait); map.m_lblk = block; map.m_len = 1; err = ext4_map_blocks(handle, inode, &map, map_flags); if (err == 0) return create ? ERR_PTR(-ENOSPC) : NULL; if (err < 0) return ERR_PTR(err); if (nowait) return sb_find_get_block(inode->i_sb, map.m_pblk); bh = sb_getblk(inode->i_sb, map.m_pblk); if (unlikely(!bh)) return ERR_PTR(-ENOMEM); if (map.m_flags & EXT4_MAP_NEW) { ASSERT(create != 0); ASSERT((EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) || (handle != NULL)); /* * Now that we do not always journal data, we should * keep in mind whether this should always journal the * new buffer as metadata. For now, regular file * writes use ext4_get_block instead, so it's not a * problem. */ lock_buffer(bh); BUFFER_TRACE(bh, "call get_create_access"); err = ext4_journal_get_create_access(handle, inode->i_sb, bh, EXT4_JTR_NONE); if (unlikely(err)) { unlock_buffer(bh); goto errout; } if (!buffer_uptodate(bh)) { memset(bh->b_data, 0, inode->i_sb->s_blocksize); set_buffer_uptodate(bh); } unlock_buffer(bh); BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, inode, bh); if (unlikely(err)) goto errout; } else BUFFER_TRACE(bh, "not a new buffer"); return bh; errout: brelse(bh); return ERR_PTR(err); } struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, ext4_lblk_t block, int map_flags) { struct buffer_head *bh; int ret; bh = ext4_getblk(handle, inode, block, map_flags); if (IS_ERR(bh)) return bh; if (!bh || ext4_buffer_uptodate(bh)) return bh; ret = ext4_read_bh_lock(bh, REQ_META | REQ_PRIO, true); if (ret) { put_bh(bh); return ERR_PTR(ret); } return bh; } /* Read a contiguous batch of blocks. */ int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count, bool wait, struct buffer_head **bhs) { int i, err; for (i = 0; i < bh_count; i++) { bhs[i] = ext4_getblk(NULL, inode, block + i, 0 /* map_flags */); if (IS_ERR(bhs[i])) { err = PTR_ERR(bhs[i]); bh_count = i; goto out_brelse; } } for (i = 0; i < bh_count; i++) /* Note that NULL bhs[i] is valid because of holes. */ if (bhs[i] && !ext4_buffer_uptodate(bhs[i])) ext4_read_bh_lock(bhs[i], REQ_META | REQ_PRIO, false); if (!wait) return 0; for (i = 0; i < bh_count; i++) if (bhs[i]) wait_on_buffer(bhs[i]); for (i = 0; i < bh_count; i++) { if (bhs[i] && !buffer_uptodate(bhs[i])) { err = -EIO; goto out_brelse; } } return 0; out_brelse: for (i = 0; i < bh_count; i++) { brelse(bhs[i]); bhs[i] = NULL; } return err; } int ext4_walk_page_buffers(handle_t *handle, struct inode *inode, struct buffer_head *head, unsigned from, unsigned to, int *partial, int (*fn)(handle_t *handle, struct inode *inode, struct buffer_head *bh)) { struct buffer_head *bh; unsigned block_start, block_end; unsigned blocksize = head->b_size; int err, ret = 0; struct buffer_head *next; for (bh = head, block_start = 0; ret == 0 && (bh != head || !block_start); block_start = block_end, bh = next) { next = bh->b_this_page; block_end = block_start + blocksize; if (block_end <= from || block_start >= to) { if (partial && !buffer_uptodate(bh)) *partial = 1; continue; } err = (*fn)(handle, inode, bh); if (!ret) ret = err; } return ret; } /* * Helper for handling dirtying of journalled data. We also mark the folio as * dirty so that writeback code knows about this page (and inode) contains * dirty data. ext4_writepages() then commits appropriate transaction to * make data stable. */ static int ext4_dirty_journalled_data(handle_t *handle, struct buffer_head *bh) { folio_mark_dirty(bh->b_folio); return ext4_handle_dirty_metadata(handle, NULL, bh); } int do_journal_get_write_access(handle_t *handle, struct inode *inode, struct buffer_head *bh) { int dirty = buffer_dirty(bh); int ret; if (!buffer_mapped(bh) || buffer_freed(bh)) return 0; /* * __block_write_begin() could have dirtied some buffers. Clean * the dirty bit as jbd2_journal_get_write_access() could complain * otherwise about fs integrity issues. Setting of the dirty bit * by __block_write_begin() isn't a real problem here as we clear * the bit before releasing a page lock and thus writeback cannot * ever write the buffer. */ if (dirty) clear_buffer_dirty(bh); BUFFER_TRACE(bh, "get write access"); ret = ext4_journal_get_write_access(handle, inode->i_sb, bh, EXT4_JTR_NONE); if (!ret && dirty) ret = ext4_dirty_journalled_data(handle, bh); return ret; } #ifdef CONFIG_FS_ENCRYPTION static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len, get_block_t *get_block) { unsigned from = pos & (PAGE_SIZE - 1); unsigned to = from + len; struct inode *inode = folio->mapping->host; unsigned block_start, block_end; sector_t block; int err = 0; unsigned blocksize = inode->i_sb->s_blocksize; unsigned bbits; struct buffer_head *bh, *head, *wait[2]; int nr_wait = 0; int i; BUG_ON(!folio_test_locked(folio)); BUG_ON(from > PAGE_SIZE); BUG_ON(to > PAGE_SIZE); BUG_ON(from > to); head = folio_buffers(folio); if (!head) head = create_empty_buffers(folio, blocksize, 0); bbits = ilog2(blocksize); block = (sector_t)folio->index << (PAGE_SHIFT - bbits); for (bh = head, block_start = 0; bh != head || !block_start; block++, block_start = block_end, bh = bh->b_this_page) { block_end = block_start + blocksize; if (block_end <= from || block_start >= to) { if (folio_test_uptodate(folio)) { set_buffer_uptodate(bh); } continue; } if (buffer_new(bh)) clear_buffer_new(bh); if (!buffer_mapped(bh)) { WARN_ON(bh->b_size != blocksize); err = get_block(inode, block, bh, 1); if (err) break; if (buffer_new(bh)) { if (folio_test_uptodate(folio)) { clear_buffer_new(bh); set_buffer_uptodate(bh); mark_buffer_dirty(bh); continue; } if (block_end > to || block_start < from) folio_zero_segments(folio, to, block_end, block_start, from); continue; } } if (folio_test_uptodate(folio)) { set_buffer_uptodate(bh); continue; } if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh) && (block_start < from || block_end > to)) { ext4_read_bh_lock(bh, 0, false); wait[nr_wait++] = bh; } } /* * If we issued read requests, let them complete. */ for (i = 0; i < nr_wait; i++) { wait_on_buffer(wait[i]); if (!buffer_uptodate(wait[i])) err = -EIO; } if (unlikely(err)) { folio_zero_new_buffers(folio, from, to); } else if (fscrypt_inode_uses_fs_layer_crypto(inode)) { for (i = 0; i < nr_wait; i++) { int err2; err2 = fscrypt_decrypt_pagecache_blocks(folio, blocksize, bh_offset(wait[i])); if (err2) { clear_buffer_uptodate(wait[i]); err = err2; } } } return err; } #endif /* * To preserve ordering, it is essential that the hole instantiation and * the data write be encapsulated in a single transaction. We cannot * close off a transaction and start a new one between the ext4_get_block() * and the ext4_write_end(). So doing the jbd2_journal_start at the start of * ext4_write_begin() is the right place. */ static int ext4_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, struct page **pagep, void **fsdata) { struct inode *inode = mapping->host; int ret, needed_blocks; handle_t *handle; int retries = 0; struct folio *folio; pgoff_t index; unsigned from, to; if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; trace_ext4_write_begin(inode, pos, len); /* * Reserve one block more for addition to orphan list in case * we allocate blocks but write fails for some reason */ needed_blocks = ext4_writepage_trans_blocks(inode) + 1; index = pos >> PAGE_SHIFT; from = pos & (PAGE_SIZE - 1); to = from + len; if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { ret = ext4_try_to_write_inline_data(mapping, inode, pos, len, pagep); if (ret < 0) return ret; if (ret == 1) return 0; } /* * __filemap_get_folio() can take a long time if the * system is thrashing due to memory pressure, or if the folio * is being written back. So grab it first before we start * the transaction handle. This also allows us to allocate * the folio (if needed) without using GFP_NOFS. */ retry_grab: folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, mapping_gfp_mask(mapping)); if (IS_ERR(folio)) return PTR_ERR(folio); /* * The same as page allocation, we prealloc buffer heads before * starting the handle. */ if (!folio_buffers(folio)) create_empty_buffers(folio, inode->i_sb->s_blocksize, 0); folio_unlock(folio); retry_journal: handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks); if (IS_ERR(handle)) { folio_put(folio); return PTR_ERR(handle); } folio_lock(folio); if (folio->mapping != mapping) { /* The folio got truncated from under us */ folio_unlock(folio); folio_put(folio); ext4_journal_stop(handle); goto retry_grab; } /* In case writeback began while the folio was unlocked */ folio_wait_stable(folio); #ifdef CONFIG_FS_ENCRYPTION if (ext4_should_dioread_nolock(inode)) ret = ext4_block_write_begin(folio, pos, len, ext4_get_block_unwritten); else ret = ext4_block_write_begin(folio, pos, len, ext4_get_block); #else if (ext4_should_dioread_nolock(inode)) ret = __block_write_begin(&folio->page, pos, len, ext4_get_block_unwritten); else ret = __block_write_begin(&folio->page, pos, len, ext4_get_block); #endif if (!ret && ext4_should_journal_data(inode)) { ret = ext4_walk_page_buffers(handle, inode, folio_buffers(folio), from, to, NULL, do_journal_get_write_access); } if (ret) { bool extended = (pos + len > inode->i_size) && !ext4_verity_in_progress(inode); folio_unlock(folio); /* * __block_write_begin may have instantiated a few blocks * outside i_size. Trim these off again. Don't need * i_size_read because we hold i_rwsem. * * Add inode to orphan list in case we crash before * truncate finishes */ if (extended && ext4_can_truncate(inode)) ext4_orphan_add(handle, inode); ext4_journal_stop(handle); if (extended) { ext4_truncate_failed_write(inode); /* * If truncate failed early the inode might * still be on the orphan list; we need to * make sure the inode is removed from the * orphan list in that case. */ if (inode->i_nlink) ext4_orphan_del(NULL, inode); } if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry_journal; folio_put(folio); return ret; } *pagep = &folio->page; return ret; } /* For write_end() in data=journal mode */ static int write_end_fn(handle_t *handle, struct inode *inode, struct buffer_head *bh) { int ret; if (!buffer_mapped(bh) || buffer_freed(bh)) return 0; set_buffer_uptodate(bh); ret = ext4_dirty_journalled_data(handle, bh); clear_buffer_meta(bh); clear_buffer_prio(bh); return ret; } /* * We need to pick up the new inode size which generic_commit_write gave us * `file' can be NULL - eg, when called from page_symlink(). * * ext4 never places buffers on inode->i_mapping->private_list. metadata * buffers are managed internally. */ static int ext4_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { struct folio *folio = page_folio(page); handle_t *handle = ext4_journal_current_handle(); struct inode *inode = mapping->host; loff_t old_size = inode->i_size; int ret = 0, ret2; int i_size_changed = 0; bool verity = ext4_verity_in_progress(inode); trace_ext4_write_end(inode, pos, len, copied); if (ext4_has_inline_data(inode) && ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) return ext4_write_inline_data_end(inode, pos, len, copied, folio); copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); /* * it's important to update i_size while still holding folio lock: * page writeout could otherwise come in and zero beyond i_size. * * If FS_IOC_ENABLE_VERITY is running on this inode, then Merkle tree * blocks are being written past EOF, so skip the i_size update. */ if (!verity) i_size_changed = ext4_update_inode_size(inode, pos + copied); folio_unlock(folio); folio_put(folio); if (old_size < pos && !verity) pagecache_isize_extended(inode, old_size, pos); /* * Don't mark the inode dirty under folio lock. First, it unnecessarily * makes the holding time of folio lock longer. Second, it forces lock * ordering of folio lock and transaction start for journaling * filesystems. */ if (i_size_changed) ret = ext4_mark_inode_dirty(handle, inode); if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode)) /* if we have allocated more blocks and copied * less. We will have blocks allocated outside * inode->i_size. So truncate them */ ext4_orphan_add(handle, inode); ret2 = ext4_journal_stop(handle); if (!ret) ret = ret2; if (pos + len > inode->i_size && !verity) { ext4_truncate_failed_write(inode); /* * If truncate failed early the inode might still be * on the orphan list; we need to make sure the inode * is removed from the orphan list in that case. */ if (inode->i_nlink) ext4_orphan_del(NULL, inode); } return ret ? ret : copied; } /* * This is a private version of folio_zero_new_buffers() which doesn't * set the buffer to be dirty, since in data=journalled mode we need * to call ext4_dirty_journalled_data() instead. */ static void ext4_journalled_zero_new_buffers(handle_t *handle, struct inode *inode, struct folio *folio, unsigned from, unsigned to) { unsigned int block_start = 0, block_end; struct buffer_head *head, *bh; bh = head = folio_buffers(folio); do { block_end = block_start + bh->b_size; if (buffer_new(bh)) { if (block_end > from && block_start < to) { if (!folio_test_uptodate(folio)) { unsigned start, size; start = max(from, block_start); size = min(to, block_end) - start; folio_zero_range(folio, start, size); write_end_fn(handle, inode, bh); } clear_buffer_new(bh); } } block_start = block_end; bh = bh->b_this_page; } while (bh != head); } static int ext4_journalled_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { struct folio *folio = page_folio(page); handle_t *handle = ext4_journal_current_handle(); struct inode *inode = mapping->host; loff_t old_size = inode->i_size; int ret = 0, ret2; int partial = 0; unsigned from, to; int size_changed = 0; bool verity = ext4_verity_in_progress(inode); trace_ext4_journalled_write_end(inode, pos, len, copied); from = pos & (PAGE_SIZE - 1); to = from + len; BUG_ON(!ext4_handle_valid(handle)); if (ext4_has_inline_data(inode)) return ext4_write_inline_data_end(inode, pos, len, copied, folio); if (unlikely(copied < len) && !folio_test_uptodate(folio)) { copied = 0; ext4_journalled_zero_new_buffers(handle, inode, folio, from, to); } else { if (unlikely(copied < len)) ext4_journalled_zero_new_buffers(handle, inode, folio, from + copied, to); ret = ext4_walk_page_buffers(handle, inode, folio_buffers(folio), from, from + copied, &partial, write_end_fn); if (!partial) folio_mark_uptodate(folio); } if (!verity) size_changed = ext4_update_inode_size(inode, pos + copied); EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; folio_unlock(folio); folio_put(folio); if (old_size < pos && !verity) pagecache_isize_extended(inode, old_size, pos); if (size_changed) { ret2 = ext4_mark_inode_dirty(handle, inode); if (!ret) ret = ret2; } if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode)) /* if we have allocated more blocks and copied * less. We will have blocks allocated outside * inode->i_size. So truncate them */ ext4_orphan_add(handle, inode); ret2 = ext4_journal_stop(handle); if (!ret) ret = ret2; if (pos + len > inode->i_size && !verity) { ext4_truncate_failed_write(inode); /* * If truncate failed early the inode might still be * on the orphan list; we need to make sure the inode * is removed from the orphan list in that case. */ if (inode->i_nlink) ext4_orphan_del(NULL, inode); } return ret ? ret : copied; } /* * Reserve space for a single cluster */ static int ext4_da_reserve_space(struct inode *inode) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); int ret; /* * We will charge metadata quota at writeout time; this saves * us from metadata over-estimation, though we may go over by * a small amount in the end. Here we just reserve for data. */ ret = dquot_reserve_block(inode, EXT4_C2B(sbi, 1)); if (ret) return ret; spin_lock(&ei->i_block_reservation_lock); if (ext4_claim_free_clusters(sbi, 1, 0)) { spin_unlock(&ei->i_block_reservation_lock); dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1)); return -ENOSPC; } ei->i_reserved_data_blocks++; trace_ext4_da_reserve_space(inode); spin_unlock(&ei->i_block_reservation_lock); return 0; /* success */ } void ext4_da_release_space(struct inode *inode, int to_free) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); if (!to_free) return; /* Nothing to release, exit */ spin_lock(&EXT4_I(inode)->i_block_reservation_lock); trace_ext4_da_release_space(inode, to_free); if (unlikely(to_free > ei->i_reserved_data_blocks)) { /* * if there aren't enough reserved blocks, then the * counter is messed up somewhere. Since this * function is called from invalidate page, it's * harmless to return without any action. */ ext4_warning(inode->i_sb, "ext4_da_release_space: " "ino %lu, to_free %d with only %d reserved " "data blocks", inode->i_ino, to_free, ei->i_reserved_data_blocks); WARN_ON(1); to_free = ei->i_reserved_data_blocks; } ei->i_reserved_data_blocks -= to_free; /* update fs dirty data blocks counter */ percpu_counter_sub(&sbi->s_dirtyclusters_counter, to_free); spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); dquot_release_reservation_block(inode, EXT4_C2B(sbi, to_free)); } /* * Delayed allocation stuff */ struct mpage_da_data { /* These are input fields for ext4_do_writepages() */ struct inode *inode; struct writeback_control *wbc; unsigned int can_map:1; /* Can writepages call map blocks? */ /* These are internal state of ext4_do_writepages() */ pgoff_t first_page; /* The first page to write */ pgoff_t next_page; /* Current page to examine */ pgoff_t last_page; /* Last page to examine */ /* * Extent to map - this can be after first_page because that can be * fully mapped. We somewhat abuse m_flags to store whether the extent * is delalloc or unwritten. */ struct ext4_map_blocks map; struct ext4_io_submit io_submit; /* IO submission data */ unsigned int do_map:1; unsigned int scanned_until_end:1; unsigned int journalled_more_data:1; }; static void mpage_release_unused_pages(struct mpage_da_data *mpd, bool invalidate) { unsigned nr, i; pgoff_t index, end; struct folio_batch fbatch; struct inode *inode = mpd->inode; struct address_space *mapping = inode->i_mapping; /* This is necessary when next_page == 0. */ if (mpd->first_page >= mpd->next_page) return; mpd->scanned_until_end = 0; index = mpd->first_page; end = mpd->next_page - 1; if (invalidate) { ext4_lblk_t start, last; start = index << (PAGE_SHIFT - inode->i_blkbits); last = end << (PAGE_SHIFT - inode->i_blkbits); /* * avoid racing with extent status tree scans made by * ext4_insert_delayed_block() */ down_write(&EXT4_I(inode)->i_data_sem); ext4_es_remove_extent(inode, start, last - start + 1); up_write(&EXT4_I(inode)->i_data_sem); } folio_batch_init(&fbatch); while (index <= end) { nr = filemap_get_folios(mapping, &index, end, &fbatch); if (nr == 0) break; for (i = 0; i < nr; i++) { struct folio *folio = fbatch.folios[i]; if (folio->index < mpd->first_page) continue; if (folio_next_index(folio) - 1 > end) continue; BUG_ON(!folio_test_locked(folio)); BUG_ON(folio_test_writeback(folio)); if (invalidate) { if (folio_mapped(folio)) folio_clear_dirty_for_io(folio); block_invalidate_folio(folio, 0, folio_size(folio)); folio_clear_uptodate(folio); } folio_unlock(folio); } folio_batch_release(&fbatch); } } static void ext4_print_free_blocks(struct inode *inode) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct super_block *sb = inode->i_sb; struct ext4_inode_info *ei = EXT4_I(inode); ext4_msg(sb, KERN_CRIT, "Total free blocks count %lld", EXT4_C2B(EXT4_SB(inode->i_sb), ext4_count_free_clusters(sb))); ext4_msg(sb, KERN_CRIT, "Free/Dirty block details"); ext4_msg(sb, KERN_CRIT, "free_blocks=%lld", (long long) EXT4_C2B(EXT4_SB(sb), percpu_counter_sum(&sbi->s_freeclusters_counter))); ext4_msg(sb, KERN_CRIT, "dirty_blocks=%lld", (long long) EXT4_C2B(EXT4_SB(sb), percpu_counter_sum(&sbi->s_dirtyclusters_counter))); ext4_msg(sb, KERN_CRIT, "Block reservation details"); ext4_msg(sb, KERN_CRIT, "i_reserved_data_blocks=%u", ei->i_reserved_data_blocks); return; } /* * ext4_insert_delayed_block - adds a delayed block to the extents status * tree, incrementing the reserved cluster/block * count or making a pending reservation * where needed * * @inode - file containing the newly added block * @lblk - logical block to be added * * Returns 0 on success, negative error code on failure. */ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); int ret; bool allocated = false; /* * If the cluster containing lblk is shared with a delayed, * written, or unwritten extent in a bigalloc file system, it's * already been accounted for and does not need to be reserved. * A pending reservation must be made for the cluster if it's * shared with a written or unwritten extent and doesn't already * have one. Written and unwritten extents can be purged from the * extents status tree if the system is under memory pressure, so * it's necessary to examine the extent tree if a search of the * extents status tree doesn't get a match. */ if (sbi->s_cluster_ratio == 1) { ret = ext4_da_reserve_space(inode); if (ret != 0) /* ENOSPC */ return ret; } else { /* bigalloc */ if (!ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk)) { if (!ext4_es_scan_clu(inode, &ext4_es_is_mapped, lblk)) { ret = ext4_clu_mapped(inode, EXT4_B2C(sbi, lblk)); if (ret < 0) return ret; if (ret == 0) { ret = ext4_da_reserve_space(inode); if (ret != 0) /* ENOSPC */ return ret; } else { allocated = true; } } else { allocated = true; } } } ext4_es_insert_delayed_block(inode, lblk, allocated); return 0; } /* * This function is grabs code from the very beginning of * ext4_map_blocks, but assumes that the caller is from delayed write * time. This function looks up the requested blocks and sets the * buffer delay bit under the protection of i_data_sem. */ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, struct ext4_map_blocks *map, struct buffer_head *bh) { struct extent_status es; int retval; sector_t invalid_block = ~((sector_t) 0xffff); #ifdef ES_AGGRESSIVE_TEST struct ext4_map_blocks orig_map; memcpy(&orig_map, map, sizeof(*map)); #endif if (invalid_block < ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es)) invalid_block = ~0; map->m_flags = 0; ext_debug(inode, "max_blocks %u, logical block %lu\n", map->m_len, (unsigned long) map->m_lblk); /* Lookup extent status tree firstly */ if (ext4_es_lookup_extent(inode, iblock, NULL, &es)) { if (ext4_es_is_hole(&es)) { retval = 0; down_read(&EXT4_I(inode)->i_data_sem); goto add_delayed; } /* * Delayed extent could be allocated by fallocate. * So we need to check it. */ if (ext4_es_is_delayed(&es) && !ext4_es_is_unwritten(&es)) { map_bh(bh, inode->i_sb, invalid_block); set_buffer_new(bh); set_buffer_delay(bh); return 0; } map->m_pblk = ext4_es_pblock(&es) + iblock - es.es_lblk; retval = es.es_len - (iblock - es.es_lblk); if (retval > map->m_len) retval = map->m_len; map->m_len = retval; if (ext4_es_is_written(&es)) map->m_flags |= EXT4_MAP_MAPPED; else if (ext4_es_is_unwritten(&es)) map->m_flags |= EXT4_MAP_UNWRITTEN; else BUG(); #ifdef ES_AGGRESSIVE_TEST ext4_map_blocks_es_recheck(NULL, inode, map, &orig_map, 0); #endif return retval; } /* * Try to see if we can get the block without requesting a new * file system block. */ down_read(&EXT4_I(inode)->i_data_sem); if (ext4_has_inline_data(inode)) retval = 0; else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) retval = ext4_ext_map_blocks(NULL, inode, map, 0); else retval = ext4_ind_map_blocks(NULL, inode, map, 0); add_delayed: if (retval == 0) { int ret; /* * XXX: __block_prepare_write() unmaps passed block, * is it OK? */ ret = ext4_insert_delayed_block(inode, map->m_lblk); if (ret != 0) { retval = ret; goto out_unlock; } map_bh(bh, inode->i_sb, invalid_block); set_buffer_new(bh); set_buffer_delay(bh); } else if (retval > 0) { unsigned int status; if (unlikely(retval != map->m_len)) { ext4_warning(inode->i_sb, "ES len assertion failed for inode " "%lu: retval %d != map->m_len %d", inode->i_ino, retval, map->m_len); WARN_ON(1); } status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; ext4_es_insert_extent(inode, map->m_lblk, map->m_len, map->m_pblk, status); } out_unlock: up_read((&EXT4_I(inode)->i_data_sem)); return retval; } /* * This is a special get_block_t callback which is used by * ext4_da_write_begin(). It will either return mapped block or * reserve space for a single block. * * For delayed buffer_head we have BH_Mapped, BH_New, BH_Delay set. * We also have b_blocknr = -1 and b_bdev initialized properly * * For unwritten buffer_head we have BH_Mapped, BH_New, BH_Unwritten set. * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev * initialized properly. */ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create) { struct ext4_map_blocks map; int ret = 0; BUG_ON(create == 0); BUG_ON(bh->b_size != inode->i_sb->s_blocksize); map.m_lblk = iblock; map.m_len = 1; /* * first, we need to know whether the block is allocated already * preallocated blocks are unmapped but should treated * the same as allocated blocks. */ ret = ext4_da_map_blocks(inode, iblock, &map, bh); if (ret <= 0) return ret; map_bh(bh, inode->i_sb, map.m_pblk); ext4_update_bh_state(bh, map.m_flags); if (buffer_unwritten(bh)) { /* A delayed write to unwritten bh should be marked * new and mapped. Mapped ensures that we don't do * get_block multiple times when we write to the same * offset and new ensures that we do proper zero out * for partial write. */ set_buffer_new(bh); set_buffer_mapped(bh); } return 0; } static void mpage_folio_done(struct mpage_da_data *mpd, struct folio *folio) { mpd->first_page += folio_nr_pages(folio); folio_unlock(folio); } static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio) { size_t len; loff_t size; int err; BUG_ON(folio->index != mpd->first_page); folio_clear_dirty_for_io(folio); /* * We have to be very careful here! Nothing protects writeback path * against i_size changes and the page can be writeably mapped into * page tables. So an application can be growing i_size and writing * data through mmap while writeback runs. folio_clear_dirty_for_io() * write-protects our page in page tables and the page cannot get * written to again until we release folio lock. So only after * folio_clear_dirty_for_io() we are safe to sample i_size for * ext4_bio_write_folio() to zero-out tail of the written page. We rely * on the barrier provided by folio_test_clear_dirty() in * folio_clear_dirty_for_io() to make sure i_size is really sampled only * after page tables are updated. */ size = i_size_read(mpd->inode); len = folio_size(folio); if (folio_pos(folio) + len > size && !ext4_verity_in_progress(mpd->inode)) len = size & ~PAGE_MASK; err = ext4_bio_write_folio(&mpd->io_submit, folio, len); if (!err) mpd->wbc->nr_to_write--; return err; } #define BH_FLAGS (BIT(BH_Unwritten) | BIT(BH_Delay)) /* * mballoc gives us at most this number of blocks... * XXX: That seems to be only a limitation of ext4_mb_normalize_request(). * The rest of mballoc seems to handle chunks up to full group size. */ #define MAX_WRITEPAGES_EXTENT_LEN 2048 /* * mpage_add_bh_to_extent - try to add bh to extent of blocks to map * * @mpd - extent of blocks * @lblk - logical number of the block in the file * @bh - buffer head we want to add to the extent * * The function is used to collect contig. blocks in the same state. If the * buffer doesn't require mapping for writeback and we haven't started the * extent of buffers to map yet, the function returns 'true' immediately - the * caller can write the buffer right away. Otherwise the function returns true * if the block has been added to the extent, false if the block couldn't be * added. */ static bool mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk, struct buffer_head *bh) { struct ext4_map_blocks *map = &mpd->map; /* Buffer that doesn't need mapping for writeback? */ if (!buffer_dirty(bh) || !buffer_mapped(bh) || (!buffer_delay(bh) && !buffer_unwritten(bh))) { /* So far no extent to map => we write the buffer right away */ if (map->m_len == 0) return true; return false; } /* First block in the extent? */ if (map->m_len == 0) { /* We cannot map unless handle is started... */ if (!mpd->do_map) return false; map->m_lblk = lblk; map->m_len = 1; map->m_flags = bh->b_state & BH_FLAGS; return true; } /* Don't go larger than mballoc is willing to allocate */ if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN) return false; /* Can we merge the block to our big extent? */ if (lblk == map->m_lblk + map->m_len && (bh->b_state & BH_FLAGS) == map->m_flags) { map->m_len++; return true; } return false; } /* * mpage_process_page_bufs - submit page buffers for IO or add them to extent * * @mpd - extent of blocks for mapping * @head - the first buffer in the page * @bh - buffer we should start processing from * @lblk - logical number of the block in the file corresponding to @bh * * Walk through page buffers from @bh upto @head (exclusive) and either submit * the page for IO if all buffers in this page were mapped and there's no * accumulated extent of buffers to map or add buffers in the page to the * extent of buffers to map. The function returns 1 if the caller can continue * by processing the next page, 0 if it should stop adding buffers to the * extent to map because we cannot extend it anymore. It can also return value * < 0 in case of error during IO submission. */ static int mpage_process_page_bufs(struct mpage_da_data *mpd, struct buffer_head *head, struct buffer_head *bh, ext4_lblk_t lblk) { struct inode *inode = mpd->inode; int err; ext4_lblk_t blocks = (i_size_read(inode) + i_blocksize(inode) - 1) >> inode->i_blkbits; if (ext4_verity_in_progress(inode)) blocks = EXT_MAX_BLOCKS; do { BUG_ON(buffer_locked(bh)); if (lblk >= blocks || !mpage_add_bh_to_extent(mpd, lblk, bh)) { /* Found extent to map? */ if (mpd->map.m_len) return 0; /* Buffer needs mapping and handle is not started? */ if (!mpd->do_map) return 0; /* Everything mapped so far and we hit EOF */ break; } } while (lblk++, (bh = bh->b_this_page) != head); /* So far everything mapped? Submit the page for IO. */ if (mpd->map.m_len == 0) { err = mpage_submit_folio(mpd, head->b_folio); if (err < 0) return err; mpage_folio_done(mpd, head->b_folio); } if (lblk >= blocks) { mpd->scanned_until_end = 1; return 0; } return 1; } /* * mpage_process_folio - update folio buffers corresponding to changed extent * and may submit fully mapped page for IO * @mpd: description of extent to map, on return next extent to map * @folio: Contains these buffers. * @m_lblk: logical block mapping. * @m_pblk: corresponding physical mapping. * @map_bh: determines on return whether this page requires any further * mapping or not. * * Scan given folio buffers corresponding to changed extent and update buffer * state according to new extent state. * We map delalloc buffers to their physical location, clear unwritten bits. * If the given folio is not fully mapped, we update @mpd to the next extent in * the given folio that needs mapping & return @map_bh as true. */ static int mpage_process_folio(struct mpage_da_data *mpd, struct folio *folio, ext4_lblk_t *m_lblk, ext4_fsblk_t *m_pblk, bool *map_bh) { struct buffer_head *head, *bh; ext4_io_end_t *io_end = mpd->io_submit.io_end; ext4_lblk_t lblk = *m_lblk; ext4_fsblk_t pblock = *m_pblk; int err = 0; int blkbits = mpd->inode->i_blkbits; ssize_t io_end_size = 0; struct ext4_io_end_vec *io_end_vec = ext4_last_io_end_vec(io_end); bh = head = folio_buffers(folio); do { if (lblk < mpd->map.m_lblk) continue; if (lblk >= mpd->map.m_lblk + mpd->map.m_len) { /* * Buffer after end of mapped extent. * Find next buffer in the folio to map. */ mpd->map.m_len = 0; mpd->map.m_flags = 0; io_end_vec->size += io_end_size; err = mpage_process_page_bufs(mpd, head, bh, lblk); if (err > 0) err = 0; if (!err && mpd->map.m_len && mpd->map.m_lblk > lblk) { io_end_vec = ext4_alloc_io_end_vec(io_end); if (IS_ERR(io_end_vec)) { err = PTR_ERR(io_end_vec); goto out; } io_end_vec->offset = (loff_t)mpd->map.m_lblk << blkbits; } *map_bh = true; goto out; } if (buffer_delay(bh)) { clear_buffer_delay(bh); bh->b_blocknr = pblock++; } clear_buffer_unwritten(bh); io_end_size += (1 << blkbits); } while (lblk++, (bh = bh->b_this_page) != head); io_end_vec->size += io_end_size; *map_bh = false; out: *m_lblk = lblk; *m_pblk = pblock; return err; } /* * mpage_map_buffers - update buffers corresponding to changed extent and * submit fully mapped pages for IO * * @mpd - description of extent to map, on return next extent to map * * Scan buffers corresponding to changed extent (we expect corresponding pages * to be already locked) and update buffer state according to new extent state. * We map delalloc buffers to their physical location, clear unwritten bits, * and mark buffers as uninit when we perform writes to unwritten extents * and do extent conversion after IO is finished. If the last page is not fully * mapped, we update @map to the next extent in the last page that needs * mapping. Otherwise we submit the page for IO. */ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) { struct folio_batch fbatch; unsigned nr, i; struct inode *inode = mpd->inode; int bpp_bits = PAGE_SHIFT - inode->i_blkbits; pgoff_t start, end; ext4_lblk_t lblk; ext4_fsblk_t pblock; int err; bool map_bh = false; start = mpd->map.m_lblk >> bpp_bits; end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits; lblk = start << bpp_bits; pblock = mpd->map.m_pblk; folio_batch_init(&fbatch); while (start <= end) { nr = filemap_get_folios(inode->i_mapping, &start, end, &fbatch); if (nr == 0) break; for (i = 0; i < nr; i++) { struct folio *folio = fbatch.folios[i]; err = mpage_process_folio(mpd, folio, &lblk, &pblock, &map_bh); /* * If map_bh is true, means page may require further bh * mapping, or maybe the page was submitted for IO. * So we return to call further extent mapping. */ if (err < 0 || map_bh) goto out; /* Page fully mapped - let IO run! */ err = mpage_submit_folio(mpd, folio); if (err < 0) goto out; mpage_folio_done(mpd, folio); } folio_batch_release(&fbatch); } /* Extent fully mapped and matches with page boundary. We are done. */ mpd->map.m_len = 0; mpd->map.m_flags = 0; return 0; out: folio_batch_release(&fbatch); return err; } static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) { struct inode *inode = mpd->inode; struct ext4_map_blocks *map = &mpd->map; int get_blocks_flags; int err, dioread_nolock; trace_ext4_da_write_pages_extent(inode, map); /* * Call ext4_map_blocks() to allocate any delayed allocation blocks, or * to convert an unwritten extent to be initialized (in the case * where we have written into one or more preallocated blocks). It is * possible that we're going to need more metadata blocks than * previously reserved. However we must not fail because we're in * writeback and there is nothing we can do about it so it might result * in data loss. So use reserved blocks to allocate metadata if * possible. * * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE if * the blocks in question are delalloc blocks. This indicates * that the blocks and quotas has already been checked when * the data was copied into the page cache. */ get_blocks_flags = EXT4_GET_BLOCKS_CREATE | EXT4_GET_BLOCKS_METADATA_NOFAIL | EXT4_GET_BLOCKS_IO_SUBMIT; dioread_nolock = ext4_should_dioread_nolock(inode); if (dioread_nolock) get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; if (map->m_flags & BIT(BH_Delay)) get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; err = ext4_map_blocks(handle, inode, map, get_blocks_flags); if (err < 0) return err; if (dioread_nolock && (map->m_flags & EXT4_MAP_UNWRITTEN)) { if (!mpd->io_submit.io_end->handle && ext4_handle_valid(handle)) { mpd->io_submit.io_end->handle = handle->h_rsv_handle; handle->h_rsv_handle = NULL; } ext4_set_io_unwritten_flag(inode, mpd->io_submit.io_end); } BUG_ON(map->m_len == 0); return 0; } /* * mpage_map_and_submit_extent - map extent starting at mpd->lblk of length * mpd->len and submit pages underlying it for IO * * @handle - handle for journal operations * @mpd - extent to map * @give_up_on_write - we set this to true iff there is a fatal error and there * is no hope of writing the data. The caller should discard * dirty pages to avoid infinite loops. * * The function maps extent starting at mpd->lblk of length mpd->len. If it is * delayed, blocks are allocated, if it is unwritten, we may need to convert * them to initialized or split the described range from larger unwritten * extent. Note that we need not map all the described range since allocation * can return less blocks or the range is covered by more unwritten extents. We * cannot map more because we are limited by reserved transaction credits. On * the other hand we always make sure that the last touched page is fully * mapped so that it can be written out (and thus forward progress is * guaranteed). After mapping we submit all mapped pages for IO. */ static int mpage_map_and_submit_extent(handle_t *handle, struct mpage_da_data *mpd, bool *give_up_on_write) { struct inode *inode = mpd->inode; struct ext4_map_blocks *map = &mpd->map; int err; loff_t disksize; int progress = 0; ext4_io_end_t *io_end = mpd->io_submit.io_end; struct ext4_io_end_vec *io_end_vec; io_end_vec = ext4_alloc_io_end_vec(io_end); if (IS_ERR(io_end_vec)) return PTR_ERR(io_end_vec); io_end_vec->offset = ((loff_t)map->m_lblk) << inode->i_blkbits; do { err = mpage_map_one_extent(handle, mpd); if (err < 0) { struct super_block *sb = inode->i_sb; if (ext4_forced_shutdown(sb)) goto invalidate_dirty_pages; /* * Let the uper layers retry transient errors. * In the case of ENOSPC, if ext4_count_free_blocks() * is non-zero, a commit should free up blocks. */ if ((err == -ENOMEM) || (err == -ENOSPC && ext4_count_free_clusters(sb))) { if (progress) goto update_disksize; return err; } ext4_msg(sb, KERN_CRIT, "Delayed block allocation failed for " "inode %lu at logical offset %llu with" " max blocks %u with error %d", inode->i_ino, (unsigned long long)map->m_lblk, (unsigned)map->m_len, -err); ext4_msg(sb, KERN_CRIT, "This should not happen!! Data will " "be lost\n"); if (err == -ENOSPC) ext4_print_free_blocks(inode); invalidate_dirty_pages: *give_up_on_write = true; return err; } progress = 1; /* * Update buffer state, submit mapped pages, and get us new * extent to map */ err = mpage_map_and_submit_buffers(mpd); if (err < 0) goto update_disksize; } while (map->m_len); update_disksize: /* * Update on-disk size after IO is submitted. Races with * truncate are avoided by checking i_size under i_data_sem. */ disksize = ((loff_t)mpd->first_page) << PAGE_SHIFT; if (disksize > READ_ONCE(EXT4_I(inode)->i_disksize)) { int err2; loff_t i_size; down_write(&EXT4_I(inode)->i_data_sem); i_size = i_size_read(inode); if (disksize > i_size) disksize = i_size; if (disksize > EXT4_I(inode)->i_disksize) EXT4_I(inode)->i_disksize = disksize; up_write(&EXT4_I(inode)->i_data_sem); err2 = ext4_mark_inode_dirty(handle, inode); if (err2) { ext4_error_err(inode->i_sb, -err2, "Failed to mark inode %lu dirty", inode->i_ino); } if (!err) err = err2; } return err; } /* * Calculate the total number of credits to reserve for one writepages * iteration. This is called from ext4_writepages(). We map an extent of * up to MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping * the last partial page. So in total we can map MAX_WRITEPAGES_EXTENT_LEN + * bpp - 1 blocks in bpp different extents. */ static int ext4_da_writepages_trans_blocks(struct inode *inode) { int bpp = ext4_journal_blocks_per_page(inode); return ext4_meta_trans_blocks(inode, MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp); } static int ext4_journal_folio_buffers(handle_t *handle, struct folio *folio, size_t len) { struct buffer_head *page_bufs = folio_buffers(folio); struct inode *inode = folio->mapping->host; int ret, err; ret = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len, NULL, do_journal_get_write_access); err = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len, NULL, write_end_fn); if (ret == 0) ret = err; err = ext4_jbd2_inode_add_write(handle, inode, folio_pos(folio), len); if (ret == 0) ret = err; EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; return ret; } static int mpage_journal_page_buffers(handle_t *handle, struct mpage_da_data *mpd, struct folio *folio) { struct inode *inode = mpd->inode; loff_t size = i_size_read(inode); size_t len = folio_size(folio); folio_clear_checked(folio); mpd->wbc->nr_to_write--; if (folio_pos(folio) + len > size && !ext4_verity_in_progress(inode)) len = size - folio_pos(folio); return ext4_journal_folio_buffers(handle, folio, len); } /* * mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages * needing mapping, submit mapped pages * * @mpd - where to look for pages * * Walk dirty pages in the mapping. If they are fully mapped, submit them for * IO immediately. If we cannot map blocks, we submit just already mapped * buffers in the page for IO and keep page dirty. When we can map blocks and * we find a page which isn't mapped we start accumulating extent of buffers * underlying these pages that needs mapping (formed by either delayed or * unwritten buffers). We also lock the pages containing these buffers. The * extent found is returned in @mpd structure (starting at mpd->lblk with * length mpd->len blocks). * * Note that this function can attach bios to one io_end structure which are * neither logically nor physically contiguous. Although it may seem as an * unnecessary complication, it is actually inevitable in blocksize < pagesize * case as we need to track IO to all buffers underlying a page in one io_end. */ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) { struct address_space *mapping = mpd->inode->i_mapping; struct folio_batch fbatch; unsigned int nr_folios; pgoff_t index = mpd->first_page; pgoff_t end = mpd->last_page; xa_mark_t tag; int i, err = 0; int blkbits = mpd->inode->i_blkbits; ext4_lblk_t lblk; struct buffer_head *head; handle_t *handle = NULL; int bpp = ext4_journal_blocks_per_page(mpd->inode); if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages) tag = PAGECACHE_TAG_TOWRITE; else tag = PAGECACHE_TAG_DIRTY; mpd->map.m_len = 0; mpd->next_page = index; if (ext4_should_journal_data(mpd->inode)) { handle = ext4_journal_start(mpd->inode, EXT4_HT_WRITE_PAGE, bpp); if (IS_ERR(handle)) return PTR_ERR(handle); } folio_batch_init(&fbatch); while (index <= end) { nr_folios = filemap_get_folios_tag(mapping, &index, end, tag, &fbatch); if (nr_folios == 0) break; for (i = 0; i < nr_folios; i++) { struct folio *folio = fbatch.folios[i]; /* * Accumulated enough dirty pages? This doesn't apply * to WB_SYNC_ALL mode. For integrity sync we have to * keep going because someone may be concurrently * dirtying pages, and we might have synced a lot of * newly appeared dirty pages, but have not synced all * of the old dirty pages. */ if (mpd->wbc->sync_mode == WB_SYNC_NONE && mpd->wbc->nr_to_write <= mpd->map.m_len >> (PAGE_SHIFT - blkbits)) goto out; /* If we can't merge this page, we are done. */ if (mpd->map.m_len > 0 && mpd->next_page != folio->index) goto out; if (handle) { err = ext4_journal_ensure_credits(handle, bpp, 0); if (err < 0) goto out; } folio_lock(folio); /* * If the page is no longer dirty, or its mapping no * longer corresponds to inode we are writing (which * means it has been truncated or invalidated), or the * page is already under writeback and we are not doing * a data integrity writeback, skip the page */ if (!folio_test_dirty(folio) || (folio_test_writeback(folio) && (mpd->wbc->sync_mode == WB_SYNC_NONE)) || unlikely(folio->mapping != mapping)) { folio_unlock(folio); continue; } folio_wait_writeback(folio); BUG_ON(folio_test_writeback(folio)); /* * Should never happen but for buggy code in * other subsystems that call * set_page_dirty() without properly warning * the file system first. See [1] for more * information. * * [1] https://lore.kernel.org/linux-mm/20180103100430.GE4911@quack2.suse.cz */ if (!folio_buffers(folio)) { ext4_warning_inode(mpd->inode, "page %lu does not have buffers attached", folio->index); folio_clear_dirty(folio); folio_unlock(folio); continue; } if (mpd->map.m_len == 0) mpd->first_page = folio->index; mpd->next_page = folio_next_index(folio); /* * Writeout when we cannot modify metadata is simple. * Just submit the page. For data=journal mode we * first handle writeout of the page for checkpoint and * only after that handle delayed page dirtying. This * makes sure current data is checkpointed to the final * location before possibly journalling it again which * is desirable when the page is frequently dirtied * through a pin. */ if (!mpd->can_map) { err = mpage_submit_folio(mpd, folio); if (err < 0) goto out; /* Pending dirtying of journalled data? */ if (folio_test_checked(folio)) { err = mpage_journal_page_buffers(handle, mpd, folio); if (err < 0) goto out; mpd->journalled_more_data = 1; } mpage_folio_done(mpd, folio); } else { /* Add all dirty buffers to mpd */ lblk = ((ext4_lblk_t)folio->index) << (PAGE_SHIFT - blkbits); head = folio_buffers(folio); err = mpage_process_page_bufs(mpd, head, head, lblk); if (err <= 0) goto out; err = 0; } } folio_batch_release(&fbatch); cond_resched(); } mpd->scanned_until_end = 1; if (handle) ext4_journal_stop(handle); return 0; out: folio_batch_release(&fbatch); if (handle) ext4_journal_stop(handle); return err; } static int ext4_do_writepages(struct mpage_da_data *mpd) { struct writeback_control *wbc = mpd->wbc; pgoff_t writeback_index = 0; long nr_to_write = wbc->nr_to_write; int range_whole = 0; int cycled = 1; handle_t *handle = NULL; struct inode *inode = mpd->inode; struct address_space *mapping = inode->i_mapping; int needed_blocks, rsv_blocks = 0, ret = 0; struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); struct blk_plug plug; bool give_up_on_write = false; trace_ext4_writepages(inode, wbc); /* * No pages to write? This is mainly a kludge to avoid starting * a transaction for special inodes like journal inode on last iput() * because that could violate lock ordering on umount */ if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) goto out_writepages; /* * If the filesystem has aborted, it is read-only, so return * right away instead of dumping stack traces later on that * will obscure the real source of the problem. We test * fs shutdown state instead of sb->s_flag's SB_RDONLY because * the latter could be true if the filesystem is mounted * read-only, and in that case, ext4_writepages should * *never* be called, so if that ever happens, we would want * the stack trace. */ if (unlikely(ext4_forced_shutdown(mapping->host->i_sb))) { ret = -EROFS; goto out_writepages; } /* * If we have inline data and arrive here, it means that * we will soon create the block for the 1st page, so * we'd better clear the inline data here. */ if (ext4_has_inline_data(inode)) { /* Just inode will be modified... */ handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out_writepages; } BUG_ON(ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)); ext4_destroy_inline_data(handle, inode); ext4_journal_stop(handle); } /* * data=journal mode does not do delalloc so we just need to writeout / * journal already mapped buffers. On the other hand we need to commit * transaction to make data stable. We expect all the data to be * already in the journal (the only exception are DMA pinned pages * dirtied behind our back) so we commit transaction here and run the * writeback loop to checkpoint them. The checkpointing is not actually * necessary to make data persistent *but* quite a few places (extent * shifting operations, fsverity, ...) depend on being able to drop * pagecache pages after calling filemap_write_and_wait() and for that * checkpointing needs to happen. */ if (ext4_should_journal_data(inode)) { mpd->can_map = 0; if (wbc->sync_mode == WB_SYNC_ALL) ext4_fc_commit(sbi->s_journal, EXT4_I(inode)->i_datasync_tid); } mpd->journalled_more_data = 0; if (ext4_should_dioread_nolock(inode)) { /* * We may need to convert up to one extent per block in * the page and we may dirty the inode. */ rsv_blocks = 1 + ext4_chunk_trans_blocks(inode, PAGE_SIZE >> inode->i_blkbits); } if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; if (wbc->range_cyclic) { writeback_index = mapping->writeback_index; if (writeback_index) cycled = 0; mpd->first_page = writeback_index; mpd->last_page = -1; } else { mpd->first_page = wbc->range_start >> PAGE_SHIFT; mpd->last_page = wbc->range_end >> PAGE_SHIFT; } ext4_io_submit_init(&mpd->io_submit, wbc); retry: if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag_pages_for_writeback(mapping, mpd->first_page, mpd->last_page); blk_start_plug(&plug); /* * First writeback pages that don't need mapping - we can avoid * starting a transaction unnecessarily and also avoid being blocked * in the block layer on device congestion while having transaction * started. */ mpd->do_map = 0; mpd->scanned_until_end = 0; mpd->io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL); if (!mpd->io_submit.io_end) { ret = -ENOMEM; goto unplug; } ret = mpage_prepare_extent_to_map(mpd); /* Unlock pages we didn't use */ mpage_release_unused_pages(mpd, false); /* Submit prepared bio */ ext4_io_submit(&mpd->io_submit); ext4_put_io_end_defer(mpd->io_submit.io_end); mpd->io_submit.io_end = NULL; if (ret < 0) goto unplug; while (!mpd->scanned_until_end && wbc->nr_to_write > 0) { /* For each extent of pages we use new io_end */ mpd->io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL); if (!mpd->io_submit.io_end) { ret = -ENOMEM; break; } WARN_ON_ONCE(!mpd->can_map); /* * We have two constraints: We find one extent to map and we * must always write out whole page (makes a difference when * blocksize < pagesize) so that we don't block on IO when we * try to write out the rest of the page. Journalled mode is * not supported by delalloc. */ BUG_ON(ext4_should_journal_data(inode)); needed_blocks = ext4_da_writepages_trans_blocks(inode); /* start a new transaction */ handle = ext4_journal_start_with_reserve(inode, EXT4_HT_WRITE_PAGE, needed_blocks, rsv_blocks); if (IS_ERR(handle)) { ret = PTR_ERR(handle); ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: " "%ld pages, ino %lu; err %d", __func__, wbc->nr_to_write, inode->i_ino, ret); /* Release allocated io_end */ ext4_put_io_end(mpd->io_submit.io_end); mpd->io_submit.io_end = NULL; break; } mpd->do_map = 1; trace_ext4_da_write_pages(inode, mpd->first_page, wbc); ret = mpage_prepare_extent_to_map(mpd); if (!ret && mpd->map.m_len) ret = mpage_map_and_submit_extent(handle, mpd, &give_up_on_write); /* * Caution: If the handle is synchronous, * ext4_journal_stop() can wait for transaction commit * to finish which may depend on writeback of pages to * complete or on page lock to be released. In that * case, we have to wait until after we have * submitted all the IO, released page locks we hold, * and dropped io_end reference (for extent conversion * to be able to complete) before stopping the handle. */ if (!ext4_handle_valid(handle) || handle->h_sync == 0) { ext4_journal_stop(handle); handle = NULL; mpd->do_map = 0; } /* Unlock pages we didn't use */ mpage_release_unused_pages(mpd, give_up_on_write); /* Submit prepared bio */ ext4_io_submit(&mpd->io_submit); /* * Drop our io_end reference we got from init. We have * to be careful and use deferred io_end finishing if * we are still holding the transaction as we can * release the last reference to io_end which may end * up doing unwritten extent conversion. */ if (handle) { ext4_put_io_end_defer(mpd->io_submit.io_end); ext4_journal_stop(handle); } else ext4_put_io_end(mpd->io_submit.io_end); mpd->io_submit.io_end = NULL; if (ret == -ENOSPC && sbi->s_journal) { /* * Commit the transaction which would * free blocks released in the transaction * and try again */ jbd2_journal_force_commit_nested(sbi->s_journal); ret = 0; continue; } /* Fatal error - ENOMEM, EIO... */ if (ret) break; } unplug: blk_finish_plug(&plug); if (!ret && !cycled && wbc->nr_to_write > 0) { cycled = 1; mpd->last_page = writeback_index - 1; mpd->first_page = 0; goto retry; } /* Update index */ if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) /* * Set the writeback_index so that range_cyclic * mode will write it back later */ mapping->writeback_index = mpd->first_page; out_writepages: trace_ext4_writepages_result(inode, wbc, ret, nr_to_write - wbc->nr_to_write); return ret; } static int ext4_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct super_block *sb = mapping->host->i_sb; struct mpage_da_data mpd = { .inode = mapping->host, .wbc = wbc, .can_map = 1, }; int ret; int alloc_ctx; if (unlikely(ext4_forced_shutdown(sb))) return -EIO; alloc_ctx = ext4_writepages_down_read(sb); ret = ext4_do_writepages(&mpd); /* * For data=journal writeback we could have come across pages marked * for delayed dirtying (PageChecked) which were just added to the * running transaction. Try once more to get them to stable storage. */ if (!ret && mpd.journalled_more_data) ret = ext4_do_writepages(&mpd); ext4_writepages_up_read(sb, alloc_ctx); return ret; } int ext4_normal_submit_inode_data_buffers(struct jbd2_inode *jinode) { struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, .nr_to_write = LONG_MAX, .range_start = jinode->i_dirty_start, .range_end = jinode->i_dirty_end, }; struct mpage_da_data mpd = { .inode = jinode->i_vfs_inode, .wbc = &wbc, .can_map = 0, }; return ext4_do_writepages(&mpd); } static int ext4_dax_writepages(struct address_space *mapping, struct writeback_control *wbc) { int ret; long nr_to_write = wbc->nr_to_write; struct inode *inode = mapping->host; int alloc_ctx; if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; alloc_ctx = ext4_writepages_down_read(inode->i_sb); trace_ext4_writepages(inode, wbc); ret = dax_writeback_mapping_range(mapping, EXT4_SB(inode->i_sb)->s_daxdev, wbc); trace_ext4_writepages_result(inode, wbc, ret, nr_to_write - wbc->nr_to_write); ext4_writepages_up_read(inode->i_sb, alloc_ctx); return ret; } static int ext4_nonda_switch(struct super_block *sb) { s64 free_clusters, dirty_clusters; struct ext4_sb_info *sbi = EXT4_SB(sb); /* * switch to non delalloc mode if we are running low * on free block. The free block accounting via percpu * counters can get slightly wrong with percpu_counter_batch getting * accumulated on each CPU without updating global counters * Delalloc need an accurate free block accounting. So switch * to non delalloc when we are near to error range. */ free_clusters = percpu_counter_read_positive(&sbi->s_freeclusters_counter); dirty_clusters = percpu_counter_read_positive(&sbi->s_dirtyclusters_counter); /* * Start pushing delalloc when 1/2 of free blocks are dirty. */ if (dirty_clusters && (free_clusters < 2 * dirty_clusters)) try_to_writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE); if (2 * free_clusters < 3 * dirty_clusters || free_clusters < (dirty_clusters + EXT4_FREECLUSTERS_WATERMARK)) { /* * free block count is less than 150% of dirty blocks * or free blocks is less than watermark */ return 1; } return 0; } static int ext4_da_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, struct page **pagep, void **fsdata) { int ret, retries = 0; struct folio *folio; pgoff_t index; struct inode *inode = mapping->host; if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; index = pos >> PAGE_SHIFT; if (ext4_nonda_switch(inode->i_sb) || ext4_verity_in_progress(inode)) { *fsdata = (void *)FALL_BACK_TO_NONDELALLOC; return ext4_write_begin(file, mapping, pos, len, pagep, fsdata); } *fsdata = (void *)0; trace_ext4_da_write_begin(inode, pos, len); if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { ret = ext4_da_write_inline_data_begin(mapping, inode, pos, len, pagep, fsdata); if (ret < 0) return ret; if (ret == 1) return 0; } retry: folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, mapping_gfp_mask(mapping)); if (IS_ERR(folio)) return PTR_ERR(folio); /* In case writeback began while the folio was unlocked */ folio_wait_stable(folio); #ifdef CONFIG_FS_ENCRYPTION ret = ext4_block_write_begin(folio, pos, len, ext4_da_get_block_prep); #else ret = __block_write_begin(&folio->page, pos, len, ext4_da_get_block_prep); #endif if (ret < 0) { folio_unlock(folio); folio_put(folio); /* * block_write_begin may have instantiated a few blocks * outside i_size. Trim these off again. Don't need * i_size_read because we hold inode lock. */ if (pos + len > inode->i_size) ext4_truncate_failed_write(inode); if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; return ret; } *pagep = &folio->page; return ret; } /* * Check if we should update i_disksize * when write to the end of file but not require block allocation */ static int ext4_da_should_update_i_disksize(struct folio *folio, unsigned long offset) { struct buffer_head *bh; struct inode *inode = folio->mapping->host; unsigned int idx; int i; bh = folio_buffers(folio); idx = offset >> inode->i_blkbits; for (i = 0; i < idx; i++) bh = bh->b_this_page; if (!buffer_mapped(bh) || (buffer_delay(bh)) || buffer_unwritten(bh)) return 0; return 1; } static int ext4_da_do_write_end(struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page) { struct inode *inode = mapping->host; loff_t old_size = inode->i_size; bool disksize_changed = false; loff_t new_i_size; /* * block_write_end() will mark the inode as dirty with I_DIRTY_PAGES * flag, which all that's needed to trigger page writeback. */ copied = block_write_end(NULL, mapping, pos, len, copied, page, NULL); new_i_size = pos + copied; /* * It's important to update i_size while still holding page lock, * because page writeout could otherwise come in and zero beyond * i_size. * * Since we are holding inode lock, we are sure i_disksize <= * i_size. We also know that if i_disksize < i_size, there are * delalloc writes pending in the range up to i_size. If the end of * the current write is <= i_size, there's no need to touch * i_disksize since writeback will push i_disksize up to i_size * eventually. If the end of the current write is > i_size and * inside an allocated block which ext4_da_should_update_i_disksize() * checked, we need to update i_disksize here as certain * ext4_writepages() paths not allocating blocks and update i_disksize. */ if (new_i_size > inode->i_size) { unsigned long end; i_size_write(inode, new_i_size); end = (new_i_size - 1) & (PAGE_SIZE - 1); if (copied && ext4_da_should_update_i_disksize(page_folio(page), end)) { ext4_update_i_disksize(inode, new_i_size); disksize_changed = true; } } unlock_page(page); put_page(page); if (old_size < pos) pagecache_isize_extended(inode, old_size, pos); if (disksize_changed) { handle_t *handle; handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) return PTR_ERR(handle); ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); } return copied; } static int ext4_da_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { struct inode *inode = mapping->host; int write_mode = (int)(unsigned long)fsdata; struct folio *folio = page_folio(page); if (write_mode == FALL_BACK_TO_NONDELALLOC) return ext4_write_end(file, mapping, pos, len, copied, &folio->page, fsdata); trace_ext4_da_write_end(inode, pos, len, copied); if (write_mode != CONVERT_INLINE_DATA && ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) && ext4_has_inline_data(inode)) return ext4_write_inline_data_end(inode, pos, len, copied, folio); if (unlikely(copied < len) && !PageUptodate(page)) copied = 0; return ext4_da_do_write_end(mapping, pos, len, copied, &folio->page); } /* * Force all delayed allocation blocks to be allocated for a given inode. */ int ext4_alloc_da_blocks(struct inode *inode) { trace_ext4_alloc_da_blocks(inode); if (!EXT4_I(inode)->i_reserved_data_blocks) return 0; /* * We do something simple for now. The filemap_flush() will * also start triggering a write of the data blocks, which is * not strictly speaking necessary (and for users of * laptop_mode, not even desirable). However, to do otherwise * would require replicating code paths in: * * ext4_writepages() -> * write_cache_pages() ---> (via passed in callback function) * __mpage_da_writepage() --> * mpage_add_bh_to_extent() * mpage_da_map_blocks() * * The problem is that write_cache_pages(), located in * mm/page-writeback.c, marks pages clean in preparation for * doing I/O, which is not desirable if we're not planning on * doing I/O at all. * * We could call write_cache_pages(), and then redirty all of * the pages by calling redirty_page_for_writepage() but that * would be ugly in the extreme. So instead we would need to * replicate parts of the code in the above functions, * simplifying them because we wouldn't actually intend to * write out the pages, but rather only collect contiguous * logical block extents, call the multi-block allocator, and * then update the buffer heads with the block allocations. * * For now, though, we'll cheat by calling filemap_flush(), * which will map the blocks, and start the I/O, but not * actually wait for the I/O to complete. */ return filemap_flush(inode->i_mapping); } /* * bmap() is special. It gets used by applications such as lilo and by * the swapper to find the on-disk block of a specific piece of data. * * Naturally, this is dangerous if the block concerned is still in the * journal. If somebody makes a swapfile on an ext4 data-journaling * filesystem and enables swap, then they may get a nasty shock when the * data getting swapped to that swapfile suddenly gets overwritten by * the original zero's written out previously to the journal and * awaiting writeback in the kernel's buffer cache. * * So, if we see any bmap calls here on a modified, data-journaled file, * take extra steps to flush any blocks which might be in the cache. */ static sector_t ext4_bmap(struct address_space *mapping, sector_t block) { struct inode *inode = mapping->host; sector_t ret = 0; inode_lock_shared(inode); /* * We can get here for an inline file via the FIBMAP ioctl */ if (ext4_has_inline_data(inode)) goto out; if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) && (test_opt(inode->i_sb, DELALLOC) || ext4_should_journal_data(inode))) { /* * With delalloc or journalled data we want to sync the file so * that we can make sure we allocate blocks for file and data * is in place for the user to see it */ filemap_write_and_wait(mapping); } ret = iomap_bmap(mapping, block, &ext4_iomap_ops); out: inode_unlock_shared(inode); return ret; } static int ext4_read_folio(struct file *file, struct folio *folio) { int ret = -EAGAIN; struct inode *inode = folio->mapping->host; trace_ext4_read_folio(inode, folio); if (ext4_has_inline_data(inode)) ret = ext4_readpage_inline(inode, folio); if (ret == -EAGAIN) return ext4_mpage_readpages(inode, NULL, folio); return ret; } static void ext4_readahead(struct readahead_control *rac) { struct inode *inode = rac->mapping->host; /* If the file has inline data, no need to do readahead. */ if (ext4_has_inline_data(inode)) return; ext4_mpage_readpages(inode, rac, NULL); } static void ext4_invalidate_folio(struct folio *folio, size_t offset, size_t length) { trace_ext4_invalidate_folio(folio, offset, length); /* No journalling happens on data buffers when this function is used */ WARN_ON(folio_buffers(folio) && buffer_jbd(folio_buffers(folio))); block_invalidate_folio(folio, offset, length); } static int __ext4_journalled_invalidate_folio(struct folio *folio, size_t offset, size_t length) { journal_t *journal = EXT4_JOURNAL(folio->mapping->host); trace_ext4_journalled_invalidate_folio(folio, offset, length); /* * If it's a full truncate we just forget about the pending dirtying */ if (offset == 0 && length == folio_size(folio)) folio_clear_checked(folio); return jbd2_journal_invalidate_folio(journal, folio, offset, length); } /* Wrapper for aops... */ static void ext4_journalled_invalidate_folio(struct folio *folio, size_t offset, size_t length) { WARN_ON(__ext4_journalled_invalidate_folio(folio, offset, length) < 0); } static bool ext4_release_folio(struct folio *folio, gfp_t wait) { struct inode *inode = folio->mapping->host; journal_t *journal = EXT4_JOURNAL(inode); trace_ext4_release_folio(inode, folio); /* Page has dirty journalled data -> cannot release */ if (folio_test_checked(folio)) return false; if (journal) return jbd2_journal_try_to_free_buffers(journal, folio); else return try_to_free_buffers(folio); } static bool ext4_inode_datasync_dirty(struct inode *inode) { journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; if (journal) { if (jbd2_transaction_committed(journal, EXT4_I(inode)->i_datasync_tid)) return false; if (test_opt2(inode->i_sb, JOURNAL_FAST_COMMIT)) return !list_empty(&EXT4_I(inode)->i_fc_list); return true; } /* Any metadata buffers to write? */ if (!list_empty(&inode->i_mapping->private_list)) return true; return inode->i_state & I_DIRTY_DATASYNC; } static void ext4_set_iomap(struct inode *inode, struct iomap *iomap, struct ext4_map_blocks *map, loff_t offset, loff_t length, unsigned int flags) { u8 blkbits = inode->i_blkbits; /* * Writes that span EOF might trigger an I/O size update on completion, * so consider them to be dirty for the purpose of O_DSYNC, even if * there is no other metadata changes being made or are pending. */ iomap->flags = 0; if (ext4_inode_datasync_dirty(inode) || offset + length > i_size_read(inode)) iomap->flags |= IOMAP_F_DIRTY; if (map->m_flags & EXT4_MAP_NEW) iomap->flags |= IOMAP_F_NEW; if (flags & IOMAP_DAX) iomap->dax_dev = EXT4_SB(inode->i_sb)->s_daxdev; else iomap->bdev = inode->i_sb->s_bdev; iomap->offset = (u64) map->m_lblk << blkbits; iomap->length = (u64) map->m_len << blkbits; if ((map->m_flags & EXT4_MAP_MAPPED) && !ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) iomap->flags |= IOMAP_F_MERGED; /* * Flags passed to ext4_map_blocks() for direct I/O writes can result * in m_flags having both EXT4_MAP_MAPPED and EXT4_MAP_UNWRITTEN bits * set. In order for any allocated unwritten extents to be converted * into written extents correctly within the ->end_io() handler, we * need to ensure that the iomap->type is set appropriately. Hence, the * reason why we need to check whether the EXT4_MAP_UNWRITTEN bit has * been set first. */ if (map->m_flags & EXT4_MAP_UNWRITTEN) { iomap->type = IOMAP_UNWRITTEN; iomap->addr = (u64) map->m_pblk << blkbits; if (flags & IOMAP_DAX) iomap->addr += EXT4_SB(inode->i_sb)->s_dax_part_off; } else if (map->m_flags & EXT4_MAP_MAPPED) { iomap->type = IOMAP_MAPPED; iomap->addr = (u64) map->m_pblk << blkbits; if (flags & IOMAP_DAX) iomap->addr += EXT4_SB(inode->i_sb)->s_dax_part_off; } else { iomap->type = IOMAP_HOLE; iomap->addr = IOMAP_NULL_ADDR; } } static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map, unsigned int flags) { handle_t *handle; u8 blkbits = inode->i_blkbits; int ret, dio_credits, m_flags = 0, retries = 0; /* * Trim the mapping request to the maximum value that we can map at * once for direct I/O. */ if (map->m_len > DIO_MAX_BLOCKS) map->m_len = DIO_MAX_BLOCKS; dio_credits = ext4_chunk_trans_blocks(inode, map->m_len); retry: /* * Either we allocate blocks and then don't get an unwritten extent, so * in that case we have reserved enough credits. Or, the blocks are * already allocated and unwritten. In that case, the extent conversion * fits into the credits as well. */ handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, dio_credits); if (IS_ERR(handle)) return PTR_ERR(handle); /* * DAX and direct I/O are the only two operations that are currently * supported with IOMAP_WRITE. */ WARN_ON(!(flags & (IOMAP_DAX | IOMAP_DIRECT))); if (flags & IOMAP_DAX) m_flags = EXT4_GET_BLOCKS_CREATE_ZERO; /* * We use i_size instead of i_disksize here because delalloc writeback * can complete at any point during the I/O and subsequently push the * i_disksize out to i_size. This could be beyond where direct I/O is * happening and thus expose allocated blocks to direct I/O reads. */ else if (((loff_t)map->m_lblk << blkbits) >= i_size_read(inode)) m_flags = EXT4_GET_BLOCKS_CREATE; else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) m_flags = EXT4_GET_BLOCKS_IO_CREATE_EXT; ret = ext4_map_blocks(handle, inode, map, m_flags); /* * We cannot fill holes in indirect tree based inodes as that could * expose stale data in the case of a crash. Use the magic error code * to fallback to buffered I/O. */ if (!m_flags && !ret) ret = -ENOTBLK; ext4_journal_stop(handle); if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; return ret; } static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, unsigned flags, struct iomap *iomap, struct iomap *srcmap) { int ret; struct ext4_map_blocks map; u8 blkbits = inode->i_blkbits; if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK) return -EINVAL; if (WARN_ON_ONCE(ext4_has_inline_data(inode))) return -ERANGE; /* * Calculate the first and last logical blocks respectively. */ map.m_lblk = offset >> blkbits; map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits, EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1; if (flags & IOMAP_WRITE) { /* * We check here if the blocks are already allocated, then we * don't need to start a journal txn and we can directly return * the mapping information. This could boost performance * especially in multi-threaded overwrite requests. */ if (offset + length <= i_size_read(inode)) { ret = ext4_map_blocks(NULL, inode, &map, 0); if (ret > 0 && (map.m_flags & EXT4_MAP_MAPPED)) goto out; } ret = ext4_iomap_alloc(inode, &map, flags); } else { ret = ext4_map_blocks(NULL, inode, &map, 0); } if (ret < 0) return ret; out: /* * When inline encryption is enabled, sometimes I/O to an encrypted file * has to be broken up to guarantee DUN contiguity. Handle this by * limiting the length of the mapping returned. */ map.m_len = fscrypt_limit_io_blocks(inode, map.m_lblk, map.m_len); ext4_set_iomap(inode, iomap, &map, offset, length, flags); return 0; } static int ext4_iomap_overwrite_begin(struct inode *inode, loff_t offset, loff_t length, unsigned flags, struct iomap *iomap, struct iomap *srcmap) { int ret; /* * Even for writes we don't need to allocate blocks, so just pretend * we are reading to save overhead of starting a transaction. */ flags &= ~IOMAP_WRITE; ret = ext4_iomap_begin(inode, offset, length, flags, iomap, srcmap); WARN_ON_ONCE(!ret && iomap->type != IOMAP_MAPPED); return ret; } static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length, ssize_t written, unsigned flags, struct iomap *iomap) { /* * Check to see whether an error occurred while writing out the data to * the allocated blocks. If so, return the magic error code so that we * fallback to buffered I/O and attempt to complete the remainder of * the I/O. Any blocks that may have been allocated in preparation for * the direct I/O will be reused during buffered I/O. */ if (flags & (IOMAP_WRITE | IOMAP_DIRECT) && written == 0) return -ENOTBLK; return 0; } const struct iomap_ops ext4_iomap_ops = { .iomap_begin = ext4_iomap_begin, .iomap_end = ext4_iomap_end, }; const struct iomap_ops ext4_iomap_overwrite_ops = { .iomap_begin = ext4_iomap_overwrite_begin, .iomap_end = ext4_iomap_end, }; static bool ext4_iomap_is_delalloc(struct inode *inode, struct ext4_map_blocks *map) { struct extent_status es; ext4_lblk_t offset = 0, end = map->m_lblk + map->m_len - 1; ext4_es_find_extent_range(inode, &ext4_es_is_delayed, map->m_lblk, end, &es); if (!es.es_len || es.es_lblk > end) return false; if (es.es_lblk > map->m_lblk) { map->m_len = es.es_lblk - map->m_lblk; return false; } offset = map->m_lblk - es.es_lblk; map->m_len = es.es_len - offset; return true; } static int ext4_iomap_begin_report(struct inode *inode, loff_t offset, loff_t length, unsigned int flags, struct iomap *iomap, struct iomap *srcmap) { int ret; bool delalloc = false; struct ext4_map_blocks map; u8 blkbits = inode->i_blkbits; if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK) return -EINVAL; if (ext4_has_inline_data(inode)) { ret = ext4_inline_data_iomap(inode, iomap); if (ret != -EAGAIN) { if (ret == 0 && offset >= iomap->length) ret = -ENOENT; return ret; } } /* * Calculate the first and last logical block respectively. */ map.m_lblk = offset >> blkbits; map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits, EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1; /* * Fiemap callers may call for offset beyond s_bitmap_maxbytes. * So handle it here itself instead of querying ext4_map_blocks(). * Since ext4_map_blocks() will warn about it and will return * -EIO error. */ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); if (offset >= sbi->s_bitmap_maxbytes) { map.m_flags = 0; goto set_iomap; } } ret = ext4_map_blocks(NULL, inode, &map, 0); if (ret < 0) return ret; if (ret == 0) delalloc = ext4_iomap_is_delalloc(inode, &map); set_iomap: ext4_set_iomap(inode, iomap, &map, offset, length, flags); if (delalloc && iomap->type == IOMAP_HOLE) iomap->type = IOMAP_DELALLOC; return 0; } const struct iomap_ops ext4_iomap_report_ops = { .iomap_begin = ext4_iomap_begin_report, }; /* * For data=journal mode, folio should be marked dirty only when it was * writeably mapped. When that happens, it was already attached to the * transaction and marked as jbddirty (we take care of this in * ext4_page_mkwrite()). On transaction commit, we writeprotect page mappings * so we should have nothing to do here, except for the case when someone * had the page pinned and dirtied the page through this pin (e.g. by doing * direct IO to it). In that case we'd need to attach buffers here to the * transaction but we cannot due to lock ordering. We cannot just dirty the * folio and leave attached buffers clean, because the buffers' dirty state is * "definitive". We cannot just set the buffers dirty or jbddirty because all * the journalling code will explode. So what we do is to mark the folio * "pending dirty" and next time ext4_writepages() is called, attach buffers * to the transaction appropriately. */ static bool ext4_journalled_dirty_folio(struct address_space *mapping, struct folio *folio) { WARN_ON_ONCE(!folio_buffers(folio)); if (folio_maybe_dma_pinned(folio)) folio_set_checked(folio); return filemap_dirty_folio(mapping, folio); } static bool ext4_dirty_folio(struct address_space *mapping, struct folio *folio) { WARN_ON_ONCE(!folio_test_locked(folio) && !folio_test_dirty(folio)); WARN_ON_ONCE(!folio_buffers(folio)); return block_dirty_folio(mapping, folio); } static int ext4_iomap_swap_activate(struct swap_info_struct *sis, struct file *file, sector_t *span) { return iomap_swapfile_activate(sis, file, span, &ext4_iomap_report_ops); } static const struct address_space_operations ext4_aops = { .read_folio = ext4_read_folio, .readahead = ext4_readahead, .writepages = ext4_writepages, .write_begin = ext4_write_begin, .write_end = ext4_write_end, .dirty_folio = ext4_dirty_folio, .bmap = ext4_bmap, .invalidate_folio = ext4_invalidate_folio, .release_folio = ext4_release_folio, .direct_IO = noop_direct_IO, .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_page = generic_error_remove_page, .swap_activate = ext4_iomap_swap_activate, }; static const struct address_space_operations ext4_journalled_aops = { .read_folio = ext4_read_folio, .readahead = ext4_readahead, .writepages = ext4_writepages, .write_begin = ext4_write_begin, .write_end = ext4_journalled_write_end, .dirty_folio = ext4_journalled_dirty_folio, .bmap = ext4_bmap, .invalidate_folio = ext4_journalled_invalidate_folio, .release_folio = ext4_release_folio, .direct_IO = noop_direct_IO, .migrate_folio = buffer_migrate_folio_norefs, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_page = generic_error_remove_page, .swap_activate = ext4_iomap_swap_activate, }; static const struct address_space_operations ext4_da_aops = { .read_folio = ext4_read_folio, .readahead = ext4_readahead, .writepages = ext4_writepages, .write_begin = ext4_da_write_begin, .write_end = ext4_da_write_end, .dirty_folio = ext4_dirty_folio, .bmap = ext4_bmap, .invalidate_folio = ext4_invalidate_folio, .release_folio = ext4_release_folio, .direct_IO = noop_direct_IO, .migrate_folio = buffer_migrate_folio, .is_partially_uptodate = block_is_partially_uptodate, .error_remove_page = generic_error_remove_page, .swap_activate = ext4_iomap_swap_activate, }; static const struct address_space_operations ext4_dax_aops = { .writepages = ext4_dax_writepages, .direct_IO = noop_direct_IO, .dirty_folio = noop_dirty_folio, .bmap = ext4_bmap, .swap_activate = ext4_iomap_swap_activate, }; void ext4_set_aops(struct inode *inode) { switch (ext4_inode_journal_mode(inode)) { case EXT4_INODE_ORDERED_DATA_MODE: case EXT4_INODE_WRITEBACK_DATA_MODE: break; case EXT4_INODE_JOURNAL_DATA_MODE: inode->i_mapping->a_ops = &ext4_journalled_aops; return; default: BUG(); } if (IS_DAX(inode)) inode->i_mapping->a_ops = &ext4_dax_aops; else if (test_opt(inode->i_sb, DELALLOC)) inode->i_mapping->a_ops = &ext4_da_aops; else inode->i_mapping->a_ops = &ext4_aops; } static int __ext4_block_zero_page_range(handle_t *handle, struct address_space *mapping, loff_t from, loff_t length) { ext4_fsblk_t index = from >> PAGE_SHIFT; unsigned offset = from & (PAGE_SIZE-1); unsigned blocksize, pos; ext4_lblk_t iblock; struct inode *inode = mapping->host; struct buffer_head *bh; struct folio *folio; int err = 0; folio = __filemap_get_folio(mapping, from >> PAGE_SHIFT, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mapping_gfp_constraint(mapping, ~__GFP_FS)); if (IS_ERR(folio)) return PTR_ERR(folio); blocksize = inode->i_sb->s_blocksize; iblock = index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits); bh = folio_buffers(folio); if (!bh) bh = create_empty_buffers(folio, blocksize, 0); /* Find the buffer that contains "offset" */ pos = blocksize; while (offset >= pos) { bh = bh->b_this_page; iblock++; pos += blocksize; } if (buffer_freed(bh)) { BUFFER_TRACE(bh, "freed: skip"); goto unlock; } if (!buffer_mapped(bh)) { BUFFER_TRACE(bh, "unmapped"); ext4_get_block(inode, iblock, bh, 0); /* unmapped? It's a hole - nothing to do */ if (!buffer_mapped(bh)) { BUFFER_TRACE(bh, "still unmapped"); goto unlock; } } /* Ok, it's mapped. Make sure it's up-to-date */ if (folio_test_uptodate(folio)) set_buffer_uptodate(bh); if (!buffer_uptodate(bh)) { err = ext4_read_bh_lock(bh, 0, true); if (err) goto unlock; if (fscrypt_inode_uses_fs_layer_crypto(inode)) { /* We expect the key to be set. */ BUG_ON(!fscrypt_has_encryption_key(inode)); err = fscrypt_decrypt_pagecache_blocks(folio, blocksize, bh_offset(bh)); if (err) { clear_buffer_uptodate(bh); goto unlock; } } } if (ext4_should_journal_data(inode)) { BUFFER_TRACE(bh, "get write access"); err = ext4_journal_get_write_access(handle, inode->i_sb, bh, EXT4_JTR_NONE); if (err) goto unlock; } folio_zero_range(folio, offset, length); BUFFER_TRACE(bh, "zeroed end of block"); if (ext4_should_journal_data(inode)) { err = ext4_dirty_journalled_data(handle, bh); } else { err = 0; mark_buffer_dirty(bh); if (ext4_should_order_data(inode)) err = ext4_jbd2_inode_add_write(handle, inode, from, length); } unlock: folio_unlock(folio); folio_put(folio); return err; } /* * ext4_block_zero_page_range() zeros out a mapping of length 'length' * starting from file offset 'from'. The range to be zero'd must * be contained with in one block. If the specified range exceeds * the end of the block it will be shortened to end of the block * that corresponds to 'from' */ static int ext4_block_zero_page_range(handle_t *handle, struct address_space *mapping, loff_t from, loff_t length) { struct inode *inode = mapping->host; unsigned offset = from & (PAGE_SIZE-1); unsigned blocksize = inode->i_sb->s_blocksize; unsigned max = blocksize - (offset & (blocksize - 1)); /* * correct length if it does not fall between * 'from' and the end of the block */ if (length > max || length < 0) length = max; if (IS_DAX(inode)) { return dax_zero_range(inode, from, length, NULL, &ext4_iomap_ops); } return __ext4_block_zero_page_range(handle, mapping, from, length); } /* * ext4_block_truncate_page() zeroes out a mapping from file offset `from' * up to the end of the block which corresponds to `from'. * This required during truncate. We need to physically zero the tail end * of that block so it doesn't yield old data if the file is later grown. */ static int ext4_block_truncate_page(handle_t *handle, struct address_space *mapping, loff_t from) { unsigned offset = from & (PAGE_SIZE-1); unsigned length; unsigned blocksize; struct inode *inode = mapping->host; /* If we are processing an encrypted inode during orphan list handling */ if (IS_ENCRYPTED(inode) && !fscrypt_has_encryption_key(inode)) return 0; blocksize = inode->i_sb->s_blocksize; length = blocksize - (offset & (blocksize - 1)); return ext4_block_zero_page_range(handle, mapping, from, length); } int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, loff_t lstart, loff_t length) { struct super_block *sb = inode->i_sb; struct address_space *mapping = inode->i_mapping; unsigned partial_start, partial_end; ext4_fsblk_t start, end; loff_t byte_end = (lstart + length - 1); int err = 0; partial_start = lstart & (sb->s_blocksize - 1); partial_end = byte_end & (sb->s_blocksize - 1); start = lstart >> sb->s_blocksize_bits; end = byte_end >> sb->s_blocksize_bits; /* Handle partial zero within the single block */ if (start == end && (partial_start || (partial_end != sb->s_blocksize - 1))) { err = ext4_block_zero_page_range(handle, mapping, lstart, length); return err; } /* Handle partial zero out on the start of the range */ if (partial_start) { err = ext4_block_zero_page_range(handle, mapping, lstart, sb->s_blocksize); if (err) return err; } /* Handle partial zero out on the end of the range */ if (partial_end != sb->s_blocksize - 1) err = ext4_block_zero_page_range(handle, mapping, byte_end - partial_end, partial_end + 1); return err; } int ext4_can_truncate(struct inode *inode) { if (S_ISREG(inode->i_mode)) return 1; if (S_ISDIR(inode->i_mode)) return 1; if (S_ISLNK(inode->i_mode)) return !ext4_inode_is_fast_symlink(inode); return 0; } /* * We have to make sure i_disksize gets properly updated before we truncate * page cache due to hole punching or zero range. Otherwise i_disksize update * can get lost as it may have been postponed to submission of writeback but * that will never happen after we truncate page cache. */ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, loff_t len) { handle_t *handle; int ret; loff_t size = i_size_read(inode); WARN_ON(!inode_is_locked(inode)); if (offset > size || offset + len < size) return 0; if (EXT4_I(inode)->i_disksize >= size) return 0; handle = ext4_journal_start(inode, EXT4_HT_MISC, 1); if (IS_ERR(handle)) return PTR_ERR(handle); ext4_update_i_disksize(inode, size); ret = ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); return ret; } static void ext4_wait_dax_page(struct inode *inode) { filemap_invalidate_unlock(inode->i_mapping); schedule(); filemap_invalidate_lock(inode->i_mapping); } int ext4_break_layouts(struct inode *inode) { struct page *page; int error; if (WARN_ON_ONCE(!rwsem_is_locked(&inode->i_mapping->invalidate_lock))) return -EINVAL; do { page = dax_layout_busy_page(inode->i_mapping); if (!page) return 0; error = ___wait_var_event(&page->_refcount, atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE, 0, 0, ext4_wait_dax_page(inode)); } while (error == 0); return error; } /* * ext4_punch_hole: punches a hole in a file by releasing the blocks * associated with the given offset and length * * @inode: File inode * @offset: The offset where the hole will begin * @len: The length of the hole * * Returns: 0 on success or negative on failure */ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) { struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; ext4_lblk_t first_block, stop_block; struct address_space *mapping = inode->i_mapping; loff_t first_block_offset, last_block_offset, max_length; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); handle_t *handle; unsigned int credits; int ret = 0, ret2 = 0; trace_ext4_punch_hole(inode, offset, length, 0); /* * Write out all dirty pages to avoid race conditions * Then release them. */ if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { ret = filemap_write_and_wait_range(mapping, offset, offset + length - 1); if (ret) return ret; } inode_lock(inode); /* No need to punch hole beyond i_size */ if (offset >= inode->i_size) goto out_mutex; /* * If the hole extends beyond i_size, set the hole * to end after the page that contains i_size */ if (offset + length > inode->i_size) { length = inode->i_size + PAGE_SIZE - (inode->i_size & (PAGE_SIZE - 1)) - offset; } /* * For punch hole the length + offset needs to be within one block * before last range. Adjust the length if it goes beyond that limit. */ max_length = sbi->s_bitmap_maxbytes - inode->i_sb->s_blocksize; if (offset + length > max_length) length = max_length - offset; if (offset & (sb->s_blocksize - 1) || (offset + length) & (sb->s_blocksize - 1)) { /* * Attach jinode to inode for jbd2 if we do any zeroing of * partial block */ ret = ext4_inode_attach_jinode(inode); if (ret < 0) goto out_mutex; } /* Wait all existing dio workers, newcomers will block on i_rwsem */ inode_dio_wait(inode); ret = file_modified(file); if (ret) goto out_mutex; /* * Prevent page faults from reinstantiating pages we have released from * page cache. */ filemap_invalidate_lock(mapping); ret = ext4_break_layouts(inode); if (ret) goto out_dio; first_block_offset = round_up(offset, sb->s_blocksize); last_block_offset = round_down((offset + length), sb->s_blocksize) - 1; /* Now release the pages and zero block aligned part of pages*/ if (last_block_offset > first_block_offset) { ret = ext4_update_disksize_before_punch(inode, offset, length); if (ret) goto out_dio; truncate_pagecache_range(inode, first_block_offset, last_block_offset); } if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) credits = ext4_writepage_trans_blocks(inode); else credits = ext4_blocks_for_truncate(inode); handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); ext4_std_error(sb, ret); goto out_dio; } ret = ext4_zero_partial_blocks(handle, inode, offset, length); if (ret) goto out_stop; first_block = (offset + sb->s_blocksize - 1) >> EXT4_BLOCK_SIZE_BITS(sb); stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb); /* If there are blocks to remove, do it */ if (stop_block > first_block) { down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode, 0); ext4_es_remove_extent(inode, first_block, stop_block - first_block); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) ret = ext4_ext_remove_space(inode, first_block, stop_block - 1); else ret = ext4_ind_remove_space(handle, inode, first_block, stop_block); up_write(&EXT4_I(inode)->i_data_sem); } ext4_fc_track_range(handle, inode, first_block, stop_block); if (IS_SYNC(inode)) ext4_handle_sync(handle); inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); ret2 = ext4_mark_inode_dirty(handle, inode); if (unlikely(ret2)) ret = ret2; if (ret >= 0) ext4_update_inode_fsync_trans(handle, inode, 1); out_stop: ext4_journal_stop(handle); out_dio: filemap_invalidate_unlock(mapping); out_mutex: inode_unlock(inode); return ret; } int ext4_inode_attach_jinode(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); struct jbd2_inode *jinode; if (ei->jinode || !EXT4_SB(inode->i_sb)->s_journal) return 0; jinode = jbd2_alloc_inode(GFP_KERNEL); spin_lock(&inode->i_lock); if (!ei->jinode) { if (!jinode) { spin_unlock(&inode->i_lock); return -ENOMEM; } ei->jinode = jinode; jbd2_journal_init_jbd_inode(ei->jinode, inode); jinode = NULL; } spin_unlock(&inode->i_lock); if (unlikely(jinode != NULL)) jbd2_free_inode(jinode); return 0; } /* * ext4_truncate() * * We block out ext4_get_block() block instantiations across the entire * transaction, and VFS/VM ensures that ext4_truncate() cannot run * simultaneously on behalf of the same inode. * * As we work through the truncate and commit bits of it to the journal there * is one core, guiding principle: the file's tree must always be consistent on * disk. We must be able to restart the truncate after a crash. * * The file's tree may be transiently inconsistent in memory (although it * probably isn't), but whenever we close off and commit a journal transaction, * the contents of (the filesystem + the journal) must be consistent and * restartable. It's pretty simple, really: bottom up, right to left (although * left-to-right works OK too). * * Note that at recovery time, journal replay occurs *before* the restart of * truncate against the orphan inode list. * * The committed inode has the new, desired i_size (which is the same as * i_disksize in this case). After a crash, ext4_orphan_cleanup() will see * that this inode's truncate did not complete and it will again call * ext4_truncate() to have another go. So there will be instantiated blocks * to the right of the truncation point in a crashed ext4 filesystem. But * that's fine - as long as they are linked from the inode, the post-crash * ext4_truncate() run will find them and release them. */ int ext4_truncate(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); unsigned int credits; int err = 0, err2; handle_t *handle; struct address_space *mapping = inode->i_mapping; /* * There is a possibility that we're either freeing the inode * or it's a completely new inode. In those cases we might not * have i_rwsem locked because it's not necessary. */ if (!(inode->i_state & (I_NEW|I_FREEING))) WARN_ON(!inode_is_locked(inode)); trace_ext4_truncate_enter(inode); if (!ext4_can_truncate(inode)) goto out_trace; if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC)) ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE); if (ext4_has_inline_data(inode)) { int has_inline = 1; err = ext4_inline_data_truncate(inode, &has_inline); if (err || has_inline) goto out_trace; } /* If we zero-out tail of the page, we have to create jinode for jbd2 */ if (inode->i_size & (inode->i_sb->s_blocksize - 1)) { err = ext4_inode_attach_jinode(inode); if (err) goto out_trace; } if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) credits = ext4_writepage_trans_blocks(inode); else credits = ext4_blocks_for_truncate(inode); handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); if (IS_ERR(handle)) { err = PTR_ERR(handle); goto out_trace; } if (inode->i_size & (inode->i_sb->s_blocksize - 1)) ext4_block_truncate_page(handle, mapping, inode->i_size); /* * We add the inode to the orphan list, so that if this * truncate spans multiple transactions, and we crash, we will * resume the truncate when the filesystem recovers. It also * marks the inode dirty, to catch the new size. * * Implication: the file must always be in a sane, consistent * truncatable state while each transaction commits. */ err = ext4_orphan_add(handle, inode); if (err) goto out_stop; down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode, 0); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) err = ext4_ext_truncate(handle, inode); else ext4_ind_truncate(handle, inode); up_write(&ei->i_data_sem); if (err) goto out_stop; if (IS_SYNC(inode)) ext4_handle_sync(handle); out_stop: /* * If this was a simple ftruncate() and the file will remain alive, * then we need to clear up the orphan record which we created above. * However, if this was a real unlink then we were called by * ext4_evict_inode(), and we allow that function to clean up the * orphan info for us. */ if (inode->i_nlink) ext4_orphan_del(handle, inode); inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); err2 = ext4_mark_inode_dirty(handle, inode); if (unlikely(err2 && !err)) err = err2; ext4_journal_stop(handle); out_trace: trace_ext4_truncate_exit(inode); return err; } static inline u64 ext4_inode_peek_iversion(const struct inode *inode) { if (unlikely(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) return inode_peek_iversion_raw(inode); else return inode_peek_iversion(inode); } static int ext4_inode_blocks_set(struct ext4_inode *raw_inode, struct ext4_inode_info *ei) { struct inode *inode = &(ei->vfs_inode); u64 i_blocks = READ_ONCE(inode->i_blocks); struct super_block *sb = inode->i_sb; if (i_blocks <= ~0U) { /* * i_blocks can be represented in a 32 bit variable * as multiple of 512 bytes */ raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); raw_inode->i_blocks_high = 0; ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE); return 0; } /* * This should never happen since sb->s_maxbytes should not have * allowed this, sb->s_maxbytes was set according to the huge_file * feature in ext4_fill_super(). */ if (!ext4_has_feature_huge_file(sb)) return -EFSCORRUPTED; if (i_blocks <= 0xffffffffffffULL) { /* * i_blocks can be represented in a 48 bit variable * as multiple of 512 bytes */ raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE); } else { ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE); /* i_block is stored in file system block size */ i_blocks = i_blocks >> (inode->i_blkbits - 9); raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); } return 0; } static int ext4_fill_raw_inode(struct inode *inode, struct ext4_inode *raw_inode) { struct ext4_inode_info *ei = EXT4_I(inode); uid_t i_uid; gid_t i_gid; projid_t i_projid; int block; int err; err = ext4_inode_blocks_set(raw_inode, ei); raw_inode->i_mode = cpu_to_le16(inode->i_mode); i_uid = i_uid_read(inode); i_gid = i_gid_read(inode); i_projid = from_kprojid(&init_user_ns, ei->i_projid); if (!(test_opt(inode->i_sb, NO_UID32))) { raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid)); raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid)); /* * Fix up interoperability with old kernels. Otherwise, * old inodes get re-used with the upper 16 bits of the * uid/gid intact. */ if (ei->i_dtime && list_empty(&ei->i_orphan)) { raw_inode->i_uid_high = 0; raw_inode->i_gid_high = 0; } else { raw_inode->i_uid_high = cpu_to_le16(high_16_bits(i_uid)); raw_inode->i_gid_high = cpu_to_le16(high_16_bits(i_gid)); } } else { raw_inode->i_uid_low = cpu_to_le16(fs_high2lowuid(i_uid)); raw_inode->i_gid_low = cpu_to_le16(fs_high2lowgid(i_gid)); raw_inode->i_uid_high = 0; raw_inode->i_gid_high = 0; } raw_inode->i_links_count = cpu_to_le16(inode->i_nlink); EXT4_INODE_SET_CTIME(inode, raw_inode); EXT4_INODE_SET_MTIME(inode, raw_inode); EXT4_INODE_SET_ATIME(inode, raw_inode); EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode); raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); raw_inode->i_flags = cpu_to_le32(ei->i_flags & 0xFFFFFFFF); if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) raw_inode->i_file_acl_high = cpu_to_le16(ei->i_file_acl >> 32); raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl); ext4_isize_set(raw_inode, ei->i_disksize); raw_inode->i_generation = cpu_to_le32(inode->i_generation); if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { if (old_valid_dev(inode->i_rdev)) { raw_inode->i_block[0] = cpu_to_le32(old_encode_dev(inode->i_rdev)); raw_inode->i_block[1] = 0; } else { raw_inode->i_block[0] = 0; raw_inode->i_block[1] = cpu_to_le32(new_encode_dev(inode->i_rdev)); raw_inode->i_block[2] = 0; } } else if (!ext4_has_inline_data(inode)) { for (block = 0; block < EXT4_N_BLOCKS; block++) raw_inode->i_block[block] = ei->i_data[block]; } if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) { u64 ivers = ext4_inode_peek_iversion(inode); raw_inode->i_disk_version = cpu_to_le32(ivers); if (ei->i_extra_isize) { if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) raw_inode->i_version_hi = cpu_to_le32(ivers >> 32); raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); } } if (i_projid != EXT4_DEF_PROJID && !ext4_has_feature_project(inode->i_sb)) err = err ?: -EFSCORRUPTED; if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) raw_inode->i_projid = cpu_to_le32(i_projid); ext4_inode_csum_set(inode, raw_inode, ei); return err; } /* * ext4_get_inode_loc returns with an extra refcount against the inode's * underlying buffer_head on success. If we pass 'inode' and it does not * have in-inode xattr, we have all inode data in memory that is needed * to recreate the on-disk version of this inode. */ static int __ext4_get_inode_loc(struct super_block *sb, unsigned long ino, struct inode *inode, struct ext4_iloc *iloc, ext4_fsblk_t *ret_block) { struct ext4_group_desc *gdp; struct buffer_head *bh; ext4_fsblk_t block; struct blk_plug plug; int inodes_per_block, inode_offset; iloc->bh = NULL; if (ino < EXT4_ROOT_INO || ino > le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)) return -EFSCORRUPTED; iloc->block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); gdp = ext4_get_group_desc(sb, iloc->block_group, NULL); if (!gdp) return -EIO; /* * Figure out the offset within the block group inode table */ inodes_per_block = EXT4_SB(sb)->s_inodes_per_block; inode_offset = ((ino - 1) % EXT4_INODES_PER_GROUP(sb)); iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb); block = ext4_inode_table(sb, gdp); if ((block <= le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) || (block >= ext4_blocks_count(EXT4_SB(sb)->s_es))) { ext4_error(sb, "Invalid inode table block %llu in " "block_group %u", block, iloc->block_group); return -EFSCORRUPTED; } block += (inode_offset / inodes_per_block); bh = sb_getblk(sb, block); if (unlikely(!bh)) return -ENOMEM; if (ext4_buffer_uptodate(bh)) goto has_buffer; lock_buffer(bh); if (ext4_buffer_uptodate(bh)) { /* Someone brought it uptodate while we waited */ unlock_buffer(bh); goto has_buffer; } /* * If we have all information of the inode in memory and this * is the only valid inode in the block, we need not read the * block. */ if (inode && !ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { struct buffer_head *bitmap_bh; int i, start; start = inode_offset & ~(inodes_per_block - 1); /* Is the inode bitmap in cache? */ bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp)); if (unlikely(!bitmap_bh)) goto make_io; /* * If the inode bitmap isn't in cache then the * optimisation may end up performing two reads instead * of one, so skip it. */ if (!buffer_uptodate(bitmap_bh)) { brelse(bitmap_bh); goto make_io; } for (i = start; i < start + inodes_per_block; i++) { if (i == inode_offset) continue; if (ext4_test_bit(i, bitmap_bh->b_data)) break; } brelse(bitmap_bh); if (i == start + inodes_per_block) { struct ext4_inode *raw_inode = (struct ext4_inode *) (bh->b_data + iloc->offset); /* all other inodes are free, so skip I/O */ memset(bh->b_data, 0, bh->b_size); if (!ext4_test_inode_state(inode, EXT4_STATE_NEW)) ext4_fill_raw_inode(inode, raw_inode); set_buffer_uptodate(bh); unlock_buffer(bh); goto has_buffer; } } make_io: /* * If we need to do any I/O, try to pre-readahead extra * blocks from the inode table. */ blk_start_plug(&plug); if (EXT4_SB(sb)->s_inode_readahead_blks) { ext4_fsblk_t b, end, table; unsigned num; __u32 ra_blks = EXT4_SB(sb)->s_inode_readahead_blks; table = ext4_inode_table(sb, gdp); /* s_inode_readahead_blks is always a power of 2 */ b = block & ~((ext4_fsblk_t) ra_blks - 1); if (table > b) b = table; end = b + ra_blks; num = EXT4_INODES_PER_GROUP(sb); if (ext4_has_group_desc_csum(sb)) num -= ext4_itable_unused_count(sb, gdp); table += num / inodes_per_block; if (end > table) end = table; while (b <= end) ext4_sb_breadahead_unmovable(sb, b++); } /* * There are other valid inodes in the buffer, this inode * has in-inode xattrs, or we don't have this inode in memory. * Read the block from disk. */ trace_ext4_load_inode(sb, ino); ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO, NULL); blk_finish_plug(&plug); wait_on_buffer(bh); ext4_simulate_fail_bh(sb, bh, EXT4_SIM_INODE_EIO); if (!buffer_uptodate(bh)) { if (ret_block) *ret_block = block; brelse(bh); return -EIO; } has_buffer: iloc->bh = bh; return 0; } static int __ext4_get_inode_loc_noinmem(struct inode *inode, struct ext4_iloc *iloc) { ext4_fsblk_t err_blk = 0; int ret; ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, NULL, iloc, &err_blk); if (ret == -EIO) ext4_error_inode_block(inode, err_blk, EIO, "unable to read itable block"); return ret; } int ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc) { ext4_fsblk_t err_blk = 0; int ret; ret = __ext4_get_inode_loc(inode->i_sb, inode->i_ino, inode, iloc, &err_blk); if (ret == -EIO) ext4_error_inode_block(inode, err_blk, EIO, "unable to read itable block"); return ret; } int ext4_get_fc_inode_loc(struct super_block *sb, unsigned long ino, struct ext4_iloc *iloc) { return __ext4_get_inode_loc(sb, ino, NULL, iloc, NULL); } static bool ext4_should_enable_dax(struct inode *inode) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); if (test_opt2(inode->i_sb, DAX_NEVER)) return false; if (!S_ISREG(inode->i_mode)) return false; if (ext4_should_journal_data(inode)) return false; if (ext4_has_inline_data(inode)) return false; if (ext4_test_inode_flag(inode, EXT4_INODE_ENCRYPT)) return false; if (ext4_test_inode_flag(inode, EXT4_INODE_VERITY)) return false; if (!test_bit(EXT4_FLAGS_BDEV_IS_DAX, &sbi->s_ext4_flags)) return false; if (test_opt(inode->i_sb, DAX_ALWAYS)) return true; return ext4_test_inode_flag(inode, EXT4_INODE_DAX); } void ext4_set_inode_flags(struct inode *inode, bool init) { unsigned int flags = EXT4_I(inode)->i_flags; unsigned int new_fl = 0; WARN_ON_ONCE(IS_DAX(inode) && init); if (flags & EXT4_SYNC_FL) new_fl |= S_SYNC; if (flags & EXT4_APPEND_FL) new_fl |= S_APPEND; if (flags & EXT4_IMMUTABLE_FL) new_fl |= S_IMMUTABLE; if (flags & EXT4_NOATIME_FL) new_fl |= S_NOATIME; if (flags & EXT4_DIRSYNC_FL) new_fl |= S_DIRSYNC; /* Because of the way inode_set_flags() works we must preserve S_DAX * here if already set. */ new_fl |= (inode->i_flags & S_DAX); if (init && ext4_should_enable_dax(inode)) new_fl |= S_DAX; if (flags & EXT4_ENCRYPT_FL) new_fl |= S_ENCRYPTED; if (flags & EXT4_CASEFOLD_FL) new_fl |= S_CASEFOLD; if (flags & EXT4_VERITY_FL) new_fl |= S_VERITY; inode_set_flags(inode, new_fl, S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX| S_ENCRYPTED|S_CASEFOLD|S_VERITY); } static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode, struct ext4_inode_info *ei) { blkcnt_t i_blocks ; struct inode *inode = &(ei->vfs_inode); struct super_block *sb = inode->i_sb; if (ext4_has_feature_huge_file(sb)) { /* we are using combined 48 bit field */ i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 | le32_to_cpu(raw_inode->i_blocks_lo); if (ext4_test_inode_flag(inode, EXT4_INODE_HUGE_FILE)) { /* i_blocks represent file system block size */ return i_blocks << (inode->i_blkbits - 9); } else { return i_blocks; } } else { return le32_to_cpu(raw_inode->i_blocks_lo); } } static inline int ext4_iget_extra_inode(struct inode *inode, struct ext4_inode *raw_inode, struct ext4_inode_info *ei) { __le32 *magic = (void *)raw_inode + EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize; if (EXT4_INODE_HAS_XATTR_SPACE(inode) && *magic == cpu_to_le32(EXT4_XATTR_MAGIC)) { int err; ext4_set_inode_state(inode, EXT4_STATE_XATTR); err = ext4_find_inline_data_nolock(inode); if (!err && ext4_has_inline_data(inode)) ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); return err; } else EXT4_I(inode)->i_inline_off = 0; return 0; } int ext4_get_projid(struct inode *inode, kprojid_t *projid) { if (!ext4_has_feature_project(inode->i_sb)) return -EOPNOTSUPP; *projid = EXT4_I(inode)->i_projid; return 0; } /* * ext4 has self-managed i_version for ea inodes, it stores the lower 32bit of * refcount in i_version, so use raw values if inode has EXT4_EA_INODE_FL flag * set. */ static inline void ext4_inode_set_iversion_queried(struct inode *inode, u64 val) { if (unlikely(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) inode_set_iversion_raw(inode, val); else inode_set_iversion_queried(inode, val); } static const char *check_igot_inode(struct inode *inode, ext4_iget_flags flags) { if (flags & EXT4_IGET_EA_INODE) { if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) return "missing EA_INODE flag"; if (ext4_test_inode_state(inode, EXT4_STATE_XATTR) || EXT4_I(inode)->i_file_acl) return "ea_inode with extended attributes"; } else { if ((EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) return "unexpected EA_INODE flag"; } if (is_bad_inode(inode) && !(flags & EXT4_IGET_BAD)) return "unexpected bad inode w/o EXT4_IGET_BAD"; return NULL; } struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, ext4_iget_flags flags, const char *function, unsigned int line) { struct ext4_iloc iloc; struct ext4_inode *raw_inode; struct ext4_inode_info *ei; struct ext4_super_block *es = EXT4_SB(sb)->s_es; struct inode *inode; const char *err_str; journal_t *journal = EXT4_SB(sb)->s_journal; long ret; loff_t size; int block; uid_t i_uid; gid_t i_gid; projid_t i_projid; if ((!(flags & EXT4_IGET_SPECIAL) && ((ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) || ino == le32_to_cpu(es->s_usr_quota_inum) || ino == le32_to_cpu(es->s_grp_quota_inum) || ino == le32_to_cpu(es->s_prj_quota_inum) || ino == le32_to_cpu(es->s_orphan_file_inum))) || (ino < EXT4_ROOT_INO) || (ino > le32_to_cpu(es->s_inodes_count))) { if (flags & EXT4_IGET_HANDLE) return ERR_PTR(-ESTALE); __ext4_error(sb, function, line, false, EFSCORRUPTED, 0, "inode #%lu: comm %s: iget: illegal inode #", ino, current->comm); return ERR_PTR(-EFSCORRUPTED); } inode = iget_locked(sb, ino); if (!inode) return ERR_PTR(-ENOMEM); if (!(inode->i_state & I_NEW)) { if ((err_str = check_igot_inode(inode, flags)) != NULL) { ext4_error_inode(inode, function, line, 0, err_str); iput(inode); return ERR_PTR(-EFSCORRUPTED); } return inode; } ei = EXT4_I(inode); iloc.bh = NULL; ret = __ext4_get_inode_loc_noinmem(inode, &iloc); if (ret < 0) goto bad_inode; raw_inode = ext4_raw_inode(&iloc); if ((flags & EXT4_IGET_HANDLE) && (raw_inode->i_links_count == 0) && (raw_inode->i_mode == 0)) { ret = -ESTALE; goto bad_inode; } if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > EXT4_INODE_SIZE(inode->i_sb) || (ei->i_extra_isize & 3)) { ext4_error_inode(inode, function, line, 0, "iget: bad extra_isize %u " "(inode size %u)", ei->i_extra_isize, EXT4_INODE_SIZE(inode->i_sb)); ret = -EFSCORRUPTED; goto bad_inode; } } else ei->i_extra_isize = 0; /* Precompute checksum seed for inode metadata */ if (ext4_has_metadata_csum(sb)) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); __u32 csum; __le32 inum = cpu_to_le32(inode->i_ino); __le32 gen = raw_inode->i_generation; csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum)); ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen, sizeof(gen)); } if ((!ext4_inode_csum_verify(inode, raw_inode, ei) || ext4_simulate_fail(sb, EXT4_SIM_INODE_CRC)) && (!(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY))) { ext4_error_inode_err(inode, function, line, 0, EFSBADCRC, "iget: checksum invalid"); ret = -EFSBADCRC; goto bad_inode; } inode->i_mode = le16_to_cpu(raw_inode->i_mode); i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); if (ext4_has_feature_project(sb) && EXT4_INODE_SIZE(sb) > EXT4_GOOD_OLD_INODE_SIZE && EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) i_projid = (projid_t)le32_to_cpu(raw_inode->i_projid); else i_projid = EXT4_DEF_PROJID; if (!(test_opt(inode->i_sb, NO_UID32))) { i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; } i_uid_write(inode, i_uid); i_gid_write(inode, i_gid); ei->i_projid = make_kprojid(&init_user_ns, i_projid); set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ ei->i_inline_off = 0; ei->i_dir_start_lookup = 0; ei->i_dtime = le32_to_cpu(raw_inode->i_dtime); /* We now have enough fields to check if the inode was active or not. * This is needed because nfsd might try to access dead inodes * the test is that same one that e2fsck uses * NeilBrown 1999oct15 */ if (inode->i_nlink == 0) { if ((inode->i_mode == 0 || flags & EXT4_IGET_SPECIAL || !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) && ino != EXT4_BOOT_LOADER_INO) { /* this inode is deleted or unallocated */ if (flags & EXT4_IGET_SPECIAL) { ext4_error_inode(inode, function, line, 0, "iget: special inode unallocated"); ret = -EFSCORRUPTED; } else ret = -ESTALE; goto bad_inode; } /* The only unlinked inodes we let through here have * valid i_mode and are being read by the orphan * recovery code: that's fine, we're about to complete * the process of deleting those. * OR it is the EXT4_BOOT_LOADER_INO which is * not initialized on a new filesystem. */ } ei->i_flags = le32_to_cpu(raw_inode->i_flags); ext4_set_inode_flags(inode, true); inode->i_blocks = ext4_inode_blocks(raw_inode, ei); ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo); if (ext4_has_feature_64bit(sb)) ei->i_file_acl |= ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; inode->i_size = ext4_isize(sb, raw_inode); if ((size = i_size_read(inode)) < 0) { ext4_error_inode(inode, function, line, 0, "iget: bad i_size value: %lld", size); ret = -EFSCORRUPTED; goto bad_inode; } /* * If dir_index is not enabled but there's dir with INDEX flag set, * we'd normally treat htree data as empty space. But with metadata * checksumming that corrupts checksums so forbid that. */ if (!ext4_has_feature_dir_index(sb) && ext4_has_metadata_csum(sb) && ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) { ext4_error_inode(inode, function, line, 0, "iget: Dir with htree data on filesystem without dir_index feature."); ret = -EFSCORRUPTED; goto bad_inode; } ei->i_disksize = inode->i_size; #ifdef CONFIG_QUOTA ei->i_reserved_quota = 0; #endif inode->i_generation = le32_to_cpu(raw_inode->i_generation); ei->i_block_group = iloc.block_group; ei->i_last_alloc_group = ~0; /* * NOTE! The in-memory inode i_data array is in little-endian order * even on big-endian machines: we do NOT byteswap the block numbers! */ for (block = 0; block < EXT4_N_BLOCKS; block++) ei->i_data[block] = raw_inode->i_block[block]; INIT_LIST_HEAD(&ei->i_orphan); ext4_fc_init_inode(&ei->vfs_inode); /* * Set transaction id's of transactions that have to be committed * to finish f[data]sync. We set them to currently running transaction * as we cannot be sure that the inode or some of its metadata isn't * part of the transaction - the inode could have been reclaimed and * now it is reread from disk. */ if (journal) { transaction_t *transaction; tid_t tid; read_lock(&journal->j_state_lock); if (journal->j_running_transaction) transaction = journal->j_running_transaction; else transaction = journal->j_committing_transaction; if (transaction) tid = transaction->t_tid; else tid = journal->j_commit_sequence; read_unlock(&journal->j_state_lock); ei->i_sync_tid = tid; ei->i_datasync_tid = tid; } if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { if (ei->i_extra_isize == 0) { /* The extra space is currently unused. Use it. */ BUILD_BUG_ON(sizeof(struct ext4_inode) & 3); ei->i_extra_isize = sizeof(struct ext4_inode) - EXT4_GOOD_OLD_INODE_SIZE; } else { ret = ext4_iget_extra_inode(inode, raw_inode, ei); if (ret) goto bad_inode; } } EXT4_INODE_GET_CTIME(inode, raw_inode); EXT4_INODE_GET_ATIME(inode, raw_inode); EXT4_INODE_GET_MTIME(inode, raw_inode); EXT4_EINODE_GET_XTIME(i_crtime, ei, raw_inode); if (likely(!test_opt2(inode->i_sb, HURD_COMPAT))) { u64 ivers = le32_to_cpu(raw_inode->i_disk_version); if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { if (EXT4_FITS_IN_INODE(raw_inode, ei, i_version_hi)) ivers |= (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32; } ext4_inode_set_iversion_queried(inode, ivers); } ret = 0; if (ei->i_file_acl && !ext4_inode_block_valid(inode, ei->i_file_acl, 1)) { ext4_error_inode(inode, function, line, 0, "iget: bad extended attribute block %llu", ei->i_file_acl); ret = -EFSCORRUPTED; goto bad_inode; } else if (!ext4_has_inline_data(inode)) { /* validate the block references in the inode */ if (!(EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) && (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || (S_ISLNK(inode->i_mode) && !ext4_inode_is_fast_symlink(inode)))) { if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) ret = ext4_ext_check_inode(inode); else ret = ext4_ind_check_inode(inode); } } if (ret) goto bad_inode; if (S_ISREG(inode->i_mode)) { inode->i_op = &ext4_file_inode_operations; inode->i_fop = &ext4_file_operations; ext4_set_aops(inode); } else if (S_ISDIR(inode->i_mode)) { inode->i_op = &ext4_dir_inode_operations; inode->i_fop = &ext4_dir_operations; } else if (S_ISLNK(inode->i_mode)) { /* VFS does not allow setting these so must be corruption */ if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) { ext4_error_inode(inode, function, line, 0, "iget: immutable or append flags " "not allowed on symlinks"); ret = -EFSCORRUPTED; goto bad_inode; } if (IS_ENCRYPTED(inode)) { inode->i_op = &ext4_encrypted_symlink_inode_operations; } else if (ext4_inode_is_fast_symlink(inode)) { inode->i_link = (char *)ei->i_data; inode->i_op = &ext4_fast_symlink_inode_operations; nd_terminate_link(ei->i_data, inode->i_size, sizeof(ei->i_data) - 1); } else { inode->i_op = &ext4_symlink_inode_operations; } } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { inode->i_op = &ext4_special_inode_operations; if (raw_inode->i_block[0]) init_special_inode(inode, inode->i_mode, old_decode_dev(le32_to_cpu(raw_inode->i_block[0]))); else init_special_inode(inode, inode->i_mode, new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); } else if (ino == EXT4_BOOT_LOADER_INO) { make_bad_inode(inode); } else { ret = -EFSCORRUPTED; ext4_error_inode(inode, function, line, 0, "iget: bogus i_mode (%o)", inode->i_mode); goto bad_inode; } if (IS_CASEFOLDED(inode) && !ext4_has_feature_casefold(inode->i_sb)) { ext4_error_inode(inode, function, line, 0, "casefold flag without casefold feature"); ret = -EFSCORRUPTED; goto bad_inode; } if ((err_str = check_igot_inode(inode, flags)) != NULL) { ext4_error_inode(inode, function, line, 0, err_str); ret = -EFSCORRUPTED; goto bad_inode; } brelse(iloc.bh); unlock_new_inode(inode); return inode; bad_inode: brelse(iloc.bh); iget_failed(inode); return ERR_PTR(ret); } static void __ext4_update_other_inode_time(struct super_block *sb, unsigned long orig_ino, unsigned long ino, struct ext4_inode *raw_inode) { struct inode *inode; inode = find_inode_by_ino_rcu(sb, ino); if (!inode) return; if (!inode_is_dirtytime_only(inode)) return; spin_lock(&inode->i_lock); if (inode_is_dirtytime_only(inode)) { struct ext4_inode_info *ei = EXT4_I(inode); inode->i_state &= ~I_DIRTY_TIME; spin_unlock(&inode->i_lock); spin_lock(&ei->i_raw_lock); EXT4_INODE_SET_CTIME(inode, raw_inode); EXT4_INODE_SET_MTIME(inode, raw_inode); EXT4_INODE_SET_ATIME(inode, raw_inode); ext4_inode_csum_set(inode, raw_inode, ei); spin_unlock(&ei->i_raw_lock); trace_ext4_other_inode_update_time(inode, orig_ino); return; } spin_unlock(&inode->i_lock); } /* * Opportunistically update the other time fields for other inodes in * the same inode table block. */ static void ext4_update_other_inodes_time(struct super_block *sb, unsigned long orig_ino, char *buf) { unsigned long ino; int i, inodes_per_block = EXT4_SB(sb)->s_inodes_per_block; int inode_size = EXT4_INODE_SIZE(sb); /* * Calculate the first inode in the inode table block. Inode * numbers are one-based. That is, the first inode in a block * (assuming 4k blocks and 256 byte inodes) is (n*16 + 1). */ ino = ((orig_ino - 1) & ~(inodes_per_block - 1)) + 1; rcu_read_lock(); for (i = 0; i < inodes_per_block; i++, ino++, buf += inode_size) { if (ino == orig_ino) continue; __ext4_update_other_inode_time(sb, orig_ino, ino, (struct ext4_inode *)buf); } rcu_read_unlock(); } /* * Post the struct inode info into an on-disk inode location in the * buffer-cache. This gobbles the caller's reference to the * buffer_head in the inode location struct. * * The caller must have write access to iloc->bh. */ static int ext4_do_update_inode(handle_t *handle, struct inode *inode, struct ext4_iloc *iloc) { struct ext4_inode *raw_inode = ext4_raw_inode(iloc); struct ext4_inode_info *ei = EXT4_I(inode); struct buffer_head *bh = iloc->bh; struct super_block *sb = inode->i_sb; int err; int need_datasync = 0, set_large_file = 0; spin_lock(&ei->i_raw_lock); /* * For fields not tracked in the in-memory inode, initialise them * to zero for new inodes. */ if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); if (READ_ONCE(ei->i_disksize) != ext4_isize(inode->i_sb, raw_inode)) need_datasync = 1; if (ei->i_disksize > 0x7fffffffULL) { if (!ext4_has_feature_large_file(sb) || EXT4_SB(sb)->s_es->s_rev_level == cpu_to_le32(EXT4_GOOD_OLD_REV)) set_large_file = 1; } err = ext4_fill_raw_inode(inode, raw_inode); spin_unlock(&ei->i_raw_lock); if (err) { EXT4_ERROR_INODE(inode, "corrupted inode contents"); goto out_brelse; } if (inode->i_sb->s_flags & SB_LAZYTIME) ext4_update_other_inodes_time(inode->i_sb, inode->i_ino, bh->b_data); BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, NULL, bh); if (err) goto out_error; ext4_clear_inode_state(inode, EXT4_STATE_NEW); if (set_large_file) { BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get write access"); err = ext4_journal_get_write_access(handle, sb, EXT4_SB(sb)->s_sbh, EXT4_JTR_NONE); if (err) goto out_error; lock_buffer(EXT4_SB(sb)->s_sbh); ext4_set_feature_large_file(sb); ext4_superblock_csum_set(sb); unlock_buffer(EXT4_SB(sb)->s_sbh); ext4_handle_sync(handle); err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); } ext4_update_inode_fsync_trans(handle, inode, need_datasync); out_error: ext4_std_error(inode->i_sb, err); out_brelse: brelse(bh); return err; } /* * ext4_write_inode() * * We are called from a few places: * * - Within generic_file_aio_write() -> generic_write_sync() for O_SYNC files. * Here, there will be no transaction running. We wait for any running * transaction to commit. * * - Within flush work (sys_sync(), kupdate and such). * We wait on commit, if told to. * * - Within iput_final() -> write_inode_now() * We wait on commit, if told to. * * In all cases it is actually safe for us to return without doing anything, * because the inode has been copied into a raw inode buffer in * ext4_mark_inode_dirty(). This is a correctness thing for WB_SYNC_ALL * writeback. * * Note that we are absolutely dependent upon all inode dirtiers doing the * right thing: they *must* call mark_inode_dirty() after dirtying info in * which we are interested. * * It would be a bug for them to not do this. The code: * * mark_inode_dirty(inode) * stuff(); * inode->i_size = expr; * * is in error because write_inode() could occur while `stuff()' is running, * and the new i_size will be lost. Plus the inode will no longer be on the * superblock's dirty inode list. */ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) { int err; if (WARN_ON_ONCE(current->flags & PF_MEMALLOC)) return 0; if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; if (EXT4_SB(inode->i_sb)->s_journal) { if (ext4_journal_current_handle()) { ext4_debug("called recursively, non-PF_MEMALLOC!\n"); dump_stack(); return -EIO; } /* * No need to force transaction in WB_SYNC_NONE mode. Also * ext4_sync_fs() will force the commit after everything is * written. */ if (wbc->sync_mode != WB_SYNC_ALL || wbc->for_sync) return 0; err = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal, EXT4_I(inode)->i_sync_tid); } else { struct ext4_iloc iloc; err = __ext4_get_inode_loc_noinmem(inode, &iloc); if (err) return err; /* * sync(2) will flush the whole buffer cache. No need to do * it here separately for each inode. */ if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync) sync_dirty_buffer(iloc.bh); if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) { ext4_error_inode_block(inode, iloc.bh->b_blocknr, EIO, "IO error syncing inode"); err = -EIO; } brelse(iloc.bh); } return err; } /* * In data=journal mode ext4_journalled_invalidate_folio() may fail to invalidate * buffers that are attached to a folio straddling i_size and are undergoing * commit. In that case we have to wait for commit to finish and try again. */ static void ext4_wait_for_tail_page_commit(struct inode *inode) { unsigned offset; journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; tid_t commit_tid = 0; int ret; offset = inode->i_size & (PAGE_SIZE - 1); /* * If the folio is fully truncated, we don't need to wait for any commit * (and we even should not as __ext4_journalled_invalidate_folio() may * strip all buffers from the folio but keep the folio dirty which can then * confuse e.g. concurrent ext4_writepages() seeing dirty folio without * buffers). Also we don't need to wait for any commit if all buffers in * the folio remain valid. This is most beneficial for the common case of * blocksize == PAGESIZE. */ if (!offset || offset > (PAGE_SIZE - i_blocksize(inode))) return; while (1) { struct folio *folio = filemap_lock_folio(inode->i_mapping, inode->i_size >> PAGE_SHIFT); if (IS_ERR(folio)) return; ret = __ext4_journalled_invalidate_folio(folio, offset, folio_size(folio) - offset); folio_unlock(folio); folio_put(folio); if (ret != -EBUSY) return; commit_tid = 0; read_lock(&journal->j_state_lock); if (journal->j_committing_transaction) commit_tid = journal->j_committing_transaction->t_tid; read_unlock(&journal->j_state_lock); if (commit_tid) jbd2_log_wait_commit(journal, commit_tid); } } /* * ext4_setattr() * * Called from notify_change. * * We want to trap VFS attempts to truncate the file as soon as * possible. In particular, we want to make sure that when the VFS * shrinks i_size, we put the inode on the orphan list and modify * i_disksize immediately, so that during the subsequent flushing of * dirty pages and freeing of disk blocks, we can guarantee that any * commit will leave the blocks being flushed in an unused state on * disk. (On recovery, the inode will get truncated and the blocks will * be freed, so we have a strong guarantee that no future commit will * leave these blocks visible to the user.) * * Another thing we have to assure is that if we are in ordered mode * and inode is still attached to the committing transaction, we must * we start writeout of all the dirty pages which are being truncated. * This way we are sure that all the data written in the previous * transaction are already on disk (truncate waits for pages under * writeback). * * Called with inode->i_rwsem down. */ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); int error, rc = 0; int orphan = 0; const unsigned int ia_valid = attr->ia_valid; bool inc_ivers = true; if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; if (unlikely(IS_IMMUTABLE(inode))) return -EPERM; if (unlikely(IS_APPEND(inode) && (ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_TIMES_SET)))) return -EPERM; error = setattr_prepare(idmap, dentry, attr); if (error) return error; error = fscrypt_prepare_setattr(dentry, attr); if (error) return error; error = fsverity_prepare_setattr(dentry, attr); if (error) return error; if (is_quota_modification(idmap, inode, attr)) { error = dquot_initialize(inode); if (error) return error; } if (i_uid_needs_update(idmap, attr, inode) || i_gid_needs_update(idmap, attr, inode)) { handle_t *handle; /* (user+group)*(old+new) structure, inode write (sb, * inode block, ? - but truncate inode update has it) */ handle = ext4_journal_start(inode, EXT4_HT_QUOTA, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb) + EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb)) + 3); if (IS_ERR(handle)) { error = PTR_ERR(handle); goto err_out; } /* dquot_transfer() calls back ext4_get_inode_usage() which * counts xattr inode references. */ down_read(&EXT4_I(inode)->xattr_sem); error = dquot_trans